[uClibc] improved memcpy/memset on arm in uClibc-0.9.19

Markus Pietrek maillist at fsforth.de
Tue Sep 16 15:40:34 UTC 2003


Hi folks,

the C implementation of memcpy() and memset() is very slow on ARM. Using 
optimized assembler code the performance for 32bit aligned memory blocks can 
be improved very much. Some test runs on the NS7520 ARM7 show that for 
copying a memory region of 512kB only 20ms are needed instead of 320ms. About 
the same ratio can be achieved for memset().

My code works only on the assumption that the memory is 32bit aligned and no 
Wchar are used, the block size is large enough (>>0x38 bytes) to complement 
the overhead and of course it is an ARM.

So my questions are
o with these restrictions, does it make sense to put the code in this general
  library?
o what the best method for replacing C functions with custom assembler 
functions.  Obviously, hacking libc/string/wstring.c is not good

Here is a preversion of the optimized functions.

Wvoid *Wmemcpy(Wvoid * __restrict s1, const Wvoid * __restrict s2, size_t n)
{
	register Wchar *r1 = s1;
	register const Wchar *r2 = s2;

	if( ((int) r1 & 0x3) || ((int) r2 & 0x3) || ( n <= 0x38 ) )
	{
	    while (n) {
		*r1++ = *r2++;
		--n;
	    }
	}
	else
	    __asm__ __volatile__ ("
1:			
		ldmia	%0!,{r3-r9}	
		stmia	%1!,{r3-r9}	
		ldmia	%0!,{r3-r9}	
		stmia	%1!,{r3-r9}	
        	subs    %2, %2, #0x38
        	cmp     %2, #0x38
        	bge     1b
2:
        	cmp     %2, #4
        	blt     3f
        	ldr     r4,[%0]
        	str     r4,[%1] 
        	add     %0, %0, #4
        	add     %1, %1, #4
        	sub     %2, %2, #4
        	b       2b
3:				
        	cmp     %2, #0
        	beq     4f
        	ldrb    r4,[%0]
        	strb    r4,[%1] 
        	add     %0, %0, #1
        	add     %1, %1, #1
        	sub     %2, %2, #1
        	b       3b
4:		"
				  : 
				  : "r"(r2),"r"(r1),"r"(n)
				  : "r3","r4","r5","r6","r7","r8","r9" );
	return s1;
}


Wvoid *Wmemset(Wvoid *s, Wint c, size_t n)
{
	register Wuchar *p = (Wuchar *) s;
#ifdef __BCC__
	/* bcc can optimize the counter if it thinks it is a pointer... */
	register const char *np = (const char *) n;
#else
#define np n
#endif
	if( ((int) p & 0x3) || ( n <= 0x38 ) )
	{
	    while (np) {
		*p++ = (Wuchar) c;
		--np;
	    }
	}
	else
	    __asm__ __volatile__ ("
1:
		mov	r3, %0
		mov	r4, #0xff
		and	r3, r3, r4
		orr	r3, r3, r3, lsl #8
		orr	r3, r3, r3, lsl #16

		mov	r4, r3
		mov	r5, r3
		mov	r6, r3
		mov	r7, r3
		mov	r8, r3
		mov	r9, r3
		stmia	%1!,{r3-r9}	
		stmia	%1!,{r3-r9}	
        	subs    %2, %2, #0x38
        	cmp     %2, #0x38
        	bge     1b
2:
        	cmp     %2, #4
        	blt     3f
        	str     r3,[%1] 
        	add     %1, %1, #4
        	sub     %2, %2, #4
        	b       2b
3:				
        	cmp     %2, #0
        	beq     4f
        	strb    r3,[%1] 
        	add     %1, %1, #1
        	sub     %2, %2, #1
        	b       3b
4:		"
				  : 
				  : "r"(c),"r"(p),"r"(n)
				  : "r3","r4","r5","r6","r7","r8","r9" );
	return s;
}

Bye,

-- 
Markus Pietrek
FS Forth-Systeme GmbH
Phone: +49 (7667) 908 145,  FAX +49 (7667) 908 221





More information about the uClibc mailing list