memcpy

npissoawsome · Post by **npissoawsome** » September 30th, 2012, 6:39 pm

this is the C++ source

void * __cdecl memcpy (
        void * dst,
        const void * src,
        size_t count
        )
{
        void * ret = dst;

        /*
         * copy from lower addresses to higher addresses
         */
        while (count--) {
                *(char *)dst = *(char *)src;
                dst = (char *)dst + 1;
                src = (char *)src + 1;
        }

        return(ret);
}

and this is the asm source

Code: Select all

       page    ,132
        title   memcpy - Copy source memory bytes to destination
;***
;memcpy.asm - contains memcpy and memmove routines
;
;       Copyright (c) Microsoft Corporation. All rights reserved.
;
;Purpose:
;       memcpy() copies a source memory buffer to a destination buffer.
;       Overlapping buffers are not treated specially, so propogation may occur.
;       memmove() copies a source memory buffer to a destination buffer.
;       Overlapping buffers are treated specially, to avoid propogation.
;
;*******************************************************************************

        .xlist
        include cruntime.inc
        .list
        .xmm

M_EXIT  macro
        ret                     ; _cdecl return
        endm    ; M_EXIT

PALIGN_memcpy macro d
MovPalign&d&:
        movdqa      xmm1,xmmword ptr [esi-d]
        lea         esi, byte ptr [esi-d]
    align   @WordSize
PalignLoop&d&:
        movdqa  xmm3,xmmword ptr [esi+10h]
        sub     ecx,30h
        movdqa  xmm0,xmmword ptr [esi+20h]
        movdqa  xmm5,xmmword ptr [esi+30h]
        lea     esi, xmmword ptr [esi+30h]
        cmp     ecx,30h
        movdqa  xmm2,xmm3

        palignr xmm3,xmm1,d

        movdqa  xmmword ptr [edi],xmm3
        movdqa  xmm4,xmm0

        palignr xmm0,xmm2,d

        movdqa  xmmword ptr [edi+10h],xmm0
        movdqa  xmm1,xmm5

        palignr xmm5,xmm4,d

        movdqa  xmmword ptr [edi+20h],xmm5
        lea     edi, xmmword ptr [edi+30h]
        jge     PalignLoop&d&
        lea     esi, xmmword ptr [esi+d]

        endm    ; PALIGN_memcpy

        CODESEG

    extrn   __isa_available:dword
    extrn   __isa_enabled:dword
    extrn   __favor:dword

page
;***
;memcpy - Copy source buffer to destination buffer
;
;Purpose:
;       memcpy() copies a source memory buffer to a destination memory buffer.
;       This routine does NOT recognize overlapping buffers, and thus can lead
;       to propogation.
;       For cases where propogation must be avoided, memmove() must be used.
;
;       Algorithm:
;
;           Same as memmove. See Below
;
;
;memmove - Copy source buffer to destination buffer
;
;Purpose:
;       memmove() copies a source memory buffer to a destination memory buffer.
;       This routine recognize overlapping buffers to avoid propogation.
;       For cases where propogation is not a problem, memcpy() can be used.
;
;   Algorithm:
;
;       void * memmove(void * dst, void * src, size_t count)
;       {
;               void * ret = dst;
;
;               if (dst <= src || dst >= (src + count)) {
;                       /*
;                        * Non-Overlapping Buffers
;                        * copy from lower addresses to higher addresses
;                        */
;                       while (count--)
;                               *dst++ = *src++;
;                       }
;               else {
;                       /*
;                        * Overlapping Buffers
;                        * copy from higher addresses to lower addresses
;                        */
;                       dst += count - 1;
;                       src += count - 1;
;
;                       while (count--)
;                               *dst-- = *src--;
;                       }
;
;               return(ret);
;       }
;
;
;Entry:
;       void *dst = pointer to destination buffer
;       const void *src = pointer to source buffer
;       size_t count = number of bytes to copy
;
;Exit:
;       Returns a pointer to the destination buffer in AX/DX:AX
;
;Uses:
;       CX, DX
;
;Exceptions:
;*******************************************************************************

ifdef MEM_MOVE
        _MEM_     equ <memmove>
else  ; MEM_MOVE
        _MEM_     equ <memcpy>
endif  ; MEM_MOVE

%       public  _MEM_
_MEM_   proc \
        dst:ptr byte, \
        src:ptr byte, \
        count:IWORD

        ; destination pointer
        ; source pointer
        ; number of bytes to copy

        OPTION PROLOGUE:NONE, EPILOGUE:NONE

        push    edi             ;U - save edi
        push    esi             ;V - save esi

;                   size param/4   prolog byte  #reg saved
        .FPO ( 0, 3           , $-_MEM_     , 2, 0, 0 )

        mov     esi,[esp + 010h]     ;U - esi = source
        mov     ecx,[esp + 014h]     ;V - ecx = number of bytes to move
        mov     edi,[esp + 0Ch]      ;U - edi = dest

;
; Check for overlapping buffers:
;       If (dst <= src) Or (dst >= src + Count) Then
;               Do normal (Upwards) Copy
;       Else
;               Do Downwards Copy to avoid propagation
;

        mov     eax,ecx         ;V - eax = byte count...

        mov     edx,ecx         ;U - edx = byte count...
        add     eax,esi         ;V - eax = point past source end

        cmp     edi,esi         ;U - dst <= src ?
        jbe     short CopyUp    ;V - yes, copy toward higher addresses

        cmp     edi,eax         ;U - dst < (src + count) ?
        jb      CopyDown        ;V - yes, copy toward lower addresses

;
; Copy toward higher addresses.
;
CopyUp:
;
        ; See if Enhanced Fast Strings is supported.
        ; ENFSTRG supported?
        bt      __favor, __FAVOR_ENFSTRG
        jnc     CopyUpSSE2Check                 ; no jump
        ;
        ; use Enhanced Fast Strings
        rep     movsb
        jmp     TrailUp0         ; Done
CopyUpSSE2Check:
;
; Next, see if we can use a "fast" copy SSE2 routine
        ; block size greater than min threshold?
        cmp     ecx,080h
        jb      Dword_align  ; length too small go use dwords
        ; alignments equal?
        mov     eax,edi
        xor     eax,esi
        test    eax,15
        jne     AtomChk   ; Not aligned go check Atom
        bt      __isa_enabled, __ISA_AVAILABLE_SSE2
        jc      VEC_memcpy ; yes, go SSE2 copy (params already set)
AtomChk:
        ; Is Atom supported?
        bt      __favor, __FAVOR_ATOM
        jnc     Dword_align ; no,jump

        ; check if dst is 4 byte aligned
        test    edi, 3
        jne     CopyLeadUp

        ; check if src is 4 byte aligned
        test    esi, 3
        jne     Dword_align_Ok

; A software pipelining vectorized memcpy loop using PALIGN instructions

; (1) copy the first bytes to align dst up to the nearest 16-byte boundary
; 4 byte align -> 12 byte copy, 8 byte align -> 8 byte copy, 12 byte align -> 4 byte copy
PalignHead4:
        bt      edi, 2
        jae     PalignHead8
        mov     eax, dword ptr [esi]
        sub     ecx, 4
        lea     esi, byte ptr [esi+4]
        mov     dword ptr [edi], eax
        lea     edi, byte ptr [edi+4]

PalignHead8:
        bt      edi, 3
        jae     PalignLoop
        movq    xmm1, qword ptr [esi]
        sub     ecx, 8
        lea     esi, byte ptr [esi+8]
        movq    qword ptr [edi], xmm1
        lea     edi, byte ptr [edi+8]

;(2) Use SSE palign loop
PalignLoop:
        test    esi, 7
        je      MovPalign8
        bt      esi, 3
        jae     MovPalign4

PALIGN_memcpy 12
        jmp     PalignTail

PALIGN_memcpy 8
        jmp     PalignTail

PALIGN_memcpy 4

;(3) Copy the tailing bytes.
PalignTail:
        cmp    ecx,10h
        jl     PalignTail4
        movdqu xmm1,xmmword ptr [esi]
        sub    ecx, 10h
        lea    esi, xmmword ptr [esi+10h]
        movdqa xmmword ptr [edi],xmm1
        lea    edi, xmmword ptr [edi+10h]
        jmp    PalignTail

PalignTail4:
        bt      ecx, 2
        jae     PalignTail8
        mov     eax, dword ptr [esi]
        sub     ecx,4
        lea     esi, byte ptr [esi+4]
        mov     dword ptr [edi], eax
        lea     edi, byte ptr [edi+4]

PalignTail8:
        bt      ecx, 3
        jae     PalignTailLE3
        movq    xmm1, qword ptr [esi]
        sub     ecx,8
        lea     esi, byte ptr [esi+8]
        movq    qword ptr [edi], xmm1
        lea     edi, byte ptr [edi+8]

PalignTailLE3:
        mov     eax, dword ptr TrailUpVec[ecx*4]
        jmp     eax

; The algorithm for forward moves is to align the destination to a dword
; boundary and so we can move dwords with an aligned destination.  This
; occurs in 3 steps.
;
;   - move x = ((4 - Dest & 3) & 3) bytes
;   - move y = ((L-x) >> 2) dwords
;   - move (L - x - y*4) bytes
;

Dword_align:
        test    edi,11b         ;U - destination dword aligned?
        jnz     short CopyLeadUp ;V - if we are not dword aligned already, align
Dword_align_Ok:
        shr     ecx,2           ;U - shift down to dword count
        and     edx,11b         ;V - trailing byte count

        cmp     ecx,8           ;U - test if small enough for unwind copy
        jb      short CopyUnwindUp ;V - if so, then jump

        rep     movsd           ;N - move all of our dwords

        jmp     dword ptr TrailUpVec[edx*4] ;N - process trailing bytes

;
; Code to do optimal memory copies for non-dword-aligned destinations.
;

; The following length check is done for two reasons:
;
;    1. to ensure that the actual move length is greater than any possiale
;       alignment move, and
;
;    2. to skip the multiple move logic for small moves where it would
;       be faster to move the bytes with one instruction.
;

        align   @WordSize
CopyLeadUp:

        mov     eax,edi         ;U - get destination offset
        mov     edx,11b         ;V - prepare for mask

        sub     ecx,4           ;U - check for really short string - sub for adjust
        jb      short ByteCopyUp ;V - branch to just copy bytes

        and     eax,11b         ;U - get offset within first dword
        add     ecx,eax         ;V - update size after leading bytes copied

        jmp     dword ptr LeadUpVec[eax*4-4] ;N - process leading bytes

        align   @WordSize
ByteCopyUp:
        jmp     dword ptr TrailUpVec[ecx*4+16] ;N - process just bytes

        align   @WordSize
CopyUnwindUp:
        jmp     dword ptr UnwindUpVec[ecx*4] ;N - unwind dword copy

        align   @WordSize
LeadUpVec       dd      LeadUp1, LeadUp2, LeadUp3

        align   @WordSize
LeadUp1:
        and     edx,ecx         ;U - trailing byte count
        mov     al,[esi]        ;V - get first byte from source

        mov     [edi],al        ;U - write second byte to destination
        mov     al,[esi+1]      ;V - get second byte from source

        mov     [edi+1],al      ;U - write second byte to destination
        mov     al,[esi+2]      ;V - get third byte from source

        shr     ecx,2           ;U - shift down to dword count
        mov     [edi+2],al      ;V - write third byte to destination

        add     esi,3           ;U - advance source pointer
        add     edi,3           ;V - advance destination pointer

        cmp     ecx,8           ;U - test if small enough for unwind copy
        jb      short CopyUnwindUp ;V - if so, then jump

        rep     movsd           ;N - move all of our dwords

        jmp     dword ptr TrailUpVec[edx*4] ;N - process trailing bytes

        align   @WordSize
LeadUp2:
        and     edx,ecx         ;U - trailing byte count
        mov     al,[esi]        ;V - get first byte from source

        mov     [edi],al        ;U - write second byte to destination
        mov     al,[esi+1]      ;V - get second byte from source

        shr     ecx,2           ;U - shift down to dword count
        mov     [edi+1],al      ;V - write second byte to destination

        add     esi,2           ;U - advance source pointer
        add     edi,2           ;V - advance destination pointer

        cmp     ecx,8           ;U - test if small enough for unwind copy
        jb      short CopyUnwindUp ;V - if so, then jump

        rep     movsd           ;N - move all of our dwords

        jmp     dword ptr TrailUpVec[edx*4] ;N - process trailing bytes

        align   @WordSize
LeadUp3:
        and     edx,ecx         ;U - trailing byte count
        mov     al,[esi]        ;V - get first byte from source

        mov     [edi],al        ;U - write second byte to destination
        add     esi,1           ;V - advance source pointer

        shr     ecx,2           ;U - shift down to dword count
        add     edi,1           ;V - advance destination pointer

        cmp     ecx,8           ;U - test if small enough for unwind copy
        jb      short CopyUnwindUp ;V - if so, then jump

        rep     movsd           ;N - move all of our dwords

        jmp     dword ptr TrailUpVec[edx*4] ;N - process trailing bytes

        align   @WordSize
UnwindUpVec     dd      UnwindUp0, UnwindUp1, UnwindUp2, UnwindUp3
                dd      UnwindUp4, UnwindUp5, UnwindUp6, UnwindUp7

UnwindUp7:
        mov     eax,[esi+ecx*4-28] ;U - get dword from source
                                   ;V - spare
        mov     [edi+ecx*4-28],eax ;U - put dword into destination
UnwindUp6:
        mov     eax,[esi+ecx*4-24] ;U(entry)/V(not) - get dword from source
                                   ;V(entry) - spare
        mov     [edi+ecx*4-24],eax ;U - put dword into destination
UnwindUp5:
        mov     eax,[esi+ecx*4-20] ;U(entry)/V(not) - get dword from source
                                   ;V(entry) - spare
        mov     [edi+ecx*4-20],eax ;U - put dword into destination
UnwindUp4:
        mov     eax,[esi+ecx*4-16] ;U(entry)/V(not) - get dword from source
                                   ;V(entry) - spare
        mov     [edi+ecx*4-16],eax ;U - put dword into destination
UnwindUp3:
        mov     eax,[esi+ecx*4-12] ;U(entry)/V(not) - get dword from source
                                   ;V(entry) - spare
        mov     [edi+ecx*4-12],eax ;U - put dword into destination
UnwindUp2:
        mov     eax,[esi+ecx*4-8] ;U(entry)/V(not) - get dword from source
                                  ;V(entry) - spare
        mov     [edi+ecx*4-8],eax ;U - put dword into destination
UnwindUp1:
        mov     eax,[esi+ecx*4-4] ;U(entry)/V(not) - get dword from source
                                  ;V(entry) - spare
        mov     [edi+ecx*4-4],eax ;U - put dword into destination

        lea     eax,[ecx*4]     ;V - compute update for pointer

        add     esi,eax         ;U - update source pointer
        add     edi,eax         ;V - update destination pointer
UnwindUp0:
        jmp     dword ptr TrailUpVec[edx*4] ;N - process trailing bytes

;-----------------------------------------------------------------------------

        align   @WordSize
TrailUpVec      dd      TrailUp0, TrailUp1, TrailUp2, TrailUp3

        align   @WordSize
TrailUp0:
        mov     eax,[esp + 0Ch] ;U - return pointer to destination
        pop     esi             ;V - restore esi
        pop     edi             ;U - restore edi
                                ;V - spare
        M_EXIT

        align   @WordSize
TrailUp1:
        mov     al,[esi]        ;U - get byte from source
                                ;V - spare
        mov     [edi],al        ;U - put byte in destination
        mov     eax,[esp + 0Ch] ;V - return pointer to destination
        pop     esi             ;U - restore esi
        pop     edi             ;V - restore edi
        M_EXIT

        align   @WordSize
TrailUp2:
        mov     al,[esi]        ;U - get first byte from source
                                ;V - spare
        mov     [edi],al        ;U - put first byte into destination
        mov     al,[esi+1]      ;V - get second byte from source
        mov     [edi+1],al      ;U - put second byte into destination
        mov     eax,[esp + 0Ch] ;V - return pointer to destination
        pop     esi             ;U - restore esi
        pop     edi             ;V - restore edi
        M_EXIT

        align   @WordSize
TrailUp3:
        mov     al,[esi]        ;U - get first byte from source
                                ;V - spare
        mov     [edi],al        ;U - put first byte into destination
        mov     al,[esi+1]      ;V - get second byte from source
        mov     [edi+1],al      ;U - put second byte into destination
        mov     al,[esi+2]      ;V - get third byte from source
        mov     [edi+2],al      ;U - put third byte into destination
        mov     eax,[esp + 0Ch] ;V - return pointer to destination
        pop     esi             ;U - restore esi
        pop     edi             ;V - restore edi
        M_EXIT

;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------

;
; Copy down to avoid propogation in overlapping buffers.
;
        align   @WordSize
CopyDown:
        lea     esi,[esi+ecx-4] ;U - point to 4 bytes before src buffer end
        lea     edi,[edi+ecx-4] ;V - point to 4 bytes before dest buffer end
;
; See if the destination start is dword aligned
;

        test    edi,11b         ;U - test if dword aligned
        jnz     short CopyLeadDown ;V - if not, jump

        shr     ecx,2           ;U - shift down to dword count
        and     edx,11b         ;V - trailing byte count

        cmp     ecx,8           ;U - test if small enough for unwind copy
        jb      short CopyUnwindDown ;V - if so, then jump

        std                     ;N - set direction flag
        rep     movsd           ;N - move all of our dwords
        cld                     ;N - clear direction flag back

        jmp     dword ptr TrailDownVec[edx*4] ;N - process trailing bytes

        align   @WordSize
CopyUnwindDown:
        neg     ecx             ;U - negate dword count for table merging
                                ;V - spare

        jmp     dword ptr UnwindDownVec[ecx*4+28] ;N - unwind copy

        align   @WordSize
CopyLeadDown:

        mov     eax,edi         ;U - get destination offset
        mov     edx,11b         ;V - prepare for mask

        cmp     ecx,4           ;U - check for really short string
        jb      short ByteCopyDown ;V - branch to just copy bytes

        and     eax,11b         ;U - get offset within first dword
        sub     ecx,eax         ;U - to update size after lead copied

        jmp     dword ptr LeadDownVec[eax*4-4] ;N - process leading bytes

        align   @WordSize
ByteCopyDown:
        jmp     dword ptr TrailDownVec[ecx*4] ;N - process just bytes

        align   @WordSize
LeadDownVec     dd      LeadDown1, LeadDown2, LeadDown3

        align   @WordSize
LeadDown1:
        mov     al,[esi+3]      ;U - load first byte
        and     edx,ecx         ;V - trailing byte count

        mov     [edi+3],al      ;U - write out first byte
        sub     esi,1           ;V - point to last src dword

        shr     ecx,2           ;U - shift down to dword count
        sub     edi,1           ;V - point to last dest dword

        cmp     ecx,8           ;U - test if small enough for unwind copy
        jb      short CopyUnwindDown ;V - if so, then jump

        std                     ;N - set direction flag
        rep     movsd           ;N - move all of our dwords
        cld                     ;N - clear direction flag

        jmp     dword ptr TrailDownVec[edx*4] ;N - process trailing bytes

        align   @WordSize
LeadDown2:
        mov     al,[esi+3]      ;U - load first byte
        and     edx,ecx         ;V - trailing byte count

        mov     [edi+3],al      ;U - write out first byte
        mov     al,[esi+2]      ;V - get second byte from source

        shr     ecx,2           ;U - shift down to dword count
        mov     [edi+2],al      ;V - write second byte to destination

        sub     esi,2           ;U - point to last src dword
        sub     edi,2           ;V - point to last dest dword

        cmp     ecx,8           ;U - test if small enough for unwind copy
        jb      short CopyUnwindDown ;V - if so, then jump

        std                     ;N - set direction flag
        rep     movsd           ;N - move all of our dwords
        cld                     ;N - clear direction flag

        jmp     dword ptr TrailDownVec[edx*4] ;N - process trailing bytes

        align   @WordSize
LeadDown3:
        mov     al,[esi+3]      ;U - load first byte
        and     edx,ecx         ;V - trailing byte count

        mov     [edi+3],al      ;U - write out first byte
        mov     al,[esi+2]      ;V - get second byte from source

        mov     [edi+2],al      ;U - write second byte to destination
        mov     al,[esi+1]      ;V - get third byte from source

        shr     ecx,2           ;U - shift down to dword count
        mov     [edi+1],al      ;V - write third byte to destination

        sub     esi,3           ;U - point to last src dword
        sub     edi,3           ;V - point to last dest dword

        cmp     ecx,8           ;U - test if small enough for unwind copy
        jb      CopyUnwindDown  ;V - if so, then jump

        std                     ;N - set direction flag
        rep     movsd           ;N - move all of our dwords
        cld                     ;N - clear direction flag

        jmp     dword ptr TrailDownVec[edx*4] ;N - process trailing bytes

;------------------------------------------------------------------

        align   @WordSize
UnwindDownVec   dd      UnwindDown7, UnwindDown6, UnwindDown5, UnwindDown4
                dd      UnwindDown3, UnwindDown2, UnwindDown1, UnwindDown0

UnwindDown7:
        mov     eax,[esi+ecx*4+28] ;U - get dword from source
                                   ;V - spare
        mov     [edi+ecx*4+28],eax ;U - put dword into destination
UnwindDown6:
        mov     eax,[esi+ecx*4+24] ;U(entry)/V(not) - get dword from source
                                   ;V(entry) - spare
        mov     [edi+ecx*4+24],eax ;U - put dword into destination
UnwindDown5:
        mov     eax,[esi+ecx*4+20] ;U(entry)/V(not) - get dword from source
                                   ;V(entry) - spare
        mov     [edi+ecx*4+20],eax ;U - put dword into destination
UnwindDown4:
        mov     eax,[esi+ecx*4+16] ;U(entry)/V(not) - get dword from source
                                   ;V(entry) - spare
        mov     [edi+ecx*4+16],eax ;U - put dword into destination
UnwindDown3:
        mov     eax,[esi+ecx*4+12] ;U(entry)/V(not) - get dword from source
                                   ;V(entry) - spare
        mov     [edi+ecx*4+12],eax ;U - put dword into destination
UnwindDown2:
        mov     eax,[esi+ecx*4+8] ;U(entry)/V(not) - get dword from source
                                   ;V(entry) - spare
        mov     [edi+ecx*4+8],eax ;U - put dword into destination
UnwindDown1:
        mov     eax,[esi+ecx*4+4] ;U(entry)/V(not) - get dword from source
                                  ;V(entry) - spare
        mov     [edi+ecx*4+4],eax ;U - put dword into destination

        lea     eax,[ecx*4]     ;V - compute update for pointer

        add     esi,eax         ;U - update source pointer
        add     edi,eax         ;V - update destination pointer
UnwindDown0:
        jmp     dword ptr TrailDownVec[edx*4] ;N - process trailing bytes

;-----------------------------------------------------------------------------

        align   @WordSize
TrailDownVec    dd      TrailDown0, TrailDown1, TrailDown2, TrailDown3

        align   @WordSize
TrailDown0:
        mov     eax,[esp + 0Ch] ;U - return pointer to destination
                                ;V - spare
        pop     esi             ;U - restore esi
        pop     edi             ;V - restore edi
        M_EXIT

        align   @WordSize
TrailDown1:
        mov     al,[esi+3]      ;U - get byte from source
                                ;V - spare
        mov     [edi+3],al      ;U - put byte in destination
        mov     eax,[esp + 0Ch] ;V - return pointer to destination
        pop     esi             ;U - restore esi
        pop     edi             ;V - restore edi
        M_EXIT

        align   @WordSize
TrailDown2:
        mov     al,[esi+3]      ;U - get first byte from source
                                ;V - spare
        mov     [edi+3],al      ;U - put first byte into destination
        mov     al,[esi+2]      ;V - get second byte from source
        mov     [edi+2],al      ;U - put second byte into destination
        mov     eax,[esp + 0Ch] ;V - return pointer to destination
        pop     esi             ;U - restore esi
        pop     edi             ;V - restore edi
        M_EXIT

        align   @WordSize
TrailDown3:
        mov     al,[esi+3]      ;U - get first byte from source
                                ;V - spare
        mov     [edi+3],al      ;U - put first byte into destination
        mov     al,[esi+2]      ;V - get second byte from source
        mov     [edi+2],al      ;U - put second byte into destination
        mov     al,[esi+1]      ;V - get third byte from source
        mov     [edi+1],al      ;U - put third byte into destination
        mov     eax,[esp + 0Ch] ;V - return pointer to destination
        pop     esi             ;U - restore esi
        pop     edi             ;V - restore edi
        M_EXIT


align       16
VEC_memcpy:
        push        edi         ; save dst for returning
        mov         eax, esi
        and         eax, 0Fh
        ; eax = src and dst alignment (src mod 16)
        test        eax, eax
        jne         L_Notaligned

        ; in:
        ;  edi = dst (16 byte aligned)
        ;  esi = src (16 byte aligned)
        ;  ecx = len is >= (128 - head alignment bytes)
        ; do block copy using SSE2 stores
L_Aligned:
        mov         edx, ecx
        and         ecx, 7Fh
        shr         edx, 7
        je          L_1a
        ; ecx = loop count
        ; edx = remaining copy length
align       16
 L_1:
        movdqa      xmm0,xmmword ptr [esi]
        movdqa      xmm1,xmmword ptr [esi + 10h]
        movdqa      xmm2,xmmword ptr [esi + 20h]
        movdqa      xmm3,xmmword ptr [esi + 30h]
        movdqa      xmmword ptr [edi],xmm0
        movdqa      xmmword ptr [edi + 10h],xmm1
        movdqa      xmmword ptr [edi + 20h],xmm2
        movdqa      xmmword ptr [edi + 30h],xmm3
        movdqa      xmm4,xmmword ptr [esi + 40h]
        movdqa      xmm5,xmmword ptr [esi + 50h]
        movdqa      xmm6,xmmword ptr [esi + 60h]
        movdqa      xmm7,xmmword ptr [esi + 70h]
        movdqa      xmmword ptr [edi + 40h],xmm4
        movdqa      xmmword ptr [edi + 50h],xmm5
        movdqa      xmmword ptr [edi + 60h],xmm6
        movdqa      xmmword ptr [edi + 70h],xmm7
        lea         esi,[esi + 80h]
        lea         edi,[edi + 80h]
        dec         edx
        jne         L_1
L_1a:
        test        ecx, ecx
        je          L_Return

        ; ecx = length (< 128 bytes)
        mov         edx, ecx
        shr         edx, 4
        test        edx, edx
        je          L_Trailing
        ; if > 16 bytes do a loop (16 bytes at a time)
        ; edx - loop count
        ; edi = dst
        ; esi = src
align 16
L_2:
        movdqa      xmm0, xmmword ptr [esi]
        movdqa      xmmword ptr [edi], xmm0
        lea         esi, [esi + 10h]
        lea         edi, [edi + 10h]
        dec         edx
        jne         L_2

L_Trailing:

        ; last 1-15 bytes: step back according to dst and src alignment and do a 16-byte copy
        ; esi = src
        ; eax = src alignment  (set at the start of the procedure and preserved up to here)
        ; edi = dst
        and         ecx, 0Fh
        ; ecx = remaining len
        je          L_Return

        ; get dword aligned
        mov     eax, ecx  ; save remaining len and calc number of dwords
        shr     ecx, 2
        je      L_TrailBytes ; if none try bytes
L_TrailDword:
        mov     edx, dword ptr [esi]
        mov     dword ptr [edi], edx
        lea     esi, [esi+4]
        lea     edi, [edi+4]
        dec     ecx
        jne     L_TrailDword
L_TrailBytes:
        mov     ecx, eax
        and     ecx, 03h
        je      L_Return ; if none return
L_TrailNextByte:
        mov     al, byte ptr [esi]
        mov     byte ptr [edi], al
        inc     esi
        inc     edi
        dec     ecx
        jne     L_TrailNextByte
align 16
L_Return:
        ; return dst
        pop     eax      ; Get destination for return
        pop     esi
        pop     edi
        M_EXIT

; dst addr is not 16 byte aligned
align 16
L_Notaligned:

; copy the first the first 1-15 bytes to align both src and dst up to the nearest 16-byte boundary:

; in
; esi = src
; edi = dst
; eax = src and dst alignment
; ecx = length

        mov     edx, 010h
        sub     edx, eax   ; calc num bytes to get it aligned
        sub     ecx, edx   ; calc new length and save it
        push    ecx
        mov     eax, edx   ; save alignment byte count for dwords
        mov     ecx, eax   ; set exc to rep count
        and     ecx, 03h
        je      L_MovDword ; if no bytes go do dwords
L_Byte:
        mov     dl, byte ptr [esi]  ; move the bytes
        mov     byte ptr [edi], dl
        inc     esi       ; inc the adrs
        inc     edi
        dec     ecx       ; dec the counter
        jne     L_Byte
L_MovDword:
        shr     eax, 2     ; get dword count
        je      L_Adjustcnt ; if none go to main loop
L_Dword:
        mov     edx, dword ptr [esi] ; move the dwords
        mov     dword ptr [edi], edx
        lea     esi, [esi+4] ; inc the adrs
        lea     edi, [edi+4]
        dec     eax          ; dec the counter
        jne     L_Dword
L_Adjustcnt:
        pop     ecx       ; retrive the adjusted length
        jmp     L_Aligned


_MEM_   endp
        end

they do the exact same thing, just the asm version uses SSE, so it's gonna be faster. But the fastest way to do it would be to just copy the data is massive amounts... like 8 bytes per cycle. and then the remaining bytes (assuming the size isn't a divisible by 8) in single bytes

to be honest, the asm source just looks like someone wanted to try out SSE...

Post by **chili** » October 1st, 2012, 1:16 am

SSE is the fastest way to get the job done on most systems. That's why it's in there.

The non-SSE codepath is still hella fast though; it uses rep movsd.

In either case, your previous assertion that memcpy only copies byte-by-byte is incorrect np.

npissoawsome · Post by **npissoawsome** » October 1st, 2012, 2:14 am

it's still hella slow, it's slower than doing a loop lol

npissoawsome · Post by **npissoawsome** » October 1st, 2012, 2:17 am

Also, SSE is not the fastest way at all for moving content between memory, in fact it's actually the slowest way I could think of lol. in order to use SSE, you have to have your data inside SSE registers, and than there's nothing to do in an SSE register besides math. so the operation would be like this

move first half of data into register
move second half of data into register
move fist half of data out of register
move second half of data out of register

instead of
move into register
move out of register

Post by **chili** » October 1st, 2012, 2:26 am

You clearly lack a basic grasp of the SSE instruction set. MOVDQA can move 128 bits at once. I don't know where you are getting your information from, but you are under a grave misapprehension.

Additionally, if you weren't using SSE, you wouldn't have:

move into register
move out of register

You'd just have one rep movsd instruction.

npissoawsome · Post by **npissoawsome** » October 2nd, 2012, 12:12 am

what are you talking about lol, you wouldn't use MOVSD, that from xmm to xmm, or from m128 to xmm, if you're talking about moving 128 bits of data in and out of registers, than you want to use something like movaps or movups. movaps is preferable, but you need your data to be aligned, thus that sexy _aligned_malloc call

in the entire cpu instruction set there is no instruction that can move from memory to memory

And I actually do have an understanding of SSE lol, but hey I'm still here to learn, I would love SSE tutorials, but none of that bs inline shit, if you want performance, you should be using actually assembly

The code might be moving 128 bits into an xmm reg, than out into memory, wasn't thinking clearly, you're probably right

oh and this is where I get my instructions from, the only knowledge I really have is from looking at instructions, so I don't know that much, but I know how to use it effectively.

Post by **chili** » October 2nd, 2012, 1:19 am

npissoawsome wrote:what are you talking about lol, you wouldn't use MOVSD, that from xmm to xmm, or from m128 to xmm, if you're talking about moving 128 bits of data in and out of registers, than you want to use something like movaps or movups. movaps is preferable, but you need your data to be aligned, thus that sexy _aligned_malloc call

From Stackoverflow.com:

Q: I'm lookig Intel datasheet: Intel® 64 and IA-32 Architectures Software Developer’s Manual and I can't find the difference between MOVDQA and MOVAPS x86 instructions? (...) So can anybody explain the difference?

A: In functionality, they are identical.

On some (but not all) micro-architectures, there are timing differences due to "domain crossing penalties". For this reason, one should generally use movdqa when the data is being used with integer SSE instructions, and movaps when the data is being used with floating-point instructions. instructions.

npissoawsome wrote:in the entire cpu instruction set there is no instruction that can move from memory to memory

This is so wrong it hurts. Seems like somebody (you) bought a year-long commuter pass for the FailTrain.
Get schooled: http://faydoc.tripod.com/cpu/movsd.htm

Pretty much all of your other assertions are blatantly false as well, but I can't be assed argue with you about this shit anymore np.

npissoawsome · Post by **npissoawsome** » October 2nd, 2012, 1:47 am

huh fair, (it's actually the only function, so it's not that big of a fail) enough movsd is the first instruction I've seen that moves data between memory, and you didn't have to link me to a comparison between movdqa and movaps I know they're the same thing lol, I never said they weren't

and what other assertions are you talking about? Sounds to me like you're just trying to sound smart

your exaggerations are quite absurd, I guess everyone here worshiping you has gotten to your head

Post by **LuX** » October 2nd, 2012, 7:04 am

One, never start and argument with chili, you won't win. EVER!

Second, using SSE / assembly for like everything seems a bit over the top. I mean you would think the guys at the c++ committee have put fast enough function into the files. At least for level were making games. Besides the speed gain is not that impressive unless you will need some huge calculation like every ms or something, but other than that it will make your code pretty much unreadable and harder to edit.

Third, after we get to D3D we will be able to render graphics with just about no performance cost since it's done by other hardware, which leaves plenty of resources to use on other calculations.

Four, it seems like you are addicted to speed. What are you planning to use it on anyways? Make some epic nuclear explosion simulator?

But what ever... Carry on.

Asimov · Post by **Asimov** » October 2nd, 2012, 9:06 pm

Hi,

All I can say is that I watch the tutorials and learn as much as I can. I very often don't do it the same way as the tutorial, and I have my own style, and I get stuck a lot, but the tutorials are helpful.
It's like I watched the sprite tutorial and made my sprite routine a totally different way, but without the initial tutorial I wouldn't have been able to do it my own way.

I don't worship chilli, but his tutorials are a valuable resource, and Chilli is always helpful.

Although if God was on one side and Chilli was on the other I would go with Chilli LOL, rather than the old Beardy guy LOL

PS mind you I could have kissed Chili when he showed me how to load images from a resource file, but don't tell anyone LOL

Planet Chili

memcpy

memcpy

Re: memcpy

Re: memcpy

Re: memcpy

Re: memcpy

Re: memcpy

Re: memcpy

Re: memcpy

Re: memcpy

Re: memcpy