Old:
0: Dist: 4, Time: 288 ms
1: Dist: 4, Time: 270 ms
2: Dist: 4, Time: 280 ms
3: Dist: 4, Time: 9 ms
4: Dist: 40, Time: 285 ms
5: Dist: 4, Time: 157 ms
6: Dist: 9, Time: 243 ms
7: Dist: 6, Time: 289 ms
8: Dist: 4, Time: 88 ms
9: Dist: 6, Time: 258 ms
New:
0: Dist: 4, Time: 86 ms
1: Dist: 4, Time: 80 ms
2: Dist: 4, Time: 83 ms
3: Dist: 4, Time: 3 ms
4: Dist: 40, Time: 84 ms
5: Dist: 4, Time: 47 ms
6: Dist: 9, Time: 72 ms
7: Dist: 6, Time: 86 ms
8: Dist: 4, Time: 26 ms
9: Dist: 6, Time: 77 ms
Да-а-а, интересно – същият ефект. А са променени само 3 инструкции и премахнато едно четене от паметта, което по принцип би трябвало да е кеширано (и явно на Интелските процесори е):
.loop1:
mov edx, [.len16]
movdqa xmm3, [ebx + edx]
movdqa xmm2, xmm3
+ movdqa xmm5, xmm3
paddb xmm3, xmm1
movdqa [ebx + edx], xmm3
mov al, [edi]
lea edi, [edi+1]
mov ah, al
movd xmm0, eax
punpcklbw xmm0, xmm0
punpcklbw xmm0, xmm0
punpcklbw xmm0, xmm0
mov esi, [.pString16]
.loop2:
sub edx, 16
movdqa xmm3, [esi]
.................................................................
lea esi, [esi+16]
pcmpeqb xmm3, xmm0
paddb xmm2, xmm3
movdqa xmm4, [ebx + edx]
- pminub xmm2, [ebx + edx + 16]
+ pminub xmm2, xmm5
pminub xmm2, xmm4
paddb xmm2, xmm1
movdqa [ebx + edx], xmm2
+ movdqa xmm5, xmm2
movdqa xmm2, xmm4
jnz .loop2