晒しておきます。
予め言っておきますが遅いです。
しかも汚くて読みづらいです。
static int icos_xu[8][8] = { { 181, 251, 236, 212, 181, 142, 97, 49 }, { 181, 212, 97, -50, -182, -252, -237, -143 }, { 181, 142, -98, -252, -182, 49, 236, 212 }, { 181, 49, -237, -143, 181, 212, -98, -252 }, { 181, -50, -237, 142, 181, -213, -98, 251 }, { 181, -143, -98, 251, -182, -50, 236, -213 }, { 181, -213, 97, 49, -182, 251, -237, 142 }, { 181, -252, 236, -213, 181, -143, 97, -50 } }; void DecodeDCT( int* _dct, u_int* _data ) { int* _icos_x = icos_xu[0]; int* _icos_y = icos_xu[0]; for(int y=0; y<8; y++) { for(int x=0; x<8; x++) { /* int _tmp[8] = { 0 }; for(int v=0; v<8; v++) { int* __dct = _dct[v]; int ksu = _icos_y[v]; for(int u=0; u<8; u++) { _tmp[u] += ksu * __dct[u]; } } int sum = 0; for(int u=0; u<8; u++) sum += _tmp[u] * icos_xu[x][u]; */ _asm { ; eax = 汎用 ; ebx = int* _icos_y ; ecx = ループカウンタ ; edx = 汎用 ; esi = int* __dct = _dct[v] ; edi = int* icos_xu[x] ; mm0〜3 = _tmp[0〜7] ; mm4 = ksu = _icos_y[v] ; mm5〜7 = 汎用 mov esi, _dct mov ebx, _icos_y mov ecx, 8 pxor mm0, mm0 pxor mm1, mm1 pxor mm2, mm2 pxor mm3, mm3 MMX_DCT_LOOP: movd mm4, [ebx] movq mm5, mm4 punpckldq mm4, mm5 ; < Schedule Map > ; mm5 RED MUL --> --> ADD RED MUL --> --> ADD ; mm6 RED MUL --> --> ADD ; mm7 RED MUL --> --> ADD movq mm5, [esi] pmaddwd mm5, mm4 movq mm6, [esi+8] pmaddwd mm6, mm4 paddd mm0, mm5 movq mm7, [esi+16] pmaddwd mm7, mm4 movq mm5, [esi+24] pmaddwd mm5, mm4 paddd mm1, mm6 paddd mm2, mm7 paddd mm3, mm5 add esi, 32 add ebx, 4 loop MMX_DCT_LOOP mov edi, _icos_x ; < Schedule Map > ; RED SFT MUL --> --> RED SFT MUL --> --> ADD ; RED SFT MUL --> --> ADD ; RED SFT MUL --> --> ADD movq mm5, [edi] psrad mm0, 10 pmaddwd mm0, mm5 movq mm6, [edi+8] psrad mm1, 10 pmaddwd mm1, mm6 movq mm7, [edi+16] psrad mm2, 10 pmaddwd mm2, mm7 movq mm5, [edi+24] psrad mm3, 10 pmaddwd mm3, mm5 paddd mm0, mm1 paddd mm2, mm3 paddd mm0, mm2 psrlq mm0, 8 movq mm1, mm0 psrlq mm1, 32 paddd mm0, mm1 mov esi, _data movd [esi], mm0 add esi, 4 add edi, 32 mov _data, esi mov _icos_x, edi emms } } _icos_x -= 64; _icos_y += 8; } }