/**************************************************************************** * * Module Title : OptFunctions.c * * Description : Encoder system dependant functions. * ****************************************************************************/ /**************************************************************************** * Header Files ****************************************************************************/ #include "compdll.h" #include "math.h" /**************************************************************************** * Macros ****************************************************************************/ #pragma warning(disable:4799) #define FILTER_WEIGHT 128 #define FILTER_SHIFT 7 /**************************************************************************** * Module Statics ****************************************************************************/ static __declspec(align(16)) short rd[] = { 64, 64, 64, 64, 64, 64, 64, 64 }; /**************************************************************************** * Imports ****************************************************************************/ extern INT16 BilinearFilters_mmx[8][16]; /**************************************************************************** * * ROUTINE : MmxGetSAD * * INPUTS : UINT8 *NewDataPtr : Pointer to first input data array. * INT32 PixelsPerLine : Length of line for NewDataPtr. * UINT8 *RefDataPtr : Pointer to second input data array. * INT32 RefPixelsPerLine : Length of line for RefDataPtr. * INT32 ErrorSoFar : Error accumulated before this call. * INT32 BestSoFar : (NOT USED). * * OUTPUTS : None. * * RETURNS : INT32: SAD for the two blocks. * * FUNCTION : Calculates the sum of the absolute differences for * the two blocks. * * SPECIAL NOTES : None. * ****************************************************************************/ INT32 MmxGetSAD ( UINT8 *NewDataPtr, INT32 PixelsPerLine, UINT8 *RefDataPtr, INT32 RefPixelsPerLine, INT32 ErrorSoFar, INT32 BestSoFar ) { INT32 DiffVal = ErrorSoFar; INT16 DiffAcc[4] = { 0, 0, 0, 0}; // MMX accumulator. // MMX code for SAD. __asm { pxor mm6, mm6 ; Blank mmx6 pxor mm7, mm7 ; Blank mmx7 mov eax,dword ptr [NewDataPtr] ; Load base addresses mov ebx,dword ptr [RefDataPtr] mov ecx,dword ptr [PixelsPerLine] mov edx,dword ptr [RefPixelsPerLine] // Row 1 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm1, [ebx] ; Copy eight bytes to mm1 movq mm2, mm0 ; Take copy of MM0 psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,ecx ; Inc pointer into the new data paddw mm7, mm1 ; accumulate difference... add ebx,edx ; Inc pointer into ref data // Row 2 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm1, [ebx] ; Copy eight bytes to mm1 movq mm2, mm0 ; Take copy of MM0 psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,ecx ; Inc pointer into the new data paddw mm7, mm1 ; accumulate difference... add ebx,edx ; Inc pointer into ref data // Row 3 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm1, [ebx] ; Copy eight bytes to mm1 movq mm2, mm0 ; Take copy of MM0 psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,ecx ; Inc pointer into the new data paddw mm7, mm1 ; accumulate difference... add ebx,edx ; Inc pointer into ref data // Row 4 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm1, [ebx] ; Copy eight bytes to mm1 movq mm2, mm0 ; Take copy of MM0 psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,ecx ; Inc pointer into the new data paddw mm7, mm1 ; accumulate difference... add ebx,edx ; Inc pointer into ref data // Row 5 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm1, [ebx] ; Copy eight bytes to mm1 movq mm2, mm0 ; Take copy of MM0 psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,ecx ; Inc pointer into the new data paddw mm7, mm1 ; accumulate difference... add ebx,edx ; Inc pointer into ref data // Row 6 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm1, [ebx] ; Copy eight bytes to mm1 movq mm2, mm0 ; Take copy of MM0 psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,ecx ; Inc pointer into the new data paddw mm7, mm1 ; accumulate difference... add ebx,edx ; Inc pointer into ref data // Row 7 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm1, [ebx] ; Copy eight bytes to mm1 movq mm2, mm0 ; Take copy of MM0 psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,ecx ; Inc pointer into the new data paddw mm7, mm1 ; accumulate difference... add ebx,edx ; Inc pointer into ref data // Row 8 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm1, [ebx] ; Copy eight bytes to mm1 movq mm2, mm0 ; Take copy of MM0 psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision paddw mm7, mm1 ; accumulate difference... movq DWORD PTR [DiffAcc], mm7 ; copy back accumulated results into normal memory // emms ; Clear the MMX state. } // Accumulate the 4 resulting word values. DiffVal += DiffAcc[0] + DiffAcc[1] + DiffAcc[2] + DiffAcc[3]; return DiffVal; } /**************************************************************************** * * ROUTINE : MmxGetHalfPixelSAD * * INPUTS : UINT8 *SrcData : Pointer to first input data array. * INT32 PixelsPerLine : Length of line for NewDataPtr. * UINT8 *RefDataPtr1 : Pointer to first reference data array. * UINT8 *RefDataPtr2 : Pointer to second reference data array. * INT32 RefPixelsPerLine : Length of line for RefDataPtr1/2. * INT32 ErrorSoFar : Error accumulated before this call. * INT32 BestSoFar : (NOT USED). * * OUTPUTS : None. * * RETURNS : INT32: SAD at 1/2 pixel accuracy. * * FUNCTION : Calculates the sum of the absolute differences against * half pixel interpolated references. * * SPECIAL NOTES : None. * ****************************************************************************/ INT32 MmxGetHalfPixelSAD ( UINT8 *SrcData, INT32 PixelsPerLine, UINT8 *RefDataPtr1, UINT8 *RefDataPtr2, INT32 RefPixelsPerLine, INT32 ErrorSoFar, INT32 BestSoFar ) { INT32 DiffVal = ErrorSoFar; INT32 RefOffset = (int)(RefDataPtr1 - RefDataPtr2); INT16 DiffAcc[4] = { 0, 0, 0, 0 }; // MMX accumulator. if ( RefOffset == 0 ) { // Simple case as for non 0.5 pixel DiffVal += MmxGetSAD ( SrcData, PixelsPerLine, RefDataPtr1, RefPixelsPerLine, ErrorSoFar, BestSoFar ); } else { __asm // MMX code for SAD. { pxor mm6, mm6 ; Blank mmx6 pxor mm7, mm7 ; Blank mmx7 mov eax,dword ptr [SrcData] ; Load base addresses and line increment mov ebx,dword ptr [RefDataPtr1] mov ecx,dword ptr [RefDataPtr2] mov edx,dword ptr [PixelsPerLine] mov esi,dword ptr [RefPixelsPerLine] // Row 1 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 punpckhbw mm3, mm6 ; unpack high four bytes to higher precision paddw mm1, mm2 ; Add word values together. punpckhbw mm4, mm6 psrlw mm1, 1 ; Devide by two (shift right 1) paddw mm3, mm4 ; Add word values together. movq mm0, [eax] ; Copy eight of src data to mm0 psrlw mm3, 1 ; Devide by two (shift right 1) movq mm2, mm0 ; Take copy of MM0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data for SAD psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,edx ; Inc pointer into the src data paddw mm7, mm1 ; accumulate difference... add ebx,esi ; Inc pointer into ref1 add ecx,esi ; Inc pointer into ref2 // Row 2 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 punpckhbw mm3, mm6 ; unpack high four bytes to higher precision paddw mm1, mm2 ; Add word values together. punpckhbw mm4, mm6 psrlw mm1, 1 ; Devide by two (shift right 1) paddw mm3, mm4 ; Add word values together. movq mm0, [eax] ; Copy eight of src data to mm0 psrlw mm3, 1 ; Devide by two (shift right 1) movq mm2, mm0 ; Take copy of MM0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data for SAD psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,edx ; Inc pointer into the src data paddw mm7, mm1 ; accumulate difference... add ebx,esi ; Inc pointer into ref1 add ecx,esi ; Inc pointer into ref2 // Row 3 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 punpckhbw mm3, mm6 ; unpack high four bytes to higher precision paddw mm1, mm2 ; Add word values together. punpckhbw mm4, mm6 psrlw mm1, 1 ; Devide by two (shift right 1) paddw mm3, mm4 ; Add word values together. movq mm0, [eax] ; Copy eight of src data to mm0 psrlw mm3, 1 ; Devide by two (shift right 1) movq mm2, mm0 ; Take copy of MM0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data for SAD psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,edx ; Inc pointer into the src data paddw mm7, mm1 ; accumulate difference... add ebx,esi ; Inc pointer into ref1 add ecx,esi ; Inc pointer into ref2 // Row 4 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 punpckhbw mm3, mm6 ; unpack high four bytes to higher precision paddw mm1, mm2 ; Add word values together. punpckhbw mm4, mm6 psrlw mm1, 1 ; Devide by two (shift right 1) paddw mm3, mm4 ; Add word values together. movq mm0, [eax] ; Copy eight of src data to mm0 psrlw mm3, 1 ; Devide by two (shift right 1) movq mm2, mm0 ; Take copy of MM0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data for SAD psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,edx ; Inc pointer into the src data paddw mm7, mm1 ; accumulate difference... add ebx,esi ; Inc pointer into ref1 add ecx,esi ; Inc pointer into ref2 // Row 5 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 punpckhbw mm3, mm6 ; unpack high four bytes to higher precision paddw mm1, mm2 ; Add word values together. punpckhbw mm4, mm6 psrlw mm1, 1 ; Devide by two (shift right 1) paddw mm3, mm4 ; Add word values together. movq mm0, [eax] ; Copy eight of src data to mm0 psrlw mm3, 1 ; Devide by two (shift right 1) movq mm2, mm0 ; Take copy of MM0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data for SAD psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,edx ; Inc pointer into the src data paddw mm7, mm1 ; accumulate difference... add ebx,esi ; Inc pointer into ref1 add ecx,esi ; Inc pointer into ref2 // Row 6 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 punpckhbw mm3, mm6 ; unpack high four bytes to higher precision paddw mm1, mm2 ; Add word values together. punpckhbw mm4, mm6 psrlw mm1, 1 ; Devide by two (shift right 1) paddw mm3, mm4 ; Add word values together. movq mm0, [eax] ; Copy eight of src data to mm0 psrlw mm3, 1 ; Devide by two (shift right 1) movq mm2, mm0 ; Take copy of MM0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data for SAD psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,edx ; Inc pointer into the src data paddw mm7, mm1 ; accumulate difference... add ebx,esi ; Inc pointer into ref1 add ecx,esi ; Inc pointer into ref2 // Row 7 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 punpckhbw mm3, mm6 ; unpack high four bytes to higher precision paddw mm1, mm2 ; Add word values together. punpckhbw mm4, mm6 psrlw mm1, 1 ; Devide by two (shift right 1) paddw mm3, mm4 ; Add word values together. movq mm0, [eax] ; Copy eight of src data to mm0 psrlw mm3, 1 ; Devide by two (shift right 1) movq mm2, mm0 ; Take copy of MM0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data for SAD psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision add eax,edx ; Inc pointer into the src data paddw mm7, mm1 ; accumulate difference... add ebx,esi ; Inc pointer into ref1 add ecx,esi ; Inc pointer into ref2 // Row 8 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 punpckhbw mm3, mm6 ; unpack high four bytes to higher precision paddw mm1, mm2 ; Add word values together. punpckhbw mm4, mm6 psrlw mm1, 1 ; Devide by two (shift right 1) paddw mm3, mm4 ; Add word values together. movq mm0, [eax] ; Copy eight of src data to mm0 psrlw mm3, 1 ; Devide by two (shift right 1) movq mm2, mm0 ; Take copy of MM0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data for SAD psubusb mm0, mm1 ; A-B to MM0 psubusb mm1, mm2 ; B-A to MM1 por mm0, mm1 ; OR MM0 and MM1 gives abs differences in MM0 movq mm1, mm0 ; keep a copy punpcklbw mm0, mm6 ; unpack to higher precision for accumulation paddw mm7, mm0 ; accumulate difference... punpckhbw mm1, mm6 ; unpack high four bytes to higher precision paddw mm7, mm1 ; accumulate difference... movq DWORD PTR [DiffAcc], mm7 ; copy back accumulated results into normal memory } // Accumulate the 4 word values in DiffAcc DiffVal += DiffAcc[0] + DiffAcc[1] + DiffAcc[2] + DiffAcc[3]; } return DiffVal; } /**************************************************************************** * * ROUTINE : MmxGetInterErr * * INPUTS : UINT8 *NewDataPtr : Pointer to first input data array. * INT32 PixelsPerLine : Length of line for NewDataPtr. * UINT8 *RefDataPtr1 : Pointer to first reference data array. * UINT8 *RefDataPtr2 : Pointer to second reference data array. * INT32 RefPixelsPerLine : Length of line for RefDataPtr1/2. * * OUTPUTS : None. * * RETURNS : UINT32: Error variance. * * FUNCTION : Calculates a difference error score for two blocks. * * SPECIAL NOTES : None. * ****************************************************************************/ UINT32 MmxGetInterErr ( UINT8 *NewDataPtr, INT32 PixelsPerLine, UINT8 *RefDataPtr1, UINT8 *RefDataPtr2, INT32 RefPixelsPerLine ) { UINT32 XSum = 0; UINT32 XXSum = 0; INT16 MmxXSum[4] = { 0, 0, 0, 0 }; // XSum accumulators INT32 MmxXXSum[2] = { 0, 0 }; // XXSum accumulators INT32 AbsRefOffset = abs( (int)(RefDataPtr1 - RefDataPtr2) ); // Mode of interpolation chosen based upon on the offset of the second reference pointer if ( AbsRefOffset == 0 ) { __asm { pxor mm5, mm5 ; Blank mmx6 pxor mm6, mm6 ; Blank mmx7 pxor mm7, mm7 ; Blank mmx7 mov eax,dword ptr [NewDataPtr] ; Load base addresses mov ebx,dword ptr [RefDataPtr1] mov ecx,dword ptr [PixelsPerLine] mov edx,dword ptr [RefPixelsPerLine] // Row 1 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm1, [ebx] ; Copy eight bytes to mm1 movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpcklbw mm1, mm6 punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add ebx,edx ; Inc pointer into ref data add eax,ecx ; Inc pointer into the new data movq mm1, [ebx] ; Copy eight bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 2 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory punpcklbw mm1, mm6 movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add ebx,edx ; Inc pointer into ref data add eax,ecx ; Inc pointer into the new data movq mm1, [ebx] ; Copy eight bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 3 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory punpcklbw mm1, mm6 movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add ebx,edx ; Inc pointer into ref data add eax,ecx ; Inc pointer into the new data movq mm1, [ebx] ; Copy eight bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 4 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory punpcklbw mm1, mm6 movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add ebx,edx ; Inc pointer into ref data add eax,ecx ; Inc pointer into the new data movq mm1, [ebx] ; Copy eight bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 5 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory punpcklbw mm1, mm6 movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add ebx,edx ; Inc pointer into ref data add eax,ecx ; Inc pointer into the new data movq mm1, [ebx] ; Copy eight bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 6 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory punpcklbw mm1, mm6 movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add ebx,edx ; Inc pointer into ref data add eax,ecx ; Inc pointer into the new data movq mm1, [ebx] ; Copy eight bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 7 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory punpcklbw mm1, mm6 movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add ebx,edx ; Inc pointer into ref data add eax,ecx ; Inc pointer into the new data movq mm1, [ebx] ; Copy eight bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 8 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory punpcklbw mm1, mm6 movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add ebx,edx ; Inc pointer into ref data add eax,ecx ; Inc pointer into the new data paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory } // Now accumulate the final results. XSum = MmxXSum[0] + MmxXSum[1] + MmxXSum[2] + MmxXSum[3]; XXSum = MmxXXSum[0] + MmxXXSum[1]; } // Simple half pixel reference data else { __asm { pxor mm5, mm5 ; Blank mmx6 pxor mm6, mm6 ; Blank mmx7 pxor mm7, mm7 ; Blank mmx7 mov eax,dword ptr [NewDataPtr] ; Load base addresses mov ebx,dword ptr [RefDataPtr1] mov ecx,dword ptr [RefDataPtr2] mov edx,dword ptr [PixelsPerLine] mov esi,dword ptr [RefPixelsPerLine] // Row 1 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 paddw mm1, mm2 ; Add word values together. psrlw mm1, 1 ; Devide by two (shift right 1) punpckhbw mm3, mm6 ; unpack high four bytes to higher precision punpckhbw mm4, mm6 paddw mm3, mm4 ; Add word values together. psrlw mm3, 1 ; Devide by two (shift right 1) movq mm0, [eax] ; Copy eight bytes to mm0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpcklbw mm1, mm6 punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory add eax,edx ; Inc pointer into the new data add ebx,esi ; Inc pointer into ref data add ecx,esi ; Inc pointer into ref2 data // Row 2 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 paddw mm1, mm2 ; Add word values together. psrlw mm1, 1 ; Devide by two (shift right 1) punpckhbw mm3, mm6 ; unpack high four bytes to higher precision punpckhbw mm4, mm6 paddw mm3, mm4 ; Add word values together. psrlw mm3, 1 ; Devide by two (shift right 1) movq mm0, [eax] ; Copy eight bytes to mm0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpcklbw mm1, mm6 punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory add eax,edx ; Inc pointer into the new data add ebx,esi ; Inc pointer into ref data add ecx,esi ; Inc pointer into ref2 data // Row 3 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 paddw mm1, mm2 ; Add word values together. psrlw mm1, 1 ; Devide by two (shift right 1) punpckhbw mm3, mm6 ; unpack high four bytes to higher precision punpckhbw mm4, mm6 paddw mm3, mm4 ; Add word values together. psrlw mm3, 1 ; Devide by two (shift right 1) movq mm0, [eax] ; Copy eight bytes to mm0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpcklbw mm1, mm6 punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory add eax,edx ; Inc pointer into the new data add ebx,esi ; Inc pointer into ref data add ecx,esi ; Inc pointer into ref2 data // Row 4 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 paddw mm1, mm2 ; Add word values together. psrlw mm1, 1 ; Devide by two (shift right 1) punpckhbw mm3, mm6 ; unpack high four bytes to higher precision punpckhbw mm4, mm6 paddw mm3, mm4 ; Add word values together. psrlw mm3, 1 ; Devide by two (shift right 1) movq mm0, [eax] ; Copy eight bytes to mm0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpcklbw mm1, mm6 punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory add eax,edx ; Inc pointer into the new data add ebx,esi ; Inc pointer into ref data add ecx,esi ; Inc pointer into ref2 data // Row 5 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 paddw mm1, mm2 ; Add word values together. psrlw mm1, 1 ; Devide by two (shift right 1) punpckhbw mm3, mm6 ; unpack high four bytes to higher precision punpckhbw mm4, mm6 paddw mm3, mm4 ; Add word values together. psrlw mm3, 1 ; Devide by two (shift right 1) movq mm0, [eax] ; Copy eight bytes to mm0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpcklbw mm1, mm6 punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory add eax,edx ; Inc pointer into the new data add ebx,esi ; Inc pointer into ref data add ecx,esi ; Inc pointer into ref2 data // Row 6 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 paddw mm1, mm2 ; Add word values together. psrlw mm1, 1 ; Devide by two (shift right 1) punpckhbw mm3, mm6 ; unpack high four bytes to higher precision punpckhbw mm4, mm6 paddw mm3, mm4 ; Add word values together. psrlw mm3, 1 ; Devide by two (shift right 1) movq mm0, [eax] ; Copy eight bytes to mm0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpcklbw mm1, mm6 punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory add eax,edx ; Inc pointer into the new data add ebx,esi ; Inc pointer into ref data add ecx,esi ; Inc pointer into ref2 data // Row 7 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 paddw mm1, mm2 ; Add word values together. psrlw mm1, 1 ; Devide by two (shift right 1) punpckhbw mm3, mm6 ; unpack high four bytes to higher precision punpckhbw mm4, mm6 paddw mm3, mm4 ; Add word values together. psrlw mm3, 1 ; Devide by two (shift right 1) movq mm0, [eax] ; Copy eight bytes to mm0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpcklbw mm1, mm6 punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory add eax,edx ; Inc pointer into the new data add ebx,esi ; Inc pointer into ref data add ecx,esi ; Inc pointer into ref2 data // Row 8 movq mm1, [ebx] ; Copy eight bytes from each of ref 1 and ref 2. movq mm2, [ecx] movq mm3, mm1 ; Take copies. movq mm4, mm2 punpcklbw mm1, mm6 ; unpack low four bytes to higher precision punpcklbw mm2, mm6 paddw mm1, mm2 ; Add word values together. psrlw mm1, 1 ; Devide by two (shift right 1) punpckhbw mm3, mm6 ; unpack high four bytes to higher precision punpckhbw mm4, mm6 paddw mm3, mm4 ; Add word values together. psrlw mm3, 1 ; Devide by two (shift right 1) movq mm0, [eax] ; Copy eight bytes to mm0 packuswb mm1, mm3 ; Repack to give 1/2 pixel averaged reference data movq mm2, mm0 ; Take copies movq mm3, mm1 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpcklbw mm1, mm6 punpckhbw mm2, mm6 ; unpack to higher precision punpckhbw mm3, mm6 psubsw mm0, mm1 ; A-B (low order) to MM0 psubsw mm2, mm3 ; A-B (high order) to MM2 paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 movq DWORD PTR [MmxXSum], mm5 ; copy back accumulated results into normal memory movq DWORD PTR [MmxXXSum], mm7 ; copy back accumulated results into normal memory } // Now accumulate the final results. XSum = MmxXSum[0] + MmxXSum[1] + MmxXSum[2] + MmxXSum[3]; XXSum = MmxXXSum[0] + MmxXXSum[1]; } // Compute and return population variance as mis-match metric. return ( ((XXSum << 6) - XSum*XSum ) ); } /**************************************************************************** * * ROUTINE : MmxGetIntraError * * INPUTS : UINT8 *DataPtr : Pointer to input block. * INT32 PixelsPerLine : Length of line for input block. * * OUTPUTS : None. * * RETURNS : UINT32: Block variance. * * FUNCTION : Calculates a variance score for the block. * * SPECIAL NOTES : None. * ****************************************************************************/ UINT32 MmxGetIntraError ( UINT8 *DataPtr, INT32 PixelsPerLine ) { UINT8 *DiffPtr; UINT32 XSum = 0; UINT32 XXSum = 0; // Loop expanded out for speed. DiffPtr = DataPtr; __asm { pxor mm5, mm5 ; Blank mmx6 pxor mm6, mm6 ; Blank mmx7 pxor mm7, mm7 ; Blank mmx7 mov eax,dword ptr [DiffPtr] ; Load base addresses mov ecx,dword ptr [PixelsPerLine] // Row 1 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpckhbw mm2, mm6 ; unpack to higher precision paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add eax,ecx ; Inc pointer into the new data paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 2 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpckhbw mm2, mm6 ; unpack to higher precision paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add eax,ecx ; Inc pointer into ref data paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 3 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpckhbw mm2, mm6 ; unpack to higher precision paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add eax,ecx ; Inc pointer into ref data paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 4 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpckhbw mm2, mm6 ; unpack to higher precision paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add eax,ecx ; Inc pointer into ref data paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 5 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpckhbw mm2, mm6 ; unpack to higher precision paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add eax,ecx ; Inc pointer into ref data paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 6 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpckhbw mm2, mm6 ; unpack to higher precision paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add eax,ecx ; Inc pointer into ref data paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 7 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpckhbw mm2, mm6 ; unpack to higher precision paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add eax,ecx ; Inc pointer into ref data paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 // Row 8 movq mm0, [eax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies punpcklbw mm0, mm6 ; unpack to higher precision punpckhbw mm2, mm6 ; unpack to higher precision paddw mm5, mm0 ; accumulate differences in mm5 paddw mm5, mm2 ; accumulate differences in mm5 pmaddwd mm0, mm0 ; square and accumulate pmaddwd mm2, mm2 ; square and accumulate add eax,ecx ; Inc pointer into ref data paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 movq mm4, mm5 ; punpcklwd mm5, mm6 punpckhwd mm4, mm6 movq mm0, mm7 paddw mm5, mm4 punpckhdq mm0, mm6 punpckldq mm7, mm6 movq mm4, mm5 paddd mm0, mm7 punpckhdq mm4, mm6 punpckldq mm5, mm6 movd DWORD PTR [XXSum], mm0 paddw mm4, mm5 movd DWORD ptr [XSum], mm4 } // Compute population variance as mis-match metric. return ( (XXSum<<6) - XSum*XSum ); } /**************************************************************************** * * ROUTINE : MmxGetMBFrameVertVar * * INPUTS : CP_INSTANCE *cpi : Pointer to encoder instance. * * OUTPUTS : None. * * RETURNS : UINT32: Vertical variance for frame. * * FUNCTION : Computes the vertical variance for a macroblock based * upon the sum of the local 2 pixel variances within * the entire frame. * * SPECIAL NOTES : The difference between the last two rows in a MB * are not accounted for! * ****************************************************************************/ UINT32 MmxGetMBFrameVertVar ( CP_INSTANCE *cpi ) { UINT32 FrameError; INT32 Stride = cpi->pb.Configuration.VideoFrameWidth; // UINT8 *SrcPtr = &cpi->yuv1ptr[cpi->pb.mbi.Source]; //sjlhack UINT8 *SrcPtr = &cpi->yuv1ptr[cpi->pb.mbi.blockDxInfo[0].Source]; __asm { mov ecx, [Stride] mov eax, DWORD PTR [SrcPtr] pxor mm7, mm7 ; clear mm7 pxor mm6, mm6 ; clear mm6 mov edx, 7 ; MmxGetMBFrameVertVarLoop: movq mm1, [eax] ; 00 01 02 03 04 05 06 07 movq mm0, [eax+ecx] ; 10 11 12 13 14 15 16 17 movq mm3, mm0 ; copy of 00 01 02 03 04 05 06 07 punpcklbw mm0, mm7 ; xx 00 xx 01 xx 02 xx 03 punpckhbw mm3, mm7 ; xx 04 xx 05 xx 06 xx 07 movq mm2, [eax+ecx*2] ; 20 21 22 23 24 25 26 27 movq mm4, mm1 ; 10 11 12 13 14 15 16 17 punpcklbw mm1, mm7 ; xx 10 xx 11 xx 12 xx 13 punpckhbw mm4, mm7 ; xx 14 xx 15 xx 16 xx 17 movq mm5, mm2 ; 20 21 22 23 24 25 26 27 punpcklbw mm2, mm7 ; xx 20 xx 21 xx 22 xx 23 psubw mm1, mm0 ; difference between 0, 1 low four pmaddwd mm1, mm1 ; SD between 0, 1 low four psubw mm4, mm3 ; difference bwtween 0, 1 high four pmaddwd mm4, mm4 ; SD between 0, 1 high foru punpckhbw mm5, mm7 ; xx 24 xx 25 xx 26 xx 27 psubw mm2, mm0 ; difference between 0, 2 low four pmaddwd mm2, mm2 ; sd between 0, 2 low four psubw mm5, mm3 ; difference between 0, 2 high four pmaddwd mm5, mm5 ; sd between 0, 2 high four paddd mm1, mm4 ; paddd mm2, mm5 ; paddd mm6, mm1 ; paddd mm6, mm2 ; accumlated in mm6 // done with the low eight movq mm1, 8[eax] ; 00 01 02 03 04 05 06 07 movq mm0, 8[eax+ecx] ; 10 11 12 13 14 15 16 17 movq mm3, mm0 ; copy of 00 01 02 03 04 05 06 07 punpcklbw mm0, mm7 ; xx 00 xx 01 xx 02 xx 03 punpckhbw mm3, mm7 ; xx 04 xx 05 xx 06 xx 07 movq mm2, 8[eax+ecx*2] ; 20 21 22 23 24 25 26 27 movq mm4, mm1 ; 10 11 12 13 14 15 16 17 punpcklbw mm1, mm7 ; xx 10 xx 11 xx 12 xx 13 punpckhbw mm4, mm7 ; xx 14 xx 15 xx 16 xx 17 movq mm5, mm2 ; 20 21 22 23 24 25 26 27 punpcklbw mm2, mm7 ; xx 20 xx 21 xx 22 xx 23 psubw mm1, mm0 ; difference between 0, 1 low four pmaddwd mm1, mm1 ; SD between 0, 1 low four psubw mm4, mm3 ; difference bwtween 0, 1 high four pmaddwd mm4, mm4 ; SD between 0, 1 high foru punpckhbw mm5, mm7 ; xx 24 xx 25 xx 26 xx 27 psubw mm2, mm0 ; difference between 0, 2 low four pmaddwd mm2, mm2 ; sd between 0, 2 low four psubw mm5, mm3 ; difference between 0, 2 high four pmaddwd mm5, mm5 ; sd between 0, 2 high four paddd mm1, mm4 ; paddd mm2, mm5 ; paddd mm6, mm1 ; paddd mm6, mm2 ; accumlated in mm6 lea eax, [eax + ecx *2] ; skip one line sub edx, 1 jnz MmxGetMBFrameVertVarLoop movq mm0, mm6 psrlq mm0, 32 paddd mm0, mm6 movd [FrameError], mm0 } return FrameError; } /**************************************************************************** * * ROUTINE : MmxGetMBFieldVertVar * * INPUTS : CP_INSTANCE *cpi : Pointer to encoder instance. * * OUTPUTS : None. * * RETURNS : UINT32: Vertical variance for fields within a frame. * * FUNCTION : Computes the vertical variance for a macroblock based * upon the sum of the local 2 pixel variances within * the two fields of a frame. * * SPECIAL NOTES : The difference between the last two rows in a MB * are not accounted for! * ****************************************************************************/ UINT32 MmxGetMBFieldVertVar ( CP_INSTANCE *cpi ) { UINT32 FieldError; INT32 Stride = cpi->pb.Configuration.VideoFrameWidth; // UINT8 *SrcPtr = &cpi->yuv1ptr[cpi->pb.mbi.Source]; //sjlhack UINT8 *SrcPtr = &cpi->yuv1ptr[cpi->pb.mbi.blockDxInfo[0].Source]; __asm { mov ecx, [Stride] mov eax, DWORD PTR [SrcPtr] pxor mm7, mm7 ; clear mm7 pxor mm6, mm6 ; clear mm6 mov edx, 7 ; MmxGetMBFieldVertVarLoop: movq mm1, [eax] ; 00 01 02 03 04 05 06 07 movq mm0, [eax+ecx*2] ; 10 11 12 13 14 15 16 17 movq mm2, mm0 ; 00 01 02 03 04 05 06 07 punpcklbw mm0, mm7 ; xx 00 xx 01 xx 02 xx 03 movq mm3, mm1 ; 10 11 12 13 14 15 16 17 punpckhbw mm2, mm7 ; xx 04 xx 05 xx 06 xx 07 punpcklbw mm1, mm7 ; xx 10 xx 11 xx 12 xx 13 punpckhbw mm3, mm7 ; xx 14 xx 15 xx 16 xx 17 psubw mm0, mm1 ; diff between 0 1 low four pmaddwd mm0, mm0 ; SD between 0 1 low four psubw mm2, mm3 ; diff between 0 1 high four pmaddwd mm2, mm2 ; SD between 0 1 high four paddd mm0, mm2 paddd mm6, mm0 movq mm1, 8[eax] ; 00 01 02 03 04 05 06 07 movq mm0, 8[eax+ecx*2] ; 10 11 12 13 14 15 16 17 movq mm2, mm0 ; 00 01 02 03 04 05 06 07 punpcklbw mm0, mm7 ; xx 00 xx 01 xx 02 xx 03 movq mm3, mm1 ; 10 11 12 13 14 15 16 17 punpckhbw mm2, mm7 ; xx 04 xx 05 xx 06 xx 07 punpcklbw mm1, mm7 ; xx 10 xx 11 xx 12 xx 13 punpckhbw mm3, mm7 ; xx 14 xx 15 xx 16 xx 17 psubw mm0, mm1 ; diff between 0 1 low four pmaddwd mm0, mm0 ; SD between 0 1 low four psubw mm2, mm3 ; diff between 0 1 high four pmaddwd mm2, mm2 ; SD between 0 1 high four paddd mm0, mm2 paddd mm6, mm0 lea eax, [eax+ecx] movq mm1, [eax] ; 00 01 02 03 04 05 06 07 movq mm0, [eax+ecx*2] ; 10 11 12 13 14 15 16 17 movq mm2, mm0 ; 00 01 02 03 04 05 06 07 punpcklbw mm0, mm7 ; xx 00 xx 01 xx 02 xx 03 movq mm3, mm1 ; 10 11 12 13 14 15 16 17 punpckhbw mm2, mm7 ; xx 04 xx 05 xx 06 xx 07 punpcklbw mm1, mm7 ; xx 10 xx 11 xx 12 xx 13 punpckhbw mm3, mm7 ; xx 14 xx 15 xx 16 xx 17 psubw mm0, mm1 ; diff between 0 1 low four pmaddwd mm0, mm0 ; SD between 0 1 low four psubw mm2, mm3 ; diff between 0 1 high four pmaddwd mm2, mm2 ; SD between 0 1 high four paddd mm0, mm2 paddd mm6, mm0 movq mm1, 8[eax] ; 00 01 02 03 04 05 06 07 movq mm0, 8[eax+ecx*2] ; 10 11 12 13 14 15 16 17 movq mm2, mm0 ; 00 01 02 03 04 05 06 07 punpcklbw mm0, mm7 ; xx 00 xx 01 xx 02 xx 03 movq mm3, mm1 ; 10 11 12 13 14 15 16 17 punpckhbw mm2, mm7 ; xx 04 xx 05 xx 06 xx 07 punpcklbw mm1, mm7 ; xx 10 xx 11 xx 12 xx 13 punpckhbw mm3, mm7 ; xx 14 xx 15 xx 16 xx 17 psubw mm0, mm1 ; diff between 0 1 low four pmaddwd mm0, mm0 ; SD between 0 1 low four psubw mm2, mm3 ; diff between 0 1 high four pmaddwd mm2, mm2 ; SD between 0 1 high four paddd mm0, mm2 paddd mm6, mm0 lea eax, [eax + ecx ] ; skip one line sub edx, 1 jnz MmxGetMBFieldVertVarLoop movq mm0, mm6 psrlq mm0, 32 paddd mm0, mm6 movd [FieldError], mm0 } return FieldError; } /**************************************************************************** * * ROUTINE : FilterBlock2dBil_SAD_mmx * * INPUTS : UINT8 *SrcPtr : Pointer to input block. * INT32 SrcStride : Stride for input block. * UINT8 *RefPtr : Pointer to reference block. * UINT32 SrcPixelsPerLine : Stride for reference block. * INT16 *HFilter : Pointer to horizontal filter taps. * INT16 *VFilter : Pointer to vertical filter taps. * * OUTPUTS : None. * * RETURNS : UINT32: SAD error. * * FUNCTION : Produces a filtered fractional block in 2-D * using bilinear filters and calculate the SAD. * * SPECIAL NOTES : The difference between the last two rows in a MB * are not accounted for! * ****************************************************************************/ _inline UINT32 FilterBlock2dBil_SAD_mmx ( UINT8 *SrcPtr, INT32 SrcStride, UINT8 *RefPtr, UINT32 SrcPixelsPerLine, INT16 *HFilter, INT16 *VFilter ) { UINT32 Error=0; __asm { mov eax, HFilter ; mov edi, SrcPtr ; mov esi, RefPtr ; mov ecx, 8 ; mov edx, SrcPixelsPerLine ; movq mm1, [eax] ; movq mm2, [eax+16] ; mov eax, VFilter ; pxor mm0, mm0 ; // get the first horizontal line done ; movq mm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 movq mm4, mm3 ; make a copy of current line punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 punpckhbw mm4, mm0 ; pmullw mm3, mm1 ; pmullw mm4, mm1 ; movq mm5, [esi+1] ; movq mm6, mm5 ; punpcklbw mm5, mm0 ; punpckhbw mm6, mm0 ; pmullw mm5, mm2 ; pmullw mm6, mm2 ; paddw mm3, mm5 ; paddw mm4, mm6 ; paddw mm3, rd ; xmm3 += round value psraw mm3, FILTER_SHIFT ; xmm3 /= 128 paddw mm4, rd ; psraw mm4, FILTER_SHIFT ; movq mm7, mm3 ; packuswb mm7, mm4 ; add esi, edx ; next line NextRow: movq mm3, [esi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 movq mm4, mm3 ; make a copy of current line punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 punpckhbw mm4, mm0 ; pmullw mm3, mm1 ; pmullw mm4, mm1 ; movq mm5, [esi+1] ; movq mm6, mm5 ; punpcklbw mm5, mm0 ; punpckhbw mm6, mm0 ; pmullw mm5, mm2 ; pmullw mm6, mm2 ; paddw mm3, mm5 ; paddw mm4, mm6 ; movq mm5, mm7 ; movq mm6, mm7 ; punpcklbw mm5, mm0 ; punpckhbw mm6, mm0 pmullw mm5, [eax] ; pmullw mm6, [eax] ; paddw mm3, rd ; xmm3 += round value psraw mm3, FILTER_SHIFT ; xmm3 /= 128 paddw mm4, rd ; psraw mm4, FILTER_SHIFT ; movq mm7, mm3 ; packuswb mm7, mm4 ; pmullw mm3, [eax+16] ; pmullw mm4, [eax+16] ; paddw mm3, mm5 ; paddw mm4, mm6 ; paddw mm3, rd ; xmm3 += round value psraw mm3, FILTER_SHIFT ; xmm3 /= 128 paddw mm4, rd ; psraw mm4, FILTER_SHIFT ; packuswb mm3, mm4 movq mm4, [edi] ; psadbw mm3, mm4 ; movd mm4, Error ; paddd mm3, mm4 ; movd Error, mm3 ; add esi, edx ; next line add edi, SrcStride ; ; dec ecx ; jne NextRow } return Error; } /**************************************************************************** * * ROUTINE : FilterBlock1d_vb8_SAD_mmx * * INPUTS : UINT8 *SrcPtr : Pointer to input block. * INT32 SrcStride : Stride for input block. * UINT8 *RefPtr : Pointer to reference block. * UINT32 PixelsPerLine : Stride for reference block. * UINT32 PixelStep : Offset to move to next pixel in input. * INT16 *Filter : Pointer to filter taps. * * OUTPUTS : None. * * RETURNS : UINT32: SAD error. * * FUNCTION : Applies 1-D vertical bi-linear filter to input block. * * SPECIAL NOTES : None. * ****************************************************************************/ _inline UINT32 FilterBlock1d_vb8_SAD_mmx ( UINT8 *SrcPtr, INT32 SrcStride, UINT8 *RefPtr, UINT32 PixelsPerLine, UINT32 PixelStep, INT16 *Filter ) { UINT32 Error; __asm { mov edi, Filter movq mm1, [edi] ; mm3 *= kernel 0 modifiers. movq mm2, [edi + 16] ; mm3 *= kernel 0 modifiers. mov edi, SrcPtr mov esi, RefPtr mov ecx, 8 ; mov edx, SrcStride mov eax, PixelsPerLine; pxor mm7, mm7 pxor mm0, mm0 ; mm0 = 00000000 nextrow: movq mm3, [esi] ; mm3 = p0..p7 movq mm4, mm3 ; mm4 = p0..p7 punpcklbw mm3, mm0 ; mm3 = p0..p3 punpckhbw mm4, mm0 ; mm4 = p4..p7 pmullw mm3, mm1 ; mm3 *= kernel 0 modifiers. pmullw mm4, mm1 ; mm4 *= kernel 0 modifiers. movq mm5, [esi + eax] ; movq mm6, mm5 ; punpcklbw mm5, mm0 ; punpckhbw mm6, mm0 ; pmullw mm5, mm2 ; pmullw mm6, mm2 ; paddw mm3, mm5 ; paddw mm4, mm6 ; paddw mm3, rd ; xmm3 += round value psraw mm3, FILTER_SHIFT ; xmm3 /= 128 paddw mm4, rd ; psraw mm4, FILTER_SHIFT ; packuswb mm3, mm4 ; pack and unpack to saturate movq mm5, [edi] ; psadbw mm3, mm5 ; paddd mm7, mm3 // the subsequent iterations repeat 3 out of 4 of these reads. Since the // recon block should be in cache this shouldn't cost much. Its obviously // avoidable!!!. add esi, eax add edi, edx dec ecx ; decrement count jnz nextrow ; next row movd Error, mm7 } return Error; } /**************************************************************************** * * ROUTINE : FilterBlock1d_hb8_SAD_mmx * * INPUTS : UINT8 *SrcPtr : Pointer to input block. * INT32 SrcStride : Stride for input block. * UINT8 *RefPtr : Pointer to reference block. * UINT32 SrcPixelsPerLine : Stride for reference block. * UINT32 PixelStep : Offset to move to next pixel in input. * INT16 *Filter : Pointer to filter taps. * * OUTPUTS : None. * * RETURNS : UINT32: SAD error. * * FUNCTION : Applies 1-D horizontal bi-linear filter to input block. * * SPECIAL NOTES : None. * ****************************************************************************/ _inline UINT32 FilterBlock1d_hb8_SAD_mmx ( UINT8 *SrcPtr, INT32 SrcStride, UINT8 *RefPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, INT16 *Filter ) { UINT32 Error = 0; __asm { mov edi, Filter movq mm1, [edi] ; xmm3 *= kernel 0 modifiers. movq mm2, [edi + 16] ; xmm3 *= kernel 0 modifiers. mov edi, SrcPtr mov esi, RefPtr pxor mm0, mm0 ; mm0 = 00000000 pxor mm7, mm7 ; mm7 = 0 mov ecx, 8 ; mov edx, SrcStride mov eax, SrcPixelsPerLine; nextrow: movq mm3, [esi] ; mm3 = p-1..p6 movq mm4, mm3 ; make a copy punpcklbw mm3, mm0 ; pmullw mm3, mm1 ; movq mm5, [esi+1] ;mm5 = p0 ..... p7 punpckhbw mm4, mm0 pmullw mm4, mm1 ; movq mm6, mm5 ; punpcklbw mm5, mm0 ; mm5 = p0..p7 pmullw mm5, mm2 ; punpckhbw mm6, mm0 ; pmullw mm6, mm2 ; paddw mm3, mm5 ; paddw mm4, mm6 ; paddw mm3, rd ; xmm3 += round value psraw mm3, FILTER_SHIFT ; xmm3 /= 128 paddw mm4, rd ; psraw mm4, FILTER_SHIFT ; packuswb mm3, mm4 ; pack and unpack to saturate movq mm5, [edi] ; read src psadbw mm3, mm5 ; paddd mm7, mm3 add esi, eax ; next line add edi, edx ; dec ecx ; decrement count jnz nextrow ; next row movd Error, mm7; } return Error; } /**************************************************************************** * * ROUTINE : FiltBlockBilGetSad_mmx * * INPUTS : UINT8 *SrcPtr : Pointer to input block. * INT32 SrcStride : Stride for input block. * UINT8 *ReconPtr1 : Pointer to first reference block. * UINT8 *ReconPtr2 : Pointer to second reference block. * UINT32 PixelsPerLine : Stride for reference block. * INT32 ModX : Fractional part of x-component of MV. * INT32 ModY : Fractional part of x-component of MV. * UINT32 BestSoFar : Best error found so far. * * OUTPUTS : None. * * RETURNS : UINT32: SAD error. * * FUNCTION : Applies 2-D bi-linear filter to get prediction block * and computes SAD for prediction error. * * SPECIAL NOTES : None. * ****************************************************************************/ UINT32 FiltBlockBilGetSad_mmx ( UINT8 *SrcPtr, INT32 SrcStride, UINT8 *ReconPtr1, UINT8 *ReconPtr2, INT32 PixelsPerLine, INT32 ModX, INT32 ModY, UINT32 BestSoFar ) { INT32 diff; UINT32 Error; // swap pointers so ReconPtr1 smaller (above, left, above-right or above-left ) diff = ReconPtr2-ReconPtr1; // The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision). // This works out to be what we want... despite the pointer swapping that goes on below. // For example... if the X component of the vector is a +ve ModX = X%8. // if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1. if ( diff<0 ) { // swap pointers so ReconPtr1 smaller UINT8 *temp = ReconPtr1; ReconPtr1 = ReconPtr2; ReconPtr2 = temp; diff = (int)(ReconPtr2-ReconPtr1); } if ( diff==1 ) Error = FilterBlock1d_hb8_SAD_mmx ( SrcPtr, SrcStride, ReconPtr1, PixelsPerLine, 1, BilinearFilters_mmx[ModX] ); else if (diff == (int)(PixelsPerLine) ) // Fractional pixel in vertical only Error = FilterBlock1d_vb8_SAD_mmx ( SrcPtr, SrcStride, ReconPtr1, PixelsPerLine, PixelsPerLine, BilinearFilters_mmx[ModY] ); else if(diff == (int)(PixelsPerLine - 1)) // ReconPtr1 is Top right Error = FilterBlock2dBil_SAD_mmx ( SrcPtr, SrcStride, ReconPtr1-1, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] ); else if(diff == (int)(PixelsPerLine + 1) ) // ReconPtr1 is Top left Error = FilterBlock2dBil_SAD_mmx ( SrcPtr, SrcStride, ReconPtr1, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] ); return Error; }