/* * Copyright (C) 2001-2002, Marcelo E. Magallon * You are free to use and distribute this code, with or without modifications, * in source or binary forms as long as you don't remove this copyright notice. * * This is gcc extended assembly, that means AT&T syntax. */ /* * Over operator implementation in MMX, this does: * * r = a + (1-aalpha)*b/255 * * where the second factor is computed using: * * x/255 = (x + x/256 + 128)/256 * * The inputs are all pointers to 32 bit integers. * You have to call OVER_8F_FINISH() once you are done blending. */ #ifndef _BLEND_MMX_H_ #define _BLEND_MMX_H_ #define OVER_8F(a,b,r) \ __asm__ volatile( \ "\n\t movd (%1), %%mm0" /* copy a to mm0 */ \ "\n\t mov $255, %%eax" \ "\n\t movd %%eax, %%mm1" \ "\n\t movd %%eax, %%mm2" \ "\n\t pxor %%mm0, %%mm1" /* 1 - aalpha */ \ "\n\t pand %%mm2, %%mm1" /* mask result */ \ "\n\t pxor %%mm2, %%mm2" \ "\n\t pshufw $0, %%mm1, %%mm2" /* interleave 1-a in mm2 */ \ "\n\t movd (%2), %%mm1" /* copy b to mm1 */ \ "\n\t pxor %%mm3, %%mm3" \ "\n\t punpcklbw %%mm3, %%mm1" /* Interleave b and mm1 */ \ "\n\t pmullw %%mm2, %%mm1" /* result = (1-aalpha)*b */ \ "\n\t mov $128, %%eax" \ "\n\t movd %%eax, %%mm2" \ "\n\t pshufw $0, %%mm2, %%mm2" /* interleave 128 in mm2 */ \ "\n\t paddusw %%mm1, %%mm2" /* add 128 to result */ \ "\n\t movq %%mm2, %%mm1" \ "\n\t psrlw $8, %%mm1" /* >> 8 (divide by 256) */ \ "\n\t paddusw %%mm2, %%mm1" /* add x and x/256 */ \ "\n\t psrlw $8, %%mm1" /* >> 8 (divide by 256) */ \ "\n\t packuswb %%mm1, %%mm1" /* pack result */ \ "\n\t paddusb %%mm1, %%mm0" /* add a and (1-aalpha)b */ \ "\n\t movd %%mm0, (%0)" /* copy result to memory */ \ : "=p" (r) /* %0 */ \ : "p" (a), /* %1 */ \ "p" (b) /* %2 */ \ : "eax" \ ) #define OVER_8F_FINISH() \ __asm__ volatile("\n\t emms") /* * Blend operator: * * r = aalpha*a/255 + (1-aalpha)*b/255 * * computed as: * * r = (a-b)*aalpha/255 + b * * The inputs are all pointers to 32 bit integers. * You have to call BLEND_8F_FINISH() once you are done blending. */ #define BLEND_8F(a,b,r) \ __asm__ volatile( \ "\n\t movd (%1), %%mm0" /* copy a to mm0 */ \ "\n\t movd (%2), %%mm1" /* copy b to mm1 */ \ "\n\t pxor %%mm3, %%mm3" \ "\n\t punpcklbw %%mm3, %%mm0" /* unpack a in mm0 */ \ "\n\t punpcklbw %%mm3, %%mm1" /* unpack b in mm1 */ \ \ "\n\t movq %%mm0, %%mm2" \ "\n\t punpcklwd %%mm2, %%mm2" \ "\n\t punpckldq %%mm2, %%mm2" \ \ "\n\t psubw %%mm1, %%mm0" /* a = a - b */ \ \ "\n\t movq %%mm0, %%mm3" \ \ "\n\t pmullw %%mm2, %%mm0" /* (a - b)*aa in mm0 */ \ \ "\n\t psrlw $15, %%mm3" \ "\n\t psllw $8, %%mm3" /* mm3 = 256 if a - b < 0 */ \ "\n\t mov $128, %%eax" \ "\n\t movd %%eax, %%mm2" \ "\n\t pshufw $0, %%mm2, %%mm2" /* 128x4 in mm2 */ \ "\n\t psubw %%mm3, %%mm2" /* sign(a-b) * 128 in mm2 */ \ \ "\n\t paddw %%mm2, %%mm0" /* mm0 = (a-b)*aa +- 128 */ \ \ "\n\t movq %%mm0, %%mm2" /* mm0 = mm2 = x */ \ "\n\t psrlw $8, %%mm2" /* mm2 = x/256 */ \ "\n\t paddw %%mm2, %%mm0" /* mm0 = x + x/256 */ \ "\n\t psrlw $8, %%mm0" /* mm0 = (x + x/256)/256 */ \ \ "\n\t paddw %%mm1, %%mm0" /* mm0 = (a-b)*aa + b */ \ \ "\n\t psllw $8, %%mm0" \ "\n\t psrlw $8, %%mm0" /* mask upper byte in mm0 */ \ "\n\t packuswb %%mm0, %%mm0" /* pack result */ \ "\n\t movd %%mm0, (%0)" /* copy result to memory */ \ : "=p" (r) /* %0 */ \ : "p" (a), /* %1 */ \ "p" (b) /* %2 */ \ : "eax" \ ) #define BLEND_8F_FINISH() \ __asm__ volatile("\n\t emms") #endif /* _BLEND_MMX_H_ */