| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | %include "libavutil/x86/x86util.asm" |
| |
|
| | SECTION_RODATA 32 |
| |
|
| | pw_coefhf: times 8 dw 1016, 5570 |
| | pw_coefhf1: times 16 dw -3801 |
| | pw_coefsp: times 8 dw 5077, -981 |
| | pw_splfdif: times 8 dw -768, 768 |
| |
|
| | SECTION .text |
| |
|
| | %macro LOAD8 2 |
| | %if mmsize == 32 |
| | pmovzxbw %1, %2 |
| | %else |
| | movh %1, %2 |
| | punpcklbw %1, m7 |
| | %endif |
| | %endmacro |
| |
|
| | %macro LOAD12 2 |
| | movu %1, %2 |
| | %endmacro |
| |
|
| | %macro DISP8 0 |
| | %if mmsize == 32 |
| | vextracti128 xm1, m2, 1 |
| | packuswb xm2, xm1 |
| | movu [dstq], xm2 |
| | %else |
| | packuswb m2, m2 |
| | movh [dstq], m2 |
| | %endif |
| | %endmacro |
| |
|
| | %macro DISP12 0 |
| | CLIPW m2, m7, m12 |
| | movu [dstq], m2 |
| | %endmacro |
| |
|
| | %macro FILTER 5 |
| | pxor m7, m7 |
| | .loop%1: |
| | LOAD%4 m0, [curq+t0*%5] |
| | LOAD%4 m1, [curq+t1*%5] |
| | LOAD%4 m2, [%2] |
| | LOAD%4 m3, [%3] |
| | mova m4, m3 |
| | paddw m3, m2 |
| | psubw m2, m4 |
| | ABS1 m2, m4 |
| | mova m8, m3 |
| | mova m9, m2 |
| | LOAD%4 m3, [prevq+t0*%5] |
| | LOAD%4 m4, [prevq+t1*%5] |
| | psubw m3, m0 |
| | psubw m4, m1 |
| | ABS2 m3, m4, m5, m6 |
| | paddw m3, m4 |
| | psrlw m2, 1 |
| | psrlw m3, 1 |
| | pmaxsw m2, m3 |
| | LOAD%4 m3, [nextq+t0*%5] |
| | LOAD%4 m4, [nextq+t1*%5] |
| | psubw m3, m0 |
| | psubw m4, m1 |
| | ABS2 m3, m4, m5, m6 |
| | paddw m3, m4 |
| | psrlw m3, 1 |
| | pmaxsw m2, m3 |
| |
|
| | LOAD%4 m3, [%2+t0*2*%5] |
| | LOAD%4 m4, [%3+t0*2*%5] |
| | LOAD%4 m5, [%2+t1*2*%5] |
| | LOAD%4 m6, [%3+t1*2*%5] |
| | paddw m3, m4 |
| | paddw m5, m6 |
| | mova m6, m3 |
| | paddw m6, m5 |
| | mova m10, m6 |
| | psrlw m3, 1 |
| | psrlw m5, 1 |
| | psubw m3, m0 |
| | psubw m5, m1 |
| | mova m6, m3 |
| | pminsw m3, m5 |
| | pmaxsw m5, m6 |
| | mova m4, m8 |
| | psraw m4, 1 |
| | mova m6, m4 |
| | psubw m6, m0 |
| | psubw m4, m1 |
| | pmaxsw m3, m6 |
| | pminsw m5, m6 |
| | pmaxsw m3, m4 |
| | pminsw m5, m4 |
| | mova m6, m7 |
| | psubw m6, m3 |
| | pmaxsw m6, m5 |
| | mova m3, m2 |
| | pcmpgtw m3, m7 |
| | pand m6, m3 |
| | pmaxsw m2, m6 |
| | mova m11, m2 |
| |
|
| | LOAD%4 m2, [%2+t0*4*%5] |
| | LOAD%4 m3, [%3+t0*4*%5] |
| | LOAD%4 m4, [%2+t1*4*%5] |
| | LOAD%4 m5, [%3+t1*4*%5] |
| | paddw m2, m3 |
| | paddw m4, m5 |
| | paddw m2, m4 |
| | mova m3, m2 |
| | punpcklwd m2, m8 |
| | punpckhwd m3, m8 |
| | pmaddwd m2, [pw_coefhf] |
| | pmaddwd m3, [pw_coefhf] |
| | mova m4, m10 |
| | mova m6, m4 |
| | pmullw m4, [pw_coefhf1] |
| | pmulhw m6, [pw_coefhf1] |
| | mova m5, m4 |
| | punpcklwd m4, m6 |
| | punpckhwd m5, m6 |
| | paddd m2, m4 |
| | paddd m3, m5 |
| | psrad m2, 2 |
| | psrad m3, 2 |
| |
|
| | mova m4, m0 |
| | paddw m0, m1 |
| | %if ARCH_X86_64 |
| | LOAD%4 m5, [curq+t2*%5] |
| | LOAD%4 m6, [curq+t3*%5] |
| | %else |
| | mov r4, prefs3mp |
| | mov r5, mrefs3mp |
| | LOAD%4 m5, [curq+t0*%5] |
| | LOAD%4 m6, [curq+t1*%5] |
| | mov r4, prefsmp |
| | mov r5, mrefsmp |
| | %endif |
| | paddw m6, m5 |
| | psubw m1, m4 |
| | ABS1 m1, m4 |
| | pcmpgtw m1, m9 |
| | mova m4, m1 |
| | punpcklwd m1, m4 |
| | punpckhwd m4, m4 |
| | pand m2, m1 |
| | pand m3, m4 |
| | mova m5, [pw_splfdif] |
| | mova m7, m5 |
| | pand m5, m1 |
| | pand m7, m4 |
| | paddw m5, [pw_coefsp] |
| | paddw m7, [pw_coefsp] |
| | mova m4, m0 |
| | punpcklwd m0, m6 |
| | punpckhwd m4, m6 |
| | pmaddwd m0, m5 |
| | pmaddwd m4, m7 |
| | paddd m2, m0 |
| | paddd m3, m4 |
| | psrad m2, 13 |
| | psrad m3, 13 |
| | packssdw m2, m3 |
| |
|
| | mova m4, m8 |
| | psraw m4, 1 |
| | mova m0, m11 |
| | mova m3, m4 |
| | psubw m4, m0 |
| | paddw m3, m0 |
| | CLIPW m2, m4, m3 |
| | pxor m7, m7 |
| | DISP%4 |
| |
|
| | add dstq, STEP |
| | add prevq, STEP |
| | add curq, STEP |
| | add nextq, STEP |
| | sub DWORD wm, mmsize/2 |
| | jg .loop%1 |
| | %endmacro |
| |
|
| | %macro PROC 2 |
| | %if ARCH_X86_64 |
| | movsxd r5, DWORD prefsm |
| | movsxd r6, DWORD mrefsm |
| | movsxd r7, DWORD prefs3m |
| | movsxd r8, DWORD mrefs3m |
| | DECLARE_REG_TMP 5, 6, 7, 8 |
| | %else |
| | %define m8 [rsp+ 0] |
| | %define m9 [rsp+16] |
| | %define m10 [rsp+32] |
| | %define m11 [rsp+48] |
| | mov r4, prefsmp |
| | mov r5, mrefsmp |
| | DECLARE_REG_TMP 4, 5 |
| | %endif |
| | cmp DWORD paritym, 0 |
| | je .parity0 |
| | FILTER 1, prevq, curq, %1, %2 |
| | jmp .ret |
| | .parity0: |
| | FILTER 0, curq, nextq, %1, %2 |
| | .ret: |
| | RET |
| | %endmacro |
| |
|
| | %macro BWDIF 0 |
| | %if ARCH_X86_64 |
| | cglobal bwdif_filter_line, 4, 9, 12, 0, dst, prev, cur, next, w, prefs, \ |
| | mrefs, prefs2, mrefs2, prefs3, mrefs3, \ |
| | prefs4, mrefs4, parity, clip_max |
| | %else |
| | cglobal bwdif_filter_line, 4, 6, 8, 64, dst, prev, cur, next, w, prefs, \ |
| | mrefs, prefs2, mrefs2, prefs3, mrefs3, \ |
| | prefs4, mrefs4, parity, clip_max |
| | %endif |
| | %define STEP mmsize/2 |
| | PROC 8, 1 |
| |
|
| | %if ARCH_X86_64 |
| | cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \ |
| | prefs, mrefs, prefs2, mrefs2, \ |
| | prefs3, mrefs3, prefs4, \ |
| | mrefs4, parity, clip_max |
| | %if mmsize == 32 |
| | vpbroadcastw m12, WORD clip_maxm |
| | %else |
| | movd m12, DWORD clip_maxm |
| | SPLATW m12, m12, 0 |
| | %endif |
| | %else |
| | cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ |
| | prefs, mrefs, prefs2, mrefs2, \ |
| | prefs3, mrefs3, prefs4, \ |
| | mrefs4, parity, clip_max |
| | %define m12 [rsp+64] |
| | movd m0, DWORD clip_maxm |
| | SPLATW m0, m0, 0 |
| | mova m12, m0 |
| | %endif |
| | %define STEP mmsize |
| | PROC 12, 2 |
| | %endmacro |
| |
|
| | INIT_XMM ssse3 |
| | BWDIF |
| | INIT_XMM sse2 |
| | BWDIF |
| |
|
| | %if HAVE_AVX2_EXTERNAL && ARCH_X86_64 |
| | INIT_YMM avx2 |
| | BWDIF |
| | %endif |
| |
|