| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | %include "libavutil/x86/x86util.asm" |
| |
|
| | %if ARCH_X86_64 |
| | %define pointer resq |
| | %else |
| | %define pointer resd |
| | %endif |
| |
|
| | struc Coeffs |
| | .val: pointer 1 |
| | .start: resd 1 |
| | .len: resd 1 |
| | .sizeof: |
| | endstruc |
| |
|
| | %macro CQT_CALC 9 |
| | |
| | |
| | mov id, xd |
| | add id, [coeffsq + Coeffs.start + %9] |
| | movaps m%5, [srcq + 8 * iq] |
| | movaps m%7, [srcq + 8 * iq + mmsize] |
| | shufps m%6, m%5, m%7, q3131 |
| | shufps m%5, m%5, m%7, q2020 |
| | sub id, fft_lend |
| | FMULADD_PS m%2, m%6, m%8, m%2, m%6 |
| | neg id |
| | FMULADD_PS m%1, m%5, m%8, m%1, m%5 |
| | movups m%5, [srcq + 8 * iq - mmsize + 8] |
| | movups m%7, [srcq + 8 * iq - 2*mmsize + 8] |
| | %if mmsize == 32 |
| | vperm2f128 m%5, m%5, m%5, 1 |
| | vperm2f128 m%7, m%7, m%7, 1 |
| | %endif |
| | shufps m%6, m%5, m%7, q1313 |
| | shufps m%5, m%5, m%7, q0202 |
| | FMULADD_PS m%4, m%6, m%8, m%4, m%6 |
| | FMULADD_PS m%3, m%5, m%8, m%3, m%5 |
| | %endmacro |
| |
|
| | %macro CQT_SEPARATE 6 |
| | addps m%5, m%4, m%2 |
| | subps m%6, m%3, m%1 |
| | addps m%1, m%1, m%3 |
| | subps m%2, m%2, m%4 |
| | HADDPS m%5, m%6, m%3 |
| | HADDPS m%1, m%2, m%3 |
| | HADDPS m%1, m%5, m%2 |
| | %if mmsize == 32 |
| | vextractf128 xmm%2, m%1, 1 |
| | addps xmm%1, xmm%2 |
| | %endif |
| | %endmacro |
| |
|
| | %macro DECLARE_CQT_CALC 0 |
| | |
| | %if ARCH_X86_64 |
| | cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len |
| | align 16 |
| | .loop_k: |
| | mov xd, [coeffsq + Coeffs.len] |
| | xorps m0, m0, m0 |
| | movaps m1, m0 |
| | movaps m2, m0 |
| | mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof] |
| | movaps m3, m0 |
| | movaps m8, m0 |
| | cmp coeffs_lend, xd |
| | movaps m9, m0 |
| | movaps m10, m0 |
| | movaps m11, m0 |
| | cmova coeffs_lend, xd |
| | xor xd, xd |
| | test coeffs_lend, coeffs_lend |
| | jz .check_loop_b |
| | mov coeffs_valq, [coeffsq + Coeffs.val] |
| | mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof] |
| | align 16 |
| | .loop_ab: |
| | movaps m7, [coeffs_valq + 4 * xq] |
| | CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
| | movaps m7, [coeffs_val2q + 4 * xq] |
| | CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof |
| | add xd, mmsize/4 |
| | cmp xd, coeffs_lend |
| | jb .loop_ab |
| | .check_loop_b: |
| | cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] |
| | jae .check_loop_a |
| | align 16 |
| | .loop_b: |
| | movaps m7, [coeffs_val2q + 4 * xq] |
| | CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof |
| | add xd, mmsize/4 |
| | cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof] |
| | jb .loop_b |
| | .loop_end: |
| | CQT_SEPARATE 0, 1, 2, 3, 4, 5 |
| | CQT_SEPARATE 8, 9, 10, 11, 4, 5 |
| | mulps xmm0, xmm0 |
| | mulps xmm8, xmm8 |
| | HADDPS xmm0, xmm8, xmm1 |
| | movaps [dstq], xmm0 |
| | sub lend, 2 |
| | lea dstq, [dstq + 16] |
| | lea coeffsq, [coeffsq + 2*Coeffs.sizeof] |
| | jnz .loop_k |
| | RET |
| | align 16 |
| | .check_loop_a: |
| | cmp xd, [coeffsq + Coeffs.len] |
| | jae .loop_end |
| | align 16 |
| | .loop_a: |
| | movaps m7, [coeffs_valq + 4 * xq] |
| | CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
| | add xd, mmsize/4 |
| | cmp xd, [coeffsq + Coeffs.len] |
| | jb .loop_a |
| | jmp .loop_end |
| | %else |
| | cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i |
| | %define fft_lend r4m |
| | align 16 |
| | .loop_k: |
| | mov xd, [coeffsq + Coeffs.len] |
| | xorps m0, m0, m0 |
| | movaps m1, m0 |
| | movaps m2, m0 |
| | movaps m3, m0 |
| | test xd, xd |
| | jz .store |
| | mov coeffs_valq, [coeffsq + Coeffs.val] |
| | xor xd, xd |
| | align 16 |
| | .loop_x: |
| | movaps m7, [coeffs_valq + 4 * xq] |
| | CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0 |
| | add xd, mmsize/4 |
| | cmp xd, [coeffsq + Coeffs.len] |
| | jb .loop_x |
| | CQT_SEPARATE 0, 1, 2, 3, 4, 5 |
| | mulps xmm0, xmm0 |
| | HADDPS xmm0, xmm0, xmm1 |
| | .store: |
| | movlps [dstq], xmm0 |
| | sub lend, 1 |
| | lea dstq, [dstq + 8] |
| | lea coeffsq, [coeffsq + Coeffs.sizeof] |
| | jnz .loop_k |
| | RET |
| | %endif |
| | %endmacro |
| |
|
| | INIT_XMM sse |
| | DECLARE_CQT_CALC |
| | INIT_XMM sse3 |
| | DECLARE_CQT_CALC |
| | %if HAVE_AVX_EXTERNAL |
| | INIT_YMM avx |
| | DECLARE_CQT_CALC |
| | %endif |
| | %if HAVE_FMA3_EXTERNAL |
| | INIT_YMM fma3 |
| | DECLARE_CQT_CALC |
| | %endif |
| | %if HAVE_FMA4_EXTERNAL |
| | INIT_XMM fma4 |
| | DECLARE_CQT_CALC |
| | %endif |
| |
|