| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| %include "libavutil/x86/x86util.asm" |
|
|
| SECTION_RODATA 32 |
| |
| swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7 |
| four: times 8 dd 4 |
|
|
| SECTION .text |
|
|
| |
| |
| ; |
| |
| |
| |
| |
| |
| ; |
| |
| |
| |
| |
|
|
| %macro SCALE_FUNC 1 |
| cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner |
| pxor m0, m0 |
| mova m15, [swizzle] |
| xor countq, countq |
| movsxd wq, wd |
| %ifidn %1, X4 |
| mova m14, [four] |
| shr fltsized, 2 |
| %endif |
| cmp wq, 0x10 |
| jl .tail_loop |
| sub wq, 0x10 |
| .loop: |
| movu m1, [fltposq] |
| movu m2, [fltposq+32] |
| %ifidn %1, X4 |
| pxor m9, m9 |
| pxor m10, m10 |
| pxor m11, m11 |
| pxor m12, m12 |
| xor innerq, innerq |
| .innerloop: |
| %endif |
| vpcmpeqd m13, m13 |
| vpgatherdd m3,[srcmemq + m1], m13 |
| vpcmpeqd m13, m13 |
| vpgatherdd m4,[srcmemq + m2], m13 |
| vpunpcklbw m5, m3, m0 |
| vpunpckhbw m6, m3, m0 |
| vpunpcklbw m7, m4, m0 |
| vpunpckhbw m8, m4, m0 |
| vpmaddwd m5, m5, [filterq] |
| vpmaddwd m6, m6, [filterq + 32] |
| vpmaddwd m7, m7, [filterq + 64] |
| vpmaddwd m8, m8, [filterq + 96] |
| add filterq, 0x80 |
| %ifidn %1, X4 |
| paddd m9, m5 |
| paddd m10, m6 |
| paddd m11, m7 |
| paddd m12, m8 |
| paddd m1, m14 |
| paddd m2, m14 |
| add innerq, 1 |
| cmp innerq, fltsizeq |
| jl .innerloop |
| vphaddd m5, m9, m10 |
| vphaddd m6, m11, m12 |
| %else |
| vphaddd m5, m5, m6 |
| vphaddd m6, m7, m8 |
| %endif |
| vpsrad m5, 7 |
| vpsrad m6, 7 |
| vpackssdw m5, m5, m6 |
| vpermd m5, m15, m5 |
| vmovdqu [dstq + countq * 2], m5 |
| add fltposq, 0x40 |
| add countq, 0x10 |
| cmp countq, wq |
| jle .loop |
|
|
| add wq, 0x10 |
| cmp countq, wq |
| jge .end |
| |
| .tail_loop: |
| movu xm1, [fltposq] |
| %ifidn %1, X4 |
| pxor xm9, xm9 |
| pxor xm10, xm10 |
| xor innerq, innerq |
| .tail_innerloop: |
| %endif |
| vpcmpeqd xm13, xm13 |
| vpgatherdd xm3,[srcmemq + xm1], xm13 |
| vpunpcklbw xm5, xm3, xm0 |
| vpunpckhbw xm6, xm3, xm0 |
| vpmaddwd xm5, xm5, [filterq] |
| vpmaddwd xm6, xm6, [filterq + 0x10] |
| add filterq, 0x20 |
| %ifidn %1, X4 |
| paddd xm9, xm5 |
| paddd xm10, xm6 |
| paddd xm1, xm14 |
| add innerq, 1 |
| cmp innerq, fltsizeq |
| jl .tail_innerloop |
| vphaddd xm5, xm9, xm10 |
| %else |
| vphaddd xm5, xm5, xm6 |
| %endif |
| vpsrad xm5, 7 |
| vpackssdw xm5, xm5, xm5 |
| vmovq [dstq + countq * 2], xm5 |
| add fltposq, 0x10 |
| add countq, 0x4 |
| cmp countq, wq |
| jl .tail_loop |
| .end: |
| RET |
| %endmacro |
|
|
| %if ARCH_X86_64 |
| %if HAVE_AVX2_EXTERNAL |
| INIT_YMM avx2 |
| SCALE_FUNC 4 |
| SCALE_FUNC X4 |
| %endif |
| %endif |
|
|