| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #include "mat.h" |
| | #if __ARM_NEON |
| | #include <arm_neon.h> |
| | #endif |
| | #include "platform.h" |
| |
|
| | namespace ncnn { |
| |
|
| | #if NCNN_PIXEL_ROTATE |
| | |
| | |
| | |
| |
|
| | static void kanna_rotate_1_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw; |
| | const int wgap = stride - w; |
| |
|
| | const unsigned char* src0 = src; |
| | const unsigned char* src1 = src + srcstride; |
| | unsigned char* dst0 = dst; |
| | unsigned char* dst1 = dst + stride; |
| |
|
| | int y = 0; |
| | for (; y + 1 < srch; y += 2) |
| | { |
| | #if __ARM_NEON |
| | int nn = srcw >> 5; |
| | int remain = srcw - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src0 = vld1q_u8(src0); |
| | uint8x16_t _src0n = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src0); |
| | vst1q_u8(dst0 + 16, _src0n); |
| |
|
| | uint8x16_t _src1 = vld1q_u8(src1); |
| | uint8x16_t _src1n = vld1q_u8(src1 + 16); |
| | vst1q_u8(dst1, _src1); |
| | vst1q_u8(dst1 + 16, _src1n); |
| |
|
| | src0 += 32; |
| | src1 += 32; |
| | dst0 += 32; |
| | dst1 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "pld [%2, #256] \n" |
| | "vld1.u8 {d4-d7}, [%2]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%3]! \n" |
| | "vst1.u8 {d4-d7}, [%4]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | *dst1++ = *src1++; |
| | } |
| |
|
| | src0 += srcwgap + srcstride; |
| | src1 += srcwgap + srcstride; |
| | dst0 += wgap + stride; |
| | dst1 += wgap + stride; |
| | } |
| |
|
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | int nn = srcw >> 5; |
| | int remain = srcw - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src = vld1q_u8(src0); |
| | uint8x16_t _src2 = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src); |
| | vst1q_u8(dst0 + 16, _src2); |
| |
|
| | src0 += 32; |
| | dst0 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%2]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1"); |
| | } |
| | #endif |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 += wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_1_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 2; |
| | const int wgap = stride - w * 2; |
| |
|
| | int size = srcw * 2; |
| |
|
| | const unsigned char* src0 = src; |
| | const unsigned char* src1 = src + srcstride; |
| | unsigned char* dst0 = dst; |
| | unsigned char* dst1 = dst + stride; |
| |
|
| | int y = 0; |
| | for (; y + 1 < srch; y += 2) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src0 = vld1q_u8(src0); |
| | uint8x16_t _src0n = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src0); |
| | vst1q_u8(dst0 + 16, _src0n); |
| |
|
| | uint8x16_t _src1 = vld1q_u8(src1); |
| | uint8x16_t _src1n = vld1q_u8(src1 + 16); |
| | vst1q_u8(dst1, _src1); |
| | vst1q_u8(dst1 + 16, _src1n); |
| |
|
| | src0 += 32; |
| | src1 += 32; |
| | dst0 += 32; |
| | dst1 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "pld [%2, #256] \n" |
| | "vld1.u8 {d4-d7}, [%2]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%3]! \n" |
| | "vst1.u8 {d4-d7}, [%4]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | *dst1++ = *src1++; |
| | } |
| |
|
| | src0 += srcwgap + srcstride; |
| | src1 += srcwgap + srcstride; |
| | dst0 += wgap + stride; |
| | dst1 += wgap + stride; |
| | } |
| |
|
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src = vld1q_u8(src0); |
| | uint8x16_t _src2 = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src); |
| | vst1q_u8(dst0 + 16, _src2); |
| |
|
| | src0 += 32; |
| | dst0 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%2]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 += wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_1_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 3; |
| | const int wgap = stride - w * 3; |
| |
|
| | int size = srcw * 3; |
| |
|
| | const unsigned char* src0 = src; |
| | const unsigned char* src1 = src + srcstride; |
| | unsigned char* dst0 = dst; |
| | unsigned char* dst1 = dst + stride; |
| |
|
| | int y = 0; |
| | for (; y + 1 < srch; y += 2) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src0 = vld1q_u8(src0); |
| | uint8x16_t _src0n = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src0); |
| | vst1q_u8(dst0 + 16, _src0n); |
| |
|
| | uint8x16_t _src1 = vld1q_u8(src1); |
| | uint8x16_t _src1n = vld1q_u8(src1 + 16); |
| | vst1q_u8(dst1, _src1); |
| | vst1q_u8(dst1 + 16, _src1n); |
| |
|
| | src0 += 32; |
| | src1 += 32; |
| | dst0 += 32; |
| | dst1 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "pld [%2, #256] \n" |
| | "vld1.u8 {d4-d7}, [%2]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%3]! \n" |
| | "vst1.u8 {d4-d7}, [%4]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | *dst1++ = *src1++; |
| | } |
| |
|
| | src0 += srcwgap + srcstride; |
| | src1 += srcwgap + srcstride; |
| | dst0 += wgap + stride; |
| | dst1 += wgap + stride; |
| | } |
| |
|
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src = vld1q_u8(src0); |
| | uint8x16_t _src2 = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src); |
| | vst1q_u8(dst0 + 16, _src2); |
| |
|
| | src0 += 32; |
| | dst0 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%2]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 += wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_1_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 4; |
| | const int wgap = stride - w * 4; |
| |
|
| | int size = srcw * 4; |
| |
|
| | const unsigned char* src0 = src; |
| | const unsigned char* src1 = src + srcstride; |
| | unsigned char* dst0 = dst; |
| | unsigned char* dst1 = dst + stride; |
| |
|
| | int y = 0; |
| | for (; y + 1 < srch; y += 2) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src0 = vld1q_u8(src0); |
| | uint8x16_t _src0n = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src0); |
| | vst1q_u8(dst0 + 16, _src0n); |
| |
|
| | uint8x16_t _src1 = vld1q_u8(src1); |
| | uint8x16_t _src1n = vld1q_u8(src1 + 16); |
| | vst1q_u8(dst1, _src1); |
| | vst1q_u8(dst1 + 16, _src1n); |
| |
|
| | src0 += 32; |
| | src1 += 32; |
| | dst0 += 32; |
| | dst1 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "pld [%2, #256] \n" |
| | "vld1.u8 {d4-d7}, [%2]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%3]! \n" |
| | "vst1.u8 {d4-d7}, [%4]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | *dst1++ = *src1++; |
| | } |
| |
|
| | src0 += srcwgap + srcstride; |
| | src1 += srcwgap + srcstride; |
| | dst0 += wgap + stride; |
| | dst1 += wgap + stride; |
| | } |
| |
|
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src = vld1q_u8(src0); |
| | uint8x16_t _src2 = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src); |
| | vst1q_u8(dst0 + 16, _src2); |
| |
|
| | src0 += 32; |
| | dst0 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%2]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 += wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_2_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw; |
| | const int wgap = stride + w; |
| |
|
| | const unsigned char* src0 = src; |
| | unsigned char* dst0 = dst + w - 1; |
| |
|
| | int y = 0; |
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | dst0 -= 15; |
| |
|
| | int nn = srcw >> 4; |
| | int remain = srcw - (nn << 4); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8_t _src = vld1_u8(src0); |
| | uint8x8_t _src2 = vld1_u8(src0 + 8); |
| |
|
| | _src = vrev64_u8(_src); |
| | _src2 = vrev64_u8(_src2); |
| |
|
| | vst1_u8(dst0, _src2); |
| | vst1_u8(dst0 + 8, _src); |
| |
|
| | src0 += 16; |
| | dst0 -= 16; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "mov r4, #-16 \n" |
| | "0: \n" |
| | "pld [%1, #128] \n" |
| | "vld1.u8 {d0-d1}, [%1]! \n" |
| | "vrev64.u8 d3, d0 \n" |
| | "vrev64.u8 d2, d1 \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d2-d3}, [%2], r4 \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1", "r4"); |
| | } |
| | #endif |
| |
|
| | dst0 += 15; |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0 = *src0; |
| |
|
| | src0 += 1; |
| | dst0 -= 1; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 += wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_2_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 2; |
| | const int wgap = stride + w * 2; |
| |
|
| | const unsigned char* src0 = src; |
| | unsigned char* dst0 = dst + w * 2 - 2; |
| |
|
| | int y = 0; |
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | dst0 -= 7 * 2; |
| |
|
| | int nn = srcw >> 4; |
| | int remain = srcw - (nn << 4); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x2_t _src = vld2_u8(src0); |
| | uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2); |
| |
|
| | _src.val[0] = vrev64_u8(_src.val[0]); |
| | _src.val[1] = vrev64_u8(_src.val[1]); |
| |
|
| | _src2.val[0] = vrev64_u8(_src2.val[0]); |
| | _src2.val[1] = vrev64_u8(_src2.val[1]); |
| |
|
| | vst2_u8(dst0, _src); |
| | vst2_u8(dst0 - 8 * 2, _src2); |
| |
|
| | src0 += 16 * 2; |
| | dst0 -= 16 * 2; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "mov r4, #-16 \n" |
| | "0: \n" |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d0-d1}, [%1]! \n" |
| | "vrev64.u8 d0, d0 \n" |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d2-d3}, [%1]! \n" |
| | "vrev64.u8 d1, d1 \n" |
| | "vrev64.u8 d2, d2 \n" |
| | "vst2.u8 {d0-d1}, [%2], r4 \n" |
| | "vrev64.u8 d3, d3 \n" |
| | "subs %0, #1 \n" |
| | "vst2.u8 {d2-d3}, [%2], r4 \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1", "r4"); |
| | } |
| | #endif |
| |
|
| | dst0 += 7 * 2; |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| |
|
| | src0 += 2; |
| | dst0 -= 2; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 += wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_2_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 3; |
| | const int wgap = stride + w * 3; |
| |
|
| | const unsigned char* src0 = src; |
| | unsigned char* dst0 = dst + w * 3 - 3; |
| |
|
| | int y = 0; |
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | dst0 -= 7 * 3; |
| |
|
| | int nn = srcw >> 4; |
| | int remain = srcw - (nn << 4); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x3_t _src = vld3_u8(src0); |
| | uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3); |
| |
|
| | _src.val[0] = vrev64_u8(_src.val[0]); |
| | _src.val[1] = vrev64_u8(_src.val[1]); |
| | _src.val[2] = vrev64_u8(_src.val[2]); |
| |
|
| | _src2.val[0] = vrev64_u8(_src2.val[0]); |
| | _src2.val[1] = vrev64_u8(_src2.val[1]); |
| | _src2.val[2] = vrev64_u8(_src2.val[2]); |
| |
|
| | vst3_u8(dst0, _src); |
| | vst3_u8(dst0 - 8 * 3, _src2); |
| |
|
| | src0 += 16 * 3; |
| | dst0 -= 16 * 3; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "mov r4, #-24 \n" |
| | "0: \n" |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d0-d2}, [%1]! \n" |
| | "vrev64.u8 d0, d0 \n" |
| | "vrev64.u8 d1, d1 \n" |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d4-d6}, [%1]! \n" |
| | "vrev64.u8 d2, d2 \n" |
| | "vrev64.u8 d4, d4 \n" |
| | "vst3.u8 {d0-d2}, [%2], r4 \n" |
| | "vrev64.u8 d5, d5 \n" |
| | "vrev64.u8 d6, d6 \n" |
| | "subs %0, #1 \n" |
| | "vst3.u8 {d4-d6}, [%2], r4 \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); |
| | } |
| | #endif |
| |
|
| | dst0 += 7 * 3; |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| |
|
| | src0 += 3; |
| | dst0 -= 3; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 += wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_2_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 4; |
| | const int wgap = stride + w * 4; |
| |
|
| | const unsigned char* src0 = src; |
| | unsigned char* dst0 = dst + w * 4 - 4; |
| |
|
| | int y = 0; |
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | dst0 -= 7 * 4; |
| |
|
| | int nn = srcw >> 4; |
| | int remain = srcw - (nn << 4); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x4_t _src = vld4_u8(src0); |
| | uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4); |
| |
|
| | _src.val[0] = vrev64_u8(_src.val[0]); |
| | _src.val[1] = vrev64_u8(_src.val[1]); |
| | _src.val[2] = vrev64_u8(_src.val[2]); |
| | _src.val[3] = vrev64_u8(_src.val[3]); |
| |
|
| | _src2.val[0] = vrev64_u8(_src2.val[0]); |
| | _src2.val[1] = vrev64_u8(_src2.val[1]); |
| | _src2.val[2] = vrev64_u8(_src2.val[2]); |
| | _src2.val[3] = vrev64_u8(_src2.val[3]); |
| |
|
| | vst4_u8(dst0, _src); |
| | vst4_u8(dst0 - 8 * 4, _src2); |
| |
|
| | src0 += 16 * 4; |
| | dst0 -= 16 * 4; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "mov r4, #-32 \n" |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d0-d3}, [%1]! \n" |
| | "vrev64.u8 d0, d0 \n" |
| | "vrev64.u8 d1, d1 \n" |
| | "vrev64.u8 d2, d2 \n" |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d4-d7}, [%1]! \n" |
| | "vrev64.u8 d3, d3 \n" |
| | "vrev64.u8 d4, d4 \n" |
| | "vrev64.u8 d5, d5 \n" |
| | "vst4.u8 {d0-d3}, [%2], r4 \n" |
| | "vrev64.u8 d6, d6 \n" |
| | "vrev64.u8 d7, d7 \n" |
| | "subs %0, #1 \n" |
| | "vst4.u8 {d4-d7}, [%2], r4 \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); |
| | } |
| | #endif |
| |
|
| | dst0 += 7 * 4; |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| | dst0[3] = src0[3]; |
| |
|
| | src0 += 4; |
| | dst0 -= 4; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 += wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_3_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw; |
| | const int wgap = stride - w; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * h - wgap; |
| |
|
| | const unsigned char* src0 = src; |
| | unsigned char* dst0 = dstend - 1; |
| |
|
| | int y = 0; |
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | dst0 -= 15; |
| |
|
| | int nn = srcw >> 4; |
| | int remain = srcw - (nn << 4); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8_t _src = vld1_u8(src0); |
| | uint8x8_t _src2 = vld1_u8(src0 + 8); |
| |
|
| | _src = vrev64_u8(_src); |
| | _src2 = vrev64_u8(_src2); |
| |
|
| | vst1_u8(dst0, _src2); |
| | vst1_u8(dst0 + 8, _src); |
| |
|
| | src0 += 16; |
| | dst0 -= 16; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "mov r4, #-16 \n" |
| | "0: \n" |
| | "pld [%1, #128] \n" |
| | "vld1.u8 {d0-d1}, [%1]! \n" |
| | "vrev64.u8 d3, d0 \n" |
| | "vrev64.u8 d2, d1 \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d2-d3}, [%2], r4 \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1", "r4"); |
| | } |
| | #endif |
| |
|
| | dst0 += 15; |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0 = *src0; |
| |
|
| | src0 += 1; |
| | dst0 -= 1; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 -= wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_3_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 2; |
| | const int wgap = stride - w * 2; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * h - wgap; |
| |
|
| | const unsigned char* src0 = src; |
| | unsigned char* dst0 = dstend - 2; |
| |
|
| | int y = 0; |
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | dst0 -= 7 * 2; |
| |
|
| | int nn = srcw >> 4; |
| | int remain = srcw - (nn << 4); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x2_t _src = vld2_u8(src0); |
| | uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2); |
| |
|
| | _src.val[0] = vrev64_u8(_src.val[0]); |
| | _src.val[1] = vrev64_u8(_src.val[1]); |
| |
|
| | _src2.val[0] = vrev64_u8(_src2.val[0]); |
| | _src2.val[1] = vrev64_u8(_src2.val[1]); |
| |
|
| | vst2_u8(dst0, _src); |
| | vst2_u8(dst0 - 8 * 2, _src2); |
| |
|
| | src0 += 16 * 2; |
| | dst0 -= 16 * 2; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "mov r4, #-16 \n" |
| | "0: \n" |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d0-d1}, [%1]! \n" |
| | "vrev64.u8 d0, d0 \n" |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d2-d3}, [%1]! \n" |
| | "vrev64.u8 d1, d1 \n" |
| | "vrev64.u8 d2, d2 \n" |
| | "vst2.u8 {d0-d1}, [%2], r4 \n" |
| | "vrev64.u8 d3, d3 \n" |
| | "subs %0, #1 \n" |
| | "vst2.u8 {d2-d3}, [%2], r4 \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1", "r4"); |
| | } |
| | #endif |
| |
|
| | dst0 += 7 * 2; |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| |
|
| | src0 += 2; |
| | dst0 -= 2; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 -= wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_3_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 3; |
| | const int wgap = stride - w * 3; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * h - wgap; |
| |
|
| | const unsigned char* src0 = src; |
| | unsigned char* dst0 = dstend - 3; |
| |
|
| | int y = 0; |
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | dst0 -= 7 * 3; |
| |
|
| | int nn = srcw >> 4; |
| | int remain = srcw - (nn << 4); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x3_t _src = vld3_u8(src0); |
| | uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3); |
| |
|
| | _src.val[0] = vrev64_u8(_src.val[0]); |
| | _src.val[1] = vrev64_u8(_src.val[1]); |
| | _src.val[2] = vrev64_u8(_src.val[2]); |
| |
|
| | _src2.val[0] = vrev64_u8(_src2.val[0]); |
| | _src2.val[1] = vrev64_u8(_src2.val[1]); |
| | _src2.val[2] = vrev64_u8(_src2.val[2]); |
| |
|
| | vst3_u8(dst0, _src); |
| | vst3_u8(dst0 - 8 * 3, _src2); |
| |
|
| | src0 += 16 * 3; |
| | dst0 -= 16 * 3; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "mov r4, #-24 \n" |
| | "0: \n" |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d0-d2}, [%1]! \n" |
| | "vrev64.u8 d0, d0 \n" |
| | "vrev64.u8 d1, d1 \n" |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d4-d6}, [%1]! \n" |
| | "vrev64.u8 d2, d2 \n" |
| | "vrev64.u8 d4, d4 \n" |
| | "vst3.u8 {d0-d2}, [%2], r4 \n" |
| | "vrev64.u8 d5, d5 \n" |
| | "vrev64.u8 d6, d6 \n" |
| | "subs %0, #1 \n" |
| | "vst3.u8 {d4-d6}, [%2], r4 \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); |
| | } |
| | #endif |
| |
|
| | dst0 += 7 * 3; |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| |
|
| | src0 += 3; |
| | dst0 -= 3; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 -= wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_3_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 4; |
| | const int wgap = stride - w * 4; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * h - wgap; |
| |
|
| | const unsigned char* src0 = src; |
| | unsigned char* dst0 = dstend - 4; |
| |
|
| | int y = 0; |
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | dst0 -= 7 * 4; |
| |
|
| | int nn = srcw >> 4; |
| | int remain = srcw - (nn << 4); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x4_t _src = vld4_u8(src0); |
| | uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4); |
| |
|
| | _src.val[0] = vrev64_u8(_src.val[0]); |
| | _src.val[1] = vrev64_u8(_src.val[1]); |
| | _src.val[2] = vrev64_u8(_src.val[2]); |
| | _src.val[3] = vrev64_u8(_src.val[3]); |
| |
|
| | _src2.val[0] = vrev64_u8(_src2.val[0]); |
| | _src2.val[1] = vrev64_u8(_src2.val[1]); |
| | _src2.val[2] = vrev64_u8(_src2.val[2]); |
| | _src2.val[3] = vrev64_u8(_src2.val[3]); |
| |
|
| | vst4_u8(dst0, _src); |
| | vst4_u8(dst0 - 8 * 4, _src2); |
| |
|
| | src0 += 16 * 4; |
| | dst0 -= 16 * 4; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "mov r4, #-32 \n" |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d0-d3}, [%1]! \n" |
| | "vrev64.u8 d0, d0 \n" |
| | "vrev64.u8 d1, d1 \n" |
| | "vrev64.u8 d2, d2 \n" |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d4-d7}, [%1]! \n" |
| | "vrev64.u8 d3, d3 \n" |
| | "vrev64.u8 d4, d4 \n" |
| | "vrev64.u8 d5, d5 \n" |
| | "vst4.u8 {d0-d3}, [%2], r4 \n" |
| | "vrev64.u8 d6, d6 \n" |
| | "vrev64.u8 d7, d7 \n" |
| | "subs %0, #1 \n" |
| | "vst4.u8 {d4-d7}, [%2], r4 \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "r4"); |
| | } |
| | #endif |
| |
|
| | dst0 += 7 * 4; |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| | dst0[3] = src0[3]; |
| |
|
| | src0 += 4; |
| | dst0 -= 4; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 -= wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_4_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw; |
| | const int wgap = stride + w; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1); |
| |
|
| | const unsigned char* src0 = src; |
| | const unsigned char* src1 = src + srcstride; |
| | unsigned char* dst0 = dstend; |
| | unsigned char* dst1 = dstend - stride; |
| |
|
| | int y = 0; |
| | for (; y + 1 < srch; y += 2) |
| | { |
| | #if __ARM_NEON |
| | int nn = srcw >> 5; |
| | int remain = srcw - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src0 = vld1q_u8(src0); |
| | uint8x16_t _src0n = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src0); |
| | vst1q_u8(dst0 + 16, _src0n); |
| |
|
| | uint8x16_t _src1 = vld1q_u8(src1); |
| | uint8x16_t _src1n = vld1q_u8(src1 + 16); |
| | vst1q_u8(dst1, _src1); |
| | vst1q_u8(dst1 + 16, _src1n); |
| |
|
| | src0 += 32; |
| | src1 += 32; |
| | dst0 += 32; |
| | dst1 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "pld [%2, #256] \n" |
| | "vld1.u8 {d4-d7}, [%2]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%3]! \n" |
| | "vst1.u8 {d4-d7}, [%4]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | *dst1++ = *src1++; |
| | } |
| |
|
| | src0 += srcwgap + srcstride; |
| | src1 += srcwgap + srcstride; |
| | dst0 -= wgap + stride; |
| | dst1 -= wgap + stride; |
| | } |
| |
|
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | int nn = srcw >> 5; |
| | int remain = srcw - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src = vld1q_u8(src0); |
| | uint8x16_t _src2 = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src); |
| | vst1q_u8(dst0 + 16, _src2); |
| |
|
| | src0 += 32; |
| | dst0 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%2]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1"); |
| | } |
| | #endif |
| | #else |
| | int remain = srcw; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 -= wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_4_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 2; |
| | const int wgap = stride + w * 2; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1); |
| |
|
| | int size = srcw * 2; |
| |
|
| | const unsigned char* src0 = src; |
| | const unsigned char* src1 = src + srcstride; |
| | unsigned char* dst0 = dstend; |
| | unsigned char* dst1 = dstend - stride; |
| |
|
| | int y = 0; |
| | for (; y + 1 < srch; y += 2) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src0 = vld1q_u8(src0); |
| | uint8x16_t _src0n = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src0); |
| | vst1q_u8(dst0 + 16, _src0n); |
| |
|
| | uint8x16_t _src1 = vld1q_u8(src1); |
| | uint8x16_t _src1n = vld1q_u8(src1 + 16); |
| | vst1q_u8(dst1, _src1); |
| | vst1q_u8(dst1 + 16, _src1n); |
| |
|
| | src0 += 32; |
| | src1 += 32; |
| | dst0 += 32; |
| | dst1 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "pld [%2, #256] \n" |
| | "vld1.u8 {d4-d7}, [%2]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%3]! \n" |
| | "vst1.u8 {d4-d7}, [%4]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | *dst1++ = *src1++; |
| | } |
| |
|
| | src0 += srcwgap + srcstride; |
| | src1 += srcwgap + srcstride; |
| | dst0 -= wgap + stride; |
| | dst1 -= wgap + stride; |
| | } |
| |
|
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src = vld1q_u8(src0); |
| | uint8x16_t _src2 = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src); |
| | vst1q_u8(dst0 + 16, _src2); |
| |
|
| | src0 += 32; |
| | dst0 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%2]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 -= wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_4_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 3; |
| | const int wgap = stride + w * 3; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1); |
| |
|
| | int size = srcw * 3; |
| |
|
| | const unsigned char* src0 = src; |
| | const unsigned char* src1 = src + srcstride; |
| | unsigned char* dst0 = dstend; |
| | unsigned char* dst1 = dstend - stride; |
| |
|
| | int y = 0; |
| | for (; y + 1 < srch; y += 2) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src0 = vld1q_u8(src0); |
| | uint8x16_t _src0n = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src0); |
| | vst1q_u8(dst0 + 16, _src0n); |
| |
|
| | uint8x16_t _src1 = vld1q_u8(src1); |
| | uint8x16_t _src1n = vld1q_u8(src1 + 16); |
| | vst1q_u8(dst1, _src1); |
| | vst1q_u8(dst1 + 16, _src1n); |
| |
|
| | src0 += 32; |
| | src1 += 32; |
| | dst0 += 32; |
| | dst1 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "pld [%2, #256] \n" |
| | "vld1.u8 {d4-d7}, [%2]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%3]! \n" |
| | "vst1.u8 {d4-d7}, [%4]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | *dst1++ = *src1++; |
| | } |
| |
|
| | src0 += srcwgap + srcstride; |
| | src1 += srcwgap + srcstride; |
| | dst0 -= wgap + stride; |
| | dst1 -= wgap + stride; |
| | } |
| |
|
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src = vld1q_u8(src0); |
| | uint8x16_t _src2 = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src); |
| | vst1q_u8(dst0 + 16, _src2); |
| |
|
| | src0 += 32; |
| | dst0 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%2]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 -= wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_4_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 4; |
| | const int wgap = stride + w * 4; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1); |
| |
|
| | int size = srcw * 4; |
| |
|
| | const unsigned char* src0 = src; |
| | const unsigned char* src1 = src + srcstride; |
| | unsigned char* dst0 = dstend; |
| | unsigned char* dst1 = dstend - stride; |
| |
|
| | int y = 0; |
| | for (; y + 1 < srch; y += 2) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src0 = vld1q_u8(src0); |
| | uint8x16_t _src0n = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src0); |
| | vst1q_u8(dst0 + 16, _src0n); |
| |
|
| | uint8x16_t _src1 = vld1q_u8(src1); |
| | uint8x16_t _src1n = vld1q_u8(src1 + 16); |
| | vst1q_u8(dst1, _src1); |
| | vst1q_u8(dst1 + 16, _src1n); |
| |
|
| | src0 += 32; |
| | src1 += 32; |
| | dst0 += 32; |
| | dst1 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "pld [%2, #256] \n" |
| | "vld1.u8 {d4-d7}, [%2]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%3]! \n" |
| | "vst1.u8 {d4-d7}, [%4]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | *dst1++ = *src1++; |
| | } |
| |
|
| | src0 += srcwgap + srcstride; |
| | src1 += srcwgap + srcstride; |
| | dst0 -= wgap + stride; |
| | dst1 -= wgap + stride; |
| | } |
| |
|
| | for (; y < srch; y++) |
| | { |
| | #if __ARM_NEON |
| | int nn = size >> 5; |
| | int remain = size - (nn << 5); |
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x16_t _src = vld1q_u8(src0); |
| | uint8x16_t _src2 = vld1q_u8(src0 + 16); |
| | vst1q_u8(dst0, _src); |
| | vst1q_u8(dst0 + 16, _src2); |
| |
|
| | src0 += 32; |
| | dst0 += 32; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld1.u8 {d0-d3}, [%1]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.u8 {d0-d3}, [%2]! \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(dst0) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(dst0) |
| | : "cc", "memory", "q0", "q1"); |
| | } |
| | #endif |
| | #else |
| | int remain = size; |
| | #endif |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | *dst0++ = *src0++; |
| | } |
| |
|
| | src0 += srcwgap; |
| | dst0 -= wgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_5_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int , int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst0 = dst + y; |
| | unsigned char* dst1 = dst + y + stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = 2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8_t _src0 = vld1_u8(src0); |
| | uint8x8_t _src1 = vld1_u8(src1); |
| |
|
| | uint8x8_t _src2 = vld1_u8(src0 + src_step); |
| | uint8x8_t _src3 = vld1_u8(src1 + src_step); |
| |
|
| | uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); |
| | uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); |
| | uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); |
| |
|
| | uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| | uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| |
|
| | vst1_u8(dst0, _dst0); |
| | vst1_u8(dst1, _dst1); |
| | vst1_u8(dst0 + dst_step, _dst2); |
| | vst1_u8(dst1 + dst_step, _dst3); |
| | vst1_u8(dst0 + 2 * dst_step, _dst4); |
| | vst1_u8(dst1 + 2 * dst_step, _dst5); |
| | vst1_u8(dst0 + 3 * dst_step, _dst6); |
| | vst1_u8(dst1 + 3 * dst_step, _dst7); |
| |
|
| | src0 += 8; |
| | src1 += 8; |
| |
|
| | dst0 += 4 * dst_step; |
| | dst1 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d0}, [%1], %10 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d1}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d2}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d0, d1 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d3}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d4}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d2, d3 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d5}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d6}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d4, d5 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d7}, [%2], %10 \n" |
| | |
| | "vtrn.u8 d6, d7 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q0, q1 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q2, q3 \n" |
| | |
| | "add %1, #8 \n" |
| | |
| | "vtrn.u32 q0, q2 \n" |
| | |
| | "add %2, #8 \n" |
| | |
| | "vtrn.u32 q1, q3 \n" |
| | "vst1.u8 {d0}, [%3], %11 \n" |
| | "vst1.u8 {d1}, [%4], %11 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst1.u8 {d2}, [%3], %11 \n" |
| | "vst1.u8 {d3}, [%4], %11 \n" |
| | "vst1.u8 {d4}, [%3], %11 \n" |
| | "vst1.u8 {d5}, [%4], %11 \n" |
| | "vst1.u8 {d6}, [%3], %11 \n" |
| | "vst1.u8 {d7}, [%4], %11 \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src1[0]; |
| | dst0[2] = src0[0 + src_step]; |
| | dst0[3] = src1[0 + src_step]; |
| | dst0[4] = src0[0 + 2 * src_step]; |
| | dst0[5] = src1[0 + 2 * src_step]; |
| | dst0[6] = src0[0 + 3 * src_step]; |
| | dst0[7] = src1[0 + 3 * src_step]; |
| |
|
| | src0 += 1; |
| | src1 += 1; |
| |
|
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dst + y; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | *dst0 = *src0; |
| |
|
| | src0 += 1; |
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_5_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int , int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 2; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst0 = dst + y * 2; |
| | unsigned char* dst1 = dst + y * 2 + stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = 2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x2_t _src0 = vld2_u8(src0); |
| | uint8x8x2_t _src1 = vld2_u8(src1); |
| |
|
| | uint8x8x2_t _src2 = vld2_u8(src0 + src_step); |
| | uint8x8x2_t _src3 = vld2_u8(src1 + src_step); |
| |
|
| | uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); |
| | uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); |
| | uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); |
| |
|
| | uint8x8x2_t _dst0; |
| | uint8x8x2_t _dst1; |
| | uint8x8x2_t _dst2; |
| | uint8x8x2_t _dst3; |
| | uint8x8x2_t _dst4; |
| | uint8x8x2_t _dst5; |
| | uint8x8x2_t _dst6; |
| | uint8x8x2_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| |
|
| | vst2_u8(dst0, _dst0); |
| | vst2_u8(dst1, _dst1); |
| | vst2_u8(dst0 + dst_step, _dst2); |
| | vst2_u8(dst1 + dst_step, _dst3); |
| | vst2_u8(dst0 + 2 * dst_step, _dst4); |
| | vst2_u8(dst1 + 2 * dst_step, _dst5); |
| | vst2_u8(dst0 + 3 * dst_step, _dst6); |
| | vst2_u8(dst1 + 3 * dst_step, _dst7); |
| |
|
| | src0 += 2 * 8; |
| | src1 += 2 * 8; |
| |
|
| | dst0 += 4 * dst_step; |
| | dst1 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d0-d1}, [%1], %10 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d2-d3}, [%2], %10 \n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d4-d5}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q0, q1 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d6-d7}, [%2], %10 \n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d16-d17}, [%1], %10\n" |
| | |
| | "vtrn.u8 q2, q3 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d18-d19}, [%2], %10\n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d20-d21}, [%1], %10\n" |
| | |
| | "vtrn.u8 q8, q9 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d22-d23}, [%2], %10\n" |
| | |
| | "vtrn.u8 q10, q11 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q0, q2 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q1, q3 \n" |
| | |
| | "add %1, #16 \n" |
| | |
| | "vtrn.u16 q8, q10 \n" |
| | |
| | "add %2, #16 \n" |
| | |
| | "vtrn.u16 q9, q11 \n" |
| | |
| | "vtrn.u32 q0, q8 \n" |
| | |
| | "vtrn.u32 q1, q9 \n" |
| | "vst2.u8 {d0-d1}, [%3], %11 \n" |
| | |
| | "vtrn.u32 q2, q10 \n" |
| | "vst2.u8 {d2-d3}, [%4], %11 \n" |
| | |
| | "vtrn.u32 q3, q11 \n" |
| | "vst2.u8 {d4-d5}, [%3], %11 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst2.u8 {d6-d7}, [%4], %11 \n" |
| | "vst2.u8 {d16-d17}, [%3], %11\n" |
| | "vst2.u8 {d18-d19}, [%4], %11\n" |
| | "vst2.u8 {d20-d21}, [%3], %11\n" |
| | "vst2.u8 {d22-d23}, [%4], %11\n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src1[0]; |
| | dst0[3] = src1[1]; |
| | dst0[4] = src0[0 + src_step]; |
| | dst0[5] = src0[1 + src_step]; |
| | dst0[6] = src1[0 + src_step]; |
| | dst0[7] = src1[1 + src_step]; |
| | dst0[8] = src0[0 + 2 * src_step]; |
| | dst0[9] = src0[1 + 2 * src_step]; |
| | dst0[10] = src1[0 + 2 * src_step]; |
| | dst0[11] = src1[1 + 2 * src_step]; |
| | dst0[12] = src0[0 + 3 * src_step]; |
| | dst0[13] = src0[1 + 3 * src_step]; |
| | dst0[14] = src1[0 + 3 * src_step]; |
| | dst0[15] = src1[1 + 3 * src_step]; |
| |
|
| | src0 += 2; |
| | src1 += 2; |
| |
|
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dst + y * 2; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| |
|
| | src0 += 2; |
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_5_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int , int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 3; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst0 = dst + y * 3; |
| | unsigned char* dst1 = dst + y * 3 + stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = 2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x3_t _src0 = vld3_u8(src0); |
| | uint8x8x3_t _src1 = vld3_u8(src1); |
| |
|
| | uint8x8x3_t _src2 = vld3_u8(src0 + src_step); |
| | uint8x8x3_t _src3 = vld3_u8(src1 + src_step); |
| |
|
| | uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); |
| | uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); |
| | uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); |
| |
|
| | uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); |
| | uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); |
| | uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); |
| | uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); |
| | uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); |
| | uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); |
| | uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); |
| | uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); |
| | uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); |
| | uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); |
| |
|
| | uint8x8x3_t _dst0; |
| | uint8x8x3_t _dst1; |
| | uint8x8x3_t _dst2; |
| | uint8x8x3_t _dst3; |
| | uint8x8x3_t _dst4; |
| | uint8x8x3_t _dst5; |
| | uint8x8x3_t _dst6; |
| | uint8x8x3_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| |
|
| | _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); |
| | _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); |
| | _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); |
| | _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); |
| | _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); |
| | _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); |
| | _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); |
| | _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); |
| |
|
| | vst3_u8(dst0, _dst0); |
| | vst3_u8(dst1, _dst1); |
| | vst3_u8(dst0 + dst_step, _dst2); |
| | vst3_u8(dst1 + dst_step, _dst3); |
| | vst3_u8(dst0 + 2 * dst_step, _dst4); |
| | vst3_u8(dst1 + 2 * dst_step, _dst5); |
| | vst3_u8(dst0 + 3 * dst_step, _dst6); |
| | vst3_u8(dst1 + 3 * dst_step, _dst7); |
| |
|
| | src0 += 3 * 8; |
| | src1 += 3 * 8; |
| |
|
| | dst0 += 4 * dst_step; |
| | dst1 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d0-d2}, [%1], %10 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d4-d6}, [%2], %10 \n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d8-d10}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q0, q2 \n" |
| | "vtrn.u8 d2, d6 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d12-d14}, [%2], %10\n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d16-d18}, [%1], %10\n" |
| | |
| | "vtrn.u8 q4, q6 \n" |
| | "vtrn.u8 d10, d14 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d20-d22}, [%2], %10\n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d24-d26}, [%1], %10\n" |
| | |
| | "vtrn.u8 q8, q10 \n" |
| | "vtrn.u8 d18, d22 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d28-d30}, [%2], %10\n" |
| | |
| | "vtrn.u8 q12, q14 \n" |
| | "vtrn.u8 d26, d30 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q0, q4 \n" |
| | "vtrn.u16 d2, d10 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q2, q6 \n" |
| | "vtrn.u16 d6, d14 \n" |
| | |
| | "add %1, #24 \n" |
| | |
| | "vtrn.u16 q8, q12 \n" |
| | "vtrn.u16 d18, d26 \n" |
| | |
| | "add %2, #24 \n" |
| | |
| | "vtrn.u16 q10, q14 \n" |
| | "vtrn.u16 d22, d30 \n" |
| | |
| | "vtrn.u32 q0, q8 \n" |
| | "vtrn.u32 d2, d18 \n" |
| | |
| | "vtrn.u32 q2, q10 \n" |
| | "vst3.u8 {d0-d2}, [%3], %11 \n" |
| | "vtrn.u32 d6, d22 \n" |
| | |
| | "vtrn.u32 q4, q12 \n" |
| | "vst3.u8 {d4-d6}, [%4], %11 \n" |
| | "vtrn.u32 d10, d26 \n" |
| | |
| | "vtrn.u32 q6, q14 \n" |
| | "vst3.u8 {d8-d10}, [%3], %11 \n" |
| | "vtrn.u32 d14, d30 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst3.u8 {d16-d18}, [%3], %11\n" |
| | "vst3.u8 {d12-d14}, [%4], %11\n" |
| | "vst3.u8 {d20-d22}, [%4], %11\n" |
| | "vst3.u8 {d24-d26}, [%3], %11\n" |
| | "vst3.u8 {d28-d30}, [%4], %11\n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| | dst0[3] = src1[0]; |
| | dst0[4] = src1[1]; |
| | dst0[5] = src1[2]; |
| | dst0[6] = src0[0 + src_step]; |
| | dst0[7] = src0[1 + src_step]; |
| | dst0[8] = src0[2 + src_step]; |
| | dst0[9] = src1[0 + src_step]; |
| | dst0[10] = src1[1 + src_step]; |
| | dst0[11] = src1[2 + src_step]; |
| | dst0[12] = src0[0 + 2 * src_step]; |
| | dst0[13] = src0[1 + 2 * src_step]; |
| | dst0[14] = src0[2 + 2 * src_step]; |
| | dst0[15] = src1[0 + 2 * src_step]; |
| | dst0[16] = src1[1 + 2 * src_step]; |
| | dst0[17] = src1[2 + 2 * src_step]; |
| | dst0[18] = src0[0 + 3 * src_step]; |
| | dst0[19] = src0[1 + 3 * src_step]; |
| | dst0[20] = src0[2 + 3 * src_step]; |
| | dst0[21] = src1[0 + 3 * src_step]; |
| | dst0[22] = src1[1 + 3 * src_step]; |
| | dst0[23] = src1[2 + 3 * src_step]; |
| |
|
| | src0 += 3; |
| | src1 += 3; |
| |
|
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dst + y * 3; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| |
|
| | src0 += 3; |
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_5_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int , int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 4; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst0 = dst + y * 4; |
| | unsigned char* dst1 = dst + y * 4 + stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = 2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x4_t _src0 = vld4_u8(src0); |
| | uint8x8x4_t _src1 = vld4_u8(src1); |
| |
|
| | uint8x8x4_t _src2 = vld4_u8(src0 + src_step); |
| | uint8x8x4_t _src3 = vld4_u8(src1 + src_step); |
| |
|
| | uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); |
| | uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); |
| | uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); |
| |
|
| | uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); |
| | uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); |
| | uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); |
| | uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); |
| |
|
| | uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]); |
| | uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]); |
| | uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]); |
| | uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); |
| | uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); |
| | uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); |
| | uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0])); |
| | uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1])); |
| | uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0])); |
| | uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); |
| | uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); |
| | uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); |
| | uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0])); |
| | uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0])); |
| | uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1])); |
| | uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1])); |
| |
|
| | uint8x8x4_t _dst0; |
| | uint8x8x4_t _dst1; |
| | uint8x8x4_t _dst2; |
| | uint8x8x4_t _dst3; |
| | uint8x8x4_t _dst4; |
| | uint8x8x4_t _dst5; |
| | uint8x8x4_t _dst6; |
| | uint8x8x4_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| |
|
| | _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); |
| | _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); |
| | _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); |
| | _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); |
| | _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); |
| | _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); |
| | _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); |
| | _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); |
| |
|
| | _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); |
| | _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); |
| | _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); |
| | _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); |
| | _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); |
| | _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); |
| | _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); |
| | _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); |
| |
|
| | vst4_u8(dst0, _dst0); |
| | vst4_u8(dst1, _dst1); |
| | vst4_u8(dst0 + dst_step, _dst2); |
| | vst4_u8(dst1 + dst_step, _dst3); |
| | vst4_u8(dst0 + 2 * dst_step, _dst4); |
| | vst4_u8(dst1 + 2 * dst_step, _dst5); |
| | vst4_u8(dst0 + 3 * dst_step, _dst6); |
| | vst4_u8(dst1 + 3 * dst_step, _dst7); |
| |
|
| | src0 += 4 * 8; |
| | src1 += 4 * 8; |
| |
|
| | dst0 += 4 * dst_step; |
| | dst1 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d0-d3}, [%1], %10 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d4-d7}, [%2], %10 \n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d8-d11}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q0, q2 \n" |
| | "vtrn.u8 q1, q3 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d12-d15}, [%2], %10\n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d16-d19}, [%1], %10\n" |
| | |
| | "vtrn.u8 q4, q6 \n" |
| | "vtrn.u8 q5, q7 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d20-d23}, [%2], %10\n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d24-d27}, [%1], %10\n" |
| | |
| | "vtrn.u8 q8, q10 \n" |
| | "vtrn.u8 q9, q11 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d28-d31}, [%2], %10\n" |
| | |
| | "vtrn.u8 q12, q14 \n" |
| | "vtrn.u8 q13, q15 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q0, q4 \n" |
| | "vtrn.u16 q1, q5 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q2, q6 \n" |
| | "vtrn.u16 q3, q7 \n" |
| | |
| | "add %1, #32 \n" |
| | |
| | "vtrn.u16 q8, q12 \n" |
| | "vtrn.u16 q9, q13 \n" |
| | |
| | "add %2, #32 \n" |
| | |
| | "vtrn.u16 q10, q14 \n" |
| | "vtrn.u16 q11, q15 \n" |
| | |
| | "vtrn.u32 q0, q8 \n" |
| | "vtrn.u32 q1, q9 \n" |
| | |
| | "vtrn.u32 q2, q10 \n" |
| | "vst4.u8 {d0-d3}, [%3], %11 \n" |
| | "vtrn.u32 q3, q11 \n" |
| | |
| | "vtrn.u32 q4, q12 \n" |
| | "vst4.u8 {d4-d7}, [%4], %11 \n" |
| | "vtrn.u32 q5, q13 \n" |
| | |
| | "vtrn.u32 q6, q14 \n" |
| | "vst4.u8 {d8-d11}, [%3], %11 \n" |
| | "vtrn.u32 q7, q15 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst4.u8 {d16-d19}, [%3], %11\n" |
| | "vst4.u8 {d12-d15}, [%4], %11\n" |
| | "vst4.u8 {d20-d23}, [%4], %11\n" |
| | "vst4.u8 {d24-d27}, [%3], %11\n" |
| | "vst4.u8 {d28-d31}, [%4], %11\n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| | dst0[3] = src0[3]; |
| | dst0[4] = src1[0]; |
| | dst0[5] = src1[1]; |
| | dst0[6] = src1[2]; |
| | dst0[7] = src1[3]; |
| | dst0[8] = src0[0 + src_step]; |
| | dst0[9] = src0[1 + src_step]; |
| | dst0[10] = src0[2 + src_step]; |
| | dst0[11] = src0[3 + src_step]; |
| | dst0[12] = src1[0 + src_step]; |
| | dst0[13] = src1[1 + src_step]; |
| | dst0[14] = src1[2 + src_step]; |
| | dst0[15] = src1[3 + src_step]; |
| | dst0[16] = src0[0 + 2 * src_step]; |
| | dst0[17] = src0[1 + 2 * src_step]; |
| | dst0[18] = src0[2 + 2 * src_step]; |
| | dst0[19] = src0[3 + 2 * src_step]; |
| | dst0[20] = src1[0 + 2 * src_step]; |
| | dst0[21] = src1[1 + 2 * src_step]; |
| | dst0[22] = src1[2 + 2 * src_step]; |
| | dst0[23] = src1[3 + 2 * src_step]; |
| | dst0[24] = src0[0 + 3 * src_step]; |
| | dst0[25] = src0[1 + 3 * src_step]; |
| | dst0[26] = src0[2 + 3 * src_step]; |
| | dst0[27] = src0[3 + 3 * src_step]; |
| | dst0[28] = src1[0 + 3 * src_step]; |
| | dst0[29] = src1[1 + 3 * src_step]; |
| | dst0[30] = src1[2 + 3 * src_step]; |
| | dst0[31] = src1[3 + 3 * src_step]; |
| |
|
| | src0 += 4; |
| | src1 += 4; |
| |
|
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dst + y * 4; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| | dst0[3] = src0[3]; |
| |
|
| | src0 += 4; |
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_6_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw; |
| |
|
| | |
| | unsigned char* dstend = dst + w; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst0 = dstend - y - 8; |
| | unsigned char* dst1 = dstend - y - 8 + stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = 2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8_t _src0 = vld1_u8(src0); |
| | uint8x8_t _src1 = vld1_u8(src1); |
| |
|
| | uint8x8_t _src2 = vld1_u8(src0 + src_step); |
| | uint8x8_t _src3 = vld1_u8(src1 + src_step); |
| |
|
| | uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); |
| | uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); |
| | uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); |
| |
|
| | uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| | uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| |
|
| | vst1_u8(dst0, _dst7); |
| | vst1_u8(dst1, _dst6); |
| | vst1_u8(dst0 + dst_step, _dst5); |
| | vst1_u8(dst1 + dst_step, _dst4); |
| | vst1_u8(dst0 + 2 * dst_step, _dst3); |
| | vst1_u8(dst1 + 2 * dst_step, _dst2); |
| | vst1_u8(dst0 + 3 * dst_step, _dst1); |
| | vst1_u8(dst1 + 3 * dst_step, _dst0); |
| |
|
| | src0 += 8; |
| | src1 += 8; |
| |
|
| | dst0 += 4 * dst_step; |
| | dst1 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d0}, [%1], %10 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d1}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d2}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d1, d0 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d3}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d4}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d3, d2 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d5}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d6}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d5, d4 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d7}, [%2], %10 \n" |
| | |
| | "vtrn.u8 d7, d6 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q1, q0 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q3, q2 \n" |
| | |
| | "add %1, #8 \n" |
| | |
| | "vtrn.u32 q3, q1 \n" |
| | |
| | "add %2, #8 \n" |
| | |
| | "vtrn.u32 q2, q0 \n" |
| | "vst1.u8 {d6}, [%4], %11 \n" |
| | "vst1.u8 {d7}, [%3], %11 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst1.u8 {d4}, [%4], %11 \n" |
| | "vst1.u8 {d5}, [%3], %11 \n" |
| | "vst1.u8 {d2}, [%4], %11 \n" |
| | "vst1.u8 {d3}, [%3], %11 \n" |
| | "vst1.u8 {d0}, [%4], %11 \n" |
| | "vst1.u8 {d1}, [%3], %11 \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src1[0 + 3 * src_step]; |
| | dst0[1] = src0[0 + 3 * src_step]; |
| | dst0[2] = src1[0 + 2 * src_step]; |
| | dst0[3] = src0[0 + 2 * src_step]; |
| | dst0[4] = src1[0 + src_step]; |
| | dst0[5] = src0[0 + src_step]; |
| | dst0[6] = src1[0]; |
| | dst0[7] = src0[0]; |
| |
|
| | src0 += 1; |
| | src1 += 1; |
| |
|
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend - y - 1; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | *dst0 = *src0; |
| |
|
| | src0 += 1; |
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_6_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 2; |
| |
|
| | |
| | unsigned char* dstend = dst + w * 2; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst0 = dstend - y * 2 - 8 * 2; |
| | unsigned char* dst1 = dstend - y * 2 - 8 * 2 + stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = 2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x2_t _src0 = vld2_u8(src0); |
| | uint8x8x2_t _src1 = vld2_u8(src1); |
| |
|
| | uint8x8x2_t _src2 = vld2_u8(src0 + src_step); |
| | uint8x8x2_t _src3 = vld2_u8(src1 + src_step); |
| |
|
| | uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); |
| | uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); |
| | uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); |
| |
|
| | uint8x8x2_t _dst0; |
| | uint8x8x2_t _dst1; |
| | uint8x8x2_t _dst2; |
| | uint8x8x2_t _dst3; |
| | uint8x8x2_t _dst4; |
| | uint8x8x2_t _dst5; |
| | uint8x8x2_t _dst6; |
| | uint8x8x2_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| |
|
| | vst2_u8(dst0, _dst7); |
| | vst2_u8(dst1, _dst6); |
| | vst2_u8(dst0 + dst_step, _dst5); |
| | vst2_u8(dst1 + dst_step, _dst4); |
| | vst2_u8(dst0 + 2 * dst_step, _dst3); |
| | vst2_u8(dst1 + 2 * dst_step, _dst2); |
| | vst2_u8(dst0 + 3 * dst_step, _dst1); |
| | vst2_u8(dst1 + 3 * dst_step, _dst0); |
| |
|
| | src0 += 2 * 8; |
| | src1 += 2 * 8; |
| |
|
| | dst0 += 4 * dst_step; |
| | dst1 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d0-d1}, [%1], %10 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d2-d3}, [%2], %10 \n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d4-d5}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q1, q0 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d6-d7}, [%2], %10 \n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d16-d17}, [%1], %10\n" |
| | |
| | "vtrn.u8 q3, q2 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d18-d19}, [%2], %10\n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d20-d21}, [%1], %10\n" |
| | |
| | "vtrn.u8 q9, q8 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d22-d23}, [%2], %10\n" |
| | |
| | "vtrn.u8 q11, q10 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q2, q0 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q3, q1 \n" |
| | |
| | "add %1, #16 \n" |
| | |
| | "vtrn.u16 q10, q8 \n" |
| | |
| | "add %2, #16 \n" |
| | |
| | "vtrn.u16 q11, q9 \n" |
| | |
| | "vtrn.u32 q10, q2 \n" |
| | |
| | "vtrn.u32 q11, q3 \n" |
| | "vst2.u8 {d20-d21}, [%4], %11\n" |
| | |
| | "vtrn.u32 q8, q0 \n" |
| | "vst2.u8 {d22-d23}, [%3], %11\n" |
| | |
| | "vtrn.u32 q9, q1 \n" |
| | "vst2.u8 {d16-d17}, [%4], %11\n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst2.u8 {d18-d19}, [%3], %11\n" |
| | "vst2.u8 {d4-d5}, [%4], %11 \n" |
| | "vst2.u8 {d6-d7}, [%3], %11 \n" |
| | "vst2.u8 {d0-d1}, [%4], %11 \n" |
| | "vst2.u8 {d2-d3}, [%3], %11 \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src1[0 + 3 * src_step]; |
| | dst0[1] = src1[1 + 3 * src_step]; |
| | dst0[2] = src0[0 + 3 * src_step]; |
| | dst0[3] = src0[1 + 3 * src_step]; |
| | dst0[4] = src1[0 + 2 * src_step]; |
| | dst0[5] = src1[1 + 2 * src_step]; |
| | dst0[6] = src0[0 + 2 * src_step]; |
| | dst0[7] = src0[1 + 2 * src_step]; |
| | dst0[8] = src1[0 + src_step]; |
| | dst0[9] = src1[1 + src_step]; |
| | dst0[10] = src0[0 + src_step]; |
| | dst0[11] = src0[1 + src_step]; |
| | dst0[12] = src1[0]; |
| | dst0[13] = src1[1]; |
| | dst0[14] = src0[0]; |
| | dst0[15] = src0[1]; |
| |
|
| | src0 += 2; |
| | src1 += 2; |
| |
|
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend - y * 2 - 2; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| |
|
| | src0 += 2; |
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_6_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 3; |
| |
|
| | |
| | unsigned char* dstend = dst + w * 3; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst0 = dstend - y * 3 - 8 * 3; |
| | unsigned char* dst1 = dstend - y * 3 - 8 * 3 + stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = 2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x3_t _src0 = vld3_u8(src0); |
| | uint8x8x3_t _src1 = vld3_u8(src1); |
| |
|
| | uint8x8x3_t _src2 = vld3_u8(src0 + src_step); |
| | uint8x8x3_t _src3 = vld3_u8(src1 + src_step); |
| |
|
| | uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); |
| | uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); |
| | uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); |
| |
|
| | uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); |
| | uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); |
| | uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); |
| | uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); |
| | uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); |
| | uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); |
| | uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); |
| | uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); |
| | uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); |
| | uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); |
| |
|
| | uint8x8x3_t _dst0; |
| | uint8x8x3_t _dst1; |
| | uint8x8x3_t _dst2; |
| | uint8x8x3_t _dst3; |
| | uint8x8x3_t _dst4; |
| | uint8x8x3_t _dst5; |
| | uint8x8x3_t _dst6; |
| | uint8x8x3_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| |
|
| | _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); |
| | _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); |
| | _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); |
| | _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); |
| | _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); |
| | _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); |
| | _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); |
| | _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); |
| |
|
| | vst3_u8(dst0, _dst7); |
| | vst3_u8(dst1, _dst6); |
| | vst3_u8(dst0 + dst_step, _dst5); |
| | vst3_u8(dst1 + dst_step, _dst4); |
| | vst3_u8(dst0 + 2 * dst_step, _dst3); |
| | vst3_u8(dst1 + 2 * dst_step, _dst2); |
| | vst3_u8(dst0 + 3 * dst_step, _dst1); |
| | vst3_u8(dst1 + 3 * dst_step, _dst0); |
| |
|
| | src0 += 3 * 8; |
| | src1 += 3 * 8; |
| |
|
| | dst0 += 4 * dst_step; |
| | dst1 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d0-d2}, [%1], %10 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d4-d6}, [%2], %10 \n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d8-d10}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q2, q0 \n" |
| | "vtrn.u8 d6, d2 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d12-d14}, [%2], %10\n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d16-d18}, [%1], %10\n" |
| | |
| | "vtrn.u8 q6, q4 \n" |
| | "vtrn.u8 d14, d10 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d20-d22}, [%2], %10\n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d24-d26}, [%1], %10\n" |
| | |
| | "vtrn.u8 q10, q8 \n" |
| | "vtrn.u8 d22, d18 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d28-d30}, [%2], %10\n" |
| | |
| | "vtrn.u8 q14, q12 \n" |
| | "vtrn.u8 d30, d26 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q4, q0 \n" |
| | "vtrn.u16 d10, d2 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q6, q2 \n" |
| | "vtrn.u16 d14, d6 \n" |
| | |
| | "add %1, #24 \n" |
| | |
| | "vtrn.u16 q12, q8 \n" |
| | "vtrn.u16 d26, d18 \n" |
| | |
| | "add %2, #24 \n" |
| | |
| | "vtrn.u16 q14, q10 \n" |
| | "vtrn.u16 d30, d22 \n" |
| | |
| | "vtrn.u32 q12, q4 \n" |
| | "vtrn.u32 d26, d10 \n" |
| | |
| | "vtrn.u32 q14, q6 \n" |
| | "vst3.u8 {d24-d26}, [%4], %11\n" |
| | "vtrn.u32 d30, d14 \n" |
| | |
| | "vtrn.u32 q8, q0 \n" |
| | "vst3.u8 {d28-d30}, [%3], %11\n" |
| | "vtrn.u32 d18, d2 \n" |
| | |
| | "vtrn.u32 q10, q2 \n" |
| | "vst3.u8 {d16-d18}, [%4], %11\n" |
| | "vtrn.u32 d22, d6 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst3.u8 {d20-d22}, [%3], %11\n" |
| | "vst3.u8 {d8-d10}, [%4], %11 \n" |
| | "vst3.u8 {d12-d14}, [%3], %11\n" |
| | "vst3.u8 {d0-d2}, [%4], %11 \n" |
| | "vst3.u8 {d4-d6}, [%3], %11 \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src1[0 + 3 * src_step]; |
| | dst0[1] = src1[1 + 3 * src_step]; |
| | dst0[2] = src1[2 + 3 * src_step]; |
| | dst0[3] = src0[0 + 3 * src_step]; |
| | dst0[4] = src0[1 + 3 * src_step]; |
| | dst0[5] = src0[2 + 3 * src_step]; |
| | dst0[6] = src1[0 + 2 * src_step]; |
| | dst0[7] = src1[1 + 2 * src_step]; |
| | dst0[8] = src1[2 + 2 * src_step]; |
| | dst0[9] = src0[0 + 2 * src_step]; |
| | dst0[10] = src0[1 + 2 * src_step]; |
| | dst0[11] = src0[2 + 2 * src_step]; |
| | dst0[12] = src1[0 + src_step]; |
| | dst0[13] = src1[1 + src_step]; |
| | dst0[14] = src1[2 + src_step]; |
| | dst0[15] = src0[0 + src_step]; |
| | dst0[16] = src0[1 + src_step]; |
| | dst0[17] = src0[2 + src_step]; |
| | dst0[18] = src1[0]; |
| | dst0[19] = src1[1]; |
| | dst0[20] = src1[2]; |
| | dst0[21] = src0[0]; |
| | dst0[22] = src0[1]; |
| | dst0[23] = src0[2]; |
| |
|
| | src0 += 3; |
| | src1 += 3; |
| |
|
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend - y * 3 - 3; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| |
|
| | src0 += 3; |
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_6_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int , int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 4; |
| |
|
| | |
| | unsigned char* dstend = dst + w * 4; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst0 = dstend - y * 4 - 8 * 4; |
| | unsigned char* dst1 = dstend - y * 4 - 8 * 4 + stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = 2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x4_t _src0 = vld4_u8(src0); |
| | uint8x8x4_t _src1 = vld4_u8(src1); |
| |
|
| | uint8x8x4_t _src2 = vld4_u8(src0 + src_step); |
| | uint8x8x4_t _src3 = vld4_u8(src1 + src_step); |
| |
|
| | uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); |
| | uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); |
| | uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); |
| |
|
| | uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); |
| | uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); |
| | uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); |
| | uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); |
| |
|
| | uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]); |
| | uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]); |
| | uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]); |
| | uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); |
| | uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); |
| | uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); |
| | uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1])); |
| | uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0])); |
| | uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1])); |
| | uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); |
| | uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); |
| | uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); |
| | uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1])); |
| | uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1])); |
| | uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0])); |
| | uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0])); |
| |
|
| | uint8x8x4_t _dst0; |
| | uint8x8x4_t _dst1; |
| | uint8x8x4_t _dst2; |
| | uint8x8x4_t _dst3; |
| | uint8x8x4_t _dst4; |
| | uint8x8x4_t _dst5; |
| | uint8x8x4_t _dst6; |
| | uint8x8x4_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| |
|
| | _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); |
| | _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); |
| | _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); |
| | _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); |
| | _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); |
| | _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); |
| | _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); |
| | _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); |
| |
|
| | _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); |
| | _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); |
| | _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); |
| | _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); |
| | _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); |
| | _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); |
| | _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); |
| | _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); |
| |
|
| | vst4_u8(dst0, _dst7); |
| | vst4_u8(dst1, _dst6); |
| | vst4_u8(dst0 + dst_step, _dst5); |
| | vst4_u8(dst1 + dst_step, _dst4); |
| | vst4_u8(dst0 + 2 * dst_step, _dst3); |
| | vst4_u8(dst1 + 2 * dst_step, _dst2); |
| | vst4_u8(dst0 + 3 * dst_step, _dst1); |
| | vst4_u8(dst1 + 3 * dst_step, _dst0); |
| |
|
| | src0 += 4 * 8; |
| | src1 += 4 * 8; |
| |
|
| | dst0 += 4 * dst_step; |
| | dst1 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d0-d3}, [%1], %10 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d4-d7}, [%2], %10 \n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d8-d11}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q2, q0 \n" |
| | "vtrn.u8 q3, q1 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d12-d15}, [%2], %10\n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d16-d19}, [%1], %10\n" |
| | |
| | "vtrn.u8 q6, q4 \n" |
| | "vtrn.u8 q7, q5 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d20-d23}, [%2], %10\n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d24-d27}, [%1], %10\n" |
| | |
| | "vtrn.u8 q10, q8 \n" |
| | "vtrn.u8 q11, q9 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d28-d31}, [%2], %10\n" |
| | |
| | "vtrn.u8 q14, q12 \n" |
| | "vtrn.u8 q15, q13 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q4, q0 \n" |
| | "vtrn.u16 q5, q1 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q6, q2 \n" |
| | "vtrn.u16 q7, q3 \n" |
| | |
| | "add %1, #32 \n" |
| | |
| | "vtrn.u16 q12, q8 \n" |
| | "vtrn.u16 q13, q9 \n" |
| | |
| | "add %2, #32 \n" |
| | |
| | "vtrn.u16 q14, q10 \n" |
| | "vtrn.u16 q15, q11 \n" |
| | |
| | "vtrn.u32 q12, q4 \n" |
| | "vtrn.u32 q13, q5 \n" |
| | |
| | "vtrn.u32 q14, q6 \n" |
| | "vst4.u8 {d24-d27}, [%4], %11\n" |
| | "vtrn.u32 q15, q7 \n" |
| | |
| | "vtrn.u32 q8, q0 \n" |
| | "vst4.u8 {d28-d31}, [%3], %11\n" |
| | "vtrn.u32 q9, q1 \n" |
| | |
| | "vtrn.u32 q10, q2 \n" |
| | "vst4.u8 {d16-d19}, [%4], %11\n" |
| | "vtrn.u32 q11, q3 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst4.u8 {d8-d11}, [%4], %11 \n" |
| | "vst4.u8 {d20-d23}, [%3], %11\n" |
| | "vst4.u8 {d12-d15}, [%3], %11\n" |
| | "vst4.u8 {d0-d3}, [%4], %11 \n" |
| | "vst4.u8 {d4-d7}, [%3], %11 \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst0), |
| | "=r"(dst1) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst0), |
| | "4"(dst1), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst0[0] = src1[0 + 3 * src_step]; |
| | dst0[1] = src1[1 + 3 * src_step]; |
| | dst0[2] = src1[2 + 3 * src_step]; |
| | dst0[3] = src1[3 + 3 * src_step]; |
| | dst0[4] = src0[0 + 3 * src_step]; |
| | dst0[5] = src0[1 + 3 * src_step]; |
| | dst0[6] = src0[2 + 3 * src_step]; |
| | dst0[7] = src0[3 + 3 * src_step]; |
| | dst0[8] = src1[0 + 2 * src_step]; |
| | dst0[9] = src1[1 + 2 * src_step]; |
| | dst0[10] = src1[2 + 2 * src_step]; |
| | dst0[11] = src1[3 + 2 * src_step]; |
| | dst0[12] = src0[0 + 2 * src_step]; |
| | dst0[13] = src0[1 + 2 * src_step]; |
| | dst0[14] = src0[2 + 2 * src_step]; |
| | dst0[15] = src0[3 + 2 * src_step]; |
| | dst0[16] = src1[0 + src_step]; |
| | dst0[17] = src1[1 + src_step]; |
| | dst0[18] = src1[2 + src_step]; |
| | dst0[19] = src1[3 + src_step]; |
| | dst0[20] = src0[0 + src_step]; |
| | dst0[21] = src0[1 + src_step]; |
| | dst0[22] = src0[2 + src_step]; |
| | dst0[23] = src0[3 + src_step]; |
| | dst0[24] = src1[0]; |
| | dst0[25] = src1[1]; |
| | dst0[26] = src1[2]; |
| | dst0[27] = src1[3]; |
| | dst0[28] = src0[0]; |
| | dst0[29] = src0[1]; |
| | dst0[30] = src0[2]; |
| | dst0[31] = src0[3]; |
| |
|
| | src0 += 4; |
| | src1 += 4; |
| |
|
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend - y * 4 - 4; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| | dst0[3] = src0[3]; |
| |
|
| | src0 += 4; |
| | dst0 += stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_7_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1) + w; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst6 = dstend - y - 8 - stride; |
| | unsigned char* dst7 = dstend - y - 8; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = -2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8_t _src0 = vld1_u8(src0); |
| | uint8x8_t _src1 = vld1_u8(src1); |
| |
|
| | uint8x8_t _src2 = vld1_u8(src0 + src_step); |
| | uint8x8_t _src3 = vld1_u8(src1 + src_step); |
| |
|
| | uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); |
| | uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); |
| | uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); |
| |
|
| | uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| | uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| |
|
| | vst1_u8(dst7, _dst7); |
| | vst1_u8(dst6, _dst6); |
| | vst1_u8(dst7 + dst_step, _dst5); |
| | vst1_u8(dst6 + dst_step, _dst4); |
| | vst1_u8(dst7 + 2 * dst_step, _dst3); |
| | vst1_u8(dst6 + 2 * dst_step, _dst2); |
| | vst1_u8(dst7 + 3 * dst_step, _dst1); |
| | vst1_u8(dst6 + 3 * dst_step, _dst0); |
| |
|
| | src0 += 8; |
| | src1 += 8; |
| |
|
| | dst7 += 4 * dst_step; |
| | dst6 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d0}, [%1], %10 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d1}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d2}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d1, d0 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d3}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d4}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d3, d2 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d5}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d6}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d5, d4 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d7}, [%2], %10 \n" |
| | |
| | "vtrn.u8 d7, d6 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q1, q0 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q3, q2 \n" |
| | |
| | "add %1, #8 \n" |
| | |
| | "vtrn.u32 q3, q1 \n" |
| | |
| | "add %2, #8 \n" |
| | |
| | "vtrn.u32 q2, q0 \n" |
| | "vst1.u8 {d6}, [%4], %11 \n" |
| | "vst1.u8 {d7}, [%3], %11 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst1.u8 {d4}, [%4], %11 \n" |
| | "vst1.u8 {d5}, [%3], %11 \n" |
| | "vst1.u8 {d2}, [%4], %11 \n" |
| | "vst1.u8 {d3}, [%3], %11 \n" |
| | "vst1.u8 {d0}, [%4], %11 \n" |
| | "vst1.u8 {d1}, [%3], %11 \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst7), |
| | "=r"(dst6) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst7), |
| | "4"(dst6), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst7[0] = src1[0 + 3 * src_step]; |
| | dst7[1] = src0[0 + 3 * src_step]; |
| | dst7[2] = src1[0 + 2 * src_step]; |
| | dst7[3] = src0[0 + 2 * src_step]; |
| | dst7[4] = src1[0 + src_step]; |
| | dst7[5] = src0[0 + src_step]; |
| | dst7[6] = src1[0]; |
| | dst7[7] = src0[0]; |
| |
|
| | src0 += 1; |
| | src1 += 1; |
| |
|
| | dst7 -= stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend - y - 1; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | *dst0 = *src0; |
| |
|
| | src0 += 1; |
| | dst0 -= stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_7_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 2; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1) + w * 2; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst6 = dstend - y * 2 - 8 * 2 - stride; |
| | unsigned char* dst7 = dstend - y * 2 - 8 * 2; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = -2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x2_t _src0 = vld2_u8(src0); |
| | uint8x8x2_t _src1 = vld2_u8(src1); |
| |
|
| | uint8x8x2_t _src2 = vld2_u8(src0 + src_step); |
| | uint8x8x2_t _src3 = vld2_u8(src1 + src_step); |
| |
|
| | uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); |
| | uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); |
| | uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); |
| |
|
| | uint8x8x2_t _dst0; |
| | uint8x8x2_t _dst1; |
| | uint8x8x2_t _dst2; |
| | uint8x8x2_t _dst3; |
| | uint8x8x2_t _dst4; |
| | uint8x8x2_t _dst5; |
| | uint8x8x2_t _dst6; |
| | uint8x8x2_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| |
|
| | vst2_u8(dst7, _dst7); |
| | vst2_u8(dst6, _dst6); |
| | vst2_u8(dst7 + dst_step, _dst5); |
| | vst2_u8(dst6 + dst_step, _dst4); |
| | vst2_u8(dst7 + 2 * dst_step, _dst3); |
| | vst2_u8(dst6 + 2 * dst_step, _dst2); |
| | vst2_u8(dst7 + 3 * dst_step, _dst1); |
| | vst2_u8(dst6 + 3 * dst_step, _dst0); |
| |
|
| | src0 += 2 * 8; |
| | src1 += 2 * 8; |
| |
|
| | dst7 += 4 * dst_step; |
| | dst6 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d0-d1}, [%1], %10 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d2-d3}, [%2], %10 \n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d4-d5}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q1, q0 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d6-d7}, [%2], %10 \n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d16-d17}, [%1], %10\n" |
| | |
| | "vtrn.u8 q3, q2 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d18-d19}, [%2], %10\n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d20-d21}, [%1], %10\n" |
| | |
| | "vtrn.u8 q9, q8 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d22-d23}, [%2], %10\n" |
| | |
| | "vtrn.u8 q11, q10 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q2, q0 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q3, q1 \n" |
| | |
| | "add %1, #16 \n" |
| | |
| | "vtrn.u16 q10, q8 \n" |
| | |
| | "add %2, #16 \n" |
| | |
| | "vtrn.u16 q11, q9 \n" |
| | |
| | "vtrn.u32 q10, q2 \n" |
| | |
| | "vtrn.u32 q11, q3 \n" |
| | "vst2.u8 {d20-d21}, [%4], %11\n" |
| | |
| | "vtrn.u32 q8, q0 \n" |
| | "vst2.u8 {d22-d23}, [%3], %11\n" |
| | |
| | "vtrn.u32 q9, q1 \n" |
| | "vst2.u8 {d16-d17}, [%4], %11\n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst2.u8 {d4-d5}, [%4], %11 \n" |
| | "vst2.u8 {d18-d19}, [%3], %11\n" |
| | "vst2.u8 {d6-d7}, [%3], %11 \n" |
| | "vst2.u8 {d0-d1}, [%4], %11 \n" |
| | "vst2.u8 {d2-d3}, [%3], %11 \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst7), |
| | "=r"(dst6) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst7), |
| | "4"(dst6), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst7[0] = src1[0 + 3 * src_step]; |
| | dst7[1] = src1[1 + 3 * src_step]; |
| | dst7[2] = src0[0 + 3 * src_step]; |
| | dst7[3] = src0[1 + 3 * src_step]; |
| | dst7[4] = src1[0 + 2 * src_step]; |
| | dst7[5] = src1[1 + 2 * src_step]; |
| | dst7[6] = src0[0 + 2 * src_step]; |
| | dst7[7] = src0[1 + 2 * src_step]; |
| | dst7[8] = src1[0 + src_step]; |
| | dst7[9] = src1[1 + src_step]; |
| | dst7[10] = src0[0 + src_step]; |
| | dst7[11] = src0[1 + src_step]; |
| | dst7[12] = src1[0]; |
| | dst7[13] = src1[1]; |
| | dst7[14] = src0[0]; |
| | dst7[15] = src0[1]; |
| |
|
| | src0 += 2; |
| | src1 += 2; |
| |
|
| | dst7 -= stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend - y * 2 - 2; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| |
|
| | src0 += 2; |
| | dst0 -= stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_7_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 3; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1) + w * 3; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst6 = dstend - y * 3 - 8 * 3 - stride; |
| | unsigned char* dst7 = dstend - y * 3 - 8 * 3; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = -2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x3_t _src0 = vld3_u8(src0); |
| | uint8x8x3_t _src1 = vld3_u8(src1); |
| |
|
| | uint8x8x3_t _src2 = vld3_u8(src0 + src_step); |
| | uint8x8x3_t _src3 = vld3_u8(src1 + src_step); |
| |
|
| | uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); |
| | uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); |
| | uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); |
| |
|
| | uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); |
| | uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); |
| | uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); |
| | uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); |
| | uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); |
| | uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); |
| | uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); |
| | uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); |
| | uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); |
| | uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); |
| |
|
| | uint8x8x3_t _dst0; |
| | uint8x8x3_t _dst1; |
| | uint8x8x3_t _dst2; |
| | uint8x8x3_t _dst3; |
| | uint8x8x3_t _dst4; |
| | uint8x8x3_t _dst5; |
| | uint8x8x3_t _dst6; |
| | uint8x8x3_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| |
|
| | _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); |
| | _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); |
| | _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); |
| | _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); |
| | _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); |
| | _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); |
| | _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); |
| | _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); |
| |
|
| | vst3_u8(dst7, _dst7); |
| | vst3_u8(dst6, _dst6); |
| | vst3_u8(dst7 + dst_step, _dst5); |
| | vst3_u8(dst6 + dst_step, _dst4); |
| | vst3_u8(dst7 + 2 * dst_step, _dst3); |
| | vst3_u8(dst6 + 2 * dst_step, _dst2); |
| | vst3_u8(dst7 + 3 * dst_step, _dst1); |
| | vst3_u8(dst6 + 3 * dst_step, _dst0); |
| |
|
| | src0 += 3 * 8; |
| | src1 += 3 * 8; |
| |
|
| | dst7 += 4 * dst_step; |
| | dst6 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d0-d2}, [%1], %10 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d4-d6}, [%2], %10 \n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d8-d10}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q2, q0 \n" |
| | "vtrn.u8 d6, d2 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d12-d14}, [%2], %10\n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d16-d18}, [%1], %10\n" |
| | |
| | "vtrn.u8 q6, q4 \n" |
| | "vtrn.u8 d14, d10 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d20-d22}, [%2], %10\n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d24-d26}, [%1], %10\n" |
| | |
| | "vtrn.u8 q10, q8 \n" |
| | "vtrn.u8 d22, d18 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d28-d30}, [%2], %10\n" |
| | |
| | "vtrn.u8 q14, q12 \n" |
| | "vtrn.u8 d30, d26 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q4, q0 \n" |
| | "vtrn.u16 d10, d2 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q6, q2 \n" |
| | "vtrn.u16 d14, d6 \n" |
| | |
| | "add %1, #24 \n" |
| | |
| | "vtrn.u16 q12, q8 \n" |
| | "vtrn.u16 d26, d18 \n" |
| | |
| | "add %2, #24 \n" |
| | |
| | "vtrn.u16 q14, q10 \n" |
| | "vtrn.u16 d30, d22 \n" |
| | |
| | "vtrn.u32 q12, q4 \n" |
| | "vtrn.u32 d26, d10 \n" |
| | |
| | "vtrn.u32 q14, q6 \n" |
| | "vst3.u8 {d24-d26}, [%4], %11\n" |
| | "vtrn.u32 d30, d14 \n" |
| | |
| | "vtrn.u32 q8, q0 \n" |
| | "vst3.u8 {d28-d30}, [%3], %11\n" |
| | "vtrn.u32 d18, d2 \n" |
| | |
| | "vtrn.u32 q10, q2 \n" |
| | "vst3.u8 {d16-d18}, [%4], %11\n" |
| | "vtrn.u32 d22, d6 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst3.u8 {d8-d10}, [%4], %11 \n" |
| | "vst3.u8 {d20-d22}, [%3], %11\n" |
| | "vst3.u8 {d12-d14}, [%3], %11\n" |
| | "vst3.u8 {d0-d2}, [%4], %11 \n" |
| | "vst3.u8 {d4-d6}, [%3], %11 \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst7), |
| | "=r"(dst6) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst7), |
| | "4"(dst6), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst7[0] = src1[0 + 3 * src_step]; |
| | dst7[1] = src1[1 + 3 * src_step]; |
| | dst7[2] = src1[2 + 3 * src_step]; |
| | dst7[3] = src0[0 + 3 * src_step]; |
| | dst7[4] = src0[1 + 3 * src_step]; |
| | dst7[5] = src0[2 + 3 * src_step]; |
| | dst7[6] = src1[0 + 2 * src_step]; |
| | dst7[7] = src1[1 + 2 * src_step]; |
| | dst7[8] = src1[2 + 2 * src_step]; |
| | dst7[9] = src0[0 + 2 * src_step]; |
| | dst7[10] = src0[1 + 2 * src_step]; |
| | dst7[11] = src0[2 + 2 * src_step]; |
| | dst7[12] = src1[0 + src_step]; |
| | dst7[13] = src1[1 + src_step]; |
| | dst7[14] = src1[2 + src_step]; |
| | dst7[15] = src0[0 + src_step]; |
| | dst7[16] = src0[1 + src_step]; |
| | dst7[17] = src0[2 + src_step]; |
| | dst7[18] = src1[0]; |
| | dst7[19] = src1[1]; |
| | dst7[20] = src1[2]; |
| | dst7[21] = src0[0]; |
| | dst7[22] = src0[1]; |
| | dst7[23] = src0[2]; |
| |
|
| | src0 += 3; |
| | src1 += 3; |
| |
|
| | dst7 -= stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend - y * 3 - 3; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| |
|
| | src0 += 3; |
| | dst0 -= stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_7_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 4; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1) + w * 4; |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst6 = dstend - y * 4 - 8 * 4 - stride; |
| | unsigned char* dst7 = dstend - y * 4 - 8 * 4; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = -2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x4_t _src0 = vld4_u8(src0); |
| | uint8x8x4_t _src1 = vld4_u8(src1); |
| |
|
| | uint8x8x4_t _src2 = vld4_u8(src0 + src_step); |
| | uint8x8x4_t _src3 = vld4_u8(src1 + src_step); |
| |
|
| | uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); |
| | uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); |
| | uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]); |
| |
|
| | uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]); |
| | uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]); |
| | uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]); |
| | uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]); |
| |
|
| | uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]); |
| | uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]); |
| | uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]); |
| | uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1])); |
| | uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0])); |
| | uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1])); |
| | uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0])); |
| |
|
| | uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1])); |
| | uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0])); |
| | uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1])); |
| | uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1])); |
| | uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1])); |
| | uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0])); |
| | uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0])); |
| |
|
| | uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1])); |
| | uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1])); |
| | uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0])); |
| | uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0])); |
| |
|
| | uint8x8x4_t _dst0; |
| | uint8x8x4_t _dst1; |
| | uint8x8x4_t _dst2; |
| | uint8x8x4_t _dst3; |
| | uint8x8x4_t _dst4; |
| | uint8x8x4_t _dst5; |
| | uint8x8x4_t _dst6; |
| | uint8x8x4_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| |
|
| | _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); |
| | _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); |
| | _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); |
| | _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); |
| | _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); |
| | _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); |
| | _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); |
| | _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); |
| |
|
| | _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); |
| | _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); |
| | _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); |
| | _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); |
| | _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); |
| | _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); |
| | _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); |
| | _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); |
| |
|
| | vst4_u8(dst7, _dst7); |
| | vst4_u8(dst6, _dst6); |
| | vst4_u8(dst7 + dst_step, _dst5); |
| | vst4_u8(dst6 + dst_step, _dst4); |
| | vst4_u8(dst7 + 2 * dst_step, _dst3); |
| | vst4_u8(dst6 + 2 * dst_step, _dst2); |
| | vst4_u8(dst7 + 3 * dst_step, _dst1); |
| | vst4_u8(dst6 + 3 * dst_step, _dst0); |
| |
|
| | src0 += 4 * 8; |
| | src1 += 4 * 8; |
| |
|
| | dst7 += 4 * dst_step; |
| | dst6 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d0-d3}, [%1], %10 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d4-d7}, [%2], %10 \n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d8-d11}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q2, q0 \n" |
| | "vtrn.u8 q3, q1 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d12-d15}, [%2], %10\n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d16-d19}, [%1], %10\n" |
| | |
| | "vtrn.u8 q6, q4 \n" |
| | "vtrn.u8 q7, q5 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d20-d23}, [%2], %10\n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d24-d27}, [%1], %10\n" |
| | |
| | "vtrn.u8 q10, q8 \n" |
| | "vtrn.u8 q11, q9 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d28-d31}, [%2], %10\n" |
| | |
| | "vtrn.u8 q14, q12 \n" |
| | "vtrn.u8 q15, q13 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q4, q0 \n" |
| | "vtrn.u16 q5, q1 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q6, q2 \n" |
| | "vtrn.u16 q7, q3 \n" |
| | |
| | "add %1, #32 \n" |
| | |
| | "vtrn.u16 q12, q8 \n" |
| | "vtrn.u16 q13, q9 \n" |
| | |
| | "add %2, #32 \n" |
| | |
| | "vtrn.u16 q14, q10 \n" |
| | "vtrn.u16 q15, q11 \n" |
| | |
| | "vtrn.u32 q12, q4 \n" |
| | "vtrn.u32 q13, q5 \n" |
| | |
| | "vtrn.u32 q14, q6 \n" |
| | "vst4.u8 {d24-d27}, [%4], %11\n" |
| | "vtrn.u32 q15, q7 \n" |
| | |
| | "vtrn.u32 q8, q0 \n" |
| | "vst4.u8 {d28-d31}, [%3], %11\n" |
| | "vtrn.u32 q9, q1 \n" |
| | |
| | "vtrn.u32 q10, q2 \n" |
| | "vst4.u8 {d16-d19}, [%4], %11\n" |
| | "vtrn.u32 q11, q3 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst4.u8 {d8-d11}, [%4], %11 \n" |
| | "vst4.u8 {d20-d23}, [%3], %11\n" |
| | "vst4.u8 {d12-d15}, [%3], %11\n" |
| | "vst4.u8 {d0-d3}, [%4], %11 \n" |
| | "vst4.u8 {d4-d7}, [%3], %11 \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst7), |
| | "=r"(dst6) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst7), |
| | "4"(dst6), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst7[0] = src1[0 + 3 * src_step]; |
| | dst7[1] = src1[1 + 3 * src_step]; |
| | dst7[2] = src1[2 + 3 * src_step]; |
| | dst7[3] = src1[3 + 3 * src_step]; |
| | dst7[4] = src0[0 + 3 * src_step]; |
| | dst7[5] = src0[1 + 3 * src_step]; |
| | dst7[6] = src0[2 + 3 * src_step]; |
| | dst7[7] = src0[3 + 3 * src_step]; |
| | dst7[8] = src1[0 + 2 * src_step]; |
| | dst7[9] = src1[1 + 2 * src_step]; |
| | dst7[10] = src1[2 + 2 * src_step]; |
| | dst7[11] = src1[3 + 2 * src_step]; |
| | dst7[12] = src0[0 + 2 * src_step]; |
| | dst7[13] = src0[1 + 2 * src_step]; |
| | dst7[14] = src0[2 + 2 * src_step]; |
| | dst7[15] = src0[3 + 2 * src_step]; |
| | dst7[16] = src1[0 + src_step]; |
| | dst7[17] = src1[1 + src_step]; |
| | dst7[18] = src1[2 + src_step]; |
| | dst7[19] = src1[3 + src_step]; |
| | dst7[20] = src0[0 + src_step]; |
| | dst7[21] = src0[1 + src_step]; |
| | dst7[22] = src0[2 + src_step]; |
| | dst7[23] = src0[3 + src_step]; |
| | dst7[24] = src1[0]; |
| | dst7[25] = src1[1]; |
| | dst7[26] = src1[2]; |
| | dst7[27] = src1[3]; |
| | dst7[28] = src0[0]; |
| | dst7[29] = src0[1]; |
| | dst7[30] = src0[2]; |
| | dst7[31] = src0[3]; |
| |
|
| | src0 += 4; |
| | src1 += 4; |
| |
|
| | dst7 -= stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend - y * 4 - 4; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| | dst0[3] = src0[3]; |
| |
|
| | src0 += 4; |
| | dst0 -= stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_8_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int , int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1); |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst7 = dstend + y; |
| | unsigned char* dst6 = dstend + y - stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = -2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8_t _src0 = vld1_u8(src0); |
| | uint8x8_t _src1 = vld1_u8(src1); |
| |
|
| | uint8x8_t _src2 = vld1_u8(src0 + src_step); |
| | uint8x8_t _src3 = vld1_u8(src1 + src_step); |
| |
|
| | uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step); |
| | uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step); |
| | uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); |
| |
|
| | uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| | uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| |
|
| | vst1_u8(dst7, _dst0); |
| | vst1_u8(dst6, _dst1); |
| | vst1_u8(dst7 + dst_step, _dst2); |
| | vst1_u8(dst6 + dst_step, _dst3); |
| | vst1_u8(dst7 + 2 * dst_step, _dst4); |
| | vst1_u8(dst6 + 2 * dst_step, _dst5); |
| | vst1_u8(dst7 + 3 * dst_step, _dst6); |
| | vst1_u8(dst6 + 3 * dst_step, _dst7); |
| |
|
| | src0 += 8; |
| | src1 += 8; |
| |
|
| | dst7 += 4 * dst_step; |
| | dst6 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d0}, [%1], %10 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d1}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d2}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d0, d1 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d3}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d4}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d2, d3 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d5}, [%2], %10 \n" |
| | |
| | "pld [%1, #64] \n" |
| | "vld1.u8 {d6}, [%1], %10 \n" |
| | |
| | "vtrn.u8 d4, d5 \n" |
| | |
| | "pld [%2, #64] \n" |
| | "vld1.u8 {d7}, [%2], %10 \n" |
| | |
| | "vtrn.u8 d6, d7 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q0, q1 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q2, q3 \n" |
| | |
| | "add %1, #8 \n" |
| | |
| | "vtrn.u32 q0, q2 \n" |
| | |
| | "add %2, #8 \n" |
| | |
| | "vtrn.u32 q1, q3 \n" |
| | "vst1.u8 {d0}, [%3], %11 \n" |
| | "vst1.u8 {d1}, [%4], %11 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst1.u8 {d2}, [%3], %11 \n" |
| | "vst1.u8 {d3}, [%4], %11 \n" |
| | "vst1.u8 {d4}, [%3], %11 \n" |
| | "vst1.u8 {d5}, [%4], %11 \n" |
| | "vst1.u8 {d6}, [%3], %11 \n" |
| | "vst1.u8 {d7}, [%4], %11 \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst7), |
| | "=r"(dst6) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst7), |
| | "4"(dst6), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst7[0] = src0[0]; |
| | dst7[1] = src1[0]; |
| | dst7[2] = src0[0 + src_step]; |
| | dst7[3] = src1[0 + src_step]; |
| | dst7[4] = src0[0 + 2 * src_step]; |
| | dst7[5] = src1[0 + 2 * src_step]; |
| | dst7[6] = src0[0 + 3 * src_step]; |
| | dst7[7] = src1[0 + 3 * src_step]; |
| |
|
| | src0 += 1; |
| | src1 += 1; |
| |
|
| | dst7 -= stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend + y; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | *dst0 = *src0; |
| |
|
| | src0 += 1; |
| | dst0 -= stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_8_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int , int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 2; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1); |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst7 = dstend + y * 2; |
| | unsigned char* dst6 = dstend + y * 2 - stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = -2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x2_t _src0 = vld2_u8(src0); |
| | uint8x8x2_t _src1 = vld2_u8(src1); |
| |
|
| | uint8x8x2_t _src2 = vld2_u8(src0 + src_step); |
| | uint8x8x2_t _src3 = vld2_u8(src1 + src_step); |
| |
|
| | uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step); |
| | uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step); |
| | uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); |
| |
|
| | uint8x8x2_t _dst0; |
| | uint8x8x2_t _dst1; |
| | uint8x8x2_t _dst2; |
| | uint8x8x2_t _dst3; |
| | uint8x8x2_t _dst4; |
| | uint8x8x2_t _dst5; |
| | uint8x8x2_t _dst6; |
| | uint8x8x2_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| |
|
| | vst2_u8(dst7, _dst0); |
| | vst2_u8(dst6, _dst1); |
| | vst2_u8(dst7 + dst_step, _dst2); |
| | vst2_u8(dst6 + dst_step, _dst3); |
| | vst2_u8(dst7 + 2 * dst_step, _dst4); |
| | vst2_u8(dst6 + 2 * dst_step, _dst5); |
| | vst2_u8(dst7 + 3 * dst_step, _dst6); |
| | vst2_u8(dst6 + 3 * dst_step, _dst7); |
| |
|
| | src0 += 2 * 8; |
| | src1 += 2 * 8; |
| |
|
| | dst7 += 4 * dst_step; |
| | dst6 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d0-d1}, [%1], %10 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d2-d3}, [%2], %10 \n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d4-d5}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q0, q1 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d6-d7}, [%2], %10 \n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d16-d17}, [%1], %10\n" |
| | |
| | "vtrn.u8 q2, q3 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d18-d19}, [%2], %10\n" |
| | |
| | "pld [%1, #128] \n" |
| | "vld2.u8 {d20-d21}, [%1], %10\n" |
| | |
| | "vtrn.u8 q8, q9 \n" |
| | |
| | "pld [%2, #128] \n" |
| | "vld2.u8 {d22-d23}, [%2], %10\n" |
| | |
| | "vtrn.u8 q10, q11 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q0, q2 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q1, q3 \n" |
| | |
| | "add %1, #16 \n" |
| | |
| | "vtrn.u16 q8, q10 \n" |
| | |
| | "add %2, #16 \n" |
| | |
| | "vtrn.u16 q9, q11 \n" |
| | |
| | "vtrn.u32 q0, q8 \n" |
| | |
| | "vtrn.u32 q1, q9 \n" |
| | "vst2.u8 {d0-d1}, [%3], %11 \n" |
| | |
| | "vtrn.u32 q2, q10 \n" |
| | "vst2.u8 {d2-d3}, [%4], %11 \n" |
| | |
| | "vtrn.u32 q3, q11 \n" |
| | "vst2.u8 {d4-d5}, [%3], %11 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst2.u8 {d16-d17}, [%3], %11\n" |
| | "vst2.u8 {d6-d7}, [%4], %11 \n" |
| | "vst2.u8 {d18-d19}, [%4], %11\n" |
| | "vst2.u8 {d20-d21}, [%3], %11\n" |
| | "vst2.u8 {d22-d23}, [%4], %11\n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst7), |
| | "=r"(dst6) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst7), |
| | "4"(dst6), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst7[0] = src0[0]; |
| | dst7[1] = src0[1]; |
| | dst7[2] = src1[0]; |
| | dst7[3] = src1[1]; |
| | dst7[4] = src0[0 + src_step]; |
| | dst7[5] = src0[1 + src_step]; |
| | dst7[6] = src1[0 + src_step]; |
| | dst7[7] = src1[1 + src_step]; |
| | dst7[8] = src0[0 + 2 * src_step]; |
| | dst7[9] = src0[1 + 2 * src_step]; |
| | dst7[10] = src1[0 + 2 * src_step]; |
| | dst7[11] = src1[1 + 2 * src_step]; |
| | dst7[12] = src0[0 + 3 * src_step]; |
| | dst7[13] = src0[1 + 3 * src_step]; |
| | dst7[14] = src1[0 + 3 * src_step]; |
| | dst7[15] = src1[1 + 3 * src_step]; |
| |
|
| | src0 += 2; |
| | src1 += 2; |
| |
|
| | dst7 -= stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend + y * 2; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| |
|
| | src0 += 2; |
| | dst0 -= stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_8_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int , int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 3; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1); |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst7 = dstend + y * 3; |
| | unsigned char* dst6 = dstend + y * 3 - stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = -2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x3_t _src0 = vld3_u8(src0); |
| | uint8x8x3_t _src1 = vld3_u8(src1); |
| |
|
| | uint8x8x3_t _src2 = vld3_u8(src0 + src_step); |
| | uint8x8x3_t _src3 = vld3_u8(src1 + src_step); |
| |
|
| | uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step); |
| | uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step); |
| | uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); |
| |
|
| | uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); |
| | uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); |
| | uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); |
| | uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); |
| | uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); |
| | uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); |
| | uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); |
| | uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); |
| | uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); |
| | uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); |
| |
|
| | uint8x8x3_t _dst0; |
| | uint8x8x3_t _dst1; |
| | uint8x8x3_t _dst2; |
| | uint8x8x3_t _dst3; |
| | uint8x8x3_t _dst4; |
| | uint8x8x3_t _dst5; |
| | uint8x8x3_t _dst6; |
| | uint8x8x3_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| |
|
| | _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); |
| | _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); |
| | _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); |
| | _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); |
| | _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); |
| | _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); |
| | _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); |
| | _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); |
| |
|
| | vst3_u8(dst7, _dst0); |
| | vst3_u8(dst6, _dst1); |
| | vst3_u8(dst7 + dst_step, _dst2); |
| | vst3_u8(dst6 + dst_step, _dst3); |
| | vst3_u8(dst7 + 2 * dst_step, _dst4); |
| | vst3_u8(dst6 + 2 * dst_step, _dst5); |
| | vst3_u8(dst7 + 3 * dst_step, _dst6); |
| | vst3_u8(dst6 + 3 * dst_step, _dst7); |
| |
|
| | src0 += 3 * 8; |
| | src1 += 3 * 8; |
| |
|
| | dst7 += 4 * dst_step; |
| | dst6 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d0-d2}, [%1], %10 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d4-d6}, [%2], %10 \n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d8-d10}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q0, q2 \n" |
| | "vtrn.u8 d2, d6 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d12-d14}, [%2], %10\n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d16-d18}, [%1], %10\n" |
| | |
| | "vtrn.u8 q4, q6 \n" |
| | "vtrn.u8 d10, d14 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d20-d22}, [%2], %10\n" |
| | |
| | "pld [%1, #192] \n" |
| | "vld3.u8 {d24-d26}, [%1], %10\n" |
| | |
| | "vtrn.u8 q8, q10 \n" |
| | "vtrn.u8 d18, d22 \n" |
| | |
| | "pld [%2, #192] \n" |
| | "vld3.u8 {d28-d30}, [%2], %10\n" |
| | |
| | "vtrn.u8 q12, q14 \n" |
| | "vtrn.u8 d26, d30 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q0, q4 \n" |
| | "vtrn.u16 d2, d10 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q2, q6 \n" |
| | "vtrn.u16 d6, d14 \n" |
| | |
| | "add %1, #24 \n" |
| | |
| | "vtrn.u16 q8, q12 \n" |
| | "vtrn.u16 d18, d26 \n" |
| | |
| | "add %2, #24 \n" |
| | |
| | "vtrn.u16 q10, q14 \n" |
| | "vtrn.u16 d22, d30 \n" |
| | |
| | "vtrn.u32 q0, q8 \n" |
| | "vtrn.u32 d2, d18 \n" |
| | |
| | "vtrn.u32 q2, q10 \n" |
| | "vst3.u8 {d0-d2}, [%3], %11 \n" |
| | "vtrn.u32 d6, d22 \n" |
| | |
| | "vtrn.u32 q4, q12 \n" |
| | "vst3.u8 {d4-d6}, [%4], %11 \n" |
| | "vtrn.u32 d10, d26 \n" |
| | |
| | "vtrn.u32 q6, q14 \n" |
| | "vst3.u8 {d8-d10}, [%3], %11 \n" |
| | "vtrn.u32 d14, d30 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst3.u8 {d16-d18}, [%3], %11\n" |
| | "vst3.u8 {d12-d14}, [%4], %11\n" |
| | "vst3.u8 {d20-d22}, [%4], %11\n" |
| | "vst3.u8 {d24-d26}, [%3], %11\n" |
| | "vst3.u8 {d28-d30}, [%4], %11\n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst7), |
| | "=r"(dst6) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst7), |
| | "4"(dst6), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst7[0] = src0[0]; |
| | dst7[1] = src0[1]; |
| | dst7[2] = src0[2]; |
| | dst7[3] = src1[0]; |
| | dst7[4] = src1[1]; |
| | dst7[5] = src1[2]; |
| | dst7[6] = src0[0 + src_step]; |
| | dst7[7] = src0[1 + src_step]; |
| | dst7[8] = src0[2 + src_step]; |
| | dst7[9] = src1[0 + src_step]; |
| | dst7[10] = src1[1 + src_step]; |
| | dst7[11] = src1[2 + src_step]; |
| | dst7[12] = src0[0 + 2 * src_step]; |
| | dst7[13] = src0[1 + 2 * src_step]; |
| | dst7[14] = src0[2 + 2 * src_step]; |
| | dst7[15] = src1[0 + 2 * src_step]; |
| | dst7[16] = src1[1 + 2 * src_step]; |
| | dst7[17] = src1[2 + 2 * src_step]; |
| | dst7[18] = src0[0 + 3 * src_step]; |
| | dst7[19] = src0[1 + 3 * src_step]; |
| | dst7[20] = src0[2 + 3 * src_step]; |
| | dst7[21] = src1[0 + 3 * src_step]; |
| | dst7[22] = src1[1 + 3 * src_step]; |
| | dst7[23] = src1[2 + 3 * src_step]; |
| |
|
| | src0 += 3; |
| | src1 += 3; |
| |
|
| | dst7 -= stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend + y * 3; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| |
|
| | src0 += 3; |
| | dst0 -= stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | static void kanna_rotate_8_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int , int h, int stride) |
| | { |
| | const int srcwgap = srcstride - srcw * 4; |
| |
|
| | |
| | unsigned char* dstend = dst + stride * (h - 1); |
| |
|
| | const unsigned char* src0 = src; |
| |
|
| | int y = 0; |
| | #if __ARM_NEON |
| | for (; y + 7 < srch; y += 8) |
| | { |
| | const unsigned char* src1 = src0 + srcstride; |
| |
|
| | unsigned char* dst7 = dstend + y * 4; |
| | unsigned char* dst6 = dstend + y * 4 - stride; |
| |
|
| | int src_step = 2 * srcstride; |
| | int dst_step = -2 * stride; |
| |
|
| | int nn = srcw >> 3; |
| | int remain = srcw - (nn << 3); |
| |
|
| | #if __aarch64__ |
| | for (; nn > 0; nn--) |
| | { |
| | uint8x8x4_t _src0 = vld4_u8(src0); |
| | uint8x8x4_t _src1 = vld4_u8(src1); |
| |
|
| | uint8x8x4_t _src2 = vld4_u8(src0 + src_step); |
| | uint8x8x4_t _src3 = vld4_u8(src1 + src_step); |
| |
|
| | uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step); |
| | uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step); |
| |
|
| | uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step); |
| | uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step); |
| |
|
| | uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]); |
| | uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]); |
| | uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]); |
| | uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]); |
| |
|
| | uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]); |
| | uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]); |
| | uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]); |
| | uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]); |
| |
|
| | uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]); |
| | uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]); |
| | uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]); |
| | uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]); |
| |
|
| | uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]); |
| | uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]); |
| | uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]); |
| | uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]); |
| |
|
| | uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0])); |
| | uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1])); |
| | uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0])); |
| | uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0])); |
| | uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1])); |
| | uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0])); |
| | uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0])); |
| | uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1])); |
| | uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0])); |
| | uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1])); |
| |
|
| | uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0])); |
| | uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1])); |
| | uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0])); |
| | uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0])); |
| | uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0])); |
| | uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1])); |
| | uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0])); |
| | uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0])); |
| | uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1])); |
| | uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0])); |
| | uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0])); |
| | uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1])); |
| | uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1])); |
| |
|
| | uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0])); |
| | uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0])); |
| | uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1])); |
| | uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1])); |
| |
|
| | uint8x8x4_t _dst0; |
| | uint8x8x4_t _dst1; |
| | uint8x8x4_t _dst2; |
| | uint8x8x4_t _dst3; |
| | uint8x8x4_t _dst4; |
| | uint8x8x4_t _dst5; |
| | uint8x8x4_t _dst6; |
| | uint8x8x4_t _dst7; |
| |
|
| | _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]); |
| | _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]); |
| | _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]); |
| | _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]); |
| | _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]); |
| | _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]); |
| | _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]); |
| | _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]); |
| |
|
| | _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]); |
| | _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]); |
| | _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]); |
| | _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]); |
| | _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]); |
| | _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]); |
| | _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]); |
| | _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]); |
| |
|
| | _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]); |
| | _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]); |
| | _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]); |
| | _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]); |
| | _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]); |
| | _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]); |
| | _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]); |
| | _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]); |
| |
|
| | _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]); |
| | _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]); |
| | _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]); |
| | _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]); |
| | _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]); |
| | _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]); |
| | _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]); |
| | _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]); |
| |
|
| | vst4_u8(dst7, _dst0); |
| | vst4_u8(dst6, _dst1); |
| | vst4_u8(dst7 + dst_step, _dst2); |
| | vst4_u8(dst6 + dst_step, _dst3); |
| | vst4_u8(dst7 + 2 * dst_step, _dst4); |
| | vst4_u8(dst6 + 2 * dst_step, _dst5); |
| | vst4_u8(dst7 + 3 * dst_step, _dst6); |
| | vst4_u8(dst6 + 3 * dst_step, _dst7); |
| |
|
| | src0 += 4 * 8; |
| | src1 += 4 * 8; |
| |
|
| | dst7 += 4 * dst_step; |
| | dst6 += 4 * dst_step; |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d0-d3}, [%1], %10 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d4-d7}, [%2], %10 \n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d8-d11}, [%1], %10 \n" |
| | |
| | "vtrn.u8 q0, q2 \n" |
| | "vtrn.u8 q1, q3 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d12-d15}, [%2], %10\n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d16-d19}, [%1], %10\n" |
| | |
| | "vtrn.u8 q4, q6 \n" |
| | "vtrn.u8 q5, q7 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d20-d23}, [%2], %10\n" |
| | |
| | "pld [%1, #256] \n" |
| | "vld4.u8 {d24-d27}, [%1], %10\n" |
| | |
| | "vtrn.u8 q8, q10 \n" |
| | "vtrn.u8 q9, q11 \n" |
| | |
| | "pld [%2, #256] \n" |
| | "vld4.u8 {d28-d31}, [%2], %10\n" |
| | |
| | "vtrn.u8 q12, q14 \n" |
| | "vtrn.u8 q13, q15 \n" |
| | |
| | "sub %1, %1, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q0, q4 \n" |
| | "vtrn.u16 q1, q5 \n" |
| | |
| | "sub %2, %2, %10, lsl #2 \n" |
| | |
| | "vtrn.u16 q2, q6 \n" |
| | "vtrn.u16 q3, q7 \n" |
| | |
| | "add %1, #32 \n" |
| | |
| | "vtrn.u16 q8, q12 \n" |
| | "vtrn.u16 q9, q13 \n" |
| | |
| | "add %2, #32 \n" |
| | |
| | "vtrn.u16 q10, q14 \n" |
| | "vtrn.u16 q11, q15 \n" |
| | |
| | "vtrn.u32 q0, q8 \n" |
| | "vtrn.u32 q1, q9 \n" |
| | |
| | "vtrn.u32 q2, q10 \n" |
| | "vst4.u8 {d0-d3}, [%3], %11 \n" |
| | "vtrn.u32 q3, q11 \n" |
| | |
| | "vtrn.u32 q4, q12 \n" |
| | "vst4.u8 {d4-d7}, [%4], %11 \n" |
| | "vtrn.u32 q5, q13 \n" |
| | |
| | "vtrn.u32 q6, q14 \n" |
| | "vst4.u8 {d8-d11}, [%3], %11 \n" |
| | "vtrn.u32 q7, q15 \n" |
| | |
| | "subs %0, #1 \n" |
| | |
| | "vst4.u8 {d16-d19}, [%3], %11\n" |
| | "vst4.u8 {d12-d15}, [%4], %11\n" |
| | "vst4.u8 {d20-d23}, [%4], %11\n" |
| | "vst4.u8 {d24-d27}, [%3], %11\n" |
| | "vst4.u8 {d28-d31}, [%4], %11\n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(src0), |
| | "=r"(src1), |
| | "=r"(dst7), |
| | "=r"(dst6) |
| | : "0"(nn), |
| | "1"(src0), |
| | "2"(src1), |
| | "3"(dst7), |
| | "4"(dst6), |
| | "r"(src_step), |
| | "r"(dst_step) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | } |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | dst7[0] = src0[0]; |
| | dst7[1] = src0[1]; |
| | dst7[2] = src0[2]; |
| | dst7[3] = src0[3]; |
| | dst7[4] = src1[0]; |
| | dst7[5] = src1[1]; |
| | dst7[6] = src1[2]; |
| | dst7[7] = src1[3]; |
| | dst7[8] = src0[0 + src_step]; |
| | dst7[9] = src0[1 + src_step]; |
| | dst7[10] = src0[2 + src_step]; |
| | dst7[11] = src0[3 + src_step]; |
| | dst7[12] = src1[0 + src_step]; |
| | dst7[13] = src1[1 + src_step]; |
| | dst7[14] = src1[2 + src_step]; |
| | dst7[15] = src1[3 + src_step]; |
| | dst7[16] = src0[0 + 2 * src_step]; |
| | dst7[17] = src0[1 + 2 * src_step]; |
| | dst7[18] = src0[2 + 2 * src_step]; |
| | dst7[19] = src0[3 + 2 * src_step]; |
| | dst7[20] = src1[0 + 2 * src_step]; |
| | dst7[21] = src1[1 + 2 * src_step]; |
| | dst7[22] = src1[2 + 2 * src_step]; |
| | dst7[23] = src1[3 + 2 * src_step]; |
| | dst7[24] = src0[0 + 3 * src_step]; |
| | dst7[25] = src0[1 + 3 * src_step]; |
| | dst7[26] = src0[2 + 3 * src_step]; |
| | dst7[27] = src0[3 + 3 * src_step]; |
| | dst7[28] = src1[0 + 3 * src_step]; |
| | dst7[29] = src1[1 + 3 * src_step]; |
| | dst7[30] = src1[2 + 3 * src_step]; |
| | dst7[31] = src1[3 + 3 * src_step]; |
| |
|
| | src0 += 4; |
| | src1 += 4; |
| |
|
| | dst7 -= stride; |
| | } |
| |
|
| | src0 += srcwgap + 7 * srcstride; |
| | } |
| | #endif |
| | for (; y < srch; y++) |
| | { |
| | unsigned char* dst0 = dstend + y * 4; |
| |
|
| | int x = 0; |
| | for (; x < srcw; x++) |
| | { |
| | dst0[0] = src0[0]; |
| | dst0[1] = src0[1]; |
| | dst0[2] = src0[2]; |
| | dst0[3] = src0[3]; |
| |
|
| | src0 += 4; |
| | dst0 -= stride; |
| | } |
| |
|
| | src0 += srcwgap; |
| | } |
| | } |
| |
|
| | void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) |
| | { |
| | return kanna_rotate_c1(src, srcw, srch, srcw, dst, w, h, w, type); |
| | } |
| |
|
| | void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) |
| | { |
| | return kanna_rotate_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2, type); |
| | } |
| |
|
| | void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) |
| | { |
| | return kanna_rotate_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3, type); |
| | } |
| |
|
| | void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) |
| | { |
| | return kanna_rotate_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4, type); |
| | } |
| |
|
| | void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) |
| | { |
| | |
| | |
| |
|
| | switch (type) |
| | { |
| | case 1: |
| | kanna_rotate_1_c1(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 2: |
| | kanna_rotate_2_c1(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 3: |
| | kanna_rotate_3_c1(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 4: |
| | kanna_rotate_4_c1(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 5: |
| | kanna_rotate_5_c1(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 6: |
| | kanna_rotate_6_c1(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 7: |
| | kanna_rotate_7_c1(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 8: |
| | kanna_rotate_8_c1(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | default: |
| | |
| | break; |
| | } |
| | } |
| |
|
| | void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) |
| | { |
| | |
| | |
| |
|
| | switch (type) |
| | { |
| | case 1: |
| | kanna_rotate_1_c2(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 2: |
| | kanna_rotate_2_c2(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 3: |
| | kanna_rotate_3_c2(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 4: |
| | kanna_rotate_4_c2(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 5: |
| | kanna_rotate_5_c2(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 6: |
| | kanna_rotate_6_c2(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 7: |
| | kanna_rotate_7_c2(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 8: |
| | kanna_rotate_8_c2(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | default: |
| | |
| | break; |
| | } |
| | } |
| |
|
| | void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) |
| | { |
| | |
| | |
| |
|
| | switch (type) |
| | { |
| | case 1: |
| | kanna_rotate_1_c3(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 2: |
| | kanna_rotate_2_c3(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 3: |
| | kanna_rotate_3_c3(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 4: |
| | kanna_rotate_4_c3(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 5: |
| | kanna_rotate_5_c3(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 6: |
| | kanna_rotate_6_c3(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 7: |
| | kanna_rotate_7_c3(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 8: |
| | kanna_rotate_8_c3(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | default: |
| | |
| | break; |
| | } |
| | } |
| |
|
| | void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type) |
| | { |
| | |
| | |
| |
|
| | switch (type) |
| | { |
| | case 1: |
| | kanna_rotate_1_c4(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 2: |
| | kanna_rotate_2_c4(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 3: |
| | kanna_rotate_3_c4(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 4: |
| | kanna_rotate_4_c4(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 5: |
| | kanna_rotate_5_c4(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 6: |
| | kanna_rotate_6_c4(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 7: |
| | kanna_rotate_7_c4(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | case 8: |
| | kanna_rotate_8_c4(src, srcw, srch, srcstride, dst, w, h, stride); |
| | break; |
| | default: |
| | |
| | break; |
| | } |
| | } |
| |
|
| | void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type) |
| | { |
| | |
| | |
| | |
| | |
| |
|
| | const unsigned char* srcY = src; |
| | unsigned char* dstY = dst; |
| | kanna_rotate_c1(srcY, srcw, srch, dstY, w, h, type); |
| |
|
| | const unsigned char* srcUV = src + srcw * srch; |
| | unsigned char* dstUV = dst + w * h; |
| | kanna_rotate_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2, type); |
| | } |
| | #endif |
| |
|
| | } |
| |
|