| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | static void conv3x3s1_winograd43_transform_kernel_int8_neon(const Mat& kernel, Mat& kernel_tm_packed, int inch, int outch, const Option& opt) |
| | { |
| | |
| | Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); |
| |
|
| | const short ktm[6][3] = { |
| | {6, 0, 0}, |
| | {-4, -4, -4}, |
| | {-4, 4, -4}, |
| | {1, 2, 4}, |
| | {1, -2, 4}, |
| | {0, 0, 6} |
| | }; |
| |
|
| | #pragma omp parallel for num_threads(opt.num_threads) |
| | for (int p = 0; p < outch; p++) |
| | { |
| | for (int q = 0; q < inch; q++) |
| | { |
| | const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; |
| | short* kernel_tm0 = kernel_tm.channel(p).row<short>(q); |
| |
|
| | |
| | const signed char* k0 = kernel0; |
| | const signed char* k1 = kernel0 + 3; |
| | const signed char* k2 = kernel0 + 6; |
| |
|
| | |
| | short tmp[6][3]; |
| | for (int i = 0; i < 6; i++) |
| | { |
| | tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; |
| | tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; |
| | tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; |
| | } |
| |
|
| | |
| | for (int j = 0; j < 6; j++) |
| | { |
| | short* tmpp = &tmp[j][0]; |
| |
|
| | for (int i = 0; i < 6; i++) |
| | { |
| | kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; |
| | } |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | #if __ARM_NEON |
| | if (outch >= 8) |
| | { |
| | kernel_tm_packed.create(inch, 36, outch / 8 + (outch % 8) / 4 + outch % 4, (size_t)2u * 8, 8); |
| | } |
| | else if (outch >= 4) |
| | { |
| | kernel_tm_packed.create(inch, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4); |
| | } |
| | #else |
| | if (outch >= 2) |
| | { |
| | kernel_tm_packed.create(inch, 36, outch / 2 + outch % 2, (size_t)2u * 2, 2); |
| | } |
| | #endif |
| | else |
| | { |
| | kernel_tm_packed.create(inch, 36, outch, (size_t)2u, 1); |
| | } |
| |
|
| | int p = 0; |
| | #if __ARM_NEON |
| | for (; p + 7 < outch; p += 8) |
| | { |
| | Mat g0 = kernel_tm_packed.channel(p / 8); |
| |
|
| | for (int k = 0; k < 36; k++) |
| | { |
| | short* g00 = g0.row<short>(k); |
| |
|
| | for (int q = 0; q < inch; q++) |
| | { |
| | for (int i = 0; i < 8; i++) |
| | { |
| | g00[0] = kernel_tm.channel(p + i).row<const short>(q)[k]; |
| | g00++; |
| | } |
| | } |
| | } |
| | } |
| | for (; p + 3 < outch; p += 4) |
| | { |
| | const Mat k0 = kernel_tm.channel(p); |
| | const Mat k1 = kernel_tm.channel(p + 1); |
| | const Mat k2 = kernel_tm.channel(p + 2); |
| | const Mat k3 = kernel_tm.channel(p + 3); |
| |
|
| | Mat g0 = kernel_tm_packed.channel(p / 8 + (p % 8) / 4); |
| |
|
| | for (int k = 0; k < 36; k++) |
| | { |
| | short* g00 = g0.row<short>(k); |
| |
|
| | for (int q = 0; q < inch; q++) |
| | { |
| | g00[0] = k0.row<const short>(q)[k]; |
| | g00[1] = k1.row<const short>(q)[k]; |
| | g00[2] = k2.row<const short>(q)[k]; |
| | g00[3] = k3.row<const short>(q)[k]; |
| | g00 += 4; |
| | } |
| | } |
| | } |
| | #else |
| | for (; p + 1 < outch; p += 2) |
| | { |
| | const Mat k0 = kernel_tm.channel(p); |
| | const Mat k1 = kernel_tm.channel(p + 1); |
| |
|
| | Mat g0 = kernel_tm_packed.channel(p / 2); |
| |
|
| | for (int k = 0; k < 36; k++) |
| | { |
| | short* g00 = g0.row<short>(k); |
| |
|
| | int q = 0; |
| | #if __ARM_FEATURE_SIMD32 |
| | for (; q + 1 < inch; q += 2) |
| | { |
| | g00[0] = k0.row<const short>(q)[k]; |
| | g00[2] = k1.row<const short>(q)[k]; |
| | g00[1] = k0.row<const short>(q + 1)[k]; |
| | g00[3] = k1.row<const short>(q + 1)[k]; |
| | g00 += 4; |
| | } |
| | #endif |
| | for (; q < inch; q++) |
| | { |
| | g00[0] = k0.row<const short>(q)[k]; |
| | g00[1] = k1.row<const short>(q)[k]; |
| | g00 += 2; |
| | } |
| | } |
| | } |
| | #endif |
| | for (; p < outch; p++) |
| | { |
| | const Mat k0 = kernel_tm.channel(p); |
| |
|
| | #if __ARM_NEON |
| | Mat g0 = kernel_tm_packed.channel(p / 8 + (p % 8) / 4 + p % 4); |
| | #else |
| | Mat g0 = kernel_tm_packed.channel(p / 2 + p % 2); |
| | #endif |
| |
|
| | for (int k = 0; k < 36; k++) |
| | { |
| | short* g00 = g0.row<short>(k); |
| |
|
| | for (int q = 0; q < inch; q++) |
| | { |
| | g00[0] = k0.row<const short>(q)[k]; |
| | g00 += 1; |
| | } |
| | } |
| | } |
| | } |
| |
|
| | static void conv3x3s1_winograd43_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) |
| | { |
| | int w = bottom_blob.w; |
| | int h = bottom_blob.h; |
| | int inch = bottom_blob.c; |
| | |
| | int elempack = bottom_blob.elempack; |
| |
|
| | int outw = top_blob.w; |
| | int outh = top_blob.h; |
| | int outch = top_blob.c; |
| |
|
| | |
| | Mat bottom_blob_bordered = bottom_blob; |
| |
|
| | outw = (outw + 3) / 4 * 4; |
| | outh = (outh + 3) / 4 * 4; |
| |
|
| | w = outw + 2; |
| | h = outh + 2; |
| | copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); |
| |
|
| | |
| | Mat bottom_blob_tm; |
| | { |
| | int w_tiles = outw / 4; |
| | int h_tiles = outh / 4; |
| | const int tiles = w_tiles * h_tiles; |
| |
|
| | bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); |
| | conv3x3s1_winograd43_transform_input_int8_neon(bottom_blob_bordered, bottom_blob_tm, opt); |
| | } |
| | bottom_blob_bordered = Mat(); |
| | |
| |
|
| | |
| | Mat top_blob_tm; |
| | convolution_winograd_dot_int8_neon(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); |
| | |
| |
|
| | |
| | Mat top_blob_bordered; |
| | if (outw == top_blob.w && outh == top_blob.h) |
| | { |
| | top_blob_bordered = top_blob; |
| | } |
| | else |
| | { |
| | top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator); |
| | } |
| | { |
| | conv3x3s1_winograd43_transform_output_int8_neon(top_blob_tm, top_blob_bordered, opt); |
| | } |
| | |
| |
|
| | |
| | copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); |
| | } |
| |
|
| | static void conv3x3s2_transform_kernel_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch) |
| | { |
| | kernel_tm.create(8 * 9, inch, outch / 8 + outch % 8, (size_t)1u); |
| |
|
| | const signed char* kernel = _kernel; |
| |
|
| | int p = 0; |
| | for (; p + 7 < outch; p += 8) |
| | { |
| | const signed char* k0 = kernel + (p + 0) * inch * 9; |
| | const signed char* k1 = kernel + (p + 1) * inch * 9; |
| | const signed char* k2 = kernel + (p + 2) * inch * 9; |
| | const signed char* k3 = kernel + (p + 3) * inch * 9; |
| | const signed char* k4 = kernel + (p + 4) * inch * 9; |
| | const signed char* k5 = kernel + (p + 5) * inch * 9; |
| | const signed char* k6 = kernel + (p + 6) * inch * 9; |
| | const signed char* k7 = kernel + (p + 7) * inch * 9; |
| |
|
| | signed char* ktmp = kernel_tm.channel(p / 8); |
| |
|
| | for (int q = 0; q < inch; q++) |
| | { |
| | for (int k = 0; k < 9; k++) |
| | { |
| | ktmp[0] = k0[k]; |
| | ktmp[1] = k1[k]; |
| | ktmp[2] = k2[k]; |
| | ktmp[3] = k3[k]; |
| | ktmp[4] = k4[k]; |
| | ktmp[5] = k5[k]; |
| | ktmp[6] = k6[k]; |
| | ktmp[7] = k7[k]; |
| | ktmp += 8; |
| | } |
| |
|
| | k0 += 9; |
| | k1 += 9; |
| | k2 += 9; |
| | k3 += 9; |
| | k4 += 9; |
| | k5 += 9; |
| | k6 += 9; |
| | k7 += 9; |
| | } |
| | } |
| | for (; p < outch; p++) |
| | { |
| | const signed char* k0 = kernel + (p + 0) * inch * 9; |
| |
|
| | signed char* ktmp = kernel_tm.channel(p / 8 + p % 8); |
| |
|
| | for (int q = 0; q < inch; q++) |
| | { |
| | for (int k = 0; k < 9; k++) |
| | { |
| | ktmp[k] = k0[k]; |
| | } |
| | ktmp += 9; |
| |
|
| | k0 += 9; |
| | } |
| | } |
| | } |
| |
|
| | static void conv3x3s2_packed_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt) |
| | { |
| | int w = bottom_blob.w; |
| | int inch = bottom_blob.c; |
| |
|
| | int outw = top_blob.w; |
| | int outh = top_blob.h; |
| | int outch = top_blob.c; |
| |
|
| | const int tailstep = w - 2 * outw + w; |
| |
|
| | int nn_outch = outch >> 3; |
| | int remain_outch_start = nn_outch << 3; |
| |
|
| | #pragma omp parallel for num_threads(opt.num_threads) |
| | for (int pp = 0; pp < nn_outch; pp++) |
| | { |
| | int p = pp * 8; |
| |
|
| | Mat out0 = top_blob.channel(p + 0); |
| | Mat out1 = top_blob.channel(p + 1); |
| | Mat out2 = top_blob.channel(p + 2); |
| | Mat out3 = top_blob.channel(p + 3); |
| | Mat out4 = top_blob.channel(p + 4); |
| | Mat out5 = top_blob.channel(p + 5); |
| | Mat out6 = top_blob.channel(p + 6); |
| | Mat out7 = top_blob.channel(p + 7); |
| |
|
| | out0.fill(0); |
| | out1.fill(0); |
| | out2.fill(0); |
| | out3.fill(0); |
| | out4.fill(0); |
| | out5.fill(0); |
| | out6.fill(0); |
| | out7.fill(0); |
| |
|
| | const signed char* ktmp = _kernel.channel(p / 8); |
| |
|
| | for (int q = 0; q < inch; q++) |
| | { |
| | int* outptr0 = out0; |
| | int* outptr1 = out1; |
| | int* outptr2 = out2; |
| | int* outptr3 = out3; |
| | int* outptr4 = out4; |
| | int* outptr5 = out5; |
| | int* outptr6 = out6; |
| | int* outptr7 = out7; |
| |
|
| | const signed char* img0 = bottom_blob.channel(q); |
| |
|
| | const signed char* r0 = img0; |
| | const signed char* r1 = img0 + w; |
| | const signed char* r2 = img0 + w * 2; |
| |
|
| | int i = 0; |
| |
|
| | for (; i < outh; i++) |
| | { |
| | #if __ARM_NEON |
| | #if __aarch64__ |
| | int nn = outw >> 3; |
| | int remain = outw & 7; |
| | #else |
| | int nn = outw >> 2; |
| | int remain = outw & 3; |
| | #endif |
| | #else |
| | int remain = outw; |
| | #endif |
| |
|
| | #if __ARM_NEON |
| | #if __aarch64__ |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | |
| | "ld1 {v0.8b, v1.8b, v2.8b}, [%12], #24 \n" |
| | "ld2 {v3.8b, v4.8b}, [%9], #16 \n" |
| | "ld2 {v5.8b, v6.8b}, [%9] \n" |
| | |
| | "ld1 {v8.4s, v9.4s}, [%1] \n" |
| | "ld1 {v10.4s, v11.4s}, [%2] \n" |
| | "ld1 {v12.4s, v13.4s}, [%3] \n" |
| | "ld1 {v14.4s, v15.4s}, [%4] \n" |
| | "ld1 {v16.4s, v17.4s}, [%5] \n" |
| | "ld1 {v18.4s, v19.4s}, [%6] \n" |
| | "ld1 {v20.4s, v21.4s}, [%7] \n" |
| | "ld1 {v22.4s, v23.4s}, [%8] \n" |
| | |
| | "ext v7.8b, v3.8b, v5.8b, #1 \n" |
| | |
| | "sshll v0.8h, v0.8b, #0 \n" |
| | "sshll v1.8h, v1.8b, #0 \n" |
| | "sshll v2.8h, v2.8b, #0 \n" |
| | "sshll v3.8h, v3.8b, #0 \n" |
| | "sshll v4.8h, v4.8b, #0 \n" |
| | "sshll v7.8h, v7.8b, #0 \n" |
| | |
| | |
| | "smlal v8.4s, v3.4h, v0.h[0] \n" |
| | "smlal2 v9.4s, v3.8h, v0.h[0] \n" |
| | "smlal v10.4s, v3.4h, v0.h[1] \n" |
| | "smlal2 v11.4s, v3.8h, v0.h[1] \n" |
| | "smlal v12.4s, v3.4h, v0.h[2] \n" |
| | "smlal2 v13.4s, v3.8h, v0.h[2] \n" |
| | "smlal v14.4s, v3.4h, v0.h[3] \n" |
| | "smlal2 v15.4s, v3.8h, v0.h[3] \n" |
| | "smlal v16.4s, v3.4h, v0.h[4] \n" |
| | "smlal2 v17.4s, v3.8h, v0.h[4] \n" |
| | "smlal v18.4s, v3.4h, v0.h[5] \n" |
| | "smlal2 v19.4s, v3.8h, v0.h[5] \n" |
| | "smlal v20.4s, v3.4h, v0.h[6] \n" |
| | "smlal2 v21.4s, v3.8h, v0.h[6] \n" |
| | "smlal v22.4s, v3.4h, v0.h[7] \n" |
| | "smlal2 v23.4s, v3.8h, v0.h[7] \n" |
| | |
| | "smlal v8.4s, v4.4h, v1.h[0] \n" |
| | "smlal2 v9.4s, v4.8h, v1.h[0] \n" |
| | "smlal v10.4s, v4.4h, v1.h[1] \n" |
| | "smlal2 v11.4s, v4.8h, v1.h[1] \n" |
| | "smlal v12.4s, v4.4h, v1.h[2] \n" |
| | "smlal2 v13.4s, v4.8h, v1.h[2] \n" |
| | "smlal v14.4s, v4.4h, v1.h[3] \n" |
| | "smlal2 v15.4s, v4.8h, v1.h[3] \n" |
| | "smlal v16.4s, v4.4h, v1.h[4] \n" |
| | "smlal2 v17.4s, v4.8h, v1.h[4] \n" |
| | "smlal v18.4s, v4.4h, v1.h[5] \n" |
| | "smlal2 v19.4s, v4.8h, v1.h[5] \n" |
| | "smlal v20.4s, v4.4h, v1.h[6] \n" |
| | "smlal2 v21.4s, v4.8h, v1.h[6] \n" |
| | "smlal v22.4s, v4.4h, v1.h[7] \n" |
| | "smlal2 v23.4s, v4.8h, v1.h[7] \n" |
| | |
| | "smlal v8.4s, v7.4h, v2.h[0] \n" |
| | "smlal2 v9.4s, v7.8h, v2.h[0] \n" |
| | "smlal v10.4s, v7.4h, v2.h[1] \n" |
| | "smlal2 v11.4s, v7.8h, v2.h[1] \n" |
| | "smlal v12.4s, v7.4h, v2.h[2] \n" |
| | "smlal2 v13.4s, v7.8h, v2.h[2] \n" |
| | "smlal v14.4s, v7.4h, v2.h[3] \n" |
| | "smlal2 v15.4s, v7.8h, v2.h[3] \n" |
| | "smlal v16.4s, v7.4h, v2.h[4] \n" |
| | "smlal2 v17.4s, v7.8h, v2.h[4] \n" |
| | "smlal v18.4s, v7.4h, v2.h[5] \n" |
| | "smlal2 v19.4s, v7.8h, v2.h[5] \n" |
| | "smlal v20.4s, v7.4h, v2.h[6] \n" |
| | "smlal2 v21.4s, v7.8h, v2.h[6] \n" |
| | "smlal v22.4s, v7.4h, v2.h[7] \n" |
| | "smlal2 v23.4s, v7.8h, v2.h[7] \n" |
| | |
| | "ld1 {v0.8b, v1.8b, v2.8b}, [%12], #24 \n" |
| | "ld2 {v3.8b, v4.8b}, [%10], #16 \n" |
| | "ld2 {v5.8b, v6.8b}, [%10] \n" |
| | |
| | "ext v7.8b, v3.8b, v5.8b, #1 \n" |
| | |
| | "sshll v0.8h, v0.8b, #0 \n" |
| | "sshll v1.8h, v1.8b, #0 \n" |
| | "sshll v2.8h, v2.8b, #0 \n" |
| | "sshll v3.8h, v3.8b, #0 \n" |
| | "sshll v4.8h, v4.8b, #0 \n" |
| | "sshll v7.8h, v7.8b, #0 \n" |
| | |
| | |
| | "smlal v8.4s, v3.4h, v0.h[0] \n" |
| | "smlal2 v9.4s, v3.8h, v0.h[0] \n" |
| | "smlal v10.4s, v3.4h, v0.h[1] \n" |
| | "smlal2 v11.4s, v3.8h, v0.h[1] \n" |
| | "smlal v12.4s, v3.4h, v0.h[2] \n" |
| | "smlal2 v13.4s, v3.8h, v0.h[2] \n" |
| | "smlal v14.4s, v3.4h, v0.h[3] \n" |
| | "smlal2 v15.4s, v3.8h, v0.h[3] \n" |
| | "smlal v16.4s, v3.4h, v0.h[4] \n" |
| | "smlal2 v17.4s, v3.8h, v0.h[4] \n" |
| | "smlal v18.4s, v3.4h, v0.h[5] \n" |
| | "smlal2 v19.4s, v3.8h, v0.h[5] \n" |
| | "smlal v20.4s, v3.4h, v0.h[6] \n" |
| | "smlal2 v21.4s, v3.8h, v0.h[6] \n" |
| | "smlal v22.4s, v3.4h, v0.h[7] \n" |
| | "smlal2 v23.4s, v3.8h, v0.h[7] \n" |
| | |
| | "smlal v8.4s, v4.4h, v1.h[0] \n" |
| | "smlal2 v9.4s, v4.8h, v1.h[0] \n" |
| | "smlal v10.4s, v4.4h, v1.h[1] \n" |
| | "smlal2 v11.4s, v4.8h, v1.h[1] \n" |
| | "smlal v12.4s, v4.4h, v1.h[2] \n" |
| | "smlal2 v13.4s, v4.8h, v1.h[2] \n" |
| | "smlal v14.4s, v4.4h, v1.h[3] \n" |
| | "smlal2 v15.4s, v4.8h, v1.h[3] \n" |
| | "smlal v16.4s, v4.4h, v1.h[4] \n" |
| | "smlal2 v17.4s, v4.8h, v1.h[4] \n" |
| | "smlal v18.4s, v4.4h, v1.h[5] \n" |
| | "smlal2 v19.4s, v4.8h, v1.h[5] \n" |
| | "smlal v20.4s, v4.4h, v1.h[6] \n" |
| | "smlal2 v21.4s, v4.8h, v1.h[6] \n" |
| | "smlal v22.4s, v4.4h, v1.h[7] \n" |
| | "smlal2 v23.4s, v4.8h, v1.h[7] \n" |
| | |
| | "smlal v8.4s, v7.4h, v2.h[0] \n" |
| | "smlal2 v9.4s, v7.8h, v2.h[0] \n" |
| | "smlal v10.4s, v7.4h, v2.h[1] \n" |
| | "smlal2 v11.4s, v7.8h, v2.h[1] \n" |
| | "smlal v12.4s, v7.4h, v2.h[2] \n" |
| | "smlal2 v13.4s, v7.8h, v2.h[2] \n" |
| | "smlal v14.4s, v7.4h, v2.h[3] \n" |
| | "smlal2 v15.4s, v7.8h, v2.h[3] \n" |
| | "smlal v16.4s, v7.4h, v2.h[4] \n" |
| | "smlal2 v17.4s, v7.8h, v2.h[4] \n" |
| | "smlal v18.4s, v7.4h, v2.h[5] \n" |
| | "smlal2 v19.4s, v7.8h, v2.h[5] \n" |
| | "smlal v20.4s, v7.4h, v2.h[6] \n" |
| | "smlal2 v21.4s, v7.8h, v2.h[6] \n" |
| | "smlal v22.4s, v7.4h, v2.h[7] \n" |
| | "smlal2 v23.4s, v7.8h, v2.h[7] \n" |
| | |
| | "ld1 {v0.8b, v1.8b, v2.8b}, [%12], #24 \n" |
| | "ld2 {v3.8b, v4.8b}, [%11], #16 \n" |
| | "ld2 {v5.8b, v6.8b}, [%11] \n" |
| | |
| | "ext v7.8b, v3.8b, v5.8b, #1 \n" |
| | |
| | "sshll v0.8h, v0.8b, #0 \n" |
| | "sshll v1.8h, v1.8b, #0 \n" |
| | "sshll v2.8h, v2.8b, #0 \n" |
| | "sshll v3.8h, v3.8b, #0 \n" |
| | "sshll v4.8h, v4.8b, #0 \n" |
| | "sshll v7.8h, v7.8b, #0 \n" |
| | |
| | |
| | "smlal v8.4s, v3.4h, v0.h[0] \n" |
| | "smlal2 v9.4s, v3.8h, v0.h[0] \n" |
| | "smlal v10.4s, v3.4h, v0.h[1] \n" |
| | "smlal2 v11.4s, v3.8h, v0.h[1] \n" |
| | "smlal v12.4s, v3.4h, v0.h[2] \n" |
| | "smlal2 v13.4s, v3.8h, v0.h[2] \n" |
| | "smlal v14.4s, v3.4h, v0.h[3] \n" |
| | "smlal2 v15.4s, v3.8h, v0.h[3] \n" |
| | "smlal v16.4s, v3.4h, v0.h[4] \n" |
| | "smlal2 v17.4s, v3.8h, v0.h[4] \n" |
| | "smlal v18.4s, v3.4h, v0.h[5] \n" |
| | "smlal2 v19.4s, v3.8h, v0.h[5] \n" |
| | "smlal v20.4s, v3.4h, v0.h[6] \n" |
| | "smlal2 v21.4s, v3.8h, v0.h[6] \n" |
| | "smlal v22.4s, v3.4h, v0.h[7] \n" |
| | "smlal2 v23.4s, v3.8h, v0.h[7] \n" |
| | |
| | "smlal v8.4s, v4.4h, v1.h[0] \n" |
| | "smlal2 v9.4s, v4.8h, v1.h[0] \n" |
| | "smlal v10.4s, v4.4h, v1.h[1] \n" |
| | "smlal2 v11.4s, v4.8h, v1.h[1] \n" |
| | "smlal v12.4s, v4.4h, v1.h[2] \n" |
| | "smlal2 v13.4s, v4.8h, v1.h[2] \n" |
| | "smlal v14.4s, v4.4h, v1.h[3] \n" |
| | "smlal2 v15.4s, v4.8h, v1.h[3] \n" |
| | "smlal v16.4s, v4.4h, v1.h[4] \n" |
| | "smlal2 v17.4s, v4.8h, v1.h[4] \n" |
| | "smlal v18.4s, v4.4h, v1.h[5] \n" |
| | "smlal2 v19.4s, v4.8h, v1.h[5] \n" |
| | "smlal v20.4s, v4.4h, v1.h[6] \n" |
| | "smlal2 v21.4s, v4.8h, v1.h[6] \n" |
| | "smlal v22.4s, v4.4h, v1.h[7] \n" |
| | "smlal2 v23.4s, v4.8h, v1.h[7] \n" |
| | |
| | "smlal v8.4s, v7.4h, v2.h[0] \n" |
| | "smlal2 v9.4s, v7.8h, v2.h[0] \n" |
| | "smlal v10.4s, v7.4h, v2.h[1] \n" |
| | "smlal2 v11.4s, v7.8h, v2.h[1] \n" |
| | "smlal v12.4s, v7.4h, v2.h[2] \n" |
| | "smlal2 v13.4s, v7.8h, v2.h[2] \n" |
| | "smlal v14.4s, v7.4h, v2.h[3] \n" |
| | "smlal2 v15.4s, v7.8h, v2.h[3] \n" |
| | "smlal v16.4s, v7.4h, v2.h[4] \n" |
| | "smlal2 v17.4s, v7.8h, v2.h[4] \n" |
| | "smlal v18.4s, v7.4h, v2.h[5] \n" |
| | "smlal2 v19.4s, v7.8h, v2.h[5] \n" |
| | "smlal v20.4s, v7.4h, v2.h[6] \n" |
| | "smlal2 v21.4s, v7.8h, v2.h[6] \n" |
| | "smlal v22.4s, v7.4h, v2.h[7] \n" |
| | "smlal2 v23.4s, v7.8h, v2.h[7] \n" |
| | |
| | "st1 {v8.4s, v9.4s}, [%1], #32 \n" |
| | "st1 {v10.4s, v11.4s}, [%2], #32 \n" |
| | "st1 {v12.4s, v13.4s}, [%3], #32 \n" |
| | "st1 {v14.4s, v15.4s}, [%4], #32 \n" |
| | "st1 {v16.4s, v17.4s}, [%5], #32 \n" |
| | "st1 {v18.4s, v19.4s}, [%6], #32 \n" |
| | "st1 {v20.4s, v21.4s}, [%7], #32 \n" |
| | "st1 {v22.4s, v23.4s}, [%8], #32 \n" |
| | |
| | "subs %w0, %w0, #1 \n" |
| | "sub %12, %12, #72 \n" |
| | |
| | "bne 0b \n" |
| | |
| | : "=r"(nn), |
| | "=r"(outptr0), |
| | "=r"(outptr1), |
| | "=r"(outptr2), |
| | "=r"(outptr3), |
| | "=r"(outptr4), |
| | "=r"(outptr5), |
| | "=r"(outptr6), |
| | "=r"(outptr7), |
| | "=r"(r0), |
| | "=r"(r1), |
| | "=r"(r2), |
| | "=r"(ktmp) |
| | : "0"(nn), |
| | "1"(outptr0), |
| | "2"(outptr1), |
| | "3"(outptr2), |
| | "4"(outptr3), |
| | "5"(outptr4), |
| | "6"(outptr5), |
| | "7"(outptr6), |
| | "8"(outptr7), |
| | "9"(r0), |
| | "10"(r1), |
| | "11"(r2), |
| | "12"(ktmp) |
| | : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | "pld [%1, #128] \n" |
| | "vld1.s32 {d16-d17}, [%1] \n" |
| | "pld [%2, #128] \n" |
| | "vld1.s32 {d18-d19}, [%2] \n" |
| | "pld [%3, #128] \n" |
| | "vld1.s32 {d20-d21}, [%3] \n" |
| | "pld [%4, #128] \n" |
| | "vld1.s32 {d22-d23}, [%4] \n" |
| | |
| | |
| | "pld [%9, #64] \n" |
| | "vld2.s8 {d8-d9}, [%9] \n" |
| | "add %9, #8 \n" |
| | "pld [%12, #64] \n" |
| | "vld1.s8 {d0-d2}, [%12]! \n" |
| | |
| | "pld [%5, #128] \n" |
| | "vld1.s32 {d24-d25}, [%5] \n" |
| | "pld [%6, #128] \n" |
| | "vld1.s32 {d26-d27}, [%6] \n" |
| | |
| | "vmovl.s8 q2, d2 \n" |
| | "vmovl.s8 q1, d1 \n" |
| | "vmovl.s8 q0, d0 \n" |
| | "vext.s8 d12, d8, d8, #1 \n" |
| | |
| | "pld [%7, #128] \n" |
| | "vld1.s32 {d28-d29}, [%7] \n" |
| | |
| | "vmovl.s8 q5, d9 \n" |
| | "vmovl.s8 q4, d8 \n" |
| | "vmovl.s8 q6, d12 \n" |
| | |
| | "pld [%8, #128] \n" |
| | "vld1.s32 {d30-d31}, [%8] \n" |
| | |
| | "vmlal.s16 q8, d8, d0[0] \n" |
| | "vmlal.s16 q9, d8, d0[1] \n" |
| | "vmlal.s16 q10, d8, d0[2] \n" |
| | "vmlal.s16 q11, d8, d0[3] \n" |
| | "vmlal.s16 q12, d8, d1[0] \n" |
| | "vmlal.s16 q13, d8, d1[1] \n" |
| | "vmlal.s16 q14, d8, d1[2] \n" |
| | "vmlal.s16 q15, d8, d1[3] \n" |
| | |
| | "vmlal.s16 q8, d10, d2[0] \n" |
| | "vmlal.s16 q9, d10, d2[1] \n" |
| | "vmlal.s16 q10, d10, d2[2] \n" |
| | "vmlal.s16 q11, d10, d2[3] \n" |
| | "vmlal.s16 q12, d10, d3[0] \n" |
| | "vmlal.s16 q13, d10, d3[1] \n" |
| | "vmlal.s16 q14, d10, d3[2] \n" |
| | "vmlal.s16 q15, d10, d3[3] \n" |
| | |
| | "pld [%10, #64] \n" |
| | "vld2.s8 {d8-d9}, [%10] \n" |
| | "add %10, #8 \n" |
| | |
| | "vmlal.s16 q8, d12, d4[0] \n" |
| | "vmlal.s16 q9, d12, d4[1] \n" |
| | "vmlal.s16 q10, d12, d4[2] \n" |
| | "vmlal.s16 q11, d12, d4[3] \n" |
| | |
| | "pld [%12, #64] \n" |
| | "vld1.s8 {d0-d2}, [%12]! \n" |
| | |
| | "vmlal.s16 q12, d12, d5[0] \n" |
| | "vmlal.s16 q13, d12, d5[1] \n" |
| | "vmlal.s16 q14, d12, d5[2] \n" |
| | "vmlal.s16 q15, d12, d5[3] \n" |
| | |
| | |
| | "vext.s8 d12, d8, d8, #1 \n" |
| | |
| | "vmovl.s8 q2, d2 \n" |
| | "vmovl.s8 q1, d1 \n" |
| | "vmovl.s8 q0, d0 \n" |
| | "vmovl.s8 q5, d9 \n" |
| | "vmovl.s8 q4, d8 \n" |
| | "vmovl.s8 q6, d12 \n" |
| | |
| | "vmlal.s16 q8, d8, d0[0] \n" |
| | "vmlal.s16 q9, d8, d0[1] \n" |
| | "vmlal.s16 q10, d8, d0[2] \n" |
| | "vmlal.s16 q11, d8, d0[3] \n" |
| | "vmlal.s16 q12, d8, d1[0] \n" |
| | "vmlal.s16 q13, d8, d1[1] \n" |
| | "vmlal.s16 q14, d8, d1[2] \n" |
| | "vmlal.s16 q15, d8, d1[3] \n" |
| | |
| | "vmlal.s16 q8, d10, d2[0] \n" |
| | "vmlal.s16 q9, d10, d2[1] \n" |
| | "vmlal.s16 q10, d10, d2[2] \n" |
| | "vmlal.s16 q11, d10, d2[3] \n" |
| | "vmlal.s16 q12, d10, d3[0] \n" |
| | "vmlal.s16 q13, d10, d3[1] \n" |
| | "vmlal.s16 q14, d10, d3[2] \n" |
| | "vmlal.s16 q15, d10, d3[3] \n" |
| | |
| | "pld [%11, #64] \n" |
| | "vld2.s8 {d8-d9}, [%11] \n" |
| | "add %11, #8 \n" |
| | |
| | "vmlal.s16 q8, d12, d4[0] \n" |
| | "vmlal.s16 q9, d12, d4[1] \n" |
| | "vmlal.s16 q10, d12, d4[2] \n" |
| | "vmlal.s16 q11, d12, d4[3] \n" |
| | |
| | "pld [%12, #64] \n" |
| | "vld1.s8 {d0-d2}, [%12]! \n" |
| | |
| | "vmlal.s16 q12, d12, d5[0] \n" |
| | "vmlal.s16 q13, d12, d5[1] \n" |
| | "vmlal.s16 q14, d12, d5[2] \n" |
| | "vmlal.s16 q15, d12, d5[3] \n" |
| | |
| | |
| | "vext.s8 d12, d8, d8, #1 \n" |
| | |
| | "vmovl.s8 q2, d2 \n" |
| | "vmovl.s8 q1, d1 \n" |
| | "vmovl.s8 q0, d0 \n" |
| | "vmovl.s8 q5, d9 \n" |
| | "vmovl.s8 q4, d8 \n" |
| | "vmovl.s8 q6, d12 \n" |
| | |
| | "vmlal.s16 q8, d8, d0[0] \n" |
| | "vmlal.s16 q9, d8, d0[1] \n" |
| | "vmlal.s16 q10, d8, d0[2] \n" |
| | "vmlal.s16 q11, d8, d0[3] \n" |
| | "vmlal.s16 q12, d8, d1[0] \n" |
| | "vmlal.s16 q13, d8, d1[1] \n" |
| | "vmlal.s16 q14, d8, d1[2] \n" |
| | "vmlal.s16 q15, d8, d1[3] \n" |
| | |
| | "vmlal.s16 q8, d10, d2[0] \n" |
| | "vmlal.s16 q9, d10, d2[1] \n" |
| | "vmlal.s16 q10, d10, d2[2] \n" |
| | "vmlal.s16 q11, d10, d2[3] \n" |
| | "vmlal.s16 q12, d10, d3[0] \n" |
| | "vmlal.s16 q13, d10, d3[1] \n" |
| | "vmlal.s16 q14, d10, d3[2] \n" |
| | "vmlal.s16 q15, d10, d3[3] \n" |
| | |
| | "vmlal.s16 q8, d12, d4[0] \n" |
| | "vmlal.s16 q9, d12, d4[1] \n" |
| | "vmlal.s16 q10, d12, d4[2] \n" |
| | "vmlal.s16 q11, d12, d4[3] \n" |
| | "vmlal.s16 q12, d12, d5[0] \n" |
| | "vmlal.s16 q13, d12, d5[1] \n" |
| | "vmlal.s16 q14, d12, d5[2] \n" |
| | "vmlal.s16 q15, d12, d5[3] \n" |
| | |
| | |
| | "sub %12, %12, #72 \n" |
| | "vst1.s32 {d16-d17}, [%1]! \n" |
| | "vst1.s32 {d18-d19}, [%2]! \n" |
| | "vst1.s32 {d20-d21}, [%3]! \n" |
| | "vst1.s32 {d22-d23}, [%4]! \n" |
| | "subs %0, #1 \n" |
| | "vst1.s32 {d24-d25}, [%5]! \n" |
| | "vst1.s32 {d26-d27}, [%6]! \n" |
| | "vst1.s32 {d28-d29}, [%7]! \n" |
| | "vst1.s32 {d30-d31}, [%8]! \n" |
| | |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(outptr0), |
| | "=r"(outptr1), |
| | "=r"(outptr2), |
| | "=r"(outptr3), |
| | "=r"(outptr4), |
| | "=r"(outptr5), |
| | "=r"(outptr6), |
| | "=r"(outptr7), |
| | "=r"(r0), |
| | "=r"(r1), |
| | "=r"(r2), |
| | "=r"(ktmp) |
| | : "0"(nn), |
| | "1"(outptr0), |
| | "2"(outptr1), |
| | "3"(outptr2), |
| | "4"(outptr3), |
| | "5"(outptr4), |
| | "6"(outptr5), |
| | "7"(outptr6), |
| | "8"(outptr7), |
| | "9"(r0), |
| | "10"(r1), |
| | "11"(r2), |
| | "12"(ktmp) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | } |
| | #endif |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | #if __ARM_NEON |
| | #if __aarch64__ |
| | int8x8_t _r0_s8 = vld1_s8(r0); |
| | int8x8_t _r1_s8 = vld1_s8(r1); |
| | int8x8_t _r2_s8 = vld1_s8(r2); |
| |
|
| | int16x8_t _r0 = vmovl_s8(_r0_s8); |
| | int16x8_t _r1 = vmovl_s8(_r1_s8); |
| | int16x8_t _r2 = vmovl_s8(_r2_s8); |
| |
|
| | int32x4_t _sum03 = {}; |
| | int32x4_t _sum47 = {}; |
| |
|
| | _sum03 = vld1q_lane_s32(outptr0, _sum03, 0); |
| | _sum03 = vld1q_lane_s32(outptr1, _sum03, 1); |
| | _sum03 = vld1q_lane_s32(outptr2, _sum03, 2); |
| | _sum03 = vld1q_lane_s32(outptr3, _sum03, 3); |
| | _sum47 = vld1q_lane_s32(outptr4, _sum47, 0); |
| | _sum47 = vld1q_lane_s32(outptr5, _sum47, 1); |
| | _sum47 = vld1q_lane_s32(outptr6, _sum47, 2); |
| | _sum47 = vld1q_lane_s32(outptr7, _sum47, 3); |
| |
|
| | |
| | int8x8_t _k0_8 = vld1_s8(ktmp); |
| | int8x8_t _k1_8 = vld1_s8(ktmp + 8); |
| | int8x8_t _k2_8 = vld1_s8(ktmp + 16); |
| |
|
| | int16x8_t _k0 = vmovl_s8(_k0_8); |
| | int16x8_t _k1 = vmovl_s8(_k1_8); |
| | int16x8_t _k2 = vmovl_s8(_k2_8); |
| |
|
| | int32x4_t _sum0 = vmull_laneq_s16(vget_low_s16(_k0), _r0, 0); |
| | int32x4_t _sum0n = vmull_laneq_s16(vget_high_s16(_k0), _r0, 0); |
| | int32x4_t _sum1 = vmull_laneq_s16(vget_low_s16(_k1), _r0, 1); |
| | int32x4_t _sum1n = vmull_laneq_s16(vget_high_s16(_k1), _r0, 1); |
| | _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r0, 2); |
| | _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r0, 2); |
| |
|
| | |
| | _k0_8 = vld1_s8(ktmp + 24); |
| | _k1_8 = vld1_s8(ktmp + 32); |
| | _k2_8 = vld1_s8(ktmp + 40); |
| |
|
| | _k0 = vmovl_s8(_k0_8); |
| | _k1 = vmovl_s8(_k1_8); |
| | _k2 = vmovl_s8(_k2_8); |
| |
|
| | _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_k0), _r1, 0); |
| | _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_k0), _r1, 0); |
| | _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_k1), _r1, 1); |
| | _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r1, 1); |
| | _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r1, 2); |
| | _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r1, 2); |
| |
|
| | |
| | _k0_8 = vld1_s8(ktmp + 48); |
| | _k1_8 = vld1_s8(ktmp + 56); |
| | _k2_8 = vld1_s8(ktmp + 64); |
| |
|
| | _k0 = vmovl_s8(_k0_8); |
| | _k1 = vmovl_s8(_k1_8); |
| | _k2 = vmovl_s8(_k2_8); |
| |
|
| | _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_k0), _r2, 0); |
| | _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_k0), _r2, 0); |
| | _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_k1), _r2, 1); |
| | _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r2, 1); |
| | _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r2, 2); |
| | _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r2, 2); |
| |
|
| | _sum0 = vaddq_s32(_sum0, _sum1); |
| | _sum0n = vaddq_s32(_sum0n, _sum1n); |
| | _sum03 = vaddq_s32(_sum03, _sum0); |
| | _sum47 = vaddq_s32(_sum47, _sum0n); |
| |
|
| | vst1q_lane_s32(outptr0, _sum03, 0); |
| | vst1q_lane_s32(outptr1, _sum03, 1); |
| | vst1q_lane_s32(outptr2, _sum03, 2); |
| | vst1q_lane_s32(outptr3, _sum03, 3); |
| | vst1q_lane_s32(outptr4, _sum47, 0); |
| | vst1q_lane_s32(outptr5, _sum47, 1); |
| | vst1q_lane_s32(outptr6, _sum47, 2); |
| | vst1q_lane_s32(outptr7, _sum47, 3); |
| |
|
| | outptr0++; |
| | outptr1++; |
| | outptr2++; |
| | outptr3++; |
| | outptr4++; |
| | outptr5++; |
| | outptr6++; |
| | outptr7++; |
| | #else |
| | asm volatile( |
| | "pld [%8, #64] \n" |
| | "vld1.s8 {d0}, [%8] \n" |
| | "pld [%9, #64] \n" |
| | "vld1.s8 {d2}, [%9] \n" |
| | "pld [%10, #64] \n" |
| | "vld1.s8 {d4}, [%10] \n" |
| | |
| | "pld [%11, #64] \n" |
| | "vld1.s8 {d6-d8}, [%11]! \n" |
| | |
| | "vmovl.s8 q0, d0 \n" |
| | "vmovl.s8 q1, d2 \n" |
| | "vmovl.s8 q2, d4 \n" |
| | |
| | "vmovl.s8 q5, d8 \n" |
| | "vmovl.s8 q4, d7 \n" |
| | "vmovl.s8 q3, d6 \n" |
| | |
| | "vld1.s32 {d20[0]}, [%0] \n" |
| | "vld1.s32 {d20[1]}, [%1] \n" |
| | "vld1.s32 {d21[0]}, [%2] \n" |
| | "vld1.s32 {d21[1]}, [%3] \n" |
| | |
| | "pld [%11, #64] \n" |
| | "vld1.s8 {d24-d26}, [%11]! \n" |
| | "vmovl.s8 q14, d26 \n" |
| | "vmovl.s8 q13, d25 \n" |
| | "vmovl.s8 q12, d24 \n" |
| | |
| | "vld1.s32 {d22[0]}, [%4] \n" |
| | "vld1.s32 {d22[1]}, [%5] \n" |
| | "vld1.s32 {d23[0]}, [%6] \n" |
| | "vld1.s32 {d23[1]}, [%7] \n" |
| | |
| | "vmull.s16 q6, d6, d0[0] \n" |
| | "vmull.s16 q7, d7, d0[0] \n" |
| | "vmull.s16 q8, d8, d0[1] \n" |
| | "vmull.s16 q9, d9, d0[1] \n" |
| | "vmlal.s16 q10, d10, d0[2] \n" |
| | "vmlal.s16 q11, d11, d0[2] \n" |
| | |
| | "pld [%11, #64] \n" |
| | "vld1.s8 {d6-d8}, [%11]! \n" |
| | "vmovl.s8 q5, d8 \n" |
| | "vmovl.s8 q4, d7 \n" |
| | "vmovl.s8 q3, d6 \n" |
| | |
| | "vmlal.s16 q6, d24, d2[0] \n" |
| | "vmlal.s16 q7, d25, d2[0] \n" |
| | "vmlal.s16 q8, d26, d2[1] \n" |
| | "vmlal.s16 q9, d27, d2[1] \n" |
| | "vmlal.s16 q10, d28, d2[2] \n" |
| | "vmlal.s16 q11, d29, d2[2] \n" |
| | |
| | "vmlal.s16 q6, d6, d4[0] \n" |
| | "vmlal.s16 q7, d7, d4[0] \n" |
| | "vmlal.s16 q8, d8, d4[1] \n" |
| | "vmlal.s16 q9, d9, d4[1] \n" |
| | "vmlal.s16 q10, d10, d4[2] \n" |
| | "vmlal.s16 q11, d11, d4[2] \n" |
| | |
| | "vadd.s32 q8, q8, q6 \n" |
| | "vadd.s32 q9, q9, q7 \n" |
| | |
| | "sub %11, %11, #72 \n" |
| | |
| | "vadd.s32 q10, q10, q8 \n" |
| | "vadd.s32 q11, q11, q9 \n" |
| | |
| | "vst1.s32 {d20[0]}, [%0]! \n" |
| | "vst1.s32 {d20[1]}, [%1]! \n" |
| | "vst1.s32 {d21[0]}, [%2]! \n" |
| | "vst1.s32 {d21[1]}, [%3]! \n" |
| | "vst1.s32 {d22[0]}, [%4]! \n" |
| | "vst1.s32 {d22[1]}, [%5]! \n" |
| | "vst1.s32 {d23[0]}, [%6]! \n" |
| | "vst1.s32 {d23[1]}, [%7]! \n" |
| | |
| | : "=r"(outptr0), |
| | "=r"(outptr1), |
| | "=r"(outptr2), |
| | "=r"(outptr3), |
| | "=r"(outptr4), |
| | "=r"(outptr5), |
| | "=r"(outptr6), |
| | "=r"(outptr7), |
| | "=r"(r0), |
| | "=r"(r1), |
| | "=r"(r2), |
| | "=r"(ktmp) |
| | : "0"(outptr0), |
| | "1"(outptr1), |
| | "2"(outptr2), |
| | "3"(outptr3), |
| | "4"(outptr4), |
| | "5"(outptr5), |
| | "6"(outptr6), |
| | "7"(outptr7), |
| | "8"(r0), |
| | "9"(r1), |
| | "10"(r2), |
| | "11"(ktmp) |
| | : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | #endif |
| | #else |
| | int sum0 = 0; |
| | int sum1 = 0; |
| | int sum2 = 0; |
| | int sum3 = 0; |
| | int sum4 = 0; |
| | int sum5 = 0; |
| | int sum6 = 0; |
| | int sum7 = 0; |
| |
|
| | sum0 += (int)r0[0] * ktmp[0]; |
| | sum1 += (int)r0[0] * ktmp[1]; |
| | sum2 += (int)r0[0] * ktmp[2]; |
| | sum3 += (int)r0[0] * ktmp[3]; |
| | sum4 += (int)r0[0] * ktmp[4]; |
| | sum5 += (int)r0[0] * ktmp[5]; |
| | sum6 += (int)r0[0] * ktmp[6]; |
| | sum7 += (int)r0[0] * ktmp[7]; |
| | ktmp += 8; |
| |
|
| | sum0 += (int)r0[1] * ktmp[0]; |
| | sum1 += (int)r0[1] * ktmp[1]; |
| | sum2 += (int)r0[1] * ktmp[2]; |
| | sum3 += (int)r0[1] * ktmp[3]; |
| | sum4 += (int)r0[1] * ktmp[4]; |
| | sum5 += (int)r0[1] * ktmp[5]; |
| | sum6 += (int)r0[1] * ktmp[6]; |
| | sum7 += (int)r0[1] * ktmp[7]; |
| | ktmp += 8; |
| |
|
| | sum0 += (int)r0[2] * ktmp[0]; |
| | sum1 += (int)r0[2] * ktmp[1]; |
| | sum2 += (int)r0[2] * ktmp[2]; |
| | sum3 += (int)r0[2] * ktmp[3]; |
| | sum4 += (int)r0[2] * ktmp[4]; |
| | sum5 += (int)r0[2] * ktmp[5]; |
| | sum6 += (int)r0[2] * ktmp[6]; |
| | sum7 += (int)r0[2] * ktmp[7]; |
| | ktmp += 8; |
| |
|
| | sum0 += (int)r1[0] * ktmp[0]; |
| | sum1 += (int)r1[0] * ktmp[1]; |
| | sum2 += (int)r1[0] * ktmp[2]; |
| | sum3 += (int)r1[0] * ktmp[3]; |
| | sum4 += (int)r1[0] * ktmp[4]; |
| | sum5 += (int)r1[0] * ktmp[5]; |
| | sum6 += (int)r1[0] * ktmp[6]; |
| | sum7 += (int)r1[0] * ktmp[7]; |
| | ktmp += 8; |
| |
|
| | sum0 += (int)r1[1] * ktmp[0]; |
| | sum1 += (int)r1[1] * ktmp[1]; |
| | sum2 += (int)r1[1] * ktmp[2]; |
| | sum3 += (int)r1[1] * ktmp[3]; |
| | sum4 += (int)r1[1] * ktmp[4]; |
| | sum5 += (int)r1[1] * ktmp[5]; |
| | sum6 += (int)r1[1] * ktmp[6]; |
| | sum7 += (int)r1[1] * ktmp[7]; |
| | ktmp += 8; |
| |
|
| | sum0 += (int)r1[2] * ktmp[0]; |
| | sum1 += (int)r1[2] * ktmp[1]; |
| | sum2 += (int)r1[2] * ktmp[2]; |
| | sum3 += (int)r1[2] * ktmp[3]; |
| | sum4 += (int)r1[2] * ktmp[4]; |
| | sum5 += (int)r1[2] * ktmp[5]; |
| | sum6 += (int)r1[2] * ktmp[6]; |
| | sum7 += (int)r1[2] * ktmp[7]; |
| | ktmp += 8; |
| |
|
| | sum0 += (int)r2[0] * ktmp[0]; |
| | sum1 += (int)r2[0] * ktmp[1]; |
| | sum2 += (int)r2[0] * ktmp[2]; |
| | sum3 += (int)r2[0] * ktmp[3]; |
| | sum4 += (int)r2[0] * ktmp[4]; |
| | sum5 += (int)r2[0] * ktmp[5]; |
| | sum6 += (int)r2[0] * ktmp[6]; |
| | sum7 += (int)r2[0] * ktmp[7]; |
| | ktmp += 8; |
| |
|
| | sum0 += (int)r2[1] * ktmp[0]; |
| | sum1 += (int)r2[1] * ktmp[1]; |
| | sum2 += (int)r2[1] * ktmp[2]; |
| | sum3 += (int)r2[1] * ktmp[3]; |
| | sum4 += (int)r2[1] * ktmp[4]; |
| | sum5 += (int)r2[1] * ktmp[5]; |
| | sum6 += (int)r2[1] * ktmp[6]; |
| | sum7 += (int)r2[1] * ktmp[7]; |
| | ktmp += 8; |
| |
|
| | sum0 += (int)r2[2] * ktmp[0]; |
| | sum1 += (int)r2[2] * ktmp[1]; |
| | sum2 += (int)r2[2] * ktmp[2]; |
| | sum3 += (int)r2[2] * ktmp[3]; |
| | sum4 += (int)r2[2] * ktmp[4]; |
| | sum5 += (int)r2[2] * ktmp[5]; |
| | sum6 += (int)r2[2] * ktmp[6]; |
| | sum7 += (int)r2[2] * ktmp[7]; |
| | ktmp += 8; |
| |
|
| | *outptr0 += sum0; |
| | *outptr1 += sum1; |
| | *outptr2 += sum2; |
| | *outptr3 += sum3; |
| | *outptr4 += sum4; |
| | *outptr5 += sum5; |
| | *outptr6 += sum6; |
| | *outptr7 += sum7; |
| |
|
| | ktmp -= 8 * 9; |
| |
|
| | outptr0++; |
| | outptr1++; |
| | outptr2++; |
| | outptr3++; |
| | outptr4++; |
| | outptr5++; |
| | outptr6++; |
| | outptr7++; |
| | #endif |
| | r0 += 2; |
| | r1 += 2; |
| | r2 += 2; |
| | } |
| |
|
| | r0 += tailstep; |
| | r1 += tailstep; |
| | r2 += tailstep; |
| | } |
| |
|
| | ktmp += 8 * 9; |
| | } |
| | } |
| |
|
| | #pragma omp parallel for num_threads(opt.num_threads) |
| | for (int p = remain_outch_start; p < outch; p++) |
| | { |
| | Mat out = top_blob.channel(p); |
| |
|
| | out.fill(0); |
| |
|
| | const signed char* ktmp = _kernel.channel(p / 8 + p % 8); |
| |
|
| | for (int q = 0; q < inch; q++) |
| | { |
| | int* outptr = out; |
| |
|
| | const signed char* img0 = bottom_blob.channel(q); |
| |
|
| | const signed char* r0 = img0; |
| | const signed char* r1 = img0 + w; |
| | const signed char* r2 = img0 + w * 2; |
| |
|
| | int i = 0; |
| |
|
| | for (; i < outh; i++) |
| | { |
| | #if __ARM_NEON |
| | int nn = outw >> 3; |
| | int remain = outw & 7; |
| | #else |
| | int remain = outw; |
| | #endif |
| |
|
| | #if __ARM_NEON |
| | #if __aarch64__ |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "0: \n" |
| | |
| | "ld1 {v0.8b, v1.8b}, [%5] \n" |
| | "ld2 {v2.8b, v3.8b}, [%2], #16 \n" |
| | "ld2 {v4.8b, v5.8b}, [%2] \n" |
| | |
| | "ld2 {v6.8b, v7.8b}, [%3], #16 \n" |
| | "ld2 {v8.8b, v9.8b}, [%3] \n" |
| | |
| | "ld2 {v10.8b, v11.8b}, [%4], #16 \n" |
| | "ld2 {v12.8b, v13.8b}, [%4] \n" |
| | |
| | "ld1 {v14.4s, v15.4s}, [%1] \n" |
| | |
| | "ext v4.8b, v2.8b, v4.8b, #1 \n" |
| | "ext v8.8b, v6.8b, v8.8b, #1 \n" |
| | "ext v12.8b, v10.8b, v12.8b, #1 \n" |
| | |
| | "sshll v0.8h, v0.8b, #0 \n" |
| | "sshll v1.8h, v1.8b, #0 \n" |
| | "sshll v2.8h, v2.8b, #0 \n" |
| | "sshll v3.8h, v3.8b, #0 \n" |
| | "sshll v4.8h, v4.8b, #0 \n" |
| | "sshll v6.8h, v6.8b, #0 \n" |
| | "sshll v7.8h, v7.8b, #0 \n" |
| | "sshll v8.8h, v8.8b, #0 \n" |
| | "sshll v10.8h, v10.8b, #0 \n" |
| | "sshll v11.8h, v11.8b, #0 \n" |
| | "sshll v12.8h, v12.8b, #0 \n" |
| | |
| | |
| | "smull v16.4s, v2.4h, v0.h[0] \n" |
| | "smull2 v17.4s, v2.8h, v0.h[0] \n" |
| | "smull v18.4s, v3.4h, v0.h[1] \n" |
| | "smull2 v19.4s, v3.8h, v0.h[1] \n" |
| | "smlal v16.4s, v4.4h, v0.h[2] \n" |
| | "smlal2 v17.4s, v4.8h, v0.h[2] \n" |
| | "smlal v18.4s, v6.4h, v0.h[3] \n" |
| | "smlal2 v19.4s, v6.8h, v0.h[3] \n" |
| | "smlal v16.4s, v7.4h, v0.h[4] \n" |
| | "smlal2 v17.4s, v7.8h, v0.h[4] \n" |
| | "smlal v18.4s, v8.4h, v0.h[5] \n" |
| | "smlal2 v19.4s, v8.8h, v0.h[5] \n" |
| | "smlal v16.4s, v10.4h, v0.h[6] \n" |
| | "smlal2 v17.4s, v10.8h, v0.h[6] \n" |
| | "smlal v18.4s, v11.4h, v0.h[7] \n" |
| | "smlal2 v19.4s, v11.8h, v0.h[7] \n" |
| | "smlal v16.4s, v12.4h, v1.h[0] \n" |
| | "smlal2 v17.4s, v12.8h, v1.h[0] \n" |
| | |
| | "add v8.4s, v16.4s, v18.4s \n" |
| | "add v9.4s, v17.4s, v19.4s \n" |
| | |
| | "st1 {v8.4s, v9.4s}, [%1], #32 \n" |
| | |
| | "subs %w0, %w0, #1 \n" |
| | |
| | "bne 0b \n" |
| | |
| | : "=r"(nn), |
| | "=r"(outptr), |
| | "=r"(r0), |
| | "=r"(r1), |
| | "=r"(r2), |
| | "=r"(ktmp) |
| | : "0"(nn), |
| | "1"(outptr), |
| | "2"(r0), |
| | "3"(r1), |
| | "4"(r2), |
| | "5"(ktmp) |
| | : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19"); |
| | } |
| | #else |
| | if (nn > 0) |
| | { |
| | asm volatile( |
| | "vld1.s8 {d0-d1}, [%5] \n" |
| | "vmovl.s8 q1, d1 \n" |
| | "vmovl.s8 q0, d0 \n" |
| | "0: \n" |
| | "pld [%2, #192] \n" |
| | "vld2.s8 {d4-d5}, [%2]! \n" |
| | "vld2.s8 {d8-d9}, [%2] \n" |
| | "vld2.s8 {d10-d11}, [%3]! \n" |
| | "vld2.s8 {d14-d15}, [%3] \n" |
| | "vld2.s8 {d16-d17}, [%4]! \n" |
| | "vld2.s8 {d20-d21}, [%4] \n" |
| | "vld1.s32 {d22-d25}, [%1] \n" |
| | |
| | "vext.s8 d8, d4, d8, #1 \n" |
| | "vext.s8 d14, d10, d14, #1 \n" |
| | "vext.s8 d20, d16, d20, #1 \n" |
| | |
| | "vmovl.s8 q3, d5 \n" |
| | "vmovl.s8 q2, d4 \n" |
| | "vmovl.s8 q4, d8 \n" |
| | |
| | "vmovl.s8 q6, d11 \n" |
| | "vmovl.s8 q5, d10 \n" |
| | "vmovl.s8 q7, d14 \n" |
| | |
| | "vmovl.s8 q9, d17 \n" |
| | "vmovl.s8 q8, d16 \n" |
| | "vmovl.s8 q10, d20 \n" |
| | |
| | "vmlal.s16 q11, d4, d0[0] \n" |
| | "vmlal.s16 q12, d5, d0[0] \n" |
| | "vmull.s16 q13, d6, d0[1] \n" |
| | "vmull.s16 q14, d7, d0[1] \n" |
| | "vmlal.s16 q11, d8, d0[2] \n" |
| | "vmlal.s16 q12, d9, d0[2] \n" |
| | |
| | "vmlal.s16 q13, d12, d1[0] \n" |
| | "vmlal.s16 q14, d13, d1[0] \n" |
| | "vmlal.s16 q11, d10, d0[3] \n" |
| | "vmlal.s16 q12, d11, d0[3] \n" |
| | "vmlal.s16 q13, d14, d1[1] \n" |
| | "vmlal.s16 q14, d15, d1[1] \n" |
| | |
| | "vmlal.s16 q11, d16, d1[2] \n" |
| | "vmlal.s16 q12, d17, d1[2] \n" |
| | "vmlal.s16 q13, d18, d1[3] \n" |
| | "vmlal.s16 q14, d19, d1[3] \n" |
| | "vmlal.s16 q11, d20, d2[0] \n" |
| | "vmlal.s16 q12, d21, d2[0] \n" |
| | |
| | "vadd.s32 q11, q11, q13 \n" |
| | "vadd.s32 q12, q12, q14 \n" |
| | |
| | "vst1.32 {d22-d25}, [%1]! \n" |
| | |
| | "subs %0, #1 \n" |
| | "bne 0b \n" |
| | : "=r"(nn), |
| | "=r"(outptr), |
| | "=r"(r0), |
| | "=r"(r1), |
| | "=r"(r2), |
| | "=r"(ktmp) |
| | : "0"(nn), |
| | "1"(outptr), |
| | "2"(r0), |
| | "3"(r1), |
| | "4"(r2), |
| | "5"(ktmp) |
| | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); |
| | } |
| | #endif |
| | #endif |
| | if (remain > 0) |
| | { |
| | #if __ARM_NEON |
| | int8x8_t _k01234567s8 = vld1_s8(ktmp); |
| | int8x8_t _k8xxxxxxxs8 = vld1_s8(ktmp + 8); |
| | int8x8_t _k34567xxxs8 = vext_s8(_k01234567s8, _k01234567s8, 3); |
| | int8x8_t _k678xxxxxs8 = vext_s8(_k01234567s8, _k8xxxxxxxs8, 6); |
| | int16x8_t _k0123_s16 = vmovl_s8(_k01234567s8); |
| | int16x8_t _k3456_s16 = vmovl_s8(_k34567xxxs8); |
| | int16x8_t _k678x_s16 = vmovl_s8(_k678xxxxxs8); |
| | #endif |
| | for (; remain > 0; remain--) |
| | { |
| | #if __ARM_NEON |
| | int8x8_t _r00s8 = vld1_s8(r0); |
| | int8x8_t _r10s8 = vld1_s8(r1); |
| | int8x8_t _r20s8 = vld1_s8(r2); |
| |
|
| | int16x8_t _r00s16 = vmovl_s8(_r00s8); |
| | int16x8_t _r10s16 = vmovl_s8(_r10s8); |
| | int16x8_t _r20s16 = vmovl_s8(_r20s8); |
| |
|
| | int32x4_t _sum = vmull_s16(vget_low_s16(_r00s16), vget_low_s16(_k0123_s16)); |
| | _sum = vmlal_s16(_sum, vget_low_s16(_r10s16), vget_low_s16(_k3456_s16)); |
| | _sum = vmlal_s16(_sum, vget_low_s16(_r20s16), vget_low_s16(_k678x_s16)); |
| |
|
| | _sum = vsetq_lane_s32(*outptr, _sum, 3); |
| |
|
| | #if __aarch64__ |
| | *outptr = vaddvq_s32(_sum); |
| | #else |
| | int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum)); |
| | _ss = vpadd_s32(_ss, _ss); |
| |
|
| | *outptr = vget_lane_s32(_ss, 0); |
| | #endif |
| | #else |
| | int sum = 0; |
| |
|
| | sum += (int)r0[0] * ktmp[0]; |
| | sum += (int)r0[1] * ktmp[1]; |
| | sum += (int)r0[2] * ktmp[2]; |
| | sum += (int)r1[0] * ktmp[3]; |
| | sum += (int)r1[1] * ktmp[4]; |
| | sum += (int)r1[2] * ktmp[5]; |
| | sum += (int)r2[0] * ktmp[6]; |
| | sum += (int)r2[1] * ktmp[7]; |
| | sum += (int)r2[2] * ktmp[8]; |
| |
|
| | *outptr += sum; |
| | #endif |
| | r0 += 2; |
| | r1 += 2; |
| | r2 += 2; |
| | outptr++; |
| | } |
| | } |
| |
|
| | r0 += tailstep; |
| | r1 += tailstep; |
| | r2 += tailstep; |
| | } |
| |
|
| | ktmp += 9; |
| | } |
| | } |
| | } |
| |
|