| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | static void conv1x1s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) |
| | { |
| | int inch = bottom_blob.c; |
| |
|
| | int outw = top_blob.w; |
| | int outh = top_blob.h; |
| | int outch = top_blob.c; |
| |
|
| | const float* kernel = _kernel; |
| | const float* bias = _bias; |
| |
|
| | #pragma omp parallel for num_threads(opt.num_threads) |
| | for (int p = 0; p < outch; p++) |
| | { |
| | Mat out = top_blob.channel(p); |
| |
|
| | const float bias0 = bias ? bias[p] : 0.f; |
| |
|
| | out.fill(bias0); |
| |
|
| | int q = 0; |
| |
|
| | for (; q + 3 < inch; q += 4) |
| | { |
| | float* outptr = out; |
| |
|
| | const float* img0 = bottom_blob.channel(q); |
| | const float* img1 = bottom_blob.channel(q + 1); |
| | const float* img2 = bottom_blob.channel(q + 2); |
| | const float* img3 = bottom_blob.channel(q + 3); |
| |
|
| | const float* kernel0 = kernel + p * inch + q; |
| | const float k0 = kernel0[0]; |
| | const float k1 = kernel0[1]; |
| | const float k2 = kernel0[2]; |
| | const float k3 = kernel0[3]; |
| |
|
| | const float* r0 = img0; |
| | const float* r1 = img1; |
| | const float* r2 = img2; |
| | const float* r3 = img3; |
| |
|
| | int size = outw * outh; |
| |
|
| | int remain = size; |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | float sum = *r0 * k0; |
| | float sum1 = *r1 * k1; |
| | float sum2 = *r2 * k2; |
| | float sum3 = *r3 * k3; |
| |
|
| | *outptr += sum + sum1 + sum2 + sum3; |
| |
|
| | r0++; |
| | r1++; |
| | r2++; |
| | r3++; |
| | outptr++; |
| | } |
| | } |
| |
|
| | for (; q < inch; q++) |
| | { |
| | float* outptr = out; |
| |
|
| | const float* img0 = bottom_blob.channel(q); |
| |
|
| | const float* kernel0 = kernel + p * inch + q; |
| | const float k0 = kernel0[0]; |
| |
|
| | const float* r0 = img0; |
| |
|
| | int size = outw * outh; |
| |
|
| | int remain = size; |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | float sum = *r0 * k0; |
| |
|
| | *outptr += sum; |
| |
|
| | r0++; |
| | outptr++; |
| | } |
| | } |
| | } |
| | } |
| |
|
| | static void conv1x1s2_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) |
| | { |
| | int w = bottom_blob.w; |
| | int inch = bottom_blob.c; |
| |
|
| | int outw = top_blob.w; |
| | int outh = top_blob.h; |
| | int outch = top_blob.c; |
| |
|
| | const int tailstep = w - 2 * outw + w; |
| |
|
| | const float* kernel = _kernel; |
| | const float* bias = _bias; |
| |
|
| | #pragma omp parallel for num_threads(opt.num_threads) |
| | for (int p = 0; p < outch; p++) |
| | { |
| | Mat out = top_blob.channel(p); |
| |
|
| | const float bias0 = bias ? bias[p] : 0.f; |
| |
|
| | out.fill(bias0); |
| |
|
| | int q = 0; |
| |
|
| | for (; q + 3 < inch; q += 4) |
| | { |
| | float* outptr = out; |
| |
|
| | const float* img0 = bottom_blob.channel(q); |
| | const float* img1 = bottom_blob.channel(q + 1); |
| | const float* img2 = bottom_blob.channel(q + 2); |
| | const float* img3 = bottom_blob.channel(q + 3); |
| |
|
| | const float* kernel0 = kernel + p * inch + q; |
| | const float k0 = kernel0[0]; |
| | const float k1 = kernel0[1]; |
| | const float k2 = kernel0[2]; |
| | const float k3 = kernel0[3]; |
| |
|
| | const float* r0 = img0; |
| | const float* r1 = img1; |
| | const float* r2 = img2; |
| | const float* r3 = img3; |
| |
|
| | for (int i = 0; i < outh; i++) |
| | { |
| | int remain = outw; |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | float sum = *r0 * k0; |
| | float sum1 = *r1 * k1; |
| | float sum2 = *r2 * k2; |
| | float sum3 = *r3 * k3; |
| |
|
| | *outptr += sum + sum1 + sum2 + sum3; |
| |
|
| | r0 += 2; |
| | r1 += 2; |
| | r2 += 2; |
| | r3 += 2; |
| | outptr++; |
| | } |
| |
|
| | r0 += tailstep; |
| | r1 += tailstep; |
| | r2 += tailstep; |
| | r3 += tailstep; |
| | } |
| | } |
| |
|
| | for (; q < inch; q++) |
| | { |
| | float* outptr = out; |
| |
|
| | const float* img0 = bottom_blob.channel(q); |
| |
|
| | const float* kernel0 = kernel + p * inch + q; |
| | const float k0 = kernel0[0]; |
| |
|
| | const float* r0 = img0; |
| |
|
| | for (int i = 0; i < outh; i++) |
| | { |
| | int remain = outw; |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | float sum = *r0 * k0; |
| |
|
| | *outptr += sum; |
| |
|
| | r0 += 2; |
| | outptr++; |
| | } |
| |
|
| | r0 += tailstep; |
| | } |
| | } |
| | } |
| | } |
| |
|