| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | static void conv5x5s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) |
| | { |
| | int w = bottom_blob.w; |
| | int inch = bottom_blob.c; |
| |
|
| | int outw = top_blob.w; |
| | int outh = top_blob.h; |
| | int outch = top_blob.c; |
| |
|
| | const float* kernel = _kernel; |
| | const float* bias = _bias; |
| |
|
| | #pragma omp parallel for num_threads(opt.num_threads) |
| | for (int p = 0; p < outch; p++) |
| | { |
| | Mat out = top_blob.channel(p); |
| |
|
| | const float bias0 = bias ? bias[p] : 0.f; |
| |
|
| | out.fill(bias0); |
| |
|
| | for (int q = 0; q < inch; q++) |
| | { |
| | float* outptr = out; |
| | float* outptr2 = outptr + outw; |
| |
|
| | const float* img0 = bottom_blob.channel(q); |
| |
|
| | const float* kernel0 = kernel + p * inch * 25 + q * 25; |
| |
|
| | const float* r0 = img0; |
| | const float* r1 = img0 + w; |
| | const float* r2 = img0 + w * 2; |
| | const float* r3 = img0 + w * 3; |
| | const float* r4 = img0 + w * 4; |
| | const float* r5 = img0 + w * 5; |
| |
|
| | const float* k0 = kernel0; |
| | const float* k1 = kernel0 + 5; |
| | const float* k2 = kernel0 + 10; |
| | const float* k3 = kernel0 + 15; |
| | const float* k4 = kernel0 + 20; |
| |
|
| | int i = 0; |
| |
|
| | for (; i + 1 < outh; i += 2) |
| | { |
| | int remain = outw; |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | float sum = 0; |
| | float sum2 = 0; |
| |
|
| | sum += r0[0] * k0[0]; |
| | sum += r0[1] * k0[1]; |
| | sum += r0[2] * k0[2]; |
| | sum += r0[3] * k0[3]; |
| | sum += r0[4] * k0[4]; |
| |
|
| | sum += r1[0] * k1[0]; |
| | sum += r1[1] * k1[1]; |
| | sum += r1[2] * k1[2]; |
| | sum += r1[3] * k1[3]; |
| | sum += r1[4] * k1[4]; |
| |
|
| | sum += r2[0] * k2[0]; |
| | sum += r2[1] * k2[1]; |
| | sum += r2[2] * k2[2]; |
| | sum += r2[3] * k2[3]; |
| | sum += r2[4] * k2[4]; |
| |
|
| | sum += r3[0] * k3[0]; |
| | sum += r3[1] * k3[1]; |
| | sum += r3[2] * k3[2]; |
| | sum += r3[3] * k3[3]; |
| | sum += r3[4] * k3[4]; |
| |
|
| | sum += r4[0] * k4[0]; |
| | sum += r4[1] * k4[1]; |
| | sum += r4[2] * k4[2]; |
| | sum += r4[3] * k4[3]; |
| | sum += r4[4] * k4[4]; |
| |
|
| | sum2 += r1[0] * k0[0]; |
| | sum2 += r1[1] * k0[1]; |
| | sum2 += r1[2] * k0[2]; |
| | sum2 += r1[3] * k0[3]; |
| | sum2 += r1[4] * k0[4]; |
| |
|
| | sum2 += r2[0] * k1[0]; |
| | sum2 += r2[1] * k1[1]; |
| | sum2 += r2[2] * k1[2]; |
| | sum2 += r2[3] * k1[3]; |
| | sum2 += r2[4] * k1[4]; |
| |
|
| | sum2 += r3[0] * k2[0]; |
| | sum2 += r3[1] * k2[1]; |
| | sum2 += r3[2] * k2[2]; |
| | sum2 += r3[3] * k2[3]; |
| | sum2 += r3[4] * k2[4]; |
| |
|
| | sum2 += r4[0] * k3[0]; |
| | sum2 += r4[1] * k3[1]; |
| | sum2 += r4[2] * k3[2]; |
| | sum2 += r4[3] * k3[3]; |
| | sum2 += r4[4] * k3[4]; |
| |
|
| | sum2 += r5[0] * k4[0]; |
| | sum2 += r5[1] * k4[1]; |
| | sum2 += r5[2] * k4[2]; |
| | sum2 += r5[3] * k4[3]; |
| | sum2 += r5[4] * k4[4]; |
| |
|
| | *outptr += sum; |
| | *outptr2 += sum2; |
| |
|
| | r0++; |
| | r1++; |
| | r2++; |
| | r3++; |
| | r4++; |
| | r5++; |
| | outptr++; |
| | outptr2++; |
| | } |
| |
|
| | r0 += 4 + w; |
| | r1 += 4 + w; |
| | r2 += 4 + w; |
| | r3 += 4 + w; |
| | r4 += 4 + w; |
| | r5 += 4 + w; |
| |
|
| | outptr += outw; |
| | outptr2 += outw; |
| | } |
| |
|
| | for (; i < outh; i++) |
| | { |
| | int remain = outw; |
| |
|
| | for (; remain > 0; remain--) |
| | { |
| | float sum = 0; |
| |
|
| | sum += r0[0] * k0[0]; |
| | sum += r0[1] * k0[1]; |
| | sum += r0[2] * k0[2]; |
| | sum += r0[3] * k0[3]; |
| | sum += r0[4] * k0[4]; |
| |
|
| | sum += r1[0] * k1[0]; |
| | sum += r1[1] * k1[1]; |
| | sum += r1[2] * k1[2]; |
| | sum += r1[3] * k1[3]; |
| | sum += r1[4] * k1[4]; |
| |
|
| | sum += r2[0] * k2[0]; |
| | sum += r2[1] * k2[1]; |
| | sum += r2[2] * k2[2]; |
| | sum += r2[3] * k2[3]; |
| | sum += r2[4] * k2[4]; |
| |
|
| | sum += r3[0] * k3[0]; |
| | sum += r3[1] * k3[1]; |
| | sum += r3[2] * k3[2]; |
| | sum += r3[3] * k3[3]; |
| | sum += r3[4] * k3[4]; |
| |
|
| | sum += r4[0] * k4[0]; |
| | sum += r4[1] * k4[1]; |
| | sum += r4[2] * k4[2]; |
| | sum += r4[3] * k4[3]; |
| | sum += r4[4] * k4[4]; |
| |
|
| | *outptr += sum; |
| |
|
| | r0++; |
| | r1++; |
| | r2++; |
| | r3++; |
| | r4++; |
| | outptr++; |
| | } |
| |
|
| | r0 += 4; |
| | r1 += 4; |
| | r2 += 4; |
| | r3 += 4; |
| | r4 += 4; |
| | } |
| | } |
| | } |
| | } |
| |
|