| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #version 450 |
| |
|
| | #if NCNN_fp16_storage |
| | #extension GL_EXT_shader_16bit_storage: require |
| | #endif |
| | #if NCNN_fp16_arithmetic |
| | #extension GL_EXT_shader_explicit_arithmetic_types_float16: require |
| | #endif |
| |
|
| | #define shape_constant_id_offset 0 |
| | layout (constant_id = shape_constant_id_offset + 0) const int w = 0; |
| | layout (constant_id = shape_constant_id_offset + 1) const int h = 0; |
| | layout (constant_id = shape_constant_id_offset + 2) const int c = 0; |
| | layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; |
| |
|
| | layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; |
| |
|
| | layout (constant_id = shape_constant_id_offset + 5) const int block_x = 0; |
| | layout (constant_id = shape_constant_id_offset + 6) const int block_y = 0; |
| |
|
| | #if NCNN_image_shader |
| | layout (binding = 0) uniform unfp sampler3D bottom_blob; |
| | layout (binding = 1, imfmtc1) writeonly uniform unfp image3D bottom_tm_blob; |
| | #else |
| | layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; |
| | layout (binding = 1) writeonly buffer bottom_tm_blob { sfp bottom_tm_blob_data[]; }; |
| | #endif |
| |
|
| | layout (push_constant) uniform parameter |
| | { |
| | int w; |
| | int h; |
| | int c; |
| | int cstep; |
| |
|
| | int outcstep; |
| |
|
| | int block_x; |
| | int block_y; |
| | } p; |
| |
|
| | void main() |
| | { |
| | int gx = int(gl_GlobalInvocationID.x); |
| | int gy = int(gl_GlobalInvocationID.y); |
| | int gz = int(gl_GlobalInvocationID.z); |
| |
|
| | if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) |
| | return; |
| |
|
| | |
| | int sx = gx * 4; |
| | int sy = gy * 4; |
| |
|
| | #if NCNN_image_shader |
| | afp v00 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 0, gz)); |
| | afp v01 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 0, gz)); |
| | afp v02 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 0, gz)); |
| | afp v03 = image3d_ld1(bottom_blob, ivec3(sx + 3, sy + 0, gz)); |
| | afp v04 = image3d_ld1(bottom_blob, ivec3(sx + 4, sy + 0, gz)); |
| | afp v05 = image3d_ld1(bottom_blob, ivec3(sx + 5, sy + 0, gz)); |
| |
|
| | afp v10 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 1, gz)); |
| | afp v11 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 1, gz)); |
| | afp v12 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 1, gz)); |
| | afp v13 = image3d_ld1(bottom_blob, ivec3(sx + 3, sy + 1, gz)); |
| | afp v14 = image3d_ld1(bottom_blob, ivec3(sx + 4, sy + 1, gz)); |
| | afp v15 = image3d_ld1(bottom_blob, ivec3(sx + 5, sy + 1, gz)); |
| |
|
| | afp v20 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 2, gz)); |
| | afp v21 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 2, gz)); |
| | afp v22 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 2, gz)); |
| | afp v23 = image3d_ld1(bottom_blob, ivec3(sx + 3, sy + 2, gz)); |
| | afp v24 = image3d_ld1(bottom_blob, ivec3(sx + 4, sy + 2, gz)); |
| | afp v25 = image3d_ld1(bottom_blob, ivec3(sx + 5, sy + 2, gz)); |
| |
|
| | afp v30 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 3, gz)); |
| | afp v31 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 3, gz)); |
| | afp v32 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 3, gz)); |
| | afp v33 = image3d_ld1(bottom_blob, ivec3(sx + 3, sy + 3, gz)); |
| | afp v34 = image3d_ld1(bottom_blob, ivec3(sx + 4, sy + 3, gz)); |
| | afp v35 = image3d_ld1(bottom_blob, ivec3(sx + 5, sy + 3, gz)); |
| |
|
| | afp v40 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 4, gz)); |
| | afp v41 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 4, gz)); |
| | afp v42 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 4, gz)); |
| | afp v43 = image3d_ld1(bottom_blob, ivec3(sx + 3, sy + 4, gz)); |
| | afp v44 = image3d_ld1(bottom_blob, ivec3(sx + 4, sy + 4, gz)); |
| | afp v45 = image3d_ld1(bottom_blob, ivec3(sx + 5, sy + 4, gz)); |
| |
|
| | afp v50 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 5, gz)); |
| | afp v51 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 5, gz)); |
| | afp v52 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 5, gz)); |
| | afp v53 = image3d_ld1(bottom_blob, ivec3(sx + 3, sy + 5, gz)); |
| | afp v54 = image3d_ld1(bottom_blob, ivec3(sx + 4, sy + 5, gz)); |
| | afp v55 = image3d_ld1(bottom_blob, ivec3(sx + 5, sy + 5, gz)); |
| | #else |
| | int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; |
| | ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); |
| | ivec2 v_offset45 = v_offset_0 + ivec2(4, 5) * psc(w); |
| |
|
| | afp v00 = buffer_ld1(bottom_blob_data, v_offset.r + 0); |
| | afp v01 = sx + 1 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.r + 1) : afp(0.f); |
| | afp v02 = sx + 2 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.r + 2) : afp(0.f); |
| | afp v03 = sx + 3 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.r + 3) : afp(0.f); |
| | afp v04 = sx + 4 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.r + 4) : afp(0.f); |
| | afp v05 = sx + 5 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.r + 5) : afp(0.f); |
| |
|
| | afp v10 = sy + 1 < psc(h) ? buffer_ld1(bottom_blob_data, v_offset.g + 0) : afp(0.f); |
| | afp v11 = sy + 1 < psc(h) && sx + 1 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.g + 1) : afp(0.f); |
| | afp v12 = sy + 1 < psc(h) && sx + 2 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.g + 2) : afp(0.f); |
| | afp v13 = sy + 1 < psc(h) && sx + 3 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.g + 3) : afp(0.f); |
| | afp v14 = sy + 1 < psc(h) && sx + 4 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.g + 4) : afp(0.f); |
| | afp v15 = sy + 1 < psc(h) && sx + 5 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.g + 5) : afp(0.f); |
| |
|
| | afp v20 = sy + 2 < psc(h) ? buffer_ld1(bottom_blob_data, v_offset.b + 0) : afp(0.f); |
| | afp v21 = sy + 2 < psc(h) && sx + 1 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.b + 1) : afp(0.f); |
| | afp v22 = sy + 2 < psc(h) && sx + 2 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.b + 2) : afp(0.f); |
| | afp v23 = sy + 2 < psc(h) && sx + 3 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.b + 3) : afp(0.f); |
| | afp v24 = sy + 2 < psc(h) && sx + 4 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.b + 4) : afp(0.f); |
| | afp v25 = sy + 2 < psc(h) && sx + 5 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.b + 5) : afp(0.f); |
| |
|
| | afp v30 = sy + 3 < psc(h) ? buffer_ld1(bottom_blob_data, v_offset.a + 0) : afp(0.f); |
| | afp v31 = sy + 3 < psc(h) && sx + 1 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.a + 1) : afp(0.f); |
| | afp v32 = sy + 3 < psc(h) && sx + 2 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.a + 2) : afp(0.f); |
| | afp v33 = sy + 3 < psc(h) && sx + 3 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.a + 3) : afp(0.f); |
| | afp v34 = sy + 3 < psc(h) && sx + 4 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.a + 4) : afp(0.f); |
| | afp v35 = sy + 3 < psc(h) && sx + 5 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset.a + 5) : afp(0.f); |
| |
|
| | afp v40 = sy + 4 < psc(h) ? buffer_ld1(bottom_blob_data, v_offset45.x + 0) : afp(0.f); |
| | afp v41 = sy + 4 < psc(h) && sx + 1 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset45.x + 1) : afp(0.f); |
| | afp v42 = sy + 4 < psc(h) && sx + 2 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset45.x + 2) : afp(0.f); |
| | afp v43 = sy + 4 < psc(h) && sx + 3 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset45.x + 3) : afp(0.f); |
| | afp v44 = sy + 4 < psc(h) && sx + 4 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset45.x + 4) : afp(0.f); |
| | afp v45 = sy + 4 < psc(h) && sx + 5 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset45.x + 5) : afp(0.f); |
| |
|
| | afp v50 = sy + 5 < psc(h) ? buffer_ld1(bottom_blob_data, v_offset45.y + 0) : afp(0.f); |
| | afp v51 = sy + 5 < psc(h) && sx + 1 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset45.y + 1) : afp(0.f); |
| | afp v52 = sy + 5 < psc(h) && sx + 2 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset45.y + 2) : afp(0.f); |
| | afp v53 = sy + 5 < psc(h) && sx + 3 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset45.y + 3) : afp(0.f); |
| | afp v54 = sy + 5 < psc(h) && sx + 4 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset45.y + 4) : afp(0.f); |
| | afp v55 = sy + 5 < psc(h) && sx + 5 < psc(w) ? buffer_ld1(bottom_blob_data, v_offset45.y + 5) : afp(0.f); |
| | #endif |
| |
|
| | #define sq2 1.41421356237 |
| | #define sq2_d2 1.41421356237/2 |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | afp m00 = v00 - v02 * afp(2.5) + v04; |
| | afp m01 = v10 - v12 * afp(2.5) + v14; |
| | afp m02 = v20 - v22 * afp(2.5) + v24; |
| | afp m03 = v30 - v32 * afp(2.5) + v34; |
| | afp m04 = v40 - v42 * afp(2.5) + v44; |
| | afp m05 = v50 - v52 * afp(2.5) + v54; |
| |
|
| | afp m10 = v04 - v02 * afp(2) + v03 * afp(sq2_d2) - v01 * afp(sq2); |
| | afp m11 = v14 - v12 * afp(2) + v13 * afp(sq2_d2) - v11 * afp(sq2); |
| | afp m12 = v24 - v22 * afp(2) + v23 * afp(sq2_d2) - v21 * afp(sq2); |
| | afp m13 = v34 - v32 * afp(2) + v33 * afp(sq2_d2) - v31 * afp(sq2); |
| | afp m14 = v44 - v42 * afp(2) + v43 * afp(sq2_d2) - v41 * afp(sq2); |
| | afp m15 = v54 - v52 * afp(2) + v53 * afp(sq2_d2) - v51 * afp(sq2); |
| |
|
| | afp m20 = v04 - v02 * afp(2) - v03 * afp(sq2_d2) + v01 * afp(sq2); |
| | afp m21 = v14 - v12 * afp(2) - v13 * afp(sq2_d2) + v11 * afp(sq2); |
| | afp m22 = v24 - v22 * afp(2) - v23 * afp(sq2_d2) + v21 * afp(sq2); |
| | afp m23 = v34 - v32 * afp(2) - v33 * afp(sq2_d2) + v31 * afp(sq2); |
| | afp m24 = v44 - v42 * afp(2) - v43 * afp(sq2_d2) + v41 * afp(sq2); |
| | afp m25 = v54 - v52 * afp(2) - v53 * afp(sq2_d2) + v51 * afp(sq2); |
| |
|
| | afp m30 = v04 - v02 * afp(0.5) + v03 * afp(sq2) - v01 * afp(sq2_d2); |
| | afp m31 = v14 - v12 * afp(0.5) + v13 * afp(sq2) - v11 * afp(sq2_d2); |
| | afp m32 = v24 - v22 * afp(0.5) + v23 * afp(sq2) - v21 * afp(sq2_d2); |
| | afp m33 = v34 - v32 * afp(0.5) + v33 * afp(sq2) - v31 * afp(sq2_d2); |
| | afp m34 = v44 - v42 * afp(0.5) + v43 * afp(sq2) - v41 * afp(sq2_d2); |
| | afp m35 = v54 - v52 * afp(0.5) + v53 * afp(sq2) - v51 * afp(sq2_d2); |
| |
|
| | afp m40 = v04 - v02 * afp(0.5) - v03 * afp(sq2) + v01 * afp(sq2_d2); |
| | afp m41 = v14 - v12 * afp(0.5) - v13 * afp(sq2) + v11 * afp(sq2_d2); |
| | afp m42 = v24 - v22 * afp(0.5) - v23 * afp(sq2) + v21 * afp(sq2_d2); |
| | afp m43 = v34 - v32 * afp(0.5) - v33 * afp(sq2) + v31 * afp(sq2_d2); |
| | afp m44 = v44 - v42 * afp(0.5) - v43 * afp(sq2) + v41 * afp(sq2_d2); |
| | afp m45 = v54 - v52 * afp(0.5) - v53 * afp(sq2) + v51 * afp(sq2_d2); |
| |
|
| | afp m50 = v01 - v03 * afp(2.5) + v05; |
| | afp m51 = v11 - v13 * afp(2.5) + v15; |
| | afp m52 = v21 - v23 * afp(2.5) + v25; |
| | afp m53 = v31 - v33 * afp(2.5) + v35; |
| | afp m54 = v41 - v43 * afp(2.5) + v45; |
| | afp m55 = v51 - v53 * afp(2.5) + v55; |
| |
|
| | v00 = m00 - m02 * afp(2.5) + m04; |
| | v10 = m10 - m12 * afp(2.5) + m14; |
| | v20 = m20 - m22 * afp(2.5) + m24; |
| | v30 = m30 - m32 * afp(2.5) + m34; |
| | v40 = m40 - m42 * afp(2.5) + m44; |
| | v50 = m50 - m52 * afp(2.5) + m54; |
| |
|
| | v01 = m04 - m02 * afp(2) + m03 * afp(sq2_d2) - m01 * afp(sq2); |
| | v11 = m14 - m12 * afp(2) + m13 * afp(sq2_d2) - m11 * afp(sq2); |
| | v21 = m24 - m22 * afp(2) + m23 * afp(sq2_d2) - m21 * afp(sq2); |
| | v31 = m34 - m32 * afp(2) + m33 * afp(sq2_d2) - m31 * afp(sq2); |
| | v41 = m44 - m42 * afp(2) + m43 * afp(sq2_d2) - m41 * afp(sq2); |
| | v51 = m54 - m52 * afp(2) + m53 * afp(sq2_d2) - m51 * afp(sq2); |
| |
|
| | v02 = m04 - m02 * afp(2) - m03 * afp(sq2_d2) + m01 * afp(sq2); |
| | v12 = m14 - m12 * afp(2) - m13 * afp(sq2_d2) + m11 * afp(sq2); |
| | v22 = m24 - m22 * afp(2) - m23 * afp(sq2_d2) + m21 * afp(sq2); |
| | v32 = m34 - m32 * afp(2) - m33 * afp(sq2_d2) + m31 * afp(sq2); |
| | v42 = m44 - m42 * afp(2) - m43 * afp(sq2_d2) + m41 * afp(sq2); |
| | v52 = m54 - m52 * afp(2) - m53 * afp(sq2_d2) + m51 * afp(sq2); |
| |
|
| | v03 = m04 - m02 * afp(0.5) + m03 * afp(sq2) - m01 * afp(sq2_d2); |
| | v13 = m14 - m12 * afp(0.5) + m13 * afp(sq2) - m11 * afp(sq2_d2); |
| | v23 = m24 - m22 * afp(0.5) + m23 * afp(sq2) - m21 * afp(sq2_d2); |
| | v33 = m34 - m32 * afp(0.5) + m33 * afp(sq2) - m31 * afp(sq2_d2); |
| | v43 = m44 - m42 * afp(0.5) + m43 * afp(sq2) - m41 * afp(sq2_d2); |
| | v53 = m54 - m52 * afp(0.5) + m53 * afp(sq2) - m51 * afp(sq2_d2); |
| |
|
| | v04 = m04 - m02 * afp(0.5) - m03 * afp(sq2) + m01 * afp(sq2_d2); |
| | v14 = m14 - m12 * afp(0.5) - m13 * afp(sq2) + m11 * afp(sq2_d2); |
| | v24 = m24 - m22 * afp(0.5) - m23 * afp(sq2) + m21 * afp(sq2_d2); |
| | v34 = m34 - m32 * afp(0.5) - m33 * afp(sq2) + m31 * afp(sq2_d2); |
| | v44 = m44 - m42 * afp(0.5) - m43 * afp(sq2) + m41 * afp(sq2_d2); |
| | v54 = m54 - m52 * afp(0.5) - m53 * afp(sq2) + m51 * afp(sq2_d2); |
| |
|
| | v05 = m01 - m03 * afp(2.5) + m05; |
| | v15 = m11 - m13 * afp(2.5) + m15; |
| | v25 = m21 - m23 * afp(2.5) + m25; |
| | v35 = m31 - m33 * afp(2.5) + m35; |
| | v45 = m41 - m43 * afp(2.5) + m45; |
| | v55 = m51 - m53 * afp(2.5) + m55; |
| |
|
| | |
| | #if NCNN_image_shader |
| | int x = gy * psc(block_x) + gx; |
| |
|
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 0), v00); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 1), v01); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 2), v02); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 3), v03); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 4), v04); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 5), v05); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 6), v10); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 7), v11); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 8), v12); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 9), v13); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 10), v14); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 11), v15); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 12), v20); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 13), v21); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 14), v22); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 15), v23); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 16), v24); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 17), v25); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 18), v30); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 19), v31); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 20), v32); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 21), v33); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 22), v34); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 23), v35); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 24), v40); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 25), v41); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 26), v42); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 27), v43); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 28), v44); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 29), v45); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 30), v50); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 31), v51); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 32), v52); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 33), v53); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 34), v54); |
| | image3d_st1(bottom_tm_blob, ivec3(x, gz, 35), v55); |
| | #else |
| | int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx; |
| |
|
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 0 * psc(outcstep), v00); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 1 * psc(outcstep), v01); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 2 * psc(outcstep), v02); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 3 * psc(outcstep), v03); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 4 * psc(outcstep), v04); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 5 * psc(outcstep), v05); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 6 * psc(outcstep), v10); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 7 * psc(outcstep), v11); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 8 * psc(outcstep), v12); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 9 * psc(outcstep), v13); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 10 * psc(outcstep), v14); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 11 * psc(outcstep), v15); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 12 * psc(outcstep), v20); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 13 * psc(outcstep), v21); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 14 * psc(outcstep), v22); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 15 * psc(outcstep), v23); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 16 * psc(outcstep), v24); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 17 * psc(outcstep), v25); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 18 * psc(outcstep), v30); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 19 * psc(outcstep), v31); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 20 * psc(outcstep), v32); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 21 * psc(outcstep), v33); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 22 * psc(outcstep), v34); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 23 * psc(outcstep), v35); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 24 * psc(outcstep), v40); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 25 * psc(outcstep), v41); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 26 * psc(outcstep), v42); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 27 * psc(outcstep), v43); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 28 * psc(outcstep), v44); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 29 * psc(outcstep), v45); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 30 * psc(outcstep), v50); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 31 * psc(outcstep), v51); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 32 * psc(outcstep), v52); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 33 * psc(outcstep), v53); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 34 * psc(outcstep), v54); |
| | buffer_st1(bottom_tm_blob_data, v_tm_offset + 35 * psc(outcstep), v55); |
| | #endif |
| | } |
| |
|