| | #include <stdint.h> |
| |
|
| | #include <cuda.h> |
| | #include <cuda_fp16.h> |
| | #include <cuda_runtime.h> |
| |
|
| | #include <ATen/cuda/CUDAContext.h> |
| | #include <torch/torch.h> |
| |
|
| | #include <algorithm> |
| | #include <stdexcept> |
| |
|
| | #include <cstdio> |
| |
|
| |
|
| | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") |
| | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor") |
| | #define CHECK_IS_INT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, #x " must be an int tensor") |
| | #define CHECK_IS_FLOATING(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Float || x.scalar_type() == at::ScalarType::Half || x.scalar_type() == at::ScalarType::Double, #x " must be a floating tensor") |
| |
|
| |
|
| | template <typename T> |
| | __host__ __device__ T div_round_up(T val, T divisor) { |
| | return (val + divisor - 1) / divisor; |
| | } |
| |
|
| | template <typename scalar_t> |
| | __global__ void kernel_sh( |
| | const scalar_t * __restrict__ inputs, |
| | scalar_t * outputs, |
| | uint32_t B, uint32_t D, uint32_t C, |
| | scalar_t * dy_dx |
| | ) { |
| | const uint32_t b = threadIdx.x + blockIdx.x * blockDim.x; |
| | if (b >= B) return; |
| |
|
| | const uint32_t C2 = C * C; |
| |
|
| | |
| | inputs += b * D; |
| | outputs += b * C2; |
| |
|
| | scalar_t x = inputs[0], y = inputs[1], z = inputs[2]; |
| |
|
| | scalar_t xy=x*y, xz=x*z, yz=y*z, x2=x*x, y2=y*y, z2=z*z, xyz=xy*z; |
| | scalar_t x4=x2*x2, y4=y2*y2, z4=z2*z2; |
| | scalar_t x6=x4*x2, y6=y4*y2, z6=z4*z2; |
| |
|
| | auto write_sh = [&]() { |
| | outputs[0] = 0.28209479177387814f ; |
| | if (C <= 1) { return; } |
| | outputs[1] = -0.48860251190291987f*y ; |
| | outputs[2] = 0.48860251190291987f*z ; |
| | outputs[3] = -0.48860251190291987f*x ; |
| | if (C <= 2) { return; } |
| | outputs[4] = 1.0925484305920792f*xy ; |
| | outputs[5] = -1.0925484305920792f*yz ; |
| | outputs[6] = 0.94617469575755997f*z2 - 0.31539156525251999f ; |
| | outputs[7] = -1.0925484305920792f*xz ; |
| | outputs[8] = 0.54627421529603959f*x2 - 0.54627421529603959f*y2 ; |
| | if (C <= 3) { return; } |
| | outputs[9] = 0.59004358992664352f*y*(-3.0f*x2 + y2) ; |
| | outputs[10] = 2.8906114426405538f*xy*z ; |
| | outputs[11] = 0.45704579946446572f*y*(1.0f - 5.0f*z2) ; |
| | outputs[12] = 0.3731763325901154f*z*(5.0f*z2 - 3.0f) ; |
| | outputs[13] = 0.45704579946446572f*x*(1.0f - 5.0f*z2) ; |
| | outputs[14] = 1.4453057213202769f*z*(x2 - y2) ; |
| | outputs[15] = 0.59004358992664352f*x*(-x2 + 3.0f*y2) ; |
| | if (C <= 4) { return; } |
| | outputs[16] = 2.5033429417967046f*xy*(x2 - y2) ; |
| | outputs[17] = 1.7701307697799304f*yz*(-3.0f*x2 + y2) ; |
| | outputs[18] = 0.94617469575756008f*xy*(7.0f*z2 - 1.0f) ; |
| | outputs[19] = 0.66904654355728921f*yz*(3.0f - 7.0f*z2) ; |
| | outputs[20] = -3.1735664074561294f*z2 + 3.7024941420321507f*z4 + 0.31735664074561293f ; |
| | outputs[21] = 0.66904654355728921f*xz*(3.0f - 7.0f*z2) ; |
| | outputs[22] = 0.47308734787878004f*(x2 - y2)*(7.0f*z2 - 1.0f) ; |
| | outputs[23] = 1.7701307697799304f*xz*(-x2 + 3.0f*y2) ; |
| | outputs[24] = -3.7550144126950569f*x2*y2 + 0.62583573544917614f*x4 + 0.62583573544917614f*y4 ; |
| | if (C <= 5) { return; } |
| | outputs[25] = 0.65638205684017015f*y*(10.0f*x2*y2 - 5.0f*x4 - y4) ; |
| | outputs[26] = 8.3026492595241645f*xy*z*(x2 - y2) ; |
| | outputs[27] = -0.48923829943525038f*y*(3.0f*x2 - y2)*(9.0f*z2 - 1.0f) ; |
| | outputs[28] = 4.7935367849733241f*xy*z*(3.0f*z2 - 1.0f) ; |
| | outputs[29] = 0.45294665119569694f*y*(14.0f*z2 - 21.0f*z4 - 1.0f) ; |
| | outputs[30] = 0.1169503224534236f*z*(-70.0f*z2 + 63.0f*z4 + 15.0f) ; |
| | outputs[31] = 0.45294665119569694f*x*(14.0f*z2 - 21.0f*z4 - 1.0f) ; |
| | outputs[32] = 2.3967683924866621f*z*(x2 - y2)*(3.0f*z2 - 1.0f) ; |
| | outputs[33] = -0.48923829943525038f*x*(x2 - 3.0f*y2)*(9.0f*z2 - 1.0f) ; |
| | outputs[34] = 2.0756623148810411f*z*(-6.0f*x2*y2 + x4 + y4) ; |
| | outputs[35] = 0.65638205684017015f*x*(10.0f*x2*y2 - x4 - 5.0f*y4) ; |
| | if (C <= 6) { return; } |
| | outputs[36] = 1.3663682103838286f*xy*(-10.0f*x2*y2 + 3.0f*x4 + 3.0f*y4) ; |
| | outputs[37] = 2.3666191622317521f*yz*(10.0f*x2*y2 - 5.0f*x4 - y4) ; |
| | outputs[38] = 2.0182596029148963f*xy*(x2 - y2)*(11.0f*z2 - 1.0f) ; |
| | outputs[39] = -0.92120525951492349f*yz*(3.0f*x2 - y2)*(11.0f*z2 - 3.0f) ; |
| | outputs[40] = 0.92120525951492349f*xy*(-18.0f*z2 + 33.0f*z4 + 1.0f) ; |
| | outputs[41] = 0.58262136251873131f*yz*(30.0f*z2 - 33.0f*z4 - 5.0f) ; |
| | outputs[42] = 6.6747662381009842f*z2 - 20.024298714302954f*z4 + 14.684485723822165f*z6 - 0.31784601133814211f ; |
| | outputs[43] = 0.58262136251873131f*xz*(30.0f*z2 - 33.0f*z4 - 5.0f) ; |
| | outputs[44] = 0.46060262975746175f*(x2 - y2)*(11.0f*z2*(3.0f*z2 - 1.0f) - 7.0f*z2 + 1.0f) ; |
| | outputs[45] = -0.92120525951492349f*xz*(x2 - 3.0f*y2)*(11.0f*z2 - 3.0f) ; |
| | outputs[46] = 0.50456490072872406f*(11.0f*z2 - 1.0f)*(-6.0f*x2*y2 + x4 + y4) ; |
| | outputs[47] = 2.3666191622317521f*xz*(10.0f*x2*y2 - x4 - 5.0f*y4) ; |
| | outputs[48] = 10.247761577878714f*x2*y4 - 10.247761577878714f*x4*y2 + 0.6831841051919143f*x6 - 0.6831841051919143f*y6 ; |
| | if (C <= 7) { return; } |
| | outputs[49] = 0.70716273252459627f*y*(-21.0f*x2*y4 + 35.0f*x4*y2 - 7.0f*x6 + y6) ; |
| | outputs[50] = 5.2919213236038001f*xy*z*(-10.0f*x2*y2 + 3.0f*x4 + 3.0f*y4) ; |
| | outputs[51] = -0.51891557872026028f*y*(13.0f*z2 - 1.0f)*(-10.0f*x2*y2 + 5.0f*x4 + y4) ; |
| | outputs[52] = 4.1513246297620823f*xy*z*(x2 - y2)*(13.0f*z2 - 3.0f) ; |
| | outputs[53] = -0.15645893386229404f*y*(3.0f*x2 - y2)*(13.0f*z2*(11.0f*z2 - 3.0f) - 27.0f*z2 + 3.0f) ; |
| | outputs[54] = 0.44253269244498261f*xy*z*(-110.0f*z2 + 143.0f*z4 + 15.0f) ; |
| | outputs[55] = 0.090331607582517306f*y*(-135.0f*z2 + 495.0f*z4 - 429.0f*z6 + 5.0f) ; |
| | outputs[56] = 0.068284276912004949f*z*(315.0f*z2 - 693.0f*z4 + 429.0f*z6 - 35.0f) ; |
| | outputs[57] = 0.090331607582517306f*x*(-135.0f*z2 + 495.0f*z4 - 429.0f*z6 + 5.0f) ; |
| | outputs[58] = 0.07375544874083044f*z*(x2 - y2)*(143.0f*z2*(3.0f*z2 - 1.0f) - 187.0f*z2 + 45.0f) ; |
| | outputs[59] = -0.15645893386229404f*x*(x2 - 3.0f*y2)*(13.0f*z2*(11.0f*z2 - 3.0f) - 27.0f*z2 + 3.0f) ; |
| | outputs[60] = 1.0378311574405206f*z*(13.0f*z2 - 3.0f)*(-6.0f*x2*y2 + x4 + y4) ; |
| | outputs[61] = -0.51891557872026028f*x*(13.0f*z2 - 1.0f)*(-10.0f*x2*y2 + x4 + 5.0f*y4) ; |
| | outputs[62] = 2.6459606618019f*z*(15.0f*x2*y4 - 15.0f*x4*y2 + x6 - y6) ; |
| | outputs[63] = 0.70716273252459627f*x*(-35.0f*x2*y4 + 21.0f*x4*y2 - x6 + 7.0f*y6) ; |
| | }; |
| |
|
| | write_sh(); |
| |
|
| | if (dy_dx) { |
| | scalar_t *dx = dy_dx + b * D * C2; |
| | scalar_t *dy = dx + C2; |
| | scalar_t *dz = dy + C2; |
| |
|
| | auto write_sh_dx = [&]() { |
| | dx[0] = 0.0f ; |
| | if (C <= 1) { return; } |
| | dx[1] = 0.0f ; |
| | dx[2] = 0.0f ; |
| | dx[3] = -0.48860251190291992f ; |
| | if (C <= 2) { return; } |
| | dx[4] = 1.0925484305920792f*y ; |
| | dx[5] = 0.0f ; |
| | dx[6] = 0.0f ; |
| | dx[7] = -1.0925484305920792f*z ; |
| | dx[8] = 1.0925484305920792f*x ; |
| | if (C <= 3) { return; } |
| | dx[9] = -3.5402615395598609f*xy ; |
| | dx[10] = 2.8906114426405538f*yz ; |
| | dx[11] = 0.0f ; |
| | dx[12] = 0.0f ; |
| | dx[13] = 0.45704579946446572f - 2.2852289973223288f*z2 ; |
| | dx[14] = 2.8906114426405538f*xz ; |
| | dx[15] = -1.7701307697799304f*x2 + 1.7701307697799304f*y2 ; |
| | if (C <= 4) { return; } |
| | dx[16] = 2.5033429417967046f*y*(3.0f*x2 - y2) ; |
| | dx[17] = -10.620784618679583f*xy*z ; |
| | dx[18] = 0.94617469575756008f*y*(7.0f*z2 - 1.0f) ; |
| | dx[19] = 0.0f ; |
| | dx[20] = 0.0f ; |
| | dx[21] = 0.66904654355728921f*z*(3.0f - 7.0f*z2) ; |
| | dx[22] = 0.94617469575756008f*x*(7.0f*z2 - 1.0f) ; |
| | dx[23] = 5.3103923093397913f*z*(-x2 + y2) ; |
| | dx[24] = 2.5033429417967046f*x*(x2 - 3.0f*y2) ; |
| | if (C <= 5) { return; } |
| | dx[25] = 13.127641136803401f*xy*(-x2 + y2) ; |
| | dx[26] = 8.3026492595241645f*yz*(3.0f*x2 - y2) ; |
| | dx[27] = 2.9354297966115022f*xy*(1.0f - 9.0f*z2) ; |
| | dx[28] = 4.7935367849733241f*yz*(3.0f*z2 - 1.0f) ; |
| | dx[29] = 0.0f ; |
| | dx[30] = 0.0f ; |
| | dx[31] = 6.3412531167397574f*z2 - 9.5118796751096362f*z4 - 0.45294665119569694f ; |
| | dx[32] = 4.7935367849733241f*xz*(3.0f*z2 - 1.0f) ; |
| | dx[33] = -13.209434084751759f*x2*z2 + 1.4677148983057511f*x2 + 13.209434084751759f*y2*z2 - 1.4677148983057511f*y2 ; |
| | dx[34] = 8.3026492595241645f*xz*(x2 - 3.0f*y2) ; |
| | dx[35] = 19.6914617052051f*x2*y2 - 3.2819102842008503f*x4 - 3.2819102842008503f*y4 ; |
| | if (C <= 6) { return; } |
| | dx[36] = 4.0991046311514854f*y*(-10.0f*x2*y2 + 5.0f*x4 + y4) ; |
| | dx[37] = 47.332383244635047f*xy*z*(-x2 + y2) ; |
| | dx[38] = 2.0182596029148963f*y*(3.0f*x2 - y2)*(11.0f*z2 - 1.0f) ; |
| | dx[39] = 5.5272315570895412f*xy*z*(3.0f - 11.0f*z2) ; |
| | dx[40] = 0.92120525951492349f*y*(-18.0f*z2 + 33.0f*z4 + 1.0f) ; |
| | dx[41] = 0.0f ; |
| | dx[42] = 0.0f ; |
| | dx[43] = 0.58262136251873131f*z*(30.0f*z2 - 33.0f*z4 - 5.0f) ; |
| | dx[44] = 0.92120525951492349f*x*(-18.0f*z2 + 33.0f*z4 + 1.0f) ; |
| | dx[45] = -2.7636157785447706f*z*(x2 - y2)*(11.0f*z2 - 3.0f) ; |
| | dx[46] = 2.0182596029148963f*x*(x2 - 3.0f*y2)*(11.0f*z2 - 1.0f) ; |
| | dx[47] = 11.833095811158762f*z*(6.0f*x2*y2 - x4 - y4) ; |
| | dx[48] = 4.0991046311514854f*x*(-10.0f*x2*y2 + x4 + 5.0f*y4) ; |
| | if (C <= 7) { return; } |
| | dx[49] = 9.9002782553443485f*xy*(10.0f*x2*y2 - 3.0f*x4 - 3.0f*y4) ; |
| | dx[50] = 15.875763970811402f*yz*(-10.0f*x2*y2 + 5.0f*x4 + y4) ; |
| | dx[51] = -10.378311574405206f*xy*(x2 - y2)*(13.0f*z2 - 1.0f) ; |
| | dx[52] = 4.1513246297620823f*yz*(3.0f*x2 - y2)*(13.0f*z2 - 3.0f) ; |
| | dx[53] = 0.93875360317376422f*xy*(66.0f*z2 - 143.0f*z4 - 3.0f) ; |
| | dx[54] = 0.44253269244498261f*yz*(-110.0f*z2 + 143.0f*z4 + 15.0f) ; |
| | dx[55] = 0.0f ; |
| | dx[56] = 0.0f ; |
| | dx[57] = -12.194767023639836f*z2 + 44.714145753346067f*z4 - 38.752259652899923f*z6 + 0.45165803791258652f ; |
| | dx[58] = 0.44253269244498261f*xz*(-110.0f*z2 + 143.0f*z4 + 15.0f) ; |
| | dx[59] = 30.97886890473422f*x2*z2 - 67.120882626924143f*x2*z4 - 1.4081304047606462f*x2 - 30.97886890473422f*y2*z2 + 67.120882626924143f*y2*z4 + 1.4081304047606462f*y2 ; |
| | dx[60] = 4.1513246297620823f*xz*(x2 - 3.0f*y2)*(13.0f*z2 - 3.0f) ; |
| | dx[61] = -0.51891557872026028f*(13.0f*z2 - 1.0f)*(-10.0f*x2*y2 + 4.0f*x2*(x2 - 5.0f*y2) + x4 + 5.0f*y4) ; |
| | dx[62] = 15.875763970811402f*xz*(-10.0f*x2*y2 + x4 + 5.0f*y4) ; |
| | dx[63] = -74.252086915082614f*x2*y4 + 74.252086915082614f*x4*y2 - 4.9501391276721742f*x6 + 4.9501391276721742f*y6 ; |
| | }; |
| |
|
| | auto write_sh_dy = [&]() { |
| | dy[0] = 0.0f ; |
| | if (C <= 1) { return; } |
| | dy[1] = -0.48860251190291992f ; |
| | dy[2] = 0.0f ; |
| | dy[3] = 0.0f ; |
| | if (C <= 2) { return; } |
| | dy[4] = 1.0925484305920792f*x ; |
| | dy[5] = -1.0925484305920792f*z ; |
| | dy[6] = 0.0f ; |
| | dy[7] = 0.0f ; |
| | dy[8] = -1.0925484305920792f*y ; |
| | if (C <= 3) { return; } |
| | dy[9] = -1.7701307697799304f*x2 + 1.7701307697799304f*y2 ; |
| | dy[10] = 2.8906114426405538f*xz ; |
| | dy[11] = 0.45704579946446572f - 2.2852289973223288f*z2 ; |
| | dy[12] = 0.0f ; |
| | dy[13] = 0.0f ; |
| | dy[14] = -2.8906114426405538f*yz ; |
| | dy[15] = 3.5402615395598609f*xy ; |
| | if (C <= 4) { return; } |
| | dy[16] = 2.5033429417967046f*x*(x2 - 3.0f*y2) ; |
| | dy[17] = 5.3103923093397913f*z*(-x2 + y2) ; |
| | dy[18] = 0.94617469575756008f*x*(7.0f*z2 - 1.0f) ; |
| | dy[19] = 0.66904654355728921f*z*(3.0f - 7.0f*z2) ; |
| | dy[20] = 0.0f ; |
| | dy[21] = 0.0f ; |
| | dy[22] = 0.94617469575756008f*y*(1.0f - 7.0f*z2) ; |
| | dy[23] = 10.620784618679583f*xy*z ; |
| | dy[24] = 2.5033429417967046f*y*(-3.0f*x2 + y2) ; |
| | if (C <= 5) { return; } |
| | dy[25] = 19.6914617052051f*x2*y2 - 3.2819102842008503f*x4 - 3.2819102842008503f*y4 ; |
| | dy[26] = 8.3026492595241645f*xz*(x2 - 3.0f*y2) ; |
| | dy[27] = -1.4677148983057511f*(x2 - y2)*(9.0f*z2 - 1.0f) ; |
| | dy[28] = 4.7935367849733241f*xz*(3.0f*z2 - 1.0f) ; |
| | dy[29] = 6.3412531167397574f*z2 - 9.5118796751096362f*z4 - 0.45294665119569694f ; |
| | dy[30] = 0.0f ; |
| | dy[31] = 0.0f ; |
| | dy[32] = 4.7935367849733241f*yz*(1.0f - 3.0f*z2) ; |
| | dy[33] = 2.9354297966115022f*xy*(9.0f*z2 - 1.0f) ; |
| | dy[34] = 8.3026492595241645f*yz*(-3.0f*x2 + y2) ; |
| | dy[35] = 13.127641136803401f*xy*(x2 - y2) ; |
| | if (C <= 6) { return; } |
| | dy[36] = 4.0991046311514854f*x*(-10.0f*x2*y2 + x4 + 5.0f*y4) ; |
| | dy[37] = 11.833095811158762f*z*(6.0f*x2*y2 - x4 - y4) ; |
| | dy[38] = 2.0182596029148963f*x*(x2 - 3.0f*y2)*(11.0f*z2 - 1.0f) ; |
| | dy[39] = -2.7636157785447706f*z*(x2 - y2)*(11.0f*z2 - 3.0f) ; |
| | dy[40] = 0.92120525951492349f*x*(-18.0f*z2 + 33.0f*z4 + 1.0f) ; |
| | dy[41] = 0.58262136251873131f*z*(30.0f*z2 - 33.0f*z4 - 5.0f) ; |
| | dy[42] = 0.0f ; |
| | dy[43] = 0.0f ; |
| | dy[44] = 0.92120525951492349f*y*(18.0f*z2 - 33.0f*z4 - 1.0f) ; |
| | dy[45] = 5.5272315570895412f*xy*z*(11.0f*z2 - 3.0f) ; |
| | dy[46] = -2.0182596029148963f*y*(3.0f*x2 - y2)*(11.0f*z2 - 1.0f) ; |
| | dy[47] = 47.332383244635047f*xy*z*(x2 - y2) ; |
| | dy[48] = 4.0991046311514854f*y*(10.0f*x2*y2 - 5.0f*x4 - y4) ; |
| | if (C <= 7) { return; } |
| | dy[49] = -74.252086915082614f*x2*y4 + 74.252086915082614f*x4*y2 - 4.9501391276721742f*x6 + 4.9501391276721742f*y6 ; |
| | dy[50] = 15.875763970811402f*xz*(-10.0f*x2*y2 + x4 + 5.0f*y4) ; |
| | dy[51] = 0.51891557872026028f*(13.0f*z2 - 1.0f)*(10.0f*x2*y2 - 5.0f*x4 + 4.0f*y2*(5.0f*x2 - y2) - y4) ; |
| | dy[52] = 4.1513246297620823f*xz*(x2 - 3.0f*y2)*(13.0f*z2 - 3.0f) ; |
| | dy[53] = -0.46937680158688211f*(x2 - y2)*(13.0f*z2*(11.0f*z2 - 3.0f) - 27.0f*z2 + 3.0f) ; |
| | dy[54] = 0.44253269244498261f*xz*(-110.0f*z2 + 143.0f*z4 + 15.0f) ; |
| | dy[55] = -12.194767023639836f*z2 + 44.714145753346067f*z4 - 38.752259652899923f*z6 + 0.45165803791258652f ; |
| | dy[56] = 0.0f ; |
| | dy[57] = 0.0f ; |
| | dy[58] = 0.44253269244498261f*yz*(110.0f*z2 - 143.0f*z4 - 15.0f) ; |
| | dy[59] = 0.93875360317376422f*xy*(-66.0f*z2 + 143.0f*z4 + 3.0f) ; |
| | dy[60] = -4.1513246297620823f*yz*(3.0f*x2 - y2)*(13.0f*z2 - 3.0f) ; |
| | dy[61] = 10.378311574405206f*xy*(x2 - y2)*(13.0f*z2 - 1.0f) ; |
| | dy[62] = 15.875763970811402f*yz*(10.0f*x2*y2 - 5.0f*x4 - y4) ; |
| | dy[63] = 9.9002782553443485f*xy*(-10.0f*x2*y2 + 3.0f*x4 + 3.0f*y4) ; |
| | }; |
| |
|
| | auto write_sh_dz = [&]() { |
| | dz[0] = 0.0f ; |
| | if (C <= 1) { return; } |
| | dz[1] = 0.0f ; |
| | dz[2] = 0.48860251190291992f ; |
| | dz[3] = 0.0f ; |
| | if (C <= 2) { return; } |
| | dz[4] = 0.0f ; |
| | dz[5] = -1.0925484305920792f*y ; |
| | dz[6] = 1.8923493915151202f*z ; |
| | dz[7] = -1.0925484305920792f*x ; |
| | dz[8] = 0.0f ; |
| | if (C <= 3) { return; } |
| | dz[9] = 0.0f ; |
| | dz[10] = 2.8906114426405538f*xy ; |
| | dz[11] = -4.5704579946446566f*yz ; |
| | dz[12] = 5.597644988851731f*z2 - 1.1195289977703462f ; |
| | dz[13] = -4.5704579946446566f*xz ; |
| | dz[14] = 1.4453057213202769f*x2 - 1.4453057213202769f*y2 ; |
| | dz[15] = 0.0f ; |
| | if (C <= 4) { return; } |
| | dz[16] = 0.0f ; |
| | dz[17] = 1.7701307697799304f*y*(-3.0f*x2 + y2) ; |
| | dz[18] = 13.246445740605839f*xy*z ; |
| | dz[19] = 2.0071396306718676f*y*(1.0f - 7.0f*z2) ; |
| | dz[20] = 14.809976568128603f*z*z*z - 6.3471328149122579f*z ; |
| | dz[21] = 2.0071396306718676f*x*(1.0f - 7.0f*z2) ; |
| | dz[22] = 6.6232228703029197f*z*(x2 - y2) ; |
| | dz[23] = 1.7701307697799304f*x*(-x2 + 3.0f*y2) ; |
| | dz[24] = 0.0f ; |
| | if (C <= 5) { return; } |
| | dz[25] = 0.0f ; |
| | dz[26] = 8.3026492595241645f*xy*(x2 - y2) ; |
| | dz[27] = 8.8062893898345074f*yz*(-3.0f*x2 + y2) ; |
| | dz[28] = 4.7935367849733241f*xy*(9.0f*z2 - 1.0f) ; |
| | dz[29] = 12.682506233479513f*yz*(1.0f - 3.0f*z2) ; |
| | dz[30] = -24.559567715218954f*z2 + 36.839351572828434f*z4 + 1.754254836801354f ; |
| | dz[31] = 12.682506233479513f*xz*(1.0f - 3.0f*z2) ; |
| | dz[32] = 2.3967683924866621f*(x2 - y2)*(9.0f*z2 - 1.0f) ; |
| | dz[33] = 8.8062893898345074f*xz*(-x2 + 3.0f*y2) ; |
| | dz[34] = -12.453973889286246f*x2*y2 + 2.0756623148810411f*x4 + 2.0756623148810411f*y4 ; |
| | dz[35] = 0.0f ; |
| | if (C <= 6) { return; } |
| | dz[36] = 0.0f ; |
| | dz[37] = 2.3666191622317521f*y*(10.0f*x2*y2 - 5.0f*x4 - y4) ; |
| | dz[38] = 44.401711264127719f*xy*z*(x2 - y2) ; |
| | dz[39] = -2.7636157785447706f*y*(3.0f*x2 - y2)*(11.0f*z2 - 1.0f) ; |
| | dz[40] = 11.054463114179082f*xy*z*(11.0f*z2 - 3.0f) ; |
| | dz[41] = 2.9131068125936568f*y*(18.0f*z2 - 33.0f*z4 - 1.0f) ; |
| | dz[42] = 2.6699064952403937f*z*(-30.0f*z2 + 33.0f*z4 + 5.0f) ; |
| | dz[43] = 2.9131068125936568f*x*(18.0f*z2 - 33.0f*z4 - 1.0f) ; |
| | dz[44] = 5.5272315570895412f*z*(x2 - y2)*(11.0f*z2 - 3.0f) ; |
| | dz[45] = -2.7636157785447706f*x*(x2 - 3.0f*y2)*(11.0f*z2 - 1.0f) ; |
| | dz[46] = 11.10042781603193f*z*(-6.0f*x2*y2 + x4 + y4) ; |
| | dz[47] = 2.3666191622317521f*x*(10.0f*x2*y2 - x4 - 5.0f*y4) ; |
| | dz[48] = 0.0f ; |
| | if (C <= 7) { return; } |
| | dz[49] = 0.0f ; |
| | dz[50] = 5.2919213236038001f*xy*(-10.0f*x2*y2 + 3.0f*x4 + 3.0f*y4) ; |
| | dz[51] = 13.491805046726766f*yz*(10.0f*x2*y2 - 5.0f*x4 - y4) ; |
| | dz[52] = 12.453973889286248f*xy*(x2 - y2)*(13.0f*z2 - 1.0f) ; |
| | dz[53] = -6.8841930899409371f*yz*(3.0f*x2 - y2)*(13.0f*z2 - 3.0f) ; |
| | dz[54] = 2.2126634622249131f*xy*(-66.0f*z2 + 143.0f*z4 + 3.0f) ; |
| | dz[55] = 1.6259689364853116f*yz*(110.0f*z2 - 143.0f*z4 - 15.0f) ; |
| | dz[56] = 64.528641681844675f*z2 - 236.60501950009714f*z4 + 205.05768356675085f*z6 - 2.3899496919201733f ; |
| | dz[57] = 1.6259689364853116f*xz*(110.0f*z2 - 143.0f*z4 - 15.0f) ; |
| | dz[58] = 0.07375544874083044f*(x2 - y2)*(143.0f*z2*(3.0f*z2 - 1.0f) + 132.0f*z2*(13.0f*z2 - 5.0f) - 187.0f*z2 + 45.0f) ; |
| | dz[59] = -6.8841930899409371f*xz*(x2 - 3.0f*y2)*(13.0f*z2 - 3.0f) ; |
| | dz[60] = 3.1134934723215619f*(13.0f*z2 - 1.0f)*(-6.0f*x2*y2 + x4 + y4) ; |
| | dz[61] = 13.491805046726766f*xz*(10.0f*x2*y2 - x4 - 5.0f*y4) ; |
| | dz[62] = 39.6894099270285f*x2*y4 - 39.6894099270285f*x4*y2 + 2.6459606618019f*x6 - 2.6459606618019f*y6 ; |
| | dz[63] = 0.0f ; |
| | }; |
| | write_sh_dx(); |
| | write_sh_dy(); |
| | write_sh_dz(); |
| | } |
| | } |
| |
|
| |
|
| | template <typename scalar_t> |
| | __global__ void kernel_sh_backward( |
| | const scalar_t * __restrict__ grad, |
| | const scalar_t * __restrict__ inputs, |
| | uint32_t B, uint32_t D, uint32_t C, |
| | const scalar_t * __restrict__ dy_dx, |
| | scalar_t * grad_inputs |
| | ) { |
| | const uint32_t t = threadIdx.x + blockIdx.x * blockDim.x; |
| | const uint32_t b = t / D; |
| | if (b >= B) return; |
| |
|
| | const uint32_t d = t - b * D; |
| | const uint32_t C2 = C * C; |
| |
|
| | |
| | grad += b * C2; |
| | dy_dx += b * D * C2 + d * C2; |
| |
|
| | for (int ch = 0; ch < C2; ch++) { |
| | grad_inputs[t] += grad[ch] * dy_dx[ch]; |
| | |
| | } |
| |
|
| | } |
| |
|
| | |
| | |
| | template <typename scalar_t> |
| | void sh_encode_forward_cuda(const scalar_t *inputs, scalar_t *outputs, const uint32_t B, const uint32_t D, const uint32_t C, scalar_t *dy_dx) { |
| | static constexpr uint32_t N_THREADS = 256; |
| | kernel_sh<scalar_t><<<div_round_up(B, N_THREADS), N_THREADS>>>(inputs, outputs, B, D, C, dy_dx); |
| | } |
| |
|
| |
|
| | template <typename scalar_t> |
| | void sh_encode_backward_cuda(const scalar_t *grad, const scalar_t *inputs, const uint32_t B, const uint32_t D, const uint32_t C, scalar_t *dy_dx, scalar_t *grad_inputs) { |
| | static constexpr uint32_t N_THREADS = 256; |
| | kernel_sh_backward<scalar_t><<<div_round_up(B * D, N_THREADS), N_THREADS>>>(grad, inputs, B, D, C, dy_dx, grad_inputs); |
| | } |
| |
|
| |
|
| | void sh_encode_forward(at::Tensor inputs, at::Tensor outputs, const uint32_t B, const uint32_t D, const uint32_t C, at::optional<at::Tensor> dy_dx) { |
| | CHECK_CUDA(inputs); |
| | CHECK_CUDA(outputs); |
| | |
| | |
| | CHECK_CONTIGUOUS(inputs); |
| | CHECK_CONTIGUOUS(outputs); |
| | |
| |
|
| | CHECK_IS_FLOATING(inputs); |
| | CHECK_IS_FLOATING(outputs); |
| | |
| |
|
| | AT_DISPATCH_FLOATING_TYPES_AND_HALF( |
| | inputs.scalar_type(), "sh_encode_forward_cuda", ([&] { |
| | sh_encode_forward_cuda<scalar_t>(inputs.data_ptr<scalar_t>(), outputs.data_ptr<scalar_t>(), B, D, C, dy_dx.has_value() ? dy_dx.value().data_ptr<scalar_t>() : nullptr); |
| | })); |
| | } |
| |
|
| | void sh_encode_backward(at::Tensor grad, at::Tensor inputs, const uint32_t B, const uint32_t D, const uint32_t C, at::Tensor dy_dx, at::Tensor grad_inputs) { |
| | CHECK_CUDA(grad); |
| | CHECK_CUDA(inputs); |
| | CHECK_CUDA(dy_dx); |
| | CHECK_CUDA(grad_inputs); |
| | |
| | CHECK_CONTIGUOUS(grad); |
| | CHECK_CONTIGUOUS(inputs); |
| | CHECK_CONTIGUOUS(dy_dx); |
| | CHECK_CONTIGUOUS(grad_inputs); |
| |
|
| | CHECK_IS_FLOATING(grad); |
| | CHECK_IS_FLOATING(inputs); |
| | CHECK_IS_FLOATING(dy_dx); |
| | CHECK_IS_FLOATING(grad_inputs); |
| |
|
| | AT_DISPATCH_FLOATING_TYPES_AND_HALF( |
| | grad.scalar_type(), "sh_encode_backward_cuda", ([&] { |
| | sh_encode_backward_cuda<scalar_t>(grad.data_ptr<scalar_t>(), inputs.data_ptr<scalar_t>(), B, D, C, dy_dx.data_ptr<scalar_t>(), grad_inputs.data_ptr<scalar_t>()); |
| | })); |
| | } |