#include #include #include #include "rasterize_cuda_kernel.h" const int CUDA_NUM_THREADS = 1024; inline int GET_BLOCKS(const int N){ return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } // for the older gpus atomicAdd with double arguments does not exist #if __CUDA_ARCH__ < 600 and defined(__CUDA_ARCH__) static __inline__ __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); } while (assumed != old); return __longlong_as_double(old); } #endif #define GET_DIRECT_4d(data, x0, x1, x2, x3, sd0, sd1, sd2, sd3) \ ((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3)]) #define ADD_ATOMIC_4d(data, x0, x1, x2, x3, sd0, sd1, sd2, sd3, v) \ atomicAdd( data + (x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3), v ) #define ADD_ATOMIC_5d(data, x0, x1, x2, x3, x4, sd0, sd1, sd2, sd3, sd4, v) \ atomicAdd( data + (x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3) + (x4)*(sd4), v ) #define SET_DIRECT_4d(data, x0, x1, x2, x3, sd0, sd1, sd2, sd3, v) \ ((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3)]) = v #define GET_DIRECT_3d(data, x0, x1, x2, sd0, sd1, sd2) \ ((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2)]) #define SET_DIRECT_3d(data, x0, x1, x2, sd0, sd1, sd2, v) \ ((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) ]) = v #define GET_DIRECT_5d(data, x0, x1, x2, x3, x4, stride0, stride1, stride2, stride3, stride4) \ ((data)[(x0)*(stride0)+(x1)*(stride1)+(x2)*(stride2)+(x3)*(stride3)+(x4)*(stride4)]) #define SET_DIRECT_5d(data, x0, x1, x2, x3, x4, stride0, stride1, stride2, stride3, stride4, value) \ ((data)[(x0)*(stride0)+(x1)*(stride1)+(x2)*(stride2)+(x3)*(stride3)+(x4)*(stride4)] = (value)) #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) const int MODE_BOUNDARY = 0; const int MODE_MASK = 1; const int MODE_HARD_MASK = 2; template __global__ void inside_outside_cuda_kernel( const scalar_t* __restrict__ vertices, int batch_size, int number_vertices, scalar_t* rasterized, int height, int width) { // 1-D array of 1-D blocks. const int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= batch_size * width * height) { return; } const int w = width; const int h = height; const int nv = number_vertices; // batch index. const int bi = i / (w * h); // pixel number (linear index) const int pn = i % (w * h); const int yp = pn / w; const int xp = pn % w; // cast a ray: William Randolph Franklin. int j = 0; scalar_t c = 0; for (int vn = 0, j = nv - 1; vn < nv; j = vn++) { scalar_t from_x; scalar_t from_y; scalar_t to_x; scalar_t to_y; from_x = vertices[bi * (nv * 2) + vn * 2]; from_y = vertices[bi * (nv * 2) + vn * 2 + 1]; to_x = vertices[bi * (nv * 2) + j * 2]; to_y = vertices[bi * (nv * 2) + j * 2 + 1]; if (((from_y > yp) != (to_y > yp)) && (xp < (to_x - from_x) * (yp - from_y) / (to_y - from_y) + from_x)) { c = !c; } } rasterized[i] = c == 0 ? -1.0 : 1.0; } template __global__ void forward_rasterize_cuda_kernel( const scalar_t* __restrict__ vertices, int batch_size, int number_vertices, scalar_t* rasterized, int* contribution_map, int height, int width, float inv_smoothness, int mode) { // 1-D array of 1-D blocks. const int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= batch_size * width * height) { return; } const int w = width; const int h = height; const int nv = number_vertices; // batch index. const int bi = i / (w * h); // pixel number (linear index) const int pn = i % (w * h); const int yp = pn / w; const int xp = pn % w; // go through each vertex. // at some point, we'll need to record // which segment contributed the most // for backwards pass. scalar_t max_contribution = -2147483647; int max_vertex_number = -1; for (int vn = 0; vn < nv; vn++) { int from_index; int to_index; scalar_t from_x; scalar_t from_y; scalar_t to_x; scalar_t to_y; scalar_t x2_sub_x1; scalar_t y2_sub_y1; scalar_t square_segment_length; scalar_t x_sub_x1; scalar_t y_sub_y1; scalar_t x_sub_x2; scalar_t y_sub_y2; scalar_t dot; scalar_t x_proj; scalar_t y_proj; scalar_t contribution; // grid_x, grid_y = xp, yp. from_index = vn; to_index = (vn + 1) % number_vertices; from_x = vertices[bi * (nv * 2) + from_index * 2]; from_y = vertices[bi * (nv * 2) + from_index * 2 + 1]; to_x = vertices[bi * (nv * 2) + to_index * 2]; to_y = vertices[bi * (nv * 2) + to_index * 2 + 1]; x2_sub_x1 = to_x - from_x; y2_sub_y1 = to_y - from_y; square_segment_length = x2_sub_x1 * x2_sub_x1 + y2_sub_y1 * y2_sub_y1 + 0.00001; x_sub_x1 = xp - from_x; y_sub_y1 = yp - from_y; x_sub_x2 = xp - to_x; y_sub_y2 = yp - to_y; dot = ((x_sub_x1 * x2_sub_x1) + (y_sub_y1 * y2_sub_y1)) / square_segment_length; x_proj = xp - (from_x + dot * x2_sub_x1); y_proj = yp - (from_y + dot * y2_sub_y1); // Does it matter here to compute the squared distance or true Euclidean distance? if (dot < 0) { contribution = pow(x_sub_x1, 2) + pow(y_sub_y1, 2); } else if (dot > 1) { contribution = pow(x_sub_x2, 2) + pow(y_sub_y2, 2); } else { contribution = pow(x_proj, 2) + pow(y_proj, 2); } // we need contribution to be a decreasing function. // if (mode == MODE_MASK) { // // sign * -dist // contribution = 1.0 / (1.0 + exp(-rasterized[i] * contribution / inv_smoothness)); // } // else if (mode == MODE_HARD_MASK) { // // map the inside outside map to 0 or 1.0. // // technically, we don't need this preceeding loop. // contribution = rasterized[i] < 0 ? 0.0 : 1.0; // } // else { // contribution = exp(-contribution / inv_smoothness); // } contribution = -contribution; if (contribution > max_contribution) { max_contribution = contribution; max_vertex_number = vn; } } if (mode == MODE_MASK) { // sign * -dist max_contribution = 1.0 / (1.0 + exp(rasterized[i] * max_contribution / inv_smoothness)); } else if (mode == MODE_HARD_MASK) { // map the inside outside map to 0 or 1.0. // technically, we don't need this preceeding loop. max_contribution = rasterized[i] < 0 ? 0.0 : 1.0; } else { max_contribution = exp(max_contribution / inv_smoothness); } rasterized[i] = max_contribution; contribution_map[i] = max_vertex_number; } template __global__ void backward_rasterize_cuda_kernel( const scalar_t* __restrict__ vertices, const scalar_t* __restrict__ rasterized, const int* __restrict__ contribution_map, const scalar_t* __restrict__ grad_output, scalar_t* grad_vertices, int batch_size, int number_vertices, int width, int height, float inv_smoothness) { const int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= batch_size * width * height) { return; } const int w = width; const int h = height; const int nv = number_vertices; // batch index. const int bi = i / (w * h); // pixel number (linear index) const int pn = i % (w * h); const int yp = pn / w; const int xp = pn % w; // produce dR/dv. // since we use max over all vertices, we only need // to apply it to single vertex. int vn; int from_index; int to_index; scalar_t from_x; scalar_t from_y; scalar_t to_x; scalar_t to_y; scalar_t x2_sub_x1; scalar_t y2_sub_y1; scalar_t square_segment_length; scalar_t x_sub_x1; scalar_t y_sub_y1; scalar_t x_sub_x2; scalar_t y_sub_y2; scalar_t dot; scalar_t x_proj; scalar_t y_proj; scalar_t grad_x1 = 0.0; scalar_t grad_y1 = 0.0; scalar_t grad_x2 = 0.0; scalar_t grad_y2 = 0.0; scalar_t in_out = rasterized[i] >= 0.5 ? 1.0 : -1.0; vn = contribution_map[i]; from_index = vn; to_index = (vn + 1) % nv; // determine how we computed the distance to this segment. from_x = vertices[bi * (nv * 2) + from_index * 2]; from_y = vertices[bi * (nv * 2) + from_index * 2 + 1]; to_x = vertices[bi * (nv * 2) + to_index * 2]; to_y = vertices[bi * (nv * 2) + to_index * 2 + 1]; x2_sub_x1 = to_x - from_x; y2_sub_y1 = to_y - from_y; // grad: // dX1 = 2 * x2_sub_x1 * -1 // dX2 = 2 * x2_sub_x1 // dY1 = 2 * y2_sub_y1 * -1 // dY2 = 2 * y2_sub_y1 // possible this could NaN? square_segment_length = x2_sub_x1 * x2_sub_x1 + y2_sub_y1 * y2_sub_y1 + 0.00001; x_sub_x1 = xp - from_x; y_sub_y1 = yp - from_y; x_sub_x2 = xp - to_x; y_sub_y2 = yp - to_y; // grad numer: // dX1 = -1 * x2_sub_x1 + -1 * x_sub_x1 // dX2 = x_sub_x1 scalar_t dot_num = ((x_sub_x1 * x2_sub_x1) + (y_sub_y1 * y2_sub_y1)); dot = dot_num / square_segment_length; x_proj = xp - (from_x + dot * x2_sub_x1); y_proj = yp - (from_y + dot * y2_sub_y1); // negative sign? if (dot < 0) { // contribution = exp(-((xp - from_x) ** 2 + (yp - from_y) ** 2 / inv_smoothness) // grad_x1 = (rasterized[i] * 2 * x_sub_x1) / inv_smoothness; // grad_y1 = (rasterized[i] * 2 * y_sub_y1) / inv_smoothness; // grad_x1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * 2 * x_sub_x1 / inv_smoothness; // grad_y1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * 2 * y_sub_y1 / inv_smoothness; grad_x1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * x_sub_x1 / inv_smoothness; grad_y1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * y_sub_y1 / inv_smoothness; } else if (dot > 1) { // contribution = exp(-((xp - to_x) ** 2 + (yp - to_y) ** 2) / inv_smoothness) // grad_x2 = (rasterized[i] * 2 * x_sub_x2) / inv_smoothness; // grad_y2 = (rasterized[i] * 2 * y_sub_y2) / inv_smoothness; grad_x2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * x_sub_x2 / inv_smoothness; grad_y2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * y_sub_y2 / inv_smoothness; } else { // contribution = exp(-(xp - from_x) ** 2 / inv_smoothness) scalar_t ss_x1 = -2.0 * x2_sub_x1; scalar_t ss_x2 = 2.0 * x2_sub_x1; scalar_t ss_y1 = -2.0 * y2_sub_y1; scalar_t ss_y2 = 2.0 * y2_sub_y1; scalar_t dot_x1 = (square_segment_length * (-x2_sub_x1 - x_sub_x1) - dot_num * ss_x1) / pow(square_segment_length, 2); scalar_t dot_x2 = (square_segment_length * x_sub_x1 - dot_num * ss_x2) / pow(square_segment_length, 2); scalar_t dot_y1 = (square_segment_length * (-y2_sub_y1 - y_sub_y1) - dot_num * ss_y1) / pow(square_segment_length, 2); scalar_t dot_y2 = (square_segment_length * y_sub_y1 - dot_num * ss_y2) / pow(square_segment_length, 2); // d/dx() scalar_t x_proj_x1 = -1 - dot_x1 * x2_sub_x1 + dot; scalar_t x_proj_x2 = -(dot_x2 * x2_sub_x1 + dot); scalar_t y_proj_y1 = -1 - dot_y1 * y2_sub_y1 + dot; scalar_t y_proj_y2 = -(dot_y2 * y2_sub_y1 + dot); // we also need mixed. scalar_t y_proj_x1 = -dot_x1 * y2_sub_y1; scalar_t y_proj_x2 = -dot_x2 * y2_sub_y1; scalar_t x_proj_y1 = -dot_y1 * x2_sub_x1; scalar_t x_proj_y2 = -dot_y2 * x2_sub_x1; // - as well? grad_x1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_x1 + 2.0 * y_proj * y_proj_x1) / inv_smoothness; grad_x2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_x2 + 2.0 * y_proj * y_proj_x2) / inv_smoothness; grad_y1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_y1 + 2.0 * y_proj * y_proj_y1) / inv_smoothness; grad_y2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_y2 + 2.0 * y_proj * y_proj_y2) / inv_smoothness; // grad_x1 = -rasterized[i] * (2.0 * x_proj * x_proj_x1 + 2.0 * y_proj * y_proj_x1) / inv_smoothness; // grad_x2 = -rasterized[i] * (2.0 * x_proj * x_proj_x2 + 2.0 * y_proj * y_proj_x2) / inv_smoothness; // grad_y1 = -rasterized[i] * (2.0 * x_proj * x_proj_y1 + 2.0 * y_proj * y_proj_y1) / inv_smoothness; // grad_y2 = -rasterized[i] * (2.0 * x_proj * x_proj_y2 + 2.0 * y_proj * y_proj_y2) / inv_smoothness; } // apply the input gradients. grad_x1 = grad_x1 * grad_output[i]; grad_x2 = grad_x2 * grad_output[i]; grad_y1 = grad_y1 * grad_output[i]; grad_y2 = grad_y2 * grad_output[i]; // grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + from_index * 2] = grad_x1; // grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + from_index * 2 + 1] = grad_y1; // grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + to_index * 2] = grad_x2; // grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + to_index * 2 + 1] = grad_y2; // unsure if should be deferencing. atomicAdd(grad_vertices + bi * nv * 2 + from_index * 2, grad_x1); atomicAdd(grad_vertices + bi * nv * 2 + from_index * 2 + 1, grad_y1); atomicAdd(grad_vertices + bi * nv * 2 + to_index * 2, grad_x2); atomicAdd(grad_vertices + bi * nv * 2 + to_index * 2 + 1, grad_y2); } std::vector forward_rasterize_cuda( at::Tensor vertices, at::Tensor rasterized, at::Tensor contribution_map, int width, int height, float inv_smoothness, int mode) { const auto batch_size = vertices.size(0); const auto number_vertices = vertices.size(1); const int threads = 512; // each block processes some 512 sized chunk of the output image. const dim3 blocks ((batch_size * width * height - 1) / threads + 1); if ((mode == MODE_MASK) || (mode == MODE_HARD_MASK)) { // determine whether each point is inside or outside. AT_DISPATCH_FLOATING_TYPES(vertices.type(), "inside_outside_cuda", ([&] { inside_outside_cuda_kernel<<>>( vertices.data(), batch_size, number_vertices, rasterized.data(), height, width); })); } if (mode != MODE_HARD_MASK) { AT_DISPATCH_FLOATING_TYPES(vertices.type(), "forward_rasterize_cuda", ([&] { forward_rasterize_cuda_kernel<<>>( vertices.data(), batch_size, number_vertices, rasterized.data(), contribution_map.data(), height, width, inv_smoothness, mode); })); } cudaError_t err = cudaGetLastError(); err = cudaGetLastError(); if (err != cudaSuccess) printf("Error in forward_rasterize: %s\n", cudaGetErrorString(err)); return { rasterized, contribution_map }; } at::Tensor backward_rasterize_cuda( at::Tensor vertices, at::Tensor rasterized, at::Tensor contribution_map, at::Tensor grad_output, at::Tensor grad_vertices, int width, int height, float inv_smoothness, int mode) { const auto batch_size = vertices.size(0); const auto number_vertices = vertices.size(1); const int threads = 512; const dim3 blocks ((batch_size * width * height - 1) / threads + 1); AT_DISPATCH_FLOATING_TYPES(vertices.type(), "backward_rasterize_cuda", ([&] { backward_rasterize_cuda_kernel<<>>( vertices.data(), rasterized.data(), contribution_map.data(), grad_output.data(), grad_vertices.data(), batch_size, number_vertices, width, height, inv_smoothness); })); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) printf("Error in backward_rasterize: %s\n", cudaGetErrorString(err)); return grad_vertices; }