Spaces:
Runtime error
Runtime error
| const int CUDA_NUM_THREADS = 1024; | |
| inline int GET_BLOCKS(const int N){ | |
| return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; | |
| } | |
| // for the older gpus atomicAdd with double arguments does not exist | |
| static __inline__ __device__ double atomicAdd(double* address, double val) { | |
| unsigned long long int* address_as_ull = (unsigned long long int*)address; | |
| unsigned long long int old = *address_as_ull, assumed; | |
| do { | |
| assumed = old; | |
| old = atomicCAS(address_as_ull, assumed, | |
| __double_as_longlong(val + __longlong_as_double(assumed))); | |
| // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old); | |
| } while (assumed != old); | |
| return __longlong_as_double(old); | |
| } | |
| ((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3)]) | |
| atomicAdd( data + (x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3), v ) | |
| atomicAdd( data + (x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3) + (x4)*(sd4), v ) | |
| ((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3)]) = v | |
| ((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2)]) | |
| ((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) ]) = v | |
| ((data)[(x0)*(stride0)+(x1)*(stride1)+(x2)*(stride2)+(x3)*(stride3)+(x4)*(stride4)]) | |
| ((data)[(x0)*(stride0)+(x1)*(stride1)+(x2)*(stride2)+(x3)*(stride3)+(x4)*(stride4)] = (value)) | |
| for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) | |
| const int MODE_BOUNDARY = 0; | |
| const int MODE_MASK = 1; | |
| const int MODE_HARD_MASK = 2; | |
| template <typename scalar_t> | |
| __global__ void inside_outside_cuda_kernel( | |
| const scalar_t* __restrict__ vertices, | |
| int batch_size, | |
| int number_vertices, | |
| scalar_t* rasterized, | |
| int height, | |
| int width) { | |
| // 1-D array of 1-D blocks. | |
| const int i = blockIdx.x * blockDim.x + threadIdx.x; | |
| if (i >= batch_size * width * height) { | |
| return; | |
| } | |
| const int w = width; | |
| const int h = height; | |
| const int nv = number_vertices; | |
| // batch index. | |
| const int bi = i / (w * h); | |
| // pixel number (linear index) | |
| const int pn = i % (w * h); | |
| const int yp = pn / w; | |
| const int xp = pn % w; | |
| // cast a ray: William Randolph Franklin. | |
| int j = 0; | |
| scalar_t c = 0; | |
| for (int vn = 0, j = nv - 1; vn < nv; j = vn++) { | |
| scalar_t from_x; | |
| scalar_t from_y; | |
| scalar_t to_x; | |
| scalar_t to_y; | |
| from_x = vertices[bi * (nv * 2) + vn * 2]; | |
| from_y = vertices[bi * (nv * 2) + vn * 2 + 1]; | |
| to_x = vertices[bi * (nv * 2) + j * 2]; | |
| to_y = vertices[bi * (nv * 2) + j * 2 + 1]; | |
| if (((from_y > yp) != (to_y > yp)) && (xp < (to_x - from_x) * (yp - from_y) / (to_y - from_y) + from_x)) { | |
| c = !c; | |
| } | |
| } | |
| rasterized[i] = c == 0 ? -1.0 : 1.0; | |
| } | |
| template <typename scalar_t> | |
| __global__ void forward_rasterize_cuda_kernel( | |
| const scalar_t* __restrict__ vertices, | |
| int batch_size, | |
| int number_vertices, | |
| scalar_t* rasterized, | |
| int* contribution_map, | |
| int height, | |
| int width, | |
| float inv_smoothness, | |
| int mode) { | |
| // 1-D array of 1-D blocks. | |
| const int i = blockIdx.x * blockDim.x + threadIdx.x; | |
| if (i >= batch_size * width * height) { | |
| return; | |
| } | |
| const int w = width; | |
| const int h = height; | |
| const int nv = number_vertices; | |
| // batch index. | |
| const int bi = i / (w * h); | |
| // pixel number (linear index) | |
| const int pn = i % (w * h); | |
| const int yp = pn / w; | |
| const int xp = pn % w; | |
| // go through each vertex. | |
| // at some point, we'll need to record | |
| // which segment contributed the most | |
| // for backwards pass. | |
| scalar_t max_contribution = -2147483647; | |
| int max_vertex_number = -1; | |
| for (int vn = 0; vn < nv; vn++) { | |
| int from_index; | |
| int to_index; | |
| scalar_t from_x; | |
| scalar_t from_y; | |
| scalar_t to_x; | |
| scalar_t to_y; | |
| scalar_t x2_sub_x1; | |
| scalar_t y2_sub_y1; | |
| scalar_t square_segment_length; | |
| scalar_t x_sub_x1; | |
| scalar_t y_sub_y1; | |
| scalar_t x_sub_x2; | |
| scalar_t y_sub_y2; | |
| scalar_t dot; | |
| scalar_t x_proj; | |
| scalar_t y_proj; | |
| scalar_t contribution; | |
| // grid_x, grid_y = xp, yp. | |
| from_index = vn; | |
| to_index = (vn + 1) % number_vertices; | |
| from_x = vertices[bi * (nv * 2) + from_index * 2]; | |
| from_y = vertices[bi * (nv * 2) + from_index * 2 + 1]; | |
| to_x = vertices[bi * (nv * 2) + to_index * 2]; | |
| to_y = vertices[bi * (nv * 2) + to_index * 2 + 1]; | |
| x2_sub_x1 = to_x - from_x; | |
| y2_sub_y1 = to_y - from_y; | |
| square_segment_length = x2_sub_x1 * x2_sub_x1 + y2_sub_y1 * y2_sub_y1 + 0.00001; | |
| x_sub_x1 = xp - from_x; | |
| y_sub_y1 = yp - from_y; | |
| x_sub_x2 = xp - to_x; | |
| y_sub_y2 = yp - to_y; | |
| dot = ((x_sub_x1 * x2_sub_x1) + (y_sub_y1 * y2_sub_y1)) / square_segment_length; | |
| x_proj = xp - (from_x + dot * x2_sub_x1); | |
| y_proj = yp - (from_y + dot * y2_sub_y1); | |
| // Does it matter here to compute the squared distance or true Euclidean distance? | |
| if (dot < 0) { | |
| contribution = pow(x_sub_x1, 2) + pow(y_sub_y1, 2); | |
| } | |
| else if (dot > 1) { | |
| contribution = pow(x_sub_x2, 2) + pow(y_sub_y2, 2); | |
| } | |
| else { | |
| contribution = pow(x_proj, 2) + pow(y_proj, 2); | |
| } | |
| // we need contribution to be a decreasing function. | |
| // if (mode == MODE_MASK) { | |
| // // sign * -dist | |
| // contribution = 1.0 / (1.0 + exp(-rasterized[i] * contribution / inv_smoothness)); | |
| // } | |
| // else if (mode == MODE_HARD_MASK) { | |
| // // map the inside outside map to 0 or 1.0. | |
| // // technically, we don't need this preceeding loop. | |
| // contribution = rasterized[i] < 0 ? 0.0 : 1.0; | |
| // } | |
| // else { | |
| // contribution = exp(-contribution / inv_smoothness); | |
| // } | |
| contribution = -contribution; | |
| if (contribution > max_contribution) { | |
| max_contribution = contribution; | |
| max_vertex_number = vn; | |
| } | |
| } | |
| if (mode == MODE_MASK) { | |
| // sign * -dist | |
| max_contribution = 1.0 / (1.0 + exp(rasterized[i] * max_contribution / inv_smoothness)); | |
| } | |
| else if (mode == MODE_HARD_MASK) { | |
| // map the inside outside map to 0 or 1.0. | |
| // technically, we don't need this preceeding loop. | |
| max_contribution = rasterized[i] < 0 ? 0.0 : 1.0; | |
| } | |
| else { | |
| max_contribution = exp(max_contribution / inv_smoothness); | |
| } | |
| rasterized[i] = max_contribution; | |
| contribution_map[i] = max_vertex_number; | |
| } | |
| template <typename scalar_t> | |
| __global__ void backward_rasterize_cuda_kernel( | |
| const scalar_t* __restrict__ vertices, | |
| const scalar_t* __restrict__ rasterized, | |
| const int* __restrict__ contribution_map, | |
| const scalar_t* __restrict__ grad_output, | |
| scalar_t* grad_vertices, | |
| int batch_size, | |
| int number_vertices, | |
| int width, | |
| int height, | |
| float inv_smoothness) { | |
| const int i = blockIdx.x * blockDim.x + threadIdx.x; | |
| if (i >= batch_size * width * height) { | |
| return; | |
| } | |
| const int w = width; | |
| const int h = height; | |
| const int nv = number_vertices; | |
| // batch index. | |
| const int bi = i / (w * h); | |
| // pixel number (linear index) | |
| const int pn = i % (w * h); | |
| const int yp = pn / w; | |
| const int xp = pn % w; | |
| // produce dR/dv. | |
| // since we use max over all vertices, we only need | |
| // to apply it to single vertex. | |
| int vn; | |
| int from_index; | |
| int to_index; | |
| scalar_t from_x; | |
| scalar_t from_y; | |
| scalar_t to_x; | |
| scalar_t to_y; | |
| scalar_t x2_sub_x1; | |
| scalar_t y2_sub_y1; | |
| scalar_t square_segment_length; | |
| scalar_t x_sub_x1; | |
| scalar_t y_sub_y1; | |
| scalar_t x_sub_x2; | |
| scalar_t y_sub_y2; | |
| scalar_t dot; | |
| scalar_t x_proj; | |
| scalar_t y_proj; | |
| scalar_t grad_x1 = 0.0; | |
| scalar_t grad_y1 = 0.0; | |
| scalar_t grad_x2 = 0.0; | |
| scalar_t grad_y2 = 0.0; | |
| scalar_t in_out = rasterized[i] >= 0.5 ? 1.0 : -1.0; | |
| vn = contribution_map[i]; | |
| from_index = vn; | |
| to_index = (vn + 1) % nv; | |
| // determine how we computed the distance to this segment. | |
| from_x = vertices[bi * (nv * 2) + from_index * 2]; | |
| from_y = vertices[bi * (nv * 2) + from_index * 2 + 1]; | |
| to_x = vertices[bi * (nv * 2) + to_index * 2]; | |
| to_y = vertices[bi * (nv * 2) + to_index * 2 + 1]; | |
| x2_sub_x1 = to_x - from_x; | |
| y2_sub_y1 = to_y - from_y; | |
| // grad: | |
| // dX1 = 2 * x2_sub_x1 * -1 | |
| // dX2 = 2 * x2_sub_x1 | |
| // dY1 = 2 * y2_sub_y1 * -1 | |
| // dY2 = 2 * y2_sub_y1 | |
| // possible this could NaN? | |
| square_segment_length = x2_sub_x1 * x2_sub_x1 + y2_sub_y1 * y2_sub_y1 + 0.00001; | |
| x_sub_x1 = xp - from_x; | |
| y_sub_y1 = yp - from_y; | |
| x_sub_x2 = xp - to_x; | |
| y_sub_y2 = yp - to_y; | |
| // grad numer: | |
| // dX1 = -1 * x2_sub_x1 + -1 * x_sub_x1 | |
| // dX2 = x_sub_x1 | |
| scalar_t dot_num = ((x_sub_x1 * x2_sub_x1) + (y_sub_y1 * y2_sub_y1)); | |
| dot = dot_num / square_segment_length; | |
| x_proj = xp - (from_x + dot * x2_sub_x1); | |
| y_proj = yp - (from_y + dot * y2_sub_y1); | |
| // negative sign? | |
| if (dot < 0) { | |
| // contribution = exp(-((xp - from_x) ** 2 + (yp - from_y) ** 2 / inv_smoothness) | |
| // grad_x1 = (rasterized[i] * 2 * x_sub_x1) / inv_smoothness; | |
| // grad_y1 = (rasterized[i] * 2 * y_sub_y1) / inv_smoothness; | |
| // grad_x1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * 2 * x_sub_x1 / inv_smoothness; | |
| // grad_y1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * 2 * y_sub_y1 / inv_smoothness; | |
| grad_x1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * x_sub_x1 / inv_smoothness; | |
| grad_y1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * y_sub_y1 / inv_smoothness; | |
| } | |
| else if (dot > 1) { | |
| // contribution = exp(-((xp - to_x) ** 2 + (yp - to_y) ** 2) / inv_smoothness) | |
| // grad_x2 = (rasterized[i] * 2 * x_sub_x2) / inv_smoothness; | |
| // grad_y2 = (rasterized[i] * 2 * y_sub_y2) / inv_smoothness; | |
| grad_x2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * x_sub_x2 / inv_smoothness; | |
| grad_y2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * y_sub_y2 / inv_smoothness; | |
| } | |
| else { | |
| // contribution = exp(-(xp - from_x) ** 2 / inv_smoothness) | |
| scalar_t ss_x1 = -2.0 * x2_sub_x1; | |
| scalar_t ss_x2 = 2.0 * x2_sub_x1; | |
| scalar_t ss_y1 = -2.0 * y2_sub_y1; | |
| scalar_t ss_y2 = 2.0 * y2_sub_y1; | |
| scalar_t dot_x1 = (square_segment_length * (-x2_sub_x1 - x_sub_x1) - dot_num * ss_x1) / pow(square_segment_length, 2); | |
| scalar_t dot_x2 = (square_segment_length * x_sub_x1 - dot_num * ss_x2) / pow(square_segment_length, 2); | |
| scalar_t dot_y1 = (square_segment_length * (-y2_sub_y1 - y_sub_y1) - dot_num * ss_y1) / pow(square_segment_length, 2); | |
| scalar_t dot_y2 = (square_segment_length * y_sub_y1 - dot_num * ss_y2) / pow(square_segment_length, 2); | |
| // d/dx() | |
| scalar_t x_proj_x1 = -1 - dot_x1 * x2_sub_x1 + dot; | |
| scalar_t x_proj_x2 = -(dot_x2 * x2_sub_x1 + dot); | |
| scalar_t y_proj_y1 = -1 - dot_y1 * y2_sub_y1 + dot; | |
| scalar_t y_proj_y2 = -(dot_y2 * y2_sub_y1 + dot); | |
| // we also need mixed. | |
| scalar_t y_proj_x1 = -dot_x1 * y2_sub_y1; | |
| scalar_t y_proj_x2 = -dot_x2 * y2_sub_y1; | |
| scalar_t x_proj_y1 = -dot_y1 * x2_sub_x1; | |
| scalar_t x_proj_y2 = -dot_y2 * x2_sub_x1; | |
| // - as well? | |
| grad_x1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_x1 + 2.0 * y_proj * y_proj_x1) / inv_smoothness; | |
| grad_x2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_x2 + 2.0 * y_proj * y_proj_x2) / inv_smoothness; | |
| grad_y1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_y1 + 2.0 * y_proj * y_proj_y1) / inv_smoothness; | |
| grad_y2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_y2 + 2.0 * y_proj * y_proj_y2) / inv_smoothness; | |
| // grad_x1 = -rasterized[i] * (2.0 * x_proj * x_proj_x1 + 2.0 * y_proj * y_proj_x1) / inv_smoothness; | |
| // grad_x2 = -rasterized[i] * (2.0 * x_proj * x_proj_x2 + 2.0 * y_proj * y_proj_x2) / inv_smoothness; | |
| // grad_y1 = -rasterized[i] * (2.0 * x_proj * x_proj_y1 + 2.0 * y_proj * y_proj_y1) / inv_smoothness; | |
| // grad_y2 = -rasterized[i] * (2.0 * x_proj * x_proj_y2 + 2.0 * y_proj * y_proj_y2) / inv_smoothness; | |
| } | |
| // apply the input gradients. | |
| grad_x1 = grad_x1 * grad_output[i]; | |
| grad_x2 = grad_x2 * grad_output[i]; | |
| grad_y1 = grad_y1 * grad_output[i]; | |
| grad_y2 = grad_y2 * grad_output[i]; | |
| // grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + from_index * 2] = grad_x1; | |
| // grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + from_index * 2 + 1] = grad_y1; | |
| // grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + to_index * 2] = grad_x2; | |
| // grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + to_index * 2 + 1] = grad_y2; | |
| // unsure if should be deferencing. | |
| atomicAdd(grad_vertices + bi * nv * 2 + from_index * 2, grad_x1); | |
| atomicAdd(grad_vertices + bi * nv * 2 + from_index * 2 + 1, grad_y1); | |
| atomicAdd(grad_vertices + bi * nv * 2 + to_index * 2, grad_x2); | |
| atomicAdd(grad_vertices + bi * nv * 2 + to_index * 2 + 1, grad_y2); | |
| } | |
| std::vector<at::Tensor> forward_rasterize_cuda( | |
| at::Tensor vertices, | |
| at::Tensor rasterized, | |
| at::Tensor contribution_map, | |
| int width, | |
| int height, | |
| float inv_smoothness, | |
| int mode) { | |
| const auto batch_size = vertices.size(0); | |
| const auto number_vertices = vertices.size(1); | |
| const int threads = 512; | |
| // each block processes some 512 sized chunk of the output image. | |
| const dim3 blocks ((batch_size * width * height - 1) / threads + 1); | |
| if ((mode == MODE_MASK) || (mode == MODE_HARD_MASK)) { | |
| // determine whether each point is inside or outside. | |
| AT_DISPATCH_FLOATING_TYPES(vertices.type(), "inside_outside_cuda", ([&] { | |
| inside_outside_cuda_kernel<scalar_t><<<blocks, threads>>>( | |
| vertices.data<scalar_t>(), | |
| batch_size, | |
| number_vertices, | |
| rasterized.data<scalar_t>(), | |
| height, | |
| width); | |
| })); | |
| } | |
| if (mode != MODE_HARD_MASK) { | |
| AT_DISPATCH_FLOATING_TYPES(vertices.type(), "forward_rasterize_cuda", ([&] { | |
| forward_rasterize_cuda_kernel<scalar_t><<<blocks, threads>>>( | |
| vertices.data<scalar_t>(), | |
| batch_size, | |
| number_vertices, | |
| rasterized.data<scalar_t>(), | |
| contribution_map.data<int>(), | |
| height, | |
| width, | |
| inv_smoothness, | |
| mode); | |
| })); | |
| } | |
| cudaError_t err = cudaGetLastError(); | |
| err = cudaGetLastError(); | |
| if (err != cudaSuccess) | |
| printf("Error in forward_rasterize: %s\n", cudaGetErrorString(err)); | |
| return { rasterized, contribution_map }; | |
| } | |
| at::Tensor backward_rasterize_cuda( | |
| at::Tensor vertices, | |
| at::Tensor rasterized, | |
| at::Tensor contribution_map, | |
| at::Tensor grad_output, | |
| at::Tensor grad_vertices, | |
| int width, | |
| int height, | |
| float inv_smoothness, | |
| int mode) { | |
| const auto batch_size = vertices.size(0); | |
| const auto number_vertices = vertices.size(1); | |
| const int threads = 512; | |
| const dim3 blocks ((batch_size * width * height - 1) / threads + 1); | |
| AT_DISPATCH_FLOATING_TYPES(vertices.type(), "backward_rasterize_cuda", ([&] { | |
| backward_rasterize_cuda_kernel<scalar_t><<<blocks, threads>>>( | |
| vertices.data<scalar_t>(), | |
| rasterized.data<scalar_t>(), | |
| contribution_map.data<int>(), | |
| grad_output.data<scalar_t>(), | |
| grad_vertices.data<scalar_t>(), | |
| batch_size, | |
| number_vertices, | |
| width, | |
| height, | |
| inv_smoothness); | |
| })); | |
| cudaError_t err = cudaGetLastError(); | |
| if (err != cudaSuccess) | |
| printf("Error in backward_rasterize: %s\n", cudaGetErrorString(err)); | |
| return grad_vertices; | |
| } | |