Spaces:

AGLO-AI
/

raster2seq

Runtime error

raster2seq / diff_ras /rasterize_cuda_kernel.cu

anas

Initial deployment of Raster2Seq floor plan vectorization API

fadb92b about 1 month ago

17.4 kB

	#include <ATen/ATen.h>

	#include <cuda.h>
	#include <cuda_runtime.h>

	#include "rasterize_cuda_kernel.h"

	const int CUDA_NUM_THREADS = 1024;

	inline int GET_BLOCKS(const int N){
	return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
	}

	// for the older gpus atomicAdd with double arguments does not exist
	#if __CUDA_ARCH__ < 600 and defined(__CUDA_ARCH__)
	static __inline__ __device__ double atomicAdd(double* address, double val) {
	unsigned long long int* address_as_ull = (unsigned long long int*)address;
	unsigned long long int old = *address_as_ull, assumed;
	do {
	assumed = old;
	old = atomicCAS(address_as_ull, assumed,
	__double_as_longlong(val + __longlong_as_double(assumed)));
	// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } while (assumed != old);
	} while (assumed != old);
	return __longlong_as_double(old);
	}
	#endif

	#define GET_DIRECT_4d(data, x0, x1, x2, x3, sd0, sd1, sd2, sd3) \
	((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3)])

	#define ADD_ATOMIC_4d(data, x0, x1, x2, x3, sd0, sd1, sd2, sd3, v) \
	atomicAdd( data + (x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3), v )
	#define ADD_ATOMIC_5d(data, x0, x1, x2, x3, x4, sd0, sd1, sd2, sd3, sd4, v) \
	atomicAdd( data + (x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3) + (x4)*(sd4), v )
	#define SET_DIRECT_4d(data, x0, x1, x2, x3, sd0, sd1, sd2, sd3, v) \
	((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) + (x3) * (sd3)]) = v

	#define GET_DIRECT_3d(data, x0, x1, x2, sd0, sd1, sd2) \
	((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2)])

	#define SET_DIRECT_3d(data, x0, x1, x2, sd0, sd1, sd2, v) \
	((data)[(x0) * (sd0) + (x1) * (sd1) + (x2) * (sd2) ]) = v

	#define GET_DIRECT_5d(data, x0, x1, x2, x3, x4, stride0, stride1, stride2, stride3, stride4) \
	((data)[(x0)(stride0)+(x1)(stride1)+(x2)(stride2)+(x3)(stride3)+(x4)*(stride4)])

	#define SET_DIRECT_5d(data, x0, x1, x2, x3, x4, stride0, stride1, stride2, stride3, stride4, value) \
	((data)[(x0)(stride0)+(x1)(stride1)+(x2)(stride2)+(x3)(stride3)+(x4)*(stride4)] = (value))

	#define CUDA_KERNEL_LOOP(i, n) \
	for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)


	const int MODE_BOUNDARY = 0;
	const int MODE_MASK = 1;
	const int MODE_HARD_MASK = 2;

	template <typename scalar_t>
	__global__ void inside_outside_cuda_kernel(
	const scalar_t* __restrict__ vertices,
	int batch_size,
	int number_vertices,
	scalar_t* rasterized,
	int height,
	int width) {
	// 1-D array of 1-D blocks.
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= batch_size * width * height) {
	return;
	}

	const int w = width;
	const int h = height;
	const int nv = number_vertices;

	// batch index.
	const int bi = i / (w * h);

	// pixel number (linear index)
	const int pn = i % (w * h);
	const int yp = pn / w;
	const int xp = pn % w;

	// cast a ray: William Randolph Franklin.
	int j = 0;
	scalar_t c = 0;
	for (int vn = 0, j = nv - 1; vn < nv; j = vn++) {
	scalar_t from_x;
	scalar_t from_y;
	scalar_t to_x;
	scalar_t to_y;

	from_x = vertices[bi * (nv * 2) + vn * 2];
	from_y = vertices[bi * (nv * 2) + vn * 2 + 1];
	to_x = vertices[bi * (nv * 2) + j * 2];
	to_y = vertices[bi * (nv * 2) + j * 2 + 1];

	if (((from_y > yp) != (to_y > yp)) && (xp < (to_x - from_x) * (yp - from_y) / (to_y - from_y) + from_x)) {
	c = !c;
	}
	}

	rasterized[i] = c == 0 ? -1.0 : 1.0;
	}

	template <typename scalar_t>
	__global__ void forward_rasterize_cuda_kernel(
	const scalar_t* __restrict__ vertices,
	int batch_size,
	int number_vertices,
	scalar_t* rasterized,
	int* contribution_map,
	int height,
	int width,
	float inv_smoothness,
	int mode) {
	// 1-D array of 1-D blocks.
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= batch_size * width * height) {
	return;
	}

	const int w = width;
	const int h = height;
	const int nv = number_vertices;

	// batch index.
	const int bi = i / (w * h);

	// pixel number (linear index)
	const int pn = i % (w * h);
	const int yp = pn / w;
	const int xp = pn % w;

	// go through each vertex.
	// at some point, we'll need to record
	// which segment contributed the most
	// for backwards pass.
	scalar_t max_contribution = -2147483647;
	int max_vertex_number = -1;
	for (int vn = 0; vn < nv; vn++) {
	int from_index;
	int to_index;
	scalar_t from_x;
	scalar_t from_y;
	scalar_t to_x;
	scalar_t to_y;
	scalar_t x2_sub_x1;
	scalar_t y2_sub_y1;
	scalar_t square_segment_length;
	scalar_t x_sub_x1;
	scalar_t y_sub_y1;
	scalar_t x_sub_x2;
	scalar_t y_sub_y2;
	scalar_t dot;
	scalar_t x_proj;
	scalar_t y_proj;
	scalar_t contribution;

	// grid_x, grid_y = xp, yp.
	from_index = vn;
	to_index = (vn + 1) % number_vertices;

	from_x = vertices[bi * (nv * 2) + from_index * 2];
	from_y = vertices[bi * (nv * 2) + from_index * 2 + 1];

	to_x = vertices[bi * (nv * 2) + to_index * 2];
	to_y = vertices[bi * (nv * 2) + to_index * 2 + 1];

	x2_sub_x1 = to_x - from_x;
	y2_sub_y1 = to_y - from_y;

	square_segment_length = x2_sub_x1 * x2_sub_x1 + y2_sub_y1 * y2_sub_y1 + 0.00001;

	x_sub_x1 = xp - from_x;
	y_sub_y1 = yp - from_y;
	x_sub_x2 = xp - to_x;
	y_sub_y2 = yp - to_y;

	dot = ((x_sub_x1 * x2_sub_x1) + (y_sub_y1 * y2_sub_y1)) / square_segment_length;
	x_proj = xp - (from_x + dot * x2_sub_x1);
	y_proj = yp - (from_y + dot * y2_sub_y1);

	// Does it matter here to compute the squared distance or true Euclidean distance?
	if (dot < 0) {
	contribution = pow(x_sub_x1, 2) + pow(y_sub_y1, 2);
	}
	else if (dot > 1) {
	contribution = pow(x_sub_x2, 2) + pow(y_sub_y2, 2);
	}
	else {
	contribution = pow(x_proj, 2) + pow(y_proj, 2);
	}

	// we need contribution to be a decreasing function.
	// if (mode == MODE_MASK) {
	// // sign * -dist
	// contribution = 1.0 / (1.0 + exp(-rasterized[i] * contribution / inv_smoothness));
	// }
	// else if (mode == MODE_HARD_MASK) {
	// // map the inside outside map to 0 or 1.0.
	// // technically, we don't need this preceeding loop.
	// contribution = rasterized[i] < 0 ? 0.0 : 1.0;
	// }
	// else {
	// contribution = exp(-contribution / inv_smoothness);
	// }

	contribution = -contribution;

	if (contribution > max_contribution) {
	max_contribution = contribution;
	max_vertex_number = vn;
	}
	}

	if (mode == MODE_MASK) {
	// sign * -dist
	max_contribution = 1.0 / (1.0 + exp(rasterized[i] * max_contribution / inv_smoothness));
	}
	else if (mode == MODE_HARD_MASK) {
	// map the inside outside map to 0 or 1.0.
	// technically, we don't need this preceeding loop.
	max_contribution = rasterized[i] < 0 ? 0.0 : 1.0;
	}
	else {
	max_contribution = exp(max_contribution / inv_smoothness);
	}

	rasterized[i] = max_contribution;
	contribution_map[i] = max_vertex_number;
	}

	template <typename scalar_t>
	__global__ void backward_rasterize_cuda_kernel(
	const scalar_t* __restrict__ vertices,
	const scalar_t* __restrict__ rasterized,
	const int* __restrict__ contribution_map,
	const scalar_t* __restrict__ grad_output,
	scalar_t* grad_vertices,
	int batch_size,
	int number_vertices,
	int width,
	int height,
	float inv_smoothness) {
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i >= batch_size * width * height) {
	return;
	}

	const int w = width;
	const int h = height;
	const int nv = number_vertices;

	// batch index.
	const int bi = i / (w * h);

	// pixel number (linear index)
	const int pn = i % (w * h);
	const int yp = pn / w;
	const int xp = pn % w;

	// produce dR/dv.
	// since we use max over all vertices, we only need
	// to apply it to single vertex.
	int vn;
	int from_index;
	int to_index;
	scalar_t from_x;
	scalar_t from_y;
	scalar_t to_x;
	scalar_t to_y;
	scalar_t x2_sub_x1;
	scalar_t y2_sub_y1;
	scalar_t square_segment_length;
	scalar_t x_sub_x1;
	scalar_t y_sub_y1;
	scalar_t x_sub_x2;
	scalar_t y_sub_y2;
	scalar_t dot;
	scalar_t x_proj;
	scalar_t y_proj;
	scalar_t grad_x1 = 0.0;
	scalar_t grad_y1 = 0.0;
	scalar_t grad_x2 = 0.0;
	scalar_t grad_y2 = 0.0;

	scalar_t in_out = rasterized[i] >= 0.5 ? 1.0 : -1.0;

	vn = contribution_map[i];
	from_index = vn;
	to_index = (vn + 1) % nv;

	// determine how we computed the distance to this segment.
	from_x = vertices[bi * (nv * 2) + from_index * 2];
	from_y = vertices[bi * (nv * 2) + from_index * 2 + 1];

	to_x = vertices[bi * (nv * 2) + to_index * 2];
	to_y = vertices[bi * (nv * 2) + to_index * 2 + 1];

	x2_sub_x1 = to_x - from_x;
	y2_sub_y1 = to_y - from_y;

	// grad:
	// dX1 = 2 * x2_sub_x1 * -1
	// dX2 = 2 * x2_sub_x1
	// dY1 = 2 * y2_sub_y1 * -1
	// dY2 = 2 * y2_sub_y1
	// possible this could NaN?
	square_segment_length = x2_sub_x1 * x2_sub_x1 + y2_sub_y1 * y2_sub_y1 + 0.00001;

	x_sub_x1 = xp - from_x;
	y_sub_y1 = yp - from_y;
	x_sub_x2 = xp - to_x;
	y_sub_y2 = yp - to_y;

	// grad numer:
	// dX1 = -1 * x2_sub_x1 + -1 * x_sub_x1
	// dX2 = x_sub_x1
	scalar_t dot_num = ((x_sub_x1 * x2_sub_x1) + (y_sub_y1 * y2_sub_y1));
	dot = dot_num / square_segment_length;
	x_proj = xp - (from_x + dot * x2_sub_x1);
	y_proj = yp - (from_y + dot * y2_sub_y1);

	// negative sign?
	if (dot < 0) {
	// contribution = exp(-((xp - from_x) 2 + (yp - from_y) 2 / inv_smoothness)
	// grad_x1 = (rasterized[i] * 2 * x_sub_x1) / inv_smoothness;
	// grad_y1 = (rasterized[i] * 2 * y_sub_y1) / inv_smoothness;
	// grad_x1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * 2 * x_sub_x1 / inv_smoothness;
	// grad_y1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * 2 * y_sub_y1 / inv_smoothness;
	grad_x1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * x_sub_x1 / inv_smoothness;
	grad_y1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * y_sub_y1 / inv_smoothness;
	}
	else if (dot > 1) {
	// contribution = exp(-((xp - to_x) 2 + (yp - to_y) 2) / inv_smoothness)
	// grad_x2 = (rasterized[i] * 2 * x_sub_x2) / inv_smoothness;
	// grad_y2 = (rasterized[i] * 2 * y_sub_y2) / inv_smoothness;
	grad_x2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * x_sub_x2 / inv_smoothness;
	grad_y2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * -2 * y_sub_y2 / inv_smoothness;
	}
	else {
	// contribution = exp(-(xp - from_x) ** 2 / inv_smoothness)
	scalar_t ss_x1 = -2.0 * x2_sub_x1;
	scalar_t ss_x2 = 2.0 * x2_sub_x1;
	scalar_t ss_y1 = -2.0 * y2_sub_y1;
	scalar_t ss_y2 = 2.0 * y2_sub_y1;

	scalar_t dot_x1 = (square_segment_length * (-x2_sub_x1 - x_sub_x1) - dot_num * ss_x1) / pow(square_segment_length, 2);
	scalar_t dot_x2 = (square_segment_length * x_sub_x1 - dot_num * ss_x2) / pow(square_segment_length, 2);
	scalar_t dot_y1 = (square_segment_length * (-y2_sub_y1 - y_sub_y1) - dot_num * ss_y1) / pow(square_segment_length, 2);
	scalar_t dot_y2 = (square_segment_length * y_sub_y1 - dot_num * ss_y2) / pow(square_segment_length, 2);

	// d/dx()
	scalar_t x_proj_x1 = -1 - dot_x1 * x2_sub_x1 + dot;
	scalar_t x_proj_x2 = -(dot_x2 * x2_sub_x1 + dot);

	scalar_t y_proj_y1 = -1 - dot_y1 * y2_sub_y1 + dot;
	scalar_t y_proj_y2 = -(dot_y2 * y2_sub_y1 + dot);

	// we also need mixed.
	scalar_t y_proj_x1 = -dot_x1 * y2_sub_y1;
	scalar_t y_proj_x2 = -dot_x2 * y2_sub_y1;
	scalar_t x_proj_y1 = -dot_y1 * x2_sub_x1;
	scalar_t x_proj_y2 = -dot_y2 * x2_sub_x1;

	// - as well?
	grad_x1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_x1 + 2.0 * y_proj * y_proj_x1) / inv_smoothness;
	grad_x2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_x2 + 2.0 * y_proj * y_proj_x2) / inv_smoothness;
	grad_y1 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_y1 + 2.0 * y_proj * y_proj_y1) / inv_smoothness;
	grad_y2 = in_out * rasterized[i] * (1.0 - rasterized[i]) * (2.0 * x_proj * x_proj_y2 + 2.0 * y_proj * y_proj_y2) / inv_smoothness;

	// grad_x1 = -rasterized[i] * (2.0 * x_proj * x_proj_x1 + 2.0 * y_proj * y_proj_x1) / inv_smoothness;
	// grad_x2 = -rasterized[i] * (2.0 * x_proj * x_proj_x2 + 2.0 * y_proj * y_proj_x2) / inv_smoothness;
	// grad_y1 = -rasterized[i] * (2.0 * x_proj * x_proj_y1 + 2.0 * y_proj * y_proj_y1) / inv_smoothness;
	// grad_y2 = -rasterized[i] * (2.0 * x_proj * x_proj_y2 + 2.0 * y_proj * y_proj_y2) / inv_smoothness;
	}

	// apply the input gradients.
	grad_x1 = grad_x1 * grad_output[i];
	grad_x2 = grad_x2 * grad_output[i];
	grad_y1 = grad_y1 * grad_output[i];
	grad_y2 = grad_y2 * grad_output[i];

	// grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + from_index * 2] = grad_x1;
	// grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + from_index * 2 + 1] = grad_y1;
	// grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + to_index * 2] = grad_x2;
	// grad_vertices[bi * (height * width * nv * 2) + yp * (width * nv * 2) + xp * (nv * 2) + to_index * 2 + 1] = grad_y2;

	// unsure if should be deferencing.
	atomicAdd(grad_vertices + bi * nv * 2 + from_index * 2, grad_x1);
	atomicAdd(grad_vertices + bi * nv * 2 + from_index * 2 + 1, grad_y1);
	atomicAdd(grad_vertices + bi * nv * 2 + to_index * 2, grad_x2);
	atomicAdd(grad_vertices + bi * nv * 2 + to_index * 2 + 1, grad_y2);
	}

	std::vector<at::Tensor> forward_rasterize_cuda(
	at::Tensor vertices,
	at::Tensor rasterized,
	at::Tensor contribution_map,
	int width,
	int height,
	float inv_smoothness,
	int mode) {
	const auto batch_size = vertices.size(0);
	const auto number_vertices = vertices.size(1);
	const int threads = 512;

	// each block processes some 512 sized chunk of the output image.
	const dim3 blocks ((batch_size * width * height - 1) / threads + 1);

	if ((mode == MODE_MASK) \|\| (mode == MODE_HARD_MASK)) {
	// determine whether each point is inside or outside.
	AT_DISPATCH_FLOATING_TYPES(vertices.type(), "inside_outside_cuda", ([&] {
	inside_outside_cuda_kernel<scalar_t><<<blocks, threads>>>(
	vertices.data<scalar_t>(),
	batch_size,
	number_vertices,
	rasterized.data<scalar_t>(),
	height,
	width);
	}));
	}

	if (mode != MODE_HARD_MASK) {
	AT_DISPATCH_FLOATING_TYPES(vertices.type(), "forward_rasterize_cuda", ([&] {
	forward_rasterize_cuda_kernel<scalar_t><<<blocks, threads>>>(
	vertices.data<scalar_t>(),
	batch_size,
	number_vertices,
	rasterized.data<scalar_t>(),
	contribution_map.data<int>(),
	height,
	width,
	inv_smoothness,
	mode);
	}));
	}

	cudaError_t err = cudaGetLastError();
	err = cudaGetLastError();
	if (err != cudaSuccess)
	printf("Error in forward_rasterize: %s\n", cudaGetErrorString(err));

	return { rasterized, contribution_map };
	}

	at::Tensor backward_rasterize_cuda(
	at::Tensor vertices,
	at::Tensor rasterized,
	at::Tensor contribution_map,
	at::Tensor grad_output,
	at::Tensor grad_vertices,
	int width,
	int height,
	float inv_smoothness,
	int mode) {
	const auto batch_size = vertices.size(0);
	const auto number_vertices = vertices.size(1);
	const int threads = 512;
	const dim3 blocks ((batch_size * width * height - 1) / threads + 1);

	AT_DISPATCH_FLOATING_TYPES(vertices.type(), "backward_rasterize_cuda", ([&] {
	backward_rasterize_cuda_kernel<scalar_t><<<blocks, threads>>>(
	vertices.data<scalar_t>(),
	rasterized.data<scalar_t>(),
	contribution_map.data<int>(),
	grad_output.data<scalar_t>(),
	grad_vertices.data<scalar_t>(),
	batch_size,
	number_vertices,
	width,
	height,
	inv_smoothness);
	}));

	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess)
	printf("Error in backward_rasterize: %s\n", cudaGetErrorString(err));

	return grad_vertices;
	}