Spaces:

lablab-ai-amd-developer-hackathon
/

ROCmPort-AI

Running

App Files Files Community

ROCmPort-AI / backend /demo_kernels /convolution_2d.cu

tazwarrrr

Initial commit

1a6672d about 2 months ago

raw

history blame contribute delete

8 kB

	#include <cuda_runtime.h>
	#include <stdio.h>
	#include <stdlib.h>

	// 2D Convolution kernel with intentional warp size bug
	__global__ void convolution2D(const float input, const float kernel, float *output,
	int input_height, int input_width, int kernel_size, int output_height, int output_width) {
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	if (row < output_height && col < output_width) {
	float sum = 0.0f;
	int kernel_radius = kernel_size / 2;

	// Apply convolution
	for (int i = -kernel_radius; i <= kernel_radius; i++) {
	for (int j = -kernel_radius; j <= kernel_radius; j++) {
	int input_row = row + i;
	int input_col = col + j;

	// Check bounds
	if (input_row >= 0 && input_row < input_height &&
	input_col >= 0 && input_col < input_width) {

	int kernel_row = i + kernel_radius;
	int kernel_col = j + kernel_radius;

	sum += input[input_row * input_width + input_col] *
	kernel[kernel_row * kernel_size + kernel_col];
	}
	}
	}

	output[row * output_width + col] = sum;

	// Intentional warp size bug - assumes 32 threads per warp
	// This will break on AMD wavefront (64 threads)
	if (threadIdx.x % 32 == 0 && threadIdx.y % 32 == 0) {
	// This warp-level operation only works for CUDA
	printf("Warp (%d,%d) processed output pixel (%d,%d) = %f\n",
	threadIdx.x / 32, threadIdx.y / 32, row, col, sum);
	}
	}
	}

	// Shared memory version for comparison
	__global__ void convolution2DShared(const float input, const float kernel, float *output,
	int input_height, int input_width, int kernel_size, int output_height, int output_width) {
	__shared__ float shared_input[32 + 6][32 + 6]; // +6 for 3x3 kernel padding
	__shared__ float shared_kernel[7][7]; // Max 7x7 kernel

	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	int kernel_radius = kernel_size / 2;

	// Load kernel into shared memory
	if (threadIdx.x < kernel_size && threadIdx.y < kernel_size) {
	shared_kernel[threadIdx.y][threadIdx.x] = kernel[threadIdx.y * kernel_size + threadIdx.x];
	}

	// Load input tile with padding
	int input_row = blockIdx.y * blockDim.y + threadIdx.y - kernel_radius;
	int input_col = blockIdx.x * blockDim.x + threadIdx.x - kernel_radius;

	if (input_row >= 0 && input_row < input_height && input_col >= 0 && input_col < input_width) {
	shared_input[threadIdx.y][threadIdx.x] = input[input_row * input_width + input_col];
	} else {
	shared_input[threadIdx.y][threadIdx.x] = 0.0f;
	}

	__syncthreads();

	// Compute convolution
	if (row < output_height && col < output_width) {
	float sum = 0.0f;

	for (int i = 0; i < kernel_size; i++) {
	for (int j = 0; j < kernel_size; j++) {
	sum += shared_input[threadIdx.y + i][threadIdx.x + j] * shared_kernel[i][j];
	}
	}

	output[row * output_width + col] = sum;
	}
	}

	int main(int argc, char **argv) {
	int input_height = 1024;
	int input_width = 1024;
	int kernel_size = 3;

	int output_height = input_height - kernel_size + 1;
	int output_width = input_width - kernel_size + 1;

	size_t input_size = input_height * input_width * sizeof(float);
	size_t kernel_size_bytes = kernel_size * kernel_size * sizeof(float);
	size_t output_size = output_height * output_width * sizeof(float);

	printf("Input: %dx%d, Kernel: %dx%d, Output: %dx%d\n",
	input_height, input_width, kernel_size, kernel_size, output_height, output_width);

	// Allocate host memory
	float h_input = (float )malloc(input_size);
	float h_kernel = (float )malloc(kernel_size_bytes);
	float h_output = (float )malloc(output_size);
	float h_output_ref = (float )malloc(output_size);

	// Initialize input and kernel
	for (int i = 0; i < input_height * input_width; i++) {
	h_input[i] = rand() / (float)RAND_MAX;
	}

	// Simple 3x3 edge detection kernel
	float kernel_3x3[9] = {-1, -1, -1, -1, 8, -1, -1, -1, -1};
	for (int i = 0; i < kernel_size * kernel_size; i++) {
	h_kernel[i] = kernel_3x3[i];
	}

	// Allocate device memory
	float d_input, d_kernel, d_output, d_output_ref;
	cudaMalloc(&d_input, input_size);
	cudaMalloc(&d_kernel, kernel_size_bytes);
	cudaMalloc(&d_output, output_size);
	cudaMalloc(&d_output_ref, output_size);

	// Copy to device
	cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
	cudaMemcpy(d_kernel, h_kernel, kernel_size_bytes, cudaMemcpyHostToDevice);

	// Setup kernel launch parameters
	dim3 threadsPerBlock(32, 32);
	dim3 blocksPerGrid((output_width + threadsPerBlock.x - 1) / threadsPerBlock.x,
	(output_height + threadsPerBlock.y - 1) / threadsPerBlock.y);

	printf("Launching kernel with grid (%d,%d) and block (%d,%d)\n",
	blocksPerGrid.x, blocksPerGrid.y, threadsPerBlock.x, threadsPerBlock.y);

	// Warmup
	convolution2D<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_kernel, d_output_ref,
	input_height, input_width, kernel_size,
	output_height, output_width);
	cudaDeviceSynchronize();

	// Time basic kernel
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaEventRecord(start);
	convolution2D<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_kernel, d_output_ref,
	input_height, input_width, kernel_size,
	output_height, output_width);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);

	float basic_time = 0;
	cudaEventElapsedTime(&basic_time, start, stop);
	printf("Basic kernel time: %.3f ms\n", basic_time);

	// Time shared memory kernel
	cudaEventRecord(start);
	convolution2DShared<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_kernel, d_output,
	input_height, input_width, kernel_size,
	output_height, output_width);
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);

	float shared_time = 0;
	cudaEventElapsedTime(&shared_time, start, stop);
	printf("Shared memory kernel time: %.3f ms\n", shared_time);

	printf("Speedup: %.2fx\n", basic_time / shared_time);

	// Copy results back
	cudaMemcpy(h_output_ref, d_output_ref, output_size, cudaMemcpyDeviceToHost);
	cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);

	// Verify results (first few elements)
	bool correct = true;
	for (int i = 0; i < min(100, output_height * output_width); i++) {
	if (fabs(h_output[i] - h_output_ref[i]) > 1e-5) {
	printf("Mismatch at element %d: %f != %f\n", i, h_output[i], h_output_ref[i]);
	correct = false;
	break;
	}
	}

	if (correct) {
	printf("Verification PASSED (first 100 elements)\n");
	} else {
	printf("Verification FAILED\n");
	}

	// Cleanup
	cudaFree(d_input);
	cudaFree(d_kernel);
	cudaFree(d_output);
	cudaFree(d_output_ref);
	free(h_input);
	free(h_kernel);
	free(h_output);
	free(h_output_ref);

	printf("Done\n");
	return 0;
	}