ROCmPort-AI / backend /demo_kernels /convolution_2d.cu
tazwarrrr's picture
Initial commit
1a6672d
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
// 2D Convolution kernel with intentional warp size bug
__global__ void convolution2D(const float *input, const float *kernel, float *output,
int input_height, int input_width, int kernel_size, int output_height, int output_width) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < output_height && col < output_width) {
float sum = 0.0f;
int kernel_radius = kernel_size / 2;
// Apply convolution
for (int i = -kernel_radius; i <= kernel_radius; i++) {
for (int j = -kernel_radius; j <= kernel_radius; j++) {
int input_row = row + i;
int input_col = col + j;
// Check bounds
if (input_row >= 0 && input_row < input_height &&
input_col >= 0 && input_col < input_width) {
int kernel_row = i + kernel_radius;
int kernel_col = j + kernel_radius;
sum += input[input_row * input_width + input_col] *
kernel[kernel_row * kernel_size + kernel_col];
}
}
}
output[row * output_width + col] = sum;
// Intentional warp size bug - assumes 32 threads per warp
// This will break on AMD wavefront (64 threads)
if (threadIdx.x % 32 == 0 && threadIdx.y % 32 == 0) {
// This warp-level operation only works for CUDA
printf("Warp (%d,%d) processed output pixel (%d,%d) = %f\n",
threadIdx.x / 32, threadIdx.y / 32, row, col, sum);
}
}
}
// Shared memory version for comparison
__global__ void convolution2DShared(const float *input, const float *kernel, float *output,
int input_height, int input_width, int kernel_size, int output_height, int output_width) {
__shared__ float shared_input[32 + 6][32 + 6]; // +6 for 3x3 kernel padding
__shared__ float shared_kernel[7][7]; // Max 7x7 kernel
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int kernel_radius = kernel_size / 2;
// Load kernel into shared memory
if (threadIdx.x < kernel_size && threadIdx.y < kernel_size) {
shared_kernel[threadIdx.y][threadIdx.x] = kernel[threadIdx.y * kernel_size + threadIdx.x];
}
// Load input tile with padding
int input_row = blockIdx.y * blockDim.y + threadIdx.y - kernel_radius;
int input_col = blockIdx.x * blockDim.x + threadIdx.x - kernel_radius;
if (input_row >= 0 && input_row < input_height && input_col >= 0 && input_col < input_width) {
shared_input[threadIdx.y][threadIdx.x] = input[input_row * input_width + input_col];
} else {
shared_input[threadIdx.y][threadIdx.x] = 0.0f;
}
__syncthreads();
// Compute convolution
if (row < output_height && col < output_width) {
float sum = 0.0f;
for (int i = 0; i < kernel_size; i++) {
for (int j = 0; j < kernel_size; j++) {
sum += shared_input[threadIdx.y + i][threadIdx.x + j] * shared_kernel[i][j];
}
}
output[row * output_width + col] = sum;
}
}
int main(int argc, char **argv) {
int input_height = 1024;
int input_width = 1024;
int kernel_size = 3;
int output_height = input_height - kernel_size + 1;
int output_width = input_width - kernel_size + 1;
size_t input_size = input_height * input_width * sizeof(float);
size_t kernel_size_bytes = kernel_size * kernel_size * sizeof(float);
size_t output_size = output_height * output_width * sizeof(float);
printf("Input: %dx%d, Kernel: %dx%d, Output: %dx%d\n",
input_height, input_width, kernel_size, kernel_size, output_height, output_width);
// Allocate host memory
float *h_input = (float *)malloc(input_size);
float *h_kernel = (float *)malloc(kernel_size_bytes);
float *h_output = (float *)malloc(output_size);
float *h_output_ref = (float *)malloc(output_size);
// Initialize input and kernel
for (int i = 0; i < input_height * input_width; i++) {
h_input[i] = rand() / (float)RAND_MAX;
}
// Simple 3x3 edge detection kernel
float kernel_3x3[9] = {-1, -1, -1, -1, 8, -1, -1, -1, -1};
for (int i = 0; i < kernel_size * kernel_size; i++) {
h_kernel[i] = kernel_3x3[i];
}
// Allocate device memory
float *d_input, *d_kernel, *d_output, *d_output_ref;
cudaMalloc(&d_input, input_size);
cudaMalloc(&d_kernel, kernel_size_bytes);
cudaMalloc(&d_output, output_size);
cudaMalloc(&d_output_ref, output_size);
// Copy to device
cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, h_kernel, kernel_size_bytes, cudaMemcpyHostToDevice);
// Setup kernel launch parameters
dim3 threadsPerBlock(32, 32);
dim3 blocksPerGrid((output_width + threadsPerBlock.x - 1) / threadsPerBlock.x,
(output_height + threadsPerBlock.y - 1) / threadsPerBlock.y);
printf("Launching kernel with grid (%d,%d) and block (%d,%d)\n",
blocksPerGrid.x, blocksPerGrid.y, threadsPerBlock.x, threadsPerBlock.y);
// Warmup
convolution2D<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_kernel, d_output_ref,
input_height, input_width, kernel_size,
output_height, output_width);
cudaDeviceSynchronize();
// Time basic kernel
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
convolution2D<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_kernel, d_output_ref,
input_height, input_width, kernel_size,
output_height, output_width);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float basic_time = 0;
cudaEventElapsedTime(&basic_time, start, stop);
printf("Basic kernel time: %.3f ms\n", basic_time);
// Time shared memory kernel
cudaEventRecord(start);
convolution2DShared<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_kernel, d_output,
input_height, input_width, kernel_size,
output_height, output_width);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float shared_time = 0;
cudaEventElapsedTime(&shared_time, start, stop);
printf("Shared memory kernel time: %.3f ms\n", shared_time);
printf("Speedup: %.2fx\n", basic_time / shared_time);
// Copy results back
cudaMemcpy(h_output_ref, d_output_ref, output_size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost);
// Verify results (first few elements)
bool correct = true;
for (int i = 0; i < min(100, output_height * output_width); i++) {
if (fabs(h_output[i] - h_output_ref[i]) > 1e-5) {
printf("Mismatch at element %d: %f != %f\n", i, h_output[i], h_output_ref[i]);
correct = false;
break;
}
}
if (correct) {
printf("Verification PASSED (first 100 elements)\n");
} else {
printf("Verification FAILED\n");
}
// Cleanup
cudaFree(d_input);
cudaFree(d_kernel);
cudaFree(d_output);
cudaFree(d_output_ref);
free(h_input);
free(h_kernel);
free(h_output);
free(h_output_ref);
printf("Done\n");
return 0;
}