#include #include #include // 2D Convolution kernel with intentional warp size bug __global__ void convolution2D(const float *input, const float *kernel, float *output, int input_height, int input_width, int kernel_size, int output_height, int output_width) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; if (row < output_height && col < output_width) { float sum = 0.0f; int kernel_radius = kernel_size / 2; // Apply convolution for (int i = -kernel_radius; i <= kernel_radius; i++) { for (int j = -kernel_radius; j <= kernel_radius; j++) { int input_row = row + i; int input_col = col + j; // Check bounds if (input_row >= 0 && input_row < input_height && input_col >= 0 && input_col < input_width) { int kernel_row = i + kernel_radius; int kernel_col = j + kernel_radius; sum += input[input_row * input_width + input_col] * kernel[kernel_row * kernel_size + kernel_col]; } } } output[row * output_width + col] = sum; // Intentional warp size bug - assumes 32 threads per warp // This will break on AMD wavefront (64 threads) if (threadIdx.x % 32 == 0 && threadIdx.y % 32 == 0) { // This warp-level operation only works for CUDA printf("Warp (%d,%d) processed output pixel (%d,%d) = %f\n", threadIdx.x / 32, threadIdx.y / 32, row, col, sum); } } } // Shared memory version for comparison __global__ void convolution2DShared(const float *input, const float *kernel, float *output, int input_height, int input_width, int kernel_size, int output_height, int output_width) { __shared__ float shared_input[32 + 6][32 + 6]; // +6 for 3x3 kernel padding __shared__ float shared_kernel[7][7]; // Max 7x7 kernel int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; int kernel_radius = kernel_size / 2; // Load kernel into shared memory if (threadIdx.x < kernel_size && threadIdx.y < kernel_size) { shared_kernel[threadIdx.y][threadIdx.x] = kernel[threadIdx.y * kernel_size + threadIdx.x]; } // Load input tile with padding int input_row = blockIdx.y * blockDim.y + threadIdx.y - kernel_radius; int input_col = blockIdx.x * blockDim.x + threadIdx.x - kernel_radius; if (input_row >= 0 && input_row < input_height && input_col >= 0 && input_col < input_width) { shared_input[threadIdx.y][threadIdx.x] = input[input_row * input_width + input_col]; } else { shared_input[threadIdx.y][threadIdx.x] = 0.0f; } __syncthreads(); // Compute convolution if (row < output_height && col < output_width) { float sum = 0.0f; for (int i = 0; i < kernel_size; i++) { for (int j = 0; j < kernel_size; j++) { sum += shared_input[threadIdx.y + i][threadIdx.x + j] * shared_kernel[i][j]; } } output[row * output_width + col] = sum; } } int main(int argc, char **argv) { int input_height = 1024; int input_width = 1024; int kernel_size = 3; int output_height = input_height - kernel_size + 1; int output_width = input_width - kernel_size + 1; size_t input_size = input_height * input_width * sizeof(float); size_t kernel_size_bytes = kernel_size * kernel_size * sizeof(float); size_t output_size = output_height * output_width * sizeof(float); printf("Input: %dx%d, Kernel: %dx%d, Output: %dx%d\n", input_height, input_width, kernel_size, kernel_size, output_height, output_width); // Allocate host memory float *h_input = (float *)malloc(input_size); float *h_kernel = (float *)malloc(kernel_size_bytes); float *h_output = (float *)malloc(output_size); float *h_output_ref = (float *)malloc(output_size); // Initialize input and kernel for (int i = 0; i < input_height * input_width; i++) { h_input[i] = rand() / (float)RAND_MAX; } // Simple 3x3 edge detection kernel float kernel_3x3[9] = {-1, -1, -1, -1, 8, -1, -1, -1, -1}; for (int i = 0; i < kernel_size * kernel_size; i++) { h_kernel[i] = kernel_3x3[i]; } // Allocate device memory float *d_input, *d_kernel, *d_output, *d_output_ref; cudaMalloc(&d_input, input_size); cudaMalloc(&d_kernel, kernel_size_bytes); cudaMalloc(&d_output, output_size); cudaMalloc(&d_output_ref, output_size); // Copy to device cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice); cudaMemcpy(d_kernel, h_kernel, kernel_size_bytes, cudaMemcpyHostToDevice); // Setup kernel launch parameters dim3 threadsPerBlock(32, 32); dim3 blocksPerGrid((output_width + threadsPerBlock.x - 1) / threadsPerBlock.x, (output_height + threadsPerBlock.y - 1) / threadsPerBlock.y); printf("Launching kernel with grid (%d,%d) and block (%d,%d)\n", blocksPerGrid.x, blocksPerGrid.y, threadsPerBlock.x, threadsPerBlock.y); // Warmup convolution2D<<>>(d_input, d_kernel, d_output_ref, input_height, input_width, kernel_size, output_height, output_width); cudaDeviceSynchronize(); // Time basic kernel cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); convolution2D<<>>(d_input, d_kernel, d_output_ref, input_height, input_width, kernel_size, output_height, output_width); cudaEventRecord(stop); cudaEventSynchronize(stop); float basic_time = 0; cudaEventElapsedTime(&basic_time, start, stop); printf("Basic kernel time: %.3f ms\n", basic_time); // Time shared memory kernel cudaEventRecord(start); convolution2DShared<<>>(d_input, d_kernel, d_output, input_height, input_width, kernel_size, output_height, output_width); cudaEventRecord(stop); cudaEventSynchronize(stop); float shared_time = 0; cudaEventElapsedTime(&shared_time, start, stop); printf("Shared memory kernel time: %.3f ms\n", shared_time); printf("Speedup: %.2fx\n", basic_time / shared_time); // Copy results back cudaMemcpy(h_output_ref, d_output_ref, output_size, cudaMemcpyDeviceToHost); cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost); // Verify results (first few elements) bool correct = true; for (int i = 0; i < min(100, output_height * output_width); i++) { if (fabs(h_output[i] - h_output_ref[i]) > 1e-5) { printf("Mismatch at element %d: %f != %f\n", i, h_output[i], h_output_ref[i]); correct = false; break; } } if (correct) { printf("Verification PASSED (first 100 elements)\n"); } else { printf("Verification FAILED\n"); } // Cleanup cudaFree(d_input); cudaFree(d_kernel); cudaFree(d_output); cudaFree(d_output_ref); free(h_input); free(h_kernel); free(h_output); free(h_output_ref); printf("Done\n"); return 0; }