| #include <cuda_runtime.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
|
|
| |
| __global__ void convolution2D(const float *input, const float *kernel, float *output, |
| int input_height, int input_width, int kernel_size, int output_height, int output_width) { |
| int row = blockIdx.y * blockDim.y + threadIdx.y; |
| int col = blockIdx.x * blockDim.x + threadIdx.x; |
| |
| if (row < output_height && col < output_width) { |
| float sum = 0.0f; |
| int kernel_radius = kernel_size / 2; |
| |
| |
| for (int i = -kernel_radius; i <= kernel_radius; i++) { |
| for (int j = -kernel_radius; j <= kernel_radius; j++) { |
| int input_row = row + i; |
| int input_col = col + j; |
| |
| |
| if (input_row >= 0 && input_row < input_height && |
| input_col >= 0 && input_col < input_width) { |
| |
| int kernel_row = i + kernel_radius; |
| int kernel_col = j + kernel_radius; |
| |
| sum += input[input_row * input_width + input_col] * |
| kernel[kernel_row * kernel_size + kernel_col]; |
| } |
| } |
| } |
| |
| output[row * output_width + col] = sum; |
| |
| |
| |
| if (threadIdx.x % 32 == 0 && threadIdx.y % 32 == 0) { |
| |
| printf("Warp (%d,%d) processed output pixel (%d,%d) = %f\n", |
| threadIdx.x / 32, threadIdx.y / 32, row, col, sum); |
| } |
| } |
| } |
|
|
| |
| __global__ void convolution2DShared(const float *input, const float *kernel, float *output, |
| int input_height, int input_width, int kernel_size, int output_height, int output_width) { |
| __shared__ float shared_input[32 + 6][32 + 6]; |
| __shared__ float shared_kernel[7][7]; |
| |
| int row = blockIdx.y * blockDim.y + threadIdx.y; |
| int col = blockIdx.x * blockDim.x + threadIdx.x; |
| |
| int kernel_radius = kernel_size / 2; |
| |
| |
| if (threadIdx.x < kernel_size && threadIdx.y < kernel_size) { |
| shared_kernel[threadIdx.y][threadIdx.x] = kernel[threadIdx.y * kernel_size + threadIdx.x]; |
| } |
| |
| |
| int input_row = blockIdx.y * blockDim.y + threadIdx.y - kernel_radius; |
| int input_col = blockIdx.x * blockDim.x + threadIdx.x - kernel_radius; |
| |
| if (input_row >= 0 && input_row < input_height && input_col >= 0 && input_col < input_width) { |
| shared_input[threadIdx.y][threadIdx.x] = input[input_row * input_width + input_col]; |
| } else { |
| shared_input[threadIdx.y][threadIdx.x] = 0.0f; |
| } |
| |
| __syncthreads(); |
| |
| |
| if (row < output_height && col < output_width) { |
| float sum = 0.0f; |
| |
| for (int i = 0; i < kernel_size; i++) { |
| for (int j = 0; j < kernel_size; j++) { |
| sum += shared_input[threadIdx.y + i][threadIdx.x + j] * shared_kernel[i][j]; |
| } |
| } |
| |
| output[row * output_width + col] = sum; |
| } |
| } |
|
|
| int main(int argc, char **argv) { |
| int input_height = 1024; |
| int input_width = 1024; |
| int kernel_size = 3; |
| |
| int output_height = input_height - kernel_size + 1; |
| int output_width = input_width - kernel_size + 1; |
| |
| size_t input_size = input_height * input_width * sizeof(float); |
| size_t kernel_size_bytes = kernel_size * kernel_size * sizeof(float); |
| size_t output_size = output_height * output_width * sizeof(float); |
| |
| printf("Input: %dx%d, Kernel: %dx%d, Output: %dx%d\n", |
| input_height, input_width, kernel_size, kernel_size, output_height, output_width); |
| |
| |
| float *h_input = (float *)malloc(input_size); |
| float *h_kernel = (float *)malloc(kernel_size_bytes); |
| float *h_output = (float *)malloc(output_size); |
| float *h_output_ref = (float *)malloc(output_size); |
| |
| |
| for (int i = 0; i < input_height * input_width; i++) { |
| h_input[i] = rand() / (float)RAND_MAX; |
| } |
| |
| |
| float kernel_3x3[9] = {-1, -1, -1, -1, 8, -1, -1, -1, -1}; |
| for (int i = 0; i < kernel_size * kernel_size; i++) { |
| h_kernel[i] = kernel_3x3[i]; |
| } |
| |
| |
| float *d_input, *d_kernel, *d_output, *d_output_ref; |
| cudaMalloc(&d_input, input_size); |
| cudaMalloc(&d_kernel, kernel_size_bytes); |
| cudaMalloc(&d_output, output_size); |
| cudaMalloc(&d_output_ref, output_size); |
| |
| |
| cudaMemcpy(d_input, h_input, input_size, cudaMemcpyHostToDevice); |
| cudaMemcpy(d_kernel, h_kernel, kernel_size_bytes, cudaMemcpyHostToDevice); |
| |
| |
| dim3 threadsPerBlock(32, 32); |
| dim3 blocksPerGrid((output_width + threadsPerBlock.x - 1) / threadsPerBlock.x, |
| (output_height + threadsPerBlock.y - 1) / threadsPerBlock.y); |
| |
| printf("Launching kernel with grid (%d,%d) and block (%d,%d)\n", |
| blocksPerGrid.x, blocksPerGrid.y, threadsPerBlock.x, threadsPerBlock.y); |
| |
| |
| convolution2D<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_kernel, d_output_ref, |
| input_height, input_width, kernel_size, |
| output_height, output_width); |
| cudaDeviceSynchronize(); |
| |
| |
| cudaEvent_t start, stop; |
| cudaEventCreate(&start); |
| cudaEventCreate(&stop); |
| |
| cudaEventRecord(start); |
| convolution2D<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_kernel, d_output_ref, |
| input_height, input_width, kernel_size, |
| output_height, output_width); |
| cudaEventRecord(stop); |
| cudaEventSynchronize(stop); |
| |
| float basic_time = 0; |
| cudaEventElapsedTime(&basic_time, start, stop); |
| printf("Basic kernel time: %.3f ms\n", basic_time); |
| |
| |
| cudaEventRecord(start); |
| convolution2DShared<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_kernel, d_output, |
| input_height, input_width, kernel_size, |
| output_height, output_width); |
| cudaEventRecord(stop); |
| cudaEventSynchronize(stop); |
| |
| float shared_time = 0; |
| cudaEventElapsedTime(&shared_time, start, stop); |
| printf("Shared memory kernel time: %.3f ms\n", shared_time); |
| |
| printf("Speedup: %.2fx\n", basic_time / shared_time); |
| |
| |
| cudaMemcpy(h_output_ref, d_output_ref, output_size, cudaMemcpyDeviceToHost); |
| cudaMemcpy(h_output, d_output, output_size, cudaMemcpyDeviceToHost); |
| |
| |
| bool correct = true; |
| for (int i = 0; i < min(100, output_height * output_width); i++) { |
| if (fabs(h_output[i] - h_output_ref[i]) > 1e-5) { |
| printf("Mismatch at element %d: %f != %f\n", i, h_output[i], h_output_ref[i]); |
| correct = false; |
| break; |
| } |
| } |
| |
| if (correct) { |
| printf("Verification PASSED (first 100 elements)\n"); |
| } else { |
| printf("Verification FAILED\n"); |
| } |
| |
| |
| cudaFree(d_input); |
| cudaFree(d_kernel); |
| cudaFree(d_output); |
| cudaFree(d_output_ref); |
| free(h_input); |
| free(h_kernel); |
| free(h_output); |
| free(h_output_ref); |
| |
| printf("Done\n"); |
| return 0; |
| } |
|
|