| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <unistd.h> |
| #include <stdio.h> |
| #include <time.h> |
| #include <sys/time.h> |
| #include <stdlib.h> |
| #include <stdarg.h> |
| #include <string.h> |
| #include <cuda.h> |
|
|
| #define SMALL_FLOAT_VAL 0.00000001f |
|
|
| double rtclock() |
| { |
| struct timezone Tzp; |
| struct timeval Tp; |
| int stat; |
| stat = gettimeofday(&Tp, &Tzp); |
| if (stat != 0) |
| printf("Error return from gettimeofday: %d", stat); |
| return (Tp.tv_sec + Tp.tv_usec * 1.0e-6); |
| } |
|
|
| float absVal(float a) |
| { |
| if (a < 0) |
| { |
| return (a * -1); |
| } |
| else |
| { |
| return a; |
| } |
| } |
|
|
| float percentDiff(double val1, double val2) |
| { |
| if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01)) |
| { |
| return 0.0f; |
| } |
|
|
| else |
| { |
| return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL))); |
| } |
| } |
|
|
| #define GPU_DEVICE 0 |
|
|
| |
| #define PERCENT_DIFF_ERROR_THRESHOLD 0.05 |
|
|
| |
| #define SIZE 40960 |
| int NI; |
| int NJ; |
|
|
| |
| #define DIM_THREAD_BLOCK_X 32 |
| #define DIM_THREAD_BLOCK_Y 32 |
|
|
| #define SHMEM_SIZE (DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y) |
| |
|
|
| |
| #define ALPHA 1.1f |
| #define BETA 1.1f |
|
|
| |
| typedef float DATA_TYPE; |
| |
|
|
| void gemv(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C) |
| { |
| int i,j; |
| |
| for (i = 0; i < NI; i++) { |
| C[i] *= BETA; |
| for (j = 0; j < NJ; j++) { |
| C[i] += ALPHA * A[i*NJ + j] * B[j]; |
| } |
| } |
| } |
|
|
| void init(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *C_ref) |
| { |
| int i, j; |
|
|
| for (i = 0; i < NI; i++) |
| for (j = 0; j < NJ; j++) |
| A[i*NJ + j] = ((DATA_TYPE) i*j) / NI; |
|
|
| for (j = 0; j < NJ; j++) |
| B[j] = ((DATA_TYPE) j + 1) / NJ; |
|
|
| for (i = 0; i < NI; i++) { |
| C[i] = ((DATA_TYPE)i + 2) / NI; |
| C_ref[i] = ((DATA_TYPE)i + 2) / NI; |
| } |
|
|
| for (i = 0; i < NI; i++) |
| for (j = 0; j < NJ; j++) |
| A[i*NJ + j] = 1.0f; |
|
|
| |
| |
|
|
| |
| |
| |
| |
| } |
|
|
|
|
| void compareResults(DATA_TYPE* C, DATA_TYPE* C_outputFromGpu) |
| { |
| int i, fail; |
| fail = 0; |
| |
| |
| for (i=0; i < NI; i++) { |
| if (percentDiff(C[i], C_outputFromGpu[i]) > PERCENT_DIFF_ERROR_THRESHOLD) { |
| fail++; |
| printf("%d, GPU is %f, CPU is %f.\n", i, C[i], C_outputFromGpu[i]); |
| } |
| } |
| |
| |
| printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail); |
| } |
|
|
|
|
| void GPU_argv_init() |
| { |
| cudaDeviceProp deviceProp; |
| cudaGetDeviceProperties(&deviceProp, GPU_DEVICE); |
| printf("setting device %d with name %s\n",GPU_DEVICE,deviceProp.name); |
| cudaSetDevice( GPU_DEVICE ); |
| } |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| __global__ void gemv_kernel(DATA_TYPE *a, DATA_TYPE *b, DATA_TYPE *c, int NI, int NJ) |
| { |
| |
| int row = blockIdx.y * blockDim.y + threadIdx.y; |
| int col = blockIdx.x * blockDim.x + threadIdx.x; |
|
|
| |
| __shared__ DATA_TYPE s_a[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y]; |
| __shared__ DATA_TYPE s_b[DIM_THREAD_BLOCK_X]; |
|
|
| |
|
|
| DATA_TYPE tmp = BETA * c[row]; |
| |
| |
| for (int i = 0; i < NJ; i += blockDim.x) |
| { |
| s_a[threadIdx.y * blockDim.x + threadIdx.x] = a[row * NJ + i + threadIdx.x]; |
| s_b[threadIdx.x] = b[i + threadIdx.x]; |
|
|
| |
|
|
| |
| __syncthreads(); |
|
|
| |
| for (int k = 0; k < blockDim.x; k++) |
| { |
| tmp += ALPHA * s_a[threadIdx.y * blockDim.x + k] * s_b[threadIdx.x]; |
| } |
|
|
| |
| __syncthreads(); |
| } |
|
|
| c[row] = tmp; |
| } |
|
|
| void gemvCuda(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *A_gpu, DATA_TYPE *B_gpu, DATA_TYPE *C_gpu, DATA_TYPE *C_gpu_tmp) |
| { |
| double t_start, t_end; |
|
|
| dim3 block(DIM_THREAD_BLOCK_X, DIM_THREAD_BLOCK_Y); |
| |
| dim3 grid((size_t)(ceil( ((float)NI)/ ((float)block.x) )),(size_t)(ceil( ((float)NJ)/ ((float)block.y) ))); |
|
|
| dim3 block_reduce(DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y); |
| dim3 grid_reduce((size_t)(ceil(((float)NJ) / ((float)(DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y))))); |
|
|
| t_start = rtclock(); |
| cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyHostToDevice); |
| cudaMemcpy(B_gpu, B, sizeof(DATA_TYPE) * NJ, cudaMemcpyHostToDevice); |
| cudaMemcpy(C_gpu, C, sizeof(DATA_TYPE) * NI, cudaMemcpyHostToDevice); |
| gemv_kernel<<<grid, block>>>(A_gpu, B_gpu, C_gpu, NI, NJ); |
| |
| cudaDeviceSynchronize(); |
| cudaMemcpy(C, C_gpu, sizeof(DATA_TYPE) * NI, cudaMemcpyDeviceToHost); |
| t_end = rtclock(); |
|
|
| fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start); |
| } |
| |
|
|
| int main(int argc, char *argv[]) |
| { |
| if (argc >= 3) { |
| NI = atoi(argv[1]); |
| NJ = atoi(argv[2]); |
| } else { |
| NI = SIZE; |
| NJ = SIZE; |
| } |
|
|
| double t_start, t_end; |
|
|
| DATA_TYPE* A; |
| DATA_TYPE* B; |
| DATA_TYPE* C; |
| DATA_TYPE *C_ref; |
|
|
| DATA_TYPE *A_gpu; |
| DATA_TYPE *B_gpu; |
| DATA_TYPE *C_gpu; |
| DATA_TYPE *C_gpu_tmp; |
|
|
| A = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); |
| B = (DATA_TYPE*)malloc(NJ*sizeof(DATA_TYPE)); |
| C = (DATA_TYPE*)malloc(NI*sizeof(DATA_TYPE)); |
| C_ref = (DATA_TYPE *)malloc(NI*sizeof(DATA_TYPE)); |
|
|
| |
| |
| |
| |
| cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NJ); |
| cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NJ); |
| cudaMalloc(&C_gpu, sizeof(DATA_TYPE) * NI); |
| cudaMalloc(&C_gpu_tmp, sizeof(DATA_TYPE) * NI * DIM_THREAD_BLOCK_X); |
|
|
| init(A, B, C, C_ref); |
|
|
| GPU_argv_init(); |
|
|
| gemvCuda(A, B, C, A_gpu, B_gpu, C_gpu, C_gpu_tmp); |
|
|
| t_start = rtclock(); |
| gemv(A, B, C_ref); |
| t_end = rtclock(); |
| fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); |
| |
| compareResults(C, C_ref); |
|
|
| free(A); |
| free(B); |
| free(C); |
| free(C_ref); |
| cudaFree(A_gpu); |
| cudaFree(B_gpu); |
| cudaFree(C_gpu); |
| cudaFree(C_gpu_tmp); |
| return 0; |
| } |
|
|
|
|