lrh12580
first commit
5cb6c4b
/**
* gemm.cu: This file is part of the PolyBench/GPU 1.0 test suite.
*
*
* Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
* Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
* Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
*/
#include <unistd.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <cuda.h>
#define SMALL_FLOAT_VAL 0.00000001f
double rtclock()
{
struct timezone Tzp;
struct timeval Tp;
int stat;
stat = gettimeofday(&Tp, &Tzp);
if (stat != 0)
printf("Error return from gettimeofday: %d", stat);
return (Tp.tv_sec + Tp.tv_usec * 1.0e-6);
}
float absVal(float a)
{
if (a < 0)
{
return (a * -1);
}
else
{
return a;
}
}
float percentDiff(double val1, double val2)
{
if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01))
{
return 0.0f;
}
else
{
return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL)));
}
}
#define GPU_DEVICE 0
//define the error threshold for the results "not matching"
#define PERCENT_DIFF_ERROR_THRESHOLD 0.05
/* Problem size */
#define SIZE 40960
int NI;
int NJ;
/* Thread block dimensions */
#define DIM_THREAD_BLOCK_X 32
#define DIM_THREAD_BLOCK_Y 32
#define SHMEM_SIZE (DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y)
// #define SHMEM_SIZE (NI)
/* Declared constant values for ALPHA and BETA (same as values in PolyBench 2.0) */
#define ALPHA 1.1f
#define BETA 1.1f
/* Can switch DATA_TYPE between float and double */
typedef float DATA_TYPE;
// typedef int DATA_TYPE;
void gemv(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C)
{
int i,j;
for (i = 0; i < NI; i++) {
C[i] *= BETA;
for (j = 0; j < NJ; j++) {
C[i] += ALPHA * A[i*NJ + j] * B[j];
}
}
}
void init(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *C_ref)
{
int i, j;
for (i = 0; i < NI; i++)
for (j = 0; j < NJ; j++)
A[i*NJ + j] = ((DATA_TYPE) i*j) / NI;
for (j = 0; j < NJ; j++)
B[j] = ((DATA_TYPE) j + 1) / NJ;
for (i = 0; i < NI; i++) {
C[i] = ((DATA_TYPE)i + 2) / NI;
C_ref[i] = ((DATA_TYPE)i + 2) / NI;
}
for (i = 0; i < NI; i++)
for (j = 0; j < NJ; j++)
A[i*NJ + j] = 1.0f;
// for (j = 0; j < NJ; j++)
// B[j] = 1.0f;
// for (i = 0; i < NI; i++) {
// C[i] = 1.0f;
// C_ref[i] = 1.0f;
// }
}
void compareResults(DATA_TYPE* C, DATA_TYPE* C_outputFromGpu)
{
int i, fail;
fail = 0;
// Compare C1 and C2
for (i=0; i < NI; i++) {
if (percentDiff(C[i], C_outputFromGpu[i]) > PERCENT_DIFF_ERROR_THRESHOLD) {
fail++;
printf("%d, GPU is %f, CPU is %f.\n", i, C[i], C_outputFromGpu[i]);
}
}
// Print results
printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
}
void GPU_argv_init()
{
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, GPU_DEVICE);
printf("setting device %d with name %s\n",GPU_DEVICE,deviceProp.name);
cudaSetDevice( GPU_DEVICE );
}
// __global__ void gemv_kernel_reduce(DATA_TYPE *c_tmp, DATA_TYPE *c, int NI, int NJ)
// {
// int index = blockIdx.x * blockDim.x + threadIdx.x;
// for (int i = 0; i < DIM_THREAD_BLOCK_X; i++) {
// c[index] += c_tmp[index * DIM_THREAD_BLOCK_X + i];
// }
// }
// __global__ void gemv_kernel(DATA_TYPE *a, DATA_TYPE *b, DATA_TYPE *c, int NI, int NJ)
// {
// // Compute each thread's global row and column index
// int row = blockIdx.y * blockDim.y + threadIdx.y;
// __shared__ DATA_TYPE tmp[DIM_THREAD_BLOCK_X];
// // Sweep tile across matrix
// for (int i = threadIdx.x; i < NJ; i += blockDim.x)
// {
// tmp[threadIdx.x] = b[i];
// }
// __syncthreads();
// for (int i = threadIdx.x; i < NJ; i += blockDim.x)
// {
// c[row * DIM_THREAD_BLOCK_X + threadIdx.x] += ALPHA * a[row * NJ + i] * tmp[threadIdx.x];
// }
// __syncthreads();
// }
// __global__ void gemv_kernel(DATA_TYPE *a, DATA_TYPE *b, DATA_TYPE *c)
// {
// // Compute each thread's global row and column index
// int row = blockIdx.y * blockDim.y + threadIdx.y;
// __shared__ DATA_TYPE tmp[DIM_THREAD_BLOCK_X];
// tmp[threadIdx.x] = 0;
// __syncthreads();
// // Sweep tile across matrix
// for (int i = 0; i < NJ; i+= blockDim.x) {
// // s_b[threadIdx.x] = b[i + threadIdx.x];
// // __syncthreads();
// tmp[threadIdx.x] += ALPHA * a[row * NJ + i + threadIdx.x] * b[i + threadIdx.x];
// __syncthreads();
// }
// __syncthreads();
// DATA_TYPE tmp_c = c[row] * BETA;
// __syncthreads();
// for (int i = 0; i < blockDim.x; i++) {
// tmp_c += tmp[i];
// __syncthreads();
// }
// __syncthreads();
// c[row] = tmp_c;
// __syncthreads();
// }
__global__ void gemv_kernel(DATA_TYPE *a, DATA_TYPE *b, DATA_TYPE *c, int NI, int NJ)
{
// Compute each thread's global row and column index
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// Statically allocated shared memory
__shared__ DATA_TYPE s_a[DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y];
__shared__ DATA_TYPE s_b[DIM_THREAD_BLOCK_X];
// Accumulate in temporary variable
DATA_TYPE tmp = BETA * c[row];
// Sweep tile across matrix
for (int i = 0; i < NJ; i += blockDim.x)
{
s_a[threadIdx.y * blockDim.x + threadIdx.x] = a[row * NJ + i + threadIdx.x];
s_b[threadIdx.x] = b[i + threadIdx.x];
// = b[(i + threadIdx.y) * NJ + col];
// Wait for both tiles to be loaded in before doing computation
__syncthreads();
// Do matrix multiplication on the small matrix
for (int k = 0; k < blockDim.x; k++)
{
tmp += ALPHA * s_a[threadIdx.y * blockDim.x + k] * s_b[threadIdx.x];
}
// Wait for all threads to finish using current tiles before loading in new ones
__syncthreads();
}
c[row] = tmp;
}
void gemvCuda(DATA_TYPE *A, DATA_TYPE *B, DATA_TYPE *C, DATA_TYPE *A_gpu, DATA_TYPE *B_gpu, DATA_TYPE *C_gpu, DATA_TYPE *C_gpu_tmp)
{
double t_start, t_end;
dim3 block(DIM_THREAD_BLOCK_X, DIM_THREAD_BLOCK_Y);
// dim3 grid(1, (size_t)(ceil(((float)NJ) / ((float)block.y))));
dim3 grid((size_t)(ceil( ((float)NI)/ ((float)block.x) )),(size_t)(ceil( ((float)NJ)/ ((float)block.y) )));
dim3 block_reduce(DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y);
dim3 grid_reduce((size_t)(ceil(((float)NJ) / ((float)(DIM_THREAD_BLOCK_X * DIM_THREAD_BLOCK_Y)))));
t_start = rtclock();
cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI * NJ, cudaMemcpyHostToDevice);
cudaMemcpy(B_gpu, B, sizeof(DATA_TYPE) * NJ, cudaMemcpyHostToDevice);
cudaMemcpy(C_gpu, C, sizeof(DATA_TYPE) * NI, cudaMemcpyHostToDevice);
gemv_kernel<<<grid, block>>>(A_gpu, B_gpu, C_gpu, NI, NJ);
// gemv_kernel_reduce<<<grid_reduce, block_reduce>>>(C_gpu_tmp, C_gpu, NI, NJ);
cudaDeviceSynchronize();
cudaMemcpy(C, C_gpu, sizeof(DATA_TYPE) * NI, cudaMemcpyDeviceToHost);
t_end = rtclock();
fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
}
int main(int argc, char *argv[])
{
if (argc >= 3) {
NI = atoi(argv[1]);
NJ = atoi(argv[2]);
} else {
NI = SIZE;
NJ = SIZE;
}
double t_start, t_end;
DATA_TYPE* A;
DATA_TYPE* B;
DATA_TYPE* C;
DATA_TYPE *C_ref;
DATA_TYPE *A_gpu;
DATA_TYPE *B_gpu;
DATA_TYPE *C_gpu;
DATA_TYPE *C_gpu_tmp;
A = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE));
B = (DATA_TYPE*)malloc(NJ*sizeof(DATA_TYPE));
C = (DATA_TYPE*)malloc(NI*sizeof(DATA_TYPE));
C_ref = (DATA_TYPE *)malloc(NI*sizeof(DATA_TYPE));
//cudaMallocManaged(&A_gpu, sizeof(DATA_TYPE) * NI * NK);
//cudaMallocManaged(&B_gpu, sizeof(DATA_TYPE) * NK * NJ);
//cudaMallocManaged(&C_gpu, sizeof(DATA_TYPE) * NI * NJ);
cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI * NJ);
cudaMalloc(&B_gpu, sizeof(DATA_TYPE) * NJ);
cudaMalloc(&C_gpu, sizeof(DATA_TYPE) * NI);
cudaMalloc(&C_gpu_tmp, sizeof(DATA_TYPE) * NI * DIM_THREAD_BLOCK_X);
init(A, B, C, C_ref);
GPU_argv_init();
gemvCuda(A, B, C, A_gpu, B_gpu, C_gpu, C_gpu_tmp);
t_start = rtclock();
gemv(A, B, C_ref);
t_end = rtclock();
fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
compareResults(C, C_ref);
free(A);
free(B);
free(C);
free(C_ref);
cudaFree(A_gpu);
cudaFree(B_gpu);
cudaFree(C_gpu);
cudaFree(C_gpu_tmp);
return 0;
}