/** * gemm.cu: This file is part of the PolyBench/GPU 1.0 test suite. * * * Contact: Scott Grauer-Gray * Louis-Noel Pouchet * Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU */ #include #include #include #include #include #include #include #include #include #include "../../../common/cupti_add.h" #include "../../../common/cpu_timestamps.h" #include #include using namespace nvcuda::experimental; #define PREFETCH_COUNT 2 #define SMALL_FLOAT_VAL 0.00000001f double rtclock() { struct timezone Tzp; struct timeval Tp; uint64_t stat; stat = gettimeofday(&Tp, &Tzp); if (stat != 0) printf("Error return from gettimeofday: %d", stat); return (Tp.tv_sec + Tp.tv_usec * 1.0e-6); } float absVal(float a) { if (a < 0) { return (a * -1); } else { return a; } } float percentDiff(double val1, double val2) { if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01)) { return 0.0f; } else { return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL))); } } //define the error threshold for the results "not matching" #define PERCENT_DIFF_ERROR_THRESHOLD 0.005 /* Problem size */ #define SIZE 4096000 #define ITER 100 uint64_t NI; /* Thread block dimensions */ #define DIM_THREAD_BLOCK 256 #define BATCH_SIZE 16 #define LCG_A 1.1f #define LCG_B 1.1f /* Can switch DATA_TYPE between float and double */ typedef float DATA_TYPE; // typedef uint64_t DATA_TYPE; void saxpy(DATA_TYPE *A, uint64_t iterations) { for (uint64_t i = 0; i < NI; i++) { for (uint64_t iter = 0; iter < iterations; iter++) { A[i] = LCG_A * A[i] + LCG_B; } } } void init(DATA_TYPE *A, DATA_TYPE *A_ref) { for (uint64_t i = 0; i < NI; i++) { A[i] = ((DATA_TYPE) i) / NI; A_ref[i] = ((DATA_TYPE) i) / NI; } } void compareResults(DATA_TYPE* A, DATA_TYPE* A_outputFromGpu) { uint64_t fail = 0; // Compare C1 and C2 for (uint64_t i = 0; i < NI; i++) { // printf("%lld, GPU is %f, CPU is %f.\n", i, A[i], A_outputFromGpu[i]); if (percentDiff(A[i], A_outputFromGpu[i]) > PERCENT_DIFF_ERROR_THRESHOLD) { fail++; printf("%lld, GPU is %f, CPU is %f.\n", i, A[i], A_outputFromGpu[i]); } } // Print results printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail); } __global__ void vector_rand_kernel(DATA_TYPE *a, uint64_t NI, uint64_t iterations, uint64_t block_size, size_t seed) { cooperative_groups::thread_block block = cooperative_groups::this_thread_block(); pipeline pipe; const uint64_t mem_size = DIM_THREAD_BLOCK * BATCH_SIZE; __shared__ DATA_TYPE tmp[mem_size * PREFETCH_COUNT]; curandState_t randState; size_t tx = blockIdx.x * blockDim.x + threadIdx.x; curand_init(seed, tx, 0, &randState); size_t idx = 0; uint64_t total_tiles = NI / mem_size; uint64_t base_tiles = total_tiles / gridDim.x; uint64_t tiles_this_block = block_size / mem_size; uint64_t fetch = base_tiles * blockIdx.x; uint64_t end_tile = fetch + tiles_this_block; for (uint64_t compute = fetch; compute < end_tile; compute++) { for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++) { for (uint64_t i = threadIdx.x; i < mem_size; i += blockDim.x) { idx = curand(&randState); idx <<= 32; idx |= curand(&randState); memcpy_async(tmp[(fetch % PREFETCH_COUNT) * mem_size + i], a[fetch * mem_size + idx % mem_size], pipe); } pipe.commit(); } if (fetch == end_tile) { for (uint64_t i = 0; i < PREFETCH_COUNT-1; ++i) { pipe.commit(); } ++fetch; } pipe.wait_prior(); block.sync(); for (uint64_t i = threadIdx.x; i < mem_size; i += blockDim.x) { for (uint64_t iter = 0; iter < iterations; iter++) { tmp[(compute % PREFETCH_COUNT) * mem_size + i] = LCG_A * tmp[(compute % PREFETCH_COUNT) * mem_size + i] + LCG_B; } } block.sync(); for (uint64_t i = threadIdx.x; i < mem_size; i += blockDim.x) { a[compute * mem_size + i] = tmp[(compute % PREFETCH_COUNT) * mem_size + idx % mem_size]; } block.sync(); } } void saxpyCuda(DATA_TYPE *A, DATA_TYPE *A_gpu, uint64_t iterations, uint64_t block_size) { double t_start, t_end; if (block_size <= DIM_THREAD_BLOCK) block_size = DIM_THREAD_BLOCK; dim3 block(DIM_THREAD_BLOCK); dim3 grid(NI / block_size); //t_start = rtclock(); cudaMemcpy(A_gpu, A, sizeof(DATA_TYPE) * NI, cudaMemcpyHostToDevice); vector_rand_kernel<<>>(A_gpu, NI, iterations, block_size, 832945); cudaDeviceSynchronize(); cudaMemcpy(A, A_gpu, sizeof(DATA_TYPE) * NI, cudaMemcpyDeviceToHost); //t_end = rtclock(); //fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start); } int main(int argc, char *argv[]) { uint64_t start_tsc = rdtsc(); uint64_t start_tsp = rdtsp(); printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp); uint64_t iterations = ITER; uint64_t block_size = DIM_THREAD_BLOCK * BATCH_SIZE; if (argc >= 4) { NI = atoll(argv[1]); iterations = atoi(argv[2]); block_size = atoi(argv[3]); } else { NI = SIZE; iterations = ITER; block_size = DIM_THREAD_BLOCK * BATCH_SIZE; } int nblocks = NI / block_size; if (nblocks > 64) { nblocks = 64; block_size = NI / nblocks; } double t_start, t_end; DATA_TYPE* A; DATA_TYPE *A_ref; DATA_TYPE *A_gpu; A = (DATA_TYPE*)malloc(NI*sizeof(DATA_TYPE)); A_ref = (DATA_TYPE *)malloc(NI*sizeof(DATA_TYPE)); //cudaMallocManaged(&A_gpu, sizeof(DATA_TYPE) * NI * NK); //cudaMallocManaged(&B_gpu, sizeof(DATA_TYPE) * NK * NJ); //cudaMallocManaged(&C_gpu, sizeof(DATA_TYPE) * NI * NJ); init(A, A_ref); GPU_argv_init(); initTrace(); startCPU(); cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI); saxpyCuda(A, A_gpu, iterations, block_size); // t_start = rtclock(); // saxpy(A_ref, iterations); // t_end = rtclock(); // fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); // compareResults(A, A_ref); cudaFree(A_gpu); endCPU(); finiTrace(); free(A); free(A_ref); return 0; }