| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <unistd.h> |
| #include <stdio.h> |
| #include <time.h> |
| #include <sys/time.h> |
| #include <stdlib.h> |
| #include <stdarg.h> |
| #include <string.h> |
| #include <cuda.h> |
| #include "../../../common/cupti_add.h" |
| #include "../../../common/cpu_timestamps.h" |
|
|
| #include <cooperative_groups.h> |
| #include <cooperative_groups/memcpy_async.h> |
|
|
| using namespace nvcuda::experimental; |
|
|
| #define PREFETCH_COUNT 2 |
|
|
| #define SMALL_FLOAT_VAL 0.00000001f |
|
|
| double rtclock() |
| { |
| struct timezone Tzp; |
| struct timeval Tp; |
| uint64_t stat; |
| stat = gettimeofday(&Tp, &Tzp); |
| if (stat != 0) |
| printf("Error return from gettimeofday: %d", stat); |
| return (Tp.tv_sec + Tp.tv_usec * 1.0e-6); |
| } |
|
|
| float absVal(float a) |
| { |
| if (a < 0) |
| { |
| return (a * -1); |
| } |
| else |
| { |
| return a; |
| } |
| } |
|
|
| float percentDiff(double val1, double val2) |
| { |
| if ((absVal(val1) < 0.01) && (absVal(val2) < 0.01)) |
| { |
| return 0.0f; |
| } |
|
|
| else |
| { |
| return 100.0f * (absVal(absVal(val1 - val2) / absVal(val1 + SMALL_FLOAT_VAL))); |
| } |
| } |
|
|
| |
| #define PERCENT_DIFF_ERROR_THRESHOLD 0.005 |
|
|
| |
| #define SIZE 1073741824 |
| #define ITER 100 |
| uint64_t NI; |
|
|
| |
| #ifndef DIM_THREAD_BLOCK |
| #define DIM_THREAD_BLOCK 256 |
| #endif |
|
|
| #ifndef BATCH_SIZE |
| #define BATCH_SIZE 16 |
| #endif |
|
|
| #ifndef NBLOCKS |
| #define NBLOCKS 64 |
| #endif |
|
|
| #define LCG_A 1.1f |
| #define LCG_B 1.1f |
|
|
| |
| typedef float DATA_TYPE; |
| |
|
|
| void saxpy(DATA_TYPE *A, uint64_t iterations) |
| { |
| for (uint64_t i = 0; i < NI; i++) { |
| for (uint64_t iter = 0; iter < iterations; iter++) { |
| A[i] = LCG_A * A[i] + LCG_B; |
| } |
| } |
| } |
|
|
| void init(DATA_TYPE *A, DATA_TYPE *A_ref) |
| { |
| for (uint64_t i = 0; i < NI; i++) { |
| A[i] = ((DATA_TYPE) i) / NI; |
| A_ref[i] = ((DATA_TYPE) i) / NI; |
| } |
| } |
|
|
|
|
| void compareResults(DATA_TYPE* A, DATA_TYPE* A_outputFromGpu) |
| { |
| uint64_t fail = 0; |
| |
| |
| for (uint64_t i = 0; i < NI; i++) { |
| |
| if (percentDiff(A[i], A_outputFromGpu[i]) > PERCENT_DIFF_ERROR_THRESHOLD) { |
| fail++; |
| printf("%lld, GPU is %f, CPU is %f.\n", i, A[i], A_outputFromGpu[i]); |
| } |
| } |
| |
| |
| printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail); |
| } |
|
|
| __global__ void vector_seq_kernel(DATA_TYPE *a, uint64_t NI, uint64_t iterations, uint64_t block_size) |
| { |
| |
| const uint64_t mem_size = DIM_THREAD_BLOCK * BATCH_SIZE; |
|
|
| |
| extern __shared__ DATA_TYPE tmp[]; |
|
|
| uint64_t total_tiles = NI / mem_size; |
| uint64_t base_tiles = total_tiles / gridDim.x; |
|
|
| uint64_t tiles_this_block = block_size / mem_size; |
|
|
| uint64_t tile = base_tiles * blockIdx.x; |
| uint64_t end_tile = tile + tiles_this_block; |
|
|
| for (; tile < end_tile; tile += 1) |
| { |
| for (uint64_t i = threadIdx.x; i < mem_size; i += blockDim.x) |
| { |
| tmp[i] = a[tile * mem_size + i]; |
| } |
|
|
| __syncthreads(); |
|
|
| for (uint64_t i = threadIdx.x; i < mem_size; i += blockDim.x) |
| { |
| for (uint64_t iter = 0; iter < iterations; iter++) |
| { |
| tmp[i] = LCG_A * tmp[i] + LCG_B; |
| } |
| } |
|
|
| for (uint64_t i = threadIdx.x; i < mem_size; i += blockDim.x) |
| { |
| a[tile * mem_size + i] = tmp[i]; |
| } |
| } |
| } |
|
|
| void saxpyCuda(DATA_TYPE *A, DATA_TYPE *A_gpu, uint64_t iterations, uint64_t block_size) |
| { |
| double t_start, t_end; |
| if (block_size <= DIM_THREAD_BLOCK) |
| block_size = DIM_THREAD_BLOCK; |
|
|
| dim3 block(DIM_THREAD_BLOCK); |
| dim3 grid(NI / block_size); |
|
|
| int MaxBytesofSharedMemory = DIM_THREAD_BLOCK * BATCH_SIZE * sizeof(DATA_TYPE); |
| cudaFuncSetAttribute(vector_seq_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, MaxBytesofSharedMemory); |
|
|
| |
| cudaMemcpy(A_gpu, A, NI * sizeof(DATA_TYPE), cudaMemcpyHostToDevice); |
| vector_seq_kernel<<<grid, block, MaxBytesofSharedMemory>>>(A_gpu, NI, iterations, block_size); |
| cudaDeviceSynchronize(); |
| cudaMemcpy(A, A_gpu, NI * sizeof(DATA_TYPE), cudaMemcpyDeviceToHost); |
| |
|
|
| |
| } |
|
|
| int main(int argc, char *argv[]) |
| { |
| uint64_t start_tsc = rdtsc(); |
| uint64_t start_tsp = rdtsp(); |
| printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp); |
| uint64_t iterations = ITER; |
| uint64_t block_size = DIM_THREAD_BLOCK * BATCH_SIZE; |
| if (argc >= 4) { |
| NI = atoll(argv[1]); |
| iterations = atoi(argv[2]); |
| block_size = atoi(argv[3]); |
| } |
| else { |
| NI = SIZE; |
| iterations = ITER; |
| block_size = DIM_THREAD_BLOCK * BATCH_SIZE; |
| } |
|
|
| int nblocks = NBLOCKS; |
| block_size = NI / nblocks; |
|
|
| double t_start, t_end; |
|
|
| DATA_TYPE* A; |
| DATA_TYPE *A_ref; |
|
|
| DATA_TYPE *A_gpu; |
|
|
| A = (DATA_TYPE*)malloc(NI*sizeof(DATA_TYPE)); |
| A_ref = (DATA_TYPE *)malloc(NI*sizeof(DATA_TYPE)); |
|
|
| |
| |
| |
|
|
| init(A, A_ref); |
|
|
| GPU_argv_init(); |
|
|
| initTrace(); |
| startCPU(); |
| |
| cudaMalloc(&A_gpu, sizeof(DATA_TYPE) * NI); |
|
|
| saxpyCuda(A, A_gpu, iterations, block_size); |
|
|
| cudaFree(A_gpu); |
| endCPU(); |
| finiTrace(); |
|
|
| |
| |
| |
| |
| |
| |
|
|
| free(A); |
| free(A_ref); |
| return 0; |
| } |
|
|