| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| #include "cuda.h" |
| #include <cstdio> |
| #include <sys/time.h> |
| #include <time.h> |
| |
| #define BLOCK_DIM 16 |
|
|
| #include "../../../common/cupti_add.h" |
| #include "../../../common/cpu_timestamps.h" |
|
|
| #include <cooperative_groups.h> |
| #include <cooperative_groups/memcpy_async.h> |
|
|
| using namespace nvcuda::experimental; |
|
|
| #define PREFETCH_COUNT 2 |
|
|
| #define DIM_THREAD_BLOCK 256 |
|
|
| #ifndef SIZE |
| #define SIZE 4096 |
| #endif |
|
|
| __global__ void add(float *a, float *b, float *c) |
| { |
| int tid = blockIdx.x; |
|
|
| c[tid] = a[tid] + b[tid]; |
| } |
|
|
| __global__ void scale(float *a, int size, int index) |
| { |
| int i; |
| int start = (index * size + index); |
| int end = (index * size + size); |
|
|
| for (i = start + 1; i < end; i++) |
| { |
| a[i] = (a[i] / a[start]); |
| } |
| } |
|
|
| __global__ void reduce(float *a, int size, int index, int b_size) |
| { |
| extern __shared__ float pivot[SIZE]; |
| int i; |
|
|
| int tid = threadIdx.x; |
| int bid = blockIdx.x; |
| int block_size = b_size; |
|
|
| int pivot_start = (index * size + index); |
| int pivot_end = (index * size + size); |
|
|
| int start; |
| int end; |
| int pivot_row; |
| int my_row; |
|
|
| if (tid == 0) |
| { |
| for (i = index; i < size; i++) |
| pivot[i] = a[(index * size) + i]; |
| } |
|
|
| __syncthreads(); |
|
|
| pivot_row = (index * size); |
| my_row = (((block_size * bid) + tid) * size); |
| start = my_row + index; |
| end = my_row + size; |
|
|
| if (my_row > pivot_row) |
| { |
| for (i = start + 1; i < end; i++) |
| { |
| a[i] = a[i] - (a[start] * pivot[(i - my_row)]); |
| } |
| } |
| } |
|
|
| void initCPU(float *a, int N) |
| { |
| srand((unsigned)2); |
| |
| for (int i = 0; i < (N * N); i++) |
| { |
| a[i] = ((rand() % 10) + 1); |
| |
| } |
| } |
|
|
| void initGPU(float *a_dev, float *a, int N) |
| { |
| for (int i = 0; i < (N * N); i++) |
| { |
| a_dev[i] = a[i]; |
| |
| } |
| } |
|
|
| __global__ void lud_kernel(float *a, int N) |
| { |
| cooperative_groups::thread_block block = cooperative_groups::this_thread_block(); |
| pipeline pipe; |
| |
| __shared__ float pivot[SIZE * PREFETCH_COUNT]; |
|
|
| int fetch = 0; |
| int end_tile = fetch + N; |
|
|
| for (int compute = fetch; compute < end_tile; compute++) |
| { |
| for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++) |
| { |
| int tid = threadIdx.x; |
| int bid = blockIdx.x; |
| int block_size = blockDim.x; |
|
|
| if (tid == 0 && bid == 0) |
| { |
| int start = (compute * N + compute); |
| int end = (compute * N + N); |
|
|
| for (int i = start + 1; i < end; i++) |
| a[i] = (a[i] / a[start]); |
| } |
| block.sync(); |
|
|
| if (tid == 0) |
| { |
| for (int i = compute; i < N; i++) |
| memcpy_async(pivot[(fetch % PREFETCH_COUNT) * N + i], a[(compute * N) + i], pipe); |
| } |
| |
| pipe.commit(); |
| } |
| if (fetch == end_tile) |
| { |
| for (int i = 0; i < PREFETCH_COUNT - 1; ++i) |
| { |
| pipe.commit(); |
| } |
| ++fetch; |
| } |
| pipe.wait_prior<PREFETCH_COUNT - 1>(); |
| block.sync(); |
|
|
| int tid = threadIdx.x; |
| int bid = blockIdx.x; |
| int block_size = blockDim.x; |
|
|
| int pivot_row = (compute * N); |
| int my_row = (((block_size * bid) + tid) * N); |
| int start = my_row + compute; |
| int end = my_row + N; |
|
|
| if (my_row > pivot_row) |
| { |
| for (int i = start + 1; i < end; i++) |
| { |
| a[i] = a[i] - (a[start] * pivot[(compute % PREFETCH_COUNT) * N + (i - my_row)]); |
| } |
| } |
| block.sync(); |
| } |
| } |
|
|
| int main(int argc, char *argv[]) |
| { |
| uint64_t start_tsc = rdtsc(); |
| uint64_t start_tsp = rdtsp(); |
| printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp); |
| float *a; |
| float *a_gpu; |
| float *c; |
| float error; |
| int N; |
| int flag = 0; |
|
|
| float **result; |
| float **a_ref; |
| int blocks; |
|
|
| int i; |
| int j; |
| int k; |
| float l1; |
| float u1; |
|
|
| N = SIZE; |
| |
| a = (float *)malloc(sizeof(float) * N * N); |
| c = (float *)malloc(sizeof(float) * N * N); |
|
|
| result = (float **)malloc(sizeof(float *) * N); |
| a_ref = (float **)malloc(sizeof(float *) * N); |
|
|
| for (i = 0; i < N; i++) |
| { |
| result[i] = (float *)malloc(sizeof(float) * N); |
| a_ref[i] = (float *)malloc(sizeof(float) * N); |
| } |
| initCPU(a, N); |
|
|
| GPU_argv_init(); |
| initTrace(); |
| startCPU(); |
| |
| |
|
|
| cudaMallocManaged(&a_gpu, N * N * sizeof(float)); |
| memcpy(a_gpu, a, N * N * sizeof(float)); |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| cudaStream_t stream1; |
| cudaStreamCreate(&stream1); |
|
|
| cudaMemPrefetchAsync(a_gpu, N * N * sizeof(float), GPU_DEVICE, stream1); |
| cudaStreamSynchronize(stream1); |
|
|
| blocks = ((N / DIM_THREAD_BLOCK)); |
| lud_kernel<<<blocks, DIM_THREAD_BLOCK, 0, stream1>>>(a_gpu, N); |
| cudaDeviceSynchronize(); |
| |
|
|
| |
| memcpy(c, a_gpu, N * N * sizeof(float)); |
| |
| cudaFree(a_gpu); |
|
|
| endCPU(); |
| finiTrace(); |
|
|
| |
| for (i = 0; i < N; i++) |
| { |
| for (j = 0; j < N; j++) |
| { |
| |
| result[i][j] = c[i * N + j]; |
| |
| } |
| } |
|
|
| printf("======================================================="); |
| printf("\n Performing inplace verification \n"); |
|
|
| for (i = 0; i < N; i++) |
| { |
| for (j = 0; j < N; j++) |
| { |
| a_ref[i][j] = 0; |
| for (k = 0; k < N; k++) |
| { |
| if (i >= k) |
| l1 = result[i][k]; |
| else |
| l1 = 0; |
|
|
| if (k == j) |
| u1 = 1; |
| else if (k < j) |
| u1 = result[k][j]; |
| else |
| u1 = 0.0; |
|
|
| a_ref[i][j] = a_ref[i][j] + (l1 * u1); |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
|
|
| return 0; |
| } |
|
|