lrh12580

first commit

5cb6c4b over 2 years ago

7.18 kB

	/** Modifed version of knn-CUDA from https://github.com/vincentfpgarcia/kNN-CUDA
	* The modifications are
	* removed texture memory usage
	* removed split query KNN computation
	* added feature extraction with bilinear interpolation
	*
	* Last modified by Christopher B. Choy <chrischoy@ai.stanford.edu> 12/23/2016
	*/

	// Includes
	#include "cuda.h"
	#include <cstdio>
	#include <sys/time.h>
	#include <time.h>
	// Constants used by the program
	#define BLOCK_DIM 16

	#include "../../../common/cupti_add.h"
	#include "../../../common/cpu_timestamps.h"

	#include <cooperative_groups.h>
	#include <cooperative_groups/memcpy_async.h>

	using namespace nvcuda::experimental;

	#define PREFETCH_COUNT 2

	#define DIM_THREAD_BLOCK 256

	#ifndef SIZE
	#define SIZE 4096
	#endif

	__global__ void add(float a, float b, float *c)
	{
	int tid = blockIdx.x; // Handle the data at the index

	c[tid] = a[tid] + b[tid];
	}

	__global__ void scale(float *a, int size, int index)
	{
	int i;
	int start = (index * size + index);
	int end = (index * size + size);

	for (i = start + 1; i < end; i++)
	{
	a[i] = (a[i] / a[start]);
	}
	}

	__global__ void reduce(float *a, int size, int index, int b_size)
	{
	extern __shared__ float pivot[SIZE];
	int i;

	int tid = threadIdx.x;
	int bid = blockIdx.x;
	int block_size = b_size;

	int pivot_start = (index * size + index);
	int pivot_end = (index * size + size);

	int start;
	int end;
	int pivot_row;
	int my_row;

	if (tid == 0)
	{
	for (i = index; i < size; i++)
	pivot[i] = a[(index * size) + i];
	}

	__syncthreads();

	pivot_row = (index * size);
	my_row = (((block_size * bid) + tid) * size);
	start = my_row + index;
	end = my_row + size;

	if (my_row > pivot_row)
	{
	for (i = start + 1; i < end; i++)
	{
	a[i] = a[i] - (a[start] * pivot[(i - my_row)]);
	}
	}
	}

	void initCPU(float *a, int N)
	{
	srand((unsigned)2);
	// fill the arrays 'a' on the CPU
	for (int i = 0; i < (N * N); i++)
	{
	a[i] = ((rand() % 10) + 1);
	// a[i] = 1.0f;
	}
	}

	void initGPU(float a_dev, float a, int N)
	{
	for (int i = 0; i < (N * N); i++)
	{
	a_dev[i] = a[i];
	// a_dev[i] = 1.0f;
	}
	}

	__global__ void lud_kernel(float *a, int N)
	{
	cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
	pipeline pipe;
	// extern __shared__ float pivot[];
	__shared__ float pivot[SIZE * PREFETCH_COUNT];

	int fetch = 0;
	int end_tile = fetch + N;

	for (int compute = fetch; compute < end_tile; compute++)
	{
	for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
	{
	int tid = threadIdx.x;
	int bid = blockIdx.x;
	int block_size = blockDim.x;

	if (tid == 0 && bid == 0)
	{
	int start = (compute * N + compute);
	int end = (compute * N + N);

	for (int i = start + 1; i < end; i++)
	a[i] = (a[i] / a[start]);
	}
	block.sync();

	if (tid == 0)
	{
	for (int i = compute; i < N; i++)
	memcpy_async(pivot[(fetch % PREFETCH_COUNT) * N + i], a[(compute * N) + i], pipe);
	}
	// block.sync();
	pipe.commit();
	}
	if (fetch == end_tile)
	{
	for (int i = 0; i < PREFETCH_COUNT - 1; ++i)
	{
	pipe.commit();
	}
	++fetch;
	}
	pipe.wait_prior<PREFETCH_COUNT - 1>();
	block.sync();

	int tid = threadIdx.x;
	int bid = blockIdx.x;
	int block_size = blockDim.x;

	int pivot_row = (compute * N);
	int my_row = (((block_size * bid) + tid) * N);
	int start = my_row + compute;
	int end = my_row + N;

	if (my_row > pivot_row)
	{
	for (int i = start + 1; i < end; i++)
	{
	a[i] = a[i] - (a[start] * pivot[(compute % PREFETCH_COUNT) * N + (i - my_row)]);
	}
	}
	block.sync();
	}
	}

	int main(int argc, char *argv[])
	{
	uint64_t start_tsc = rdtsc();
	uint64_t start_tsp = rdtsp();
	printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
	float *a;
	float *a_gpu;
	float *c;
	float error;
	int N;
	int flag = 0;

	float **result;
	float **a_ref;
	int blocks;

	int i;
	int j;
	int k;
	float l1;
	float u1;

	N = SIZE;
	// allocate memory on CPU
	a = (float )malloc(sizeof(float) N * N);
	c = (float )malloc(sizeof(float) N * N);

	result = (float *)malloc(sizeof(float ) * N);
	a_ref = (float *)malloc(sizeof(float ) * N);

	for (i = 0; i < N; i++)
	{
	result[i] = (float )malloc(sizeof(float) N);
	a_ref[i] = (float )malloc(sizeof(float) N);
	}
	initCPU(a, N);

	GPU_argv_init();
	initTrace();
	startCPU();
	// allocate the memory on the GPU
	// cudaMalloc((void *)&dev_a, N N * sizeof(float));

	cudaMallocManaged(&a_gpu, N * N * sizeof(float));
	memcpy(a_gpu, a, N * N * sizeof(float));

	// cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice); // copy array to device memory

	/Perform LU Decomposition/
	// for (i = 0; i < N; i++)
	// {
	// scale<<<1, 1>>>(a_gpu, N, i);
	// // blocks= ((N-i-1)/512)+1;
	// blocks = ((N / DIM_THREAD_BLOCK));
	// // printf("Number of blocks rxd : %d \n",blocks);
	// reduce<<<blocks, DIM_THREAD_BLOCK>>>(a_gpu, N, i, DIM_THREAD_BLOCK);
	// cudaDeviceSynchronize();
	// }

	cudaStream_t stream1;
	cudaStreamCreate(&stream1);

	cudaMemPrefetchAsync(a_gpu, N * N * sizeof(float), GPU_DEVICE, stream1);
	cudaStreamSynchronize(stream1);

	blocks = ((N / DIM_THREAD_BLOCK));
	lud_kernel<<<blocks, DIM_THREAD_BLOCK, 0, stream1>>>(a_gpu, N);
	cudaDeviceSynchronize();
	/LU decomposition ends here/

	// cudaMemcpy(c, dev_a, N * N * sizeof(float), cudaMemcpyDeviceToHost); // copy array back to host
	memcpy(c, a_gpu, N * N * sizeof(float));
	// free the memory allocated on the GPU
	cudaFree(a_gpu);

	endCPU();
	finiTrace();

	/copy the result matrix into explicit 2D matrix for verification/
	for (i = 0; i < N; i++)
	{
	for (j = 0; j < N; j++)
	{
	// c[i * N + j] = a_gpu[i * N + j];
	result[i][j] = c[i * N + j];
	// printf("result %d %d Error is %lf \n ", i, j, result[i][j]);
	}
	}

	printf("=======================================================");
	printf("\n Performing inplace verification \n");

	for (i = 0; i < N; i++)
	{
	for (j = 0; j < N; j++)
	{
	a_ref[i][j] = 0;
	for (k = 0; k < N; k++)
	{
	if (i >= k)
	l1 = result[i][k];
	else
	l1 = 0;

	if (k == j)
	u1 = 1;
	else if (k < j)
	u1 = result[k][j]; // figured it out
	else
	u1 = 0.0;

	a_ref[i][j] = a_ref[i][j] + (l1 * u1);
	}
	}
	}

	// for (i = 0; i < N; i++)
	// {
	// for (j = 0; j < N; j++)
	// {
	// error = abs(a[(i * N + j)] - a_ref[i][j]);
	// if (error > 1)
	// {
	// // printf("No match occured at %d %d Error is %lf \n ", i, j, abs(a[(i * N + j)] - a_ref[i][j]));
	// // printf("No match occured at %d %d Error is %lf, %lf \n ", i, j, a[(i * N + j)], a_ref[i][j]);
	// flag = flag + 1;
	// }
	// }
	// }

	// if (flag == 0)
	// printf("Match \n");
	// else
	// printf("No Matchs %d \n", flag);



	return 0;
	}