UvmPinAsync / workloads /realworld /async /BN /ordergraph_kernel.cu

lrh12580

first commit

5cb6c4b about 2 years ago

8.48 kB

	#ifndef _ORDERGRAPH_KERNEL_H_
	#define _ORDERGRAPH_KERNEL_H_

	#include <stdio.h>

	#ifdef DATA_25
	#include "data25.cu"
	#endif
	#ifdef DATA_30
	#include "data30.cu"
	#endif
	#ifdef DATA_40
	#include "data40.cu"
	#endif
	#ifdef DATA_45
	#include "data45.cu"
	#endif
	#ifdef DATA_50
	#include "data50.cu"
	#endif
	#ifdef DATA_125
	#include "data125.cu"
	#endif
	;

	#include <cooperative_groups.h>
	#include <cooperative_groups/memcpy_async.h>

	using namespace nvcuda::experimental;

	#define PREFETCH_COUNT 2

	#define BLOCK_SIZE 256
	#define MAX_NBLOCKS 1024
	#define MIN_NBATCHES 16


	__device__ void Dincr(int *bit, int n);
	__device__ void DincrS(int *bit, int n);
	__device__ bool D_getState(int parN, int *sta, int time);
	__device__ void D_findComb(int *comb, int l, int n);
	__device__ int D_findindex(int *arr, int size);
	__device__ int D_C(int n, int a);

	__global__ void genScoreKernel(int sizepernode, float *D_localscore,
	int D_data, float D_LG) {
	int id = blockIdx.x * BLOCK_SIZE + threadIdx.x;
	int node, index;
	bool flag;
	int parent[5] = {0};
	int pre[NODE_N] = {0};
	int state[5] = {0};
	int i, j, parN = 0, tmp, t;
	int t1 = 0, t2 = 0;
	float ls = 0;
	int Nij[STATE_N] = {0};

	if (id < sizepernode) {

	D_findComb(parent, id, NODE_N - 1);

	for (i = 0; i < 4; i++) {
	if (parent[i] > 0)
	parN++;
	}

	for (node = 0; node < NODE_N; node++) {

	j = 1;
	for (i = 0; i < NODE_N; i++) {
	if (i != node)
	pre[j++] = i;
	}

	for (tmp = 0; tmp < parN; tmp++)
	state[tmp] = 0;

	index = sizepernode * node + id;

	// priors
	/*
	for(tmp=1;tmp<=4;tmp++){
	localscore[index]+=100(prior[node][pre[parent[tmp]]]-0.5)(prior[node][pre[parent[tmp]]]-0.5)*(prior[node][pre[parent[tmp]]]-0.5);
	}
	*/
	t = 0;
	while (D_getState(parN, state, t++)) { // for get state
	// printf("test %u\n",id);
	ls = 0;
	for (tmp = 0; tmp < STATE_N; tmp++)
	Nij[tmp] = 0;

	for (t1 = 0; t1 < DATA_N; t1++) {
	flag = true;
	for (t2 = 0; t2 < parN; t2++) {
	if (D_data[t1 * NODE_N + pre[parent[t2]]] != state[t2]) {
	flag = false;
	break;
	}
	}
	if (!flag)
	continue;

	Nij[D_data[t1 * NODE_N + node]]++;
	}

	tmp = STATE_N - 1;

	for (t1 = 0; t1 < STATE_N; t1++) {
	ls += D_LG[Nij[t1]];
	tmp += Nij[t1];
	}

	ls -= D_LG[tmp];
	ls += D_LG[STATE_N - 1];

	D_localscore[index] += ls;
	}
	}
	}
	}

	__global__ void computeKernel(int taskperthr, int sizepernode,
	float D_localscore, bool D_parent, int node,
	int total, float D_Score, int D_resP,
	int nbatches) {
	cooperative_groups::thread_block block =
	cooperative_groups::this_thread_block();
	pipeline pipe;
	__shared__ float lsinblock[PREFETCH_COUNT][BLOCK_SIZE];

	int fetch = 0;
	int end_tile = fetch + nbatches;
	int bestparent[4] = {0}, parent[5] = {-1};

	for (int compute = fetch; compute < end_tile; compute++) {
	for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++) {
	unsigned int bid = blockIdx.x * nbatches + fetch;
	unsigned int tid = threadIdx.x;
	unsigned int id = bid * (BLOCK_SIZE * nbatches) + tid;

	int posN = 1, i, index, tmp;
	int pre[NODE_N] = {0};
	int parN = 0;

	float bestls = -999999999999999, ls;

	for (i = 0; i < NODE_N; i++) {
	if (D_parent[i] == 1) {
	pre[posN++] = i;
	}
	}

	for (i = 0; i < taskperthr && ((id * taskperthr + i) < total); i++) {

	D_findComb(parent, id * taskperthr + i, posN);

	for (parN = 0; parN < 4; parN++) {
	if (parent[parN] < 0)
	break;
	if (pre[parent[parN]] > node)
	parent[parN] = pre[parent[parN]];
	else
	parent[parN] = pre[parent[parN]] + 1;
	}

	for (tmp = parN; tmp > 0; tmp--) {
	parent[tmp] = parent[tmp - 1];
	}
	parent[0] = 0;

	index = D_findindex(parent, parN);
	index += sizepernode * node;

	ls = D_localscore[index];

	if (ls > bestls) {
	bestls = ls;
	for (tmp = 0; tmp < 4; tmp++)
	bestparent[tmp] = parent[tmp + 1];
	}
	}

	memcpy_async(lsinblock[fetch % PREFETCH_COUNT][tid], bestls, pipe);
	pipe.commit();
	}
	if (fetch == end_tile) {
	for (int i = 0; i < PREFETCH_COUNT - 1; ++i) {
	pipe.commit();
	}
	++fetch;
	}
	pipe.wait_prior<PREFETCH_COUNT - 1>();
	block.sync();

	int i, t;
	unsigned int bid = blockIdx.x * nbatches + compute;
	unsigned int tid = threadIdx.x;
	// unsigned int id = bid * (BLOCK_SIZE * nbatches) + tid;

	for (i = BLOCK_SIZE / 2; i >= 1; i /= 2) {
	if (tid < i) {
	if (lsinblock[compute % PREFETCH_COUNT][tid + i] >
	lsinblock[compute % PREFETCH_COUNT][tid] &&
	lsinblock[compute % PREFETCH_COUNT][tid + i] < 0) {
	lsinblock[compute % PREFETCH_COUNT][tid] =
	lsinblock[compute % PREFETCH_COUNT][tid + i];
	lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)(tid + i);
	} else if (lsinblock[compute % PREFETCH_COUNT][tid + i] <
	lsinblock[compute % PREFETCH_COUNT][tid] &&
	lsinblock[compute % PREFETCH_COUNT][tid] < 0) {
	lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)tid;
	} else if (lsinblock[tid] > 0 && lsinblock[tid + i] < 0) {
	lsinblock[compute % PREFETCH_COUNT][tid] =
	lsinblock[compute % PREFETCH_COUNT][tid + i];
	lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)(tid + i);
	} else if (lsinblock[compute % PREFETCH_COUNT][tid] < 0 &&
	lsinblock[compute % PREFETCH_COUNT][tid + i] > 0) {
	lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)tid;
	}
	}
	block.sync();
	}
	block.sync();

	if (tid == 0) {
	D_Score[bid] = lsinblock[compute % PREFETCH_COUNT][0];
	t = 0;
	for (i = 0; i < 7 && t < 128 && t >= 0; i++) {
	t = (int)lsinblock[compute % PREFETCH_COUNT][(int)powf(2.0, i) + t];
	}
	lsinblock[compute % PREFETCH_COUNT][0] = (float)t;
	}
	block.sync();

	if (tid == (int)lsinblock[compute % PREFETCH_COUNT][0]) {
	for (i = 0; i < 4; i++) {
	D_resP[bid * 4 + i] = bestparent[i];
	}
	}
	}
	}

	__device__ void Dincr(int *bit, int n) {

	while (n <= NODE_N) {
	bit[n]++;
	if (bit[n] >= 2) {
	bit[n] = 0;
	n++;
	} else {
	break;
	}
	}

	return;
	}

	__device__ void DincrS(int *bit, int n) {

	bit[n]++;
	if (bit[n] >= STATE_N) {
	bit[n] = 0;
	Dincr(bit, n + 1);
	}

	return;
	}

	__device__ bool D_getState(int parN, int *sta, int time) {
	int i, j = 1;

	for (i = 0; i < parN; i++) {
	j *= STATE_N;
	}
	j--;
	if (time > j)
	return false;

	if (time >= 1)
	DincrS(sta, 0);

	return true;
	}

	__device__ void D_findComb(int *comb, int l, int n) {
	const int len = 4;
	if (l == 0) {
	for (int i = 0; i < len; i++)
	comb[i] = -1;
	return;
	}
	int sum = 0;
	int k = 1;

	while (sum < l)
	sum += D_C(n, k++);
	l -= sum - D_C(n, --k);
	int low = 0;
	int pos = 0;
	while (k > 1) {
	sum = 0;
	int s = 1;
	while (sum < l)
	sum += D_C(n - s++, k - 1);
	l -= sum - D_C(n - (--s), --k);
	low += s;
	comb[pos++] = low;
	n -= s;
	}
	comb[pos] = low + l;
	for (int i = pos + 1; i < 4; i++)
	comb[i] = -1;
	}

	__device__ int D_findindex(int *arr,
	int size) { // reminder: arr[0] has to be 0 && size
	// == array size-1 && index start from 0
	int i, j, index = 0;

	for (i = 1; i < size; i++) {
	index += D_C(NODE_N - 1, i);
	}

	for (i = 1; i <= size - 1; i++) {
	for (j = arr[i - 1] + 1; j <= arr[i] - 1; j++) {
	index += D_C(NODE_N - 1 - j, size - i);
	}
	}

	index += arr[size] - arr[size - 1];

	return index;
	}

	__device__ int D_C(int n, int a) {
	int i, res = 1, atmp = a;

	for (i = 0; i < atmp; i++) {
	res *= n;
	n--;
	}

	for (i = 0; i < atmp; i++) {
	res /= a;
	a--;
	}

	return res;
	}

	#endif