UvmPinAsync / workloads /realworld /async /BN /ordergraph_kernel.cu
lrh12580
first commit
5cb6c4b
#ifndef _ORDERGRAPH_KERNEL_H_
#define _ORDERGRAPH_KERNEL_H_
#include <stdio.h>
#ifdef DATA_25
#include "data25.cu"
#endif
#ifdef DATA_30
#include "data30.cu"
#endif
#ifdef DATA_40
#include "data40.cu"
#endif
#ifdef DATA_45
#include "data45.cu"
#endif
#ifdef DATA_50
#include "data50.cu"
#endif
#ifdef DATA_125
#include "data125.cu"
#endif
;
#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>
using namespace nvcuda::experimental;
#define PREFETCH_COUNT 2
#define BLOCK_SIZE 256
#define MAX_NBLOCKS 1024
#define MIN_NBATCHES 16
__device__ void Dincr(int *bit, int n);
__device__ void DincrS(int *bit, int n);
__device__ bool D_getState(int parN, int *sta, int time);
__device__ void D_findComb(int *comb, int l, int n);
__device__ int D_findindex(int *arr, int size);
__device__ int D_C(int n, int a);
__global__ void genScoreKernel(int sizepernode, float *D_localscore,
int *D_data, float *D_LG) {
int id = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int node, index;
bool flag;
int parent[5] = {0};
int pre[NODE_N] = {0};
int state[5] = {0};
int i, j, parN = 0, tmp, t;
int t1 = 0, t2 = 0;
float ls = 0;
int Nij[STATE_N] = {0};
if (id < sizepernode) {
D_findComb(parent, id, NODE_N - 1);
for (i = 0; i < 4; i++) {
if (parent[i] > 0)
parN++;
}
for (node = 0; node < NODE_N; node++) {
j = 1;
for (i = 0; i < NODE_N; i++) {
if (i != node)
pre[j++] = i;
}
for (tmp = 0; tmp < parN; tmp++)
state[tmp] = 0;
index = sizepernode * node + id;
// priors
/*
for(tmp=1;tmp<=4;tmp++){
localscore[index]+=100*(prior[node][pre[parent[tmp]]]-0.5)*(prior[node][pre[parent[tmp]]]-0.5)*(prior[node][pre[parent[tmp]]]-0.5);
}
*/
t = 0;
while (D_getState(parN, state, t++)) { // for get state
// printf("test %u\n",id);
ls = 0;
for (tmp = 0; tmp < STATE_N; tmp++)
Nij[tmp] = 0;
for (t1 = 0; t1 < DATA_N; t1++) {
flag = true;
for (t2 = 0; t2 < parN; t2++) {
if (D_data[t1 * NODE_N + pre[parent[t2]]] != state[t2]) {
flag = false;
break;
}
}
if (!flag)
continue;
Nij[D_data[t1 * NODE_N + node]]++;
}
tmp = STATE_N - 1;
for (t1 = 0; t1 < STATE_N; t1++) {
ls += D_LG[Nij[t1]];
tmp += Nij[t1];
}
ls -= D_LG[tmp];
ls += D_LG[STATE_N - 1];
D_localscore[index] += ls;
}
}
}
}
__global__ void computeKernel(int taskperthr, int sizepernode,
float *D_localscore, bool *D_parent, int node,
int total, float *D_Score, int *D_resP,
int nbatches) {
cooperative_groups::thread_block block =
cooperative_groups::this_thread_block();
pipeline pipe;
__shared__ float lsinblock[PREFETCH_COUNT][BLOCK_SIZE];
int fetch = 0;
int end_tile = fetch + nbatches;
int bestparent[4] = {0}, parent[5] = {-1};
for (int compute = fetch; compute < end_tile; compute++) {
for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++) {
unsigned int bid = blockIdx.x * nbatches + fetch;
unsigned int tid = threadIdx.x;
unsigned int id = bid * (BLOCK_SIZE * nbatches) + tid;
int posN = 1, i, index, tmp;
int pre[NODE_N] = {0};
int parN = 0;
float bestls = -999999999999999, ls;
for (i = 0; i < NODE_N; i++) {
if (D_parent[i] == 1) {
pre[posN++] = i;
}
}
for (i = 0; i < taskperthr && ((id * taskperthr + i) < total); i++) {
D_findComb(parent, id * taskperthr + i, posN);
for (parN = 0; parN < 4; parN++) {
if (parent[parN] < 0)
break;
if (pre[parent[parN]] > node)
parent[parN] = pre[parent[parN]];
else
parent[parN] = pre[parent[parN]] + 1;
}
for (tmp = parN; tmp > 0; tmp--) {
parent[tmp] = parent[tmp - 1];
}
parent[0] = 0;
index = D_findindex(parent, parN);
index += sizepernode * node;
ls = D_localscore[index];
if (ls > bestls) {
bestls = ls;
for (tmp = 0; tmp < 4; tmp++)
bestparent[tmp] = parent[tmp + 1];
}
}
memcpy_async(lsinblock[fetch % PREFETCH_COUNT][tid], bestls, pipe);
pipe.commit();
}
if (fetch == end_tile) {
for (int i = 0; i < PREFETCH_COUNT - 1; ++i) {
pipe.commit();
}
++fetch;
}
pipe.wait_prior<PREFETCH_COUNT - 1>();
block.sync();
int i, t;
unsigned int bid = blockIdx.x * nbatches + compute;
unsigned int tid = threadIdx.x;
// unsigned int id = bid * (BLOCK_SIZE * nbatches) + tid;
for (i = BLOCK_SIZE / 2; i >= 1; i /= 2) {
if (tid < i) {
if (lsinblock[compute % PREFETCH_COUNT][tid + i] >
lsinblock[compute % PREFETCH_COUNT][tid] &&
lsinblock[compute % PREFETCH_COUNT][tid + i] < 0) {
lsinblock[compute % PREFETCH_COUNT][tid] =
lsinblock[compute % PREFETCH_COUNT][tid + i];
lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)(tid + i);
} else if (lsinblock[compute % PREFETCH_COUNT][tid + i] <
lsinblock[compute % PREFETCH_COUNT][tid] &&
lsinblock[compute % PREFETCH_COUNT][tid] < 0) {
lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)tid;
} else if (lsinblock[tid] > 0 && lsinblock[tid + i] < 0) {
lsinblock[compute % PREFETCH_COUNT][tid] =
lsinblock[compute % PREFETCH_COUNT][tid + i];
lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)(tid + i);
} else if (lsinblock[compute % PREFETCH_COUNT][tid] < 0 &&
lsinblock[compute % PREFETCH_COUNT][tid + i] > 0) {
lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)tid;
}
}
block.sync();
}
block.sync();
if (tid == 0) {
D_Score[bid] = lsinblock[compute % PREFETCH_COUNT][0];
t = 0;
for (i = 0; i < 7 && t < 128 && t >= 0; i++) {
t = (int)lsinblock[compute % PREFETCH_COUNT][(int)powf(2.0, i) + t];
}
lsinblock[compute % PREFETCH_COUNT][0] = (float)t;
}
block.sync();
if (tid == (int)lsinblock[compute % PREFETCH_COUNT][0]) {
for (i = 0; i < 4; i++) {
D_resP[bid * 4 + i] = bestparent[i];
}
}
}
}
__device__ void Dincr(int *bit, int n) {
while (n <= NODE_N) {
bit[n]++;
if (bit[n] >= 2) {
bit[n] = 0;
n++;
} else {
break;
}
}
return;
}
__device__ void DincrS(int *bit, int n) {
bit[n]++;
if (bit[n] >= STATE_N) {
bit[n] = 0;
Dincr(bit, n + 1);
}
return;
}
__device__ bool D_getState(int parN, int *sta, int time) {
int i, j = 1;
for (i = 0; i < parN; i++) {
j *= STATE_N;
}
j--;
if (time > j)
return false;
if (time >= 1)
DincrS(sta, 0);
return true;
}
__device__ void D_findComb(int *comb, int l, int n) {
const int len = 4;
if (l == 0) {
for (int i = 0; i < len; i++)
comb[i] = -1;
return;
}
int sum = 0;
int k = 1;
while (sum < l)
sum += D_C(n, k++);
l -= sum - D_C(n, --k);
int low = 0;
int pos = 0;
while (k > 1) {
sum = 0;
int s = 1;
while (sum < l)
sum += D_C(n - s++, k - 1);
l -= sum - D_C(n - (--s), --k);
low += s;
comb[pos++] = low;
n -= s;
}
comb[pos] = low + l;
for (int i = pos + 1; i < 4; i++)
comb[i] = -1;
}
__device__ int D_findindex(int *arr,
int size) { // reminder: arr[0] has to be 0 && size
// == array size-1 && index start from 0
int i, j, index = 0;
for (i = 1; i < size; i++) {
index += D_C(NODE_N - 1, i);
}
for (i = 1; i <= size - 1; i++) {
for (j = arr[i - 1] + 1; j <= arr[i] - 1; j++) {
index += D_C(NODE_N - 1 - j, size - i);
}
}
index += arr[size] - arr[size - 1];
return index;
}
__device__ int D_C(int n, int a) {
int i, res = 1, atmp = a;
for (i = 0; i < atmp; i++) {
res *= n;
n--;
}
for (i = 0; i < atmp; i++) {
res /= a;
a--;
}
return res;
}
#endif