#ifndef _ORDERGRAPH_KERNEL_H_ #define _ORDERGRAPH_KERNEL_H_ #include #ifdef DATA_25 #include "data25.cu" #endif #ifdef DATA_30 #include "data30.cu" #endif #ifdef DATA_40 #include "data40.cu" #endif #ifdef DATA_45 #include "data45.cu" #endif #ifdef DATA_50 #include "data50.cu" #endif #ifdef DATA_125 #include "data125.cu" #endif ; #include #include using namespace nvcuda::experimental; #define PREFETCH_COUNT 2 #define BLOCK_SIZE 256 #define MAX_NBLOCKS 1024 #define MIN_NBATCHES 16 __device__ void Dincr(int *bit, int n); __device__ void DincrS(int *bit, int n); __device__ bool D_getState(int parN, int *sta, int time); __device__ void D_findComb(int *comb, int l, int n); __device__ int D_findindex(int *arr, int size); __device__ int D_C(int n, int a); __global__ void genScoreKernel(int sizepernode, float *D_localscore, int *D_data, float *D_LG) { int id = blockIdx.x * BLOCK_SIZE + threadIdx.x; int node, index; bool flag; int parent[5] = {0}; int pre[NODE_N] = {0}; int state[5] = {0}; int i, j, parN = 0, tmp, t; int t1 = 0, t2 = 0; float ls = 0; int Nij[STATE_N] = {0}; if (id < sizepernode) { D_findComb(parent, id, NODE_N - 1); for (i = 0; i < 4; i++) { if (parent[i] > 0) parN++; } for (node = 0; node < NODE_N; node++) { j = 1; for (i = 0; i < NODE_N; i++) { if (i != node) pre[j++] = i; } for (tmp = 0; tmp < parN; tmp++) state[tmp] = 0; index = sizepernode * node + id; // priors /* for(tmp=1;tmp<=4;tmp++){ localscore[index]+=100*(prior[node][pre[parent[tmp]]]-0.5)*(prior[node][pre[parent[tmp]]]-0.5)*(prior[node][pre[parent[tmp]]]-0.5); } */ t = 0; while (D_getState(parN, state, t++)) { // for get state // printf("test %u\n",id); ls = 0; for (tmp = 0; tmp < STATE_N; tmp++) Nij[tmp] = 0; for (t1 = 0; t1 < DATA_N; t1++) { flag = true; for (t2 = 0; t2 < parN; t2++) { if (D_data[t1 * NODE_N + pre[parent[t2]]] != state[t2]) { flag = false; break; } } if (!flag) continue; Nij[D_data[t1 * NODE_N + node]]++; } tmp = STATE_N - 1; for (t1 = 0; t1 < STATE_N; t1++) { ls += D_LG[Nij[t1]]; tmp += Nij[t1]; } ls -= D_LG[tmp]; ls += D_LG[STATE_N - 1]; D_localscore[index] += ls; } } } } __global__ void computeKernel(int taskperthr, int sizepernode, float *D_localscore, bool *D_parent, int node, int total, float *D_Score, int *D_resP, int nbatches) { cooperative_groups::thread_block block = cooperative_groups::this_thread_block(); pipeline pipe; __shared__ float lsinblock[PREFETCH_COUNT][BLOCK_SIZE]; int fetch = 0; int end_tile = fetch + nbatches; int bestparent[4] = {0}, parent[5] = {-1}; for (int compute = fetch; compute < end_tile; compute++) { for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++) { unsigned int bid = blockIdx.x * nbatches + fetch; unsigned int tid = threadIdx.x; unsigned int id = bid * (BLOCK_SIZE * nbatches) + tid; int posN = 1, i, index, tmp; int pre[NODE_N] = {0}; int parN = 0; float bestls = -999999999999999, ls; for (i = 0; i < NODE_N; i++) { if (D_parent[i] == 1) { pre[posN++] = i; } } for (i = 0; i < taskperthr && ((id * taskperthr + i) < total); i++) { D_findComb(parent, id * taskperthr + i, posN); for (parN = 0; parN < 4; parN++) { if (parent[parN] < 0) break; if (pre[parent[parN]] > node) parent[parN] = pre[parent[parN]]; else parent[parN] = pre[parent[parN]] + 1; } for (tmp = parN; tmp > 0; tmp--) { parent[tmp] = parent[tmp - 1]; } parent[0] = 0; index = D_findindex(parent, parN); index += sizepernode * node; ls = D_localscore[index]; if (ls > bestls) { bestls = ls; for (tmp = 0; tmp < 4; tmp++) bestparent[tmp] = parent[tmp + 1]; } } memcpy_async(lsinblock[fetch % PREFETCH_COUNT][tid], bestls, pipe); pipe.commit(); } if (fetch == end_tile) { for (int i = 0; i < PREFETCH_COUNT - 1; ++i) { pipe.commit(); } ++fetch; } pipe.wait_prior(); block.sync(); int i, t; unsigned int bid = blockIdx.x * nbatches + compute; unsigned int tid = threadIdx.x; // unsigned int id = bid * (BLOCK_SIZE * nbatches) + tid; for (i = BLOCK_SIZE / 2; i >= 1; i /= 2) { if (tid < i) { if (lsinblock[compute % PREFETCH_COUNT][tid + i] > lsinblock[compute % PREFETCH_COUNT][tid] && lsinblock[compute % PREFETCH_COUNT][tid + i] < 0) { lsinblock[compute % PREFETCH_COUNT][tid] = lsinblock[compute % PREFETCH_COUNT][tid + i]; lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)(tid + i); } else if (lsinblock[compute % PREFETCH_COUNT][tid + i] < lsinblock[compute % PREFETCH_COUNT][tid] && lsinblock[compute % PREFETCH_COUNT][tid] < 0) { lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)tid; } else if (lsinblock[tid] > 0 && lsinblock[tid + i] < 0) { lsinblock[compute % PREFETCH_COUNT][tid] = lsinblock[compute % PREFETCH_COUNT][tid + i]; lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)(tid + i); } else if (lsinblock[compute % PREFETCH_COUNT][tid] < 0 && lsinblock[compute % PREFETCH_COUNT][tid + i] > 0) { lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)tid; } } block.sync(); } block.sync(); if (tid == 0) { D_Score[bid] = lsinblock[compute % PREFETCH_COUNT][0]; t = 0; for (i = 0; i < 7 && t < 128 && t >= 0; i++) { t = (int)lsinblock[compute % PREFETCH_COUNT][(int)powf(2.0, i) + t]; } lsinblock[compute % PREFETCH_COUNT][0] = (float)t; } block.sync(); if (tid == (int)lsinblock[compute % PREFETCH_COUNT][0]) { for (i = 0; i < 4; i++) { D_resP[bid * 4 + i] = bestparent[i]; } } } } __device__ void Dincr(int *bit, int n) { while (n <= NODE_N) { bit[n]++; if (bit[n] >= 2) { bit[n] = 0; n++; } else { break; } } return; } __device__ void DincrS(int *bit, int n) { bit[n]++; if (bit[n] >= STATE_N) { bit[n] = 0; Dincr(bit, n + 1); } return; } __device__ bool D_getState(int parN, int *sta, int time) { int i, j = 1; for (i = 0; i < parN; i++) { j *= STATE_N; } j--; if (time > j) return false; if (time >= 1) DincrS(sta, 0); return true; } __device__ void D_findComb(int *comb, int l, int n) { const int len = 4; if (l == 0) { for (int i = 0; i < len; i++) comb[i] = -1; return; } int sum = 0; int k = 1; while (sum < l) sum += D_C(n, k++); l -= sum - D_C(n, --k); int low = 0; int pos = 0; while (k > 1) { sum = 0; int s = 1; while (sum < l) sum += D_C(n - s++, k - 1); l -= sum - D_C(n - (--s), --k); low += s; comb[pos++] = low; n -= s; } comb[pos] = low + l; for (int i = pos + 1; i < 4; i++) comb[i] = -1; } __device__ int D_findindex(int *arr, int size) { // reminder: arr[0] has to be 0 && size // == array size-1 && index start from 0 int i, j, index = 0; for (i = 1; i < size; i++) { index += D_C(NODE_N - 1, i); } for (i = 1; i <= size - 1; i++) { for (j = arr[i - 1] + 1; j <= arr[i] - 1; j++) { index += D_C(NODE_N - 1 - j, size - i); } } index += arr[size] - arr[size - 1]; return index; } __device__ int D_C(int n, int a) { int i, res = 1, atmp = a; for (i = 0; i < atmp; i++) { res *= n; n--; } for (i = 0; i < atmp; i++) { res /= a; a--; } return res; } #endif