| #ifndef _ORDERGRAPH_KERNEL_H_ |
| #define _ORDERGRAPH_KERNEL_H_ |
|
|
| #include <stdio.h> |
|
|
| #ifdef DATA_25 |
| #include "data25.cu" |
| #endif |
| #ifdef DATA_30 |
| #include "data30.cu" |
| #endif |
| #ifdef DATA_40 |
| #include "data40.cu" |
| #endif |
| #ifdef DATA_45 |
| #include "data45.cu" |
| #endif |
| #ifdef DATA_50 |
| #include "data50.cu" |
| #endif |
| #ifdef DATA_125 |
| #include "data125.cu" |
| #endif |
| ; |
|
|
| #include <cooperative_groups.h> |
| #include <cooperative_groups/memcpy_async.h> |
|
|
| using namespace nvcuda::experimental; |
|
|
| #define PREFETCH_COUNT 2 |
|
|
| #define BLOCK_SIZE 256 |
| #define MAX_NBLOCKS 1024 |
| #define MIN_NBATCHES 16 |
|
|
|
|
| __device__ void Dincr(int *bit, int n); |
| __device__ void DincrS(int *bit, int n); |
| __device__ bool D_getState(int parN, int *sta, int time); |
| __device__ void D_findComb(int *comb, int l, int n); |
| __device__ int D_findindex(int *arr, int size); |
| __device__ int D_C(int n, int a); |
|
|
| __global__ void genScoreKernel(int sizepernode, float *D_localscore, |
| int *D_data, float *D_LG) { |
| int id = blockIdx.x * BLOCK_SIZE + threadIdx.x; |
| int node, index; |
| bool flag; |
| int parent[5] = {0}; |
| int pre[NODE_N] = {0}; |
| int state[5] = {0}; |
| int i, j, parN = 0, tmp, t; |
| int t1 = 0, t2 = 0; |
| float ls = 0; |
| int Nij[STATE_N] = {0}; |
|
|
| if (id < sizepernode) { |
|
|
| D_findComb(parent, id, NODE_N - 1); |
|
|
| for (i = 0; i < 4; i++) { |
| if (parent[i] > 0) |
| parN++; |
| } |
|
|
| for (node = 0; node < NODE_N; node++) { |
|
|
| j = 1; |
| for (i = 0; i < NODE_N; i++) { |
| if (i != node) |
| pre[j++] = i; |
| } |
|
|
| for (tmp = 0; tmp < parN; tmp++) |
| state[tmp] = 0; |
|
|
| index = sizepernode * node + id; |
|
|
| |
| |
| |
| |
| |
| |
| t = 0; |
| while (D_getState(parN, state, t++)) { |
| |
| ls = 0; |
| for (tmp = 0; tmp < STATE_N; tmp++) |
| Nij[tmp] = 0; |
|
|
| for (t1 = 0; t1 < DATA_N; t1++) { |
| flag = true; |
| for (t2 = 0; t2 < parN; t2++) { |
| if (D_data[t1 * NODE_N + pre[parent[t2]]] != state[t2]) { |
| flag = false; |
| break; |
| } |
| } |
| if (!flag) |
| continue; |
|
|
| Nij[D_data[t1 * NODE_N + node]]++; |
| } |
|
|
| tmp = STATE_N - 1; |
|
|
| for (t1 = 0; t1 < STATE_N; t1++) { |
| ls += D_LG[Nij[t1]]; |
| tmp += Nij[t1]; |
| } |
|
|
| ls -= D_LG[tmp]; |
| ls += D_LG[STATE_N - 1]; |
|
|
| D_localscore[index] += ls; |
| } |
| } |
| } |
| } |
|
|
| __global__ void computeKernel(int taskperthr, int sizepernode, |
| float *D_localscore, bool *D_parent, int node, |
| int total, float *D_Score, int *D_resP, |
| int nbatches) { |
| cooperative_groups::thread_block block = |
| cooperative_groups::this_thread_block(); |
| pipeline pipe; |
| __shared__ float lsinblock[PREFETCH_COUNT][BLOCK_SIZE]; |
|
|
| int fetch = 0; |
| int end_tile = fetch + nbatches; |
| int bestparent[4] = {0}, parent[5] = {-1}; |
|
|
| for (int compute = fetch; compute < end_tile; compute++) { |
| for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++) { |
| unsigned int bid = blockIdx.x * nbatches + fetch; |
| unsigned int tid = threadIdx.x; |
| unsigned int id = bid * (BLOCK_SIZE * nbatches) + tid; |
|
|
| int posN = 1, i, index, tmp; |
| int pre[NODE_N] = {0}; |
| int parN = 0; |
|
|
| float bestls = -999999999999999, ls; |
|
|
| for (i = 0; i < NODE_N; i++) { |
| if (D_parent[i] == 1) { |
| pre[posN++] = i; |
| } |
| } |
|
|
| for (i = 0; i < taskperthr && ((id * taskperthr + i) < total); i++) { |
|
|
| D_findComb(parent, id * taskperthr + i, posN); |
|
|
| for (parN = 0; parN < 4; parN++) { |
| if (parent[parN] < 0) |
| break; |
| if (pre[parent[parN]] > node) |
| parent[parN] = pre[parent[parN]]; |
| else |
| parent[parN] = pre[parent[parN]] + 1; |
| } |
|
|
| for (tmp = parN; tmp > 0; tmp--) { |
| parent[tmp] = parent[tmp - 1]; |
| } |
| parent[0] = 0; |
|
|
| index = D_findindex(parent, parN); |
| index += sizepernode * node; |
|
|
| ls = D_localscore[index]; |
|
|
| if (ls > bestls) { |
| bestls = ls; |
| for (tmp = 0; tmp < 4; tmp++) |
| bestparent[tmp] = parent[tmp + 1]; |
| } |
| } |
|
|
| memcpy_async(lsinblock[fetch % PREFETCH_COUNT][tid], bestls, pipe); |
| pipe.commit(); |
| } |
| if (fetch == end_tile) { |
| for (int i = 0; i < PREFETCH_COUNT - 1; ++i) { |
| pipe.commit(); |
| } |
| ++fetch; |
| } |
| pipe.wait_prior<PREFETCH_COUNT - 1>(); |
| block.sync(); |
|
|
| int i, t; |
| unsigned int bid = blockIdx.x * nbatches + compute; |
| unsigned int tid = threadIdx.x; |
| |
|
|
| for (i = BLOCK_SIZE / 2; i >= 1; i /= 2) { |
| if (tid < i) { |
| if (lsinblock[compute % PREFETCH_COUNT][tid + i] > |
| lsinblock[compute % PREFETCH_COUNT][tid] && |
| lsinblock[compute % PREFETCH_COUNT][tid + i] < 0) { |
| lsinblock[compute % PREFETCH_COUNT][tid] = |
| lsinblock[compute % PREFETCH_COUNT][tid + i]; |
| lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)(tid + i); |
| } else if (lsinblock[compute % PREFETCH_COUNT][tid + i] < |
| lsinblock[compute % PREFETCH_COUNT][tid] && |
| lsinblock[compute % PREFETCH_COUNT][tid] < 0) { |
| lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)tid; |
| } else if (lsinblock[tid] > 0 && lsinblock[tid + i] < 0) { |
| lsinblock[compute % PREFETCH_COUNT][tid] = |
| lsinblock[compute % PREFETCH_COUNT][tid + i]; |
| lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)(tid + i); |
| } else if (lsinblock[compute % PREFETCH_COUNT][tid] < 0 && |
| lsinblock[compute % PREFETCH_COUNT][tid + i] > 0) { |
| lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)tid; |
| } |
| } |
| block.sync(); |
| } |
| block.sync(); |
|
|
| if (tid == 0) { |
| D_Score[bid] = lsinblock[compute % PREFETCH_COUNT][0]; |
| t = 0; |
| for (i = 0; i < 7 && t < 128 && t >= 0; i++) { |
| t = (int)lsinblock[compute % PREFETCH_COUNT][(int)powf(2.0, i) + t]; |
| } |
| lsinblock[compute % PREFETCH_COUNT][0] = (float)t; |
| } |
| block.sync(); |
|
|
| if (tid == (int)lsinblock[compute % PREFETCH_COUNT][0]) { |
| for (i = 0; i < 4; i++) { |
| D_resP[bid * 4 + i] = bestparent[i]; |
| } |
| } |
| } |
| } |
|
|
| __device__ void Dincr(int *bit, int n) { |
|
|
| while (n <= NODE_N) { |
| bit[n]++; |
| if (bit[n] >= 2) { |
| bit[n] = 0; |
| n++; |
| } else { |
| break; |
| } |
| } |
|
|
| return; |
| } |
|
|
| __device__ void DincrS(int *bit, int n) { |
|
|
| bit[n]++; |
| if (bit[n] >= STATE_N) { |
| bit[n] = 0; |
| Dincr(bit, n + 1); |
| } |
|
|
| return; |
| } |
|
|
| __device__ bool D_getState(int parN, int *sta, int time) { |
| int i, j = 1; |
|
|
| for (i = 0; i < parN; i++) { |
| j *= STATE_N; |
| } |
| j--; |
| if (time > j) |
| return false; |
|
|
| if (time >= 1) |
| DincrS(sta, 0); |
|
|
| return true; |
| } |
|
|
| __device__ void D_findComb(int *comb, int l, int n) { |
| const int len = 4; |
| if (l == 0) { |
| for (int i = 0; i < len; i++) |
| comb[i] = -1; |
| return; |
| } |
| int sum = 0; |
| int k = 1; |
|
|
| while (sum < l) |
| sum += D_C(n, k++); |
| l -= sum - D_C(n, --k); |
| int low = 0; |
| int pos = 0; |
| while (k > 1) { |
| sum = 0; |
| int s = 1; |
| while (sum < l) |
| sum += D_C(n - s++, k - 1); |
| l -= sum - D_C(n - (--s), --k); |
| low += s; |
| comb[pos++] = low; |
| n -= s; |
| } |
| comb[pos] = low + l; |
| for (int i = pos + 1; i < 4; i++) |
| comb[i] = -1; |
| } |
|
|
| __device__ int D_findindex(int *arr, |
| int size) { |
| |
| int i, j, index = 0; |
|
|
| for (i = 1; i < size; i++) { |
| index += D_C(NODE_N - 1, i); |
| } |
|
|
| for (i = 1; i <= size - 1; i++) { |
| for (j = arr[i - 1] + 1; j <= arr[i] - 1; j++) { |
| index += D_C(NODE_N - 1 - j, size - i); |
| } |
| } |
|
|
| index += arr[size] - arr[size - 1]; |
|
|
| return index; |
| } |
|
|
| __device__ int D_C(int n, int a) { |
| int i, res = 1, atmp = a; |
|
|
| for (i = 0; i < atmp; i++) { |
| res *= n; |
| n--; |
| } |
|
|
| for (i = 0; i < atmp; i++) { |
| res /= a; |
| a--; |
| } |
|
|
| return res; |
| } |
|
|
| #endif |
|
|