| #include <math.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <time.h> |
| |
| |
| #include <cuda_runtime.h> |
| |
| #include "ordergraph_kernel.cu" |
| ; |
|
|
| #include "../../../common/cpu_timestamps.h" |
| #include "../../../common/cupti_add.h" |
|
|
| const int HIGHEST = 3; |
| int taskperthr = 1; |
| int sizepernode; |
| int ITER = 100; |
|
|
| |
| float preScore = -99999999999.0f; |
| float score = 0.0; |
| float maxScore[HIGHEST] = {-999999999.0f}; |
| bool orders[NODE_N][NODE_N]; |
| bool preOrders[NODE_N][NODE_N]; |
| bool preGraph[NODE_N][NODE_N]; |
| bool bestGraph[HIGHEST][NODE_N][NODE_N]; |
| bool graph[NODE_N][NODE_N]; |
| |
| float *localscore, *D_localscore, *D_Score, *scores; |
| float *LG; |
| bool *D_parent; |
| int *D_resP, *parents; |
|
|
| void initial(); |
| int genOrders(); |
| int ConCore(); |
| bool getparent(int *bit, int *pre, int posN, int *parent, int *parN, |
| int time); |
| void incr(int *bit, int n); |
| void incrS(int *bit, int n); |
| bool getState( |
| int parN, int *state, |
| int time); |
| float logGamma(int N); |
| float findBestGraph(); |
| void genScore(); |
| int convert(int *parent, int parN); |
| void sortGraph(); |
| void swap(int a, int b); |
| void Pre_logGamma(); |
| int findindex(int *arr, int size); |
| int C(int n, int a); |
|
|
| FILE *fpout; |
|
|
| int main(int argc, char *argv[]) { |
| |
| |
| |
| |
| |
| |
| uint64_t start_tsc = rdtsc(); |
| uint64_t start_tsp = rdtsp(); |
| printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp); |
| int i, j, c = 0, tmp, a, b; |
| float tmpd; |
| #ifdef DATA_25 |
| char name[20] = "25.out"; |
| #endif |
| #ifdef DATA_30 |
| char name[20] = "30.out"; |
| #endif |
| #ifdef DATA_40 |
| char name[20] = "40.out"; |
| #endif |
| #ifdef DATA_45 |
| char name[20] = "45.out"; |
| #endif |
| #ifdef DATA_50 |
| char name[20] = "50.out"; |
| #endif |
| #ifdef DATA_125 |
| char name[20] = "125.out"; |
| #endif |
|
|
| fpout = fopen(name, "w"); |
|
|
| clock_t start, finish, total = 0, pre1, pre2; |
| cudaDeviceSynchronize(); |
|
|
| printf("NODE_N=%d\nInitialization...\n", NODE_N); |
| pre1 = clock(); |
|
|
| srand(time(NULL)); |
| initial(); |
|
|
| GPU_argv_init(); |
| initTrace(); |
| startCPU(); |
|
|
| genScore(); |
| pre2 = clock(); |
| printf("OK, begin to generate orders.\n"); |
|
|
| i = 0; |
| while (i != ITER) { |
|
|
| start = clock(); |
|
|
| i++; |
| score = 0; |
|
|
| for (a = 0; a < NODE_N; a++) { |
| for (j = 0; j < NODE_N; j++) { |
| orders[a][j] = preOrders[a][j]; |
| } |
| } |
|
|
| tmp = rand() % 6; |
| for (j = 0; j < tmp; j++) |
| genOrders(); |
|
|
| score = findBestGraph(); |
|
|
| finish = clock(); |
| total += finish - start; |
|
|
| ConCore(); |
|
|
| |
| if (c < HIGHEST) { |
| tmp = 1; |
| for (j = 0; j < c; j++) { |
| if (maxScore[j] == preScore) { |
| tmp = 0; |
| } |
| } |
| if (tmp != 0) { |
| maxScore[c] = preScore; |
| for (a = 0; a < NODE_N; a++) { |
| for (b = 0; b < NODE_N; b++) { |
| bestGraph[c][a][b] = preGraph[a][b]; |
| } |
| } |
| c++; |
| } |
|
|
| } else if (c == HIGHEST) { |
| sortGraph(); |
| c++; |
| } else { |
|
|
| tmp = 1; |
| for (j = 0; j < HIGHEST; j++) { |
| if (maxScore[j] == preScore) { |
| tmp = 0; |
| break; |
| } |
| } |
| if (tmp != 0 && preScore > maxScore[HIGHEST - 1]) { |
| maxScore[HIGHEST - 1] = preScore; |
| for (a = 0; a < NODE_N; a++) { |
| for (b = 0; b < NODE_N; b++) { |
| bestGraph[HIGHEST - 1][a][b] = preGraph[a][b]; |
| } |
| } |
| b = HIGHEST - 1; |
| for (a = HIGHEST - 2; a >= 0; a--) { |
| if (maxScore[b] > maxScore[a]) { |
| swap(a, b); |
| tmpd = maxScore[a]; |
| maxScore[a] = maxScore[b]; |
| maxScore[b] = tmpd; |
| b = a; |
| } |
| } |
| } |
| } |
|
|
| } |
|
|
| cudaFreeHost(localscore); |
| cudaFree(D_localscore); |
| cudaFree(D_parent); |
|
|
| cudaFreeHost(scores); |
| cudaFreeHost(parents); |
| cudaFree(D_Score); |
| cudaFree(D_resP); |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| endCPU(); |
| finiTrace(); |
| fprintf(fpout, "Duration per interation is %f seconds.\n", |
| ((float)total / ITER) / CLOCKS_PER_SEC); |
| fprintf(fpout, "Total duration is %f seconds.\n", |
| (float)(pre2 - pre1 + total) / CLOCKS_PER_SEC); |
| fprintf(fpout, "Preprocessing duration is %f seconds.\n", |
| (float)(pre2 - pre1) / CLOCKS_PER_SEC); |
|
|
| printf("Duration per interation is %f seconds.\n", |
| ((float)total / ITER) / CLOCKS_PER_SEC); |
| printf("Total duration is %f seconds.\n", |
| (float)(pre2 - pre1 + total) / CLOCKS_PER_SEC); |
| printf("Preprocessing duration is %f seconds.\n", |
| (float)(pre2 - pre1) / CLOCKS_PER_SEC); |
|
|
| return 0; |
| } |
|
|
| void sortGraph() { |
| float max = -99999999999999; |
| int maxi, i, j; |
| float tmp; |
|
|
| for (j = 0; j < HIGHEST - 1; j++) { |
| max = maxScore[j]; |
| maxi = j; |
| for (i = j + 1; i < HIGHEST; i++) { |
| if (maxScore[i] > max) { |
| max = maxScore[i]; |
| maxi = i; |
| } |
| } |
|
|
| swap(j, maxi); |
| tmp = maxScore[j]; |
| maxScore[j] = max; |
| maxScore[maxi] = tmp; |
| } |
| } |
|
|
| void swap(int a, int b) { |
| int i, j; |
| bool tmp; |
|
|
| for (i = 0; i < NODE_N; i++) { |
| for (j = 0; j < NODE_N; j++) { |
|
|
| tmp = bestGraph[a][i][j]; |
| bestGraph[a][i][j] = bestGraph[b][i][j]; |
| bestGraph[b][i][j] = tmp; |
| } |
| } |
| } |
|
|
| void initial() { |
| int i, j, tmp, a, b, r; |
| bool tmpd; |
| tmp = 1; |
| for (i = 1; i <= 4; i++) { |
| tmp += C(NODE_N - 1, i); |
| } |
| sizepernode = tmp; |
| tmp *= NODE_N; |
|
|
| cudaMallocHost((void **)&localscore, tmp * sizeof(float)); |
|
|
| for (i = 0; i < tmp; i++) |
| localscore[i] = 0; |
|
|
| for (i = 0; i < NODE_N; i++) { |
| for (j = 0; j < NODE_N; j++) |
| orders[i][j] = 0; |
| } |
| for (i = 0; i < NODE_N; i++) { |
| for (j = 0; j < i; j++) |
| orders[i][j] = 1; |
| } |
| r = rand() % 10000; |
| for (i = 0; i < r; i++) { |
| a = rand() % NODE_N; |
| b = rand() % NODE_N; |
| for (j = 0; j < NODE_N; j++) { |
| tmpd = orders[j][a]; |
| orders[j][a] = orders[j][b]; |
| orders[j][b] = tmpd; |
| } |
|
|
| for (j = 0; j < NODE_N; j++) { |
| tmpd = orders[a][j]; |
| orders[a][j] = orders[b][j]; |
| orders[b][j] = tmpd; |
| } |
| } |
|
|
| for (i = 0; i < NODE_N; i++) { |
| for (j = 0; j < NODE_N; j++) { |
| preOrders[i][j] = orders[i][j]; |
| } |
| } |
| } |
|
|
| |
| int genOrders() { |
|
|
| int a, b, j; |
| bool tmp; |
| a = rand() % NODE_N; |
| b = rand() % NODE_N; |
|
|
| for (j = 0; j < NODE_N; j++) { |
| tmp = orders[a][j]; |
| orders[a][j] = orders[b][j]; |
| orders[b][j] = tmp; |
| } |
| for (j = 0; j < NODE_N; j++) { |
| tmp = orders[j][a]; |
| orders[j][a] = orders[j][b]; |
| orders[j][b] = tmp; |
| } |
|
|
| return 1; |
| } |
|
|
| |
| int ConCore() { |
| int i, j; |
| float tmp; |
| tmp = log((rand() % 100000) / 100000.0); |
| if (tmp < (score - preScore)) { |
|
|
| for (i = 0; i < NODE_N; i++) { |
| for (j = 0; j < NODE_N; j++) { |
| preOrders[i][j] = orders[i][j]; |
| preGraph[i][j] = graph[i][j]; |
| } |
| } |
| preScore = score; |
|
|
| return 1; |
| } |
|
|
| return 0; |
| } |
|
|
| void genScore() { |
| int *D_data; |
| float *D_LG; |
| dim3 grid(sizepernode / 256 + 1, 1, 1); |
| dim3 threads(256, 1, 1); |
|
|
| Pre_logGamma(); |
| |
| cudaMallocManaged((void **)&D_data, NODE_N * DATA_N * sizeof(int)); |
| cudaMallocManaged((void **)&D_localscore, NODE_N * sizepernode * sizeof(float)); |
| cudaMallocManaged((void **)&D_LG, (DATA_N + 2) * sizeof(float)); |
| cudaMemset(D_localscore, 0.0, NODE_N * sizepernode * sizeof(float)); |
| memcpy(D_data, data, NODE_N * DATA_N * sizeof(int)); |
| memcpy(D_LG, LG, (DATA_N + 2) * sizeof(float)); |
|
|
| cudaStream_t stream1; |
| cudaStream_t stream2; |
| cudaStream_t stream3; |
| cudaStreamCreate(&stream1); |
| cudaStreamCreate(&stream2); |
| cudaStreamCreate(&stream3); |
| cudaMemPrefetchAsync(D_data, NODE_N * DATA_N * sizeof(int), GPU_DEVICE, stream1); |
| cudaStreamSynchronize(stream1); |
| cudaMemPrefetchAsync(D_LG, (DATA_N + 2) * sizeof(float), GPU_DEVICE, stream2); |
| cudaStreamSynchronize(stream2); |
| cudaMemPrefetchAsync(D_localscore,NODE_N * sizepernode * sizeof(float), GPU_DEVICE, stream3); |
| cudaStreamSynchronize(stream3); |
|
|
| genScoreKernel<<<grid, threads, 0, stream3>>>(sizepernode, D_localscore, D_data, D_LG); |
| cudaDeviceSynchronize(); |
|
|
| memcpy(localscore, D_localscore, NODE_N * sizepernode * sizeof(float)); |
|
|
| |
| |
|
|
| cudaFreeHost(LG); |
| cudaFree(D_LG); |
| cudaFree(D_data); |
|
|
| cudaMallocHost((void **)&scores, |
| (sizepernode / (256 * taskperthr) + 1) * sizeof(float)); |
| cudaMallocHost((void **)&parents, |
| (sizepernode / (256 * taskperthr) + 1) * 4 * sizeof(int)); |
| cudaMallocManaged((void **)&D_Score, |
| (sizepernode / (256 * taskperthr) + 1) * sizeof(float)); |
| cudaMallocManaged((void **)&D_parent, NODE_N * sizeof(bool)); |
| cudaMallocManaged((void **)&D_resP, |
| (sizepernode / (256 * taskperthr) + 1) * 4 * sizeof(int)); |
| } |
|
|
| int convert(int *parent, int parN) { |
| int i, j, w = 1, tmp = 0; |
| j = 0; |
| for (i = 0; parN > 0 && i <= parent[parN - 1]; i++) { |
| if (parent[j] == i) { |
| j++; |
| tmp += w; |
| } |
| w *= 2; |
| } |
|
|
| return tmp; |
| } |
|
|
| void Pre_logGamma() { |
|
|
| cudaMallocHost((void **)&LG, (DATA_N + 2) * sizeof(float)); |
|
|
| LG[1] = log(1.0); |
| float i; |
| for (i = 2; i <= DATA_N + 1; i++) { |
| LG[(int)i] = LG[(int)i - 1] + log((float)i); |
| } |
| } |
|
|
| void incr(int *bit, int n) { |
|
|
| bit[n]++; |
| if (bit[n] >= 2) { |
| bit[n] = 0; |
| incr(bit, n + 1); |
| } |
|
|
| return; |
| } |
|
|
| void incrS(int *bit, int n) { |
|
|
| bit[n]++; |
| if (bit[n] >= STATE_N) { |
| bit[n] = 0; |
| incr(bit, n + 1); |
| } |
|
|
| return; |
| } |
|
|
| bool getState(int parN, int *state, int time) { |
| int j = 1; |
|
|
| j = pow(STATE_N, (float)parN) - 1; |
|
|
| if (time > j) |
| return false; |
|
|
| if (time >= 1) |
| incrS(state, 0); |
|
|
| return true; |
| } |
|
|
| bool getparent(int *bit, int *pre, int posN, int *parent, int *parN, int time) { |
| int i, j = 1; |
|
|
| *parN = 0; |
| if (time == 0) |
| return true; |
|
|
| for (i = 0; i < posN; i++) { |
| j = j * 2; |
| } |
| j--; |
|
|
| if (time > j) |
| return false; |
|
|
| incr(bit, 0); |
|
|
| for (i = 0; i < posN; i++) { |
| if (bit[i] == 1) { |
| parent[(*parN)++] = pre[i]; |
| } |
| } |
|
|
| return true; |
| } |
|
|
| float findBestGraph() { |
| float bestls = -99999999; |
| int bestparent[5]; |
| int bestpN, total; |
| int node, index; |
| int pre[NODE_N] = {0}; |
| int parent[NODE_N] = {0}; |
| int posN = 0, i, j, parN, tmp, k, l; |
| float ls = -99999999999, score = 0; |
| int blocknum; |
|
|
| for (i = 0; i < NODE_N; i++) |
| for (j = 0; j < NODE_N; j++) |
| graph[i][j] = 0; |
|
|
| for (node = 0; node < NODE_N; node++) { |
|
|
| bestls = -99999999; |
| posN = 0; |
|
|
| for (i = 0; i < NODE_N; i++) { |
| if (orders[node][i] == 1) { |
| pre[posN++] = i; |
| } |
| } |
|
|
| if (posN >= 0) { |
| total = C(posN, 4) + C(posN, 3) + C(posN, 2) + posN + 1; |
| taskperthr = 1; |
| blocknum = total / (256 * taskperthr) + 1; |
|
|
| int nbatches = MIN_NBATCHES; |
|
|
| int blocknum_max = total / (BLOCK_SIZE * MIN_NBATCHES * taskperthr) + 1; |
| if (blocknum_max >= MAX_NBLOCKS) { |
| blocknum = MAX_NBLOCKS; |
| nbatches = (total + 1) / (BLOCK_SIZE * MAX_NBLOCKS * taskperthr); |
| } else { |
| blocknum = blocknum_max; |
| } |
|
|
| cudaMemset(D_resP, 0, blocknum * 4 * sizeof(int)); |
| cudaMemset(D_Score, -999999.0, blocknum * nbatches * sizeof(float)); |
| memcpy(D_parent, orders[node], NODE_N * sizeof(bool)); |
|
|
| cudaStream_t stream1; |
| cudaStream_t stream2; |
| cudaStream_t stream3; |
| cudaStreamCreate(&stream1); |
| cudaStreamCreate(&stream2); |
| cudaStreamCreate(&stream3); |
| cudaMemPrefetchAsync(D_parent, NODE_N * sizeof(bool), GPU_DEVICE, stream1); |
| cudaStreamSynchronize(stream1); |
| cudaMemPrefetchAsync(D_resP, blocknum * 4 * sizeof(int), GPU_DEVICE, stream2); |
| cudaStreamSynchronize(stream2); |
| cudaMemPrefetchAsync(D_Score, blocknum * nbatches * sizeof(float), GPU_DEVICE, stream3); |
| cudaStreamSynchronize(stream3); |
|
|
| computeKernel<<<blocknum, 256, 0, stream3>>>( |
| taskperthr, sizepernode, D_localscore, D_parent, node, total, D_Score, |
| D_resP, nbatches); |
| cudaDeviceSynchronize(); |
|
|
| memcpy(parents, D_resP, blocknum * 4 * sizeof(int)); |
| memcpy(scores, D_Score, blocknum * sizeof(float)); |
|
|
| for (i = 0; i < blocknum * nbatches; i++) { |
|
|
| if (scores[i] > bestls) { |
|
|
| bestls = scores[i]; |
|
|
| parN = 0; |
| for (tmp = 0; tmp < 4; tmp++) { |
| if (parents[i * 4 + tmp] < 0) |
| break; |
|
|
| bestparent[tmp] = parents[i * 4 + tmp]; |
|
|
| parN++; |
| } |
|
|
| bestpN = parN; |
| } |
| } |
| } else { |
| if (posN >= 4) { |
| for (i = 0; i < posN; i++) { |
| for (j = i + 1; j < posN; j++) { |
| for (k = j + 1; k < posN; k++) { |
| for (l = k + 1; l < posN; l++) { |
| parN = 4; |
| if (pre[i] > node) |
| parent[1] = pre[i]; |
| else |
| parent[1] = pre[i] + 1; |
| if (pre[j] > node) |
| parent[2] = pre[j]; |
| else |
| parent[2] = pre[j] + 1; |
| if (pre[k] > node) |
| parent[3] = pre[k]; |
| else |
| parent[3] = pre[k] + 1; |
| if (pre[l] > node) |
| parent[4] = pre[l]; |
| else |
| parent[4] = pre[l] + 1; |
|
|
| index = findindex(parent, parN); |
| index += sizepernode * node; |
| ls = localscore[index]; |
|
|
| if (ls > bestls) { |
| bestls = ls; |
| bestpN = parN; |
| for (tmp = 0; tmp < parN; tmp++) |
| bestparent[tmp] = parent[tmp + 1]; |
| } |
| } |
| } |
| } |
| } |
| } |
|
|
| if (posN >= 3) { |
| for (i = 0; i < posN; i++) { |
| for (j = i + 1; j < posN; j++) { |
| for (k = j + 1; k < posN; k++) { |
|
|
| parN = 3; |
| if (pre[i] > node) |
| parent[1] = pre[i]; |
| else |
| parent[1] = pre[i] + 1; |
| if (pre[j] > node) |
| parent[2] = pre[j]; |
| else |
| parent[2] = pre[j] + 1; |
| if (pre[k] > node) |
| parent[3] = pre[k]; |
| else |
| parent[3] = pre[k] + 1; |
|
|
| index = findindex(parent, parN); |
| index += sizepernode * node; |
| ls = localscore[index]; |
|
|
| if (ls > bestls) { |
| bestls = ls; |
| bestpN = parN; |
| for (tmp = 0; tmp < parN; tmp++) |
| bestparent[tmp] = parent[tmp + 1]; |
| } |
| } |
| } |
| } |
| } |
|
|
| if (posN >= 2) { |
| for (i = 0; i < posN; i++) { |
| for (j = i + 1; j < posN; j++) { |
|
|
| parN = 2; |
| if (pre[i] > node) |
| parent[1] = pre[i]; |
| else |
| parent[1] = pre[i] + 1; |
| if (pre[j] > node) |
| parent[2] = pre[j]; |
| else |
| parent[2] = pre[j] + 1; |
|
|
| index = findindex(parent, parN); |
| index += sizepernode * node; |
| ls = localscore[index]; |
|
|
| if (ls > bestls) { |
| bestls = ls; |
| bestpN = parN; |
| for (tmp = 0; tmp < parN; tmp++) |
| bestparent[tmp] = parent[tmp + 1]; |
| } |
| } |
| } |
| } |
|
|
| if (posN >= 1) { |
| for (i = 0; i < posN; i++) { |
|
|
| parN = 1; |
| if (pre[i] > node) |
| parent[1] = pre[i]; |
| else |
| parent[1] = pre[i] + 1; |
|
|
| index = findindex(parent, parN); |
| index += sizepernode * node; |
| ls = localscore[index]; |
|
|
| if (ls > bestls) { |
| bestls = ls; |
| bestpN = parN; |
| for (tmp = 0; tmp < parN; tmp++) |
| bestparent[tmp] = parent[tmp + 1]; |
| } |
| } |
| } |
|
|
| parN = 0; |
| index = sizepernode * node; |
|
|
| ls = localscore[index]; |
|
|
| if (ls > bestls) { |
| bestls = ls; |
| bestpN = 0; |
| } |
| } |
| if (bestls > -99999999) { |
|
|
| for (i = 0; i < bestpN; i++) { |
| if (bestparent[i] < node) |
| graph[node][bestparent[i] - 1] = 1; |
| else |
| graph[node][bestparent[i]] = 1; |
| } |
| score += bestls; |
| } |
| } |
|
|
| return score; |
| } |
|
|
| int findindex(int *arr, int size) { |
| |
| int i, j, index = 0; |
|
|
| for (i = 1; i < size; i++) { |
| index += C(NODE_N - 1, i); |
| } |
|
|
| for (i = 1; i <= size - 1; i++) { |
| for (j = arr[i - 1] + 1; j <= arr[i] - 1; j++) { |
| index += C(NODE_N - 1 - j, size - i); |
| } |
| } |
|
|
| index += arr[size] - arr[size - 1]; |
|
|
| return index; |
| } |
|
|
| int C(int n, int a) { |
| int i, res = 1, atmp = a; |
|
|
| for (i = 0; i < atmp; i++) { |
| res *= n; |
| n--; |
| } |
|
|
| for (i = 0; i < atmp; i++) { |
| res /= a; |
| a--; |
| } |
|
|
| return res; |
| } |