| #define LIMIT -999 |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <math.h> |
| #include "needle.h" |
| #include <cuda.h> |
| #include <sys/time.h> |
| #include <cupti.h> |
| #include "../../../common/cupti_add.h" |
| #include "../../../common/cpu_timestamps.h" |
|
|
| // includes, kernels |
| #include "needle_kernel.cu" |
|
|
| #ifdef TIMING |
| #include "timing.h" |
|
|
| struct timeval tv; |
| struct timeval tv_total_start, tv_total_end; |
| struct timeval tv_h2d_start, tv_h2d_end; |
| struct timeval tv_d2h_start, tv_d2h_end; |
| struct timeval tv_kernel_start, tv_kernel_end; |
| struct timeval tv_mem_alloc_start, tv_mem_alloc_end; |
| struct timeval tv_close_start, tv_close_end; |
| float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0, |
| d2h_time = 0, close_time = 0, total_time = 0; |
| #endif |
|
|
| //////////////////////////////////////////////////////////////////////////////// |
| // declaration, forward |
| void runTest( int argc, char** argv); |
|
|
|
|
| int blosum62[24][24] = { |
| { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4}, |
| {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4}, |
| {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4}, |
| {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4}, |
| { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4}, |
| {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4}, |
| {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, |
| { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4}, |
| {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4}, |
| {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4}, |
| {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4}, |
| {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4}, |
| {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4}, |
| {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4}, |
| {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4}, |
| { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4}, |
| { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4}, |
| {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4}, |
| {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4}, |
| { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4}, |
| {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4}, |
| {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4}, |
| { 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4}, |
| {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1} |
| }; |
|
|
| double gettime() { |
| struct timeval t; |
| gettimeofday(&t,NULL); |
| return t.tv_sec+t.tv_usec*1e-6; |
| } |
|
|
| //////////////////////////////////////////////////////////////////////////////// |
| // Program main |
| //////////////////////////////////////////////////////////////////////////////// |
| int |
| main( int argc, char** argv) |
| { |
| uint64_t start_tsc = rdtsc(); |
| uint64_t start_tsp = rdtsp(); |
| printf("start_tsc %llu start_tsp %llu\n", start_tsc, start_tsp); |
| |
| printf("WG size of kernel = %d \n", BLOCK_SIZE); |
|
|
| runTest( argc, argv); |
|
|
| return EXIT_SUCCESS; |
| } |
|
|
| void usage(int argc, char **argv) |
| { |
| fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]); |
| fprintf(stderr, "\t<dimension> - x and y dimensions\n"); |
| fprintf(stderr, "\t<penalty> - penalty(positive integer)\n"); |
| exit(1); |
| } |
|
|
| void runTest( int argc, char** argv) |
| { |
| int max_rows, max_cols, penalty, nblocks; |
| int *input_itemsets, *output_itemsets, *referrence; |
| int *matrix_cuda, *referrence_cuda; |
| int size; |
| |
| |
| // the lengths of the two sequences should be able to divided by 16. |
| // And at current stage max_rows needs to equal max_cols |
| if (argc == 4) |
| { |
| max_rows = atoi(argv[1]); |
| max_cols = atoi(argv[1]); |
| penalty = atoi(argv[2]); |
| nblocks = atoi(argv[3]); |
| } |
| else{ |
| usage(argc, argv); |
| } |
| |
| if(atoi(argv[1])%16!=0){ |
| fprintf(stderr,"The dimension values must be a multiple of 16\n"); |
| exit(1); |
| } |
| |
|
|
| max_rows = max_rows + 1; |
| max_cols = max_cols + 1; |
| referrence = (int *)malloc( max_rows * max_cols * sizeof(int) ); |
| input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); |
| output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); |
| |
|
|
| if (!input_itemsets) |
| fprintf(stderr, "error: can not allocate memory"); |
|
|
| srand ( 7 ); |
| |
| |
| for (int i = 0 ; i < max_cols; i++){ |
| for (int j = 0 ; j < max_rows; j++){ |
| input_itemsets[i*max_cols+j] = 0; |
| } |
| } |
| |
| printf("Start Needleman-Wunsch\n"); |
| |
| for( int i=1; i< max_rows ; i++){ //please define your own sequence. |
| input_itemsets[i*max_cols] = rand() % 10 + 1; |
| } |
| for( int j=1; j< max_cols ; j++){ //please define your own sequence. |
| input_itemsets[j] = rand() % 10 + 1; |
| } |
|
|
|
|
| for (int i = 1 ; i < max_cols; i++){ |
| for (int j = 1 ; j < max_rows; j++){ |
| referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]]; |
| } |
| } |
|
|
| for( int i = 1; i< max_rows ; i++) |
| input_itemsets[i*max_cols] = -i * penalty; |
| for( int j = 1; j< max_cols ; j++) |
| input_itemsets[j] = -j * penalty; |
|
|
|
|
| size = max_cols * max_rows; |
| |
| GPU_argv_init(); |
| initTrace(); |
| startCPU(); |
|
|
| cudaMallocManaged((void**)& referrence_cuda, sizeof(int)*size); |
| cudaMallocManaged((void **)&matrix_cuda, sizeof(int) * size); |
|
|
| memcpy(referrence_cuda, referrence, sizeof(int) * size); |
| memcpy(matrix_cuda, input_itemsets, sizeof(int) * size); |
|
|
| cudaStream_t stream1; |
| cudaStream_t stream2; |
| cudaStreamCreate(&stream1); |
| cudaStreamCreate(&stream2); |
|
|
| cudaMemPrefetchAsync(referrence_cuda, sizeof(int) * size, GPU_DEVICE, stream1); |
| cudaStreamSynchronize(stream1); |
| cudaMemPrefetchAsync(matrix_cuda, sizeof(int) * size, GPU_DEVICE, stream2); |
| cudaStreamSynchronize(stream2); |
|
|
| dim3 dimGrid; |
| dim3 dimBlock(BLOCK_SIZE, 1); |
| // int block_width = ( max_cols - 1 )/BLOCK_SIZE; |
| int block_width = nblocks - 1; |
| int block_size = (max_cols - 1) / (nblocks * BLOCK_SIZE); |
|
|
| #ifdef TIMING |
| gettimeofday(&tv_kernel_start, NULL); |
| #endif |
|
|
| //printf("Processing top-left matrix\n"); |
| //process top-left matrix |
| for( int i = 1 ; i <= block_width ; i++) { |
| dimGrid.x = i; |
| dimGrid.y = 1; |
| needle_cuda_shared_1<<<dimGrid, dimBlock, 0, stream2>>>(referrence_cuda, matrix_cuda |
| ,max_cols, penalty, i, block_width, block_size); |
| } |
| //printf("Processing bottom-right matrix\n"); |
| //process bottom-right matrix |
| for( int i = block_width - 1 ; i >= 1 ; i--){ |
| dimGrid.x = i; |
| dimGrid.y = 1; |
| needle_cuda_shared_2<<<dimGrid, dimBlock, 0, stream2>>>(referrence_cuda, matrix_cuda |
| ,max_cols, penalty, i, block_width, block_size); |
| } |
| cudaDeviceSynchronize(); |
| memcpy(output_itemsets, matrix_cuda, sizeof(int) * size); |
|
|
| cudaFree(referrence_cuda); |
| cudaFree(matrix_cuda); |
|
|
| #ifdef TIMING |
| gettimeofday(&tv_kernel_end, NULL); |
| tvsub(&tv_kernel_end, &tv_kernel_start, &tv); |
| kernel_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0; |
| #endif |
|
|
| // cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost); |
|
|
| //#define TRACEBACK |
| #ifdef TRACEBACK |
| |
| FILE *fpo = fopen("result.txt","w"); |
| fprintf(fpo, "print traceback value GPU:\n"); |
| |
| for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){ |
| int nw, n, w, traceback; |
| if ( i == max_rows - 2 && j == max_rows - 2 ) |
| fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element |
| if ( i == 0 && j == 0 ) |
| break; |
| if ( i > 0 && j > 0 ){ |
| nw = output_itemsets[(i - 1) * max_cols + j - 1]; |
| w = output_itemsets[ i * max_cols + j - 1 ]; |
| n = output_itemsets[(i - 1) * max_cols + j]; |
| } |
| else if ( i == 0 ){ |
| nw = n = LIMIT; |
| w = output_itemsets[ i * max_cols + j - 1 ]; |
| } |
| else if ( j == 0 ){ |
| nw = w = LIMIT; |
| n = output_itemsets[(i - 1) * max_cols + j]; |
| } |
| else{ |
| } |
|
|
| //traceback = maximum(nw, w, n); |
| int new_nw, new_w, new_n; |
| new_nw = nw + referrence[i * max_cols + j]; |
| new_w = w - penalty; |
| new_n = n - penalty; |
| |
| traceback = maximum(new_nw, new_w, new_n); |
| if(traceback == new_nw) |
| traceback = nw; |
| if(traceback == new_w) |
| traceback = w; |
| if(traceback == new_n) |
| traceback = n; |
| |
| fprintf(fpo, "%d ", traceback); |
|
|
| if(traceback == nw ) |
| {i--; j--; continue;} |
|
|
| else if(traceback == w ) |
| {j--; continue;} |
|
|
| else if(traceback == n ) |
| {i--; continue;} |
|
|
| else |
| ; |
| } |
| |
| fclose(fpo); |
|
|
| #endif |
| endCPU(); |
| finiTrace(); |
| free(referrence); |
| free(input_itemsets); |
| free(output_itemsets); |
|
|
| #ifdef TIMING |
| printf("Exec: %f\n", kernel_time); |
| #endif |
| } |
|
|
|
|