lrh12580
first commit
5cb6c4b
#define LIMIT -999
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "needle.h"
#include <cuda.h>
#include <sys/time.h>
#include <cupti.h>
#include "../../../common/cupti_add.h"
#include "../../../common/cpu_timestamps.h"
// includes, kernels
#include "needle_kernel.cu"
#ifdef TIMING
#include "timing.h"
struct timeval tv;
struct timeval tv_total_start, tv_total_end;
struct timeval tv_h2d_start, tv_h2d_end;
struct timeval tv_d2h_start, tv_d2h_end;
struct timeval tv_kernel_start, tv_kernel_end;
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
struct timeval tv_close_start, tv_close_end;
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
d2h_time = 0, close_time = 0, total_time = 0;
#endif
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest( int argc, char** argv);
int blosum62[24][24] = {
{ 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4},
{-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4},
{-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4},
{-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4},
{ 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4},
{-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4},
{-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
{ 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4},
{-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4},
{-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4},
{-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4},
{-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4},
{-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4},
{-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4},
{-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4},
{ 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4},
{ 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4},
{-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4},
{-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4},
{ 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4},
{-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4},
{-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4},
{ 0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4},
{-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1}
};
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
uint64_t start_tsc = rdtsc();
uint64_t start_tsp = rdtsp();
printf("start_tsc %llu start_tsp %llu\n", start_tsc, start_tsp);
printf("WG size of kernel = %d \n", BLOCK_SIZE);
runTest( argc, argv);
return EXIT_SUCCESS;
}
void usage(int argc, char **argv)
{
fprintf(stderr, "Usage: %s <max_rows/max_cols> <penalty> \n", argv[0]);
fprintf(stderr, "\t<dimension> - x and y dimensions\n");
fprintf(stderr, "\t<penalty> - penalty(positive integer)\n");
exit(1);
}
void runTest( int argc, char** argv)
{
int max_rows, max_cols, penalty, nblocks;
int *input_itemsets, *output_itemsets, *referrence;
int *matrix_cuda, *referrence_cuda;
int size;
// the lengths of the two sequences should be able to divided by 16.
// And at current stage max_rows needs to equal max_cols
if (argc == 4)
{
max_rows = atoi(argv[1]);
max_cols = atoi(argv[1]);
penalty = atoi(argv[2]);
nblocks = atoi(argv[3]);
}
else{
usage(argc, argv);
}
if(atoi(argv[1])%16!=0){
fprintf(stderr,"The dimension values must be a multiple of 16\n");
exit(1);
}
max_rows = max_rows + 1;
max_cols = max_cols + 1;
referrence = (int *)malloc( max_rows * max_cols * sizeof(int) );
input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) );
if (!input_itemsets)
fprintf(stderr, "error: can not allocate memory");
srand ( 7 );
for (int i = 0 ; i < max_cols; i++){
for (int j = 0 ; j < max_rows; j++){
input_itemsets[i*max_cols+j] = 0;
}
}
printf("Start Needleman-Wunsch\n");
for( int i=1; i< max_rows ; i++){ //please define your own sequence.
input_itemsets[i*max_cols] = rand() % 10 + 1;
}
for( int j=1; j< max_cols ; j++){ //please define your own sequence.
input_itemsets[j] = rand() % 10 + 1;
}
for (int i = 1 ; i < max_cols; i++){
for (int j = 1 ; j < max_rows; j++){
referrence[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]];
}
}
for( int i = 1; i< max_rows ; i++)
input_itemsets[i*max_cols] = -i * penalty;
for( int j = 1; j< max_cols ; j++)
input_itemsets[j] = -j * penalty;
size = max_cols * max_rows;
GPU_argv_init();
initTrace();
startCPU();
cudaMallocManaged((void**)& referrence_cuda, sizeof(int)*size);
cudaMallocManaged((void **)&matrix_cuda, sizeof(int) * size);
memcpy(referrence_cuda, referrence, sizeof(int) * size);
memcpy(matrix_cuda, input_itemsets, sizeof(int) * size);
cudaStream_t stream1;
cudaStream_t stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaMemPrefetchAsync(referrence_cuda, sizeof(int) * size, GPU_DEVICE, stream1);
cudaStreamSynchronize(stream1);
cudaMemPrefetchAsync(matrix_cuda, sizeof(int) * size, GPU_DEVICE, stream2);
cudaStreamSynchronize(stream2);
dim3 dimGrid;
dim3 dimBlock(BLOCK_SIZE, 1);
// int block_width = ( max_cols - 1 )/BLOCK_SIZE;
int block_width = nblocks - 1;
int block_size = (max_cols - 1) / (nblocks * BLOCK_SIZE);
#ifdef TIMING
gettimeofday(&tv_kernel_start, NULL);
#endif
//printf("Processing top-left matrix\n");
//process top-left matrix
for( int i = 1 ; i <= block_width ; i++) {
dimGrid.x = i;
dimGrid.y = 1;
needle_cuda_shared_1<<<dimGrid, dimBlock, 0, stream2>>>(referrence_cuda, matrix_cuda
,max_cols, penalty, i, block_width, block_size);
}
//printf("Processing bottom-right matrix\n");
//process bottom-right matrix
for( int i = block_width - 1 ; i >= 1 ; i--){
dimGrid.x = i;
dimGrid.y = 1;
needle_cuda_shared_2<<<dimGrid, dimBlock, 0, stream2>>>(referrence_cuda, matrix_cuda
,max_cols, penalty, i, block_width, block_size);
}
cudaDeviceSynchronize();
memcpy(output_itemsets, matrix_cuda, sizeof(int) * size);
cudaFree(referrence_cuda);
cudaFree(matrix_cuda);
#ifdef TIMING
gettimeofday(&tv_kernel_end, NULL);
tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
kernel_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
// cudaMemcpy(output_itemsets, matrix_cuda, sizeof(int) * size, cudaMemcpyDeviceToHost);
//#define TRACEBACK
#ifdef TRACEBACK
FILE *fpo = fopen("result.txt","w");
fprintf(fpo, "print traceback value GPU:\n");
for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){
int nw, n, w, traceback;
if ( i == max_rows - 2 && j == max_rows - 2 )
fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element
if ( i == 0 && j == 0 )
break;
if ( i > 0 && j > 0 ){
nw = output_itemsets[(i - 1) * max_cols + j - 1];
w = output_itemsets[ i * max_cols + j - 1 ];
n = output_itemsets[(i - 1) * max_cols + j];
}
else if ( i == 0 ){
nw = n = LIMIT;
w = output_itemsets[ i * max_cols + j - 1 ];
}
else if ( j == 0 ){
nw = w = LIMIT;
n = output_itemsets[(i - 1) * max_cols + j];
}
else{
}
//traceback = maximum(nw, w, n);
int new_nw, new_w, new_n;
new_nw = nw + referrence[i * max_cols + j];
new_w = w - penalty;
new_n = n - penalty;
traceback = maximum(new_nw, new_w, new_n);
if(traceback == new_nw)
traceback = nw;
if(traceback == new_w)
traceback = w;
if(traceback == new_n)
traceback = n;
fprintf(fpo, "%d ", traceback);
if(traceback == nw )
{i--; j--; continue;}
else if(traceback == w )
{j--; continue;}
else if(traceback == n )
{i--; continue;}
else
;
}
fclose(fpo);
#endif
endCPU();
finiTrace();
free(referrence);
free(input_itemsets);
free(output_itemsets);
#ifdef TIMING
printf("Exec: %f\n", kernel_time);
#endif
}