| // includes, system
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| // includes, project
|
|
|
|
|
| // includes, kernels
|
|
|
|
|
| void random_matrix(float *I, int rows, int cols);
|
| void runTest( int argc, char** argv);
|
| void usage(int argc, char **argv)
|
| {
|
| fprintf(stderr, "Usage: %s <rows> <cols> <y1> <y2> <x1> <x2> <lamda> <no. of iter>\n", argv[0]);
|
| fprintf(stderr, "\t<rows> - number of rows\n");
|
| fprintf(stderr, "\t<cols> - number of cols\n");
|
| fprintf(stderr, "\t<y1> - y1 value of the speckle\n");
|
| fprintf(stderr, "\t<y2> - y2 value of the speckle\n");
|
| fprintf(stderr, "\t<x1> - x1 value of the speckle\n");
|
| fprintf(stderr, "\t<x2> - x2 value of the speckle\n");
|
| fprintf(stderr, "\t<lamda> - lambda (0,1)\n");
|
| fprintf(stderr, "\t<no. of iter> - number of iterations\n");
|
|
|
| exit(1);
|
| }
|
| ////////////////////////////////////////////////////////////////////////////////
|
| // Program main
|
| ////////////////////////////////////////////////////////////////////////////////
|
| int main(int argc, char *argv[])
|
| {
|
| uint64_t start_tsc = rdtsc();
|
| uint64_t start_tsp = rdtsp();
|
| printf("start_tsc %lu start_tsp %lu\n", start_tsc, start_tsp);
|
| printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
|
| runTest( argc, argv);
|
|
|
| return EXIT_SUCCESS;
|
| }
|
|
|
| void
|
| runTest( int argc, char** argv)
|
| {
|
| int rows, cols, size_I, size_R, niter = 10, iter, nblocks;
|
| float *I, *J, lambda, q0sqr, sum, sum2, tmp, meanROI,varROI ;
|
|
|
|
|
| float Jc, G2, L, num, den, qsqr;
|
| int *iN,*iS,*jE,*jW, k;
|
| float *dN,*dS,*dW,*dE;
|
| float cN,cS,cW,cE,D;
|
|
|
|
|
|
|
|
|
| float *J_cuda;
|
| float *C_cuda;
|
| float *E_C, *W_C, *N_C, *S_C;
|
|
|
|
|
|
|
| unsigned int r1, r2, c1, c2;
|
| float *c;
|
|
|
|
|
|
|
| if (argc == 10)
|
| {
|
| rows = atoi(argv[1]); //number of rows in the domain
|
| cols = atoi(argv[2]); //number of cols in the domain
|
| if ((rows%16!=0) || (cols%16!=0)){
|
| fprintf(stderr, "rows and cols must be multiples of 16\n");
|
| exit(1);
|
| }
|
| r1 = atoi(argv[3]); //y1 position of the speckle
|
| r2 = atoi(argv[4]); //y2 position of the speckle
|
| c1 = atoi(argv[5]); //x1 position of the speckle
|
| c2 = atoi(argv[6]); //x2 position of the speckle
|
| lambda = atof(argv[7]); //Lambda value
|
| niter = atoi(argv[8]); //number of iterations
|
| nblocks = atoi(argv[9]); // number of blocks
|
| }
|
| else{
|
| usage(argc, argv);
|
| }
|
|
|
| size_I = cols * rows;
|
| size_R = (r2-r1+1)*(c2-c1+1);
|
|
|
| I = (float *)malloc( size_I * sizeof(float) );
|
| J = (float *)malloc( size_I * sizeof(float) );
|
| c = (float *)malloc(sizeof(float)* size_I) ;
|
|
|
|
|
|
|
|
|
| iN = (int *)malloc(sizeof(unsigned int*) * rows) ;
|
| iS = (int *)malloc(sizeof(unsigned int*) * rows) ;
|
| jW = (int *)malloc(sizeof(unsigned int*) * cols) ;
|
| jE = (int *)malloc(sizeof(unsigned int*) * cols) ;
|
|
|
|
|
| dN = (float *)malloc(sizeof(float)* size_I) ;
|
| dS = (float *)malloc(sizeof(float)* size_I) ;
|
| dW = (float *)malloc(sizeof(float)* size_I) ;
|
| dE = (float *)malloc(sizeof(float)* size_I) ;
|
|
|
|
|
| for (int i=0; i< rows; i++) {
|
| iN[i] = i-1;
|
| iS[i] = i+1;
|
| }
|
| for (int j=0; j< cols; j++) {
|
| jW[j] = j-1;
|
| jE[j] = j+1;
|
| }
|
| iN[0] = 0;
|
| iS[rows-1] = rows-1;
|
| jW[0] = 0;
|
| jE[cols-1] = cols-1;
|
|
|
|
|
| GPU_argv_init();
|
| initTrace();
|
| startCPU();
|
|
|
|
|
|
|
| //Allocate device memory
|
| cudaMalloc((void**)& J_cuda, sizeof(float)* size_I);
|
| cudaMalloc((void**)& C_cuda, sizeof(float)* size_I);
|
| cudaMalloc((void**)& E_C, sizeof(float)* size_I);
|
| cudaMalloc((void**)& W_C, sizeof(float)* size_I);
|
| cudaMalloc((void**)& S_C, sizeof(float)* size_I);
|
| cudaMalloc((void**)& N_C, sizeof(float)* size_I);
|
|
|
|
|
|
|
|
|
| printf("Randomizing the input matrix\n");
|
| //Generate a random matrix
|
| random_matrix(I, rows, cols);
|
|
|
| for (int k = 0; k < size_I; k++ ) {
|
| J[k] = (float)exp(I[k]) ;
|
| }
|
| printf("Start the SRAD main loop\n");
|
| for (iter=0; iter< niter; iter++){
|
| sum=0; sum2=0;
|
| for (int i=r1; i<=r2; i++) {
|
| for (int j=c1; j<=c2; j++) {
|
| tmp = J[i * cols + j];
|
| sum += tmp ;
|
| sum2 += tmp*tmp;
|
| }
|
| }
|
| meanROI = sum / size_R;
|
| varROI = (sum2 / size_R) - meanROI*meanROI;
|
| q0sqr = varROI / (meanROI*meanROI);
|
|
|
|
|
|
|
| for (int i = 0 ; i < rows ; i++) {
|
| for (int j = 0; j < cols; j++) {
|
|
|
| k = i * cols + j;
|
| Jc = J[k];
|
|
|
| // directional derivates
|
| dN[k] = J[iN[i] * cols + j] - Jc;
|
| dS[k] = J[iS[i] * cols + j] - Jc;
|
| dW[k] = J[i * cols + jW[j]] - Jc;
|
| dE[k] = J[i * cols + jE[j]] - Jc;
|
|
|
| G2 = (dN[k]*dN[k] + dS[k]*dS[k]
|
| + dW[k]*dW[k] + dE[k]*dE[k]) / (Jc*Jc);
|
|
|
| L = (dN[k] + dS[k] + dW[k] + dE[k]) / Jc;
|
|
|
| num = (0.5*G2) - ((1.0/16.0)*(L*L)) ;
|
| den = 1 + (.25*L);
|
| qsqr = num/(den*den);
|
|
|
| // diffusion coefficent (equ 33)
|
| den = (qsqr-q0sqr) / (q0sqr * (1+q0sqr)) ;
|
| c[k] = 1.0 / (1.0+den) ;
|
|
|
| // saturate diffusion coefficent
|
| if (c[k] < 0) {c[k] = 0;}
|
| else if (c[k] > 1) {c[k] = 1;}
|
| }
|
| }
|
|
|
| for (int i = 0; i < rows; i++) {
|
| for (int j = 0; j < cols; j++) {
|
|
|
| // current index
|
| k = i * cols + j;
|
|
|
| // diffusion coefficent
|
| cN = c[k];
|
| cS = c[iS[i] * cols + j];
|
| cW = c[k];
|
| cE = c[i * cols + jE[j]];
|
|
|
| // divergence (equ 58)
|
| D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k];
|
|
|
| // image update (equ 61)
|
| J[k] = J[k] + 0.25*lambda*D;
|
| }
|
| }
|
|
|
|
|
|
|
|
|
|
|
|
|
| //Currently the input size must be divided by 16 - the block size
|
|
|
| // ruihao
|
| int block_x = cols/BLOCK_SIZE ;
|
| int block_y = rows/BLOCK_SIZE ;
|
|
|
| if (nblocks > block_x) nblocks = block_x;
|
|
|
| dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
|
| // dim3 dimGrid(block_x, block_y);
|
| dim3 dimGrid(nblocks, nblocks);
|
| // ruihao
|
|
|
| //Copy data from main memory to device memory
|
| cudaMemcpy(J_cuda, J, sizeof(float) * size_I, cudaMemcpyHostToDevice);
|
|
|
| //Run kernels
|
| // srad_cuda_1<<<dimGrid, dimBlock>>>(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, q0sqr);
|
| // srad_cuda_2<<<dimGrid, dimBlock>>>(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, lambda, q0sqr);
|
| srad_cuda_1<<<dimGrid, dimBlock>>>(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, q0sqr, cols / nblocks);
|
| srad_cuda_2<<<dimGrid, dimBlock>>>(E_C, W_C, N_C, S_C, J_cuda, C_cuda, cols, rows, lambda, q0sqr, cols / nblocks);
|
| //Copy data from device memory to main memory
|
| cudaMemcpy(J, J_cuda, sizeof(float) * size_I, cudaMemcpyDeviceToHost);
|
|
|
|
|
| }
|
|
|
| cudaThreadSynchronize();
|
|
|
| cudaFree(C_cuda);
|
| cudaFree(J_cuda);
|
| cudaFree(E_C);
|
| cudaFree(W_C);
|
| cudaFree(N_C);
|
| cudaFree(S_C);
|
|
|
| endCPU();
|
| finiTrace();
|
|
|
|
|
| //Printing output
|
| printf("Printing Output:\n");
|
| for( int i = 0 ; i < rows ; i++){
|
| for ( int j = 0 ; j < cols ; j++){
|
| printf("%.5f ", J[i * cols + j]);
|
| }
|
| printf("\n");
|
| }
|
|
|
|
|
| printf("Computation Done\n");
|
|
|
| free(I);
|
| free(J);
|
|
|
| free(iN); free(iS); free(jW); free(jE);
|
| free(dN); free(dS); free(dW); free(dE);
|
|
|
| free(c);
|
|
|
| }
|
|
|
|
|
| void random_matrix(float *I, int rows, int cols){
|
|
|
| srand(7);
|
|
|
| for( int i = 0 ; i < rows ; i++){
|
| for ( int j = 0 ; j < cols ; j++){
|
| I[i * cols + j] = rand()/(float)RAND_MAX ;
|
| }
|
| }
|
|
|
| }
|
|
|
|
|