| //========================================================================================================================================================================================================200 | |
| //======================================================================================================================================================150 | |
| //====================================================================================================100 | |
| //==================================================50 | |
| //========================================================================================================================================================================================================200 | |
| // UPDATE | |
| //========================================================================================================================================================================================================200 | |
| // 14 APR 2011 Lukasz G. Szafaryn | |
| //========================================================================================================================================================================================================200 | |
| // DEFINE/INCLUDE | |
| //========================================================================================================================================================================================================200 | |
| //======================================================================================================================================================150 | |
| // LIBRARIES | |
| //======================================================================================================================================================150 | |
| //======================================================================================================================================================150 | |
| // UTILITIES | |
| //======================================================================================================================================================150 | |
| //======================================================================================================================================================150 | |
| // MAIN FUNCTION HEADER | |
| //======================================================================================================================================================150 | |
| //======================================================================================================================================================150 | |
| // KERNEL | |
| //======================================================================================================================================================150 | |
| //========================================================================================================================================================================================================200 | |
| // MAIN FUNCTION | |
| //========================================================================================================================================================================================================200 | |
| extern inline __attribute__((always_inline)) unsigned long rdtsc() | |
| { | |
| unsigned long a, d; | |
| __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); | |
| return (a | (d << 32)); | |
| } | |
| extern inline __attribute__((always_inline)) unsigned long rdtsp() { | |
| struct timespec tms; | |
| if (clock_gettime(CLOCK_REALTIME, &tms)) { | |
| return -1; | |
| } | |
| unsigned long ns = tms.tv_sec * 1000000000; | |
| ns += tms.tv_nsec; | |
| return ns; | |
| } | |
| int | |
| main( int argc, | |
| char *argv []) | |
| { | |
| uint64_t start_tsc = rdtsc(); | |
| uint64_t start_tsp = rdtsp(); | |
| printf("start_tsc %llu start_tsp %llu\n", start_tsc, start_tsp); | |
| printf("thread block size of kernel = %d \n", NUMBER_THREADS); | |
| //======================================================================================================================================================150 | |
| // CPU/MCPU VARIABLES | |
| //======================================================================================================================================================150 | |
| // timer | |
| long long time0; | |
| time0 = get_time(); | |
| // timer | |
| long long time1; | |
| long long time2; | |
| long long time3; | |
| long long time4; | |
| long long time5; | |
| long long time6; | |
| long long time7; | |
| // counters | |
| int i, j, k, l, m, n; | |
| // system memory | |
| par_str par_cpu; | |
| dim_str dim_cpu; | |
| box_str* box_cpu; | |
| FOUR_VECTOR* rv_cpu; | |
| fp* qv_cpu; | |
| FOUR_VECTOR* fv_cpu; | |
| int nh; | |
| time1 = get_time(); | |
| //======================================================================================================================================================150 | |
| // CHECK INPUT ARGUMENTS | |
| //======================================================================================================================================================150 | |
| // assing default values | |
| dim_cpu.boxes1d_arg = 1; | |
| // go through arguments | |
| dim_cpu.boxes1d_arg = atoi(argv[1]); | |
| int nblocks = atoi(argv[2]); | |
| // Print configuration | |
| printf("Configuration used: boxes1d = %d\n", dim_cpu.boxes1d_arg); | |
| time2 = get_time(); | |
| //======================================================================================================================================================150 | |
| // INPUTS | |
| //======================================================================================================================================================150 | |
| par_cpu.alpha = 0.5; | |
| time3 = get_time(); | |
| //======================================================================================================================================================150 | |
| // DIMENSIONS | |
| //======================================================================================================================================================150 | |
| // total number of boxes | |
| dim_cpu.number_boxes = dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg; | |
| // how many particles space has in each direction | |
| dim_cpu.space_elem = dim_cpu.number_boxes * NUMBER_PAR_PER_BOX; | |
| dim_cpu.space_mem = dim_cpu.space_elem * sizeof(FOUR_VECTOR); | |
| dim_cpu.space_mem2 = dim_cpu.space_elem * sizeof(fp); | |
| // box array | |
| dim_cpu.box_mem = dim_cpu.number_boxes * sizeof(box_str); | |
| time4 = get_time(); | |
| //======================================================================================================================================================150 | |
| // SYSTEM MEMORY | |
| //======================================================================================================================================================150 | |
| //====================================================================================================100 | |
| // BOX | |
| //====================================================================================================100 | |
| // allocate boxes | |
| box_cpu = (box_str*)malloc(dim_cpu.box_mem); | |
| // initialize number of home boxes | |
| nh = 0; | |
| // home boxes in z direction | |
| for(i=0; i<dim_cpu.boxes1d_arg; i++){ | |
| // home boxes in y direction | |
| for(j=0; j<dim_cpu.boxes1d_arg; j++){ | |
| // home boxes in x direction | |
| for(k=0; k<dim_cpu.boxes1d_arg; k++){ | |
| // current home box | |
| box_cpu[nh].x = k; | |
| box_cpu[nh].y = j; | |
| box_cpu[nh].z = i; | |
| box_cpu[nh].number = nh; | |
| box_cpu[nh].offset = nh * NUMBER_PAR_PER_BOX; | |
| // initialize number of neighbor boxes | |
| box_cpu[nh].nn = 0; | |
| // neighbor boxes in z direction | |
| for(l=-1; l<2; l++){ | |
| // neighbor boxes in y direction | |
| for(m=-1; m<2; m++){ | |
| // neighbor boxes in x direction | |
| for(n=-1; n<2; n++){ | |
| // check if (this neighbor exists) and (it is not the same as home box) | |
| if( (((i+l)>=0 && (j+m)>=0 && (k+n)>=0)==true && ((i+l)<dim_cpu.boxes1d_arg && (j+m)<dim_cpu.boxes1d_arg && (k+n)<dim_cpu.boxes1d_arg)==true) && | |
| (l==0 && m==0 && n==0)==false ){ | |
| // current neighbor box | |
| box_cpu[nh].nei[box_cpu[nh].nn].x = (k+n); | |
| box_cpu[nh].nei[box_cpu[nh].nn].y = (j+m); | |
| box_cpu[nh].nei[box_cpu[nh].nn].z = (i+l); | |
| box_cpu[nh].nei[box_cpu[nh].nn].number = (box_cpu[nh].nei[box_cpu[nh].nn].z * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg) + | |
| (box_cpu[nh].nei[box_cpu[nh].nn].y * dim_cpu.boxes1d_arg) + | |
| box_cpu[nh].nei[box_cpu[nh].nn].x; | |
| box_cpu[nh].nei[box_cpu[nh].nn].offset = box_cpu[nh].nei[box_cpu[nh].nn].number * NUMBER_PAR_PER_BOX; | |
| // increment neighbor box | |
| box_cpu[nh].nn = box_cpu[nh].nn + 1; | |
| } | |
| } // neighbor boxes in x direction | |
| } // neighbor boxes in y direction | |
| } // neighbor boxes in z direction | |
| // increment home box | |
| nh = nh + 1; | |
| } // home boxes in x direction | |
| } // home boxes in y direction | |
| } // home boxes in z direction | |
| //====================================================================================================100 | |
| // PARAMETERS, DISTANCE, CHARGE AND FORCE | |
| //====================================================================================================100 | |
| // random generator seed set to random value - time in this case | |
| srand(time(NULL)); | |
| // input (distances) | |
| rv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem); | |
| for(i=0; i<dim_cpu.space_elem; i=i+1){ | |
| rv_cpu[i].v = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0 | |
| rv_cpu[i].x = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0 | |
| rv_cpu[i].y = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0 | |
| rv_cpu[i].z = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0 | |
| } | |
| // input (charge) | |
| qv_cpu = (fp*)malloc(dim_cpu.space_mem2); | |
| for(i=0; i<dim_cpu.space_elem; i=i+1){ | |
| qv_cpu[i] = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0 | |
| } | |
| // output (forces) | |
| fv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem); | |
| for(i=0; i<dim_cpu.space_elem; i=i+1){ | |
| fv_cpu[i].v = 0; // set to 0, because kernels keeps adding to initial value | |
| fv_cpu[i].x = 0; // set to 0, because kernels keeps adding to initial value | |
| fv_cpu[i].y = 0; // set to 0, because kernels keeps adding to initial value | |
| fv_cpu[i].z = 0; // set to 0, because kernels keeps adding to initial value | |
| } | |
| time5 = get_time(); | |
| //======================================================================================================================================================150 | |
| // KERNEL | |
| //======================================================================================================================================================150 | |
| //====================================================================================================100 | |
| // GPU_CUDA | |
| //====================================================================================================100 | |
| kernel_gpu_cuda_wrapper(par_cpu, | |
| dim_cpu, | |
| box_cpu, | |
| rv_cpu, | |
| qv_cpu, | |
| fv_cpu, | |
| nblocks); | |
| time6 = get_time(); | |
| //======================================================================================================================================================150 | |
| // SYSTEM MEMORY DEALLOCATION | |
| //======================================================================================================================================================150 | |
| // dump results | |
| FILE *fptr; | |
| fptr = fopen("result.txt", "w"); | |
| for(i=0; i<dim_cpu.space_elem; i=i+1){ | |
| fprintf(fptr, "%f, %f, %f, %f\n", fv_cpu[i].v, fv_cpu[i].x, fv_cpu[i].y, fv_cpu[i].z); | |
| } | |
| fclose(fptr); | |
| free(rv_cpu); | |
| free(qv_cpu); | |
| free(fv_cpu); | |
| free(box_cpu); | |
| time7 = get_time(); | |
| //======================================================================================================================================================150 | |
| // DISPLAY TIMING | |
| //======================================================================================================================================================150 | |
| // printf("Time spent in different stages of the application:\n"); | |
| // printf("%15.12f s, %15.12f % : VARIABLES\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time7-time0) * 100); | |
| // printf("%15.12f s, %15.12f % : INPUT ARGUMENTS\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time7-time0) * 100); | |
| // printf("%15.12f s, %15.12f % : INPUTS\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time7-time0) * 100); | |
| // printf("%15.12f s, %15.12f % : dim_cpu\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time7-time0) * 100); | |
| // printf("%15.12f s, %15.12f % : SYS MEM: ALO\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time7-time0) * 100); | |
| // printf("%15.12f s, %15.12f % : KERNEL: COMPUTE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time7-time0) * 100); | |
| // printf("%15.12f s, %15.12f % : SYS MEM: FRE\n", (float) (time7-time6) / 1000000, (float) (time7-time6) / (float) (time7-time0) * 100); | |
| // printf("Total time:\n"); | |
| // printf("%.12f s\n", (float) (time7-time0) / 1000000); | |
| //======================================================================================================================================================150 | |
| // RETURN | |
| //======================================================================================================================================================150 | |
| return 0.0; // always returns 0.0 | |
| } | |