lrh12580
first commit
5cb6c4b
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
//====================================================================================================100
//==================================================50
//========================================================================================================================================================================================================200
// UPDATE
//========================================================================================================================================================================================================200
// 14 APR 2011 Lukasz G. Szafaryn
//========================================================================================================================================================================================================200
// DEFINE/INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// LIBRARIES
//======================================================================================================================================================150
#include <stdio.h> // (in path known to compiler) needed by printf
#include <stdlib.h> // (in path known to compiler) needed by malloc
#include <stdbool.h> // (in path known to compiler) needed by true/false
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "./util/timer/timer.h" // (in path specified here)
#include "./util/num/num.h" // (in path specified here)
//======================================================================================================================================================150
// MAIN FUNCTION HEADER
//======================================================================================================================================================150
#include "./main.h" // (in the current directory)
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
#include "./kernel/kernel_gpu_cuda_wrapper.h" // (in library path specified here)
//========================================================================================================================================================================================================200
// MAIN FUNCTION
//========================================================================================================================================================================================================200
#define _POSIX_C_SOURCE 200809L
#include <stdio.h>
#include <time.h>
#include <stdint.h>
extern inline __attribute__((always_inline)) unsigned long rdtsc()
{
unsigned long a, d;
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
return (a | (d << 32));
}
extern inline __attribute__((always_inline)) unsigned long rdtsp() {
struct timespec tms;
if (clock_gettime(CLOCK_REALTIME, &tms)) {
return -1;
}
unsigned long ns = tms.tv_sec * 1000000000;
ns += tms.tv_nsec;
return ns;
}
int
main( int argc,
char *argv [])
{
uint64_t start_tsc = rdtsc();
uint64_t start_tsp = rdtsp();
printf("start_tsc %llu start_tsp %llu\n", start_tsc, start_tsp);
printf("thread block size of kernel = %d \n", NUMBER_THREADS);
//======================================================================================================================================================150
// CPU/MCPU VARIABLES
//======================================================================================================================================================150
// timer
long long time0;
time0 = get_time();
// timer
long long time1;
long long time2;
long long time3;
long long time4;
long long time5;
long long time6;
long long time7;
// counters
int i, j, k, l, m, n;
// system memory
par_str par_cpu;
dim_str dim_cpu;
box_str* box_cpu;
FOUR_VECTOR* rv_cpu;
fp* qv_cpu;
FOUR_VECTOR* fv_cpu;
int nh;
time1 = get_time();
//======================================================================================================================================================150
// CHECK INPUT ARGUMENTS
//======================================================================================================================================================150
// assing default values
dim_cpu.boxes1d_arg = 1;
// go through arguments
dim_cpu.boxes1d_arg = atoi(argv[1]);
int nblocks = atoi(argv[2]);
// Print configuration
printf("Configuration used: boxes1d = %d\n", dim_cpu.boxes1d_arg);
time2 = get_time();
//======================================================================================================================================================150
// INPUTS
//======================================================================================================================================================150
par_cpu.alpha = 0.5;
time3 = get_time();
//======================================================================================================================================================150
// DIMENSIONS
//======================================================================================================================================================150
// total number of boxes
dim_cpu.number_boxes = dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg;
// how many particles space has in each direction
dim_cpu.space_elem = dim_cpu.number_boxes * NUMBER_PAR_PER_BOX;
dim_cpu.space_mem = dim_cpu.space_elem * sizeof(FOUR_VECTOR);
dim_cpu.space_mem2 = dim_cpu.space_elem * sizeof(fp);
// box array
dim_cpu.box_mem = dim_cpu.number_boxes * sizeof(box_str);
time4 = get_time();
//======================================================================================================================================================150
// SYSTEM MEMORY
//======================================================================================================================================================150
//====================================================================================================100
// BOX
//====================================================================================================100
// allocate boxes
box_cpu = (box_str*)malloc(dim_cpu.box_mem);
// initialize number of home boxes
nh = 0;
// home boxes in z direction
for(i=0; i<dim_cpu.boxes1d_arg; i++){
// home boxes in y direction
for(j=0; j<dim_cpu.boxes1d_arg; j++){
// home boxes in x direction
for(k=0; k<dim_cpu.boxes1d_arg; k++){
// current home box
box_cpu[nh].x = k;
box_cpu[nh].y = j;
box_cpu[nh].z = i;
box_cpu[nh].number = nh;
box_cpu[nh].offset = nh * NUMBER_PAR_PER_BOX;
// initialize number of neighbor boxes
box_cpu[nh].nn = 0;
// neighbor boxes in z direction
for(l=-1; l<2; l++){
// neighbor boxes in y direction
for(m=-1; m<2; m++){
// neighbor boxes in x direction
for(n=-1; n<2; n++){
// check if (this neighbor exists) and (it is not the same as home box)
if( (((i+l)>=0 && (j+m)>=0 && (k+n)>=0)==true && ((i+l)<dim_cpu.boxes1d_arg && (j+m)<dim_cpu.boxes1d_arg && (k+n)<dim_cpu.boxes1d_arg)==true) &&
(l==0 && m==0 && n==0)==false ){
// current neighbor box
box_cpu[nh].nei[box_cpu[nh].nn].x = (k+n);
box_cpu[nh].nei[box_cpu[nh].nn].y = (j+m);
box_cpu[nh].nei[box_cpu[nh].nn].z = (i+l);
box_cpu[nh].nei[box_cpu[nh].nn].number = (box_cpu[nh].nei[box_cpu[nh].nn].z * dim_cpu.boxes1d_arg * dim_cpu.boxes1d_arg) +
(box_cpu[nh].nei[box_cpu[nh].nn].y * dim_cpu.boxes1d_arg) +
box_cpu[nh].nei[box_cpu[nh].nn].x;
box_cpu[nh].nei[box_cpu[nh].nn].offset = box_cpu[nh].nei[box_cpu[nh].nn].number * NUMBER_PAR_PER_BOX;
// increment neighbor box
box_cpu[nh].nn = box_cpu[nh].nn + 1;
}
} // neighbor boxes in x direction
} // neighbor boxes in y direction
} // neighbor boxes in z direction
// increment home box
nh = nh + 1;
} // home boxes in x direction
} // home boxes in y direction
} // home boxes in z direction
//====================================================================================================100
// PARAMETERS, DISTANCE, CHARGE AND FORCE
//====================================================================================================100
// random generator seed set to random value - time in this case
srand(time(NULL));
// input (distances)
rv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem);
for(i=0; i<dim_cpu.space_elem; i=i+1){
rv_cpu[i].v = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].x = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].y = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
rv_cpu[i].z = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
}
// input (charge)
qv_cpu = (fp*)malloc(dim_cpu.space_mem2);
for(i=0; i<dim_cpu.space_elem; i=i+1){
qv_cpu[i] = (rand()%10 + 1) / 10.0; // get a number in the range 0.1 - 1.0
}
// output (forces)
fv_cpu = (FOUR_VECTOR*)malloc(dim_cpu.space_mem);
for(i=0; i<dim_cpu.space_elem; i=i+1){
fv_cpu[i].v = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].x = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].y = 0; // set to 0, because kernels keeps adding to initial value
fv_cpu[i].z = 0; // set to 0, because kernels keeps adding to initial value
}
time5 = get_time();
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
//====================================================================================================100
// GPU_CUDA
//====================================================================================================100
kernel_gpu_cuda_wrapper(par_cpu,
dim_cpu,
box_cpu,
rv_cpu,
qv_cpu,
fv_cpu,
nblocks);
time6 = get_time();
//======================================================================================================================================================150
// SYSTEM MEMORY DEALLOCATION
//======================================================================================================================================================150
// dump results
#ifdef OUTPUT
FILE *fptr;
fptr = fopen("result.txt", "w");
for(i=0; i<dim_cpu.space_elem; i=i+1){
fprintf(fptr, "%f, %f, %f, %f\n", fv_cpu[i].v, fv_cpu[i].x, fv_cpu[i].y, fv_cpu[i].z);
}
fclose(fptr);
#endif
free(rv_cpu);
free(qv_cpu);
free(fv_cpu);
free(box_cpu);
time7 = get_time();
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
// printf("Time spent in different stages of the application:\n");
// printf("%15.12f s, %15.12f % : VARIABLES\n", (float) (time1-time0) / 1000000, (float) (time1-time0) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : INPUT ARGUMENTS\n", (float) (time2-time1) / 1000000, (float) (time2-time1) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : INPUTS\n", (float) (time3-time2) / 1000000, (float) (time3-time2) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : dim_cpu\n", (float) (time4-time3) / 1000000, (float) (time4-time3) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : SYS MEM: ALO\n", (float) (time5-time4) / 1000000, (float) (time5-time4) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : KERNEL: COMPUTE\n", (float) (time6-time5) / 1000000, (float) (time6-time5) / (float) (time7-time0) * 100);
// printf("%15.12f s, %15.12f % : SYS MEM: FRE\n", (float) (time7-time6) / 1000000, (float) (time7-time6) / (float) (time7-time0) * 100);
// printf("Total time:\n");
// printf("%.12f s\n", (float) (time7-time0) / 1000000);
//======================================================================================================================================================150
// RETURN
//======================================================================================================================================================150
return 0.0; // always returns 0.0
}