//----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------200 // plasmaKernel_gpu_2 //----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------200 #include #include using namespace nvcuda::experimental; #define PREFETCH_COUNT 2 __global__ void kernel_gpu_cuda(par_str d_par_gpu, dim_str d_dim_gpu, box_str *d_box_gpu, FOUR_VECTOR *d_rv_gpu, fp *d_qv_gpu, FOUR_VECTOR *d_fv_gpu, int boxes_per_block) { cooperative_groups::thread_block block = cooperative_groups::this_thread_block(); pipeline pipe; //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 // THREAD PARAMETERS //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 int bx = blockIdx.x; // get current horizontal block index (0-n) int tx = threadIdx.x; // get current horizontal thread index (0-n) int wtx = tx; //------------------------------------------------------------------------------------------------------------------------------------------------------160 // Extract input parameters //------------------------------------------------------------------------------------------------------------------------------------------------------160 // parameters fp a2 = 2.0 * d_par_gpu.alpha * d_par_gpu.alpha; // home box int first_i; FOUR_VECTOR *rA; FOUR_VECTOR *fA; __shared__ FOUR_VECTOR rA_shared[100]; // nei box int pointer; int k = 0; int first_j; FOUR_VECTOR *rB; fp *qB; int j = 0; __shared__ FOUR_VECTOR rB_shared[NUMBER_PAR_PER_BOX * PREFETCH_COUNT]; __shared__ double qB_shared[NUMBER_PAR_PER_BOX * PREFETCH_COUNT]; // common fp r2; fp u2; fp vij; fp fs; fp fxij; fp fyij; fp fzij; THREE_VECTOR d; int box = bx * boxes_per_block; int end_box = box + boxes_per_block; //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 // DO FOR THE NUMBER OF BOXES //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 for (; box < end_box; box++) { //------------------------------------------------------------------------------------------------------------------------------------------------------160 // Home box //------------------------------------------------------------------------------------------------------------------------------------------------------160 //----------------------------------------------------------------------------------------------------------------------------------140 // Setup parameters //----------------------------------------------------------------------------------------------------------------------------------140 // home box - box parameters first_i = d_box_gpu[box].offset; // home box - distance, force, charge and type parameters rA = &d_rv_gpu[first_i]; fA = &d_fv_gpu[first_i]; //----------------------------------------------------------------------------------------------------------------------------------140 // Copy to shared memory //----------------------------------------------------------------------------------------------------------------------------------140 // home box - shared memory while (wtx < NUMBER_PAR_PER_BOX) { rA_shared[wtx] = rA[wtx]; wtx = wtx + NUMBER_THREADS; } wtx = tx; // synchronize threads - not needed, but just to be safe block.sync(); //------------------------------------------------------------------------------------------------------------------------------------------------------160 // nei box loop //------------------------------------------------------------------------------------------------------------------------------------------------------160 // if (wtx == 0) // printf("d_box_gpu[%d].nn is %d\n", bx, d_box_gpu[bx].nn); int fetch = 0; int end_tile = 1 + d_box_gpu[box].nn; // loop over neiing boxes of home box for (int compute = fetch; compute < end_tile; compute++) { for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++) { //----------------------------------------50 // nei box - get pointer to the right box //----------------------------------------50 if (fetch == 0) { pointer = box; // set first box to be processed to home box } else { pointer = d_box_gpu[box].nei[fetch - 1].number; // remaining boxes are nei boxes } //----------------------------------------------------------------------------------------------------------------------------------140 // Setup parameters //----------------------------------------------------------------------------------------------------------------------------------140 // nei box - box parameters first_j = d_box_gpu[pointer].offset; // nei box - distance, (force), charge and (type) parameters rB = &d_rv_gpu[first_j]; qB = &d_qv_gpu[first_j]; //----------------------------------------------------------------------------------------------------------------------------------140 // Setup parameters //----------------------------------------------------------------------------------------------------------------------------------140 // nei box - shared memory while (wtx < NUMBER_PAR_PER_BOX) { memcpy_async(rB_shared[(fetch % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + wtx], rB[wtx], pipe); memcpy_async(qB_shared[(fetch % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + wtx], qB[wtx], pipe); wtx = wtx + NUMBER_THREADS; } wtx = tx; // synchronize threads because in next section each thread accesses data brought in by different threads here pipe.commit(); } if (fetch == end_tile) { for (int i = 0; i < PREFETCH_COUNT - 1; ++i) { pipe.commit(); } ++fetch; } pipe.wait_prior(); block.sync(); //----------------------------------------------------------------------------------------------------------------------------------140 // Calculation //----------------------------------------------------------------------------------------------------------------------------------140 // loop for the number of particles in the home box // for (int i=0; i