| //----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------200 | |
| // plasmaKernel_gpu_2 | |
| //----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------200 | |
| using namespace nvcuda::experimental; | |
| __global__ void kernel_gpu_cuda(par_str d_par_gpu, | |
| dim_str d_dim_gpu, | |
| box_str *d_box_gpu, | |
| FOUR_VECTOR *d_rv_gpu, | |
| fp *d_qv_gpu, | |
| FOUR_VECTOR *d_fv_gpu, | |
| int boxes_per_block) | |
| { | |
| cooperative_groups::thread_block block = cooperative_groups::this_thread_block(); | |
| pipeline pipe; | |
| //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 | |
| // THREAD PARAMETERS | |
| //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 | |
| int bx = blockIdx.x; // get current horizontal block index (0-n) | |
| int tx = threadIdx.x; // get current horizontal thread index (0-n) | |
| int wtx = tx; | |
| //------------------------------------------------------------------------------------------------------------------------------------------------------160 | |
| // Extract input parameters | |
| //------------------------------------------------------------------------------------------------------------------------------------------------------160 | |
| // parameters | |
| fp a2 = 2.0 * d_par_gpu.alpha * d_par_gpu.alpha; | |
| // home box | |
| int first_i; | |
| FOUR_VECTOR *rA; | |
| FOUR_VECTOR *fA; | |
| __shared__ FOUR_VECTOR rA_shared[100]; | |
| // nei box | |
| int pointer; | |
| int k = 0; | |
| int first_j; | |
| FOUR_VECTOR *rB; | |
| fp *qB; | |
| int j = 0; | |
| __shared__ FOUR_VECTOR rB_shared[NUMBER_PAR_PER_BOX * PREFETCH_COUNT]; | |
| __shared__ double qB_shared[NUMBER_PAR_PER_BOX * PREFETCH_COUNT]; | |
| // common | |
| fp r2; | |
| fp u2; | |
| fp vij; | |
| fp fs; | |
| fp fxij; | |
| fp fyij; | |
| fp fzij; | |
| THREE_VECTOR d; | |
| int box = bx * boxes_per_block; | |
| int end_box = box + boxes_per_block; | |
| //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 | |
| // DO FOR THE NUMBER OF BOXES | |
| //--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180 | |
| for (; box < end_box; box++) | |
| { | |
| //------------------------------------------------------------------------------------------------------------------------------------------------------160 | |
| // Home box | |
| //------------------------------------------------------------------------------------------------------------------------------------------------------160 | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // Setup parameters | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // home box - box parameters | |
| first_i = d_box_gpu[box].offset; | |
| // home box - distance, force, charge and type parameters | |
| rA = &d_rv_gpu[first_i]; | |
| fA = &d_fv_gpu[first_i]; | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // Copy to shared memory | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // home box - shared memory | |
| while (wtx < NUMBER_PAR_PER_BOX) | |
| { | |
| rA_shared[wtx] = rA[wtx]; | |
| wtx = wtx + NUMBER_THREADS; | |
| } | |
| wtx = tx; | |
| // synchronize threads - not needed, but just to be safe | |
| block.sync(); | |
| //------------------------------------------------------------------------------------------------------------------------------------------------------160 | |
| // nei box loop | |
| //------------------------------------------------------------------------------------------------------------------------------------------------------160 | |
| // if (wtx == 0) | |
| // printf("d_box_gpu[%d].nn is %d\n", bx, d_box_gpu[bx].nn); | |
| int fetch = 0; | |
| int end_tile = 1 + d_box_gpu[box].nn; | |
| // loop over neiing boxes of home box | |
| for (int compute = fetch; compute < end_tile; compute++) | |
| { | |
| for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++) | |
| { | |
| //----------------------------------------50 | |
| // nei box - get pointer to the right box | |
| //----------------------------------------50 | |
| if (fetch == 0) | |
| { | |
| pointer = box; // set first box to be processed to home box | |
| } | |
| else | |
| { | |
| pointer = d_box_gpu[box].nei[fetch - 1].number; // remaining boxes are nei boxes | |
| } | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // Setup parameters | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // nei box - box parameters | |
| first_j = d_box_gpu[pointer].offset; | |
| // nei box - distance, (force), charge and (type) parameters | |
| rB = &d_rv_gpu[first_j]; | |
| qB = &d_qv_gpu[first_j]; | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // Setup parameters | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // nei box - shared memory | |
| while (wtx < NUMBER_PAR_PER_BOX) | |
| { | |
| memcpy_async(rB_shared[(fetch % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + wtx], rB[wtx], pipe); | |
| memcpy_async(qB_shared[(fetch % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + wtx], qB[wtx], pipe); | |
| wtx = wtx + NUMBER_THREADS; | |
| } | |
| wtx = tx; | |
| // synchronize threads because in next section each thread accesses data brought in by different threads here | |
| pipe.commit(); | |
| } | |
| if (fetch == end_tile) | |
| { | |
| for (int i = 0; i < PREFETCH_COUNT - 1; ++i) | |
| { | |
| pipe.commit(); | |
| } | |
| ++fetch; | |
| } | |
| pipe.wait_prior<PREFETCH_COUNT - 1>(); | |
| block.sync(); | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // Calculation | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // loop for the number of particles in the home box | |
| // for (int i=0; i<nTotal_i; i++){ | |
| while (wtx < NUMBER_PAR_PER_BOX) | |
| { | |
| // loop for the number of particles in the current nei box | |
| for (j = 0; j < NUMBER_PAR_PER_BOX; j++) | |
| { | |
| r2 = (fp)rA_shared[wtx].v + (fp)rB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j].v - DOT((fp)rA_shared[wtx], (fp)rB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j]); | |
| u2 = a2 * r2; | |
| vij = exp(-u2); | |
| fs = 2 * vij; | |
| d.x = (fp)rA_shared[wtx].x - (fp)rB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j].x; | |
| fxij = fs * d.x; | |
| d.y = (fp)rA_shared[wtx].y - (fp)rB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j].y; | |
| fyij = fs * d.y; | |
| d.z = (fp)rA_shared[wtx].z - (fp)rB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j].z; | |
| fzij = fs * d.z; | |
| fA[wtx].v += (double)((fp)qB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j] * vij); | |
| fA[wtx].x += (double)((fp)qB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j] * fxij); | |
| fA[wtx].y += (double)((fp)qB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j] * fyij); | |
| fA[wtx].z += (double)((fp)qB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j] * fzij); | |
| } | |
| // increment work thread index | |
| wtx = wtx + NUMBER_THREADS; | |
| } | |
| // reset work index | |
| wtx = tx; | |
| // synchronize after finishing force contributions from current nei box not to cause conflicts when starting next box | |
| block.sync(); | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| // Calculation END | |
| //----------------------------------------------------------------------------------------------------------------------------------140 | |
| } | |
| } | |
| } | |