File size: 8,696 Bytes
5cb6c4b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | //----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------200
// plasmaKernel_gpu_2
//----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------200
#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>
using namespace nvcuda::experimental;
#define PREFETCH_COUNT 2
__global__ void kernel_gpu_cuda(par_str d_par_gpu,
dim_str d_dim_gpu,
box_str *d_box_gpu,
FOUR_VECTOR *d_rv_gpu,
fp *d_qv_gpu,
FOUR_VECTOR *d_fv_gpu,
int boxes_per_block)
{
cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
pipeline pipe;
//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
// THREAD PARAMETERS
//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
int bx = blockIdx.x; // get current horizontal block index (0-n)
int tx = threadIdx.x; // get current horizontal thread index (0-n)
int wtx = tx;
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// Extract input parameters
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// parameters
fp a2 = 2.0 * d_par_gpu.alpha * d_par_gpu.alpha;
// home box
int first_i;
FOUR_VECTOR *rA;
FOUR_VECTOR *fA;
__shared__ FOUR_VECTOR rA_shared[100];
// nei box
int pointer;
int k = 0;
int first_j;
FOUR_VECTOR *rB;
fp *qB;
int j = 0;
__shared__ FOUR_VECTOR rB_shared[NUMBER_PAR_PER_BOX * PREFETCH_COUNT];
__shared__ double qB_shared[NUMBER_PAR_PER_BOX * PREFETCH_COUNT];
// common
fp r2;
fp u2;
fp vij;
fp fs;
fp fxij;
fp fyij;
fp fzij;
THREE_VECTOR d;
int box = bx * boxes_per_block;
int end_box = box + boxes_per_block;
//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
// DO FOR THE NUMBER OF BOXES
//--------------------------------------------------------------------------------------------------------------------------------------------------------------------------180
for (; box < end_box; box++)
{
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// Home box
//------------------------------------------------------------------------------------------------------------------------------------------------------160
//----------------------------------------------------------------------------------------------------------------------------------140
// Setup parameters
//----------------------------------------------------------------------------------------------------------------------------------140
// home box - box parameters
first_i = d_box_gpu[box].offset;
// home box - distance, force, charge and type parameters
rA = &d_rv_gpu[first_i];
fA = &d_fv_gpu[first_i];
//----------------------------------------------------------------------------------------------------------------------------------140
// Copy to shared memory
//----------------------------------------------------------------------------------------------------------------------------------140
// home box - shared memory
while (wtx < NUMBER_PAR_PER_BOX)
{
rA_shared[wtx] = rA[wtx];
wtx = wtx + NUMBER_THREADS;
}
wtx = tx;
// synchronize threads - not needed, but just to be safe
block.sync();
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// nei box loop
//------------------------------------------------------------------------------------------------------------------------------------------------------160
// if (wtx == 0)
// printf("d_box_gpu[%d].nn is %d\n", bx, d_box_gpu[bx].nn);
int fetch = 0;
int end_tile = 1 + d_box_gpu[box].nn;
// loop over neiing boxes of home box
for (int compute = fetch; compute < end_tile; compute++)
{
for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
{
//----------------------------------------50
// nei box - get pointer to the right box
//----------------------------------------50
if (fetch == 0)
{
pointer = box; // set first box to be processed to home box
}
else
{
pointer = d_box_gpu[box].nei[fetch - 1].number; // remaining boxes are nei boxes
}
//----------------------------------------------------------------------------------------------------------------------------------140
// Setup parameters
//----------------------------------------------------------------------------------------------------------------------------------140
// nei box - box parameters
first_j = d_box_gpu[pointer].offset;
// nei box - distance, (force), charge and (type) parameters
rB = &d_rv_gpu[first_j];
qB = &d_qv_gpu[first_j];
//----------------------------------------------------------------------------------------------------------------------------------140
// Setup parameters
//----------------------------------------------------------------------------------------------------------------------------------140
// nei box - shared memory
while (wtx < NUMBER_PAR_PER_BOX)
{
memcpy_async(rB_shared[(fetch % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + wtx], rB[wtx], pipe);
memcpy_async(qB_shared[(fetch % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + wtx], qB[wtx], pipe);
wtx = wtx + NUMBER_THREADS;
}
wtx = tx;
// synchronize threads because in next section each thread accesses data brought in by different threads here
pipe.commit();
}
if (fetch == end_tile)
{
for (int i = 0; i < PREFETCH_COUNT - 1; ++i)
{
pipe.commit();
}
++fetch;
}
pipe.wait_prior<PREFETCH_COUNT - 1>();
block.sync();
//----------------------------------------------------------------------------------------------------------------------------------140
// Calculation
//----------------------------------------------------------------------------------------------------------------------------------140
// loop for the number of particles in the home box
// for (int i=0; i<nTotal_i; i++){
while (wtx < NUMBER_PAR_PER_BOX)
{
// loop for the number of particles in the current nei box
for (j = 0; j < NUMBER_PAR_PER_BOX; j++)
{
r2 = (fp)rA_shared[wtx].v + (fp)rB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j].v - DOT((fp)rA_shared[wtx], (fp)rB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j]);
u2 = a2 * r2;
vij = exp(-u2);
fs = 2 * vij;
d.x = (fp)rA_shared[wtx].x - (fp)rB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j].x;
fxij = fs * d.x;
d.y = (fp)rA_shared[wtx].y - (fp)rB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j].y;
fyij = fs * d.y;
d.z = (fp)rA_shared[wtx].z - (fp)rB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j].z;
fzij = fs * d.z;
fA[wtx].v += (double)((fp)qB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j] * vij);
fA[wtx].x += (double)((fp)qB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j] * fxij);
fA[wtx].y += (double)((fp)qB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j] * fyij);
fA[wtx].z += (double)((fp)qB_shared[(compute % PREFETCH_COUNT) * NUMBER_PAR_PER_BOX + j] * fzij);
}
// increment work thread index
wtx = wtx + NUMBER_THREADS;
}
// reset work index
wtx = tx;
// synchronize after finishing force contributions from current nei box not to cause conflicts when starting next box
block.sync();
//----------------------------------------------------------------------------------------------------------------------------------140
// Calculation END
//----------------------------------------------------------------------------------------------------------------------------------140
}
}
}
|