UvmPinAsync / workloads /realworld /async /srad /srad_kernel.cu
lrh12580
first commit
5cb6c4b
#include "srad.h"
#include <stdio.h>
#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>
using namespace nvcuda::experimental;
#define PREFETCH_COUNT 2
__global__ void
srad_cuda_1(
float *E_C,
float *W_C,
float *N_C,
float *S_C,
float *J_cuda,
float *C_cuda,
int cols,
int rows,
float q0sqr,
int block_size)
{
cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
pipeline pipe;
// shared memory allocation
__shared__ float temp[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
__shared__ float temp_result[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
__shared__ float north[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
__shared__ float south[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
__shared__ float east[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
__shared__ float west[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
int tile_dim_x = cols / BLOCK_SIZE;
int total_tiles = tile_dim_x * tile_dim_x;
int tiles_this_block = (block_size / BLOCK_SIZE) * (block_size / BLOCK_SIZE);
int base_tile = (blockIdx.y * gridDim.x + blockIdx.x) * tiles_this_block;
int fetch = base_tile;
int end_tile = fetch + tiles_this_block;
for (int compute = fetch; compute < end_tile; compute++)
{
// thread id
int tx = threadIdx.x;
int ty = threadIdx.y;
for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
{
// block id
int bx = fetch % tile_dim_x;
int by = fetch / tile_dim_x;
// indices
int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
int index_n = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + tx - cols;
int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
int index_w = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty - 1;
int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
if (index_n < 0) index_n = 0;
if (index_s >= (cols * rows)) index_s = cols * rows - 1;
if (index_w < 0) index_w = 0;
if (index_e >= (cols * rows)) index_e = cols * rows - 1;
// load data to shared memory
memcpy_async(north[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[index_n], pipe);
memcpy_async(south[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[index_s], pipe);
if (by == 0)
{
memcpy_async(north[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[BLOCK_SIZE * bx + tx], pipe);
}
else if (by == tile_dim_x - 1)
{
memcpy_async(south[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx],
J_cuda[cols * BLOCK_SIZE * (tile_dim_x - 1) + BLOCK_SIZE * bx + cols * (BLOCK_SIZE - 1) + tx], pipe);
}
block.sync();
memcpy_async(west[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[index_w], pipe);
memcpy_async(east[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[index_e], pipe);
if (bx == 0)
{
memcpy_async(west[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[cols * BLOCK_SIZE * by + cols * ty], pipe);
}
else if (bx == tile_dim_x - 1)
{
memcpy_async(east[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx],
J_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * (tile_dim_x - 1) + cols * ty + BLOCK_SIZE - 1], pipe);
}
block.sync();
memcpy_async(temp[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[index], pipe);
pipe.commit();
}
if (fetch == end_tile)
{
for (int i = 0; i < PREFETCH_COUNT - 1; ++i)
{
pipe.commit();
}
++fetch;
}
pipe.wait_prior<PREFETCH_COUNT - 1>();
block.sync();
// block id
int bx = compute % tile_dim_x;
int by = compute / tile_dim_x;
// indices
int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
int index_n = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + tx - cols;
int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
int index_w = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty - 1;
int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
if (index_n < 0) index_n = 0;
if (index_s >= (cols * rows)) index_s = cols * rows - 1;
if (index_w < 0) index_w = 0;
if (index_e >= (cols * rows)) index_e = cols * rows - 1;
float n, w, e, s, jc, g2, l, num, den, qsqr, c;
jc = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
if (ty == 0 && tx == 0)
{ // nw
n = north[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
w = west[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
}
else if (ty == 0 && tx == BLOCK_SIZE - 1)
{ // ne
n = north[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
e = east[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
}
else if (ty == BLOCK_SIZE - 1 && tx == BLOCK_SIZE - 1)
{ // se
n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
s = south[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
e = east[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
}
else if (ty == BLOCK_SIZE - 1 && tx == 0)
{ // sw
n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
s = south[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
w = west[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
}
else if (ty == 0)
{ // n
n = north[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
}
else if (tx == BLOCK_SIZE - 1)
{ // e
n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
e = east[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
}
else if (ty == BLOCK_SIZE - 1)
{ // s
n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
s = south[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
}
else if (tx == 0)
{ // w
n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
w = west[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
}
else
{ // the data elements which are not on the borders
n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
}
g2 = (n * n + s * s + w * w + e * e) / (jc * jc);
l = (n + s + w + e) / jc;
num = (0.5 * g2) - ((1.0 / 16.0) * (l * l));
den = 1 + (.25 * l);
qsqr = num / (den * den);
// diffusion coefficent (equ 33)
den = (qsqr - q0sqr) / (q0sqr * (1 + q0sqr));
c = 1.0 / (1.0 + den);
// saturate diffusion coefficent
if (c < 0)
{
temp_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = 0;
}
else if (c > 1)
{
temp_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = 1;
}
else
{
temp_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = c;
}
block.sync();
C_cuda[index] = temp_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
E_C[index] = e;
W_C[index] = w;
S_C[index] = s;
N_C[index] = n;
}
}
__global__ void
srad_cuda_2(
float *E_C,
float *W_C,
float *N_C,
float *S_C,
float *J_cuda,
float *C_cuda,
int cols,
int rows,
float lambda,
float q0sqr,
int block_size)
{
cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
pipeline pipe;
// shared memory allocation
__shared__ float south_c[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
__shared__ float east_c[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
__shared__ float c_cuda_temp[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
__shared__ float c_cuda_result[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
__shared__ float temp[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
int tile_dim_x = cols / BLOCK_SIZE;
int total_tiles = tile_dim_x * tile_dim_x;
int tiles_this_block = (block_size / BLOCK_SIZE) * (block_size / BLOCK_SIZE);
int tiles_this_block_x = (block_size / BLOCK_SIZE);
int base_tile = (blockIdx.y * gridDim.x + blockIdx.x) * tiles_this_block;
int fetch = base_tile;
int end_tile = fetch + tiles_this_block;
for (int compute = fetch; compute < end_tile; compute++)
{
// thread id
int tx = threadIdx.x;
int ty = threadIdx.y;
for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
{
// block id
int offset = fetch - base_tile;
int block_id = fetch / tiles_this_block;
int bx = block_id % gridDim.x * tiles_this_block_x + offset % tiles_this_block_x;
int by = block_id / gridDim.x * tiles_this_block_x + offset / tiles_this_block_x;
// indices
int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
if (index_s >= (cols * rows)) index_s = cols * rows - 1;
if (index_e >= (cols * rows)) index_e = cols * rows - 1;
// load data to shared memory
temp[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = J_cuda[index];
block.sync();
south_c[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = C_cuda[index_s];
if (by == tile_dim_x - 1)
{
south_c[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = C_cuda[cols * BLOCK_SIZE * (tile_dim_x - 1) + BLOCK_SIZE * bx + cols * (BLOCK_SIZE - 1) + tx];
}
block.sync();
east_c[ty * BLOCK_SIZE + tx] = C_cuda[index_e];
if (bx == tile_dim_x - 1)
{
east_c[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = C_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * (tile_dim_x - 1) + cols * ty + BLOCK_SIZE - 1];
}
block.sync();
c_cuda_temp[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = C_cuda[index];
block.sync();
}
if (fetch == end_tile)
{
for (int i = 0; i < PREFETCH_COUNT - 1; ++i)
{
pipe.commit();
}
++fetch;
}
pipe.wait_prior<PREFETCH_COUNT - 1>();
block.sync();
// block id
int bx = compute % tile_dim_x;
int by = compute / tile_dim_x;
// indices
int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
float cc, cn, cs, ce, cw, d_sum;
cc = c_cuda_temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
if (ty == BLOCK_SIZE - 1 && tx == BLOCK_SIZE - 1)
{ // se
cn = cc;
cs = south_c[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
cw = cc;
ce = east_c[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
}
else if (tx == BLOCK_SIZE - 1)
{ // e
cn = cc;
cs = c_cuda_temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx];
cw = cc;
ce = east_c[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
}
else if (ty == BLOCK_SIZE - 1)
{ // s
cn = cc;
cs = south_c[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
cw = cc;
ce = c_cuda_temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1];
}
else
{ // the data elements which are not on the borders
cn = cc;
cs = c_cuda_temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx];
cw = cc;
ce = c_cuda_temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1];
}
// divergence (equ 58)
d_sum = cn * N_C[index] + cs * S_C[index] + cw * W_C[index] + ce * E_C[index];
// image update (equ 61)
c_cuda_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] + 0.25 * lambda * d_sum;
block.sync();
J_cuda[index] = c_cuda_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
}
}