| #include "srad.h"
|
| #include <stdio.h>
|
|
|
| #include <cooperative_groups.h>
|
| #include <cooperative_groups/memcpy_async.h>
|
|
|
| using namespace nvcuda::experimental;
|
|
|
| #define PREFETCH_COUNT 2
|
|
|
| __global__ void
|
| srad_cuda_1(
|
| float *E_C,
|
| float *W_C,
|
| float *N_C,
|
| float *S_C,
|
| float *J_cuda,
|
| float *C_cuda,
|
| int cols,
|
| int rows,
|
| float q0sqr,
|
| int block_size)
|
| {
|
| cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
|
| pipeline pipe;
|
| // shared memory allocation
|
| __shared__ float temp[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
| __shared__ float temp_result[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
|
|
| __shared__ float north[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
| __shared__ float south[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
| __shared__ float east[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
| __shared__ float west[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
|
|
| int tile_dim_x = cols / BLOCK_SIZE;
|
|
|
| int total_tiles = tile_dim_x * tile_dim_x;
|
| int tiles_this_block = (block_size / BLOCK_SIZE) * (block_size / BLOCK_SIZE);
|
|
|
| int base_tile = (blockIdx.y * gridDim.x + blockIdx.x) * tiles_this_block;
|
| int fetch = base_tile;
|
| int end_tile = fetch + tiles_this_block;
|
|
|
| for (int compute = fetch; compute < end_tile; compute++)
|
| {
|
| // thread id
|
| int tx = threadIdx.x;
|
| int ty = threadIdx.y;
|
|
|
| for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
|
| {
|
| // block id
|
| int bx = fetch % tile_dim_x;
|
| int by = fetch / tile_dim_x;
|
|
|
| // indices
|
| int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
|
| int index_n = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + tx - cols;
|
| int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
|
| int index_w = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty - 1;
|
| int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
|
|
|
| if (index_n < 0) index_n = 0;
|
| if (index_s >= (cols * rows)) index_s = cols * rows - 1;
|
| if (index_w < 0) index_w = 0;
|
| if (index_e >= (cols * rows)) index_e = cols * rows - 1;
|
|
|
| // load data to shared memory
|
| memcpy_async(north[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[index_n], pipe);
|
| memcpy_async(south[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[index_s], pipe);
|
| if (by == 0)
|
| {
|
| memcpy_async(north[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[BLOCK_SIZE * bx + tx], pipe);
|
| }
|
| else if (by == tile_dim_x - 1)
|
| {
|
| memcpy_async(south[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx],
|
| J_cuda[cols * BLOCK_SIZE * (tile_dim_x - 1) + BLOCK_SIZE * bx + cols * (BLOCK_SIZE - 1) + tx], pipe);
|
| }
|
| block.sync();
|
|
|
| memcpy_async(west[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[index_w], pipe);
|
| memcpy_async(east[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[index_e], pipe);
|
|
|
| if (bx == 0)
|
| {
|
| memcpy_async(west[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[cols * BLOCK_SIZE * by + cols * ty], pipe);
|
| }
|
| else if (bx == tile_dim_x - 1)
|
| {
|
| memcpy_async(east[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx],
|
| J_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * (tile_dim_x - 1) + cols * ty + BLOCK_SIZE - 1], pipe);
|
| }
|
| block.sync();
|
| memcpy_async(temp[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx], J_cuda[index], pipe);
|
| pipe.commit();
|
| }
|
| if (fetch == end_tile)
|
| {
|
| for (int i = 0; i < PREFETCH_COUNT - 1; ++i)
|
| {
|
| pipe.commit();
|
| }
|
| ++fetch;
|
| }
|
| pipe.wait_prior<PREFETCH_COUNT - 1>();
|
| block.sync();
|
|
|
| // block id
|
| int bx = compute % tile_dim_x;
|
| int by = compute / tile_dim_x;
|
|
|
| // indices
|
| int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
|
| int index_n = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + tx - cols;
|
| int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
|
| int index_w = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty - 1;
|
| int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
|
|
|
| if (index_n < 0) index_n = 0;
|
| if (index_s >= (cols * rows)) index_s = cols * rows - 1;
|
| if (index_w < 0) index_w = 0;
|
| if (index_e >= (cols * rows)) index_e = cols * rows - 1;
|
|
|
| float n, w, e, s, jc, g2, l, num, den, qsqr, c;
|
| jc = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
|
|
|
| if (ty == 0 && tx == 0)
|
| { // nw
|
| n = north[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
|
| w = west[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
|
| }
|
| else if (ty == 0 && tx == BLOCK_SIZE - 1)
|
| { // ne
|
| n = north[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
|
| w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
|
| e = east[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| }
|
| else if (ty == BLOCK_SIZE - 1 && tx == BLOCK_SIZE - 1)
|
| { // se
|
| n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
|
| s = south[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
|
| e = east[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| }
|
| else if (ty == BLOCK_SIZE - 1 && tx == 0)
|
| { // sw
|
| n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
|
| s = south[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| w = west[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
|
| }
|
|
|
| else if (ty == 0)
|
| { // n
|
| n = north[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
|
| w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
|
| e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
|
| }
|
| else if (tx == BLOCK_SIZE - 1)
|
| { // e
|
| n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
|
| s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
|
| w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
|
| e = east[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| }
|
| else if (ty == BLOCK_SIZE - 1)
|
| { // s
|
| n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
|
| s = south[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
|
| e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
|
| }
|
| else if (tx == 0)
|
| { // w
|
| n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
|
| s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
|
| w = west[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] - jc;
|
| e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
|
| }
|
| else
|
| { // the data elements which are not on the borders
|
| n = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty - 1) * BLOCK_SIZE + tx] - jc;
|
| s = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx] - jc;
|
| w = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx - 1] - jc;
|
| e = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1] - jc;
|
| }
|
|
|
| g2 = (n * n + s * s + w * w + e * e) / (jc * jc);
|
|
|
| l = (n + s + w + e) / jc;
|
|
|
| num = (0.5 * g2) - ((1.0 / 16.0) * (l * l));
|
| den = 1 + (.25 * l);
|
| qsqr = num / (den * den);
|
|
|
| // diffusion coefficent (equ 33)
|
| den = (qsqr - q0sqr) / (q0sqr * (1 + q0sqr));
|
| c = 1.0 / (1.0 + den);
|
|
|
| // saturate diffusion coefficent
|
| if (c < 0)
|
| {
|
| temp_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = 0;
|
| }
|
| else if (c > 1)
|
| {
|
| temp_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = 1;
|
| }
|
| else
|
| {
|
| temp_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = c;
|
| }
|
| block.sync();
|
|
|
| C_cuda[index] = temp_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
|
| E_C[index] = e;
|
| W_C[index] = w;
|
| S_C[index] = s;
|
| N_C[index] = n;
|
| }
|
| }
|
|
|
|
|
|
|
| __global__ void
|
| srad_cuda_2(
|
| float *E_C,
|
| float *W_C,
|
| float *N_C,
|
| float *S_C,
|
| float *J_cuda,
|
| float *C_cuda,
|
| int cols,
|
| int rows,
|
| float lambda,
|
| float q0sqr,
|
| int block_size)
|
| {
|
| cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
|
| pipeline pipe;
|
| // shared memory allocation
|
| __shared__ float south_c[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
| __shared__ float east_c[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
|
|
| __shared__ float c_cuda_temp[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
| __shared__ float c_cuda_result[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
| __shared__ float temp[BLOCK_SIZE * BLOCK_SIZE * PREFETCH_COUNT];
|
|
|
| int tile_dim_x = cols / BLOCK_SIZE;
|
|
|
| int total_tiles = tile_dim_x * tile_dim_x;
|
| int tiles_this_block = (block_size / BLOCK_SIZE) * (block_size / BLOCK_SIZE);
|
| int tiles_this_block_x = (block_size / BLOCK_SIZE);
|
|
|
| int base_tile = (blockIdx.y * gridDim.x + blockIdx.x) * tiles_this_block;
|
| int fetch = base_tile;
|
| int end_tile = fetch + tiles_this_block;
|
|
|
| for (int compute = fetch; compute < end_tile; compute++)
|
| {
|
| // thread id
|
| int tx = threadIdx.x;
|
| int ty = threadIdx.y;
|
|
|
| for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
|
| {
|
| // block id
|
| int offset = fetch - base_tile;
|
| int block_id = fetch / tiles_this_block;
|
| int bx = block_id % gridDim.x * tiles_this_block_x + offset % tiles_this_block_x;
|
| int by = block_id / gridDim.x * tiles_this_block_x + offset / tiles_this_block_x;
|
|
|
| // indices
|
| int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
|
| int index_s = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * BLOCK_SIZE + tx;
|
| int index_e = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + BLOCK_SIZE;
|
|
|
| if (index_s >= (cols * rows)) index_s = cols * rows - 1;
|
| if (index_e >= (cols * rows)) index_e = cols * rows - 1;
|
|
|
| // load data to shared memory
|
| temp[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = J_cuda[index];
|
| block.sync();
|
|
|
| south_c[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = C_cuda[index_s];
|
| if (by == tile_dim_x - 1)
|
| {
|
| south_c[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = C_cuda[cols * BLOCK_SIZE * (tile_dim_x - 1) + BLOCK_SIZE * bx + cols * (BLOCK_SIZE - 1) + tx];
|
| }
|
| block.sync();
|
|
|
| east_c[ty * BLOCK_SIZE + tx] = C_cuda[index_e];
|
| if (bx == tile_dim_x - 1)
|
| {
|
| east_c[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = C_cuda[cols * BLOCK_SIZE * by + BLOCK_SIZE * (tile_dim_x - 1) + cols * ty + BLOCK_SIZE - 1];
|
| }
|
| block.sync();
|
|
|
| c_cuda_temp[(fetch % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = C_cuda[index];
|
| block.sync();
|
| }
|
| if (fetch == end_tile)
|
| {
|
| for (int i = 0; i < PREFETCH_COUNT - 1; ++i)
|
| {
|
| pipe.commit();
|
| }
|
| ++fetch;
|
| }
|
| pipe.wait_prior<PREFETCH_COUNT - 1>();
|
| block.sync();
|
|
|
| // block id
|
| int bx = compute % tile_dim_x;
|
| int by = compute / tile_dim_x;
|
|
|
| // indices
|
| int index = cols * BLOCK_SIZE * by + BLOCK_SIZE * bx + cols * ty + tx;
|
|
|
| float cc, cn, cs, ce, cw, d_sum;
|
| cc = c_cuda_temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
|
|
|
| if (ty == BLOCK_SIZE - 1 && tx == BLOCK_SIZE - 1)
|
| { // se
|
| cn = cc;
|
| cs = south_c[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
|
| cw = cc;
|
| ce = east_c[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
|
| }
|
| else if (tx == BLOCK_SIZE - 1)
|
| { // e
|
| cn = cc;
|
| cs = c_cuda_temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx];
|
| cw = cc;
|
| ce = east_c[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
|
| }
|
| else if (ty == BLOCK_SIZE - 1)
|
| { // s
|
| cn = cc;
|
| cs = south_c[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
|
| cw = cc;
|
| ce = c_cuda_temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1];
|
| }
|
| else
|
| { // the data elements which are not on the borders
|
| cn = cc;
|
| cs = c_cuda_temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + (ty + 1) * BLOCK_SIZE + tx];
|
| cw = cc;
|
| ce = c_cuda_temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx + 1];
|
| }
|
|
|
| // divergence (equ 58)
|
| d_sum = cn * N_C[index] + cs * S_C[index] + cw * W_C[index] + ce * E_C[index];
|
|
|
| // image update (equ 61)
|
| c_cuda_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] = temp[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx] + 0.25 * lambda * d_sum;
|
|
|
| block.sync();
|
|
|
| J_cuda[index] = c_cuda_result[(compute % PREFETCH_COUNT) * BLOCK_SIZE * BLOCK_SIZE + ty * BLOCK_SIZE + tx];
|
| }
|
| } |