#ifndef _BACKPROP_CUDA_KERNEL_H_ #define _BACKPROP_CUDA_KERNEL_H_ #include #include "backprop.h" #include "math.h" #include "cuda.h" #include #include using namespace nvcuda::experimental; #define PREFETCH_COUNT 2 __global__ void bpnn_layerforward_CUDA(float *input_cuda, float *output_hidden_cuda, float *input_hidden_cuda, float *hidden_partial_sum, int in, int hid, int tile_size) { cooperative_groups::thread_block block = cooperative_groups::this_thread_block(); pipeline pipe; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; __shared__ float input_node[HEIGHT * PREFETCH_COUNT]; __shared__ float weight_matrix[HEIGHT * WIDTH * PREFETCH_COUNT]; int batches = tile_size / WIDTH; int fetch = batches * by; int end_tile = fetch + batches; for (int compute = fetch; compute < end_tile; compute++) { for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++) { int fetch_index = (hid + 1) * HEIGHT * fetch + (hid + 1) * ty + tx + 1 + (hid + 1); int index_in = HEIGHT * fetch + ty + 1; if (tx == 0) memcpy_async(input_node[(fetch % PREFETCH_COUNT) * HEIGHT + ty], input_cuda[index_in], pipe); memcpy_async(weight_matrix[(fetch % PREFETCH_COUNT) * HEIGHT * WIDTH + ty * WIDTH + tx], input_hidden_cuda[fetch_index], pipe); pipe.commit(); } if (fetch == end_tile) { for (int i = 0; i < PREFETCH_COUNT - 1; ++i) { pipe.commit(); } ++fetch; } pipe.wait_prior(); block.sync(); int compute_index = (hid + 1) * HEIGHT * compute + (hid + 1) * ty + tx + 1 + (hid + 1); weight_matrix[(compute % PREFETCH_COUNT) * HEIGHT * WIDTH + ty * WIDTH + tx] *= input_node[(compute % PREFETCH_COUNT) * HEIGHT + ty]; block.sync(); for (int i = 1; i <= __log2f(HEIGHT); i++) { int power_two = __powf(2, i); if (ty % power_two == 0) weight_matrix[(compute % PREFETCH_COUNT) * HEIGHT * WIDTH + ty * WIDTH + tx] += weight_matrix[(compute % PREFETCH_COUNT) * HEIGHT * WIDTH + (ty + power_two / 2) * WIDTH + tx]; block.sync(); } input_hidden_cuda[compute_index] = weight_matrix[(compute % PREFETCH_COUNT) * HEIGHT * WIDTH + ty * WIDTH + tx]; block.sync(); if (tx == 0) { hidden_partial_sum[compute * hid + ty] = weight_matrix[(compute % PREFETCH_COUNT) * HEIGHT * WIDTH + tx * WIDTH + ty]; } } } // __global__ void // bpnn_layerforward_CUDA(float *input_cuda, // float *output_hidden_cuda, // float *input_hidden_cuda, // float *hidden_partial_sum, // int in, // int hid, // int tile_size) // { // int by = blockIdx.y; // int tx = threadIdx.x; // int ty = threadIdx.y; // int batches = tile_size / WIDTH; // __shared__ float input_node[HEIGHT]; // __shared__ float weight_matrix[HEIGHT * WIDTH]; // for (int b = 0; b < batches; b++) // { // int index = (hid + 1) * HEIGHT * (batches * by + b) + (hid + 1) * ty + tx + 1 + (hid + 1); // int index_in = HEIGHT * (batches * by + b) + ty + 1; // if (tx == 0) // input_node[ty] = input_cuda[index_in]; // __syncthreads(); // weight_matrix[ty * WIDTH + tx] = input_hidden_cuda[index]; // __syncthreads(); // weight_matrix[ty * WIDTH + tx] = weight_matrix[ty * WIDTH + tx] * input_node[ty]; // __syncthreads(); // for (int i = 1; i <= __log2f(HEIGHT); i++) // { // int power_two = __powf(2, i); // if (ty % power_two == 0) // weight_matrix[ty * WIDTH + tx] += weight_matrix[(ty + power_two / 2) * WIDTH + tx]; // __syncthreads(); // } // input_hidden_cuda[index] = weight_matrix[ty * WIDTH + tx]; // __syncthreads(); // if (tx == 0) // { // hidden_partial_sum[(batches * by + b) * hid + ty] = weight_matrix[tx * WIDTH + ty]; // } // } // } __global__ void bpnn_adjust_weights_cuda(float * delta, int hid, float * ly, int in, float * w, float * oldw, int tile_size) { int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int batches = tile_size / WIDTH; for (int b = 0; b < batches; b++) { int index = (hid + 1) * HEIGHT * (batches * by + b) + (hid + 1) * ty + tx + 1 + (hid + 1); int index_y = HEIGHT * (batches * by + b) + ty + 1; int index_x = tx + 1; // eta = 0.3; // momentum = 0.3; w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index])); oldw[index] = ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index])); __syncthreads(); if (ty == 0 && by == 0 && b == 0) { w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x])); oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x])); } } } #endif