UvmPinAsync / workloads /realworld /async /backprop /backprop_cuda_kernel.cu

lrh12580

first commit

5cb6c4b about 2 years ago

5.61 kB



	#ifndef _BACKPROP_CUDA_KERNEL_H_
	#define _BACKPROP_CUDA_KERNEL_H_

	#include <stdio.h>
	#include "backprop.h"
	#include "math.h"
	#include "cuda.h"

	#include <cooperative_groups.h>
	#include <cooperative_groups/memcpy_async.h>

	using namespace nvcuda::experimental;

	#define PREFETCH_COUNT 2

	__global__ void
	bpnn_layerforward_CUDA(float *input_cuda,
	float *output_hidden_cuda,
	float *input_hidden_cuda,
	float *hidden_partial_sum,
	int in,
	int hid,
	int tile_size)
	{
	cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
	pipeline pipe;

	int by = blockIdx.y;
	int tx = threadIdx.x;
	int ty = threadIdx.y;

	__shared__ float input_node[HEIGHT * PREFETCH_COUNT];
	__shared__ float weight_matrix[HEIGHT * WIDTH * PREFETCH_COUNT];

	int batches = tile_size / WIDTH;

	int fetch = batches * by;
	int end_tile = fetch + batches;

	for (int compute = fetch; compute < end_tile; compute++)
	{
	for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++)
	{
	int fetch_index = (hid + 1) * HEIGHT * fetch + (hid + 1) * ty + tx + 1 + (hid + 1);
	int index_in = HEIGHT * fetch + ty + 1;

	if (tx == 0)
	memcpy_async(input_node[(fetch % PREFETCH_COUNT) * HEIGHT + ty], input_cuda[index_in], pipe);

	memcpy_async(weight_matrix[(fetch % PREFETCH_COUNT) * HEIGHT * WIDTH + ty * WIDTH + tx], input_hidden_cuda[fetch_index], pipe);
	pipe.commit();
	}
	if (fetch == end_tile)
	{
	for (int i = 0; i < PREFETCH_COUNT - 1; ++i)
	{
	pipe.commit();
	}
	++fetch;
	}
	pipe.wait_prior<PREFETCH_COUNT - 1>();
	block.sync();



	int compute_index = (hid + 1) * HEIGHT * compute + (hid + 1) * ty + tx + 1 + (hid + 1);
	weight_matrix[(compute % PREFETCH_COUNT) * HEIGHT * WIDTH + ty * WIDTH + tx] = input_node[(compute % PREFETCH_COUNT) HEIGHT + ty];
	block.sync();

	for (int i = 1; i <= __log2f(HEIGHT); i++) {
	int power_two = __powf(2, i);
	if (ty % power_two == 0)
	weight_matrix[(compute % PREFETCH_COUNT) * HEIGHT * WIDTH + ty * WIDTH + tx] += weight_matrix[(compute % PREFETCH_COUNT) * HEIGHT * WIDTH + (ty + power_two / 2) * WIDTH + tx];
	block.sync();
	}

	input_hidden_cuda[compute_index] = weight_matrix[(compute % PREFETCH_COUNT) * HEIGHT * WIDTH + ty * WIDTH + tx];
	block.sync();

	if (tx == 0)
	{
	hidden_partial_sum[compute * hid + ty] = weight_matrix[(compute % PREFETCH_COUNT) * HEIGHT * WIDTH + tx * WIDTH + ty];
	}
	}
	}

	// __global__ void
	// bpnn_layerforward_CUDA(float *input_cuda,
	// float *output_hidden_cuda,
	// float *input_hidden_cuda,
	// float *hidden_partial_sum,
	// int in,
	// int hid,
	// int tile_size)
	// {
	// int by = blockIdx.y;
	// int tx = threadIdx.x;
	// int ty = threadIdx.y;

	// int batches = tile_size / WIDTH;

	// __shared__ float input_node[HEIGHT];
	// __shared__ float weight_matrix[HEIGHT * WIDTH];

	// for (int b = 0; b < batches; b++)
	// {
	// int index = (hid + 1) * HEIGHT * (batches * by + b) + (hid + 1) * ty + tx + 1 + (hid + 1);

	// int index_in = HEIGHT * (batches * by + b) + ty + 1;

	// if (tx == 0)
	// input_node[ty] = input_cuda[index_in];

	// __syncthreads();

	// weight_matrix[ty * WIDTH + tx] = input_hidden_cuda[index];

	// __syncthreads();

	// weight_matrix[ty * WIDTH + tx] = weight_matrix[ty * WIDTH + tx] * input_node[ty];

	// __syncthreads();

	// for (int i = 1; i <= __log2f(HEIGHT); i++)
	// {

	// int power_two = __powf(2, i);

	// if (ty % power_two == 0)
	// weight_matrix[ty * WIDTH + tx] += weight_matrix[(ty + power_two / 2) * WIDTH + tx];

	// __syncthreads();
	// }

	// input_hidden_cuda[index] = weight_matrix[ty * WIDTH + tx];

	// __syncthreads();

	// if (tx == 0)
	// {
	// hidden_partial_sum[(batches * by + b) * hid + ty] = weight_matrix[tx * WIDTH + ty];
	// }
	// }
	// }

	__global__ void bpnn_adjust_weights_cuda(float * delta,
	int hid,
	float * ly,
	int in,
	float * w,
	float * oldw,
	int tile_size)
	{
	int by = blockIdx.y;

	int tx = threadIdx.x;
	int ty = threadIdx.y;

	int batches = tile_size / WIDTH;

	for (int b = 0; b < batches; b++) {
	int index = (hid + 1) * HEIGHT * (batches * by + b) + (hid + 1) * ty + tx + 1 + (hid + 1);
	int index_y = HEIGHT * (batches * by + b) + ty + 1;
	int index_x = tx + 1;
	// eta = 0.3;
	// momentum = 0.3;

	w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
	oldw[index] = ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));

	__syncthreads();

	if (ty == 0 && by == 0 && b == 0)
	{
	w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
	oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
	}
	}
	}
	#endif