UvmPinAsync / workloads /realworld /async /backprop /backprop_cuda_kernel.cu.bp
lrh12580
first commit
5cb6c4b
#ifndef _BACKPROP_CUDA_KERNEL_H_
#define _BACKPROP_CUDA_KERNEL_H_
#include <stdio.h>
#include "backprop.h"
#include "math.h"
#include "cuda.h"
__global__ void
bpnn_layerforward_CUDA(float *input_cuda,
float *output_hidden_cuda,
float *input_hidden_cuda,
float *hidden_partial_sum,
int in,
int hid)
{
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int index = ( hid + 1 ) * HEIGHT * by + ( hid + 1 ) * ty + tx + 1 + ( hid + 1 ) ;
int index_in = HEIGHT * by + ty + 1;
__shared__ float input_node[HEIGHT];
__shared__ float weight_matrix[HEIGHT][WIDTH];
if ( tx == 0 )
input_node[ty] = input_cuda[index_in] ;
__syncthreads();
weight_matrix[ty][tx] = input_hidden_cuda[index];
__syncthreads();
weight_matrix[ty][tx] = weight_matrix[ty][tx] * input_node[ty];
__syncthreads();
for ( int i = 1 ; i <= __log2f(HEIGHT) ; i++){
int power_two = __powf(2, i);
if( ty % power_two == 0 )
weight_matrix[ty][tx] = weight_matrix[ty][tx] + weight_matrix[ty + power_two/2][tx];
__syncthreads();
}
input_hidden_cuda[index] = weight_matrix[ty][tx];
__syncthreads();
if ( tx == 0 ) {
hidden_partial_sum[by * hid + ty] = weight_matrix[tx][ty];
}
}
__global__ void bpnn_adjust_weights_cuda(float * delta,
int hid,
float * ly,
int in,
float * w,
float * oldw)
{
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int index = ( hid + 1 ) * HEIGHT * by + ( hid + 1 ) * ty + tx + 1 + ( hid + 1 ) ;
int index_y = HEIGHT * by + ty + 1;
int index_x = tx + 1;
//eta = 0.3;
//momentum = 0.3;
w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
oldw[index] = ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
__syncthreads();
if (ty == 0 && by ==0){
w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
}
}
#endif