| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| extern "C" __global__ |
| void sp_boost_from_duty( |
| const float * __restrict__ active_duty, |
| float * __restrict__ boost, |
| float boost_strength, |
| unsigned int n |
| ) { |
| extern __shared__ float smem_raw[]; |
| float * smem = smem_raw; |
| const unsigned int tid = threadIdx.x; |
| const unsigned int bsz = blockDim.x; |
|
|
| |
| float local_sum = 0.0f; |
| for (unsigned int i = tid; i < n; i += bsz) { |
| local_sum += active_duty[i]; |
| } |
| |
| for (int off = 16; off > 0; off >>= 1) { |
| local_sum += __shfl_down_sync(0xffffffff, local_sum, off); |
| } |
| unsigned int lane = tid & 31; |
| unsigned int warp = tid >> 5; |
| if (lane == 0) smem[warp] = local_sum; |
| __syncthreads(); |
|
|
| |
| __shared__ float mean_s; |
| if (warp == 0) { |
| unsigned int nwarps = (bsz + 31) / 32; |
| float v = (lane < nwarps) ? smem[lane] : 0.0f; |
| for (int off = 16; off > 0; off >>= 1) { |
| v += __shfl_down_sync(0xffffffff, v, off); |
| } |
| if (tid == 0) { |
| mean_s = v / (float)n; |
| } |
| } |
| __syncthreads(); |
|
|
| |
| float mean = mean_s; |
| for (unsigned int i = tid; i < n; i += bsz) { |
| float d = active_duty[i] - mean; |
| boost[i] = expf(-boost_strength * d); |
| } |
| } |
|
|