icarus112's picture
Upload folder using huggingface_hub
1c59946 verified
// SP overlap kernel.
//
// For each column c (one CUDA block), compute:
// overlap[c] = sum over its synapse list of {inp[bit[c][s]] && perm[c][s] >= conn_thr}
// boosted[c] = overlap[c] * boost[c]
// raw_overlap[c] = overlap[c] (also returned so host can drive duty cycle)
//
// Memory layout (flat, column-major with per-column stride = synapses_per_col):
// syn_bit[c * S + s] : u32 index into input SDR
// syn_perm[c * S + s] : f32 permanence in [0, 1]
// boost[c] : f32
// inp[b] : u8 0/1
// Output:
// raw[c] : u32
// boosted[c] : f32
//
// Launch:
// grid = n_columns
// block = 128 (or 256) — one warp-sweep across synapses; many warps give
// parallel reduction across S (typically S=40).
//
// At S=40 this is completely latency-bound; we coalesce reads and do a
// warp-shuffle reduction. For clarity we use a simple block-wide shared-mem
// reduction which is sufficient for S <= 1024 and has zero correctness risk.
extern "C" __global__
void sp_overlap(
const unsigned char * __restrict__ inp, // (input_bits,)
const unsigned int * __restrict__ syn_bit, // (n_columns * S,)
const float * __restrict__ syn_perm,// (n_columns * S,)
const float * __restrict__ boost, // (n_columns,)
float conn_thr,
unsigned int synapses_per_col, // S
unsigned int n_columns,
unsigned int * __restrict__ raw_out, // (n_columns,)
float * __restrict__ boosted_out // (n_columns,)
) {
const unsigned int c = blockIdx.x;
if (c >= n_columns) return;
const unsigned int base = c * synapses_per_col;
const unsigned int tid = threadIdx.x;
const unsigned int bsz = blockDim.x;
// Per-thread partial count.
unsigned int local = 0;
for (unsigned int s = tid; s < synapses_per_col; s += bsz) {
unsigned int b = syn_bit[base + s];
float p = syn_perm[base + s];
// Branchless: only counts when input active AND perm connected.
// Using (inp != 0) to tolerate u8 layout.
unsigned int hit = ((inp[b] != 0) && (p >= conn_thr)) ? 1u : 0u;
local += hit;
}
// Block-wide reduction in shared memory.
__shared__ unsigned int smem[32];
// Warp-level reduction via shuffle.
unsigned int lane = tid & 31;
unsigned int warp = tid >> 5;
for (int off = 16; off > 0; off >>= 1) {
local += __shfl_down_sync(0xffffffff, local, off);
}
if (lane == 0) smem[warp] = local;
__syncthreads();
if (warp == 0) {
unsigned int v = (tid < (bsz + 31) / 32) ? smem[lane] : 0;
for (int off = 16; off > 0; off >>= 1) {
v += __shfl_down_sync(0xffffffff, v, off);
}
if (tid == 0) {
raw_out[c] = v;
boosted_out[c] = (float)v * boost[c];
}
}
}