| // SP overlap kernel. | |
| // | |
| // For each column c (one CUDA block), compute: | |
| // overlap[c] = sum over its synapse list of {inp[bit[c][s]] && perm[c][s] >= conn_thr} | |
| // boosted[c] = overlap[c] * boost[c] | |
| // raw_overlap[c] = overlap[c] (also returned so host can drive duty cycle) | |
| // | |
| // Memory layout (flat, column-major with per-column stride = synapses_per_col): | |
| // syn_bit[c * S + s] : u32 index into input SDR | |
| // syn_perm[c * S + s] : f32 permanence in [0, 1] | |
| // boost[c] : f32 | |
| // inp[b] : u8 0/1 | |
| // Output: | |
| // raw[c] : u32 | |
| // boosted[c] : f32 | |
| // | |
| // Launch: | |
| // grid = n_columns | |
| // block = 128 (or 256) — one warp-sweep across synapses; many warps give | |
| // parallel reduction across S (typically S=40). | |
| // | |
| // At S=40 this is completely latency-bound; we coalesce reads and do a | |
| // warp-shuffle reduction. For clarity we use a simple block-wide shared-mem | |
| // reduction which is sufficient for S <= 1024 and has zero correctness risk. | |
| extern "C" __global__ | |
| void sp_overlap( | |
| const unsigned char * __restrict__ inp, // (input_bits,) | |
| const unsigned int * __restrict__ syn_bit, // (n_columns * S,) | |
| const float * __restrict__ syn_perm,// (n_columns * S,) | |
| const float * __restrict__ boost, // (n_columns,) | |
| float conn_thr, | |
| unsigned int synapses_per_col, // S | |
| unsigned int n_columns, | |
| unsigned int * __restrict__ raw_out, // (n_columns,) | |
| float * __restrict__ boosted_out // (n_columns,) | |
| ) { | |
| const unsigned int c = blockIdx.x; | |
| if (c >= n_columns) return; | |
| const unsigned int base = c * synapses_per_col; | |
| const unsigned int tid = threadIdx.x; | |
| const unsigned int bsz = blockDim.x; | |
| // Per-thread partial count. | |
| unsigned int local = 0; | |
| for (unsigned int s = tid; s < synapses_per_col; s += bsz) { | |
| unsigned int b = syn_bit[base + s]; | |
| float p = syn_perm[base + s]; | |
| // Branchless: only counts when input active AND perm connected. | |
| // Using (inp != 0) to tolerate u8 layout. | |
| unsigned int hit = ((inp[b] != 0) && (p >= conn_thr)) ? 1u : 0u; | |
| local += hit; | |
| } | |
| // Block-wide reduction in shared memory. | |
| __shared__ unsigned int smem[32]; | |
| // Warp-level reduction via shuffle. | |
| unsigned int lane = tid & 31; | |
| unsigned int warp = tid >> 5; | |
| for (int off = 16; off > 0; off >>= 1) { | |
| local += __shfl_down_sync(0xffffffff, local, off); | |
| } | |
| if (lane == 0) smem[warp] = local; | |
| __syncthreads(); | |
| if (warp == 0) { | |
| unsigned int v = (tid < (bsz + 31) / 32) ? smem[lane] : 0; | |
| for (int off = 16; off > 0; off >>= 1) { | |
| v += __shfl_down_sync(0xffffffff, v, off); | |
| } | |
| if (tid == 0) { | |
| raw_out[c] = v; | |
| boosted_out[c] = (float)v * boost[c]; | |
| } | |
| } | |
| } | |