// SP overlap kernel. // // For each column c (one CUDA block), compute: // overlap[c] = sum over its synapse list of {inp[bit[c][s]] && perm[c][s] >= conn_thr} // boosted[c] = overlap[c] * boost[c] // raw_overlap[c] = overlap[c] (also returned so host can drive duty cycle) // // Memory layout (flat, column-major with per-column stride = synapses_per_col): // syn_bit[c * S + s] : u32 index into input SDR // syn_perm[c * S + s] : f32 permanence in [0, 1] // boost[c] : f32 // inp[b] : u8 0/1 // Output: // raw[c] : u32 // boosted[c] : f32 // // Launch: // grid = n_columns // block = 128 (or 256) — one warp-sweep across synapses; many warps give // parallel reduction across S (typically S=40). // // At S=40 this is completely latency-bound; we coalesce reads and do a // warp-shuffle reduction. For clarity we use a simple block-wide shared-mem // reduction which is sufficient for S <= 1024 and has zero correctness risk. extern "C" __global__ void sp_overlap( const unsigned char * __restrict__ inp, // (input_bits,) const unsigned int * __restrict__ syn_bit, // (n_columns * S,) const float * __restrict__ syn_perm,// (n_columns * S,) const float * __restrict__ boost, // (n_columns,) float conn_thr, unsigned int synapses_per_col, // S unsigned int n_columns, unsigned int * __restrict__ raw_out, // (n_columns,) float * __restrict__ boosted_out // (n_columns,) ) { const unsigned int c = blockIdx.x; if (c >= n_columns) return; const unsigned int base = c * synapses_per_col; const unsigned int tid = threadIdx.x; const unsigned int bsz = blockDim.x; // Per-thread partial count. unsigned int local = 0; for (unsigned int s = tid; s < synapses_per_col; s += bsz) { unsigned int b = syn_bit[base + s]; float p = syn_perm[base + s]; // Branchless: only counts when input active AND perm connected. // Using (inp != 0) to tolerate u8 layout. unsigned int hit = ((inp[b] != 0) && (p >= conn_thr)) ? 1u : 0u; local += hit; } // Block-wide reduction in shared memory. __shared__ unsigned int smem[32]; // Warp-level reduction via shuffle. unsigned int lane = tid & 31; unsigned int warp = tid >> 5; for (int off = 16; off > 0; off >>= 1) { local += __shfl_down_sync(0xffffffff, local, off); } if (lane == 0) smem[warp] = local; __syncthreads(); if (warp == 0) { unsigned int v = (tid < (bsz + 31) / 32) ? smem[lane] : 0; for (int off = 16; off > 0; off >>= 1) { v += __shfl_down_sync(0xffffffff, v, off); } if (tid == 0) { raw_out[c] = v; boosted_out[c] = (float)v * boost[c]; } } }