// SP overlap kernel.
//
// For each column c (one CUDA block), compute:
//   overlap[c] = sum over its synapse list of {inp[bit[c][s]] && perm[c][s] >= conn_thr}
//   boosted[c] = overlap[c] * boost[c]
//   raw_overlap[c] = overlap[c]   (also returned so host can drive duty cycle)
//
// Memory layout (flat, column-major with per-column stride = synapses_per_col):
//   syn_bit[c * S + s]  : u32 index into input SDR
//   syn_perm[c * S + s] : f32 permanence in [0, 1]
//   boost[c]           : f32
//   inp[b]             : u8 0/1
// Output:
//   raw[c]     : u32
//   boosted[c] : f32
//
// Launch:
//   grid  = n_columns
//   block = 128 (or 256) — one warp-sweep across synapses; many warps give
//                          parallel reduction across S (typically S=40).
//
// At S=40 this is completely latency-bound; we coalesce reads and do a
// warp-shuffle reduction. For clarity we use a simple block-wide shared-mem
// reduction which is sufficient for S <= 1024 and has zero correctness risk.

extern "C" __global__
void sp_overlap(
    const unsigned char * __restrict__ inp,     // (input_bits,)
    const unsigned int  * __restrict__ syn_bit, // (n_columns * S,)
    const float         * __restrict__ syn_perm,// (n_columns * S,)
    const float         * __restrict__ boost,   // (n_columns,)
    float                 conn_thr,
    unsigned int          synapses_per_col,     // S
    unsigned int          n_columns,
    unsigned int        * __restrict__ raw_out,     // (n_columns,)
    float               * __restrict__ boosted_out  // (n_columns,)
) {
    const unsigned int c = blockIdx.x;
    if (c >= n_columns) return;

    const unsigned int base = c * synapses_per_col;
    const unsigned int tid = threadIdx.x;
    const unsigned int bsz = blockDim.x;

    // Per-thread partial count.
    unsigned int local = 0;
    for (unsigned int s = tid; s < synapses_per_col; s += bsz) {
        unsigned int b = syn_bit[base + s];
        float p = syn_perm[base + s];
        // Branchless: only counts when input active AND perm connected.
        // Using (inp != 0) to tolerate u8 layout.
        unsigned int hit = ((inp[b] != 0) && (p >= conn_thr)) ? 1u : 0u;
        local += hit;
    }

    // Block-wide reduction in shared memory.
    __shared__ unsigned int smem[32];

    // Warp-level reduction via shuffle.
    unsigned int lane = tid & 31;
    unsigned int warp = tid >> 5;
    for (int off = 16; off > 0; off >>= 1) {
        local += __shfl_down_sync(0xffffffff, local, off);
    }
    if (lane == 0) smem[warp] = local;
    __syncthreads();

    if (warp == 0) {
        unsigned int v = (tid < (bsz + 31) / 32) ? smem[lane] : 0;
        for (int off = 16; off > 0; off >>= 1) {
            v += __shfl_down_sync(0xffffffff, v, off);
        }
        if (tid == 0) {
            raw_out[c] = v;
            boosted_out[c] = (float)v * boost[c];
        }
    }
}