// SP Hebbian learning kernel.
//
// For each active (winner) column c, for each of its synapses s:
//   if input[bit[c][s]] active: perm += inc
//   else:                       perm -= dec
// Clamp to [0, 1].
//
// Launch: one block per column (2048 blocks), but we predicate on
// active_mask[c] to avoid launching k-specific blocks.
//
// This matches the CPU reference line-for-line:
//   src/sp.rs lines 157-169.

extern "C" __global__
void sp_learn(
    const unsigned char * __restrict__ active_mask,  // (n_columns,) 0/1
    const unsigned char * __restrict__ inp,          // (input_bits,)
    const unsigned int  * __restrict__ syn_bit,      // (n_columns * S,)
    float               * __restrict__ syn_perm,     // (n_columns * S,) in-place
    float                 inc,
    float                 dec,
    unsigned int          synapses_per_col,
    unsigned int          n_columns
) {
    const unsigned int c = blockIdx.x;
    if (c >= n_columns) return;
    if (active_mask[c] == 0) return;

    const unsigned int base = c * synapses_per_col;
    const unsigned int tid = threadIdx.x;
    const unsigned int bsz = blockDim.x;

    for (unsigned int s = tid; s < synapses_per_col; s += bsz) {
        unsigned int b = syn_bit[base + s];
        float p = syn_perm[base + s];
        if (inp[b] != 0) {
            p += inc;
            if (p > 1.0f) p = 1.0f;
        } else {
            p -= dec;
            if (p < 0.0f) p = 0.0f;
        }
        syn_perm[base + s] = p;
    }
}