| // SP Hebbian learning kernel. | |
| // | |
| // For each active (winner) column c, for each of its synapses s: | |
| // if input[bit[c][s]] active: perm += inc | |
| // else: perm -= dec | |
| // Clamp to [0, 1]. | |
| // | |
| // Launch: one block per column (2048 blocks), but we predicate on | |
| // active_mask[c] to avoid launching k-specific blocks. | |
| // | |
| // This matches the CPU reference line-for-line: | |
| // src/sp.rs lines 157-169. | |
| extern "C" __global__ | |
| void sp_learn( | |
| const unsigned char * __restrict__ active_mask, // (n_columns,) 0/1 | |
| const unsigned char * __restrict__ inp, // (input_bits,) | |
| const unsigned int * __restrict__ syn_bit, // (n_columns * S,) | |
| float * __restrict__ syn_perm, // (n_columns * S,) in-place | |
| float inc, | |
| float dec, | |
| unsigned int synapses_per_col, | |
| unsigned int n_columns | |
| ) { | |
| const unsigned int c = blockIdx.x; | |
| if (c >= n_columns) return; | |
| if (active_mask[c] == 0) return; | |
| const unsigned int base = c * synapses_per_col; | |
| const unsigned int tid = threadIdx.x; | |
| const unsigned int bsz = blockDim.x; | |
| for (unsigned int s = tid; s < synapses_per_col; s += bsz) { | |
| unsigned int b = syn_bit[base + s]; | |
| float p = syn_perm[base + s]; | |
| if (inp[b] != 0) { | |
| p += inc; | |
| if (p > 1.0f) p = 1.0f; | |
| } else { | |
| p -= dec; | |
| if (p < 0.0f) p = 0.0f; | |
| } | |
| syn_perm[base + s] = p; | |
| } | |
| } | |