icarus112's picture
Upload folder using huggingface_hub
1c59946 verified
// TM predict kernel — cell-grouped launch.
//
// Grid: n_cells blocks (one per cell).
// Block: 32 threads (one warp).
//
// Each block iterates the segments owned by its cell (count in cell_seg_count[cell]).
// For each live segment, counts active connected/potential synapses against
// prev_active_bits. Updates per-segment counters, cell_predictive bit, and
// col_predicted flag.
struct TmConfig {
unsigned int activation_threshold;
unsigned int learning_threshold;
unsigned int cells_per_column;
unsigned int synapses_per_segment;
unsigned int n_segments;
unsigned int n_cells;
unsigned int max_segments_per_cell;
unsigned int max_new_synapses;
int conn_thr_i16;
int perm_inc_i16;
int perm_dec_i16;
int predicted_seg_dec_i16;
int initial_perm_i16;
unsigned int iter_seed;
unsigned int n_cols;
unsigned int bits_words;
};
extern "C" __global__
void tm_predict(
const unsigned int * __restrict__ seg_cell_id,
const unsigned int * __restrict__ seg_syn_count,
const unsigned int * __restrict__ syn_presyn,
const short * __restrict__ syn_perm,
const unsigned int * __restrict__ cell_active_bits,
unsigned int * __restrict__ cell_predictive_bits,
unsigned char * __restrict__ col_predicted,
unsigned int * __restrict__ seg_num_active_connected,
unsigned int * __restrict__ seg_num_active_potential,
unsigned int * __restrict__ col_best_match,
const unsigned int * __restrict__ cell_seg_count,
TmConfig cfg
) {
const unsigned int cell = blockIdx.x;
if (cell >= cfg.n_cells) return;
const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
if (n_segs_here == 0) return;
const unsigned int tid = threadIdx.x;
const unsigned int col = cell / cfg.cells_per_column;
const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;
for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
const unsigned int seg = seg_base_id + local_seg;
const unsigned int n_syn = seg_syn_count[seg];
if (n_syn == 0) {
if (tid == 0) {
seg_num_active_connected[seg] = 0;
seg_num_active_potential[seg] = 0;
}
continue;
}
const unsigned int syn_base = seg * cfg.synapses_per_segment;
unsigned int local_conn = 0;
unsigned int local_pot = 0;
for (unsigned int s = tid; s < n_syn; s += 32u) {
unsigned int presyn = syn_presyn[syn_base + s];
unsigned int word = cell_active_bits[presyn >> 5];
unsigned int bit = (word >> (presyn & 31u)) & 1u;
if (bit) {
local_pot += 1u;
int p = (int)syn_perm[syn_base + s];
if (p >= cfg.conn_thr_i16) {
local_conn += 1u;
}
}
}
for (int off = 16; off > 0; off >>= 1) {
local_conn += __shfl_down_sync(0xffffffffu, local_conn, off);
local_pot += __shfl_down_sync(0xffffffffu, local_pot, off);
}
if (tid == 0) {
seg_num_active_connected[seg] = local_conn;
seg_num_active_potential[seg] = local_pot;
if (local_conn >= cfg.activation_threshold) {
unsigned int word_idx = cell >> 5;
unsigned int bit_mask = 1u << (cell & 31u);
atomicOr(&cell_predictive_bits[word_idx], bit_mask);
col_predicted[col] = 1;
}
if (local_pot >= cfg.learning_threshold) {
unsigned int pot_c = local_pot > 2047u ? 2047u : local_pot;
unsigned int key = (pot_c << 21) | (seg & 0x1FFFFFu);
atomicMax(&col_best_match[col], key);
}
}
}
}