Spaces:

Jackoatmon
/

feather-runtime

Runtime error

App Files Files Community

feather-runtime / overlay /htm_rust /src /gpu /kernels /tm_predict.cu

Jackoatmon

Update Feather H200 runtime: Nemotron streaming and HTM force-CPU canary fixes

c2bf4b6 verified 14 days ago

raw

history blame contribute delete

3.89 kB

	// TM predict kernel — cell-grouped launch.
	//
	// Grid: n_cells blocks (one per cell).
	// Block: 32 threads (one warp).
	//
	// Each block iterates the segments owned by its cell (count in cell_seg_count[cell]).
	// For each live segment, counts active connected/potential synapses against
	// prev_active_bits. Updates per-segment counters, cell_predictive bit, and
	// col_predicted flag.

	struct TmConfig {
	unsigned int activation_threshold;
	unsigned int learning_threshold;
	unsigned int cells_per_column;
	unsigned int synapses_per_segment;
	unsigned int n_segments;
	unsigned int n_cells;
	unsigned int max_segments_per_cell;
	unsigned int max_new_synapses;
	int conn_thr_i16;
	int perm_inc_i16;
	int perm_dec_i16;
	int predicted_seg_dec_i16;
	int initial_perm_i16;
	unsigned int iter_seed;
	unsigned int n_cols;
	unsigned int bits_words;
	};

	extern "C" __global__
	void tm_predict(
	const unsigned int * __restrict__ seg_cell_id,
	const unsigned int * __restrict__ seg_syn_count,
	const unsigned int * __restrict__ syn_presyn,
	const short * __restrict__ syn_perm,
	const unsigned int * __restrict__ cell_active_bits,
	unsigned int * __restrict__ cell_predictive_bits,
	unsigned char * __restrict__ col_predicted,
	unsigned int * __restrict__ seg_num_active_connected,
	unsigned int * __restrict__ seg_num_active_potential,
	unsigned int * __restrict__ col_best_match,
	const unsigned int * __restrict__ cell_seg_count,
	TmConfig cfg
	) {
	const unsigned int cell = blockIdx.x;
	if (cell >= cfg.n_cells) return;

	const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
	if (n_segs_here == 0) return;

	const unsigned int tid = threadIdx.x;
	const unsigned int col = cell / cfg.cells_per_column;
	const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;

	for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
	const unsigned int seg = seg_base_id + local_seg;
	const unsigned int n_syn = seg_syn_count[seg];
	if (n_syn == 0) {
	if (tid == 0) {
	seg_num_active_connected[seg] = 0;
	seg_num_active_potential[seg] = 0;
	}
	continue;
	}
	const unsigned int syn_base = seg * cfg.synapses_per_segment;

	unsigned int local_conn = 0;
	unsigned int local_pot = 0;
	for (unsigned int s = tid; s < n_syn; s += 32u) {
	unsigned int presyn = syn_presyn[syn_base + s];
	unsigned int word = cell_active_bits[presyn >> 5];
	unsigned int bit = (word >> (presyn & 31u)) & 1u;
	if (bit) {
	local_pot += 1u;
	int p = (int)syn_perm[syn_base + s];
	if (p >= cfg.conn_thr_i16) {
	local_conn += 1u;
	}
	}
	}
	for (int off = 16; off > 0; off >>= 1) {
	local_conn += __shfl_down_sync(0xffffffffu, local_conn, off);
	local_pot += __shfl_down_sync(0xffffffffu, local_pot, off);
	}

	if (tid == 0) {
	seg_num_active_connected[seg] = local_conn;
	seg_num_active_potential[seg] = local_pot;
	if (local_conn >= cfg.activation_threshold) {
	unsigned int word_idx = cell >> 5;
	unsigned int bit_mask = 1u << (cell & 31u);
	atomicOr(&cell_predictive_bits[word_idx], bit_mask);
	col_predicted[col] = 1;
	}
	if (local_pot >= cfg.learning_threshold) {
	unsigned int pot_c = local_pot > 2047u ? 2047u : local_pot;
	unsigned int key = (pot_c << 21) \| (seg & 0x1FFFFFu);
	atomicMax(&col_best_match[col], key);
	}
	}
	}
	}