Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

App Files Files Community

feather-a10g-large-runtime / overlay /htm_rust /src /gpu /kernels /htm_fused_step.cu

icarus112

Upload folder using huggingface_hub

1c59946 verified 6 days ago

raw

history blame contribute delete

31.7 kB

	// Fused HTM megakernel — SP + TM, all T timesteps in a single launch.
	//
	// Design rationale:
	// - Global top-K column selection requires cross-block synchronization at
	// every timestep (grid.sync is unreliable on WSL2/sm_86 without rdc=true).
	// - Replace with per-column threshold activation using local lateral
	// inhibition: column c activates if overlap[c]*boost[c] > threshold[c].
	// Threshold is a per-column running-EMA learned scalar that steers the
	// column's long-run activation rate toward the global sparsity target.
	// - This is biologically grounded (GABAergic local inhibition) and supported
	// by HTM theory (duty-cycle boost already drives this loop; we just
	// change which lever the EMA pulls).
	//
	// Launch shape:
	// grid = min(device SM count, 16) // hard cap — see below
	// block = 1024 threads = 32 warps
	// Each warp of 32 owns a contiguous column slice (n_columns / total_warps).
	//
	// Cross-block coherence:
	// - Ping-pong buffers for cell_active/cell_winner: write _a at even t,
	// read _b; reversed at odd t.
	// - Preferred path: cooperative launch + hardware whole-grid sync.
	// - Fallback path: software 3-slot rotating grid barrier for devices/drivers
	// that cannot do cooperative launch.
	//
	// 2026-04-16: grid_dim reduced from 28 to 16 after deadlock RCA. The previous
	// cap of 28 relied on all blocks being concurrently resident on a 30-SM RTX
	// 3060 Laptop. Under thermal throttling effective residency dropped to ~20-24,
	// leaving scheduled blocks spinning on the software grid barrier waiting for
	// peer blocks that would never run. 16 blocks is below any realistic residency
	// floor and preserves enough warp parallelism (16*32 = 512 warps) to saturate
	// memory bandwidth on the spatial-pooler stage.
	//
	// Kernel signature uses struct-by-value for pointers and config to stay
	// inside cudarc's launch-arg count limit.

	#include <cooperative_groups.h>
	#include <cooperative_groups/memcpy_async.h>

	namespace cg = cooperative_groups;

	// Maximum columns owned per cluster-block in DSMEM.
	// Supports n_columns up to COLS_PER_CLUSTER_BLOCK_MAX * cluster_size.
	// At cluster_size=16: supports up to 256*16=4096 columns.
	// Each array costs 256*4 = 1024 bytes; three arrays = 3072 bytes per SM —
	// well under the 228 KB H200 shared-memory cap.
	#define COLS_PER_CLUSTER_BLOCK_MAX 256u

	// Maximum input_bits supported by the TMA-multicast staging tile.
	// At 32 KB this covers the production SDR width (16384 bits) with 2× headroom.
	// Total shared per SM: 32768 (tile) + 3072 (DSMEM float arrays) = ~35 KB —
	// well under the 228 KB H200 limit.
	//
	// Expected speedup from TMA multicast input staging (T9/T11):
	// - Without staging: 16 SMs × T × (input_bits GMEM reads per timestep)
	// - With staging: 1 TMA DMA per timestep, shared reads from L1 thereafter
	// - Theoretical DRAM bandwidth reduction: ~16× on input reads
	// - Wall-clock reduction estimate: -20 to -40 ms from reduced input fetch latency
	#define INPUT_BITS_MAX 32768u

	extern "C" {

	struct FusedPtrs {
	unsigned long long syn_bit;
	unsigned long long syn_perm;
	unsigned long long boost;
	unsigned long long active_duty;
	unsigned long long inhibition_threshold;
	unsigned long long seg_cell_id;
	unsigned long long seg_syn_count;
	unsigned long long syn_presyn;
	unsigned long long tm_syn_perm;
	unsigned long long cell_seg_count;
	unsigned long long cell_active_a;
	unsigned long long cell_active_b;
	unsigned long long cell_winner_a;
	unsigned long long cell_winner_b;
	unsigned long long inputs;
	unsigned long long cols_out;
	unsigned long long anom_out;
	unsigned long long barrier_counters;
	unsigned long long step_scratch;
	};

	struct FusedConfig {
	// SP constants
	unsigned int input_bits;
	unsigned int n_columns;
	unsigned int synapses_per_col;
	float conn_thr;
	float sp_inc;
	float sp_dec;
	float sparsity_target;
	float duty_alpha;
	float thr_adapt_rate;
	// TM constants
	unsigned int cells_per_column;
	unsigned int n_cells;
	unsigned int bits_words;
	unsigned int max_segments_per_cell;
	unsigned int synapses_per_segment;
	unsigned int activation_threshold;
	unsigned int learning_threshold;
	unsigned int max_new_synapses;
	int conn_thr_i16;
	int perm_inc_i16;
	int perm_dec_i16;
	int predicted_seg_dec_i16;
	int initial_perm_i16;
	// Loop constants
	unsigned int T;
	unsigned int learn;
	unsigned int iter_seed;
	unsigned int cooperative_grid_sync;
	};

	// Hardware cluster barrier using Hopper sm_90a cooperative_groups::this_cluster().sync().
	// Replaces the former software Decoupled Look-Back (DLB) atomic-spin barrier.
	//
	// cluster::sync() is a single PTX instruction (barrier.cluster) that resolves
	// in ~10-40 ns inside the cluster, with no device-level serialization.
	// Multiple clusters (one per HTM region) run fully concurrently — bounded
	// only by SM count (8 clusters × 16 SMs = 128 ≤ 132 on H200).
	//
	// The flags / expected / phase / cooperative_grid_sync parameters are kept
	// in the signature for call-site compatibility but are unused.
	__device__ static inline void fused_grid_barrier(cg::grid_group grid,
	unsigned int * /* flags — unused */,
	unsigned int /* expected — unused */,
	unsigned int /* phase — unused */,
	unsigned int /* cooperative_grid_sync — unused */) {
	#if __CUDA_ARCH__ >= 900
	// Hopper+ : hardware cluster barrier (~10-40 ns)
	auto cluster = cg::this_cluster();
	cluster.sync();
	#else
	// Pre-Hopper (sm_80, sm_86, sm_89): grid-level cooperative sync.
	// Requires cooperative kernel launch. ~us-ms range, adequate for HTM
	// workload (kernel launch frequency is low).
	grid.sync();
	#endif
	}

	__device__ static inline unsigned int warp_sum_u32(unsigned int v) {
	for (int off = 16; off > 0; off >>= 1) {
	v += __shfl_down_sync(0xffffffffu, v, off);
	}
	return v;
	}

	// Core kernel body — works for both single-region and batched launches.
	// Single-region: caller passes the one FusedPtrs struct.
	// Batched: each block reads its region's FusedPtrs via blockIdx.y before
	// calling this. State is independent per region (each region owns its own
	// GPU buffers); grid.sync() is the only cross-block primitive and it
	// spans ALL blocks in the grid (harmless over-sync across regions).
	__device__ static inline
	void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
	cg::grid_group grid = cg::this_grid();
	// Cast pointers.
	const unsigned int * __restrict__ syn_bit = (const unsigned int*)P.syn_bit;
	float * __restrict__ syn_perm = (float*)P.syn_perm;
	float * __restrict__ boost = (float*)P.boost;
	float * __restrict__ active_duty = (float*)P.active_duty;
	float * __restrict__ inhibition_threshold = (float*)P.inhibition_threshold;
	unsigned int * __restrict__ seg_cell_id = (unsigned int*)P.seg_cell_id;
	unsigned int * __restrict__ seg_syn_count = (unsigned int*)P.seg_syn_count;
	unsigned int * __restrict__ syn_presyn = (unsigned int*)P.syn_presyn;
	short * __restrict__ tm_syn_perm = (short*)P.tm_syn_perm;
	unsigned int * __restrict__ cell_seg_count = (unsigned int*)P.cell_seg_count;
	unsigned int * __restrict__ cell_active_a = (unsigned int*)P.cell_active_a;
	unsigned int * __restrict__ cell_active_b = (unsigned int*)P.cell_active_b;
	unsigned int * __restrict__ cell_winner_a = (unsigned int*)P.cell_winner_a;
	unsigned int * __restrict__ cell_winner_b = (unsigned int*)P.cell_winner_b;
	const unsigned char * __restrict__ inputs = (const unsigned char*)P.inputs;
	unsigned char * __restrict__ cols_out = (unsigned char*)P.cols_out;
	float * __restrict__ anom_out = (float*)P.anom_out;
	unsigned int * __restrict__ barrier_counters = (unsigned int*)P.barrier_counters;
	unsigned int * __restrict__ step_scratch = (unsigned int*)P.step_scratch;

	const unsigned int tid = threadIdx.x;
	const unsigned int lane = tid & 31u;
	const unsigned int warp = tid >> 5;
	const unsigned int warps_per_block = blockDim.x >> 5;
	const unsigned int gwarp = blockIdx.x * warps_per_block + warp;
	const unsigned int n_warps = gridDim.x * warps_per_block;

	const unsigned int n_cols = cfg.n_columns;
	const unsigned int col_lo = (gwarp * n_cols) / n_warps;
	const unsigned int col_hi = ((gwarp + 1) * n_cols) / n_warps;

	unsigned int phase = 0u;

	// =========================================================
	// DSMEM: Cluster-distributed shared memory for hot per-column
	// state (inhibition_threshold, boost, active_duty).
	//
	// On Hopper (sm_90+): Each block in the cluster owns a contiguous
	// slice of columns in its own __shared__ arrays. Any block can
	// peer-read another block's slice via cluster.map_shared_rank().
	//
	// On Ampere (sm_86) and other pre-Hopper: No cluster support.
	// Read/write directly from/to global memory (inhibition_threshold,
	// boost, active_duty device pointers). Slightly higher latency but
	// functionally correct.
	// =========================================================

	#if __CUDA_ARCH__ >= 900
	// Hopper+ cluster path
	auto cluster = cg::this_cluster();
	const unsigned int cluster_block_rank = cluster.block_rank(); // 0..cluster_size-1
	const unsigned int cluster_sz = cluster.num_blocks(); // == gridDim.x (≤16)
	#else
	// Pre-Hopper: no cluster, each block is independent.
	const unsigned int cluster_block_rank = blockIdx.x;
	const unsigned int cluster_sz = gridDim.x;
	#endif

	// Partition n_cols evenly across cluster blocks.
	// Each block owns cols_per_block columns starting at my_col_start.
	const unsigned int cols_per_block =
	(n_cols + cluster_sz - 1u) / cluster_sz; // ceil div
	const unsigned int my_col_start =
	cluster_block_rank * cols_per_block;
	const unsigned int my_col_end =
	(my_col_start + cols_per_block < n_cols)
	? (my_col_start + cols_per_block) : n_cols; // clamp

	#if __CUDA_ARCH__ >= 900
	// Cluster-distributed shared memory arrays.
	// Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
	// Peer blocks address into each other's smem via map_shared_rank.
	__shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
	__shared__ float s_boost [COLS_PER_CLUSTER_BLOCK_MAX];
	__shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
	#endif

	// TMA multicast input staging tile (T9) — HOPPER ONLY.
	//
	// On Hopper: cg::memcpy_async with cluster scope multicasts input to all
	// 16 SMs, reducing DRAM traffic by ~16×.
	// On Ampere: 32 KB smem allocation exceeds per-block budget when
	// cooperatively launched (48 KB total, registers eat the rest). Skip the
	// tile entirely — Stage A reads from GMEM directly (original path).
	#if __CUDA_ARCH__ >= 900
	__shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
	#endif

	#if __CUDA_ARCH__ >= 900
	// Initial GMEM → smem load (reads state from previous forward call).
	// Each block loads only its own slice; tid strides across the slice.
	for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
	const unsigned int off = c - my_col_start;
	s_inhib_thr [off] = inhibition_threshold[c];
	s_boost [off] = boost[c];
	s_active_duty[off] = active_duty[c];
	}

	// All blocks in the cluster must finish loading before any block
	// starts reading peer smem inside the T-loop.
	cluster.sync();
	#else
	// Pre-Hopper: no smem caching needed — reads go directly to GMEM.
	// Grid sync ensures all blocks have completed Phase 0 init before T-loop.
	grid.sync();
	#endif

	const unsigned int S = cfg.synapses_per_col;
	const unsigned int cpc = cfg.cells_per_column;
	const unsigned int SPS = cfg.synapses_per_segment;
	const unsigned int MSC = cfg.max_segments_per_cell;

	// Main timestep loop.
	for (unsigned int t = 0u; t < cfg.T; t++) {
	const unsigned int inp_off = t * cfg.input_bits;
	const unsigned int col_base_out = t * n_cols;

	unsigned int * curr_active = (t & 1u) ? cell_active_b : cell_active_a;
	unsigned int * prev_active = (t & 1u) ? cell_active_a : cell_active_b;
	unsigned int * curr_winner = (t & 1u) ? cell_winner_b : cell_winner_a;
	unsigned int * prev_winner = (t & 1u) ? cell_winner_a : cell_winner_b;

	// ---- Phase 0: clear curr bitsets for my cell range ----
	const unsigned int my_cell_lo = col_lo * cpc;
	const unsigned int my_cell_hi = col_hi * cpc;
	if (cpc == 32u) {
	// Fast path: one word per column.
	for (unsigned int c = col_lo + lane; c < col_hi; c += 32u) {
	curr_active[c] = 0u;
	curr_winner[c] = 0u;
	}
	} else {
	for (unsigned int cell = my_cell_lo + lane; cell < my_cell_hi; cell += 32u) {
	unsigned int w = cell >> 5;
	unsigned int m = 1u << (cell & 31u);
	atomicAnd(&curr_active[w], ~m);
	atomicAnd(&curr_winner[w], ~m);
	}
	}

	// Block 0, lane 0, warp 0 resets step-scratch counters.
	if (blockIdx.x == 0u && tid == 0u) {
	step_scratch[0] = 0u;
	step_scratch[1] = 0u;
	}

	// ---- BARRIER 1 ----
	// Fence: make the above clear-bitsets + scratch writes globally
	// visible before peer blocks observe "barrier arrived".
	__threadfence();
	fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);

	// =========================================================
	// T9: TMA MULTICAST INPUT STAGING
	//
	// Issue a single cluster-scope async DMA to broadcast this
	// timestep's input slice into s_input_tile across all 16 SMs
	// in the cluster simultaneously. On Hopper sm_90a,
	// cg::memcpy_async with cluster scope maps to the TMA
	// hardware unit (cp.async.bulk.tensor multicast), reducing
	// DRAM input traffic by ~16× vs each block fetching its own
	// copy from GMEM.
	//
	// The staging is gated on cfg.input_bits <= INPUT_BITS_MAX.
	// If the tile is too small (custom large input_bits), we fall
	// back to per-thread GMEM reads in Stage A (identical to the
	// original path; use_input_tile==false).
	//
	// Ordering: BARRIER 1 completes before we issue the DMA.
	// The DMA completes before Stage A reads s_input_tile.
	// =========================================================
	#if __CUDA_ARCH__ >= 900
	const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
	if (use_input_tile) {
	auto tb = cg::this_thread_block();
	cg::memcpy_async(tb, s_input_tile,
	inputs + inp_off,
	cfg.input_bits);
	cg::wait(tb);
	cluster.sync();
	}
	#else
	const bool use_input_tile = false;
	#endif

	// =========================================================
	// STAGE A: Spatial Pooler
	//
	// Hot per-column state (boost, inhibition_threshold,
	// active_duty) is served from cluster DSMEM rather than
	// GMEM for each of the T timesteps. GMEM is written on
	// update so state persists across forward calls.
	// =========================================================
	for (unsigned int c = col_lo; c < col_hi; c++) {
	unsigned int base = c * S;
	unsigned int local = 0u;
	for (unsigned int s = lane; s < S; s += 32u) {
	unsigned int b = syn_bit[base + s];
	float p = syn_perm[base + s];
	// T9: read from cluster-broadcast tile when available;
	// fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
	#if __CUDA_ARCH__ >= 900
	unsigned int inp_byte = use_input_tile
	? (unsigned int)s_input_tile[b]
	: (unsigned int)inputs[inp_off + b];
	#else
	unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
	#endif
	unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
	local += hit;
	}
	unsigned int overlap = warp_sum_u32(local);
	overlap = __shfl_sync(0xffffffffu, overlap, 0);

	// Read boost + threshold for column c.
	#if __CUDA_ARCH__ >= 900
	// Hopper: read from cluster-distributed shared memory.
	const unsigned int owner_block = c / cols_per_block;
	const unsigned int owner_offset = c - owner_block * cols_per_block;
	float boost_val = cluster.map_shared_rank(s_boost, owner_block)[owner_offset];
	float thr = cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset];
	#else
	// Pre-Hopper: read directly from global memory.
	float boost_val = boost[c];
	float thr = inhibition_threshold[c];
	#endif

	float boosted = (float)overlap * boost_val;
	unsigned int is_active = (boosted > thr) ? 1u : 0u;

	if (lane == 0) {
	cols_out[col_base_out + c] = (unsigned char)is_active;
	if (is_active) {
	atomicAdd(&step_scratch[0], 1u);
	}
	}

	// SP learn (Hebbian) on active columns.
	// T9: use tile for input reads here too.
	if (cfg.learn && is_active) {
	for (unsigned int s = lane; s < S; s += 32u) {
	unsigned int b = syn_bit[base + s];
	float p = syn_perm[base + s];
	#if __CUDA_ARCH__ >= 900
	unsigned int inp_byte = use_input_tile
	? (unsigned int)s_input_tile[b]
	: (unsigned int)inputs[inp_off + b];
	#else
	unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
	#endif
	if (inp_byte != 0u) {
	p += cfg.sp_inc;
	if (p > 1.0f) p = 1.0f;
	} else {
	p -= cfg.sp_dec;
	if (p < 0.0f) p = 0.0f;
	}
	syn_perm[base + s] = p;
	}
	}

	// active_duty EMA + threshold adaptation.
	// Writes go to both DSMEM (hot path, Hopper only) and GMEM (persistence).
	if (lane == 0) {
	#if __CUDA_ARCH__ >= 900
	float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
	#else
	float ad = active_duty[c];
	#endif
	float sample = is_active ? 1.0f : 0.0f;
	ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;

	#if __CUDA_ARCH__ >= 900
	// Writeback: peer smem (for next timestep read) + GMEM (persistence).
	cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
	#endif
	active_duty[c] = ad;

	// Threshold steers toward target sparsity.
	float err = ad - cfg.sparsity_target;
	float new_thr = thr + cfg.thr_adapt_rate * err * 100.0f;
	if (new_thr < 0.1f) new_thr = 0.1f;
	if (new_thr > 1000.0f) new_thr = 1000.0f;

	#if __CUDA_ARCH__ >= 900
	// Writeback: peer smem (for next timestep read) + GMEM (persistence).
	cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
	#endif
	inhibition_threshold[c] = new_thr;
	}
	}

	// ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
	//
	// On Hopper: cluster.sync() ensures all peer smem writes from this
	// timestep are visible to all blocks before Stage B / next t.
	// On pre-Hopper: no smem peer writes occur (all state in GMEM),
	// so no extra sync needed here — the grid barrier below suffices.
	#if __CUDA_ARCH__ >= 900
	cluster.sync();
	#endif

	// ---- BARRIER 2: SP active_mask must be visible before TM reads ----
	// Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
	// writes to global memory before peers advance past this barrier.
	__threadfence();
	fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);

	// =========================================================
	// STAGE B: Temporal Memory
	// =========================================================
	for (unsigned int c = col_lo; c < col_hi; c++) {
	unsigned int col_active = cols_out[col_base_out + c];
	if (col_active == 0u) continue;

	unsigned int base_cell = c * cpc;
	unsigned int any_predicted = 0u;
	unsigned int best_seg_id_for_grow = 0xFFFFFFFFu;
	unsigned int best_pot_count = 0u;

	for (unsigned int k = 0u; k < cpc; k++) {
	unsigned int cell = base_cell + k;
	unsigned int n_segs_here = cell_seg_count[cell];
	if (n_segs_here > MSC) n_segs_here = MSC;
	if (n_segs_here == 0u) continue;

	unsigned int seg_base_id = cell * MSC;
	unsigned int cell_is_predictive = 0u;

	for (unsigned int ls = 0u; ls < n_segs_here; ls++) {
	unsigned int seg = seg_base_id + ls;
	unsigned int n_syn = seg_syn_count[seg];
	if (n_syn == 0u) continue;
	unsigned int syn_base = seg * SPS;

	unsigned int l_conn = 0u;
	unsigned int l_pot = 0u;
	for (unsigned int s = lane; s < n_syn; s += 32u) {
	unsigned int presyn = syn_presyn[syn_base + s];
	unsigned int w = prev_active[presyn >> 5];
	unsigned int bit = (w >> (presyn & 31u)) & 1u;
	if (bit) {
	l_pot += 1u;
	int p = (int)tm_syn_perm[syn_base + s];
	if (p >= cfg.conn_thr_i16) l_conn += 1u;
	}
	}
	unsigned int tot_conn = warp_sum_u32(l_conn);
	unsigned int tot_pot = warp_sum_u32(l_pot);
	tot_conn = __shfl_sync(0xffffffffu, tot_conn, 0);
	tot_pot = __shfl_sync(0xffffffffu, tot_pot, 0);

	if (tot_conn >= cfg.activation_threshold) cell_is_predictive = 1u;
	if (tot_pot >= cfg.learning_threshold && tot_pot > best_pot_count) {
	best_pot_count = tot_pot;
	best_seg_id_for_grow = seg;
	}

	// Reinforce predicted-and-correct segment.
	if (cfg.learn && tot_conn >= cfg.activation_threshold) {
	for (unsigned int s = lane; s < n_syn; s += 32u) {
	unsigned int presyn = syn_presyn[syn_base + s];
	unsigned int w = prev_active[presyn >> 5];
	unsigned int bit = (w >> (presyn & 31u)) & 1u;
	int p = (int)tm_syn_perm[syn_base + s];
	if (bit) {
	int np = p + cfg.perm_inc_i16;
	if (np > 32767) np = 32767;
	tm_syn_perm[syn_base + s] = (short)np;
	} else {
	int np = p - cfg.perm_dec_i16;
	if (np < 0) np = 0;
	tm_syn_perm[syn_base + s] = (short)np;
	}
	}
	}
	}

	if (cell_is_predictive) {
	any_predicted = 1u;
	if (lane == 0) {
	unsigned int w = cell >> 5;
	unsigned int m = 1u << (cell & 31u);
	atomicOr(&curr_active[w], m);
	atomicOr(&curr_winner[w], m);
	}
	}
	}

	// BURST if no predicted.
	if (!any_predicted) {
	if (lane == 0) {
	for (unsigned int k = 0u; k < cpc; k++) {
	unsigned int cell = base_cell + k;
	unsigned int w = cell >> 5;
	unsigned int m = 1u << (cell & 31u);
	atomicOr(&curr_active[w], m);
	}
	unsigned int win = base_cell;
	unsigned int ww = win >> 5;
	unsigned int wm = 1u << (win & 31u);
	atomicOr(&curr_winner[ww], wm);
	atomicAdd(&step_scratch[1], 1u);
	}

	if (cfg.learn) {
	unsigned int target_seg;
	unsigned int existing_syn;
	if (best_seg_id_for_grow != 0xFFFFFFFFu) {
	// Reuse best matching segment.
	target_seg = best_seg_id_for_grow;
	existing_syn = seg_syn_count[target_seg];
	target_seg = __shfl_sync(0xffffffffu, target_seg, 0);
	existing_syn = __shfl_sync(0xffffffffu, existing_syn, 0);

	// Reinforce its existing synapses.
	unsigned int syn_base = target_seg * SPS;
	for (unsigned int s = lane; s < existing_syn; s += 32u) {
	unsigned int presyn = syn_presyn[syn_base + s];
	unsigned int w = prev_active[presyn >> 5];
	unsigned int bit = (w >> (presyn & 31u)) & 1u;
	int p = (int)tm_syn_perm[syn_base + s];
	if (bit) {
	int np = p + cfg.perm_inc_i16;
	if (np > 32767) np = 32767;
	tm_syn_perm[syn_base + s] = (short)np;
	} else {
	int np = p - cfg.perm_dec_i16;
	if (np < 0) np = 0;
	tm_syn_perm[syn_base + s] = (short)np;
	}
	}
	} else {
	// Allocate new segment on winner cell (cell 0 of col).
	unsigned int new_seg = 0u;
	if (lane == 0) {
	unsigned int winner_cell = base_cell;
	unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u);
	if (slot >= MSC) slot = slot % MSC;
	new_seg = winner_cell * MSC + slot;
	seg_cell_id[new_seg] = winner_cell;
	seg_syn_count[new_seg] = 0u;
	}
	target_seg = __shfl_sync(0xffffffffu, new_seg, 0);
	existing_syn = 0u;
	}

	// Grow synapses to prev_winner cells — lane 0 serialized.
	unsigned int room = (SPS > existing_syn) ? (SPS - existing_syn) : 0u;
	unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room;
	if (lane == 0 && max_grow > 0u) {
	unsigned int syn_base = target_seg * SPS;
	unsigned int grown = 0u;
	unsigned int start_off = (c * 2654435761u + cfg.iter_seed + t) % cfg.bits_words;
	for (unsigned int w_off = 0u;
	w_off < cfg.bits_words && grown < max_grow;
	w_off++) {
	unsigned int widx = (start_off + w_off) % cfg.bits_words;
	unsigned int word = prev_winner[widx];
	while (word != 0u && grown < max_grow) {
	unsigned int bit_pos = __ffs(word) - 1u;
	word &= ~(1u << bit_pos);
	unsigned int cell_id = widx * 32u + bit_pos;
	if (cell_id >= cfg.n_cells) continue;
	bool exists = false;
	for (unsigned int es = 0u; es < existing_syn + grown; es++) {
	if (syn_presyn[syn_base + es] == cell_id) { exists = true; break; }
	}
	if (exists) continue;
	unsigned int write_idx = existing_syn + grown;
	if (write_idx >= SPS) break;
	syn_presyn[syn_base + write_idx] = cell_id;
	tm_syn_perm[syn_base + write_idx] = (short)cfg.initial_perm_i16;
	grown++;
	}
	}
	if (grown > 0u) {
	seg_syn_count[target_seg] = existing_syn + grown;
	}
	}
	}
	}
	}

	// ---- BARRIER 3: TM writes complete before anomaly + next-step read ----
	// Fence: flush curr_active/curr_winner bitsets + tm_syn_perm +
	// seg_syn_count + syn_presyn before peers advance and consume them as
	// prev_active/prev_winner at t+1.
	__threadfence();
	fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);

	// Write anomaly for step t.
	if (blockIdx.x == 0u && tid == 0u) {
	unsigned int total = step_scratch[0];
	unsigned int bad = step_scratch[1];
	float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
	anom_out[t] = anom;
	}
	}
	}

	// Single-region kernel (legacy call site).
	__global__ __launch_bounds__(256, 2)
	void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
	htm_fused_step_body(P, cfg);
	}

	// Batched kernel: one cooperative launch for B regions. grid.y = B,
	// grid.x = per-region block count. Each block reads its region's
	// FusedPtrs from the device array via blockIdx.y.
	__global__ __launch_bounds__(256, 2)
	void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
	const FusedPtrs P = P_arr[blockIdx.y];
	htm_fused_step_body(P, cfg);
	}

	} // extern "C"