icarus112's picture
Upload folder using huggingface_hub
1c59946 verified
// Fused HTM megakernel — SP + TM, all T timesteps in a single launch.
//
// Design rationale:
// - Global top-K column selection requires cross-block synchronization at
// every timestep (grid.sync is unreliable on WSL2/sm_86 without rdc=true).
// - Replace with per-column threshold activation using local lateral
// inhibition: column c activates if overlap[c]*boost[c] > threshold[c].
// Threshold is a per-column running-EMA learned scalar that steers the
// column's long-run activation rate toward the global sparsity target.
// - This is biologically grounded (GABAergic local inhibition) and supported
// by HTM theory (duty-cycle boost already drives this loop; we just
// change which lever the EMA pulls).
//
// Launch shape:
// grid = min(device SM count, 16) // hard cap — see below
// block = 1024 threads = 32 warps
// Each warp of 32 owns a contiguous column slice (n_columns / total_warps).
//
// Cross-block coherence:
// - Ping-pong buffers for cell_active/cell_winner: write _a at even t,
// read _b; reversed at odd t.
// - Preferred path: cooperative launch + hardware whole-grid sync.
// - Fallback path: software 3-slot rotating grid barrier for devices/drivers
// that cannot do cooperative launch.
//
// 2026-04-16: grid_dim reduced from 28 to 16 after deadlock RCA. The previous
// cap of 28 relied on all blocks being concurrently resident on a 30-SM RTX
// 3060 Laptop. Under thermal throttling effective residency dropped to ~20-24,
// leaving scheduled blocks spinning on the software grid barrier waiting for
// peer blocks that would never run. 16 blocks is below any realistic residency
// floor and preserves enough warp parallelism (16*32 = 512 warps) to saturate
// memory bandwidth on the spatial-pooler stage.
//
// Kernel signature uses struct-by-value for pointers and config to stay
// inside cudarc's launch-arg count limit.
#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>
namespace cg = cooperative_groups;
// Maximum columns owned per cluster-block in DSMEM.
// Supports n_columns up to COLS_PER_CLUSTER_BLOCK_MAX * cluster_size.
// At cluster_size=16: supports up to 256*16=4096 columns.
// Each array costs 256*4 = 1024 bytes; three arrays = 3072 bytes per SM —
// well under the 228 KB H200 shared-memory cap.
#define COLS_PER_CLUSTER_BLOCK_MAX 256u
// Maximum input_bits supported by the TMA-multicast staging tile.
// At 32 KB this covers the production SDR width (16384 bits) with 2× headroom.
// Total shared per SM: 32768 (tile) + 3072 (DSMEM float arrays) = ~35 KB —
// well under the 228 KB H200 limit.
//
// Expected speedup from TMA multicast input staging (T9/T11):
// - Without staging: 16 SMs × T × (input_bits GMEM reads per timestep)
// - With staging: 1 TMA DMA per timestep, shared reads from L1 thereafter
// - Theoretical DRAM bandwidth reduction: ~16× on input reads
// - Wall-clock reduction estimate: -20 to -40 ms from reduced input fetch latency
#define INPUT_BITS_MAX 32768u
extern "C" {
struct FusedPtrs {
unsigned long long syn_bit;
unsigned long long syn_perm;
unsigned long long boost;
unsigned long long active_duty;
unsigned long long inhibition_threshold;
unsigned long long seg_cell_id;
unsigned long long seg_syn_count;
unsigned long long syn_presyn;
unsigned long long tm_syn_perm;
unsigned long long cell_seg_count;
unsigned long long cell_active_a;
unsigned long long cell_active_b;
unsigned long long cell_winner_a;
unsigned long long cell_winner_b;
unsigned long long inputs;
unsigned long long cols_out;
unsigned long long anom_out;
unsigned long long barrier_counters;
unsigned long long step_scratch;
};
struct FusedConfig {
// SP constants
unsigned int input_bits;
unsigned int n_columns;
unsigned int synapses_per_col;
float conn_thr;
float sp_inc;
float sp_dec;
float sparsity_target;
float duty_alpha;
float thr_adapt_rate;
// TM constants
unsigned int cells_per_column;
unsigned int n_cells;
unsigned int bits_words;
unsigned int max_segments_per_cell;
unsigned int synapses_per_segment;
unsigned int activation_threshold;
unsigned int learning_threshold;
unsigned int max_new_synapses;
int conn_thr_i16;
int perm_inc_i16;
int perm_dec_i16;
int predicted_seg_dec_i16;
int initial_perm_i16;
// Loop constants
unsigned int T;
unsigned int learn;
unsigned int iter_seed;
unsigned int cooperative_grid_sync;
};
// Hardware cluster barrier using Hopper sm_90a cooperative_groups::this_cluster().sync().
// Replaces the former software Decoupled Look-Back (DLB) atomic-spin barrier.
//
// cluster::sync() is a single PTX instruction (barrier.cluster) that resolves
// in ~10-40 ns inside the cluster, with no device-level serialization.
// Multiple clusters (one per HTM region) run fully concurrently — bounded
// only by SM count (8 clusters × 16 SMs = 128 ≤ 132 on H200).
//
// The flags / expected / phase / cooperative_grid_sync parameters are kept
// in the signature for call-site compatibility but are unused.
__device__ static inline void fused_grid_barrier(cg::grid_group grid,
unsigned int * /* flags — unused */,
unsigned int /* expected — unused */,
unsigned int /* phase — unused */,
unsigned int /* cooperative_grid_sync — unused */) {
#if __CUDA_ARCH__ >= 900
// Hopper+ : hardware cluster barrier (~10-40 ns)
auto cluster = cg::this_cluster();
cluster.sync();
#else
// Pre-Hopper (sm_80, sm_86, sm_89): grid-level cooperative sync.
// Requires cooperative kernel launch. ~us-ms range, adequate for HTM
// workload (kernel launch frequency is low).
grid.sync();
#endif
}
__device__ static inline unsigned int warp_sum_u32(unsigned int v) {
for (int off = 16; off > 0; off >>= 1) {
v += __shfl_down_sync(0xffffffffu, v, off);
}
return v;
}
// Core kernel body — works for both single-region and batched launches.
// Single-region: caller passes the one FusedPtrs struct.
// Batched: each block reads its region's FusedPtrs via blockIdx.y before
// calling this. State is independent per region (each region owns its own
// GPU buffers); grid.sync() is the only cross-block primitive and it
// spans ALL blocks in the grid (harmless over-sync across regions).
__device__ static inline
void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
cg::grid_group grid = cg::this_grid();
// Cast pointers.
const unsigned int * __restrict__ syn_bit = (const unsigned int*)P.syn_bit;
float * __restrict__ syn_perm = (float*)P.syn_perm;
float * __restrict__ boost = (float*)P.boost;
float * __restrict__ active_duty = (float*)P.active_duty;
float * __restrict__ inhibition_threshold = (float*)P.inhibition_threshold;
unsigned int * __restrict__ seg_cell_id = (unsigned int*)P.seg_cell_id;
unsigned int * __restrict__ seg_syn_count = (unsigned int*)P.seg_syn_count;
unsigned int * __restrict__ syn_presyn = (unsigned int*)P.syn_presyn;
short * __restrict__ tm_syn_perm = (short*)P.tm_syn_perm;
unsigned int * __restrict__ cell_seg_count = (unsigned int*)P.cell_seg_count;
unsigned int * __restrict__ cell_active_a = (unsigned int*)P.cell_active_a;
unsigned int * __restrict__ cell_active_b = (unsigned int*)P.cell_active_b;
unsigned int * __restrict__ cell_winner_a = (unsigned int*)P.cell_winner_a;
unsigned int * __restrict__ cell_winner_b = (unsigned int*)P.cell_winner_b;
const unsigned char * __restrict__ inputs = (const unsigned char*)P.inputs;
unsigned char * __restrict__ cols_out = (unsigned char*)P.cols_out;
float * __restrict__ anom_out = (float*)P.anom_out;
unsigned int * __restrict__ barrier_counters = (unsigned int*)P.barrier_counters;
unsigned int * __restrict__ step_scratch = (unsigned int*)P.step_scratch;
const unsigned int tid = threadIdx.x;
const unsigned int lane = tid & 31u;
const unsigned int warp = tid >> 5;
const unsigned int warps_per_block = blockDim.x >> 5;
const unsigned int gwarp = blockIdx.x * warps_per_block + warp;
const unsigned int n_warps = gridDim.x * warps_per_block;
const unsigned int n_cols = cfg.n_columns;
const unsigned int col_lo = (gwarp * n_cols) / n_warps;
const unsigned int col_hi = ((gwarp + 1) * n_cols) / n_warps;
unsigned int phase = 0u;
// =========================================================
// DSMEM: Cluster-distributed shared memory for hot per-column
// state (inhibition_threshold, boost, active_duty).
//
// On Hopper (sm_90+): Each block in the cluster owns a contiguous
// slice of columns in its own __shared__ arrays. Any block can
// peer-read another block's slice via cluster.map_shared_rank().
//
// On Ampere (sm_86) and other pre-Hopper: No cluster support.
// Read/write directly from/to global memory (inhibition_threshold,
// boost, active_duty device pointers). Slightly higher latency but
// functionally correct.
// =========================================================
#if __CUDA_ARCH__ >= 900
// Hopper+ cluster path
auto cluster = cg::this_cluster();
const unsigned int cluster_block_rank = cluster.block_rank(); // 0..cluster_size-1
const unsigned int cluster_sz = cluster.num_blocks(); // == gridDim.x (≤16)
#else
// Pre-Hopper: no cluster, each block is independent.
const unsigned int cluster_block_rank = blockIdx.x;
const unsigned int cluster_sz = gridDim.x;
#endif
// Partition n_cols evenly across cluster blocks.
// Each block owns cols_per_block columns starting at my_col_start.
const unsigned int cols_per_block =
(n_cols + cluster_sz - 1u) / cluster_sz; // ceil div
const unsigned int my_col_start =
cluster_block_rank * cols_per_block;
const unsigned int my_col_end =
(my_col_start + cols_per_block < n_cols)
? (my_col_start + cols_per_block) : n_cols; // clamp
#if __CUDA_ARCH__ >= 900
// Cluster-distributed shared memory arrays.
// Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
// Peer blocks address into each other's smem via map_shared_rank.
__shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
__shared__ float s_boost [COLS_PER_CLUSTER_BLOCK_MAX];
__shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
#endif
// TMA multicast input staging tile (T9) — HOPPER ONLY.
//
// On Hopper: cg::memcpy_async with cluster scope multicasts input to all
// 16 SMs, reducing DRAM traffic by ~16×.
// On Ampere: 32 KB smem allocation exceeds per-block budget when
// cooperatively launched (48 KB total, registers eat the rest). Skip the
// tile entirely — Stage A reads from GMEM directly (original path).
#if __CUDA_ARCH__ >= 900
__shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
#endif
#if __CUDA_ARCH__ >= 900
// Initial GMEM → smem load (reads state from previous forward call).
// Each block loads only its own slice; tid strides across the slice.
for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
const unsigned int off = c - my_col_start;
s_inhib_thr [off] = inhibition_threshold[c];
s_boost [off] = boost[c];
s_active_duty[off] = active_duty[c];
}
// All blocks in the cluster must finish loading before any block
// starts reading peer smem inside the T-loop.
cluster.sync();
#else
// Pre-Hopper: no smem caching needed — reads go directly to GMEM.
// Grid sync ensures all blocks have completed Phase 0 init before T-loop.
grid.sync();
#endif
const unsigned int S = cfg.synapses_per_col;
const unsigned int cpc = cfg.cells_per_column;
const unsigned int SPS = cfg.synapses_per_segment;
const unsigned int MSC = cfg.max_segments_per_cell;
// Main timestep loop.
for (unsigned int t = 0u; t < cfg.T; t++) {
const unsigned int inp_off = t * cfg.input_bits;
const unsigned int col_base_out = t * n_cols;
unsigned int * curr_active = (t & 1u) ? cell_active_b : cell_active_a;
unsigned int * prev_active = (t & 1u) ? cell_active_a : cell_active_b;
unsigned int * curr_winner = (t & 1u) ? cell_winner_b : cell_winner_a;
unsigned int * prev_winner = (t & 1u) ? cell_winner_a : cell_winner_b;
// ---- Phase 0: clear curr bitsets for my cell range ----
const unsigned int my_cell_lo = col_lo * cpc;
const unsigned int my_cell_hi = col_hi * cpc;
if (cpc == 32u) {
// Fast path: one word per column.
for (unsigned int c = col_lo + lane; c < col_hi; c += 32u) {
curr_active[c] = 0u;
curr_winner[c] = 0u;
}
} else {
for (unsigned int cell = my_cell_lo + lane; cell < my_cell_hi; cell += 32u) {
unsigned int w = cell >> 5;
unsigned int m = 1u << (cell & 31u);
atomicAnd(&curr_active[w], ~m);
atomicAnd(&curr_winner[w], ~m);
}
}
// Block 0, lane 0, warp 0 resets step-scratch counters.
if (blockIdx.x == 0u && tid == 0u) {
step_scratch[0] = 0u;
step_scratch[1] = 0u;
}
// ---- BARRIER 1 ----
// Fence: make the above clear-bitsets + scratch writes globally
// visible before peer blocks observe "barrier arrived".
__threadfence();
fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
// =========================================================
// T9: TMA MULTICAST INPUT STAGING
//
// Issue a single cluster-scope async DMA to broadcast this
// timestep's input slice into s_input_tile across all 16 SMs
// in the cluster simultaneously. On Hopper sm_90a,
// cg::memcpy_async with cluster scope maps to the TMA
// hardware unit (cp.async.bulk.tensor multicast), reducing
// DRAM input traffic by ~16× vs each block fetching its own
// copy from GMEM.
//
// The staging is gated on cfg.input_bits <= INPUT_BITS_MAX.
// If the tile is too small (custom large input_bits), we fall
// back to per-thread GMEM reads in Stage A (identical to the
// original path; use_input_tile==false).
//
// Ordering: BARRIER 1 completes before we issue the DMA.
// The DMA completes before Stage A reads s_input_tile.
// =========================================================
#if __CUDA_ARCH__ >= 900
const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
if (use_input_tile) {
auto tb = cg::this_thread_block();
cg::memcpy_async(tb, s_input_tile,
inputs + inp_off,
cfg.input_bits);
cg::wait(tb);
cluster.sync();
}
#else
const bool use_input_tile = false;
#endif
// =========================================================
// STAGE A: Spatial Pooler
//
// Hot per-column state (boost, inhibition_threshold,
// active_duty) is served from cluster DSMEM rather than
// GMEM for each of the T timesteps. GMEM is written on
// update so state persists across forward calls.
// =========================================================
for (unsigned int c = col_lo; c < col_hi; c++) {
unsigned int base = c * S;
unsigned int local = 0u;
for (unsigned int s = lane; s < S; s += 32u) {
unsigned int b = syn_bit[base + s];
float p = syn_perm[base + s];
// T9: read from cluster-broadcast tile when available;
// fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
#if __CUDA_ARCH__ >= 900
unsigned int inp_byte = use_input_tile
? (unsigned int)s_input_tile[b]
: (unsigned int)inputs[inp_off + b];
#else
unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
#endif
unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
local += hit;
}
unsigned int overlap = warp_sum_u32(local);
overlap = __shfl_sync(0xffffffffu, overlap, 0);
// Read boost + threshold for column c.
#if __CUDA_ARCH__ >= 900
// Hopper: read from cluster-distributed shared memory.
const unsigned int owner_block = c / cols_per_block;
const unsigned int owner_offset = c - owner_block * cols_per_block;
float boost_val = cluster.map_shared_rank(s_boost, owner_block)[owner_offset];
float thr = cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset];
#else
// Pre-Hopper: read directly from global memory.
float boost_val = boost[c];
float thr = inhibition_threshold[c];
#endif
float boosted = (float)overlap * boost_val;
unsigned int is_active = (boosted > thr) ? 1u : 0u;
if (lane == 0) {
cols_out[col_base_out + c] = (unsigned char)is_active;
if (is_active) {
atomicAdd(&step_scratch[0], 1u);
}
}
// SP learn (Hebbian) on active columns.
// T9: use tile for input reads here too.
if (cfg.learn && is_active) {
for (unsigned int s = lane; s < S; s += 32u) {
unsigned int b = syn_bit[base + s];
float p = syn_perm[base + s];
#if __CUDA_ARCH__ >= 900
unsigned int inp_byte = use_input_tile
? (unsigned int)s_input_tile[b]
: (unsigned int)inputs[inp_off + b];
#else
unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
#endif
if (inp_byte != 0u) {
p += cfg.sp_inc;
if (p > 1.0f) p = 1.0f;
} else {
p -= cfg.sp_dec;
if (p < 0.0f) p = 0.0f;
}
syn_perm[base + s] = p;
}
}
// active_duty EMA + threshold adaptation.
// Writes go to both DSMEM (hot path, Hopper only) and GMEM (persistence).
if (lane == 0) {
#if __CUDA_ARCH__ >= 900
float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
#else
float ad = active_duty[c];
#endif
float sample = is_active ? 1.0f : 0.0f;
ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
#if __CUDA_ARCH__ >= 900
// Writeback: peer smem (for next timestep read) + GMEM (persistence).
cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
#endif
active_duty[c] = ad;
// Threshold steers toward target sparsity.
float err = ad - cfg.sparsity_target;
float new_thr = thr + cfg.thr_adapt_rate * err * 100.0f;
if (new_thr < 0.1f) new_thr = 0.1f;
if (new_thr > 1000.0f) new_thr = 1000.0f;
#if __CUDA_ARCH__ >= 900
// Writeback: peer smem (for next timestep read) + GMEM (persistence).
cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
#endif
inhibition_threshold[c] = new_thr;
}
}
// ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
//
// On Hopper: cluster.sync() ensures all peer smem writes from this
// timestep are visible to all blocks before Stage B / next t.
// On pre-Hopper: no smem peer writes occur (all state in GMEM),
// so no extra sync needed here — the grid barrier below suffices.
#if __CUDA_ARCH__ >= 900
cluster.sync();
#endif
// ---- BARRIER 2: SP active_mask must be visible before TM reads ----
// Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
// writes to global memory before peers advance past this barrier.
__threadfence();
fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
// =========================================================
// STAGE B: Temporal Memory
// =========================================================
for (unsigned int c = col_lo; c < col_hi; c++) {
unsigned int col_active = cols_out[col_base_out + c];
if (col_active == 0u) continue;
unsigned int base_cell = c * cpc;
unsigned int any_predicted = 0u;
unsigned int best_seg_id_for_grow = 0xFFFFFFFFu;
unsigned int best_pot_count = 0u;
for (unsigned int k = 0u; k < cpc; k++) {
unsigned int cell = base_cell + k;
unsigned int n_segs_here = cell_seg_count[cell];
if (n_segs_here > MSC) n_segs_here = MSC;
if (n_segs_here == 0u) continue;
unsigned int seg_base_id = cell * MSC;
unsigned int cell_is_predictive = 0u;
for (unsigned int ls = 0u; ls < n_segs_here; ls++) {
unsigned int seg = seg_base_id + ls;
unsigned int n_syn = seg_syn_count[seg];
if (n_syn == 0u) continue;
unsigned int syn_base = seg * SPS;
unsigned int l_conn = 0u;
unsigned int l_pot = 0u;
for (unsigned int s = lane; s < n_syn; s += 32u) {
unsigned int presyn = syn_presyn[syn_base + s];
unsigned int w = prev_active[presyn >> 5];
unsigned int bit = (w >> (presyn & 31u)) & 1u;
if (bit) {
l_pot += 1u;
int p = (int)tm_syn_perm[syn_base + s];
if (p >= cfg.conn_thr_i16) l_conn += 1u;
}
}
unsigned int tot_conn = warp_sum_u32(l_conn);
unsigned int tot_pot = warp_sum_u32(l_pot);
tot_conn = __shfl_sync(0xffffffffu, tot_conn, 0);
tot_pot = __shfl_sync(0xffffffffu, tot_pot, 0);
if (tot_conn >= cfg.activation_threshold) cell_is_predictive = 1u;
if (tot_pot >= cfg.learning_threshold && tot_pot > best_pot_count) {
best_pot_count = tot_pot;
best_seg_id_for_grow = seg;
}
// Reinforce predicted-and-correct segment.
if (cfg.learn && tot_conn >= cfg.activation_threshold) {
for (unsigned int s = lane; s < n_syn; s += 32u) {
unsigned int presyn = syn_presyn[syn_base + s];
unsigned int w = prev_active[presyn >> 5];
unsigned int bit = (w >> (presyn & 31u)) & 1u;
int p = (int)tm_syn_perm[syn_base + s];
if (bit) {
int np = p + cfg.perm_inc_i16;
if (np > 32767) np = 32767;
tm_syn_perm[syn_base + s] = (short)np;
} else {
int np = p - cfg.perm_dec_i16;
if (np < 0) np = 0;
tm_syn_perm[syn_base + s] = (short)np;
}
}
}
}
if (cell_is_predictive) {
any_predicted = 1u;
if (lane == 0) {
unsigned int w = cell >> 5;
unsigned int m = 1u << (cell & 31u);
atomicOr(&curr_active[w], m);
atomicOr(&curr_winner[w], m);
}
}
}
// BURST if no predicted.
if (!any_predicted) {
if (lane == 0) {
for (unsigned int k = 0u; k < cpc; k++) {
unsigned int cell = base_cell + k;
unsigned int w = cell >> 5;
unsigned int m = 1u << (cell & 31u);
atomicOr(&curr_active[w], m);
}
unsigned int win = base_cell;
unsigned int ww = win >> 5;
unsigned int wm = 1u << (win & 31u);
atomicOr(&curr_winner[ww], wm);
atomicAdd(&step_scratch[1], 1u);
}
if (cfg.learn) {
unsigned int target_seg;
unsigned int existing_syn;
if (best_seg_id_for_grow != 0xFFFFFFFFu) {
// Reuse best matching segment.
target_seg = best_seg_id_for_grow;
existing_syn = seg_syn_count[target_seg];
target_seg = __shfl_sync(0xffffffffu, target_seg, 0);
existing_syn = __shfl_sync(0xffffffffu, existing_syn, 0);
// Reinforce its existing synapses.
unsigned int syn_base = target_seg * SPS;
for (unsigned int s = lane; s < existing_syn; s += 32u) {
unsigned int presyn = syn_presyn[syn_base + s];
unsigned int w = prev_active[presyn >> 5];
unsigned int bit = (w >> (presyn & 31u)) & 1u;
int p = (int)tm_syn_perm[syn_base + s];
if (bit) {
int np = p + cfg.perm_inc_i16;
if (np > 32767) np = 32767;
tm_syn_perm[syn_base + s] = (short)np;
} else {
int np = p - cfg.perm_dec_i16;
if (np < 0) np = 0;
tm_syn_perm[syn_base + s] = (short)np;
}
}
} else {
// Allocate new segment on winner cell (cell 0 of col).
unsigned int new_seg = 0u;
if (lane == 0) {
unsigned int winner_cell = base_cell;
unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u);
if (slot >= MSC) slot = slot % MSC;
new_seg = winner_cell * MSC + slot;
seg_cell_id[new_seg] = winner_cell;
seg_syn_count[new_seg] = 0u;
}
target_seg = __shfl_sync(0xffffffffu, new_seg, 0);
existing_syn = 0u;
}
// Grow synapses to prev_winner cells — lane 0 serialized.
unsigned int room = (SPS > existing_syn) ? (SPS - existing_syn) : 0u;
unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room;
if (lane == 0 && max_grow > 0u) {
unsigned int syn_base = target_seg * SPS;
unsigned int grown = 0u;
unsigned int start_off = (c * 2654435761u + cfg.iter_seed + t) % cfg.bits_words;
for (unsigned int w_off = 0u;
w_off < cfg.bits_words && grown < max_grow;
w_off++) {
unsigned int widx = (start_off + w_off) % cfg.bits_words;
unsigned int word = prev_winner[widx];
while (word != 0u && grown < max_grow) {
unsigned int bit_pos = __ffs(word) - 1u;
word &= ~(1u << bit_pos);
unsigned int cell_id = widx * 32u + bit_pos;
if (cell_id >= cfg.n_cells) continue;
bool exists = false;
for (unsigned int es = 0u; es < existing_syn + grown; es++) {
if (syn_presyn[syn_base + es] == cell_id) { exists = true; break; }
}
if (exists) continue;
unsigned int write_idx = existing_syn + grown;
if (write_idx >= SPS) break;
syn_presyn[syn_base + write_idx] = cell_id;
tm_syn_perm[syn_base + write_idx] = (short)cfg.initial_perm_i16;
grown++;
}
}
if (grown > 0u) {
seg_syn_count[target_seg] = existing_syn + grown;
}
}
}
}
}
// ---- BARRIER 3: TM writes complete before anomaly + next-step read ----
// Fence: flush curr_active/curr_winner bitsets + tm_syn_perm +
// seg_syn_count + syn_presyn before peers advance and consume them as
// prev_active/prev_winner at t+1.
__threadfence();
fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
// Write anomaly for step t.
if (blockIdx.x == 0u && tid == 0u) {
unsigned int total = step_scratch[0];
unsigned int bad = step_scratch[1];
float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
anom_out[t] = anom;
}
}
}
// Single-region kernel (legacy call site).
__global__ __launch_bounds__(256, 2)
void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
htm_fused_step_body(P, cfg);
}
// Batched kernel: one cooperative launch for B regions. grid.y = B,
// grid.x = per-region block count. Each block reads its region's
// FusedPtrs from the device array via blockIdx.y.
__global__ __launch_bounds__(256, 2)
void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
const FusedPtrs P = P_arr[blockIdx.y];
htm_fused_step_body(P, cfg);
}
} // extern "C"