Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

File size: 31,733 Bytes

1c59946

// Fused HTM megakernel — SP + TM, all T timesteps in a single launch.
//
// Design rationale:
//   - Global top-K column selection requires cross-block synchronization at
//     every timestep (grid.sync is unreliable on WSL2/sm_86 without rdc=true).
//   - Replace with per-column threshold activation using local lateral
//     inhibition: column c activates if overlap[c]*boost[c] > threshold[c].
//     Threshold is a per-column running-EMA learned scalar that steers the
//     column's long-run activation rate toward the global sparsity target.
//   - This is biologically grounded (GABAergic local inhibition) and supported
//     by HTM theory (duty-cycle boost already drives this loop; we just
//     change which lever the EMA pulls).
//
// Launch shape:
//   grid  = min(device SM count, 16)  // hard cap — see below
//   block = 1024 threads = 32 warps
//   Each warp of 32 owns a contiguous column slice (n_columns / total_warps).
//
// Cross-block coherence:
//   - Ping-pong buffers for cell_active/cell_winner: write _a at even t,
//     read _b; reversed at odd t.
//   - Preferred path: cooperative launch + hardware whole-grid sync.
//   - Fallback path: software 3-slot rotating grid barrier for devices/drivers
//     that cannot do cooperative launch.
//
// 2026-04-16: grid_dim reduced from 28 to 16 after deadlock RCA. The previous
// cap of 28 relied on all blocks being concurrently resident on a 30-SM RTX
// 3060 Laptop. Under thermal throttling effective residency dropped to ~20-24,
// leaving scheduled blocks spinning on the software grid barrier waiting for
// peer blocks that would never run. 16 blocks is below any realistic residency
// floor and preserves enough warp parallelism (16*32 = 512 warps) to saturate
// memory bandwidth on the spatial-pooler stage.
//
// Kernel signature uses struct-by-value for pointers and config to stay
// inside cudarc's launch-arg count limit.

#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>

namespace cg = cooperative_groups;

// Maximum columns owned per cluster-block in DSMEM.
// Supports n_columns up to COLS_PER_CLUSTER_BLOCK_MAX * cluster_size.
// At cluster_size=16: supports up to 256*16=4096 columns.
// Each array costs 256*4 = 1024 bytes; three arrays = 3072 bytes per SM —
// well under the 228 KB H200 shared-memory cap.
#define COLS_PER_CLUSTER_BLOCK_MAX 256u

// Maximum input_bits supported by the TMA-multicast staging tile.
// At 32 KB this covers the production SDR width (16384 bits) with 2× headroom.
// Total shared per SM: 32768 (tile) + 3072 (DSMEM float arrays) = ~35 KB —
// well under the 228 KB H200 limit.
//
// Expected speedup from TMA multicast input staging (T9/T11):
//   - Without staging: 16 SMs × T × (input_bits GMEM reads per timestep)
//   - With staging:    1 TMA DMA per timestep, shared reads from L1 thereafter
//   - Theoretical DRAM bandwidth reduction: ~16× on input reads
//   - Wall-clock reduction estimate: -20 to -40 ms from reduced input fetch latency
#define INPUT_BITS_MAX 32768u

extern "C" {

struct FusedPtrs {
    unsigned long long syn_bit;
    unsigned long long syn_perm;
    unsigned long long boost;
    unsigned long long active_duty;
    unsigned long long inhibition_threshold;
    unsigned long long seg_cell_id;
    unsigned long long seg_syn_count;
    unsigned long long syn_presyn;
    unsigned long long tm_syn_perm;
    unsigned long long cell_seg_count;
    unsigned long long cell_active_a;
    unsigned long long cell_active_b;
    unsigned long long cell_winner_a;
    unsigned long long cell_winner_b;
    unsigned long long inputs;
    unsigned long long cols_out;
    unsigned long long anom_out;
    unsigned long long barrier_counters;
    unsigned long long step_scratch;
};

struct FusedConfig {
    // SP constants
    unsigned int input_bits;
    unsigned int n_columns;
    unsigned int synapses_per_col;
    float        conn_thr;
    float        sp_inc;
    float        sp_dec;
    float        sparsity_target;
    float        duty_alpha;
    float        thr_adapt_rate;
    // TM constants
    unsigned int cells_per_column;
    unsigned int n_cells;
    unsigned int bits_words;
    unsigned int max_segments_per_cell;
    unsigned int synapses_per_segment;
    unsigned int activation_threshold;
    unsigned int learning_threshold;
    unsigned int max_new_synapses;
    int          conn_thr_i16;
    int          perm_inc_i16;
    int          perm_dec_i16;
    int          predicted_seg_dec_i16;
    int          initial_perm_i16;
    // Loop constants
    unsigned int T;
    unsigned int learn;
    unsigned int iter_seed;
    unsigned int cooperative_grid_sync;
};

// Hardware cluster barrier using Hopper sm_90a cooperative_groups::this_cluster().sync().
// Replaces the former software Decoupled Look-Back (DLB) atomic-spin barrier.
//
// cluster::sync() is a single PTX instruction (barrier.cluster) that resolves
// in ~10-40 ns inside the cluster, with no device-level serialization.
// Multiple clusters (one per HTM region) run fully concurrently — bounded
// only by SM count (8 clusters × 16 SMs = 128 ≤ 132 on H200).
//
// The flags / expected / phase / cooperative_grid_sync parameters are kept
// in the signature for call-site compatibility but are unused.
__device__ static inline void fused_grid_barrier(cg::grid_group grid,
                                                 unsigned int * /* flags — unused */,
                                                 unsigned int /* expected — unused */,
                                                 unsigned int /* phase — unused */,
                                                 unsigned int /* cooperative_grid_sync — unused */) {
#if __CUDA_ARCH__ >= 900
    // Hopper+ : hardware cluster barrier (~10-40 ns)
    auto cluster = cg::this_cluster();
    cluster.sync();
#else
    // Pre-Hopper (sm_80, sm_86, sm_89): grid-level cooperative sync.
    // Requires cooperative kernel launch. ~us-ms range, adequate for HTM
    // workload (kernel launch frequency is low).
    grid.sync();
#endif
}

__device__ static inline unsigned int warp_sum_u32(unsigned int v) {
    for (int off = 16; off > 0; off >>= 1) {
        v += __shfl_down_sync(0xffffffffu, v, off);
    }
    return v;
}

// Core kernel body — works for both single-region and batched launches.
// Single-region: caller passes the one FusedPtrs struct.
// Batched: each block reads its region's FusedPtrs via blockIdx.y before
// calling this. State is independent per region (each region owns its own
// GPU buffers); grid.sync() is the only cross-block primitive and it
// spans ALL blocks in the grid (harmless over-sync across regions).
__device__ static inline
void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
    cg::grid_group grid = cg::this_grid();
    // Cast pointers.
    const unsigned int  * __restrict__ syn_bit               = (const unsigned int*)P.syn_bit;
    float               * __restrict__ syn_perm              = (float*)P.syn_perm;
    float               * __restrict__ boost                 = (float*)P.boost;
    float               * __restrict__ active_duty           = (float*)P.active_duty;
    float               * __restrict__ inhibition_threshold  = (float*)P.inhibition_threshold;
    unsigned int        * __restrict__ seg_cell_id           = (unsigned int*)P.seg_cell_id;
    unsigned int        * __restrict__ seg_syn_count         = (unsigned int*)P.seg_syn_count;
    unsigned int        * __restrict__ syn_presyn            = (unsigned int*)P.syn_presyn;
    short               * __restrict__ tm_syn_perm           = (short*)P.tm_syn_perm;
    unsigned int        * __restrict__ cell_seg_count        = (unsigned int*)P.cell_seg_count;
    unsigned int        * __restrict__ cell_active_a         = (unsigned int*)P.cell_active_a;
    unsigned int        * __restrict__ cell_active_b         = (unsigned int*)P.cell_active_b;
    unsigned int        * __restrict__ cell_winner_a         = (unsigned int*)P.cell_winner_a;
    unsigned int        * __restrict__ cell_winner_b         = (unsigned int*)P.cell_winner_b;
    const unsigned char * __restrict__ inputs                = (const unsigned char*)P.inputs;
    unsigned char       * __restrict__ cols_out              = (unsigned char*)P.cols_out;
    float               * __restrict__ anom_out              = (float*)P.anom_out;
    unsigned int        * __restrict__ barrier_counters      = (unsigned int*)P.barrier_counters;
    unsigned int        * __restrict__ step_scratch          = (unsigned int*)P.step_scratch;

    const unsigned int tid     = threadIdx.x;
    const unsigned int lane    = tid & 31u;
    const unsigned int warp    = tid >> 5;
    const unsigned int warps_per_block = blockDim.x >> 5;
    const unsigned int gwarp   = blockIdx.x * warps_per_block + warp;
    const unsigned int n_warps = gridDim.x * warps_per_block;

    const unsigned int n_cols  = cfg.n_columns;
    const unsigned int col_lo  = (gwarp * n_cols) / n_warps;
    const unsigned int col_hi  = ((gwarp + 1) * n_cols) / n_warps;

    unsigned int phase = 0u;

    // =========================================================
    // DSMEM: Cluster-distributed shared memory for hot per-column
    // state (inhibition_threshold, boost, active_duty).
    //
    // On Hopper (sm_90+): Each block in the cluster owns a contiguous
    // slice of columns in its own __shared__ arrays. Any block can
    // peer-read another block's slice via cluster.map_shared_rank().
    //
    // On Ampere (sm_86) and other pre-Hopper: No cluster support.
    // Read/write directly from/to global memory (inhibition_threshold,
    // boost, active_duty device pointers). Slightly higher latency but
    // functionally correct.
    // =========================================================

#if __CUDA_ARCH__ >= 900
    // Hopper+ cluster path
    auto cluster = cg::this_cluster();
    const unsigned int cluster_block_rank = cluster.block_rank();  // 0..cluster_size-1
    const unsigned int cluster_sz         = cluster.num_blocks();  // == gridDim.x (≤16)
#else
    // Pre-Hopper: no cluster, each block is independent.
    const unsigned int cluster_block_rank = blockIdx.x;
    const unsigned int cluster_sz         = gridDim.x;
#endif

    // Partition n_cols evenly across cluster blocks.
    // Each block owns cols_per_block columns starting at my_col_start.
    const unsigned int cols_per_block =
        (n_cols + cluster_sz - 1u) / cluster_sz;               // ceil div
    const unsigned int my_col_start =
        cluster_block_rank * cols_per_block;
    const unsigned int my_col_end =
        (my_col_start + cols_per_block < n_cols)
            ? (my_col_start + cols_per_block) : n_cols;        // clamp

#if __CUDA_ARCH__ >= 900
    // Cluster-distributed shared memory arrays.
    // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
    // Peer blocks address into each other's smem via map_shared_rank.
    __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
    __shared__ float s_boost     [COLS_PER_CLUSTER_BLOCK_MAX];
    __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
#endif

    // TMA multicast input staging tile (T9) — HOPPER ONLY.
    //
    // On Hopper: cg::memcpy_async with cluster scope multicasts input to all
    // 16 SMs, reducing DRAM traffic by ~16×.
    // On Ampere: 32 KB smem allocation exceeds per-block budget when
    // cooperatively launched (48 KB total, registers eat the rest). Skip the
    // tile entirely — Stage A reads from GMEM directly (original path).
#if __CUDA_ARCH__ >= 900
    __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
#endif

#if __CUDA_ARCH__ >= 900
    // Initial GMEM → smem load (reads state from previous forward call).
    // Each block loads only its own slice; tid strides across the slice.
    for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
        const unsigned int off = c - my_col_start;
        s_inhib_thr [off] = inhibition_threshold[c];
        s_boost     [off] = boost[c];
        s_active_duty[off] = active_duty[c];
    }

    // All blocks in the cluster must finish loading before any block
    // starts reading peer smem inside the T-loop.
    cluster.sync();
#else
    // Pre-Hopper: no smem caching needed — reads go directly to GMEM.
    // Grid sync ensures all blocks have completed Phase 0 init before T-loop.
    grid.sync();
#endif

    const unsigned int S   = cfg.synapses_per_col;
    const unsigned int cpc = cfg.cells_per_column;
    const unsigned int SPS = cfg.synapses_per_segment;
    const unsigned int MSC = cfg.max_segments_per_cell;

    // Main timestep loop.
    for (unsigned int t = 0u; t < cfg.T; t++) {
        const unsigned int inp_off      = t * cfg.input_bits;
        const unsigned int col_base_out = t * n_cols;

        unsigned int * curr_active = (t & 1u) ? cell_active_b : cell_active_a;
        unsigned int * prev_active = (t & 1u) ? cell_active_a : cell_active_b;
        unsigned int * curr_winner = (t & 1u) ? cell_winner_b : cell_winner_a;
        unsigned int * prev_winner = (t & 1u) ? cell_winner_a : cell_winner_b;

        // ---- Phase 0: clear curr bitsets for my cell range ----
        const unsigned int my_cell_lo = col_lo * cpc;
        const unsigned int my_cell_hi = col_hi * cpc;
        if (cpc == 32u) {
            // Fast path: one word per column.
            for (unsigned int c = col_lo + lane; c < col_hi; c += 32u) {
                curr_active[c] = 0u;
                curr_winner[c] = 0u;
            }
        } else {
            for (unsigned int cell = my_cell_lo + lane; cell < my_cell_hi; cell += 32u) {
                unsigned int w = cell >> 5;
                unsigned int m = 1u << (cell & 31u);
                atomicAnd(&curr_active[w], ~m);
                atomicAnd(&curr_winner[w], ~m);
            }
        }

        // Block 0, lane 0, warp 0 resets step-scratch counters.
        if (blockIdx.x == 0u && tid == 0u) {
            step_scratch[0] = 0u;
            step_scratch[1] = 0u;
        }

        // ---- BARRIER 1 ----
        // Fence: make the above clear-bitsets + scratch writes globally
        // visible before peer blocks observe "barrier arrived".
        __threadfence();
        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);

        // =========================================================
        // T9: TMA MULTICAST INPUT STAGING
        //
        // Issue a single cluster-scope async DMA to broadcast this
        // timestep's input slice into s_input_tile across all 16 SMs
        // in the cluster simultaneously.  On Hopper sm_90a,
        // cg::memcpy_async with cluster scope maps to the TMA
        // hardware unit (cp.async.bulk.tensor multicast), reducing
        // DRAM input traffic by ~16× vs each block fetching its own
        // copy from GMEM.
        //
        // The staging is gated on cfg.input_bits <= INPUT_BITS_MAX.
        // If the tile is too small (custom large input_bits), we fall
        // back to per-thread GMEM reads in Stage A (identical to the
        // original path; use_input_tile==false).
        //
        // Ordering: BARRIER 1 completes before we issue the DMA.
        // The DMA completes before Stage A reads s_input_tile.
        // =========================================================
#if __CUDA_ARCH__ >= 900
        const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
        if (use_input_tile) {
            auto tb = cg::this_thread_block();
            cg::memcpy_async(tb, s_input_tile,
                             inputs + inp_off,
                             cfg.input_bits);
            cg::wait(tb);
            cluster.sync();
        }
#else
        const bool use_input_tile = false;
#endif

        // =========================================================
        // STAGE A: Spatial Pooler
        //
        // Hot per-column state (boost, inhibition_threshold,
        // active_duty) is served from cluster DSMEM rather than
        // GMEM for each of the T timesteps.  GMEM is written on
        // update so state persists across forward calls.
        // =========================================================
        for (unsigned int c = col_lo; c < col_hi; c++) {
            unsigned int base = c * S;
            unsigned int local = 0u;
            for (unsigned int s = lane; s < S; s += 32u) {
                unsigned int b = syn_bit[base + s];
                float p = syn_perm[base + s];
                // T9: read from cluster-broadcast tile when available;
                // fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
#if __CUDA_ARCH__ >= 900
                unsigned int inp_byte = use_input_tile
                    ? (unsigned int)s_input_tile[b]
                    : (unsigned int)inputs[inp_off + b];
#else
                unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
#endif
                unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
                local += hit;
            }
            unsigned int overlap = warp_sum_u32(local);
            overlap = __shfl_sync(0xffffffffu, overlap, 0);

            // Read boost + threshold for column c.
#if __CUDA_ARCH__ >= 900
            // Hopper: read from cluster-distributed shared memory.
            const unsigned int owner_block  = c / cols_per_block;
            const unsigned int owner_offset = c - owner_block * cols_per_block;
            float boost_val = cluster.map_shared_rank(s_boost,      owner_block)[owner_offset];
            float thr       = cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset];
#else
            // Pre-Hopper: read directly from global memory.
            float boost_val = boost[c];
            float thr       = inhibition_threshold[c];
#endif

            float boosted = (float)overlap * boost_val;
            unsigned int is_active = (boosted > thr) ? 1u : 0u;

            if (lane == 0) {
                cols_out[col_base_out + c] = (unsigned char)is_active;
                if (is_active) {
                    atomicAdd(&step_scratch[0], 1u);
                }
            }

            // SP learn (Hebbian) on active columns.
            // T9: use tile for input reads here too.
            if (cfg.learn && is_active) {
                for (unsigned int s = lane; s < S; s += 32u) {
                    unsigned int b = syn_bit[base + s];
                    float p = syn_perm[base + s];
#if __CUDA_ARCH__ >= 900
                    unsigned int inp_byte = use_input_tile
                        ? (unsigned int)s_input_tile[b]
                        : (unsigned int)inputs[inp_off + b];
#else
                    unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
#endif
                    if (inp_byte != 0u) {
                        p += cfg.sp_inc;
                        if (p > 1.0f) p = 1.0f;
                    } else {
                        p -= cfg.sp_dec;
                        if (p < 0.0f) p = 0.0f;
                    }
                    syn_perm[base + s] = p;
                }
            }

            // active_duty EMA + threshold adaptation.
            // Writes go to both DSMEM (hot path, Hopper only) and GMEM (persistence).
            if (lane == 0) {
#if __CUDA_ARCH__ >= 900
                float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
#else
                float ad = active_duty[c];
#endif
                float sample = is_active ? 1.0f : 0.0f;
                ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;

#if __CUDA_ARCH__ >= 900
                // Writeback: peer smem (for next timestep read) + GMEM (persistence).
                cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
#endif
                active_duty[c] = ad;

                // Threshold steers toward target sparsity.
                float err = ad - cfg.sparsity_target;
                float new_thr = thr + cfg.thr_adapt_rate * err * 100.0f;
                if (new_thr < 0.1f) new_thr = 0.1f;
                if (new_thr > 1000.0f) new_thr = 1000.0f;

#if __CUDA_ARCH__ >= 900
                // Writeback: peer smem (for next timestep read) + GMEM (persistence).
                cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
#endif
                inhibition_threshold[c] = new_thr;
            }
        }

        // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
        //
        // On Hopper: cluster.sync() ensures all peer smem writes from this
        // timestep are visible to all blocks before Stage B / next t.
        // On pre-Hopper: no smem peer writes occur (all state in GMEM),
        // so no extra sync needed here — the grid barrier below suffices.
#if __CUDA_ARCH__ >= 900
        cluster.sync();
#endif

        // ---- BARRIER 2: SP active_mask must be visible before TM reads ----
        // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
        // writes to global memory before peers advance past this barrier.
        __threadfence();
        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);

        // =========================================================
        // STAGE B: Temporal Memory
        // =========================================================
        for (unsigned int c = col_lo; c < col_hi; c++) {
            unsigned int col_active = cols_out[col_base_out + c];
            if (col_active == 0u) continue;

            unsigned int base_cell = c * cpc;
            unsigned int any_predicted = 0u;
            unsigned int best_seg_id_for_grow = 0xFFFFFFFFu;
            unsigned int best_pot_count = 0u;

            for (unsigned int k = 0u; k < cpc; k++) {
                unsigned int cell = base_cell + k;
                unsigned int n_segs_here = cell_seg_count[cell];
                if (n_segs_here > MSC) n_segs_here = MSC;
                if (n_segs_here == 0u) continue;

                unsigned int seg_base_id = cell * MSC;
                unsigned int cell_is_predictive = 0u;

                for (unsigned int ls = 0u; ls < n_segs_here; ls++) {
                    unsigned int seg = seg_base_id + ls;
                    unsigned int n_syn = seg_syn_count[seg];
                    if (n_syn == 0u) continue;
                    unsigned int syn_base = seg * SPS;

                    unsigned int l_conn = 0u;
                    unsigned int l_pot  = 0u;
                    for (unsigned int s = lane; s < n_syn; s += 32u) {
                        unsigned int presyn = syn_presyn[syn_base + s];
                        unsigned int w = prev_active[presyn >> 5];
                        unsigned int bit = (w >> (presyn & 31u)) & 1u;
                        if (bit) {
                            l_pot += 1u;
                            int p = (int)tm_syn_perm[syn_base + s];
                            if (p >= cfg.conn_thr_i16) l_conn += 1u;
                        }
                    }
                    unsigned int tot_conn = warp_sum_u32(l_conn);
                    unsigned int tot_pot  = warp_sum_u32(l_pot);
                    tot_conn = __shfl_sync(0xffffffffu, tot_conn, 0);
                    tot_pot  = __shfl_sync(0xffffffffu, tot_pot, 0);

                    if (tot_conn >= cfg.activation_threshold) cell_is_predictive = 1u;
                    if (tot_pot >= cfg.learning_threshold && tot_pot > best_pot_count) {
                        best_pot_count = tot_pot;
                        best_seg_id_for_grow = seg;
                    }

                    // Reinforce predicted-and-correct segment.
                    if (cfg.learn && tot_conn >= cfg.activation_threshold) {
                        for (unsigned int s = lane; s < n_syn; s += 32u) {
                            unsigned int presyn = syn_presyn[syn_base + s];
                            unsigned int w = prev_active[presyn >> 5];
                            unsigned int bit = (w >> (presyn & 31u)) & 1u;
                            int p = (int)tm_syn_perm[syn_base + s];
                            if (bit) {
                                int np = p + cfg.perm_inc_i16;
                                if (np > 32767) np = 32767;
                                tm_syn_perm[syn_base + s] = (short)np;
                            } else {
                                int np = p - cfg.perm_dec_i16;
                                if (np < 0) np = 0;
                                tm_syn_perm[syn_base + s] = (short)np;
                            }
                        }
                    }
                }

                if (cell_is_predictive) {
                    any_predicted = 1u;
                    if (lane == 0) {
                        unsigned int w = cell >> 5;
                        unsigned int m = 1u << (cell & 31u);
                        atomicOr(&curr_active[w], m);
                        atomicOr(&curr_winner[w], m);
                    }
                }
            }

            // BURST if no predicted.
            if (!any_predicted) {
                if (lane == 0) {
                    for (unsigned int k = 0u; k < cpc; k++) {
                        unsigned int cell = base_cell + k;
                        unsigned int w = cell >> 5;
                        unsigned int m = 1u << (cell & 31u);
                        atomicOr(&curr_active[w], m);
                    }
                    unsigned int win = base_cell;
                    unsigned int ww = win >> 5;
                    unsigned int wm = 1u << (win & 31u);
                    atomicOr(&curr_winner[ww], wm);
                    atomicAdd(&step_scratch[1], 1u);
                }

                if (cfg.learn) {
                    unsigned int target_seg;
                    unsigned int existing_syn;
                    if (best_seg_id_for_grow != 0xFFFFFFFFu) {
                        // Reuse best matching segment.
                        target_seg = best_seg_id_for_grow;
                        existing_syn = seg_syn_count[target_seg];
                        target_seg = __shfl_sync(0xffffffffu, target_seg, 0);
                        existing_syn = __shfl_sync(0xffffffffu, existing_syn, 0);

                        // Reinforce its existing synapses.
                        unsigned int syn_base = target_seg * SPS;
                        for (unsigned int s = lane; s < existing_syn; s += 32u) {
                            unsigned int presyn = syn_presyn[syn_base + s];
                            unsigned int w = prev_active[presyn >> 5];
                            unsigned int bit = (w >> (presyn & 31u)) & 1u;
                            int p = (int)tm_syn_perm[syn_base + s];
                            if (bit) {
                                int np = p + cfg.perm_inc_i16;
                                if (np > 32767) np = 32767;
                                tm_syn_perm[syn_base + s] = (short)np;
                            } else {
                                int np = p - cfg.perm_dec_i16;
                                if (np < 0) np = 0;
                                tm_syn_perm[syn_base + s] = (short)np;
                            }
                        }
                    } else {
                        // Allocate new segment on winner cell (cell 0 of col).
                        unsigned int new_seg = 0u;
                        if (lane == 0) {
                            unsigned int winner_cell = base_cell;
                            unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u);
                            if (slot >= MSC) slot = slot % MSC;
                            new_seg = winner_cell * MSC + slot;
                            seg_cell_id[new_seg] = winner_cell;
                            seg_syn_count[new_seg] = 0u;
                        }
                        target_seg = __shfl_sync(0xffffffffu, new_seg, 0);
                        existing_syn = 0u;
                    }

                    // Grow synapses to prev_winner cells — lane 0 serialized.
                    unsigned int room = (SPS > existing_syn) ? (SPS - existing_syn) : 0u;
                    unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room;
                    if (lane == 0 && max_grow > 0u) {
                        unsigned int syn_base = target_seg * SPS;
                        unsigned int grown = 0u;
                        unsigned int start_off = (c * 2654435761u + cfg.iter_seed + t) % cfg.bits_words;
                        for (unsigned int w_off = 0u;
                             w_off < cfg.bits_words && grown < max_grow;
                             w_off++) {
                            unsigned int widx = (start_off + w_off) % cfg.bits_words;
                            unsigned int word = prev_winner[widx];
                            while (word != 0u && grown < max_grow) {
                                unsigned int bit_pos = __ffs(word) - 1u;
                                word &= ~(1u << bit_pos);
                                unsigned int cell_id = widx * 32u + bit_pos;
                                if (cell_id >= cfg.n_cells) continue;
                                bool exists = false;
                                for (unsigned int es = 0u; es < existing_syn + grown; es++) {
                                    if (syn_presyn[syn_base + es] == cell_id) { exists = true; break; }
                                }
                                if (exists) continue;
                                unsigned int write_idx = existing_syn + grown;
                                if (write_idx >= SPS) break;
                                syn_presyn[syn_base + write_idx] = cell_id;
                                tm_syn_perm[syn_base + write_idx] = (short)cfg.initial_perm_i16;
                                grown++;
                            }
                        }
                        if (grown > 0u) {
                            seg_syn_count[target_seg] = existing_syn + grown;
                        }
                    }
                }
            }
        }

        // ---- BARRIER 3: TM writes complete before anomaly + next-step read ----
        // Fence: flush curr_active/curr_winner bitsets + tm_syn_perm +
        // seg_syn_count + syn_presyn before peers advance and consume them as
        // prev_active/prev_winner at t+1.
        __threadfence();
        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);

        // Write anomaly for step t.
        if (blockIdx.x == 0u && tid == 0u) {
            unsigned int total = step_scratch[0];
            unsigned int bad   = step_scratch[1];
            float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
            anom_out[t] = anom;
        }
    }
}

// Single-region kernel (legacy call site).
__global__ __launch_bounds__(256, 2)
void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
    htm_fused_step_body(P, cfg);
}

// Batched kernel: one cooperative launch for B regions. grid.y = B,
// grid.x = per-region block count. Each block reads its region's
// FusedPtrs from the device array via blockIdx.y.
__global__ __launch_bounds__(256, 2)
void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
    const FusedPtrs P = P_arr[blockIdx.y];
    htm_fused_step_body(P, cfg);
}

} // extern "C"