Spaces:
Build error
Build error
| // Fused HTM megakernel — SP + TM, all T timesteps in a single launch. | |
| // | |
| // Design rationale: | |
| // - Global top-K column selection requires cross-block synchronization at | |
| // every timestep (grid.sync is unreliable on WSL2/sm_86 without rdc=true). | |
| // - Replace with per-column threshold activation using local lateral | |
| // inhibition: column c activates if overlap[c]*boost[c] > threshold[c]. | |
| // Threshold is a per-column running-EMA learned scalar that steers the | |
| // column's long-run activation rate toward the global sparsity target. | |
| // - This is biologically grounded (GABAergic local inhibition) and supported | |
| // by HTM theory (duty-cycle boost already drives this loop; we just | |
| // change which lever the EMA pulls). | |
| // | |
| // Launch shape: | |
| // grid = min(device SM count, 16) // hard cap — see below | |
| // block = 1024 threads = 32 warps | |
| // Each warp of 32 owns a contiguous column slice (n_columns / total_warps). | |
| // | |
| // Cross-block coherence: | |
| // - Ping-pong buffers for cell_active/cell_winner: write _a at even t, | |
| // read _b; reversed at odd t. | |
| // - Preferred path: cooperative launch + hardware whole-grid sync. | |
| // - Fallback path: software 3-slot rotating grid barrier for devices/drivers | |
| // that cannot do cooperative launch. | |
| // | |
| // 2026-04-16: grid_dim reduced from 28 to 16 after deadlock RCA. The previous | |
| // cap of 28 relied on all blocks being concurrently resident on a 30-SM RTX | |
| // 3060 Laptop. Under thermal throttling effective residency dropped to ~20-24, | |
| // leaving scheduled blocks spinning on the software grid barrier waiting for | |
| // peer blocks that would never run. 16 blocks is below any realistic residency | |
| // floor and preserves enough warp parallelism (16*32 = 512 warps) to saturate | |
| // memory bandwidth on the spatial-pooler stage. | |
| // | |
| // Kernel signature uses struct-by-value for pointers and config to stay | |
| // inside cudarc's launch-arg count limit. | |
| namespace cg = cooperative_groups; | |
| // Maximum columns owned per cluster-block in DSMEM. | |
| // Supports n_columns up to COLS_PER_CLUSTER_BLOCK_MAX * cluster_size. | |
| // At cluster_size=16: supports up to 256*16=4096 columns. | |
| // Each array costs 256*4 = 1024 bytes; three arrays = 3072 bytes per SM — | |
| // well under the 228 KB H200 shared-memory cap. | |
| // Maximum input_bits supported by the TMA-multicast staging tile. | |
| // At 32 KB this covers the production SDR width (16384 bits) with 2× headroom. | |
| // Total shared per SM: 32768 (tile) + 3072 (DSMEM float arrays) = ~35 KB — | |
| // well under the 228 KB H200 limit. | |
| // | |
| // Expected speedup from TMA multicast input staging (T9/T11): | |
| // - Without staging: 16 SMs × T × (input_bits GMEM reads per timestep) | |
| // - With staging: 1 TMA DMA per timestep, shared reads from L1 thereafter | |
| // - Theoretical DRAM bandwidth reduction: ~16× on input reads | |
| // - Wall-clock reduction estimate: -20 to -40 ms from reduced input fetch latency | |
| extern "C" { | |
| struct FusedPtrs { | |
| unsigned long long syn_bit; | |
| unsigned long long syn_perm; | |
| unsigned long long boost; | |
| unsigned long long active_duty; | |
| unsigned long long inhibition_threshold; | |
| unsigned long long seg_cell_id; | |
| unsigned long long seg_syn_count; | |
| unsigned long long syn_presyn; | |
| unsigned long long tm_syn_perm; | |
| unsigned long long cell_seg_count; | |
| unsigned long long cell_active_a; | |
| unsigned long long cell_active_b; | |
| unsigned long long cell_winner_a; | |
| unsigned long long cell_winner_b; | |
| unsigned long long inputs; | |
| unsigned long long cols_out; | |
| unsigned long long anom_out; | |
| unsigned long long barrier_counters; | |
| unsigned long long step_scratch; | |
| }; | |
| struct FusedConfig { | |
| // SP constants | |
| unsigned int input_bits; | |
| unsigned int n_columns; | |
| unsigned int synapses_per_col; | |
| float conn_thr; | |
| float sp_inc; | |
| float sp_dec; | |
| float sparsity_target; | |
| float duty_alpha; | |
| float thr_adapt_rate; | |
| // TM constants | |
| unsigned int cells_per_column; | |
| unsigned int n_cells; | |
| unsigned int bits_words; | |
| unsigned int max_segments_per_cell; | |
| unsigned int synapses_per_segment; | |
| unsigned int activation_threshold; | |
| unsigned int learning_threshold; | |
| unsigned int max_new_synapses; | |
| int conn_thr_i16; | |
| int perm_inc_i16; | |
| int perm_dec_i16; | |
| int predicted_seg_dec_i16; | |
| int initial_perm_i16; | |
| // Loop constants | |
| unsigned int T; | |
| unsigned int learn; | |
| unsigned int iter_seed; | |
| unsigned int cooperative_grid_sync; | |
| }; | |
| // Hardware cluster barrier using Hopper sm_90a cooperative_groups::this_cluster().sync(). | |
| // Replaces the former software Decoupled Look-Back (DLB) atomic-spin barrier. | |
| // | |
| // cluster::sync() is a single PTX instruction (barrier.cluster) that resolves | |
| // in ~10-40 ns inside the cluster, with no device-level serialization. | |
| // Multiple clusters (one per HTM region) run fully concurrently — bounded | |
| // only by SM count (8 clusters × 16 SMs = 128 ≤ 132 on H200). | |
| // | |
| // The flags / expected / phase / cooperative_grid_sync parameters are kept | |
| // in the signature for call-site compatibility but are unused. | |
| __device__ static inline void fused_grid_barrier(cg::grid_group grid, | |
| unsigned int * /* flags — unused */, | |
| unsigned int /* expected — unused */, | |
| unsigned int /* phase — unused */, | |
| unsigned int /* cooperative_grid_sync — unused */) { | |
| // Hopper+ : hardware cluster barrier (~10-40 ns) | |
| auto cluster = cg::this_cluster(); | |
| cluster.sync(); | |
| // Pre-Hopper (sm_80, sm_86, sm_89): grid-level cooperative sync. | |
| // Requires cooperative kernel launch. ~us-ms range, adequate for HTM | |
| // workload (kernel launch frequency is low). | |
| grid.sync(); | |
| } | |
| __device__ static inline unsigned int warp_sum_u32(unsigned int v) { | |
| for (int off = 16; off > 0; off >>= 1) { | |
| v += __shfl_down_sync(0xffffffffu, v, off); | |
| } | |
| return v; | |
| } | |
| // Core kernel body — works for both single-region and batched launches. | |
| // Single-region: caller passes the one FusedPtrs struct. | |
| // Batched: each block reads its region's FusedPtrs via blockIdx.y before | |
| // calling this. State is independent per region (each region owns its own | |
| // GPU buffers); grid.sync() is the only cross-block primitive and it | |
| // spans ALL blocks in the grid (harmless over-sync across regions). | |
| __device__ static inline | |
| void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) { | |
| cg::grid_group grid = cg::this_grid(); | |
| // Cast pointers. | |
| const unsigned int * __restrict__ syn_bit = (const unsigned int*)P.syn_bit; | |
| float * __restrict__ syn_perm = (float*)P.syn_perm; | |
| float * __restrict__ boost = (float*)P.boost; | |
| float * __restrict__ active_duty = (float*)P.active_duty; | |
| float * __restrict__ inhibition_threshold = (float*)P.inhibition_threshold; | |
| unsigned int * __restrict__ seg_cell_id = (unsigned int*)P.seg_cell_id; | |
| unsigned int * __restrict__ seg_syn_count = (unsigned int*)P.seg_syn_count; | |
| unsigned int * __restrict__ syn_presyn = (unsigned int*)P.syn_presyn; | |
| short * __restrict__ tm_syn_perm = (short*)P.tm_syn_perm; | |
| unsigned int * __restrict__ cell_seg_count = (unsigned int*)P.cell_seg_count; | |
| unsigned int * __restrict__ cell_active_a = (unsigned int*)P.cell_active_a; | |
| unsigned int * __restrict__ cell_active_b = (unsigned int*)P.cell_active_b; | |
| unsigned int * __restrict__ cell_winner_a = (unsigned int*)P.cell_winner_a; | |
| unsigned int * __restrict__ cell_winner_b = (unsigned int*)P.cell_winner_b; | |
| const unsigned char * __restrict__ inputs = (const unsigned char*)P.inputs; | |
| unsigned char * __restrict__ cols_out = (unsigned char*)P.cols_out; | |
| float * __restrict__ anom_out = (float*)P.anom_out; | |
| unsigned int * __restrict__ barrier_counters = (unsigned int*)P.barrier_counters; | |
| unsigned int * __restrict__ step_scratch = (unsigned int*)P.step_scratch; | |
| const unsigned int tid = threadIdx.x; | |
| const unsigned int lane = tid & 31u; | |
| const unsigned int warp = tid >> 5; | |
| const unsigned int warps_per_block = blockDim.x >> 5; | |
| const unsigned int gwarp = blockIdx.x * warps_per_block + warp; | |
| const unsigned int n_warps = gridDim.x * warps_per_block; | |
| const unsigned int n_cols = cfg.n_columns; | |
| const unsigned int col_lo = (gwarp * n_cols) / n_warps; | |
| const unsigned int col_hi = ((gwarp + 1) * n_cols) / n_warps; | |
| unsigned int phase = 0u; | |
| // ========================================================= | |
| // DSMEM: Cluster-distributed shared memory for hot per-column | |
| // state (inhibition_threshold, boost, active_duty). | |
| // | |
| // On Hopper (sm_90+): Each block in the cluster owns a contiguous | |
| // slice of columns in its own __shared__ arrays. Any block can | |
| // peer-read another block's slice via cluster.map_shared_rank(). | |
| // | |
| // On Ampere (sm_86) and other pre-Hopper: No cluster support. | |
| // Read/write directly from/to global memory (inhibition_threshold, | |
| // boost, active_duty device pointers). Slightly higher latency but | |
| // functionally correct. | |
| // ========================================================= | |
| // Hopper+ cluster path | |
| auto cluster = cg::this_cluster(); | |
| const unsigned int cluster_block_rank = cluster.block_rank(); // 0..cluster_size-1 | |
| const unsigned int cluster_sz = cluster.num_blocks(); // == gridDim.x (≤16) | |
| // Pre-Hopper: no cluster, each block is independent. | |
| const unsigned int cluster_block_rank = blockIdx.x; | |
| const unsigned int cluster_sz = gridDim.x; | |
| // Partition n_cols evenly across cluster blocks. | |
| // Each block owns cols_per_block columns starting at my_col_start. | |
| const unsigned int cols_per_block = | |
| (n_cols + cluster_sz - 1u) / cluster_sz; // ceil div | |
| const unsigned int my_col_start = | |
| cluster_block_rank * cols_per_block; | |
| const unsigned int my_col_end = | |
| (my_col_start + cols_per_block < n_cols) | |
| ? (my_col_start + cols_per_block) : n_cols; // clamp | |
| // Cluster-distributed shared memory arrays. | |
| // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array. | |
| // Peer blocks address into each other's smem via map_shared_rank. | |
| __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX]; | |
| __shared__ float s_boost [COLS_PER_CLUSTER_BLOCK_MAX]; | |
| __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX]; | |
| // TMA multicast input staging tile (T9) — HOPPER ONLY. | |
| // | |
| // On Hopper: cg::memcpy_async with cluster scope multicasts input to all | |
| // 16 SMs, reducing DRAM traffic by ~16×. | |
| // On Ampere: 32 KB smem allocation exceeds per-block budget when | |
| // cooperatively launched (48 KB total, registers eat the rest). Skip the | |
| // tile entirely — Stage A reads from GMEM directly (original path). | |
| __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX]; | |
| // Initial GMEM → smem load (reads state from previous forward call). | |
| // Each block loads only its own slice; tid strides across the slice. | |
| for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) { | |
| const unsigned int off = c - my_col_start; | |
| s_inhib_thr [off] = inhibition_threshold[c]; | |
| s_boost [off] = boost[c]; | |
| s_active_duty[off] = active_duty[c]; | |
| } | |
| // All blocks in the cluster must finish loading before any block | |
| // starts reading peer smem inside the T-loop. | |
| cluster.sync(); | |
| // Pre-Hopper: no smem caching needed — reads go directly to GMEM. | |
| // Grid sync ensures all blocks have completed Phase 0 init before T-loop. | |
| grid.sync(); | |
| const unsigned int S = cfg.synapses_per_col; | |
| const unsigned int cpc = cfg.cells_per_column; | |
| const unsigned int SPS = cfg.synapses_per_segment; | |
| const unsigned int MSC = cfg.max_segments_per_cell; | |
| // Main timestep loop. | |
| for (unsigned int t = 0u; t < cfg.T; t++) { | |
| const unsigned int inp_off = t * cfg.input_bits; | |
| const unsigned int col_base_out = t * n_cols; | |
| unsigned int * curr_active = (t & 1u) ? cell_active_b : cell_active_a; | |
| unsigned int * prev_active = (t & 1u) ? cell_active_a : cell_active_b; | |
| unsigned int * curr_winner = (t & 1u) ? cell_winner_b : cell_winner_a; | |
| unsigned int * prev_winner = (t & 1u) ? cell_winner_a : cell_winner_b; | |
| // ---- Phase 0: clear curr bitsets for my cell range ---- | |
| const unsigned int my_cell_lo = col_lo * cpc; | |
| const unsigned int my_cell_hi = col_hi * cpc; | |
| if (cpc == 32u) { | |
| // Fast path: one word per column. | |
| for (unsigned int c = col_lo + lane; c < col_hi; c += 32u) { | |
| curr_active[c] = 0u; | |
| curr_winner[c] = 0u; | |
| } | |
| } else { | |
| for (unsigned int cell = my_cell_lo + lane; cell < my_cell_hi; cell += 32u) { | |
| unsigned int w = cell >> 5; | |
| unsigned int m = 1u << (cell & 31u); | |
| atomicAnd(&curr_active[w], ~m); | |
| atomicAnd(&curr_winner[w], ~m); | |
| } | |
| } | |
| // Block 0, lane 0, warp 0 resets step-scratch counters. | |
| if (blockIdx.x == 0u && tid == 0u) { | |
| step_scratch[0] = 0u; | |
| step_scratch[1] = 0u; | |
| } | |
| // ---- BARRIER 1 ---- | |
| // Fence: make the above clear-bitsets + scratch writes globally | |
| // visible before peer blocks observe "barrier arrived". | |
| __threadfence(); | |
| fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync); | |
| // ========================================================= | |
| // T9: TMA MULTICAST INPUT STAGING | |
| // | |
| // Issue a single cluster-scope async DMA to broadcast this | |
| // timestep's input slice into s_input_tile across all 16 SMs | |
| // in the cluster simultaneously. On Hopper sm_90a, | |
| // cg::memcpy_async with cluster scope maps to the TMA | |
| // hardware unit (cp.async.bulk.tensor multicast), reducing | |
| // DRAM input traffic by ~16× vs each block fetching its own | |
| // copy from GMEM. | |
| // | |
| // The staging is gated on cfg.input_bits <= INPUT_BITS_MAX. | |
| // If the tile is too small (custom large input_bits), we fall | |
| // back to per-thread GMEM reads in Stage A (identical to the | |
| // original path; use_input_tile==false). | |
| // | |
| // Ordering: BARRIER 1 completes before we issue the DMA. | |
| // The DMA completes before Stage A reads s_input_tile. | |
| // ========================================================= | |
| const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX); | |
| if (use_input_tile) { | |
| auto tb = cg::this_thread_block(); | |
| cg::memcpy_async(tb, s_input_tile, | |
| inputs + inp_off, | |
| cfg.input_bits); | |
| cg::wait(tb); | |
| cluster.sync(); | |
| } | |
| const bool use_input_tile = false; | |
| // ========================================================= | |
| // STAGE A: Spatial Pooler | |
| // | |
| // Hot per-column state (boost, inhibition_threshold, | |
| // active_duty) is served from cluster DSMEM rather than | |
| // GMEM for each of the T timesteps. GMEM is written on | |
| // update so state persists across forward calls. | |
| // ========================================================= | |
| for (unsigned int c = col_lo; c < col_hi; c++) { | |
| unsigned int base = c * S; | |
| unsigned int local = 0u; | |
| for (unsigned int s = lane; s < S; s += 32u) { | |
| unsigned int b = syn_bit[base + s]; | |
| float p = syn_perm[base + s]; | |
| // T9: read from cluster-broadcast tile when available; | |
| // fall back to direct GMEM when input_bits > INPUT_BITS_MAX. | |
| unsigned int inp_byte = use_input_tile | |
| ? (unsigned int)s_input_tile[b] | |
| : (unsigned int)inputs[inp_off + b]; | |
| unsigned int inp_byte = (unsigned int)inputs[inp_off + b]; | |
| unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u; | |
| local += hit; | |
| } | |
| unsigned int overlap = warp_sum_u32(local); | |
| overlap = __shfl_sync(0xffffffffu, overlap, 0); | |
| // Read boost + threshold for column c. | |
| // Hopper: read from cluster-distributed shared memory. | |
| const unsigned int owner_block = c / cols_per_block; | |
| const unsigned int owner_offset = c - owner_block * cols_per_block; | |
| float boost_val = cluster.map_shared_rank(s_boost, owner_block)[owner_offset]; | |
| float thr = cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset]; | |
| // Pre-Hopper: read directly from global memory. | |
| float boost_val = boost[c]; | |
| float thr = inhibition_threshold[c]; | |
| float boosted = (float)overlap * boost_val; | |
| unsigned int is_active = (boosted > thr) ? 1u : 0u; | |
| if (lane == 0) { | |
| cols_out[col_base_out + c] = (unsigned char)is_active; | |
| if (is_active) { | |
| atomicAdd(&step_scratch[0], 1u); | |
| } | |
| } | |
| // SP learn (Hebbian) on active columns. | |
| // T9: use tile for input reads here too. | |
| if (cfg.learn && is_active) { | |
| for (unsigned int s = lane; s < S; s += 32u) { | |
| unsigned int b = syn_bit[base + s]; | |
| float p = syn_perm[base + s]; | |
| unsigned int inp_byte = use_input_tile | |
| ? (unsigned int)s_input_tile[b] | |
| : (unsigned int)inputs[inp_off + b]; | |
| unsigned int inp_byte = (unsigned int)inputs[inp_off + b]; | |
| if (inp_byte != 0u) { | |
| p += cfg.sp_inc; | |
| if (p > 1.0f) p = 1.0f; | |
| } else { | |
| p -= cfg.sp_dec; | |
| if (p < 0.0f) p = 0.0f; | |
| } | |
| syn_perm[base + s] = p; | |
| } | |
| } | |
| // active_duty EMA + threshold adaptation. | |
| // Writes go to both DSMEM (hot path, Hopper only) and GMEM (persistence). | |
| if (lane == 0) { | |
| float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset]; | |
| float ad = active_duty[c]; | |
| float sample = is_active ? 1.0f : 0.0f; | |
| ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample; | |
| // Writeback: peer smem (for next timestep read) + GMEM (persistence). | |
| cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad; | |
| active_duty[c] = ad; | |
| // Threshold steers toward target sparsity. | |
| float err = ad - cfg.sparsity_target; | |
| float new_thr = thr + cfg.thr_adapt_rate * err * 100.0f; | |
| if (new_thr < 0.1f) new_thr = 0.1f; | |
| if (new_thr > 1000.0f) new_thr = 1000.0f; | |
| // Writeback: peer smem (for next timestep read) + GMEM (persistence). | |
| cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr; | |
| inhibition_threshold[c] = new_thr; | |
| } | |
| } | |
| // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ---- | |
| // | |
| // On Hopper: cluster.sync() ensures all peer smem writes from this | |
| // timestep are visible to all blocks before Stage B / next t. | |
| // On pre-Hopper: no smem peer writes occur (all state in GMEM), | |
| // so no extra sync needed here — the grid barrier below suffices. | |
| cluster.sync(); | |
| // ---- BARRIER 2: SP active_mask must be visible before TM reads ---- | |
| // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch | |
| // writes to global memory before peers advance past this barrier. | |
| __threadfence(); | |
| fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync); | |
| // ========================================================= | |
| // STAGE B: Temporal Memory | |
| // ========================================================= | |
| for (unsigned int c = col_lo; c < col_hi; c++) { | |
| unsigned int col_active = cols_out[col_base_out + c]; | |
| if (col_active == 0u) continue; | |
| unsigned int base_cell = c * cpc; | |
| unsigned int any_predicted = 0u; | |
| unsigned int best_seg_id_for_grow = 0xFFFFFFFFu; | |
| unsigned int best_pot_count = 0u; | |
| for (unsigned int k = 0u; k < cpc; k++) { | |
| unsigned int cell = base_cell + k; | |
| unsigned int n_segs_here = cell_seg_count[cell]; | |
| if (n_segs_here > MSC) n_segs_here = MSC; | |
| if (n_segs_here == 0u) continue; | |
| unsigned int seg_base_id = cell * MSC; | |
| unsigned int cell_is_predictive = 0u; | |
| for (unsigned int ls = 0u; ls < n_segs_here; ls++) { | |
| unsigned int seg = seg_base_id + ls; | |
| unsigned int n_syn = seg_syn_count[seg]; | |
| if (n_syn == 0u) continue; | |
| unsigned int syn_base = seg * SPS; | |
| unsigned int l_conn = 0u; | |
| unsigned int l_pot = 0u; | |
| for (unsigned int s = lane; s < n_syn; s += 32u) { | |
| unsigned int presyn = syn_presyn[syn_base + s]; | |
| unsigned int w = prev_active[presyn >> 5]; | |
| unsigned int bit = (w >> (presyn & 31u)) & 1u; | |
| if (bit) { | |
| l_pot += 1u; | |
| int p = (int)tm_syn_perm[syn_base + s]; | |
| if (p >= cfg.conn_thr_i16) l_conn += 1u; | |
| } | |
| } | |
| unsigned int tot_conn = warp_sum_u32(l_conn); | |
| unsigned int tot_pot = warp_sum_u32(l_pot); | |
| tot_conn = __shfl_sync(0xffffffffu, tot_conn, 0); | |
| tot_pot = __shfl_sync(0xffffffffu, tot_pot, 0); | |
| if (tot_conn >= cfg.activation_threshold) cell_is_predictive = 1u; | |
| if (tot_pot >= cfg.learning_threshold && tot_pot > best_pot_count) { | |
| best_pot_count = tot_pot; | |
| best_seg_id_for_grow = seg; | |
| } | |
| // Reinforce predicted-and-correct segment. | |
| if (cfg.learn && tot_conn >= cfg.activation_threshold) { | |
| for (unsigned int s = lane; s < n_syn; s += 32u) { | |
| unsigned int presyn = syn_presyn[syn_base + s]; | |
| unsigned int w = prev_active[presyn >> 5]; | |
| unsigned int bit = (w >> (presyn & 31u)) & 1u; | |
| int p = (int)tm_syn_perm[syn_base + s]; | |
| if (bit) { | |
| int np = p + cfg.perm_inc_i16; | |
| if (np > 32767) np = 32767; | |
| tm_syn_perm[syn_base + s] = (short)np; | |
| } else { | |
| int np = p - cfg.perm_dec_i16; | |
| if (np < 0) np = 0; | |
| tm_syn_perm[syn_base + s] = (short)np; | |
| } | |
| } | |
| } | |
| } | |
| if (cell_is_predictive) { | |
| any_predicted = 1u; | |
| if (lane == 0) { | |
| unsigned int w = cell >> 5; | |
| unsigned int m = 1u << (cell & 31u); | |
| atomicOr(&curr_active[w], m); | |
| atomicOr(&curr_winner[w], m); | |
| } | |
| } | |
| } | |
| // BURST if no predicted. | |
| if (!any_predicted) { | |
| if (lane == 0) { | |
| for (unsigned int k = 0u; k < cpc; k++) { | |
| unsigned int cell = base_cell + k; | |
| unsigned int w = cell >> 5; | |
| unsigned int m = 1u << (cell & 31u); | |
| atomicOr(&curr_active[w], m); | |
| } | |
| unsigned int win = base_cell; | |
| unsigned int ww = win >> 5; | |
| unsigned int wm = 1u << (win & 31u); | |
| atomicOr(&curr_winner[ww], wm); | |
| atomicAdd(&step_scratch[1], 1u); | |
| } | |
| if (cfg.learn) { | |
| unsigned int target_seg; | |
| unsigned int existing_syn; | |
| if (best_seg_id_for_grow != 0xFFFFFFFFu) { | |
| // Reuse best matching segment. | |
| target_seg = best_seg_id_for_grow; | |
| existing_syn = seg_syn_count[target_seg]; | |
| target_seg = __shfl_sync(0xffffffffu, target_seg, 0); | |
| existing_syn = __shfl_sync(0xffffffffu, existing_syn, 0); | |
| // Reinforce its existing synapses. | |
| unsigned int syn_base = target_seg * SPS; | |
| for (unsigned int s = lane; s < existing_syn; s += 32u) { | |
| unsigned int presyn = syn_presyn[syn_base + s]; | |
| unsigned int w = prev_active[presyn >> 5]; | |
| unsigned int bit = (w >> (presyn & 31u)) & 1u; | |
| int p = (int)tm_syn_perm[syn_base + s]; | |
| if (bit) { | |
| int np = p + cfg.perm_inc_i16; | |
| if (np > 32767) np = 32767; | |
| tm_syn_perm[syn_base + s] = (short)np; | |
| } else { | |
| int np = p - cfg.perm_dec_i16; | |
| if (np < 0) np = 0; | |
| tm_syn_perm[syn_base + s] = (short)np; | |
| } | |
| } | |
| } else { | |
| // Allocate new segment on winner cell (cell 0 of col). | |
| unsigned int new_seg = 0u; | |
| if (lane == 0) { | |
| unsigned int winner_cell = base_cell; | |
| unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u); | |
| if (slot >= MSC) slot = slot % MSC; | |
| new_seg = winner_cell * MSC + slot; | |
| seg_cell_id[new_seg] = winner_cell; | |
| seg_syn_count[new_seg] = 0u; | |
| } | |
| target_seg = __shfl_sync(0xffffffffu, new_seg, 0); | |
| existing_syn = 0u; | |
| } | |
| // Grow synapses to prev_winner cells — lane 0 serialized. | |
| unsigned int room = (SPS > existing_syn) ? (SPS - existing_syn) : 0u; | |
| unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room; | |
| if (lane == 0 && max_grow > 0u) { | |
| unsigned int syn_base = target_seg * SPS; | |
| unsigned int grown = 0u; | |
| unsigned int start_off = (c * 2654435761u + cfg.iter_seed + t) % cfg.bits_words; | |
| for (unsigned int w_off = 0u; | |
| w_off < cfg.bits_words && grown < max_grow; | |
| w_off++) { | |
| unsigned int widx = (start_off + w_off) % cfg.bits_words; | |
| unsigned int word = prev_winner[widx]; | |
| while (word != 0u && grown < max_grow) { | |
| unsigned int bit_pos = __ffs(word) - 1u; | |
| word &= ~(1u << bit_pos); | |
| unsigned int cell_id = widx * 32u + bit_pos; | |
| if (cell_id >= cfg.n_cells) continue; | |
| bool exists = false; | |
| for (unsigned int es = 0u; es < existing_syn + grown; es++) { | |
| if (syn_presyn[syn_base + es] == cell_id) { exists = true; break; } | |
| } | |
| if (exists) continue; | |
| unsigned int write_idx = existing_syn + grown; | |
| if (write_idx >= SPS) break; | |
| syn_presyn[syn_base + write_idx] = cell_id; | |
| tm_syn_perm[syn_base + write_idx] = (short)cfg.initial_perm_i16; | |
| grown++; | |
| } | |
| } | |
| if (grown > 0u) { | |
| seg_syn_count[target_seg] = existing_syn + grown; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| // ---- BARRIER 3: TM writes complete before anomaly + next-step read ---- | |
| // Fence: flush curr_active/curr_winner bitsets + tm_syn_perm + | |
| // seg_syn_count + syn_presyn before peers advance and consume them as | |
| // prev_active/prev_winner at t+1. | |
| __threadfence(); | |
| fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync); | |
| // Write anomaly for step t. | |
| if (blockIdx.x == 0u && tid == 0u) { | |
| unsigned int total = step_scratch[0]; | |
| unsigned int bad = step_scratch[1]; | |
| float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f; | |
| anom_out[t] = anom; | |
| } | |
| } | |
| } | |
| // Single-region kernel (legacy call site). | |
| __global__ __launch_bounds__(256, 2) | |
| void htm_fused_step(FusedPtrs P, FusedConfig cfg) { | |
| htm_fused_step_body(P, cfg); | |
| } | |
| // Batched kernel: one cooperative launch for B regions. grid.y = B, | |
| // grid.x = per-region block count. Each block reads its region's | |
| // FusedPtrs from the device array via blockIdx.y. | |
| __global__ __launch_bounds__(256, 2) | |
| void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) { | |
| const FusedPtrs P = P_arr[blockIdx.y]; | |
| htm_fused_step_body(P, cfg); | |
| } | |
| } // extern "C" | |