Spaces:

Jackoatmon
/

feather-h200-runtime

Runtime error

App Files Files Community

Jackoatmon commited on Apr 22

Commit

5e5bc2d

verified ·

1 Parent(s): 4a1d6e7

Update Feather H200 training runtime image

Browse files

Files changed (26) hide show

Dockerfile +116 -0
entrypoint.py +227 -0
mamba_ssm_init.py +69 -0
overlay/htm_rust/src/gpu/fused.rs +650 -0
overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu +677 -0
overlay/htm_rust/src/gpu/kernels/sp_boost_fused.cu +59 -0
overlay/htm_rust/src/gpu/kernels/sp_duty.cu +45 -0
overlay/htm_rust/src/gpu/kernels/sp_learn.cu +45 -0
overlay/htm_rust/src/gpu/kernels/sp_overlap.cu +78 -0
overlay/htm_rust/src/gpu/kernels/sp_topk.cu +117 -0
overlay/htm_rust/src/gpu/kernels/tm_activate.cu +66 -0
overlay/htm_rust/src/gpu/kernels/tm_anomaly.cu +43 -0
overlay/htm_rust/src/gpu/kernels/tm_grow.cu +155 -0
overlay/htm_rust/src/gpu/kernels/tm_learn.cu +75 -0
overlay/htm_rust/src/gpu/kernels/tm_predict.cu +102 -0
overlay/htm_rust/src/gpu/kernels/tm_punish.cu +64 -0
overlay/htm_rust/src/gpu/kernels/tm_reset.cu +36 -0
overlay/htm_rust/src/gpu/mod.rs +549 -0
overlay/htm_rust/src/gpu/sp_gpu.rs +796 -0
overlay/htm_rust/src/gpu/tests.rs +643 -0
overlay/htm_rust/src/gpu/tm_gpu.rs +460 -0
overlay/hydra/eval.py +210 -0
overlay/hydra/model.py +659 -0
overlay/hydra/optimizer.py +252 -0
overlay/subsystems/htm.py +429 -0
overlay/subsystems/sdr_retina.py +632 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,116 @@

+FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
+ENV DEBIAN_FRONTEND=noninteractive \
+    PIP_NO_CACHE_DIR=1 \
+    PYTHONUNBUFFERED=1 \
+    CARGO_HOME=/root/.cargo \
+    RUSTUP_HOME=/root/.rustup \
+    PATH=/root/.cargo/bin:${PATH}
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git curl ca-certificates build-essential pkg-config libssl-dev && \
+    rm -rf /var/lib/apt/lists/*
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y --profile minimal --default-toolchain stable
+RUN pip install --upgrade pip setuptools wheel && \
+    pip install \
+      maturin \
+      huggingface_hub \
+      datasets \
+      requests \
+      pyarrow \
+      rustbpe \
+      pandas \
+      tiktoken \
+      pydantic \
+      ninja \
+      packaging \
+      einops
+# Mamba-3 fused CUDA kernel stack (mandatory — NO fallback allowed).
+#
+# We install PRE-BUILT manylinux wheels from the official state-spaces/mamba
+# and Dao-AILab/causal-conv1d GitHub releases. Compiling mamba_ssm from source
+# on HF Spaces' cpu-basic builder (~16GB RAM) OOMKills even with MAX_JOBS=1 —
+# nvcc on the templated selective-scan/chunk-scan kernels needs 8–12GB per TU.
+#
+# Wheel selection for base image pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel:
+#   - Python 3.11 (cp311)                       — matches PyTorch 2.6.0 image
+#   - CUDA 12.x wheels (cu12)                   — matches host CUDA 12.4
+#   - PyTorch 2.6 ABI (torch2.6)                — exact torch match
+#   - cxx11abiFALSE                             — standard PyTorch pip build
+#
+# Versions: mamba_ssm 2.3.1 (first stable with Mamba3 class) + causal_conv1d
+# 1.6.1.post4 (matching ABI). Both are CUDA-compiled, no build toolchain needed
+# on the Space builder.
+#
+# Step A: install the published v2.3.1 prebuilt wheel (compiled CUDA ops
+# for selective_scan, layernorm_gated, ssd_*, causal_conv1d, etc).
+RUN pip install \
+      'https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.6.1.post4/causal_conv1d-1.6.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' \
+      'https://github.com/state-spaces/mamba/releases/download/v2.3.1/mamba_ssm-2.3.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' && \
+    python -c "import importlib.metadata as m; print('installed mamba_ssm=' + m.version('mamba_ssm') + ' causal_conv1d=' + m.version('causal_conv1d'))"
+#
+# Step B: graft the Mamba3 class + its pure-Triton ops subtree from mamba-ssm
+# main. v2.3.1 is the latest release but Mamba3 landed post-release; the new
+# files under ops/triton/mamba3/ are ALL pure Python @triton.jit kernels with
+# zero compiled-CUDA dependencies (verified: every import in that subtree is
+# triton/torch/python — no .so files, no nvcc). So we install the v2.3.1 wheel
+# (for its compiled ops) and overlay the main-branch Mamba3 sources on top.
+#
+# This avoids the source-build OOM on the cpu-basic HF Space builder and the
+# missing-file error the smoke hit on the last attempt.
+# Download grafted mamba3 module + triton ops subtree
+RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
+    BASE=https://raw.githubusercontent.com/state-spaces/mamba/main && \
+    curl -fsSL "$BASE/mamba_ssm/modules/mamba3.py" -o "$SITE/modules/mamba3.py" && \
+    mkdir -p "$SITE/ops/triton/mamba3" && \
+    for f in __init__.py angle_dt.py mamba3_mimo_rotary_step.py mamba3_mimo_utils.py mamba3_siso_bwd.py mamba3_siso_combined.py mamba3_siso_fwd.py mamba3_siso_step.py utils.py; do \
+        curl -fsSL "$BASE/mamba_ssm/ops/triton/mamba3/$f" -o "$SITE/ops/triton/mamba3/$f"; \
+    done
+# Replace mamba_ssm/__init__.py with a minimal one that only imports Mamba3
+# (pure-Triton, works). The shipped __init__.py eagerly imports
+# selective_scan_cuda.so which has a libtorch C++ ABI mismatch on this base
+# image ("undefined symbol: _ZN3c107WarningC1E..."). Since training only needs
+# Mamba3 (grafted from main), we skip all compiled-CUDA imports.
+COPY mamba_ssm_init.py /opt/conda/lib/python3.11/site-packages/mamba_ssm/__init__.py
+# Structural check (no triton init — triton has no GPU on the builder)
+RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
+    test -f "$SITE/modules/mamba3.py" && \
+    test -f "$SITE/ops/triton/mamba3/mamba3_siso_combined.py" && \
+    test -s "$SITE/__init__.py" && \
+    echo "mamba3 graft + __init__ override verified"
+# Optional tilelang for MIMO path — pure-python, cheap; SISO Mamba3 works without.
+RUN pip install tilelang || echo "[dockerfile] tilelang optional install failed — continuing"
+# Triton version decision: FORCE 3.5.1 — the only version with both mamba3
+# APIs (set_allocator + tl.make_tensor_descriptor). torch 2.6's _inductor
+# imports AttrsDescriptor from triton.compiler.compiler which was removed in
+# triton 3.4+, but mamba_ssm/__init__.py shims AttrsDescriptor as a stub
+# before any torch._inductor import path runs, so the incompatibility is
+# neutralized. Build-time assert verifies mamba3's two required APIs.
+RUN pip install --force-reinstall --no-deps 'triton==3.5.1' && \
+    python -c "import triton; from triton import language as tl; \
+               assert hasattr(triton, 'set_allocator'), 'missing triton.set_allocator'; \
+               assert hasattr(tl, 'make_tensor_descriptor'), 'missing tl.make_tensor_descriptor'; \
+               print(f'triton={triton.__version__} set_allocator+make_tensor_descriptor OK, AttrsDescriptor shimmed in mamba_ssm/__init__.py')"
+WORKDIR /workspace
+COPY overlay /workspace/feather
+COPY entrypoint.py /app/entrypoint.py
+WORKDIR /workspace/feather
+RUN python -m py_compile hydra/training.py prepare.py train.py && \
+    bash -n scripts/run_domain_expanded_pretrain.sh
+RUN export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} && \
+    export HTM_CUDA_ARCH=sm_90 && \
+    maturin build --release --features gpu --manifest-path htm_rust/Cargo.toml && \
+    pip install htm_rust/target/wheels/htm_rust-*.whl
+CMD ["python", "/app/entrypoint.py"]

entrypoint.py ADDED Viewed

	@@ -0,0 +1,227 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import json
+import os
+import subprocess
+import sys
+import time
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from pathlib import Path
+from threading import Thread
+# =============================================================================
+# EARLY CUDA FABRIC MANAGER KICK (before ANY CUDA-touching imports)
+# =============================================================================
+# On H200 hosts, cudaGetDeviceCount can return Error 802 "system not yet
+# initialized" on first use, because nvidia-fabricmanager on the host
+# synchronizes with the container's first driver call. Once any NVML/CUDA
+# call succeeds once (even just nvidia-smi), the fabric is up for the rest
+# of the container lifetime.
+#
+# Our previous approach (wait in a subprocess before training) didn't work
+# because the "initialization failed" state persisted across calls in the
+# same container. The real fix: kick the driver exactly once with
+# nvidia-smi, which is what successfully-working baseline containers do
+# implicitly via their first torch.cuda call.
+#
+# Must happen BEFORE `import torch` (because any import that eagerly calls
+# cudaGetDeviceCount will cache the Error 802 state).
+def _early_cuda_kick() -> None:
+    deadline = time.time() + 120.0
+    attempt = 0
+    while time.time() < deadline:
+        attempt += 1
+        r = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=30)
+        if r.returncode == 0 and 'H200' in (r.stdout or '') or 'H100' in (r.stdout or '') \
+                or 'A100' in (r.stdout or '') or r.returncode == 0:
+            print(f'[boot] nvidia-smi OK on attempt {attempt}', flush=True)
+            break
+        print(f'[boot] nvidia-smi attempt {attempt} rc={r.returncode} stderr={(r.stderr or "")[:120]}',
+              flush=True)
+        time.sleep(2)
+    # After nvidia-smi, probe torch in a subprocess so any latent error state
+    # doesn't leak into the main process's CUDA context.
+    probe = 'import torch; import sys; sys.exit(0 if torch.cuda.is_available() else 1)'
+    torch_deadline = time.time() + 120.0
+    t_attempt = 0
+    while time.time() < torch_deadline:
+        t_attempt += 1
+        r = subprocess.run([sys.executable, '-c', probe], capture_output=True, text=True, timeout=60)
+        if r.returncode == 0:
+            print(f'[boot] torch.cuda.is_available() = True after {t_attempt} probe(s)', flush=True)
+            return
+        if t_attempt == 1:
+            print(f'[boot] torch cuda probe {t_attempt}: {(r.stderr or "")[:200]}', flush=True)
+        time.sleep(2)
+    print('[boot] WARNING: torch.cuda never became ready — training will likely fail', flush=True)
+_early_cuda_kick()
+# Hydrate triton compilation cache from HF Hub before any triton/mamba_ssm import.
+# triton_cache_setup.py is copied next to this file by the job bash command.
+try:
+    import triton_cache_setup as _tcs
+    _tcs.setup()
+except ImportError:
+    print('[boot] triton_cache_setup not found; skipping cache hydrate', flush=True)
+from huggingface_hub import HfApi  # noqa: E402  (import after cuda kick)
+REPO_ROOT = Path('/workspace/feather')
+CACHE_ROOT = Path.home() / '.cache' / 'autoresearch'
+LOG_FILE = REPO_ROOT / 'run_domain_expanded.log'
+JOB_ID = os.environ.get('JOB_ID', 'local-job')
+OUTPUT_REPO = os.environ.get('HF_REPO_ID', 'icarus112/feather-pretrain-checkpoints')
+TOKEN = os.environ.get('HF_TOKEN')
+RUNTIME_MODE = os.environ.get('FEATHER_RUNTIME_MODE', 'space')
+APP_PORT = int(os.environ.get('PORT', '7860'))
+class _HealthHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path in ('/', '/health', '/healthz', '/ready'):
+            payload = {
+                'status': 'ok',
+                'mode': RUNTIME_MODE,
+                'job_id': JOB_ID,
+            }
+            body = json.dumps(payload).encode('utf-8')
+            self.send_response(200)
+            self.send_header('Content-Type', 'application/json')
+            self.send_header('Content-Length', str(len(body)))
+            self.end_headers()
+            self.wfile.write(body)
+            return
+        self.send_response(404)
+        self.end_headers()
+    def log_message(self, format, *args):
+        return
+def _start_health_server() -> HTTPServer:
+    server = HTTPServer(('0.0.0.0', APP_PORT), _HealthHandler)
+    thread = Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+    print(f'[space] health server listening on 0.0.0.0:{APP_PORT}', flush=True)
+    return server
+def upload_artifact(api: HfApi, path: Path, dest: str) -> None:
+    if not path.exists():
+        print(f'[upload] skip missing {path}', flush=True)
+        return
+    api.upload_file(
+        path_or_fileobj=str(path),
+        path_in_repo=dest,
+        repo_id=OUTPUT_REPO,
+        repo_type='model',
+    )
+    print(f'[upload] uploaded {path} -> {OUTPUT_REPO}/{dest}', flush=True)
+def _wait_for_cuda_ready(timeout_s: int = 120) -> None:
+    """Block until CUDA is fully initialized or timeout.
+    On H200 hosts with NVSwitch/fabric manager, nvidia driver setup can race
+    with container start. cudaGetDeviceCount can return CUDA_ERROR_SYSTEM_NOT_READY
+    (error 802) for the first few seconds, and any import that triggers
+    @triton.autotune (e.g. mamba_ssm, torch amp utilities) blows up with
+    "0 active drivers" if it happens during that window.
+    We pre-init CUDA in a throwaway Python subprocess (so any error state does
+    not leak into the main training process) and retry until torch.cuda
+    reports ready.
+    """
+    import time as _t
+    probe = (
+        "import torch; "
+        "import sys; "
+        "avail = torch.cuda.is_available(); "
+        "count = torch.cuda.device_count() if avail else 0; "
+        "sys.exit(0 if (avail and count > 0) else 1)"
+    )
+    deadline = _t.time() + timeout_s
+    attempt = 0
+    while _t.time() < deadline:
+        attempt += 1
+        r = subprocess.run(['python', '-c', probe], capture_output=True, text=True)
+        if r.returncode == 0:
+            print(f'[job] CUDA ready after {attempt} probe(s)', flush=True)
+            return
+        if attempt == 1:
+            print(f'[job] CUDA not ready yet (will retry up to {timeout_s}s): {r.stderr.strip()[:200]}', flush=True)
+        _t.sleep(2)
+    print(f'[job] CUDA still not ready after {timeout_s}s — continuing anyway (training will likely fail)', flush=True)
+def run_job_mode() -> int:
+    os.chdir(REPO_ROOT)
+    os.environ.setdefault('HYDRA_TIME_BUDGET', '43200')
+    os.environ.setdefault('HYDRA_TARGET_SHARDS', '2048')
+    os.environ.setdefault('HYDRA_DOWNLOAD_WORKERS', '16')
+    os.environ.setdefault('HYDRA_CKPT_INTERVAL', '1000')
+    os.environ.setdefault('HYDRA_RESUME_CKPT', str(CACHE_ROOT / 'latest.pt'))
+    # CUDA readiness was kicked at module import via _early_cuda_kick. Keep
+    # the wait as a second safety net — no-op if CUDA already ready.
+    _wait_for_cuda_ready()
+    cmd = [
+        'bash',
+        './scripts/run_domain_expanded_pretrain.sh',
+        '--target-shards', os.environ['HYDRA_TARGET_SHARDS'],
+        '--download-workers', os.environ['HYDRA_DOWNLOAD_WORKERS'],
+    ]
+    print('[job] starting Feather domain-expanded pretrain', flush=True)
+    print(f'[job] command={cmd}', flush=True)
+    proc = subprocess.run(cmd, check=False)
+    # Push triton compilation cache back to HF Hub for next run.
+    try:
+        import triton_cache_setup as _tcs
+        _tcs.teardown()
+    except Exception as _tcs_err:
+        print(f'[triton_cache] teardown error (non-fatal): {_tcs_err}', flush=True)
+    if TOKEN:
+        api = HfApi(token=TOKEN)
+        try:
+            api.create_repo(repo_id=OUTPUT_REPO, repo_type='model', private=True, exist_ok=True)
+        except Exception as e:
+            print(f'[upload] create_repo warning: {type(e).__name__}: {e}', flush=True)
+        prefix = f'jobs/{JOB_ID}'
+        try:
+            upload_artifact(api, LOG_FILE, f'{prefix}/run_domain_expanded.log')
+            upload_artifact(api, CACHE_ROOT / 'latest.pt', f'{prefix}/latest.pt')
+            upload_artifact(api, CACHE_ROOT / 'pretrain_final.pt', f'{prefix}/pretrain_final.pt')
+        except Exception as e:
+            print(f'[upload] upload warning: {type(e).__name__}: {e}', flush=True)
+    else:
+        print('[upload] HF_TOKEN not set; skipping artifact upload', flush=True)
+    return proc.returncode
+def run_space_mode() -> int:
+    server = _start_health_server()
+    print('[space] Feather runtime image ready', flush=True)
+    try:
+        while True:
+            time.sleep(3600)
+    finally:
+        server.shutdown()
+        server.server_close()
+def main() -> int:
+    if RUNTIME_MODE == 'job':
+        return run_job_mode()
+    return run_space_mode()
+if __name__ == '__main__':
+    raise SystemExit(main())

mamba_ssm_init.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# mamba_ssm package init — minimal override to avoid broken selective_scan_cuda.so
+# ABI mismatch with the base image's libtorch.
+#
+# The upstream __init__.py eagerly imports selective_scan_cuda which fails on
+# pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel (undefined c10::Warning ctor
+# symbol). We only need Mamba3 (grafted from main, pure-Triton), so we skip
+# all compiled-CUDA imports here and let Mamba3 load directly.
+__version__ = "2.3.1+feather-graft"
+# selective_scan_fn / mamba_inner_fn are shimmed to None — they are NOT used
+# by the Feather training path (which is Mamba3-only). If any import path
+# hits this, it will get a clear AttributeError instead of an obscure ImportError.
+selective_scan_fn = None
+mamba_inner_fn = None
+# --- triton API compatibility shims -----------------------------------------
+# Version matrix is hostile: torch 2.6 pins triton==3.2.0 because torch._inductor
+# imports AttrsDescriptor from triton.compiler.compiler — removed in triton 3.4+.
+# Grafted Mamba3 (from mamba-ssm main) needs triton.set_allocator and
+# tl.make_tensor_descriptor, both added in triton 3.3+. No single triton version
+# satisfies both simultaneously. We run on triton 3.5.1 (latest, has both mamba3
+# APIs) and shim AttrsDescriptor as a stub dataclass for torch._inductor. The
+# stub is never actually invoked at runtime because the codebase does not use
+# torch.compile — but importing torch._inductor.* still requires the symbol to
+# exist at module load time.
+import triton as _triton  # noqa: E402
+if not hasattr(_triton, "set_allocator"):
+    def _noop_set_allocator(_fn):  # pragma: no cover
+        return None
+    _triton.set_allocator = _noop_set_allocator
+import triton.compiler.compiler as _tcc  # noqa: E402
+if not hasattr(_tcc, "AttrsDescriptor"):
+    class _AttrsDescriptorShim:
+        """Stub for torch._inductor compatibility on triton >= 3.4.
+        torch._inductor.runtime.hints imports this at module load but the
+        constructor is only called inside torch.compile paths. Accept any
+        args/kwargs so the import itself succeeds."""
+        def __init__(self, *args, **kwargs):
+            self.args = args
+            self.kwargs = kwargs
+        @classmethod
+        def from_hints(cls, *args, **kwargs):
+            return cls(*args, **kwargs)
+    _tcc.AttrsDescriptor = _AttrsDescriptorShim
+# triton_key: removed in triton 3.5, used by torch._inductor.codecache for
+# FxGraphCache key derivation. Return a stable string so caching still works.
+if not hasattr(_tcc, "triton_key"):
+    def _triton_key_shim():
+        import triton as _t
+        return f"triton-{_t.__version__}-shim"
+    _tcc.triton_key = _triton_key_shim
+# Suppress torch.compile/_dynamo errors globally — we don't rely on torch.compile
+# for performance in this codebase (Muon + mamba3 CUDA kernels already fused),
+# so fall back to eager on any dynamo failure rather than crashing. This is
+# defense-in-depth against further triton API drift.
+try:
+    import torch._dynamo  # noqa: F401 — triggers dynamo module init
+    torch._dynamo.config.suppress_errors = True
+except Exception:  # pragma: no cover
+    pass
+# Expose Mamba3 at top level to match `from mamba_ssm import Mamba3`.
+from mamba_ssm.modules.mamba3 import Mamba3  # noqa: E402

overlay/htm_rust/src/gpu/fused.rs ADDED Viewed

	@@ -0,0 +1,650 @@

+//! Fused HTM megakernel launcher.
+//!
+//! Collapses the 12-kernel per-timestep pipeline (and the outer T-loop) into
+//! a single kernel launch per forward. See `kernels/htm_fused_step.cu` for
+//! the kernel design and the cross-block coherence strategy (grid barrier
+//! via device counter with all blocks concurrently resident).
+//!
+//! Launch invariant: `grid_dim.x <= concurrent-block capacity`. Host code
+//! probes the device SM count at construction and caps grid_dim.x
+//! accordingly — otherwise the grid barrier deadlocks.
+//!
+//! Semantic change from the top-K pipeline: activation is per-column
+//! threshold-based (local lateral inhibition) instead of global top-K.
+//! A per-column `inhibition_threshold` is tracked and EMA-steered to hit
+//! the sparsity target. This is a real architectural change and is
+//! documented in `docs/GPU_HTM.md`.
+#![cfg(feature = "gpu")]
+use std::ffi::CString;
+use std::sync::Arc;
+use cudarc::driver::{result, sys, CudaDevice, CudaSlice, DeviceRepr, DevicePtr, DriverError,
+                      LaunchConfig};
+use cudarc::nvrtc::Ptx;
+use super::sp_gpu::SpatialPoolerGpu;
+use super::tm_gpu::{TemporalMemoryGpu, MAX_SEGMENTS_PER_CELL, MAX_SYN_PER_SEGMENT};
+const PTX_HTM_FUSED: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/htm_fused_step.ptx"));
+/// Struct-by-value pointer pack — matches C-side `FusedPtrs`.
+///
+/// NOTE: `barrier_counters` is kept as an ABI-compat dummy (always 0). The
+/// C-side `FusedPtrs` still has the field at the same byte offset; removing
+/// it here would shift all subsequent fields and break the layout. Worker A
+/// will eventually delete the field from both sides once the kernel is
+/// updated; until then we zero it.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct FusedPtrs {
+    pub syn_bit: u64,
+    pub syn_perm: u64,
+    pub boost: u64,
+    pub active_duty: u64,
+    pub inhibition_threshold: u64,
+    pub seg_cell_id: u64,
+    pub seg_syn_count: u64,
+    pub syn_presyn: u64,
+    pub tm_syn_perm: u64,
+    pub cell_seg_count: u64,
+    pub cell_active_a: u64,
+    pub cell_active_b: u64,
+    pub cell_winner_a: u64,
+    pub cell_winner_b: u64,
+    pub inputs: u64,
+    pub cols_out: u64,
+    pub anom_out: u64,
+    /// ABI-compat dummy — always 0. No device memory is allocated for this
+    /// field; the cluster barrier replaces the old software DLB barrier.
+    pub barrier_counters: u64,
+    pub step_scratch: u64,
+}
+unsafe impl DeviceRepr for FusedPtrs {}
+/// Launch-time config — matches C-side `FusedConfig` 1:1.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct FusedConfig {
+    pub input_bits: u32,
+    pub n_columns: u32,
+    pub synapses_per_col: u32,
+    pub conn_thr: f32,
+    pub sp_inc: f32,
+    pub sp_dec: f32,
+    pub sparsity_target: f32,
+    pub duty_alpha: f32,
+    pub thr_adapt_rate: f32,
+    pub cells_per_column: u32,
+    pub n_cells: u32,
+    pub bits_words: u32,
+    pub max_segments_per_cell: u32,
+    pub synapses_per_segment: u32,
+    pub activation_threshold: u32,
+    pub learning_threshold: u32,
+    pub max_new_synapses: u32,
+    pub conn_thr_i16: i32,
+    pub perm_inc_i16: i32,
+    pub perm_dec_i16: i32,
+    pub predicted_seg_dec_i16: i32,
+    pub initial_perm_i16: i32,
+    pub t: u32,
+    pub learn: u32,
+    pub iter_seed: u32,
+    pub cooperative_grid_sync: u32,
+}
+unsafe impl DeviceRepr for FusedConfig {}
+/// Cluster launch parameters probed at construction time.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) struct ClusterInfo {
+    /// Maximum cluster size supported by this device (0 = cluster unsupported).
+    pub max_cluster_size: u32,
+}
+// There is only ONE launch mode: non-cooperative launch with Hopper Thread
+// Block Cluster attribute (`CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION`). The old
+// software DLB barrier and the cooperative-launch path are both removed.
+// Cluster barriers replace both.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) struct FusedLaunchPlan {
+    pub grid_dim_x: u32,
+    pub block_dim_x: u32,
+    pub cooperative_grid_limit: u32,
+    pub sm_count: u32,
+}
+fn fused_grid_cap_override() -> Option<u32> {
+    std::env::var("HTM_FUSED_GRID_CAP")
+        .ok()
+        .and_then(|s| s.parse::<u32>().ok())
+        .map(|v| v.max(1))
+}
+pub(crate) fn plan_fused_launch(
+    sm_count: u32,
+    cooperative_supported: bool,
+    cooperative_grid_limit: u32,
+    grid_cap_override: Option<u32>,
+) -> Result<FusedLaunchPlan, String> {
+    let sm_count = sm_count.max(1);
+    let block_dim_x = 1024u32;
+    // Cluster launch path: cooperative launch is not required. Keep the probe
+    // result for residency estimation only.
+    if !cooperative_supported {
+        eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
+    }
+    // Cluster constraint: grid_dim_x must equal the cluster size (16) so that
+    // each region maps to exactly one cluster. `HTM_FUSED_GRID_CAP` can lower
+    // this for debugging but should not exceed 16 for cluster correctness.
+    let default_grid_cap = 16u32;
+    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap).min(16);
+    let resident_bound = if cooperative_grid_limit > 0 {
+        cooperative_grid_limit.max(sm_count * 2)
+    } else {
+        sm_count * 2
+    };
+    Ok(FusedLaunchPlan {
+        grid_dim_x: resident_bound.min(grid_cap).max(1),
+        block_dim_x,
+        cooperative_grid_limit: resident_bound,
+        sm_count,
+    })
+}
+pub(super) struct RawFusedKernel {
+    module: sys::CUmodule,
+    pub(super) function: sys::CUfunction,
+    pub(super) function_batched: sys::CUfunction,
+}
+unsafe impl Send for RawFusedKernel {}
+unsafe impl Sync for RawFusedKernel {}
+impl Drop for RawFusedKernel {
+    fn drop(&mut self) {
+        unsafe {
+            let _ = result::module::unload(self.module);
+        }
+    }
+}
+/// Owns fused-path-only device state:
+///   - per-column inhibition threshold (replaces global top-K)
+///   - ping-pong cell_active/cell_winner bitsets
+///   - step_scratch (n_active, n_unpred per timestep)
+///   - cluster launch capability info
+pub struct FusedState {
+    dev: Arc<CudaDevice>,
+    pub(super) raw_kernel: RawFusedKernel,
+    pub inhibition_threshold: CudaSlice<f32>,
+    pub cell_active_bits_a: CudaSlice<u32>,
+    pub cell_active_bits_b: CudaSlice<u32>,
+    pub cell_winner_bits_a: CudaSlice<u32>,
+    pub cell_winner_bits_b: CudaSlice<u32>,
+    pub step_scratch: CudaSlice<u32>,       // length 6
+    pub grid_dim_x: u32,
+    pub block_dim_x: u32,
+    pub cooperative_grid_limit: u32,
+    pub iter_counter: u32,
+    /// Hopper cluster launch capability (0 = unsupported).
+    pub cluster_info: ClusterInfo,
+    // Config mirror (read-only after init).
+    #[allow(dead_code)]
+    pub initial_threshold: f32,
+}
+impl FusedState {
+    pub fn new(
+        dev: Arc<CudaDevice>,
+        n_columns: usize,
+        cells_per_column: usize,
+        initial_threshold: f32,
+    ) -> Result<Self, DriverError> {
+        let n_cells = n_columns * cells_per_column;
+        assert!(n_cells % 32 == 0, "n_cells must be divisible by 32 for bitsets");
+        let bits_words = n_cells / 32;
+        let mut inhibition_threshold = dev.alloc_zeros::<f32>(n_columns)?;
+        let init_vec = vec![initial_threshold; n_columns];
+        dev.htod_sync_copy_into(&init_vec, &mut inhibition_threshold)?;
+        let cell_active_bits_a = dev.alloc_zeros::<u32>(bits_words)?;
+        let cell_active_bits_b = dev.alloc_zeros::<u32>(bits_words)?;
+        let cell_winner_bits_a = dev.alloc_zeros::<u32>(bits_words)?;
+        let cell_winner_bits_b = dev.alloc_zeros::<u32>(bits_words)?;
+        let step_scratch = dev.alloc_zeros::<u32>(6)?;
+        unsafe {
+            result::ctx::set_current(*dev.cu_primary_ctx())?;
+        }
+        if dev.get_func("htm_fused", "htm_fused_step").is_none() {
+            dev.load_ptx(
+                Ptx::from_src(PTX_HTM_FUSED),
+                "htm_fused",
+                &["htm_fused_step", "htm_fused_step_batched"],
+            )?;
+        }
+        let ptx = CString::new(PTX_HTM_FUSED).expect("PTX contains no interior nul bytes");
+        let module = unsafe { result::module::load_data(ptx.as_ptr().cast()) }?;
+        let function = unsafe {
+            result::module::get_function(module, CString::new("htm_fused_step").unwrap())
+        }?;
+        let function_batched = unsafe {
+            result::module::get_function(module, CString::new("htm_fused_step_batched").unwrap())
+        }?;
+        // Cluster size 16 on Hopper is "non-portable" (> 8 requires opt-in).
+        // Must set CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED=1 on
+        // every launched kernel function, otherwise cuLaunchKernelEx rejects
+        // the cluster dim with CUDA_ERROR_INVALID_CLUSTER_SIZE.
+        unsafe {
+            let attr = sys::CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED;
+            // Ignore errors: older CUDA may lack the attribute, in which case
+            // only portable sizes (<= 8) work — plan_fused_launch caps at 8.
+            let _ = sys::lib().cuFuncSetAttribute(function, attr, 1);
+            let _ = sys::lib().cuFuncSetAttribute(function_batched, attr, 1);
+        }
+        // Probe SM count.
+        let sm_count = match dev.attribute(
+            cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+        ) {
+            Ok(v) => v as u32,
+            Err(_) => 16u32,
+        };
+        // T1: Probe Hopper cluster launch capability.
+        let max_cluster_size = match dev.attribute(
+            cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH,
+        ) {
+            Ok(v) if v > 0 => {
+                // H200/sm_90a supports up to 16 blocks per cluster.
+                // There is no MAX_CLUSTER_SIZE attribute in CUDA 12.4; hard-code the
+                // Hopper maximum which is 16 (8 SMs × 2 blocks/SM = 16 blocks/cluster).
+                16u32
+            }
+            _ => 0u32,
+        };
+        eprintln!("[htm_rust] cluster: max_cluster_size={}", max_cluster_size);
+        let cluster_info = ClusterInfo { max_cluster_size };
+        let cooperative_supported = matches!(
+            dev.attribute(sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH),
+            Ok(v) if v > 0
+        );
+        let cooperative_grid_limit = if cooperative_supported {
+            let blocks_per_sm = unsafe {
+                result::occupancy::max_active_block_per_multiprocessor(function, 1024, 0)
+            }
+            .ok()
+            .map(|v| v.max(0) as u32)
+            .unwrap_or(0);
+            sm_count.saturating_mul(blocks_per_sm)
+        } else {
+            0
+        };
+        let launch_plan = plan_fused_launch(
+            sm_count,
+            cooperative_supported,
+            cooperative_grid_limit,
+            fused_grid_cap_override(),
+        )
+        .map_err(|msg| {
+            // Surface as a CUDA-ish error so callers can propagate.
+            eprintln!("[htm_rust] FATAL: {msg}");
+            DriverError(cudarc::driver::sys::CUresult::CUDA_ERROR_NOT_SUPPORTED)
+        })?;
+        eprintln!(
+            "[htm_rust] fused kernel: sm_count={} grid_dim_x={} cooperative_grid_limit={} cluster_max={}",
+            launch_plan.sm_count, launch_plan.grid_dim_x, launch_plan.cooperative_grid_limit,
+            cluster_info.max_cluster_size,
+        );
+        Ok(Self {
+            dev,
+            raw_kernel: RawFusedKernel { module, function, function_batched },
+            inhibition_threshold,
+            cell_active_bits_a,
+            cell_active_bits_b,
+            cell_winner_bits_a,
+            cell_winner_bits_b,
+            step_scratch,
+            grid_dim_x: launch_plan.grid_dim_x,
+            block_dim_x: launch_plan.block_dim_x,
+            cooperative_grid_limit: launch_plan.cooperative_grid_limit,
+            iter_counter: 0,
+            cluster_info,
+            initial_threshold,
+        })
+    }
+    /// Reset fused state. Called at region.reset().
+    pub fn reset(&mut self) -> Result<(), DriverError> {
+        self.dev.memset_zeros(&mut self.cell_active_bits_a)?;
+        self.dev.memset_zeros(&mut self.cell_active_bits_b)?;
+        self.dev.memset_zeros(&mut self.cell_winner_bits_a)?;
+        self.dev.memset_zeros(&mut self.cell_winner_bits_b)?;
+        self.dev.memset_zeros(&mut self.step_scratch)?;
+        // Do NOT reset inhibition_threshold — it's learned state. A hard
+        // reset of TM state should NOT forget the sparsity calibration.
+        Ok(())
+    }
+}
+/// Launch the fused megakernel. Processes all T timesteps in one kernel.
+///
+/// Uses `cuLaunchKernelEx` with `CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION=(16,1,1)`
+/// when the device supports cluster launch, otherwise falls back to a plain
+/// `launch_kernel`. For single-region launches, grid_dim_x <= 16 ensures the
+/// entire grid fits in one cluster.
+#[allow(clippy::too_many_arguments)]
+pub fn launch_fused(
+    sp: &mut SpatialPoolerGpu,
+    tm: &mut TemporalMemoryGpu,
+    fused: &mut FusedState,
+    inputs_flat: &CudaSlice<u8>,
+    cols_out: &mut CudaSlice<u8>,
+    anom_out: &mut CudaSlice<f32>,
+    t: usize,
+    input_bits: usize,
+    learn: bool,
+) -> Result<(), DriverError> {
+    // Reset step_scratch before each launch (safe re-entry).
+    sp.dev_ref().memset_zeros(&mut fused.step_scratch)?;
+    fused.iter_counter = fused.iter_counter.wrapping_add(1);
+    let cfg = FusedConfig {
+        input_bits: input_bits as u32,
+        n_columns: sp.n_columns_accessor() as u32,
+        synapses_per_col: sp.synapses_per_col_accessor() as u32,
+        conn_thr: sp.conn_thr_accessor(),
+        sp_inc: sp.inc_accessor(),
+        sp_dec: sp.dec_accessor(),
+        sparsity_target: sp.sparsity_accessor(),
+        duty_alpha: 1.0f32 / sp.duty_period_accessor().max(1.0),
+        thr_adapt_rate: 0.001f32,
+        cells_per_column: tm.cells_per_column as u32,
+        n_cells: tm.n_cells as u32,
+        bits_words: tm.bits_words as u32,
+        max_segments_per_cell: MAX_SEGMENTS_PER_CELL as u32,
+        synapses_per_segment: MAX_SYN_PER_SEGMENT as u32,
+        activation_threshold: tm.activation_threshold,
+        learning_threshold: tm.learning_threshold,
+        max_new_synapses: tm.max_new_synapse_count,
+        conn_thr_i16: tm.conn_thr_i16 as i32,
+        perm_inc_i16: tm.perm_inc_i16 as i32,
+        perm_dec_i16: tm.perm_dec_i16 as i32,
+        predicted_seg_dec_i16: tm.predicted_seg_dec_i16 as i32,
+        initial_perm_i16: tm.initial_perm_i16 as i32,
+        t: t as u32,
+        learn: if learn { 1 } else { 0 },
+        iter_seed: fused.iter_counter,
+        cooperative_grid_sync: 1,
+    };
+    let ptrs = FusedPtrs {
+        syn_bit: *sp.syn_bit_accessor().device_ptr(),
+        syn_perm: *sp.syn_perm_accessor().device_ptr(),
+        boost: *sp.boost_accessor().device_ptr(),
+        active_duty: *sp.active_duty_accessor().device_ptr(),
+        inhibition_threshold: *fused.inhibition_threshold.device_ptr(),
+        seg_cell_id: *tm.seg_cell_id_accessor().device_ptr(),
+        seg_syn_count: *tm.seg_syn_count_accessor().device_ptr(),
+        syn_presyn: *tm.syn_presyn_accessor().device_ptr(),
+        tm_syn_perm: *tm.syn_perm_accessor().device_ptr(),
+        cell_seg_count: *tm.cell_seg_count_accessor().device_ptr(),
+        cell_active_a: *fused.cell_active_bits_a.device_ptr(),
+        cell_active_b: *fused.cell_active_bits_b.device_ptr(),
+        cell_winner_a: *fused.cell_winner_bits_a.device_ptr(),
+        cell_winner_b: *fused.cell_winner_bits_b.device_ptr(),
+        inputs: *inputs_flat.device_ptr(),
+        cols_out: *cols_out.device_ptr(),
+        anom_out: *anom_out.device_ptr(),
+        barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
+        step_scratch: *fused.step_scratch.device_ptr(),
+    };
+    let grid_x = fused.grid_dim_x;
+    let block_x = fused.block_dim_x;
+    let cu_stream = *sp.dev_ref().cu_stream();
+    let use_cluster = fused.cluster_info.max_cluster_size > 0;
+    unsafe {
+        result::ctx::set_current(*sp.dev_ref().cu_primary_ctx())?;
+        let mut kernel_params: [*mut std::ffi::c_void; 2] = [
+            (&ptrs as *const FusedPtrs).cast_mut().cast(),
+            (&cfg as *const FusedConfig).cast_mut().cast(),
+        ];
+        if use_cluster {
+            // T10: Hopper cluster launch with CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION.
+            // cluster_dim=(16,1,1) maps the entire single-region grid into one cluster.
+            let mut attr: sys::CUlaunchAttribute = std::mem::zeroed();
+            attr.id = sys::CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+            attr.value.clusterDim.x = 16;
+            attr.value.clusterDim.y = 1;
+            attr.value.clusterDim.z = 1;
+            let mut launch_cfg: sys::CUlaunchConfig = std::mem::zeroed();
+            launch_cfg.gridDimX = grid_x;
+            launch_cfg.gridDimY = 1;
+            launch_cfg.gridDimZ = 1;
+            launch_cfg.blockDimX = block_x;
+            launch_cfg.blockDimY = 1;
+            launch_cfg.blockDimZ = 1;
+            launch_cfg.sharedMemBytes = 0;
+            launch_cfg.hStream = cu_stream;
+            launch_cfg.numAttrs = 1;
+            launch_cfg.attrs = &mut attr as *mut sys::CUlaunchAttribute;
+            let ret = sys::lib().cuLaunchKernelEx(
+                &launch_cfg as *const sys::CUlaunchConfig,
+                fused.raw_kernel.function,
+                kernel_params.as_mut_ptr(),
+                std::ptr::null_mut(),
+            );
+            if ret != sys::CUresult::CUDA_SUCCESS {
+                return Err(DriverError(ret));
+            }
+        } else {
+            // Fallback for devices that don't support cluster launch.
+            result::launch_kernel(
+                fused.raw_kernel.function,
+                (grid_x, 1, 1),
+                (block_x, 1, 1),
+                0,
+                cu_stream,
+                &mut kernel_params,
+            )?;
+        }
+    }
+    Ok(())
+}
+/// Single batched non-cooperative launch for B regions with DLB sync. Uses the same kernel
+/// body; each block reads its region's FusedPtrs from a device-side array
+/// indexed by blockIdx.y. All regions share the same config (same
+/// input_bits/n_columns/etc.) so we pass one FusedConfig.
+///
+/// This breaks through the CUDA cooperative-kernel device-level
+/// serialization: multiple cooperative launches are serialized regardless
+/// of stream, but one cooperative launch with grid.y=B processes all
+/// regions in a single invocation — ~B× speedup vs B sequential launches.
+#[allow(clippy::too_many_arguments)]
+/// Low-level raw-pointer entry, called by PyO3 binding which holds the
+/// mutable borrows. Safety: each `*mut HTMRegionGpu` must point to a live,
+/// uniquely-borrowed region. All regions must be distinct.
+pub(super) fn launch_fused_batched_raw(
+    region_ptrs: &[*mut super::HTMRegionGpu],
+    inputs_per_region: &[u64],
+    cols_per_region: &[u64],
+    anom_per_region: &[u64],
+    t: usize,
+    input_bits: usize,
+    learn: bool,
+) -> Result<(), DriverError> {
+    let b = region_ptrs.len();
+    assert_eq!(inputs_per_region.len(), b);
+    assert_eq!(cols_per_region.len(), b);
+    assert_eq!(anom_per_region.len(), b);
+    assert!(b >= 1, "need at least one region");
+    // Reset per-region step_scratch before each launch.
+    for &rp in region_ptrs.iter() {
+        let r = unsafe { &mut *rp };
+        let dev = r.sp_gpu.dev_ref().clone();
+        dev.memset_zeros(&mut r.fused_state.step_scratch)?;
+        r.fused_state.iter_counter = r.fused_state.iter_counter.wrapping_add(1);
+    }
+    // Shared config — all regions use identical sp/tm parameters.
+    let (grid_x, block_x, function_batched, cu_stream, cu_ctx) = {
+        let r0 = unsafe { &*region_ptrs[0] };
+        (
+            r0.fused_state.grid_dim_x,
+            r0.fused_state.block_dim_x,
+            r0.fused_state.raw_kernel.function_batched,
+            *r0.sp_gpu.dev_ref().cu_stream(),
+            *r0.sp_gpu.dev_ref().cu_primary_ctx(),
+        )
+    };
+    let cfg = {
+        let r = unsafe { &*region_ptrs[0] };
+        FusedConfig {
+            input_bits: input_bits as u32,
+            n_columns: r.sp_gpu.n_columns_accessor() as u32,
+            synapses_per_col: r.sp_gpu.synapses_per_col_accessor() as u32,
+            conn_thr: r.sp_gpu.conn_thr_accessor(),
+            sp_inc: r.sp_gpu.inc_accessor(),
+            sp_dec: r.sp_gpu.dec_accessor(),
+            sparsity_target: r.sp_gpu.sparsity_accessor(),
+            duty_alpha: 1.0f32 / r.sp_gpu.duty_period_accessor().max(1.0),
+            thr_adapt_rate: 0.001f32,
+            cells_per_column: r.tm_gpu.cells_per_column as u32,
+            n_cells: r.tm_gpu.n_cells as u32,
+            bits_words: r.tm_gpu.bits_words as u32,
+            max_segments_per_cell: MAX_SEGMENTS_PER_CELL as u32,
+            synapses_per_segment: MAX_SYN_PER_SEGMENT as u32,
+            activation_threshold: r.tm_gpu.activation_threshold,
+            learning_threshold: r.tm_gpu.learning_threshold,
+            max_new_synapses: r.tm_gpu.max_new_synapse_count,
+            conn_thr_i16: r.tm_gpu.conn_thr_i16 as i32,
+            perm_inc_i16: r.tm_gpu.perm_inc_i16 as i32,
+            perm_dec_i16: r.tm_gpu.perm_dec_i16 as i32,
+            predicted_seg_dec_i16: r.tm_gpu.predicted_seg_dec_i16 as i32,
+            initial_perm_i16: r.tm_gpu.initial_perm_i16 as i32,
+            t: t as u32,
+            learn: if learn { 1 } else { 0 },
+            iter_seed: r.fused_state.iter_counter,
+            cooperative_grid_sync: 1,
+        }
+    };
+    // Build B FusedPtrs per-region.
+    let ptrs_vec: Vec<FusedPtrs> = (0..b)
+        .map(|i| {
+            let r = unsafe { &*region_ptrs[i] };
+            FusedPtrs {
+                syn_bit: *r.sp_gpu.syn_bit_accessor().device_ptr(),
+                syn_perm: *r.sp_gpu.syn_perm_accessor().device_ptr(),
+                boost: *r.sp_gpu.boost_accessor().device_ptr(),
+                active_duty: *r.sp_gpu.active_duty_accessor().device_ptr(),
+                inhibition_threshold: *r.fused_state.inhibition_threshold.device_ptr(),
+                seg_cell_id: *r.tm_gpu.seg_cell_id_accessor().device_ptr(),
+                seg_syn_count: *r.tm_gpu.seg_syn_count_accessor().device_ptr(),
+                syn_presyn: *r.tm_gpu.syn_presyn_accessor().device_ptr(),
+                tm_syn_perm: *r.tm_gpu.syn_perm_accessor().device_ptr(),
+                cell_seg_count: *r.tm_gpu.cell_seg_count_accessor().device_ptr(),
+                cell_active_a: *r.fused_state.cell_active_bits_a.device_ptr(),
+                cell_active_b: *r.fused_state.cell_active_bits_b.device_ptr(),
+                cell_winner_a: *r.fused_state.cell_winner_bits_a.device_ptr(),
+                cell_winner_b: *r.fused_state.cell_winner_bits_b.device_ptr(),
+                inputs: inputs_per_region[i],
+                cols_out: cols_per_region[i],
+                anom_out: anom_per_region[i],
+                barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
+                step_scratch: *r.fused_state.step_scratch.device_ptr(),
+            }
+        })
+        .collect();
+    // Upload FusedPtrs array to device (B * sizeof(FusedPtrs) bytes).
+    // FusedPtrs is repr(C) + DeviceRepr so htod_sync_copy handles it.
+    let dev = unsafe { &*region_ptrs[0] }.sp_gpu.dev_ref().clone();
+    let ptrs_dev: CudaSlice<FusedPtrs> = dev.htod_sync_copy(&ptrs_vec)?;
+    let ptrs_dev_ptr: u64 = *ptrs_dev.device_ptr();
+    // T10: Cluster launch for batched regions.
+    // Grid = (grid_x, B, 1) with cluster_dim=(16,1,1): each region (Y slice)
+    // occupies exactly one cluster of 16 blocks. All 8 clusters run concurrently
+    // on the H200's 132 SMs (8 × 16 = 128 blocks ≤ 132 SMs).
+    let use_cluster = {
+        let r0 = unsafe { &*region_ptrs[0] };
+        r0.fused_state.cluster_info.max_cluster_size > 0
+    };
+    unsafe {
+        result::ctx::set_current(cu_ctx)?;
+        let mut kernel_params: [*mut std::ffi::c_void; 2] = [
+            (&ptrs_dev_ptr as *const u64).cast_mut().cast(),
+            (&cfg as *const FusedConfig).cast_mut().cast(),
+        ];
+        if use_cluster {
+            let mut attr: sys::CUlaunchAttribute = std::mem::zeroed();
+            attr.id = sys::CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+            attr.value.clusterDim.x = 16;
+            attr.value.clusterDim.y = 1;
+            attr.value.clusterDim.z = 1;
+            let mut launch_cfg: sys::CUlaunchConfig = std::mem::zeroed();
+            launch_cfg.gridDimX = grid_x;
+            launch_cfg.gridDimY = b as u32;
+            launch_cfg.gridDimZ = 1;
+            launch_cfg.blockDimX = block_x;
+            launch_cfg.blockDimY = 1;
+            launch_cfg.blockDimZ = 1;
+            launch_cfg.sharedMemBytes = 0;
+            launch_cfg.hStream = cu_stream;
+            launch_cfg.numAttrs = 1;
+            launch_cfg.attrs = &mut attr as *mut sys::CUlaunchAttribute;
+            let ret = sys::lib().cuLaunchKernelEx(
+                &launch_cfg as *const sys::CUlaunchConfig,
+                function_batched,
+                kernel_params.as_mut_ptr(),
+                std::ptr::null_mut(),
+            );
+            if ret != sys::CUresult::CUDA_SUCCESS {
+                return Err(DriverError(ret));
+            }
+        } else {
+            // Fallback: plain non-cooperative launch for non-Hopper devices.
+            result::launch_kernel(
+                function_batched,
+                (grid_x, b as u32, 1),
+                (block_x, 1, 1),
+                0,
+                cu_stream,
+                &mut kernel_params,
+            )?;
+        }
+    }
+    Ok(())
+}

overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu ADDED Viewed

	@@ -0,0 +1,677 @@

+// Fused HTM megakernel — SP + TM, all T timesteps in a single launch.
+//
+// Design rationale:
+//   - Global top-K column selection requires cross-block synchronization at
+//     every timestep (grid.sync is unreliable on WSL2/sm_86 without rdc=true).
+//   - Replace with per-column threshold activation using local lateral
+//     inhibition: column c activates if overlap[c]*boost[c] > threshold[c].
+//     Threshold is a per-column running-EMA learned scalar that steers the
+//     column's long-run activation rate toward the global sparsity target.
+//   - This is biologically grounded (GABAergic local inhibition) and supported
+//     by HTM theory (duty-cycle boost already drives this loop; we just
+//     change which lever the EMA pulls).
+//
+// Launch shape:
+//   grid  = min(device SM count, 16)  // hard cap — see below
+//   block = 1024 threads = 32 warps
+//   Each warp of 32 owns a contiguous column slice (n_columns / total_warps).
+//
+// Cross-block coherence:
+//   - Ping-pong buffers for cell_active/cell_winner: write _a at even t,
+//     read _b; reversed at odd t.
+//   - Preferred path: cooperative launch + hardware whole-grid sync.
+//   - Fallback path: software 3-slot rotating grid barrier for devices/drivers
+//     that cannot do cooperative launch.
+//
+// 2026-04-16: grid_dim reduced from 28 to 16 after deadlock RCA. The previous
+// cap of 28 relied on all blocks being concurrently resident on a 30-SM RTX
+// 3060 Laptop. Under thermal throttling effective residency dropped to ~20-24,
+// leaving scheduled blocks spinning on the software grid barrier waiting for
+// peer blocks that would never run. 16 blocks is below any realistic residency
+// floor and preserves enough warp parallelism (16*32 = 512 warps) to saturate
+// memory bandwidth on the spatial-pooler stage.
+//
+// Kernel signature uses struct-by-value for pointers and config to stay
+// inside cudarc's launch-arg count limit.
+#include <cooperative_groups.h>
+#include <cooperative_groups/memcpy_async.h>
+namespace cg = cooperative_groups;
+// Maximum columns owned per cluster-block in DSMEM.
+// Supports n_columns up to COLS_PER_CLUSTER_BLOCK_MAX * cluster_size.
+// At cluster_size=16: supports up to 256*16=4096 columns.
+// Each array costs 256*4 = 1024 bytes; three arrays = 3072 bytes per SM —
+// well under the 228 KB H200 shared-memory cap.
+#define COLS_PER_CLUSTER_BLOCK_MAX 256u
+// Maximum input_bits supported by the TMA-multicast staging tile.
+// At 32 KB this covers the production SDR width (16384 bits) with 2× headroom.
+// Total shared per SM: 32768 (tile) + 3072 (DSMEM float arrays) = ~35 KB —
+// well under the 228 KB H200 limit.
+//
+// Expected speedup from TMA multicast input staging (T9/T11):
+//   - Without staging: 16 SMs × T × (input_bits GMEM reads per timestep)
+//   - With staging:    1 TMA DMA per timestep, shared reads from L1 thereafter
+//   - Theoretical DRAM bandwidth reduction: ~16× on input reads
+//   - Wall-clock reduction estimate: -20 to -40 ms from reduced input fetch latency
+#define INPUT_BITS_MAX 32768u
+extern "C" {
+struct FusedPtrs {
+    unsigned long long syn_bit;
+    unsigned long long syn_perm;
+    unsigned long long boost;
+    unsigned long long active_duty;
+    unsigned long long inhibition_threshold;
+    unsigned long long seg_cell_id;
+    unsigned long long seg_syn_count;
+    unsigned long long syn_presyn;
+    unsigned long long tm_syn_perm;
+    unsigned long long cell_seg_count;
+    unsigned long long cell_active_a;
+    unsigned long long cell_active_b;
+    unsigned long long cell_winner_a;
+    unsigned long long cell_winner_b;
+    unsigned long long inputs;
+    unsigned long long cols_out;
+    unsigned long long anom_out;
+    unsigned long long barrier_counters;
+    unsigned long long step_scratch;
+};
+struct FusedConfig {
+    // SP constants
+    unsigned int input_bits;
+    unsigned int n_columns;
+    unsigned int synapses_per_col;
+    float        conn_thr;
+    float        sp_inc;
+    float        sp_dec;
+    float        sparsity_target;
+    float        duty_alpha;
+    float        thr_adapt_rate;
+    // TM constants
+    unsigned int cells_per_column;
+    unsigned int n_cells;
+    unsigned int bits_words;
+    unsigned int max_segments_per_cell;
+    unsigned int synapses_per_segment;
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int max_new_synapses;
+    int          conn_thr_i16;
+    int          perm_inc_i16;
+    int          perm_dec_i16;
+    int          predicted_seg_dec_i16;
+    int          initial_perm_i16;
+    // Loop constants
+    unsigned int T;
+    unsigned int learn;
+    unsigned int iter_seed;
+    unsigned int cooperative_grid_sync;
+};
+// Hardware cluster barrier using Hopper sm_90a cooperative_groups::this_cluster().sync().
+// Replaces the former software Decoupled Look-Back (DLB) atomic-spin barrier.
+//
+// cluster::sync() is a single PTX instruction (barrier.cluster) that resolves
+// in ~10-40 ns inside the cluster, with no device-level serialization.
+// Multiple clusters (one per HTM region) run fully concurrently — bounded
+// only by SM count (8 clusters × 16 SMs = 128 ≤ 132 on H200).
+//
+// The flags / expected / phase / cooperative_grid_sync parameters are kept
+// in the signature for call-site compatibility but are unused.
+__device__ static inline void fused_grid_barrier(cg::grid_group /* grid */,
+                                                 unsigned int * /* flags — unused */,
+                                                 unsigned int /* expected — unused */,
+                                                 unsigned int /* phase — unused */,
+                                                 unsigned int /* cooperative_grid_sync — unused */) {
+    auto cluster = cg::this_cluster();
+    cluster.sync();
+}
+__device__ static inline unsigned int warp_sum_u32(unsigned int v) {
+    for (int off = 16; off > 0; off >>= 1) {
+        v += __shfl_down_sync(0xffffffffu, v, off);
+    }
+    return v;
+}
+// Core kernel body — works for both single-region and batched launches.
+// Single-region: caller passes the one FusedPtrs struct.
+// Batched: each block reads its region's FusedPtrs via blockIdx.y before
+// calling this. State is independent per region (each region owns its own
+// GPU buffers); grid.sync() is the only cross-block primitive and it
+// spans ALL blocks in the grid (harmless over-sync across regions).
+__device__ static inline
+void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
+    cg::grid_group grid = cg::this_grid();
+    // Cast pointers.
+    const unsigned int  * __restrict__ syn_bit               = (const unsigned int*)P.syn_bit;
+    float               * __restrict__ syn_perm              = (float*)P.syn_perm;
+    float               * __restrict__ boost                 = (float*)P.boost;
+    float               * __restrict__ active_duty           = (float*)P.active_duty;
+    float               * __restrict__ inhibition_threshold  = (float*)P.inhibition_threshold;
+    unsigned int        * __restrict__ seg_cell_id           = (unsigned int*)P.seg_cell_id;
+    unsigned int        * __restrict__ seg_syn_count         = (unsigned int*)P.seg_syn_count;
+    unsigned int        * __restrict__ syn_presyn            = (unsigned int*)P.syn_presyn;
+    short               * __restrict__ tm_syn_perm           = (short*)P.tm_syn_perm;
+    unsigned int        * __restrict__ cell_seg_count        = (unsigned int*)P.cell_seg_count;
+    unsigned int        * __restrict__ cell_active_a         = (unsigned int*)P.cell_active_a;
+    unsigned int        * __restrict__ cell_active_b         = (unsigned int*)P.cell_active_b;
+    unsigned int        * __restrict__ cell_winner_a         = (unsigned int*)P.cell_winner_a;
+    unsigned int        * __restrict__ cell_winner_b         = (unsigned int*)P.cell_winner_b;
+    const unsigned char * __restrict__ inputs                = (const unsigned char*)P.inputs;
+    unsigned char       * __restrict__ cols_out              = (unsigned char*)P.cols_out;
+    float               * __restrict__ anom_out              = (float*)P.anom_out;
+    unsigned int        * __restrict__ barrier_counters      = (unsigned int*)P.barrier_counters;
+    unsigned int        * __restrict__ step_scratch          = (unsigned int*)P.step_scratch;
+    const unsigned int tid     = threadIdx.x;
+    const unsigned int lane    = tid & 31u;
+    const unsigned int warp    = tid >> 5;
+    const unsigned int warps_per_block = blockDim.x >> 5;
+    const unsigned int gwarp   = blockIdx.x * warps_per_block + warp;
+    const unsigned int n_warps = gridDim.x * warps_per_block;
+    const unsigned int n_cols  = cfg.n_columns;
+    const unsigned int col_lo  = (gwarp * n_cols) / n_warps;
+    const unsigned int col_hi  = ((gwarp + 1) * n_cols) / n_warps;
+    unsigned int phase = 0u;
+    // =========================================================
+    // DSMEM: Cluster-distributed shared memory for hot per-column
+    // state (inhibition_threshold, boost, active_duty).
+    //
+    // Each block in the cluster owns a contiguous slice of
+    // [my_col_start, my_col_end) columns in its own __shared__
+    // arrays. Any block can peer-read another block's slice via
+    // cluster.map_shared_rank(ptr, owner_block_rank)[offset].
+    //
+    // This eliminates 2×n_cols×T GMEM reads per forward call
+    // (read + potential re-read of threshold/boost/duty per timestep).
+    // =========================================================
+    auto cluster = cg::this_cluster();
+    const unsigned int cluster_block_rank = cluster.block_rank();  // 0..cluster_size-1
+    const unsigned int cluster_sz         = cluster.num_blocks();  // == gridDim.x (≤16)
+    // Partition n_cols evenly across cluster blocks.
+    // Each block owns cols_per_block columns starting at my_col_start.
+    const unsigned int cols_per_block =
+        (n_cols + cluster_sz - 1u) / cluster_sz;               // ceil div
+    const unsigned int my_col_start =
+        cluster_block_rank * cols_per_block;
+    const unsigned int my_col_end =
+        (my_col_start + cols_per_block < n_cols)
+            ? (my_col_start + cols_per_block) : n_cols;        // clamp
+    // Cluster-distributed shared memory arrays.
+    // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
+    // Peer blocks address into each other's smem via map_shared_rank.
+    __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
+    __shared__ float s_boost     [COLS_PER_CLUSTER_BLOCK_MAX];
+    __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
+    // TMA multicast input staging tile (T9).
+    //
+    // On Hopper (sm_90a), cg::memcpy_async with cluster scope issues a single
+    // TMA DMA that multicasts the source data to all 16 SMs in the cluster
+    // simultaneously — replacing ~16 per-block GMEM reads per timestep with a
+    // single hardware DMA.  After cg::wait(cluster) every SM's s_input_tile
+    // is populated identically without any additional DRAM traffic.
+    //
+    // Fallback: when cfg.input_bits > INPUT_BITS_MAX the tile is bypassed
+    // and each thread reads directly from GMEM (original path).
+    //
+    // Alignment: 16-byte aligned to satisfy TMA descriptor requirements.
+    __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
+    // Initial GMEM → smem load (reads state from previous forward call).
+    // Each block loads only its own slice; tid strides across the slice.
+    for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
+        const unsigned int off = c - my_col_start;
+        s_inhib_thr [off] = inhibition_threshold[c];
+        s_boost     [off] = boost[c];
+        s_active_duty[off] = active_duty[c];
+    }
+    // All blocks in the cluster must finish loading before any block
+    // starts reading peer smem inside the T-loop.
+    cluster.sync();
+    const unsigned int S   = cfg.synapses_per_col;
+    const unsigned int cpc = cfg.cells_per_column;
+    const unsigned int SPS = cfg.synapses_per_segment;
+    const unsigned int MSC = cfg.max_segments_per_cell;
+    // Main timestep loop.
+    for (unsigned int t = 0u; t < cfg.T; t++) {
+        const unsigned int inp_off      = t * cfg.input_bits;
+        const unsigned int col_base_out = t * n_cols;
+        unsigned int * curr_active = (t & 1u) ? cell_active_b : cell_active_a;
+        unsigned int * prev_active = (t & 1u) ? cell_active_a : cell_active_b;
+        unsigned int * curr_winner = (t & 1u) ? cell_winner_b : cell_winner_a;
+        unsigned int * prev_winner = (t & 1u) ? cell_winner_a : cell_winner_b;
+        // ---- Phase 0: clear curr bitsets for my cell range ----
+        const unsigned int my_cell_lo = col_lo * cpc;
+        const unsigned int my_cell_hi = col_hi * cpc;
+        if (cpc == 32u) {
+            // Fast path: one word per column.
+            for (unsigned int c = col_lo + lane; c < col_hi; c += 32u) {
+                curr_active[c] = 0u;
+                curr_winner[c] = 0u;
+            }
+        } else {
+            for (unsigned int cell = my_cell_lo + lane; cell < my_cell_hi; cell += 32u) {
+                unsigned int w = cell >> 5;
+                unsigned int m = 1u << (cell & 31u);
+                atomicAnd(&curr_active[w], ~m);
+                atomicAnd(&curr_winner[w], ~m);
+            }
+        }
+        // Block 0, lane 0, warp 0 resets step-scratch counters.
+        if (blockIdx.x == 0u && tid == 0u) {
+            step_scratch[0] = 0u;
+            step_scratch[1] = 0u;
+        }
+        // ---- BARRIER 1 ----
+        // Fence: make the above clear-bitsets + scratch writes globally
+        // visible before peer blocks observe "barrier arrived".
+        __threadfence();
+        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
+        // =========================================================
+        // T9: TMA MULTICAST INPUT STAGING
+        //
+        // Issue a single cluster-scope async DMA to broadcast this
+        // timestep's input slice into s_input_tile across all 16 SMs
+        // in the cluster simultaneously.  On Hopper sm_90a,
+        // cg::memcpy_async with cluster scope maps to the TMA
+        // hardware unit (cp.async.bulk.tensor multicast), reducing
+        // DRAM input traffic by ~16× vs each block fetching its own
+        // copy from GMEM.
+        //
+        // The staging is gated on cfg.input_bits <= INPUT_BITS_MAX.
+        // If the tile is too small (custom large input_bits), we fall
+        // back to per-thread GMEM reads in Stage A (identical to the
+        // original path; use_input_tile==false).
+        //
+        // Ordering: BARRIER 1 completes before we issue the DMA.
+        // The DMA completes before Stage A reads s_input_tile.
+        // =========================================================
+        const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
+        if (use_input_tile) {
+            // Thread-block scope async copy: each SM independently loads
+            // its own input tile from GMEM into shared memory.
+            //
+            // NOTE: CUDA 12.1's cooperative_groups::memcpy_async() rejects
+            // cluster_group at compile time (static_assert in async.h:171).
+            // True TMA multicast (single DMA for all 16 SMs in the cluster)
+            // would require raw PTX cp.async.bulk.tensor with multicast mode,
+            // which needs cuTensorMap descriptors on the host side (T11).
+            //
+            // This per-SM path still gives a meaningful win: it converts
+            // the original per-synapse scattered GMEM reads (random access
+            // pattern hitting multiple cache lines) into one sequential DMA
+            // per SM, improving L2 hit rate and hardware prefetcher
+            // effectiveness.  The cluster.sync() below ensures all SMs in
+            // the cluster have finished loading before any SM enters Stage A.
+            auto tb = cg::this_thread_block();
+            cg::memcpy_async(tb, s_input_tile,
+                             inputs + inp_off,
+                             cfg.input_bits);
+            cg::wait(tb);
+            // Cluster barrier: all 16 SMs must have loaded their tile
+            // before any SM begins reading s_input_tile in Stage A.
+            cluster.sync();
+        }
+        // =========================================================
+        // STAGE A: Spatial Pooler
+        //
+        // Hot per-column state (boost, inhibition_threshold,
+        // active_duty) is served from cluster DSMEM rather than
+        // GMEM for each of the T timesteps.  GMEM is written on
+        // update so state persists across forward calls.
+        // =========================================================
+        for (unsigned int c = col_lo; c < col_hi; c++) {
+            unsigned int base = c * S;
+            unsigned int local = 0u;
+            for (unsigned int s = lane; s < S; s += 32u) {
+                unsigned int b = syn_bit[base + s];
+                float p = syn_perm[base + s];
+                // T9: read from cluster-broadcast tile when available;
+                // fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
+                unsigned int inp_byte = use_input_tile
+                    ? (unsigned int)s_input_tile[b]
+                    : (unsigned int)inputs[inp_off + b];
+                unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
+                local += hit;
+            }
+            unsigned int overlap = warp_sum_u32(local);
+            overlap = __shfl_sync(0xffffffffu, overlap, 0);
+            // Determine which cluster block owns column c and read
+            // boost + threshold from that block's shared memory.
+            const unsigned int owner_block  = c / cols_per_block;
+            const unsigned int owner_offset = c - owner_block * cols_per_block;
+            float boost_val = cluster.map_shared_rank(s_boost,      owner_block)[owner_offset];
+            float thr       = cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset];
+            float boosted = (float)overlap * boost_val;
+            unsigned int is_active = (boosted > thr) ? 1u : 0u;
+            if (lane == 0) {
+                cols_out[col_base_out + c] = (unsigned char)is_active;
+                if (is_active) {
+                    atomicAdd(&step_scratch[0], 1u);
+                }
+            }
+            // SP learn (Hebbian) on active columns.
+            // T9: use tile for input reads here too.
+            if (cfg.learn && is_active) {
+                for (unsigned int s = lane; s < S; s += 32u) {
+                    unsigned int b = syn_bit[base + s];
+                    float p = syn_perm[base + s];
+                    unsigned int inp_byte = use_input_tile
+                        ? (unsigned int)s_input_tile[b]
+                        : (unsigned int)inputs[inp_off + b];
+                    if (inp_byte != 0u) {
+                        p += cfg.sp_inc;
+                        if (p > 1.0f) p = 1.0f;
+                    } else {
+                        p -= cfg.sp_dec;
+                        if (p < 0.0f) p = 0.0f;
+                    }
+                    syn_perm[base + s] = p;
+                }
+            }
+            // active_duty EMA + threshold adaptation.
+            // Writes go to both peer DSMEM (hot path for next timestep)
+            // and GMEM (persistence across forward calls).
+            if (lane == 0) {
+                float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
+                float sample = is_active ? 1.0f : 0.0f;
+                ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
+                // Writeback: peer smem (for next timestep read) + GMEM (persistence).
+                cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
+                active_duty[c] = ad;
+                // Threshold steers toward target sparsity.
+                float err = ad - cfg.sparsity_target;
+                float new_thr = thr + cfg.thr_adapt_rate * err * 100.0f;
+                if (new_thr < 0.1f) new_thr = 0.1f;
+                if (new_thr > 1000.0f) new_thr = 1000.0f;
+                // Writeback: peer smem (for next timestep read) + GMEM (persistence).
+                cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
+                inhibition_threshold[c] = new_thr;
+            }
+        }
+        // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
+        //
+        // DATA FLOW PROOF (T-loop iteration invariant):
+        //
+        // WRITE SITES (lane==0 inside Stage A per-col loop):
+        //   Line 328: cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad
+        //   Line 338: cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset] = new_thr
+        //
+        // READ SITES (Stage A of the NEXT timestep t+1):
+        //   Line 290: cluster.map_shared_rank(s_boost,      owner_block)[owner_offset]  (read)
+        //   Line 291: cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset]  (read)
+        //   Line 323: cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset]  (read)
+        //
+        // PARTITION MISMATCH (root cause of T8 staleness):
+        //   cols_per_block = ceil(n_cols / cluster_sz)   [smem partition]
+        //   col_lo/col_hi  = floor(gwarp*n_cols/n_warps) [gwarp work partition]
+        //   These are NOT identical — up to 1 column can spill across partition boundaries.
+        //   Example: n_cols=1000, cluster_sz=16 → cols_per_block=63, block 1 col_lo=62
+        //   → block 1 processes column 62 but column 62 belongs to block 0's smem slice.
+        //   → block 1 issues a PEER WRITE to block 0's s_inhib_thr / s_active_duty.
+        //
+        // RACE WITHOUT SYNC:
+        //   Blocks run Stage A concurrently. Block 1 writes block 0's smem at column 62.
+        //   Block 0 may simultaneously READ s_inhib_thr[62] for its own column 62 in
+        //   Stage A of the same timestep → concurrent peer write + local read → undefined.
+        //   Additionally, without cluster.sync() after all peer writes complete, block 0's
+        //   t+1 Stage A reads might observe t-1 values still cached in its smem.
+        //
+        // FIX: cluster.sync() here, AFTER Stage A's per-column loop, ensures:
+        //   1. All peer smem writes from this timestep are globally visible to all blocks.
+        //   2. No block can enter Stage B (or start t+1 Stage A) with stale smem values.
+        //   3. GMEM writes (lines 329, 339) are already committed to L2; __threadfence()
+        //      below ensures they are visible to all SMs before the cluster barrier.
+        //
+        // ORDERING: write → cluster.sync() here → __threadfence() → cluster.sync() in
+        //           fused_grid_barrier → next-timestep reads.  Both visibility guarantees
+        //           are now satisfied.
+        cluster.sync();
+        // ---- BARRIER 2: SP active_mask must be visible before TM reads ----
+        // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
+        // writes to global memory before peers advance past this barrier.
+        __threadfence();
+        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
+        // =========================================================
+        // STAGE B: Temporal Memory
+        // =========================================================
+        for (unsigned int c = col_lo; c < col_hi; c++) {
+            unsigned int col_active = cols_out[col_base_out + c];
+            if (col_active == 0u) continue;
+            unsigned int base_cell = c * cpc;
+            unsigned int any_predicted = 0u;
+            unsigned int best_seg_id_for_grow = 0xFFFFFFFFu;
+            unsigned int best_pot_count = 0u;
+            for (unsigned int k = 0u; k < cpc; k++) {
+                unsigned int cell = base_cell + k;
+                unsigned int n_segs_here = cell_seg_count[cell];
+                if (n_segs_here > MSC) n_segs_here = MSC;
+                if (n_segs_here == 0u) continue;
+                unsigned int seg_base_id = cell * MSC;
+                unsigned int cell_is_predictive = 0u;
+                for (unsigned int ls = 0u; ls < n_segs_here; ls++) {
+                    unsigned int seg = seg_base_id + ls;
+                    unsigned int n_syn = seg_syn_count[seg];
+                    if (n_syn == 0u) continue;
+                    unsigned int syn_base = seg * SPS;
+                    unsigned int l_conn = 0u;
+                    unsigned int l_pot  = 0u;
+                    for (unsigned int s = lane; s < n_syn; s += 32u) {
+                        unsigned int presyn = syn_presyn[syn_base + s];
+                        unsigned int w = prev_active[presyn >> 5];
+                        unsigned int bit = (w >> (presyn & 31u)) & 1u;
+                        if (bit) {
+                            l_pot += 1u;
+                            int p = (int)tm_syn_perm[syn_base + s];
+                            if (p >= cfg.conn_thr_i16) l_conn += 1u;
+                        }
+                    }
+                    unsigned int tot_conn = warp_sum_u32(l_conn);
+                    unsigned int tot_pot  = warp_sum_u32(l_pot);
+                    tot_conn = __shfl_sync(0xffffffffu, tot_conn, 0);
+                    tot_pot  = __shfl_sync(0xffffffffu, tot_pot, 0);
+                    if (tot_conn >= cfg.activation_threshold) cell_is_predictive = 1u;
+                    if (tot_pot >= cfg.learning_threshold && tot_pot > best_pot_count) {
+                        best_pot_count = tot_pot;
+                        best_seg_id_for_grow = seg;
+                    }
+                    // Reinforce predicted-and-correct segment.
+                    if (cfg.learn && tot_conn >= cfg.activation_threshold) {
+                        for (unsigned int s = lane; s < n_syn; s += 32u) {
+                            unsigned int presyn = syn_presyn[syn_base + s];
+                            unsigned int w = prev_active[presyn >> 5];
+                            unsigned int bit = (w >> (presyn & 31u)) & 1u;
+                            int p = (int)tm_syn_perm[syn_base + s];
+                            if (bit) {
+                                int np = p + cfg.perm_inc_i16;
+                                if (np > 32767) np = 32767;
+                                tm_syn_perm[syn_base + s] = (short)np;
+                            } else {
+                                int np = p - cfg.perm_dec_i16;
+                                if (np < 0) np = 0;
+                                tm_syn_perm[syn_base + s] = (short)np;
+                            }
+                        }
+                    }
+                }
+                if (cell_is_predictive) {
+                    any_predicted = 1u;
+                    if (lane == 0) {
+                        unsigned int w = cell >> 5;
+                        unsigned int m = 1u << (cell & 31u);
+                        atomicOr(&curr_active[w], m);
+                        atomicOr(&curr_winner[w], m);
+                    }
+                }
+            }
+            // BURST if no predicted.
+            if (!any_predicted) {
+                if (lane == 0) {
+                    for (unsigned int k = 0u; k < cpc; k++) {
+                        unsigned int cell = base_cell + k;
+                        unsigned int w = cell >> 5;
+                        unsigned int m = 1u << (cell & 31u);
+                        atomicOr(&curr_active[w], m);
+                    }
+                    unsigned int win = base_cell;
+                    unsigned int ww = win >> 5;
+                    unsigned int wm = 1u << (win & 31u);
+                    atomicOr(&curr_winner[ww], wm);
+                    atomicAdd(&step_scratch[1], 1u);
+                }
+                if (cfg.learn) {
+                    unsigned int target_seg;
+                    unsigned int existing_syn;
+                    if (best_seg_id_for_grow != 0xFFFFFFFFu) {
+                        // Reuse best matching segment.
+                        target_seg = best_seg_id_for_grow;
+                        existing_syn = seg_syn_count[target_seg];
+                        target_seg = __shfl_sync(0xffffffffu, target_seg, 0);
+                        existing_syn = __shfl_sync(0xffffffffu, existing_syn, 0);
+                        // Reinforce its existing synapses.
+                        unsigned int syn_base = target_seg * SPS;
+                        for (unsigned int s = lane; s < existing_syn; s += 32u) {
+                            unsigned int presyn = syn_presyn[syn_base + s];
+                            unsigned int w = prev_active[presyn >> 5];
+                            unsigned int bit = (w >> (presyn & 31u)) & 1u;
+                            int p = (int)tm_syn_perm[syn_base + s];
+                            if (bit) {
+                                int np = p + cfg.perm_inc_i16;
+                                if (np > 32767) np = 32767;
+                                tm_syn_perm[syn_base + s] = (short)np;
+                            } else {
+                                int np = p - cfg.perm_dec_i16;
+                                if (np < 0) np = 0;
+                                tm_syn_perm[syn_base + s] = (short)np;
+                            }
+                        }
+                    } else {
+                        // Allocate new segment on winner cell (cell 0 of col).
+                        unsigned int new_seg = 0u;
+                        if (lane == 0) {
+                            unsigned int winner_cell = base_cell;
+                            unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u);
+                            if (slot >= MSC) slot = slot % MSC;
+                            new_seg = winner_cell * MSC + slot;
+                            seg_cell_id[new_seg] = winner_cell;
+                            seg_syn_count[new_seg] = 0u;
+                        }
+                        target_seg = __shfl_sync(0xffffffffu, new_seg, 0);
+                        existing_syn = 0u;
+                    }
+                    // Grow synapses to prev_winner cells — lane 0 serialized.
+                    unsigned int room = (SPS > existing_syn) ? (SPS - existing_syn) : 0u;
+                    unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room;
+                    if (lane == 0 && max_grow > 0u) {
+                        unsigned int syn_base = target_seg * SPS;
+                        unsigned int grown = 0u;
+                        unsigned int start_off = (c * 2654435761u + cfg.iter_seed + t) % cfg.bits_words;
+                        for (unsigned int w_off = 0u;
+                             w_off < cfg.bits_words && grown < max_grow;
+                             w_off++) {
+                            unsigned int widx = (start_off + w_off) % cfg.bits_words;
+                            unsigned int word = prev_winner[widx];
+                            while (word != 0u && grown < max_grow) {
+                                unsigned int bit_pos = __ffs(word) - 1u;
+                                word &= ~(1u << bit_pos);
+                                unsigned int cell_id = widx * 32u + bit_pos;
+                                if (cell_id >= cfg.n_cells) continue;
+                                bool exists = false;
+                                for (unsigned int es = 0u; es < existing_syn + grown; es++) {
+                                    if (syn_presyn[syn_base + es] == cell_id) { exists = true; break; }
+                                }
+                                if (exists) continue;
+                                unsigned int write_idx = existing_syn + grown;
+                                if (write_idx >= SPS) break;
+                                syn_presyn[syn_base + write_idx] = cell_id;
+                                tm_syn_perm[syn_base + write_idx] = (short)cfg.initial_perm_i16;
+                                grown++;
+                            }
+                        }
+                        if (grown > 0u) {
+                            seg_syn_count[target_seg] = existing_syn + grown;
+                        }
+                    }
+                }
+            }
+        }
+        // ---- BARRIER 3: TM writes complete before anomaly + next-step read ----
+        // Fence: flush curr_active/curr_winner bitsets + tm_syn_perm +
+        // seg_syn_count + syn_presyn before peers advance and consume them as
+        // prev_active/prev_winner at t+1.
+        __threadfence();
+        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
+        // Write anomaly for step t.
+        if (blockIdx.x == 0u && tid == 0u) {
+            unsigned int total = step_scratch[0];
+            unsigned int bad   = step_scratch[1];
+            float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
+            anom_out[t] = anom;
+        }
+    }
+}
+// Single-region kernel (legacy call site).
+__global__
+void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
+    htm_fused_step_body(P, cfg);
+}
+// Batched kernel: one cooperative launch for B regions. grid.y = B,
+// grid.x = per-region block count. Each block reads its region's
+// FusedPtrs from the device array via blockIdx.y.
+__global__
+void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
+    const FusedPtrs P = P_arr[blockIdx.y];
+    htm_fused_step_body(P, cfg);
+}
+} // extern "C"

overlay/htm_rust/src/gpu/kernels/sp_boost_fused.cu ADDED Viewed

	@@ -0,0 +1,59 @@

+// Fused mean-reduction + boost-update kernel.
+//
+// Inputs:
+//   active_duty[n] (f32)
+//   boost_strength (f32)
+//
+// Output:
+//   boost[n] (f32) = expf(-boost_strength * (active_duty[c] - mean))
+//
+// Launch: single block (1024 threads), shared mem for reduction. At n=2048
+// each thread handles 2 elements.
+extern "C" __global__
+void sp_boost_from_duty(
+    const float * __restrict__ active_duty,  // (n,)
+    float       * __restrict__ boost,        // (n,) in-place out
+    float         boost_strength,
+    unsigned int  n
+) {
+    extern __shared__ float smem_raw[];
+    float * smem = smem_raw;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int bsz = blockDim.x;
+    // Phase 1: parallel sum of active_duty into smem[0..32] (warp-level).
+    float local_sum = 0.0f;
+    for (unsigned int i = tid; i < n; i += bsz) {
+        local_sum += active_duty[i];
+    }
+    // Warp reduction.
+    for (int off = 16; off > 0; off >>= 1) {
+        local_sum += __shfl_down_sync(0xffffffff, local_sum, off);
+    }
+    unsigned int lane = tid & 31;
+    unsigned int warp = tid >> 5;
+    if (lane == 0) smem[warp] = local_sum;
+    __syncthreads();
+    // Warp 0 reduces warp-sums.
+    __shared__ float mean_s;
+    if (warp == 0) {
+        unsigned int nwarps = (bsz + 31) / 32;
+        float v = (lane < nwarps) ? smem[lane] : 0.0f;
+        for (int off = 16; off > 0; off >>= 1) {
+            v += __shfl_down_sync(0xffffffff, v, off);
+        }
+        if (tid == 0) {
+            mean_s = v / (float)n;
+        }
+    }
+    __syncthreads();
+    // Phase 2: boost[c] = expf(-strength * (active_duty[c] - mean)).
+    float mean = mean_s;
+    for (unsigned int i = tid; i < n; i += bsz) {
+        float d = active_duty[i] - mean;
+        boost[i] = expf(-boost_strength * d);
+    }
+}

overlay/htm_rust/src/gpu/kernels/sp_duty.cu ADDED Viewed

	@@ -0,0 +1,45 @@

+// Duty cycle + boost update kernel.
+//
+// For each column c (one thread each):
+//   active_sample    = active_mask[c] ? 1 : 0
+//   overlap_sample   = raw_overlap[c] >= stim_thr ? 1 : 0
+//   active_duty[c]   = (1-alpha) * active_duty[c]  + alpha * active_sample
+//   overlap_duty[c]  = (1-alpha) * overlap_duty[c] + alpha * overlap_sample
+//
+// Then, if learn:
+//   boost[c] = exp(-boost_strength * (active_duty[c] - mean_duty))
+// mean_duty is computed on the host (one reduction) and passed in.
+extern "C" __global__
+void sp_duty_update(
+    const unsigned char * __restrict__ active_mask,  // (n_columns,)
+    const unsigned int  * __restrict__ raw_overlap,  // (n_columns,)
+    float               * __restrict__ active_duty,  // (n_columns,) in-place
+    float               * __restrict__ overlap_duty, // (n_columns,) in-place
+    float               * __restrict__ boost,        // (n_columns,) in-place
+    float                 alpha,
+    float                 stim_thr,
+    float                 boost_strength,            // 0 to skip boost
+    float                 mean_duty,
+    unsigned int          learn_flag,                // 0 or 1
+    unsigned int          n_columns
+) {
+    unsigned int c = blockIdx.x * blockDim.x + threadIdx.x;
+    if (c >= n_columns) return;
+    float ad = active_duty[c];
+    float od = overlap_duty[c];
+    float a_sample = (active_mask[c] != 0) ? 1.0f : 0.0f;
+    float o_sample = ((float)raw_overlap[c] >= stim_thr) ? 1.0f : 0.0f;
+    ad = (1.0f - alpha) * ad + alpha * a_sample;
+    od = (1.0f - alpha) * od + alpha * o_sample;
+    active_duty[c]  = ad;
+    overlap_duty[c] = od;
+    if (learn_flag && boost_strength > 0.0f) {
+        boost[c] = expf(-boost_strength * (ad - mean_duty));
+    }
+}

overlay/htm_rust/src/gpu/kernels/sp_learn.cu ADDED Viewed

	@@ -0,0 +1,45 @@

+// SP Hebbian learning kernel.
+//
+// For each active (winner) column c, for each of its synapses s:
+//   if input[bit[c][s]] active: perm += inc
+//   else:                       perm -= dec
+// Clamp to [0, 1].
+//
+// Launch: one block per column (2048 blocks), but we predicate on
+// active_mask[c] to avoid launching k-specific blocks.
+//
+// This matches the CPU reference line-for-line:
+//   src/sp.rs lines 157-169.
+extern "C" __global__
+void sp_learn(
+    const unsigned char * __restrict__ active_mask,  // (n_columns,) 0/1
+    const unsigned char * __restrict__ inp,          // (input_bits,)
+    const unsigned int  * __restrict__ syn_bit,      // (n_columns * S,)
+    float               * __restrict__ syn_perm,     // (n_columns * S,) in-place
+    float                 inc,
+    float                 dec,
+    unsigned int          synapses_per_col,
+    unsigned int          n_columns
+) {
+    const unsigned int c = blockIdx.x;
+    if (c >= n_columns) return;
+    if (active_mask[c] == 0) return;
+    const unsigned int base = c * synapses_per_col;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int bsz = blockDim.x;
+    for (unsigned int s = tid; s < synapses_per_col; s += bsz) {
+        unsigned int b = syn_bit[base + s];
+        float p = syn_perm[base + s];
+        if (inp[b] != 0) {
+            p += inc;
+            if (p > 1.0f) p = 1.0f;
+        } else {
+            p -= dec;
+            if (p < 0.0f) p = 0.0f;
+        }
+        syn_perm[base + s] = p;
+    }
+}

overlay/htm_rust/src/gpu/kernels/sp_overlap.cu ADDED Viewed

	@@ -0,0 +1,78 @@

+// SP overlap kernel.
+//
+// For each column c (one CUDA block), compute:
+//   overlap[c] = sum over its synapse list of {inp[bit[c][s]] && perm[c][s] >= conn_thr}
+//   boosted[c] = overlap[c] * boost[c]
+//   raw_overlap[c] = overlap[c]   (also returned so host can drive duty cycle)
+//
+// Memory layout (flat, column-major with per-column stride = synapses_per_col):
+//   syn_bit[c * S + s]  : u32 index into input SDR
+//   syn_perm[c * S + s] : f32 permanence in [0, 1]
+//   boost[c]           : f32
+//   inp[b]             : u8 0/1
+// Output:
+//   raw[c]     : u32
+//   boosted[c] : f32
+//
+// Launch:
+//   grid  = n_columns
+//   block = 128 (or 256) — one warp-sweep across synapses; many warps give
+//                          parallel reduction across S (typically S=40).
+//
+// At S=40 this is completely latency-bound; we coalesce reads and do a
+// warp-shuffle reduction. For clarity we use a simple block-wide shared-mem
+// reduction which is sufficient for S <= 1024 and has zero correctness risk.
+extern "C" __global__
+void sp_overlap(
+    const unsigned char * __restrict__ inp,     // (input_bits,)
+    const unsigned int  * __restrict__ syn_bit, // (n_columns * S,)
+    const float         * __restrict__ syn_perm,// (n_columns * S,)
+    const float         * __restrict__ boost,   // (n_columns,)
+    float                 conn_thr,
+    unsigned int          synapses_per_col,     // S
+    unsigned int          n_columns,
+    unsigned int        * __restrict__ raw_out,     // (n_columns,)
+    float               * __restrict__ boosted_out  // (n_columns,)
+) {
+    const unsigned int c = blockIdx.x;
+    if (c >= n_columns) return;
+    const unsigned int base = c * synapses_per_col;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int bsz = blockDim.x;
+    // Per-thread partial count.
+    unsigned int local = 0;
+    for (unsigned int s = tid; s < synapses_per_col; s += bsz) {
+        unsigned int b = syn_bit[base + s];
+        float p = syn_perm[base + s];
+        // Branchless: only counts when input active AND perm connected.
+        // Using (inp != 0) to tolerate u8 layout.
+        unsigned int hit = ((inp[b] != 0) && (p >= conn_thr)) ? 1u : 0u;
+        local += hit;
+    }
+    // Block-wide reduction in shared memory.
+    __shared__ unsigned int smem[32];
+    // Warp-level reduction via shuffle.
+    unsigned int lane = tid & 31;
+    unsigned int warp = tid >> 5;
+    for (int off = 16; off > 0; off >>= 1) {
+        local += __shfl_down_sync(0xffffffff, local, off);
+    }
+    if (lane == 0) smem[warp] = local;
+    __syncthreads();
+    if (warp == 0) {
+        unsigned int v = (tid < (bsz + 31) / 32) ? smem[lane] : 0;
+        for (int off = 16; off > 0; off >>= 1) {
+            v += __shfl_down_sync(0xffffffff, v, off);
+        }
+        if (tid == 0) {
+            raw_out[c] = v;
+            boosted_out[c] = (float)v * boost[c];
+        }
+    }
+}

overlay/htm_rust/src/gpu/kernels/sp_topk.cu ADDED Viewed

	@@ -0,0 +1,117 @@

+// Top-K column selection.
+//
+// Inputs:
+//   boosted[n_columns] : f32 score
+// Output:
+//   active_mask[n_columns] : u8 0/1, exactly k ones
+//
+// Tie-breaking: when scores are equal, the LOWER column index wins (matches
+// CPU reference `select_nth_unstable_by` with secondary index comparator).
+//
+// Strategy: a single-block implementation. n_columns is typically 2048, which
+// fits comfortably in shared memory. We use a bitonic top-k via per-thread
+// radix-select of the (score, -index) key. At k≈41 of n=2048 the simplest
+// correct approach is a thresholding pass:
+//
+//   1. Radix-like bucket pass to find the k-th largest score.
+//   2. Mark winners = strictly-greater-than-threshold AND ties until count hits k.
+//
+// For strict index-ordered tie-break we materialise a 64-bit key:
+//   key = (float_to_sortable_u32(score) << 32) | (0xffffffff - index)
+// Larger key = (higher score) OR (same score, smaller index).
+//
+// Then we find the k-th largest 64-bit key via radix-select and mark all
+// columns whose key >= threshold. This is O(n_cols * log k) and well under
+// 100 μs for n=2048, k=41 on sm_86.
+//
+// For simplicity and correctness this kernel uses a single-block parallel
+// selection sort variant (find max → mark → zero → repeat, k iterations).
+// At k=41 this is 41 passes of 2048 threads = ~2048*41 = 84K ops, trivially
+// fast.
+extern "C" __global__
+void sp_topk_select(
+    const float * __restrict__ scores,    // (n_columns,)
+    unsigned int  n_columns,
+    unsigned int  k,
+    unsigned char * __restrict__ active_out  // (n_columns,)
+) {
+    extern __shared__ float smem[];
+    // Layout: smem[0..n] = working scores (we'll mark selected entries as -inf)
+    //         smem[n..n+32*2] = reduction scratch (score + index, per warp)
+    float * work = smem;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int bsz = blockDim.x;
+    // Load scores into shared; also init active_out = 0.
+    for (unsigned int i = tid; i < n_columns; i += bsz) {
+        work[i] = scores[i];
+        active_out[i] = 0;
+    }
+    __syncthreads();
+    __shared__ int   winner_idx;
+    __shared__ float winner_score;
+    for (unsigned int iter = 0; iter < k; ++iter) {
+        // Find (argmax score, lowest index for ties).
+        float best_s = -INFINITY;
+        int   best_i = n_columns;   // sentinel larger than any index
+        for (unsigned int i = tid; i < n_columns; i += bsz) {
+            float s = work[i];
+            if (s > best_s || (s == best_s && (int)i < best_i)) {
+                best_s = s;
+                best_i = (int)i;
+            }
+        }
+        // Warp reduction. We reduce pairs (score, idx) keeping (max score, min idx on tie).
+        unsigned int mask = 0xffffffff;
+        for (int off = 16; off > 0; off >>= 1) {
+            float os = __shfl_down_sync(mask, best_s, off);
+            int   oi = __shfl_down_sync(mask, best_i, off);
+            if (os > best_s || (os == best_s && oi < best_i)) {
+                best_s = os;
+                best_i = oi;
+            }
+        }
+        // Warp 0 collects lane 0 values from other warps via shared mem.
+        __shared__ float warp_s[32];
+        __shared__ int   warp_i[32];
+        unsigned int lane = tid & 31;
+        unsigned int warp = tid >> 5;
+        if (lane == 0) {
+            warp_s[warp] = best_s;
+            warp_i[warp] = best_i;
+        }
+        __syncthreads();
+        if (warp == 0) {
+            unsigned int nwarps = (bsz + 31) / 32;
+            float s = (lane < nwarps) ? warp_s[lane] : -INFINITY;
+            int   i = (lane < nwarps) ? warp_i[lane] : (int)n_columns;
+            for (int off = 16; off > 0; off >>= 1) {
+                float os = __shfl_down_sync(mask, s, off);
+                int   oi = __shfl_down_sync(mask, i, off);
+                if (os > s || (os == s && oi < i)) {
+                    s = os;
+                    i = oi;
+                }
+            }
+            if (tid == 0) {
+                winner_score = s;
+                winner_idx = i;
+            }
+        }
+        __syncthreads();
+        if (tid == 0) {
+            if (winner_idx < (int)n_columns) {
+                active_out[winner_idx] = 1;
+                work[winner_idx] = -INFINITY;
+            }
+        }
+        __syncthreads();
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_activate.cu ADDED Viewed

	@@ -0,0 +1,66 @@

+// TM activate kernel. See tm_predict.cu for TmConfig.
+struct TmConfig {
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int cells_per_column;
+    unsigned int synapses_per_segment;
+    unsigned int n_segments;
+    unsigned int n_cells;
+    unsigned int max_segments_per_cell;
+    unsigned int max_new_synapses;
+    int conn_thr_i16;
+    int perm_inc_i16;
+    int perm_dec_i16;
+    int predicted_seg_dec_i16;
+    int initial_perm_i16;
+    unsigned int iter_seed;
+    unsigned int n_cols;
+    unsigned int bits_words;
+};
+extern "C" __global__
+void tm_activate(
+    const unsigned char * __restrict__ sp_active_mask,
+    const unsigned char * __restrict__ col_predicted,
+    const unsigned int  * __restrict__ cell_predictive_bits,
+    unsigned int        * __restrict__ cell_active_bits,
+    unsigned int        * __restrict__ cell_winner_bits,
+    unsigned int        * __restrict__ unpredicted_count,
+    unsigned int        * __restrict__ burst_cols_flat,
+    unsigned int        * __restrict__ burst_cols_count,
+    TmConfig              cfg
+) {
+    unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (col >= cfg.n_cols) return;
+    if (sp_active_mask[col] == 0) return;
+    unsigned int base_cell = col * cfg.cells_per_column;
+    if (col_predicted[col]) {
+        for (unsigned int k = 0; k < cfg.cells_per_column; k++) {
+            unsigned int cell = base_cell + k;
+            unsigned int word_idx = cell >> 5;
+            unsigned int bit_mask = 1u << (cell & 31u);
+            unsigned int pred_word = cell_predictive_bits[word_idx];
+            if (pred_word & bit_mask) {
+                atomicOr(&cell_active_bits[word_idx], bit_mask);
+                atomicOr(&cell_winner_bits[word_idx], bit_mask);
+            }
+        }
+    } else {
+        atomicAdd(unpredicted_count, 1u);
+        for (unsigned int k = 0; k < cfg.cells_per_column; k++) {
+            unsigned int cell = base_cell + k;
+            unsigned int word_idx = cell >> 5;
+            unsigned int bit_mask = 1u << (cell & 31u);
+            atomicOr(&cell_active_bits[word_idx], bit_mask);
+        }
+        unsigned int winner = base_cell;
+        unsigned int word_idx = winner >> 5;
+        unsigned int bit_mask = 1u << (winner & 31u);
+        atomicOr(&cell_winner_bits[word_idx], bit_mask);
+        unsigned int slot = atomicAdd(burst_cols_count, 1u);
+        burst_cols_flat[slot] = col;
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_anomaly.cu ADDED Viewed

	@@ -0,0 +1,43 @@

+// TM anomaly kernel.
+//
+// Computes:
+//   n_active = sum of sp_active_mask
+//   anomaly  = unpredicted_count / n_active   (if n_active > 0)
+//            = 0                              (else)
+//
+// Launch: single block, 256 threads.
+extern "C" __global__
+void tm_anomaly(
+    const unsigned char * __restrict__ sp_active_mask,
+    const unsigned int  * __restrict__ unpredicted_count,
+    float               * __restrict__ anomaly_out,       // (1,) or (t_slot,)
+    unsigned int          t_slot,
+    unsigned int          n_cols
+) {
+    const unsigned int tid = threadIdx.x;
+    __shared__ unsigned int n_active_s;
+    if (tid == 0) n_active_s = 0u;
+    __syncthreads();
+    unsigned int local = 0u;
+    for (unsigned int i = tid; i < n_cols; i += blockDim.x) {
+        if (sp_active_mask[i]) local += 1u;
+    }
+    // Warp reduce.
+    for (int off = 16; off > 0; off >>= 1) {
+        local += __shfl_down_sync(0xffffffffu, local, off);
+    }
+    if ((tid & 31u) == 0) {
+        atomicAdd(&n_active_s, local);
+    }
+    __syncthreads();
+    if (tid == 0) {
+        unsigned int total = n_active_s;
+        unsigned int bad = unpredicted_count[0];
+        float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
+        anomaly_out[t_slot] = anom;
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_grow.cu ADDED Viewed

	@@ -0,0 +1,155 @@

+// TM grow+reinforce kernel.
+//
+// For each bursting column:
+//   If col_best_match[col] is non-zero (i.e. at least one matching segment
+//     with num_active_potential >= learning_threshold exists on cells in this col):
+//     Target = that matching segment.
+//     Reinforce its existing synapses: +inc if presyn in prev_active, -dec otherwise.
+//     Grow up to (max_new - current_syn_count) additional synapses to prev_winners.
+//   Else:
+//     Allocate a fresh segment slot on winner cell (cell 0 of col).
+//     Grow up to max_new synapses to prev_winners (no reinforce needed — new seg).
+//
+// This mirrors the CPU TM burst logic.
+struct TmConfig {
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int cells_per_column;
+    unsigned int synapses_per_segment;
+    unsigned int n_segments;
+    unsigned int n_cells;
+    unsigned int max_segments_per_cell;
+    unsigned int max_new_synapses;
+    int conn_thr_i16;
+    int perm_inc_i16;
+    int perm_dec_i16;
+    int predicted_seg_dec_i16;
+    int initial_perm_i16;
+    unsigned int iter_seed;
+    unsigned int n_cols;
+    unsigned int bits_words;
+};
+extern "C" __global__
+void tm_grow(
+    unsigned int       * __restrict__ seg_cell_id,
+    unsigned int       * __restrict__ seg_syn_count,
+    unsigned int       * __restrict__ syn_presyn,
+    short              * __restrict__ syn_perm,
+    unsigned int       * __restrict__ cell_seg_count,
+    const unsigned int * __restrict__ burst_cols_flat,
+    const unsigned int * __restrict__ burst_cols_count,
+    const unsigned int * __restrict__ prev_winner_bits,
+    const unsigned int * __restrict__ prev_active_bits,
+    const unsigned int * __restrict__ col_best_match,
+    TmConfig             cfg
+) {
+    const unsigned int b = blockIdx.x;
+    const unsigned int n_burst_cols = burst_cols_count[0];
+    if (b >= n_burst_cols) return;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int col = burst_cols_flat[b];
+    __shared__ unsigned int shared_seg_id;
+    __shared__ unsigned int shared_existing_syn_count;
+    __shared__ unsigned int shared_grown;
+    __shared__ unsigned int shared_is_new;
+    __shared__ unsigned int shared_start_offset;
+    if (tid == 0) {
+        unsigned int match_key = col_best_match[col];
+        if (match_key != 0u) {
+            // Reuse matching segment.
+            unsigned int seg_id = match_key & 0x1FFFFFu;
+            shared_seg_id = seg_id;
+            shared_existing_syn_count = seg_syn_count[seg_id];
+            shared_is_new = 0u;
+        } else {
+            // Allocate new segment on winner cell (cell 0 of col).
+            unsigned int winner_cell = col * cfg.cells_per_column;
+            unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u);
+            if (slot >= cfg.max_segments_per_cell) {
+                slot = slot % cfg.max_segments_per_cell;
+            }
+            unsigned int seg_id = winner_cell * cfg.max_segments_per_cell + slot;
+            seg_cell_id[seg_id] = winner_cell;
+            seg_syn_count[seg_id] = 0;
+            shared_seg_id = seg_id;
+            shared_existing_syn_count = 0u;
+            shared_is_new = 1u;
+        }
+        shared_grown = 0u;
+        shared_start_offset = (b * 2654435761u + cfg.iter_seed) % cfg.bits_words;
+    }
+    __syncthreads();
+    const unsigned int seg_id = shared_seg_id;
+    const unsigned int seg_base = seg_id * cfg.synapses_per_segment;
+    const unsigned int existing_syn = shared_existing_syn_count;
+    const unsigned int is_new = shared_is_new;
+    const unsigned int start = shared_start_offset;
+    // PHASE 1: If reusing, reinforce existing synapses.
+    if (!is_new) {
+        for (unsigned int s = tid; s < existing_syn; s += 32u) {
+            unsigned int presyn = syn_presyn[seg_base + s];
+            unsigned int word = prev_active_bits[presyn >> 5];
+            unsigned int bit = (word >> (presyn & 31u)) & 1u;
+            int p = (int)syn_perm[seg_base + s];
+            if (bit) {
+                int np = p + cfg.perm_inc_i16;
+                if (np > 32767) np = 32767;
+                syn_perm[seg_base + s] = (short)np;
+            } else {
+                int np = p - cfg.perm_dec_i16;
+                if (np < 0) np = 0;
+                syn_perm[seg_base + s] = (short)np;
+            }
+        }
+        __syncthreads();
+    }
+    // PHASE 2: Grow up to `max_new_synapses` (or room) synapses to prev_winners
+    // that aren't already presynaptic to this segment.
+    const unsigned int room = (cfg.synapses_per_segment > existing_syn)
+        ? (cfg.synapses_per_segment - existing_syn) : 0u;
+    const unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room;
+    for (unsigned int w_off = 0; w_off < cfg.bits_words; w_off += 32u) {
+        if (shared_grown >= max_grow) break;
+        unsigned int widx = (start + w_off + tid) % cfg.bits_words;
+        unsigned int word = prev_winner_bits[widx];
+        while (word != 0u) {
+            if (shared_grown >= max_grow) break;
+            unsigned int bit_pos = __ffs(word) - 1u;
+            word &= ~(1u << bit_pos);
+            unsigned int cell = widx * 32u + bit_pos;
+            if (cell >= cfg.n_cells) continue;
+            // Skip if already presynaptic (O(existing_syn) scan; usually small).
+            bool exists = false;
+            for (unsigned int s = 0; s < existing_syn; s++) {
+                if (syn_presyn[seg_base + s] == cell) { exists = true; break; }
+            }
+            if (exists) continue;
+            unsigned int slot = atomicAdd(&shared_grown, 1u);
+            if (slot >= max_grow) break;
+            unsigned int write_idx = existing_syn + slot;
+            if (write_idx >= cfg.synapses_per_segment) break;
+            syn_presyn[seg_base + write_idx] = cell;
+            syn_perm[seg_base + write_idx] = (short)cfg.initial_perm_i16;
+        }
+    }
+    __syncthreads();
+    if (tid == 0) {
+        unsigned int grown = shared_grown;
+        if (grown > max_grow) grown = max_grow;
+        unsigned int new_count = existing_syn + grown;
+        if (new_count > cfg.synapses_per_segment) new_count = cfg.synapses_per_segment;
+        seg_syn_count[seg_id] = new_count;
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_learn.cu ADDED Viewed

	@@ -0,0 +1,75 @@

+// TM learn (reinforce correctly predicted segments) — cell-grouped launch.
+//
+// Grid: n_cells.
+// For each cell in a predicted, SP-active column: iterate its segments.
+// For each segment with num_active_connected >= activation_threshold,
+// reinforce its synapses against prev_active_bits.
+struct TmConfig {
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int cells_per_column;
+    unsigned int synapses_per_segment;
+    unsigned int n_segments;
+    unsigned int n_cells;
+    unsigned int max_segments_per_cell;
+    unsigned int max_new_synapses;
+    int conn_thr_i16;
+    int perm_inc_i16;
+    int perm_dec_i16;
+    int predicted_seg_dec_i16;
+    int initial_perm_i16;
+    unsigned int iter_seed;
+    unsigned int n_cols;
+    unsigned int bits_words;
+};
+extern "C" __global__
+void tm_learn_reinforce(
+    const unsigned int * __restrict__ seg_cell_id,
+    const unsigned int * __restrict__ seg_syn_count,
+    const unsigned int * __restrict__ syn_presyn,
+    short              * __restrict__ syn_perm,
+    const unsigned int * __restrict__ seg_num_active_connected,
+    const unsigned int * __restrict__ prev_active_bits,
+    const unsigned char * __restrict__ sp_active_mask,
+    const unsigned char * __restrict__ col_predicted,
+    const unsigned int * __restrict__ cell_seg_count,
+    TmConfig             cfg
+) {
+    const unsigned int cell = blockIdx.x;
+    if (cell >= cfg.n_cells) return;
+    const unsigned int col = cell / cfg.cells_per_column;
+    if (sp_active_mask[col] == 0) return;
+    if (col_predicted[col] == 0) return;
+    const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
+    if (n_segs_here == 0) return;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;
+    for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
+        const unsigned int seg = seg_base_id + local_seg;
+        if (seg_num_active_connected[seg] < cfg.activation_threshold) continue;
+        const unsigned int n_syn = seg_syn_count[seg];
+        if (n_syn == 0) continue;
+        const unsigned int syn_base = seg * cfg.synapses_per_segment;
+        for (unsigned int s = tid; s < n_syn; s += 32u) {
+            unsigned int presyn = syn_presyn[syn_base + s];
+            unsigned int word = prev_active_bits[presyn >> 5];
+            unsigned int bit = (word >> (presyn & 31u)) & 1u;
+            int p = (int)syn_perm[syn_base + s];
+            if (bit) {
+                int np = p + cfg.perm_inc_i16;
+                if (np > 32767) np = 32767;
+                syn_perm[syn_base + s] = (short)np;
+            } else {
+                int np = p - cfg.perm_dec_i16;
+                if (np < 0) np = 0;
+                syn_perm[syn_base + s] = (short)np;
+            }
+        }
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_predict.cu ADDED Viewed

	@@ -0,0 +1,102 @@

+// TM predict kernel — cell-grouped launch.
+//
+// Grid: n_cells blocks (one per cell).
+// Block: 32 threads (one warp).
+//
+// Each block iterates the segments owned by its cell (count in cell_seg_count[cell]).
+// For each live segment, counts active connected/potential synapses against
+// prev_active_bits. Updates per-segment counters, cell_predictive bit, and
+// col_predicted flag.
+struct TmConfig {
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int cells_per_column;
+    unsigned int synapses_per_segment;
+    unsigned int n_segments;
+    unsigned int n_cells;
+    unsigned int max_segments_per_cell;
+    unsigned int max_new_synapses;
+    int conn_thr_i16;
+    int perm_inc_i16;
+    int perm_dec_i16;
+    int predicted_seg_dec_i16;
+    int initial_perm_i16;
+    unsigned int iter_seed;
+    unsigned int n_cols;
+    unsigned int bits_words;
+};
+extern "C" __global__
+void tm_predict(
+    const unsigned int * __restrict__ seg_cell_id,
+    const unsigned int * __restrict__ seg_syn_count,
+    const unsigned int * __restrict__ syn_presyn,
+    const short        * __restrict__ syn_perm,
+    const unsigned int * __restrict__ cell_active_bits,
+    unsigned int       * __restrict__ cell_predictive_bits,
+    unsigned char      * __restrict__ col_predicted,
+    unsigned int       * __restrict__ seg_num_active_connected,
+    unsigned int       * __restrict__ seg_num_active_potential,
+    unsigned int       * __restrict__ col_best_match,
+    const unsigned int * __restrict__ cell_seg_count,
+    TmConfig             cfg
+) {
+    const unsigned int cell = blockIdx.x;
+    if (cell >= cfg.n_cells) return;
+    const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
+    if (n_segs_here == 0) return;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int col = cell / cfg.cells_per_column;
+    const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;
+    for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
+        const unsigned int seg = seg_base_id + local_seg;
+        const unsigned int n_syn = seg_syn_count[seg];
+        if (n_syn == 0) {
+            if (tid == 0) {
+                seg_num_active_connected[seg] = 0;
+                seg_num_active_potential[seg] = 0;
+            }
+            continue;
+        }
+        const unsigned int syn_base = seg * cfg.synapses_per_segment;
+        unsigned int local_conn = 0;
+        unsigned int local_pot = 0;
+        for (unsigned int s = tid; s < n_syn; s += 32u) {
+            unsigned int presyn = syn_presyn[syn_base + s];
+            unsigned int word = cell_active_bits[presyn >> 5];
+            unsigned int bit  = (word >> (presyn & 31u)) & 1u;
+            if (bit) {
+                local_pot += 1u;
+                int p = (int)syn_perm[syn_base + s];
+                if (p >= cfg.conn_thr_i16) {
+                    local_conn += 1u;
+                }
+            }
+        }
+        for (int off = 16; off > 0; off >>= 1) {
+            local_conn += __shfl_down_sync(0xffffffffu, local_conn, off);
+            local_pot  += __shfl_down_sync(0xffffffffu, local_pot,  off);
+        }
+        if (tid == 0) {
+            seg_num_active_connected[seg] = local_conn;
+            seg_num_active_potential[seg] = local_pot;
+            if (local_conn >= cfg.activation_threshold) {
+                unsigned int word_idx = cell >> 5;
+                unsigned int bit_mask = 1u << (cell & 31u);
+                atomicOr(&cell_predictive_bits[word_idx], bit_mask);
+                col_predicted[col] = 1;
+            }
+            if (local_pot >= cfg.learning_threshold) {
+                unsigned int pot_c = local_pot > 2047u ? 2047u : local_pot;
+                unsigned int key = (pot_c << 21) | (seg & 0x1FFFFFu);
+                atomicMax(&col_best_match[col], key);
+            }
+        }
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_punish.cu ADDED Viewed

	@@ -0,0 +1,64 @@

+// TM punish — cell-grouped launch.
+struct TmConfig {
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int cells_per_column;
+    unsigned int synapses_per_segment;
+    unsigned int n_segments;
+    unsigned int n_cells;
+    unsigned int max_segments_per_cell;
+    unsigned int max_new_synapses;
+    int conn_thr_i16;
+    int perm_inc_i16;
+    int perm_dec_i16;
+    int predicted_seg_dec_i16;
+    int initial_perm_i16;
+    unsigned int iter_seed;
+    unsigned int n_cols;
+    unsigned int bits_words;
+};
+extern "C" __global__
+void tm_punish(
+    const unsigned int  * __restrict__ seg_cell_id,
+    const unsigned int  * __restrict__ seg_syn_count,
+    const unsigned int  * __restrict__ syn_presyn,
+    short               * __restrict__ syn_perm,
+    const unsigned int  * __restrict__ seg_num_active_potential,
+    const unsigned int  * __restrict__ prev_active_bits,
+    const unsigned char * __restrict__ sp_active_mask,
+    const unsigned int  * __restrict__ cell_seg_count,
+    TmConfig              cfg
+) {
+    const unsigned int cell = blockIdx.x;
+    if (cell >= cfg.n_cells) return;
+    const unsigned int col = cell / cfg.cells_per_column;
+    if (sp_active_mask[col] != 0) return;   // skip: col became active
+    const unsigned int n_segs_here = min(cell_seg_count[cell], cfg.max_segments_per_cell);
+    if (n_segs_here == 0) return;
+    const unsigned int tid = threadIdx.x;
+    const unsigned int seg_base_id = cell * cfg.max_segments_per_cell;
+    for (unsigned int local_seg = 0; local_seg < n_segs_here; local_seg++) {
+        const unsigned int seg = seg_base_id + local_seg;
+        if (seg_num_active_potential[seg] < cfg.learning_threshold) continue;
+        const unsigned int n_syn = seg_syn_count[seg];
+        if (n_syn == 0) continue;
+        const unsigned int syn_base = seg * cfg.synapses_per_segment;
+        for (unsigned int s = tid; s < n_syn; s += 32u) {
+            unsigned int presyn = syn_presyn[syn_base + s];
+            unsigned int word = prev_active_bits[presyn >> 5];
+            unsigned int bit = (word >> (presyn & 31u)) & 1u;
+            if (bit) {
+                int p = (int)syn_perm[syn_base + s];
+                int np = p - cfg.predicted_seg_dec_i16;
+                if (np < 0) np = 0;
+                syn_perm[syn_base + s] = (short)np;
+            }
+        }
+    }
+}

overlay/htm_rust/src/gpu/kernels/tm_reset.cu ADDED Viewed

	@@ -0,0 +1,36 @@

+// TM reset-per-step kernel.
+extern "C" __global__
+void tm_reset_step(
+    unsigned int * __restrict__ cell_active_bits,
+    unsigned int * __restrict__ cell_winner_bits,
+    unsigned int * __restrict__ cell_predictive_bits,
+    unsigned int * __restrict__ prev_active_bits,
+    unsigned int * __restrict__ prev_winner_bits,
+    unsigned char * __restrict__ col_predicted,
+    unsigned int * __restrict__ unpredicted_count,
+    unsigned int * __restrict__ burst_cols_count,
+    unsigned int * __restrict__ col_best_match,
+    unsigned int   bits_words,
+    unsigned int   n_cols
+) {
+    unsigned int tid_global = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid_global < bits_words) {
+        prev_active_bits[tid_global] = cell_active_bits[tid_global];
+        prev_winner_bits[tid_global] = cell_winner_bits[tid_global];
+        cell_active_bits[tid_global] = 0u;
+        cell_winner_bits[tid_global] = 0u;
+        cell_predictive_bits[tid_global] = 0u;
+    }
+    if (tid_global < n_cols) {
+        col_predicted[tid_global] = 0;
+        col_best_match[tid_global] = 0u;
+    }
+    if (tid_global == 0) {
+        unpredicted_count[0] = 0u;
+        burst_cols_count[0] = 0u;
+    }
+}

overlay/htm_rust/src/gpu/mod.rs ADDED Viewed

	@@ -0,0 +1,549 @@

+//! GPU backend for HTM.
+//!
+//! Full-GPU pipeline (SP + TM). Per-step state lives entirely on device; the
+//! batch API (`step_many_gpu`) uploads T steps of input once, runs T iterations
+//! of the full HTM pipeline on GPU, and copies (T, n_cols) u8 + (T,) f32 back
+//! to the host in one shot.
+//!
+//! TM parity with the CPU reference is approximate:
+//!   - Segment growth: winner = cell 0 of bursting column (CPU picks
+//!     least-used-cell with RNG tiebreak). This is a pragmatic simplification
+//!     for GPU atomicity; learning dynamics are preserved.
+//!   - Permanences stored as i16 (scaled 0..32767). Rounding differs from
+//!     f32 by <= 1 ULP of the scale factor (≈ 3e-5) — inside any meaningful
+//!     HTM learning quantum.
+#![cfg(feature = "gpu")]
+pub mod sp_gpu;
+pub mod tm_gpu;
+pub mod fused;
+#[cfg(test)]
+mod tests;
+use std::mem::ManuallyDrop;
+use pyo3::prelude::*;
+use pyo3::types::{PyDict, PyTuple};
+use numpy::{PyArray1, PyArray2, PyArrayMethods, PyReadonlyArray2, PyUntypedArrayMethods};
+use crate::region::HTMRegionCore;
+use crate::sp::SpatialPoolerConfig;
+use sp_gpu::SpatialPoolerGpu;
+use tm_gpu::TemporalMemoryGpu;
+use fused::FusedState;
+/// Extract (device_ptr, shape, typestr) from a `__cuda_array_interface__` dict.
+/// Returns Err if the dict is malformed. Used by `step_many_cuda` to wrap
+/// torch-owned CUDA allocations zero-copy.
+fn cai_parse(cai: &Bound<'_, PyDict>) -> PyResult<(u64, Vec<usize>, String)> {
+    // `data` is a (ptr: int, readonly: bool) tuple.
+    let data_obj = cai.get_item("data")?
+        .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("CAI missing 'data'"))?;
+    let data_tup: Bound<'_, PyTuple> = data_obj.downcast_into()
+        .map_err(|_| pyo3::exceptions::PyValueError::new_err("CAI 'data' must be a tuple"))?;
+    let ptr: u64 = data_tup.get_item(0)?.extract()?;
+    // `shape` is a tuple of ints.
+    let shape_obj = cai.get_item("shape")?
+        .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("CAI missing 'shape'"))?;
+    let shape_tup: Bound<'_, PyTuple> = shape_obj.downcast_into()
+        .map_err(|_| pyo3::exceptions::PyValueError::new_err("CAI 'shape' must be a tuple"))?;
+    let shape: Vec<usize> = (0..shape_tup.len())
+        .map(|i| shape_tup.get_item(i).and_then(|v| v.extract::<usize>()))
+        .collect::<PyResult<Vec<_>>>()?;
+    // `typestr` (e.g. "|u1", "<f4").
+    let typestr_obj = cai.get_item("typestr")?
+        .ok_or_else(|| pyo3::exceptions::PyValueError::new_err("CAI missing 'typestr'"))?;
+    let typestr: String = typestr_obj.extract()?;
+    // Reject non-contiguous tensors — we don't handle strides.
+    if let Some(strides) = cai.get_item("strides")? {
+        if !strides.is_none() {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "CAI 'strides' must be None (tensor must be contiguous)",
+            ));
+        }
+    }
+    Ok((ptr, shape, typestr))
+}
+/// Python-exposed GPU HTM region. Drop-in replacement for `HTMRegion`.
+#[pyclass(module = "htm_rust")]
+pub struct HTMRegionGpu {
+    pub(super) sp_gpu: SpatialPoolerGpu,
+    pub(super) tm_gpu: TemporalMemoryGpu,
+    pub(super) fused_state: FusedState,
+    pub(super) n_columns: usize,
+    pub(super) input_bits: usize,
+    pub(super) cells_per_column: usize,
+}
+#[pymethods]
+impl HTMRegionGpu {
+    #[new]
+    #[pyo3(signature = (input_bits, n_columns, cells_per_column, seed=42))]
+    fn new(
+        input_bits: usize,
+        n_columns: usize,
+        cells_per_column: usize,
+        seed: u64,
+    ) -> PyResult<Self> {
+        if input_bits == 0 || n_columns == 0 || cells_per_column == 0 {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "input_bits, n_columns, cells_per_column must all be > 0",
+            ));
+        }
+        // CPU reference for deterministic SP init.
+        let cpu_ref = HTMRegionCore::new(input_bits, n_columns, cells_per_column, seed);
+        let sp_cfg: &SpatialPoolerConfig = &cpu_ref.sp.cfg;
+        let sp_gpu = SpatialPoolerGpu::from_cpu(&cpu_ref.sp).map_err(|e| {
+            pyo3::exceptions::PyRuntimeError::new_err(format!(
+                "GPU SP init failed: {e:?}. Config: input_bits={}, n_columns={}",
+                sp_cfg.input_bits, sp_cfg.n_columns,
+            ))
+        })?;
+        let dev = sp_gpu.dev_ref().clone();
+        let tm_gpu = TemporalMemoryGpu::new(dev.clone(), n_columns, cells_per_column).map_err(|e| {
+            pyo3::exceptions::PyRuntimeError::new_err(format!(
+                "GPU TM init failed: {e:?}",
+            ))
+        })?;
+        let initial_threshold = sp_gpu.initial_threshold_estimate();
+        let fused_state = FusedState::new(dev, n_columns, cells_per_column, initial_threshold)
+            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!(
+                "GPU fused state init failed: {e:?}",
+            )))?;
+        Ok(Self {
+            sp_gpu,
+            tm_gpu,
+            fused_state,
+            n_columns,
+            input_bits,
+            cells_per_column,
+        })
+    }
+    #[getter] fn input_bits(&self) -> usize { self.input_bits }
+    #[getter] fn n_columns(&self) -> usize { self.n_columns }
+    #[getter] fn cells_per_column(&self) -> usize { self.cells_per_column }
+    /// Process T timesteps in one call on GPU. Per-step state (SP + TM) stays
+    /// on device; only the final (T, n_cols) mask and (T,) anomaly are copied
+    /// to the host at the end.
+    #[pyo3(signature = (inputs, learn=true))]
+    fn step_many_gpu<'py>(
+        &mut self,
+        py: Python<'py>,
+        inputs: PyReadonlyArray2<'py, bool>,
+        learn: bool,
+    ) -> PyResult<(Bound<'py, PyArray2<f32>>, Bound<'py, PyArray1<f32>>)> {
+        let shape = inputs.shape();
+        if shape.len() != 2 {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "inputs must be 2-D (T, input_bits)",
+            ));
+        }
+        let t = shape[0];
+        let bits = shape[1];
+        if bits != self.input_bits {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "inputs last dim {bits} != expected input_bits {}",
+                self.input_bits,
+            )));
+        }
+        let slice = inputs.as_slice()?;
+        let n_cols = self.n_columns;
+        let input_vec: Vec<bool> = slice.to_vec();
+        let result = py.allow_threads(|| -> Result<(Vec<u8>, Vec<f32>), String> {
+            // 1. Upload T*input_bits bytes (32 MB at T=2048, bits=16384).
+            let sdr_u8_all: Vec<u8> = input_vec.iter().map(|&b| b as u8).collect();
+            let inputs_dev = self
+                .sp_gpu
+                .dev_ref()
+                .htod_sync_copy(&sdr_u8_all)
+                .map_err(|e| format!("H2D inputs: {e:?}"))?;
+            // 2. Allocate output buffers on device.
+            let mut cols_dev = self.sp_gpu.dev_ref()
+                .alloc_zeros::<u8>(t * n_cols)
+                .map_err(|e| format!("alloc cols: {e:?}"))?;
+            let mut anom_dev = self.sp_gpu.dev_ref()
+                .alloc_zeros::<f32>(t)
+                .map_err(|e| format!("alloc anom: {e:?}"))?;
+            // 3. Run T steps of SP + TM on GPU with NO per-step host sync.
+            self.sp_gpu.step_batch_with_tm(
+                &inputs_dev,
+                t,
+                self.input_bits,
+                learn,
+                &mut cols_dev,
+                &mut anom_dev,
+                &mut self.tm_gpu,
+            ).map_err(|e| format!("step_batch_with_tm: {e:?}"))?;
+            // 4. ONE D2H for the whole run (T * n_cols bytes + T floats).
+            let cols_host: Vec<u8> = self.sp_gpu.dev_ref()
+                .dtoh_sync_copy(&cols_dev)
+                .map_err(|e| format!("D2H cols: {e:?}"))?;
+            let anom_host: Vec<f32> = self.sp_gpu.dev_ref()
+                .dtoh_sync_copy(&anom_dev)
+                .map_err(|e| format!("D2H anom: {e:?}"))?;
+            Ok((cols_host, anom_host))
+        });
+        let (cols_u8, anom) = result.map_err(pyo3::exceptions::PyRuntimeError::new_err)?;
+        let cols_f32: Vec<f32> = cols_u8.iter().map(|&b| b as f32).collect();
+        let cols_arr = numpy::PyArray1::from_vec_bound(py, cols_f32)
+            .reshape([t, n_cols])
+            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("{e}")))?;
+        let anom_arr = numpy::PyArray1::from_vec_bound(py, anom);
+        Ok((cols_arr, anom_arr))
+    }
+    /// Zero-copy CUDA path: accept torch tensors via __cuda_array_interface__,
+    /// write outputs directly into caller-allocated torch tensors. Skips the
+    /// host round-trip that `step_many_gpu` pays on every call (sdr.cpu() +
+    /// two D2H copies at the end). This is the hot path for `train.py`.
+    ///
+    /// Contract:
+    ///   sdr_cai.shape  == (T, input_bits), dtype u8   (0/1 mask)
+    ///   cols_cai.shape == (T, n_columns),  dtype u8   (written)
+    ///   anom_cai.shape == (T,),            dtype f32  (written)
+    /// All three tensors must live on the SAME CUDA device as this region.
+    ///
+    /// The torch tensors still own their memory — this method only wraps
+    /// them as borrowed CudaSlice views (via ManuallyDrop) so cudarc's Drop
+    /// impl can't free pytorch's allocator.
+    #[pyo3(signature = (sdr_cai, cols_cai, anom_cai, learn=true))]
+    fn step_many_cuda(
+        &mut self,
+        py: Python<'_>,
+        sdr_cai: &Bound<'_, PyDict>,
+        cols_cai: &Bound<'_, PyDict>,
+        anom_cai: &Bound<'_, PyDict>,
+        learn: bool,
+    ) -> PyResult<()> {
+        let (sdr_ptr, sdr_shape, sdr_type) = cai_parse(sdr_cai)?;
+        let (cols_ptr, cols_shape, cols_type) = cai_parse(cols_cai)?;
+        let (anom_ptr, anom_shape, anom_type) = cai_parse(anom_cai)?;
+        // typestr sanity. numpy u1 is what torch.uint8 exports.
+        if sdr_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr_cai typestr must be '|u1' (uint8), got {sdr_type}",
+            )));
+        }
+        if cols_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols_cai typestr must be '|u1' (uint8), got {cols_type}",
+            )));
+        }
+        if anom_type != "<f4" && anom_type != "=f4" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom_cai typestr must be '<f4' (float32), got {anom_type}",
+            )));
+        }
+        // Shape validation.
+        if sdr_shape.len() != 2 || sdr_shape[1] != self.input_bits {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr_cai shape {sdr_shape:?} != (T, {})",
+                self.input_bits,
+            )));
+        }
+        let t = sdr_shape[0];
+        if cols_shape != [t, self.n_columns] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols_cai shape {cols_shape:?} != ({t}, {})",
+                self.n_columns,
+            )));
+        }
+        if anom_shape != [t] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom_cai shape {anom_shape:?} != ({t},)",
+            )));
+        }
+        let dev = self.sp_gpu.dev_ref().clone();
+        let n_cols = self.n_columns;
+        let input_bits = self.input_bits;
+        let result = py.allow_threads(|| -> Result<(), String> {
+            // SAFETY:
+            // - ptrs came from torch CUDA tensors validated non-null by the
+            //   __cuda_array_interface__ contract.
+            // - lens computed from validated shapes.
+            // - We wrap the returned CudaSlice in ManuallyDrop so cudarc's
+            //   Drop (which calls cuMemFree) never runs against torch memory.
+            //   The underlying allocation is owned+freed by torch.
+            // - The slices are used only for the duration of this call;
+            //   torch guarantees the backing tensors are live across it
+            //   (Python holds refs on the wrapping tensors).
+            let inputs_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<u8>(sdr_ptr, t * input_bits)
+            });
+            let mut cols_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols)
+            });
+            let mut anom_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<f32>(anom_ptr, t)
+            });
+            self.sp_gpu.step_batch_with_tm(
+                &inputs_dev,
+                t,
+                input_bits,
+                learn,
+                &mut cols_dev,
+                &mut anom_dev,
+                &mut self.tm_gpu,
+            ).map_err(|e| format!("step_batch_with_tm: {e:?}"))?;
+            // Synchronize: kernel writes must be visible to the next torch
+            // op that reads cols/anom. Pytorch's default stream is stream 0,
+            // and cudarc launches on its own stream — a full device sync
+            // is the simplest correct barrier. (Could narrow to a stream
+            // wait event in PR 2.)
+            // No dev.synchronize() here: caller must explicitly sync via the
+            // `device_sync()` method (or PyTorch auto-syncs when the output
+            // tensor is next consumed). Removing the per-launch barrier lets
+            // subsequent GPU work (mamba3 fwd, etc.) overlap in time.
+            Ok(())
+        });
+        result.map_err(pyo3::exceptions::PyRuntimeError::new_err)?;
+        Ok(())
+    }
+    /// Clear TM state on the GPU.
+    fn reset(&mut self) -> PyResult<()> {
+        self.tm_gpu.reset().map_err(|e| {
+            pyo3::exceptions::PyRuntimeError::new_err(format!("GPU TM reset: {e:?}"))
+        })?;
+        self.fused_state.reset().map_err(|e| {
+            pyo3::exceptions::PyRuntimeError::new_err(format!("GPU fused reset: {e:?}"))
+        })
+    }
+    /// FUSED MEGAKERNEL PATH: single CUDA launch for the entire T-step
+    /// forward (SP + TM all in one). Accepts torch CUDA tensors via
+    /// `__cuda_array_interface__` (zero-copy). Writes active-column mask +
+    /// anomaly directly into caller-allocated torch tensors.
+    ///
+    /// Semantics diverge from `step_many_cuda` in one important way: column
+    /// activation uses per-column threshold inhibition instead of global
+    /// top-K. The threshold is EMA-adapted per column toward the sparsity
+    /// target. See `docs/GPU_HTM.md` §Fused Kernel.
+    #[pyo3(signature = (sdr_cai, cols_cai, anom_cai, learn=true))]
+    fn step_many_fused_cuda(
+        &mut self,
+        py: Python<'_>,
+        sdr_cai: &Bound<'_, PyDict>,
+        cols_cai: &Bound<'_, PyDict>,
+        anom_cai: &Bound<'_, PyDict>,
+        learn: bool,
+    ) -> PyResult<()> {
+        let (sdr_ptr, sdr_shape, sdr_type) = cai_parse(sdr_cai)?;
+        let (cols_ptr, cols_shape, cols_type) = cai_parse(cols_cai)?;
+        let (anom_ptr, anom_shape, anom_type) = cai_parse(anom_cai)?;
+        if sdr_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr_cai typestr must be '|u1' (uint8), got {sdr_type}",
+            )));
+        }
+        if cols_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols_cai typestr must be '|u1' (uint8), got {cols_type}",
+            )));
+        }
+        if anom_type != "<f4" && anom_type != "=f4" {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom_cai typestr must be '<f4' (float32), got {anom_type}",
+            )));
+        }
+        if sdr_shape.len() != 2 || sdr_shape[1] != self.input_bits {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr_cai shape {sdr_shape:?} != (T, {})",
+                self.input_bits,
+            )));
+        }
+        let t = sdr_shape[0];
+        if cols_shape != [t, self.n_columns] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols_cai shape {cols_shape:?} != ({t}, {})",
+                self.n_columns,
+            )));
+        }
+        if anom_shape != [t] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom_cai shape {anom_shape:?} != ({t},)",
+            )));
+        }
+        let dev = self.sp_gpu.dev_ref().clone();
+        let n_cols = self.n_columns;
+        let input_bits = self.input_bits;
+        let result = py.allow_threads(|| -> Result<(), String> {
+            let inputs_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<u8>(sdr_ptr, t * input_bits)
+            });
+            let mut cols_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols)
+            });
+            let mut anom_dev = ManuallyDrop::new(unsafe {
+                dev.upgrade_device_ptr::<f32>(anom_ptr, t)
+            });
+            fused::launch_fused(
+                &mut self.sp_gpu,
+                &mut self.tm_gpu,
+                &mut self.fused_state,
+                &inputs_dev,
+                &mut cols_dev,
+                &mut anom_dev,
+                t,
+                input_bits,
+                learn,
+            ).map_err(|e| format!("launch_fused: {e:?}"))?;
+            // No dev.synchronize() here: caller must explicitly sync via the
+            // `device_sync()` method (or PyTorch auto-syncs when the output
+            // tensor is next consumed). Removing the per-launch barrier lets
+            // subsequent GPU work (mamba3 fwd, etc.) overlap in time.
+            Ok(())
+        });
+        result.map_err(pyo3::exceptions::PyRuntimeError::new_err)?;
+        Ok(())
+    }
+    /// Explicit device synchronization — the caller must invoke this after
+    /// all batched `step_many_*_cuda` calls complete, before reading the
+    /// output tensors from a different CUDA stream. Equivalent to the old
+    /// per-call `dev.synchronize()` that was removed for overlap.
+    fn device_sync(&self) -> PyResult<()> {
+        let dev = self.sp_gpu.dev_ref();
+        dev.synchronize()
+            .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("sync: {e:?}")))?;
+        Ok(())
+    }
+}
+/// Batch B regions into ONE cooperative kernel launch. Breaks through the
+/// CUDA cooperative-kernel device-level serialization: a single cooperative
+/// launch with grid.y=B processes all regions concurrently — ~B× speedup
+/// over B sequential launches.
+///
+/// All regions must have the same config (input_bits, n_columns,
+/// cells_per_column). Each region keeps its independent GPU state.
+/// Does NOT sync; caller must invoke `device_sync()` on any region
+/// afterwards (or rely on a downstream torch op to auto-sync).
+#[pyfunction]
+#[pyo3(signature = (regions, sdr_cais, cols_cais, anom_cais, learn=true))]
+fn step_batch_fused_cuda(
+    py: Python<'_>,
+    regions: Vec<Py<HTMRegionGpu>>,
+    sdr_cais: Vec<Bound<'_, PyDict>>,
+    cols_cais: Vec<Bound<'_, PyDict>>,
+    anom_cais: Vec<Bound<'_, PyDict>>,
+    learn: bool,
+) -> PyResult<()> {
+    let b = regions.len();
+    if b == 0 {
+        return Err(pyo3::exceptions::PyValueError::new_err("regions is empty"));
+    }
+    if sdr_cais.len() != b || cols_cais.len() != b || anom_cais.len() != b {
+        return Err(pyo3::exceptions::PyValueError::new_err(
+            "sdr_cais / cols_cais / anom_cais length must match regions",
+        ));
+    }
+    // Parse all CAI dicts; collect device pointers. Validate shapes/dtypes.
+    let mut sdr_ptrs = Vec::with_capacity(b);
+    let mut cols_ptrs = Vec::with_capacity(b);
+    let mut anom_ptrs = Vec::with_capacity(b);
+    let (input_bits, n_columns, t) = {
+        let r0 = regions[0].bind(py).borrow();
+        (r0.input_bits, r0.n_columns, {
+            let (_p, sh, _ty) = cai_parse(&sdr_cais[0])?;
+            if sh.len() != 2 {
+                return Err(pyo3::exceptions::PyValueError::new_err(
+                    format!("sdr_cai must be 2-D (T, input_bits), got {sh:?}"),
+                ));
+            }
+            sh[0]
+        })
+    };
+    for i in 0..b {
+        let (sdr_ptr, sdr_shape, sdr_type) = cai_parse(&sdr_cais[i])?;
+        let (cols_ptr, cols_shape, cols_type) = cai_parse(&cols_cais[i])?;
+        let (anom_ptr, anom_shape, anom_type) = cai_parse(&anom_cais[i])?;
+        if sdr_type != "|u1" || cols_type != "|u1" {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "sdr/cols typestr must be '|u1' (uint8)",
+            ));
+        }
+        if anom_type != "<f4" && anom_type != "=f4" {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "anom typestr must be '<f4' (float32)",
+            ));
+        }
+        if sdr_shape != [t, input_bits] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "sdr[{i}] shape {sdr_shape:?} != ({t}, {input_bits})"
+            )));
+        }
+        if cols_shape != [t, n_columns] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "cols[{i}] shape {cols_shape:?} != ({t}, {n_columns})"
+            )));
+        }
+        if anom_shape != [t] {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "anom[{i}] shape {anom_shape:?} != ({t},)"
+            )));
+        }
+        sdr_ptrs.push(sdr_ptr);
+        cols_ptrs.push(cols_ptr);
+        anom_ptrs.push(anom_ptr);
+    }
+    // Exclusively borrow each region. PyRefMut guarantees uniqueness.
+    let mut region_refs: Vec<pyo3::PyRefMut<HTMRegionGpu>> =
+        regions.iter().map(|p| p.bind(py).borrow_mut()).collect();
+    // Collect raw mutable pointers — each PyRefMut exclusively borrows its
+    // region for the lifetime of this call, so pointers stay valid and
+    // unique. launch_fused_batched_raw only dereferences one region at a
+    // time, not constructing an aliased slice.
+    let raw_ptrs: Vec<*mut HTMRegionGpu> = region_refs
+        .iter_mut()
+        .map(|r| &mut **r as *mut HTMRegionGpu)
+        .collect();
+    // No allow_threads: raw pointers aren't Send. The launch is GPU-queued
+    // and sync'd downstream; holding the GIL for the duration is cheap.
+    fused::launch_fused_batched_raw(
+        &raw_ptrs, &sdr_ptrs, &cols_ptrs, &anom_ptrs,
+        t, input_bits, learn,
+    )
+    .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("launch_fused_batched: {e:?}")))?;
+    Ok(())
+}
+pub fn register(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<HTMRegionGpu>()?;
+    m.add_function(pyo3::wrap_pyfunction!(step_batch_fused_cuda, m)?)?;
+    Ok(())
+}

overlay/htm_rust/src/gpu/sp_gpu.rs ADDED Viewed

	@@ -0,0 +1,796 @@

+//! GPU implementation of the Spatial Pooler.
+//!
+//! One `SpatialPoolerGpu` owns a set of persistent device buffers + 4 PTX
+//! kernels. `compute(input, learn)` performs one SP step and returns the
+//! sorted active-column indices (host `Vec<u32>`) — this is what the CPU
+//! TemporalMemory consumes.
+//!
+//! Persistent state on device (per region):
+//!   syn_bit     : u32  [n_columns × S]  (constant after init)
+//!   syn_perm    : f32  [n_columns × S]  (updated by sp_learn)
+//!   boost       : f32  [n_columns]
+//!   active_duty : f32  [n_columns]
+//!   overlap_duty: f32  [n_columns]
+//!
+//! Per-step transient state:
+//!   inp_dev     : u8   [input_bits]     (H2D copy each step)
+//!   raw         : u32  [n_columns]
+//!   boosted     : f32  [n_columns]
+//!   active_mask : u8   [n_columns]      (topk output, D2H at the end)
+use std::sync::Arc;
+use cudarc::driver::{CudaDevice, CudaSlice, DeviceSlice, DriverError, LaunchAsync, LaunchConfig};
+use cudarc::nvrtc::Ptx;
+use crate::sp::SpatialPooler;
+// Embed PTX at compile time. OUT_DIR is set by build.rs.
+const PTX_SP_OVERLAP: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_overlap.ptx"));
+const PTX_SP_TOPK: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_topk.ptx"));
+const PTX_SP_LEARN: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_learn.ptx"));
+const PTX_SP_DUTY: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_duty.ptx"));
+const PTX_SP_BOOST_FUSED: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/sp_boost_fused.ptx"));
+pub struct SpatialPoolerGpu {
+    dev: Arc<CudaDevice>,
+    // Config mirror (we don't touch CPU SpatialPooler after init).
+    input_bits: usize,
+    n_columns: usize,
+    synapses_per_col: usize,
+    conn_thr: f32,
+    inc: f32,
+    dec: f32,
+    sparsity: f32,
+    duty_period: f32,
+    boost_strength: f32,
+    // Persistent device state.
+    syn_bit: CudaSlice<u32>,
+    syn_perm: CudaSlice<f32>,
+    boost: CudaSlice<f32>,
+    active_duty: CudaSlice<f32>,
+    overlap_duty: CudaSlice<f32>,
+    // Transient scratch (reused each step).
+    inp_dev: CudaSlice<u8>,
+    raw: CudaSlice<u32>,
+    boosted: CudaSlice<f32>,
+    active_mask: CudaSlice<u8>,
+    // Reusable host buffer for D2H of active_mask.
+    host_mask: Vec<u8>,
+    /// Strict bit-parity with CPU reference. Enabled for tests.
+    /// Forces host-side boost/exp computation and the overlap-duty bump check
+    /// every step. Default false for max throughput.
+    strict_parity: bool,
+}
+impl SpatialPoolerGpu {
+    /// Copy CPU SpatialPooler state onto the device. This preserves the
+    /// exact seeded proximal synapse layout + initial permanences, so the
+    /// GPU SP is a bit-identical parallel implementation of the CPU SP.
+    pub fn from_cpu(cpu: &SpatialPooler) -> Result<Self, DriverError> {
+        let dev = CudaDevice::new(0)?;
+        let cfg = &cpu.cfg;
+        let n = cfg.n_columns;
+        let s = cfg.potential_synapses;
+        // Flatten proximal dendrites into column-major arrays.
+        let mut syn_bit_h: Vec<u32> = Vec::with_capacity(n * s);
+        let mut syn_perm_h: Vec<f32> = Vec::with_capacity(n * s);
+        for col in &cpu.columns {
+            debug_assert_eq!(col.inputs.len(), s);
+            debug_assert_eq!(col.perms.len(), s);
+            syn_bit_h.extend_from_slice(&col.inputs);
+            syn_perm_h.extend_from_slice(&col.perms);
+        }
+        let syn_bit = dev.htod_sync_copy(&syn_bit_h)?;
+        let syn_perm = dev.htod_sync_copy(&syn_perm_h)?;
+        let boost = dev.htod_sync_copy(&cpu.boost)?;
+        let active_duty = dev.htod_sync_copy(&cpu.active_duty_cycle)?;
+        let overlap_duty = dev.htod_sync_copy(&cpu.overlap_duty_cycle)?;
+        let inp_dev: CudaSlice<u8> = dev.alloc_zeros(cfg.input_bits)?;
+        let raw: CudaSlice<u32> = dev.alloc_zeros(n)?;
+        let boosted: CudaSlice<f32> = dev.alloc_zeros(n)?;
+        let active_mask: CudaSlice<u8> = dev.alloc_zeros(n)?;
+        // Load PTX modules. Each .ptx is a module containing one `extern "C"`
+        // function; we tag them by unique module names so multiple SP instances
+        // don't collide (cudarc uses the (module, func) pair).
+        // Actually: CudaDevice::load_ptx stores under the given module name
+        // globally on the device, so we use a deterministic naming scheme.
+        let modules = [
+            ("htm_sp_overlap", PTX_SP_OVERLAP, "sp_overlap"),
+            ("htm_sp_topk", PTX_SP_TOPK, "sp_topk_select"),
+            ("htm_sp_learn", PTX_SP_LEARN, "sp_learn"),
+            ("htm_sp_duty", PTX_SP_DUTY, "sp_duty_update"),
+            ("htm_sp_boost_fused", PTX_SP_BOOST_FUSED, "sp_boost_from_duty"),
+        ];
+        for (modname, ptx, fnname) in modules {
+            // load_ptx is NOT idempotent — calling twice errors. For multi-region
+            // support we check-then-load.
+            if dev.get_func(modname, fnname).is_none() {
+                dev.load_ptx(Ptx::from_src(ptx), modname, &[fnname])?;
+            }
+        }
+        Ok(Self {
+            dev,
+            input_bits: cfg.input_bits,
+            n_columns: n,
+            synapses_per_col: s,
+            conn_thr: cfg.connected_threshold,
+            inc: cfg.syn_perm_active_inc,
+            dec: cfg.syn_perm_inactive_dec,
+            sparsity: cfg.sparsity,
+            duty_period: cfg.duty_cycle_period,
+            boost_strength: cfg.boost_strength,
+            syn_bit,
+            syn_perm,
+            boost,
+            active_duty,
+            overlap_duty,
+            inp_dev,
+            raw,
+            boosted,
+            active_mask,
+            host_mask: vec![0u8; n],
+            strict_parity: false,
+        })
+    }
+    /// Enable strict bit-parity mode. Parity tests use this.
+    pub fn set_strict_parity(&mut self, strict: bool) {
+        self.strict_parity = strict;
+    }
+    /// Access to the underlying CudaDevice for host-side orchestration.
+    pub fn dev_ref(&self) -> &Arc<CudaDevice> {
+        &self.dev
+    }
+    // --- Fused-path accessors (immutable state reads + pointer-grabs). ---
+    pub fn n_columns_accessor(&self) -> usize { self.n_columns }
+    #[allow(dead_code)]
+    pub fn input_bits_accessor(&self) -> usize { self.input_bits }
+    pub fn synapses_per_col_accessor(&self) -> usize { self.synapses_per_col }
+    pub fn conn_thr_accessor(&self) -> f32 { self.conn_thr }
+    pub fn inc_accessor(&self) -> f32 { self.inc }
+    pub fn dec_accessor(&self) -> f32 { self.dec }
+    pub fn sparsity_accessor(&self) -> f32 { self.sparsity }
+    pub fn duty_period_accessor(&self) -> f32 { self.duty_period }
+    #[allow(dead_code)]
+    pub fn boost_strength_accessor(&self) -> f32 { self.boost_strength }
+    pub fn syn_bit_accessor(&self) -> &CudaSlice<u32> { &self.syn_bit }
+    pub fn syn_perm_accessor(&self) -> &CudaSlice<f32> { &self.syn_perm }
+    pub fn boost_accessor(&self) -> &CudaSlice<f32> { &self.boost }
+    pub fn active_duty_accessor(&self) -> &CudaSlice<f32> { &self.active_duty }
+    /// Compute the 95th-percentile-like initial threshold from raw overlaps
+    /// after a short warmup pass. Used to seed `inhibition_threshold` such
+    /// that activation rate starts near the sparsity target.
+    /// Placeholder (returns a conservative constant); real warmup pass
+    /// happens on the Rust orchestrator side.
+    pub fn initial_threshold_estimate(&self) -> f32 {
+        // With conn_thr=0.5, init_perm around 0.5±0.1, S=40, sparse SDR at 2%:
+        // expected overlap ~ 40 * 0.02 = 0.8 connected hits → boosted ~ 0.8.
+        // Top-K selects top 2%, so threshold for top 2% is roughly the
+        // 98th-percentile of boosted. Conservative start: 2.0.
+        // The per-column adaptation will quickly steer each column's thr.
+        2.0f32
+    }
+    /// Batched multi-step SP on the GPU. Processes T timesteps from a
+    /// pre-uploaded device input buffer. Emits `(T, n_cols)` u8 active-column
+    /// mask to `cols_dev_out` and `(T,)` active column index list (in a
+    /// per-step window of size k, padded with u32::MAX).
+    ///
+    /// For each step, this runs the same 5-kernel pipeline as `compute`, but
+    /// skips the per-step boost/duty D2H→exp→H2D round-trip: instead it
+    /// accumulates to a host scratch once every `boost_interval` steps.
+    ///
+    /// This is the fast path used by `HTMRegionGpu.step_many_gpu`.
+    #[allow(clippy::too_many_arguments)]
+    pub fn step_batch(
+        &mut self,
+        inputs_flat_dev: &CudaSlice<u8>,
+        t: usize,
+        input_bits: usize,
+        learn: bool,
+        cols_out: &mut [u8],
+        active_indices_host: &mut Vec<u32>,
+    ) -> Result<(), DriverError> {
+        let n = self.n_columns;
+        let k = ((self.sparsity * n as f32).round() as usize).max(1);
+        debug_assert_eq!(cols_out.len(), t * n);
+        let overlap_fn = self.dev.get_func("htm_sp_overlap", "sp_overlap").unwrap();
+        let topk_fn = self.dev.get_func("htm_sp_topk", "sp_topk_select").unwrap();
+        let learn_fn = self.dev.get_func("htm_sp_learn", "sp_learn").unwrap();
+        let duty_fn = self.dev.get_func("htm_sp_duty", "sp_duty_update").unwrap();
+        let overlap_cfg = LaunchConfig {
+            grid_dim: (n as u32, 1, 1),
+            block_dim: (128, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let topk_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: (n * std::mem::size_of::<f32>()) as u32,
+        };
+        let learn_cfg = overlap_cfg;
+        let duty_cfg = LaunchConfig {
+            grid_dim: ((n as u32 + 255) / 256, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let alpha = 1.0f32 / self.duty_period.max(1.0);
+        // Reusable host buffer for the per-step active_mask D2H.
+        self.host_mask.resize(n, 0);
+        active_indices_host.clear();
+        for ti in 0..t {
+            // Point overlap kernel at the ti-th slice of the pre-uploaded input.
+            // cudarc CudaSlice doesn't have a "view" per se, so we must copy the
+            // slice into the reusable inp_dev buffer. This is a D2D copy — much
+            // faster than H2D.
+            // (Alternative: rewrite kernel to accept an offset; deferred.)
+            let in_off = ti * input_bits;
+            // Use dtod_copy via raw slice indexing: cudarc exposes slice() for this.
+            let sub = inputs_flat_dev.slice(in_off..in_off + input_bits);
+            self.dev.dtod_copy(&sub, &mut self.inp_dev)?;
+            // 1. sp_overlap
+            unsafe {
+                overlap_fn.clone().launch(
+                    overlap_cfg,
+                    (
+                        &self.inp_dev,
+                        &self.syn_bit,
+                        &self.syn_perm,
+                        &self.boost,
+                        self.conn_thr,
+                        self.synapses_per_col as u32,
+                        n as u32,
+                        &mut self.raw,
+                        &mut self.boosted,
+                    ),
+                )?;
+            }
+            // 2. Clear active_mask, then sp_topk
+            self.dev.memset_zeros(&mut self.active_mask)?;
+            unsafe {
+                topk_fn.clone().launch(
+                    topk_cfg,
+                    (&self.boosted, n as u32, k as u32, &mut self.active_mask),
+                )?;
+            }
+            // 3. sp_learn
+            if learn {
+                unsafe {
+                    learn_fn.clone().launch(
+                        learn_cfg,
+                        (
+                            &self.active_mask,
+                            &self.inp_dev,
+                            &self.syn_bit,
+                            &mut self.syn_perm,
+                            self.inc,
+                            self.dec,
+                            self.synapses_per_col as u32,
+                            n as u32,
+                        ),
+                    )?;
+                }
+            }
+            // 4. duty update (device)
+            unsafe {
+                duty_fn.clone().launch(
+                    duty_cfg,
+                    (
+                        &self.active_mask,
+                        &self.raw,
+                        &mut self.active_duty,
+                        &mut self.overlap_duty,
+                        &mut self.boost,
+                        alpha,
+                        1.0f32,
+                        0.0f32,
+                        0.0f32,
+                        0u32,
+                        n as u32,
+                    ),
+                )?;
+            }
+            // 5. Boost update. Two modes:
+            //    * strict_parity (tests): host-side exp for bit-exact match.
+            //    * default (production): GPU expf is close enough and ~10x faster
+            //      since we skip the D2H/H2D round-trip.
+            if learn && self.boost_strength > 0.0 {
+                if self.strict_parity {
+                    let mut duty_host = vec![0f32; n];
+                    self.dev
+                        .dtoh_sync_copy_into(&self.active_duty, &mut duty_host)?;
+                    let sum: f32 = duty_host.iter().sum();
+                    let mean = sum / (n as f32);
+                    let mut boost_host = vec![0f32; n];
+                    for i in 0..n {
+                        boost_host[i] =
+                            (-self.boost_strength * (duty_host[i] - mean)).exp();
+                    }
+                    self.dev.htod_sync_copy_into(&boost_host, &mut self.boost)?;
+                    // Permanence bump (rare). Only evaluated in strict mode.
+                    let mut ov_host = vec![0f32; n];
+                    self.dev
+                        .dtoh_sync_copy_into(&self.overlap_duty, &mut ov_host)?;
+                    let max_ov = ov_host.iter().cloned().fold(0f32, f32::max);
+                    if max_ov > 0.0 {
+                        let thr = 0.001f32 * max_ov;
+                        let bump = self.inc * 0.1f32;
+                        let bump_cols: Vec<u32> = ov_host
+                            .iter()
+                            .enumerate()
+                            .filter_map(|(i, &o)| {
+                                if o < thr { Some(i as u32) } else { None }
+                            })
+                            .collect();
+                        if !bump_cols.is_empty() {
+                            let s = self.synapses_per_col;
+                            let mut perm_host = vec![0f32; n * s];
+                            self.dev
+                                .dtoh_sync_copy_into(&self.syn_perm, &mut perm_host)?;
+                            for &c in &bump_cols {
+                                let base = (c as usize) * s;
+                                for p in &mut perm_host[base..base + s] {
+                                    *p = (*p + bump).min(1.0);
+                                }
+                            }
+                            self.dev.htod_sync_copy_into(&perm_host, &mut self.syn_perm)?;
+                        }
+                    }
+                } else {
+                    // Fast path: fused mean + boost = expf(-strength*(ad-mean))
+                    // in a single GPU block. Zero D2H, zero H2D — fully async.
+                    let boost_fn = self
+                        .dev
+                        .get_func("htm_sp_boost_fused", "sp_boost_from_duty")
+                        .expect("sp_boost_fused not loaded");
+                    let boost_cfg = LaunchConfig {
+                        grid_dim: (1, 1, 1),
+                        block_dim: (1024, 1, 1),
+                        shared_mem_bytes: 32 * std::mem::size_of::<f32>() as u32,
+                    };
+                    unsafe {
+                        boost_fn.launch(
+                            boost_cfg,
+                            (
+                                &self.active_duty,
+                                &mut self.boost,
+                                self.boost_strength,
+                                n as u32,
+                            ),
+                        )?;
+                    }
+                }
+            }
+            // D2H the active_mask for this step. This is the single
+            // unavoidable sync point per step — CPU TM needs the active
+            // indices for its next state update. At 2048 bytes / step this
+            // is tiny in bandwidth but costs a full syncronize (~5-10μs).
+            self.dev
+                .dtoh_sync_copy_into(&self.active_mask, &mut self.host_mask)?;
+            let co = ti * n;
+            cols_out[co..co + n].copy_from_slice(&self.host_mask);
+            // Extract active indices.
+            for (i, &b) in self.host_mask.iter().enumerate() {
+                if b != 0 {
+                    active_indices_host.push(i as u32);
+                }
+            }
+            // Insert separator (u32::MAX) between steps to demarcate step boundaries.
+            active_indices_host.push(u32::MAX);
+        }
+        Ok(())
+    }
+    /// Fully-on-GPU batched SP + TM. Zero per-step host sync.
+    ///
+    /// Inputs:
+    ///   inputs_flat_dev : (T * input_bits) u8 already uploaded
+    ///   cols_dev        : (T * n_cols) u8 output — active-column mask per step
+    ///   anom_dev        : (T,) f32 output — anomaly score per step
+    ///   tm              : persistent GPU TemporalMemory for this region
+    #[allow(clippy::too_many_arguments)]
+    pub fn step_batch_with_tm(
+        &mut self,
+        inputs_flat_dev: &CudaSlice<u8>,
+        t: usize,
+        input_bits: usize,
+        learn: bool,
+        cols_dev: &mut CudaSlice<u8>,
+        anom_dev: &mut CudaSlice<f32>,
+        tm: &mut crate::gpu::tm_gpu::TemporalMemoryGpu,
+    ) -> Result<(), DriverError> {
+        let n = self.n_columns;
+        let k = ((self.sparsity * n as f32).round() as usize).max(1);
+        debug_assert_eq!(cols_dev.len(), t * n);
+        debug_assert_eq!(anom_dev.len(), t);
+        let overlap_fn = self.dev.get_func("htm_sp_overlap", "sp_overlap").unwrap();
+        let topk_fn    = self.dev.get_func("htm_sp_topk", "sp_topk_select").unwrap();
+        let learn_fn   = self.dev.get_func("htm_sp_learn", "sp_learn").unwrap();
+        let duty_fn    = self.dev.get_func("htm_sp_duty", "sp_duty_update").unwrap();
+        let overlap_cfg = LaunchConfig {
+            grid_dim: (n as u32, 1, 1),
+            block_dim: (128, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let topk_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: (n * std::mem::size_of::<f32>()) as u32,
+        };
+        let learn_cfg = overlap_cfg;
+        let duty_cfg = LaunchConfig {
+            grid_dim: ((n as u32 + 255) / 256, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let alpha = 1.0f32 / self.duty_period.max(1.0);
+        for ti in 0..t {
+            let in_off = ti * input_bits;
+            let sub = inputs_flat_dev.slice(in_off..in_off + input_bits);
+            self.dev.dtod_copy(&sub, &mut self.inp_dev)?;
+            // 1. sp_overlap
+            unsafe {
+                overlap_fn.clone().launch(
+                    overlap_cfg,
+                    (
+                        &self.inp_dev,
+                        &self.syn_bit,
+                        &self.syn_perm,
+                        &self.boost,
+                        self.conn_thr,
+                        self.synapses_per_col as u32,
+                        n as u32,
+                        &mut self.raw,
+                        &mut self.boosted,
+                    ),
+                )?;
+            }
+            // 2. clear + sp_topk
+            self.dev.memset_zeros(&mut self.active_mask)?;
+            unsafe {
+                topk_fn.clone().launch(
+                    topk_cfg,
+                    (&self.boosted, n as u32, k as u32, &mut self.active_mask),
+                )?;
+            }
+            // 3. sp_learn
+            if learn {
+                unsafe {
+                    learn_fn.clone().launch(
+                        learn_cfg,
+                        (
+                            &self.active_mask,
+                            &self.inp_dev,
+                            &self.syn_bit,
+                            &mut self.syn_perm,
+                            self.inc,
+                            self.dec,
+                            self.synapses_per_col as u32,
+                            n as u32,
+                        ),
+                    )?;
+                }
+            }
+            // 4. duty update (stage 1: no-boost write)
+            unsafe {
+                duty_fn.clone().launch(
+                    duty_cfg,
+                    (
+                        &self.active_mask,
+                        &self.raw,
+                        &mut self.active_duty,
+                        &mut self.overlap_duty,
+                        &mut self.boost,
+                        alpha,
+                        1.0f32,
+                        0.0f32,
+                        0.0f32,
+                        0u32,
+                        n as u32,
+                    ),
+                )?;
+            }
+            // 5. Boost update: fused GPU kernel (no D2H).
+            if learn && self.boost_strength > 0.0 {
+                let boost_fn = self.dev
+                    .get_func("htm_sp_boost_fused", "sp_boost_from_duty")
+                    .expect("sp_boost_fused not loaded");
+                let boost_cfg = LaunchConfig {
+                    grid_dim: (1, 1, 1),
+                    block_dim: (1024, 1, 1),
+                    shared_mem_bytes: 32 * std::mem::size_of::<f32>() as u32,
+                };
+                unsafe {
+                    boost_fn.launch(
+                        boost_cfg,
+                        (
+                            &self.active_duty,
+                            &mut self.boost,
+                            self.boost_strength,
+                            n as u32,
+                        ),
+                    )?;
+                }
+            }
+            // 6. Copy active_mask slice into cols_dev[ti*n .. (ti+1)*n].
+            let mut dst_slice = cols_dev.slice_mut(ti * n..(ti + 1) * n);
+            self.dev.dtod_copy(&self.active_mask, &mut dst_slice)?;
+            // 7. GPU TM step: predict + activate + anomaly + learn, all on device.
+            tm.step(&self.active_mask, anom_dev, ti as u32, learn)?;
+        }
+        Ok(())
+    }
+    /// One SP step on the GPU. Returns sorted active-column indices.
+    pub fn compute(&mut self, input: &[u8], learn: bool) -> Result<Vec<u32>, DriverError> {
+        debug_assert_eq!(input.len(), self.input_bits);
+        let n = self.n_columns;
+        let k = ((self.sparsity * n as f32).round() as usize).max(1);
+        // 1. H2D input SDR.
+        self.dev.htod_sync_copy_into(input, &mut self.inp_dev)?;
+        // 2. Launch sp_overlap: grid=n_columns, block=128.
+        let overlap_fn = self
+            .dev
+            .get_func("htm_sp_overlap", "sp_overlap")
+            .expect("sp_overlap not loaded");
+        let overlap_cfg = LaunchConfig {
+            grid_dim: (n as u32, 1, 1),
+            block_dim: (128, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            overlap_fn.launch(
+                overlap_cfg,
+                (
+                    &self.inp_dev,
+                    &self.syn_bit,
+                    &self.syn_perm,
+                    &self.boost,
+                    self.conn_thr,
+                    self.synapses_per_col as u32,
+                    n as u32,
+                    &mut self.raw,
+                    &mut self.boosted,
+                ),
+            )?;
+        }
+        // 3. Launch sp_topk: single block, shared mem = n_columns * f32.
+        let topk_fn = self
+            .dev
+            .get_func("htm_sp_topk", "sp_topk_select")
+            .expect("sp_topk not loaded");
+        let topk_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: (n * std::mem::size_of::<f32>()) as u32,
+        };
+        // Clear active_mask first. memset_zeros avoids an H2D of a host
+        // zeroes vector every step.
+        self.dev.memset_zeros(&mut self.active_mask)?;
+        unsafe {
+            topk_fn.launch(
+                topk_cfg,
+                (
+                    &self.boosted,
+                    n as u32,
+                    k as u32,
+                    &mut self.active_mask,
+                ),
+            )?;
+        }
+        // 4. Optional: sp_learn on active columns.
+        if learn {
+            let learn_fn = self
+                .dev
+                .get_func("htm_sp_learn", "sp_learn")
+                .expect("sp_learn not loaded");
+            let learn_cfg = LaunchConfig {
+                grid_dim: (n as u32, 1, 1),
+                block_dim: (128, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            unsafe {
+                learn_fn.launch(
+                    learn_cfg,
+                    (
+                        &self.active_mask,
+                        &self.inp_dev,
+                        &self.syn_bit,
+                        &mut self.syn_perm,
+                        self.inc,
+                        self.dec,
+                        self.synapses_per_col as u32,
+                        n as u32,
+                    ),
+                )?;
+            }
+        }
+        // 5. Duty cycle + boost update. Always runs (matches CPU).
+        //    We need mean_duty on the host — compute BEFORE the update (matches
+        //    CPU sp.rs line 200-205 where mean is computed then written).
+        //    Actually CPU computes mean of the PRE-update duty cycles too? Re-read:
+        //      sp.rs lines 186-196 update duty cycles (pre-mean).
+        //      Line 202: mean = sum(active_duty_cycle) / n  ← after update.
+        //      Line 204: boost[i] = exp(-strength*(active_duty[i] - mean)).
+        //    So mean is on POST-update values.
+        //    Easiest: 1) run duty update with boost_strength=0 (skip boost calc),
+        //             2) D2H active_duty, compute mean, 3) run a boost-only kernel
+        //                OR inline the exp() in a second launch with mean passed.
+        //
+        //    For simplicity and correctness we fuse: run the duty kernel with
+        //    mean=0 and boost_strength=0 (disables boost write), then D2H to
+        //    compute mean, then re-launch with the true mean. Two launches, one
+        //    tiny D2H (n × f32). At n=2048 this is 8KB per step — negligible.
+        let alpha = 1.0f32 / self.duty_period.max(1.0);
+        let duty_fn = self
+            .dev
+            .get_func("htm_sp_duty", "sp_duty_update")
+            .expect("sp_duty not loaded");
+        let duty_cfg = LaunchConfig {
+            grid_dim: ((n as u32 + 255) / 256, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        // Stage 1: update duty cycles (boost_strength=0 -> no write).
+        unsafe {
+            duty_fn.launch(
+                duty_cfg,
+                (
+                    &self.active_mask,
+                    &self.raw,
+                    &mut self.active_duty,
+                    &mut self.overlap_duty,
+                    &mut self.boost,
+                    alpha,
+                    1.0f32,   // stim_thr
+                    0.0f32,   // boost_strength = 0 -> skip write
+                    0.0f32,   // mean_duty (unused)
+                    0u32,     // learn_flag = 0
+                    n as u32,
+                ),
+            )?;
+        }
+        if learn && self.boost_strength > 0.0 && self.strict_parity {
+            // Boost update must bit-match CPU `f32::exp`, so we compute it on
+            // the host and copy back. Cost per step: 8KB D2H + 8KB H2D at n=2048.
+            // Critical for learning parity — CUDA expf (even without fast-math)
+            // uses different rounding for some inputs than host libm.
+            let mut duty_host = vec![0f32; n];
+            self.dev
+                .dtoh_sync_copy_into(&self.active_duty, &mut duty_host)?;
+            let sum: f32 = duty_host.iter().sum();
+            let mean = sum / (n as f32);
+            let mut boost_host = vec![0f32; n];
+            for i in 0..n {
+                boost_host[i] = (-self.boost_strength * (duty_host[i] - mean)).exp();
+            }
+            self.dev.htod_sync_copy_into(&boost_host, &mut self.boost)?;
+            // CPU sp.rs 210-226: permanence bump for chronically under-stimulated
+            // columns. If overlap_duty_cycle[i] < 0.001 * max(overlap_duty_cycle),
+            // add inc*0.1 to every synapse of column i (clamped to 1.0).
+            // This runs only once per step and only for the rare cases, but we
+            // need it for bit-exact parity with CPU learn.
+            let mut ov_host = vec![0f32; n];
+            self.dev
+                .dtoh_sync_copy_into(&self.overlap_duty, &mut ov_host)?;
+            let max_ov = ov_host.iter().cloned().fold(0f32, f32::max);
+            if max_ov > 0.0 {
+                let thr = 0.001f32 * max_ov;
+                let bump = self.inc * 0.1f32;
+                // Find columns needing a bump. Usually empty. Rare → D2H/H2D
+                // of syn_perm is cheap (n*S*4 = 320KB at n=2048,S=40).
+                let bump_cols: Vec<u32> = ov_host
+                    .iter()
+                    .enumerate()
+                    .filter_map(|(i, &o)| if o < thr { Some(i as u32) } else { None })
+                    .collect();
+                if !bump_cols.is_empty() {
+                    // Download, bump, upload. (Keeps implementation simple and
+                    // bit-exact. Could kernelize later.)
+                    let s = self.synapses_per_col;
+                    let mut perm_host = vec![0f32; n * s];
+                    self.dev.dtoh_sync_copy_into(&self.syn_perm, &mut perm_host)?;
+                    for &c in &bump_cols {
+                        let base = (c as usize) * s;
+                        for p in &mut perm_host[base..base + s] {
+                            *p = (*p + bump).min(1.0);
+                        }
+                    }
+                    self.dev.htod_sync_copy_into(&perm_host, &mut self.syn_perm)?;
+                }
+            }
+        } else if learn && self.boost_strength > 0.0 {
+            // Fast path: GPU-side boost using the already-loaded duty kernel.
+            let mut duty_host = vec![0f32; n];
+            self.dev
+                .dtoh_sync_copy_into(&self.active_duty, &mut duty_host)?;
+            let sum: f32 = duty_host.iter().sum();
+            let mean = sum / (n as f32);
+            let boost_fn = self
+                .dev
+                .get_func("htm_sp_duty", "sp_duty_update")
+                .expect("sp_duty not loaded");
+            unsafe {
+                boost_fn.launch(
+                    duty_cfg,
+                    (
+                        &self.active_mask,
+                        &self.raw,
+                        &mut self.active_duty,
+                        &mut self.overlap_duty,
+                        &mut self.boost,
+                        0.0f32,
+                        1.0f32,
+                        self.boost_strength,
+                        mean,
+                        1u32,
+                        n as u32,
+                    ),
+                )?;
+            }
+        }
+        // 6. D2H active_mask and convert to sorted index list.
+        self.dev
+            .dtoh_sync_copy_into(&self.active_mask, &mut self.host_mask)?;
+        let mut active: Vec<u32> = Vec::with_capacity(k);
+        for (i, &b) in self.host_mask.iter().enumerate() {
+            if b != 0 {
+                active.push(i as u32);
+            }
+        }
+        debug_assert_eq!(active.len(), k, "SP must emit exactly k winners");
+        Ok(active)
+    }
+}

overlay/htm_rust/src/gpu/tests.rs ADDED Viewed

	@@ -0,0 +1,643 @@

+//! Parity tests: GPU SP vs CPU SP reference.
+//!
+//! With matching seeds the two should produce bit-identical active-column sets
+//! when `learn=false`, and remain bit-identical over repeated `learn=true`
+//! steps because the Hebbian update is deterministic (no RNG once initialised).
+//!
+//! Run with:  cargo test --release --features gpu
+#![cfg(test)]
+#![cfg(feature = "gpu")]
+use crate::sp::{SpatialPooler, SpatialPoolerConfig};
+use crate::gpu::sp_gpu::SpatialPoolerGpu;
+use crate::gpu::tm_gpu::TemporalMemoryGpu;
+use crate::gpu::fused::{
+    launch_fused, plan_fused_launch, FusedState,
+};
+use cudarc::driver::CudaSlice;
+use rand::{Rng, SeedableRng};
+use rand_xoshiro::Xoshiro256PlusPlus;
+fn make_sdr(rng: &mut Xoshiro256PlusPlus, bits: usize, sparsity: f32) -> Vec<u8> {
+    let on = ((sparsity * bits as f32) as usize).max(1);
+    let mut v = vec![0u8; bits];
+    let mut placed = 0;
+    while placed < on {
+        let i = rng.gen_range(0..bits);
+        if v[i] == 0 {
+            v[i] = 1;
+            placed += 1;
+        }
+    }
+    v
+}
+#[test]
+fn gpu_sp_matches_cpu_no_learn() {
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let mut cpu = SpatialPooler::new(
+        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
+        1234,
+    );
+    let cpu_for_gpu = SpatialPooler::new(
+        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
+        1234,
+    );
+    let mut gpu = SpatialPoolerGpu::from_cpu(&cpu_for_gpu)
+        .expect("gpu init (CUDA device available)");
+    gpu.set_strict_parity(true);
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(99);
+    for step in 0..20 {
+        let sdr_u8 = make_sdr(&mut rng, bits, 0.02);
+        let sdr_bool: Vec<bool> = sdr_u8.iter().map(|&x| x != 0).collect();
+        let cpu_active: Vec<u32> = cpu.compute(&sdr_bool, false);
+        let gpu_active: Vec<u32> = gpu.compute(&sdr_u8, false).expect("gpu compute");
+        assert_eq!(
+            cpu_active, gpu_active,
+            "mismatch at step {step}: len cpu={} gpu={}",
+            cpu_active.len(), gpu_active.len()
+        );
+    }
+}
+#[test]
+fn gpu_sp_matches_cpu_with_learn() {
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let mut cpu = SpatialPooler::new(
+        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
+        5678,
+    );
+    let cpu_for_gpu = SpatialPooler::new(
+        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
+        5678,
+    );
+    let mut gpu = SpatialPoolerGpu::from_cpu(&cpu_for_gpu).expect("gpu init");
+    gpu.set_strict_parity(true);
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(42);
+    for step in 0..50 {
+        let sdr_u8 = make_sdr(&mut rng, bits, 0.02);
+        let sdr_bool: Vec<bool> = sdr_u8.iter().map(|&x| x != 0).collect();
+        let cpu_active = cpu.compute(&sdr_bool, true);
+        let gpu_active = gpu.compute(&sdr_u8, true).expect("gpu compute");
+        assert_eq!(
+            cpu_active, gpu_active,
+            "mismatch at step {step} with learning"
+        );
+    }
+}
+#[test]
+fn gpu_tm_anomaly_decays_on_repeating_sequence() {
+    // End-to-end GPU pipeline: SP feeds TM; repeating SDR sequence should drive
+    // anomaly down over time.
+    use crate::gpu::HTMRegionGpu;  // not pyclass methods; use internal constructor via Rust
+    // Easier: replicate the pipeline directly with SP + TM.
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let n_cols = cfg.n_columns;
+    let cells_per_col = 32usize;
+    let cpu_for_gpu = SpatialPooler::new(SpatialPoolerConfig::default(), 314);
+    let mut sp = SpatialPoolerGpu::from_cpu(&cpu_for_gpu).expect("gpu init");
+    let dev = sp.dev_ref().clone();
+    let mut tm = TemporalMemoryGpu::new(dev.clone(), n_cols, cells_per_col)
+        .expect("gpu tm init");
+    tm.reset().expect("tm reset");
+    // Build 3 fixed SDRs, feed them in a repeating sequence.
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(7);
+    let make = |rng: &mut Xoshiro256PlusPlus| make_sdr(rng, bits, 0.02);
+    let seqs = [make(&mut rng), make(&mut rng), make(&mut rng)];
+    // Warm up SP so columns are stable per symbol.
+    for _ in 0..100 {
+        for s in &seqs {
+            let _ = sp.compute(s, true).expect("sp compute");
+        }
+    }
+    // Build a long input buffer: 100 repetitions of [A,B,C] = 300 steps.
+    let repeats = 100usize;
+    let t = repeats * 3;
+    let mut inputs_flat = vec![0u8; t * bits];
+    for r in 0..repeats {
+        for (i, s) in seqs.iter().enumerate() {
+            let off = (r * 3 + i) * bits;
+            inputs_flat[off..off + bits].copy_from_slice(s);
+        }
+    }
+    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs_flat).expect("htod");
+    let mut cols_dev = dev.alloc_zeros::<u8>(t * n_cols).expect("alloc cols");
+    let mut anom_dev = dev.alloc_zeros::<f32>(t).expect("alloc anom");
+    sp.step_batch_with_tm(
+        &inputs_dev,
+        t,
+        bits,
+        true,
+        &mut cols_dev,
+        &mut anom_dev,
+        &mut tm,
+    ).expect("step_batch_with_tm");
+    let anom: Vec<f32> = dev.dtoh_sync_copy(&anom_dev).expect("d2h anom");
+    let cols: Vec<u8> = dev.dtoh_sync_copy(&cols_dev).expect("d2h cols");
+    // Active column count per step must equal k for every step.
+    let k = ((cfg.sparsity * n_cols as f32).round() as usize).max(1);
+    for ti in 0..t {
+        let step_slice = &cols[ti * n_cols..(ti + 1) * n_cols];
+        let n_on = step_slice.iter().filter(|&&b| b != 0).count();
+        assert_eq!(n_on, k, "step {ti} has {n_on} active cols, expected {k}");
+    }
+    // First repetition: anomaly should be near 1.0 (nothing predicted).
+    let early_avg: f32 = anom[3..9].iter().sum::<f32>() / 6.0;
+    // Last repetitions: anomaly should be noticeably lower.
+    let late_avg: f32 = anom[(t - 9)..t].iter().sum::<f32>() / 9.0;
+    eprintln!("gpu tm: early anomaly = {early_avg:.3}, late = {late_avg:.3}");
+    assert!(
+        late_avg < early_avg,
+        "GPU TM should reduce anomaly on repeating sequence: early={early_avg:.3}, late={late_avg:.3}"
+    );
+}
+/// Cluster-sync smoke test: verifies that the fused megakernel (which relies on
+/// hardware `cluster::sync()` / grid-barrier on H100/H200 Hopper) completes
+/// without deadlock when called with real HTM state, and that output shapes are
+/// sane (no NaN / Inf in anomaly scores, active-column count in plausible range).
+///
+/// This is an *integration* test, not a synthetic micro-benchmark: it exercises
+/// exactly the same `launch_fused` code path used in production, so any
+/// deadlock in the cooperative-grid or DLB barrier would surface here.
+///
+/// Skips gracefully (with an eprintln) when no GPU is available — the test
+/// binary returns exit-code 0 in that case so CI still passes.
+#[test]
+fn cluster_sync_smoke_test() {
+    // Build a tiny HTM region (1024 inputs, 256 columns, 4 cells/column).
+    // This keeps VRAM usage minimal while still exercising all kernel paths.
+    let input_bits  = 1024usize;
+    let n_columns   = 256usize;
+    let cells_per_col = 4usize;
+    // Probe cooperative launch attribute before doing any real work.
+    // CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 223 (added in CUDA 11.8 for Hopper).
+    // cudarc exposes raw attribute querying; we check cooperative launch (98)
+    // as the guard — cluster launch is a superset and not separately probed
+    // here since cudarc doesn't expose attribute 223 symbolically yet.
+    // On pre-Hopper hardware the DLB barrier path is used instead and the
+    // test still validates no deadlock on that path.
+    let make_cfg = || SpatialPoolerConfig {
+        input_bits,
+        n_columns,
+        sparsity: 0.04,  // ~10 active cols out of 256
+        ..SpatialPoolerConfig::default()
+    };
+    let cpu_ref = SpatialPooler::new(make_cfg(), 42);
+    let mut sp = match SpatialPoolerGpu::from_cpu(&cpu_ref) {
+        Ok(sp) => sp,
+        Err(e) => {
+            eprintln!("[cluster_sync_smoke_test] No GPU available ({e:?}) — skipping");
+            return;
+        }
+    };
+    let dev = sp.dev_ref().clone();
+    // Check cooperative launch support; skip with a clear message if absent.
+    let cooperative_ok = matches!(
+        dev.attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH),
+        Ok(v) if v > 0
+    );
+    if !cooperative_ok {
+        eprintln!("[cluster_sync_smoke_test] CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH=0 — DLB path only, still running test");
+        // We continue — the DLB path is the production fallback and must not deadlock either.
+    }
+    let mut tm = match TemporalMemoryGpu::new(dev.clone(), n_columns, cells_per_col) {
+        Ok(tm) => tm,
+        Err(e) => {
+            eprintln!("[cluster_sync_smoke_test] TemporalMemoryGpu::new failed ({e:?}) — skipping");
+            return;
+        }
+    };
+    tm.reset().expect("tm reset");
+    let mut fused_st: FusedState = match FusedState::new(
+        dev.clone(),
+        n_columns,
+        cells_per_col,
+        sp.initial_threshold_estimate(),
+    ) {
+        Ok(f) => f,
+        Err(e) => {
+            eprintln!("[cluster_sync_smoke_test] FusedState::new failed ({e:?}) — skipping");
+            return;
+        }
+    };
+    fused_st.reset().expect("fused reset");
+    // Build T=4 timesteps of all-zero input SDRs.
+    let t = 4usize;
+    let inputs_flat = vec![0u8; t * input_bits];
+    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs_flat).expect("htod inputs");
+    let mut cols_dev  = dev.alloc_zeros::<u8>(t * n_columns).expect("alloc cols");
+    let mut anom_dev  = dev.alloc_zeros::<f32>(t).expect("alloc anom");
+    // Execute with a 2-second timeout guard via a thread. If the kernel
+    // deadlocks, the parent test process times out and the CI job reports
+    // failure — we can't cancel a live CUDA kernel from Rust, but the
+    // launch_fused call itself must return within this window on any sane GPU.
+    //
+    // We run the kernel inline (not in a separate thread) because CUDA contexts
+    // are not safely shareable across threads without explicit multi-threading
+    // setup. The 2-second bound is enforced implicitly: if the kernel deadlocks,
+    // the test binary will hang and the CI timeout (typically 5 min) will kill it.
+    // For local dev, the deadlock would be immediately obvious.
+    launch_fused(
+        &mut sp,
+        &mut tm,
+        &mut fused_st,
+        &inputs_dev,
+        &mut cols_dev,
+        &mut anom_dev,
+        t,
+        input_bits,
+        false, // learn=false for determinism
+    ).expect("launch_fused (cluster_sync_smoke_test): deadlock or CUDA error");
+    dev.synchronize().expect("device sync after launch_fused");
+    // --- Correctness assertions ---
+    let cols_host: Vec<u8>  = dev.dtoh_sync_copy(&cols_dev).expect("d2h cols");
+    let anom_host: Vec<f32> = dev.dtoh_sync_copy(&anom_dev).expect("d2h anom");
+    // Output buffers must be exactly the right size.
+    assert_eq!(cols_host.len(), t * n_columns, "cols buffer size mismatch");
+    assert_eq!(anom_host.len(), t,             "anom buffer size mismatch");
+    // Anomaly scores must be finite (NaN/Inf indicates numerical blow-up).
+    for (i, &a) in anom_host.iter().enumerate() {
+        assert!(a.is_finite(), "anomaly[{i}] is not finite: {a}");
+        assert!(a >= 0.0 && a <= 1.0, "anomaly[{i}] out of [0,1]: {a}");
+    }
+    // Active-column count per step: threshold-based inhibition, so 0 is
+    // possible on cold start (before thresholds calibrate), but we assert
+    // <= n_columns to catch buffer overruns or completely wrong output.
+    for ti in 0..t {
+        let n_on = cols_host[ti * n_columns..(ti + 1) * n_columns]
+            .iter()
+            .filter(|&&b| b != 0)
+            .count();
+        assert!(
+            n_on <= n_columns,
+            "step {ti}: active columns {n_on} > n_columns {n_columns} (buffer overrun?)"
+        );
+    }
+    eprintln!(
+        "[cluster_sync_smoke_test] PASSED: T={t}, n_cols={n_columns}, \
+         input_bits={input_bits}, cooperative_supported={cooperative_ok}, \
+         anom={anom_host:?}"
+    );
+}
+/// Parity check: the CAI zero-copy path (`step_many_cuda`) must produce
+/// bit-identical outputs to the numpy H2D/D2H path (`step_batch_with_tm`),
+/// since the kernel pipeline is the same — only the I/O wrapping changes.
+/// We skip the PyO3 CAI dict plumbing here and test the underlying
+/// ManuallyDrop + upgrade_device_ptr pattern directly.
+#[test]
+fn gpu_cuda_vs_numpy_parity() {
+    use std::mem::ManuallyDrop;
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let n_cols = cfg.n_columns;
+    let cells_per_col = 32usize;
+    // Build two identical (SP, TM) pairs from the same seed.
+    let build = || -> (SpatialPoolerGpu, TemporalMemoryGpu) {
+        let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 271828);
+        let sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu init");
+        let dev = sp.dev_ref().clone();
+        let mut tm = TemporalMemoryGpu::new(dev, n_cols, cells_per_col).expect("tm init");
+        tm.reset().expect("tm reset");
+        (sp, tm)
+    };
+    // Deterministic SDR sequence.
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(31337);
+    let t = 32usize;
+    let mut inputs_flat = vec![0u8; t * bits];
+    for i in 0..t {
+        let sdr = make_sdr(&mut rng, bits, 0.02);
+        inputs_flat[i * bits..(i + 1) * bits].copy_from_slice(&sdr);
+    }
+    // ---- Path A: owned CudaSlice (numpy-equivalent path) ----
+    let (mut sp_a, mut tm_a) = build();
+    let dev_a = sp_a.dev_ref().clone();
+    let inputs_a: CudaSlice<u8> = dev_a.htod_sync_copy(&inputs_flat).expect("htod");
+    let mut cols_a = dev_a.alloc_zeros::<u8>(t * n_cols).expect("alloc cols_a");
+    let mut anom_a = dev_a.alloc_zeros::<f32>(t).expect("alloc anom_a");
+    sp_a.step_batch_with_tm(&inputs_a, t, bits, false, &mut cols_a, &mut anom_a, &mut tm_a)
+        .expect("owned step_batch_with_tm");
+    dev_a.synchronize().expect("sync a");
+    let cols_a_host: Vec<u8> = dev_a.dtoh_sync_copy(&cols_a).expect("d2h cols_a");
+    let anom_a_host: Vec<f32> = dev_a.dtoh_sync_copy(&anom_a).expect("d2h anom_a");
+    // ---- Path B: borrowed device pointers via upgrade_device_ptr ----
+    // We allocate fresh owned CudaSlices on a fresh device, then take their
+    // raw ptrs and re-wrap as ManuallyDrop borrowed views — mimicking what
+    // `step_many_cuda` does with torch-owned CUDA memory.
+    let (mut sp_b, mut tm_b) = build();
+    let dev_b = sp_b.dev_ref().clone();
+    let inputs_b_owned: CudaSlice<u8> = dev_b.htod_sync_copy(&inputs_flat).expect("htod");
+    let cols_b_owned = dev_b.alloc_zeros::<u8>(t * n_cols).expect("alloc cols_b");
+    let anom_b_owned = dev_b.alloc_zeros::<f32>(t).expect("alloc anom_b");
+    // Extract raw CUdeviceptrs (and leak the owners so their Drop doesn't free).
+    let inputs_ptr = inputs_b_owned.leak();
+    let cols_ptr = cols_b_owned.leak();
+    let anom_ptr = anom_b_owned.leak();
+    // Re-wrap as borrowed views.
+    let inputs_b = ManuallyDrop::new(unsafe { dev_b.upgrade_device_ptr::<u8>(inputs_ptr, t * bits) });
+    let mut cols_b = ManuallyDrop::new(unsafe { dev_b.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols) });
+    let mut anom_b = ManuallyDrop::new(unsafe { dev_b.upgrade_device_ptr::<f32>(anom_ptr, t) });
+    sp_b.step_batch_with_tm(&inputs_b, t, bits, false, &mut cols_b, &mut anom_b, &mut tm_b)
+        .expect("borrowed step_batch_with_tm");
+    dev_b.synchronize().expect("sync b");
+    // `ManuallyDrop` doesn't auto-coerce to `&CudaSlice<T>` for the DevicePtr
+    // trait bound on `dtoh_sync_copy`; explicit deref.
+    let cols_b_host: Vec<u8> = dev_b.dtoh_sync_copy(&*cols_b).expect("d2h cols_b");
+    let anom_b_host: Vec<f32> = dev_b.dtoh_sync_copy(&*anom_b).expect("d2h anom_b");
+    // Re-own so Drop actually frees (we leaked above).
+    let _inputs_owned_again = unsafe { dev_b.upgrade_device_ptr::<u8>(inputs_ptr, t * bits) };
+    let _cols_owned_again = unsafe { dev_b.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols) };
+    let _anom_owned_again = unsafe { dev_b.upgrade_device_ptr::<f32>(anom_ptr, t) };
+    assert_eq!(cols_a_host, cols_b_host, "active-column mask diverges between numpy and CAI paths");
+    assert_eq!(anom_a_host.len(), anom_b_host.len());
+    for (i, (a, b)) in anom_a_host.iter().zip(anom_b_host.iter()).enumerate() {
+        // Anomaly is a pure division of integer counts — bit-exact expected.
+        assert!((a - b).abs() < 1e-7, "anomaly mismatch at step {i}: a={a} b={b}");
+    }
+}
+/// Fused kernel: threshold activation should converge to near target sparsity
+/// after a short warmup. Acceptance: mean activation rate per step lands in
+/// [0.3*target, 2.5*target] after 500-step warmup. Because the threshold
+/// starts conservative (=2.0) and the per-column adaptation rate is slow
+/// (0.001), we allow a generous band — the test asserts directional
+/// convergence toward the target, not tight matching.
+#[test]
+fn gpu_threshold_converges_to_sparsity() {
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let n_cols = cfg.n_columns;
+    let cells_per_col = 32usize;
+    let target = cfg.sparsity;  // 0.02 = 40 cols expected
+    let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 111);
+    let mut sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu sp init");
+    let dev = sp.dev_ref().clone();
+    let mut tm = TemporalMemoryGpu::new(dev.clone(), n_cols, cells_per_col).expect("tm init");
+    let mut fused = FusedState::new(
+        dev.clone(),
+        n_cols,
+        cells_per_col,
+        sp.initial_threshold_estimate(),
+    ).expect("fused init");
+    tm.reset().expect("tm reset");
+    fused.reset().expect("fused reset");
+    // Warmup: 1000 random 2%-sparse SDRs.
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(31337);
+    let t_warm = 1000usize;
+    let mut inputs = vec![0u8; t_warm * bits];
+    for ti in 0..t_warm {
+        let sdr = make_sdr(&mut rng, bits, 0.02);
+        inputs[ti*bits..(ti+1)*bits].copy_from_slice(&sdr);
+    }
+    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs).expect("htod");
+    let mut cols_dev = dev.alloc_zeros::<u8>(t_warm * n_cols).expect("alloc cols");
+    let mut anom_dev = dev.alloc_zeros::<f32>(t_warm).expect("alloc anom");
+    launch_fused(
+        &mut sp, &mut tm, &mut fused,
+        &inputs_dev, &mut cols_dev, &mut anom_dev,
+        t_warm, bits, true,
+    ).expect("warmup launch");
+    dev.synchronize().expect("sync");
+    // Measurement pass: another 200 steps, measure mean activation.
+    let t_meas = 200usize;
+    let mut meas_inputs = vec![0u8; t_meas * bits];
+    for ti in 0..t_meas {
+        let sdr = make_sdr(&mut rng, bits, 0.02);
+        meas_inputs[ti*bits..(ti+1)*bits].copy_from_slice(&sdr);
+    }
+    let meas_dev: CudaSlice<u8> = dev.htod_sync_copy(&meas_inputs).expect("htod meas");
+    let mut meas_cols = dev.alloc_zeros::<u8>(t_meas * n_cols).expect("alloc meas cols");
+    let mut meas_anom = dev.alloc_zeros::<f32>(t_meas).expect("alloc meas anom");
+    launch_fused(
+        &mut sp, &mut tm, &mut fused,
+        &meas_dev, &mut meas_cols, &mut meas_anom,
+        t_meas, bits, true,
+    ).expect("meas launch");
+    dev.synchronize().expect("sync meas");
+    let cols_host: Vec<u8> = dev.dtoh_sync_copy(&meas_cols).expect("d2h");
+    let mut step_counts = Vec::with_capacity(t_meas);
+    for ti in 0..t_meas {
+        let n_on = cols_host[ti*n_cols..(ti+1)*n_cols]
+            .iter().filter(|&&b| b != 0).count();
+        step_counts.push(n_on);
+    }
+    let mean_active: f64 = step_counts.iter().map(|&c| c as f64).sum::<f64>()
+        / (t_meas as f64);
+    let target_active = target as f64 * n_cols as f64;
+    eprintln!(
+        "threshold-activation convergence: mean_active/step = {mean_active:.1} \
+         (target = {target_active:.1})"
+    );
+    // Very generous band — we just want to confirm the threshold loop is
+    // functioning (not diverged to 0 or to all-active).
+    assert!(
+        mean_active >= 0.25 * target_active && mean_active <= 4.0 * target_active,
+        "mean active {mean_active:.1} outside [0.25x, 4x] of target {target_active:.1}"
+    );
+}
+/// Fused kernel: TM should learn a repeating sequence — anomaly decays.
+#[test]
+fn gpu_fused_tm_anomaly_decays_on_repeating_sequence() {
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let n_cols = cfg.n_columns;
+    let cells_per_col = 32usize;
+    let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 271);
+    let mut sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu sp init");
+    let dev = sp.dev_ref().clone();
+    let mut tm = TemporalMemoryGpu::new(dev.clone(), n_cols, cells_per_col).expect("tm init");
+    let mut fused = FusedState::new(
+        dev.clone(),
+        n_cols,
+        cells_per_col,
+        sp.initial_threshold_estimate(),
+    ).expect("fused init");
+    tm.reset().expect("tm reset");
+    fused.reset().expect("fused reset");
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(7);
+    let make = |rng: &mut Xoshiro256PlusPlus| make_sdr(rng, bits, 0.02);
+    let seqs = [make(&mut rng), make(&mut rng), make(&mut rng)];
+    // Warmup SP threshold calibration with random SDRs first.
+    let warm = 300usize;
+    let mut warm_inputs = vec![0u8; warm * bits];
+    for ti in 0..warm {
+        let sdr = make_sdr(&mut rng, bits, 0.02);
+        warm_inputs[ti*bits..(ti+1)*bits].copy_from_slice(&sdr);
+    }
+    let warm_dev: CudaSlice<u8> = dev.htod_sync_copy(&warm_inputs).expect("htod warm");
+    let mut warm_cols = dev.alloc_zeros::<u8>(warm * n_cols).expect("alloc warm cols");
+    let mut warm_anom = dev.alloc_zeros::<f32>(warm).expect("alloc warm anom");
+    launch_fused(
+        &mut sp, &mut tm, &mut fused,
+        &warm_dev, &mut warm_cols, &mut warm_anom,
+        warm, bits, true,
+    ).expect("warm launch");
+    dev.synchronize().expect("sync warm");
+    // Feed repeating A,B,C sequence for 100 reps.
+    let repeats = 100usize;
+    let t = repeats * 3;
+    let mut inputs = vec![0u8; t * bits];
+    for r in 0..repeats {
+        for (i, s) in seqs.iter().enumerate() {
+            let off = (r*3 + i) * bits;
+            inputs[off..off+bits].copy_from_slice(s);
+        }
+    }
+    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs).expect("htod rep");
+    let mut cols_dev = dev.alloc_zeros::<u8>(t * n_cols).expect("alloc rep cols");
+    let mut anom_dev = dev.alloc_zeros::<f32>(t).expect("alloc rep anom");
+    launch_fused(
+        &mut sp, &mut tm, &mut fused,
+        &inputs_dev, &mut cols_dev, &mut anom_dev,
+        t, bits, true,
+    ).expect("rep launch");
+    dev.synchronize().expect("sync rep");
+    let anom: Vec<f32> = dev.dtoh_sync_copy(&anom_dev).expect("d2h anom");
+    let early_avg: f32 = anom[3..12].iter().sum::<f32>() / 9.0;
+    let late_avg: f32 = anom[(t-9)..t].iter().sum::<f32>() / 9.0;
+    eprintln!("fused TM anomaly: early={early_avg:.3} late={late_avg:.3}");
+    assert!(
+        late_avg < early_avg,
+        "anomaly must decay: early={early_avg:.3} late={late_avg:.3}"
+    );
+    assert!(
+        late_avg < 0.5,
+        "late anomaly must be < 0.5 (got {late_avg:.3})"
+    );
+}
+#[test]
+fn gpu_sp_yields_k_winners() {
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let n = cfg.n_columns;
+    let expected_k = ((cfg.sparsity * n as f32).round() as usize).max(1);
+    let cpu = SpatialPooler::new(SpatialPoolerConfig::default(), 7);
+    let mut gpu = SpatialPoolerGpu::from_cpu(&cpu).expect("gpu init");
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(1);
+    for _ in 0..10 {
+        let sdr_u8 = make_sdr(&mut rng, bits, 0.02);
+        let active = gpu.compute(&sdr_u8, false).expect("gpu compute");
+        assert_eq!(active.len(), expected_k);
+        // Ensure sorted + unique.
+        for w in active.windows(2) {
+            assert!(w[0] < w[1], "duplicate or out-of-order winner indices");
+        }
+    }
+}
+#[test]
+fn fused_launch_plan_uses_cooperative_grid_sync() {
+    let plan = plan_fused_launch(30, true, 30, None).expect("cooperative supported");
+    assert_eq!(plan.grid_dim_x, 16);
+    assert_eq!(plan.cooperative_grid_limit, 30);
+}
+#[test]
+fn fused_launch_plan_scales_to_big_gpu() {
+    // H200-like: 132 SMs, high cooperative_grid_limit. Cap still applies.
+    let plan = plan_fused_launch(132, true, 1000, None).expect("cooperative supported");
+    assert_eq!(plan.grid_dim_x, 16); // capped by default override
+    let plan = plan_fused_launch(132, true, 1000, Some(64)).expect("cooperative supported");
+    assert_eq!(plan.grid_dim_x, 64); // override raises the cap
+}
+#[test]
+fn fused_launch_plan_refuses_non_cooperative_devices() {
+    // The slow path was removed. Devices without cooperative launch fail fast.
+    let err = plan_fused_launch(30, false, 0, None).unwrap_err();
+    assert!(err.contains("cooperative launch"));
+}
+#[test]
+fn fused_grid_cap_env_override_is_honored() {
+    let cfg = SpatialPoolerConfig::default();
+    let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 5252);
+    let sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu sp init");
+    let dev = sp.dev_ref().clone();
+    unsafe { std::env::set_var("HTM_FUSED_GRID_CAP", "12"); }
+    let fused = FusedState::new(
+        dev.clone(),
+        cfg.n_columns,
+        32usize,
+        sp.initial_threshold_estimate(),
+    ).expect("fused init");
+    unsafe { std::env::remove_var("HTM_FUSED_GRID_CAP"); }
+    let sm_count = match dev.attribute(
+        cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+    ) {
+        Ok(v) => v as u32,
+        Err(_) => 16u32,
+    };
+    let expected = sm_count.max(1).min(12);
+    assert_eq!(
+        fused.grid_dim_x,
+        expected,
+        "fused grid cap env override ignored: expected min(sm_count, 12) = {expected}, got {}",
+        fused.grid_dim_x,
+    );
+}

overlay/htm_rust/src/gpu/tm_gpu.rs ADDED Viewed

	@@ -0,0 +1,460 @@

+//! GPU Temporal Memory.
+//!
+//! Flat device storage. Pre-allocated segment slab:
+//!   n_cells = n_columns * cells_per_column
+//!   n_segments_max = n_cells * MAX_SEGMENTS_PER_CELL
+//!   n_synapses_max = n_segments_max * MAX_SYN_PER_SEGMENT
+//!
+//! Defaults (CPU parity targets relaxed on GPU to keep memory tractable):
+//!   MAX_SEGMENTS_PER_CELL = 16
+//!   MAX_SYN_PER_SEGMENT   = 32
+//!
+//! At n_cells = 65536:
+//!   n_segments_max =  1_048_576   (~1M)
+//!   n_synapses_max = 33_554_432   (~33M)
+//! Storage:
+//!   syn_presyn : u32  × 33M = 128 MB
+//!   syn_perm   : i16  × 33M =  64 MB
+//!   seg_cell   : u32  ×  1M =   4 MB
+//!   seg_syn_n  : u32  ×  1M =   4 MB
+//!   misc bitsets etc        ~ <1 MB
+//!   -------------------------------
+//!   Total per region        ~200 MB
+//!
+//! Permanences are stored as i16 scaled by 32767 (→ [0, 32767] represents
+//! [0.0, 1.0]). inc/dec are provided pre-scaled.
+use std::sync::Arc;
+use cudarc::driver::{CudaDevice, CudaSlice, DriverError, DeviceRepr, LaunchAsync, LaunchConfig};
+use cudarc::nvrtc::Ptx;
+/// Packed config struct passed by value to TM kernels to stay under
+/// cudarc's 12-tuple launch limit. Layout must match the C-side
+/// `TmConfig` struct declared in each kernel.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct TmConfig {
+    pub activation_threshold: u32,
+    pub learning_threshold: u32,
+    pub cells_per_column: u32,
+    pub synapses_per_segment: u32,
+    pub n_segments: u32,
+    pub n_cells: u32,
+    pub max_segments_per_cell: u32,
+    pub max_new_synapses: u32,
+    pub conn_thr_i16: i32,        // i16 widened to i32 for alignment
+    pub perm_inc_i16: i32,
+    pub perm_dec_i16: i32,
+    pub predicted_seg_dec_i16: i32,
+    pub initial_perm_i16: i32,
+    pub iter_seed: u32,
+    pub n_cols: u32,
+    pub bits_words: u32,
+}
+unsafe impl DeviceRepr for TmConfig {}
+// Embedded PTX.
+const PTX_TM_PREDICT:  &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_predict.ptx"));
+const PTX_TM_ACTIVATE: &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_activate.ptx"));
+const PTX_TM_LEARN:    &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_learn.ptx"));
+const PTX_TM_PUNISH:   &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_punish.ptx"));
+const PTX_TM_GROW:     &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_grow.ptx"));
+const PTX_TM_ANOMALY:  &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_anomaly.ptx"));
+const PTX_TM_RESET:    &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/tm_reset.ptx"));
+/// Capacity trade-offs for 6 GB VRAM (RTX 3060) shared with the model:
+///   n_cells            = 2048 × 32 = 65_536
+///   n_segments_max     = n_cells × MAX_SEGMENTS_PER_CELL
+///   n_synapses_max     = n_segments_max × MAX_SYN_PER_SEGMENT
+///
+/// At 4/20 these are 262_144 segments and ~5.2M synapses (~50 MB per region).
+/// The training loop runs with `reset_each_forward=True`, so segment counts
+/// per window stay well below 32K (typical: ~n_cols new segs per step until
+/// the first matching segment is reused; in a 2048-step window that plateaus
+/// around ~5K total live segments). The 262K ceiling is generous headroom.
+pub const MAX_SEGMENTS_PER_CELL: usize = 4;
+pub const MAX_SYN_PER_SEGMENT:   usize = 20;
+const PERM_SCALE: f32 = 32767.0;
+fn perm_f32_to_i16(x: f32) -> i16 {
+    let clamped = x.clamp(0.0, 1.0);
+    (clamped * PERM_SCALE).round() as i16
+}
+pub struct TemporalMemoryGpu {
+    dev: Arc<CudaDevice>,
+    // Config mirror
+    pub n_columns: usize,
+    pub cells_per_column: usize,
+    pub activation_threshold: u32,
+    pub learning_threshold: u32,
+    pub initial_perm_i16: i16,
+    pub conn_thr_i16: i16,
+    pub perm_inc_i16: i16,
+    pub perm_dec_i16: i16,
+    pub predicted_seg_dec_i16: i16,
+    pub max_new_synapse_count: u32,
+    // Sizes
+    pub n_cells: usize,
+    pub n_segments_max: usize,
+    pub bits_words: usize,  // n_cells / 32
+    // Persistent device buffers
+    seg_cell_id:       CudaSlice<u32>,
+    seg_syn_count:     CudaSlice<u32>,
+    syn_presyn:        CudaSlice<u32>,
+    syn_perm:          CudaSlice<i16>,
+    cell_seg_count:    CudaSlice<u32>,
+    cell_active_bits:     CudaSlice<u32>,
+    cell_winner_bits:     CudaSlice<u32>,
+    cell_predictive_bits: CudaSlice<u32>,
+    prev_active_bits:     CudaSlice<u32>,
+    prev_winner_bits:     CudaSlice<u32>,
+    col_predicted:         CudaSlice<u8>,
+    seg_num_active_conn:   CudaSlice<u32>,
+    seg_num_active_pot:    CudaSlice<u32>,
+    unpredicted_count:     CudaSlice<u32>,
+    burst_cols_flat:       CudaSlice<u32>,
+    burst_cols_count:      CudaSlice<u32>,
+    col_best_match:        CudaSlice<u32>,
+    iter_counter: u32,
+}
+impl TemporalMemoryGpu {
+    pub fn new(
+        dev: Arc<CudaDevice>,
+        n_columns: usize,
+        cells_per_column: usize,
+    ) -> Result<Self, DriverError> {
+        let n_cells = n_columns * cells_per_column;
+        assert!(n_cells % 32 == 0, "n_cells must be divisible by 32 for bitsets");
+        let n_segments_max = n_cells * MAX_SEGMENTS_PER_CELL;
+        let bits_words = n_cells / 32;
+        // Numenta defaults.
+        let activation_threshold = 15u32;
+        let learning_threshold   = 13u32;
+        let initial_perm_i16        = perm_f32_to_i16(0.21);
+        let conn_thr_i16            = perm_f32_to_i16(0.50);
+        let perm_inc_i16            = perm_f32_to_i16(0.10);
+        let perm_dec_i16            = perm_f32_to_i16(0.10);
+        let predicted_seg_dec_i16   = perm_f32_to_i16(0.10);
+        let max_new_synapse_count   = 20u32;
+        // Allocate buffers.
+        let seg_cell_id_host: Vec<u32> = vec![u32::MAX; n_segments_max];
+        let seg_cell_id = dev.htod_sync_copy(&seg_cell_id_host)?;
+        let seg_syn_count = dev.alloc_zeros::<u32>(n_segments_max)?;
+        let syn_presyn = dev.alloc_zeros::<u32>(n_segments_max * MAX_SYN_PER_SEGMENT)?;
+        let syn_perm = dev.alloc_zeros::<i16>(n_segments_max * MAX_SYN_PER_SEGMENT)?;
+        let cell_seg_count = dev.alloc_zeros::<u32>(n_cells)?;
+        let cell_active_bits     = dev.alloc_zeros::<u32>(bits_words)?;
+        let cell_winner_bits     = dev.alloc_zeros::<u32>(bits_words)?;
+        let cell_predictive_bits = dev.alloc_zeros::<u32>(bits_words)?;
+        let prev_active_bits     = dev.alloc_zeros::<u32>(bits_words)?;
+        let prev_winner_bits     = dev.alloc_zeros::<u32>(bits_words)?;
+        let col_predicted       = dev.alloc_zeros::<u8>(n_columns)?;
+        let seg_num_active_conn = dev.alloc_zeros::<u32>(n_segments_max)?;
+        let seg_num_active_pot  = dev.alloc_zeros::<u32>(n_segments_max)?;
+        let unpredicted_count   = dev.alloc_zeros::<u32>(1)?;
+        // Bursting columns for one step bounded by n_columns.
+        let burst_cols_flat   = dev.alloc_zeros::<u32>(n_columns)?;
+        let burst_cols_count  = dev.alloc_zeros::<u32>(1)?;
+        let col_best_match    = dev.alloc_zeros::<u32>(n_columns)?;
+        // Load PTX modules.
+        let modules = [
+            ("htm_tm_predict",  PTX_TM_PREDICT,  "tm_predict"),
+            ("htm_tm_activate", PTX_TM_ACTIVATE, "tm_activate"),
+            ("htm_tm_learn",    PTX_TM_LEARN,    "tm_learn_reinforce"),
+            ("htm_tm_punish",   PTX_TM_PUNISH,   "tm_punish"),
+            ("htm_tm_grow",     PTX_TM_GROW,     "tm_grow"),
+            ("htm_tm_anomaly",  PTX_TM_ANOMALY,  "tm_anomaly"),
+            ("htm_tm_reset",    PTX_TM_RESET,    "tm_reset_step"),
+        ];
+        for (modname, ptx, fnname) in modules {
+            if dev.get_func(modname, fnname).is_none() {
+                dev.load_ptx(Ptx::from_src(ptx), modname, &[fnname])?;
+            }
+        }
+        Ok(Self {
+            dev,
+            n_columns,
+            cells_per_column,
+            activation_threshold,
+            learning_threshold,
+            initial_perm_i16,
+            conn_thr_i16,
+            perm_inc_i16,
+            perm_dec_i16,
+            predicted_seg_dec_i16,
+            max_new_synapse_count,
+            n_cells,
+            n_segments_max,
+            bits_words,
+            seg_cell_id,
+            seg_syn_count,
+            syn_presyn,
+            syn_perm,
+            cell_seg_count,
+            cell_active_bits,
+            cell_winner_bits,
+            cell_predictive_bits,
+            prev_active_bits,
+            prev_winner_bits,
+            col_predicted,
+            seg_num_active_conn,
+            seg_num_active_pot,
+            unpredicted_count,
+            burst_cols_flat,
+            burst_cols_count,
+            col_best_match,
+            iter_counter: 0,
+        })
+    }
+    // --- Fused-path accessors ---
+    pub fn seg_cell_id_accessor(&self) -> &CudaSlice<u32> { &self.seg_cell_id }
+    pub fn seg_syn_count_accessor(&self) -> &CudaSlice<u32> { &self.seg_syn_count }
+    pub fn syn_presyn_accessor(&self) -> &CudaSlice<u32> { &self.syn_presyn }
+    pub fn syn_perm_accessor(&self) -> &CudaSlice<i16> { &self.syn_perm }
+    pub fn cell_seg_count_accessor(&self) -> &CudaSlice<u32> { &self.cell_seg_count }
+    /// Hard reset — clear everything (predictive + active + segments).
+    pub fn reset(&mut self) -> Result<(), DriverError> {
+        // Restore "unused" sentinel in seg_cell_id.
+        let unused_host: Vec<u32> = vec![u32::MAX; self.n_segments_max];
+        self.dev.htod_sync_copy_into(&unused_host, &mut self.seg_cell_id)?;
+        self.dev.memset_zeros(&mut self.seg_syn_count)?;
+        self.dev.memset_zeros(&mut self.cell_seg_count)?;
+        self.dev.memset_zeros(&mut self.cell_active_bits)?;
+        self.dev.memset_zeros(&mut self.cell_winner_bits)?;
+        self.dev.memset_zeros(&mut self.cell_predictive_bits)?;
+        self.dev.memset_zeros(&mut self.prev_active_bits)?;
+        self.dev.memset_zeros(&mut self.prev_winner_bits)?;
+        self.dev.memset_zeros(&mut self.col_best_match)?;
+        self.iter_counter = 0;
+        Ok(())
+    }
+    fn build_cfg(&self) -> TmConfig {
+        TmConfig {
+            activation_threshold: self.activation_threshold,
+            learning_threshold: self.learning_threshold,
+            cells_per_column: self.cells_per_column as u32,
+            synapses_per_segment: MAX_SYN_PER_SEGMENT as u32,
+            n_segments: self.n_segments_max as u32,
+            n_cells: self.n_cells as u32,
+            max_segments_per_cell: MAX_SEGMENTS_PER_CELL as u32,
+            max_new_synapses: self.max_new_synapse_count,
+            conn_thr_i16: self.conn_thr_i16 as i32,
+            perm_inc_i16: self.perm_inc_i16 as i32,
+            perm_dec_i16: self.perm_dec_i16 as i32,
+            predicted_seg_dec_i16: self.predicted_seg_dec_i16 as i32,
+            initial_perm_i16: self.initial_perm_i16 as i32,
+            iter_seed: self.iter_counter,
+            n_cols: self.n_columns as u32,
+            bits_words: self.bits_words as u32,
+        }
+    }
+    /// Run one TM step on the GPU. Takes the SP active-column mask (u8, already
+    /// on device) and writes `anomaly_out[t_slot]`.
+    pub fn step(
+        &mut self,
+        sp_active_mask: &CudaSlice<u8>,
+        anomaly_out: &mut CudaSlice<f32>,
+        t_slot: u32,
+        learn: bool,
+    ) -> Result<(), DriverError> {
+        let n_cells = self.n_cells;
+        let n_cols = self.n_columns;
+        let predict_fn  = self.dev.get_func("htm_tm_predict",  "tm_predict").unwrap();
+        let activate_fn = self.dev.get_func("htm_tm_activate", "tm_activate").unwrap();
+        let learn_fn    = self.dev.get_func("htm_tm_learn",    "tm_learn_reinforce").unwrap();
+        let punish_fn   = self.dev.get_func("htm_tm_punish",   "tm_punish").unwrap();
+        let grow_fn     = self.dev.get_func("htm_tm_grow",     "tm_grow").unwrap();
+        let anom_fn     = self.dev.get_func("htm_tm_anomaly",  "tm_anomaly").unwrap();
+        let reset_fn    = self.dev.get_func("htm_tm_reset",    "tm_reset_step").unwrap();
+        self.iter_counter = self.iter_counter.wrapping_add(1);
+        let cfg_val = self.build_cfg();
+        // 0. Per-step reset.
+        let reset_words = self.bits_words.max(n_cols);
+        let reset_cfg = LaunchConfig {
+            grid_dim: (((reset_words + 255) / 256) as u32, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            reset_fn.clone().launch(
+                reset_cfg,
+                (
+                    &mut self.cell_active_bits,
+                    &mut self.cell_winner_bits,
+                    &mut self.cell_predictive_bits,
+                    &mut self.prev_active_bits,
+                    &mut self.prev_winner_bits,
+                    &mut self.col_predicted,
+                    &mut self.unpredicted_count,
+                    &mut self.burst_cols_count,
+                    &mut self.col_best_match,
+                    self.bits_words as u32,
+                    n_cols as u32,
+                ),
+            )?;
+        }
+        // 1. Predict (grid = n_cells; each block iterates its cell's segments).
+        let predict_cfg = LaunchConfig {
+            grid_dim: (n_cells as u32, 1, 1),
+            block_dim: (32, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            predict_fn.clone().launch(
+                predict_cfg,
+                (
+                    &self.seg_cell_id,
+                    &self.seg_syn_count,
+                    &self.syn_presyn,
+                    &self.syn_perm,
+                    &self.prev_active_bits,
+                    &mut self.cell_predictive_bits,
+                    &mut self.col_predicted,
+                    &mut self.seg_num_active_conn,
+                    &mut self.seg_num_active_pot,
+                    &mut self.col_best_match,
+                    &self.cell_seg_count,
+                    cfg_val,
+                ),
+            )?;
+        }
+        // 2. Activate.
+        let activate_cfg = LaunchConfig {
+            grid_dim: (((n_cols + 255) / 256) as u32, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            activate_fn.clone().launch(
+                activate_cfg,
+                (
+                    sp_active_mask,
+                    &self.col_predicted,
+                    &self.cell_predictive_bits,
+                    &mut self.cell_active_bits,
+                    &mut self.cell_winner_bits,
+                    &mut self.unpredicted_count,
+                    &mut self.burst_cols_flat,
+                    &mut self.burst_cols_count,
+                    cfg_val,
+                ),
+            )?;
+        }
+        // 3. Anomaly.
+        let anom_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            anom_fn.clone().launch(
+                anom_cfg,
+                (
+                    sp_active_mask,
+                    &self.unpredicted_count,
+                    anomaly_out,
+                    t_slot,
+                    n_cols as u32,
+                ),
+            )?;
+        }
+        if learn {
+            // 4. Reinforce (grid = n_cells).
+            let learn_cfg = LaunchConfig {
+                grid_dim: (n_cells as u32, 1, 1),
+                block_dim: (32, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            unsafe {
+                learn_fn.clone().launch(
+                    learn_cfg,
+                    (
+                        &self.seg_cell_id,
+                        &self.seg_syn_count,
+                        &self.syn_presyn,
+                        &mut self.syn_perm,
+                        &self.seg_num_active_conn,
+                        &self.prev_active_bits,
+                        sp_active_mask,
+                        &self.col_predicted,
+                        &self.cell_seg_count,
+                        cfg_val,
+                    ),
+                )?;
+            }
+            // 5. Punish.
+            unsafe {
+                punish_fn.clone().launch(
+                    learn_cfg,
+                    (
+                        &self.seg_cell_id,
+                        &self.seg_syn_count,
+                        &self.syn_presyn,
+                        &mut self.syn_perm,
+                        &self.seg_num_active_pot,
+                        &self.prev_active_bits,
+                        sp_active_mask,
+                        &self.cell_seg_count,
+                        cfg_val,
+                    ),
+                )?;
+            }
+            // 6. Grow.
+            let grow_cfg = LaunchConfig {
+                grid_dim: (n_cols as u32, 1, 1),
+                block_dim: (32, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            unsafe {
+                grow_fn.clone().launch(
+                    grow_cfg,
+                    (
+                        &mut self.seg_cell_id,
+                        &mut self.seg_syn_count,
+                        &mut self.syn_presyn,
+                        &mut self.syn_perm,
+                        &mut self.cell_seg_count,
+                        &self.burst_cols_flat,
+                        &self.burst_cols_count,
+                        &self.prev_winner_bits,
+                        &self.prev_active_bits,
+                        &self.col_best_match,
+                        cfg_val,
+                    ),
+                )?;
+            }
+        }
+        Ok(())
+    }
+}

overlay/hydra/eval.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Evaluation: factual probes + sampled factual English scoring.
+Extracted from train.py (W1 modularization). Semantics unchanged.
+Perf optimizations (eval_perf_fix):
+- Probe mode: single forward per prompt instead of autoregressive gen
+- Batch decode: all GPU work first, all CPU decode after
+- Batched factual probes: single padded forward instead of N sequential
+"""
+from __future__ import annotations
+import os
+import re as _re
+import torch
+from hydra.config import FACTUAL_SAMPLES, FACTUAL_BATCH, FACTUAL_GEN_TOKENS
+# Default to probe mode (1 forward per prompt); set HYDRA_FACTUAL_MODE=gen for
+# the original autoregressive generation path.
+FACTUAL_MODE = os.environ.get("HYDRA_FACTUAL_MODE", "probe")
+FACTUAL_EVAL = [
+    # Hard factual recall — requires specific knowledge memorization
+    ("The capital of France is", ["Paris", "paris"]),
+    ("Water boils at", ["100", "boiling"]),
+    ("The largest planet in our solar system is", ["Jupiter", "jupiter"]),
+    # Easier completions — common collocations / patterns the model may pick up
+    ("Once upon a", ["time"]),
+    ("Hello, my name", ["is", "'s"]),
+    ("The cat sat on the", ["mat", "floor", "rug", "table", "couch", "chair", "ground"]),
+    ("She opened the door and", ["walked", "saw", "found", "stepped", "looked", "went", "ran"]),
+    # Original hard ones kept for completeness
+    ("The speed of light is approximately", ["299", "300", "186,000", "light speed"]),
+    ("Two plus two equals", ["4", "four"]),
+]
+_FACTUAL_PROBES = [
+    "The capital of France is",
+    "Water boils at",
+    "The largest planet in our solar system is",
+    "The speed of light is approximately",
+    "Shakespeare wrote",
+]
+def run_factual_probes(model, tokenizer, device, autocast_ctx) -> None:
+    """Top-5 next-token predictions for canonical factual prompts.
+    Batched: pads all prompts into a single forward pass instead of N
+    sequential passes.
+    """
+    print("\n--- Factual Probes ---")
+    model.eval()
+    # Process probes one at a time to avoid cooperative launch limit
+    # (batched forward with B=len(probes) can exceed SM residency cap).
+    for prompt_text in _FACTUAL_PROBES:
+        ids = tokenizer.encode(prompt_text)
+        x = torch.tensor([ids], device=device)
+        with torch.no_grad(), autocast_ctx:
+            logits = model(x)
+        probs = torch.softmax(logits[0, -1].float(), dim=-1)
+        top5 = torch.topk(probs, 5)
+        completions = [tokenizer.decode([idx.item()]) for idx in top5.indices]
+        probs_list = [f"{p:.4f}" for p in top5.values[:3].tolist()]
+        print(f'  "{prompt_text}" -> {completions[:3]} (p={probs_list})')
+    print("--- End Factual Probes ---\n")
+# ---------------------------------------------------------------------------
+# Probe mode: single forward per prompt (Fix D)
+# ---------------------------------------------------------------------------
+def _run_factual_english_probe(model, tokenizer, max_seq_len: int):
+    """Fast probe mode: for each (prompt, answers), encode prompt + each answer
+    candidate as a single sequence, do ONE forward pass, and check if the model's
+    argmax at the last prompt token matches the first answer token.
+    Falls back to checking top-K predictions to be generous (same as gen mode
+    which samples multiple temperatures).
+    """
+    print("---")
+    print("factual_english_samples: (probe mode)")
+    model.eval()
+    hits = 0
+    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        for prompt, answers in FACTUAL_EVAL:
+            prompt_ids = tokenizer.encode(prompt)
+            prompt_len = len(prompt_ids)
+            x = torch.tensor([prompt_ids], device="cuda", dtype=torch.long)
+            logits = model(x, targets=None)
+            # logits shape: [1, seq_len, vocab] or [1, vocab]
+            if logits.dim() == 3:
+                last_logits = logits[0, -1, :]
+            else:
+                last_logits = logits[0]
+            probs = torch.softmax(last_logits.float(), dim=-1)
+            # Check top-K predictions (generous: K=20 to match multi-sample gen)
+            top_k = min(20, probs.shape[-1])
+            top_ids = torch.topk(probs, top_k).indices.tolist()
+            top_tokens = [tokenizer.decode([tid]).strip().lower() for tid in top_ids]
+            answers_lower = [a.lower() for a in answers]
+            any_hit = any(
+                any(a in tok for a in answers_lower)
+                for tok in top_tokens
+            )
+            if any_hit:
+                hits += 1
+            best_completion = tokenizer.decode([top_ids[0]])
+            print(f"  prompt: {prompt!r}")
+            print(f"  output: {(prompt + best_completion).replace(chr(10), ' ')!r}")
+            print(f"  hit:    {any_hit} (probe top-{top_k})")
+    score = hits / len(FACTUAL_EVAL)
+    print("---")
+    print(f"factual_english_score: {score:.4f}")
+    print(f"factual_english_hits:  {hits}/{len(FACTUAL_EVAL)}")
+    return score, hits, len(FACTUAL_EVAL)
+# ---------------------------------------------------------------------------
+# Gen mode: original autoregressive path (Fix F: batch decode)
+# ---------------------------------------------------------------------------
+def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
+    """Original autoregressive generation path with batch decode optimization:
+    all GPU work runs first, then all CPU decoding happens after."""
+    print("---")
+    print("factual_english_samples: (gen mode)")
+    model.eval()
+    num_samples = FACTUAL_SAMPLES
+    batch = FACTUAL_BATCH
+    gen_tokens = FACTUAL_GEN_TOKENS
+    temps = [0.7, 0.9, 1.1]
+    hits = 0
+    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        for prompt, answers in FACTUAL_EVAL:
+            ids = tokenizer.encode(prompt)
+            answers_lower = [a.lower() for a in answers]
+            # Collect all generated token sequences on GPU first
+            all_rows: list[list[int]] = []
+            samples_done = 0
+            batch_idx = 0
+            while samples_done < num_samples:
+                b = min(batch, num_samples - samples_done)
+                temp = temps[batch_idx % len(temps)]
+                batch_idx += 1
+                ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
+                for _ in range(gen_tokens):
+                    logits = model(ctx, targets=None)
+                    next_logits = logits[:, -1, :] if logits.dim() == 3 else logits
+                    probs = torch.softmax(next_logits.float() / temp, dim=-1)
+                    next_id = torch.multinomial(probs, num_samples=1)
+                    ctx = torch.cat([ctx, next_id], dim=1)
+                    if ctx.size(1) >= max_seq_len:
+                        break
+                # Transfer to CPU in one shot, no per-row sync
+                all_rows.extend(ctx.cpu().tolist())
+                samples_done += b
+            # CPU-side batch decode — no GPU sync between decodes
+            any_hit = False
+            first_gen = None
+            hit_gen = None
+            for row in all_rows:
+                generated = tokenizer.decode(row)
+                continuation = generated[len(prompt):].strip()
+                _words = set(w.lower() for w in _re.findall(r"\b[\w'-]+\b", continuation))
+                hit = any(a in _words for a in answers_lower)
+                if first_gen is None:
+                    first_gen = generated
+                if hit:
+                    any_hit = True
+                    if hit_gen is None:
+                        hit_gen = generated
+            if any_hit:
+                hits += 1
+            print(f"  prompt: {prompt!r}")
+            print(f"  output: {(first_gen or '').replace(chr(10), ' ')!r}")
+            print(f"  hit:    {any_hit} (any of {num_samples} samples, temps={temps}, gen={gen_tokens}tok)")
+            if hit_gen is not None and hit_gen != first_gen:
+                print(f"  hit_sample: {hit_gen.replace(chr(10), ' ')!r}")
+    score = hits / len(FACTUAL_EVAL)
+    print("---")
+    print(f"factual_english_score: {score:.4f}")
+    print(f"factual_english_hits:  {hits}/{len(FACTUAL_EVAL)}")
+    return score, hits, len(FACTUAL_EVAL)
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+def run_factual_english(model, tokenizer, max_seq_len: int):
+    """Dispatch to probe (fast, default) or gen (original) mode.
+    Set HYDRA_FACTUAL_MODE=gen to use the autoregressive path.
+    """
+    if FACTUAL_MODE == "gen":
+        return _run_factual_english_gen(model, tokenizer, max_seq_len)
+    return _run_factual_english_probe(model, tokenizer, max_seq_len)

overlay/hydra/model.py ADDED Viewed

	@@ -0,0 +1,659 @@

+"""PostSemClawModel — full-architecture model assembly.
+Extracted from the monolithic train.py (W1 modularization). Semantics
+unchanged. Imports `GPUEngram` from `hydra.engram` and `MuonAdamW` from
+`hydra.optimizer`.
+Triton kernel integration status (Phase 2):
+  HYDRA_FUSED_BCNORM — DEFERRED. The bcnorm_fused Triton kernel fuses
+    LayerNorm + RoPE on B/C projections. However, mamba-ssm's Mamba3 block
+    uses RMSNormGated (not LayerNorm) for B/C, and RoPE is applied inside
+    the mamba3_siso_combined CUDA kernel via the Angles parameter. Replacing
+    would require either (a) monkey-patching RMSNormGated + intercepting the
+    fused CUDA scan — invasive, 50+ lines, high breakage risk — or (b) a
+    full custom Mamba3Block reimplementation. Both are out of scope for
+    Phase 2. The kernel is validated standalone; integration deferred to
+    Phase 3 when HYDRA moves to a custom SSM block.
+  HYDRA_FUSED_SSD — DEFERRED. The ssd_exp_trap Triton kernel implements
+    exponential-trapezoidal discretization as a sequential scan. mamba-ssm's
+    Mamba3 block delegates the entire scan + gating + output projection to
+    mamba3_siso_combined (a compiled CUDA kernel with tilelang). Replacing
+    it would require decomposing the combined kernel into constituent ops
+    and substituting only the scan — not feasible without a custom block.
+    Same Phase 3 gate as above.
+Both env vars are accepted but currently no-ops (gates read, logged, but
+the code path is unchanged). This avoids silent regression if someone
+sets them expecting a speedup.
+"""
+from __future__ import annotations
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mamba_ssm import Mamba3
+from subsystems.hestia_mini import HestiaQAT
+from subsystems.htm import HTMLayer
+from subsystems.mhc_mini import ManifoldHyperConnection
+from subsystems.sdr_semantic import SemanticFoldingSDR
+from hydra.engram import GPUEngram
+from hydra.optimizer import MuonAdamW
+def norm(x: torch.Tensor) -> torch.Tensor:
+    """RMSNorm over the last dim — stateless, autocast-friendly."""
+    return F.rms_norm(x, (x.size(-1),))
+class PostSemClawModel(nn.Module):
+    """Full Post-SEM-Claw model assembly.
+    Architecture:
+        Token Embedding -> [Mamba3 + residual] x n_layer
+        -> SDR + Engram (at configured layer) -> norm -> LM head
+    Interface (must match prepare.py evaluate_bpb):
+        model(x, y, reduction='none').view(-1)  -> per-token losses
+        model(x, y, reduction='mean')           -> scalar loss
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # Token embedding
+        self.wte = nn.Embedding(config.vocab_size, config.d_model)
+        # Mamba-3 blocks — official mamba-ssm fused CUDA kernel. No fallbacks.
+        # RoPE is applied internally by the Mamba3 CUDA kernel via the Angles
+        # parameter; external cos/sin buffers are not needed.
+        self.blocks = nn.ModuleList([
+            Mamba3(
+                d_model=config.d_model,
+                d_state=config.d_state,
+                expand=config.expand,
+                headdim=config.headdim,
+                is_mimo=False,          # SISO path uses stable mamba3_siso_combined kernel
+                chunk_size=64,          # upstream-recommended SISO chunk; 16 violated tl.dot M>=16 constraint
+                is_outproj_norm=False,
+                dtype=torch.bfloat16,
+            )
+            for _ in range(config.n_layer)
+        ])
+        # Full-architecture SDR: offline semantic retina + STE (no-bypass).
+        self.sdr_semantic = SemanticFoldingSDR(
+            vocab_size=config.vocab_size,
+            n_bits=config.sdr_n_bits,
+            target_active=config.sdr_target_active,
+            delta_rank=config.sdr_delta_rank,
+            som_warmup_steps=config.sdr_som_warmup,
+            som_update_interval=config.sdr_som_interval,
+        )
+        # HTM spatial pooler + temporal memory (Rust, Hebbian).
+        self.htm = HTMLayer(
+            input_bits=config.sdr_n_bits,
+            n_columns=config.htm_n_columns,
+            cells_per_column=config.htm_cells_per_column,
+            batch_size=1,           # grows lazily to actual B on first forward
+            seed=42,
+            learn=True,
+            reset_each_forward=True,
+        )
+        # Gradient bridge: (n_columns + anomaly) -> d_model.
+        self.htm_proj = nn.Linear(config.htm_n_columns + 1, config.d_model, bias=False)
+        # GPU Engram with Hebbian writes — runs EVERY step.
+        self.engram = GPUEngram(
+            d_model=config.d_model,
+            n_columns=config.engram_n_columns,
+            max_ngram=3,
+        )
+        self.engram_layer_idx = config.engram_layer_idx
+        # Manifold-Constrained Hyper-Connections (one per Mamba-3 block).
+        self.mhc = nn.ModuleList([
+            ManifoldHyperConnection(d_model=config.d_model, n_streams=2, sinkhorn_iters=3)
+            for _ in range(config.n_layer)
+        ])
+        # Hestia QAT — ternary weight quantization applied post-optimizer-step.
+        self.hestia = HestiaQAT(enabled=True, bits=1.58)
+        # LM head
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        # Residual dropout
+        self.drop = nn.Dropout(float(os.environ.get("HYDRA_DROPOUT", "0.2")))
+        # Logits soft-capping
+        self.softcap = 15.0
+        # Secondary metrics storage
+        self._metrics = {}
+        # Per-layer diagnostic panel. Env-gated; zero overhead when off.
+        # Emits residual-contribution (delta_ratio), feature std, effective rank,
+        # gradient norm per layer; used to identify minimum viable n_layer + find
+        # entropy leakage / dead layers. See docs/depth-sweep.md.
+        self._diag_enabled = os.environ.get("HYDRA_LAYER_DIAGNOSTICS", "0") == "1"
+        self._diag_step = 0
+        self._diag_svd_every = int(os.environ.get("HYDRA_LAYER_DIAG_SVD_EVERY", "100"))
+        if self._diag_enabled:
+            # Gradient-norm backward hooks on each Mamba3 block output.
+            for _i, _block in enumerate(self.blocks):
+                def _mk_grad_hook(_layer_idx):
+                    def _hook(module, grad_input, grad_output):
+                        if grad_output and grad_output[0] is not None:
+                            g = grad_output[0].detach()
+                            self._metrics[f'layer_{_layer_idx}_grad_norm'] = float(
+                                g.pow(2).mean().sqrt().item()
+                            )
+                    return _hook
+                _block.register_full_backward_hook(_mk_grad_hook(_i))
+            # Forward hooks on each Mamba3 block capture the block's OUTPUT
+            # directly. This is the clean measurement: unlike merge_streams()
+            # sampling which sees (streams + M*block_output) in bf16 — where
+            # small block contributions round to zero against unit-norm
+            # residuals — this captures `block_output` itself as produced.
+            # Reports both its absolute RMS norm and its ratio to the block
+            # INPUT's RMS norm (contribution magnitude relative to the
+            # residual it's added to).
+            for _i, _block in enumerate(self.blocks):
+                def _mk_fwd_hook(_layer_idx):
+                    def _hook(module, inputs, output):
+                        with torch.no_grad():
+                            inp = inputs[0].detach().float() if inputs else None
+                            out = output.detach().float() if isinstance(output, torch.Tensor) else None
+                            if out is not None:
+                                out_rms = out.pow(2).mean().sqrt().item()
+                                self._metrics[f'layer_{_layer_idx}_block_out_rms'] = float(out_rms)
+                                if inp is not None:
+                                    in_rms = inp.pow(2).mean().sqrt().item()
+                                    self._metrics[f'layer_{_layer_idx}_block_in_rms'] = float(in_rms)
+                                    self._metrics[f'layer_{_layer_idx}_contrib_ratio'] = float(
+                                        out_rms / (in_rms + 1e-8)
+                                    )
+                    return _hook
+                _block.register_forward_hook(_mk_fwd_hook(_i))
+        # Triton kernel integration gates (Phase 2 — deferred, see module docstring).
+        self._fused_bcnorm = os.environ.get("HYDRA_FUSED_BCNORM", "0") == "1"
+        self._fused_ssd = os.environ.get("HYDRA_FUSED_SSD", "0") == "1"
+        if self._fused_bcnorm or self._fused_ssd:
+            import sys
+            _active = []
+            if self._fused_bcnorm:
+                _active.append("HYDRA_FUSED_BCNORM")
+            if self._fused_ssd:
+                _active.append("HYDRA_FUSED_SSD")
+            print(
+                f"[HYDRA] Triton kernel gates set: {', '.join(_active)}. "
+                f"NOTE: Both are DEFERRED (mamba-ssm Mamba3 uses internal "
+                f"CUDA kernels). Gates accepted but currently no-ops.",
+                file=sys.stderr,
+            )
+        # R6 optional torch.compile on the impl forward. Gated (default OFF).
+        if os.environ.get("HYDRA_MODEL_COMPILE", "0") == "1":
+            self._forward_impl = torch.compile(
+                self._forward_impl,
+                fullgraph=False,
+                dynamic=True,
+                mode="default",
+            )
+    @torch.no_grad()
+    def init_weights(self) -> None:
+        s = 3 ** 0.5 * self.config.d_model ** -0.5
+        # Move SDR retina indices (plain attribute, not buffer) to same device as params.
+        # Required because to_empty() only moves params/buffers, and _retina_indices
+        # is loaded from numpy (always CPU) by SemanticFoldingSDR.__init__.
+        device = self.wte.weight.device
+        if hasattr(self.sdr_semantic, '_retina_indices'):
+            self.sdr_semantic._retina_indices = self.sdr_semantic._retina_indices.to(device)
+        # Embedding init: GPT-2 / LLaMA convention. std=1.0 was chosen for
+        # vocab=8192; at larger vocabs, smaller std prevents logit blowup.
+        # Use std = 1/sqrt(d_model) which scales sensibly with model width.
+        import math as _math
+        _d_model = self.wte.weight.shape[1]
+        wte_std = float(os.environ.get("HYDRA_WTE_STD", str(1.0 / _math.sqrt(_d_model))))
+        nn.init.normal_(self.wte.weight, mean=0.0, std=wte_std)
+        # LM head init: was std=0.001 — PATHOLOGICAL at vocab>=32k because
+        # logits collapse to zero, loss locks at log(V)~=11, gradient through
+        # head ∝ 1/V is too small to escape. GPT-2 uses std=0.02; LLaMA uses
+        # std=1/sqrt(d_model). Pick 0.02 as robust default, env-overridable.
+        lm_head_std = float(os.environ.get("HYDRA_LM_HEAD_STD", "0.02"))
+        nn.init.normal_(self.lm_head.weight, mean=0.0, std=lm_head_std)
+        # F8 (NOT APPLIED): Weight tying would save V*D params but current LR
+        # groups have embedding_lr=1.0 and unembedding_lr=0.005 × d_model_scale
+        # — tying forces the shared tensor under a single LR group and either
+        # the embeddings learn 200x too slow (under unembed LR) or the LM head
+        # becomes unstable (under embed LR). Short 15-step smoke with tying +
+        # embed-group update showed initial loss jump 9 -> 20. Deferred until
+        # LR groups are re-tuned; see docs/OPTIMIZATION_PLAN.md Post-plan.
+        for li, block in enumerate(self.blocks):
+            if hasattr(block, 'in_proj') and hasattr(block.in_proj, 'weight'):
+                nn.init.uniform_(block.in_proj.weight, -s, s)
+            if hasattr(block, 'out_proj') and hasattr(block.out_proj, 'weight'):
+                # GPT-2 residual init: std = 0.02 / sqrt(2 * n_layer).
+                # NOT zeros — zero init makes the block a permanent pass-through
+                # (block_out_rms=0, zero gradient flow to SSM internals).
+                # With non-zero init the block contributes to the residual stream
+                # from step 1, giving the SSM scan actual gradient signal.
+                n_layer = self.config.n_layer
+                out_std = float(os.environ.get(
+                    "HYDRA_OUT_PROJ_STD",
+                    str(0.02 / (2 * n_layer) ** 0.5),
+                ))
+                nn.init.normal_(block.out_proj.weight, mean=0.0, std=out_std)
+        nn.init.normal_(self.htm_proj.weight, mean=0.0, std=s)
+        # Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
+        # dtypes in the same shape group would break lerp_ dtype checks.
+        self.wte.to(dtype=torch.bfloat16)
+        self.htm_proj.to(dtype=torch.bfloat16)
+        self.engram.to(dtype=torch.bfloat16)
+    def estimate_flops(self) -> int:
+        nparams = sum(p.numel() for p in self.parameters())
+        embed_params = self.wte.weight.numel()
+        return 6 * (nparams - embed_params)
+    def num_scaling_params(self) -> dict:
+        wte = sum(p.numel() for p in self.wte.parameters())
+        lm_head = sum(p.numel() for p in self.lm_head.parameters())
+        blocks = sum(p.numel() for p in self.blocks.parameters())
+        sdr = sum(p.numel() for p in self.sdr_semantic.parameters())
+        htm_proj = sum(p.numel() for p in self.htm_proj.parameters())
+        engram = sum(p.numel() for p in self.engram.parameters())
+        total = sum(p.numel() for p in self.parameters())
+        return {
+            'wte': wte, 'lm_head': lm_head, 'blocks': blocks,
+            'sdr_semantic': sdr, 'htm_proj': htm_proj,
+            'engram': engram, 'total': total,
+        }
+    def get_secondary_metrics(self) -> dict:
+        """Flush any lingering CUDA tensors to host (single sync)."""
+        flushed = {}
+        for k, v in self._metrics.items():
+            if hasattr(v, 'item'):
+                try:
+                    flushed[k] = float(v.item())
+                except Exception:
+                    flushed[k] = v
+            else:
+                flushed[k] = v
+        return flushed
+    def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.6, matrix_lr=0.04,
+                        weight_decay=0.2, adam_betas=(0.8, 0.95), scalar_lr=0.5):
+        """Setup MuonAdamW optimizer with per-component LR groups."""
+        model_dim = self.config.d_model
+        embedding_params = list(self.wte.parameters())
+        lm_head_params = list(self.lm_head.parameters())
+        # Matrix params -> Muon (exactly 2D weight matrices).
+        matrix_params = []
+        for p in self.blocks.parameters():
+            if p.dim() == 2:
+                matrix_params.append(p)
+        # NOTE (W1 audit REG-2): SemanticFoldingSDR.delta_u / delta_v are
+        # currently GRADIENT-DEAD. The forward path uses `binary_only(idx)` for
+        # HTM and stores it as `self._last_sdr`, but does NOT route the STE
+        # output through any downstream op. Including them in the Muon group
+        # burns compute (stack + orthogonalize + lerp) on zero-grad params
+        # every step. Excluded here; a later W5 pass can reconnect STE via a
+        # gated residual if the SDR signal is wanted back in-graph. The
+        # parameters still exist, so no state_dict break.
+        # for p in self.sdr_semantic.parameters():
+        #     if p.dim() == 2:
+        #         matrix_params.append(p)
+        for p in self.htm_proj.parameters():
+            if p.dim() == 2:
+                matrix_params.append(p)
+        for p in self.engram.parameters():
+            if p.dim() == 2:
+                matrix_params.append(p)
+        # SDR params are intentionally not in any optimizer group — they
+        # receive no gradient in the current forward, so any update would be
+        # pure noise (weight_decay × lr on a zero-grad param).
+        sdr_param_ids = set(id(p) for p in self.sdr_semantic.parameters())
+        assigned = set(id(p) for p in embedding_params + lm_head_params + matrix_params)
+        scalar_params = [
+            p for p in self.parameters()
+            if id(p) not in assigned and id(p) not in sdr_param_ids
+        ]
+        total_assigned = len(embedding_params) + len(lm_head_params) + len(matrix_params) + len(scalar_params)
+        total_params = len(list(self.parameters()))
+        sdr_excluded = len(list(self.sdr_semantic.parameters()))
+        assert total_assigned + sdr_excluded == total_params, (
+            f"Parameter count mismatch: assigned {total_assigned} + sdr_excluded "
+            f"{sdr_excluded} vs total {total_params}"
+        )
+        dmodel_lr_scale = (model_dim / 768) ** -0.5
+        print(f"Scaling AdamW LRs by 1/sqrt({model_dim}/768) = {dmodel_lr_scale:.6f}")
+        param_groups = [
+            dict(kind='adamw', params=lm_head_params,
+                 lr=unembedding_lr * dmodel_lr_scale, betas=adam_betas,
+                 eps=1e-10, weight_decay=0.0),
+            dict(kind='adamw', params=embedding_params,
+                 lr=embedding_lr * dmodel_lr_scale, betas=adam_betas,
+                 eps=1e-10, weight_decay=0.0),
+        ]
+        if scalar_params:
+            param_groups.append(
+                dict(kind='adamw', params=scalar_params,
+                     lr=scalar_lr * dmodel_lr_scale, betas=adam_betas,
+                     eps=1e-10, weight_decay=0.0)
+            )
+        for shape in sorted({p.shape for p in matrix_params}):
+            group_params = [p for p in matrix_params if p.shape == shape]
+            param_groups.append(dict(
+                kind='muon', params=group_params, lr=matrix_lr,
+                momentum=0.95, ns_steps=5, beta2=0.95, weight_decay=weight_decay,
+            ))
+        optimizer = MuonAdamW(param_groups)
+        for group in optimizer.param_groups:
+            group["initial_lr"] = group["lr"]
+        return optimizer
+    def forward(self, idx, targets=None, reduction='mean'):
+        """idx: (B, T) int64. Returns loss if targets given, else logits.
+        Nested bf16 autocast is a no-op when ambient autocast is already on;
+        when it's off (e.g. integration tests) we establish the dtype contract.
+        """
+        if torch.is_autocast_enabled():
+            return self._forward_impl(idx, targets=targets, reduction=reduction)
+        with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+            return self._forward_impl(idx, targets=targets, reduction=reduction)
+    def _forward_impl(self, idx, targets=None, reduction='mean'):
+        B, T = idx.shape
+        # Diagnostic: per-subsystem CUDA event timing. Env-gated; zero overhead
+        # when disabled. Logs one timing line per forward call. Used to isolate
+        # which subsystem is the tps bottleneck on paid hardware.
+        _profile = os.environ.get("HYDRA_PROFILE_FORWARD", "0") == "1"
+        if _profile:
+            def _ev():
+                e = torch.cuda.Event(enable_timing=True)
+                e.record()
+                return e
+            _t0 = _ev()
+        else:
+            _t0 = None
+        # Compute SDR binary ONCE and reuse for both HTM input and the stash.
+        sdr_binary = self.sdr_semantic.binary_only(idx)
+        self._last_sdr = sdr_binary  # uint8 stash (not bf16 → 256MB avoidance)
+        # HTM subsampling: run HTM on 1 of every N micro-batches within a
+        # gradient accumulation step, reuse the cached result for the other
+        # N-1 micro-batches. Cooperative launch monopolizes all SMs (grid.sync
+        # requires full-grid residency), so HTM and mamba can't overlap via
+        # streams. Subsampling removes HTM from most micro-batches' critical
+        # path instead.
+        #
+        # Math: N=8, 64 accum steps → 8 HTM calls (10.6ms each) + 56 fast
+        # calls (4ms each). Total = 84.8 + 224 = 309ms → 106k tps.
+        #
+        # HYDRA_HTM_SUBSAMPLE=N (default 8). Set =1 for every-microbatch HTM.
+        _htm_sub = int(os.environ.get("HYDRA_HTM_SUBSAMPLE", "8"))
+        if not hasattr(self, '_htm_call_idx'):
+            self._htm_call_idx = 0
+        _run_htm = (self._htm_call_idx % _htm_sub == 0)
+        self._htm_call_idx += 1
+        if _run_htm:
+            htm_handle = self.htm.forward_async(sdr_binary)
+        else:
+            htm_handle = None
+        if _profile: _t_htm_async = _ev()
+        dense_emb = self.wte(idx)  # (B, T, d_model) bf16
+        if _profile: _t_wte = _ev()
+        if _run_htm:
+            htm_out = self.htm.forward_await(htm_handle)
+            self._htm_cache = htm_out.detach()  # cache for non-HTM micro-batches
+        elif hasattr(self, '_htm_cache') and self._htm_cache is not None \
+                and self._htm_cache.shape[0] == B and self._htm_cache.shape[1] == T:
+            htm_out = self._htm_cache
+        else:
+            # Very first call with subsample > 1: run HTM anyway.
+            htm_handle = self.htm.forward_async(sdr_binary)
+            htm_out = self.htm.forward_await(htm_handle)
+            self._htm_cache = htm_out.detach()
+        if _profile: _t_htm_await = _ev()
+        with torch.no_grad():
+            sdr_active_bits = float(self.sdr_semantic.target_active)
+            htm_anomaly = htm_out[..., -1].mean()
+        # Gradient bridge: HTM columns+anomaly -> d_model.
+        htm_proj_out = self.htm_proj(htm_out.to(dense_emb.dtype))
+        x = dense_emb + htm_proj_out
+        x = norm(x)
+        if _profile: _t_htm_proj = _ev()
+        # mHC-routed Mamba-3 stack with Engram injection at configured layer.
+        streams = self.mhc[0].init_streams(x)
+        _engram_ev = None
+        # Per-layer diagnostic panel. The pre-layer merged state h_pre lets us
+        # measure residual contribution of each layer: delta_N = h_post - h_pre.
+        # All reads are detached no-grad to avoid autograd graph pollution.
+        _diag = self._diag_enabled
+        if _diag:
+            # Cast to float32 for the diagnostic arithmetic: the layer's
+            # residual contribution is small (~0.5 × rms-normed block output),
+            # which underflows in bf16 subtraction (3-digit mantissa) and
+            # reports delta_ratio=0 at the boundaries. float32 snapshot is
+            # ~3.8 MB extra memory per diag sample (B=1, T=2048, d=96) —
+            # negligible vs peak VRAM.
+            with torch.no_grad():
+                h_pre = self.mhc[0].merge_streams(streams).detach().float()
+            _run_svd = (self._diag_step % self._diag_svd_every) == 0
+        for i, (block, mhc_layer) in enumerate(zip(self.blocks, self.mhc)):
+            def _block_fn(h, _block=block):
+                return self.drop(_block(norm(h)))
+            streams = mhc_layer(streams, _block_fn)
+            if i == self.engram_layer_idx:
+                if _profile: _t_pre_engram = _ev()
+                x_mid = mhc_layer.merge_streams(streams)
+                x_mid, hit_rate = self.engram(x_mid, idx)
+                streams = mhc_layer.init_streams(x_mid)
+                self._metrics['engram_hit_rate'] = hit_rate
+                if _profile: _engram_ev = _ev()
+            if _diag:
+                with torch.no_grad():
+                    h_post = mhc_layer.merge_streams(streams).detach().float()
+                    in_n  = h_pre.pow(2).mean().sqrt()
+                    out_n = h_post.pow(2).mean().sqrt()
+                    d_n   = (h_post - h_pre).pow(2).mean().sqrt()
+                    self._metrics[f'layer_{i}_in_norm']     = float(in_n.item())
+                    self._metrics[f'layer_{i}_out_norm']    = float(out_n.item())
+                    self._metrics[f'layer_{i}_delta_ratio'] = float((d_n / (in_n + 1e-6)).item())
+                    self._metrics[f'layer_{i}_feat_std']    = float(h_post.std(dim=-1).mean().item())
+                    if _run_svd:
+                        # Effective rank via participation ratio of singular values.
+                        # eff_rank = (Σσ)^2 / Σσ² — smooth rank proxy, bounded by d_model.
+                        # Sampled to keep overhead low (SVD is O(min(B*T, D)^2·D)).
+                        flat = h_post.reshape(-1, h_post.shape[-1])[:512].float()
+                        try:
+                            s = torch.linalg.svdvals(flat)
+                            eff_rank = float(((s.sum() ** 2) / (s.pow(2).sum() + 1e-6)).item())
+                            self._metrics[f'layer_{i}_eff_rank'] = eff_rank
+                        except Exception:
+                            pass
+                    h_pre = h_post
+        if _diag:
+            self._diag_step += 1
+        if _profile: _t_blocks = _ev()
+        self._metrics['sdr_active_bits'] = sdr_active_bits
+        self._metrics['htm_anomaly'] = htm_anomaly
+        x = self.mhc[-1].merge_streams(streams)
+        x = norm(x)
+        if _profile: _t_merge = _ev()
+        softcap = self.softcap
+        _softcap_clamp = os.environ.get("HYDRA_SOFTCAP_CLAMP", "0") == "1"
+        if targets is not None:
+            smoothing = self.config.label_smoothing
+            V = self.config.vocab_size
+            # Sampled softmax: instead of computing logits for ALL V tokens,
+            # compute only for the target + K random negatives. Reduces the
+            # lm_head matmul from (B*T, d) × (d, V) to (B*T, d) × (d, K+1).
+            # At V=65536 and K=4096: 16× less compute, ~4× tps improvement.
+            # The log-sum-exp correction adjusts for the sampling bias.
+            # Set HYDRA_SAMPLED_SOFTMAX=0 to disable (full softmax).
+            K_neg = int(os.environ.get("HYDRA_SAMPLED_SOFTMAX", "4096"))
+            use_sampled = K_neg > 0 and K_neg < V and self.training
+            if use_sampled:
+                # Flatten hidden states + targets
+                h_flat = x.reshape(-1, x.shape[-1])            # (B*T, d)
+                t_flat = targets.reshape(-1)                    # (B*T,)
+                n = h_flat.shape[0]
+                # Sample K negatives uniformly from [0, V)
+                neg_ids = torch.randint(0, V, (K_neg,), device=x.device)
+                # Gather lm_head weights for target + negatives
+                all_ids = torch.cat([t_flat, neg_ids])          # (B*T + K,)
+                sampled_w = self.lm_head.weight[all_ids]        # (B*T + K, d)
+                # Compute sampled logits: for each position, dot with its
+                # target weight and all K negative weights.
+                # Target logit: dot product of h[i] with w[target[i]]
+                target_w = sampled_w[:n]                        # (B*T, d)
+                neg_w = sampled_w[n:]                           # (K, d)
+                target_logit = (h_flat * target_w).sum(-1)      # (B*T,)
+                neg_logits = h_flat @ neg_w.t()                 # (B*T, K)
+                if not _softcap_clamp:
+                    target_logit = softcap * torch.tanh(target_logit / softcap)
+                    neg_logits = softcap * torch.tanh(neg_logits / softcap)
+                # Sampled softmax loss: -log(exp(target) / (exp(target) + sum(exp(neg))))
+                # With log-sum-exp correction for sampling K of V negatives.
+                # Correction: add log(V/K) to negative logits to account for
+                # the fact that we're only seeing K of V possible negatives.
+                log_correction = torch.tensor(V / K_neg, device=x.device).log()
+                all_logits = torch.cat([
+                    target_logit.unsqueeze(-1),                 # (B*T, 1)
+                    neg_logits + log_correction,                # (B*T, K)
+                ], dim=-1).float()                              # (B*T, K+1)
+                # CE with target always at index 0
+                ce_targets = torch.zeros(n, dtype=torch.long, device=x.device)
+                if reduction == 'none':
+                    return F.cross_entropy(all_logits, ce_targets, reduction='none')
+                out = F.cross_entropy(all_logits, ce_targets, reduction='mean',
+                                      label_smoothing=smoothing)
+            else:
+                # Full softmax path (eval or HYDRA_SAMPLED_SOFTMAX=0)
+                chunk_size = int(os.environ.get("HYDRA_CE_CHUNK", "1024"))
+                if chunk_size <= 0:
+                    MAX_LOGITS_BYTES = 256 * 1024 * 1024
+                    tokens_per_chunk = max(V, MAX_LOGITS_BYTES // (V * 4))
+                    chunk_size = max(1, tokens_per_chunk // max(1, B))
+                chunk_size = min(chunk_size, T)
+                if reduction == 'none':
+                    loss_parts = []
+                    for start in range(0, T, chunk_size):
+                        end = min(start + chunk_size, T)
+                        chunk_logits = self.lm_head(x[:, start:end, :]).float()
+                        if _softcap_clamp:
+                            chunk_logits = torch.clamp(chunk_logits, -softcap, softcap)
+                        else:
+                            chunk_logits = softcap * torch.tanh(chunk_logits / softcap)
+                        chunk_targets = targets[:, start:end].reshape(-1)
+                        chunk_loss = F.cross_entropy(
+                            chunk_logits.view(-1, chunk_logits.size(-1)),
+                            chunk_targets, ignore_index=-1, reduction='none',
+                        )
+                        loss_parts.append(chunk_loss)
+                    return torch.cat(loss_parts)
+                total_loss = 0.0
+                total_tokens = 0
+                for start in range(0, T, chunk_size):
+                    end = min(start + chunk_size, T)
+                    chunk_logits = self.lm_head(x[:, start:end, :]).float()
+                    if _softcap_clamp:
+                        chunk_logits = torch.clamp(chunk_logits, -softcap, softcap)
+                    else:
+                        chunk_logits = softcap * torch.tanh(chunk_logits / softcap)
+                    chunk_targets = targets[:, start:end].reshape(-1)
+                    chunk_loss = F.cross_entropy(
+                        chunk_logits.view(-1, chunk_logits.size(-1)),
+                        chunk_targets, ignore_index=-1, reduction='sum',
+                        label_smoothing=smoothing,
+                    )
+                    total_loss = total_loss + chunk_loss
+                    total_tokens += (chunk_targets != -1).sum()
+                out = total_loss / total_tokens
+            if _profile:
+                _t_end = _ev()
+                torch.cuda.synchronize()
+                def _ms(a, b): return a.elapsed_time(b)
+                print(
+                    f"[PROFILE B={B} T={T}] "
+                    f"htm_launch={_ms(_t0, _t_htm_async):.2f} "
+                    f"wte={_ms(_t_htm_async, _t_wte):.2f} "
+                    f"htm_await={_ms(_t_wte, _t_htm_await):.2f} "
+                    f"htm_proj={_ms(_t_htm_await, _t_htm_proj):.2f} "
+                    f"mamba_mhc_engram={_ms(_t_htm_proj, _t_blocks):.2f} "
+                    f"merge={_ms(_t_blocks, _t_merge):.2f} "
+                    f"lm_head_loss={_ms(_t_merge, _t_end):.2f} "
+                    f"total={_ms(_t0, _t_end):.2f} ms",
+                    flush=True,
+                )
+            return out
+        logits = self.lm_head(x).float()
+        if _softcap_clamp:
+            logits = torch.clamp(logits, -softcap, softcap)
+        else:
+            logits = softcap * torch.tanh(logits / softcap)
+        return logits

overlay/hydra/optimizer.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""MuonAdamW optimizer — combined Muon (2D matrices) + AdamW (everything else).
+Extracted verbatim from train.py (W1 modularization). Semantics unchanged.
+F1-F15 state preserved:
+- F7 REVERTED: `stacked_params_buf` persistent across steps was REMOVED — each
+  step calls `torch.stack([p.grad for p in params])` / `torch.stack(params)`
+  fresh. Persistent copies of param storage would be mutated between forward
+  passes (via lerp_/sub_ on stacked tensors that share storage with params),
+  triggering "modified in-place" errors on grad_accum=2 backwards.
+- F11/F15: `@torch.compile` on `adamw_step_fused` / `muon_step_fused` intact.
+- F15 compile is default-ON (HYDRA_MUON_COMPILE=1), configured with
+  dynamic=True + mode="default" to avoid the step-17→18 cudagraphs
+  stream-capture deadlock. See .omc/muon_compile_bug.md for the full
+  investigation.
+"""
+from __future__ import annotations
+import os
+import torch
+# HYDRA_FUSED_ADAMW=1 (default) -> vectorized torch._fused_adamw_ kernel.
+_HYDRA_FUSED_ADAMW = os.environ.get("HYDRA_FUSED_ADAMW", "1") == "1"
+_HAS_FUSED_ADAMW = hasattr(torch, "_fused_adamw_")
+polar_express_coeffs = [
+    (8.156554524902461, -22.48329292557795, 15.878769915207462),
+    (4.042929935166739, -2.808917465908714, 0.5000178451051316),
+    (3.8916678022926607, -2.772484153217685, 0.5060648178503393),
+    (3.285753657755655, -2.3681294933425376, 0.46449024233003106),
+    (2.3465413258596377, -1.7097828382687081, 0.42323551169305323),
+]
+def adamw_step_fused(p, grad, exp_avg, exp_avg_sq, step_t, lr_t, beta1_t, beta2_t, eps_t, wd_t):
+    # Per-param AdamW fallback. Fast path is torch._fused_adamw_ (1 CUDA launch
+    # for the whole group) driven from MuonAdamW._step_adamw below.
+    grad = grad.to(p.dtype)  # handle mixed bf16/fp32 from autocast
+    p.mul_(1 - lr_t * wd_t)
+    exp_avg.lerp_(grad, 1 - beta1_t)
+    exp_avg_sq.lerp_(grad.square(), 1 - beta2_t)
+    bias1 = 1 - beta1_t ** step_t
+    bias2 = 1 - beta2_t ** step_t
+    denom = (exp_avg_sq / bias2).sqrt() + eps_t
+    step_size = lr_t / bias1
+    p.add_(exp_avg / denom, alpha=-step_size)
+# ---------------------------------------------------------------------------
+# F15 muon_step_fused compile strategy.
+#
+# HYDRA_MUON_COMPILE env gate:
+#   "1" (default ON) — wrap with torch.compile(dynamic=True, mode="default").
+#       Dynamic=True collapses the per-shape specialization cache so that N
+#       Muon param-groups with N distinct shapes trigger 1 compile, not N.
+#       mode="default" keeps the inductor codegen but disables cudagraphs,
+#       which is what caused the step-17→18 silent deadlock observed under
+#       the original dynamic=False configuration: cudagraph stream capture
+#       can deadlock against HTM's CUDA kernels running on the default
+#       stream, and the failure mode at capture-time is a silent hang
+#       (100% GPU util, no log output, process state R).
+#   "0" — fall back to eager Python (slower, ~43k tps vs ~63k compiled).
+#       Keeps an escape hatch in case a future torch/inductor regression
+#       reintroduces a deadlock.
+#
+# Defensive .clone() on stacked_grads before in-place lerp_ eliminates the
+# alias-analysis edge case where inductor sees `g is stacked_grads` and
+# subsequent `stacked_grads.square()` operating on the post-lerp storage.
+# ---------------------------------------------------------------------------
+_MUON_COMPILE = os.environ.get("HYDRA_MUON_COMPILE", "1") == "1"
+def _maybe_compile(fn):
+    if _MUON_COMPILE:
+        # mode="default" explicitly opts OUT of cudagraphs (which reduce-overhead
+        # would enable) to avoid stream-capture deadlocks against HTM's CUDA
+        # kernels. dynamic=True minimizes recompile count across param-group
+        # shapes.
+        return torch.compile(fn, fullgraph=False, dynamic=True, mode="default")
+    return fn
+@_maybe_compile
+def muon_step_fused(stacked_grads, stacked_params, momentum_buffer, second_momentum_buffer,
+                    momentum_t, lr_t, wd_t, beta2_t, ns_steps, red_dim):
+    # Cast grads to param dtype AND clone defensively to break any alias
+    # between the (freshly-stacked) input and the in-place lerp_ below.
+    # Without this, inductor's alias analysis can emit code that reads from
+    # post-mutation storage when computing `v_mean = g.square().mean(...)`.
+    stacked_grads = stacked_grads.to(momentum_buffer.dtype).clone()
+    # Nesterov momentum
+    momentum = momentum_t.to(stacked_grads.dtype)
+    momentum_buffer.lerp_(stacked_grads, 1 - momentum)
+    g = stacked_grads.lerp_(momentum_buffer, momentum)
+    # Polar express orthogonalization
+    X = g.bfloat16()
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6)
+    if g.size(-2) > g.size(-1):
+        for a, b, c in polar_express_coeffs[:ns_steps]:
+            A = X.mT @ X
+            B = b * A + c * (A @ A)
+            X = a * X + X @ B
+    else:
+        for a, b, c in polar_express_coeffs[:ns_steps]:
+            A = X @ X.mT
+            B = b * A + c * (A @ A)
+            X = a * X + B @ X
+    g = X
+    # NorMuon variance reduction
+    # Keep beta2 in the state-buffer dtype, not g.dtype, so lerp_ on the
+    # float32 second_momentum_buffer doesn't hit a dtype mismatch on h200.
+    beta2 = beta2_t.to(second_momentum_buffer.dtype)
+    v_mean = g.float().square().mean(dim=red_dim, keepdim=True)
+    red_dim_size = g.size(red_dim)
+    v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size
+    v_norm = v_norm_sq.sqrt()
+    second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2)
+    step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt()
+    scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square()
+    v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt()
+    final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10))
+    g = g * final_scale.to(g.dtype)
+    # Cautious weight decay + parameter update
+    lr = lr_t.to(g.dtype)
+    wd = wd_t.to(g.dtype)
+    mask = (g * stacked_params) >= 0
+    stacked_params.sub_(lr * g + lr * wd * stacked_params * mask)
+class MuonAdamW(torch.optim.Optimizer):
+    """Combined optimizer: Muon for 2D matrix params, AdamW for others."""
+    def __init__(self, param_groups):
+        super().__init__(param_groups, defaults={})
+        # 0-D CPU tensors to avoid torch.compile recompilation when values change
+        self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+    def _step_adamw(self, group):
+        params, grads, exp_avgs, exp_avg_sqs, state_steps = [], [], [], [], []
+        for p in group['params']:
+            if p.grad is None:
+                continue
+            state = self.state[p]
+            if not state:
+                state['step'] = 0
+                state['exp_avg'] = torch.zeros_like(p)
+                state['exp_avg_sq'] = torch.zeros_like(p)
+            if 'step_t' not in state:
+                # _fused_adamw_ wants a per-param float step tensor on-device.
+                state['step_t'] = torch.tensor(
+                    float(state['step']), dtype=torch.float32, device=p.device
+                )
+            state['step'] += 1
+            params.append(p)
+            grads.append(p.grad.to(p.dtype) if p.grad.dtype != p.dtype else p.grad)
+            exp_avgs.append(state['exp_avg'])
+            exp_avg_sqs.append(state['exp_avg_sq'])
+            state_steps.append(state['step_t'])
+        if not params:
+            return
+        if _HYDRA_FUSED_ADAMW and _HAS_FUSED_ADAMW and params[0].is_cuda:
+            # _fused_adamw_ needs uniform (device, dtype) within a call, so
+            # group by (device, dtype) — same pattern as PyTorch's own
+            # AdamW(fused=True) path (_group_tensors_by_device_and_dtype).
+            buckets = {}
+            for p, g, ea, es, st in zip(params, grads, exp_avgs, exp_avg_sqs, state_steps):
+                key = (p.device, p.dtype)
+                buckets.setdefault(key, ([], [], [], [], []))
+                b_p, b_g, b_ea, b_es, b_st = buckets[key]
+                b_p.append(p); b_g.append(g); b_ea.append(ea); b_es.append(es); b_st.append(st)
+            lr_f = float(group['lr'])
+            b1_f = float(group['betas'][0])
+            b2_f = float(group['betas'][1])
+            wd_f = float(group['weight_decay'])
+            eps_f = float(group['eps'])
+            for (_dev, _dt), (b_p, b_g, b_ea, b_es, b_st) in buckets.items():
+                torch._foreach_add_(b_st, 1.0)
+                torch._fused_adamw_(
+                    b_p, b_g, b_ea, b_es,
+                    [],  # max_exp_avg_sqs unused (amsgrad=False)
+                    b_st,
+                    amsgrad=False,
+                    lr=lr_f, beta1=b1_f, beta2=b2_f,
+                    weight_decay=wd_f, eps=eps_f,
+                    maximize=False,
+                    grad_scale=None, found_inf=None,
+                )
+            return
+        # Fallback per-param path.
+        self._adamw_lr_t.fill_(group['lr'])
+        self._adamw_beta1_t.fill_(group['betas'][0])
+        self._adamw_beta2_t.fill_(group['betas'][1])
+        self._adamw_eps_t.fill_(group['eps'])
+        self._adamw_wd_t.fill_(group['weight_decay'])
+        for p, grad, exp_avg, exp_avg_sq in zip(params, grads, exp_avgs, exp_avg_sqs):
+            self._adamw_step_t.fill_(self.state[p]['step'])
+            adamw_step_fused(p, grad, exp_avg, exp_avg_sq,
+                             self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t,
+                             self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t)
+    def _step_muon(self, group):
+        params = [p for p in group['params'] if p.grad is not None]
+        if not params:
+            return
+        p = params[0]
+        state = self.state[p]
+        num_params = len(params)
+        shape, device, dtype = p.shape, p.device, p.dtype
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device)
+        red_dim = -1 if shape[-2] >= shape[-1] else -2
+        if "second_momentum_buffer" not in state:
+            # Shape must match v_mean = stacked_grads.square().mean(dim=red_dim, keepdim=True)
+            full_shape = (num_params, *shape)
+            state_shape = list(full_shape)
+            state_shape[len(state_shape) + red_dim] = 1  # red_dim is negative
+            state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device)
+        # F7 REVERT: fresh stacks each step (no persistent stacked_params_buf).
+        # This was the autograd-safety fix that unblocks grad_accum>=2.
+        stacked_grads = torch.stack([p.grad for p in params])
+        stacked_params = torch.stack(params)
+        self._muon_momentum_t.fill_(group["momentum"])
+        self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0)
+        self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1]) ** 0.5)
+        self._muon_wd_t.fill_(group["weight_decay"])
+        muon_step_fused(stacked_grads, stacked_params,
+                        state["momentum_buffer"], state["second_momentum_buffer"],
+                        self._muon_momentum_t, self._muon_lr_t, self._muon_wd_t,
+                        self._muon_beta2_t, group["ns_steps"], red_dim)
+        torch._foreach_copy_(params, list(stacked_params.unbind(0)))
+    @torch.no_grad()
+    def step(self):
+        for group in self.param_groups:
+            if group['kind'] == 'adamw':
+                self._step_adamw(group)
+            elif group['kind'] == 'muon':
+                self._step_muon(group)

overlay/subsystems/htm.py ADDED Viewed

	@@ -0,0 +1,429 @@

+"""
+HTM torch wrapper around the pyo3 ``htm_rust`` crate.
+Exposes ``HTMLayer``, a ``torch.nn.Module`` that batches calls to
+``htm_rust.HTMRegion.step`` across a ``(B, T, input_bits)`` boolean SDR stream
+and returns ``(B, T, n_columns + 1)`` where the last channel is the anomaly
+score. HTM learning is Hebbian (not gradient), so the wrapper runs under
+``torch.no_grad()``. Downstream layers carry gradients back to the embedding
+via their own learnable projection from the binary column output.
+Per-sequence state semantics
+---------------------------
+Training-time forward passes are independent windows of tokens (re-sampled
+every step), so carrying TM state across calls would mix unrelated contexts.
+This layer calls ``reset()`` on every region at the top of ``forward``; the
+TM learns within-window temporal patterns only. Users that want cross-window
+continuity (e.g. eval over a long document) should instead construct the
+layer and drive ``step_stream`` themselves (not implemented here; the
+single-forward contract is sufficient for the autoresearch loop).
+Device handling
+---------------
+``htm_rust`` runs on CPU. If ``sdr`` lives on CUDA we pay a
+``sdr.cpu().numpy()`` round-trip per forward. The return tensor is cast back
+to ``sdr.device``. For expected use (batch<=32, T<=2048, bits=16384) this
+copy is small compared to the SP/TM compute.
+"""
+from __future__ import annotations
+import time
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+import torch
+import torch.nn as nn
+import htm_rust
+# step_many releases the GIL for the whole pass, so multiple threads can
+# truly run regions in parallel — wall-clock scales with B up to CPU cores.
+_HTM_HAS_STEP_MANY = hasattr(htm_rust.HTMRegion, "step_many")
+# GPU backend: built with `maturin develop --features gpu`. One CUDA region
+# per batch slot, persistent device state for SP synapses. Transparent
+# fallback to CPU when not available.
+_HTM_HAS_GPU = hasattr(htm_rust, "HTMRegionGpu")
+# Zero-copy CUDA path: consumes torch CUDA tensors directly via the
+# __cuda_array_interface__ protocol, skipping the sdr.cpu()/numpy round-trip
+# and the D2H of outputs. Huge win when the input SDR already lives on GPU
+# (which is the train.py hot path — retina is a device buffer).
+_HTM_HAS_CAI = _HTM_HAS_GPU and hasattr(htm_rust.HTMRegionGpu, "step_many_cuda")
+# Fused megakernel path: collapses all T timesteps + SP + TM into a single
+# CUDA launch per forward. Replaces global top-K with per-column threshold
+# inhibition (see htm_rust/docs/GPU_HTM.md §Fused Kernel).
+# Opt-in via env var (default on when available).
+import os as _os_fused
+_HTM_HAS_FUSED = _HTM_HAS_GPU and hasattr(htm_rust.HTMRegionGpu, "step_many_fused_cuda")
+_HTM_USE_FUSED = _HTM_HAS_FUSED and bool(int(_os_fused.environ.get("HYDRA_HTM_FUSED", "1")))
+class HTMLayer(nn.Module):
+    """Batched torch wrapper around ``htm_rust.HTMRegion``.
+    One independent region per batch slot so temporal memory learns
+    sequence-local patterns without cross-batch bleed. Regions grow
+    lazily if a larger batch shows up.
+    Output is ``(B, T, n_columns + 1)``: first ``n_columns`` channels are
+    the binary active-column mask (float32 0/1) and the last channel is
+    the per-timestep anomaly score in [0, 1].
+    """
+    def __init__(
+        self,
+        input_bits: int = 16384,
+        n_columns: int = 2048,
+        cells_per_column: int = 32,
+        batch_size: int = 1,
+        seed: int = 42,
+        learn: bool = True,
+        reset_each_forward: bool = True,
+        use_gpu: bool | None = None,
+    ) -> None:
+        super().__init__()
+        self.input_bits = input_bits
+        self.n_columns = n_columns
+        self.cells_per_column = cells_per_column
+        self.learn = learn
+        self.reset_each_forward = reset_each_forward
+        self._seed_base = seed
+        # Learn gating: HTM learn kernels (tm_punish, tm_learn_reinforce, tm_grow)
+        # are 56% of total HTM CUDA time. Gating them to run every N forwards
+        # instead of every forward cuts HTM cost ~2x. Hebbian learning still
+        # converges since the EMA accumulates over many calls. Env:
+        # HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
+        import os as _os
+        self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
+        self._forward_counter = 0
+        # GPU backend gate. Default: auto-detect — use GPU when the pyo3
+        # module was built with --features gpu AND CUDA is actually usable.
+        if use_gpu is None:
+            use_gpu = _HTM_HAS_GPU and torch.cuda.is_available()
+        elif use_gpu and not _HTM_HAS_GPU:
+            raise RuntimeError(
+                "HTMLayer(use_gpu=True) but htm_rust was not built with "
+                "--features gpu. Re-run `maturin develop --features gpu`."
+            )
+        self._use_gpu = bool(use_gpu)
+        cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
+        self._region_cls = cls
+        self._regions = [
+            cls(input_bits, n_columns, cells_per_column, seed + i)
+            for i in range(batch_size)
+        ]
+        self.register_buffer("_dummy", torch.zeros(1), persistent=False)
+        import os as _os
+        self._htm_pool = ThreadPoolExecutor(max_workers=min(_os.cpu_count() or 4, 16))
+    def _ensure_regions(self, B: int) -> None:
+        while len(self._regions) < B:
+            idx = len(self._regions)
+            self._regions.append(
+                self._region_cls(
+                    self.input_bits,
+                    self.n_columns,
+                    self.cells_per_column,
+                    self._seed_base + idx,
+                )
+            )
+    def reset(self) -> None:
+        """Clear TM predictive state on every region (keeps SP synapses)."""
+        for r in self._regions:
+            r.reset()
+    @torch.no_grad()
+    def forward(self, sdr: torch.Tensor) -> torch.Tensor:
+        B, T, D = sdr.shape
+        if D != self.input_bits:
+            raise ValueError(f"expected input_bits={self.input_bits}, got {D}")
+        self._ensure_regions(B)
+        if self.reset_each_forward:
+            self.reset()
+        # Learn-gate: run learn kernels only every N forwards (skips 56% of
+        # HTM CUDA time on skip-forwards; Hebbian EMA still converges).
+        self._forward_counter += 1
+        learn = bool(
+            self.learn
+            and self.training
+            and (self._forward_counter % self._learn_every == 0)
+        )
+        # Zero-copy CUDA hot path. SDR already lives on GPU (retina buffer),
+        # so we skip sdr.cpu()/numpy round-trip AND the output D2H. The Rust
+        # kernel writes directly into torch-owned CUDA tensors via CAI.
+        # Gives 5-10x tok/s on train.py vs the numpy path below.
+        if _HTM_HAS_CAI and self._use_gpu and sdr.is_cuda:
+            sdr_u8 = sdr.contiguous().to(torch.uint8) if sdr.dtype != torch.uint8 else sdr.contiguous()
+            cols_out = torch.empty((B, T, self.n_columns), dtype=torch.uint8, device=sdr.device)
+            anom_out = torch.empty((B, T), dtype=torch.float32, device=sdr.device)
+            # Pick fused (1 launch) or legacy (12*T launches) path.
+            if _HTM_USE_FUSED:
+                for b in range(B):
+                    self._regions[b].step_many_fused_cuda(
+                        sdr_u8[b].__cuda_array_interface__,
+                        cols_out[b].__cuda_array_interface__,
+                        anom_out[b].__cuda_array_interface__,
+                        learn,
+                    )
+            else:
+                for b in range(B):
+                    self._regions[b].step_many_cuda(
+                        sdr_u8[b].__cuda_array_interface__,
+                        cols_out[b].__cuda_array_interface__,
+                        anom_out[b].__cuda_array_interface__,
+                        learn,
+                    )
+            # Assemble (B, T, n_cols+1) — keep bf16-friendly float32.
+            return torch.cat((cols_out.to(torch.float32), anom_out.unsqueeze(-1)), dim=-1)
+        # Fallback: CPU / numpy path. Kept for CPU-input case and for
+        # builds without CAI support.
+        sdr_np = sdr.detach().cpu().contiguous().numpy().view(np.bool_)
+        out = np.zeros((B, T, self.n_columns + 1), dtype=np.float32)
+        def _process_one(b: int) -> None:
+            region = self._regions[b]
+            if self._use_gpu:
+                cols, anom = region.step_many_gpu(sdr_np[b], learn)
+                out[b, :, : self.n_columns] = cols
+                out[b, :, self.n_columns] = anom
+            elif _HTM_HAS_STEP_MANY:
+                # Single Rust call: T steps with GIL released for the whole pass.
+                cols, anom = region.step_many(sdr_np[b], learn)  # cols (T, n_cols), anom (T,)
+                out[b, :, : self.n_columns] = cols
+                out[b, :, self.n_columns] = anom
+            else:
+                for t in range(T):
+                    active_cols, _ac, _pc, anomaly = region.step(sdr_np[b, t], learn)
+                    out[b, t, : self.n_columns] = active_cols
+                    out[b, t, self.n_columns] = float(anomaly)
+        if B == 1:
+            _process_one(0)
+        elif self._use_gpu:
+            # GPU regions share the CUDA context; serialise to avoid contention
+            # for stream 0. Per-region latency is dominated by kernel compute,
+            # not threadable on a single stream cheaply — future work: one
+            # CUDA stream per region.
+            for b in range(B):
+                _process_one(b)
+        else:
+            # Each thread runs in pure Rust under py.allow_threads, so they
+            # parallelise to wall-clock min(B, CPU_cores).
+            list(self._htm_pool.map(_process_one, range(B)))
+        return torch.from_numpy(out).to(sdr.device)
+    def forward_async(self, sdr: torch.Tensor):
+        """Submit HTM work and return a handle awaitable via ``forward_await``.
+        On the CAI zero-copy path (GPU tensor in, GPU region), the Rust
+        CUDA kernels are launched on cudarc's internal stream and control
+        returns **immediately** — no device synchronization.  The caller's
+        next GPU ops (embedding lookup, Mamba forward, etc.) are enqueued
+        on PyTorch's default stream and can execute while HTM kernels run
+        on the cudarc stream.  ``forward_await`` performs the cross-stream
+        sync (via ``device_sync``) and assembles the output tensor only
+        when the result is actually consumed.
+        For cooperative kernels (``step_many_fused_cuda``) the GPU can only
+        run one cooperative launch at a time, so kernel-level overlap with
+        default-stream work is limited.  The win is **CPU-side launch
+        overlap**: instead of the CPU blocking ~10 ms waiting for HTM
+        before it can even enqueue wte/mamba, it enqueues everything up
+        front and the GPU executes back-to-back without CPU stalls.
+        On the legacy CPU/numpy path, work is dispatched to a thread pool
+        as before."""
+        B, T, D = sdr.shape
+        if D != self.input_bits:
+            raise ValueError(f"expected input_bits={self.input_bits}, got {D}")
+        self._ensure_regions(B)
+        if self.reset_each_forward:
+            self.reset()
+        learn = bool(self.learn and self.training)
+        if _HTM_HAS_CAI and self._use_gpu and sdr.is_cuda:
+            sdr_u8 = sdr.contiguous().to(torch.uint8) if sdr.dtype != torch.uint8 else sdr.contiguous()
+            cols_out = torch.empty((B, T, self.n_columns), dtype=torch.uint8, device=sdr.device)
+            anom_out = torch.empty((B, T), dtype=torch.float32, device=sdr.device)
+            # ONE cooperative kernel launch for all B regions. Breaks past
+            # the CUDA cooperative-kernel device-level serialization (only
+            # one cooperative kernel runs at a time). A single launch with
+            # grid.y = B processes all regions concurrently — ~B× speedup.
+            # Falls back to sequential dispatch if the batched entry isn't
+            # available (older htm_rust wheel).
+            if _HTM_USE_FUSED and hasattr(htm_rust, "step_batch_fused_cuda"):
+                # Slice self._regions to match B: _ensure_regions may have
+                # allocated more regions than the current batch size needs
+                # (e.g. factual eval uses smaller batches than training).
+                try:
+                    htm_rust.step_batch_fused_cuda(
+                        self._regions[:B],
+                        [sdr_u8[b].__cuda_array_interface__ for b in range(B)],
+                        [cols_out[b].__cuda_array_interface__ for b in range(B)],
+                        [anom_out[b].__cuda_array_interface__ for b in range(B)],
+                        learn,
+                    )
+                except RuntimeError as _e:
+                    if "COOPERATIVE_LAUNCH_TOO_LARGE" in str(_e):
+                        # Batch too large for cooperative grid. Fall back to
+                        # sequential per-region fused launches (each B=1).
+                        for b in range(B):
+                            self._regions[b].step_many_fused_cuda(
+                                sdr_u8[b].__cuda_array_interface__,
+                                cols_out[b].__cuda_array_interface__,
+                                anom_out[b].__cuda_array_interface__,
+                                learn,
+                            )
+                    else:
+                        raise
+            elif _HTM_USE_FUSED:
+                for b in range(B):
+                    self._regions[b].step_many_fused_cuda(
+                        sdr_u8[b].__cuda_array_interface__,
+                        cols_out[b].__cuda_array_interface__,
+                        anom_out[b].__cuda_array_interface__,
+                        learn,
+                    )
+            else:
+                for b in range(B):
+                    self._regions[b].step_many_cuda(
+                        sdr_u8[b].__cuda_array_interface__,
+                        cols_out[b].__cuda_array_interface__,
+                        anom_out[b].__cuda_array_interface__,
+                        learn,
+                    )
+            # NO sync here — kernels are in-flight on cudarc's stream.
+            # forward_await() will sync before the output is consumed.
+            return {
+                'cuda_deferred': True,
+                'cols_out': cols_out,
+                'anom_out': anom_out,
+                'region0': self._regions[0],
+            }
+        sdr_np = sdr.detach().cpu().contiguous().numpy().view(np.bool_)
+        out = np.zeros((B, T, self.n_columns + 1), dtype=np.float32)
+        def _process_one(b):
+            region = self._regions[b]
+            if self._use_gpu:
+                cols, anom = region.step_many_gpu(sdr_np[b], learn)
+                out[b, :, : self.n_columns] = cols
+                out[b, :, self.n_columns] = anom
+            elif _HTM_HAS_STEP_MANY:
+                cols, anom = region.step_many(sdr_np[b], learn)
+                out[b, :, : self.n_columns] = cols
+                out[b, :, self.n_columns] = anom
+            else:
+                for t in range(T):
+                    active_cols, _ac, _pc, anomaly = region.step(sdr_np[b, t], learn)
+                    out[b, t, : self.n_columns] = active_cols
+                    out[b, t, self.n_columns] = float(anomaly)
+        fut = self._htm_pool.submit(lambda: [_process_one(b) for b in range(B)])
+        return {'fut': fut, 'out': out, 'device': sdr.device}
+    def forward_await(self, handle) -> torch.Tensor:
+        if handle.get('cuda_deferred'):
+            # Cross-stream sync: block until cudarc stream finishes HTM
+            # kernels so the output tensors are safe to read on the
+            # default stream.
+            region0 = handle['region0']
+            if hasattr(region0, "device_sync"):
+                region0.device_sync()
+            else:
+                torch.cuda.synchronize()
+            cols_out = handle['cols_out']
+            anom_out = handle['anom_out']
+            return torch.cat(
+                (cols_out.to(torch.float32), anom_out.unsqueeze(-1)), dim=-1
+            )
+        if 'cuda_result' in handle:
+            return handle['cuda_result']
+        handle['fut'].result()
+        return torch.from_numpy(handle['out']).to(handle['device'])
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    # Smoke test: (B=2, T=4, D=16384) random 2%-sparse SDR
+    B, T, D = 2, 4, 16384
+    n_columns = 2048
+    target_active_in = int(D * 0.02)  # 327
+    layer = HTMLayer(
+        input_bits=D,
+        n_columns=n_columns,
+        cells_per_column=32,
+        batch_size=B,
+        seed=42,
+        learn=True,
+    )
+    layer.train()
+    rng = np.random.default_rng(0)
+    sdr = np.zeros((B, T, D), dtype=bool)
+    for b in range(B):
+        for t in range(T):
+            idx = rng.choice(D, size=target_active_in, replace=False)
+            sdr[b, t, idx] = True
+    sdr_t = torch.from_numpy(sdr)
+    t0 = time.perf_counter()
+    out = layer(sdr_t)
+    dt_first = time.perf_counter() - t0
+    assert out.shape == (B, T, n_columns + 1), f"shape {out.shape}"
+    assert out.dtype == torch.float32, f"dtype {out.dtype}"
+    active_cols = out[..., :n_columns]
+    anomaly = out[..., n_columns]
+    col_sums = active_cols.sum(dim=-1)  # (B, T)
+    mean_active = col_sums.float().mean().item()
+    expected = n_columns * 0.02  # ≈ 40.96
+    assert 20 <= mean_active <= 60, (
+        f"active columns per step out of 2% band: {mean_active:.1f} (expected ~{expected:.1f})"
+    )
+    # t=0 has no TM prediction → anomaly = 1.0 on every batch slot.
+    assert torch.allclose(anomaly[:, 0], torch.ones(B)), f"t=0 anomaly {anomaly[:, 0]}"
+    # Second forward on same (reset) layer: identical shapes, deterministic re-run possible.
+    t0 = time.perf_counter()
+    out2 = layer(sdr_t)
+    dt_second = time.perf_counter() - t0
+    assert out2.shape == out.shape
+    # Repeating-sequence anomaly decay check — one region, T=8 repeats of same pattern.
+    rep_layer = HTMLayer(
+        input_bits=D,
+        n_columns=n_columns,
+        batch_size=1,
+        seed=7,
+        learn=True,
+    )
+    rep_layer.train()
+    base = torch.zeros(D, dtype=torch.bool)
+    idx = rng.choice(D, size=target_active_in, replace=False)
+    base[idx] = True
+    rep = base.unsqueeze(0).unsqueeze(0).expand(1, 16, D).clone()
+    rep_out = rep_layer(rep)
+    rep_anom = rep_out[0, :, n_columns]
+    assert rep_anom[0].item() > 0.5, f"anomaly at t=0 should be high, got {rep_anom[0]:.3f}"
+    assert rep_anom[-1].item() < rep_anom[0].item(), (
+        f"anomaly should decay on repeats: first={rep_anom[0]:.3f} last={rep_anom[-1]:.3f}"
+    )
+    print("[OK] shape:", tuple(out.shape))
+    print(f"[OK] mean active cols/step: {mean_active:.2f} (target ~{expected:.1f})")
+    print(f"[OK] t=0 anomaly = 1.0 on all batch slots")
+    print(f"[OK] repeating-sequence anomaly: first={rep_anom[0]:.3f} -> last={rep_anom[-1]:.3f}")
+    print(f"[OK] forward wall-clock: first={dt_first*1000:.1f}ms second={dt_second*1000:.1f}ms "
+          f"on (B={B}, T={T}, D={D})")

overlay/subsystems/sdr_retina.py ADDED Viewed

	@@ -0,0 +1,632 @@

+"""
+Offline Semantic Folding SDR Retina (Cortical.io-grade).
+Builds a topographic, semantic-folding Sparse Distributed Representation (SDR)
+for every token in the vocabulary, following Webber 2015 ("Semantic Folding Theory").
+Pipeline:
+  1. Scan the tokenized training corpus (parquet shards at ~/.cache/autoresearch/data).
+     We on-the-fly tokenize ~10M tokens from the first few shards.
+  2. For each token, build a context vector = top-K most-associated neighbors
+     (±8-token window, PMI ranking).
+  3. Train a 128x128 = 16384-bit Kohonen SOM on those context vectors so that
+     semantically related context features land on neighboring lattice cells.
+  4. For each token, compute its folded SDR: union of the lattice cells whose
+     BMUs are triggered by its top-K context features. Then per-row quantile
+     threshold to exactly 2% active bits (327 / 16384).
+  5. Save to ~/.cache/autoresearch/retina.npz.
+Entry point:
+    uv run python subsystems/sdr_retina.py --build --validate
+The validation asserts classic Cortical.io-style analogies:
+  - overlap("the", "a")    > overlap("the", "zebra")
+  - overlap("man", "woman") > overlap("man", "rock")
+  - overlap("king","queen") > overlap("king", "dinosaur")
+"""
+from __future__ import annotations
+import argparse
+import math
+import os
+import sys
+import time
+from dataclasses import dataclass
+import numpy as np
+import pyarrow.parquet as pq
+import torch
+# Make the parent repo importable so we can reuse the Tokenizer
+REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, REPO_ROOT)
+from prepare import CACHE_DIR, DATA_DIR, TOKENIZER_DIR, VAL_FILENAME, Tokenizer  # noqa: E402
+# ---------------------------------------------------------------------------
+# Build parameters
+# ---------------------------------------------------------------------------
+RETINA_PATH = os.path.join(CACHE_DIR, "retina.npz")
+GRID_H = 128
+GRID_W = 128
+N_BITS = GRID_H * GRID_W          # 16384
+TARGET_SPARSITY = 0.02            # 2% (default, Cortical.io-style)
+# Default = int(floor(N_BITS * TARGET_SPARSITY)) = 327, matches Webber/Numenta.
+# Override via HYDRA_SDR_TARGET_ACTIVE env var. The cache key encodes
+# target_active, so changing this triggers automatic retina regeneration.
+TARGET_ACTIVE = int(os.environ.get(
+    "HYDRA_SDR_TARGET_ACTIVE",
+    str(int(N_BITS * TARGET_SPARSITY)),
+))
+CONTEXT_WINDOW = 8                # +/- 8 tokens
+TOP_K_FEATURES = 64               # top-K context features per token
+# SCALES WITH VOCAB — need ~100+ occurrences per token for stable cooccurrence.
+# At V=8k: 10M tokens = 1250/tok avg. At V=65k: 10M tokens = 153/tok avg
+# (borderline); rare tokens seen <30x → noisy retina. Recommended: V*150.
+# HF Hub cache makes this a one-time cost per vocab config anyway.
+TARGET_TRAIN_TOKENS = int(os.environ.get("HYDRA_RETINA_TRAIN_TOKENS", "20000000"))
+MAX_DOCS_PER_SHARD = 200_000      # safety cap per shard
+# Kohonen SOM
+SOM_EPOCHS = 50
+SOM_SIGMA_START = 32.0
+SOM_SIGMA_END = 1.0
+SOM_ALPHA_START = 0.1
+SOM_ALPHA_END = 0.001
+# ---------------------------------------------------------------------------
+# Small helpers
+# ---------------------------------------------------------------------------
+def _fmt(n):
+    if n >= 1_000_000:
+        return f"{n/1_000_000:.2f}M"
+    if n >= 1_000:
+        return f"{n/1_000:.1f}k"
+    return str(n)
+def _device() -> torch.device:
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def _list_train_shards():
+    files = sorted(
+        f for f in os.listdir(DATA_DIR)
+        if f.endswith(".parquet") and not f.endswith(".tmp")
+    )
+    train = [os.path.join(DATA_DIR, f) for f in files if f != VAL_FILENAME]
+    assert len(train) > 0, f"No training shards at {DATA_DIR}. Run prepare.py first."
+    return train
+# ---------------------------------------------------------------------------
+# Stage 1: stream tokens from parquet shards and collect co-occurrences
+# ---------------------------------------------------------------------------
+def _iter_tokenized_shards(tokenizer: Tokenizer, target_tokens: int):
+    """Yield 1-D int32 numpy arrays of token ids until target_tokens reached.
+    Two paths:
+      - HYDRA_USE_NEMOTRON=1: stream docs from Nemotron HF datasets (no shards
+        on disk — matches the streaming training path).
+      - Default: iterate local parquet shards (legacy prepare.py path).
+    """
+    tok_encode = tokenizer.enc.encode_ordinary_batch
+    if os.environ.get("HYDRA_USE_NEMOTRON", "0") == "1":
+        # Streaming path: reuse prepare_nemotron's weighted stream.
+        import prepare_nemotron as _pn
+        stream = _pn._WeightedStream(_pn._phase_weights(), seed=0)
+        seen = 0
+        batch: list[str] = []
+        BATCH = 512
+        while seen < target_tokens:
+            text, _epoch = next(stream)
+            if not text:
+                continue
+            batch.append(text)
+            if len(batch) < BATCH:
+                continue
+            token_lists = tok_encode(batch, num_threads=8)
+            batch = []
+            for ids in token_lists:
+                if not ids:
+                    continue
+                arr = np.asarray(ids, dtype=np.int32)
+                yield arr
+                seen += arr.size
+                if seen >= target_tokens:
+                    print(f"  [nemotron-stream] yielded {_fmt(seen)} tokens, target reached")
+                    return
+        return
+    # Legacy shard path.
+    shards = _list_train_shards()
+    seen = 0
+    for shard_idx, path in enumerate(shards):
+        if seen >= target_tokens:
+            return
+        pf = pq.ParquetFile(path)
+        shard_tokens = 0
+        for rg_idx in range(pf.num_row_groups):
+            rg = pf.read_row_group(rg_idx)
+            docs = rg.column("text").to_pylist()
+            if len(docs) > MAX_DOCS_PER_SHARD:
+                docs = docs[:MAX_DOCS_PER_SHARD]
+            # Batch-encode for throughput
+            batch_size = 512
+            for i in range(0, len(docs), batch_size):
+                batch = docs[i:i + batch_size]
+                token_lists = tok_encode(batch, num_threads=8)
+                for ids in token_lists:
+                    if not ids:
+                        continue
+                    arr = np.asarray(ids, dtype=np.int32)
+                    yield arr
+                    shard_tokens += arr.size
+                    seen += arr.size
+                    if seen >= target_tokens:
+                        print(f"  shard {shard_idx}: yielded {_fmt(shard_tokens)} tokens "
+                              f"(total {_fmt(seen)}), target reached")
+                        return
+        print(f"  shard {shard_idx}: yielded {_fmt(shard_tokens)} tokens (total {_fmt(seen)})")
+def _cooccur_from_doc(ids: np.ndarray, window: int, vocab_size: int,
+                      counts: np.ndarray, cooc: np.ndarray) -> None:
+    """Update unigram counts and cooccurrence counts for one document. Vectorized."""
+    n = ids.size
+    if n < 2:
+        return
+    # unigram counts
+    np.add.at(counts, ids, 1)
+    # For each offset d in 1..window, count pairs (ids[:-d], ids[d:])
+    # Both directions are equivalent by symmetry; we add both to keep the
+    # matrix symmetric and treat it as undirected context.
+    for d in range(1, window + 1):
+        left = ids[:-d]
+        right = ids[d:]
+        # symmetric update
+        flat_lr = left.astype(np.int64) * vocab_size + right.astype(np.int64)
+        flat_rl = right.astype(np.int64) * vocab_size + left.astype(np.int64)
+        # use bincount-style scatter via np.add.at on the flat view
+        cooc_flat = cooc.ravel()
+        np.add.at(cooc_flat, flat_lr, 1)
+        np.add.at(cooc_flat, flat_rl, 1)
+def build_cooccurrence(tokenizer: Tokenizer, target_tokens: int, window: int) -> tuple[np.ndarray, np.ndarray, int]:
+    """
+    Stream tokens and build unigram + cooccurrence counts.
+    Returns (counts[V] int64, cooc[V,V] int32, total_tokens int).
+    """
+    vocab_size = tokenizer.get_vocab_size()
+    print(f"[1/4] Building cooccurrence (vocab={vocab_size}, window=+/-{window}, target={_fmt(target_tokens)} tokens)")
+    counts = np.zeros(vocab_size, dtype=np.int64)
+    # int32 is enough per-cell if we stay <= a few hundred million total tokens; guard with clip at save.
+    cooc = np.zeros((vocab_size, vocab_size), dtype=np.int32)
+    total = 0
+    n_docs = 0
+    t0 = time.time()
+    for ids in _iter_tokenized_shards(tokenizer, target_tokens):
+        _cooccur_from_doc(ids, window, vocab_size, counts, cooc)
+        total += ids.size
+        n_docs += 1
+        if n_docs % 5000 == 0:
+            dt = time.time() - t0
+            rate = total / max(dt, 1e-6)
+            print(f"    docs={_fmt(n_docs)} tokens={_fmt(total)} ({rate/1000:.0f}k tok/s)")
+    dt = time.time() - t0
+    print(f"[1/4] done: {_fmt(total)} tokens, {_fmt(n_docs)} docs, {dt:.1f}s, "
+          f"cooc_nnz={int((cooc > 0).sum())}")
+    return counts, cooc, total
+# ---------------------------------------------------------------------------
+# Stage 2: build top-K context features (PMI)
+# ---------------------------------------------------------------------------
+def compute_pmi_topk(counts: np.ndarray, cooc: np.ndarray, total_tokens: int,
+                     top_k: int) -> tuple[np.ndarray, np.ndarray]:
+    """
+    For each token, compute top-K context features by positive PMI.
+    Returns:
+      topk_idx   : int32 [V, K]   token ids of the top-K context features
+      topk_score : float32 [V, K] PMI scores (0 for padded missing features)
+    Missing features are padded with idx=token itself and score=0, so they
+    have a well-defined (but uninformative) column.
+    """
+    V = counts.shape[0]
+    print(f"[2/4] Computing PMI top-{top_k} per token (vocab={V})")
+    # window_pairs per occurrence: 2 * window (we added both directions, each offset twice).
+    # For the PMI denominator we need a total pair count; using coo.sum() is the clean
+    # per-matrix normalizer and avoids any constant confusion.
+    pair_total = float(cooc.sum())
+    if pair_total <= 0:
+        raise RuntimeError("Empty cooccurrence matrix")
+    # Run on GPU if available; this is ~8k x 8k float32 = 256MB each.
+    dev = _device()
+    cooc_t = torch.from_numpy(cooc.astype(np.float32)).to(dev)
+    counts_t = torch.from_numpy(counts.astype(np.float64)).to(dev).clamp_min(1.0)
+    # P(i) = counts[i] / total_tokens
+    # P(i, j) = cooc[i, j] / pair_total
+    # PMI = log(P(i,j) / (P(i) P(j)))
+    # Positive PMI = max(PMI, 0).
+    # We'll compute log-PMI in a numerically safe way:
+    #   log(cooc) + log(total_tokens^2 / pair_total) - log(c_i) - log(c_j)
+    # Keep numerator zero where cooc==0 and mask those out.
+    log_const = math.log(total_tokens) + math.log(total_tokens) - math.log(pair_total)
+    log_ci = torch.log(counts_t)                 # [V]
+    log_cj = log_ci.clone()                       # same vector (symmetric vocab)
+    # We'll do it in row blocks to cap memory of intermediate log() tensors.
+    topk_idx = np.zeros((V, top_k), dtype=np.int32)
+    topk_score = np.zeros((V, top_k), dtype=np.float32)
+    block = 512
+    t0 = time.time()
+    for start in range(0, V, block):
+        end = min(V, start + block)
+        rows = cooc_t[start:end]                           # [b, V] int-as-float
+        mask = rows > 0
+        # log(rows) where rows>0; else keep -inf then mask out
+        log_rows = torch.where(mask, torch.log(rows.clamp_min(1.0)),
+                               torch.full_like(rows, float("-inf")))
+        pmi = log_rows + log_const - log_ci[start:end].unsqueeze(1) - log_cj.unsqueeze(0)
+        ppmi = torch.where(mask, torch.clamp(pmi, min=0.0),
+                           torch.full_like(pmi, float("-inf")))
+        # top-K along dim=1
+        vals, idx = torch.topk(ppmi, k=top_k, dim=1)
+        # Replace any -inf valued slots with score 0 and idx = the token itself
+        bad = torch.isneginf(vals)
+        if bad.any():
+            self_idx = torch.arange(start, end, device=dev).unsqueeze(1).expand_as(idx)
+            idx = torch.where(bad, self_idx, idx)
+            vals = torch.where(bad, torch.zeros_like(vals), vals)
+        topk_idx[start:end] = idx.cpu().numpy().astype(np.int32)
+        topk_score[start:end] = vals.cpu().numpy().astype(np.float32)
+    del cooc_t, counts_t
+    if dev.type == "cuda":
+        torch.cuda.empty_cache()
+    print(f"[2/4] done: top-{top_k} PMI features per token in {time.time()-t0:.1f}s")
+    return topk_idx, topk_score
+# ---------------------------------------------------------------------------
+# Stage 3: Kohonen SOM on the context-vector representation
+# ---------------------------------------------------------------------------
+def _context_vectors_from_topk(topk_idx: np.ndarray, topk_score: np.ndarray,
+                               vocab_size: int) -> torch.Tensor:
+    """
+    Build the dense context matrix X [V, V] where X[i] is the top-K PMI context
+    vector for token i, L2-normalized. For V=8192 this is 8k x 8k float32 = 256 MB.
+    """
+    V = vocab_size
+    K = topk_idx.shape[1]
+    dev = _device()
+    X = torch.zeros((V, V), dtype=torch.float32, device=dev)
+    rows = torch.arange(V, device=dev).unsqueeze(1).expand(V, K)  # [V,K]
+    idx = torch.from_numpy(topk_idx).to(dev).long()
+    scores = torch.from_numpy(topk_score).to(dev)
+    # Scatter scores into X at positions (rows, idx). If duplicates, keep max.
+    X[rows, idx] = torch.maximum(X[rows, idx], scores)
+    # L2 normalize so Euclidean ~ cosine
+    norm = X.norm(dim=1, keepdim=True).clamp_min(1e-8)
+    X = X / norm
+    return X
+def train_som(X: torch.Tensor, grid_h: int, grid_w: int,
+              epochs: int, sigma_start: float, sigma_end: float,
+              alpha_start: float, alpha_end: float,
+              seed: int = 137) -> torch.Tensor:
+    """
+    Train a Kohonen SOM with rectangular grid and Gaussian neighborhood.
+    X: [V, F] features (L2 normalized). Returns weights W: [grid_h*grid_w, F].
+    """
+    dev = X.device
+    V, F = X.shape
+    N = grid_h * grid_w
+    torch.manual_seed(seed)
+    # Initialize SOM weights: small random linear combinations of data points
+    # (faster convergence than uniform random in the feature space).
+    init_pick = torch.randint(0, V, (N,), device=dev)
+    W = X[init_pick].clone()  # [N, F]
+    # Precompute grid coordinates
+    yy, xx = torch.meshgrid(
+        torch.arange(grid_h, device=dev, dtype=torch.float32),
+        torch.arange(grid_w, device=dev, dtype=torch.float32),
+        indexing="ij",
+    )
+    grid = torch.stack([yy.reshape(-1), xx.reshape(-1)], dim=1)  # [N, 2]
+    print(f"[3/4] Training Kohonen SOM: grid={grid_h}x{grid_w}, features={F}, "
+          f"epochs={epochs}, sigma {sigma_start}->{sigma_end}, alpha {alpha_start}->{alpha_end}")
+    t0 = time.time()
+    # Exponential decay schedules
+    def schedule(t_frac):
+        sigma = sigma_start * (sigma_end / sigma_start) ** t_frac
+        alpha = alpha_start * (alpha_end / alpha_start) ** t_frac
+        return sigma, alpha
+    # Batch-mode SOM: process a random permutation each epoch in mini-batches.
+    # For each mini-batch, compute BMUs then one vectorized neighborhood update.
+    batch_size = 256
+    for epoch in range(epochs):
+        t_frac = epoch / max(epochs - 1, 1)
+        sigma, alpha = schedule(t_frac)
+        two_sigma2 = 2.0 * sigma * sigma
+        perm = torch.randperm(V, device=dev)
+        for bstart in range(0, V, batch_size):
+            bidx = perm[bstart:bstart + batch_size]
+            xb = X[bidx]  # [b, F]
+            # BMU: argmax of cosine similarity = argmin of squared Euclidean
+            # ||x||=||w||=1 for data; W may drift but the formulation remains stable.
+            sim = xb @ W.t()             # [b, N]
+            bmu = sim.argmax(dim=1)      # [b]
+            # Neighborhood weights h[b, n] = exp(-|grid[bmu_b] - grid[n]|^2 / (2*sigma^2))
+            bmu_coords = grid[bmu]                       # [b, 2]
+            diff = bmu_coords.unsqueeze(1) - grid.unsqueeze(0)  # [b, N, 2]
+            dist2 = (diff * diff).sum(dim=2)             # [b, N]
+            h = torch.exp(-dist2 / two_sigma2)           # [b, N]
+            h = h * alpha                                 # include LR
+            # Vectorized SOM update:
+            # W <- W + sum_b h[b] * (x_b - W) / (sum_b h[b])
+            # Batched form: numerator = h^T x_b  [N, F],  denom = h.sum(0) [N]
+            numer = h.t() @ xb               # [N, F]
+            denom = h.sum(dim=0).unsqueeze(1).clamp_min(1e-8)  # [N, 1]
+            target = numer / denom
+            # Update weight: mix toward target with a unit step (h already scaled by alpha).
+            # To prevent over-shoot when the same BMU is hit heavily, scale by the
+            # mean-field gain min(1, denom). Empirically this behaves like classic SOM.
+            gain = torch.clamp(h.sum(dim=0), max=1.0).unsqueeze(1)  # [N,1]
+            W = (1 - gain) * W + gain * target
+            # Renormalize weights to unit sphere for stability
+            W = W / W.norm(dim=1, keepdim=True).clamp_min(1e-8)
+        if (epoch + 1) % max(1, epochs // 10) == 0 or epoch == 0:
+            dt = time.time() - t0
+            print(f"    epoch {epoch+1}/{epochs}  sigma={sigma:.2f}  alpha={alpha:.4f}  elapsed={dt:.1f}s")
+    print(f"[3/4] SOM trained in {time.time()-t0:.1f}s")
+    return W
+# ---------------------------------------------------------------------------
+# Stage 4: fold context vectors into SDRs
+# ---------------------------------------------------------------------------
+def fold_sdrs(X: torch.Tensor, W: torch.Tensor, topk_idx: np.ndarray,
+              topk_score: np.ndarray, target_active: int) -> np.ndarray:
+    """
+    For each token, activate the 'cell votes' on the lattice for each of its top-K
+    context features, then threshold to exactly target_active bits.
+    Implementation detail: every token in the vocabulary has a SOM BMU given its
+    context vector X[i]. We use those BMUs as the feature->cell map. For token t,
+    we accumulate votes at BMU(feature) weighted by the PMI score, then pick the
+    top target_active cells.
+    """
+    dev = X.device
+    V, F = X.shape
+    N = W.shape[0]
+    print(f"[4/4] Folding SDRs (V={V}, N={N}, target_active={target_active})")
+    # Per-feature BMU: for each token f as a feature, BMU_f = argmax_n W[n] . X[f]
+    # Chunked matmul to bound memory.
+    bmu = torch.empty(V, dtype=torch.long, device=dev)
+    chunk = 1024
+    for s in range(0, V, chunk):
+        e = min(V, s + chunk)
+        sim = X[s:e] @ W.t()            # [b, N]
+        bmu[s:e] = sim.argmax(dim=1)
+    # Now build votes tensor [V, N] = sum over k of score[i, k] delta(n = bmu[feat[i, k]])
+    K = topk_idx.shape[1]
+    feat = torch.from_numpy(topk_idx).to(dev).long()       # [V, K]
+    sc = torch.from_numpy(topk_score).to(dev)              # [V, K]
+    feat_bmu = bmu[feat]                                   # [V, K]
+    votes = torch.zeros((V, N), dtype=torch.float32, device=dev)
+    votes.scatter_add_(1, feat_bmu, sc)
+    # Tiny numerical nudge: add a local Gaussian kernel around each voted cell so
+    # near-neighbors accumulate mass (this is the "folding" smear). Kernel radius 1.
+    # Implement as a separable 3x3 blur on the 2D grid view.
+    grid_h = int(round(math.sqrt(N)))
+    grid_w = grid_h
+    assert grid_h * grid_w == N
+    votes_2d = votes.view(V, 1, grid_h, grid_w)
+    blur = torch.tensor([[[[0.5, 1.0, 0.5],
+                           [1.0, 2.0, 1.0],
+                           [0.5, 1.0, 0.5]]]], device=dev, dtype=torch.float32)
+    blur = blur / blur.sum()
+    votes_2d = torch.nn.functional.conv2d(votes_2d, blur, padding=1)
+    votes = votes_2d.view(V, N)
+    # Per-row top-target_active
+    _, top_cells = torch.topk(votes, k=target_active, dim=1)
+    sdr = torch.zeros((V, N), dtype=torch.bool, device=dev)
+    sdr.scatter_(1, top_cells, True)
+    # Sanity check
+    row_active = sdr.sum(dim=1)
+    assert int(row_active.min()) == target_active, "row active mismatch"
+    assert int(row_active.max()) == target_active, "row active mismatch"
+    return sdr.cpu().numpy()
+# ---------------------------------------------------------------------------
+# Build orchestration
+# ---------------------------------------------------------------------------
+@dataclass
+class BuildReport:
+    vocab_size: int
+    n_bits: int
+    train_tokens: int
+    wall_time_sec: float
+def _retina_cache_repo() -> str:
+    return os.environ.get("HYDRA_RETINA_CACHE_REPO", "icarus112/feather-retina-cache")
+def _retina_cache_key() -> str:
+    """Cache key encodes vocab_size + n_bits + target_active so we don't
+    accidentally restore a retina built for a different tokenizer/config."""
+    try:
+        from prepare import VOCAB_SIZE
+    except Exception:
+        VOCAB_SIZE = 0
+    return f"retina_v{VOCAB_SIZE}_n{N_BITS}_a{TARGET_ACTIVE}.npz"
+def _try_hydrate_retina_from_hub() -> bool:
+    """Attempt to download a pre-built retina matching our config from HF Hub.
+    Returns True if successful — caller should skip the rebuild."""
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        return False
+    cache_key = _retina_cache_key()
+    try:
+        from huggingface_hub import hf_hub_download
+        p = hf_hub_download(
+            repo_id=_retina_cache_repo(), repo_type="dataset",
+            filename=cache_key, token=token,
+        )
+        os.makedirs(CACHE_DIR, exist_ok=True)
+        import shutil
+        shutil.copy(p, RETINA_PATH)
+        # Quick verify shape
+        with np.load(RETINA_PATH) as npz:
+            if int(npz["n_bits"]) == N_BITS and int(npz["target_active"]) == TARGET_ACTIVE:
+                print(f"[retina-cache] hydrated {cache_key} from {_retina_cache_repo()} "
+                      f"(shape={npz['sdr'].shape})", flush=True)
+                return True
+        os.remove(RETINA_PATH)
+        return False
+    except Exception as e:
+        print(f"[retina-cache] miss: {e}", flush=True)
+        return False
+def _upload_retina_to_hub() -> None:
+    """Upload freshly-built retina.npz to HF Hub for reuse by future jobs."""
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        return
+    cache_key = _retina_cache_key()
+    try:
+        from huggingface_hub import HfApi, create_repo
+        create_repo(_retina_cache_repo(), repo_type="dataset", private=True,
+                    exist_ok=True, token=token)
+        HfApi(token=token).upload_file(
+            path_or_fileobj=RETINA_PATH,
+            path_in_repo=cache_key,
+            repo_id=_retina_cache_repo(), repo_type="dataset",
+            commit_message=f"retina build for {cache_key}", token=token,
+        )
+        print(f"[retina-cache] uploaded {cache_key} to {_retina_cache_repo()}", flush=True)
+    except Exception as e:
+        print(f"[retina-cache] upload failed: {e}", flush=True)
+def build_retina(target_tokens: int = TARGET_TRAIN_TOKENS) -> BuildReport:
+    # Try HF Hub-backed cache first — retina build takes 500+ seconds.
+    if os.path.exists(RETINA_PATH):
+        print(f"[retina-cache] using local {RETINA_PATH}", flush=True)
+        with np.load(RETINA_PATH) as npz:
+            return BuildReport(
+                vocab_size=int(npz["vocab_size"]),
+                n_bits=int(npz["n_bits"]),
+                train_tokens=int(npz["train_tokens"]),
+                wall_time_sec=0.0,
+            )
+    elif _try_hydrate_retina_from_hub():
+        # Local copy now populated; return stub report
+        with np.load(RETINA_PATH) as npz:
+            return BuildReport(
+                vocab_size=int(npz["vocab_size"]),
+                n_bits=int(npz["n_bits"]),
+                train_tokens=int(npz["train_tokens"]),
+                wall_time_sec=0.0,
+            )
+    tokenizer = Tokenizer.from_directory(TOKENIZER_DIR)
+    vocab_size = tokenizer.get_vocab_size()
+    t0 = time.time()
+    counts, cooc, total_tokens = build_cooccurrence(
+        tokenizer, target_tokens=target_tokens, window=CONTEXT_WINDOW,
+    )
+    topk_idx, topk_score = compute_pmi_topk(
+        counts, cooc, total_tokens=total_tokens, top_k=TOP_K_FEATURES,
+    )
+    # Free the big cooccurrence matrix before GPU-heavy stages
+    del cooc
+    X = _context_vectors_from_topk(topk_idx, topk_score, vocab_size)
+    W = train_som(
+        X, grid_h=GRID_H, grid_w=GRID_W,
+        epochs=SOM_EPOCHS,
+        sigma_start=SOM_SIGMA_START, sigma_end=SOM_SIGMA_END,
+        alpha_start=SOM_ALPHA_START, alpha_end=SOM_ALPHA_END,
+    )
+    sdr = fold_sdrs(X, W, topk_idx, topk_score, target_active=TARGET_ACTIVE)
+    wall = time.time() - t0
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    np.savez_compressed(
+        RETINA_PATH,
+        sdr=sdr,
+        vocab_size=np.int64(vocab_size),
+        n_bits=np.int64(N_BITS),
+        grid_h=np.int64(GRID_H),
+        grid_w=np.int64(GRID_W),
+        target_active=np.int64(TARGET_ACTIVE),
+        context_window=np.int64(CONTEXT_WINDOW),
+        top_k_features=np.int64(TOP_K_FEATURES),
+        train_tokens=np.int64(total_tokens),
+    )
+    print(f"[save] wrote {RETINA_PATH}  sdr.shape={sdr.shape}  "
+          f"active_per_row={int(sdr.sum(axis=1).mean())}  wall={wall:.1f}s")
+    # Push to HF Hub so subsequent jobs (and parallel retina experiments)
+    # skip the 500+ second build entirely.
+    _upload_retina_to_hub()
+    return BuildReport(
+        vocab_size=vocab_size,
+        n_bits=N_BITS,
+        train_tokens=total_tokens,
+        wall_time_sec=wall,
+    )