Spaces:

Jackoatmon
/

feather-runtime

Runtime error

App Files Files Community

Jackoatmon commited on 14 days ago

Commit

e317e25

verified ·

1 Parent(s): 7de795d

Update Feather h200 training runtime image

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +16 -20
Dockerfile +118 -122
entrypoint.py +267 -227
mamba_ssm_init.py +62 -94
overlay/.dockerignore +20 -20
overlay/configs/__init__.py +5 -5
overlay/configs/hardware_config.py +104 -104
overlay/configs/harness_config.py +63 -63
overlay/configs/model_config.py +80 -80
overlay/harness/__init__.py +21 -21
overlay/harness/eval_agent.py +129 -257
overlay/harness/git_utils.py +94 -94
overlay/harness/health_monitor.py +86 -86
overlay/harness/meta_agent.py +139 -139
overlay/harness/orchestrator.py +281 -284
overlay/harness/search_strategy.py +153 -153
overlay/htm_rust/Cargo.lock +383 -383
overlay/htm_rust/Cargo.toml +37 -37
overlay/htm_rust/build.rs +168 -160
overlay/htm_rust/pyproject.toml +17 -17
overlay/htm_rust/src/gpu/fused.rs +702 -663
overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu +677 -677
overlay/htm_rust/src/gpu/tests.rs +663 -643
overlay/htm_rust/src/lib.rs +198 -198
overlay/htm_rust/src/region.rs +94 -94
overlay/htm_rust/src/sp.rs +302 -302
overlay/htm_rust/src/tm.rs +545 -545
overlay/hydra/__init__.py +37 -31
overlay/hydra/config.py +225 -220
overlay/hydra/data_module.py +288 -288
overlay/hydra/diffusion_loss.py +236 -236
overlay/hydra/engram.py +160 -175
overlay/hydra/eval.py +210 -217
overlay/hydra/gdn_block.py +126 -126
overlay/hydra/hyena_block.py +68 -68
overlay/hydra/lightning_module.py +326 -326
overlay/hydra/model.py +0 -0
overlay/hydra/optimizer.py +252 -252
overlay/hydra/reality_bridge.py +71 -0
overlay/hydra/training.py +965 -946
overlay/kernels/cuda/decode_kernels.cu +10 -10
overlay/kernels/cuda/flashfftconv/LICENSE +201 -201
overlay/kernels/cuda/flashfftconv/README.md +57 -57
overlay/kernels/cuda/flashfftconv/UPSTREAM_COMMIT +1 -1
overlay/kernels/cuda/flashfftconv/csrc/.gitignore +9 -9
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly.h +373 -373
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda.cu +698 -698
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda_bf16.cu +724 -724
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda.cu +723 -723
overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda_bf16.cu +705 -705

.dockerignore CHANGED Viewed

@@ -1,20 +1,16 @@
-.git
-.github
-.venv
-.remember
-.letta
-.claude
-__pycache__
-*.pyc
-*.pyo
-*.pyd
-*.log
-run_*.log
-run*.log
-*.txt
-WORKER_COMPLETE
-autoresearch_loop.log
-overlay/data/
-overlay/state_store/
-overlay/htm_rust/target/
-overlay/hydra-core/target/

+# Keep HF runtime image context deterministic and small.
+**/__pycache__/
+**/*.py[cod]
+**/.pytest_cache/
+**/.mypy_cache/
+**/.ruff_cache/
+**/.venv/
+**/target/
+**/logs/
+**/*.log
+**/*.out
+**/*.pt
+**/*.safetensors
+**/*.parquet
+**/*.npz
+**/.git/

Dockerfile CHANGED Viewed

@@ -1,128 +1,124 @@
-FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
-ARG HTM_CUDA_ARCH=sm_86
-ENV DEBIAN_FRONTEND=noninteractive \
-    PIP_NO_CACHE_DIR=1 \
-    PYTHONUNBUFFERED=1 \
-    CARGO_HOME=/root/.cargo \
-    RUSTUP_HOME=/root/.rustup \
-    PATH=/root/.cargo/bin:${PATH}
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    git curl ca-certificates build-essential pkg-config libssl-dev && \
-    rm -rf /var/lib/apt/lists/*
-RUN curl https://sh.rustup.rs -sSf | bash -s -- -y --profile minimal --default-toolchain stable
-RUN pip install --upgrade pip setuptools wheel && \
-    pip install \
-      maturin \
-      huggingface_hub \
-      datasets \
-      requests \
-      pyarrow \
-      rustbpe \
-      pandas \
-      tiktoken \
-      pydantic \
-      ninja \
-      packaging \
-      einops
-# Mamba-3 fused CUDA kernel stack (mandatory — NO fallback allowed).
-#
-# We install PRE-BUILT manylinux wheels from the official state-spaces/mamba
-# and Dao-AILab/causal-conv1d GitHub releases. Compiling mamba_ssm from source
-# on HF Spaces' cpu-basic builder (~16GB RAM) OOMKills even with MAX_JOBS=1 —
-# nvcc on the templated selective-scan/chunk-scan kernels needs 8–12GB per TU.
-#
-# Wheel selection for base image pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel:
-#   - Python 3.11 (cp311)                       — matches PyTorch 2.6.0 image
-#   - CUDA 12.x wheels (cu12)                   — matches host CUDA 12.4
-#   - PyTorch 2.6 ABI (torch2.6)                — exact torch match
-#   - cxx11abiFALSE                             — standard PyTorch pip build
-#
-# Versions: mamba_ssm 2.3.1 (first stable with Mamba3 class) + causal_conv1d
-# 1.6.1.post4 (matching ABI). Both are CUDA-compiled, no build toolchain needed
-# on the Space builder.
-#
-# Step A: install the published v2.3.1 prebuilt wheel (compiled CUDA ops
-# for selective_scan, layernorm_gated, ssd_*, causal_conv1d, etc).
-RUN pip install \
-      'https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.6.1.post4/causal_conv1d-1.6.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' \
-      'https://github.com/state-spaces/mamba/releases/download/v2.3.1/mamba_ssm-2.3.1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' && \
-    python -c "import importlib.metadata as m; print('installed mamba_ssm=' + m.version('mamba_ssm') + ' causal_conv1d=' + m.version('causal_conv1d'))"
-#
-# Step B: graft the Mamba3 class + its pure-Triton ops subtree from mamba-ssm
-# main. v2.3.1 is the latest release but Mamba3 landed post-release; the new
-# files under ops/triton/mamba3/ are ALL pure Python @triton.jit kernels with
-# zero compiled-CUDA dependencies (verified: every import in that subtree is
-# triton/torch/python — no .so files, no nvcc). So we install the v2.3.1 wheel
-# (for its compiled ops) and overlay the main-branch Mamba3 sources on top.
-#
-# This avoids the source-build OOM on the cpu-basic HF Space builder and the
-# missing-file error the smoke hit on the last attempt.
-# Download grafted mamba3 module + triton ops subtree
-RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
-    BASE=https://raw.githubusercontent.com/state-spaces/mamba/main && \
-    curl -fsSL "$BASE/mamba_ssm/modules/mamba3.py" -o "$SITE/modules/mamba3.py" && \
-    mkdir -p "$SITE/ops/triton/mamba3" && \
-    for f in __init__.py angle_dt.py mamba3_mimo_rotary_step.py mamba3_mimo_utils.py mamba3_siso_bwd.py mamba3_siso_combined.py mamba3_siso_fwd.py mamba3_siso_step.py utils.py; do \
-        curl -fsSL "$BASE/mamba_ssm/ops/triton/mamba3/$f" -o "$SITE/ops/triton/mamba3/$f"; \
-    done
-# Replace mamba_ssm/__init__.py with a minimal one that only imports Mamba3
-# (pure-Triton, works). The shipped __init__.py eagerly imports
-# selective_scan_cuda.so which has a libtorch C++ ABI mismatch on this base
-# image ("undefined symbol: _ZN3c107WarningC1E..."). Since training only needs
-# Mamba3 (grafted from main), we skip all compiled-CUDA imports.
-COPY mamba_ssm_init.py /opt/conda/lib/python3.11/site-packages/mamba_ssm/__init__.py
-# Structural check (no triton init — triton has no GPU on the builder)
-RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
-    test -f "$SITE/modules/mamba3.py" && \
-    test -f "$SITE/ops/triton/mamba3/mamba3_siso_combined.py" && \
-    test -s "$SITE/__init__.py" && \
-    echo "mamba3 graft + __init__ override verified"
-# Optional tilelang for MIMO path — pure-python, cheap; SISO Mamba3 works without.
-RUN pip install tilelang || echo "[dockerfile] tilelang optional install failed — continuing"
-# Triton version decision: FORCE 3.5.1 — the only version with both mamba3
-# APIs (set_allocator + tl.make_tensor_descriptor). torch 2.6's _inductor
-# imports AttrsDescriptor from triton.compiler.compiler which was removed in
-# triton 3.4+, but mamba_ssm/__init__.py shims AttrsDescriptor as a stub
-# before any torch._inductor import path runs, so the incompatibility is
-# neutralized. Build-time assert verifies mamba3's two required APIs.
-RUN pip install --force-reinstall --no-deps 'triton==3.5.1' && \
-    python -c "import triton; from triton import language as tl; \
-               assert hasattr(triton, 'set_allocator'), 'missing triton.set_allocator'; \
-               assert hasattr(tl, 'make_tensor_descriptor'), 'missing tl.make_tensor_descriptor'; \
-               print(f'triton={triton.__version__} set_allocator+make_tensor_descriptor OK, AttrsDescriptor shimmed in mamba_ssm/__init__.py')"
-WORKDIR /workspace
-COPY overlay /workspace/feather
-COPY entrypoint.py /app/entrypoint.py
-WORKDIR /workspace/feather
-RUN python - <<'PY'
-from pathlib import Path
-for sh in Path('/workspace/feather/scripts').glob('*.sh'):
-    raw = sh.read_bytes()
-    norm = raw.replace(b'\r\n', b'\n')
-    if norm != raw:
-        sh.write_bytes(norm)
-PY
 RUN python -m py_compile hydra/training.py prepare.py train.py && \
     bash -n scripts/run_domain_expanded_pretrain.sh
 RUN export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} && \
-    export HTM_CUDA_ARCH=${HTM_CUDA_ARCH} && \
-    export CARGO_BUILD_JOBS=1 && \
-    maturin build --release -j 1 --features gpu --manifest-path htm_rust/Cargo.toml && \
     pip install htm_rust/target/wheels/htm_rust-*.whl
-CMD ["python", "/app/entrypoint.py"]

+FROM pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel
+# Default target is HF Jobs a10g-large (NVIDIA A10G, Ampere GA102, sm_86).
+# Override at build time for other cards, e.g. --build-arg FEATHER_GPU_ARCH=sm_90a.
+ARG FEATHER_GPU_ARCH=sm_86
+ARG FEATHER_TORCH_CUDA_ARCH_LIST=8.6
+ENV DEBIAN_FRONTEND=noninteractive \
+    PIP_NO_CACHE_DIR=1 \
+    PYTHONUNBUFFERED=1 \
+    CARGO_HOME=/root/.cargo \
+    RUSTUP_HOME=/root/.rustup \
+    HTM_CUDA_ARCH=${FEATHER_GPU_ARCH} \
+    TORCH_CUDA_ARCH_LIST=${FEATHER_TORCH_CUDA_ARCH_LIST} \
+    PATH=/root/.cargo/bin:${PATH}
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git curl ca-certificates build-essential pkg-config libssl-dev && \
+    rm -rf /var/lib/apt/lists/*
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y --profile minimal --default-toolchain stable
+RUN pip install --upgrade pip setuptools wheel && \
+    pip install \
+      maturin \
+      huggingface_hub \
+      datasets \
+      requests \
+      pyarrow \
+      rustbpe \
+      pandas \
+      tiktoken \
+      pydantic \
+      ninja \
+      packaging \
+      einops
+# Mamba-3 fused CUDA kernel stack (mandatory — NO fallback allowed).
+#
+# We install PRE-BUILT manylinux wheels from the official state-spaces/mamba
+# and Dao-AILab/causal-conv1d GitHub releases. Compiling mamba_ssm from source
+# on HF Spaces' cpu-basic builder (~16GB RAM) OOMKills even with MAX_JOBS=1 —
+# nvcc on the templated selective-scan/chunk-scan kernels needs 8–12GB per TU.
+#
+# Wheel selection for base image pytorch/pytorch:2.5.1-cuda12.1-cudnn9-devel:
+#   - Python 3.11 (cp311)                       — matches PyTorch 2.5.1 image
+#   - CUDA 12.x wheels (cu12)                   — compatible with CUDA 12.1 base
+#   - PyTorch 2.5 ABI (torch2.5)                — exact torch match
+#   - cxx11abiFALSE                             — standard PyTorch pip build
+#
+# Versions: mamba_ssm 2.3.0 + causal_conv1d 1.6.0 (matching torch2.5 ABI).
+# Both are CUDA-compiled, no build toolchain needed
+# on the Space builder.
+#
+# Step A: install the published v2.3.0 prebuilt wheel (compiled CUDA ops
+# for selective_scan, layernorm_gated, ssd_*, causal_conv1d, etc).
+RUN pip install \
+      'https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.6.0/causal_conv1d-1.6.0+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' \
+      'https://github.com/state-spaces/mamba/releases/download/v2.3.0/mamba_ssm-2.3.0+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl' && \
+    python -c "import importlib.metadata as m; print('installed mamba_ssm=' + m.version('mamba_ssm') + ' causal_conv1d=' + m.version('causal_conv1d'))"
+#
+# Step B: graft the Mamba3 class + its pure-Triton ops subtree from mamba-ssm
+# main. v2.3.1 is the latest release but Mamba3 landed post-release; the new
+# files under ops/triton/mamba3/ are ALL pure Python @triton.jit kernels with
+# zero compiled-CUDA dependencies (verified: every import in that subtree is
+# triton/torch/python — no .so files, no nvcc). So we install the v2.3.1 wheel
+# (for its compiled ops) and overlay the main-branch Mamba3 sources on top.
+#
+# This avoids the source-build OOM on the cpu-basic HF Space builder and the
+# missing-file error the smoke hit on the last attempt.
+# Download grafted mamba3 module + triton ops subtree
+RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
+    BASE=https://raw.githubusercontent.com/state-spaces/mamba/main && \
+    curl -fsSL "$BASE/mamba_ssm/modules/mamba3.py" -o "$SITE/modules/mamba3.py" && \
+    mkdir -p "$SITE/ops/triton/mamba3" && \
+    for f in __init__.py angle_dt.py mamba3_mimo_rotary_step.py mamba3_mimo_utils.py mamba3_siso_bwd.py mamba3_siso_combined.py mamba3_siso_fwd.py mamba3_siso_step.py utils.py; do \
+        curl -fsSL "$BASE/mamba_ssm/ops/triton/mamba3/$f" -o "$SITE/ops/triton/mamba3/$f"; \
+    done
+# Replace mamba_ssm/__init__.py with a minimal one that only imports Mamba3
+# (pure-Triton, works). The shipped __init__.py eagerly imports
+# selective_scan_cuda.so which has a libtorch C++ ABI mismatch on this base
+# image ("undefined symbol: _ZN3c107WarningC1E..."). Since training only needs
+# Mamba3 (grafted from main), we skip all compiled-CUDA imports.
+COPY mamba_ssm_init.py /opt/conda/lib/python3.11/site-packages/mamba_ssm/__init__.py
+# Structural check (no triton init — triton has no GPU on the builder)
+RUN SITE=/opt/conda/lib/python3.11/site-packages/mamba_ssm && \
+    test -f "$SITE/modules/mamba3.py" && \
+    test -f "$SITE/ops/triton/mamba3/mamba3_siso_combined.py" && \
+    test -s "$SITE/__init__.py" && \
+    echo "mamba3 graft + __init__ override verified"
+# Optional tilelang for MIMO path — pure-python, cheap; SISO Mamba3 works without.
+RUN pip install tilelang || echo "[dockerfile] tilelang optional install failed — continuing"
+# Triton version decision: FORCE 3.4.0 — first line with both mamba3
+# APIs (set_allocator + tl.make_tensor_descriptor) while avoiding the 3.5.x
+# driver-discovery regression seen on HF A10G (`0 active drivers` despite
+# torch.cuda being available). torch 2.5's _inductor expects older Triton
+# internals, but mamba_ssm/__init__.py shims AttrsDescriptor as a stub
+# before any torch._inductor import path runs, so the incompatibility is
+# neutralized. Build-time assert verifies mamba3's two required APIs.
+RUN pip install --force-reinstall --no-deps 'triton==3.4.0' && \
+    python -c "import triton; from triton import language as tl; \
+               assert hasattr(triton, 'set_allocator'), 'missing triton.set_allocator'; \
+               assert hasattr(tl, 'make_tensor_descriptor'), 'missing tl.make_tensor_descriptor'; \
+               print(f'triton={triton.__version__} set_allocator+make_tensor_descriptor OK, AttrsDescriptor shimmed in mamba_ssm/__init__.py')"
+WORKDIR /workspace
+COPY overlay /workspace/feather
+COPY entrypoint.py /app/entrypoint.py
+WORKDIR /workspace/feather
 RUN python -m py_compile hydra/training.py prepare.py train.py && \
     bash -n scripts/run_domain_expanded_pretrain.sh
 RUN export LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} && \
+    echo "building htm_rust GPU kernels for HTM_CUDA_ARCH=${HTM_CUDA_ARCH} TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}" && \
+    maturin build --release --features gpu --manifest-path htm_rust/Cargo.toml && \
     pip install htm_rust/target/wheels/htm_rust-*.whl
+CMD ["python", "/app/entrypoint.py"]

entrypoint.py CHANGED Viewed

@@ -1,227 +1,267 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-import json
-import os
-import subprocess
-import sys
-import time
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from pathlib import Path
-from threading import Thread
-# =============================================================================
-# EARLY CUDA FABRIC MANAGER KICK (before ANY CUDA-touching imports)
-# =============================================================================
-# On H200 hosts, cudaGetDeviceCount can return Error 802 "system not yet
-# initialized" on first use, because nvidia-fabricmanager on the host
-# synchronizes with the container's first driver call. Once any NVML/CUDA
-# call succeeds once (even just nvidia-smi), the fabric is up for the rest
-# of the container lifetime.
-#
-# Our previous approach (wait in a subprocess before training) didn't work
-# because the "initialization failed" state persisted across calls in the
-# same container. The real fix: kick the driver exactly once with
-# nvidia-smi, which is what successfully-working baseline containers do
-# implicitly via their first torch.cuda call.
-#
-# Must happen BEFORE `import torch` (because any import that eagerly calls
-# cudaGetDeviceCount will cache the Error 802 state).
-def _early_cuda_kick() -> None:
-    deadline = time.time() + 120.0
-    attempt = 0
-    while time.time() < deadline:
-        attempt += 1
-        r = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=30)
-        if r.returncode == 0 and 'H200' in (r.stdout or '') or 'H100' in (r.stdout or '') \
-                or 'A100' in (r.stdout or '') or r.returncode == 0:
-            print(f'[boot] nvidia-smi OK on attempt {attempt}', flush=True)
-            break
-        print(f'[boot] nvidia-smi attempt {attempt} rc={r.returncode} stderr={(r.stderr or "")[:120]}',
-              flush=True)
-        time.sleep(2)
-    # After nvidia-smi, probe torch in a subprocess so any latent error state
-    # doesn't leak into the main process's CUDA context.
-    probe = 'import torch; import sys; sys.exit(0 if torch.cuda.is_available() else 1)'
-    torch_deadline = time.time() + 120.0
-    t_attempt = 0
-    while time.time() < torch_deadline:
-        t_attempt += 1
-        r = subprocess.run([sys.executable, '-c', probe], capture_output=True, text=True, timeout=60)
-        if r.returncode == 0:
-            print(f'[boot] torch.cuda.is_available() = True after {t_attempt} probe(s)', flush=True)
-            return
-        if t_attempt == 1:
-            print(f'[boot] torch cuda probe {t_attempt}: {(r.stderr or "")[:200]}', flush=True)
-        time.sleep(2)
-    print('[boot] WARNING: torch.cuda never became ready — training will likely fail', flush=True)
-_early_cuda_kick()
-# Hydrate triton compilation cache from HF Hub before any triton/mamba_ssm import.
-# triton_cache_setup.py is copied next to this file by the job bash command.
-try:
-    import triton_cache_setup as _tcs
-    _tcs.setup()
-except ImportError:
-    print('[boot] triton_cache_setup not found; skipping cache hydrate', flush=True)
-from huggingface_hub import HfApi  # noqa: E402  (import after cuda kick)
-REPO_ROOT = Path('/workspace/feather')
-CACHE_ROOT = Path.home() / '.cache' / 'autoresearch'
-LOG_FILE = REPO_ROOT / 'run_domain_expanded.log'
-JOB_ID = os.environ.get('JOB_ID', 'local-job')
-OUTPUT_REPO = os.environ.get('HF_REPO_ID', 'icarus112/feather-pretrain-checkpoints')
-TOKEN = os.environ.get('HF_TOKEN')
-RUNTIME_MODE = os.environ.get('FEATHER_RUNTIME_MODE', 'space')
-APP_PORT = int(os.environ.get('PORT', '7860'))
-class _HealthHandler(BaseHTTPRequestHandler):
-    def do_GET(self):
-        if self.path in ('/', '/health', '/healthz', '/ready'):
-            payload = {
-                'status': 'ok',
-                'mode': RUNTIME_MODE,
-                'job_id': JOB_ID,
-            }
-            body = json.dumps(payload).encode('utf-8')
-            self.send_response(200)
-            self.send_header('Content-Type', 'application/json')
-            self.send_header('Content-Length', str(len(body)))
-            self.end_headers()
-            self.wfile.write(body)
-            return
-        self.send_response(404)
-        self.end_headers()
-    def log_message(self, format, *args):
-        return
-def _start_health_server() -> HTTPServer:
-    server = HTTPServer(('0.0.0.0', APP_PORT), _HealthHandler)
-    thread = Thread(target=server.serve_forever, daemon=True)
-    thread.start()
-    print(f'[space] health server listening on 0.0.0.0:{APP_PORT}', flush=True)
-    return server
-def upload_artifact(api: HfApi, path: Path, dest: str) -> None:
-    if not path.exists():
-        print(f'[upload] skip missing {path}', flush=True)
-        return
-    api.upload_file(
-        path_or_fileobj=str(path),
-        path_in_repo=dest,
-        repo_id=OUTPUT_REPO,
-        repo_type='model',
-    )
-    print(f'[upload] uploaded {path} -> {OUTPUT_REPO}/{dest}', flush=True)
-def _wait_for_cuda_ready(timeout_s: int = 120) -> None:
-    """Block until CUDA is fully initialized or timeout.
-    On H200 hosts with NVSwitch/fabric manager, nvidia driver setup can race
-    with container start. cudaGetDeviceCount can return CUDA_ERROR_SYSTEM_NOT_READY
-    (error 802) for the first few seconds, and any import that triggers
-    @triton.autotune (e.g. mamba_ssm, torch amp utilities) blows up with
-    "0 active drivers" if it happens during that window.
-    We pre-init CUDA in a throwaway Python subprocess (so any error state does
-    not leak into the main training process) and retry until torch.cuda
-    reports ready.
-    """
-    import time as _t
-    probe = (
-        "import torch; "
-        "import sys; "
-        "avail = torch.cuda.is_available(); "
-        "count = torch.cuda.device_count() if avail else 0; "
-        "sys.exit(0 if (avail and count > 0) else 1)"
-    )
-    deadline = _t.time() + timeout_s
-    attempt = 0
-    while _t.time() < deadline:
-        attempt += 1
-        r = subprocess.run(['python', '-c', probe], capture_output=True, text=True)
-        if r.returncode == 0:
-            print(f'[job] CUDA ready after {attempt} probe(s)', flush=True)
-            return
-        if attempt == 1:
-            print(f'[job] CUDA not ready yet (will retry up to {timeout_s}s): {r.stderr.strip()[:200]}', flush=True)
-        _t.sleep(2)
-    print(f'[job] CUDA still not ready after {timeout_s}s — continuing anyway (training will likely fail)', flush=True)
-def run_job_mode() -> int:
-    os.chdir(REPO_ROOT)
-    os.environ.setdefault('HYDRA_TIME_BUDGET', '43200')
-    os.environ.setdefault('HYDRA_TARGET_SHARDS', '2048')
-    os.environ.setdefault('HYDRA_DOWNLOAD_WORKERS', '16')
-    os.environ.setdefault('HYDRA_CKPT_INTERVAL', '1000')
-    os.environ.setdefault('HYDRA_RESUME_CKPT', str(CACHE_ROOT / 'latest.pt'))
-    # CUDA readiness was kicked at module import via _early_cuda_kick. Keep
-    # the wait as a second safety net — no-op if CUDA already ready.
-    _wait_for_cuda_ready()
-    cmd = [
-        'bash',
-        './scripts/run_domain_expanded_pretrain.sh',
-        '--target-shards', os.environ['HYDRA_TARGET_SHARDS'],
-        '--download-workers', os.environ['HYDRA_DOWNLOAD_WORKERS'],
-    ]
-    print('[job] starting Feather domain-expanded pretrain', flush=True)
-    print(f'[job] command={cmd}', flush=True)
-    proc = subprocess.run(cmd, check=False)
-    # Push triton compilation cache back to HF Hub for next run.
-    try:
-        import triton_cache_setup as _tcs
-        _tcs.teardown()
-    except Exception as _tcs_err:
-        print(f'[triton_cache] teardown error (non-fatal): {_tcs_err}', flush=True)
-    if TOKEN:
-        api = HfApi(token=TOKEN)
-        try:
-            api.create_repo(repo_id=OUTPUT_REPO, repo_type='model', private=True, exist_ok=True)
-        except Exception as e:
-            print(f'[upload] create_repo warning: {type(e).__name__}: {e}', flush=True)
-        prefix = f'jobs/{JOB_ID}'
-        try:
-            upload_artifact(api, LOG_FILE, f'{prefix}/run_domain_expanded.log')
-            upload_artifact(api, CACHE_ROOT / 'latest.pt', f'{prefix}/latest.pt')
-            upload_artifact(api, CACHE_ROOT / 'pretrain_final.pt', f'{prefix}/pretrain_final.pt')
-        except Exception as e:
-            print(f'[upload] upload warning: {type(e).__name__}: {e}', flush=True)
-    else:
-        print('[upload] HF_TOKEN not set; skipping artifact upload', flush=True)
-    return proc.returncode
-def run_space_mode() -> int:
-    server = _start_health_server()
-    print('[space] Feather runtime image ready', flush=True)
-    try:
-        while True:
-            time.sleep(3600)
-    finally:
-        server.shutdown()
-        server.server_close()
-def main() -> int:
-    if RUNTIME_MODE == 'job':
-        return run_job_mode()
-    return run_space_mode()
-if __name__ == '__main__':
-    raise SystemExit(main())

+#!/usr/bin/env python3
+from __future__ import annotations
+import json
+import os
+import subprocess
+import sys
+import time
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from pathlib import Path
+from threading import Thread
+def _prepend_library_path(*paths: str) -> None:
+    """Expose injected NVIDIA driver libraries before torch/triton imports."""
+    existing = [p for p in os.environ.get('LD_LIBRARY_PATH', '').split(':') if p]
+    merged = []
+    for p in paths:
+        if p and p not in merged:
+            merged.append(p)
+    for p in existing:
+        if p not in merged:
+            merged.append(p)
+    os.environ['LD_LIBRARY_PATH'] = ':'.join(merged)
+_prepend_library_path(
+    # HF Jobs injects the host driver under /usr/local/nvidia. Prefer that
+    # over CUDA toolkit/compat libcuda stubs; using /usr/local/cuda/compat here
+    # made A10G PyTorch report Error 803 despite nvidia-smi working.
+    '/usr/local/nvidia/lib64',
+    '/usr/local/nvidia/lib',
+    '/usr/lib/x86_64-linux-gnu',
+)
+# =============================================================================
+# EARLY CUDA FABRIC MANAGER KICK (before ANY CUDA-touching imports)
+# =============================================================================
+# On HF GPU hosts, cudaGetDeviceCount can transiently return not-ready errors
+# on first use. H200 fabric-manager is the worst case; A10G is usually ready
+# immediately, but the same early kick keeps the runtime deterministic.
+# synchronizes with the container's first driver call. Once any NVML/CUDA
+# call succeeds once (even just nvidia-smi), the fabric is up for the rest
+# of the container lifetime.
+#
+# Our previous approach (wait in a subprocess before training) didn't work
+# because the "initialization failed" state persisted across calls in the
+# same container. The real fix: kick the driver exactly once with
+# nvidia-smi, which is what successfully-working baseline containers do
+# implicitly via their first torch.cuda call.
+#
+# Must happen BEFORE `import torch` (because any import that eagerly calls
+# cudaGetDeviceCount will cache the Error 802 state).
+def _early_cuda_kick() -> None:
+    deadline = time.time() + 120.0
+    attempt = 0
+    while time.time() < deadline:
+        attempt += 1
+        r = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=30)
+        if r.returncode == 0:
+            gpu_line = next((ln.strip() for ln in (r.stdout or '').splitlines() if any(g in ln for g in ('A10', 'A100', 'H100', 'H200', 'RTX'))), 'gpu=unknown')
+            print(f'[boot] nvidia-smi OK on attempt {attempt}: {gpu_line}', flush=True)
+            break
+        print(f'[boot] nvidia-smi attempt {attempt} rc={r.returncode} stderr={(r.stderr or "")[:120]}',
+              flush=True)
+        time.sleep(2)
+    # After nvidia-smi, probe torch in a subprocess so any latent error state
+    # doesn't leak into the main process's CUDA context.
+    probe = 'import torch; import sys; sys.exit(0 if torch.cuda.is_available() else 1)'
+    torch_deadline = time.time() + 120.0
+    t_attempt = 0
+    while time.time() < torch_deadline:
+        t_attempt += 1
+        r = subprocess.run([sys.executable, '-c', probe], capture_output=True, text=True, timeout=60)
+        if r.returncode == 0:
+            print(f'[boot] torch.cuda.is_available() = True after {t_attempt} probe(s)', flush=True)
+            return
+        if t_attempt == 1:
+            print(f'[boot] torch cuda probe {t_attempt}: {(r.stderr or "")[:200]}', flush=True)
+        time.sleep(2)
+    print('[boot] WARNING: torch.cuda never became ready — training will likely fail', flush=True)
+_early_cuda_kick()
+# Hydrate triton compilation cache from HF Hub before any triton/mamba_ssm import.
+# triton_cache_setup.py is copied next to this file by the job bash command.
+try:
+    import triton_cache_setup as _tcs
+    _tcs.setup()
+except ImportError:
+    print('[boot] triton_cache_setup not found; skipping cache hydrate', flush=True)
+from huggingface_hub import HfApi  # noqa: E402  (import after cuda kick)
+REPO_ROOT = Path('/workspace/feather')
+CACHE_ROOT = Path.home() / '.cache' / 'autoresearch'
+LOG_FILE = REPO_ROOT / 'run_domain_expanded.log'
+JOB_ID = os.environ.get('JOB_ID', 'local-job')
+OUTPUT_REPO = os.environ.get('HF_REPO_ID', 'icarus112/feather-pretrain-checkpoints')
+TOKEN = os.environ.get('HF_TOKEN')
+RUNTIME_MODE = os.environ.get('FEATHER_RUNTIME_MODE', 'space')
+APP_PORT = int(os.environ.get('PORT', '7860'))
+class _HealthHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path in ('/', '/health', '/healthz', '/ready'):
+            payload = {
+                'status': 'ok',
+                'mode': RUNTIME_MODE,
+                'job_id': JOB_ID,
+            }
+            body = json.dumps(payload).encode('utf-8')
+            self.send_response(200)
+            self.send_header('Content-Type', 'application/json')
+            self.send_header('Content-Length', str(len(body)))
+            self.end_headers()
+            self.wfile.write(body)
+            return
+        self.send_response(404)
+        self.end_headers()
+    def log_message(self, format, *args):
+        return
+def _start_health_server() -> HTTPServer:
+    server = HTTPServer(('0.0.0.0', APP_PORT), _HealthHandler)
+    thread = Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+    print(f'[space] health server listening on 0.0.0.0:{APP_PORT}', flush=True)
+    return server
+def upload_artifact(api: HfApi, path: Path, dest: str) -> None:
+    if not path.exists():
+        print(f'[upload] skip missing {path}', flush=True)
+        return
+    api.upload_file(
+        path_or_fileobj=str(path),
+        path_in_repo=dest,
+        repo_id=OUTPUT_REPO,
+        repo_type='model',
+    )
+    print(f'[upload] uploaded {path} -> {OUTPUT_REPO}/{dest}', flush=True)
+def _wait_for_cuda_ready(timeout_s: int = 120) -> None:
+    """Block until CUDA is fully initialized or timeout.
+    On H200 hosts with NVSwitch/fabric manager, nvidia driver setup can race
+    with container start. cudaGetDeviceCount can return CUDA_ERROR_SYSTEM_NOT_READY
+    (error 802) for the first few seconds, and any import that triggers
+    @triton.autotune (e.g. mamba_ssm, torch amp utilities) blows up with
+    "0 active drivers" if it happens during that window.
+    We pre-init CUDA in a throwaway Python subprocess (so any error state does
+    not leak into the main training process) and retry until torch.cuda
+    reports ready.
+    """
+    import time as _t
+    probe = (
+        "import torch; "
+        "import sys; "
+        "avail = torch.cuda.is_available(); "
+        "count = torch.cuda.device_count() if avail else 0; "
+        "torch.empty(1, device='cuda') if (avail and count > 0) else None; "
+        "from triton.runtime import driver; "
+        "driver.active.get_current_device(); "
+        "sys.exit(0 if (avail and count > 0) else 1)"
+    )
+    deadline = _t.time() + timeout_s
+    attempt = 0
+    while _t.time() < deadline:
+        attempt += 1
+        r = subprocess.run(['python', '-c', probe], capture_output=True, text=True)
+        if r.returncode == 0:
+            print(f'[job] CUDA/Triton ready after {attempt} probe(s)', flush=True)
+            return
+        if attempt == 1:
+            print(f'[job] CUDA not ready yet (will retry up to {timeout_s}s): {r.stderr.strip()[:200]}', flush=True)
+        _t.sleep(2)
+    print(f'[job] CUDA still not ready after {timeout_s}s — continuing anyway (training will likely fail)', flush=True)
+def run_job_mode() -> int:
+    os.chdir(REPO_ROOT)
+    os.environ.setdefault('HYDRA_TIME_BUDGET', '43200')
+    os.environ.setdefault('HYDRA_TARGET_SHARDS', '2048')
+    os.environ.setdefault('HYDRA_DOWNLOAD_WORKERS', '16')
+    os.environ.setdefault('HYDRA_CKPT_INTERVAL', '1000')
+    os.environ.setdefault('HYDRA_RESUME_CKPT', str(CACHE_ROOT / 'latest.pt'))
+    os.environ.setdefault('FEATHER_GPU_PROFILE', 'a10g-large')
+    os.environ.setdefault('HTM_CUDA_ARCH', 'sm_86')
+    os.environ.setdefault('TORCH_CUDA_ARCH_LIST', '8.6')
+    os.environ.setdefault('TRITON_CACHE_DIR', f"/workspace/triton_cache/{os.environ['FEATHER_GPU_PROFILE']}")
+    os.environ.setdefault('TRITON_CACHE_REPO', f"icarus112/feather-triton-cache-{os.environ['FEATHER_GPU_PROFILE']}")
+    print(f"[job] gpu_profile={os.environ['FEATHER_GPU_PROFILE']} htm_cuda_arch={os.environ['HTM_CUDA_ARCH']} torch_cuda_arch={os.environ['TORCH_CUDA_ARCH_LIST']}", flush=True)
+    # CUDA readiness was kicked at module import via _early_cuda_kick. Keep
+    # the wait as a second safety net — no-op if CUDA already ready.
+    _wait_for_cuda_ready()
+    cmd = [
+        'bash',
+        './scripts/run_domain_expanded_pretrain.sh',
+        '--target-shards', os.environ['HYDRA_TARGET_SHARDS'],
+        '--download-workers', os.environ['HYDRA_DOWNLOAD_WORKERS'],
+    ]
+    print('[job] ensuring retina.npz before training...', flush=True)
+    try:
+        sys.path.insert(0, str(REPO_ROOT))
+        from subsystems.sdr_retina import build_retina
+        build_retina()
+    except Exception as _retina_err:
+        print(f'[job] retina bootstrap warning (train.py may still build it): {_retina_err}', flush=True)
+    print('[job] starting Feather domain-expanded pretrain', flush=True)
+    print(f'[job] command={cmd}', flush=True)
+    proc = subprocess.run(cmd, check=False)
+    # Push triton compilation cache back to HF Hub for next run.
+    try:
+        import triton_cache_setup as _tcs
+        _tcs.teardown()
+    except Exception as _tcs_err:
+        print(f'[triton_cache] teardown error (non-fatal): {_tcs_err}', flush=True)
+    if TOKEN:
+        api = HfApi(token=TOKEN)
+        try:
+            api.create_repo(repo_id=OUTPUT_REPO, repo_type='model', private=True, exist_ok=True)
+        except Exception as e:
+            print(f'[upload] create_repo warning: {type(e).__name__}: {e}', flush=True)
+        prefix = f'jobs/{JOB_ID}'
+        try:
+            upload_artifact(api, LOG_FILE, f'{prefix}/run_domain_expanded.log')
+            upload_artifact(api, CACHE_ROOT / 'latest.pt', f'{prefix}/latest.pt')
+            upload_artifact(api, CACHE_ROOT / 'pretrain_final.pt', f'{prefix}/pretrain_final.pt')
+        except Exception as e:
+            print(f'[upload] upload warning: {type(e).__name__}: {e}', flush=True)
+    else:
+        print('[upload] HF_TOKEN not set; skipping artifact upload', flush=True)
+    return proc.returncode
+def run_space_mode() -> int:
+    server = _start_health_server()
+    print('[space] Feather runtime image ready', flush=True)
+    try:
+        while True:
+            time.sleep(3600)
+    finally:
+        server.shutdown()
+        server.server_close()
+def main() -> int:
+    if RUNTIME_MODE == 'job':
+        return run_job_mode()
+    return run_space_mode()
+if __name__ == '__main__':
+    raise SystemExit(main())

mamba_ssm_init.py CHANGED Viewed

@@ -1,101 +1,69 @@
-# mamba_ssm package init — minimal override to avoid broken selective_scan_cuda.so
-# ABI mismatch with the base image's libtorch.
-#
-# The upstream __init__.py eagerly imports selective_scan_cuda which fails on
-# pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel (undefined c10::Warning ctor
-# symbol). We only need Mamba3 (grafted from main, pure-Triton), so we skip
-# all compiled-CUDA imports here and let Mamba3 load directly.
-__version__ = "2.3.1+feather-graft"
-# selective_scan_fn / mamba_inner_fn are shimmed to None — they are NOT used
-# by the Feather training path (which is Mamba3-only). If any import path
-# hits this, it will get a clear AttributeError instead of an obscure ImportError.
-selective_scan_fn = None
-mamba_inner_fn = None
-# --- triton API compatibility shims -----------------------------------------
-# Version matrix is hostile: torch 2.6 pins triton==3.2.0 because torch._inductor
-# imports AttrsDescriptor from triton.compiler.compiler — removed in triton 3.4+.
-# Grafted Mamba3 (from mamba-ssm main) needs triton.set_allocator and
-# tl.make_tensor_descriptor, both added in triton 3.3+. No single triton version
-# satisfies both simultaneously. We run on triton 3.5.1 (latest, has both mamba3
-# APIs) and shim AttrsDescriptor as a stub dataclass for torch._inductor. The
-# stub is never actually invoked at runtime because the codebase does not use
-# torch.compile — but importing torch._inductor.* still requires the symbol to
-# exist at module load time.
 import triton as _triton  # noqa: E402
 if not hasattr(_triton, "set_allocator"):
-    def _noop_set_allocator(_fn):  # pragma: no cover
-        return None
-    _triton.set_allocator = _noop_set_allocator
-import triton.compiler.compiler as _tcc  # noqa: E402
-if not hasattr(_tcc, "AttrsDescriptor"):
-    class _AttrsDescriptorShim:
-        """Stub for torch._inductor compatibility on triton >= 3.4.
-        torch._inductor.runtime.hints imports this at module load but the
-        constructor is only called inside torch.compile paths. Accept any
-        args/kwargs so the import itself succeeds."""
-        def __init__(self, *args, **kwargs):
-            self.args = args
-            self.kwargs = kwargs
-        @classmethod
-        def from_hints(cls, *args, **kwargs):
-            return cls(*args, **kwargs)
-    _tcc.AttrsDescriptor = _AttrsDescriptorShim
-# triton_key: removed in triton 3.5, used by torch._inductor.codecache for
-# FxGraphCache key derivation. Return a stable string so caching still works.
-if not hasattr(_tcc, "triton_key"):
-    def _triton_key_shim():
-        import triton as _t
-        return f"triton-{_t.__version__}-shim"
-    _tcc.triton_key = _triton_key_shim
-# Triton 3.5 wheels can occasionally load with an empty backend registry in
-# HF Jobs environments (driver.active -> "0 active drivers"), even though the
-# NVIDIA backend module is present and CudaDriver.is_active() is True.
-# Patch _create_driver to directly select CudaDriver when registry discovery
-# returns empty.
-import importlib as _importlib  # noqa: E402
-_triton_driver_mod = _importlib.import_module("triton.runtime.driver")
-if getattr(_triton_driver_mod, "backends", None) == {}:
-    from triton.backends.nvidia import driver as _nvidia_driver  # noqa: E402
-    def _create_driver_shim():
-        if hasattr(_nvidia_driver, "CudaDriver") and _nvidia_driver.CudaDriver.is_active():
-            return _nvidia_driver.CudaDriver()
-        raise RuntimeError(
-            "Triton backend registry is empty and NVIDIA CudaDriver is not active"
-        )
-    _triton_driver_mod._create_driver = _create_driver_shim
-    if hasattr(_triton_driver_mod, "driver") and hasattr(_triton_driver_mod.driver, "reset_active"):
-        _triton_driver_mod.driver.reset_active()
-_triton_compiler_mod = _importlib.import_module("triton.compiler.compiler")
-if getattr(_triton_compiler_mod, "backends", None) == {}:
-    from triton.backends import Backend as _Backend  # noqa: E402
-    from triton.backends.nvidia.compiler import CUDABackend as _CUDABackend  # noqa: E402
-    from triton.backends.nvidia.driver import CudaDriver as _CudaDriver  # noqa: E402
-    _triton_compiler_mod.backends["nvidia"] = _Backend(
-        compiler=_CUDABackend,
-        driver=_CudaDriver,
-    )
-# Suppress torch.compile/_dynamo errors globally — we don't rely on torch.compile
-# for performance in this codebase (Muon + mamba3 CUDA kernels already fused),
-# so fall back to eager on any dynamo failure rather than crashing. This is
-# defense-in-depth against further triton API drift.
-try:
-    import torch._dynamo  # noqa: F401 — triggers dynamo module init
-    torch._dynamo.config.suppress_errors = True
-except Exception:  # pragma: no cover
-    pass
-# Expose Mamba3 at top level to match `from mamba_ssm import Mamba3`.
-from mamba_ssm.modules.mamba3 import Mamba3  # noqa: E402

+# mamba_ssm package init — minimal override to avoid broken selective_scan_cuda.so
+# ABI mismatch with the base image's libtorch.
+#
+# The upstream __init__.py eagerly imports selective_scan_cuda which fails on
+# pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel (undefined c10::Warning ctor
+# symbol). We only need Mamba3 (grafted from main, pure-Triton), so we skip
+# all compiled-CUDA imports here and let Mamba3 load directly.
+__version__ = "2.3.1+feather-graft"
+# selective_scan_fn / mamba_inner_fn are shimmed to None — they are NOT used
+# by the Feather training path (which is Mamba3-only). If any import path
+# hits this, it will get a clear AttributeError instead of an obscure ImportError.
+selective_scan_fn = None
+mamba_inner_fn = None
+# --- triton API compatibility shims -----------------------------------------
+# Version matrix is hostile: torch 2.6 pins triton==3.2.0 because torch._inductor
+# imports AttrsDescriptor from triton.compiler.compiler — removed in triton 3.4+.
+# Grafted Mamba3 (from mamba-ssm main) needs triton.set_allocator and
+# tl.make_tensor_descriptor, both added in triton 3.3+. No single triton version
+# satisfies both simultaneously. We run on triton 3.5.1 (latest, has both mamba3
+# APIs) and shim AttrsDescriptor as a stub dataclass for torch._inductor. The
+# stub is never actually invoked at runtime because the codebase does not use
+# torch.compile — but importing torch._inductor.* still requires the symbol to
+# exist at module load time.
 import triton as _triton  # noqa: E402
 if not hasattr(_triton, "set_allocator"):
+    def _noop_set_allocator(_fn):  # pragma: no cover
+        return None
+    _triton.set_allocator = _noop_set_allocator
+import triton.compiler.compiler as _tcc  # noqa: E402
+if not hasattr(_tcc, "AttrsDescriptor"):
+    class _AttrsDescriptorShim:
+        """Stub for torch._inductor compatibility on triton >= 3.4.
+        torch._inductor.runtime.hints imports this at module load but the
+        constructor is only called inside torch.compile paths. Accept any
+        args/kwargs so the import itself succeeds."""
+        def __init__(self, *args, **kwargs):
+            self.args = args
+            self.kwargs = kwargs
+        @classmethod
+        def from_hints(cls, *args, **kwargs):
+            return cls(*args, **kwargs)
+    _tcc.AttrsDescriptor = _AttrsDescriptorShim
+# triton_key: removed in triton 3.5, used by torch._inductor.codecache for
+# FxGraphCache key derivation. Return a stable string so caching still works.
+if not hasattr(_tcc, "triton_key"):
+    def _triton_key_shim():
+        import triton as _t
+        return f"triton-{_t.__version__}-shim"
+    _tcc.triton_key = _triton_key_shim
+# Suppress torch.compile/_dynamo errors globally — we don't rely on torch.compile
+# for performance in this codebase (Muon + mamba3 CUDA kernels already fused),
+# so fall back to eager on any dynamo failure rather than crashing. This is
+# defense-in-depth against further triton API drift.
+try:
+    import torch._dynamo  # noqa: F401 — triggers dynamo module init
+    torch._dynamo.config.suppress_errors = True
+except Exception:  # pragma: no cover
+    pass
+# Expose Mamba3 at top level to match `from mamba_ssm import Mamba3`.
+from mamba_ssm.modules.mamba3 import Mamba3  # noqa: E402

overlay/.dockerignore CHANGED Viewed

@@ -1,20 +1,20 @@
-.git
-.github
-.venv
-.remember
-.letta
-.claude
-__pycache__
-*.pyc
-*.pyo
-*.pyd
-*.log
-run_*.log
-run*.log
-*.txt
-WORKER_COMPLETE
-autoresearch_loop.log
-data/
-state_store/
-htm_rust/target/
-hydra-core/target/

+.git
+.github
+.venv
+.remember
+.letta
+.claude
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.log
+run_*.log
+run*.log
+*.txt
+WORKER_COMPLETE
+autoresearch_loop.log
+data/
+state_store/
+htm_rust/target/
+hydra-core/target/

overlay/configs/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
-from configs.hardware_config import HardwareConfig
-from configs.harness_config import HarnessConfig
-from configs.model_config import PostSemClawConfig
-__all__ = ["PostSemClawConfig", "HarnessConfig", "HardwareConfig"]

+from configs.hardware_config import HardwareConfig
+from configs.harness_config import HarnessConfig
+from configs.model_config import PostSemClawConfig
+__all__ = ["PostSemClawConfig", "HarnessConfig", "HardwareConfig"]

overlay/configs/hardware_config.py CHANGED Viewed

@@ -1,104 +1,104 @@
-"""Hardware detection and memory budget configuration."""
-from __future__ import annotations
-import torch
-from pydantic import BaseModel, Field
-class HardwareConfig(BaseModel):
-    """Auto-detected hardware configuration with memory budgets."""
-    gpu_name: str = Field(default="unknown", description="GPU device name")
-    gpu_memory_mb: int = Field(default=0, description="Total GPU memory in MB")
-    gpu_vram_mb: int = Field(default=0, description="Alias for gpu_memory_mb (legacy compat)")
-    compute_capability: tuple[int, int] = Field(
-        default=(0, 0), description="CUDA compute capability"
-    )
-    peak_flops: float = Field(
-        default=12.74e12, description="Peak FP32 FLOPS for MFU calculation"
-    )
-    bf16_peak_flops: float = Field(
-        default=38.1e12, description="Peak BF16 FLOPS (RTX 3060 default)"
-    )
-    # Memory budget
-    model_budget_mb: int = Field(
-        default=1500, description="Max MB for model params + optimizer"
-    )
-    activation_budget_mb: int = Field(
-        default=3000, description="Max MB for activations"
-    )
-    overhead_mb: int = Field(
-        default=500, description="Reserved for CUDA context + PyTorch overhead"
-    )
-    max_vram_usage_pct: float = Field(
-        default=90.0, description="Max VRAM usage as % of total"
-    )
-    gradient_checkpointing: bool = Field(
-        default=False, description="Enable gradient checkpointing to save VRAM"
-    )
-    @classmethod
-    def detect(cls) -> HardwareConfig:
-        """Auto-detect hardware from current CUDA device."""
-        if not torch.cuda.is_available():
-            return cls()
-        device = torch.cuda.current_device()
-        props = torch.cuda.get_device_properties(device)
-        cap = (props.major, props.minor)
-        mem_mb = props.total_memory // (1024 * 1024)
-        gpu_name = props.name
-        # Peak FP32 FLOPS lookup by compute capability (approximate)
-        fp32_flops_table: dict[tuple[int, int], float] = {
-            (8, 6): 12.74e12,  # RTX 3060
-            (8, 9): 40.09e12,  # RTX 4090
-            (9, 0): 989.5e12,  # H100 (BF16)
-        }
-        peak = fp32_flops_table.get(cap, 12.74e12)
-        # BF16 peak FLOPS lookup by GPU name substring
-        bf16_flops_table: dict[str, float] = {
-            "3060": 38.1e12,
-            "3090": 71.0e12,
-            "4090": 165.2e12,
-            "A100": 312e12,
-            "H100": 989.5e12,
-            "A10G": 70.0e12,
-        }
-        bf16_peak = 38.1e12  # default to RTX 3060
-        for key, val in bf16_flops_table.items():
-            if key in gpu_name:
-                bf16_peak = val
-                break
-        # Memory budget: leave overhead_mb for CUDA context
-        overhead = 500
-        available = mem_mb - overhead
-        model_budget = int(available * 0.3)      # 30% for params + optimizer
-        activation_budget = int(available * 0.7)  # 70% for activations
-        return cls(
-            gpu_name=gpu_name,
-            gpu_memory_mb=mem_mb,
-            gpu_vram_mb=mem_mb,
-            compute_capability=cap,
-            peak_flops=peak,
-            bf16_peak_flops=bf16_peak,
-            model_budget_mb=model_budget,
-            activation_budget_mb=activation_budget,
-        )
-    def suggest_batch_size(self, d_model: int, seq_len: int, n_layer: int) -> int:
-        """Suggest batch size based on activation budget.
-        Uses rough estimate: per-sample activation ~= n_layer * seq_len * d_model
-        * 4 bytes * 2 (fwd + bwd).
-        """
-        per_sample_mb = n_layer * seq_len * d_model * 4 * 2 / (1024 * 1024)
-        if per_sample_mb <= 0:
-            return 1
-        batch = max(1, int(self.activation_budget_mb / per_sample_mb))
-        # Round down to power of 2
-        return 2 ** (batch.bit_length() - 1) if batch > 1 else 1

+"""Hardware detection and memory budget configuration."""
+from __future__ import annotations
+import torch
+from pydantic import BaseModel, Field
+class HardwareConfig(BaseModel):
+    """Auto-detected hardware configuration with memory budgets."""
+    gpu_name: str = Field(default="unknown", description="GPU device name")
+    gpu_memory_mb: int = Field(default=0, description="Total GPU memory in MB")
+    gpu_vram_mb: int = Field(default=0, description="Alias for gpu_memory_mb (legacy compat)")
+    compute_capability: tuple[int, int] = Field(
+        default=(0, 0), description="CUDA compute capability"
+    )
+    peak_flops: float = Field(
+        default=12.74e12, description="Peak FP32 FLOPS for MFU calculation"
+    )
+    bf16_peak_flops: float = Field(
+        default=38.1e12, description="Peak BF16 FLOPS (RTX 3060 default)"
+    )
+    # Memory budget
+    model_budget_mb: int = Field(
+        default=1500, description="Max MB for model params + optimizer"
+    )
+    activation_budget_mb: int = Field(
+        default=3000, description="Max MB for activations"
+    )
+    overhead_mb: int = Field(
+        default=500, description="Reserved for CUDA context + PyTorch overhead"
+    )
+    max_vram_usage_pct: float = Field(
+        default=90.0, description="Max VRAM usage as % of total"
+    )
+    gradient_checkpointing: bool = Field(
+        default=False, description="Enable gradient checkpointing to save VRAM"
+    )
+    @classmethod
+    def detect(cls) -> HardwareConfig:
+        """Auto-detect hardware from current CUDA device."""
+        if not torch.cuda.is_available():
+            return cls()
+        device = torch.cuda.current_device()
+        props = torch.cuda.get_device_properties(device)
+        cap = (props.major, props.minor)
+        mem_mb = props.total_memory // (1024 * 1024)
+        gpu_name = props.name
+        # Peak FP32 FLOPS lookup by compute capability (approximate)
+        fp32_flops_table: dict[tuple[int, int], float] = {
+            (8, 6): 12.74e12,  # RTX 3060
+            (8, 9): 40.09e12,  # RTX 4090
+            (9, 0): 989.5e12,  # H100 (BF16)
+        }
+        peak = fp32_flops_table.get(cap, 12.74e12)
+        # BF16 peak FLOPS lookup by GPU name substring
+        bf16_flops_table: dict[str, float] = {
+            "3060": 38.1e12,
+            "3090": 71.0e12,
+            "4090": 165.2e12,
+            "A100": 312e12,
+            "H100": 989.5e12,
+            "A10G": 70.0e12,
+        }
+        bf16_peak = 38.1e12  # default to RTX 3060
+        for key, val in bf16_flops_table.items():
+            if key in gpu_name:
+                bf16_peak = val
+                break
+        # Memory budget: leave overhead_mb for CUDA context
+        overhead = 500
+        available = mem_mb - overhead
+        model_budget = int(available * 0.3)      # 30% for params + optimizer
+        activation_budget = int(available * 0.7)  # 70% for activations
+        return cls(
+            gpu_name=gpu_name,
+            gpu_memory_mb=mem_mb,
+            gpu_vram_mb=mem_mb,
+            compute_capability=cap,
+            peak_flops=peak,
+            bf16_peak_flops=bf16_peak,
+            model_budget_mb=model_budget,
+            activation_budget_mb=activation_budget,
+        )
+    def suggest_batch_size(self, d_model: int, seq_len: int, n_layer: int) -> int:
+        """Suggest batch size based on activation budget.
+        Uses rough estimate: per-sample activation ~= n_layer * seq_len * d_model
+        * 4 bytes * 2 (fwd + bwd).
+        """
+        per_sample_mb = n_layer * seq_len * d_model * 4 * 2 / (1024 * 1024)
+        if per_sample_mb <= 0:
+            return 1
+        batch = max(1, int(self.activation_budget_mb / per_sample_mb))
+        # Round down to power of 2
+        return 2 ** (batch.bit_length() - 1) if batch > 1 else 1

overlay/configs/harness_config.py CHANGED Viewed

@@ -3,53 +3,53 @@ from typing import Literal
 from pydantic import BaseModel, Field
-type GateThresholds = dict[str, float]
-type GateConfig = dict[str, GateThresholds]
 class HarnessConfig(BaseModel):
-    """Configuration for the HYDRA harness behavior."""
-    # Inner loop
-    time_budget_seconds: int = Field(
-        default=300, ge=60, description="Training time budget per experiment in seconds"
-    )
-    max_experiments: int = Field(
-        default=1000, ge=0, description="Max experiments before stopping (0=infinite)"
-    )
-    # Meta-agent
-    meta_interval: int = Field(
-        default=20, ge=5, description="Run meta-agent every N experiments"
-    )
-    max_meta_changes: int = Field(
-        default=3, ge=1, le=10, description="Max changes per meta-iteration"
-    )
-    # Search strategy
-    exploration_mode: Literal["conservative", "balanced", "bold"] = "balanced"
-    exploration_budget: int = Field(
-        default=5, ge=1, description="Consecutive bold experiments when stuck"
-    )
-    stuck_threshold: int = Field(
-        default=10, ge=3, description="No improvement for N experiments = stuck"
-    )
-    crash_threshold: float = Field(
-        default=0.5,
-        ge=0.1,
-        le=1.0,
-        description="Crash rate threshold for BROKEN state",
-    )
-    regression_tolerance: float = Field(
-        default=0.05,
-        ge=0,
-        le=0.2,
-        description="Max val_bpb regression from best (fraction)",
-    )
-    max_regression_pct: float = Field(
-        default=5.0, description="Max % regression from best known val_bpb"
-    )
     # Keep/discard criteria
     primary_metric: str = "val_bpb"
     secondary_metrics: GateConfig = Field(
@@ -63,23 +63,23 @@ class HarnessConfig(BaseModel):
             "hestia_quant_error": {"max": 0.05},
         }
     )
-    # Experiment execution
-    experiment_timeout: int = Field(
-        default=600, ge=300, description="Kill experiment after N seconds"
-    )
-    warmup_steps: int = Field(
-        default=10, ge=0, description="Steps to exclude from timing"
-    )
-    # Git
-    branch_prefix: str = Field(default="autoresearch", description="Branch naming prefix")
-    results_file: str = Field(default="results.tsv", description="Experiment log file")
-    # Secondary metric gates (optional keep/discard criteria)
-    gate_mhc_spectral_norm: float | None = Field(
-        default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
-    )
     gate_engram_hit_rate: float | None = Field(
         default=None, description="Min engram_hit_rate for keep (None=disabled)"
     )

 from pydantic import BaseModel, Field
+GateThresholds = dict[str, float]
+GateConfig = dict[str, GateThresholds]
 class HarnessConfig(BaseModel):
+    """Configuration for the HYDRA harness behavior."""
+    # Inner loop
+    time_budget_seconds: int = Field(
+        default=300, ge=60, description="Training time budget per experiment in seconds"
+    )
+    max_experiments: int = Field(
+        default=1000, ge=0, description="Max experiments before stopping (0=infinite)"
+    )
+    # Meta-agent
+    meta_interval: int = Field(
+        default=20, ge=5, description="Run meta-agent every N experiments"
+    )
+    max_meta_changes: int = Field(
+        default=3, ge=1, le=10, description="Max changes per meta-iteration"
+    )
+    # Search strategy
+    exploration_mode: Literal["conservative", "balanced", "bold"] = "balanced"
+    exploration_budget: int = Field(
+        default=5, ge=1, description="Consecutive bold experiments when stuck"
+    )
+    stuck_threshold: int = Field(
+        default=10, ge=3, description="No improvement for N experiments = stuck"
+    )
+    crash_threshold: float = Field(
+        default=0.5,
+        ge=0.1,
+        le=1.0,
+        description="Crash rate threshold for BROKEN state",
+    )
+    regression_tolerance: float = Field(
+        default=0.05,
+        ge=0,
+        le=0.2,
+        description="Max val_bpb regression from best (fraction)",
+    )
+    max_regression_pct: float = Field(
+        default=5.0, description="Max % regression from best known val_bpb"
+    )
     # Keep/discard criteria
     primary_metric: str = "val_bpb"
     secondary_metrics: GateConfig = Field(
             "hestia_quant_error": {"max": 0.05},
         }
     )
+    # Experiment execution
+    experiment_timeout: int = Field(
+        default=600, ge=300, description="Kill experiment after N seconds"
+    )
+    warmup_steps: int = Field(
+        default=10, ge=0, description="Steps to exclude from timing"
+    )
+    # Git
+    branch_prefix: str = Field(default="autoresearch", description="Branch naming prefix")
+    results_file: str = Field(default="results.tsv", description="Experiment log file")
+    # Secondary metric gates (optional keep/discard criteria)
+    gate_mhc_spectral_norm: float | None = Field(
+        default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
+    )
     gate_engram_hit_rate: float | None = Field(
         default=None, description="Min engram_hit_rate for keep (None=disabled)"
     )

overlay/configs/model_config.py CHANGED Viewed

@@ -1,80 +1,80 @@
-"""Post-SEM-Claw model configuration with Pydantic validation."""
-from pydantic import BaseModel, Field, field_validator
-class PostSemClawConfig(BaseModel):
-    """Configuration for the Post-SEM-Claw architecture.
-    Default values mirror the @dataclass in train.py exactly.
-    train.py is the source of truth — this file must stay in sync with it.
-    """
-    # Sequence
-    sequence_len: int = Field(default=2048, description="Context length (from prepare.py MAX_SEQ_LEN)")
-    vocab_size: int = Field(default=8192, description="Vocabulary size (from prepare.py VOCAB_SIZE)")
-    # Mamba-3 SSM
-    n_layer: int = Field(default=4, ge=1, le=48, description="Number of Mamba-3 blocks")
-    d_model: int = Field(default=256, ge=64, description="Model embedding dimension")
-    d_state: int = Field(default=64, ge=16, description="SSM state dimension")
-    headdim: int = Field(default=32, ge=16, description="SSM head dimension")
-    n_heads: int = Field(default=8, ge=1, description="Number of SSM heads (d_model // headdim)")
-    expand: int = Field(default=2, ge=1, le=4, description="Inner dim multiplier (inner_dim = expand * d_model)")
-    # mHC (Manifold Hyper-Connection)
-    mhc_n_streams: int = Field(default=4, ge=2, le=8, description="Number of residual streams")
-    mhc_sinkhorn_iters: int = Field(default=5, ge=1, le=100, description="Sinkhorn-Knopp iterations")
-    # Engram (conditional memory)
-    engram_n_columns: int = Field(default=4096, ge=256, description="Hash table columns")
-    engram_key_dim: int = Field(default=64, ge=16, description="Engram key dimension")
-    engram_layer_idx: int = Field(default=1, ge=0, description="Which layer gets engram (0-indexed)")
-    # Hestia QAT (disabled Phase 1, skeleton only)
-    hestia_enabled: bool = Field(default=False, description="Enable Hestia quantization")
-    hestia_bits: float = Field(default=1.58, gt=0, description="Target quantization bits (1.58 = 1.58-bit ternary)")
-    # SDR (bypass-only in Phase 1)
-    sdr_enabled: bool = Field(default=False, description="Enable stochastic resonance")
-    sdr_k: int = Field(default=64, ge=1, description="Top-K sparsification")
-    sdr_noise_std: float = Field(default=0.1, ge=0.0, description="SR noise standard deviation")
-    @field_validator("n_heads")
-    @classmethod
-    def validate_heads(cls, v: int, info: "FieldValidationInfo") -> int:
-        """Ensure n_heads equals d_model // headdim."""
-        d_model = info.data.get("d_model", 256)
-        headdim = info.data.get("headdim", 32)
-        expected = d_model // headdim
-        if v != expected:
-            raise ValueError(
-                f"n_heads ({v}) must equal d_model // headdim ({expected})"
-            )
-        return v
-    def estimate_params(self) -> int:
-        """Rough parameter count estimate based on train.py architecture."""
-        inner = self.expand * self.d_model
-        # in_proj: d_model -> inner + inner + d_state + d_state + n_heads
-        in_proj = self.d_model * (inner + inner + self.d_state + self.d_state + self.n_heads)
-        out_proj = inner * self.d_model
-        # conv1d (kernel=4, groups=inner_dim)
-        conv = inner * 4
-        # A_log, lambda_theta, D: n_heads each (3 vectors)
-        ssm_params = self.n_heads * 3
-        # bc_norm: d_state * 2 (weight + bias)
-        bc_norm = self.d_state * 2
-        per_block = in_proj + out_proj + conv + ssm_params + bc_norm
-        blocks = per_block * self.n_layer
-        # Embedding + lm_head (tied or untied)
-        embed = self.vocab_size * self.d_model * 2
-        # Engram: one instance at engram_layer_idx
-        # columns * d_model keys + d_model * engram_key_dim projection
-        engram = self.engram_n_columns * self.d_model + self.d_model * self.engram_key_dim
-        # mHC mixing matrices: n_layer * mhc_n_streams^2
-        mhc = self.n_layer * self.mhc_n_streams ** 2
-        return embed + blocks + engram + mhc

+"""Post-SEM-Claw model configuration with Pydantic validation."""
+from pydantic import BaseModel, Field, field_validator
+class PostSemClawConfig(BaseModel):
+    """Configuration for the Post-SEM-Claw architecture.
+    Default values mirror the @dataclass in train.py exactly.
+    train.py is the source of truth — this file must stay in sync with it.
+    """
+    # Sequence
+    sequence_len: int = Field(default=2048, description="Context length (from prepare.py MAX_SEQ_LEN)")
+    vocab_size: int = Field(default=8192, description="Vocabulary size (from prepare.py VOCAB_SIZE)")
+    # Mamba-3 SSM
+    n_layer: int = Field(default=4, ge=1, le=48, description="Number of Mamba-3 blocks")
+    d_model: int = Field(default=256, ge=64, description="Model embedding dimension")
+    d_state: int = Field(default=64, ge=16, description="SSM state dimension")
+    headdim: int = Field(default=32, ge=16, description="SSM head dimension")
+    n_heads: int = Field(default=8, ge=1, description="Number of SSM heads (d_model // headdim)")
+    expand: int = Field(default=2, ge=1, le=4, description="Inner dim multiplier (inner_dim = expand * d_model)")
+    # mHC (Manifold Hyper-Connection)
+    mhc_n_streams: int = Field(default=4, ge=2, le=8, description="Number of residual streams")
+    mhc_sinkhorn_iters: int = Field(default=5, ge=1, le=100, description="Sinkhorn-Knopp iterations")
+    # Engram (conditional memory)
+    engram_n_columns: int = Field(default=4096, ge=256, description="Hash table columns")
+    engram_key_dim: int = Field(default=64, ge=16, description="Engram key dimension")
+    engram_layer_idx: int = Field(default=1, ge=0, description="Which layer gets engram (0-indexed)")
+    # Hestia QAT (disabled Phase 1, skeleton only)
+    hestia_enabled: bool = Field(default=False, description="Enable Hestia quantization")
+    hestia_bits: float = Field(default=1.58, gt=0, description="Target quantization bits (1.58 = 1.58-bit ternary)")
+    # SDR (bypass-only in Phase 1)
+    sdr_enabled: bool = Field(default=False, description="Enable stochastic resonance")
+    sdr_k: int = Field(default=64, ge=1, description="Top-K sparsification")
+    sdr_noise_std: float = Field(default=0.1, ge=0.0, description="SR noise standard deviation")
+    @field_validator("n_heads")
+    @classmethod
+    def validate_heads(cls, v: int, info: "FieldValidationInfo") -> int:
+        """Ensure n_heads equals d_model // headdim."""
+        d_model = info.data.get("d_model", 256)
+        headdim = info.data.get("headdim", 32)
+        expected = d_model // headdim
+        if v != expected:
+            raise ValueError(
+                f"n_heads ({v}) must equal d_model // headdim ({expected})"
+            )
+        return v
+    def estimate_params(self) -> int:
+        """Rough parameter count estimate based on train.py architecture."""
+        inner = self.expand * self.d_model
+        # in_proj: d_model -> inner + inner + d_state + d_state + n_heads
+        in_proj = self.d_model * (inner + inner + self.d_state + self.d_state + self.n_heads)
+        out_proj = inner * self.d_model
+        # conv1d (kernel=4, groups=inner_dim)
+        conv = inner * 4
+        # A_log, lambda_theta, D: n_heads each (3 vectors)
+        ssm_params = self.n_heads * 3
+        # bc_norm: d_state * 2 (weight + bias)
+        bc_norm = self.d_state * 2
+        per_block = in_proj + out_proj + conv + ssm_params + bc_norm
+        blocks = per_block * self.n_layer
+        # Embedding + lm_head (tied or untied)
+        embed = self.vocab_size * self.d_model * 2
+        # Engram: one instance at engram_layer_idx
+        # columns * d_model keys + d_model * engram_key_dim projection
+        engram = self.engram_n_columns * self.d_model + self.d_model * self.engram_key_dim
+        # mHC mixing matrices: n_layer * mhc_n_streams^2
+        mhc = self.n_layer * self.mhc_n_streams ** 2
+        return embed + blocks + engram + mhc

overlay/harness/__init__.py CHANGED Viewed

@@ -1,21 +1,21 @@
-"""HYDRA harness package: orchestration infrastructure for autoresearch."""
-from harness.eval_agent import ExperimentResult, parse_run_log, should_keep
-from harness.git_utils import current_branch, current_commit_short
-from harness.health_monitor import check_health, get_gpu_stats
-from harness.meta_agent import run_meta_iteration
-from harness.orchestrator import run_loop
-from harness.search_strategy import ResearchState, diagnose
-__all__ = [
-    "run_loop",
-    "parse_run_log",
-    "ExperimentResult",
-    "should_keep",
-    "run_meta_iteration",
-    "diagnose",
-    "ResearchState",
-    "check_health",
-    "get_gpu_stats",
-    "current_branch",
-    "current_commit_short",
-]

+"""HYDRA harness package: orchestration infrastructure for autoresearch."""
+from harness.eval_agent import ExperimentResult, parse_run_log, should_keep
+from harness.git_utils import current_branch, current_commit_short
+from harness.health_monitor import check_health, get_gpu_stats
+from harness.meta_agent import run_meta_iteration
+from harness.orchestrator import run_loop
+from harness.search_strategy import ResearchState, diagnose
+__all__ = [
+    "run_loop",
+    "parse_run_log",
+    "ExperimentResult",
+    "should_keep",
+    "run_meta_iteration",
+    "diagnose",
+    "ResearchState",
+    "check_health",
+    "get_gpu_stats",
+    "current_branch",
+    "current_commit_short",
+]

overlay/harness/eval_agent.py CHANGED Viewed

@@ -1,300 +1,172 @@
 """Eval agent: parse run.log and extract metrics from training runs."""
 import re
-import statistics
-from dataclasses import dataclass
-type GateThresholds = dict[str, float]
-type GateConfig = dict[str, GateThresholds]
-@dataclass
 class ExperimentResult:
-    """Parsed result from a single experiment run.
-    All float fields default to 0.0; integer fields default to 0.
-    The ``crashed`` flag is set when the log indicates a failure or the
-    log file is missing entirely.
-    """
-    # Primary metric
-    val_bpb: float = 0.0
-    # Timing
-    training_seconds: float = 0.0
-    total_seconds: float = 0.0
-    # Hardware
-    peak_vram_mb: float = 0.0
-    mfu_percent: float = 0.0
     # Throughput
     total_tokens_m: float = 0.0
     num_steps: int = 0
-    tps_median: float = 0.0
-    tps_p10: float = 0.0
-    tps_min: float = 0.0
-    tps_max: float = 0.0
-    tps_samples: int = 0
-    # Model shape (echoed by train.py summary block)
-    num_params_m: float = 0.0
-    n_layer: int = 0
-    d_model: int = 0
     # Secondary health metrics
     mhc_spectral_norm: float = 0.0
     engram_hit_rate: float = 0.0
     sr_bypass_rate: float = 0.0
-    # Evaluation breadth metrics
-    factual_english_score: float = 0.0
-    instruction_following_score: float = 0.0
-    distinct_1: float = 0.0
-    distinct_2: float = 0.0
-    repetition_rate: float = 0.0
-    repetition_bigram_rate: float = 0.0
-    calibration_ece: float = 0.0
-    calibration_brier: float = 0.0
-    calibration_accuracy: float = 0.0
-    calibration_tokens: int = 0
-    eval_seed: int = 0
-    eval_seed_group: str = ""
-    # Status
-    crashed: bool = False
-    error_message: str = ""
-# Regex patterns keyed by ExperimentResult attribute name.
-# Format must match the ``--- Summary ---`` block printed by train.py.
-_PATTERNS: dict[str, str] = {
-    "val_bpb": r"^val_bpb:\s+([\d.]+)",
-    "training_seconds": r"^training_seconds:\s+([\d.]+)",
-    "total_seconds": r"^total_seconds:\s+([\d.]+)",
-    "peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)",
-    "mfu_percent": r"^mfu_percent:\s+([\d.]+)",
-    "total_tokens_m": r"^total_tokens_M:\s+([\d.]+)",
-    "num_steps": r"^num_steps:\s+(\d+)",
-    "num_params_m": r"^num_params_M:\s+([\d.]+)",
-    "n_layer": r"^n_layer:\s+(\d+)",
-    "d_model": r"^d_model:\s+(\d+)",
-    "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
     "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
     "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
-    "factual_english_score": r"^factual_english_score:\s+([\d.]+)",
-    "instruction_following_score": r"^instruction_following_score:\s+([\d.]+)",
-    "distinct_1": r"^distinct_1:\s+([\d.]+)",
-    "distinct_2": r"^distinct_2:\s+([\d.]+)",
-    "repetition_rate": r"^repetition_rate:\s+([\d.]+)",
-    "repetition_bigram_rate": r"^repetition_bigram_rate:\s+([\d.]+)",
-    "calibration_ece": r"^calibration_ece:\s+([\d.]+)",
-    "calibration_brier": r"^calibration_brier:\s*([\d.]+)",
-    "calibration_accuracy": r"^calibration_accuracy:\s+([\d.]+)",
-    "calibration_tokens": r"^calibration_tokens:\s+(\d+)",
-    "eval_seed": r"^eval_seed:\s+(\d+)",
-    "eval_seed_group": r"^eval_seed_group:\s+(.+)",
 }
-# Attributes that should be parsed as int rather than float.
-_INT_ATTRS: frozenset[str] = frozenset(
-    {
-        "num_steps",
-        "n_layer",
-        "d_model",
-        "calibration_tokens",
-        "eval_seed",
-    }
-)
-_STR_ATTRS: frozenset[str] = frozenset({"eval_seed_group"})
-_STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
-_TPS_PATTERN = re.compile(r"\btps=(\d+)\b")
-def _percentile_linear(sorted_values: list[float], pct: float) -> float:
-    """Compute percentile via linear interpolation (0 <= pct <= 100)."""
-    if not sorted_values:
-        return 0.0
-    if len(sorted_values) == 1:
-        return sorted_values[0]
-    rank = (len(sorted_values) - 1) * (pct / 100.0)
-    lo = int(rank)
-    hi = min(lo + 1, len(sorted_values) - 1)
-    frac = rank - lo
-    return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac
-def parse_run_log(log_path: str) -> ExperimentResult:
-    """Parse a run.log file and extract all training metrics.
-    Args:
-        log_path: Absolute path to the run.log file.
-    Returns:
-        Populated ExperimentResult; sets ``crashed=True`` when the log
-        contains a traceback or the file is missing.
-    """
-    result = ExperimentResult()
-    try:
-        with open(log_path) as fh:
-            content = fh.read()
-    except FileNotFoundError:
-        result.crashed = True
-        result.error_message = f"Log file not found: {log_path}"
-        return result
-    # Detect crash signals in output. Keep this strict to avoid false positives
-    # from benign log lines that include "error" in a non-fatal context.
-    if (
-        "Traceback" in content
-        or "\nFAIL\n" in content
-        or "[TPS_GUARD] FAIL" in content
-        or "raise SystemExit(1)" in content
-    ):
         result.crashed = True
         lines = content.strip().splitlines()
         result.error_message = "\n".join(lines[-20:])
     for attr, pattern in _PATTERNS.items():
         match = re.search(pattern, content, re.MULTILINE)
         if match:
             raw = match.group(1)
-            if attr in _INT_ATTRS:
-                setattr(result, attr, int(raw))
-            elif attr in _STR_ATTRS:
-                setattr(result, attr, raw.strip())
-            else:
-                setattr(result, attr, float(raw))
-    warmup_steps = 10
-    warmup_match = re.search(r"\[TPS_GUARD\] enabled .*?warmup_steps=(\d+)", content)
-    if warmup_match:
-        warmup_steps = int(warmup_match.group(1))
-    step_tps_samples: list[tuple[int, int]] = []
-    for m in _STEP_TPS_PATTERN.finditer(content):
-        step_tps_samples.append((int(m.group(1)), int(m.group(2))))
-    tps_values: list[float] = []
-    if step_tps_samples:
-        for step, tps in step_tps_samples:
-            if step >= warmup_steps:
-                tps_values.append(float(tps))
-        if not tps_values:
-            tps_values = [float(tps) for _, tps in step_tps_samples]
-    else:
-        tps_values = [float(m.group(1)) for m in _TPS_PATTERN.finditer(content)]
-    if tps_values:
-        sorted_tps = sorted(tps_values)
-        result.tps_samples = len(tps_values)
-        result.tps_median = float(statistics.median(tps_values))
-        result.tps_p10 = float(_percentile_linear(sorted_tps, 10.0))
-        result.tps_min = float(sorted_tps[0])
-        result.tps_max = float(sorted_tps[-1])
     return result
 def check_secondary_alarms(result: ExperimentResult) -> list[str]:
-    """Check secondary metrics against fixed alarm thresholds.
-    Args:
-        result: Parsed experiment result.
-    Returns:
-        List of human-readable alarm strings (empty if all clear).
-    """
-    alarms: list[str] = []
-    if result.mhc_spectral_norm > 2.0:
-        alarms.append(
-            f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)"
-        )
-    if 0 < result.engram_hit_rate < 0.1:
-        alarms.append(
-            f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
-        )
-    if 0 < result.mfu_percent < 10:
         alarms.append(
-            f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
         )
-    if result.calibration_ece > 0.35:
         alarms.append(
-            f"calibration_ece={result.calibration_ece:.4f} > 0.35 (poor calibration)"
         )
-    if result.tps_median > 0 and result.tps_median < 50000:
         alarms.append(
-            f"tps_median={result.tps_median:.0f} < 50000 (throughput below A10 objective)"
         )
     return alarms
-def _check_gate(
-    result: ExperimentResult,
-    gates: GateConfig,
-    metric: str,
-) -> tuple[bool, str] | None:
-    """Evaluate a single min/max gate against an ExperimentResult metric."""
-    gate = gates.get(metric, {})
-    value = getattr(result, metric)
-    max_value = gate.get("max")
-    if max_value is not None and value > max_value:
-        return False, f"{metric} {value:.4f} > gate {max_value}"
-    min_value = gate.get("min")
-    if min_value is not None and value < min_value:
-        return False, f"{metric} {value:.4f} < gate {min_value}"
-    return None
 def should_keep(
     result: ExperimentResult,
     best_bpb: float,
-    gates: GateConfig | None = None,
 ) -> tuple[bool, str]:
-    """Decide whether to keep or discard an experiment.
-    The primary criterion is strictly lower val_bpb than the current best.
-    Optional secondary gates (passed from HarnessConfig.secondary_metrics)
-    can reject an otherwise-improving result.
-    Args:
-        result: Parsed experiment result.
-        best_bpb: Current best val_bpb across all experiments.
-        gates: Optional dict mapping metric name to threshold dict with
-               ``"max"`` or ``"min"`` keys, e.g.
-               ``{"mhc_spectral_norm": {"max": 2.0}}``.
-    Returns:
-        Tuple of (keep: bool, reason: str).
-    """
-    if result.crashed:
-        return False, "crash"
-    if result.val_bpb <= 0:
-        return False, "invalid val_bpb"
-    if result.val_bpb >= best_bpb:
-        return False, "discard"
     # Secondary gate checks.
     if gates:
-        gate_metrics = (
-            "mhc_spectral_norm",
-            "engram_hit_rate",
-            "factual_english_score",
-            "instruction_following_score",
-            "distinct_1",
-            "distinct_2",
-            "repetition_rate",
-            "repetition_bigram_rate",
-            "calibration_ece",
-            "tps_median",
-            "tps_p10",
-        )
-        for metric in gate_metrics:
-            gate_result = _check_gate(result, gates, metric)
-            if gate_result is not None:
-                return gate_result
     return True, "keep"

 """Eval agent: parse run.log and extract metrics from training runs."""
 import re
+from dataclasses import dataclass, field
+@dataclass
 class ExperimentResult:
+    """Parsed result from a single experiment run.
+    All float fields default to 0.0; integer fields default to 0.
+    The ``crashed`` flag is set when the log indicates a failure or the
+    log file is missing entirely.
+    """
+    # Primary metric
+    val_bpb: float = 0.0
+    # Timing
+    training_seconds: float = 0.0
+    total_seconds: float = 0.0
+    # Hardware
+    peak_vram_mb: float = 0.0
+    mfu_percent: float = 0.0
     # Throughput
     total_tokens_m: float = 0.0
     num_steps: int = 0
+    # Model shape (echoed by train.py summary block)
+    num_params_m: float = 0.0
+    n_layer: int = 0
+    d_model: int = 0
     # Secondary health metrics
     mhc_spectral_norm: float = 0.0
     engram_hit_rate: float = 0.0
     sr_bypass_rate: float = 0.0
+    # Status
+    crashed: bool = False
+    error_message: str = ""
+# Regex patterns keyed by ExperimentResult attribute name.
+# Format must match the ``--- Summary ---`` block printed by train.py.
+_PATTERNS: dict[str, str] = {
+    "val_bpb": r"^val_bpb:\s+([\d.]+)",
+    "training_seconds": r"^training_seconds:\s+([\d.]+)",
+    "total_seconds": r"^total_seconds:\s+([\d.]+)",
+    "peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)",
+    "mfu_percent": r"^mfu_percent:\s+([\d.]+)",
+    "total_tokens_m": r"^total_tokens_M:\s+([\d.]+)",
+    "num_steps": r"^num_steps:\s+(\d+)",
+    "num_params_m": r"^num_params_M:\s+([\d.]+)",
+    "n_layer": r"^n_layer:\s+(\d+)",
+    "d_model": r"^d_model:\s+(\d+)",
+    "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
     "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
     "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
 }
+# Attributes that should be parsed as int rather than float.
+_INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"})
+def parse_run_log(log_path: str) -> ExperimentResult:
+    """Parse a run.log file and extract all training metrics.
+    Args:
+        log_path: Absolute path to the run.log file.
+    Returns:
+        Populated ExperimentResult; sets ``crashed=True`` when the log
+        contains a traceback or the file is missing.
+    """
+    result = ExperimentResult()
+    try:
+        with open(log_path) as fh:
+            content = fh.read()
+    except FileNotFoundError:
+        result.crashed = True
+        result.error_message = f"Log file not found: {log_path}"
+        return result
+    # Detect crash signals in output.
+    if "Traceback" in content or "FAIL" in content or "Error" in content:
         result.crashed = True
         lines = content.strip().splitlines()
         result.error_message = "\n".join(lines[-20:])
     for attr, pattern in _PATTERNS.items():
         match = re.search(pattern, content, re.MULTILINE)
         if match:
             raw = match.group(1)
+            setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw))
     return result
 def check_secondary_alarms(result: ExperimentResult) -> list[str]:
+    """Check secondary metrics against fixed alarm thresholds.
+    Args:
+        result: Parsed experiment result.
+    Returns:
+        List of human-readable alarm strings (empty if all clear).
+    """
+    alarms: list[str] = []
+    if result.mhc_spectral_norm > 2.0:
         alarms.append(
+            f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)"
         )
+    if 0 < result.engram_hit_rate < 0.1:
         alarms.append(
+            f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
         )
+    if 0 < result.mfu_percent < 10:
         alarms.append(
+            f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
         )
     return alarms
 def should_keep(
     result: ExperimentResult,
     best_bpb: float,
+    gates: dict | None = None,
 ) -> tuple[bool, str]:
+    """Decide whether to keep or discard an experiment.
+    The primary criterion is strictly lower val_bpb than the current best.
+    Optional secondary gates (passed from HarnessConfig.secondary_metrics)
+    can reject an otherwise-improving result.
+    Args:
+        result: Parsed experiment result.
+        best_bpb: Current best val_bpb across all experiments.
+        gates: Optional dict mapping metric name to threshold dict with
+               ``"max"`` or ``"min"`` keys, e.g.
+               ``{"mhc_spectral_norm": {"max": 2.0}}``.
+    Returns:
+        Tuple of (keep: bool, reason: str).
+    """
+    if result.crashed:
+        return False, "crash"
+    if result.val_bpb <= 0:
+        return False, "invalid val_bpb"
+    if result.val_bpb >= best_bpb:
+        return False, "discard"
     # Secondary gate checks.
     if gates:
+        gate_mhc = gates.get("mhc_spectral_norm", {}).get("max")
+        if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc:
+            return (
+                False,
+                f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}",
+            )
+        gate_engram = gates.get("engram_hit_rate", {}).get("min")
+        if gate_engram is not None and result.engram_hit_rate < gate_engram:
+            return (
+                False,
+                f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}",
+            )
     return True, "keep"

overlay/harness/git_utils.py CHANGED Viewed

@@ -1,94 +1,94 @@
-"""Git utilities for HYDRA autoresearch branch management."""
-import os
-import subprocess
-REPO_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-def run_git(*args: str, check: bool = True) -> subprocess.CompletedProcess:
-    """Run a git command in the repo directory.
-    Args:
-        *args: Git command arguments.
-        check: Whether to raise on non-zero exit code.
-    Returns:
-        Completed process with stdout/stderr captured.
-    """
-    return subprocess.run(
-        ["git"] + list(args),
-        cwd=REPO_DIR,
-        capture_output=True,
-        text=True,
-        check=check,
-    )
-def current_branch() -> str:
-    """Return the current git branch name.
-    Returns:
-        Branch name string.
-    """
-    result = run_git("rev-parse", "--abbrev-ref", "HEAD")
-    return result.stdout.strip()
-def current_commit_short() -> str:
-    """Return the current HEAD commit short hash (7 chars).
-    Returns:
-        7-character commit hash.
-    """
-    result = run_git("rev-parse", "--short=7", "HEAD")
-    return result.stdout.strip()
-def create_branch(name: str) -> None:
-    """Create and switch to a new branch.
-    Args:
-        name: Branch name to create.
-    """
-    run_git("checkout", "-b", name)
-def commit_all(message: str) -> str:
-    """Stage all changes, commit, and return short hash.
-    Args:
-        message: Commit message.
-    Returns:
-        Short commit hash after committing.
-    """
-    run_git("add", "-A")
-    run_git("commit", "-m", message, check=False)
-    return current_commit_short()
-def reset_to(commit: str) -> None:
-    """Hard reset to a specific commit, discarding all changes.
-    Args:
-        commit: Commit hash (short or full) to reset to.
-    """
-    run_git("reset", "--hard", commit)
-def get_last_n_diffs(n: int = 3) -> list[str]:
-    """Get the last N commit diffs (--stat format) for meta-agent context.
-    Args:
-        n: Number of recent commits to retrieve.
-    Returns:
-        List of diff stat strings, one per commit (truncated to 500 chars).
-    """
-    result = run_git("log", f"-{n}", "--format=%H", check=False)
-    hashes = [h for h in result.stdout.strip().split("\n") if h]
-    diffs: list[str] = []
-    for h in hashes:
-        diff_result = run_git("show", "--stat", h, check=False)
-        diffs.append(diff_result.stdout[:500])
-    return diffs

+"""Git utilities for HYDRA autoresearch branch management."""
+import os
+import subprocess
+REPO_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+def run_git(*args: str, check: bool = True) -> subprocess.CompletedProcess:
+    """Run a git command in the repo directory.
+    Args:
+        *args: Git command arguments.
+        check: Whether to raise on non-zero exit code.
+    Returns:
+        Completed process with stdout/stderr captured.
+    """
+    return subprocess.run(
+        ["git"] + list(args),
+        cwd=REPO_DIR,
+        capture_output=True,
+        text=True,
+        check=check,
+    )
+def current_branch() -> str:
+    """Return the current git branch name.
+    Returns:
+        Branch name string.
+    """
+    result = run_git("rev-parse", "--abbrev-ref", "HEAD")
+    return result.stdout.strip()
+def current_commit_short() -> str:
+    """Return the current HEAD commit short hash (7 chars).
+    Returns:
+        7-character commit hash.
+    """
+    result = run_git("rev-parse", "--short=7", "HEAD")
+    return result.stdout.strip()
+def create_branch(name: str) -> None:
+    """Create and switch to a new branch.
+    Args:
+        name: Branch name to create.
+    """
+    run_git("checkout", "-b", name)
+def commit_all(message: str) -> str:
+    """Stage all changes, commit, and return short hash.
+    Args:
+        message: Commit message.
+    Returns:
+        Short commit hash after committing.
+    """
+    run_git("add", "-A")
+    run_git("commit", "-m", message, check=False)
+    return current_commit_short()
+def reset_to(commit: str) -> None:
+    """Hard reset to a specific commit, discarding all changes.
+    Args:
+        commit: Commit hash (short or full) to reset to.
+    """
+    run_git("reset", "--hard", commit)
+def get_last_n_diffs(n: int = 3) -> list[str]:
+    """Get the last N commit diffs (--stat format) for meta-agent context.
+    Args:
+        n: Number of recent commits to retrieve.
+    Returns:
+        List of diff stat strings, one per commit (truncated to 500 chars).
+    """
+    result = run_git("log", f"-{n}", "--format=%H", check=False)
+    hashes = [h for h in result.stdout.strip().split("\n") if h]
+    diffs: list[str] = []
+    for h in hashes:
+        diff_result = run_git("show", "--stat", h, check=False)
+        diffs.append(diff_result.stdout[:500])
+    return diffs

overlay/harness/health_monitor.py CHANGED Viewed

@@ -1,86 +1,86 @@
-"""Hardware health monitoring for HYDRA experiments.
-Provides lightweight checks that the orchestrator runs before each
-experiment to avoid launching training into a degraded GPU state.
-"""
-import os
-import torch
-def get_gpu_stats() -> dict:
-    """Return current GPU memory statistics.
-    Returns:
-        Dict with keys: available (bool), and when available:
-        name, memory_allocated_mb, memory_reserved_mb,
-        max_memory_allocated_mb, memory_total_mb.
-    """
-    if not torch.cuda.is_available():
-        return {"available": False}
-    props = torch.cuda.get_device_properties(0)
-    return {
-        "available": True,
-        "name": torch.cuda.get_device_name(0),
-        "memory_allocated_mb": torch.cuda.memory_allocated(0) / (1024 * 1024),
-        "memory_reserved_mb": torch.cuda.memory_reserved(0) / (1024 * 1024),
-        "max_memory_allocated_mb": torch.cuda.max_memory_allocated(0) / (1024 * 1024),
-        "memory_total_mb": props.total_mem / (1024 * 1024),
-    }
-def check_health(
-    vram_pressure_pct: float = 90.0,
-    min_free_disk_gb: float = 1.0,
-) -> tuple[bool, list[str]]:
-    """Check GPU and disk health before launching an experiment.
-    Args:
-        vram_pressure_pct: Warn when GPU memory allocation exceeds this
-            percentage of total VRAM.
-        min_free_disk_gb: Warn when free disk space falls below this.
-    Returns:
-        Tuple of (healthy: bool, warnings: list[str]).
-        ``healthy`` is True when there are no warnings.
-    """
-    warnings: list[str] = []
-    stats = get_gpu_stats()
-    if not stats["available"]:
-        return False, ["No CUDA GPU available"]
-    # Memory pressure check.
-    used_pct = (
-        stats["memory_allocated_mb"] / stats["memory_total_mb"] * 100
-        if stats["memory_total_mb"] > 0
-        else 0.0
-    )
-    if used_pct > vram_pressure_pct:
-        warnings.append(
-            f"GPU memory pressure: {used_pct:.1f}% allocated "
-            f"({stats['memory_allocated_mb']:.0f} / {stats['memory_total_mb']:.0f} MB)"
-        )
-    # Disk space check.
-    try:
-        statvfs = os.statvfs(os.path.dirname(os.path.abspath(__file__)))
-        free_gb = (statvfs.f_bavail * statvfs.f_frsize) / (1024**3)
-        if free_gb < min_free_disk_gb:
-            warnings.append(f"Low disk space: {free_gb:.2f} GB free")
-    except (AttributeError, OSError):
-        # os.statvfs not available on all platforms (e.g. Windows).
-        pass
-    return len(warnings) == 0, warnings
-def reset_peak_stats() -> None:
-    """Reset GPU peak memory tracking for the next experiment.
-    Should be called immediately before launching each training run so
-    that peak_vram_mb reported in run.log reflects only that experiment.
-    """
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()

+"""Hardware health monitoring for HYDRA experiments.
+Provides lightweight checks that the orchestrator runs before each
+experiment to avoid launching training into a degraded GPU state.
+"""
+import os
+import torch
+def get_gpu_stats() -> dict:
+    """Return current GPU memory statistics.
+    Returns:
+        Dict with keys: available (bool), and when available:
+        name, memory_allocated_mb, memory_reserved_mb,
+        max_memory_allocated_mb, memory_total_mb.
+    """
+    if not torch.cuda.is_available():
+        return {"available": False}
+    props = torch.cuda.get_device_properties(0)
+    return {
+        "available": True,
+        "name": torch.cuda.get_device_name(0),
+        "memory_allocated_mb": torch.cuda.memory_allocated(0) / (1024 * 1024),
+        "memory_reserved_mb": torch.cuda.memory_reserved(0) / (1024 * 1024),
+        "max_memory_allocated_mb": torch.cuda.max_memory_allocated(0) / (1024 * 1024),
+        "memory_total_mb": props.total_mem / (1024 * 1024),
+    }
+def check_health(
+    vram_pressure_pct: float = 90.0,
+    min_free_disk_gb: float = 1.0,
+) -> tuple[bool, list[str]]:
+    """Check GPU and disk health before launching an experiment.
+    Args:
+        vram_pressure_pct: Warn when GPU memory allocation exceeds this
+            percentage of total VRAM.
+        min_free_disk_gb: Warn when free disk space falls below this.
+    Returns:
+        Tuple of (healthy: bool, warnings: list[str]).
+        ``healthy`` is True when there are no warnings.
+    """
+    warnings: list[str] = []
+    stats = get_gpu_stats()
+    if not stats["available"]:
+        return False, ["No CUDA GPU available"]
+    # Memory pressure check.
+    used_pct = (
+        stats["memory_allocated_mb"] / stats["memory_total_mb"] * 100
+        if stats["memory_total_mb"] > 0
+        else 0.0
+    )
+    if used_pct > vram_pressure_pct:
+        warnings.append(
+            f"GPU memory pressure: {used_pct:.1f}% allocated "
+            f"({stats['memory_allocated_mb']:.0f} / {stats['memory_total_mb']:.0f} MB)"
+        )
+    # Disk space check.
+    try:
+        statvfs = os.statvfs(os.path.dirname(os.path.abspath(__file__)))
+        free_gb = (statvfs.f_bavail * statvfs.f_frsize) / (1024**3)
+        if free_gb < min_free_disk_gb:
+            warnings.append(f"Low disk space: {free_gb:.2f} GB free")
+    except (AttributeError, OSError):
+        # os.statvfs not available on all platforms (e.g. Windows).
+        pass
+    return len(warnings) == 0, warnings
+def reset_peak_stats() -> None:
+    """Reset GPU peak memory tracking for the next experiment.
+    Should be called immediately before launching each training run so
+    that peak_vram_mb reported in run.log reflects only that experiment.
+    """
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()

overlay/harness/meta_agent.py CHANGED Viewed

@@ -1,139 +1,139 @@
-"""Meta-agent: evolves program.md based on experiment history.
-Runs every ``meta_interval`` inner-loop experiments (configured in
-HarnessConfig).  Reads the current research state from results.tsv,
-decides whether guidance is needed, and appends a directive to
-program.md.  Any previous auto-generated directive is replaced so
-the file stays clean.
-"""
-import os
-from harness.git_utils import REPO_DIR
-from harness.search_strategy import ResearchState, diagnose
-PROGRAM_PATH = os.path.join(REPO_DIR, "program.md")
-RESULTS_PATH = os.path.join(REPO_DIR, "results.tsv")
-# Sentinel that marks auto-generated content so it can be cleanly replaced.
-_DIRECTIVE_MARKER = "## Meta-Agent Directive (auto-generated)"
-def generate_directive(state: ResearchState) -> str | None:
-    """Generate a directive string to append to program.md, or None.
-    A directive is only produced when the research state is not EXPLORING
-    (i.e., something needs to change).
-    Args:
-        state: Current ResearchState diagnosis.
-    Returns:
-        Formatted directive string, or None when no change is needed.
-    """
-    if state.label == "EXPLORING":
-        return None
-    if state.label == "BROKEN":
-        return (
-            f"\n{_DIRECTIVE_MARKER}\n"
-            f"ALERT: Crash rate is {state.crash_rate:.0%} in the recent window. "
-            "Revert to the last stable commit. Reduce model complexity before "
-            "proposing further changes. Suggested actions:\n"
-            "- Reduce d_model or n_layer\n"
-            "- Reduce batch_size\n"
-            "- Disable experimental modules (Engram, mHC, Hestia) one at a time\n"
-        )
-    if state.label == "STUCK":
-        stale = state.total_experiments - state.last_improvement_at
-        return (
-            f"\n{_DIRECTIVE_MARKER}\n"
-            f"ALERT: No improvement for {stale} experiments "
-            f"(best_bpb={state.best_bpb:.6f}). "
-            "Apply BOLD changes for the next 5 experiments:\n"
-            "- Dramatically change d_model or n_layer (2× or ½)\n"
-            "- Toggle Engram or mHC on/off entirely\n"
-            "- Change optimizer hyperparameters by 3–5×\n"
-            "- Temporarily accept results within 0.5% of baseline\n"
-        )
-    if state.label == "EXPLOITING":
-        return (
-            f"\n{_DIRECTIVE_MARKER}\n"
-            "Search is converging too early. Inject diversity:\n"
-            "- If recent experiments tune LR, try architecture changes instead\n"
-            "- If tuning architecture, try optimizer or regularisation changes\n"
-            "- Try removing complexity (simplification wins are valuable)\n"
-            "- Explore a subsystem not touched in the last 10 experiments\n"
-        )
-    return None
-def _strip_previous_directive(content: str) -> str:
-    """Remove any prior auto-generated directive block from content.
-    Args:
-        content: Full text of program.md.
-    Returns:
-        Content with any previous directive stripped and trailing
-        whitespace normalised.
-    """
-    if _DIRECTIVE_MARKER in content:
-        content = content[: content.index(_DIRECTIVE_MARKER)].rstrip() + "\n"
-    return content
-def run_meta_iteration(
-    program_path: str = PROGRAM_PATH,
-    results_path: str = RESULTS_PATH,
-) -> dict:
-    """Run one meta-agent iteration.
-    Diagnoses the current research state and optionally rewrites
-    program.md with a new directive.
-    Args:
-        program_path: Path to program.md.
-        results_path: Path to results.tsv.
-    Returns:
-        Summary dict with keys: state, total_experiments, best_bpb,
-        crash_rate, changed, and optionally directive.
-    """
-    state = diagnose(results_path)
-    summary: dict = {
-        "state": state.label,
-        "total_experiments": state.total_experiments,
-        "best_bpb": state.best_bpb,
-        "crash_rate": state.crash_rate,
-        "changed": False,
-    }
-    directive = generate_directive(state)
-    if directive is None:
-        return summary
-    try:
-        with open(program_path) as fh:
-            content = fh.read()
-    except FileNotFoundError:
-        content = ""
-    content = _strip_previous_directive(content)
-    content = content + "\n" + directive
-    tmp_path = program_path + ".tmp"
-    try:
-        with open(tmp_path, "w") as fh:
-            fh.write(content)
-        os.replace(tmp_path, program_path)  # atomic on POSIX
-    finally:
-        if os.path.exists(tmp_path):
-            os.unlink(tmp_path)
-    summary["changed"] = True
-    summary["directive"] = directive.strip()
-    return summary

+"""Meta-agent: evolves program.md based on experiment history.
+Runs every ``meta_interval`` inner-loop experiments (configured in
+HarnessConfig).  Reads the current research state from results.tsv,
+decides whether guidance is needed, and appends a directive to
+program.md.  Any previous auto-generated directive is replaced so
+the file stays clean.
+"""
+import os
+from harness.git_utils import REPO_DIR
+from harness.search_strategy import ResearchState, diagnose
+PROGRAM_PATH = os.path.join(REPO_DIR, "program.md")
+RESULTS_PATH = os.path.join(REPO_DIR, "results.tsv")
+# Sentinel that marks auto-generated content so it can be cleanly replaced.
+_DIRECTIVE_MARKER = "## Meta-Agent Directive (auto-generated)"
+def generate_directive(state: ResearchState) -> str | None:
+    """Generate a directive string to append to program.md, or None.
+    A directive is only produced when the research state is not EXPLORING
+    (i.e., something needs to change).
+    Args:
+        state: Current ResearchState diagnosis.
+    Returns:
+        Formatted directive string, or None when no change is needed.
+    """
+    if state.label == "EXPLORING":
+        return None
+    if state.label == "BROKEN":
+        return (
+            f"\n{_DIRECTIVE_MARKER}\n"
+            f"ALERT: Crash rate is {state.crash_rate:.0%} in the recent window. "
+            "Revert to the last stable commit. Reduce model complexity before "
+            "proposing further changes. Suggested actions:\n"
+            "- Reduce d_model or n_layer\n"
+            "- Reduce batch_size\n"
+            "- Disable experimental modules (Engram, mHC, Hestia) one at a time\n"
+        )
+    if state.label == "STUCK":
+        stale = state.total_experiments - state.last_improvement_at
+        return (
+            f"\n{_DIRECTIVE_MARKER}\n"
+            f"ALERT: No improvement for {stale} experiments "
+            f"(best_bpb={state.best_bpb:.6f}). "
+            "Apply BOLD changes for the next 5 experiments:\n"
+            "- Dramatically change d_model or n_layer (2× or ½)\n"
+            "- Toggle Engram or mHC on/off entirely\n"
+            "- Change optimizer hyperparameters by 3–5×\n"
+            "- Temporarily accept results within 0.5% of baseline\n"
+        )
+    if state.label == "EXPLOITING":
+        return (
+            f"\n{_DIRECTIVE_MARKER}\n"
+            "Search is converging too early. Inject diversity:\n"
+            "- If recent experiments tune LR, try architecture changes instead\n"
+            "- If tuning architecture, try optimizer or regularisation changes\n"
+            "- Try removing complexity (simplification wins are valuable)\n"
+            "- Explore a subsystem not touched in the last 10 experiments\n"
+        )
+    return None
+def _strip_previous_directive(content: str) -> str:
+    """Remove any prior auto-generated directive block from content.
+    Args:
+        content: Full text of program.md.
+    Returns:
+        Content with any previous directive stripped and trailing
+        whitespace normalised.
+    """
+    if _DIRECTIVE_MARKER in content:
+        content = content[: content.index(_DIRECTIVE_MARKER)].rstrip() + "\n"
+    return content
+def run_meta_iteration(
+    program_path: str = PROGRAM_PATH,
+    results_path: str = RESULTS_PATH,
+) -> dict:
+    """Run one meta-agent iteration.
+    Diagnoses the current research state and optionally rewrites
+    program.md with a new directive.
+    Args:
+        program_path: Path to program.md.
+        results_path: Path to results.tsv.
+    Returns:
+        Summary dict with keys: state, total_experiments, best_bpb,
+        crash_rate, changed, and optionally directive.
+    """
+    state = diagnose(results_path)
+    summary: dict = {
+        "state": state.label,
+        "total_experiments": state.total_experiments,
+        "best_bpb": state.best_bpb,
+        "crash_rate": state.crash_rate,
+        "changed": False,
+    }
+    directive = generate_directive(state)
+    if directive is None:
+        return summary
+    try:
+        with open(program_path) as fh:
+            content = fh.read()
+    except FileNotFoundError:
+        content = ""
+    content = _strip_previous_directive(content)
+    content = content + "\n" + directive
+    tmp_path = program_path + ".tmp"
+    try:
+        with open(tmp_path, "w") as fh:
+            fh.write(content)
+        os.replace(tmp_path, program_path)  # atomic on POSIX
+    finally:
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+    summary["changed"] = True
+    summary["directive"] = directive.strip()
+    return summary

overlay/harness/orchestrator.py CHANGED Viewed

@@ -1,296 +1,293 @@
-"""HYDRA Orchestrator: main loop for autonomous research.
-Usage::
-    python -m harness.orchestrator [--meta-interval N] [--max-experiments N]
-Loop:
-    1. Read current state (branch, results.tsv, program.md)
-    2. [Architect Agent] proposes and applies changes to train.py (external)
-    3. Git commit the changes
-    4. Run training: ``uv run train.py`` captured to run.log
-    5. [Eval Agent] extract metrics from run.log
-    6. Keep or discard based on val_bpb + secondary metric gates
-    7. Log to results.tsv
-    8. Every ``meta_interval`` experiments: [Meta Agent] evolves program.md
-    9. Repeat
-The orchestrator intentionally does NOT modify train.py itself -- it
-provides the infrastructure ("rails") that the autoresearch loop runs on.
-"""
-import argparse
-import csv
 import os
 import subprocess
 import time
-from configs.harness_config import HarnessConfig
 from harness.eval_agent import ExperimentResult, check_secondary_alarms, parse_run_log, should_keep
-from harness.git_utils import REPO_DIR, commit_all, current_commit_short, reset_to
-from harness.health_monitor import check_health, reset_peak_stats
-from harness.meta_agent import run_meta_iteration
-from harness.search_strategy import diagnose
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-RESULTS_FILE = os.path.join(REPO_DIR, "results.tsv")
-RUN_LOG = os.path.join(REPO_DIR, "run.log")
-_TSV_HEADER = "commit\tval_bpb\tmemory_gb\tstatus\tdescription\n"
-# ---------------------------------------------------------------------------
-# TSV helpers
-# ---------------------------------------------------------------------------
-def init_results_tsv() -> None:
-    """Create results.tsv with header row if it does not yet exist."""
-    if not os.path.exists(RESULTS_FILE):
-        with open(RESULTS_FILE, "w") as fh:
-            fh.write(_TSV_HEADER)
-def log_result(
-    commit: str,
-    val_bpb: float,
-    memory_gb: float,
-    status: str,
-    description: str,
-) -> None:
-    """Append one row to results.tsv.
-    Args:
-        commit: Short git hash for this experiment.
-        val_bpb: Validation bits-per-byte (0.0 for crashes).
-        memory_gb: Peak VRAM usage in gigabytes.
-        status: One of keep / discard / crash / timeout.
-        description: Short human-readable description.
-    """
-    with open(RESULTS_FILE, "a") as fh:
-        fh.write(
-            f"{commit}\t{val_bpb:.6f}\t{memory_gb:.2f}\t{status}\t{description}\n"
-        )
-def count_experiments() -> int:
-    """Count the number of experiment rows in results.tsv.
-    Returns:
-        Row count excluding the header line (0 when file does not exist).
-    """
-    if not os.path.exists(RESULTS_FILE):
-        return 0
-    with open(RESULTS_FILE) as fh:
-        return max(0, sum(1 for _ in fh) - 1)
-def _load_best_bpb() -> float:
-    """Scan results.tsv for the best (lowest positive) val_bpb seen so far.
-    Returns:
-        Best val_bpb, or ``float("inf")`` when no valid result exists.
-    """
-    if not os.path.exists(RESULTS_FILE):
-        return float("inf")
-    best = float("inf")
-    with open(RESULTS_FILE) as fh:
-        reader = csv.DictReader(fh, delimiter="\t")
-        for row in reader:
-            try:
-                bpb = float(row.get("val_bpb", "0") or "0")
-            except ValueError:
-                continue
-            if 0 < bpb < best:
-                best = bpb
-    return best
-# ---------------------------------------------------------------------------
-# Experiment execution
-# ---------------------------------------------------------------------------
-def run_experiment(timeout: int = 600) -> str:
-    """Launch ``uv run train.py`` and capture all output to run.log.
-    Args:
-        timeout: Kill the process after this many seconds.
-    Returns:
-        One of ``"ok"``, ``"timeout"``, or ``"error"``.
-    """
-    try:
-        with open(RUN_LOG, "w") as log_file:
-            proc = subprocess.run(
-                ["uv", "run", "train.py"],
-                cwd=REPO_DIR,
-                stdout=log_file,
-                stderr=subprocess.STDOUT,
-                timeout=timeout,
-            )
-        return "ok" if proc.returncode == 0 else "error"
-    except subprocess.TimeoutExpired:
-        return "timeout"
-    except Exception as exc:  # noqa: BLE001
-        with open(RUN_LOG, "a") as log_file:
-            log_file.write(f"\nOrchestrator error: {exc}\n")
-        return "error"
-# ---------------------------------------------------------------------------
-# Main loop
-# ---------------------------------------------------------------------------
 def run_loop(
     meta_interval: int = 20,
     max_experiments: int | None = None,
     experiment_timeout: int = 600,
-    secondary_gates: dict[str, dict[str, float]] | None = None,
 ) -> None:
-    """Run the HYDRA autoresearch loop.
-    This function runs indefinitely (or until ``max_experiments`` is reached
-    or the user interrupts with Ctrl-C).
-    Args:
-        meta_interval: Run the meta-agent every N experiments.
-        max_experiments: Hard stop after this many experiments (None = infinite).
-        experiment_timeout: Seconds before a training run is killed.
-        secondary_gates: Optional gate thresholds forwarded to
-            :func:`~harness.eval_agent.should_keep`.
-    """
     init_results_tsv()
-    if secondary_gates is None:
-        secondary_gates = HarnessConfig().to_secondary_gates()
     best_bpb = _load_best_bpb()
-    experiment_num = count_experiments()
-    print(
-        f"HYDRA Orchestrator starting. "
-        f"Experiments so far: {experiment_num}, Best BPB: {best_bpb:.6f}"
-    )
-    while max_experiments is None or experiment_num < max_experiments:
-        experiment_num += 1
-        # ------------------------------------------------------------------
-        # Pre-flight health check
-        # ------------------------------------------------------------------
-        healthy, hw_warnings = check_health()
-        if hw_warnings:
-            print(f"  [health] {hw_warnings}")
-        # ------------------------------------------------------------------
-        # Periodic meta-agent update
-        # ------------------------------------------------------------------
-        if experiment_num > 1 and experiment_num % meta_interval == 0:
-            print(f"\n=== Meta-agent iteration at experiment {experiment_num} ===")
-            meta_result = run_meta_iteration()
-            print(
-                f"  state={meta_result['state']}  "
-                f"best_bpb={meta_result['best_bpb']:.6f}  "
-                f"changed={meta_result['changed']}"
-            )
-            if meta_result.get("directive"):
-                print(f"  directive: {meta_result['directive'][:120]}")
-        # ------------------------------------------------------------------
-        # Record baseline commit so we can reset on failure / discard
-        # ------------------------------------------------------------------
-        pre_commit = current_commit_short()
-        # ------------------------------------------------------------------
-        # Run experiment
-        # ------------------------------------------------------------------
-        print(f"\n--- Experiment {experiment_num} ---")
-        reset_peak_stats()
-        t0 = time.time()
-        run_status = run_experiment(timeout=experiment_timeout)
-        elapsed = time.time() - t0
-        print(f"  run_status={run_status}  elapsed={elapsed:.1f}s")
-        # ------------------------------------------------------------------
-        # Parse results
-        # ------------------------------------------------------------------
-        result: ExperimentResult = parse_run_log(RUN_LOG)
-        if result.crashed or run_status != "ok":
-            commit = current_commit_short()
-            err_short = (
-                "timeout"
-                if run_status == "timeout"
-                else result.error_message[:80].replace("\n", " ")
-            )
-            log_result(commit, 0.0, 0.0, "crash", err_short)
-            print(f"  CRASH: {err_short}")
-            reset_to(pre_commit)
-            continue
-        # ------------------------------------------------------------------
-        # Secondary alarms (non-blocking -- logged but do not abort)
-        # ------------------------------------------------------------------
-        alarms = check_secondary_alarms(result)
-        if alarms:
-            for alarm in alarms:
-                print(f"  [alarm] {alarm}")
-        # ------------------------------------------------------------------
-        # Keep / discard
-        # ------------------------------------------------------------------
-        keep, reason = should_keep(result, best_bpb, gates=secondary_gates)
-        commit = current_commit_short()
-        memory_gb = result.peak_vram_mb / 1024.0
-        if keep:
-            best_bpb = result.val_bpb
-            description = f"val_bpb improved to {result.val_bpb:.6f}"
-            log_result(commit, result.val_bpb, memory_gb, "keep", description)
-            print(f"  KEEP: val_bpb={result.val_bpb:.6f}  (new best)")
-        else:
-            description = f"{reason} val_bpb={result.val_bpb:.6f}"
-            log_result(commit, result.val_bpb, memory_gb, "discard", description)
-            print(f"  DISCARD: val_bpb={result.val_bpb:.6f}  ({reason})")
-            reset_to(pre_commit)
-    print(f"\nHYDRA finished after {experiment_num} experiments. Best BPB: {best_bpb:.6f}")
-# ---------------------------------------------------------------------------
-# CLI entry point
-# ---------------------------------------------------------------------------
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="HYDRA Autoresearch Orchestrator")
-    parser.add_argument(
-        "--meta-interval",
-        type=int,
-        default=20,
-        help="Run meta-agent every N experiments (default: 20)",
-    )
-    parser.add_argument(
-        "--max-experiments",
-        type=int,
-        default=None,
-        help="Stop after N experiments; omit for infinite (default: infinite)",
-    )
-    parser.add_argument(
-        "--experiment-timeout",
-        type=int,
-        default=600,
-        help="Kill training run after N seconds (default: 600)",
-    )
-    args = parser.parse_args()
-    try:
-        run_loop(
-            meta_interval=args.meta_interval,
-            max_experiments=args.max_experiments,
-            experiment_timeout=args.experiment_timeout,
-        )
-    except KeyboardInterrupt:
-        print("\nOrchestrator stopped by user.")

+"""HYDRA Orchestrator: main loop for autonomous research.
+Usage::
+    python -m harness.orchestrator [--meta-interval N] [--max-experiments N]
+Loop:
+    1. Read current state (branch, results.tsv, program.md)
+    2. [Architect Agent] proposes and applies changes to train.py (external)
+    3. Git commit the changes
+    4. Run training: ``uv run train.py`` captured to run.log
+    5. [Eval Agent] extract metrics from run.log
+    6. Keep or discard based on val_bpb + secondary metric gates
+    7. Log to results.tsv
+    8. Every ``meta_interval`` experiments: [Meta Agent] evolves program.md
+    9. Repeat
+The orchestrator intentionally does NOT modify train.py itself -- it
+provides the infrastructure ("rails") that the autoresearch loop runs on.
+"""
+import argparse
+import csv
 import os
 import subprocess
 import time
 from harness.eval_agent import ExperimentResult, check_secondary_alarms, parse_run_log, should_keep
+from harness.git_utils import REPO_DIR, commit_all, current_commit_short, reset_to
+from harness.health_monitor import check_health, reset_peak_stats
+from harness.meta_agent import run_meta_iteration
+from harness.search_strategy import diagnose
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+RESULTS_FILE = os.path.join(REPO_DIR, "results.tsv")
+RUN_LOG = os.path.join(REPO_DIR, "run.log")
+_TSV_HEADER = "commit\tval_bpb\tmemory_gb\tstatus\tdescription\n"
+# ---------------------------------------------------------------------------
+# TSV helpers
+# ---------------------------------------------------------------------------
+def init_results_tsv() -> None:
+    """Create results.tsv with header row if it does not yet exist."""
+    if not os.path.exists(RESULTS_FILE):
+        with open(RESULTS_FILE, "w") as fh:
+            fh.write(_TSV_HEADER)
+def log_result(
+    commit: str,
+    val_bpb: float,
+    memory_gb: float,
+    status: str,
+    description: str,
+) -> None:
+    """Append one row to results.tsv.
+    Args:
+        commit: Short git hash for this experiment.
+        val_bpb: Validation bits-per-byte (0.0 for crashes).
+        memory_gb: Peak VRAM usage in gigabytes.
+        status: One of keep / discard / crash / timeout.
+        description: Short human-readable description.
+    """
+    with open(RESULTS_FILE, "a") as fh:
+        fh.write(
+            f"{commit}\t{val_bpb:.6f}\t{memory_gb:.2f}\t{status}\t{description}\n"
+        )
+def count_experiments() -> int:
+    """Count the number of experiment rows in results.tsv.
+    Returns:
+        Row count excluding the header line (0 when file does not exist).
+    """
+    if not os.path.exists(RESULTS_FILE):
+        return 0
+    with open(RESULTS_FILE) as fh:
+        return max(0, sum(1 for _ in fh) - 1)
+def _load_best_bpb() -> float:
+    """Scan results.tsv for the best (lowest positive) val_bpb seen so far.
+    Returns:
+        Best val_bpb, or ``float("inf")`` when no valid result exists.
+    """
+    if not os.path.exists(RESULTS_FILE):
+        return float("inf")
+    best = float("inf")
+    with open(RESULTS_FILE) as fh:
+        reader = csv.DictReader(fh, delimiter="\t")
+        for row in reader:
+            try:
+                bpb = float(row.get("val_bpb", "0") or "0")
+            except ValueError:
+                continue
+            if 0 < bpb < best:
+                best = bpb
+    return best
+# ---------------------------------------------------------------------------
+# Experiment execution
+# ---------------------------------------------------------------------------
+def run_experiment(timeout: int = 600) -> str:
+    """Launch ``uv run train.py`` and capture all output to run.log.
+    Args:
+        timeout: Kill the process after this many seconds.
+    Returns:
+        One of ``"ok"``, ``"timeout"``, or ``"error"``.
+    """
+    try:
+        with open(RUN_LOG, "w") as log_file:
+            proc = subprocess.run(
+                ["uv", "run", "train.py"],
+                cwd=REPO_DIR,
+                stdout=log_file,
+                stderr=subprocess.STDOUT,
+                timeout=timeout,
+            )
+        return "ok" if proc.returncode == 0 else "error"
+    except subprocess.TimeoutExpired:
+        return "timeout"
+    except Exception as exc:  # noqa: BLE001
+        with open(RUN_LOG, "a") as log_file:
+            log_file.write(f"\nOrchestrator error: {exc}\n")
+        return "error"
+# ---------------------------------------------------------------------------
+# Main loop
+# ---------------------------------------------------------------------------
 def run_loop(
     meta_interval: int = 20,
     max_experiments: int | None = None,
     experiment_timeout: int = 600,
+    secondary_gates: dict | None = None,
 ) -> None:
+    """Run the HYDRA autoresearch loop.
+    This function runs indefinitely (or until ``max_experiments`` is reached
+    or the user interrupts with Ctrl-C).
+    Args:
+        meta_interval: Run the meta-agent every N experiments.
+        max_experiments: Hard stop after this many experiments (None = infinite).
+        experiment_timeout: Seconds before a training run is killed.
+        secondary_gates: Optional gate thresholds forwarded to
+            :func:`~harness.eval_agent.should_keep`.
+    """
     init_results_tsv()
     best_bpb = _load_best_bpb()
+    experiment_num = count_experiments()
+    print(
+        f"HYDRA Orchestrator starting. "
+        f"Experiments so far: {experiment_num}, Best BPB: {best_bpb:.6f}"
+    )
+    while max_experiments is None or experiment_num < max_experiments:
+        experiment_num += 1
+        # ------------------------------------------------------------------
+        # Pre-flight health check
+        # ------------------------------------------------------------------
+        healthy, hw_warnings = check_health()
+        if hw_warnings:
+            print(f"  [health] {hw_warnings}")
+        # ------------------------------------------------------------------
+        # Periodic meta-agent update
+        # ------------------------------------------------------------------
+        if experiment_num > 1 and experiment_num % meta_interval == 0:
+            print(f"\n=== Meta-agent iteration at experiment {experiment_num} ===")
+            meta_result = run_meta_iteration()
+            print(
+                f"  state={meta_result['state']}  "
+                f"best_bpb={meta_result['best_bpb']:.6f}  "
+                f"changed={meta_result['changed']}"
+            )
+            if meta_result.get("directive"):
+                print(f"  directive: {meta_result['directive'][:120]}")
+        # ------------------------------------------------------------------
+        # Record baseline commit so we can reset on failure / discard
+        # ------------------------------------------------------------------
+        pre_commit = current_commit_short()
+        # ------------------------------------------------------------------
+        # Run experiment
+        # ------------------------------------------------------------------
+        print(f"\n--- Experiment {experiment_num} ---")
+        reset_peak_stats()
+        t0 = time.time()
+        run_status = run_experiment(timeout=experiment_timeout)
+        elapsed = time.time() - t0
+        print(f"  run_status={run_status}  elapsed={elapsed:.1f}s")
+        # ------------------------------------------------------------------
+        # Parse results
+        # ------------------------------------------------------------------
+        result: ExperimentResult = parse_run_log(RUN_LOG)
+        if result.crashed or run_status != "ok":
+            commit = current_commit_short()
+            err_short = (
+                "timeout"
+                if run_status == "timeout"
+                else result.error_message[:80].replace("\n", " ")
+            )
+            log_result(commit, 0.0, 0.0, "crash", err_short)
+            print(f"  CRASH: {err_short}")
+            reset_to(pre_commit)
+            continue
+        # ------------------------------------------------------------------
+        # Secondary alarms (non-blocking -- logged but do not abort)
+        # ------------------------------------------------------------------
+        alarms = check_secondary_alarms(result)
+        if alarms:
+            for alarm in alarms:
+                print(f"  [alarm] {alarm}")
+        # ------------------------------------------------------------------
+        # Keep / discard
+        # ------------------------------------------------------------------
+        keep, reason = should_keep(result, best_bpb, gates=secondary_gates)
+        commit = current_commit_short()
+        memory_gb = result.peak_vram_mb / 1024.0
+        if keep:
+            best_bpb = result.val_bpb
+            description = f"val_bpb improved to {result.val_bpb:.6f}"
+            log_result(commit, result.val_bpb, memory_gb, "keep", description)
+            print(f"  KEEP: val_bpb={result.val_bpb:.6f}  (new best)")
+        else:
+            description = f"{reason} val_bpb={result.val_bpb:.6f}"
+            log_result(commit, result.val_bpb, memory_gb, "discard", description)
+            print(f"  DISCARD: val_bpb={result.val_bpb:.6f}  ({reason})")
+            reset_to(pre_commit)
+    print(f"\nHYDRA finished after {experiment_num} experiments. Best BPB: {best_bpb:.6f}")
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="HYDRA Autoresearch Orchestrator")
+    parser.add_argument(
+        "--meta-interval",
+        type=int,
+        default=20,
+        help="Run meta-agent every N experiments (default: 20)",
+    )
+    parser.add_argument(
+        "--max-experiments",
+        type=int,
+        default=None,
+        help="Stop after N experiments; omit for infinite (default: infinite)",
+    )
+    parser.add_argument(
+        "--experiment-timeout",
+        type=int,
+        default=600,
+        help="Kill training run after N seconds (default: 600)",
+    )
+    args = parser.parse_args()
+    try:
+        run_loop(
+            meta_interval=args.meta_interval,
+            max_experiments=args.max_experiments,
+            experiment_timeout=args.experiment_timeout,
+        )
+    except KeyboardInterrupt:
+        print("\nOrchestrator stopped by user.")

overlay/harness/search_strategy.py CHANGED Viewed

@@ -1,153 +1,153 @@
-"""Search strategy for HYDRA's meta-evolution loop.
-Reads results.tsv and diagnoses the current research state as one of:
-  EXPLORING  -- active improvement trend with diverse experiments
-  EXPLOITING -- narrowing in on a local optimum (low diversity)
-  STUCK      -- no improvement for >= stuck_threshold experiments
-  BROKEN     -- crash rate exceeds crash_threshold
-"""
-import csv
-import os
-from dataclasses import dataclass
-@dataclass
-class ResearchState:
-    """Diagnosis of the current research trajectory.
-    Attributes:
-        label: One of EXPLORING, EXPLOITING, STUCK, BROKEN.
-        trend_improving: True when the second half of the recent window is
-            better (lower BPB) than the first half.
-        experiment_diversity: Rough 0–1 score based on unique description
-            prefixes in the recent window.
-        crash_rate: Fraction of recent experiments that crashed.
-        best_bpb: Lowest val_bpb seen across all experiments.
-        last_improvement_at: Ordinal of the experiment that set best_bpb.
-        total_experiments: Total rows in results.tsv (excluding header).
-    """
-    label: str
-    trend_improving: bool
-    experiment_diversity: float
-    crash_rate: float
-    best_bpb: float
-    last_improvement_at: int
-    total_experiments: int
-def diagnose(
-    results_path: str,
-    window: int = 20,
-    stuck_threshold: int = 10,
-    crash_threshold: float = 0.5,
-) -> ResearchState:
-    """Diagnose current research state from results.tsv.
-    Args:
-        results_path: Path to the tab-separated results file.
-        window: Number of recent experiments to consider for trend/diversity.
-        stuck_threshold: Experiments without improvement before labelling STUCK.
-        crash_threshold: Crash fraction above which state becomes BROKEN.
-    Returns:
-        ResearchState with diagnosis label and supporting statistics.
-    """
-    if not os.path.exists(results_path):
-        return ResearchState(
-            label="EXPLORING",
-            trend_improving=False,
-            experiment_diversity=0.0,
-            crash_rate=0.0,
-            best_bpb=float("inf"),
-            last_improvement_at=0,
-            total_experiments=0,
-        )
-    rows: list[dict] = []
-    with open(results_path) as fh:
-        reader = csv.DictReader(fh, delimiter="\t")
-        for row in reader:
-            rows.append(row)
-    if not rows:
-        return ResearchState(
-            label="EXPLORING",
-            trend_improving=False,
-            experiment_diversity=0.0,
-            crash_rate=0.0,
-            best_bpb=float("inf"),
-            last_improvement_at=0,
-            total_experiments=0,
-        )
-    total = len(rows)
-    recent = rows[-window:]
-    # Crash rate in the recent window.
-    crashes = sum(1 for r in recent if r.get("status") == "crash")
-    crash_rate = crashes / len(recent) if recent else 0.0
-    # Best BPB overall and which experiment achieved it.
-    best_bpb = float("inf")
-    last_improvement_at = 0
-    for i, row in enumerate(rows):
-        try:
-            bpb = float(row.get("val_bpb", "0") or "0")
-        except ValueError:
-            continue
-        if bpb > 0 and bpb < best_bpb:
-            best_bpb = bpb
-            last_improvement_at = i + 1
-    # Trend: is the second half of the recent window better than the first?
-    valid_bpbs = [
-        float(r.get("val_bpb", "0") or "0")
-        for r in recent
-        if float(r.get("val_bpb", "0") or "0") > 0
-    ]
-    trend_improving = False
-    if len(valid_bpbs) >= 4:
-        mid = len(valid_bpbs) // 2
-        first_half_mean = sum(valid_bpbs[:mid]) / mid
-        second_half_mean = sum(valid_bpbs[mid:]) / (len(valid_bpbs) - mid)
-        trend_improving = second_half_mean < first_half_mean
-    # Diversity: fraction of unique description prefixes (first 20 chars).
-    descriptions = {r.get("description", "")[:20] for r in recent}
-    diversity = min(1.0, len(descriptions) / max(1, len(recent)))
-    # Classify state.
-    stale = total - last_improvement_at
-    if crash_rate > crash_threshold:
-        label = "BROKEN"
-    elif stale >= stuck_threshold:
-        label = "STUCK"
-    elif trend_improving and diversity > 0.3:
-        label = "EXPLORING"
-    else:
-        label = "EXPLOITING"
-    return ResearchState(
-        label=label,
-        trend_improving=trend_improving,
-        experiment_diversity=diversity,
-        crash_rate=crash_rate,
-        best_bpb=best_bpb,
-        last_improvement_at=last_improvement_at,
-        total_experiments=total,
-    )
-def should_explore(results_path: str, n: int = 10) -> bool:
-    """Return True when no improvement has been seen in the last N experiments.
-    Args:
-        results_path: Path to results.tsv.
-        n: Look-back window for improvement check.
-    Returns:
-        True if the research loop should try bolder mutations.
-    """
-    state = diagnose(results_path, window=n, stuck_threshold=n)
-    return state.label in ("STUCK", "BROKEN")

+"""Search strategy for HYDRA's meta-evolution loop.
+Reads results.tsv and diagnoses the current research state as one of:
+  EXPLORING  -- active improvement trend with diverse experiments
+  EXPLOITING -- narrowing in on a local optimum (low diversity)
+  STUCK      -- no improvement for >= stuck_threshold experiments
+  BROKEN     -- crash rate exceeds crash_threshold
+"""
+import csv
+import os
+from dataclasses import dataclass
+@dataclass
+class ResearchState:
+    """Diagnosis of the current research trajectory.
+    Attributes:
+        label: One of EXPLORING, EXPLOITING, STUCK, BROKEN.
+        trend_improving: True when the second half of the recent window is
+            better (lower BPB) than the first half.
+        experiment_diversity: Rough 0–1 score based on unique description
+            prefixes in the recent window.
+        crash_rate: Fraction of recent experiments that crashed.
+        best_bpb: Lowest val_bpb seen across all experiments.
+        last_improvement_at: Ordinal of the experiment that set best_bpb.
+        total_experiments: Total rows in results.tsv (excluding header).
+    """
+    label: str
+    trend_improving: bool
+    experiment_diversity: float
+    crash_rate: float
+    best_bpb: float
+    last_improvement_at: int
+    total_experiments: int
+def diagnose(
+    results_path: str,
+    window: int = 20,
+    stuck_threshold: int = 10,
+    crash_threshold: float = 0.5,
+) -> ResearchState:
+    """Diagnose current research state from results.tsv.
+    Args:
+        results_path: Path to the tab-separated results file.
+        window: Number of recent experiments to consider for trend/diversity.
+        stuck_threshold: Experiments without improvement before labelling STUCK.
+        crash_threshold: Crash fraction above which state becomes BROKEN.
+    Returns:
+        ResearchState with diagnosis label and supporting statistics.
+    """
+    if not os.path.exists(results_path):
+        return ResearchState(
+            label="EXPLORING",
+            trend_improving=False,
+            experiment_diversity=0.0,
+            crash_rate=0.0,
+            best_bpb=float("inf"),
+            last_improvement_at=0,
+            total_experiments=0,
+        )
+    rows: list[dict] = []
+    with open(results_path) as fh:
+        reader = csv.DictReader(fh, delimiter="\t")
+        for row in reader:
+            rows.append(row)
+    if not rows:
+        return ResearchState(
+            label="EXPLORING",
+            trend_improving=False,
+            experiment_diversity=0.0,
+            crash_rate=0.0,
+            best_bpb=float("inf"),
+            last_improvement_at=0,
+            total_experiments=0,
+        )
+    total = len(rows)
+    recent = rows[-window:]
+    # Crash rate in the recent window.
+    crashes = sum(1 for r in recent if r.get("status") == "crash")
+    crash_rate = crashes / len(recent) if recent else 0.0
+    # Best BPB overall and which experiment achieved it.
+    best_bpb = float("inf")
+    last_improvement_at = 0
+    for i, row in enumerate(rows):
+        try:
+            bpb = float(row.get("val_bpb", "0") or "0")
+        except ValueError:
+            continue
+        if bpb > 0 and bpb < best_bpb:
+            best_bpb = bpb
+            last_improvement_at = i + 1
+    # Trend: is the second half of the recent window better than the first?
+    valid_bpbs = [
+        float(r.get("val_bpb", "0") or "0")
+        for r in recent
+        if float(r.get("val_bpb", "0") or "0") > 0
+    ]
+    trend_improving = False
+    if len(valid_bpbs) >= 4:
+        mid = len(valid_bpbs) // 2
+        first_half_mean = sum(valid_bpbs[:mid]) / mid
+        second_half_mean = sum(valid_bpbs[mid:]) / (len(valid_bpbs) - mid)
+        trend_improving = second_half_mean < first_half_mean
+    # Diversity: fraction of unique description prefixes (first 20 chars).
+    descriptions = {r.get("description", "")[:20] for r in recent}
+    diversity = min(1.0, len(descriptions) / max(1, len(recent)))
+    # Classify state.
+    stale = total - last_improvement_at
+    if crash_rate > crash_threshold:
+        label = "BROKEN"
+    elif stale >= stuck_threshold:
+        label = "STUCK"
+    elif trend_improving and diversity > 0.3:
+        label = "EXPLORING"
+    else:
+        label = "EXPLOITING"
+    return ResearchState(
+        label=label,
+        trend_improving=trend_improving,
+        experiment_diversity=diversity,
+        crash_rate=crash_rate,
+        best_bpb=best_bpb,
+        last_improvement_at=last_improvement_at,
+        total_experiments=total,
+    )
+def should_explore(results_path: str, n: int = 10) -> bool:
+    """Return True when no improvement has been seen in the last N experiments.
+    Args:
+        results_path: Path to results.tsv.
+        n: Look-back window for improvement check.
+    Returns:
+        True if the research loop should try bolder mutations.
+    """
+    state = diagnose(results_path, window=n, stuck_threshold=n)
+    return state.label in ("STUCK", "BROKEN")

overlay/htm_rust/Cargo.lock CHANGED Viewed

@@ -1,383 +1,383 @@
-# This file is automatically @generated by Cargo.
-# It is not intended for manual editing.
-version = 4
-[[package]]
-name = "autocfg"
-version = "1.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
-[[package]]
-name = "cfg-if"
-version = "1.0.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
-[[package]]
-name = "cudarc"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38cd60a9a42ec83a2ed7effb0b1f073270264ea99da7acfc44f7e8d74dee0384"
-dependencies = [
- "libloading",
-]
-[[package]]
-name = "getrandom"
-version = "0.2.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
-dependencies = [
- "cfg-if",
- "libc",
- "wasi",
-]
-[[package]]
-name = "heck"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
-[[package]]
-name = "htm_rust"
-version = "0.1.0"
-dependencies = [
- "cudarc",
- "ndarray",
- "numpy",
- "pyo3",
- "rand",
- "rand_xoshiro",
-]
-[[package]]
-name = "indoc"
-version = "2.0.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
-dependencies = [
- "rustversion",
-]
-[[package]]
-name = "libc"
-version = "0.2.185"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
-[[package]]
-name = "libloading"
-version = "0.8.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
-dependencies = [
- "cfg-if",
- "windows-link",
-]
-[[package]]
-name = "matrixmultiply"
-version = "0.3.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08"
-dependencies = [
- "autocfg",
- "rawpointer",
-]
-[[package]]
-name = "memoffset"
-version = "0.9.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
-dependencies = [
- "autocfg",
-]
-[[package]]
-name = "ndarray"
-version = "0.16.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841"
-dependencies = [
- "matrixmultiply",
- "num-complex",
- "num-integer",
- "num-traits",
- "portable-atomic",
- "portable-atomic-util",
- "rawpointer",
-]
-[[package]]
-name = "num-complex"
-version = "0.4.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
-dependencies = [
- "num-traits",
-]
-[[package]]
-name = "num-integer"
-version = "0.1.46"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
-dependencies = [
- "num-traits",
-]
-[[package]]
-name = "num-traits"
-version = "0.2.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
-dependencies = [
- "autocfg",
-]
-[[package]]
-name = "numpy"
-version = "0.22.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edb929bc0da91a4d85ed6c0a84deaa53d411abfb387fc271124f91bf6b89f14e"
-dependencies = [
- "libc",
- "ndarray",
- "num-complex",
- "num-integer",
- "num-traits",
- "pyo3",
- "rustc-hash",
-]
-[[package]]
-name = "once_cell"
-version = "1.21.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
-[[package]]
-name = "portable-atomic"
-version = "1.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
-[[package]]
-name = "portable-atomic-util"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3"
-dependencies = [
- "portable-atomic",
-]
-[[package]]
-name = "ppv-lite86"
-version = "0.2.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
-dependencies = [
- "zerocopy",
-]
-[[package]]
-name = "proc-macro2"
-version = "1.0.106"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
-dependencies = [
- "unicode-ident",
-]
-[[package]]
-name = "pyo3"
-version = "0.22.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
-dependencies = [
- "cfg-if",
- "indoc",
- "libc",
- "memoffset",
- "once_cell",
- "portable-atomic",
- "pyo3-build-config",
- "pyo3-ffi",
- "pyo3-macros",
- "unindent",
-]
-[[package]]
-name = "pyo3-build-config"
-version = "0.22.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
-dependencies = [
- "once_cell",
- "target-lexicon",
-]
-[[package]]
-name = "pyo3-ffi"
-version = "0.22.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
-dependencies = [
- "libc",
- "pyo3-build-config",
-]
-[[package]]
-name = "pyo3-macros"
-version = "0.22.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
-dependencies = [
- "proc-macro2",
- "pyo3-macros-backend",
- "quote",
- "syn",
-]
-[[package]]
-name = "pyo3-macros-backend"
-version = "0.22.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
-dependencies = [
- "heck",
- "proc-macro2",
- "pyo3-build-config",
- "quote",
- "syn",
-]
-[[package]]
-name = "quote"
-version = "1.0.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
-dependencies = [
- "proc-macro2",
-]
-[[package]]
-name = "rand"
-version = "0.8.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
-dependencies = [
- "libc",
- "rand_chacha",
- "rand_core",
-]
-[[package]]
-name = "rand_chacha"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
-dependencies = [
- "ppv-lite86",
- "rand_core",
-]
-[[package]]
-name = "rand_core"
-version = "0.6.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
-dependencies = [
- "getrandom",
-]
-[[package]]
-name = "rand_xoshiro"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
-dependencies = [
- "rand_core",
-]
-[[package]]
-name = "rawpointer"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
-[[package]]
-name = "rustc-hash"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
-[[package]]
-name = "rustversion"
-version = "1.0.22"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
-[[package]]
-name = "syn"
-version = "2.0.117"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
-dependencies = [
- "proc-macro2",
- "quote",
- "unicode-ident",
-]
-[[package]]
-name = "target-lexicon"
-version = "0.12.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
-[[package]]
-name = "unicode-ident"
-version = "1.0.24"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
-[[package]]
-name = "unindent"
-version = "0.2.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
-[[package]]
-name = "wasi"
-version = "0.11.1+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
-[[package]]
-name = "windows-link"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
-[[package]]
-name = "zerocopy"
-version = "0.8.48"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
-dependencies = [
- "zerocopy-derive",
-]
-[[package]]
-name = "zerocopy-derive"
-version = "0.8.48"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
-]

+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+[[package]]
+name = "cudarc"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38cd60a9a42ec83a2ed7effb0b1f073270264ea99da7acfc44f7e8d74dee0384"
+dependencies = [
+ "libloading",
+]
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+[[package]]
+name = "htm_rust"
+version = "0.1.0"
+dependencies = [
+ "cudarc",
+ "ndarray",
+ "numpy",
+ "pyo3",
+ "rand",
+ "rand_xoshiro",
+]
+[[package]]
+name = "indoc"
+version = "2.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
+dependencies = [
+ "rustversion",
+]
+[[package]]
+name = "libc"
+version = "0.2.185"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ff2c0fe9bc6cb6b14a0592c2ff4fa9ceb83eea9db979b0487cd054946a2b8f"
+[[package]]
+name = "libloading"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
+dependencies = [
+ "cfg-if",
+ "windows-link",
+]
+[[package]]
+name = "matrixmultiply"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08"
+dependencies = [
+ "autocfg",
+ "rawpointer",
+]
+[[package]]
+name = "memoffset"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
+dependencies = [
+ "autocfg",
+]
+[[package]]
+name = "ndarray"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841"
+dependencies = [
+ "matrixmultiply",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "portable-atomic",
+ "portable-atomic-util",
+ "rawpointer",
+]
+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+[[package]]
+name = "numpy"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edb929bc0da91a4d85ed6c0a84deaa53d411abfb387fc271124f91bf6b89f14e"
+dependencies = [
+ "libc",
+ "ndarray",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "pyo3",
+ "rustc-hash",
+]
+[[package]]
+name = "once_cell"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+[[package]]
+name = "portable-atomic"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
+[[package]]
+name = "portable-atomic-util"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3"
+dependencies = [
+ "portable-atomic",
+]
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+[[package]]
+name = "pyo3"
+version = "0.22.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884"
+dependencies = [
+ "cfg-if",
+ "indoc",
+ "libc",
+ "memoffset",
+ "once_cell",
+ "portable-atomic",
+ "pyo3-build-config",
+ "pyo3-ffi",
+ "pyo3-macros",
+ "unindent",
+]
+[[package]]
+name = "pyo3-build-config"
+version = "0.22.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38"
+dependencies = [
+ "once_cell",
+ "target-lexicon",
+]
+[[package]]
+name = "pyo3-ffi"
+version = "0.22.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636"
+dependencies = [
+ "libc",
+ "pyo3-build-config",
+]
+[[package]]
+name = "pyo3-macros"
+version = "0.22.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453"
+dependencies = [
+ "proc-macro2",
+ "pyo3-macros-backend",
+ "quote",
+ "syn",
+]
+[[package]]
+name = "pyo3-macros-backend"
+version = "0.22.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "pyo3-build-config",
+ "quote",
+ "syn",
+]
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+[[package]]
+name = "rand_xoshiro"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
+dependencies = [
+ "rand_core",
+]
+[[package]]
+name = "rawpointer"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+[[package]]
+name = "target-lexicon"
+version = "0.12.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+[[package]]
+name = "unindent"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+[[package]]
+name = "zerocopy"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+dependencies = [
+ "zerocopy-derive",
+]
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]

overlay/htm_rust/Cargo.toml CHANGED Viewed

@@ -1,37 +1,37 @@
-[package]
-name = "htm_rust"
-version = "0.1.0"
-edition = "2021"
-authors = ["Feather/HYDRA"]
-description = "Numenta BAMI-spec Hierarchical Temporal Memory (Spatial Pooler + Temporal Memory) with pyo3 bindings"
-license = "MIT"
-[lib]
-name = "htm_rust"
-crate-type = ["cdylib", "rlib"]
-[dependencies]
-pyo3 = { version = "0.22", features = ["extension-module"] }
-numpy = "0.22"
-ndarray = "0.16"
-rand = "0.8"
-rand_xoshiro = "0.6"
-# cudarc: CUDA Rust bindings with dynamic-loading (no link-time dep on libcuda).
-# Kernels are embedded as PTX and JIT-compiled at runtime.
-cudarc = { version = "0.12", default-features = false, features = ["dynamic-linking", "driver", "cuda-12010"], optional = true }
-[build-dependencies]
-# Only required when building with --features gpu. We shell to nvcc directly
-# so we don't need cc's cuda support (which drags in extra deps).
-[features]
-default = []
-# `gpu` adds the HTMRegionGPU class, compiles .cu kernels to PTX at build time,
-# and links cudarc. Without this feature the crate is pure-CPU and has no
-# CUDA dependency at build or run time.
-gpu = ["cudarc"]
-[profile.release]
-opt-level = 3
-lto = "thin"
-codegen-units = 1

+[package]
+name = "htm_rust"
+version = "0.1.0"
+edition = "2021"
+authors = ["Feather/HYDRA"]
+description = "Numenta BAMI-spec Hierarchical Temporal Memory (Spatial Pooler + Temporal Memory) with pyo3 bindings"
+license = "MIT"
+[lib]
+name = "htm_rust"
+crate-type = ["cdylib", "rlib"]
+[dependencies]
+pyo3 = { version = "0.22", features = ["extension-module"] }
+numpy = "0.22"
+ndarray = "0.16"
+rand = "0.8"
+rand_xoshiro = "0.6"
+# cudarc: CUDA Rust bindings with dynamic-loading (no link-time dep on libcuda).
+# Kernels are embedded as PTX and JIT-compiled at runtime.
+cudarc = { version = "0.12", default-features = false, features = ["dynamic-linking", "driver", "cuda-12010"], optional = true }
+[build-dependencies]
+# Only required when building with --features gpu. We shell to nvcc directly
+# so we don't need cc's cuda support (which drags in extra deps).
+[features]
+default = []
+# `gpu` adds the HTMRegionGPU class, compiles .cu kernels to PTX at build time,
+# and links cudarc. Without this feature the crate is pure-CPU and has no
+# CUDA dependency at build or run time.
+gpu = ["cudarc"]
+[profile.release]
+opt-level = 3
+lto = "thin"
+codegen-units = 1

overlay/htm_rust/build.rs CHANGED Viewed

@@ -1,160 +1,168 @@
-//! Build script: compiles `.cu` kernel files to PTX when the `gpu` feature
-//! is enabled. PTX files are embedded into the final Rust binary via
-//! `include_str!` / `OUT_DIR` constants and JIT-loaded at runtime by cudarc.
-//!
-//! No-op when `gpu` feature is off — CPU-only builds have zero CUDA
-//! toolchain dependency.
-//!
-//! nvcc lookup order:
-//!   1. $NVCC env var
-//!   2. `nvcc` on PATH
-//!   3. `/usr/local/cuda-12.1/bin/nvcc`
-//!   4. `/usr/local/cuda/bin/nvcc`
-//!
-//! Target: sm_90a (Hopper, H200 — enables cluster::sync, TMA, wgmma). Override with $HTM_CUDA_ARCH.
-use std::env;
-use std::path::PathBuf;
-use std::process::Command;
-fn main() {
-    // Re-run whenever we edit the build script or any kernel source.
-    println!("cargo:rerun-if-changed=build.rs");
-    let gpu = env::var_os("CARGO_FEATURE_GPU").is_some();
-    if !gpu {
-        return;
-    }
-    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
-    let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_90a".into());
-    // Base kernels — compile for any sm_80+ GPU. Each .cu file → one .ptx file.
-    let base_kernels: &[&str] = &[
-        "sp_overlap",
-        "sp_topk",
-        "sp_learn",
-        "sp_duty",
-        "sp_boost_fused",
-        "tm_predict",
-        "tm_activate",
-        "tm_learn",
-        "tm_punish",
-        "tm_grow",
-        "tm_anomaly",
-        "tm_reset",
-    ];
-    // htm_fused_step now compiles for ALL architectures (sm_80+).
-    // On Hopper (sm_90+): uses cluster-distributed shared memory for hot state.
-    // On Ampere (sm_86) and other pre-Hopper: uses global memory reads/writes
-    // with grid.sync() for cross-block synchronization (cooperative launch).
-    let kernels: Vec<&str> = base_kernels.iter().chain(["htm_fused_step"].iter()).copied().collect();
-    let kernels_dir = PathBuf::from("src/gpu/kernels");
-    for k in &kernels {
-        let src = kernels_dir.join(format!("{k}.cu"));
-        println!("cargo:rerun-if-changed={}", src.display());
-    }
-    let nvcc = find_nvcc();
-    println!("cargo:warning=htm_rust: nvcc = {nvcc}");
-    println!("cargo:warning=htm_rust: target arch = {arch}");
-    // Prefer gcc-12 if present (CUDA 12.1 doesn't support gcc-13+ headers).
-    let host_compiler = env::var("HTM_CUDA_CCBIN")
-        .ok()
-        .or_else(|| {
-            for cand in ["/usr/bin/gcc-12", "/usr/bin/gcc-11"] {
-                if std::path::Path::new(cand).exists() {
-                    return Some(cand.to_string());
-                }
-            }
-            None
-        });
-    // Optionally patch the emitted PTX `.version` header down to match an
-    // older driver. Useful when the system driver (e.g. on WSL2) is older
-    // than the nvcc toolchain. Set HTM_PTX_VERSION to e.g. "7.8" or "8.0".
-    let ptx_version_override = env::var("HTM_PTX_VERSION").ok();
-    for k in kernels {
-        let src = kernels_dir.join(format!("{k}.cu"));
-        let ptx = out_dir.join(format!("{k}.ptx"));
-        if !src.exists() {
-            panic!("missing kernel source: {}", src.display());
-        }
-        let mut cmd = Command::new(&nvcc);
-        // Note: `--use_fast_math` breaks bit-parity with host `expf`, which
-        // in turn flips boost tie-breaks in SP learning. We accept the tiny
-        // perf loss for correctness; the hot overlap kernel has no transcendentals.
-        cmd.args([
-            "--ptx",
-            "-O3",
-            "-rdc=true",
-            "-arch",
-            &arch,
-        ]);
-        if let Some(cc) = &host_compiler {
-            cmd.args(["-ccbin", cc]);
-        }
-        cmd.arg("-o").arg(&ptx).arg(&src);
-        let status = cmd
-            .status()
-            .unwrap_or_else(|e| panic!("failed to spawn nvcc: {e}"));
-        if !status.success() {
-            panic!("nvcc failed for {}", src.display());
-        }
-        if let Some(ver) = &ptx_version_override {
-            // Read, patch, write.
-            let text = std::fs::read_to_string(&ptx)
-                .unwrap_or_else(|e| panic!("read {} failed: {e}", ptx.display()));
-            // Match `.version X.Y` where X and Y are digits. Replace whole line.
-            let patched: String = text
-                .lines()
-                .map(|line| {
-                    let t = line.trim_start();
-                    if t.starts_with(".version ") {
-                        format!(".version {ver}")
-                    } else {
-                        line.to_string()
-                    }
-                })
-                .collect::<Vec<_>>()
-                .join("\n");
-            std::fs::write(&ptx, patched)
-                .unwrap_or_else(|e| panic!("write {} failed: {e}", ptx.display()));
-        }
-    }
-    // Export OUT_DIR for include_str! in Rust.
-    println!(
-        "cargo:rustc-env=HTM_GPU_PTX_DIR={}",
-        out_dir.display()
-    );
-}
-fn find_nvcc() -> String {
-    if let Ok(n) = env::var("NVCC") {
-        return n;
-    }
-    // Try PATH.
-    if Command::new("nvcc").arg("--version").output().is_ok() {
-        return "nvcc".into();
-    }
-    for cand in [
-        "/usr/local/cuda-12.1/bin/nvcc",
-        "/usr/local/cuda/bin/nvcc",
-        "/usr/local/cuda-12/bin/nvcc",
-    ] {
-        if std::path::Path::new(cand).exists() {
-            return cand.into();
-        }
-    }
-    panic!(
-        "nvcc not found. Set $NVCC or install CUDA toolkit. \
-         Tried PATH, /usr/local/cuda-12.1, /usr/local/cuda."
-    );
-}

+//! Build script: compiles `.cu` kernel files to PTX when the `gpu` feature
+//! is enabled. PTX files are embedded into the final Rust binary via
+//! `include_str!` / `OUT_DIR` constants and JIT-loaded at runtime by cudarc.
+//!
+//! No-op when `gpu` feature is off — CPU-only builds have zero CUDA
+//! toolchain dependency.
+//!
+//! nvcc lookup order:
+//!   1. $NVCC env var
+//!   2. `nvcc` on PATH
+//!   3. `/usr/local/cuda-12.1/bin/nvcc`
+//!   4. `/usr/local/cuda/bin/nvcc`
+//!
+//! Default target: sm_86 (Ampere A10G / RTX 30xx). Override with $HTM_CUDA_ARCH (e.g. sm_90a for H200).
+use std::env;
+use std::path::PathBuf;
+use std::process::Command;
+fn main() {
+    // Re-run whenever we edit the build script or any kernel source.
+    println!("cargo:rerun-if-changed=build.rs");
+    let gpu = env::var_os("CARGO_FEATURE_GPU").is_some();
+    if !gpu {
+        return;
+    }
+    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
+    let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_86".into());
+    // Base kernels — compile for any sm_80+ GPU. Each .cu file → one .ptx file.
+    let base_kernels: &[&str] = &[
+        "sp_overlap",
+        "sp_topk",
+        "sp_learn",
+        "sp_duty",
+        "sp_boost_fused",
+        "tm_predict",
+        "tm_activate",
+        "tm_learn",
+        "tm_punish",
+        "tm_grow",
+        "tm_anomaly",
+        "tm_reset",
+    ];
+    // htm_fused_step now compiles for ALL architectures (sm_80+).
+    // On Hopper (sm_90+): uses cluster-distributed shared memory for hot state.
+    // On Ampere (sm_86) and other pre-Hopper: uses global memory reads/writes
+    // with grid.sync() for cross-block synchronization (cooperative launch).
+    let kernels: Vec<&str> = base_kernels.iter().chain(["htm_fused_step"].iter()).copied().collect();
+    let kernels_dir = PathBuf::from("src/gpu/kernels");
+    for k in &kernels {
+        let src = kernels_dir.join(format!("{k}.cu"));
+        println!("cargo:rerun-if-changed={}", src.display());
+    }
+    let nvcc = find_nvcc();
+    println!("cargo:warning=htm_rust: nvcc = {nvcc}");
+    println!("cargo:warning=htm_rust: target arch = {arch}");
+    // Prefer gcc-12 if present (CUDA 12.1 doesn't support gcc-13+ headers).
+    let host_compiler = env::var("HTM_CUDA_CCBIN")
+        .ok()
+        .or_else(|| {
+            for cand in ["/usr/bin/gcc-12", "/usr/bin/gcc-11"] {
+                if std::path::Path::new(cand).exists() {
+                    return Some(cand.to_string());
+                }
+            }
+            None
+        });
+    // Optionally patch the emitted PTX `.version` header down to match an
+    // older driver. Useful when the system driver (e.g. on WSL2) is older
+    // than the nvcc toolchain. Set HTM_PTX_VERSION to e.g. "7.8" or "8.0".
+    let ptx_version_override = env::var("HTM_PTX_VERSION").ok();
+    for k in kernels {
+        let src = kernels_dir.join(format!("{k}.cu"));
+        let ptx = out_dir.join(format!("{k}.ptx"));
+        if !src.exists() {
+            panic!("missing kernel source: {}", src.display());
+        }
+        let mut cmd = Command::new(&nvcc);
+        // Note: `--use_fast_math` breaks bit-parity with host `expf`, which
+        // in turn flips boost tie-breaks in SP learning. We accept the tiny
+        // perf loss for correctness; the hot overlap kernel has no transcendentals.
+        cmd.args([
+            "--ptx",
+            "-O3",
+            "-rdc=true",
+            "-arch",
+            &arch,
+        ]);
+        // `cooperative_groups::this_cluster()` is not declared for Ampere
+        // device compiles in CUDA 12.x, even if guarded by __CUDA_ARCH__ in
+        // some nvcc front-end phases. Define an explicit build-time kill
+        // switch for all non-Hopper targets so sm_86/A10G only sees the
+        // cooperative-grid path.
+        if !arch.starts_with("sm_90") {
+            cmd.arg("-DHTM_DISABLE_CLUSTER=1");
+        }
+        if let Some(cc) = &host_compiler {
+            cmd.args(["-ccbin", cc]);
+        }
+        cmd.arg("-o").arg(&ptx).arg(&src);
+        let status = cmd
+            .status()
+            .unwrap_or_else(|e| panic!("failed to spawn nvcc: {e}"));
+        if !status.success() {
+            panic!("nvcc failed for {}", src.display());
+        }
+        if let Some(ver) = &ptx_version_override {
+            // Read, patch, write.
+            let text = std::fs::read_to_string(&ptx)
+                .unwrap_or_else(|e| panic!("read {} failed: {e}", ptx.display()));
+            // Match `.version X.Y` where X and Y are digits. Replace whole line.
+            let patched: String = text
+                .lines()
+                .map(|line| {
+                    let t = line.trim_start();
+                    if t.starts_with(".version ") {
+                        format!(".version {ver}")
+                    } else {
+                        line.to_string()
+                    }
+                })
+                .collect::<Vec<_>>()
+                .join("\n");
+            std::fs::write(&ptx, patched)
+                .unwrap_or_else(|e| panic!("write {} failed: {e}", ptx.display()));
+        }
+    }
+    // Export OUT_DIR for include_str! in Rust.
+    println!(
+        "cargo:rustc-env=HTM_GPU_PTX_DIR={}",
+        out_dir.display()
+    );
+}
+fn find_nvcc() -> String {
+    if let Ok(n) = env::var("NVCC") {
+        return n;
+    }
+    // Try PATH.
+    if Command::new("nvcc").arg("--version").output().is_ok() {
+        return "nvcc".into();
+    }
+    for cand in [
+        "/usr/local/cuda-12.1/bin/nvcc",
+        "/usr/local/cuda/bin/nvcc",
+        "/usr/local/cuda-12/bin/nvcc",
+    ] {
+        if std::path::Path::new(cand).exists() {
+            return cand.into();
+        }
+    }
+    panic!(
+        "nvcc not found. Set $NVCC or install CUDA toolkit. \
+         Tried PATH, /usr/local/cuda-12.1, /usr/local/cuda."
+    );
+}

overlay/htm_rust/pyproject.toml CHANGED Viewed

@@ -1,17 +1,17 @@
-[build-system]
-requires = ["maturin>=1.4,<2.0"]
-build-backend = "maturin"
-[project]
-name = "htm_rust"
-version = "0.1.0"
-description = "Numenta BAMI-spec HTM (Spatial Pooler + Temporal Memory) in Rust with pyo3 bindings"
-requires-python = ">=3.11"
-classifiers = [
-    "Programming Language :: Rust",
-    "Programming Language :: Python :: Implementation :: CPython",
-]
-[tool.maturin]
-features = ["pyo3/extension-module"]
-module-name = "htm_rust"

+[build-system]
+requires = ["maturin>=1.4,<2.0"]
+build-backend = "maturin"
+[project]
+name = "htm_rust"
+version = "0.1.0"
+description = "Numenta BAMI-spec HTM (Spatial Pooler + Temporal Memory) in Rust with pyo3 bindings"
+requires-python = ">=3.11"
+classifiers = [
+    "Programming Language :: Rust",
+    "Programming Language :: Python :: Implementation :: CPython",
+]
+[tool.maturin]
+features = ["pyo3/extension-module"]
+module-name = "htm_rust"

overlay/htm_rust/src/gpu/fused.rs CHANGED Viewed

@@ -1,663 +1,702 @@
-//! Fused HTM megakernel launcher.
-//!
-//! Collapses the 12-kernel per-timestep pipeline (and the outer T-loop) into
-//! a single kernel launch per forward. See `kernels/htm_fused_step.cu` for
-//! the kernel design and the cross-block coherence strategy (grid barrier
-//! via device counter with all blocks concurrently resident).
-//!
-//! Launch invariant: `grid_dim.x <= concurrent-block capacity`. Host code
-//! probes the device SM count at construction and caps grid_dim.x
-//! accordingly — otherwise the grid barrier deadlocks.
-//!
-//! Semantic change from the top-K pipeline: activation is per-column
-//! threshold-based (local lateral inhibition) instead of global top-K.
-//! A per-column `inhibition_threshold` is tracked and EMA-steered to hit
-//! the sparsity target. This is a real architectural change and is
-//! documented in `docs/GPU_HTM.md`.
-#![cfg(feature = "gpu")]
-use std::ffi::CString;
-use std::sync::Arc;
-use cudarc::driver::{result, sys, CudaDevice, CudaSlice, DeviceRepr, DevicePtr, DriverError,
-                      LaunchConfig};
-use cudarc::nvrtc::Ptx;
-use super::sp_gpu::SpatialPoolerGpu;
-use super::tm_gpu::{TemporalMemoryGpu, MAX_SEGMENTS_PER_CELL, MAX_SYN_PER_SEGMENT};
-const PTX_HTM_FUSED: &str =
-    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/htm_fused_step.ptx"));
-/// Struct-by-value pointer pack — matches C-side `FusedPtrs`.
-///
-/// NOTE: `barrier_counters` is kept as an ABI-compat dummy (always 0). The
-/// C-side `FusedPtrs` still has the field at the same byte offset; removing
-/// it here would shift all subsequent fields and break the layout. Worker A
-/// will eventually delete the field from both sides once the kernel is
-/// updated; until then we zero it.
-#[repr(C)]
-#[derive(Clone, Copy)]
-pub struct FusedPtrs {
-    pub syn_bit: u64,
-    pub syn_perm: u64,
-    pub boost: u64,
-    pub active_duty: u64,
-    pub inhibition_threshold: u64,
-    pub seg_cell_id: u64,
-    pub seg_syn_count: u64,
-    pub syn_presyn: u64,
-    pub tm_syn_perm: u64,
-    pub cell_seg_count: u64,
-    pub cell_active_a: u64,
-    pub cell_active_b: u64,
-    pub cell_winner_a: u64,
-    pub cell_winner_b: u64,
-    pub inputs: u64,
-    pub cols_out: u64,
-    pub anom_out: u64,
-    /// ABI-compat dummy — always 0. No device memory is allocated for this
-    /// field; the cluster barrier replaces the old software DLB barrier.
-    pub barrier_counters: u64,
-    pub step_scratch: u64,
-}
-unsafe impl DeviceRepr for FusedPtrs {}
-/// Launch-time config — matches C-side `FusedConfig` 1:1.
-#[repr(C)]
-#[derive(Clone, Copy)]
-pub struct FusedConfig {
-    pub input_bits: u32,
-    pub n_columns: u32,
-    pub synapses_per_col: u32,
-    pub conn_thr: f32,
-    pub sp_inc: f32,
-    pub sp_dec: f32,
-    pub sparsity_target: f32,
-    pub duty_alpha: f32,
-    pub thr_adapt_rate: f32,
-    pub cells_per_column: u32,
-    pub n_cells: u32,
-    pub bits_words: u32,
-    pub max_segments_per_cell: u32,
-    pub synapses_per_segment: u32,
-    pub activation_threshold: u32,
-    pub learning_threshold: u32,
-    pub max_new_synapses: u32,
-    pub conn_thr_i16: i32,
-    pub perm_inc_i16: i32,
-    pub perm_dec_i16: i32,
-    pub predicted_seg_dec_i16: i32,
-    pub initial_perm_i16: i32,
-    pub t: u32,
-    pub learn: u32,
-    pub iter_seed: u32,
-    pub cooperative_grid_sync: u32,
-}
-unsafe impl DeviceRepr for FusedConfig {}
-/// Cluster launch parameters probed at construction time.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) struct ClusterInfo {
-    /// Maximum cluster size supported by this device (0 = cluster unsupported).
-    pub max_cluster_size: u32,
-}
-// There is only ONE launch mode: non-cooperative launch with Hopper Thread
-// Block Cluster attribute (`CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION`). The old
-// software DLB barrier and the cooperative-launch path are both removed.
-// Cluster barriers replace both.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub(crate) struct FusedLaunchPlan {
-    pub grid_dim_x: u32,
-    pub block_dim_x: u32,
-    pub cooperative_grid_limit: u32,
-    pub sm_count: u32,
-}
-fn fused_grid_cap_override() -> Option<u32> {
-    std::env::var("HTM_FUSED_GRID_CAP")
-        .ok()
-        .and_then(|s| s.parse::<u32>().ok())
-        .map(|v| v.max(1))
-}
-pub(crate) fn plan_fused_launch(
-    sm_count: u32,
-    cooperative_supported: bool,
-    cooperative_grid_limit: u32,
-    grid_cap_override: Option<u32>,
-) -> Result<FusedLaunchPlan, String> {
-    let sm_count = sm_count.max(1);
-    // 1024 threads/block exceeds the register file on Ampere (sm_86: 65536
-    // regs/SM ÷ 1024 = 64 regs/thread; fused kernel needs ~80+). 256 gives
-    // 256 regs/thread which is ample. Compensate with more blocks via
-    // cooperative launch. On Hopper (228 KB smem, 255 regs/thread baseline),
-    // 1024 works fine, but 256 is safe everywhere.
-    let block_dim_x = 256u32;
-    // Cluster launch path: cooperative launch is not required. Keep the probe
-    // result for residency estimation only.
-    if !cooperative_supported {
-        eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
-    }
-    // Tested grid_cap: 4 blocks = 30ms (too serial), 16 blocks = 10.8ms (parallel wins).
-    // Parallelism in SP overlap + TM predict stages outweighs grid.sync() cost.
-    let default_grid_cap = 16u32;
-    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap);
-    let resident_bound = if cooperative_grid_limit > 0 {
-        cooperative_grid_limit.max(sm_count * 2)
-    } else {
-        sm_count * 2
-    };
-    Ok(FusedLaunchPlan {
-        grid_dim_x: resident_bound.min(grid_cap).max(1),
-        block_dim_x,
-        cooperative_grid_limit: resident_bound,
-        sm_count,
-    })
-}
-pub(super) struct RawFusedKernel {
-    module: sys::CUmodule,
-    pub(super) function: sys::CUfunction,
-    pub(super) function_batched: sys::CUfunction,
-}
-unsafe impl Send for RawFusedKernel {}
-unsafe impl Sync for RawFusedKernel {}
-impl Drop for RawFusedKernel {
-    fn drop(&mut self) {
-        unsafe {
-            let _ = result::module::unload(self.module);
-        }
-    }
-}
-/// Owns fused-path-only device state:
-///   - per-column inhibition threshold (replaces global top-K)
-///   - ping-pong cell_active/cell_winner bitsets
-///   - step_scratch (n_active, n_unpred per timestep)
-///   - cluster launch capability info
-pub struct FusedState {
-    dev: Arc<CudaDevice>,
-    pub(super) raw_kernel: RawFusedKernel,
-    pub inhibition_threshold: CudaSlice<f32>,
-    pub cell_active_bits_a: CudaSlice<u32>,
-    pub cell_active_bits_b: CudaSlice<u32>,
-    pub cell_winner_bits_a: CudaSlice<u32>,
-    pub cell_winner_bits_b: CudaSlice<u32>,
-    pub step_scratch: CudaSlice<u32>,       // length 6
-    pub grid_dim_x: u32,
-    pub block_dim_x: u32,
-    pub cooperative_grid_limit: u32,
-    pub iter_counter: u32,
-    /// Hopper cluster launch capability (0 = unsupported).
-    pub cluster_info: ClusterInfo,
-    // Config mirror (read-only after init).
-    #[allow(dead_code)]
-    pub initial_threshold: f32,
-}
-impl FusedState {
-    pub fn new(
-        dev: Arc<CudaDevice>,
-        n_columns: usize,
-        cells_per_column: usize,
-        initial_threshold: f32,
-    ) -> Result<Self, DriverError> {
-        let n_cells = n_columns * cells_per_column;
-        assert!(n_cells % 32 == 0, "n_cells must be divisible by 32 for bitsets");
-        let bits_words = n_cells / 32;
-        let mut inhibition_threshold = dev.alloc_zeros::<f32>(n_columns)?;
-        let init_vec = vec![initial_threshold; n_columns];
-        dev.htod_sync_copy_into(&init_vec, &mut inhibition_threshold)?;
-        let cell_active_bits_a = dev.alloc_zeros::<u32>(bits_words)?;
-        let cell_active_bits_b = dev.alloc_zeros::<u32>(bits_words)?;
-        let cell_winner_bits_a = dev.alloc_zeros::<u32>(bits_words)?;
-        let cell_winner_bits_b = dev.alloc_zeros::<u32>(bits_words)?;
-        let step_scratch = dev.alloc_zeros::<u32>(6)?;
-        unsafe {
-            result::ctx::set_current(*dev.cu_primary_ctx())?;
-        }
-        if dev.get_func("htm_fused", "htm_fused_step").is_none() {
-            dev.load_ptx(
-                Ptx::from_src(PTX_HTM_FUSED),
-                "htm_fused",
-                &["htm_fused_step", "htm_fused_step_batched"],
-            )?;
-        }
-        let ptx = CString::new(PTX_HTM_FUSED).expect("PTX contains no interior nul bytes");
-        let module = unsafe { result::module::load_data(ptx.as_ptr().cast()) }?;
-        let function = unsafe {
-            result::module::get_function(module, CString::new("htm_fused_step").unwrap())
-        }?;
-        let function_batched = unsafe {
-            result::module::get_function(module, CString::new("htm_fused_step_batched").unwrap())
-        }?;
-        // Cluster size 16 on Hopper is "non-portable" (> 8 requires opt-in).
-        // Must set CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED=1 on
-        // every launched kernel function, otherwise cuLaunchKernelEx rejects
-        // the cluster dim with CUDA_ERROR_INVALID_CLUSTER_SIZE.
-        unsafe {
-            let attr = sys::CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED;
-            // Ignore errors: older CUDA may lack the attribute, in which case
-            // only portable sizes (<= 8) work — plan_fused_launch caps at 8.
-            let _ = sys::lib().cuFuncSetAttribute(function, attr, 1);
-            let _ = sys::lib().cuFuncSetAttribute(function_batched, attr, 1);
-        }
-        // Probe SM count.
-        let sm_count = match dev.attribute(
-            cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-        ) {
-            Ok(v) => v as u32,
-            Err(_) => 16u32,
-        };
-        // T1: Probe Hopper cluster launch capability.
-        let max_cluster_size = match dev.attribute(
-            cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH,
-        ) {
-            Ok(v) if v > 0 => {
-                // H200/sm_90a supports up to 16 blocks per cluster.
-                // There is no MAX_CLUSTER_SIZE attribute in CUDA 12.4; hard-code the
-                // Hopper maximum which is 16 (8 SMs × 2 blocks/SM = 16 blocks/cluster).
-                16u32
-            }
-            _ => 0u32,
-        };
-        eprintln!("[htm_rust] cluster: max_cluster_size={}", max_cluster_size);
-        let cluster_info = ClusterInfo { max_cluster_size };
-        let cooperative_supported = matches!(
-            dev.attribute(sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH),
-            Ok(v) if v > 0
-        );
-        let cooperative_grid_limit = if cooperative_supported {
-            let blocks_per_sm = unsafe {
-                result::occupancy::max_active_block_per_multiprocessor(function, 1024, 0)
-            }
-            .ok()
-            .map(|v| v.max(0) as u32)
-            .unwrap_or(0);
-            sm_count.saturating_mul(blocks_per_sm)
-        } else {
-            0
-        };
-        let launch_plan = plan_fused_launch(
-            sm_count,
-            cooperative_supported,
-            cooperative_grid_limit,
-            fused_grid_cap_override(),
-        )
-        .map_err(|msg| {
-            // Surface as a CUDA-ish error so callers can propagate.
-            eprintln!("[htm_rust] FATAL: {msg}");
-            DriverError(cudarc::driver::sys::CUresult::CUDA_ERROR_NOT_SUPPORTED)
-        })?;
-        eprintln!(
-            "[htm_rust] fused kernel: sm_count={} grid_dim_x={} cooperative_grid_limit={} cluster_max={}",
-            launch_plan.sm_count, launch_plan.grid_dim_x, launch_plan.cooperative_grid_limit,
-            cluster_info.max_cluster_size,
-        );
-        Ok(Self {
-            dev,
-            raw_kernel: RawFusedKernel { module, function, function_batched },
-            inhibition_threshold,
-            cell_active_bits_a,
-            cell_active_bits_b,
-            cell_winner_bits_a,
-            cell_winner_bits_b,
-            step_scratch,
-            grid_dim_x: launch_plan.grid_dim_x,
-            block_dim_x: launch_plan.block_dim_x,
-            cooperative_grid_limit: launch_plan.cooperative_grid_limit,
-            iter_counter: 0,
-            cluster_info,
-            initial_threshold,
-        })
-    }
-    /// Reset fused state. Called at region.reset().
-    pub fn reset(&mut self) -> Result<(), DriverError> {
-        self.dev.memset_zeros(&mut self.cell_active_bits_a)?;
-        self.dev.memset_zeros(&mut self.cell_active_bits_b)?;
-        self.dev.memset_zeros(&mut self.cell_winner_bits_a)?;
-        self.dev.memset_zeros(&mut self.cell_winner_bits_b)?;
-        self.dev.memset_zeros(&mut self.step_scratch)?;
-        // Do NOT reset inhibition_threshold — it's learned state. A hard
-        // reset of TM state should NOT forget the sparsity calibration.
-        Ok(())
-    }
-}
-/// Launch the fused megakernel. Processes all T timesteps in one kernel.
-///
-/// Uses `cuLaunchKernelEx` with `CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION=(16,1,1)`
-/// when the device supports cluster launch, otherwise falls back to a plain
-/// `launch_kernel`. For single-region launches, grid_dim_x <= 16 ensures the
-/// entire grid fits in one cluster.
-#[allow(clippy::too_many_arguments)]
-pub fn launch_fused(
-    sp: &mut SpatialPoolerGpu,
-    tm: &mut TemporalMemoryGpu,
-    fused: &mut FusedState,
-    inputs_flat: &CudaSlice<u8>,
-    cols_out: &mut CudaSlice<u8>,
-    anom_out: &mut CudaSlice<f32>,
-    t: usize,
-    input_bits: usize,
-    learn: bool,
-) -> Result<(), DriverError> {
-    // Reset step_scratch before each launch (safe re-entry).
-    sp.dev_ref().memset_zeros(&mut fused.step_scratch)?;
-    fused.iter_counter = fused.iter_counter.wrapping_add(1);
-    let cfg = FusedConfig {
-        input_bits: input_bits as u32,
-        n_columns: sp.n_columns_accessor() as u32,
-        synapses_per_col: sp.synapses_per_col_accessor() as u32,
-        conn_thr: sp.conn_thr_accessor(),
-        sp_inc: sp.inc_accessor(),
-        sp_dec: sp.dec_accessor(),
-        sparsity_target: sp.sparsity_accessor(),
-        duty_alpha: 1.0f32 / sp.duty_period_accessor().max(1.0),
-        thr_adapt_rate: 0.001f32,
-        cells_per_column: tm.cells_per_column as u32,
-        n_cells: tm.n_cells as u32,
-        bits_words: tm.bits_words as u32,
-        max_segments_per_cell: MAX_SEGMENTS_PER_CELL as u32,
-        synapses_per_segment: MAX_SYN_PER_SEGMENT as u32,
-        activation_threshold: tm.activation_threshold,
-        learning_threshold: tm.learning_threshold,
-        max_new_synapses: tm.max_new_synapse_count,
-        conn_thr_i16: tm.conn_thr_i16 as i32,
-        perm_inc_i16: tm.perm_inc_i16 as i32,
-        perm_dec_i16: tm.perm_dec_i16 as i32,
-        predicted_seg_dec_i16: tm.predicted_seg_dec_i16 as i32,
-        initial_perm_i16: tm.initial_perm_i16 as i32,
-        t: t as u32,
-        learn: if learn { 1 } else { 0 },
-        iter_seed: fused.iter_counter,
-        cooperative_grid_sync: 1,
-    };
-    let ptrs = FusedPtrs {
-        syn_bit: *sp.syn_bit_accessor().device_ptr(),
-        syn_perm: *sp.syn_perm_accessor().device_ptr(),
-        boost: *sp.boost_accessor().device_ptr(),
-        active_duty: *sp.active_duty_accessor().device_ptr(),
-        inhibition_threshold: *fused.inhibition_threshold.device_ptr(),
-        seg_cell_id: *tm.seg_cell_id_accessor().device_ptr(),
-        seg_syn_count: *tm.seg_syn_count_accessor().device_ptr(),
-        syn_presyn: *tm.syn_presyn_accessor().device_ptr(),
-        tm_syn_perm: *tm.syn_perm_accessor().device_ptr(),
-        cell_seg_count: *tm.cell_seg_count_accessor().device_ptr(),
-        cell_active_a: *fused.cell_active_bits_a.device_ptr(),
-        cell_active_b: *fused.cell_active_bits_b.device_ptr(),
-        cell_winner_a: *fused.cell_winner_bits_a.device_ptr(),
-        cell_winner_b: *fused.cell_winner_bits_b.device_ptr(),
-        inputs: *inputs_flat.device_ptr(),
-        cols_out: *cols_out.device_ptr(),
-        anom_out: *anom_out.device_ptr(),
-        barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
-        step_scratch: *fused.step_scratch.device_ptr(),
-    };
-    let grid_x = fused.grid_dim_x;
-    let block_x = fused.block_dim_x;
-    let cu_stream = *sp.dev_ref().cu_stream();
-    let use_cluster = fused.cluster_info.max_cluster_size > 0;
-    unsafe {
-        result::ctx::set_current(*sp.dev_ref().cu_primary_ctx())?;
-        let mut kernel_params: [*mut std::ffi::c_void; 2] = [
-            (&ptrs as *const FusedPtrs).cast_mut().cast(),
-            (&cfg as *const FusedConfig).cast_mut().cast(),
-        ];
-        if use_cluster {
-            // T10: Hopper cluster launch with CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION.
-            // cluster_dim=(16,1,1) maps the entire single-region grid into one cluster.
-            let mut attr: sys::CUlaunchAttribute = std::mem::zeroed();
-            attr.id = sys::CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
-            attr.value.clusterDim.x = 16;
-            attr.value.clusterDim.y = 1;
-            attr.value.clusterDim.z = 1;
-            let mut launch_cfg: sys::CUlaunchConfig = std::mem::zeroed();
-            launch_cfg.gridDimX = grid_x;
-            launch_cfg.gridDimY = 1;
-            launch_cfg.gridDimZ = 1;
-            launch_cfg.blockDimX = block_x;
-            launch_cfg.blockDimY = 1;
-            launch_cfg.blockDimZ = 1;
-            launch_cfg.sharedMemBytes = 0;
-            launch_cfg.hStream = cu_stream;
-            launch_cfg.numAttrs = 1;
-            launch_cfg.attrs = &mut attr as *mut sys::CUlaunchAttribute;
-            let ret = sys::lib().cuLaunchKernelEx(
-                &launch_cfg as *const sys::CUlaunchConfig,
-                fused.raw_kernel.function,
-                kernel_params.as_mut_ptr(),
-                std::ptr::null_mut(),
-            );
-            if ret != sys::CUresult::CUDA_SUCCESS {
-                return Err(DriverError(ret));
-            }
-        } else {
-            // Pre-Hopper: cooperative kernel launch. The fused kernel uses
-            // grid.sync() for cross-block synchronization which REQUIRES
-            // cuLaunchCooperativeKernel (normal launch silently crashes on
-            // the first grid.sync() call).
-            let ret = sys::lib().cuLaunchCooperativeKernel(
-                fused.raw_kernel.function,
-                grid_x, 1, 1,
-                block_x, 1, 1,
-                0,  // sharedMemBytes
-                cu_stream,
-                kernel_params.as_mut_ptr(),
-            );
-            if ret != sys::CUresult::CUDA_SUCCESS {
-                return Err(DriverError(ret));
-            }
-        }
-    }
-    Ok(())
-}
-/// Single batched non-cooperative launch for B regions with DLB sync. Uses the same kernel
-/// body; each block reads its region's FusedPtrs from a device-side array
-/// indexed by blockIdx.y. All regions share the same config (same
-/// input_bits/n_columns/etc.) so we pass one FusedConfig.
-///
-/// This breaks through the CUDA cooperative-kernel device-level
-/// serialization: multiple cooperative launches are serialized regardless
-/// of stream, but one cooperative launch with grid.y=B processes all
-/// regions in a single invocation — ~B× speedup vs B sequential launches.
-#[allow(clippy::too_many_arguments)]
-/// Low-level raw-pointer entry, called by PyO3 binding which holds the
-/// mutable borrows. Safety: each `*mut HTMRegionGpu` must point to a live,
-/// uniquely-borrowed region. All regions must be distinct.
-pub(super) fn launch_fused_batched_raw(
-    region_ptrs: &[*mut super::HTMRegionGpu],
-    inputs_per_region: &[u64],
-    cols_per_region: &[u64],
-    anom_per_region: &[u64],
-    t: usize,
-    input_bits: usize,
-    learn: bool,
-) -> Result<(), DriverError> {
-    let b = region_ptrs.len();
-    assert_eq!(inputs_per_region.len(), b);
-    assert_eq!(cols_per_region.len(), b);
-    assert_eq!(anom_per_region.len(), b);
-    assert!(b >= 1, "need at least one region");
-    // Reset per-region step_scratch before each launch.
-    for &rp in region_ptrs.iter() {
-        let r = unsafe { &mut *rp };
-        let dev = r.sp_gpu.dev_ref().clone();
-        dev.memset_zeros(&mut r.fused_state.step_scratch)?;
-        r.fused_state.iter_counter = r.fused_state.iter_counter.wrapping_add(1);
-    }
-    // Shared config — all regions use identical sp/tm parameters.
-    let (grid_x, block_x, function_batched, cu_stream, cu_ctx) = {
-        let r0 = unsafe { &*region_ptrs[0] };
-        (
-            r0.fused_state.grid_dim_x,
-            r0.fused_state.block_dim_x,
-            r0.fused_state.raw_kernel.function_batched,
-            *r0.sp_gpu.dev_ref().cu_stream(),
-            *r0.sp_gpu.dev_ref().cu_primary_ctx(),
-        )
-    };
-    let cfg = {
-        let r = unsafe { &*region_ptrs[0] };
-        FusedConfig {
-            input_bits: input_bits as u32,
-            n_columns: r.sp_gpu.n_columns_accessor() as u32,
-            synapses_per_col: r.sp_gpu.synapses_per_col_accessor() as u32,
-            conn_thr: r.sp_gpu.conn_thr_accessor(),
-            sp_inc: r.sp_gpu.inc_accessor(),
-            sp_dec: r.sp_gpu.dec_accessor(),
-            sparsity_target: r.sp_gpu.sparsity_accessor(),
-            duty_alpha: 1.0f32 / r.sp_gpu.duty_period_accessor().max(1.0),
-            thr_adapt_rate: 0.001f32,
-            cells_per_column: r.tm_gpu.cells_per_column as u32,
-            n_cells: r.tm_gpu.n_cells as u32,
-            bits_words: r.tm_gpu.bits_words as u32,
-            max_segments_per_cell: MAX_SEGMENTS_PER_CELL as u32,
-            synapses_per_segment: MAX_SYN_PER_SEGMENT as u32,
-            activation_threshold: r.tm_gpu.activation_threshold,
-            learning_threshold: r.tm_gpu.learning_threshold,
-            max_new_synapses: r.tm_gpu.max_new_synapse_count,
-            conn_thr_i16: r.tm_gpu.conn_thr_i16 as i32,
-            perm_inc_i16: r.tm_gpu.perm_inc_i16 as i32,
-            perm_dec_i16: r.tm_gpu.perm_dec_i16 as i32,
-            predicted_seg_dec_i16: r.tm_gpu.predicted_seg_dec_i16 as i32,
-            initial_perm_i16: r.tm_gpu.initial_perm_i16 as i32,
-            t: t as u32,
-            learn: if learn { 1 } else { 0 },
-            iter_seed: r.fused_state.iter_counter,
-            cooperative_grid_sync: 1,
-        }
-    };
-    // Build B FusedPtrs per-region.
-    let ptrs_vec: Vec<FusedPtrs> = (0..b)
-        .map(|i| {
-            let r = unsafe { &*region_ptrs[i] };
-            FusedPtrs {
-                syn_bit: *r.sp_gpu.syn_bit_accessor().device_ptr(),
-                syn_perm: *r.sp_gpu.syn_perm_accessor().device_ptr(),
-                boost: *r.sp_gpu.boost_accessor().device_ptr(),
-                active_duty: *r.sp_gpu.active_duty_accessor().device_ptr(),
-                inhibition_threshold: *r.fused_state.inhibition_threshold.device_ptr(),
-                seg_cell_id: *r.tm_gpu.seg_cell_id_accessor().device_ptr(),
-                seg_syn_count: *r.tm_gpu.seg_syn_count_accessor().device_ptr(),
-                syn_presyn: *r.tm_gpu.syn_presyn_accessor().device_ptr(),
-                tm_syn_perm: *r.tm_gpu.syn_perm_accessor().device_ptr(),
-                cell_seg_count: *r.tm_gpu.cell_seg_count_accessor().device_ptr(),
-                cell_active_a: *r.fused_state.cell_active_bits_a.device_ptr(),
-                cell_active_b: *r.fused_state.cell_active_bits_b.device_ptr(),
-                cell_winner_a: *r.fused_state.cell_winner_bits_a.device_ptr(),
-                cell_winner_b: *r.fused_state.cell_winner_bits_b.device_ptr(),
-                inputs: inputs_per_region[i],
-                cols_out: cols_per_region[i],
-                anom_out: anom_per_region[i],
-                barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
-                step_scratch: *r.fused_state.step_scratch.device_ptr(),
-            }
-        })
-        .collect();
-    // Upload FusedPtrs array to device (B * sizeof(FusedPtrs) bytes).
-    // FusedPtrs is repr(C) + DeviceRepr so htod_sync_copy handles it.
-    let dev = unsafe { &*region_ptrs[0] }.sp_gpu.dev_ref().clone();
-    let ptrs_dev: CudaSlice<FusedPtrs> = dev.htod_sync_copy(&ptrs_vec)?;
-    let ptrs_dev_ptr: u64 = *ptrs_dev.device_ptr();
-    // T10: Cluster launch for batched regions.
-    // Grid = (grid_x, B, 1) with cluster_dim=(16,1,1): each region (Y slice)
-    // occupies exactly one cluster of 16 blocks. All 8 clusters run concurrently
-    // on the H200's 132 SMs (8 × 16 = 128 blocks ≤ 132 SMs).
-    let use_cluster = {
-        let r0 = unsafe { &*region_ptrs[0] };
-        r0.fused_state.cluster_info.max_cluster_size > 0
-    };
-    unsafe {
-        result::ctx::set_current(cu_ctx)?;
-        let mut kernel_params: [*mut std::ffi::c_void; 2] = [
-            (&ptrs_dev_ptr as *const u64).cast_mut().cast(),
-            (&cfg as *const FusedConfig).cast_mut().cast(),
-        ];
-        if use_cluster {
-            let mut attr: sys::CUlaunchAttribute = std::mem::zeroed();
-            attr.id = sys::CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
-            attr.value.clusterDim.x = 16;
-            attr.value.clusterDim.y = 1;
-            attr.value.clusterDim.z = 1;
-            let mut launch_cfg: sys::CUlaunchConfig = std::mem::zeroed();
-            launch_cfg.gridDimX = grid_x;
-            launch_cfg.gridDimY = b as u32;
-            launch_cfg.gridDimZ = 1;
-            launch_cfg.blockDimX = block_x;
-            launch_cfg.blockDimY = 1;
-            launch_cfg.blockDimZ = 1;
-            launch_cfg.sharedMemBytes = 0;
-            launch_cfg.hStream = cu_stream;
-            launch_cfg.numAttrs = 1;
-            launch_cfg.attrs = &mut attr as *mut sys::CUlaunchAttribute;
-            let ret = sys::lib().cuLaunchKernelEx(
-                &launch_cfg as *const sys::CUlaunchConfig,
-                function_batched,
-                kernel_params.as_mut_ptr(),
-                std::ptr::null_mut(),
-            );
-            if ret != sys::CUresult::CUDA_SUCCESS {
-                return Err(DriverError(ret));
-            }
-        } else {
-            // Pre-Hopper: cooperative kernel launch (grid.sync() requires it).
-            let ret = sys::lib().cuLaunchCooperativeKernel(
-                function_batched,
-                grid_x, b as u32, 1,
-                block_x, 1, 1,
-                0,  // sharedMemBytes
-                cu_stream,
-                kernel_params.as_mut_ptr(),
-            );
-            if ret != sys::CUresult::CUDA_SUCCESS {
-                return Err(DriverError(ret));
-            }
-        }
-    }
-    Ok(())
-}

+//! Fused HTM megakernel launcher.
+//!
+//! Collapses the 12-kernel per-timestep pipeline (and the outer T-loop) into
+//! a single kernel launch per forward. See `kernels/htm_fused_step.cu` for
+//! the kernel design and the cross-block coherence strategy (grid barrier
+//! via device counter with all blocks concurrently resident).
+//!
+//! Launch invariant: `grid_dim.x <= concurrent-block capacity`. Host code
+//! probes the device SM count at construction and caps grid_dim.x
+//! accordingly — otherwise the grid barrier deadlocks.
+//!
+//! Semantic change from the top-K pipeline: activation is per-column
+//! threshold-based (local lateral inhibition) instead of global top-K.
+//! A per-column `inhibition_threshold` is tracked and EMA-steered to hit
+//! the sparsity target. This is a real architectural change and is
+//! documented in `docs/GPU_HTM.md`.
+#![cfg(feature = "gpu")]
+use std::ffi::CString;
+use std::sync::Arc;
+use cudarc::driver::{result, sys, CudaDevice, CudaSlice, DeviceRepr, DevicePtr, DriverError,
+                      LaunchConfig};
+use cudarc::nvrtc::Ptx;
+use super::sp_gpu::SpatialPoolerGpu;
+use super::tm_gpu::{TemporalMemoryGpu, MAX_SEGMENTS_PER_CELL, MAX_SYN_PER_SEGMENT};
+const PTX_HTM_FUSED: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/htm_fused_step.ptx"));
+/// Struct-by-value pointer pack — matches C-side `FusedPtrs`.
+///
+/// NOTE: `barrier_counters` is kept as an ABI-compat dummy (always 0). The
+/// C-side `FusedPtrs` still has the field at the same byte offset; removing
+/// it here would shift all subsequent fields and break the layout. Worker A
+/// will eventually delete the field from both sides once the kernel is
+/// updated; until then we zero it.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct FusedPtrs {
+    pub syn_bit: u64,
+    pub syn_perm: u64,
+    pub boost: u64,
+    pub active_duty: u64,
+    pub inhibition_threshold: u64,
+    pub seg_cell_id: u64,
+    pub seg_syn_count: u64,
+    pub syn_presyn: u64,
+    pub tm_syn_perm: u64,
+    pub cell_seg_count: u64,
+    pub cell_active_a: u64,
+    pub cell_active_b: u64,
+    pub cell_winner_a: u64,
+    pub cell_winner_b: u64,
+    pub inputs: u64,
+    pub cols_out: u64,
+    pub anom_out: u64,
+    /// ABI-compat dummy — always 0. No device memory is allocated for this
+    /// field; the cluster barrier replaces the old software DLB barrier.
+    pub barrier_counters: u64,
+    pub step_scratch: u64,
+}
+unsafe impl DeviceRepr for FusedPtrs {}
+/// Launch-time config — matches C-side `FusedConfig` 1:1.
+#[repr(C)]
+#[derive(Clone, Copy)]
+pub struct FusedConfig {
+    pub input_bits: u32,
+    pub n_columns: u32,
+    pub synapses_per_col: u32,
+    pub conn_thr: f32,
+    pub sp_inc: f32,
+    pub sp_dec: f32,
+    pub sparsity_target: f32,
+    pub duty_alpha: f32,
+    pub thr_adapt_rate: f32,
+    pub cells_per_column: u32,
+    pub n_cells: u32,
+    pub bits_words: u32,
+    pub max_segments_per_cell: u32,
+    pub synapses_per_segment: u32,
+    pub activation_threshold: u32,
+    pub learning_threshold: u32,
+    pub max_new_synapses: u32,
+    pub conn_thr_i16: i32,
+    pub perm_inc_i16: i32,
+    pub perm_dec_i16: i32,
+    pub predicted_seg_dec_i16: i32,
+    pub initial_perm_i16: i32,
+    pub t: u32,
+    pub learn: u32,
+    pub iter_seed: u32,
+    pub cooperative_grid_sync: u32,
+}
+unsafe impl DeviceRepr for FusedConfig {}
+/// Cluster launch parameters probed at construction time.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) struct ClusterInfo {
+    /// Maximum cluster size supported by this device (0 = cluster unsupported).
+    pub max_cluster_size: u32,
+}
+// There is only ONE launch mode: non-cooperative launch with Hopper Thread
+// Block Cluster attribute (`CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION`). The old
+// software DLB barrier and the cooperative-launch path are both removed.
+// Cluster barriers replace both.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) struct FusedLaunchPlan {
+    pub grid_dim_x: u32,
+    pub block_dim_x: u32,
+    pub cooperative_grid_limit: u32,
+    pub sm_count: u32,
+}
+fn fused_grid_cap_override() -> Option<u32> {
+    std::env::var("HTM_FUSED_GRID_CAP")
+        .ok()
+        .and_then(|s| s.parse::<u32>().ok())
+        .map(|v| v.max(1))
+}
+pub(crate) fn plan_fused_launch(
+    sm_count: u32,
+    cooperative_supported: bool,
+    cooperative_grid_limit: u32,
+    grid_cap_override: Option<u32>,
+) -> Result<FusedLaunchPlan, String> {
+    let sm_count = sm_count.max(1);
+    // 1024 threads/block exceeds the register file on Ampere (sm_86: 65536
+    // regs/SM ÷ 1024 = 64 regs/thread; fused kernel needs ~80+). 256 gives
+    // 256 regs/thread which is ample. Compensate with more blocks via
+    // cooperative launch. On Hopper (228 KB smem, 255 regs/thread baseline),
+    // 1024 works fine, but 256 is safe everywhere.
+    let block_dim_x = 256u32;
+    // Cluster launch path: cooperative launch is not required. Keep the probe
+    // result for residency estimation only.
+    if !cooperative_supported {
+        eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
+    }
+    // Tested grid_cap: 4 blocks = 30ms (too serial), 16 blocks = 10.8ms (parallel wins).
+    // Parallelism in SP overlap + TM predict stages outweighs grid.sync() cost.
+    let default_grid_cap = 16u32;
+    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap);
+    let resident_bound = if cooperative_grid_limit > 0 {
+        cooperative_grid_limit.max(sm_count * 2)
+    } else {
+        sm_count * 2
+    };
+    Ok(FusedLaunchPlan {
+        grid_dim_x: resident_bound.min(grid_cap).max(1),
+        block_dim_x,
+        cooperative_grid_limit: resident_bound,
+        sm_count,
+    })
+}
+pub(crate) fn plan_batched_grid_dim(
+    grid_dim_x: u32,
+    cooperative_grid_limit: u32,
+    batch_regions: usize,
+    use_cluster: bool,
+) -> Result<u32, String> {
+    if use_cluster {
+        return Ok(grid_dim_x.max(1));
+    }
+    let batch_regions = batch_regions.max(1) as u32;
+    if cooperative_grid_limit == 0 {
+        return Err("COOPERATIVE_LAUNCH_TOO_LARGE: cooperative launch limit unavailable".into());
+    }
+    let max_grid_x = cooperative_grid_limit / batch_regions;
+    if max_grid_x == 0 {
+        return Err(format!(
+            "COOPERATIVE_LAUNCH_TOO_LARGE: batch_regions={batch_regions} exceeds cooperative_grid_limit={cooperative_grid_limit}"
+        ));
+    }
+    Ok(grid_dim_x.min(max_grid_x).max(1))
+}
+pub(super) struct RawFusedKernel {
+    module: sys::CUmodule,
+    pub(super) function: sys::CUfunction,
+    pub(super) function_batched: sys::CUfunction,
+}
+unsafe impl Send for RawFusedKernel {}
+unsafe impl Sync for RawFusedKernel {}
+impl Drop for RawFusedKernel {
+    fn drop(&mut self) {
+        unsafe {
+            let _ = result::module::unload(self.module);
+        }
+    }
+}
+/// Owns fused-path-only device state:
+///   - per-column inhibition threshold (replaces global top-K)
+///   - ping-pong cell_active/cell_winner bitsets
+///   - step_scratch (n_active, n_unpred per timestep)
+///   - cluster launch capability info
+pub struct FusedState {
+    dev: Arc<CudaDevice>,
+    pub(super) raw_kernel: RawFusedKernel,
+    pub inhibition_threshold: CudaSlice<f32>,
+    pub cell_active_bits_a: CudaSlice<u32>,
+    pub cell_active_bits_b: CudaSlice<u32>,
+    pub cell_winner_bits_a: CudaSlice<u32>,
+    pub cell_winner_bits_b: CudaSlice<u32>,
+    pub step_scratch: CudaSlice<u32>,       // length 6
+    pub grid_dim_x: u32,
+    pub block_dim_x: u32,
+    pub cooperative_grid_limit: u32,
+    pub iter_counter: u32,
+    /// Hopper cluster launch capability (0 = unsupported).
+    pub cluster_info: ClusterInfo,
+    // Config mirror (read-only after init).
+    #[allow(dead_code)]
+    pub initial_threshold: f32,
+}
+impl FusedState {
+    pub fn new(
+        dev: Arc<CudaDevice>,
+        n_columns: usize,
+        cells_per_column: usize,
+        initial_threshold: f32,
+    ) -> Result<Self, DriverError> {
+        let n_cells = n_columns * cells_per_column;
+        assert!(n_cells % 32 == 0, "n_cells must be divisible by 32 for bitsets");
+        let bits_words = n_cells / 32;
+        let mut inhibition_threshold = dev.alloc_zeros::<f32>(n_columns)?;
+        let init_vec = vec![initial_threshold; n_columns];
+        dev.htod_sync_copy_into(&init_vec, &mut inhibition_threshold)?;
+        let cell_active_bits_a = dev.alloc_zeros::<u32>(bits_words)?;
+        let cell_active_bits_b = dev.alloc_zeros::<u32>(bits_words)?;
+        let cell_winner_bits_a = dev.alloc_zeros::<u32>(bits_words)?;
+        let cell_winner_bits_b = dev.alloc_zeros::<u32>(bits_words)?;
+        let step_scratch = dev.alloc_zeros::<u32>(6)?;
+        unsafe {
+            result::ctx::set_current(*dev.cu_primary_ctx())?;
+        }
+        if dev.get_func("htm_fused", "htm_fused_step").is_none() {
+            dev.load_ptx(
+                Ptx::from_src(PTX_HTM_FUSED),
+                "htm_fused",
+                &["htm_fused_step", "htm_fused_step_batched"],
+            )?;
+        }
+        let ptx = CString::new(PTX_HTM_FUSED).expect("PTX contains no interior nul bytes");
+        let module = unsafe { result::module::load_data(ptx.as_ptr().cast()) }?;
+        let function = unsafe {
+            result::module::get_function(module, CString::new("htm_fused_step").unwrap())
+        }?;
+        let function_batched = unsafe {
+            result::module::get_function(module, CString::new("htm_fused_step_batched").unwrap())
+        }?;
+        // Cluster size 16 on Hopper is "non-portable" (> 8 requires opt-in).
+        // Must set CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED=1 on
+        // every launched kernel function, otherwise cuLaunchKernelEx rejects
+        // the cluster dim with CUDA_ERROR_INVALID_CLUSTER_SIZE.
+        unsafe {
+            let attr = sys::CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED;
+            // Ignore errors: older CUDA may lack the attribute, in which case
+            // only portable sizes (<= 8) work — plan_fused_launch caps at 8.
+            let _ = sys::lib().cuFuncSetAttribute(function, attr, 1);
+            let _ = sys::lib().cuFuncSetAttribute(function_batched, attr, 1);
+        }
+        // Probe SM count.
+        let sm_count = match dev.attribute(
+            cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+        ) {
+            Ok(v) => v as u32,
+            Err(_) => 16u32,
+        };
+        // T1: Probe Hopper cluster launch capability.
+        let max_cluster_size = match dev.attribute(
+            cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH,
+        ) {
+            Ok(v) if v > 0 => {
+                // H200/sm_90a supports up to 16 blocks per cluster.
+                // There is no MAX_CLUSTER_SIZE attribute in CUDA 12.4; hard-code the
+                // Hopper maximum which is 16 (8 SMs × 2 blocks/SM = 16 blocks/cluster).
+                16u32
+            }
+            _ => 0u32,
+        };
+        eprintln!("[htm_rust] cluster: max_cluster_size={}", max_cluster_size);
+        let cluster_info = ClusterInfo { max_cluster_size };
+        let cooperative_supported = matches!(
+            dev.attribute(sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH),
+            Ok(v) if v > 0
+        );
+        let cooperative_grid_limit = if cooperative_supported {
+            let blocks_per_sm = unsafe {
+                // Must match plan_fused_launch(): the A10G/Ampere-safe fused
+                // kernel launch uses 256 threads/block, not the historical
+                // 1024-thread Hopper occupancy probe.
+                result::occupancy::max_active_block_per_multiprocessor(function, 256, 0)
+            }
+            .ok()
+            .map(|v| v.max(0) as u32)
+            .unwrap_or(0);
+            sm_count.saturating_mul(blocks_per_sm)
+        } else {
+            0
+        };
+        let launch_plan = plan_fused_launch(
+            sm_count,
+            cooperative_supported,
+            cooperative_grid_limit,
+            fused_grid_cap_override(),
+        )
+        .map_err(|msg| {
+            // Surface as a CUDA-ish error so callers can propagate.
+            eprintln!("[htm_rust] FATAL: {msg}");
+            DriverError(cudarc::driver::sys::CUresult::CUDA_ERROR_NOT_SUPPORTED)
+        })?;
+        eprintln!(
+            "[htm_rust] fused kernel: sm_count={} grid_dim_x={} cooperative_grid_limit={} cluster_max={}",
+            launch_plan.sm_count, launch_plan.grid_dim_x, launch_plan.cooperative_grid_limit,
+            cluster_info.max_cluster_size,
+        );
+        Ok(Self {
+            dev,
+            raw_kernel: RawFusedKernel { module, function, function_batched },
+            inhibition_threshold,
+            cell_active_bits_a,
+            cell_active_bits_b,
+            cell_winner_bits_a,
+            cell_winner_bits_b,
+            step_scratch,
+            grid_dim_x: launch_plan.grid_dim_x,
+            block_dim_x: launch_plan.block_dim_x,
+            cooperative_grid_limit: launch_plan.cooperative_grid_limit,
+            iter_counter: 0,
+            cluster_info,
+            initial_threshold,
+        })
+    }
+    /// Reset fused state. Called at region.reset().
+    pub fn reset(&mut self) -> Result<(), DriverError> {
+        self.dev.memset_zeros(&mut self.cell_active_bits_a)?;
+        self.dev.memset_zeros(&mut self.cell_active_bits_b)?;
+        self.dev.memset_zeros(&mut self.cell_winner_bits_a)?;
+        self.dev.memset_zeros(&mut self.cell_winner_bits_b)?;
+        self.dev.memset_zeros(&mut self.step_scratch)?;
+        // Do NOT reset inhibition_threshold — it's learned state. A hard
+        // reset of TM state should NOT forget the sparsity calibration.
+        Ok(())
+    }
+}
+/// Launch the fused megakernel. Processes all T timesteps in one kernel.
+///
+/// Uses `cuLaunchKernelEx` with `CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION=(16,1,1)`
+/// when the device supports cluster launch, otherwise falls back to a plain
+/// `launch_kernel`. For single-region launches, grid_dim_x <= 16 ensures the
+/// entire grid fits in one cluster.
+#[allow(clippy::too_many_arguments)]
+pub fn launch_fused(
+    sp: &mut SpatialPoolerGpu,
+    tm: &mut TemporalMemoryGpu,
+    fused: &mut FusedState,
+    inputs_flat: &CudaSlice<u8>,
+    cols_out: &mut CudaSlice<u8>,
+    anom_out: &mut CudaSlice<f32>,
+    t: usize,
+    input_bits: usize,
+    learn: bool,
+) -> Result<(), DriverError> {
+    // Reset step_scratch before each launch (safe re-entry).
+    sp.dev_ref().memset_zeros(&mut fused.step_scratch)?;
+    fused.iter_counter = fused.iter_counter.wrapping_add(1);
+    let cfg = FusedConfig {
+        input_bits: input_bits as u32,
+        n_columns: sp.n_columns_accessor() as u32,
+        synapses_per_col: sp.synapses_per_col_accessor() as u32,
+        conn_thr: sp.conn_thr_accessor(),
+        sp_inc: sp.inc_accessor(),
+        sp_dec: sp.dec_accessor(),
+        sparsity_target: sp.sparsity_accessor(),
+        duty_alpha: 1.0f32 / sp.duty_period_accessor().max(1.0),
+        thr_adapt_rate: 0.001f32,
+        cells_per_column: tm.cells_per_column as u32,
+        n_cells: tm.n_cells as u32,
+        bits_words: tm.bits_words as u32,
+        max_segments_per_cell: MAX_SEGMENTS_PER_CELL as u32,
+        synapses_per_segment: MAX_SYN_PER_SEGMENT as u32,
+        activation_threshold: tm.activation_threshold,
+        learning_threshold: tm.learning_threshold,
+        max_new_synapses: tm.max_new_synapse_count,
+        conn_thr_i16: tm.conn_thr_i16 as i32,
+        perm_inc_i16: tm.perm_inc_i16 as i32,
+        perm_dec_i16: tm.perm_dec_i16 as i32,
+        predicted_seg_dec_i16: tm.predicted_seg_dec_i16 as i32,
+        initial_perm_i16: tm.initial_perm_i16 as i32,
+        t: t as u32,
+        learn: if learn { 1 } else { 0 },
+        iter_seed: fused.iter_counter,
+        cooperative_grid_sync: 1,
+    };
+    let ptrs = FusedPtrs {
+        syn_bit: *sp.syn_bit_accessor().device_ptr(),
+        syn_perm: *sp.syn_perm_accessor().device_ptr(),
+        boost: *sp.boost_accessor().device_ptr(),
+        active_duty: *sp.active_duty_accessor().device_ptr(),
+        inhibition_threshold: *fused.inhibition_threshold.device_ptr(),
+        seg_cell_id: *tm.seg_cell_id_accessor().device_ptr(),
+        seg_syn_count: *tm.seg_syn_count_accessor().device_ptr(),
+        syn_presyn: *tm.syn_presyn_accessor().device_ptr(),
+        tm_syn_perm: *tm.syn_perm_accessor().device_ptr(),
+        cell_seg_count: *tm.cell_seg_count_accessor().device_ptr(),
+        cell_active_a: *fused.cell_active_bits_a.device_ptr(),
+        cell_active_b: *fused.cell_active_bits_b.device_ptr(),
+        cell_winner_a: *fused.cell_winner_bits_a.device_ptr(),
+        cell_winner_b: *fused.cell_winner_bits_b.device_ptr(),
+        inputs: *inputs_flat.device_ptr(),
+        cols_out: *cols_out.device_ptr(),
+        anom_out: *anom_out.device_ptr(),
+        barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
+        step_scratch: *fused.step_scratch.device_ptr(),
+    };
+    let grid_x = fused.grid_dim_x;
+    let block_x = fused.block_dim_x;
+    let cu_stream = *sp.dev_ref().cu_stream();
+    let use_cluster = fused.cluster_info.max_cluster_size > 0;
+    unsafe {
+        result::ctx::set_current(*sp.dev_ref().cu_primary_ctx())?;
+        let mut kernel_params: [*mut std::ffi::c_void; 2] = [
+            (&ptrs as *const FusedPtrs).cast_mut().cast(),
+            (&cfg as *const FusedConfig).cast_mut().cast(),
+        ];
+        if use_cluster {
+            // T10: Hopper cluster launch with CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION.
+            // cluster_dim=(16,1,1) maps the entire single-region grid into one cluster.
+            let mut attr: sys::CUlaunchAttribute = std::mem::zeroed();
+            attr.id = sys::CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+            attr.value.clusterDim.x = 16;
+            attr.value.clusterDim.y = 1;
+            attr.value.clusterDim.z = 1;
+            let mut launch_cfg: sys::CUlaunchConfig = std::mem::zeroed();
+            launch_cfg.gridDimX = grid_x;
+            launch_cfg.gridDimY = 1;
+            launch_cfg.gridDimZ = 1;
+            launch_cfg.blockDimX = block_x;
+            launch_cfg.blockDimY = 1;
+            launch_cfg.blockDimZ = 1;
+            launch_cfg.sharedMemBytes = 0;
+            launch_cfg.hStream = cu_stream;
+            launch_cfg.numAttrs = 1;
+            launch_cfg.attrs = &mut attr as *mut sys::CUlaunchAttribute;
+            let ret = sys::lib().cuLaunchKernelEx(
+                &launch_cfg as *const sys::CUlaunchConfig,
+                fused.raw_kernel.function,
+                kernel_params.as_mut_ptr(),
+                std::ptr::null_mut(),
+            );
+            if ret != sys::CUresult::CUDA_SUCCESS {
+                return Err(DriverError(ret));
+            }
+        } else {
+            // Pre-Hopper: cooperative kernel launch. The fused kernel uses
+            // grid.sync() for cross-block synchronization which REQUIRES
+            // cuLaunchCooperativeKernel (normal launch silently crashes on
+            // the first grid.sync() call).
+            let ret = sys::lib().cuLaunchCooperativeKernel(
+                fused.raw_kernel.function,
+                grid_x, 1, 1,
+                block_x, 1, 1,
+                0,  // sharedMemBytes
+                cu_stream,
+                kernel_params.as_mut_ptr(),
+            );
+            if ret != sys::CUresult::CUDA_SUCCESS {
+                return Err(DriverError(ret));
+            }
+        }
+    }
+    Ok(())
+}
+/// Single batched non-cooperative launch for B regions with DLB sync. Uses the same kernel
+/// body; each block reads its region's FusedPtrs from a device-side array
+/// indexed by blockIdx.y. All regions share the same config (same
+/// input_bits/n_columns/etc.) so we pass one FusedConfig.
+///
+/// This breaks through the CUDA cooperative-kernel device-level
+/// serialization: multiple cooperative launches are serialized regardless
+/// of stream, but one cooperative launch with grid.y=B processes all
+/// regions in a single invocation — ~B× speedup vs B sequential launches.
+#[allow(clippy::too_many_arguments)]
+/// Low-level raw-pointer entry, called by PyO3 binding which holds the
+/// mutable borrows. Safety: each `*mut HTMRegionGpu` must point to a live,
+/// uniquely-borrowed region. All regions must be distinct.
+pub(super) fn launch_fused_batched_raw(
+    region_ptrs: &[*mut super::HTMRegionGpu],
+    inputs_per_region: &[u64],
+    cols_per_region: &[u64],
+    anom_per_region: &[u64],
+    t: usize,
+    input_bits: usize,
+    learn: bool,
+) -> Result<(), DriverError> {
+    let b = region_ptrs.len();
+    assert_eq!(inputs_per_region.len(), b);
+    assert_eq!(cols_per_region.len(), b);
+    assert_eq!(anom_per_region.len(), b);
+    assert!(b >= 1, "need at least one region");
+    // Reset per-region step_scratch before each launch.
+    for &rp in region_ptrs.iter() {
+        let r = unsafe { &mut *rp };
+        let dev = r.sp_gpu.dev_ref().clone();
+        dev.memset_zeros(&mut r.fused_state.step_scratch)?;
+        r.fused_state.iter_counter = r.fused_state.iter_counter.wrapping_add(1);
+    }
+    // Shared config — all regions use identical sp/tm parameters.
+    let (grid_x, block_x, cooperative_grid_limit, function_batched, cu_stream, cu_ctx) = {
+        let r0 = unsafe { &*region_ptrs[0] };
+        (
+            r0.fused_state.grid_dim_x,
+            r0.fused_state.block_dim_x,
+            r0.fused_state.cooperative_grid_limit,
+            r0.fused_state.raw_kernel.function_batched,
+            *r0.sp_gpu.dev_ref().cu_stream(),
+            *r0.sp_gpu.dev_ref().cu_primary_ctx(),
+        )
+    };
+    let cfg = {
+        let r = unsafe { &*region_ptrs[0] };
+        FusedConfig {
+            input_bits: input_bits as u32,
+            n_columns: r.sp_gpu.n_columns_accessor() as u32,
+            synapses_per_col: r.sp_gpu.synapses_per_col_accessor() as u32,
+            conn_thr: r.sp_gpu.conn_thr_accessor(),
+            sp_inc: r.sp_gpu.inc_accessor(),
+            sp_dec: r.sp_gpu.dec_accessor(),
+            sparsity_target: r.sp_gpu.sparsity_accessor(),
+            duty_alpha: 1.0f32 / r.sp_gpu.duty_period_accessor().max(1.0),
+            thr_adapt_rate: 0.001f32,
+            cells_per_column: r.tm_gpu.cells_per_column as u32,
+            n_cells: r.tm_gpu.n_cells as u32,
+            bits_words: r.tm_gpu.bits_words as u32,
+            max_segments_per_cell: MAX_SEGMENTS_PER_CELL as u32,
+            synapses_per_segment: MAX_SYN_PER_SEGMENT as u32,
+            activation_threshold: r.tm_gpu.activation_threshold,
+            learning_threshold: r.tm_gpu.learning_threshold,
+            max_new_synapses: r.tm_gpu.max_new_synapse_count,
+            conn_thr_i16: r.tm_gpu.conn_thr_i16 as i32,
+            perm_inc_i16: r.tm_gpu.perm_inc_i16 as i32,
+            perm_dec_i16: r.tm_gpu.perm_dec_i16 as i32,
+            predicted_seg_dec_i16: r.tm_gpu.predicted_seg_dec_i16 as i32,
+            initial_perm_i16: r.tm_gpu.initial_perm_i16 as i32,
+            t: t as u32,
+            learn: if learn { 1 } else { 0 },
+            iter_seed: r.fused_state.iter_counter,
+            cooperative_grid_sync: 1,
+        }
+    };
+    // Build B FusedPtrs per-region.
+    let ptrs_vec: Vec<FusedPtrs> = (0..b)
+        .map(|i| {
+            let r = unsafe { &*region_ptrs[i] };
+            FusedPtrs {
+                syn_bit: *r.sp_gpu.syn_bit_accessor().device_ptr(),
+                syn_perm: *r.sp_gpu.syn_perm_accessor().device_ptr(),
+                boost: *r.sp_gpu.boost_accessor().device_ptr(),
+                active_duty: *r.sp_gpu.active_duty_accessor().device_ptr(),
+                inhibition_threshold: *r.fused_state.inhibition_threshold.device_ptr(),
+                seg_cell_id: *r.tm_gpu.seg_cell_id_accessor().device_ptr(),
+                seg_syn_count: *r.tm_gpu.seg_syn_count_accessor().device_ptr(),
+                syn_presyn: *r.tm_gpu.syn_presyn_accessor().device_ptr(),
+                tm_syn_perm: *r.tm_gpu.syn_perm_accessor().device_ptr(),
+                cell_seg_count: *r.tm_gpu.cell_seg_count_accessor().device_ptr(),
+                cell_active_a: *r.fused_state.cell_active_bits_a.device_ptr(),
+                cell_active_b: *r.fused_state.cell_active_bits_b.device_ptr(),
+                cell_winner_a: *r.fused_state.cell_winner_bits_a.device_ptr(),
+                cell_winner_b: *r.fused_state.cell_winner_bits_b.device_ptr(),
+                inputs: inputs_per_region[i],
+                cols_out: cols_per_region[i],
+                anom_out: anom_per_region[i],
+                barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
+                step_scratch: *r.fused_state.step_scratch.device_ptr(),
+            }
+        })
+        .collect();
+    // Upload FusedPtrs array to device (B * sizeof(FusedPtrs) bytes).
+    // FusedPtrs is repr(C) + DeviceRepr so htod_sync_copy handles it.
+    let dev = unsafe { &*region_ptrs[0] }.sp_gpu.dev_ref().clone();
+    let ptrs_dev: CudaSlice<FusedPtrs> = dev.htod_sync_copy(&ptrs_vec)?;
+    let ptrs_dev_ptr: u64 = *ptrs_dev.device_ptr();
+    // T10: Cluster launch for batched regions.
+    // Grid = (grid_x, B, 1) with cluster_dim=(16,1,1): each region (Y slice)
+    // occupies exactly one cluster of 16 blocks. All 8 clusters run concurrently
+    // on the H200's 132 SMs (8 × 16 = 128 blocks ≤ 132 SMs).
+    let use_cluster = {
+        let r0 = unsafe { &*region_ptrs[0] };
+        r0.fused_state.cluster_info.max_cluster_size > 0
+    };
+    let grid_x = plan_batched_grid_dim(grid_x, cooperative_grid_limit, b, use_cluster)
+        .map_err(|msg| {
+            eprintln!("[htm_rust] FATAL: {msg}");
+            DriverError(cudarc::driver::sys::CUresult::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE)
+        })?;
+    unsafe {
+        result::ctx::set_current(cu_ctx)?;
+        let mut kernel_params: [*mut std::ffi::c_void; 2] = [
+            (&ptrs_dev_ptr as *const u64).cast_mut().cast(),
+            (&cfg as *const FusedConfig).cast_mut().cast(),
+        ];
+        if use_cluster {
+            let mut attr: sys::CUlaunchAttribute = std::mem::zeroed();
+            attr.id = sys::CUlaunchAttributeID::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+            attr.value.clusterDim.x = 16;
+            attr.value.clusterDim.y = 1;
+            attr.value.clusterDim.z = 1;
+            let mut launch_cfg: sys::CUlaunchConfig = std::mem::zeroed();
+            launch_cfg.gridDimX = grid_x;
+            launch_cfg.gridDimY = b as u32;
+            launch_cfg.gridDimZ = 1;
+            launch_cfg.blockDimX = block_x;
+            launch_cfg.blockDimY = 1;
+            launch_cfg.blockDimZ = 1;
+            launch_cfg.sharedMemBytes = 0;
+            launch_cfg.hStream = cu_stream;
+            launch_cfg.numAttrs = 1;
+            launch_cfg.attrs = &mut attr as *mut sys::CUlaunchAttribute;
+            let ret = sys::lib().cuLaunchKernelEx(
+                &launch_cfg as *const sys::CUlaunchConfig,
+                function_batched,
+                kernel_params.as_mut_ptr(),
+                std::ptr::null_mut(),
+            );
+            if ret != sys::CUresult::CUDA_SUCCESS {
+                return Err(DriverError(ret));
+            }
+        } else {
+            // Pre-Hopper: cooperative kernel launch (grid.sync() requires it).
+            let ret = sys::lib().cuLaunchCooperativeKernel(
+                function_batched,
+                grid_x, b as u32, 1,
+                block_x, 1, 1,
+                0,  // sharedMemBytes
+                cu_stream,
+                kernel_params.as_mut_ptr(),
+            );
+            if ret != sys::CUresult::CUDA_SUCCESS {
+                return Err(DriverError(ret));
+            }
+        }
+    }
+    // `ptrs_dev` is a per-call device array consumed by the async kernel.
+    // Keep it alive until the kernel has read it; otherwise dropping/freeing
+    // it immediately after launch can surface as a later unrelated CUDA error.
+    dev.synchronize()?;
+    Ok(())
+}

overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu CHANGED Viewed

@@ -1,677 +1,677 @@
-// Fused HTM megakernel — SP + TM, all T timesteps in a single launch.
-//
-// Design rationale:
-//   - Global top-K column selection requires cross-block synchronization at
-//     every timestep (grid.sync is unreliable on WSL2/sm_86 without rdc=true).
-//   - Replace with per-column threshold activation using local lateral
-//     inhibition: column c activates if overlap[c]*boost[c] > threshold[c].
-//     Threshold is a per-column running-EMA learned scalar that steers the
-//     column's long-run activation rate toward the global sparsity target.
-//   - This is biologically grounded (GABAergic local inhibition) and supported
-//     by HTM theory (duty-cycle boost already drives this loop; we just
-//     change which lever the EMA pulls).
-//
-// Launch shape:
-//   grid  = min(device SM count, 16)  // hard cap — see below
-//   block = 1024 threads = 32 warps
-//   Each warp of 32 owns a contiguous column slice (n_columns / total_warps).
-//
-// Cross-block coherence:
-//   - Ping-pong buffers for cell_active/cell_winner: write _a at even t,
-//     read _b; reversed at odd t.
-//   - Preferred path: cooperative launch + hardware whole-grid sync.
-//   - Fallback path: software 3-slot rotating grid barrier for devices/drivers
-//     that cannot do cooperative launch.
-//
-// 2026-04-16: grid_dim reduced from 28 to 16 after deadlock RCA. The previous
-// cap of 28 relied on all blocks being concurrently resident on a 30-SM RTX
-// 3060 Laptop. Under thermal throttling effective residency dropped to ~20-24,
-// leaving scheduled blocks spinning on the software grid barrier waiting for
-// peer blocks that would never run. 16 blocks is below any realistic residency
-// floor and preserves enough warp parallelism (16*32 = 512 warps) to saturate
-// memory bandwidth on the spatial-pooler stage.
-//
-// Kernel signature uses struct-by-value for pointers and config to stay
-// inside cudarc's launch-arg count limit.
-#include <cooperative_groups.h>
-#include <cooperative_groups/memcpy_async.h>
-namespace cg = cooperative_groups;
-// Maximum columns owned per cluster-block in DSMEM.
-// Supports n_columns up to COLS_PER_CLUSTER_BLOCK_MAX * cluster_size.
-// At cluster_size=16: supports up to 256*16=4096 columns.
-// Each array costs 256*4 = 1024 bytes; three arrays = 3072 bytes per SM —
-// well under the 228 KB H200 shared-memory cap.
-#define COLS_PER_CLUSTER_BLOCK_MAX 256u
-// Maximum input_bits supported by the TMA-multicast staging tile.
-// At 32 KB this covers the production SDR width (16384 bits) with 2× headroom.
-// Total shared per SM: 32768 (tile) + 3072 (DSMEM float arrays) = ~35 KB —
-// well under the 228 KB H200 limit.
-//
-// Expected speedup from TMA multicast input staging (T9/T11):
-//   - Without staging: 16 SMs × T × (input_bits GMEM reads per timestep)
-//   - With staging:    1 TMA DMA per timestep, shared reads from L1 thereafter
-//   - Theoretical DRAM bandwidth reduction: ~16× on input reads
-//   - Wall-clock reduction estimate: -20 to -40 ms from reduced input fetch latency
-#define INPUT_BITS_MAX 32768u
-extern "C" {
-struct FusedPtrs {
-    unsigned long long syn_bit;
-    unsigned long long syn_perm;
-    unsigned long long boost;
-    unsigned long long active_duty;
-    unsigned long long inhibition_threshold;
-    unsigned long long seg_cell_id;
-    unsigned long long seg_syn_count;
-    unsigned long long syn_presyn;
-    unsigned long long tm_syn_perm;
-    unsigned long long cell_seg_count;
-    unsigned long long cell_active_a;
-    unsigned long long cell_active_b;
-    unsigned long long cell_winner_a;
-    unsigned long long cell_winner_b;
-    unsigned long long inputs;
-    unsigned long long cols_out;
-    unsigned long long anom_out;
-    unsigned long long barrier_counters;
-    unsigned long long step_scratch;
-};
-struct FusedConfig {
-    // SP constants
-    unsigned int input_bits;
-    unsigned int n_columns;
-    unsigned int synapses_per_col;
-    float        conn_thr;
-    float        sp_inc;
-    float        sp_dec;
-    float        sparsity_target;
-    float        duty_alpha;
-    float        thr_adapt_rate;
-    // TM constants
-    unsigned int cells_per_column;
-    unsigned int n_cells;
-    unsigned int bits_words;
-    unsigned int max_segments_per_cell;
-    unsigned int synapses_per_segment;
-    unsigned int activation_threshold;
-    unsigned int learning_threshold;
-    unsigned int max_new_synapses;
-    int          conn_thr_i16;
-    int          perm_inc_i16;
-    int          perm_dec_i16;
-    int          predicted_seg_dec_i16;
-    int          initial_perm_i16;
-    // Loop constants
-    unsigned int T;
-    unsigned int learn;
-    unsigned int iter_seed;
-    unsigned int cooperative_grid_sync;
-};
-// Hardware cluster barrier using Hopper sm_90a cooperative_groups::this_cluster().sync().
-// Replaces the former software Decoupled Look-Back (DLB) atomic-spin barrier.
-//
-// cluster::sync() is a single PTX instruction (barrier.cluster) that resolves
-// in ~10-40 ns inside the cluster, with no device-level serialization.
-// Multiple clusters (one per HTM region) run fully concurrently — bounded
-// only by SM count (8 clusters × 16 SMs = 128 ≤ 132 on H200).
-//
-// The flags / expected / phase / cooperative_grid_sync parameters are kept
-// in the signature for call-site compatibility but are unused.
-__device__ static inline void fused_grid_barrier(cg::grid_group grid,
-                                                 unsigned int * /* flags — unused */,
-                                                 unsigned int /* expected — unused */,
-                                                 unsigned int /* phase — unused */,
-                                                 unsigned int /* cooperative_grid_sync — unused */) {
-#if __CUDA_ARCH__ >= 900
-    // Hopper+ : hardware cluster barrier (~10-40 ns)
-    auto cluster = cg::this_cluster();
-    cluster.sync();
-#else
-    // Pre-Hopper (sm_80, sm_86, sm_89): grid-level cooperative sync.
-    // Requires cooperative kernel launch. ~us-ms range, adequate for HTM
-    // workload (kernel launch frequency is low).
-    grid.sync();
-#endif
-}
-__device__ static inline unsigned int warp_sum_u32(unsigned int v) {
-    for (int off = 16; off > 0; off >>= 1) {
-        v += __shfl_down_sync(0xffffffffu, v, off);
-    }
-    return v;
-}
-// Core kernel body — works for both single-region and batched launches.
-// Single-region: caller passes the one FusedPtrs struct.
-// Batched: each block reads its region's FusedPtrs via blockIdx.y before
-// calling this. State is independent per region (each region owns its own
-// GPU buffers); grid.sync() is the only cross-block primitive and it
-// spans ALL blocks in the grid (harmless over-sync across regions).
-__device__ static inline
-void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
-    cg::grid_group grid = cg::this_grid();
-    // Cast pointers.
-    const unsigned int  * __restrict__ syn_bit               = (const unsigned int*)P.syn_bit;
-    float               * __restrict__ syn_perm              = (float*)P.syn_perm;
-    float               * __restrict__ boost                 = (float*)P.boost;
-    float               * __restrict__ active_duty           = (float*)P.active_duty;
-    float               * __restrict__ inhibition_threshold  = (float*)P.inhibition_threshold;
-    unsigned int        * __restrict__ seg_cell_id           = (unsigned int*)P.seg_cell_id;
-    unsigned int        * __restrict__ seg_syn_count         = (unsigned int*)P.seg_syn_count;
-    unsigned int        * __restrict__ syn_presyn            = (unsigned int*)P.syn_presyn;
-    short               * __restrict__ tm_syn_perm           = (short*)P.tm_syn_perm;
-    unsigned int        * __restrict__ cell_seg_count        = (unsigned int*)P.cell_seg_count;
-    unsigned int        * __restrict__ cell_active_a         = (unsigned int*)P.cell_active_a;
-    unsigned int        * __restrict__ cell_active_b         = (unsigned int*)P.cell_active_b;
-    unsigned int        * __restrict__ cell_winner_a         = (unsigned int*)P.cell_winner_a;
-    unsigned int        * __restrict__ cell_winner_b         = (unsigned int*)P.cell_winner_b;
-    const unsigned char * __restrict__ inputs                = (const unsigned char*)P.inputs;
-    unsigned char       * __restrict__ cols_out              = (unsigned char*)P.cols_out;
-    float               * __restrict__ anom_out              = (float*)P.anom_out;
-    unsigned int        * __restrict__ barrier_counters      = (unsigned int*)P.barrier_counters;
-    unsigned int        * __restrict__ step_scratch          = (unsigned int*)P.step_scratch;
-    const unsigned int tid     = threadIdx.x;
-    const unsigned int lane    = tid & 31u;
-    const unsigned int warp    = tid >> 5;
-    const unsigned int warps_per_block = blockDim.x >> 5;
-    const unsigned int gwarp   = blockIdx.x * warps_per_block + warp;
-    const unsigned int n_warps = gridDim.x * warps_per_block;
-    const unsigned int n_cols  = cfg.n_columns;
-    const unsigned int col_lo  = (gwarp * n_cols) / n_warps;
-    const unsigned int col_hi  = ((gwarp + 1) * n_cols) / n_warps;
-    unsigned int phase = 0u;
-    // =========================================================
-    // DSMEM: Cluster-distributed shared memory for hot per-column
-    // state (inhibition_threshold, boost, active_duty).
-    //
-    // On Hopper (sm_90+): Each block in the cluster owns a contiguous
-    // slice of columns in its own __shared__ arrays. Any block can
-    // peer-read another block's slice via cluster.map_shared_rank().
-    //
-    // On Ampere (sm_86) and other pre-Hopper: No cluster support.
-    // Read/write directly from/to global memory (inhibition_threshold,
-    // boost, active_duty device pointers). Slightly higher latency but
-    // functionally correct.
-    // =========================================================
-#if __CUDA_ARCH__ >= 900
-    // Hopper+ cluster path
-    auto cluster = cg::this_cluster();
-    const unsigned int cluster_block_rank = cluster.block_rank();  // 0..cluster_size-1
-    const unsigned int cluster_sz         = cluster.num_blocks();  // == gridDim.x (≤16)
-#else
-    // Pre-Hopper: no cluster, each block is independent.
-    const unsigned int cluster_block_rank = blockIdx.x;
-    const unsigned int cluster_sz         = gridDim.x;
-#endif
-    // Partition n_cols evenly across cluster blocks.
-    // Each block owns cols_per_block columns starting at my_col_start.
-    const unsigned int cols_per_block =
-        (n_cols + cluster_sz - 1u) / cluster_sz;               // ceil div
-    const unsigned int my_col_start =
-        cluster_block_rank * cols_per_block;
-    const unsigned int my_col_end =
-        (my_col_start + cols_per_block < n_cols)
-            ? (my_col_start + cols_per_block) : n_cols;        // clamp
-#if __CUDA_ARCH__ >= 900
-    // Cluster-distributed shared memory arrays.
-    // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
-    // Peer blocks address into each other's smem via map_shared_rank.
-    __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
-    __shared__ float s_boost     [COLS_PER_CLUSTER_BLOCK_MAX];
-    __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
-#endif
-    // TMA multicast input staging tile (T9) — HOPPER ONLY.
-    //
-    // On Hopper: cg::memcpy_async with cluster scope multicasts input to all
-    // 16 SMs, reducing DRAM traffic by ~16×.
-    // On Ampere: 32 KB smem allocation exceeds per-block budget when
-    // cooperatively launched (48 KB total, registers eat the rest). Skip the
-    // tile entirely — Stage A reads from GMEM directly (original path).
-#if __CUDA_ARCH__ >= 900
-    __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
-#endif
-#if __CUDA_ARCH__ >= 900
-    // Initial GMEM → smem load (reads state from previous forward call).
-    // Each block loads only its own slice; tid strides across the slice.
-    for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
-        const unsigned int off = c - my_col_start;
-        s_inhib_thr [off] = inhibition_threshold[c];
-        s_boost     [off] = boost[c];
-        s_active_duty[off] = active_duty[c];
-    }
-    // All blocks in the cluster must finish loading before any block
-    // starts reading peer smem inside the T-loop.
-    cluster.sync();
-#else
-    // Pre-Hopper: no smem caching needed — reads go directly to GMEM.
-    // Grid sync ensures all blocks have completed Phase 0 init before T-loop.
-    grid.sync();
-#endif
-    const unsigned int S   = cfg.synapses_per_col;
-    const unsigned int cpc = cfg.cells_per_column;
-    const unsigned int SPS = cfg.synapses_per_segment;
-    const unsigned int MSC = cfg.max_segments_per_cell;
-    // Main timestep loop.
-    for (unsigned int t = 0u; t < cfg.T; t++) {
-        const unsigned int inp_off      = t * cfg.input_bits;
-        const unsigned int col_base_out = t * n_cols;
-        unsigned int * curr_active = (t & 1u) ? cell_active_b : cell_active_a;
-        unsigned int * prev_active = (t & 1u) ? cell_active_a : cell_active_b;
-        unsigned int * curr_winner = (t & 1u) ? cell_winner_b : cell_winner_a;
-        unsigned int * prev_winner = (t & 1u) ? cell_winner_a : cell_winner_b;
-        // ---- Phase 0: clear curr bitsets for my cell range ----
-        const unsigned int my_cell_lo = col_lo * cpc;
-        const unsigned int my_cell_hi = col_hi * cpc;
-        if (cpc == 32u) {
-            // Fast path: one word per column.
-            for (unsigned int c = col_lo + lane; c < col_hi; c += 32u) {
-                curr_active[c] = 0u;
-                curr_winner[c] = 0u;
-            }
-        } else {
-            for (unsigned int cell = my_cell_lo + lane; cell < my_cell_hi; cell += 32u) {
-                unsigned int w = cell >> 5;
-                unsigned int m = 1u << (cell & 31u);
-                atomicAnd(&curr_active[w], ~m);
-                atomicAnd(&curr_winner[w], ~m);
-            }
-        }
-        // Block 0, lane 0, warp 0 resets step-scratch counters.
-        if (blockIdx.x == 0u && tid == 0u) {
-            step_scratch[0] = 0u;
-            step_scratch[1] = 0u;
-        }
-        // ---- BARRIER 1 ----
-        // Fence: make the above clear-bitsets + scratch writes globally
-        // visible before peer blocks observe "barrier arrived".
-        __threadfence();
-        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
-        // =========================================================
-        // T9: TMA MULTICAST INPUT STAGING
-        //
-        // Issue a single cluster-scope async DMA to broadcast this
-        // timestep's input slice into s_input_tile across all 16 SMs
-        // in the cluster simultaneously.  On Hopper sm_90a,
-        // cg::memcpy_async with cluster scope maps to the TMA
-        // hardware unit (cp.async.bulk.tensor multicast), reducing
-        // DRAM input traffic by ~16× vs each block fetching its own
-        // copy from GMEM.
-        //
-        // The staging is gated on cfg.input_bits <= INPUT_BITS_MAX.
-        // If the tile is too small (custom large input_bits), we fall
-        // back to per-thread GMEM reads in Stage A (identical to the
-        // original path; use_input_tile==false).
-        //
-        // Ordering: BARRIER 1 completes before we issue the DMA.
-        // The DMA completes before Stage A reads s_input_tile.
-        // =========================================================
-#if __CUDA_ARCH__ >= 900
-        const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
-        if (use_input_tile) {
-            auto tb = cg::this_thread_block();
-            cg::memcpy_async(tb, s_input_tile,
-                             inputs + inp_off,
-                             cfg.input_bits);
-            cg::wait(tb);
-            cluster.sync();
-        }
-#else
-        const bool use_input_tile = false;
-#endif
-        // =========================================================
-        // STAGE A: Spatial Pooler
-        //
-        // Hot per-column state (boost, inhibition_threshold,
-        // active_duty) is served from cluster DSMEM rather than
-        // GMEM for each of the T timesteps.  GMEM is written on
-        // update so state persists across forward calls.
-        // =========================================================
-        for (unsigned int c = col_lo; c < col_hi; c++) {
-            unsigned int base = c * S;
-            unsigned int local = 0u;
-            for (unsigned int s = lane; s < S; s += 32u) {
-                unsigned int b = syn_bit[base + s];
-                float p = syn_perm[base + s];
-                // T9: read from cluster-broadcast tile when available;
-                // fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
-#if __CUDA_ARCH__ >= 900
-                unsigned int inp_byte = use_input_tile
-                    ? (unsigned int)s_input_tile[b]
-                    : (unsigned int)inputs[inp_off + b];
-#else
-                unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
-#endif
-                unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
-                local += hit;
-            }
-            unsigned int overlap = warp_sum_u32(local);
-            overlap = __shfl_sync(0xffffffffu, overlap, 0);
-            // Read boost + threshold for column c.
-#if __CUDA_ARCH__ >= 900
-            // Hopper: read from cluster-distributed shared memory.
-            const unsigned int owner_block  = c / cols_per_block;
-            const unsigned int owner_offset = c - owner_block * cols_per_block;
-            float boost_val = cluster.map_shared_rank(s_boost,      owner_block)[owner_offset];
-            float thr       = cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset];
-#else
-            // Pre-Hopper: read directly from global memory.
-            float boost_val = boost[c];
-            float thr       = inhibition_threshold[c];
-#endif
-            float boosted = (float)overlap * boost_val;
-            unsigned int is_active = (boosted > thr) ? 1u : 0u;
-            if (lane == 0) {
-                cols_out[col_base_out + c] = (unsigned char)is_active;
-                if (is_active) {
-                    atomicAdd(&step_scratch[0], 1u);
-                }
-            }
-            // SP learn (Hebbian) on active columns.
-            // T9: use tile for input reads here too.
-            if (cfg.learn && is_active) {
-                for (unsigned int s = lane; s < S; s += 32u) {
-                    unsigned int b = syn_bit[base + s];
-                    float p = syn_perm[base + s];
-#if __CUDA_ARCH__ >= 900
-                    unsigned int inp_byte = use_input_tile
-                        ? (unsigned int)s_input_tile[b]
-                        : (unsigned int)inputs[inp_off + b];
-#else
-                    unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
-#endif
-                    if (inp_byte != 0u) {
-                        p += cfg.sp_inc;
-                        if (p > 1.0f) p = 1.0f;
-                    } else {
-                        p -= cfg.sp_dec;
-                        if (p < 0.0f) p = 0.0f;
-                    }
-                    syn_perm[base + s] = p;
-                }
-            }
-            // active_duty EMA + threshold adaptation.
-            // Writes go to both DSMEM (hot path, Hopper only) and GMEM (persistence).
-            if (lane == 0) {
-#if __CUDA_ARCH__ >= 900
-                float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
-#else
-                float ad = active_duty[c];
-#endif
-                float sample = is_active ? 1.0f : 0.0f;
-                ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
-#if __CUDA_ARCH__ >= 900
-                // Writeback: peer smem (for next timestep read) + GMEM (persistence).
-                cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
-#endif
-                active_duty[c] = ad;
-                // Threshold steers toward target sparsity.
-                float err = ad - cfg.sparsity_target;
-                float new_thr = thr + cfg.thr_adapt_rate * err * 100.0f;
-                if (new_thr < 0.1f) new_thr = 0.1f;
-                if (new_thr > 1000.0f) new_thr = 1000.0f;
-#if __CUDA_ARCH__ >= 900
-                // Writeback: peer smem (for next timestep read) + GMEM (persistence).
-                cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
-#endif
-                inhibition_threshold[c] = new_thr;
-            }
-        }
-        // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
-        //
-        // On Hopper: cluster.sync() ensures all peer smem writes from this
-        // timestep are visible to all blocks before Stage B / next t.
-        // On pre-Hopper: no smem peer writes occur (all state in GMEM),
-        // so no extra sync needed here — the grid barrier below suffices.
-#if __CUDA_ARCH__ >= 900
-        cluster.sync();
-#endif
-        // ---- BARRIER 2: SP active_mask must be visible before TM reads ----
-        // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
-        // writes to global memory before peers advance past this barrier.
-        __threadfence();
-        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
-        // =========================================================
-        // STAGE B: Temporal Memory
-        // =========================================================
-        for (unsigned int c = col_lo; c < col_hi; c++) {
-            unsigned int col_active = cols_out[col_base_out + c];
-            if (col_active == 0u) continue;
-            unsigned int base_cell = c * cpc;
-            unsigned int any_predicted = 0u;
-            unsigned int best_seg_id_for_grow = 0xFFFFFFFFu;
-            unsigned int best_pot_count = 0u;
-            for (unsigned int k = 0u; k < cpc; k++) {
-                unsigned int cell = base_cell + k;
-                unsigned int n_segs_here = cell_seg_count[cell];
-                if (n_segs_here > MSC) n_segs_here = MSC;
-                if (n_segs_here == 0u) continue;
-                unsigned int seg_base_id = cell * MSC;
-                unsigned int cell_is_predictive = 0u;
-                for (unsigned int ls = 0u; ls < n_segs_here; ls++) {
-                    unsigned int seg = seg_base_id + ls;
-                    unsigned int n_syn = seg_syn_count[seg];
-                    if (n_syn == 0u) continue;
-                    unsigned int syn_base = seg * SPS;
-                    unsigned int l_conn = 0u;
-                    unsigned int l_pot  = 0u;
-                    for (unsigned int s = lane; s < n_syn; s += 32u) {
-                        unsigned int presyn = syn_presyn[syn_base + s];
-                        unsigned int w = prev_active[presyn >> 5];
-                        unsigned int bit = (w >> (presyn & 31u)) & 1u;
-                        if (bit) {
-                            l_pot += 1u;
-                            int p = (int)tm_syn_perm[syn_base + s];
-                            if (p >= cfg.conn_thr_i16) l_conn += 1u;
-                        }
-                    }
-                    unsigned int tot_conn = warp_sum_u32(l_conn);
-                    unsigned int tot_pot  = warp_sum_u32(l_pot);
-                    tot_conn = __shfl_sync(0xffffffffu, tot_conn, 0);
-                    tot_pot  = __shfl_sync(0xffffffffu, tot_pot, 0);
-                    if (tot_conn >= cfg.activation_threshold) cell_is_predictive = 1u;
-                    if (tot_pot >= cfg.learning_threshold && tot_pot > best_pot_count) {
-                        best_pot_count = tot_pot;
-                        best_seg_id_for_grow = seg;
-                    }
-                    // Reinforce predicted-and-correct segment.
-                    if (cfg.learn && tot_conn >= cfg.activation_threshold) {
-                        for (unsigned int s = lane; s < n_syn; s += 32u) {
-                            unsigned int presyn = syn_presyn[syn_base + s];
-                            unsigned int w = prev_active[presyn >> 5];
-                            unsigned int bit = (w >> (presyn & 31u)) & 1u;
-                            int p = (int)tm_syn_perm[syn_base + s];
-                            if (bit) {
-                                int np = p + cfg.perm_inc_i16;
-                                if (np > 32767) np = 32767;
-                                tm_syn_perm[syn_base + s] = (short)np;
-                            } else {
-                                int np = p - cfg.perm_dec_i16;
-                                if (np < 0) np = 0;
-                                tm_syn_perm[syn_base + s] = (short)np;
-                            }
-                        }
-                    }
-                }
-                if (cell_is_predictive) {
-                    any_predicted = 1u;
-                    if (lane == 0) {
-                        unsigned int w = cell >> 5;
-                        unsigned int m = 1u << (cell & 31u);
-                        atomicOr(&curr_active[w], m);
-                        atomicOr(&curr_winner[w], m);
-                    }
-                }
-            }
-            // BURST if no predicted.
-            if (!any_predicted) {
-                if (lane == 0) {
-                    for (unsigned int k = 0u; k < cpc; k++) {
-                        unsigned int cell = base_cell + k;
-                        unsigned int w = cell >> 5;
-                        unsigned int m = 1u << (cell & 31u);
-                        atomicOr(&curr_active[w], m);
-                    }
-                    unsigned int win = base_cell;
-                    unsigned int ww = win >> 5;
-                    unsigned int wm = 1u << (win & 31u);
-                    atomicOr(&curr_winner[ww], wm);
-                    atomicAdd(&step_scratch[1], 1u);
-                }
-                if (cfg.learn) {
-                    unsigned int target_seg;
-                    unsigned int existing_syn;
-                    if (best_seg_id_for_grow != 0xFFFFFFFFu) {
-                        // Reuse best matching segment.
-                        target_seg = best_seg_id_for_grow;
-                        existing_syn = seg_syn_count[target_seg];
-                        target_seg = __shfl_sync(0xffffffffu, target_seg, 0);
-                        existing_syn = __shfl_sync(0xffffffffu, existing_syn, 0);
-                        // Reinforce its existing synapses.
-                        unsigned int syn_base = target_seg * SPS;
-                        for (unsigned int s = lane; s < existing_syn; s += 32u) {
-                            unsigned int presyn = syn_presyn[syn_base + s];
-                            unsigned int w = prev_active[presyn >> 5];
-                            unsigned int bit = (w >> (presyn & 31u)) & 1u;
-                            int p = (int)tm_syn_perm[syn_base + s];
-                            if (bit) {
-                                int np = p + cfg.perm_inc_i16;
-                                if (np > 32767) np = 32767;
-                                tm_syn_perm[syn_base + s] = (short)np;
-                            } else {
-                                int np = p - cfg.perm_dec_i16;
-                                if (np < 0) np = 0;
-                                tm_syn_perm[syn_base + s] = (short)np;
-                            }
-                        }
-                    } else {
-                        // Allocate new segment on winner cell (cell 0 of col).
-                        unsigned int new_seg = 0u;
-                        if (lane == 0) {
-                            unsigned int winner_cell = base_cell;
-                            unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u);
-                            if (slot >= MSC) slot = slot % MSC;
-                            new_seg = winner_cell * MSC + slot;
-                            seg_cell_id[new_seg] = winner_cell;
-                            seg_syn_count[new_seg] = 0u;
-                        }
-                        target_seg = __shfl_sync(0xffffffffu, new_seg, 0);
-                        existing_syn = 0u;
-                    }
-                    // Grow synapses to prev_winner cells — lane 0 serialized.
-                    unsigned int room = (SPS > existing_syn) ? (SPS - existing_syn) : 0u;
-                    unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room;
-                    if (lane == 0 && max_grow > 0u) {
-                        unsigned int syn_base = target_seg * SPS;
-                        unsigned int grown = 0u;
-                        unsigned int start_off = (c * 2654435761u + cfg.iter_seed + t) % cfg.bits_words;
-                        for (unsigned int w_off = 0u;
-                             w_off < cfg.bits_words && grown < max_grow;
-                             w_off++) {
-                            unsigned int widx = (start_off + w_off) % cfg.bits_words;
-                            unsigned int word = prev_winner[widx];
-                            while (word != 0u && grown < max_grow) {
-                                unsigned int bit_pos = __ffs(word) - 1u;
-                                word &= ~(1u << bit_pos);
-                                unsigned int cell_id = widx * 32u + bit_pos;
-                                if (cell_id >= cfg.n_cells) continue;
-                                bool exists = false;
-                                for (unsigned int es = 0u; es < existing_syn + grown; es++) {
-                                    if (syn_presyn[syn_base + es] == cell_id) { exists = true; break; }
-                                }
-                                if (exists) continue;
-                                unsigned int write_idx = existing_syn + grown;
-                                if (write_idx >= SPS) break;
-                                syn_presyn[syn_base + write_idx] = cell_id;
-                                tm_syn_perm[syn_base + write_idx] = (short)cfg.initial_perm_i16;
-                                grown++;
-                            }
-                        }
-                        if (grown > 0u) {
-                            seg_syn_count[target_seg] = existing_syn + grown;
-                        }
-                    }
-                }
-            }
-        }
-        // ---- BARRIER 3: TM writes complete before anomaly + next-step read ----
-        // Fence: flush curr_active/curr_winner bitsets + tm_syn_perm +
-        // seg_syn_count + syn_presyn before peers advance and consume them as
-        // prev_active/prev_winner at t+1.
-        __threadfence();
-        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
-        // Write anomaly for step t.
-        if (blockIdx.x == 0u && tid == 0u) {
-            unsigned int total = step_scratch[0];
-            unsigned int bad   = step_scratch[1];
-            float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
-            anom_out[t] = anom;
-        }
-    }
-}
-// Single-region kernel (legacy call site).
-__global__ __launch_bounds__(256, 2)
-void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
-    htm_fused_step_body(P, cfg);
-}
-// Batched kernel: one cooperative launch for B regions. grid.y = B,
-// grid.x = per-region block count. Each block reads its region's
-// FusedPtrs from the device array via blockIdx.y.
-__global__ __launch_bounds__(256, 2)
-void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
-    const FusedPtrs P = P_arr[blockIdx.y];
-    htm_fused_step_body(P, cfg);
-}
-} // extern "C"

+// Fused HTM megakernel — SP + TM, all T timesteps in a single launch.
+//
+// Design rationale:
+//   - Global top-K column selection requires cross-block synchronization at
+//     every timestep (grid.sync is unreliable on WSL2/sm_86 without rdc=true).
+//   - Replace with per-column threshold activation using local lateral
+//     inhibition: column c activates if overlap[c]*boost[c] > threshold[c].
+//     Threshold is a per-column running-EMA learned scalar that steers the
+//     column's long-run activation rate toward the global sparsity target.
+//   - This is biologically grounded (GABAergic local inhibition) and supported
+//     by HTM theory (duty-cycle boost already drives this loop; we just
+//     change which lever the EMA pulls).
+//
+// Launch shape:
+//   grid  = min(device SM count, 16)  // hard cap — see below
+//   block = 1024 threads = 32 warps
+//   Each warp of 32 owns a contiguous column slice (n_columns / total_warps).
+//
+// Cross-block coherence:
+//   - Ping-pong buffers for cell_active/cell_winner: write _a at even t,
+//     read _b; reversed at odd t.
+//   - Preferred path: cooperative launch + hardware whole-grid sync.
+//   - Fallback path: software 3-slot rotating grid barrier for devices/drivers
+//     that cannot do cooperative launch.
+//
+// 2026-04-16: grid_dim reduced from 28 to 16 after deadlock RCA. The previous
+// cap of 28 relied on all blocks being concurrently resident on a 30-SM RTX
+// 3060 Laptop. Under thermal throttling effective residency dropped to ~20-24,
+// leaving scheduled blocks spinning on the software grid barrier waiting for
+// peer blocks that would never run. 16 blocks is below any realistic residency
+// floor and preserves enough warp parallelism (16*32 = 512 warps) to saturate
+// memory bandwidth on the spatial-pooler stage.
+//
+// Kernel signature uses struct-by-value for pointers and config to stay
+// inside cudarc's launch-arg count limit.
+#include <cooperative_groups.h>
+#include <cooperative_groups/memcpy_async.h>
+namespace cg = cooperative_groups;
+// Maximum columns owned per cluster-block in DSMEM.
+// Supports n_columns up to COLS_PER_CLUSTER_BLOCK_MAX * cluster_size.
+// At cluster_size=16: supports up to 256*16=4096 columns.
+// Each array costs 256*4 = 1024 bytes; three arrays = 3072 bytes per SM —
+// well under the 228 KB H200 shared-memory cap.
+#define COLS_PER_CLUSTER_BLOCK_MAX 256u
+// Maximum input_bits supported by the TMA-multicast staging tile.
+// At 32 KB this covers the production SDR width (16384 bits) with 2× headroom.
+// Total shared per SM: 32768 (tile) + 3072 (DSMEM float arrays) = ~35 KB —
+// well under the 228 KB H200 limit.
+//
+// Expected speedup from TMA multicast input staging (T9/T11):
+//   - Without staging: 16 SMs × T × (input_bits GMEM reads per timestep)
+//   - With staging:    1 TMA DMA per timestep, shared reads from L1 thereafter
+//   - Theoretical DRAM bandwidth reduction: ~16× on input reads
+//   - Wall-clock reduction estimate: -20 to -40 ms from reduced input fetch latency
+#define INPUT_BITS_MAX 32768u
+extern "C" {
+struct FusedPtrs {
+    unsigned long long syn_bit;
+    unsigned long long syn_perm;
+    unsigned long long boost;
+    unsigned long long active_duty;
+    unsigned long long inhibition_threshold;
+    unsigned long long seg_cell_id;
+    unsigned long long seg_syn_count;
+    unsigned long long syn_presyn;
+    unsigned long long tm_syn_perm;
+    unsigned long long cell_seg_count;
+    unsigned long long cell_active_a;
+    unsigned long long cell_active_b;
+    unsigned long long cell_winner_a;
+    unsigned long long cell_winner_b;
+    unsigned long long inputs;
+    unsigned long long cols_out;
+    unsigned long long anom_out;
+    unsigned long long barrier_counters;
+    unsigned long long step_scratch;
+};
+struct FusedConfig {
+    // SP constants
+    unsigned int input_bits;
+    unsigned int n_columns;
+    unsigned int synapses_per_col;
+    float        conn_thr;
+    float        sp_inc;
+    float        sp_dec;
+    float        sparsity_target;
+    float        duty_alpha;
+    float        thr_adapt_rate;
+    // TM constants
+    unsigned int cells_per_column;
+    unsigned int n_cells;
+    unsigned int bits_words;
+    unsigned int max_segments_per_cell;
+    unsigned int synapses_per_segment;
+    unsigned int activation_threshold;
+    unsigned int learning_threshold;
+    unsigned int max_new_synapses;
+    int          conn_thr_i16;
+    int          perm_inc_i16;
+    int          perm_dec_i16;
+    int          predicted_seg_dec_i16;
+    int          initial_perm_i16;
+    // Loop constants
+    unsigned int T;
+    unsigned int learn;
+    unsigned int iter_seed;
+    unsigned int cooperative_grid_sync;
+};
+// Hardware cluster barrier using Hopper sm_90a cooperative_groups::this_cluster().sync().
+// Replaces the former software Decoupled Look-Back (DLB) atomic-spin barrier.
+//
+// cluster::sync() is a single PTX instruction (barrier.cluster) that resolves
+// in ~10-40 ns inside the cluster, with no device-level serialization.
+// Multiple clusters (one per HTM region) run fully concurrently — bounded
+// only by SM count (8 clusters × 16 SMs = 128 ≤ 132 on H200).
+//
+// The flags / expected / phase / cooperative_grid_sync parameters are kept
+// in the signature for call-site compatibility but are unused.
+__device__ static inline void fused_grid_barrier(cg::grid_group grid,
+                                                 unsigned int * /* flags — unused */,
+                                                 unsigned int /* expected — unused */,
+                                                 unsigned int /* phase — unused */,
+                                                 unsigned int /* cooperative_grid_sync — unused */) {
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    // Hopper+ : hardware cluster barrier (~10-40 ns)
+    auto cluster = cg::this_cluster();
+    cluster.sync();
+#else
+    // Pre-Hopper (sm_80, sm_86, sm_89): grid-level cooperative sync.
+    // Requires cooperative kernel launch. ~us-ms range, adequate for HTM
+    // workload (kernel launch frequency is low).
+    grid.sync();
+#endif
+}
+__device__ static inline unsigned int warp_sum_u32(unsigned int v) {
+    for (int off = 16; off > 0; off >>= 1) {
+        v += __shfl_down_sync(0xffffffffu, v, off);
+    }
+    return v;
+}
+// Core kernel body — works for both single-region and batched launches.
+// Single-region: caller passes the one FusedPtrs struct.
+// Batched: each block reads its region's FusedPtrs via blockIdx.y before
+// calling this. State is independent per region (each region owns its own
+// GPU buffers); grid.sync() is the only cross-block primitive and it
+// spans ALL blocks in the grid (harmless over-sync across regions).
+__device__ static inline
+void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
+    cg::grid_group grid = cg::this_grid();
+    // Cast pointers.
+    const unsigned int  * __restrict__ syn_bit               = (const unsigned int*)P.syn_bit;
+    float               * __restrict__ syn_perm              = (float*)P.syn_perm;
+    float               * __restrict__ boost                 = (float*)P.boost;
+    float               * __restrict__ active_duty           = (float*)P.active_duty;
+    float               * __restrict__ inhibition_threshold  = (float*)P.inhibition_threshold;
+    unsigned int        * __restrict__ seg_cell_id           = (unsigned int*)P.seg_cell_id;
+    unsigned int        * __restrict__ seg_syn_count         = (unsigned int*)P.seg_syn_count;
+    unsigned int        * __restrict__ syn_presyn            = (unsigned int*)P.syn_presyn;
+    short               * __restrict__ tm_syn_perm           = (short*)P.tm_syn_perm;
+    unsigned int        * __restrict__ cell_seg_count        = (unsigned int*)P.cell_seg_count;
+    unsigned int        * __restrict__ cell_active_a         = (unsigned int*)P.cell_active_a;
+    unsigned int        * __restrict__ cell_active_b         = (unsigned int*)P.cell_active_b;
+    unsigned int        * __restrict__ cell_winner_a         = (unsigned int*)P.cell_winner_a;
+    unsigned int        * __restrict__ cell_winner_b         = (unsigned int*)P.cell_winner_b;
+    const unsigned char * __restrict__ inputs                = (const unsigned char*)P.inputs;
+    unsigned char       * __restrict__ cols_out              = (unsigned char*)P.cols_out;
+    float               * __restrict__ anom_out              = (float*)P.anom_out;
+    unsigned int        * __restrict__ barrier_counters      = (unsigned int*)P.barrier_counters;
+    unsigned int        * __restrict__ step_scratch          = (unsigned int*)P.step_scratch;
+    const unsigned int tid     = threadIdx.x;
+    const unsigned int lane    = tid & 31u;
+    const unsigned int warp    = tid >> 5;
+    const unsigned int warps_per_block = blockDim.x >> 5;
+    const unsigned int gwarp   = blockIdx.x * warps_per_block + warp;
+    const unsigned int n_warps = gridDim.x * warps_per_block;
+    const unsigned int n_cols  = cfg.n_columns;
+    const unsigned int col_lo  = (gwarp * n_cols) / n_warps;
+    const unsigned int col_hi  = ((gwarp + 1) * n_cols) / n_warps;
+    unsigned int phase = 0u;
+    // =========================================================
+    // DSMEM: Cluster-distributed shared memory for hot per-column
+    // state (inhibition_threshold, boost, active_duty).
+    //
+    // On Hopper (sm_90+): Each block in the cluster owns a contiguous
+    // slice of columns in its own __shared__ arrays. Any block can
+    // peer-read another block's slice via cluster.map_shared_rank().
+    //
+    // On Ampere (sm_86) and other pre-Hopper: No cluster support.
+    // Read/write directly from/to global memory (inhibition_threshold,
+    // boost, active_duty device pointers). Slightly higher latency but
+    // functionally correct.
+    // =========================================================
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    // Hopper+ cluster path
+    auto cluster = cg::this_cluster();
+    const unsigned int cluster_block_rank = cluster.block_rank();  // 0..cluster_size-1
+    const unsigned int cluster_sz         = cluster.num_blocks();  // == gridDim.x (≤16)
+#else
+    // Pre-Hopper: no cluster, each block is independent.
+    const unsigned int cluster_block_rank = blockIdx.x;
+    const unsigned int cluster_sz         = gridDim.x;
+#endif
+    // Partition n_cols evenly across cluster blocks.
+    // Each block owns cols_per_block columns starting at my_col_start.
+    const unsigned int cols_per_block =
+        (n_cols + cluster_sz - 1u) / cluster_sz;               // ceil div
+    const unsigned int my_col_start =
+        cluster_block_rank * cols_per_block;
+    const unsigned int my_col_end =
+        (my_col_start + cols_per_block < n_cols)
+            ? (my_col_start + cols_per_block) : n_cols;        // clamp
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    // Cluster-distributed shared memory arrays.
+    // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
+    // Peer blocks address into each other's smem via map_shared_rank.
+    __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
+    __shared__ float s_boost     [COLS_PER_CLUSTER_BLOCK_MAX];
+    __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
+#endif
+    // TMA multicast input staging tile (T9) — HOPPER ONLY.
+    //
+    // On Hopper: cg::memcpy_async with cluster scope multicasts input to all
+    // 16 SMs, reducing DRAM traffic by ~16×.
+    // On Ampere: 32 KB smem allocation exceeds per-block budget when
+    // cooperatively launched (48 KB total, registers eat the rest). Skip the
+    // tile entirely — Stage A reads from GMEM directly (original path).
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
+#endif
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    // Initial GMEM → smem load (reads state from previous forward call).
+    // Each block loads only its own slice; tid strides across the slice.
+    for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
+        const unsigned int off = c - my_col_start;
+        s_inhib_thr [off] = inhibition_threshold[c];
+        s_boost     [off] = boost[c];
+        s_active_duty[off] = active_duty[c];
+    }
+    // All blocks in the cluster must finish loading before any block
+    // starts reading peer smem inside the T-loop.
+    cluster.sync();
+#else
+    // Pre-Hopper: no smem caching needed — reads go directly to GMEM.
+    // Grid sync ensures all blocks have completed Phase 0 init before T-loop.
+    grid.sync();
+#endif
+    const unsigned int S   = cfg.synapses_per_col;
+    const unsigned int cpc = cfg.cells_per_column;
+    const unsigned int SPS = cfg.synapses_per_segment;
+    const unsigned int MSC = cfg.max_segments_per_cell;
+    // Main timestep loop.
+    for (unsigned int t = 0u; t < cfg.T; t++) {
+        const unsigned int inp_off      = t * cfg.input_bits;
+        const unsigned int col_base_out = t * n_cols;
+        unsigned int * curr_active = (t & 1u) ? cell_active_b : cell_active_a;
+        unsigned int * prev_active = (t & 1u) ? cell_active_a : cell_active_b;
+        unsigned int * curr_winner = (t & 1u) ? cell_winner_b : cell_winner_a;
+        unsigned int * prev_winner = (t & 1u) ? cell_winner_a : cell_winner_b;
+        // ---- Phase 0: clear curr bitsets for my cell range ----
+        const unsigned int my_cell_lo = col_lo * cpc;
+        const unsigned int my_cell_hi = col_hi * cpc;
+        if (cpc == 32u) {
+            // Fast path: one word per column.
+            for (unsigned int c = col_lo + lane; c < col_hi; c += 32u) {
+                curr_active[c] = 0u;
+                curr_winner[c] = 0u;
+            }
+        } else {
+            for (unsigned int cell = my_cell_lo + lane; cell < my_cell_hi; cell += 32u) {
+                unsigned int w = cell >> 5;
+                unsigned int m = 1u << (cell & 31u);
+                atomicAnd(&curr_active[w], ~m);
+                atomicAnd(&curr_winner[w], ~m);
+            }
+        }
+        // Block 0, lane 0, warp 0 resets step-scratch counters.
+        if (blockIdx.x == 0u && tid == 0u) {
+            step_scratch[0] = 0u;
+            step_scratch[1] = 0u;
+        }
+        // ---- BARRIER 1 ----
+        // Fence: make the above clear-bitsets + scratch writes globally
+        // visible before peer blocks observe "barrier arrived".
+        __threadfence();
+        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
+        // =========================================================
+        // T9: TMA MULTICAST INPUT STAGING
+        //
+        // Issue a single cluster-scope async DMA to broadcast this
+        // timestep's input slice into s_input_tile across all 16 SMs
+        // in the cluster simultaneously.  On Hopper sm_90a,
+        // cg::memcpy_async with cluster scope maps to the TMA
+        // hardware unit (cp.async.bulk.tensor multicast), reducing
+        // DRAM input traffic by ~16× vs each block fetching its own
+        // copy from GMEM.
+        //
+        // The staging is gated on cfg.input_bits <= INPUT_BITS_MAX.
+        // If the tile is too small (custom large input_bits), we fall
+        // back to per-thread GMEM reads in Stage A (identical to the
+        // original path; use_input_tile==false).
+        //
+        // Ordering: BARRIER 1 completes before we issue the DMA.
+        // The DMA completes before Stage A reads s_input_tile.
+        // =========================================================
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+        const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
+        if (use_input_tile) {
+            auto tb = cg::this_thread_block();
+            cg::memcpy_async(tb, s_input_tile,
+                             inputs + inp_off,
+                             cfg.input_bits);
+            cg::wait(tb);
+            cluster.sync();
+        }
+#else
+        const bool use_input_tile = false;
+#endif
+        // =========================================================
+        // STAGE A: Spatial Pooler
+        //
+        // Hot per-column state (boost, inhibition_threshold,
+        // active_duty) is served from cluster DSMEM rather than
+        // GMEM for each of the T timesteps.  GMEM is written on
+        // update so state persists across forward calls.
+        // =========================================================
+        for (unsigned int c = col_lo; c < col_hi; c++) {
+            unsigned int base = c * S;
+            unsigned int local = 0u;
+            for (unsigned int s = lane; s < S; s += 32u) {
+                unsigned int b = syn_bit[base + s];
+                float p = syn_perm[base + s];
+                // T9: read from cluster-broadcast tile when available;
+                // fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+                unsigned int inp_byte = use_input_tile
+                    ? (unsigned int)s_input_tile[b]
+                    : (unsigned int)inputs[inp_off + b];
+#else
+                unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
+#endif
+                unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
+                local += hit;
+            }
+            unsigned int overlap = warp_sum_u32(local);
+            overlap = __shfl_sync(0xffffffffu, overlap, 0);
+            // Read boost + threshold for column c.
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+            // Hopper: read from cluster-distributed shared memory.
+            const unsigned int owner_block  = c / cols_per_block;
+            const unsigned int owner_offset = c - owner_block * cols_per_block;
+            float boost_val = cluster.map_shared_rank(s_boost,      owner_block)[owner_offset];
+            float thr       = cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset];
+#else
+            // Pre-Hopper: read directly from global memory.
+            float boost_val = boost[c];
+            float thr       = inhibition_threshold[c];
+#endif
+            float boosted = (float)overlap * boost_val;
+            unsigned int is_active = (boosted > thr) ? 1u : 0u;
+            if (lane == 0) {
+                cols_out[col_base_out + c] = (unsigned char)is_active;
+                if (is_active) {
+                    atomicAdd(&step_scratch[0], 1u);
+                }
+            }
+            // SP learn (Hebbian) on active columns.
+            // T9: use tile for input reads here too.
+            if (cfg.learn && is_active) {
+                for (unsigned int s = lane; s < S; s += 32u) {
+                    unsigned int b = syn_bit[base + s];
+                    float p = syn_perm[base + s];
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+                    unsigned int inp_byte = use_input_tile
+                        ? (unsigned int)s_input_tile[b]
+                        : (unsigned int)inputs[inp_off + b];
+#else
+                    unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
+#endif
+                    if (inp_byte != 0u) {
+                        p += cfg.sp_inc;
+                        if (p > 1.0f) p = 1.0f;
+                    } else {
+                        p -= cfg.sp_dec;
+                        if (p < 0.0f) p = 0.0f;
+                    }
+                    syn_perm[base + s] = p;
+                }
+            }
+            // active_duty EMA + threshold adaptation.
+            // Writes go to both DSMEM (hot path, Hopper only) and GMEM (persistence).
+            if (lane == 0) {
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+                float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
+#else
+                float ad = active_duty[c];
+#endif
+                float sample = is_active ? 1.0f : 0.0f;
+                ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+                // Writeback: peer smem (for next timestep read) + GMEM (persistence).
+                cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
+#endif
+                active_duty[c] = ad;
+                // Threshold steers toward target sparsity.
+                float err = ad - cfg.sparsity_target;
+                float new_thr = thr + cfg.thr_adapt_rate * err * 100.0f;
+                if (new_thr < 0.1f) new_thr = 0.1f;
+                if (new_thr > 1000.0f) new_thr = 1000.0f;
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+                // Writeback: peer smem (for next timestep read) + GMEM (persistence).
+                cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
+#endif
+                inhibition_threshold[c] = new_thr;
+            }
+        }
+        // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
+        //
+        // On Hopper: cluster.sync() ensures all peer smem writes from this
+        // timestep are visible to all blocks before Stage B / next t.
+        // On pre-Hopper: no smem peer writes occur (all state in GMEM),
+        // so no extra sync needed here — the grid barrier below suffices.
+#if !defined(HTM_DISABLE_CLUSTER) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+        cluster.sync();
+#endif
+        // ---- BARRIER 2: SP active_mask must be visible before TM reads ----
+        // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
+        // writes to global memory before peers advance past this barrier.
+        __threadfence();
+        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
+        // =========================================================
+        // STAGE B: Temporal Memory
+        // =========================================================
+        for (unsigned int c = col_lo; c < col_hi; c++) {
+            unsigned int col_active = cols_out[col_base_out + c];
+            if (col_active == 0u) continue;
+            unsigned int base_cell = c * cpc;
+            unsigned int any_predicted = 0u;
+            unsigned int best_seg_id_for_grow = 0xFFFFFFFFu;
+            unsigned int best_pot_count = 0u;
+            for (unsigned int k = 0u; k < cpc; k++) {
+                unsigned int cell = base_cell + k;
+                unsigned int n_segs_here = cell_seg_count[cell];
+                if (n_segs_here > MSC) n_segs_here = MSC;
+                if (n_segs_here == 0u) continue;
+                unsigned int seg_base_id = cell * MSC;
+                unsigned int cell_is_predictive = 0u;
+                for (unsigned int ls = 0u; ls < n_segs_here; ls++) {
+                    unsigned int seg = seg_base_id + ls;
+                    unsigned int n_syn = seg_syn_count[seg];
+                    if (n_syn == 0u) continue;
+                    unsigned int syn_base = seg * SPS;
+                    unsigned int l_conn = 0u;
+                    unsigned int l_pot  = 0u;
+                    for (unsigned int s = lane; s < n_syn; s += 32u) {
+                        unsigned int presyn = syn_presyn[syn_base + s];
+                        unsigned int w = prev_active[presyn >> 5];
+                        unsigned int bit = (w >> (presyn & 31u)) & 1u;
+                        if (bit) {
+                            l_pot += 1u;
+                            int p = (int)tm_syn_perm[syn_base + s];
+                            if (p >= cfg.conn_thr_i16) l_conn += 1u;
+                        }
+                    }
+                    unsigned int tot_conn = warp_sum_u32(l_conn);
+                    unsigned int tot_pot  = warp_sum_u32(l_pot);
+                    tot_conn = __shfl_sync(0xffffffffu, tot_conn, 0);
+                    tot_pot  = __shfl_sync(0xffffffffu, tot_pot, 0);
+                    if (tot_conn >= cfg.activation_threshold) cell_is_predictive = 1u;
+                    if (tot_pot >= cfg.learning_threshold && tot_pot > best_pot_count) {
+                        best_pot_count = tot_pot;
+                        best_seg_id_for_grow = seg;
+                    }
+                    // Reinforce predicted-and-correct segment.
+                    if (cfg.learn && tot_conn >= cfg.activation_threshold) {
+                        for (unsigned int s = lane; s < n_syn; s += 32u) {
+                            unsigned int presyn = syn_presyn[syn_base + s];
+                            unsigned int w = prev_active[presyn >> 5];
+                            unsigned int bit = (w >> (presyn & 31u)) & 1u;
+                            int p = (int)tm_syn_perm[syn_base + s];
+                            if (bit) {
+                                int np = p + cfg.perm_inc_i16;
+                                if (np > 32767) np = 32767;
+                                tm_syn_perm[syn_base + s] = (short)np;
+                            } else {
+                                int np = p - cfg.perm_dec_i16;
+                                if (np < 0) np = 0;
+                                tm_syn_perm[syn_base + s] = (short)np;
+                            }
+                        }
+                    }
+                }
+                if (cell_is_predictive) {
+                    any_predicted = 1u;
+                    if (lane == 0) {
+                        unsigned int w = cell >> 5;
+                        unsigned int m = 1u << (cell & 31u);
+                        atomicOr(&curr_active[w], m);
+                        atomicOr(&curr_winner[w], m);
+                    }
+                }
+            }
+            // BURST if no predicted.
+            if (!any_predicted) {
+                if (lane == 0) {
+                    for (unsigned int k = 0u; k < cpc; k++) {
+                        unsigned int cell = base_cell + k;
+                        unsigned int w = cell >> 5;
+                        unsigned int m = 1u << (cell & 31u);
+                        atomicOr(&curr_active[w], m);
+                    }
+                    unsigned int win = base_cell;
+                    unsigned int ww = win >> 5;
+                    unsigned int wm = 1u << (win & 31u);
+                    atomicOr(&curr_winner[ww], wm);
+                    atomicAdd(&step_scratch[1], 1u);
+                }
+                if (cfg.learn) {
+                    unsigned int target_seg;
+                    unsigned int existing_syn;
+                    if (best_seg_id_for_grow != 0xFFFFFFFFu) {
+                        // Reuse best matching segment.
+                        target_seg = best_seg_id_for_grow;
+                        existing_syn = seg_syn_count[target_seg];
+                        target_seg = __shfl_sync(0xffffffffu, target_seg, 0);
+                        existing_syn = __shfl_sync(0xffffffffu, existing_syn, 0);
+                        // Reinforce its existing synapses.
+                        unsigned int syn_base = target_seg * SPS;
+                        for (unsigned int s = lane; s < existing_syn; s += 32u) {
+                            unsigned int presyn = syn_presyn[syn_base + s];
+                            unsigned int w = prev_active[presyn >> 5];
+                            unsigned int bit = (w >> (presyn & 31u)) & 1u;
+                            int p = (int)tm_syn_perm[syn_base + s];
+                            if (bit) {
+                                int np = p + cfg.perm_inc_i16;
+                                if (np > 32767) np = 32767;
+                                tm_syn_perm[syn_base + s] = (short)np;
+                            } else {
+                                int np = p - cfg.perm_dec_i16;
+                                if (np < 0) np = 0;
+                                tm_syn_perm[syn_base + s] = (short)np;
+                            }
+                        }
+                    } else {
+                        // Allocate new segment on winner cell (cell 0 of col).
+                        unsigned int new_seg = 0u;
+                        if (lane == 0) {
+                            unsigned int winner_cell = base_cell;
+                            unsigned int slot = atomicAdd(&cell_seg_count[winner_cell], 1u);
+                            if (slot >= MSC) slot = slot % MSC;
+                            new_seg = winner_cell * MSC + slot;
+                            seg_cell_id[new_seg] = winner_cell;
+                            seg_syn_count[new_seg] = 0u;
+                        }
+                        target_seg = __shfl_sync(0xffffffffu, new_seg, 0);
+                        existing_syn = 0u;
+                    }
+                    // Grow synapses to prev_winner cells — lane 0 serialized.
+                    unsigned int room = (SPS > existing_syn) ? (SPS - existing_syn) : 0u;
+                    unsigned int max_grow = (cfg.max_new_synapses < room) ? cfg.max_new_synapses : room;
+                    if (lane == 0 && max_grow > 0u) {
+                        unsigned int syn_base = target_seg * SPS;
+                        unsigned int grown = 0u;
+                        unsigned int start_off = (c * 2654435761u + cfg.iter_seed + t) % cfg.bits_words;
+                        for (unsigned int w_off = 0u;
+                             w_off < cfg.bits_words && grown < max_grow;
+                             w_off++) {
+                            unsigned int widx = (start_off + w_off) % cfg.bits_words;
+                            unsigned int word = prev_winner[widx];
+                            while (word != 0u && grown < max_grow) {
+                                unsigned int bit_pos = __ffs(word) - 1u;
+                                word &= ~(1u << bit_pos);
+                                unsigned int cell_id = widx * 32u + bit_pos;
+                                if (cell_id >= cfg.n_cells) continue;
+                                bool exists = false;
+                                for (unsigned int es = 0u; es < existing_syn + grown; es++) {
+                                    if (syn_presyn[syn_base + es] == cell_id) { exists = true; break; }
+                                }
+                                if (exists) continue;
+                                unsigned int write_idx = existing_syn + grown;
+                                if (write_idx >= SPS) break;
+                                syn_presyn[syn_base + write_idx] = cell_id;
+                                tm_syn_perm[syn_base + write_idx] = (short)cfg.initial_perm_i16;
+                                grown++;
+                            }
+                        }
+                        if (grown > 0u) {
+                            seg_syn_count[target_seg] = existing_syn + grown;
+                        }
+                    }
+                }
+            }
+        }
+        // ---- BARRIER 3: TM writes complete before anomaly + next-step read ----
+        // Fence: flush curr_active/curr_winner bitsets + tm_syn_perm +
+        // seg_syn_count + syn_presyn before peers advance and consume them as
+        // prev_active/prev_winner at t+1.
+        __threadfence();
+        fused_grid_barrier(grid, barrier_counters, 0u, phase++, cfg.cooperative_grid_sync);
+        // Write anomaly for step t.
+        if (blockIdx.x == 0u && tid == 0u) {
+            unsigned int total = step_scratch[0];
+            unsigned int bad   = step_scratch[1];
+            float anom = (total > 0u) ? ((float)bad / (float)total) : 0.0f;
+            anom_out[t] = anom;
+        }
+    }
+}
+// Single-region kernel (legacy call site).
+__global__ __launch_bounds__(256, 2)
+void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
+    htm_fused_step_body(P, cfg);
+}
+// Batched kernel: one cooperative launch for B regions. grid.y = B,
+// grid.x = per-region block count. Each block reads its region's
+// FusedPtrs from the device array via blockIdx.y.
+__global__ __launch_bounds__(256, 2)
+void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
+    const FusedPtrs P = P_arr[blockIdx.y];
+    htm_fused_step_body(P, cfg);
+}
+} // extern "C"

overlay/htm_rust/src/gpu/tests.rs CHANGED Viewed

@@ -1,643 +1,663 @@
-//! Parity tests: GPU SP vs CPU SP reference.
-//!
-//! With matching seeds the two should produce bit-identical active-column sets
-//! when `learn=false`, and remain bit-identical over repeated `learn=true`
-//! steps because the Hebbian update is deterministic (no RNG once initialised).
-//!
-//! Run with:  cargo test --release --features gpu
-#![cfg(test)]
-#![cfg(feature = "gpu")]
-use crate::sp::{SpatialPooler, SpatialPoolerConfig};
-use crate::gpu::sp_gpu::SpatialPoolerGpu;
-use crate::gpu::tm_gpu::TemporalMemoryGpu;
-use crate::gpu::fused::{
-    launch_fused, plan_fused_launch, FusedState,
-};
-use cudarc::driver::CudaSlice;
-use rand::{Rng, SeedableRng};
-use rand_xoshiro::Xoshiro256PlusPlus;
-fn make_sdr(rng: &mut Xoshiro256PlusPlus, bits: usize, sparsity: f32) -> Vec<u8> {
-    let on = ((sparsity * bits as f32) as usize).max(1);
-    let mut v = vec![0u8; bits];
-    let mut placed = 0;
-    while placed < on {
-        let i = rng.gen_range(0..bits);
-        if v[i] == 0 {
-            v[i] = 1;
-            placed += 1;
-        }
-    }
-    v
-}
-#[test]
-fn gpu_sp_matches_cpu_no_learn() {
-    let cfg = SpatialPoolerConfig::default();
-    let bits = cfg.input_bits;
-    let mut cpu = SpatialPooler::new(
-        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
-        1234,
-    );
-    let cpu_for_gpu = SpatialPooler::new(
-        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
-        1234,
-    );
-    let mut gpu = SpatialPoolerGpu::from_cpu(&cpu_for_gpu)
-        .expect("gpu init (CUDA device available)");
-    gpu.set_strict_parity(true);
-    let mut rng = Xoshiro256PlusPlus::seed_from_u64(99);
-    for step in 0..20 {
-        let sdr_u8 = make_sdr(&mut rng, bits, 0.02);
-        let sdr_bool: Vec<bool> = sdr_u8.iter().map(|&x| x != 0).collect();
-        let cpu_active: Vec<u32> = cpu.compute(&sdr_bool, false);
-        let gpu_active: Vec<u32> = gpu.compute(&sdr_u8, false).expect("gpu compute");
-        assert_eq!(
-            cpu_active, gpu_active,
-            "mismatch at step {step}: len cpu={} gpu={}",
-            cpu_active.len(), gpu_active.len()
-        );
-    }
-}
-#[test]
-fn gpu_sp_matches_cpu_with_learn() {
-    let cfg = SpatialPoolerConfig::default();
-    let bits = cfg.input_bits;
-    let mut cpu = SpatialPooler::new(
-        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
-        5678,
-    );
-    let cpu_for_gpu = SpatialPooler::new(
-        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
-        5678,
-    );
-    let mut gpu = SpatialPoolerGpu::from_cpu(&cpu_for_gpu).expect("gpu init");
-    gpu.set_strict_parity(true);
-    let mut rng = Xoshiro256PlusPlus::seed_from_u64(42);
-    for step in 0..50 {
-        let sdr_u8 = make_sdr(&mut rng, bits, 0.02);
-        let sdr_bool: Vec<bool> = sdr_u8.iter().map(|&x| x != 0).collect();
-        let cpu_active = cpu.compute(&sdr_bool, true);
-        let gpu_active = gpu.compute(&sdr_u8, true).expect("gpu compute");
-        assert_eq!(
-            cpu_active, gpu_active,
-            "mismatch at step {step} with learning"
-        );
-    }
-}
-#[test]
-fn gpu_tm_anomaly_decays_on_repeating_sequence() {
-    // End-to-end GPU pipeline: SP feeds TM; repeating SDR sequence should drive
-    // anomaly down over time.
-    use crate::gpu::HTMRegionGpu;  // not pyclass methods; use internal constructor via Rust
-    // Easier: replicate the pipeline directly with SP + TM.
-    let cfg = SpatialPoolerConfig::default();
-    let bits = cfg.input_bits;
-    let n_cols = cfg.n_columns;
-    let cells_per_col = 32usize;
-    let cpu_for_gpu = SpatialPooler::new(SpatialPoolerConfig::default(), 314);
-    let mut sp = SpatialPoolerGpu::from_cpu(&cpu_for_gpu).expect("gpu init");
-    let dev = sp.dev_ref().clone();
-    let mut tm = TemporalMemoryGpu::new(dev.clone(), n_cols, cells_per_col)
-        .expect("gpu tm init");
-    tm.reset().expect("tm reset");
-    // Build 3 fixed SDRs, feed them in a repeating sequence.
-    let mut rng = Xoshiro256PlusPlus::seed_from_u64(7);
-    let make = |rng: &mut Xoshiro256PlusPlus| make_sdr(rng, bits, 0.02);
-    let seqs = [make(&mut rng), make(&mut rng), make(&mut rng)];
-    // Warm up SP so columns are stable per symbol.
-    for _ in 0..100 {
-        for s in &seqs {
-            let _ = sp.compute(s, true).expect("sp compute");
-        }
-    }
-    // Build a long input buffer: 100 repetitions of [A,B,C] = 300 steps.
-    let repeats = 100usize;
-    let t = repeats * 3;
-    let mut inputs_flat = vec![0u8; t * bits];
-    for r in 0..repeats {
-        for (i, s) in seqs.iter().enumerate() {
-            let off = (r * 3 + i) * bits;
-            inputs_flat[off..off + bits].copy_from_slice(s);
-        }
-    }
-    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs_flat).expect("htod");
-    let mut cols_dev = dev.alloc_zeros::<u8>(t * n_cols).expect("alloc cols");
-    let mut anom_dev = dev.alloc_zeros::<f32>(t).expect("alloc anom");
-    sp.step_batch_with_tm(
-        &inputs_dev,
-        t,
-        bits,
-        true,
-        &mut cols_dev,
-        &mut anom_dev,
-        &mut tm,
-    ).expect("step_batch_with_tm");
-    let anom: Vec<f32> = dev.dtoh_sync_copy(&anom_dev).expect("d2h anom");
-    let cols: Vec<u8> = dev.dtoh_sync_copy(&cols_dev).expect("d2h cols");
-    // Active column count per step must equal k for every step.
-    let k = ((cfg.sparsity * n_cols as f32).round() as usize).max(1);
-    for ti in 0..t {
-        let step_slice = &cols[ti * n_cols..(ti + 1) * n_cols];
-        let n_on = step_slice.iter().filter(|&&b| b != 0).count();
-        assert_eq!(n_on, k, "step {ti} has {n_on} active cols, expected {k}");
-    }
-    // First repetition: anomaly should be near 1.0 (nothing predicted).
-    let early_avg: f32 = anom[3..9].iter().sum::<f32>() / 6.0;
-    // Last repetitions: anomaly should be noticeably lower.
-    let late_avg: f32 = anom[(t - 9)..t].iter().sum::<f32>() / 9.0;
-    eprintln!("gpu tm: early anomaly = {early_avg:.3}, late = {late_avg:.3}");
-    assert!(
-        late_avg < early_avg,
-        "GPU TM should reduce anomaly on repeating sequence: early={early_avg:.3}, late={late_avg:.3}"
-    );
-}
-/// Cluster-sync smoke test: verifies that the fused megakernel (which relies on
-/// hardware `cluster::sync()` / grid-barrier on H100/H200 Hopper) completes
-/// without deadlock when called with real HTM state, and that output shapes are
-/// sane (no NaN / Inf in anomaly scores, active-column count in plausible range).
-///
-/// This is an *integration* test, not a synthetic micro-benchmark: it exercises
-/// exactly the same `launch_fused` code path used in production, so any
-/// deadlock in the cooperative-grid or DLB barrier would surface here.
-///
-/// Skips gracefully (with an eprintln) when no GPU is available — the test
-/// binary returns exit-code 0 in that case so CI still passes.
-#[test]
-fn cluster_sync_smoke_test() {
-    // Build a tiny HTM region (1024 inputs, 256 columns, 4 cells/column).
-    // This keeps VRAM usage minimal while still exercising all kernel paths.
-    let input_bits  = 1024usize;
-    let n_columns   = 256usize;
-    let cells_per_col = 4usize;
-    // Probe cooperative launch attribute before doing any real work.
-    // CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 223 (added in CUDA 11.8 for Hopper).
-    // cudarc exposes raw attribute querying; we check cooperative launch (98)
-    // as the guard — cluster launch is a superset and not separately probed
-    // here since cudarc doesn't expose attribute 223 symbolically yet.
-    // On pre-Hopper hardware the DLB barrier path is used instead and the
-    // test still validates no deadlock on that path.
-    let make_cfg = || SpatialPoolerConfig {
-        input_bits,
-        n_columns,
-        sparsity: 0.04,  // ~10 active cols out of 256
-        ..SpatialPoolerConfig::default()
-    };
-    let cpu_ref = SpatialPooler::new(make_cfg(), 42);
-    let mut sp = match SpatialPoolerGpu::from_cpu(&cpu_ref) {
-        Ok(sp) => sp,
-        Err(e) => {
-            eprintln!("[cluster_sync_smoke_test] No GPU available ({e:?}) — skipping");
-            return;
-        }
-    };
-    let dev = sp.dev_ref().clone();
-    // Check cooperative launch support; skip with a clear message if absent.
-    let cooperative_ok = matches!(
-        dev.attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH),
-        Ok(v) if v > 0
-    );
-    if !cooperative_ok {
-        eprintln!("[cluster_sync_smoke_test] CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH=0 — DLB path only, still running test");
-        // We continue — the DLB path is the production fallback and must not deadlock either.
-    }
-    let mut tm = match TemporalMemoryGpu::new(dev.clone(), n_columns, cells_per_col) {
-        Ok(tm) => tm,
-        Err(e) => {
-            eprintln!("[cluster_sync_smoke_test] TemporalMemoryGpu::new failed ({e:?}) — skipping");
-            return;
-        }
-    };
-    tm.reset().expect("tm reset");
-    let mut fused_st: FusedState = match FusedState::new(
-        dev.clone(),
-        n_columns,
-        cells_per_col,
-        sp.initial_threshold_estimate(),
-    ) {
-        Ok(f) => f,
-        Err(e) => {
-            eprintln!("[cluster_sync_smoke_test] FusedState::new failed ({e:?}) — skipping");
-            return;
-        }
-    };
-    fused_st.reset().expect("fused reset");
-    // Build T=4 timesteps of all-zero input SDRs.
-    let t = 4usize;
-    let inputs_flat = vec![0u8; t * input_bits];
-    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs_flat).expect("htod inputs");
-    let mut cols_dev  = dev.alloc_zeros::<u8>(t * n_columns).expect("alloc cols");
-    let mut anom_dev  = dev.alloc_zeros::<f32>(t).expect("alloc anom");
-    // Execute with a 2-second timeout guard via a thread. If the kernel
-    // deadlocks, the parent test process times out and the CI job reports
-    // failure — we can't cancel a live CUDA kernel from Rust, but the
-    // launch_fused call itself must return within this window on any sane GPU.
-    //
-    // We run the kernel inline (not in a separate thread) because CUDA contexts
-    // are not safely shareable across threads without explicit multi-threading
-    // setup. The 2-second bound is enforced implicitly: if the kernel deadlocks,
-    // the test binary will hang and the CI timeout (typically 5 min) will kill it.
-    // For local dev, the deadlock would be immediately obvious.
-    launch_fused(
-        &mut sp,
-        &mut tm,
-        &mut fused_st,
-        &inputs_dev,
-        &mut cols_dev,
-        &mut anom_dev,
-        t,
-        input_bits,
-        false, // learn=false for determinism
-    ).expect("launch_fused (cluster_sync_smoke_test): deadlock or CUDA error");
-    dev.synchronize().expect("device sync after launch_fused");
-    // --- Correctness assertions ---
-    let cols_host: Vec<u8>  = dev.dtoh_sync_copy(&cols_dev).expect("d2h cols");
-    let anom_host: Vec<f32> = dev.dtoh_sync_copy(&anom_dev).expect("d2h anom");
-    // Output buffers must be exactly the right size.
-    assert_eq!(cols_host.len(), t * n_columns, "cols buffer size mismatch");
-    assert_eq!(anom_host.len(), t,             "anom buffer size mismatch");
-    // Anomaly scores must be finite (NaN/Inf indicates numerical blow-up).
-    for (i, &a) in anom_host.iter().enumerate() {
-        assert!(a.is_finite(), "anomaly[{i}] is not finite: {a}");
-        assert!(a >= 0.0 && a <= 1.0, "anomaly[{i}] out of [0,1]: {a}");
-    }
-    // Active-column count per step: threshold-based inhibition, so 0 is
-    // possible on cold start (before thresholds calibrate), but we assert
-    // <= n_columns to catch buffer overruns or completely wrong output.
-    for ti in 0..t {
-        let n_on = cols_host[ti * n_columns..(ti + 1) * n_columns]
-            .iter()
-            .filter(|&&b| b != 0)
-            .count();
-        assert!(
-            n_on <= n_columns,
-            "step {ti}: active columns {n_on} > n_columns {n_columns} (buffer overrun?)"
-        );
-    }
-    eprintln!(
-        "[cluster_sync_smoke_test] PASSED: T={t}, n_cols={n_columns}, \
-         input_bits={input_bits}, cooperative_supported={cooperative_ok}, \
-         anom={anom_host:?}"
-    );
-}
-/// Parity check: the CAI zero-copy path (`step_many_cuda`) must produce
-/// bit-identical outputs to the numpy H2D/D2H path (`step_batch_with_tm`),
-/// since the kernel pipeline is the same — only the I/O wrapping changes.
-/// We skip the PyO3 CAI dict plumbing here and test the underlying
-/// ManuallyDrop + upgrade_device_ptr pattern directly.
-#[test]
-fn gpu_cuda_vs_numpy_parity() {
-    use std::mem::ManuallyDrop;
-    let cfg = SpatialPoolerConfig::default();
-    let bits = cfg.input_bits;
-    let n_cols = cfg.n_columns;
-    let cells_per_col = 32usize;
-    // Build two identical (SP, TM) pairs from the same seed.
-    let build = || -> (SpatialPoolerGpu, TemporalMemoryGpu) {
-        let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 271828);
-        let sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu init");
-        let dev = sp.dev_ref().clone();
-        let mut tm = TemporalMemoryGpu::new(dev, n_cols, cells_per_col).expect("tm init");
-        tm.reset().expect("tm reset");
-        (sp, tm)
-    };
-    // Deterministic SDR sequence.
-    let mut rng = Xoshiro256PlusPlus::seed_from_u64(31337);
-    let t = 32usize;
-    let mut inputs_flat = vec![0u8; t * bits];
-    for i in 0..t {
-        let sdr = make_sdr(&mut rng, bits, 0.02);
-        inputs_flat[i * bits..(i + 1) * bits].copy_from_slice(&sdr);
-    }
-    // ---- Path A: owned CudaSlice (numpy-equivalent path) ----
-    let (mut sp_a, mut tm_a) = build();
-    let dev_a = sp_a.dev_ref().clone();
-    let inputs_a: CudaSlice<u8> = dev_a.htod_sync_copy(&inputs_flat).expect("htod");
-    let mut cols_a = dev_a.alloc_zeros::<u8>(t * n_cols).expect("alloc cols_a");
-    let mut anom_a = dev_a.alloc_zeros::<f32>(t).expect("alloc anom_a");
-    sp_a.step_batch_with_tm(&inputs_a, t, bits, false, &mut cols_a, &mut anom_a, &mut tm_a)
-        .expect("owned step_batch_with_tm");
-    dev_a.synchronize().expect("sync a");
-    let cols_a_host: Vec<u8> = dev_a.dtoh_sync_copy(&cols_a).expect("d2h cols_a");
-    let anom_a_host: Vec<f32> = dev_a.dtoh_sync_copy(&anom_a).expect("d2h anom_a");
-    // ---- Path B: borrowed device pointers via upgrade_device_ptr ----
-    // We allocate fresh owned CudaSlices on a fresh device, then take their
-    // raw ptrs and re-wrap as ManuallyDrop borrowed views — mimicking what
-    // `step_many_cuda` does with torch-owned CUDA memory.
-    let (mut sp_b, mut tm_b) = build();
-    let dev_b = sp_b.dev_ref().clone();
-    let inputs_b_owned: CudaSlice<u8> = dev_b.htod_sync_copy(&inputs_flat).expect("htod");
-    let cols_b_owned = dev_b.alloc_zeros::<u8>(t * n_cols).expect("alloc cols_b");
-    let anom_b_owned = dev_b.alloc_zeros::<f32>(t).expect("alloc anom_b");
-    // Extract raw CUdeviceptrs (and leak the owners so their Drop doesn't free).
-    let inputs_ptr = inputs_b_owned.leak();
-    let cols_ptr = cols_b_owned.leak();
-    let anom_ptr = anom_b_owned.leak();
-    // Re-wrap as borrowed views.
-    let inputs_b = ManuallyDrop::new(unsafe { dev_b.upgrade_device_ptr::<u8>(inputs_ptr, t * bits) });
-    let mut cols_b = ManuallyDrop::new(unsafe { dev_b.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols) });
-    let mut anom_b = ManuallyDrop::new(unsafe { dev_b.upgrade_device_ptr::<f32>(anom_ptr, t) });
-    sp_b.step_batch_with_tm(&inputs_b, t, bits, false, &mut cols_b, &mut anom_b, &mut tm_b)
-        .expect("borrowed step_batch_with_tm");
-    dev_b.synchronize().expect("sync b");
-    // `ManuallyDrop` doesn't auto-coerce to `&CudaSlice<T>` for the DevicePtr
-    // trait bound on `dtoh_sync_copy`; explicit deref.
-    let cols_b_host: Vec<u8> = dev_b.dtoh_sync_copy(&*cols_b).expect("d2h cols_b");
-    let anom_b_host: Vec<f32> = dev_b.dtoh_sync_copy(&*anom_b).expect("d2h anom_b");
-    // Re-own so Drop actually frees (we leaked above).
-    let _inputs_owned_again = unsafe { dev_b.upgrade_device_ptr::<u8>(inputs_ptr, t * bits) };
-    let _cols_owned_again = unsafe { dev_b.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols) };
-    let _anom_owned_again = unsafe { dev_b.upgrade_device_ptr::<f32>(anom_ptr, t) };
-    assert_eq!(cols_a_host, cols_b_host, "active-column mask diverges between numpy and CAI paths");
-    assert_eq!(anom_a_host.len(), anom_b_host.len());
-    for (i, (a, b)) in anom_a_host.iter().zip(anom_b_host.iter()).enumerate() {
-        // Anomaly is a pure division of integer counts — bit-exact expected.
-        assert!((a - b).abs() < 1e-7, "anomaly mismatch at step {i}: a={a} b={b}");
-    }
-}
-/// Fused kernel: threshold activation should converge to near target sparsity
-/// after a short warmup. Acceptance: mean activation rate per step lands in
-/// [0.3*target, 2.5*target] after 500-step warmup. Because the threshold
-/// starts conservative (=2.0) and the per-column adaptation rate is slow
-/// (0.001), we allow a generous band — the test asserts directional
-/// convergence toward the target, not tight matching.
-#[test]
-fn gpu_threshold_converges_to_sparsity() {
-    let cfg = SpatialPoolerConfig::default();
-    let bits = cfg.input_bits;
-    let n_cols = cfg.n_columns;
-    let cells_per_col = 32usize;
-    let target = cfg.sparsity;  // 0.02 = 40 cols expected
-    let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 111);
-    let mut sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu sp init");
-    let dev = sp.dev_ref().clone();
-    let mut tm = TemporalMemoryGpu::new(dev.clone(), n_cols, cells_per_col).expect("tm init");
-    let mut fused = FusedState::new(
-        dev.clone(),
-        n_cols,
-        cells_per_col,
-        sp.initial_threshold_estimate(),
-    ).expect("fused init");
-    tm.reset().expect("tm reset");
-    fused.reset().expect("fused reset");
-    // Warmup: 1000 random 2%-sparse SDRs.
-    let mut rng = Xoshiro256PlusPlus::seed_from_u64(31337);
-    let t_warm = 1000usize;
-    let mut inputs = vec![0u8; t_warm * bits];
-    for ti in 0..t_warm {
-        let sdr = make_sdr(&mut rng, bits, 0.02);
-        inputs[ti*bits..(ti+1)*bits].copy_from_slice(&sdr);
-    }
-    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs).expect("htod");
-    let mut cols_dev = dev.alloc_zeros::<u8>(t_warm * n_cols).expect("alloc cols");
-    let mut anom_dev = dev.alloc_zeros::<f32>(t_warm).expect("alloc anom");
-    launch_fused(
-        &mut sp, &mut tm, &mut fused,
-        &inputs_dev, &mut cols_dev, &mut anom_dev,
-        t_warm, bits, true,
-    ).expect("warmup launch");
-    dev.synchronize().expect("sync");
-    // Measurement pass: another 200 steps, measure mean activation.
-    let t_meas = 200usize;
-    let mut meas_inputs = vec![0u8; t_meas * bits];
-    for ti in 0..t_meas {
-        let sdr = make_sdr(&mut rng, bits, 0.02);
-        meas_inputs[ti*bits..(ti+1)*bits].copy_from_slice(&sdr);
-    }
-    let meas_dev: CudaSlice<u8> = dev.htod_sync_copy(&meas_inputs).expect("htod meas");
-    let mut meas_cols = dev.alloc_zeros::<u8>(t_meas * n_cols).expect("alloc meas cols");
-    let mut meas_anom = dev.alloc_zeros::<f32>(t_meas).expect("alloc meas anom");
-    launch_fused(
-        &mut sp, &mut tm, &mut fused,
-        &meas_dev, &mut meas_cols, &mut meas_anom,
-        t_meas, bits, true,
-    ).expect("meas launch");
-    dev.synchronize().expect("sync meas");
-    let cols_host: Vec<u8> = dev.dtoh_sync_copy(&meas_cols).expect("d2h");
-    let mut step_counts = Vec::with_capacity(t_meas);
-    for ti in 0..t_meas {
-        let n_on = cols_host[ti*n_cols..(ti+1)*n_cols]
-            .iter().filter(|&&b| b != 0).count();
-        step_counts.push(n_on);
-    }
-    let mean_active: f64 = step_counts.iter().map(|&c| c as f64).sum::<f64>()
-        / (t_meas as f64);
-    let target_active = target as f64 * n_cols as f64;
-    eprintln!(
-        "threshold-activation convergence: mean_active/step = {mean_active:.1} \
-         (target = {target_active:.1})"
-    );
-    // Very generous band — we just want to confirm the threshold loop is
-    // functioning (not diverged to 0 or to all-active).
-    assert!(
-        mean_active >= 0.25 * target_active && mean_active <= 4.0 * target_active,
-        "mean active {mean_active:.1} outside [0.25x, 4x] of target {target_active:.1}"
-    );
-}
-/// Fused kernel: TM should learn a repeating sequence — anomaly decays.
-#[test]
-fn gpu_fused_tm_anomaly_decays_on_repeating_sequence() {
-    let cfg = SpatialPoolerConfig::default();
-    let bits = cfg.input_bits;
-    let n_cols = cfg.n_columns;
-    let cells_per_col = 32usize;
-    let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 271);
-    let mut sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu sp init");
-    let dev = sp.dev_ref().clone();
-    let mut tm = TemporalMemoryGpu::new(dev.clone(), n_cols, cells_per_col).expect("tm init");
-    let mut fused = FusedState::new(
-        dev.clone(),
-        n_cols,
-        cells_per_col,
-        sp.initial_threshold_estimate(),
-    ).expect("fused init");
-    tm.reset().expect("tm reset");
-    fused.reset().expect("fused reset");
-    let mut rng = Xoshiro256PlusPlus::seed_from_u64(7);
-    let make = |rng: &mut Xoshiro256PlusPlus| make_sdr(rng, bits, 0.02);
-    let seqs = [make(&mut rng), make(&mut rng), make(&mut rng)];
-    // Warmup SP threshold calibration with random SDRs first.
-    let warm = 300usize;
-    let mut warm_inputs = vec![0u8; warm * bits];
-    for ti in 0..warm {
-        let sdr = make_sdr(&mut rng, bits, 0.02);
-        warm_inputs[ti*bits..(ti+1)*bits].copy_from_slice(&sdr);
-    }
-    let warm_dev: CudaSlice<u8> = dev.htod_sync_copy(&warm_inputs).expect("htod warm");
-    let mut warm_cols = dev.alloc_zeros::<u8>(warm * n_cols).expect("alloc warm cols");
-    let mut warm_anom = dev.alloc_zeros::<f32>(warm).expect("alloc warm anom");
-    launch_fused(
-        &mut sp, &mut tm, &mut fused,
-        &warm_dev, &mut warm_cols, &mut warm_anom,
-        warm, bits, true,
-    ).expect("warm launch");
-    dev.synchronize().expect("sync warm");
-    // Feed repeating A,B,C sequence for 100 reps.
-    let repeats = 100usize;
-    let t = repeats * 3;
-    let mut inputs = vec![0u8; t * bits];
-    for r in 0..repeats {
-        for (i, s) in seqs.iter().enumerate() {
-            let off = (r*3 + i) * bits;
-            inputs[off..off+bits].copy_from_slice(s);
-        }
-    }
-    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs).expect("htod rep");
-    let mut cols_dev = dev.alloc_zeros::<u8>(t * n_cols).expect("alloc rep cols");
-    let mut anom_dev = dev.alloc_zeros::<f32>(t).expect("alloc rep anom");
-    launch_fused(
-        &mut sp, &mut tm, &mut fused,
-        &inputs_dev, &mut cols_dev, &mut anom_dev,
-        t, bits, true,
-    ).expect("rep launch");
-    dev.synchronize().expect("sync rep");
-    let anom: Vec<f32> = dev.dtoh_sync_copy(&anom_dev).expect("d2h anom");
-    let early_avg: f32 = anom[3..12].iter().sum::<f32>() / 9.0;
-    let late_avg: f32 = anom[(t-9)..t].iter().sum::<f32>() / 9.0;
-    eprintln!("fused TM anomaly: early={early_avg:.3} late={late_avg:.3}");
-    assert!(
-        late_avg < early_avg,
-        "anomaly must decay: early={early_avg:.3} late={late_avg:.3}"
-    );
-    assert!(
-        late_avg < 0.5,
-        "late anomaly must be < 0.5 (got {late_avg:.3})"
-    );
-}
-#[test]
-fn gpu_sp_yields_k_winners() {
-    let cfg = SpatialPoolerConfig::default();
-    let bits = cfg.input_bits;
-    let n = cfg.n_columns;
-    let expected_k = ((cfg.sparsity * n as f32).round() as usize).max(1);
-    let cpu = SpatialPooler::new(SpatialPoolerConfig::default(), 7);
-    let mut gpu = SpatialPoolerGpu::from_cpu(&cpu).expect("gpu init");
-    let mut rng = Xoshiro256PlusPlus::seed_from_u64(1);
-    for _ in 0..10 {
-        let sdr_u8 = make_sdr(&mut rng, bits, 0.02);
-        let active = gpu.compute(&sdr_u8, false).expect("gpu compute");
-        assert_eq!(active.len(), expected_k);
-        // Ensure sorted + unique.
-        for w in active.windows(2) {
-            assert!(w[0] < w[1], "duplicate or out-of-order winner indices");
-        }
-    }
-}
-#[test]
-fn fused_launch_plan_uses_cooperative_grid_sync() {
-    let plan = plan_fused_launch(30, true, 30, None).expect("cooperative supported");
-    assert_eq!(plan.grid_dim_x, 16);
-    assert_eq!(plan.cooperative_grid_limit, 30);
-}
-#[test]
-fn fused_launch_plan_scales_to_big_gpu() {
-    // H200-like: 132 SMs, high cooperative_grid_limit. Cap still applies.
-    let plan = plan_fused_launch(132, true, 1000, None).expect("cooperative supported");
-    assert_eq!(plan.grid_dim_x, 16); // capped by default override
-    let plan = plan_fused_launch(132, true, 1000, Some(64)).expect("cooperative supported");
-    assert_eq!(plan.grid_dim_x, 64); // override raises the cap
-}
-#[test]
-fn fused_launch_plan_refuses_non_cooperative_devices() {
-    // The slow path was removed. Devices without cooperative launch fail fast.
-    let err = plan_fused_launch(30, false, 0, None).unwrap_err();
-    assert!(err.contains("cooperative launch"));
-}
-#[test]
-fn fused_grid_cap_env_override_is_honored() {
-    let cfg = SpatialPoolerConfig::default();
-    let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 5252);
-    let sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu sp init");
-    let dev = sp.dev_ref().clone();
-    unsafe { std::env::set_var("HTM_FUSED_GRID_CAP", "12"); }
-    let fused = FusedState::new(
-        dev.clone(),
-        cfg.n_columns,
-        32usize,
-        sp.initial_threshold_estimate(),
-    ).expect("fused init");
-    unsafe { std::env::remove_var("HTM_FUSED_GRID_CAP"); }
-    let sm_count = match dev.attribute(
-        cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-    ) {
-        Ok(v) => v as u32,
-        Err(_) => 16u32,
-    };
-    let expected = sm_count.max(1).min(12);
-    assert_eq!(
-        fused.grid_dim_x,
-        expected,
-        "fused grid cap env override ignored: expected min(sm_count, 12) = {expected}, got {}",
-        fused.grid_dim_x,
-    );
-}

+//! Parity tests: GPU SP vs CPU SP reference.
+//!
+//! With matching seeds the two should produce bit-identical active-column sets
+//! when `learn=false`, and remain bit-identical over repeated `learn=true`
+//! steps because the Hebbian update is deterministic (no RNG once initialised).
+//!
+//! Run with:  cargo test --release --features gpu
+#![cfg(test)]
+#![cfg(feature = "gpu")]
+use crate::sp::{SpatialPooler, SpatialPoolerConfig};
+use crate::gpu::sp_gpu::SpatialPoolerGpu;
+use crate::gpu::tm_gpu::TemporalMemoryGpu;
+use crate::gpu::fused::{
+    launch_fused, plan_batched_grid_dim, plan_fused_launch, FusedState,
+};
+use cudarc::driver::CudaSlice;
+use rand::{Rng, SeedableRng};
+use rand_xoshiro::Xoshiro256PlusPlus;
+fn make_sdr(rng: &mut Xoshiro256PlusPlus, bits: usize, sparsity: f32) -> Vec<u8> {
+    let on = ((sparsity * bits as f32) as usize).max(1);
+    let mut v = vec![0u8; bits];
+    let mut placed = 0;
+    while placed < on {
+        let i = rng.gen_range(0..bits);
+        if v[i] == 0 {
+            v[i] = 1;
+            placed += 1;
+        }
+    }
+    v
+}
+#[test]
+fn gpu_sp_matches_cpu_no_learn() {
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let mut cpu = SpatialPooler::new(
+        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
+        1234,
+    );
+    let cpu_for_gpu = SpatialPooler::new(
+        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
+        1234,
+    );
+    let mut gpu = SpatialPoolerGpu::from_cpu(&cpu_for_gpu)
+        .expect("gpu init (CUDA device available)");
+    gpu.set_strict_parity(true);
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(99);
+    for step in 0..20 {
+        let sdr_u8 = make_sdr(&mut rng, bits, 0.02);
+        let sdr_bool: Vec<bool> = sdr_u8.iter().map(|&x| x != 0).collect();
+        let cpu_active: Vec<u32> = cpu.compute(&sdr_bool, false);
+        let gpu_active: Vec<u32> = gpu.compute(&sdr_u8, false).expect("gpu compute");
+        assert_eq!(
+            cpu_active, gpu_active,
+            "mismatch at step {step}: len cpu={} gpu={}",
+            cpu_active.len(), gpu_active.len()
+        );
+    }
+}
+#[test]
+fn gpu_sp_matches_cpu_with_learn() {
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let mut cpu = SpatialPooler::new(
+        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
+        5678,
+    );
+    let cpu_for_gpu = SpatialPooler::new(
+        SpatialPoolerConfig { ..SpatialPoolerConfig::default() },
+        5678,
+    );
+    let mut gpu = SpatialPoolerGpu::from_cpu(&cpu_for_gpu).expect("gpu init");
+    gpu.set_strict_parity(true);
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(42);
+    for step in 0..50 {
+        let sdr_u8 = make_sdr(&mut rng, bits, 0.02);
+        let sdr_bool: Vec<bool> = sdr_u8.iter().map(|&x| x != 0).collect();
+        let cpu_active = cpu.compute(&sdr_bool, true);
+        let gpu_active = gpu.compute(&sdr_u8, true).expect("gpu compute");
+        assert_eq!(
+            cpu_active, gpu_active,
+            "mismatch at step {step} with learning"
+        );
+    }
+}
+#[test]
+fn gpu_tm_anomaly_decays_on_repeating_sequence() {
+    // End-to-end GPU pipeline: SP feeds TM; repeating SDR sequence should drive
+    // anomaly down over time.
+    use crate::gpu::HTMRegionGpu;  // not pyclass methods; use internal constructor via Rust
+    // Easier: replicate the pipeline directly with SP + TM.
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let n_cols = cfg.n_columns;
+    let cells_per_col = 32usize;
+    let cpu_for_gpu = SpatialPooler::new(SpatialPoolerConfig::default(), 314);
+    let mut sp = SpatialPoolerGpu::from_cpu(&cpu_for_gpu).expect("gpu init");
+    let dev = sp.dev_ref().clone();
+    let mut tm = TemporalMemoryGpu::new(dev.clone(), n_cols, cells_per_col)
+        .expect("gpu tm init");
+    tm.reset().expect("tm reset");
+    // Build 3 fixed SDRs, feed them in a repeating sequence.
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(7);
+    let make = |rng: &mut Xoshiro256PlusPlus| make_sdr(rng, bits, 0.02);
+    let seqs = [make(&mut rng), make(&mut rng), make(&mut rng)];
+    // Warm up SP so columns are stable per symbol.
+    for _ in 0..100 {
+        for s in &seqs {
+            let _ = sp.compute(s, true).expect("sp compute");
+        }
+    }
+    // Build a long input buffer: 100 repetitions of [A,B,C] = 300 steps.
+    let repeats = 100usize;
+    let t = repeats * 3;
+    let mut inputs_flat = vec![0u8; t * bits];
+    for r in 0..repeats {
+        for (i, s) in seqs.iter().enumerate() {
+            let off = (r * 3 + i) * bits;
+            inputs_flat[off..off + bits].copy_from_slice(s);
+        }
+    }
+    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs_flat).expect("htod");
+    let mut cols_dev = dev.alloc_zeros::<u8>(t * n_cols).expect("alloc cols");
+    let mut anom_dev = dev.alloc_zeros::<f32>(t).expect("alloc anom");
+    sp.step_batch_with_tm(
+        &inputs_dev,
+        t,
+        bits,
+        true,
+        &mut cols_dev,
+        &mut anom_dev,
+        &mut tm,
+    ).expect("step_batch_with_tm");
+    let anom: Vec<f32> = dev.dtoh_sync_copy(&anom_dev).expect("d2h anom");
+    let cols: Vec<u8> = dev.dtoh_sync_copy(&cols_dev).expect("d2h cols");
+    // Active column count per step must equal k for every step.
+    let k = ((cfg.sparsity * n_cols as f32).round() as usize).max(1);
+    for ti in 0..t {
+        let step_slice = &cols[ti * n_cols..(ti + 1) * n_cols];
+        let n_on = step_slice.iter().filter(|&&b| b != 0).count();
+        assert_eq!(n_on, k, "step {ti} has {n_on} active cols, expected {k}");
+    }
+    // First repetition: anomaly should be near 1.0 (nothing predicted).
+    let early_avg: f32 = anom[3..9].iter().sum::<f32>() / 6.0;
+    // Last repetitions: anomaly should be noticeably lower.
+    let late_avg: f32 = anom[(t - 9)..t].iter().sum::<f32>() / 9.0;
+    eprintln!("gpu tm: early anomaly = {early_avg:.3}, late = {late_avg:.3}");
+    assert!(
+        late_avg < early_avg,
+        "GPU TM should reduce anomaly on repeating sequence: early={early_avg:.3}, late={late_avg:.3}"
+    );
+}
+/// Cluster-sync smoke test: verifies that the fused megakernel (which relies on
+/// hardware `cluster::sync()` / grid-barrier on H100/H200 Hopper) completes
+/// without deadlock when called with real HTM state, and that output shapes are
+/// sane (no NaN / Inf in anomaly scores, active-column count in plausible range).
+///
+/// This is an *integration* test, not a synthetic micro-benchmark: it exercises
+/// exactly the same `launch_fused` code path used in production, so any
+/// deadlock in the cooperative-grid or DLB barrier would surface here.
+///
+/// Skips gracefully (with an eprintln) when no GPU is available — the test
+/// binary returns exit-code 0 in that case so CI still passes.
+#[test]
+fn cluster_sync_smoke_test() {
+    // Build a tiny HTM region (1024 inputs, 256 columns, 4 cells/column).
+    // This keeps VRAM usage minimal while still exercising all kernel paths.
+    let input_bits  = 1024usize;
+    let n_columns   = 256usize;
+    let cells_per_col = 4usize;
+    // Probe cooperative launch attribute before doing any real work.
+    // CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 223 (added in CUDA 11.8 for Hopper).
+    // cudarc exposes raw attribute querying; we check cooperative launch (98)
+    // as the guard — cluster launch is a superset and not separately probed
+    // here since cudarc doesn't expose attribute 223 symbolically yet.
+    // On pre-Hopper hardware the DLB barrier path is used instead and the
+    // test still validates no deadlock on that path.
+    let make_cfg = || SpatialPoolerConfig {
+        input_bits,
+        n_columns,
+        sparsity: 0.04,  // ~10 active cols out of 256
+        ..SpatialPoolerConfig::default()
+    };
+    let cpu_ref = SpatialPooler::new(make_cfg(), 42);
+    let mut sp = match SpatialPoolerGpu::from_cpu(&cpu_ref) {
+        Ok(sp) => sp,
+        Err(e) => {
+            eprintln!("[cluster_sync_smoke_test] No GPU available ({e:?}) — skipping");
+            return;
+        }
+    };
+    let dev = sp.dev_ref().clone();
+    // Check cooperative launch support; skip with a clear message if absent.
+    let cooperative_ok = matches!(
+        dev.attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH),
+        Ok(v) if v > 0
+    );
+    if !cooperative_ok {
+        eprintln!("[cluster_sync_smoke_test] CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH=0 — DLB path only, still running test");
+        // We continue — the DLB path is the production fallback and must not deadlock either.
+    }
+    let mut tm = match TemporalMemoryGpu::new(dev.clone(), n_columns, cells_per_col) {
+        Ok(tm) => tm,
+        Err(e) => {
+            eprintln!("[cluster_sync_smoke_test] TemporalMemoryGpu::new failed ({e:?}) — skipping");
+            return;
+        }
+    };
+    tm.reset().expect("tm reset");
+    let mut fused_st: FusedState = match FusedState::new(
+        dev.clone(),
+        n_columns,
+        cells_per_col,
+        sp.initial_threshold_estimate(),
+    ) {
+        Ok(f) => f,
+        Err(e) => {
+            eprintln!("[cluster_sync_smoke_test] FusedState::new failed ({e:?}) — skipping");
+            return;
+        }
+    };
+    fused_st.reset().expect("fused reset");
+    // Build T=4 timesteps of all-zero input SDRs.
+    let t = 4usize;
+    let inputs_flat = vec![0u8; t * input_bits];
+    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs_flat).expect("htod inputs");
+    let mut cols_dev  = dev.alloc_zeros::<u8>(t * n_columns).expect("alloc cols");
+    let mut anom_dev  = dev.alloc_zeros::<f32>(t).expect("alloc anom");
+    // Execute with a 2-second timeout guard via a thread. If the kernel
+    // deadlocks, the parent test process times out and the CI job reports
+    // failure — we can't cancel a live CUDA kernel from Rust, but the
+    // launch_fused call itself must return within this window on any sane GPU.
+    //
+    // We run the kernel inline (not in a separate thread) because CUDA contexts
+    // are not safely shareable across threads without explicit multi-threading
+    // setup. The 2-second bound is enforced implicitly: if the kernel deadlocks,
+    // the test binary will hang and the CI timeout (typically 5 min) will kill it.
+    // For local dev, the deadlock would be immediately obvious.
+    launch_fused(
+        &mut sp,
+        &mut tm,
+        &mut fused_st,
+        &inputs_dev,
+        &mut cols_dev,
+        &mut anom_dev,
+        t,
+        input_bits,
+        false, // learn=false for determinism
+    ).expect("launch_fused (cluster_sync_smoke_test): deadlock or CUDA error");
+    dev.synchronize().expect("device sync after launch_fused");
+    // --- Correctness assertions ---
+    let cols_host: Vec<u8>  = dev.dtoh_sync_copy(&cols_dev).expect("d2h cols");
+    let anom_host: Vec<f32> = dev.dtoh_sync_copy(&anom_dev).expect("d2h anom");
+    // Output buffers must be exactly the right size.
+    assert_eq!(cols_host.len(), t * n_columns, "cols buffer size mismatch");
+    assert_eq!(anom_host.len(), t,             "anom buffer size mismatch");
+    // Anomaly scores must be finite (NaN/Inf indicates numerical blow-up).
+    for (i, &a) in anom_host.iter().enumerate() {
+        assert!(a.is_finite(), "anomaly[{i}] is not finite: {a}");
+        assert!(a >= 0.0 && a <= 1.0, "anomaly[{i}] out of [0,1]: {a}");
+    }
+    // Active-column count per step: threshold-based inhibition, so 0 is
+    // possible on cold start (before thresholds calibrate), but we assert
+    // <= n_columns to catch buffer overruns or completely wrong output.
+    for ti in 0..t {
+        let n_on = cols_host[ti * n_columns..(ti + 1) * n_columns]
+            .iter()
+            .filter(|&&b| b != 0)
+            .count();
+        assert!(
+            n_on <= n_columns,
+            "step {ti}: active columns {n_on} > n_columns {n_columns} (buffer overrun?)"
+        );
+    }
+    eprintln!(
+        "[cluster_sync_smoke_test] PASSED: T={t}, n_cols={n_columns}, \
+         input_bits={input_bits}, cooperative_supported={cooperative_ok}, \
+         anom={anom_host:?}"
+    );
+}
+/// Parity check: the CAI zero-copy path (`step_many_cuda`) must produce
+/// bit-identical outputs to the numpy H2D/D2H path (`step_batch_with_tm`),
+/// since the kernel pipeline is the same — only the I/O wrapping changes.
+/// We skip the PyO3 CAI dict plumbing here and test the underlying
+/// ManuallyDrop + upgrade_device_ptr pattern directly.
+#[test]
+fn gpu_cuda_vs_numpy_parity() {
+    use std::mem::ManuallyDrop;
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let n_cols = cfg.n_columns;
+    let cells_per_col = 32usize;
+    // Build two identical (SP, TM) pairs from the same seed.
+    let build = || -> (SpatialPoolerGpu, TemporalMemoryGpu) {
+        let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 271828);
+        let sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu init");
+        let dev = sp.dev_ref().clone();
+        let mut tm = TemporalMemoryGpu::new(dev, n_cols, cells_per_col).expect("tm init");
+        tm.reset().expect("tm reset");
+        (sp, tm)
+    };
+    // Deterministic SDR sequence.
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(31337);
+    let t = 32usize;
+    let mut inputs_flat = vec![0u8; t * bits];
+    for i in 0..t {
+        let sdr = make_sdr(&mut rng, bits, 0.02);
+        inputs_flat[i * bits..(i + 1) * bits].copy_from_slice(&sdr);
+    }
+    // ---- Path A: owned CudaSlice (numpy-equivalent path) ----
+    let (mut sp_a, mut tm_a) = build();
+    let dev_a = sp_a.dev_ref().clone();
+    let inputs_a: CudaSlice<u8> = dev_a.htod_sync_copy(&inputs_flat).expect("htod");
+    let mut cols_a = dev_a.alloc_zeros::<u8>(t * n_cols).expect("alloc cols_a");
+    let mut anom_a = dev_a.alloc_zeros::<f32>(t).expect("alloc anom_a");
+    sp_a.step_batch_with_tm(&inputs_a, t, bits, false, &mut cols_a, &mut anom_a, &mut tm_a)
+        .expect("owned step_batch_with_tm");
+    dev_a.synchronize().expect("sync a");
+    let cols_a_host: Vec<u8> = dev_a.dtoh_sync_copy(&cols_a).expect("d2h cols_a");
+    let anom_a_host: Vec<f32> = dev_a.dtoh_sync_copy(&anom_a).expect("d2h anom_a");
+    // ---- Path B: borrowed device pointers via upgrade_device_ptr ----
+    // We allocate fresh owned CudaSlices on a fresh device, then take their
+    // raw ptrs and re-wrap as ManuallyDrop borrowed views — mimicking what
+    // `step_many_cuda` does with torch-owned CUDA memory.
+    let (mut sp_b, mut tm_b) = build();
+    let dev_b = sp_b.dev_ref().clone();
+    let inputs_b_owned: CudaSlice<u8> = dev_b.htod_sync_copy(&inputs_flat).expect("htod");
+    let cols_b_owned = dev_b.alloc_zeros::<u8>(t * n_cols).expect("alloc cols_b");
+    let anom_b_owned = dev_b.alloc_zeros::<f32>(t).expect("alloc anom_b");
+    // Extract raw CUdeviceptrs (and leak the owners so their Drop doesn't free).
+    let inputs_ptr = inputs_b_owned.leak();
+    let cols_ptr = cols_b_owned.leak();
+    let anom_ptr = anom_b_owned.leak();
+    // Re-wrap as borrowed views.
+    let inputs_b = ManuallyDrop::new(unsafe { dev_b.upgrade_device_ptr::<u8>(inputs_ptr, t * bits) });
+    let mut cols_b = ManuallyDrop::new(unsafe { dev_b.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols) });
+    let mut anom_b = ManuallyDrop::new(unsafe { dev_b.upgrade_device_ptr::<f32>(anom_ptr, t) });
+    sp_b.step_batch_with_tm(&inputs_b, t, bits, false, &mut cols_b, &mut anom_b, &mut tm_b)
+        .expect("borrowed step_batch_with_tm");
+    dev_b.synchronize().expect("sync b");
+    // `ManuallyDrop` doesn't auto-coerce to `&CudaSlice<T>` for the DevicePtr
+    // trait bound on `dtoh_sync_copy`; explicit deref.
+    let cols_b_host: Vec<u8> = dev_b.dtoh_sync_copy(&*cols_b).expect("d2h cols_b");
+    let anom_b_host: Vec<f32> = dev_b.dtoh_sync_copy(&*anom_b).expect("d2h anom_b");
+    // Re-own so Drop actually frees (we leaked above).
+    let _inputs_owned_again = unsafe { dev_b.upgrade_device_ptr::<u8>(inputs_ptr, t * bits) };
+    let _cols_owned_again = unsafe { dev_b.upgrade_device_ptr::<u8>(cols_ptr, t * n_cols) };
+    let _anom_owned_again = unsafe { dev_b.upgrade_device_ptr::<f32>(anom_ptr, t) };
+    assert_eq!(cols_a_host, cols_b_host, "active-column mask diverges between numpy and CAI paths");
+    assert_eq!(anom_a_host.len(), anom_b_host.len());
+    for (i, (a, b)) in anom_a_host.iter().zip(anom_b_host.iter()).enumerate() {
+        // Anomaly is a pure division of integer counts — bit-exact expected.
+        assert!((a - b).abs() < 1e-7, "anomaly mismatch at step {i}: a={a} b={b}");
+    }
+}
+/// Fused kernel: threshold activation should converge to near target sparsity
+/// after a short warmup. Acceptance: mean activation rate per step lands in
+/// [0.3*target, 2.5*target] after 500-step warmup. Because the threshold
+/// starts conservative (=2.0) and the per-column adaptation rate is slow
+/// (0.001), we allow a generous band — the test asserts directional
+/// convergence toward the target, not tight matching.
+#[test]
+fn gpu_threshold_converges_to_sparsity() {
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let n_cols = cfg.n_columns;
+    let cells_per_col = 32usize;
+    let target = cfg.sparsity;  // 0.02 = 40 cols expected
+    let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 111);
+    let mut sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu sp init");
+    let dev = sp.dev_ref().clone();
+    let mut tm = TemporalMemoryGpu::new(dev.clone(), n_cols, cells_per_col).expect("tm init");
+    let mut fused = FusedState::new(
+        dev.clone(),
+        n_cols,
+        cells_per_col,
+        sp.initial_threshold_estimate(),
+    ).expect("fused init");
+    tm.reset().expect("tm reset");
+    fused.reset().expect("fused reset");
+    // Warmup: 1000 random 2%-sparse SDRs.
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(31337);
+    let t_warm = 1000usize;
+    let mut inputs = vec![0u8; t_warm * bits];
+    for ti in 0..t_warm {
+        let sdr = make_sdr(&mut rng, bits, 0.02);
+        inputs[ti*bits..(ti+1)*bits].copy_from_slice(&sdr);
+    }
+    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs).expect("htod");
+    let mut cols_dev = dev.alloc_zeros::<u8>(t_warm * n_cols).expect("alloc cols");
+    let mut anom_dev = dev.alloc_zeros::<f32>(t_warm).expect("alloc anom");
+    launch_fused(
+        &mut sp, &mut tm, &mut fused,
+        &inputs_dev, &mut cols_dev, &mut anom_dev,
+        t_warm, bits, true,
+    ).expect("warmup launch");
+    dev.synchronize().expect("sync");
+    // Measurement pass: another 200 steps, measure mean activation.
+    let t_meas = 200usize;
+    let mut meas_inputs = vec![0u8; t_meas * bits];
+    for ti in 0..t_meas {
+        let sdr = make_sdr(&mut rng, bits, 0.02);
+        meas_inputs[ti*bits..(ti+1)*bits].copy_from_slice(&sdr);
+    }
+    let meas_dev: CudaSlice<u8> = dev.htod_sync_copy(&meas_inputs).expect("htod meas");
+    let mut meas_cols = dev.alloc_zeros::<u8>(t_meas * n_cols).expect("alloc meas cols");
+    let mut meas_anom = dev.alloc_zeros::<f32>(t_meas).expect("alloc meas anom");
+    launch_fused(
+        &mut sp, &mut tm, &mut fused,
+        &meas_dev, &mut meas_cols, &mut meas_anom,
+        t_meas, bits, true,
+    ).expect("meas launch");
+    dev.synchronize().expect("sync meas");
+    let cols_host: Vec<u8> = dev.dtoh_sync_copy(&meas_cols).expect("d2h");
+    let mut step_counts = Vec::with_capacity(t_meas);
+    for ti in 0..t_meas {
+        let n_on = cols_host[ti*n_cols..(ti+1)*n_cols]
+            .iter().filter(|&&b| b != 0).count();
+        step_counts.push(n_on);
+    }
+    let mean_active: f64 = step_counts.iter().map(|&c| c as f64).sum::<f64>()
+        / (t_meas as f64);
+    let target_active = target as f64 * n_cols as f64;
+    eprintln!(
+        "threshold-activation convergence: mean_active/step = {mean_active:.1} \
+         (target = {target_active:.1})"
+    );
+    // Very generous band — we just want to confirm the threshold loop is
+    // functioning (not diverged to 0 or to all-active).
+    assert!(
+        mean_active >= 0.25 * target_active && mean_active <= 4.0 * target_active,
+        "mean active {mean_active:.1} outside [0.25x, 4x] of target {target_active:.1}"
+    );
+}
+/// Fused kernel: TM should learn a repeating sequence — anomaly decays.
+#[test]
+fn gpu_fused_tm_anomaly_decays_on_repeating_sequence() {
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let n_cols = cfg.n_columns;
+    let cells_per_col = 32usize;
+    let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 271);
+    let mut sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu sp init");
+    let dev = sp.dev_ref().clone();
+    let mut tm = TemporalMemoryGpu::new(dev.clone(), n_cols, cells_per_col).expect("tm init");
+    let mut fused = FusedState::new(
+        dev.clone(),
+        n_cols,
+        cells_per_col,
+        sp.initial_threshold_estimate(),
+    ).expect("fused init");
+    tm.reset().expect("tm reset");
+    fused.reset().expect("fused reset");
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(7);
+    let make = |rng: &mut Xoshiro256PlusPlus| make_sdr(rng, bits, 0.02);
+    let seqs = [make(&mut rng), make(&mut rng), make(&mut rng)];
+    // Warmup SP threshold calibration with random SDRs first.
+    let warm = 300usize;
+    let mut warm_inputs = vec![0u8; warm * bits];
+    for ti in 0..warm {
+        let sdr = make_sdr(&mut rng, bits, 0.02);
+        warm_inputs[ti*bits..(ti+1)*bits].copy_from_slice(&sdr);
+    }
+    let warm_dev: CudaSlice<u8> = dev.htod_sync_copy(&warm_inputs).expect("htod warm");
+    let mut warm_cols = dev.alloc_zeros::<u8>(warm * n_cols).expect("alloc warm cols");
+    let mut warm_anom = dev.alloc_zeros::<f32>(warm).expect("alloc warm anom");
+    launch_fused(
+        &mut sp, &mut tm, &mut fused,
+        &warm_dev, &mut warm_cols, &mut warm_anom,
+        warm, bits, true,
+    ).expect("warm launch");
+    dev.synchronize().expect("sync warm");
+    // Feed repeating A,B,C sequence for 100 reps.
+    let repeats = 100usize;
+    let t = repeats * 3;
+    let mut inputs = vec![0u8; t * bits];
+    for r in 0..repeats {
+        for (i, s) in seqs.iter().enumerate() {
+            let off = (r*3 + i) * bits;
+            inputs[off..off+bits].copy_from_slice(s);
+        }
+    }
+    let inputs_dev: CudaSlice<u8> = dev.htod_sync_copy(&inputs).expect("htod rep");
+    let mut cols_dev = dev.alloc_zeros::<u8>(t * n_cols).expect("alloc rep cols");
+    let mut anom_dev = dev.alloc_zeros::<f32>(t).expect("alloc rep anom");
+    launch_fused(
+        &mut sp, &mut tm, &mut fused,
+        &inputs_dev, &mut cols_dev, &mut anom_dev,
+        t, bits, true,
+    ).expect("rep launch");
+    dev.synchronize().expect("sync rep");
+    let anom: Vec<f32> = dev.dtoh_sync_copy(&anom_dev).expect("d2h anom");
+    let early_avg: f32 = anom[3..12].iter().sum::<f32>() / 9.0;
+    let late_avg: f32 = anom[(t-9)..t].iter().sum::<f32>() / 9.0;
+    eprintln!("fused TM anomaly: early={early_avg:.3} late={late_avg:.3}");
+    assert!(
+        late_avg < early_avg,
+        "anomaly must decay: early={early_avg:.3} late={late_avg:.3}"
+    );
+    assert!(
+        late_avg < 0.5,
+        "late anomaly must be < 0.5 (got {late_avg:.3})"
+    );
+}
+#[test]
+fn gpu_sp_yields_k_winners() {
+    let cfg = SpatialPoolerConfig::default();
+    let bits = cfg.input_bits;
+    let n = cfg.n_columns;
+    let expected_k = ((cfg.sparsity * n as f32).round() as usize).max(1);
+    let cpu = SpatialPooler::new(SpatialPoolerConfig::default(), 7);
+    let mut gpu = SpatialPoolerGpu::from_cpu(&cpu).expect("gpu init");
+    let mut rng = Xoshiro256PlusPlus::seed_from_u64(1);
+    for _ in 0..10 {
+        let sdr_u8 = make_sdr(&mut rng, bits, 0.02);
+        let active = gpu.compute(&sdr_u8, false).expect("gpu compute");
+        assert_eq!(active.len(), expected_k);
+        // Ensure sorted + unique.
+        for w in active.windows(2) {
+            assert!(w[0] < w[1], "duplicate or out-of-order winner indices");
+        }
+    }
+}
+#[test]
+fn fused_launch_plan_uses_cooperative_grid_sync() {
+    let plan = plan_fused_launch(30, true, 30, None).expect("cooperative supported");
+    assert_eq!(plan.grid_dim_x, 16);
+    assert_eq!(plan.cooperative_grid_limit, 30);
+}
+#[test]
+fn fused_launch_plan_scales_to_big_gpu() {
+    // H200-like: 132 SMs, high cooperative_grid_limit. Cap still applies.
+    let plan = plan_fused_launch(132, true, 1000, None).expect("cooperative supported");
+    assert_eq!(plan.grid_dim_x, 16); // capped by default override
+    let plan = plan_fused_launch(132, true, 1000, Some(64)).expect("cooperative supported");
+    assert_eq!(plan.grid_dim_x, 64); // override raises the cap
+}
+#[test]
+fn fused_launch_plan_refuses_non_cooperative_devices() {
+    // The slow path was removed. Devices without cooperative launch fail fast.
+    let err = plan_fused_launch(30, false, 0, None).unwrap_err();
+    assert!(err.contains("cooperative launch"));
+}
+#[test]
+fn fused_grid_cap_env_override_is_honored() {
+    let cfg = SpatialPoolerConfig::default();
+    let cpu_ref = SpatialPooler::new(SpatialPoolerConfig::default(), 5252);
+    let sp = SpatialPoolerGpu::from_cpu(&cpu_ref).expect("gpu sp init");
+    let dev = sp.dev_ref().clone();
+    unsafe { std::env::set_var("HTM_FUSED_GRID_CAP", "12"); }
+    let fused = FusedState::new(
+        dev.clone(),
+        cfg.n_columns,
+        32usize,
+        sp.initial_threshold_estimate(),
+    ).expect("fused init");
+    unsafe { std::env::remove_var("HTM_FUSED_GRID_CAP"); }
+    let sm_count = match dev.attribute(
+        cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+    ) {
+        Ok(v) => v as u32,
+        Err(_) => 16u32,
+    };
+    let expected = sm_count.max(1).min(12);
+    assert_eq!(
+        fused.grid_dim_x,
+        expected,
+        "fused grid cap env override ignored: expected min(sm_count, 12) = {expected}, got {}",
+        fused.grid_dim_x,
+    );
+}
+#[test]
+fn batched_grid_plan_clamps_a10g_batch32_under_cooperative_limit() {
+    // A10G observed in HF Jobs: cooperative_grid_limit=400, B=32.
+    // grid_x=16 requests 512 cooperative blocks and fails; clamp to 12.
+    let grid_x = plan_batched_grid_dim(16, 400, 32, false).expect("fits after clamp");
+    assert_eq!(grid_x, 12);
+}
+#[test]
+fn batched_grid_plan_reports_oversized_batch() {
+    let err = plan_batched_grid_dim(16, 31, 32, false).unwrap_err();
+    assert!(err.contains("COOPERATIVE_LAUNCH_TOO_LARGE"));
+}
+#[test]
+fn batched_grid_plan_does_not_clamp_cluster_launches() {
+    let grid_x = plan_batched_grid_dim(16, 31, 32, true).expect("cluster path bypasses cooperative limit");
+    assert_eq!(grid_x, 16);
+}

overlay/htm_rust/src/lib.rs CHANGED Viewed

@@ -1,198 +1,198 @@
-//! pyo3 bindings for HTMRegion (Numenta BAMI-spec HTM).
-//!
-//! Exposed class:
-//!     HTMRegion(input_bits, n_columns, cells_per_column, seed) -> HTMRegion
-//!       .step(input_sdr: np.ndarray[bool; input_bits], learn: bool = True)
-//!           -> (active_columns: np.ndarray[bool; n_columns],
-//!               active_cells:   np.ndarray[bool; n_columns*cells_per_column],
-//!               predicted_cells:np.ndarray[bool; n_columns*cells_per_column],
-//!               anomaly: float)
-//!       .reset()
-//!       .n_columns -> int
-//!       .cells_per_column -> int
-//!       .input_bits -> int
-//!
-//! GIL is dropped during the heavy compute via `py.allow_threads(...)` so the
-//! region is effectively `Send` for Python-side threading.
-// pyo3 0.22 `#[pymethods]` expansion inserts an implicit `.into()` on the
-// returned `Result` to normalise the error type, which clippy reports as
-// `useless_conversion` when our methods already return `PyErr`. The emitted
-// code sits outside the user-written impl, so item-level allows don't reach
-// it; the module-wide allow is the documented workaround.
-#![allow(clippy::useless_conversion)]
-mod region;
-mod sp;
-mod tm;
-#[cfg(feature = "gpu")]
-mod gpu;
-use numpy::{
-    IntoPyArray, PyArray1, PyArray2, PyArrayMethods, PyReadonlyArray1, PyReadonlyArray2,
-    PyUntypedArrayMethods,
-};
-use pyo3::prelude::*;
-use crate::region::HTMRegionCore;
-/// Result of one HTM step: (active_columns, active_cells, predicted_cells, anomaly).
-type StepOutput<'py> = (
-    Bound<'py, PyArray1<bool>>,
-    Bound<'py, PyArray1<bool>>,
-    Bound<'py, PyArray1<bool>>,
-    f32,
-);
-#[pyclass(module = "htm_rust")]
-pub struct HTMRegion {
-    core: HTMRegionCore,
-}
-#[pymethods]
-impl HTMRegion {
-    /// Create a new HTM region.
-    ///
-    /// Args:
-    ///     input_bits: length of binary input SDR
-    ///     n_columns: number of mini-columns in the SP (e.g. 2048)
-    ///     cells_per_column: cells per column in the TM (e.g. 32)
-    ///     seed: RNG seed for reproducibility
-    #[new]
-    #[pyo3(signature = (input_bits, n_columns, cells_per_column, seed=42))]
-    fn new(
-        input_bits: usize,
-        n_columns: usize,
-        cells_per_column: usize,
-        seed: u64,
-    ) -> PyResult<Self> {
-        if input_bits == 0 {
-            return Err(pyo3::exceptions::PyValueError::new_err(
-                "input_bits must be > 0",
-            ));
-        }
-        if n_columns == 0 {
-            return Err(pyo3::exceptions::PyValueError::new_err(
-                "n_columns must be > 0",
-            ));
-        }
-        if cells_per_column == 0 {
-            return Err(pyo3::exceptions::PyValueError::new_err(
-                "cells_per_column must be > 0",
-            ));
-        }
-        Ok(Self {
-            core: HTMRegionCore::new(input_bits, n_columns, cells_per_column, seed),
-        })
-    }
-    #[getter]
-    fn input_bits(&self) -> usize { self.core.sp.cfg.input_bits }
-    #[getter]
-    fn n_columns(&self) -> usize { self.core.sp.cfg.n_columns }
-    #[getter]
-    fn cells_per_column(&self) -> usize { self.core.tm.cfg.cells_per_column }
-    /// Process one timestep.
-    ///
-    /// Args:
-    ///     input_sdr: 1-D numpy boolean array of length `input_bits`.
-    ///     learn: if True, update SP permanences and TM synapses.
-    ///
-    /// Returns:
-    ///     (active_columns, active_cells, predicted_cells, anomaly)
-    #[pyo3(signature = (input_sdr, learn=true))]
-    fn step<'py>(
-        &mut self,
-        py: Python<'py>,
-        input_sdr: PyReadonlyArray1<'py, bool>,
-        learn: bool,
-    ) -> PyResult<StepOutput<'py>> {
-        let expected = self.core.sp.cfg.input_bits;
-        let slice = input_sdr.as_slice()?;
-        let got = slice.len();
-        if got != expected {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "input_sdr length {got} != expected input_bits {expected}",
-            )));
-        }
-        // Copy input to an owned Vec so we can drop the GIL.
-        let input_vec: Vec<bool> = slice.to_vec();
-        let (active_cols, active_cells, predicted_cells, anomaly) =
-            py.allow_threads(|| self.core.step(&input_vec, learn));
-        let a: Bound<'py, PyArray1<bool>> = active_cols.into_pyarray_bound(py);
-        let c: Bound<'py, PyArray1<bool>> = active_cells.into_pyarray_bound(py);
-        let p: Bound<'py, PyArray1<bool>> = predicted_cells.into_pyarray_bound(py);
-        Ok((a, c, p, anomaly))
-    }
-    /// Clear TM predictive state. Does NOT unlearn synapses.
-    fn reset(&mut self) { self.core.reset(); }
-    /// Process T timesteps from a `(T, input_bits)` bool ndarray.
-    ///
-    /// Returns:
-    ///     cols: (T, n_columns) float32 0/1 active-column mask
-    ///     anom: (T,) float32 anomaly scores
-    ///
-    /// Single GIL release for the whole pass, avoiding T × Python-call overhead.
-    #[pyo3(signature = (inputs, learn=true))]
-    fn step_many<'py>(
-        &mut self,
-        py: Python<'py>,
-        inputs: PyReadonlyArray2<'py, bool>,
-        learn: bool,
-    ) -> PyResult<(Bound<'py, PyArray2<f32>>, Bound<'py, PyArray1<f32>>)> {
-        let shape = inputs.shape();
-        if shape.len() != 2 {
-            return Err(pyo3::exceptions::PyValueError::new_err(
-                "inputs must be 2-D (T, input_bits)",
-            ));
-        }
-        let t = shape[0];
-        let bits = shape[1];
-        let expected = self.core.sp.cfg.input_bits;
-        if bits != expected {
-            return Err(pyo3::exceptions::PyValueError::new_err(format!(
-                "inputs last dim {bits} != expected input_bits {expected}",
-            )));
-        }
-        let slice = inputs.as_slice()?;
-        let n_cols = self.core.sp.cfg.n_columns;
-        // Own the input buffer so we can drop the GIL.
-        let input_vec: Vec<bool> = slice.to_vec();
-        let (cols_u8, anom) =
-            py.allow_threads(|| self.core.step_many(&input_vec, bits, t, learn));
-        // Convert u8 mask to f32 for direct numpy consumption.
-        let cols_f32: Vec<f32> = cols_u8.iter().map(|&b| b as f32).collect();
-        // Build (T, n_cols) and (T,) arrays.
-        let cols_arr =
-            numpy::PyArray1::from_vec_bound(py, cols_f32)
-                .reshape([t, n_cols])
-                .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("{e}")))?;
-        let anom_arr = numpy::PyArray1::from_vec_bound(py, anom);
-        Ok((cols_arr, anom_arr))
-    }
-}
-/// Python module entry point.
-#[pymodule]
-fn htm_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_class::<HTMRegion>()?;
-    #[cfg(feature = "gpu")]
-    {
-        gpu::register(m)?;
-    }
-    m.add("__version__", env!("CARGO_PKG_VERSION"))?;
-    Ok(())
-}

+//! pyo3 bindings for HTMRegion (Numenta BAMI-spec HTM).
+//!
+//! Exposed class:
+//!     HTMRegion(input_bits, n_columns, cells_per_column, seed) -> HTMRegion
+//!       .step(input_sdr: np.ndarray[bool; input_bits], learn: bool = True)
+//!           -> (active_columns: np.ndarray[bool; n_columns],
+//!               active_cells:   np.ndarray[bool; n_columns*cells_per_column],
+//!               predicted_cells:np.ndarray[bool; n_columns*cells_per_column],
+//!               anomaly: float)
+//!       .reset()
+//!       .n_columns -> int
+//!       .cells_per_column -> int
+//!       .input_bits -> int
+//!
+//! GIL is dropped during the heavy compute via `py.allow_threads(...)` so the
+//! region is effectively `Send` for Python-side threading.
+// pyo3 0.22 `#[pymethods]` expansion inserts an implicit `.into()` on the
+// returned `Result` to normalise the error type, which clippy reports as
+// `useless_conversion` when our methods already return `PyErr`. The emitted
+// code sits outside the user-written impl, so item-level allows don't reach
+// it; the module-wide allow is the documented workaround.
+#![allow(clippy::useless_conversion)]
+mod region;
+mod sp;
+mod tm;
+#[cfg(feature = "gpu")]
+mod gpu;
+use numpy::{
+    IntoPyArray, PyArray1, PyArray2, PyArrayMethods, PyReadonlyArray1, PyReadonlyArray2,
+    PyUntypedArrayMethods,
+};
+use pyo3::prelude::*;
+use crate::region::HTMRegionCore;
+/// Result of one HTM step: (active_columns, active_cells, predicted_cells, anomaly).
+type StepOutput<'py> = (
+    Bound<'py, PyArray1<bool>>,
+    Bound<'py, PyArray1<bool>>,
+    Bound<'py, PyArray1<bool>>,
+    f32,
+);
+#[pyclass(module = "htm_rust")]
+pub struct HTMRegion {
+    core: HTMRegionCore,
+}
+#[pymethods]
+impl HTMRegion {
+    /// Create a new HTM region.
+    ///
+    /// Args:
+    ///     input_bits: length of binary input SDR
+    ///     n_columns: number of mini-columns in the SP (e.g. 2048)
+    ///     cells_per_column: cells per column in the TM (e.g. 32)
+    ///     seed: RNG seed for reproducibility
+    #[new]
+    #[pyo3(signature = (input_bits, n_columns, cells_per_column, seed=42))]
+    fn new(
+        input_bits: usize,
+        n_columns: usize,
+        cells_per_column: usize,
+        seed: u64,
+    ) -> PyResult<Self> {
+        if input_bits == 0 {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "input_bits must be > 0",
+            ));
+        }
+        if n_columns == 0 {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "n_columns must be > 0",
+            ));
+        }
+        if cells_per_column == 0 {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "cells_per_column must be > 0",
+            ));
+        }
+        Ok(Self {
+            core: HTMRegionCore::new(input_bits, n_columns, cells_per_column, seed),
+        })
+    }
+    #[getter]
+    fn input_bits(&self) -> usize { self.core.sp.cfg.input_bits }
+    #[getter]
+    fn n_columns(&self) -> usize { self.core.sp.cfg.n_columns }
+    #[getter]
+    fn cells_per_column(&self) -> usize { self.core.tm.cfg.cells_per_column }
+    /// Process one timestep.
+    ///
+    /// Args:
+    ///     input_sdr: 1-D numpy boolean array of length `input_bits`.
+    ///     learn: if True, update SP permanences and TM synapses.
+    ///
+    /// Returns:
+    ///     (active_columns, active_cells, predicted_cells, anomaly)
+    #[pyo3(signature = (input_sdr, learn=true))]
+    fn step<'py>(
+        &mut self,
+        py: Python<'py>,
+        input_sdr: PyReadonlyArray1<'py, bool>,
+        learn: bool,
+    ) -> PyResult<StepOutput<'py>> {
+        let expected = self.core.sp.cfg.input_bits;
+        let slice = input_sdr.as_slice()?;
+        let got = slice.len();
+        if got != expected {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "input_sdr length {got} != expected input_bits {expected}",
+            )));
+        }
+        // Copy input to an owned Vec so we can drop the GIL.
+        let input_vec: Vec<bool> = slice.to_vec();
+        let (active_cols, active_cells, predicted_cells, anomaly) =
+            py.allow_threads(|| self.core.step(&input_vec, learn));
+        let a: Bound<'py, PyArray1<bool>> = active_cols.into_pyarray_bound(py);
+        let c: Bound<'py, PyArray1<bool>> = active_cells.into_pyarray_bound(py);
+        let p: Bound<'py, PyArray1<bool>> = predicted_cells.into_pyarray_bound(py);
+        Ok((a, c, p, anomaly))
+    }
+    /// Clear TM predictive state. Does NOT unlearn synapses.
+    fn reset(&mut self) { self.core.reset(); }
+    /// Process T timesteps from a `(T, input_bits)` bool ndarray.
+    ///
+    /// Returns:
+    ///     cols: (T, n_columns) float32 0/1 active-column mask
+    ///     anom: (T,) float32 anomaly scores
+    ///
+    /// Single GIL release for the whole pass, avoiding T × Python-call overhead.
+    #[pyo3(signature = (inputs, learn=true))]
+    fn step_many<'py>(
+        &mut self,
+        py: Python<'py>,
+        inputs: PyReadonlyArray2<'py, bool>,
+        learn: bool,
+    ) -> PyResult<(Bound<'py, PyArray2<f32>>, Bound<'py, PyArray1<f32>>)> {
+        let shape = inputs.shape();
+        if shape.len() != 2 {
+            return Err(pyo3::exceptions::PyValueError::new_err(
+                "inputs must be 2-D (T, input_bits)",
+            ));
+        }
+        let t = shape[0];
+        let bits = shape[1];
+        let expected = self.core.sp.cfg.input_bits;
+        if bits != expected {
+            return Err(pyo3::exceptions::PyValueError::new_err(format!(
+                "inputs last dim {bits} != expected input_bits {expected}",
+            )));
+        }
+        let slice = inputs.as_slice()?;
+        let n_cols = self.core.sp.cfg.n_columns;
+        // Own the input buffer so we can drop the GIL.
+        let input_vec: Vec<bool> = slice.to_vec();
+        let (cols_u8, anom) =
+            py.allow_threads(|| self.core.step_many(&input_vec, bits, t, learn));
+        // Convert u8 mask to f32 for direct numpy consumption.
+        let cols_f32: Vec<f32> = cols_u8.iter().map(|&b| b as f32).collect();
+        // Build (T, n_cols) and (T,) arrays.
+        let cols_arr =
+            numpy::PyArray1::from_vec_bound(py, cols_f32)
+                .reshape([t, n_cols])
+                .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("{e}")))?;
+        let anom_arr = numpy::PyArray1::from_vec_bound(py, anom);
+        Ok((cols_arr, anom_arr))
+    }
+}
+/// Python module entry point.
+#[pymodule]
+fn htm_rust(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<HTMRegion>()?;
+    #[cfg(feature = "gpu")]
+    {
+        gpu::register(m)?;
+    }
+    m.add("__version__", env!("CARGO_PKG_VERSION"))?;
+    Ok(())
+}

overlay/htm_rust/src/region.rs CHANGED Viewed

@@ -1,94 +1,94 @@
-//! HTMRegion: compose SpatialPooler + TemporalMemory into a single step().
-use crate::sp::{SpatialPooler, SpatialPoolerConfig};
-use crate::tm::{TemporalMemory, TemporalMemoryConfig};
-pub struct HTMRegionCore {
-    pub sp: SpatialPooler,
-    pub tm: TemporalMemory,
-}
-impl HTMRegionCore {
-    pub fn new(
-        input_bits: usize,
-        n_columns: usize,
-        cells_per_column: usize,
-        seed: u64,
-    ) -> Self {
-        let defaults = SpatialPoolerConfig::default();
-        let sp_cfg = SpatialPoolerConfig {
-            input_bits,
-            n_columns,
-            // Scale potential_radius to at most the input size.
-            potential_radius: defaults.potential_radius.min(input_bits),
-            ..defaults
-        };
-        let tm_cfg = TemporalMemoryConfig {
-            n_columns,
-            cells_per_column,
-            ..TemporalMemoryConfig::default()
-        };
-        Self {
-            sp: SpatialPooler::new(sp_cfg, seed),
-            tm: TemporalMemory::new(tm_cfg, seed.wrapping_add(0x9E3779B97F4A7C15)),
-        }
-    }
-    /// Process one timestep. Returns (active_columns_mask,
-    /// active_cells_mask, predicted_cells_mask, anomaly).
-    pub fn step(
-        &mut self,
-        input_sdr: &[bool],
-        learn: bool,
-    ) -> (Vec<bool>, Vec<bool>, Vec<bool>, f32) {
-        let active_cols = self.sp.compute(input_sdr, learn);
-        let mut active_cols_mask = vec![false; self.sp.cfg.n_columns];
-        for &c in &active_cols {
-            active_cols_mask[c as usize] = true;
-        }
-        let anomaly = self.tm.compute(&active_cols, learn);
-        // active_cells and predictive_cells are stored as Vec<bool> already.
-        let active_cells_mask = self.tm.active_cells.clone();
-        let predicted_cells_mask = self.tm.predictive_cells.clone();
-        (active_cols_mask, active_cells_mask, predicted_cells_mask, anomaly)
-    }
-    pub fn reset(&mut self) {
-        self.tm.reset();
-    }
-    /// Process T timesteps in one call. Returns flat `(T*n_columns)` active-column
-    /// mask (u8 0/1) and `(T,)` anomaly scores.
-    ///
-    /// Amortises the per-step Python round-trip for training: one GIL release,
-    /// one copy-out. Used by `HTMLayer.step_many`.
-    pub fn step_many(
-        &mut self,
-        inputs_flat: &[bool],
-        input_bits: usize,
-        t: usize,
-        learn: bool,
-    ) -> (Vec<u8>, Vec<f32>) {
-        let n_cols = self.sp.cfg.n_columns;
-        debug_assert_eq!(inputs_flat.len(), t * input_bits);
-        let mut cols = vec![0u8; t * n_cols];
-        let mut anom = vec![0f32; t];
-        for ti in 0..t {
-            let off = ti * input_bits;
-            let input = &inputs_flat[off..off + input_bits];
-            let active_cols = self.sp.compute(input, learn);
-            let co = ti * n_cols;
-            for &c in &active_cols {
-                cols[co + c as usize] = 1;
-            }
-            anom[ti] = self.tm.compute(&active_cols, learn);
-        }
-        (cols, anom)
-    }
-}

+//! HTMRegion: compose SpatialPooler + TemporalMemory into a single step().
+use crate::sp::{SpatialPooler, SpatialPoolerConfig};
+use crate::tm::{TemporalMemory, TemporalMemoryConfig};
+pub struct HTMRegionCore {
+    pub sp: SpatialPooler,
+    pub tm: TemporalMemory,
+}
+impl HTMRegionCore {
+    pub fn new(
+        input_bits: usize,
+        n_columns: usize,
+        cells_per_column: usize,
+        seed: u64,
+    ) -> Self {
+        let defaults = SpatialPoolerConfig::default();
+        let sp_cfg = SpatialPoolerConfig {
+            input_bits,
+            n_columns,
+            // Scale potential_radius to at most the input size.
+            potential_radius: defaults.potential_radius.min(input_bits),
+            ..defaults
+        };
+        let tm_cfg = TemporalMemoryConfig {
+            n_columns,
+            cells_per_column,
+            ..TemporalMemoryConfig::default()
+        };
+        Self {
+            sp: SpatialPooler::new(sp_cfg, seed),
+            tm: TemporalMemory::new(tm_cfg, seed.wrapping_add(0x9E3779B97F4A7C15)),
+        }
+    }
+    /// Process one timestep. Returns (active_columns_mask,
+    /// active_cells_mask, predicted_cells_mask, anomaly).
+    pub fn step(
+        &mut self,
+        input_sdr: &[bool],
+        learn: bool,
+    ) -> (Vec<bool>, Vec<bool>, Vec<bool>, f32) {
+        let active_cols = self.sp.compute(input_sdr, learn);
+        let mut active_cols_mask = vec![false; self.sp.cfg.n_columns];
+        for &c in &active_cols {
+            active_cols_mask[c as usize] = true;
+        }
+        let anomaly = self.tm.compute(&active_cols, learn);
+        // active_cells and predictive_cells are stored as Vec<bool> already.
+        let active_cells_mask = self.tm.active_cells.clone();
+        let predicted_cells_mask = self.tm.predictive_cells.clone();
+        (active_cols_mask, active_cells_mask, predicted_cells_mask, anomaly)
+    }
+    pub fn reset(&mut self) {
+        self.tm.reset();
+    }
+    /// Process T timesteps in one call. Returns flat `(T*n_columns)` active-column
+    /// mask (u8 0/1) and `(T,)` anomaly scores.
+    ///
+    /// Amortises the per-step Python round-trip for training: one GIL release,
+    /// one copy-out. Used by `HTMLayer.step_many`.
+    pub fn step_many(
+        &mut self,
+        inputs_flat: &[bool],
+        input_bits: usize,
+        t: usize,
+        learn: bool,
+    ) -> (Vec<u8>, Vec<f32>) {
+        let n_cols = self.sp.cfg.n_columns;
+        debug_assert_eq!(inputs_flat.len(), t * input_bits);
+        let mut cols = vec![0u8; t * n_cols];
+        let mut anom = vec![0f32; t];
+        for ti in 0..t {
+            let off = ti * input_bits;
+            let input = &inputs_flat[off..off + input_bits];
+            let active_cols = self.sp.compute(input, learn);
+            let co = ti * n_cols;
+            for &c in &active_cols {
+                cols[co + c as usize] = 1;
+            }
+            anom[ti] = self.tm.compute(&active_cols, learn);
+        }
+        (cols, anom)
+    }
+}

overlay/htm_rust/src/sp.rs CHANGED Viewed

@@ -1,302 +1,302 @@
-//! Numenta BAMI-spec Spatial Pooler.
-//!
-//! Implements:
-//!   - 2048 (configurable) mini-columns with proximal dendrites
-//!   - `potential_synapses` (default 40) synapses per column sampled from
-//!     `potential_radius` (default 1024) random input bits
-//!   - Permanence in [0.0, 1.0] (f32), connected_threshold = 0.5
-//!   - syn_perm_active_inc = +0.04, syn_perm_inactive_dec = -0.008
-//!   - Global k-WTA inhibition (top `sparsity` fraction of columns)
-//!   - Boost factor with exponential duty-cycle tracking (Numenta formula)
-//!
-//! Reference: BAMI "Spatial Pooling Algorithm Details" (Numenta, 2017).
-use rand::Rng;
-use rand::SeedableRng;
-use rand::seq::SliceRandom;
-use rand_xoshiro::Xoshiro256PlusPlus;
-/// A single proximal dendrite: a sparse set of potential synapses onto
-/// specific input bit indices, with per-synapse permanence values.
-#[derive(Clone)]
-pub struct ProximalDendrite {
-    /// Indices into the input SDR.  Length == potential_synapses.
-    pub inputs: Vec<u32>,
-    /// Permanence for each potential synapse (same length as `inputs`).
-    pub perms: Vec<f32>,
-}
-pub struct SpatialPoolerConfig {
-    pub input_bits: usize,
-    pub n_columns: usize,
-    /// Size of the random input sample per column.
-    pub potential_radius: usize,
-    /// Number of potential synapses per column's proximal dendrite.
-    pub potential_synapses: usize,
-    pub connected_threshold: f32,
-    pub syn_perm_active_inc: f32,
-    pub syn_perm_inactive_dec: f32,
-    /// Target fraction of columns active per step (e.g. 0.02 for 2%).
-    pub sparsity: f32,
-    /// Duty cycle EMA period.
-    pub duty_cycle_period: f32,
-    /// Boost strength. Set to 0.0 to disable boosting.
-    pub boost_strength: f32,
-    /// Initial permanence span around the connected threshold.
-    pub init_perm_span: f32,
-}
-impl Default for SpatialPoolerConfig {
-    fn default() -> Self {
-        Self {
-            input_bits: 16384,
-            n_columns: 2048,
-            potential_radius: 1024,
-            potential_synapses: 40,
-            connected_threshold: 0.5,
-            syn_perm_active_inc: 0.04,
-            syn_perm_inactive_dec: 0.008,
-            sparsity: 0.02,
-            duty_cycle_period: 1000.0,
-            boost_strength: 1.0,
-            init_perm_span: 0.1,
-        }
-    }
-}
-pub struct SpatialPooler {
-    pub cfg: SpatialPoolerConfig,
-    pub columns: Vec<ProximalDendrite>,
-    /// Exponential moving average of "column was active" per step.
-    pub active_duty_cycle: Vec<f32>,
-    /// Exponential moving average of "overlap exceeded threshold" per step.
-    pub overlap_duty_cycle: Vec<f32>,
-    /// Boost factor per column.
-    pub boost: Vec<f32>,
-    rng: Xoshiro256PlusPlus,
-    iter_count: u64,
-}
-impl SpatialPooler {
-    pub fn new(cfg: SpatialPoolerConfig, seed: u64) -> Self {
-        assert!(cfg.input_bits >= cfg.potential_radius,
-            "input_bits ({}) must be >= potential_radius ({})",
-            cfg.input_bits, cfg.potential_radius);
-        assert!(cfg.potential_radius >= cfg.potential_synapses,
-            "potential_radius ({}) must be >= potential_synapses ({})",
-            cfg.potential_radius, cfg.potential_synapses);
-        let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed);
-        let mut columns = Vec::with_capacity(cfg.n_columns);
-        for _ in 0..cfg.n_columns {
-            // Sample `potential_radius` distinct input indices, then from those
-            // pick `potential_synapses` as the actual proximal synapses.
-            // Using partial Fisher-Yates via shuffle on a pool index range.
-            let mut pool: Vec<u32> = (0..cfg.input_bits as u32).collect();
-            // Efficient partial shuffle: swap the first `potential_radius`
-            // items with random items from the rest (Durstenfeld step).
-            for i in 0..cfg.potential_radius.min(pool.len()) {
-                let j = rng.gen_range(i..pool.len());
-                pool.swap(i, j);
-            }
-            let window = &mut pool[..cfg.potential_radius];
-            window.shuffle(&mut rng);
-            let mut inputs: Vec<u32> = window[..cfg.potential_synapses].to_vec();
-            inputs.sort_unstable();
-            let perms: Vec<f32> = (0..cfg.potential_synapses)
-                .map(|_| {
-                    let delta: f32 = rng.gen_range(-cfg.init_perm_span..cfg.init_perm_span);
-                    (cfg.connected_threshold + delta).clamp(0.0, 1.0)
-                })
-                .collect();
-            columns.push(ProximalDendrite { inputs, perms });
-        }
-        let n = cfg.n_columns;
-        Self {
-            cfg,
-            columns,
-            active_duty_cycle: vec![0.0; n],
-            overlap_duty_cycle: vec![0.0; n],
-            boost: vec![1.0; n],
-            rng,
-            iter_count: 0,
-        }
-    }
-    /// Process one step: compute overlaps, inhibit, learn (if `learn`), update
-    /// duty cycles and boosts. Returns the set of active column indices.
-    pub fn compute(&mut self, input: &[bool], learn: bool) -> Vec<u32> {
-        assert_eq!(input.len(), self.cfg.input_bits);
-        // 1) Overlap score per column (sum of CONNECTED synapses onto active inputs).
-        //    Also track raw overlap for the overlap-duty-cycle.
-        let n = self.cfg.n_columns;
-        let mut overlaps: Vec<f32> = vec![0.0; n];
-        let mut raw_overlaps: Vec<u32> = vec![0; n];
-        for (ci, col) in self.columns.iter().enumerate() {
-            let mut s: u32 = 0;
-            for (syn_i, &inp) in col.inputs.iter().enumerate() {
-                if input[inp as usize] && col.perms[syn_i] >= self.cfg.connected_threshold {
-                    s += 1;
-                }
-            }
-            raw_overlaps[ci] = s;
-            overlaps[ci] = (s as f32) * self.boost[ci];
-        }
-        // 2) Global k-WTA inhibition. Select top-k columns by boosted overlap.
-        let k = ((self.cfg.sparsity * n as f32).round() as usize).max(1);
-        let active: Vec<u32> = top_k(&overlaps, k);
-        // 3) Hebbian learning on active columns.
-        if learn {
-            for &ci in &active {
-                let col = &mut self.columns[ci as usize];
-                for (syn_i, &inp) in col.inputs.iter().enumerate() {
-                    if input[inp as usize] {
-                        col.perms[syn_i] =
-                            (col.perms[syn_i] + self.cfg.syn_perm_active_inc).min(1.0);
-                    } else {
-                        col.perms[syn_i] =
-                            (col.perms[syn_i] - self.cfg.syn_perm_inactive_dec).max(0.0);
-                    }
-                }
-            }
-        }
-        // 4) Update duty cycles (EMA with period T -> alpha = 1/T).
-        let period = self.cfg.duty_cycle_period.max(1.0);
-        let alpha = 1.0 / period;
-        // Column is "overlapping enough" if raw overlap >= stimulus_threshold.
-        // Numenta uses min_overlap; we use 1 as a conservative floor.
-        let stimulus_threshold = 1.0_f32;
-        // Mark active columns.
-        let mut active_mask = vec![false; n];
-        for &ci in &active {
-            active_mask[ci as usize] = true;
-        }
-        for i in 0..n {
-            let active_sample = if active_mask[i] { 1.0 } else { 0.0 };
-            let overlap_sample = if (raw_overlaps[i] as f32) >= stimulus_threshold {
-                1.0
-            } else {
-                0.0
-            };
-            self.active_duty_cycle[i] =
-                (1.0 - alpha) * self.active_duty_cycle[i] + alpha * active_sample;
-            self.overlap_duty_cycle[i] =
-                (1.0 - alpha) * self.overlap_duty_cycle[i] + alpha * overlap_sample;
-        }
-        // 5) Boost factor: b_i = exp(-boost_strength * (duty_i - mean_duty)).
-        //    Under-used columns (duty < mean) get boost > 1.
-        if learn && self.cfg.boost_strength > 0.0 {
-            let mean_duty: f32 =
-                self.active_duty_cycle.iter().sum::<f32>() / (n as f32);
-            for i in 0..n {
-                self.boost[i] =
-                    (-self.cfg.boost_strength * (self.active_duty_cycle[i] - mean_duty)).exp();
-            }
-            // 6) Permanence bump for chronically under-stimulated columns.
-            //    If overlap_duty_cycle[i] < min_pct_overlap * max_duty_in_neighborhood,
-            //    bump all permanences by syn_perm_active_inc * 0.1.
-            //    With global inhibition, "neighborhood" = all columns.
-            let max_overlap_duty = self
-                .overlap_duty_cycle
-                .iter()
-                .cloned()
-                .fold(0.0_f32, f32::max);
-            let min_pct_overlap_duty = 0.001_f32 * max_overlap_duty;
-            if max_overlap_duty > 0.0 {
-                for i in 0..n {
-                    if self.overlap_duty_cycle[i] < min_pct_overlap_duty {
-                        for p in &mut self.columns[i].perms {
-                            *p = (*p + self.cfg.syn_perm_active_inc * 0.1).min(1.0);
-                        }
-                    }
-                }
-            }
-        }
-        self.iter_count = self.iter_count.wrapping_add(1);
-        let _ = &mut self.rng; // suppress unused-mut when learn=false
-        active
-    }
-}
-/// Return the indices of the top-k values in `scores`.
-/// Ties broken by index order. Output is sorted ascending.
-fn top_k(scores: &[f32], k: usize) -> Vec<u32> {
-    if k == 0 {
-        return Vec::new();
-    }
-    let mut idx: Vec<u32> = (0..scores.len() as u32).collect();
-    // Partial sort: put top-k at the front by descending score.
-    // Use select_nth_unstable_by on (desc score, asc index).
-    idx.select_nth_unstable_by(k - 1, |&a, &b| {
-        let sa = scores[a as usize];
-        let sb = scores[b as usize];
-        // Reverse for descending.
-        match sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal) {
-            std::cmp::Ordering::Equal => a.cmp(&b),
-            ord => ord,
-        }
-    });
-    let mut winners: Vec<u32> = idx[..k].to_vec();
-    winners.sort_unstable();
-    winners
-}
-// ---------------------------------------------------------------------------
-// Tests
-// ---------------------------------------------------------------------------
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use rand::Rng;
-    use rand::SeedableRng;
-    use rand_xoshiro::Xoshiro256PlusPlus;
-    #[test]
-    fn sp_sparsity_exact_2pct() {
-        // BAMI says "top ~2%"; with 2048 columns that's round(0.02*2048) = 41.
-        // The SP must produce *exactly* that count, no more, no less, and with
-        // no duplicate indices.
-        let cfg = SpatialPoolerConfig::default();
-        let expected_k = (cfg.sparsity * cfg.n_columns as f32).round() as usize;
-        assert!(expected_k > 0);
-        let input_bits = cfg.input_bits;
-        let mut sp = SpatialPooler::new(cfg, 42);
-        let mut rng = Xoshiro256PlusPlus::seed_from_u64(7);
-        for _ in 0..100 {
-            // 2% sparse random input SDR.
-            let on_bits = (0.02 * input_bits as f32) as usize;
-            let mut sdr = vec![false; input_bits];
-            for _ in 0..on_bits {
-                let i = rng.gen_range(0..input_bits);
-                sdr[i] = true;
-            }
-            let active = sp.compute(&sdr, true);
-            assert_eq!(
-                active.len(),
-                expected_k,
-                "SP must emit exactly {expected_k} active columns"
-            );
-            let mut a = active.clone();
-            a.sort_unstable();
-            a.dedup();
-            assert_eq!(a.len(), expected_k);
-        }
-    }
-}

+//! Numenta BAMI-spec Spatial Pooler.
+//!
+//! Implements:
+//!   - 2048 (configurable) mini-columns with proximal dendrites
+//!   - `potential_synapses` (default 40) synapses per column sampled from
+//!     `potential_radius` (default 1024) random input bits
+//!   - Permanence in [0.0, 1.0] (f32), connected_threshold = 0.5
+//!   - syn_perm_active_inc = +0.04, syn_perm_inactive_dec = -0.008
+//!   - Global k-WTA inhibition (top `sparsity` fraction of columns)
+//!   - Boost factor with exponential duty-cycle tracking (Numenta formula)
+//!
+//! Reference: BAMI "Spatial Pooling Algorithm Details" (Numenta, 2017).
+use rand::Rng;
+use rand::SeedableRng;
+use rand::seq::SliceRandom;
+use rand_xoshiro::Xoshiro256PlusPlus;
+/// A single proximal dendrite: a sparse set of potential synapses onto
+/// specific input bit indices, with per-synapse permanence values.
+#[derive(Clone)]
+pub struct ProximalDendrite {
+    /// Indices into the input SDR.  Length == potential_synapses.
+    pub inputs: Vec<u32>,
+    /// Permanence for each potential synapse (same length as `inputs`).
+    pub perms: Vec<f32>,
+}
+pub struct SpatialPoolerConfig {
+    pub input_bits: usize,
+    pub n_columns: usize,
+    /// Size of the random input sample per column.
+    pub potential_radius: usize,
+    /// Number of potential synapses per column's proximal dendrite.
+    pub potential_synapses: usize,
+    pub connected_threshold: f32,
+    pub syn_perm_active_inc: f32,
+    pub syn_perm_inactive_dec: f32,
+    /// Target fraction of columns active per step (e.g. 0.02 for 2%).
+    pub sparsity: f32,
+    /// Duty cycle EMA period.
+    pub duty_cycle_period: f32,
+    /// Boost strength. Set to 0.0 to disable boosting.
+    pub boost_strength: f32,
+    /// Initial permanence span around the connected threshold.
+    pub init_perm_span: f32,
+}
+impl Default for SpatialPoolerConfig {
+    fn default() -> Self {
+        Self {
+            input_bits: 16384,
+            n_columns: 2048,
+            potential_radius: 1024,
+            potential_synapses: 40,
+            connected_threshold: 0.5,
+            syn_perm_active_inc: 0.04,
+            syn_perm_inactive_dec: 0.008,
+            sparsity: 0.02,
+            duty_cycle_period: 1000.0,
+            boost_strength: 1.0,
+            init_perm_span: 0.1,
+        }
+    }
+}
+pub struct SpatialPooler {
+    pub cfg: SpatialPoolerConfig,
+    pub columns: Vec<ProximalDendrite>,
+    /// Exponential moving average of "column was active" per step.
+    pub active_duty_cycle: Vec<f32>,
+    /// Exponential moving average of "overlap exceeded threshold" per step.
+    pub overlap_duty_cycle: Vec<f32>,
+    /// Boost factor per column.
+    pub boost: Vec<f32>,
+    rng: Xoshiro256PlusPlus,
+    iter_count: u64,
+}
+impl SpatialPooler {
+    pub fn new(cfg: SpatialPoolerConfig, seed: u64) -> Self {
+        assert!(cfg.input_bits >= cfg.potential_radius,
+            "input_bits ({}) must be >= potential_radius ({})",
+            cfg.input_bits, cfg.potential_radius);
+        assert!(cfg.potential_radius >= cfg.potential_synapses,
+            "potential_radius ({}) must be >= potential_synapses ({})",
+            cfg.potential_radius, cfg.potential_synapses);
+        let mut rng = Xoshiro256PlusPlus::seed_from_u64(seed);
+        let mut columns = Vec::with_capacity(cfg.n_columns);
+        for _ in 0..cfg.n_columns {
+            // Sample `potential_radius` distinct input indices, then from those
+            // pick `potential_synapses` as the actual proximal synapses.
+            // Using partial Fisher-Yates via shuffle on a pool index range.
+            let mut pool: Vec<u32> = (0..cfg.input_bits as u32).collect();
+            // Efficient partial shuffle: swap the first `potential_radius`
+            // items with random items from the rest (Durstenfeld step).
+            for i in 0..cfg.potential_radius.min(pool.len()) {
+                let j = rng.gen_range(i..pool.len());
+                pool.swap(i, j);
+            }
+            let window = &mut pool[..cfg.potential_radius];
+            window.shuffle(&mut rng);
+            let mut inputs: Vec<u32> = window[..cfg.potential_synapses].to_vec();
+            inputs.sort_unstable();
+            let perms: Vec<f32> = (0..cfg.potential_synapses)
+                .map(|_| {
+                    let delta: f32 = rng.gen_range(-cfg.init_perm_span..cfg.init_perm_span);
+                    (cfg.connected_threshold + delta).clamp(0.0, 1.0)
+                })
+                .collect();
+            columns.push(ProximalDendrite { inputs, perms });
+        }
+        let n = cfg.n_columns;
+        Self {
+            cfg,
+            columns,
+            active_duty_cycle: vec![0.0; n],
+            overlap_duty_cycle: vec![0.0; n],
+            boost: vec![1.0; n],
+            rng,
+            iter_count: 0,
+        }
+    }
+    /// Process one step: compute overlaps, inhibit, learn (if `learn`), update
+    /// duty cycles and boosts. Returns the set of active column indices.
+    pub fn compute(&mut self, input: &[bool], learn: bool) -> Vec<u32> {
+        assert_eq!(input.len(), self.cfg.input_bits);
+        // 1) Overlap score per column (sum of CONNECTED synapses onto active inputs).
+        //    Also track raw overlap for the overlap-duty-cycle.
+        let n = self.cfg.n_columns;
+        let mut overlaps: Vec<f32> = vec![0.0; n];
+        let mut raw_overlaps: Vec<u32> = vec![0; n];
+        for (ci, col) in self.columns.iter().enumerate() {
+            let mut s: u32 = 0;
+            for (syn_i, &inp) in col.inputs.iter().enumerate() {
+                if input[inp as usize] && col.perms[syn_i] >= self.cfg.connected_threshold {
+                    s += 1;
+                }
+            }
+            raw_overlaps[ci] = s;
+            overlaps[ci] = (s as f32) * self.boost[ci];
+        }
+        // 2) Global k-WTA inhibition. Select top-k columns by boosted overlap.
+        let k = ((self.cfg.sparsity * n as f32).round() as usize).max(1);
+        let active: Vec<u32> = top_k(&overlaps, k);
+        // 3) Hebbian learning on active columns.
+        if learn {
+            for &ci in &active {
+                let col = &mut self.columns[ci as usize];
+                for (syn_i, &inp) in col.inputs.iter().enumerate() {
+                    if input[inp as usize] {
+                        col.perms[syn_i] =
+                            (col.perms[syn_i] + self.cfg.syn_perm_active_inc).min(1.0);
+                    } else {
+                        col.perms[syn_i] =
+                            (col.perms[syn_i] - self.cfg.syn_perm_inactive_dec).max(0.0);
+                    }
+                }
+            }
+        }
+        // 4) Update duty cycles (EMA with period T -> alpha = 1/T).
+        let period = self.cfg.duty_cycle_period.max(1.0);
+        let alpha = 1.0 / period;
+        // Column is "overlapping enough" if raw overlap >= stimulus_threshold.
+        // Numenta uses min_overlap; we use 1 as a conservative floor.
+        let stimulus_threshold = 1.0_f32;
+        // Mark active columns.
+        let mut active_mask = vec![false; n];
+        for &ci in &active {
+            active_mask[ci as usize] = true;
+        }
+        for i in 0..n {
+            let active_sample = if active_mask[i] { 1.0 } else { 0.0 };
+            let overlap_sample = if (raw_overlaps[i] as f32) >= stimulus_threshold {
+                1.0
+            } else {
+                0.0
+            };
+            self.active_duty_cycle[i] =
+                (1.0 - alpha) * self.active_duty_cycle[i] + alpha * active_sample;
+            self.overlap_duty_cycle[i] =
+                (1.0 - alpha) * self.overlap_duty_cycle[i] + alpha * overlap_sample;
+        }
+        // 5) Boost factor: b_i = exp(-boost_strength * (duty_i - mean_duty)).
+        //    Under-used columns (duty < mean) get boost > 1.
+        if learn && self.cfg.boost_strength > 0.0 {
+            let mean_duty: f32 =
+                self.active_duty_cycle.iter().sum::<f32>() / (n as f32);
+            for i in 0..n {
+                self.boost[i] =
+                    (-self.cfg.boost_strength * (self.active_duty_cycle[i] - mean_duty)).exp();
+            }
+            // 6) Permanence bump for chronically under-stimulated columns.
+            //    If overlap_duty_cycle[i] < min_pct_overlap * max_duty_in_neighborhood,
+            //    bump all permanences by syn_perm_active_inc * 0.1.
+            //    With global inhibition, "neighborhood" = all columns.
+            let max_overlap_duty = self
+                .overlap_duty_cycle
+                .iter()
+                .cloned()
+                .fold(0.0_f32, f32::max);
+            let min_pct_overlap_duty = 0.001_f32 * max_overlap_duty;
+            if max_overlap_duty > 0.0 {
+                for i in 0..n {
+                    if self.overlap_duty_cycle[i] < min_pct_overlap_duty {
+                        for p in &mut self.columns[i].perms {
+                            *p = (*p + self.cfg.syn_perm_active_inc * 0.1).min(1.0);
+                        }
+                    }
+                }
+            }
+        }
+        self.iter_count = self.iter_count.wrapping_add(1);
+        let _ = &mut self.rng; // suppress unused-mut when learn=false
+        active
+    }
+}
+/// Return the indices of the top-k values in `scores`.
+/// Ties broken by index order. Output is sorted ascending.
+fn top_k(scores: &[f32], k: usize) -> Vec<u32> {
+    if k == 0 {
+        return Vec::new();
+    }
+    let mut idx: Vec<u32> = (0..scores.len() as u32).collect();
+    // Partial sort: put top-k at the front by descending score.
+    // Use select_nth_unstable_by on (desc score, asc index).
+    idx.select_nth_unstable_by(k - 1, |&a, &b| {
+        let sa = scores[a as usize];
+        let sb = scores[b as usize];
+        // Reverse for descending.
+        match sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal) {
+            std::cmp::Ordering::Equal => a.cmp(&b),
+            ord => ord,
+        }
+    });
+    let mut winners: Vec<u32> = idx[..k].to_vec();
+    winners.sort_unstable();
+    winners
+}
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use rand::Rng;
+    use rand::SeedableRng;
+    use rand_xoshiro::Xoshiro256PlusPlus;
+    #[test]
+    fn sp_sparsity_exact_2pct() {
+        // BAMI says "top ~2%"; with 2048 columns that's round(0.02*2048) = 41.
+        // The SP must produce *exactly* that count, no more, no less, and with
+        // no duplicate indices.
+        let cfg = SpatialPoolerConfig::default();
+        let expected_k = (cfg.sparsity * cfg.n_columns as f32).round() as usize;
+        assert!(expected_k > 0);
+        let input_bits = cfg.input_bits;
+        let mut sp = SpatialPooler::new(cfg, 42);
+        let mut rng = Xoshiro256PlusPlus::seed_from_u64(7);
+        for _ in 0..100 {
+            // 2% sparse random input SDR.
+            let on_bits = (0.02 * input_bits as f32) as usize;
+            let mut sdr = vec![false; input_bits];
+            for _ in 0..on_bits {
+                let i = rng.gen_range(0..input_bits);
+                sdr[i] = true;
+            }
+            let active = sp.compute(&sdr, true);
+            assert_eq!(
+                active.len(),
+                expected_k,
+                "SP must emit exactly {expected_k} active columns"
+            );
+            let mut a = active.clone();
+            a.sort_unstable();
+            a.dedup();
+            assert_eq!(a.len(), expected_k);
+        }
+    }
+}

overlay/htm_rust/src/tm.rs CHANGED Viewed

@@ -1,545 +1,545 @@
-//! Numenta BAMI-spec Temporal Memory.
-//!
-//! Key parameters (Numenta defaults):
-//!   - cells_per_column = 32
-//!   - max_segments_per_cell = 255
-//!   - max_synapses_per_segment = 32
-//!   - activation_threshold = 15  (CONNECTED synapses onto active cells)
-//!   - learning_threshold     = 13  (POTENTIAL synapses onto active cells)
-//!     (often called `minThreshold` / match threshold in BAMI)
-//!   - initial_permanence = 0.21
-//!   - connected_permanence = 0.50
-//!   - permanence_increment = 0.10
-//!   - permanence_decrement = 0.10
-//!   - predicted_segment_decrement = 0.10  (decay for segments that predicted
-//!     inactive columns; called `predictedSegmentDecrement` in BAMI)
-//!   - max_new_synapse_count = 20  (max synapses to grow on a new/reinforced seg)
-//!
-//! Algorithm (one step):
-//!   Given `active_columns` from the Spatial Pooler, and segment activity
-//!   caches `active_segments` and `matching_segments` computed *at the end of
-//!   the previous step*:
-//!
-//!   1. For each active column:
-//!        - If it contains any predicted cell (any cell with an active segment
-//!          from the previous depolarization), mark those cells active and
-//!          learn on the segment that predicted it.
-//!        - Else BURST the column: mark all cells in it active, and grow a new
-//!          segment on the best-matching cell in the column (or, if none,
-//!          on the cell with the fewest segments).
-//!   2. For every column that was predicted but did NOT become active
-//!      (matching segments on inactive columns), apply the
-//!      `predicted_segment_decrement` decay so spurious predictions fade.
-//!   3. Winner cells = active cells chosen for learning (1 per active column).
-//!   4. Compute segment activity for NEXT step:
-//!        - A segment's CONNECTED activity = #synapses with perm >= connected_perm
-//!          whose presynaptic cell is in `active_cells`. If >= activation_threshold
-//!          -> segment is "active" -> its cell is "predicted".
-//!        - A segment's POTENTIAL activity = #synapses whose presynaptic cell is
-//!          in `active_cells` (regardless of permanence). If >= learning_threshold
-//!          -> segment is "matching".
-//!
-//! Anomaly score = (active columns with no prior predicted cells)
-//!                  / (# active columns).
-use rand::Rng;
-use rand::SeedableRng;
-use rand_xoshiro::Xoshiro256PlusPlus;
-type CellIdx = u32;
-type SegmentIdx = u32;
-#[derive(Clone)]
-pub struct Synapse {
-    pub presynaptic_cell: CellIdx,
-    pub permanence: f32,
-}
-#[derive(Clone)]
-pub struct Segment {
-    pub cell: CellIdx,
-    pub synapses: Vec<Synapse>,
-    /// Cached counters; recomputed each step.
-    pub num_active_connected: u32,
-    pub num_active_potential: u32,
-    /// Simple "last iter touched" stat for least-used cell selection.
-    pub last_used_iteration: u64,
-}
-pub struct TemporalMemoryConfig {
-    pub n_columns: usize,
-    pub cells_per_column: usize,
-    pub activation_threshold: u32,
-    pub learning_threshold: u32,
-    pub initial_permanence: f32,
-    pub connected_permanence: f32,
-    pub permanence_increment: f32,
-    pub permanence_decrement: f32,
-    pub predicted_segment_decrement: f32,
-    pub max_segments_per_cell: usize,
-    pub max_synapses_per_segment: usize,
-    pub max_new_synapse_count: usize,
-}
-impl Default for TemporalMemoryConfig {
-    fn default() -> Self {
-        Self {
-            n_columns: 2048,
-            cells_per_column: 32,
-            activation_threshold: 15,
-            learning_threshold: 13,
-            initial_permanence: 0.21,
-            connected_permanence: 0.50,
-            permanence_increment: 0.10,
-            permanence_decrement: 0.10,
-            predicted_segment_decrement: 0.10,
-            max_segments_per_cell: 255,
-            max_synapses_per_segment: 32,
-            max_new_synapse_count: 20,
-        }
-    }
-}
-pub struct TemporalMemory {
-    pub cfg: TemporalMemoryConfig,
-    /// All segments in the region. Indexed by SegmentIdx.
-    pub segments: Vec<Segment>,
-    /// For each cell, the list of segments that belong to it.
-    pub cell_segments: Vec<Vec<SegmentIdx>>,
-    /// Active cells in the current step.
-    pub active_cells: Vec<bool>,
-    /// Winner cells (subset of active_cells, 1 per active column) for learning.
-    pub winner_cells: Vec<bool>,
-    /// Predictive cells for the current step = cells whose segment became
-    /// active at the end of the previous step.
-    pub predictive_cells: Vec<bool>,
-    /// Cached list of segment indices that were "active" last compute().
-    active_segments_prev: Vec<SegmentIdx>,
-    /// Cached list of segment indices that were "matching" last compute().
-    matching_segments_prev: Vec<SegmentIdx>,
-    rng: Xoshiro256PlusPlus,
-    iter_count: u64,
-}
-impl TemporalMemory {
-    pub fn new(cfg: TemporalMemoryConfig, seed: u64) -> Self {
-        let total = cfg.n_columns * cfg.cells_per_column;
-        Self {
-            cell_segments: vec![Vec::new(); total],
-            active_cells: vec![false; total],
-            winner_cells: vec![false; total],
-            predictive_cells: vec![false; total],
-            cfg,
-            segments: Vec::new(),
-            active_segments_prev: Vec::new(),
-            matching_segments_prev: Vec::new(),
-            rng: Xoshiro256PlusPlus::seed_from_u64(seed),
-            iter_count: 0,
-        }
-    }
-    pub fn reset(&mut self) {
-        for v in self.active_cells.iter_mut() { *v = false; }
-        for v in self.winner_cells.iter_mut() { *v = false; }
-        for v in self.predictive_cells.iter_mut() { *v = false; }
-        self.active_segments_prev.clear();
-        self.matching_segments_prev.clear();
-    }
-    #[inline]
-    fn col_of(&self, cell: CellIdx) -> usize {
-        (cell as usize) / self.cfg.cells_per_column
-    }
-    #[inline]
-    fn cells_in_col(&self, col: usize) -> std::ops::Range<CellIdx> {
-        let base = (col * self.cfg.cells_per_column) as CellIdx;
-        base..(base + self.cfg.cells_per_column as CellIdx)
-    }
-    /// Process one step.
-    ///
-    /// `active_columns` is the set of column indices activated by the Spatial
-    /// Pooler this step.  Returns the anomaly score in [0, 1].
-    pub fn compute(&mut self, active_columns: &[u32], learn: bool) -> f32 {
-        self.iter_count = self.iter_count.wrapping_add(1);
-        // Snapshot previous-step cell activity (for learning on segments).
-        let prev_active_cells = self.active_cells.clone();
-        let prev_winner_cells = self.winner_cells.clone();
-        // Move current "predictive" (computed at the end of the last step)
-        // into local variables; we'll overwrite predictive_cells later.
-        let predictive_prev = self.predictive_cells.clone();
-        // Group active segments and matching segments by column of their
-        // owning cell, for the columns that are active this step.
-        let n_cols = self.cfg.n_columns;
-        // active_segs_by_col[col] = segment indices whose cell is in col and
-        // which were "active" in the previous depolarization.
-        // matching_segs_by_col[col] = similarly for "matching".
-        let mut active_segs_by_col: Vec<Vec<SegmentIdx>> = vec![Vec::new(); n_cols];
-        let mut matching_segs_by_col: Vec<Vec<SegmentIdx>> = vec![Vec::new(); n_cols];
-        for &seg in &self.active_segments_prev {
-            let col = self.col_of(self.segments[seg as usize].cell);
-            active_segs_by_col[col].push(seg);
-        }
-        for &seg in &self.matching_segments_prev {
-            let col = self.col_of(self.segments[seg as usize].cell);
-            matching_segs_by_col[col].push(seg);
-        }
-        // Columns that are active this step (for O(1) lookup).
-        let mut active_col_mask = vec![false; n_cols];
-        for &c in active_columns { active_col_mask[c as usize] = true; }
-        // Zero out current cell activations.
-        for v in self.active_cells.iter_mut() { *v = false; }
-        for v in self.winner_cells.iter_mut() { *v = false; }
-        // Track anomaly.
-        let mut unpredicted_cols = 0u32;
-        // We'll collect (segment, learn_mode) pairs for segment reinforcement
-        // so we can batch-apply permanence adjustments using prev_active_cells.
-        // learn_mode: "reinforce_correctly_predicted", "punish_incorrectly_matched"
-        enum LearnOp {
-            Reinforce(SegmentIdx),       // correctly predicted
-            Grow {                        // bursting column: grow on chosen segment
-                segment: SegmentIdx,
-                #[allow(dead_code)]
-                winner_cell: CellIdx,
-            },
-            Punish(SegmentIdx),           // matching segment on inactive column
-        }
-        let mut ops: Vec<LearnOp> = Vec::new();
-        // ---- 1) Process active columns ----
-        for &col in active_columns {
-            let col = col as usize;
-            let active_segs = &active_segs_by_col[col];
-            if !active_segs.is_empty() {
-                // "Activate predicted column": each cell with an active segment
-                // becomes active and is a winner; reinforce that segment.
-                let mut seen_cells: Vec<CellIdx> = Vec::new();
-                for &seg_i in active_segs {
-                    let seg = &self.segments[seg_i as usize];
-                    let cell = seg.cell;
-                    if !seen_cells.contains(&cell) {
-                        self.active_cells[cell as usize] = true;
-                        self.winner_cells[cell as usize] = true;
-                        seen_cells.push(cell);
-                    }
-                    if learn {
-                        ops.push(LearnOp::Reinforce(seg_i));
-                    }
-                }
-            } else {
-                // ----- BURST -----
-                unpredicted_cols += 1;
-                for c in self.cells_in_col(col) {
-                    self.active_cells[c as usize] = true;
-                }
-                // Pick a winner cell + segment for learning.
-                if learn {
-                    let matching = &matching_segs_by_col[col];
-                    let (winner_cell, target_segment) = if !matching.is_empty() {
-                        // Best-matching segment = highest num_active_potential.
-                        let mut best = matching[0];
-                        let mut best_score = self.segments[best as usize].num_active_potential;
-                        for &s in &matching[1..] {
-                            let score = self.segments[s as usize].num_active_potential;
-                            if score > best_score {
-                                best_score = score;
-                                best = s;
-                            }
-                        }
-                        let wc = self.segments[best as usize].cell;
-                        (wc, Some(best))
-                    } else {
-                        // Least-used cell in column, then grow a new segment.
-                        let winner = self.least_used_cell(col);
-                        (winner, None)
-                    };
-                    self.winner_cells[winner_cell as usize] = true;
-                    let segment_id = match target_segment {
-                        Some(s) => s,
-                        None => {
-                            // Create a fresh empty segment on winner cell.
-                            self.create_segment(winner_cell)
-                        }
-                    };
-                    ops.push(LearnOp::Grow { segment: segment_id, winner_cell });
-                } else {
-                    // No learning: still pick some winner cell (arbitrary)
-                    // so downstream code that inspects winner_cells isn't empty.
-                    let matching = &matching_segs_by_col[col];
-                    let winner_cell = if !matching.is_empty() {
-                        self.segments[matching[0] as usize].cell
-                    } else {
-                        self.least_used_cell(col)
-                    };
-                    self.winner_cells[winner_cell as usize] = true;
-                }
-            }
-        }
-        // ---- 2) Punish matching segments on INACTIVE columns ----
-        if learn && self.cfg.predicted_segment_decrement > 0.0 {
-            for &seg_i in &self.matching_segments_prev {
-                let col = self.col_of(self.segments[seg_i as usize].cell);
-                if !active_col_mask[col] {
-                    ops.push(LearnOp::Punish(seg_i));
-                }
-            }
-        }
-        // ---- 3) Apply learning ----
-        if learn {
-            for op in ops {
-                match op {
-                    LearnOp::Reinforce(seg_i) => {
-                        self.reinforce_segment(seg_i, &prev_active_cells);
-                        // Optionally grow up to N new synapses to winner cells
-                        // of the previous step.
-                        self.grow_synapses_on_segment(seg_i, &prev_winner_cells);
-                    }
-                    LearnOp::Grow { segment, winner_cell: _ } => {
-                        self.reinforce_segment(segment, &prev_active_cells);
-                        self.grow_synapses_on_segment(segment, &prev_winner_cells);
-                    }
-                    LearnOp::Punish(seg_i) => {
-                        let dec = self.cfg.predicted_segment_decrement;
-                        for syn in &mut self.segments[seg_i as usize].synapses {
-                            if prev_active_cells[syn.presynaptic_cell as usize] {
-                                syn.permanence = (syn.permanence - dec).max(0.0);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        // ---- 4) Compute segment activity & predictive cells for NEXT step ----
-        // We have to use the *current* active_cells (just set above).
-        let mut next_active_segs: Vec<SegmentIdx> = Vec::new();
-        let mut next_matching_segs: Vec<SegmentIdx> = Vec::new();
-        for v in self.predictive_cells.iter_mut() { *v = false; }
-        let conn = self.cfg.connected_permanence;
-        let act_thr = self.cfg.activation_threshold;
-        let learn_thr = self.cfg.learning_threshold;
-        for (seg_i, seg) in self.segments.iter_mut().enumerate() {
-            let mut n_conn: u32 = 0;
-            let mut n_pot: u32 = 0;
-            for syn in &seg.synapses {
-                if self.active_cells[syn.presynaptic_cell as usize] {
-                    n_pot += 1;
-                    if syn.permanence >= conn { n_conn += 1; }
-                }
-            }
-            seg.num_active_connected = n_conn;
-            seg.num_active_potential = n_pot;
-            if n_conn >= act_thr {
-                next_active_segs.push(seg_i as SegmentIdx);
-                self.predictive_cells[seg.cell as usize] = true;
-            }
-            if n_pot >= learn_thr {
-                next_matching_segs.push(seg_i as SegmentIdx);
-            }
-        }
-        self.active_segments_prev = next_active_segs;
-        self.matching_segments_prev = next_matching_segs;
-        // Keep predictive_prev unused-guard; we no longer need it but
-        // retained to document intent.
-        let _ = predictive_prev;
-        // Anomaly.
-        if active_columns.is_empty() {
-            0.0
-        } else {
-            (unpredicted_cols as f32) / (active_columns.len() as f32)
-        }
-    }
-    /// Reinforce synapses on `seg`: +inc if presynaptic is active last step,
-    /// -dec otherwise.
-    fn reinforce_segment(&mut self, seg_i: SegmentIdx, prev_active_cells: &[bool]) {
-        let inc = self.cfg.permanence_increment;
-        let dec = self.cfg.permanence_decrement;
-        let seg = &mut self.segments[seg_i as usize];
-        seg.last_used_iteration = self.iter_count;
-        for syn in &mut seg.synapses {
-            if prev_active_cells[syn.presynaptic_cell as usize] {
-                syn.permanence = (syn.permanence + inc).min(1.0);
-            } else {
-                syn.permanence = (syn.permanence - dec).max(0.0);
-            }
-        }
-    }
-    /// Grow up to `max_new_synapse_count - current_potential` new synapses
-    /// from previous winner cells that are not already connected to this seg.
-    fn grow_synapses_on_segment(
-        &mut self,
-        seg_i: SegmentIdx,
-        prev_winner_cells: &[bool],
-    ) {
-        let initial_perm = self.cfg.initial_permanence;
-        let cap = self.cfg.max_synapses_per_segment;
-        let max_new = self.cfg.max_new_synapse_count;
-        // Gather candidate cells (prev winners not already presynaptic to this seg).
-        let already: Vec<CellIdx> = self.segments[seg_i as usize]
-            .synapses
-            .iter()
-            .map(|s| s.presynaptic_cell)
-            .collect();
-        let mut candidates: Vec<CellIdx> = Vec::new();
-        for (cell_i, &b) in prev_winner_cells.iter().enumerate() {
-            if b && !already.contains(&(cell_i as CellIdx)) {
-                candidates.push(cell_i as CellIdx);
-            }
-        }
-        // How many can we add?
-        let current_len = self.segments[seg_i as usize].synapses.len();
-        let room = cap.saturating_sub(current_len);
-        let mut to_add = max_new.min(candidates.len()).min(room);
-        // Random sample without replacement from candidates.
-        while to_add > 0 {
-            let idx = self.rng.gen_range(0..candidates.len());
-            let pre = candidates.swap_remove(idx);
-            self.segments[seg_i as usize].synapses.push(Synapse {
-                presynaptic_cell: pre,
-                permanence: initial_perm,
-            });
-            to_add -= 1;
-        }
-    }
-    fn create_segment(&mut self, cell: CellIdx) -> SegmentIdx {
-        // Enforce per-cell segment cap by evicting least-recently-used segment
-        // if necessary.
-        let cell_segs = &mut self.cell_segments[cell as usize];
-        if cell_segs.len() >= self.cfg.max_segments_per_cell {
-            // Find LRU segment.
-            let (lru_pos, &lru_id) = cell_segs
-                .iter()
-                .enumerate()
-                .min_by_key(|(_, &sid)| self.segments[sid as usize].last_used_iteration)
-                .expect("cell_segs non-empty");
-            // Clear that segment in place and reuse its index.
-            self.segments[lru_id as usize].synapses.clear();
-            self.segments[lru_id as usize].num_active_connected = 0;
-            self.segments[lru_id as usize].num_active_potential = 0;
-            self.segments[lru_id as usize].last_used_iteration = self.iter_count;
-            // Keep at same position in cell_segs.
-            let _ = lru_pos;
-            return lru_id;
-        }
-        let new_id = self.segments.len() as SegmentIdx;
-        self.segments.push(Segment {
-            cell,
-            synapses: Vec::with_capacity(self.cfg.max_new_synapse_count),
-            num_active_connected: 0,
-            num_active_potential: 0,
-            last_used_iteration: self.iter_count,
-        });
-        cell_segs.push(new_id);
-        new_id
-    }
-    fn least_used_cell(&mut self, col: usize) -> CellIdx {
-        // Cell with the fewest segments; break ties randomly.
-        let mut min_segs = usize::MAX;
-        let mut candidates: Vec<CellIdx> = Vec::new();
-        for c in self.cells_in_col(col) {
-            let n = self.cell_segments[c as usize].len();
-            if n < min_segs {
-                min_segs = n;
-                candidates.clear();
-                candidates.push(c);
-            } else if n == min_segs {
-                candidates.push(c);
-            }
-        }
-        let idx = self.rng.gen_range(0..candidates.len());
-        candidates[idx]
-    }
-}
-// ---------------------------------------------------------------------------
-// Tests
-// ---------------------------------------------------------------------------
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use crate::sp::{SpatialPooler, SpatialPoolerConfig};
-    use rand::Rng;
-    use rand::SeedableRng;
-    use rand_xoshiro::Xoshiro256PlusPlus;
-    #[test]
-    fn tm_learns_repeating_sequence() {
-        // Sequence A -> B -> C -> A -> B -> C -> ... should drive anomaly down.
-        let cfg = SpatialPoolerConfig::default();
-        let mut sp = SpatialPooler::new(cfg, 123);
-        let mut tm = TemporalMemory::new(TemporalMemoryConfig::default(), 456);
-        // Build 3 fixed random SDRs of 2% sparsity.
-        let mut rng = Xoshiro256PlusPlus::seed_from_u64(99);
-        let input_bits = sp.cfg.input_bits;
-        let make_sdr = |rng: &mut Xoshiro256PlusPlus| {
-            let mut v = vec![false; input_bits];
-            let on = (0.02 * input_bits as f32) as usize;
-            let mut placed = 0;
-            while placed < on {
-                let i = rng.gen_range(0..input_bits);
-                if !v[i] {
-                    v[i] = true;
-                    placed += 1;
-                }
-            }
-            v
-        };
-        let seqs = [make_sdr(&mut rng), make_sdr(&mut rng), make_sdr(&mut rng)];
-        // Warm up SP first so that columns are reliable for each symbol.
-        for _ in 0..200 {
-            for s in &seqs {
-                sp.compute(s, true);
-            }
-        }
-        // Reset TM so prediction state is clean.
-        tm.reset();
-        // Record anomaly over a window early and late.
-        let mut early_anoms: Vec<f32> = Vec::new();
-        let mut late_anoms: Vec<f32> = Vec::new();
-        for iter in 0..250 {
-            for s in &seqs {
-                let active = sp.compute(s, false);
-                let anomaly = tm.compute(&active, true);
-                if iter == 10 { early_anoms.push(anomaly); }
-                if iter == 249 { late_anoms.push(anomaly); }
-            }
-        }
-        let mean = |v: &[f32]| v.iter().sum::<f32>() / (v.len() as f32);
-        let early = mean(&early_anoms);
-        let late = mean(&late_anoms);
-        println!("early_anomaly={early}, late_anomaly={late}");
-        assert!(
-            late < 0.5 * early + 1e-6,
-            "late anomaly ({late}) should be < 0.5 * early anomaly ({early})"
-        );
-    }
-}

+//! Numenta BAMI-spec Temporal Memory.
+//!
+//! Key parameters (Numenta defaults):
+//!   - cells_per_column = 32
+//!   - max_segments_per_cell = 255
+//!   - max_synapses_per_segment = 32
+//!   - activation_threshold = 15  (CONNECTED synapses onto active cells)
+//!   - learning_threshold     = 13  (POTENTIAL synapses onto active cells)
+//!     (often called `minThreshold` / match threshold in BAMI)
+//!   - initial_permanence = 0.21
+//!   - connected_permanence = 0.50
+//!   - permanence_increment = 0.10
+//!   - permanence_decrement = 0.10
+//!   - predicted_segment_decrement = 0.10  (decay for segments that predicted
+//!     inactive columns; called `predictedSegmentDecrement` in BAMI)
+//!   - max_new_synapse_count = 20  (max synapses to grow on a new/reinforced seg)
+//!
+//! Algorithm (one step):
+//!   Given `active_columns` from the Spatial Pooler, and segment activity
+//!   caches `active_segments` and `matching_segments` computed *at the end of
+//!   the previous step*:
+//!
+//!   1. For each active column:
+//!        - If it contains any predicted cell (any cell with an active segment
+//!          from the previous depolarization), mark those cells active and
+//!          learn on the segment that predicted it.
+//!        - Else BURST the column: mark all cells in it active, and grow a new
+//!          segment on the best-matching cell in the column (or, if none,
+//!          on the cell with the fewest segments).
+//!   2. For every column that was predicted but did NOT become active
+//!      (matching segments on inactive columns), apply the
+//!      `predicted_segment_decrement` decay so spurious predictions fade.
+//!   3. Winner cells = active cells chosen for learning (1 per active column).
+//!   4. Compute segment activity for NEXT step:
+//!        - A segment's CONNECTED activity = #synapses with perm >= connected_perm
+//!          whose presynaptic cell is in `active_cells`. If >= activation_threshold
+//!          -> segment is "active" -> its cell is "predicted".
+//!        - A segment's POTENTIAL activity = #synapses whose presynaptic cell is
+//!          in `active_cells` (regardless of permanence). If >= learning_threshold
+//!          -> segment is "matching".
+//!
+//! Anomaly score = (active columns with no prior predicted cells)
+//!                  / (# active columns).
+use rand::Rng;
+use rand::SeedableRng;
+use rand_xoshiro::Xoshiro256PlusPlus;
+type CellIdx = u32;
+type SegmentIdx = u32;
+#[derive(Clone)]
+pub struct Synapse {
+    pub presynaptic_cell: CellIdx,
+    pub permanence: f32,
+}
+#[derive(Clone)]
+pub struct Segment {
+    pub cell: CellIdx,
+    pub synapses: Vec<Synapse>,
+    /// Cached counters; recomputed each step.
+    pub num_active_connected: u32,
+    pub num_active_potential: u32,
+    /// Simple "last iter touched" stat for least-used cell selection.
+    pub last_used_iteration: u64,
+}
+pub struct TemporalMemoryConfig {
+    pub n_columns: usize,
+    pub cells_per_column: usize,
+    pub activation_threshold: u32,
+    pub learning_threshold: u32,
+    pub initial_permanence: f32,
+    pub connected_permanence: f32,
+    pub permanence_increment: f32,
+    pub permanence_decrement: f32,
+    pub predicted_segment_decrement: f32,
+    pub max_segments_per_cell: usize,
+    pub max_synapses_per_segment: usize,
+    pub max_new_synapse_count: usize,
+}
+impl Default for TemporalMemoryConfig {
+    fn default() -> Self {
+        Self {
+            n_columns: 2048,
+            cells_per_column: 32,
+            activation_threshold: 15,
+            learning_threshold: 13,
+            initial_permanence: 0.21,
+            connected_permanence: 0.50,
+            permanence_increment: 0.10,
+            permanence_decrement: 0.10,
+            predicted_segment_decrement: 0.10,
+            max_segments_per_cell: 255,
+            max_synapses_per_segment: 32,
+            max_new_synapse_count: 20,
+        }
+    }
+}
+pub struct TemporalMemory {
+    pub cfg: TemporalMemoryConfig,
+    /// All segments in the region. Indexed by SegmentIdx.
+    pub segments: Vec<Segment>,
+    /// For each cell, the list of segments that belong to it.
+    pub cell_segments: Vec<Vec<SegmentIdx>>,
+    /// Active cells in the current step.
+    pub active_cells: Vec<bool>,
+    /// Winner cells (subset of active_cells, 1 per active column) for learning.
+    pub winner_cells: Vec<bool>,
+    /// Predictive cells for the current step = cells whose segment became
+    /// active at the end of the previous step.
+    pub predictive_cells: Vec<bool>,
+    /// Cached list of segment indices that were "active" last compute().
+    active_segments_prev: Vec<SegmentIdx>,
+    /// Cached list of segment indices that were "matching" last compute().
+    matching_segments_prev: Vec<SegmentIdx>,
+    rng: Xoshiro256PlusPlus,
+    iter_count: u64,
+}
+impl TemporalMemory {
+    pub fn new(cfg: TemporalMemoryConfig, seed: u64) -> Self {
+        let total = cfg.n_columns * cfg.cells_per_column;
+        Self {
+            cell_segments: vec![Vec::new(); total],
+            active_cells: vec![false; total],
+            winner_cells: vec![false; total],
+            predictive_cells: vec![false; total],
+            cfg,
+            segments: Vec::new(),
+            active_segments_prev: Vec::new(),
+            matching_segments_prev: Vec::new(),
+            rng: Xoshiro256PlusPlus::seed_from_u64(seed),
+            iter_count: 0,
+        }
+    }
+    pub fn reset(&mut self) {
+        for v in self.active_cells.iter_mut() { *v = false; }
+        for v in self.winner_cells.iter_mut() { *v = false; }
+        for v in self.predictive_cells.iter_mut() { *v = false; }
+        self.active_segments_prev.clear();
+        self.matching_segments_prev.clear();
+    }
+    #[inline]
+    fn col_of(&self, cell: CellIdx) -> usize {
+        (cell as usize) / self.cfg.cells_per_column
+    }
+    #[inline]
+    fn cells_in_col(&self, col: usize) -> std::ops::Range<CellIdx> {
+        let base = (col * self.cfg.cells_per_column) as CellIdx;
+        base..(base + self.cfg.cells_per_column as CellIdx)
+    }
+    /// Process one step.
+    ///
+    /// `active_columns` is the set of column indices activated by the Spatial
+    /// Pooler this step.  Returns the anomaly score in [0, 1].
+    pub fn compute(&mut self, active_columns: &[u32], learn: bool) -> f32 {
+        self.iter_count = self.iter_count.wrapping_add(1);
+        // Snapshot previous-step cell activity (for learning on segments).
+        let prev_active_cells = self.active_cells.clone();
+        let prev_winner_cells = self.winner_cells.clone();
+        // Move current "predictive" (computed at the end of the last step)
+        // into local variables; we'll overwrite predictive_cells later.
+        let predictive_prev = self.predictive_cells.clone();
+        // Group active segments and matching segments by column of their
+        // owning cell, for the columns that are active this step.
+        let n_cols = self.cfg.n_columns;
+        // active_segs_by_col[col] = segment indices whose cell is in col and
+        // which were "active" in the previous depolarization.
+        // matching_segs_by_col[col] = similarly for "matching".
+        let mut active_segs_by_col: Vec<Vec<SegmentIdx>> = vec![Vec::new(); n_cols];
+        let mut matching_segs_by_col: Vec<Vec<SegmentIdx>> = vec![Vec::new(); n_cols];
+        for &seg in &self.active_segments_prev {
+            let col = self.col_of(self.segments[seg as usize].cell);
+            active_segs_by_col[col].push(seg);
+        }
+        for &seg in &self.matching_segments_prev {
+            let col = self.col_of(self.segments[seg as usize].cell);
+            matching_segs_by_col[col].push(seg);
+        }
+        // Columns that are active this step (for O(1) lookup).
+        let mut active_col_mask = vec![false; n_cols];
+        for &c in active_columns { active_col_mask[c as usize] = true; }
+        // Zero out current cell activations.
+        for v in self.active_cells.iter_mut() { *v = false; }
+        for v in self.winner_cells.iter_mut() { *v = false; }
+        // Track anomaly.
+        let mut unpredicted_cols = 0u32;
+        // We'll collect (segment, learn_mode) pairs for segment reinforcement
+        // so we can batch-apply permanence adjustments using prev_active_cells.
+        // learn_mode: "reinforce_correctly_predicted", "punish_incorrectly_matched"
+        enum LearnOp {
+            Reinforce(SegmentIdx),       // correctly predicted
+            Grow {                        // bursting column: grow on chosen segment
+                segment: SegmentIdx,
+                #[allow(dead_code)]
+                winner_cell: CellIdx,
+            },
+            Punish(SegmentIdx),           // matching segment on inactive column
+        }
+        let mut ops: Vec<LearnOp> = Vec::new();
+        // ---- 1) Process active columns ----
+        for &col in active_columns {
+            let col = col as usize;
+            let active_segs = &active_segs_by_col[col];
+            if !active_segs.is_empty() {
+                // "Activate predicted column": each cell with an active segment
+                // becomes active and is a winner; reinforce that segment.
+                let mut seen_cells: Vec<CellIdx> = Vec::new();
+                for &seg_i in active_segs {
+                    let seg = &self.segments[seg_i as usize];
+                    let cell = seg.cell;
+                    if !seen_cells.contains(&cell) {
+                        self.active_cells[cell as usize] = true;
+                        self.winner_cells[cell as usize] = true;
+                        seen_cells.push(cell);
+                    }
+                    if learn {
+                        ops.push(LearnOp::Reinforce(seg_i));
+                    }
+                }
+            } else {
+                // ----- BURST -----
+                unpredicted_cols += 1;
+                for c in self.cells_in_col(col) {
+                    self.active_cells[c as usize] = true;
+                }
+                // Pick a winner cell + segment for learning.
+                if learn {
+                    let matching = &matching_segs_by_col[col];
+                    let (winner_cell, target_segment) = if !matching.is_empty() {
+                        // Best-matching segment = highest num_active_potential.
+                        let mut best = matching[0];
+                        let mut best_score = self.segments[best as usize].num_active_potential;
+                        for &s in &matching[1..] {
+                            let score = self.segments[s as usize].num_active_potential;
+                            if score > best_score {
+                                best_score = score;
+                                best = s;
+                            }
+                        }
+                        let wc = self.segments[best as usize].cell;
+                        (wc, Some(best))
+                    } else {
+                        // Least-used cell in column, then grow a new segment.
+                        let winner = self.least_used_cell(col);
+                        (winner, None)
+                    };
+                    self.winner_cells[winner_cell as usize] = true;
+                    let segment_id = match target_segment {
+                        Some(s) => s,
+                        None => {
+                            // Create a fresh empty segment on winner cell.
+                            self.create_segment(winner_cell)
+                        }
+                    };
+                    ops.push(LearnOp::Grow { segment: segment_id, winner_cell });
+                } else {
+                    // No learning: still pick some winner cell (arbitrary)
+                    // so downstream code that inspects winner_cells isn't empty.
+                    let matching = &matching_segs_by_col[col];
+                    let winner_cell = if !matching.is_empty() {
+                        self.segments[matching[0] as usize].cell
+                    } else {
+                        self.least_used_cell(col)
+                    };
+                    self.winner_cells[winner_cell as usize] = true;
+                }
+            }
+        }
+        // ---- 2) Punish matching segments on INACTIVE columns ----
+        if learn && self.cfg.predicted_segment_decrement > 0.0 {
+            for &seg_i in &self.matching_segments_prev {
+                let col = self.col_of(self.segments[seg_i as usize].cell);
+                if !active_col_mask[col] {
+                    ops.push(LearnOp::Punish(seg_i));
+                }
+            }
+        }
+        // ---- 3) Apply learning ----
+        if learn {
+            for op in ops {
+                match op {
+                    LearnOp::Reinforce(seg_i) => {
+                        self.reinforce_segment(seg_i, &prev_active_cells);
+                        // Optionally grow up to N new synapses to winner cells
+                        // of the previous step.
+                        self.grow_synapses_on_segment(seg_i, &prev_winner_cells);
+                    }
+                    LearnOp::Grow { segment, winner_cell: _ } => {
+                        self.reinforce_segment(segment, &prev_active_cells);
+                        self.grow_synapses_on_segment(segment, &prev_winner_cells);
+                    }
+                    LearnOp::Punish(seg_i) => {
+                        let dec = self.cfg.predicted_segment_decrement;
+                        for syn in &mut self.segments[seg_i as usize].synapses {
+                            if prev_active_cells[syn.presynaptic_cell as usize] {
+                                syn.permanence = (syn.permanence - dec).max(0.0);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        // ---- 4) Compute segment activity & predictive cells for NEXT step ----
+        // We have to use the *current* active_cells (just set above).
+        let mut next_active_segs: Vec<SegmentIdx> = Vec::new();
+        let mut next_matching_segs: Vec<SegmentIdx> = Vec::new();
+        for v in self.predictive_cells.iter_mut() { *v = false; }
+        let conn = self.cfg.connected_permanence;
+        let act_thr = self.cfg.activation_threshold;
+        let learn_thr = self.cfg.learning_threshold;
+        for (seg_i, seg) in self.segments.iter_mut().enumerate() {
+            let mut n_conn: u32 = 0;
+            let mut n_pot: u32 = 0;
+            for syn in &seg.synapses {
+                if self.active_cells[syn.presynaptic_cell as usize] {
+                    n_pot += 1;
+                    if syn.permanence >= conn { n_conn += 1; }
+                }
+            }
+            seg.num_active_connected = n_conn;
+            seg.num_active_potential = n_pot;
+            if n_conn >= act_thr {
+                next_active_segs.push(seg_i as SegmentIdx);
+                self.predictive_cells[seg.cell as usize] = true;
+            }
+            if n_pot >= learn_thr {
+                next_matching_segs.push(seg_i as SegmentIdx);
+            }
+        }
+        self.active_segments_prev = next_active_segs;
+        self.matching_segments_prev = next_matching_segs;
+        // Keep predictive_prev unused-guard; we no longer need it but
+        // retained to document intent.
+        let _ = predictive_prev;
+        // Anomaly.
+        if active_columns.is_empty() {
+            0.0
+        } else {
+            (unpredicted_cols as f32) / (active_columns.len() as f32)
+        }
+    }
+    /// Reinforce synapses on `seg`: +inc if presynaptic is active last step,
+    /// -dec otherwise.
+    fn reinforce_segment(&mut self, seg_i: SegmentIdx, prev_active_cells: &[bool]) {
+        let inc = self.cfg.permanence_increment;
+        let dec = self.cfg.permanence_decrement;
+        let seg = &mut self.segments[seg_i as usize];
+        seg.last_used_iteration = self.iter_count;
+        for syn in &mut seg.synapses {
+            if prev_active_cells[syn.presynaptic_cell as usize] {
+                syn.permanence = (syn.permanence + inc).min(1.0);
+            } else {
+                syn.permanence = (syn.permanence - dec).max(0.0);
+            }
+        }
+    }
+    /// Grow up to `max_new_synapse_count - current_potential` new synapses
+    /// from previous winner cells that are not already connected to this seg.
+    fn grow_synapses_on_segment(
+        &mut self,
+        seg_i: SegmentIdx,
+        prev_winner_cells: &[bool],
+    ) {
+        let initial_perm = self.cfg.initial_permanence;
+        let cap = self.cfg.max_synapses_per_segment;
+        let max_new = self.cfg.max_new_synapse_count;
+        // Gather candidate cells (prev winners not already presynaptic to this seg).
+        let already: Vec<CellIdx> = self.segments[seg_i as usize]
+            .synapses
+            .iter()
+            .map(|s| s.presynaptic_cell)
+            .collect();
+        let mut candidates: Vec<CellIdx> = Vec::new();
+        for (cell_i, &b) in prev_winner_cells.iter().enumerate() {
+            if b && !already.contains(&(cell_i as CellIdx)) {
+                candidates.push(cell_i as CellIdx);
+            }
+        }
+        // How many can we add?
+        let current_len = self.segments[seg_i as usize].synapses.len();
+        let room = cap.saturating_sub(current_len);
+        let mut to_add = max_new.min(candidates.len()).min(room);
+        // Random sample without replacement from candidates.
+        while to_add > 0 {
+            let idx = self.rng.gen_range(0..candidates.len());
+            let pre = candidates.swap_remove(idx);
+            self.segments[seg_i as usize].synapses.push(Synapse {
+                presynaptic_cell: pre,
+                permanence: initial_perm,
+            });
+            to_add -= 1;
+        }
+    }
+    fn create_segment(&mut self, cell: CellIdx) -> SegmentIdx {
+        // Enforce per-cell segment cap by evicting least-recently-used segment
+        // if necessary.
+        let cell_segs = &mut self.cell_segments[cell as usize];
+        if cell_segs.len() >= self.cfg.max_segments_per_cell {
+            // Find LRU segment.
+            let (lru_pos, &lru_id) = cell_segs
+                .iter()
+                .enumerate()
+                .min_by_key(|(_, &sid)| self.segments[sid as usize].last_used_iteration)
+                .expect("cell_segs non-empty");
+            // Clear that segment in place and reuse its index.
+            self.segments[lru_id as usize].synapses.clear();
+            self.segments[lru_id as usize].num_active_connected = 0;
+            self.segments[lru_id as usize].num_active_potential = 0;
+            self.segments[lru_id as usize].last_used_iteration = self.iter_count;
+            // Keep at same position in cell_segs.
+            let _ = lru_pos;
+            return lru_id;
+        }
+        let new_id = self.segments.len() as SegmentIdx;
+        self.segments.push(Segment {
+            cell,
+            synapses: Vec::with_capacity(self.cfg.max_new_synapse_count),
+            num_active_connected: 0,
+            num_active_potential: 0,
+            last_used_iteration: self.iter_count,
+        });
+        cell_segs.push(new_id);
+        new_id
+    }
+    fn least_used_cell(&mut self, col: usize) -> CellIdx {
+        // Cell with the fewest segments; break ties randomly.
+        let mut min_segs = usize::MAX;
+        let mut candidates: Vec<CellIdx> = Vec::new();
+        for c in self.cells_in_col(col) {
+            let n = self.cell_segments[c as usize].len();
+            if n < min_segs {
+                min_segs = n;
+                candidates.clear();
+                candidates.push(c);
+            } else if n == min_segs {
+                candidates.push(c);
+            }
+        }
+        let idx = self.rng.gen_range(0..candidates.len());
+        candidates[idx]
+    }
+}
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::sp::{SpatialPooler, SpatialPoolerConfig};
+    use rand::Rng;
+    use rand::SeedableRng;
+    use rand_xoshiro::Xoshiro256PlusPlus;
+    #[test]
+    fn tm_learns_repeating_sequence() {
+        // Sequence A -> B -> C -> A -> B -> C -> ... should drive anomaly down.
+        let cfg = SpatialPoolerConfig::default();
+        let mut sp = SpatialPooler::new(cfg, 123);
+        let mut tm = TemporalMemory::new(TemporalMemoryConfig::default(), 456);
+        // Build 3 fixed random SDRs of 2% sparsity.
+        let mut rng = Xoshiro256PlusPlus::seed_from_u64(99);
+        let input_bits = sp.cfg.input_bits;
+        let make_sdr = |rng: &mut Xoshiro256PlusPlus| {
+            let mut v = vec![false; input_bits];
+            let on = (0.02 * input_bits as f32) as usize;
+            let mut placed = 0;
+            while placed < on {
+                let i = rng.gen_range(0..input_bits);
+                if !v[i] {
+                    v[i] = true;
+                    placed += 1;
+                }
+            }
+            v
+        };
+        let seqs = [make_sdr(&mut rng), make_sdr(&mut rng), make_sdr(&mut rng)];
+        // Warm up SP first so that columns are reliable for each symbol.
+        for _ in 0..200 {
+            for s in &seqs {
+                sp.compute(s, true);
+            }
+        }
+        // Reset TM so prediction state is clean.
+        tm.reset();
+        // Record anomaly over a window early and late.
+        let mut early_anoms: Vec<f32> = Vec::new();
+        let mut late_anoms: Vec<f32> = Vec::new();
+        for iter in 0..250 {
+            for s in &seqs {
+                let active = sp.compute(s, false);
+                let anomaly = tm.compute(&active, true);
+                if iter == 10 { early_anoms.push(anomaly); }
+                if iter == 249 { late_anoms.push(anomaly); }
+            }
+        }
+        let mean = |v: &[f32]| v.iter().sum::<f32>() / (v.len() as f32);
+        let early = mean(&early_anoms);
+        let late = mean(&late_anoms);
+        println!("early_anomaly={early}, late_anomaly={late}");
+        assert!(
+            late < 0.5 * early + 1e-6,
+            "late anomaly ({late}) should be < 0.5 * early anomaly ({early})"
+        );
+    }
+}

overlay/hydra/__init__.py CHANGED Viewed

@@ -1,31 +1,37 @@
-"""HYDRA training package.
-Thin facade re-exporting the public API used by train.py, the test suite,
-and external research scripts. Imports are lazy where possible to keep
-`import hydra` cheap (prepare.py and mamba-ssm are the heavy deps).
-"""
-from hydra.config import PostSemClawConfig
-from hydra.engram import GPUEngram
-from hydra.model import PostSemClawModel, norm
-from hydra.optimizer import MuonAdamW, adamw_step_fused, muon_step_fused
-# config_from_dict is imported lazily (via attribute access on hydra.training)
-# to keep `import hydra` cheap; re-export here for convenience.
-def __getattr__(name: str):
-    if name == "config_from_dict":
-        from hydra.training import config_from_dict as _cfd
-        return _cfd
-    raise AttributeError(name)
-__all__ = [
-    "PostSemClawConfig",
-    "GPUEngram",
-    "PostSemClawModel",
-    "norm",
-    "MuonAdamW",
-    "adamw_step_fused",
-    "muon_step_fused",
-    "config_from_dict",
-]

+"""HYDRA training package.
+Thin facade re-exporting the public API used by train.py, the test suite,
+and external research scripts. Imports are lazy where possible to keep
+`import hydra` cheap (prepare.py and mamba-ssm are the heavy deps).
+"""
+from hydra.config import PostSemClawConfig
+from hydra.engram import GPUEngram
+from hydra.optimizer import MuonAdamW, adamw_step_fused, muon_step_fused
+# Heavy imports are resolved lazily so `import hydra` and `import hydra.hyena_block`
+# keep working in local CPU/test environments that do not have the container-only
+# mamba-ssm wheel stack installed.
+def __getattr__(name: str):
+    if name == "PostSemClawModel":
+        from hydra.model import PostSemClawModel as _model
+        return _model
+    if name == "norm":
+        from hydra.model import norm as _norm
+        return _norm
+    if name == "config_from_dict":
+        from hydra.training import config_from_dict as _cfd
+        return _cfd
+    raise AttributeError(name)
+__all__ = [
+    "PostSemClawConfig",
+    "GPUEngram",
+    "PostSemClawModel",
+    "norm",
+    "MuonAdamW",
+    "adamw_step_fused",
+    "muon_step_fused",
+    "config_from_dict",
+]

overlay/hydra/config.py CHANGED Viewed

@@ -1,220 +1,225 @@
-"""HYDRA training configuration — dataclass + env-var constants.
-Extracted from the monolithic train.py as part of W1 modularization. All
-env-var reads and the PostSemClawConfig dataclass live here. The training
-body imports these constants; zero behavior change from the extraction.
-"""
-from __future__ import annotations
-import os
-from dataclasses import dataclass, field
-def _parse_hyena_layers_env() -> tuple[int, ...]:
-    """Parse HYDRA_HYENA_LAYERS env var into a sorted tuple of layer indices.
-    Used as the default_factory for PostSemClawConfig.hyena_layers so a fresh
-    config construction reads the current env var, but once constructed the
-    value is first-class and travels with checkpoints (see asdict(config) in
-    save_ckpt). Ckpt-load sets the dataclass field explicitly, overriding the
-    env-var default.
-    Returns empty tuple when env var is unset/empty (byte-identical to
-    pre-port behavior: no Hyena layers).
-    """
-    raw = os.environ.get("HYDRA_HYENA_LAYERS", "")
-    if not raw:
-        return ()
-    return tuple(sorted({int(s.strip()) for s in raw.split(",") if s.strip()}))
-def _parse_gdn_layers_env() -> tuple[int, ...]:
-    """Parse HYDRA_GDN_LAYERS env var into a sorted tuple of layer indices.
-    Same contract as _parse_hyena_layers_env: layers whose index is listed
-    here use GatedDeltaNet (fla.layers.GatedDeltaNet) as a drop-in
-    replacement for Mamba3. Empty tuple = no GDN layers (byte-identical
-    to baseline).
-    """
-    raw = os.environ.get("HYDRA_GDN_LAYERS", "")
-    if not raw:
-        return ()
-    return tuple(sorted({int(s.strip()) for s in raw.split(",") if s.strip()}))
-# ---------------------------------------------------------------------------
-# CUDA env — set before importing torch in entry point. Kept here so any
-# module that `from hydra.config import ...` also benefits (import order is
-# top-down in Python, and train.py used to set these at module top).
-# ---------------------------------------------------------------------------
-os.environ.setdefault("CUDA_HOME", "/usr/local/cuda")
-if "/usr/local/cuda/bin" not in os.environ.get("PATH", ""):
-    os.environ["PATH"] = "/usr/local/cuda/bin:" + os.environ.get("PATH", "")
-os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
-# ---------------------------------------------------------------------------
-# Model Configuration
-# ---------------------------------------------------------------------------
-@dataclass
-class PostSemClawConfig:
-    """Full-architecture model config. Defaults reflect Phase-1 baseline;
-    the training entry overrides d_model/n_layer/etc. from env vars."""
-    # Sequence
-    sequence_len: int = 2048
-    vocab_size: int = 8192  # Must match prepare.py VOCAB_SIZE
-    # Mamba-3 SSM
-    n_layer: int = 6
-    d_model: int = 384
-    d_state: int = 64       # SSM state dimension
-    headdim: int = 48       # head dimension for SSM
-    n_heads: int = 8        # d_model // headdim
-    expand: int = 2         # inner_dim = expand * d_model
-    # Engram (conditional memory with Hebbian writes)
-    engram_n_columns: int = 4096
-    engram_key_dim: int = 64
-    engram_layer_idx: int = 1  # which layer gets engram (0-indexed, mid-layer)
-    # SemanticFoldingSDR (offline retina with STE; no-bypass, runs every step)
-    sdr_n_bits: int = 16384          # retina width
-    # Default 327 = 2% sparsity (Webber/Numenta canonical). Override with
-    # HYDRA_SDR_TARGET_ACTIVE env var; value MUST match subsystems/sdr_retina.py
-    # TARGET_ACTIVE (same env var is read there, so just setting it once works).
-    sdr_target_active: int = int(os.environ.get("HYDRA_SDR_TARGET_ACTIVE", "327"))
-    sdr_delta_rank: int = 32         # low-rank STE delta rank
-    sdr_som_warmup: int = 500
-    sdr_som_interval: int = 100
-    # HTMLayer (Rust-backed, Hebbian; no-bypass, runs every step)
-    htm_n_columns: int = 2048
-    htm_cells_per_column: int = 32
-    # Hyena supplement layer indices (sorted tuple). Defaults to the
-    # HYDRA_HYENA_LAYERS env var at config-construction time, but once
-    # persisted in a checkpoint the value is first-class and survives even
-    # when the env var is unset at resume time. This fixes the ckpt-reload
-    # crash path where a model trained with `HYDRA_HYENA_LAYERS=3,7` saves
-    # HyenaBlock params but a fresh process without the env var would try
-    # to build a pure-Mamba3 architecture and reject the state_dict as
-    # `Missing/Unexpected key(s)`.
-    hyena_layers: tuple[int, ...] = field(default_factory=_parse_hyena_layers_env)
-    # GatedDeltaNet supplement layer indices (sorted tuple). Same semantics
-    # as hyena_layers — a layer index listed here uses GDNBlock (fla-backed
-    # Gated DeltaNet) instead of Mamba3. Selections are mutually exclusive
-    # with hyena_layers at construction time (hyena wins on overlap; the
-    # model loop checks hyena first).
-    gdn_layers: tuple[int, ...] = field(default_factory=_parse_gdn_layers_env)
-    # Label smoothing + Z-loss
-    label_smoothing: float = 0.0   # disabled: any smoothing hurts in 5-min budget
-    z_loss_weight: float = 1e-4
-# ---------------------------------------------------------------------------
-# Hyperparameters (autoresearch agent modifies these via env vars)
-# ---------------------------------------------------------------------------
-# Model architecture
-D_MODEL = int(os.environ.get("HYDRA_D_MODEL", "256"))
-N_LAYER = int(os.environ.get("HYDRA_N_LAYER", "4"))
-D_STATE = int(os.environ.get("HYDRA_D_STATE", "64"))
-HEADDIM = int(os.environ.get("HYDRA_HEADDIM", "32"))
-N_HEADS = D_MODEL // HEADDIM
-EXPAND = int(os.environ.get("HYDRA_EXPAND", "2"))
-# Engram
-ENGRAM_N_COLUMNS = int(os.environ.get("HYDRA_ENGRAM_N_COLUMNS", "1024"))
-ENGRAM_KEY_DIM = 64
-ENGRAM_LAYER_IDX = int(os.environ.get("HYDRA_ENGRAM_LAYER_IDX", "1"))
-# Optimization
-DEVICE_BATCH_SIZE = int(os.environ.get("HYDRA_BATCH_SIZE", "1"))
-TOTAL_BATCH_SIZE = int(os.environ.get("HYDRA_TOTAL_BATCH", "32768"))
-MATRIX_LR = float(os.environ.get("HYDRA_MATRIX_LR", "0.12"))
-EMBEDDING_LR = float(os.environ.get("HYDRA_EMBED_LR", "1.0"))
-UNEMBEDDING_LR = float(os.environ.get("HYDRA_UNEMBED_LR", "0.005"))
-SCALAR_LR = 0.5
-WEIGHT_DECAY = 0.01
-ADAM_BETAS = (0.9, 0.95)
-WARMUP_RATIO = 0.0
-WARMDOWN_RATIO = 0.5
-FINAL_LR_FRAC = float(os.environ.get("HYDRA_LR_MIN_MULT", "0.0"))
-# Runtime
-SEED = int(os.environ.get("HYDRA_SEED", "42"))
-# BF16 TFLOPS peak (RTX 3060=25.5, A100 SXM4=312, H100 SXM5=989)
-GPU_BF16_PEAK_FLOPS = float(os.environ.get("HYDRA_GPU_BF16_TFLOPS", "25.5")) * 1e12
-# Loss / inference knobs read by the model
-CE_CHUNK = int(os.environ.get("HYDRA_CE_CHUNK", "1024"))
-DROPOUT = float(os.environ.get("HYDRA_DROPOUT", "0.2"))
-FUSED_ADAMW = os.environ.get("HYDRA_FUSED_ADAMW", "1") == "1"
-# ---------------------------------------------------------------------------
-# Learnability knobs (all OFF by default — zero behavior change unless set)
-# ---------------------------------------------------------------------------
-# 1) Multi-Token Prediction (Llama-3 style). K=1 disables (next-1 only). K=4
-#    adds 3 extra weight-tied heads; loss = mean of K position-shifted CEs.
-MTP_K = int(os.environ.get("HYDRA_MTP_K", "1"))
-# 2) Exponential Moving Average of model weights (decay=0.999). Saves an
-#    additional latest_ema.pt at the end of training.
-USE_EMA = os.environ.get("HYDRA_USE_EMA", "0") == "1"
-EMA_DECAY = float(os.environ.get("HYDRA_EMA_DECAY", "0.999"))
-# 3) Gradient checkpointing on Mamba3 block forward. Trades ~30% compute for
-#    ~40% activation memory savings — lets you push B upward on a 3060.
-GRAD_CKPT = os.environ.get("HYDRA_GRAD_CKPT", "0") == "1"
-# 4) Doc-separator masking in packed sequences: at every packed-BOS position
-#    in the targets tensor, mask the loss (ignore_index=-1) so the model is
-#    not forced to predict doc B from doc A's context.
-DOC_SEP_MASK = os.environ.get("HYDRA_DOC_SEP_MASK", "0") == "1"
-# 5) Stop-gradient on HTM state (belt-and-braces: htm_rust already runs under
-#    torch.no_grad() so the tensor returned has requires_grad=False; this
-#    simply detaches explicitly to harden graph hygiene against future refactors).
-HTM_STOP_GRAD = os.environ.get("HYDRA_HTM_STOP_GRAD", "0") == "1"
-# 6) Output entropy penalty: loss += -lambda * H(softmax(logits)). Negative
-#    entropy penalizes peaked distributions and breaks repetition loops.
-ENTROPY_PENALTY = float(os.environ.get("HYDRA_ENTROPY_PENALTY", "0.0"))
-# 7) Curriculum: first N optimizer steps use short seq_len, then switch to
-#    full. 0 disables (no curriculum).
-CURRICULUM_SHORT_STEPS = int(os.environ.get("HYDRA_CURRICULUM_SHORT_STEPS", "0"))
-CURRICULUM_SHORT_SEQ_LEN = int(os.environ.get("HYDRA_CURRICULUM_SHORT_SEQ_LEN", "256"))
-# ---------------------------------------------------------------------------
-# Hyena supplement (additional block type for selected layer indices).
-# Hyena replaces Mamba3 at the specified layer indices while all other layers
-# remain Mamba3. Empty string (default) → no Hyena layers, byte-identical to
-# pre-port behavior.
-#   HYDRA_HYENA_LAYERS       "3,7"  — comma-separated 0-indexed layer ids
-#   HYDRA_HYENA_ORDER         2     — Hyena recurrence order (>= 2)
-#   HYDRA_HYENA_FILTER_DIM    64    — implicit-filter MLP hidden width
-# Hyena reference: https://arxiv.org/pdf/2302.10866.pdf (HazyResearch/safari).
-# ---------------------------------------------------------------------------
-HYENA_LAYERS = os.environ.get("HYDRA_HYENA_LAYERS", "")
-HYENA_ORDER = int(os.environ.get("HYDRA_HYENA_ORDER", "2"))
-HYENA_FILTER_DIM = int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64"))
-# Filter-rfft cache modes (see subsystems/hyena_pure.py):
-#   HYDRA_HYENA_FILTER_CACHE=1 — eval-only cache. Safe under torch.no_grad()
-#       where PyTorch never saves intermediate tensors. Off by default.
-#   HYDRA_HYENA_TRAIN_CACHE=1  — training-safe cache using a deferred
-#       gradient pattern. Cuts the implicit filter MLP forward to ONCE per
-#       optimizer step regardless of grad-accumulation factor. Requires the
-#       training loop (see hydra/lightning_module.py::optimizer_step) to
-#       call `model.flush_hyena_pending_grads()` before optimizer.step().
-#       Off by default.
-HYENA_FILTER_CACHE = os.environ.get("HYDRA_HYENA_FILTER_CACHE", "0") == "1"
-HYENA_TRAIN_CACHE = os.environ.get("HYDRA_HYENA_TRAIN_CACHE", "0") == "1"
-# Factual eval knobs
-FACTUAL_SAMPLES = int(os.environ.get("HYDRA_FACTUAL_SAMPLES", "3"))
-FACTUAL_BATCH = int(os.environ.get("HYDRA_FACTUAL_BATCH", "32"))
-# F6 (partial): Full incremental SSM decode integration deferred — would require
-# threading mamba_ssm InferenceParams through PostSemClawModel.forward and all
-# auxiliary subsystems (HTM, SDR, Engram) which currently run full-sequence each
-# call. As a stopgap we reduce default from 16 -> 4 so the per-prompt cost is
-# quartered (each gen-tok does a full re-encode of ctx+k tokens). Override with
-# HYDRA_FACTUAL_GEN_TOKENS to restore prior behavior. See docs/OPTIMIZATION_PLAN.md.
-FACTUAL_GEN_TOKENS = int(os.environ.get("HYDRA_FACTUAL_GEN_TOKENS", "2"))

+"""HYDRA training configuration — dataclass + env-var constants.
+Extracted from the monolithic train.py as part of W1 modularization. All
+env-var reads and the PostSemClawConfig dataclass live here. The training
+body imports these constants; zero behavior change from the extraction.
+"""
+from __future__ import annotations
+import os
+from dataclasses import dataclass, field
+def _parse_hyena_layers_env() -> tuple[int, ...]:
+    """Parse HYDRA_HYENA_LAYERS env var into a sorted tuple of layer indices.
+    Used as the default_factory for PostSemClawConfig.hyena_layers so a fresh
+    config construction reads the current env var, but once constructed the
+    value is first-class and travels with checkpoints (see asdict(config) in
+    save_ckpt). Ckpt-load sets the dataclass field explicitly, overriding the
+    env-var default.
+    Returns empty tuple when env var is unset/empty (byte-identical to
+    pre-port behavior: no Hyena layers).
+    """
+    raw = os.environ.get("HYDRA_HYENA_LAYERS", "")
+    if not raw:
+        return ()
+    return tuple(sorted({int(s.strip()) for s in raw.split(",") if s.strip()}))
+def _parse_gdn_layers_env() -> tuple[int, ...]:
+    """Parse HYDRA_GDN_LAYERS env var into a sorted tuple of layer indices.
+    Same contract as _parse_hyena_layers_env: layers whose index is listed
+    here use GatedDeltaNet (fla.layers.GatedDeltaNet) as a drop-in
+    replacement for Mamba3. Empty tuple = no GDN layers (byte-identical
+    to baseline).
+    """
+    raw = os.environ.get("HYDRA_GDN_LAYERS", "")
+    if not raw:
+        return ()
+    return tuple(sorted({int(s.strip()) for s in raw.split(",") if s.strip()}))
+# ---------------------------------------------------------------------------
+# CUDA env — set before importing torch in entry point. Kept here so any
+# module that `from hydra.config import ...` also benefits (import order is
+# top-down in Python, and train.py used to set these at module top).
+# ---------------------------------------------------------------------------
+os.environ.setdefault("CUDA_HOME", "/usr/local/cuda")
+if "/usr/local/cuda/bin" not in os.environ.get("PATH", ""):
+    os.environ["PATH"] = "/usr/local/cuda/bin:" + os.environ.get("PATH", "")
+os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True")
+# ---------------------------------------------------------------------------
+# Model Configuration
+# ---------------------------------------------------------------------------
+@dataclass
+class PostSemClawConfig:
+    """Full-architecture model config. Defaults reflect Phase-1 baseline;
+    the training entry overrides d_model/n_layer/etc. from env vars."""
+    # Sequence
+    sequence_len: int = 2048
+    vocab_size: int = 8192  # Must match prepare.py VOCAB_SIZE
+    # Mamba-3 SSM
+    n_layer: int = 6
+    d_model: int = 384
+    d_state: int = 64       # SSM state dimension
+    headdim: int = 48       # head dimension for SSM
+    n_heads: int = 8        # d_model // headdim
+    expand: int = 2         # inner_dim = expand * d_model
+    # Engram (conditional memory with Hebbian writes)
+    engram_n_columns: int = 4096
+    engram_key_dim: int = 64
+    engram_layer_idx: int = 1  # which layer gets engram (0-indexed, mid-layer)
+    # SemanticFoldingSDR (offline retina with STE; no-bypass, runs every step)
+    sdr_n_bits: int = 16384          # retina width
+    # Default 327 = 2% sparsity (Webber/Numenta canonical). Override with
+    # HYDRA_SDR_TARGET_ACTIVE env var; value MUST match subsystems/sdr_retina.py
+    # TARGET_ACTIVE (same env var is read there, so just setting it once works).
+    sdr_target_active: int = int(os.environ.get("HYDRA_SDR_TARGET_ACTIVE", "327"))
+    sdr_delta_rank: int = 32         # low-rank STE delta rank
+    sdr_som_warmup: int = 500
+    sdr_som_interval: int = 100
+    # HTMLayer (Rust-backed, Hebbian; no-bypass, runs every step)
+    htm_n_columns: int = 2048
+    htm_cells_per_column: int = 32
+    # Hyena supplement layer indices (sorted tuple). Defaults to the
+    # HYDRA_HYENA_LAYERS env var at config-construction time, but once
+    # persisted in a checkpoint the value is first-class and survives even
+    # when the env var is unset at resume time. This fixes the ckpt-reload
+    # crash path where a model trained with `HYDRA_HYENA_LAYERS=3,7` saves
+    # HyenaBlock params but a fresh process without the env var would try
+    # to build a pure-Mamba3 architecture and reject the state_dict as
+    # `Missing/Unexpected key(s)`.
+    hyena_layers: tuple[int, ...] = field(default_factory=_parse_hyena_layers_env)
+    # GatedDeltaNet supplement layer indices (sorted tuple). Same semantics
+    # as hyena_layers — a layer index listed here uses GDNBlock (fla-backed
+    # Gated DeltaNet) instead of Mamba3. Selections are mutually exclusive
+    # with hyena_layers at construction time (hyena wins on overlap; the
+    # model loop checks hyena first).
+    gdn_layers: tuple[int, ...] = field(default_factory=_parse_gdn_layers_env)
+    # Label smoothing + Z-loss
+    label_smoothing: float = field(default_factory=lambda: float(os.environ.get("HYDRA_LABEL_SMOOTHING", "0.0")))
+    z_loss_weight: float = field(default_factory=lambda: float(os.environ.get("HYDRA_Z_LOSS_WEIGHT", "1e-4")))
+# ---------------------------------------------------------------------------
+# Hyperparameters (autoresearch agent modifies these via env vars)
+# ---------------------------------------------------------------------------
+# Model architecture
+D_MODEL = int(os.environ.get("HYDRA_D_MODEL", "256"))
+N_LAYER = int(os.environ.get("HYDRA_N_LAYER", "4"))
+D_STATE = int(os.environ.get("HYDRA_D_STATE", "64"))
+HEADDIM = int(os.environ.get("HYDRA_HEADDIM", "32"))
+N_HEADS = D_MODEL // HEADDIM
+EXPAND = int(os.environ.get("HYDRA_EXPAND", "2"))
+# Engram
+ENGRAM_N_COLUMNS = int(os.environ.get("HYDRA_ENGRAM_N_COLUMNS", "1024"))
+ENGRAM_KEY_DIM = 64
+ENGRAM_LAYER_IDX = int(os.environ.get("HYDRA_ENGRAM_LAYER_IDX", "1"))
+# Optimization
+DEVICE_BATCH_SIZE = int(os.environ.get("HYDRA_BATCH_SIZE", "1"))
+TOTAL_BATCH_SIZE = int(os.environ.get("HYDRA_TOTAL_BATCH", "32768"))
+MATRIX_LR = float(os.environ.get("HYDRA_MATRIX_LR", "0.12"))
+EMBEDDING_LR = float(os.environ.get("HYDRA_EMBED_LR", "1.0"))
+UNEMBEDDING_LR = float(os.environ.get("HYDRA_UNEMBED_LR", "0.005"))
+# Scalar/vector params include Hyena implicit-filter vectors, norms, gate/bias
+# terms, and SDR delta_u/delta_v.  They are AdamW-scaled by d_model and can be
+# the hidden instability path when the high-throughput HF recipe pushes a large
+# device batch for hours.  Keep the historical default, but make it controllable
+# from launch scripts so cloud jobs can cool scalars without editing code.
+SCALAR_LR = float(os.environ.get("HYDRA_SCALAR_LR", "0.5"))
+WEIGHT_DECAY = float(os.environ.get("HYDRA_WEIGHT_DECAY", "0.01"))
+ADAM_BETAS = (0.9, 0.95)
+WARMUP_RATIO = float(os.environ.get("HYDRA_WARMUP_RATIO", "0.0"))
+WARMDOWN_RATIO = 0.5
+FINAL_LR_FRAC = float(os.environ.get("HYDRA_LR_MIN_MULT", "0.0"))
+# Runtime
+SEED = int(os.environ.get("HYDRA_SEED", "42"))
+# BF16 TFLOPS peak (RTX 3060=25.5, A100 SXM4=312, H100 SXM5=989)
+GPU_BF16_PEAK_FLOPS = float(os.environ.get("HYDRA_GPU_BF16_TFLOPS", "25.5")) * 1e12
+# Loss / inference knobs read by the model
+CE_CHUNK = int(os.environ.get("HYDRA_CE_CHUNK", "1024"))
+DROPOUT = float(os.environ.get("HYDRA_DROPOUT", "0.2"))
+FUSED_ADAMW = os.environ.get("HYDRA_FUSED_ADAMW", "1") == "1"
+# ---------------------------------------------------------------------------
+# Learnability knobs (all OFF by default — zero behavior change unless set)
+# ---------------------------------------------------------------------------
+# 1) Multi-Token Prediction (Llama-3 style). K=1 disables (next-1 only). K=4
+#    adds 3 extra weight-tied heads; loss = mean of K position-shifted CEs.
+MTP_K = int(os.environ.get("HYDRA_MTP_K", "1"))
+# 2) Exponential Moving Average of model weights (decay=0.999). Saves an
+#    additional latest_ema.pt at the end of training.
+USE_EMA = os.environ.get("HYDRA_USE_EMA", "0") == "1"
+EMA_DECAY = float(os.environ.get("HYDRA_EMA_DECAY", "0.999"))
+# 3) Gradient checkpointing on Mamba3 block forward. Trades ~30% compute for
+#    ~40% activation memory savings — lets you push B upward on a 3060.
+GRAD_CKPT = os.environ.get("HYDRA_GRAD_CKPT", "0") == "1"
+# 4) Doc-separator masking in packed sequences: at every packed-BOS position
+#    in the targets tensor, mask the loss (ignore_index=-1) so the model is
+#    not forced to predict doc B from doc A's context.
+DOC_SEP_MASK = os.environ.get("HYDRA_DOC_SEP_MASK", "0") == "1"
+# 5) Stop-gradient on HTM state (belt-and-braces: htm_rust already runs under
+#    torch.no_grad() so the tensor returned has requires_grad=False; this
+#    simply detaches explicitly to harden graph hygiene against future refactors).
+HTM_STOP_GRAD = os.environ.get("HYDRA_HTM_STOP_GRAD", "0") == "1"
+# 6) Output entropy penalty: loss += -lambda * H(softmax(logits)). Negative
+#    entropy penalizes peaked distributions and breaks repetition loops.
+ENTROPY_PENALTY = float(os.environ.get("HYDRA_ENTROPY_PENALTY", "0.0"))
+# 7) Curriculum: first N optimizer steps use short seq_len, then switch to
+#    full. 0 disables (no curriculum).
+CURRICULUM_SHORT_STEPS = int(os.environ.get("HYDRA_CURRICULUM_SHORT_STEPS", "0"))
+CURRICULUM_SHORT_SEQ_LEN = int(os.environ.get("HYDRA_CURRICULUM_SHORT_SEQ_LEN", "256"))
+# ---------------------------------------------------------------------------
+# Hyena supplement (additional block type for selected layer indices).
+# Hyena replaces Mamba3 at the specified layer indices while all other layers
+# remain Mamba3. Empty string (default) → no Hyena layers, byte-identical to
+# pre-port behavior.
+#   HYDRA_HYENA_LAYERS       "3,7"  — comma-separated 0-indexed layer ids
+#   HYDRA_HYENA_ORDER         2     — Hyena recurrence order (>= 2)
+#   HYDRA_HYENA_FILTER_DIM    64    — implicit-filter MLP hidden width
+# Hyena reference: https://arxiv.org/pdf/2302.10866.pdf (HazyResearch/safari).
+# ---------------------------------------------------------------------------
+HYENA_LAYERS = os.environ.get("HYDRA_HYENA_LAYERS", "")
+HYENA_ORDER = int(os.environ.get("HYDRA_HYENA_ORDER", "2"))
+HYENA_FILTER_DIM = int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64"))
+# Filter-rfft cache modes (see subsystems/hyena_pure.py):
+#   HYDRA_HYENA_FILTER_CACHE=1 — eval-only cache. Safe under torch.no_grad()
+#       where PyTorch never saves intermediate tensors. Off by default.
+#   HYDRA_HYENA_TRAIN_CACHE=1  — training-safe cache using a deferred
+#       gradient pattern. Cuts the implicit filter MLP forward to ONCE per
+#       optimizer step regardless of grad-accumulation factor. Requires the
+#       training loop (see hydra/lightning_module.py::optimizer_step) to
+#       call `model.flush_hyena_pending_grads()` before optimizer.step().
+#       Off by default.
+HYENA_FILTER_CACHE = os.environ.get("HYDRA_HYENA_FILTER_CACHE", "0") == "1"
+HYENA_TRAIN_CACHE = os.environ.get("HYDRA_HYENA_TRAIN_CACHE", "0") == "1"
+# Factual eval knobs
+FACTUAL_SAMPLES = int(os.environ.get("HYDRA_FACTUAL_SAMPLES", "3"))
+FACTUAL_BATCH = int(os.environ.get("HYDRA_FACTUAL_BATCH", "32"))
+# F6 (partial): Full incremental SSM decode integration deferred — would require
+# threading mamba_ssm InferenceParams through PostSemClawModel.forward and all
+# auxiliary subsystems (HTM, SDR, Engram) which currently run full-sequence each
+# call. As a stopgap we reduce default from 16 -> 4 so the per-prompt cost is
+# quartered (each gen-tok does a full re-encode of ctx+k tokens). Override with
+# HYDRA_FACTUAL_GEN_TOKENS to restore prior behavior. See docs/OPTIMIZATION_PLAN.md.
+FACTUAL_GEN_TOKENS = int(os.environ.get("HYDRA_FACTUAL_GEN_TOKENS", "2"))

overlay/hydra/data_module.py CHANGED Viewed

@@ -1,288 +1,288 @@
-"""Lightning DataModule + IterableDataset for HYDRA pretraining.
-Replaces the custom threading/queue pipeline in prepare_nemotron.make_dataloader
-with a standard multiprocessing DataLoader approach.
-Design:
-  • IterableStreamDataset: each worker opens its own HF streams for the 7-way
-    blend, tokenizes with rustbpe, packs into (T+1,) rows via best-fit, and
-    yields one row per __next__.
-  • HydraDataModule: wraps the dataset with a standard DataLoader using
-    num_workers>=1, prefetch_factor=4, pin_memory=True. Lightning handles
-    device transfer.
-  • Val stream: deterministic seed 12345, weights match training blend.
-The worker RNG is seeded per-worker so the weighted-sampling schedule is
-independent across workers (else all workers request the same config at
-the same step and prefetching serializes).
-Env vars (all preserved from prepare_nemotron):
-  HYDRA_SEQ_LEN                  — sequence length T (default 512)
-  HYDRA_BATCH_SIZE               — batch size B (default 1) — passed through
-                                    to DataLoader
-  HYDRA_STREAM_SHUFFLE_BUFFER    — HF shuffle buffer (default 2048)
-  HYDRA_USE_FULL_BLEND           — 7-way blend vs 5-way Nemotron phase
-  HYDRA_USE_NEMOTRON             — enables streaming path (else shard path)
-  HYDRA_FACTUAL_INJECT_RATE      — factual doc injection cadence
-  HYDRA_NEMOTRON_PHASE           — phase1|phase2 (when not full blend)
-  HYDRA_DATA_NUM_WORKERS         — DataLoader num_workers (default 2)
-  HYDRA_DATA_PREFETCH            — DataLoader prefetch_factor (default 4)
-  HYDRA_DATA_BUFFER              — doc_buffer size for best-fit packing
-                                    (default 1000)
-"""
-from __future__ import annotations
-import os
-import random
-from typing import Iterator
-import numpy as np
-import torch
-import lightning as L
-from torch.utils.data import DataLoader, IterableDataset, get_worker_info
-import prepare as _prepare
-import prepare_nemotron as _p_nemo
-from prepare_nemotron import (
-    FULL_BLEND_WEIGHTS,
-    PHASE1_WEIGHTS,
-    PHASE2_WEIGHTS,
-    _BLEND_REGISTRY,
-    _extract_text,
-    _open_stream,
-)
-# ---------------------------------------------------------------------------
-# Worker-local weighted stream. A stripped version of prepare_nemotron's
-# _WeightedStream that is constructed inside each worker. Adds worker sharding:
-# when num_workers > 1 the RNG is seeded per-worker, so different workers
-# sample different config sequences and pull disjoint shard assignments from
-# HF's shuffle buffer.
-# ---------------------------------------------------------------------------
-class _WorkerWeightedStream:
-    def __init__(self, weights: dict[str, float], base_seed: int, worker_id: int):
-        self.configs = list(weights.keys())
-        self.weights = [weights[c] for c in self.configs]
-        self.base_seed = base_seed
-        self.worker_id = worker_id
-        # Each worker opens its own HF streams. _open_stream returns an iter()
-        # over a streaming dataset, with an internal shuffle buffer.
-        self.streams = {c: _open_stream(c, "train") for c in self.configs}
-        # Per-worker RNG so the config-choice trajectory is independent.
-        self.rng = random.Random(base_seed + worker_id * 7919)
-        self.epoch = 1
-        # Lazy-init factual docs (once per worker). The main-process version
-        # in prepare_nemotron._WeightedStream reads these on first __next__.
-        self._factual_docs: list[str] | None = None
-        self._factual_idx = 0
-        self._inject_counter = 0
-        inject_rate = int(os.environ.get("HYDRA_FACTUAL_INJECT_RATE", "50"))
-        self._inject_rate = inject_rate
-        if inject_rate > 0:
-            factual_path = os.path.join(
-                os.path.dirname(os.path.abspath(_p_nemo.__file__)),
-                "data", "factual", "facts.txt",
-            )
-            if os.path.exists(factual_path):
-                with open(factual_path) as fh:
-                    self._factual_docs = fh.read().strip().split("\n")
-    def _reopen(self, config: str) -> None:
-        self.streams[config] = _open_stream(config, "train")
-        self.epoch += 1
-    def __iter__(self):
-        return self
-    def __next__(self) -> tuple[str, int]:
-        # Factual injection (preserves prepare_nemotron cadence).
-        if self._inject_rate > 0 and self._factual_docs:
-            self._inject_counter += 1
-            if self._inject_counter >= self._inject_rate:
-                self._inject_counter = 0
-                doc = self._factual_docs[self._factual_idx % len(self._factual_docs)]
-                self._factual_idx += 1
-                return doc, self.epoch
-        config = self.rng.choices(self.configs, weights=self.weights, k=1)[0]
-        try:
-            row = next(self.streams[config])
-        except StopIteration:
-            self._reopen(config)
-            row = next(self.streams[config])
-        return _extract_text(row), self.epoch
-# ---------------------------------------------------------------------------
-# IterableStreamDataset — yields (T+1,) packed rows. No threads. No queues.
-# Lives inside each DataLoader worker. DataLoader's own multiprocessing stacks
-# rows into batches of shape (B, T+1) and sends them to the main process.
-# ---------------------------------------------------------------------------
-class IterableStreamDataset(IterableDataset):
-    """Streams docs, tokenizes, packs into (T+1,) rows via best-fit.
-    Each worker gets its own instance (via fork/spawn) and initializes its
-    own HF streams + rustbpe tokenizer + factual injector. The tokenizer
-    pickled blob is small (~1 MB) and thread-safe per tiktoken docs.
-    """
-    def __init__(
-        self,
-        split: str,
-        seq_len: int,
-        *,
-        base_seed: int = 0,
-        doc_buffer_size: int = 1000,
-        tokenizer_batch: int = 128,
-    ):
-        super().__init__()
-        assert split in ("train", "val"), split
-        self.split = split
-        self.seq_len = seq_len
-        self.row_capacity = seq_len + 1
-        self.base_seed = base_seed
-        self.doc_buffer_size = doc_buffer_size
-        self.tokenizer_batch = tokenizer_batch
-    def _pick_weights(self) -> dict[str, float]:
-        if self.split == "val":
-            if os.environ.get("HYDRA_USE_FULL_BLEND", "0") == "1":
-                return FULL_BLEND_WEIGHTS
-            return {"Nemotron-Pretraining-Multiple-Choice": 1.0}
-        if os.environ.get("HYDRA_USE_FULL_BLEND", "0") == "1":
-            return FULL_BLEND_WEIGHTS
-        phase = os.environ.get("HYDRA_NEMOTRON_PHASE", "phase1").strip().lower()
-        return PHASE2_WEIGHTS if phase == "phase2" else PHASE1_WEIGHTS
-    def __iter__(self) -> Iterator[torch.Tensor]:
-        info = get_worker_info()
-        worker_id = 0 if info is None else info.id
-        # Each worker builds its own tokenizer instance. tiktoken's Encoding
-        # object is pickleable and the underlying C++ BPE is thread-safe;
-        # per-worker instantiation avoids cross-process sharing headaches.
-        tokenizer = _prepare.Tokenizer.from_directory()
-        bos = tokenizer.get_bos_token_id()
-        # Each worker gets its own weighted HF stream. Seed offset ensures
-        # disjoint config-choice trajectories; HF's own shuffle buffer handles
-        # shard randomization.
-        val_seed = 12345  # deterministic val
-        seed = val_seed if self.split == "val" else self.base_seed
-        stream = _WorkerWeightedStream(
-            self._pick_weights(), base_seed=seed, worker_id=worker_id,
-        )
-        row_capacity = self.row_capacity
-        doc_buffer: list[list[int]] = []
-        doc_batch_size = self.tokenizer_batch
-        def refill_buffer() -> None:
-            # Collect doc_batch_size text strings, then batch-tokenize.
-            texts: list[str] = []
-            for _ in range(doc_batch_size):
-                text, _epoch = next(stream)
-                if text:
-                    texts.append(text)
-            if texts:
-                token_lists = tokenizer.encode(texts, prepend=bos)
-                doc_buffer.extend(token_lists)
-        while True:
-            pos = 0
-            row = torch.empty(row_capacity, dtype=torch.long)
-            while pos < row_capacity:
-                while len(doc_buffer) < self.doc_buffer_size:
-                    refill_buffer()
-                remaining = row_capacity - pos
-                # Best-fit packing: largest doc that fully fits.
-                best_idx = -1
-                best_len = 0
-                for i, doc in enumerate(doc_buffer):
-                    dlen = len(doc)
-                    if dlen <= remaining and dlen > best_len:
-                        best_idx = i
-                        best_len = dlen
-                if best_idx >= 0:
-                    doc = doc_buffer.pop(best_idx)
-                    row[pos : pos + len(doc)] = torch.tensor(doc, dtype=torch.long)
-                    pos += len(doc)
-                else:
-                    # No doc fits remaining space — crop shortest to fill.
-                    shortest_idx = min(
-                        range(len(doc_buffer)),
-                        key=lambda i: len(doc_buffer[i]),
-                    )
-                    doc = doc_buffer.pop(shortest_idx)
-                    row[pos : pos + remaining] = torch.tensor(
-                        doc[:remaining], dtype=torch.long,
-                    )
-                    pos += remaining
-            yield row
-# ---------------------------------------------------------------------------
-# LightningDataModule
-# ---------------------------------------------------------------------------
-class HydraDataModule(L.LightningDataModule):
-    def __init__(
-        self,
-        batch_size: int | None = None,
-        seq_len: int | None = None,
-        num_workers: int | None = None,
-        prefetch_factor: int | None = None,
-    ):
-        super().__init__()
-        self.batch_size = batch_size or int(os.environ.get("HYDRA_BATCH_SIZE", "1"))
-        self.seq_len = seq_len or int(os.environ.get("HYDRA_SEQ_LEN", "512"))
-        self.num_workers = (
-            num_workers
-            if num_workers is not None
-            else int(os.environ.get("HYDRA_DATA_NUM_WORKERS", "2"))
-        )
-        self.prefetch_factor = (
-            prefetch_factor
-            if prefetch_factor is not None
-            else int(os.environ.get("HYDRA_DATA_PREFETCH", "4"))
-        )
-        self.doc_buffer = int(os.environ.get("HYDRA_DATA_BUFFER", "1000"))
-    def _make_loader(self, split: str, seed: int) -> DataLoader:
-        dataset = IterableStreamDataset(
-            split=split,
-            seq_len=self.seq_len,
-            base_seed=seed,
-            doc_buffer_size=self.doc_buffer,
-        )
-        # num_workers=0 → main-process iteration (useful for debugging). With
-        # IterableDataset the DataLoader batches the rows into (B, T+1) via
-        # default torch.stack-collate.
-        kw: dict = dict(
-            dataset=dataset,
-            batch_size=self.batch_size,
-            num_workers=self.num_workers,
-            pin_memory=True,
-            drop_last=True,
-        )
-        if self.num_workers > 0:
-            kw["prefetch_factor"] = self.prefetch_factor
-            kw["persistent_workers"] = True
-        return DataLoader(**kw)
-    def train_dataloader(self) -> DataLoader:
-        return self._make_loader("train", seed=0)
-    def val_dataloader(self) -> DataLoader:
-        return self._make_loader("val", seed=12345)

+"""Lightning DataModule + IterableDataset for HYDRA pretraining.
+Replaces the custom threading/queue pipeline in prepare_nemotron.make_dataloader
+with a standard multiprocessing DataLoader approach.
+Design:
+  • IterableStreamDataset: each worker opens its own HF streams for the 7-way
+    blend, tokenizes with rustbpe, packs into (T+1,) rows via best-fit, and
+    yields one row per __next__.
+  • HydraDataModule: wraps the dataset with a standard DataLoader using
+    num_workers>=1, prefetch_factor=4, pin_memory=True. Lightning handles
+    device transfer.
+  • Val stream: deterministic seed 12345, weights match training blend.
+The worker RNG is seeded per-worker so the weighted-sampling schedule is
+independent across workers (else all workers request the same config at
+the same step and prefetching serializes).
+Env vars (all preserved from prepare_nemotron):
+  HYDRA_SEQ_LEN                  — sequence length T (default 512)
+  HYDRA_BATCH_SIZE               — batch size B (default 1) — passed through
+                                    to DataLoader
+  HYDRA_STREAM_SHUFFLE_BUFFER    — HF shuffle buffer (default 2048)
+  HYDRA_USE_FULL_BLEND           — 7-way blend vs 5-way Nemotron phase
+  HYDRA_USE_NEMOTRON             — enables streaming path (else shard path)
+  HYDRA_FACTUAL_INJECT_RATE      — factual doc injection cadence
+  HYDRA_NEMOTRON_PHASE           — phase1|phase2 (when not full blend)
+  HYDRA_DATA_NUM_WORKERS         — DataLoader num_workers (default 2)
+  HYDRA_DATA_PREFETCH            — DataLoader prefetch_factor (default 4)
+  HYDRA_DATA_BUFFER              — doc_buffer size for best-fit packing
+                                    (default 1000)
+"""
+from __future__ import annotations
+import os
+import random
+from typing import Iterator
+import numpy as np
+import torch
+import lightning as L
+from torch.utils.data import DataLoader, IterableDataset, get_worker_info
+import prepare as _prepare
+import prepare_nemotron as _p_nemo
+from prepare_nemotron import (
+    FULL_BLEND_WEIGHTS,
+    PHASE1_WEIGHTS,
+    PHASE2_WEIGHTS,
+    _BLEND_REGISTRY,
+    _extract_text,
+    _open_stream,
+)
+# ---------------------------------------------------------------------------
+# Worker-local weighted stream. A stripped version of prepare_nemotron's
+# _WeightedStream that is constructed inside each worker. Adds worker sharding:
+# when num_workers > 1 the RNG is seeded per-worker, so different workers
+# sample different config sequences and pull disjoint shard assignments from
+# HF's shuffle buffer.
+# ---------------------------------------------------------------------------
+class _WorkerWeightedStream:
+    def __init__(self, weights: dict[str, float], base_seed: int, worker_id: int):
+        self.configs = list(weights.keys())
+        self.weights = [weights[c] for c in self.configs]
+        self.base_seed = base_seed
+        self.worker_id = worker_id
+        # Each worker opens its own HF streams. _open_stream returns an iter()
+        # over a streaming dataset, with an internal shuffle buffer.
+        self.streams = {c: _open_stream(c, "train") for c in self.configs}
+        # Per-worker RNG so the config-choice trajectory is independent.
+        self.rng = random.Random(base_seed + worker_id * 7919)
+        self.epoch = 1
+        # Lazy-init factual docs (once per worker). The main-process version
+        # in prepare_nemotron._WeightedStream reads these on first __next__.
+        self._factual_docs: list[str] | None = None
+        self._factual_idx = 0
+        self._inject_counter = 0
+        inject_rate = int(os.environ.get("HYDRA_FACTUAL_INJECT_RATE", "50"))
+        self._inject_rate = inject_rate
+        if inject_rate > 0:
+            factual_path = os.path.join(
+                os.path.dirname(os.path.abspath(_p_nemo.__file__)),
+                "data", "factual", "facts.txt",
+            )
+            if os.path.exists(factual_path):
+                with open(factual_path) as fh:
+                    self._factual_docs = fh.read().strip().split("\n")
+    def _reopen(self, config: str) -> None:
+        self.streams[config] = _open_stream(config, "train")
+        self.epoch += 1
+    def __iter__(self):
+        return self
+    def __next__(self) -> tuple[str, int]:
+        # Factual injection (preserves prepare_nemotron cadence).
+        if self._inject_rate > 0 and self._factual_docs:
+            self._inject_counter += 1
+            if self._inject_counter >= self._inject_rate:
+                self._inject_counter = 0
+                doc = self._factual_docs[self._factual_idx % len(self._factual_docs)]
+                self._factual_idx += 1
+                return doc, self.epoch
+        config = self.rng.choices(self.configs, weights=self.weights, k=1)[0]
+        try:
+            row = next(self.streams[config])
+        except StopIteration:
+            self._reopen(config)
+            row = next(self.streams[config])
+        return _extract_text(row), self.epoch
+# ---------------------------------------------------------------------------
+# IterableStreamDataset — yields (T+1,) packed rows. No threads. No queues.
+# Lives inside each DataLoader worker. DataLoader's own multiprocessing stacks
+# rows into batches of shape (B, T+1) and sends them to the main process.
+# ---------------------------------------------------------------------------
+class IterableStreamDataset(IterableDataset):
+    """Streams docs, tokenizes, packs into (T+1,) rows via best-fit.
+    Each worker gets its own instance (via fork/spawn) and initializes its
+    own HF streams + rustbpe tokenizer + factual injector. The tokenizer
+    pickled blob is small (~1 MB) and thread-safe per tiktoken docs.
+    """
+    def __init__(
+        self,
+        split: str,
+        seq_len: int,
+        *,
+        base_seed: int = 0,
+        doc_buffer_size: int = 1000,
+        tokenizer_batch: int = 128,
+    ):
+        super().__init__()
+        assert split in ("train", "val"), split
+        self.split = split
+        self.seq_len = seq_len
+        self.row_capacity = seq_len + 1
+        self.base_seed = base_seed
+        self.doc_buffer_size = doc_buffer_size
+        self.tokenizer_batch = tokenizer_batch
+    def _pick_weights(self) -> dict[str, float]:
+        if self.split == "val":
+            if os.environ.get("HYDRA_USE_FULL_BLEND", "0") == "1":
+                return FULL_BLEND_WEIGHTS
+            return {"Nemotron-Pretraining-Multiple-Choice": 1.0}
+        if os.environ.get("HYDRA_USE_FULL_BLEND", "0") == "1":
+            return FULL_BLEND_WEIGHTS
+        phase = os.environ.get("HYDRA_NEMOTRON_PHASE", "phase1").strip().lower()
+        return PHASE2_WEIGHTS if phase == "phase2" else PHASE1_WEIGHTS
+    def __iter__(self) -> Iterator[torch.Tensor]:
+        info = get_worker_info()
+        worker_id = 0 if info is None else info.id
+        # Each worker builds its own tokenizer instance. tiktoken's Encoding
+        # object is pickleable and the underlying C++ BPE is thread-safe;
+        # per-worker instantiation avoids cross-process sharing headaches.
+        tokenizer = _prepare.Tokenizer.from_directory()
+        bos = tokenizer.get_bos_token_id()
+        # Each worker gets its own weighted HF stream. Seed offset ensures
+        # disjoint config-choice trajectories; HF's own shuffle buffer handles
+        # shard randomization.
+        val_seed = 12345  # deterministic val
+        seed = val_seed if self.split == "val" else self.base_seed
+        stream = _WorkerWeightedStream(
+            self._pick_weights(), base_seed=seed, worker_id=worker_id,
+        )
+        row_capacity = self.row_capacity
+        doc_buffer: list[list[int]] = []
+        doc_batch_size = self.tokenizer_batch
+        def refill_buffer() -> None:
+            # Collect doc_batch_size text strings, then batch-tokenize.
+            texts: list[str] = []
+            for _ in range(doc_batch_size):
+                text, _epoch = next(stream)
+                if text:
+                    texts.append(text)
+            if texts:
+                token_lists = tokenizer.encode(texts, prepend=bos)
+                doc_buffer.extend(token_lists)
+        while True:
+            pos = 0
+            row = torch.empty(row_capacity, dtype=torch.long)
+            while pos < row_capacity:
+                while len(doc_buffer) < self.doc_buffer_size:
+                    refill_buffer()
+                remaining = row_capacity - pos
+                # Best-fit packing: largest doc that fully fits.
+                best_idx = -1
+                best_len = 0
+                for i, doc in enumerate(doc_buffer):
+                    dlen = len(doc)
+                    if dlen <= remaining and dlen > best_len:
+                        best_idx = i
+                        best_len = dlen
+                if best_idx >= 0:
+                    doc = doc_buffer.pop(best_idx)
+                    row[pos : pos + len(doc)] = torch.tensor(doc, dtype=torch.long)
+                    pos += len(doc)
+                else:
+                    # No doc fits remaining space — crop shortest to fill.
+                    shortest_idx = min(
+                        range(len(doc_buffer)),
+                        key=lambda i: len(doc_buffer[i]),
+                    )
+                    doc = doc_buffer.pop(shortest_idx)
+                    row[pos : pos + remaining] = torch.tensor(
+                        doc[:remaining], dtype=torch.long,
+                    )
+                    pos += remaining
+            yield row
+# ---------------------------------------------------------------------------
+# LightningDataModule
+# ---------------------------------------------------------------------------
+class HydraDataModule(L.LightningDataModule):
+    def __init__(
+        self,
+        batch_size: int | None = None,
+        seq_len: int | None = None,
+        num_workers: int | None = None,
+        prefetch_factor: int | None = None,
+    ):
+        super().__init__()
+        self.batch_size = batch_size or int(os.environ.get("HYDRA_BATCH_SIZE", "1"))
+        self.seq_len = seq_len or int(os.environ.get("HYDRA_SEQ_LEN", "512"))
+        self.num_workers = (
+            num_workers
+            if num_workers is not None
+            else int(os.environ.get("HYDRA_DATA_NUM_WORKERS", "2"))
+        )
+        self.prefetch_factor = (
+            prefetch_factor
+            if prefetch_factor is not None
+            else int(os.environ.get("HYDRA_DATA_PREFETCH", "4"))
+        )
+        self.doc_buffer = int(os.environ.get("HYDRA_DATA_BUFFER", "1000"))
+    def _make_loader(self, split: str, seed: int) -> DataLoader:
+        dataset = IterableStreamDataset(
+            split=split,
+            seq_len=self.seq_len,
+            base_seed=seed,
+            doc_buffer_size=self.doc_buffer,
+        )
+        # num_workers=0 → main-process iteration (useful for debugging). With
+        # IterableDataset the DataLoader batches the rows into (B, T+1) via
+        # default torch.stack-collate.
+        kw: dict = dict(
+            dataset=dataset,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            drop_last=True,
+        )
+        if self.num_workers > 0:
+            kw["prefetch_factor"] = self.prefetch_factor
+            kw["persistent_workers"] = True
+        return DataLoader(**kw)
+    def train_dataloader(self) -> DataLoader:
+        return self._make_loader("train", seed=0)
+    def val_dataloader(self) -> DataLoader:
+        return self._make_loader("val", seed=12345)

overlay/hydra/diffusion_loss.py CHANGED Viewed

@@ -1,236 +1,236 @@
-"""MDLM Rao-Blackwellized Masked Diffusion Loss.
-Implements the masked-diffusion ELBO from:
-    Sahoo et al., "Simple and Effective Masked Diffusion Language Models" (MDLM),
-    NeurIPS 2024, arXiv:2406.07524.
-Equations referenced:
-    - Forward process: eq. 2  (per-token Bernoulli masking at rate 1 - alpha_t)
-    - Log-linear schedule:    alpha_t = 1 - t,  t ~ Uniform(0, 1)
-    - RB-ELBO:     eq. 7-8   L_RB = E_t E_q [ (1/alpha_t) * CE(x_theta(x_t), x_0) ]
-                              where the expectation over masked positions.
-Key insight: the Rao-Blackwellized estimate replaces an average over all masks
-(exponential) by a closed-form weighted CE that applies weight 1/alpha_t only
-on the positions that were masked, and 0 on unmasked positions. This gives an
-unbiased estimator with lower variance than a naive Monte Carlo over mask
-patterns.
-Reference implementation cross-checked against:
-    https://github.com/kuleshov-group/mdlm  (diffusion.py::DiffusionModel._loss)
-"""
-from __future__ import annotations
-from typing import Literal
-import torch
-import torch.nn.functional as F
-# Clamping weight keeps gradients finite while still up-weighting high-noise
-# positions. Historical value 1/eps=1000 blew up HYDRA training on a 12h v2
-# launch (2026-04-22): loss 26 → 42 → NaN in 13 steps under Muon lr=7e-3
-# because per-token CE × 1000 saturated the 100-unit FAIL guard. The MDLM
-# paper reports stable training at Adam lr=1e-4; HYDRA uses Muon at 7e-3
-# (70× larger), so the weight clamp needs to compensate.
-#
-# Tunable via HYDRA_MDLM_MAX_WEIGHT (default 5.0). Set =1.0 to disable
-# weighting entirely (flat masked-LM CE, no RB reweighting — simpler and
-# more stable, sacrifices the theoretical ELBO property).
-import os as _os
-_MAX_WEIGHT: float = float(_os.environ.get("HYDRA_MDLM_MAX_WEIGHT", "5.0"))
-_MIN_ALPHA: float = 1.0 / _MAX_WEIGHT  # so clamp(alpha, min=_MIN_ALPHA) gives 1/alpha <= _MAX_WEIGHT
-# ---------------------------------------------------------------------------
-# Public API
-# ---------------------------------------------------------------------------
-def mdlm_masked_forward_process(
-    targets: torch.Tensor,
-    mask_token_id: int,
-    t: torch.Tensor | None = None,
-    alpha_schedule: Literal["linear", "loglinear"] = "loglinear",
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """MDLM forward (noising) process: mask tokens and compute RB weights.
-    Args:
-        targets: (B, T) int64 token ids — the clean sequence x_0.
-        mask_token_id: The special token id used to represent a masked token.
-        t: (B,) float in (0, 1). If None, samples Uniform(0, 1) per batch
-            element. t=0 means fully clean; t=1 means fully masked.
-        alpha_schedule: Noise schedule.
-            "loglinear" (MDLM default): alpha_t = 1 - t
-            "linear": identical formula — both are provided for completeness
-            since the paper calls the 1-t schedule "log-linear" in the context
-            of the ELBO derivation.
-    Returns:
-        x_t           : (B, T) int64 — noised sequence; masked positions hold
-                        mask_token_id, unmasked positions equal targets.
-        mask_positions: (B, T) bool  — True where the token was masked.
-        loss_weights  : (B, T) float32 — RB weighting factor. On masked
-                        positions: 1/alpha_t (clamped to _MAX_WEIGHT). On
-                        unmasked positions: 0.0. Summing
-                        (CE * loss_weights * mask_positions).sum() / mask.sum()
-                        gives the per-sample RB-ELBO estimator.
-    """
-    B, T = targets.shape
-    device = targets.device
-    dtype = torch.float32
-    # --- sample or validate t ---
-    if t is None:
-        # Uniform(0, 1) per batch element; avoid exactly 0 and 1.
-        t = torch.rand(B, device=device, dtype=dtype)
-    else:
-        t = t.to(device=device, dtype=dtype)
-        if t.shape != (B,):
-            raise ValueError(f"t must be shape (B,)={(B,)}, got {t.shape}")
-        if (t < 0).any() or (t > 1).any():
-            raise ValueError("t must be in [0, 1]")
-    # --- noise schedule: alpha_t = probability that a token is NOT masked ---
-    # Both "linear" and "loglinear" in MDLM use alpha_t = 1 - t; the paper
-    # refers to "log-linear" because the schedule is linear in the *log* domain
-    # of the forward process probability. We expose both names for clarity.
-    if alpha_schedule in ("linear", "loglinear"):
-        alpha_t = 1.0 - t          # (B,) float, in [0, 1]
-    else:
-        raise ValueError(f"Unknown alpha_schedule: {alpha_schedule!r}. Use 'linear' or 'loglinear'.")
-    # --- per-token Bernoulli mask ---
-    # alpha_t[:, None] broadcasts to (B, T).
-    alpha_t_expanded = alpha_t[:, None]                # (B, 1)
-    # Bernoulli(1 - alpha_t) = 1 means "mask this token".
-    # We sample independently per token, per batch element.
-    rand = torch.rand(B, T, device=device, dtype=dtype)
-    mask_positions = rand > alpha_t_expanded           # (B, T) bool
-    # True  → masked position
-    # False → unmasked (kept as original)
-    # --- build x_t ---
-    x_t = targets.clone()
-    x_t = torch.where(mask_positions, torch.full_like(x_t, mask_token_id), x_t)
-    # --- RB loss weights: 1/alpha_t on masked positions, 0 elsewhere ---
-    # Clamp alpha_t so weights stay finite near t→1.
-    safe_alpha = alpha_t.clamp(min=_MIN_ALPHA)         # (B,)
-    weight_per_sample = 1.0 / safe_alpha               # (B,)
-    # Broadcast to (B, T) and zero out unmasked positions.
-    loss_weights = weight_per_sample[:, None].expand(B, T).to(dtype=dtype)  # (B, T)
-    loss_weights = loss_weights * mask_positions.float()
-    return x_t, mask_positions, loss_weights
-def mdlm_rb_loss(
-    logits: torch.Tensor,
-    targets: torch.Tensor,
-    mask_positions: torch.Tensor,
-    loss_weights: torch.Tensor,
-    ignore_index: int = -100,
-) -> torch.Tensor:
-    """Rao-Blackwellized negative ELBO.
-    Applies the MDLM loss: cross-entropy on masked positions only, weighted
-    per-token by loss_weights, averaged over the batch.
-    The formula (eq. 7-8 of arXiv:2406.07524):
-        L_RB = mean_B [ sum_T (weight_t * CE(logits_i, target_i) * mask_i)
-                        / max(sum_T(mask_i), 1) ]
-    Args:
-        logits        : (B, T, V) raw logits. May be bf16; internally cast to
-                        float32 for CE computation.
-        targets       : (B, T) int64 true token ids (x_0).
-        mask_positions: (B, T) bool — True = masked position.
-        loss_weights  : (B, T) float32 — 1/alpha_t on masked positions, 0 elsewhere.
-        ignore_index  : Passed to F.cross_entropy; positions with this label
-                        are excluded from the loss.
-    Returns:
-        Scalar float32 loss. Returns 0.0 tensor if no positions are masked.
-    """
-    B, T, V = logits.shape
-    # Ensure float32 for numerical stability; F.cross_entropy accepts fp16/bf16
-    # logits but accumulates in float internally anyway. Being explicit avoids
-    # silent precision surprises.
-    logits_f = logits.float()                          # (B, T, V)
-    # Build targets with ignore_index on UNmasked positions so CE only fires
-    # where mask_positions is True. We also honour any pre-existing -100 values
-    # (e.g. doc-separator masking upstream).
-    targets_masked = torch.where(
-        mask_positions & (targets != ignore_index),
-        targets,
-        torch.full_like(targets, ignore_index),
-    )
-    # Per-token CE; shape (B, T). Positions with ignore_index → 0 from CE.
-    per_tok_ce = F.cross_entropy(
-        logits_f.reshape(B * T, V),
-        targets_masked.reshape(B * T),
-        ignore_index=ignore_index,
-        reduction="none",
-    ).reshape(B, T)                                    # (B, T) float32
-    # Apply RB weight. loss_weights already has 0 on unmasked positions.
-    weighted = per_tok_ce * loss_weights               # (B, T)
-    # Per-sample mean over masked positions, then average over batch.
-    mask_f = mask_positions.float()                    # (B, T)
-    per_sample_mask_count = mask_f.sum(dim=1).clamp(min=1)   # (B,)
-    per_sample_loss = weighted.sum(dim=1) / per_sample_mask_count  # (B,)
-    return per_sample_loss.mean()                      # scalar float32
-def mdlm_loss(
-    logits: torch.Tensor,
-    targets: torch.Tensor,
-    mask_token_id: int,
-    t: torch.Tensor | None = None,
-    alpha_schedule: Literal["linear", "loglinear"] = "loglinear",
-    ignore_index: int = -100,
-) -> torch.Tensor:
-    """Convenience wrapper: forward process + RB-ELBO in one call.
-    Suitable for the common case where the caller has full-vocab logits and
-    wants a drop-in replacement for a standard masked-LM CE loss.
-    Args:
-        logits        : (B, T, V) raw logits.
-        targets       : (B, T) int64 clean token ids.
-        mask_token_id : The MASK token id used to corrupt the input.
-        t             : Optional (B,) timestep in (0, 1). Sampled if None.
-        alpha_schedule: "loglinear" (default) or "linear".
-        ignore_index  : Token id to ignore in the loss (e.g. padding).
-    Returns:
-        Scalar float32 MDLM RB-ELBO loss.
-    Note on sampled-softmax / partial logits:
-        If your model only computes logits for a subset of vocab positions
-        (e.g. HYDRA's sampled-softmax head), call mdlm_masked_forward_process
-        and mdlm_rb_loss separately. mdlm_rb_loss expects full-vocab logits.
-    """
-    x_t, mask_positions, loss_weights = mdlm_masked_forward_process(
-        targets=targets,
-        mask_token_id=mask_token_id,
-        t=t,
-        alpha_schedule=alpha_schedule,
-    )
-    # x_t is produced for the model's input (not used by this convenience
-    # wrapper since logits are already provided by the caller). In a real
-    # training loop the caller feeds x_t into the model to get logits, THEN
-    # calls this function. See the orchestrator wiring note in training.py.
-    return mdlm_rb_loss(
-        logits=logits,
-        targets=targets,
-        mask_positions=mask_positions,
-        loss_weights=loss_weights,
-        ignore_index=ignore_index,
-    )

+"""MDLM Rao-Blackwellized Masked Diffusion Loss.
+Implements the masked-diffusion ELBO from:
+    Sahoo et al., "Simple and Effective Masked Diffusion Language Models" (MDLM),
+    NeurIPS 2024, arXiv:2406.07524.
+Equations referenced:
+    - Forward process: eq. 2  (per-token Bernoulli masking at rate 1 - alpha_t)
+    - Log-linear schedule:    alpha_t = 1 - t,  t ~ Uniform(0, 1)
+    - RB-ELBO:     eq. 7-8   L_RB = E_t E_q [ (1/alpha_t) * CE(x_theta(x_t), x_0) ]
+                              where the expectation over masked positions.
+Key insight: the Rao-Blackwellized estimate replaces an average over all masks
+(exponential) by a closed-form weighted CE that applies weight 1/alpha_t only
+on the positions that were masked, and 0 on unmasked positions. This gives an
+unbiased estimator with lower variance than a naive Monte Carlo over mask
+patterns.
+Reference implementation cross-checked against:
+    https://github.com/kuleshov-group/mdlm  (diffusion.py::DiffusionModel._loss)
+"""
+from __future__ import annotations
+from typing import Literal
+import torch
+import torch.nn.functional as F
+# Clamping weight keeps gradients finite while still up-weighting high-noise
+# positions. Historical value 1/eps=1000 blew up HYDRA training on a 12h v2
+# launch (2026-04-22): loss 26 → 42 → NaN in 13 steps under Muon lr=7e-3
+# because per-token CE × 1000 saturated the 100-unit FAIL guard. The MDLM
+# paper reports stable training at Adam lr=1e-4; HYDRA uses Muon at 7e-3
+# (70× larger), so the weight clamp needs to compensate.
+#
+# Tunable via HYDRA_MDLM_MAX_WEIGHT (default 5.0). Set =1.0 to disable
+# weighting entirely (flat masked-LM CE, no RB reweighting — simpler and
+# more stable, sacrifices the theoretical ELBO property).
+import os as _os
+_MAX_WEIGHT: float = float(_os.environ.get("HYDRA_MDLM_MAX_WEIGHT", "5.0"))
+_MIN_ALPHA: float = 1.0 / _MAX_WEIGHT  # so clamp(alpha, min=_MIN_ALPHA) gives 1/alpha <= _MAX_WEIGHT
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def mdlm_masked_forward_process(
+    targets: torch.Tensor,
+    mask_token_id: int,
+    t: torch.Tensor | None = None,
+    alpha_schedule: Literal["linear", "loglinear"] = "loglinear",
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """MDLM forward (noising) process: mask tokens and compute RB weights.
+    Args:
+        targets: (B, T) int64 token ids — the clean sequence x_0.
+        mask_token_id: The special token id used to represent a masked token.
+        t: (B,) float in (0, 1). If None, samples Uniform(0, 1) per batch
+            element. t=0 means fully clean; t=1 means fully masked.
+        alpha_schedule: Noise schedule.
+            "loglinear" (MDLM default): alpha_t = 1 - t
+            "linear": identical formula — both are provided for completeness
+            since the paper calls the 1-t schedule "log-linear" in the context
+            of the ELBO derivation.
+    Returns:
+        x_t           : (B, T) int64 — noised sequence; masked positions hold
+                        mask_token_id, unmasked positions equal targets.
+        mask_positions: (B, T) bool  — True where the token was masked.
+        loss_weights  : (B, T) float32 — RB weighting factor. On masked
+                        positions: 1/alpha_t (clamped to _MAX_WEIGHT). On
+                        unmasked positions: 0.0. Summing
+                        (CE * loss_weights * mask_positions).sum() / mask.sum()
+                        gives the per-sample RB-ELBO estimator.
+    """
+    B, T = targets.shape
+    device = targets.device
+    dtype = torch.float32
+    # --- sample or validate t ---
+    if t is None:
+        # Uniform(0, 1) per batch element; avoid exactly 0 and 1.
+        t = torch.rand(B, device=device, dtype=dtype)
+    else:
+        t = t.to(device=device, dtype=dtype)
+        if t.shape != (B,):
+            raise ValueError(f"t must be shape (B,)={(B,)}, got {t.shape}")
+        if (t < 0).any() or (t > 1).any():
+            raise ValueError("t must be in [0, 1]")
+    # --- noise schedule: alpha_t = probability that a token is NOT masked ---
+    # Both "linear" and "loglinear" in MDLM use alpha_t = 1 - t; the paper
+    # refers to "log-linear" because the schedule is linear in the *log* domain
+    # of the forward process probability. We expose both names for clarity.
+    if alpha_schedule in ("linear", "loglinear"):
+        alpha_t = 1.0 - t          # (B,) float, in [0, 1]
+    else:
+        raise ValueError(f"Unknown alpha_schedule: {alpha_schedule!r}. Use 'linear' or 'loglinear'.")
+    # --- per-token Bernoulli mask ---
+    # alpha_t[:, None] broadcasts to (B, T).
+    alpha_t_expanded = alpha_t[:, None]                # (B, 1)
+    # Bernoulli(1 - alpha_t) = 1 means "mask this token".
+    # We sample independently per token, per batch element.
+    rand = torch.rand(B, T, device=device, dtype=dtype)
+    mask_positions = rand > alpha_t_expanded           # (B, T) bool
+    # True  → masked position
+    # False → unmasked (kept as original)
+    # --- build x_t ---
+    x_t = targets.clone()
+    x_t = torch.where(mask_positions, torch.full_like(x_t, mask_token_id), x_t)
+    # --- RB loss weights: 1/alpha_t on masked positions, 0 elsewhere ---
+    # Clamp alpha_t so weights stay finite near t→1.
+    safe_alpha = alpha_t.clamp(min=_MIN_ALPHA)         # (B,)
+    weight_per_sample = 1.0 / safe_alpha               # (B,)
+    # Broadcast to (B, T) and zero out unmasked positions.
+    loss_weights = weight_per_sample[:, None].expand(B, T).to(dtype=dtype)  # (B, T)
+    loss_weights = loss_weights * mask_positions.float()
+    return x_t, mask_positions, loss_weights
+def mdlm_rb_loss(
+    logits: torch.Tensor,
+    targets: torch.Tensor,
+    mask_positions: torch.Tensor,
+    loss_weights: torch.Tensor,
+    ignore_index: int = -100,
+) -> torch.Tensor:
+    """Rao-Blackwellized negative ELBO.
+    Applies the MDLM loss: cross-entropy on masked positions only, weighted
+    per-token by loss_weights, averaged over the batch.
+    The formula (eq. 7-8 of arXiv:2406.07524):
+        L_RB = mean_B [ sum_T (weight_t * CE(logits_i, target_i) * mask_i)
+                        / max(sum_T(mask_i), 1) ]
+    Args:
+        logits        : (B, T, V) raw logits. May be bf16; internally cast to
+                        float32 for CE computation.
+        targets       : (B, T) int64 true token ids (x_0).
+        mask_positions: (B, T) bool — True = masked position.
+        loss_weights  : (B, T) float32 — 1/alpha_t on masked positions, 0 elsewhere.
+        ignore_index  : Passed to F.cross_entropy; positions with this label
+                        are excluded from the loss.
+    Returns:
+        Scalar float32 loss. Returns 0.0 tensor if no positions are masked.
+    """
+    B, T, V = logits.shape
+    # Ensure float32 for numerical stability; F.cross_entropy accepts fp16/bf16
+    # logits but accumulates in float internally anyway. Being explicit avoids
+    # silent precision surprises.
+    logits_f = logits.float()                          # (B, T, V)
+    # Build targets with ignore_index on UNmasked positions so CE only fires
+    # where mask_positions is True. We also honour any pre-existing -100 values
+    # (e.g. doc-separator masking upstream).
+    targets_masked = torch.where(
+        mask_positions & (targets != ignore_index),
+        targets,
+        torch.full_like(targets, ignore_index),
+    )
+    # Per-token CE; shape (B, T). Positions with ignore_index → 0 from CE.
+    per_tok_ce = F.cross_entropy(
+        logits_f.reshape(B * T, V),
+        targets_masked.reshape(B * T),
+        ignore_index=ignore_index,
+        reduction="none",
+    ).reshape(B, T)                                    # (B, T) float32
+    # Apply RB weight. loss_weights already has 0 on unmasked positions.
+    weighted = per_tok_ce * loss_weights               # (B, T)
+    # Per-sample mean over masked positions, then average over batch.
+    mask_f = mask_positions.float()                    # (B, T)
+    per_sample_mask_count = mask_f.sum(dim=1).clamp(min=1)   # (B,)
+    per_sample_loss = weighted.sum(dim=1) / per_sample_mask_count  # (B,)
+    return per_sample_loss.mean()                      # scalar float32
+def mdlm_loss(
+    logits: torch.Tensor,
+    targets: torch.Tensor,
+    mask_token_id: int,
+    t: torch.Tensor | None = None,
+    alpha_schedule: Literal["linear", "loglinear"] = "loglinear",
+    ignore_index: int = -100,
+) -> torch.Tensor:
+    """Convenience wrapper: forward process + RB-ELBO in one call.
+    Suitable for the common case where the caller has full-vocab logits and
+    wants a drop-in replacement for a standard masked-LM CE loss.
+    Args:
+        logits        : (B, T, V) raw logits.
+        targets       : (B, T) int64 clean token ids.
+        mask_token_id : The MASK token id used to corrupt the input.
+        t             : Optional (B,) timestep in (0, 1). Sampled if None.
+        alpha_schedule: "loglinear" (default) or "linear".
+        ignore_index  : Token id to ignore in the loss (e.g. padding).
+    Returns:
+        Scalar float32 MDLM RB-ELBO loss.
+    Note on sampled-softmax / partial logits:
+        If your model only computes logits for a subset of vocab positions
+        (e.g. HYDRA's sampled-softmax head), call mdlm_masked_forward_process
+        and mdlm_rb_loss separately. mdlm_rb_loss expects full-vocab logits.
+    """
+    x_t, mask_positions, loss_weights = mdlm_masked_forward_process(
+        targets=targets,
+        mask_token_id=mask_token_id,
+        t=t,
+        alpha_schedule=alpha_schedule,
+    )
+    # x_t is produced for the model's input (not used by this convenience
+    # wrapper since logits are already provided by the caller). In a real
+    # training loop the caller feeds x_t into the model to get logits, THEN
+    # calls this function. See the orchestrator wiring note in training.py.
+    return mdlm_rb_loss(
+        logits=logits,
+        targets=targets,
+        mask_positions=mask_positions,
+        loss_weights=loss_weights,
+        ignore_index=ignore_index,
+    )

overlay/hydra/engram.py CHANGED Viewed

@@ -1,175 +1,160 @@
-"""GPU Engram — Top-k Sparse Hopfield retrieval, scales to n_columns >= 32768.
-## What changed (scatter-gather → top-k Hopfield)
-The original forward used `self.memory[indices]` (scatter-gather), which misses
-L2 cache at n_columns > 4096 and creates a hard tps ceiling.
-An earlier Hopfield implementation used `entmax15` for sparse attention, but
-entmax's internal `torch.sort` over the full n_columns dimension allocates
-~1 GB scratch at (B*T=8192, n_columns=32768) and OOMs on a 6 GB card.
-This module replaces the sort-based entmax with **top-k softmax**, which is
-O(B*T*K) in memory and O(B*T*K * log n_columns) in compute (the top-k is
-radix-selection under the hood — not a full sort). Sparsity is still exact:
-only K columns have non-zero weight per (batch, position).
-## Why this scales where entmax didn't
-- `scores = x @ memory.T` is (B, T, n_columns) — 268 MB at bf16 with n_columns=32768.
-- `scores.topk(K)` allocates only (B, T, K) — ~2 MB at K=64. No full sort.
-- `memory[topk_idx]` gathers (B, T, K, d_model) — ~32 MB at bf16. Gather is
-  on the LAST axis of memory (columns), contiguous stride-1 rows, cache-friendly.
-- `retrieved = einsum(topk_w, selected_mem)` — ~4 MB. Final reduction.
-Peak working set well under 400 MB at any reasonable n_columns + K. The weights
-tensor is never densified (which would have been the (B, T, n_columns) killer).
-## Gradient flow
-Both the topk gather and the einsum are autograd-tracked, so `self.memory`
-receives gradient from the LM loss (which the Hebbian scatter-gather path did
-not). `topk` indices are detached — gradient flows through `topk_vals` via the
-selected memory rows.
-## Sparsity
-Exactly K columns have non-zero weight per position. Default K=64, tunable via
-HYDRA_ENGRAM_TOPK.
-## token_ids argument
-Accepted for API compatibility with hydra/model.py; unused in retrieval. The
-optional Hebbian boost (hebbian_boost=True) uses the hash-indexed path for
-its EMA write only.
-## Checkpoint compatibility
-`self.memory` shape (n_columns, d_model) is unchanged; existing .pt/.ckpt
-files load without migration.
-"""
-from __future__ import annotations
-import os
-import torch
-import torch.nn as nn
-# Top-k width — how many memory columns get non-zero weight per position.
-# Default 64 matches the entmax sparsity fraction we observed empirically
-# (~0.2% of 32768 columns == 64). HYDRA_ENGRAM_TOPK env var overrides.
-_ENGRAM_TOPK = int(os.environ.get("HYDRA_ENGRAM_TOPK", "64"))
-class GPUEngram(nn.Module):
-    """GPU Engram: Top-k Sparse Hopfield retrieval.
-    Args:
-        d_model:       Model dimension — must match the surrounding transformer.
-        n_columns:     Number of memory columns (key-value pairs). Safe up to
-                       n_columns = 65536 at d_model = 384 on a 6 GB card with
-                       B*T <= 8192.
-        max_ngram:     Retained for API compatibility; unused in retrieval.
-        hebbian_boost: If True, also run a Hebbian EMA write on the memory bank
-                       during training. Default False — the top-k gradient path
-                       provides learning signal without this.
-    """
-    def __init__(
-        self,
-        d_model: int,
-        n_columns: int = 1024,
-        max_ngram: int = 3,
-        hebbian_boost: bool = False,
-    ) -> None:
-        super().__init__()
-        self.n_columns = n_columns
-        self.max_ngram = max_ngram
-        self.hebbian_boost = hebbian_boost
-        # Shape unchanged from original — existing checkpoints load cleanly.
-        self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
-        self.gate = nn.Linear(d_model, 1, bias=True)
-        nn.init.constant_(self.gate.bias, 0.0)  # START OPEN
-        # Clamp topk K to n_columns so topk doesn't error at small engram.
-        self.topk_k = min(_ENGRAM_TOPK, n_columns)
-        # Retained for any external code that reads these attrs.
-        self.primes = [2654435761, 2246822519, 3266489917]
-        self.hebbian_lr = 0.01
-    # ------------------------------------------------------------------
-    # _hash: retained for API/checkpoint compat; unused in retrieval path.
-    # ------------------------------------------------------------------
-    def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
-        """N-gram hash → column index (Hebbian-write target only, not retrieval)."""
-        B, T = token_ids.shape
-        h = token_ids * self.primes[0]
-        if T > 1:
-            shifted1 = torch.roll(token_ids, 1, dims=1)
-            shifted1[:, 0] = 0
-            h = h ^ (shifted1 * self.primes[1])
-        if T > 2:
-            shifted2 = torch.roll(token_ids, 2, dims=1)
-            shifted2[:, :2] = 0
-            h = h ^ (shifted2 * self.primes[2])
-        return h % self.n_columns
-    # ------------------------------------------------------------------
-    # forward
-    # ------------------------------------------------------------------
-    def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
-        """Top-k Hopfield retrieve + soft gate + residual.
-        Args:
-            x:         (B, T, d_model) — input activations.
-            token_ids: (B, T) — accepted for API compat; only used in the
-                       optional Hebbian boost path.
-        Returns:
-            (x + alpha * retrieved, hit_rate)
-            - x + alpha * retrieved: (B, T, d_model)
-            - hit_rate: scalar tensor — fraction of gate values > 0.1
-        """
-        B, T, D = x.shape
-        # ---- 1. Similarity scores (coalesced GEMM) ----------------------
-        # scores[b, t, c] = dot(x[b,t], memory[c])
-        scores = x @ self.memory.T  # (B, T, n_columns)
-        # ---- 2. Top-k sparse attention ----------------------------------
-        # topk uses radix select, not a sort — O(n_columns) memory, not O(n_columns log n_columns).
-        # Never materializes a dense (B, T, n_columns) weights tensor.
-        topk_vals, topk_idx = scores.topk(self.topk_k, dim=-1)  # (B, T, K), (B, T, K)
-        topk_w = torch.softmax(topk_vals, dim=-1)                # (B, T, K)
-        # ---- 3. Gather selected memory rows -----------------------------
-        # memory[topk_idx] is a gather along axis 0 of memory (n_columns, d_model).
-        # Output shape (B, T, K, d_model) — K is small, so gather bandwidth is
-        # O(B*T*K*d_model), independent of n_columns.
-        selected_mem = self.memory[topk_idx]  # (B, T, K, d_model)
-        # ---- 4. Weighted sum → retrieved vector -------------------------
-        retrieved = torch.einsum('btk,btkd->btd', topk_w, selected_mem)  # (B, T, d_model)
-        # ---- 5. Soft gate -----------------------------------------------
-        alpha = torch.sigmoid(self.gate(x))  # (B, T, 1)
-        # ---- 6. Optional Hebbian EMA write ------------------------------
-        if self.training and self.hebbian_boost:
-            with torch.no_grad():
-                indices = self._hash(token_ids)
-                flat_idx = indices.reshape(-1)                # (B*T,)
-                flat_x = x.detach().reshape(-1, D)            # (B*T, d_model)
-                mem_dtype = self.memory.data.dtype
-                updates = (
-                    self.hebbian_lr * flat_x
-                    - self.hebbian_lr * self.memory.data[flat_idx]
-                ).to(mem_dtype)
-                self.memory.data.index_add_(0, flat_idx, updates)
-        # ---- 7. Residual + hit_rate -------------------------------------
-        hit_rate = (alpha.detach() > 0.1).float().mean()
-        return x + alpha * retrieved, hit_rate

+"""GPU Engram — Top-k Sparse Hopfield retrieval with optional Cantor/SDR nerve constraint."""
+from __future__ import annotations
+import os
+import torch
+import torch.nn as nn
+_ENGRAM_TOPK = int(os.environ.get("HYDRA_ENGRAM_TOPK", "64"))
+class GPUEngram(nn.Module):
+    """GPU Engram: Top-k Sparse Hopfield retrieval.
+    Default `routing_mode=flat` preserves the existing full-memory top-k path.
+    `cantor_sdr` constrains candidates to the current Cantor leaf shard and SDR
+    active offsets. `auto` only uses that local path when it is cheaper than the
+    full score matrix (`K * d_model < n_columns`).
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_columns: int = 1024,
+        max_ngram: int = 3,
+        hebbian_boost: bool = False,
+    ) -> None:
+        super().__init__()
+        self.n_columns = n_columns
+        self.max_ngram = max_ngram
+        self.hebbian_boost = hebbian_boost
+        self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
+        self.gate = nn.Linear(d_model, 1, bias=True)
+        nn.init.constant_(self.gate.bias, 0.0)
+        self.topk_k = min(_ENGRAM_TOPK, n_columns)
+        self.primes = [2654435761, 2246822519, 3266489917]
+        self.hebbian_lr = 0.01
+        self.routing_mode = os.environ.get("HYDRA_ENGRAM_ROUTING", "auto").lower()
+    def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
+        B, T = token_ids.shape
+        h = token_ids * self.primes[0]
+        if T > 1:
+            shifted1 = torch.roll(token_ids, 1, dims=1)
+            shifted1[:, 0] = 0
+            h = h ^ (shifted1 * self.primes[1])
+        if T > 2:
+            shifted2 = torch.roll(token_ids, 2, dims=1)
+            shifted2[:, :2] = 0
+            h = h ^ (shifted2 * self.primes[2])
+        return h % self.n_columns
+    def _validate_active_indices(self, sdr_active_indices: torch.Tensor, x: torch.Tensor) -> None:
+        if not torch.is_floating_point(sdr_active_indices) and sdr_active_indices.dtype != torch.bool:
+            pass
+        else:
+            raise ValueError("Engram Cantor/SDR routing expects compact active indices, not a dense SDR mask")
+        if sdr_active_indices.dim() not in (2, 3):
+            raise ValueError("compact active indices must have shape (B,T,K) or (B*T,K)")
+        # Dense SDR masks arrive with K ~= n_bits; compact buffers are small
+        # (retina target_active or RealityBridge l0_k). Refuse obviously dense
+        # masks so forced cantor_sdr cannot silently route 0/1 values as offsets.
+        if sdr_active_indices.shape[-1] > 1024 or sdr_active_indices.shape[-1] > self.n_columns:
+            raise ValueError("Engram Cantor/SDR routing expects compact active indices, not a dense SDR mask")
+    def _cantor_sdr_candidates(
+        self,
+        sdr_active_indices: torch.Tensor,
+        cantor_leaf_ids: torch.Tensor,
+        n_leaves: int,
+    ) -> torch.Tensor:
+        """Map SDR active offsets into each Cantor leaf's Engram column shard."""
+        self._validate_active_indices(sdr_active_indices, cantor_leaf_ids)
+        if sdr_active_indices.dim() == 2:
+            B, T = cantor_leaf_ids.shape
+            sdr_active_indices = sdr_active_indices.view(B, T, -1)
+        sdr = sdr_active_indices.to(device=cantor_leaf_ids.device, dtype=torch.long)
+        leaves = cantor_leaf_ids.to(dtype=torch.long).clamp(min=0, max=max(0, n_leaves - 1))
+        cols_per_leaf = max(1, self.n_columns // max(1, n_leaves))
+        offsets = sdr.remainder(cols_per_leaf)
+        base = leaves.unsqueeze(-1) * cols_per_leaf
+        return (base + offsets).clamp(max=self.n_columns - 1)
+    def _flat_retrieve(self, x: torch.Tensor) -> torch.Tensor:
+        scores = x @ self.memory.T
+        topk_vals, topk_idx = scores.topk(self.topk_k, dim=-1)
+        topk_w = torch.softmax(topk_vals, dim=-1)
+        selected_mem = self.memory[topk_idx]
+        return torch.einsum('btk,btkd->btd', topk_w, selected_mem)
+    def _cantor_sdr_retrieve(
+        self,
+        x: torch.Tensor,
+        sdr_active_indices: torch.Tensor,
+        cantor_leaf_ids: torch.Tensor,
+        cantor_n_leaves: int,
+    ) -> torch.Tensor:
+        candidates = self._cantor_sdr_candidates(
+            sdr_active_indices,
+            cantor_leaf_ids,
+            n_leaves=cantor_n_leaves,
+        )
+        cand_mem = self.memory[candidates]
+        scores = torch.einsum('btd,btkd->btk', x, cand_mem)
+        k = min(self.topk_k, scores.shape[-1])
+        topk_vals, local_idx = scores.topk(k, dim=-1)
+        topk_w = torch.softmax(topk_vals, dim=-1)
+        global_idx = candidates.gather(-1, local_idx)
+        selected_mem = self.memory[global_idx]
+        return torch.einsum('btk,btkd->btd', topk_w, selected_mem)
+    def forward(
+        self,
+        x: torch.Tensor,
+        token_ids: torch.Tensor,
+        sdr_active_indices: torch.Tensor | None = None,
+        cantor_leaf_ids: torch.Tensor | None = None,
+        cantor_n_leaves: int | None = None,
+    ):
+        B, T, D = x.shape
+        mode = self.routing_mode
+        use_cantor = (
+            mode in {"cantor_sdr", "auto"}
+            and sdr_active_indices is not None
+            and cantor_leaf_ids is not None
+            and cantor_n_leaves is not None
+        )
+        if mode == "auto" and use_cantor:
+            k_active = sdr_active_indices.shape[-1]
+            # Compare actual retrieval candidates against the full-memory scan.
+            # The previous `(k_active * D) < n_columns` check mixed candidate
+            # count with feature dimension, so d256/k64 fell back to flat
+            # retrieval even though Cantor/SDR scores only 64 candidates vs
+            # 8k-16k memory columns. That kept required subsystems active but
+            # spent tens of billions of extra MACs per forward.
+            use_cantor = k_active < self.n_columns
+        if use_cantor and mode in {"cantor_sdr", "auto"}:
+            retrieved = self._cantor_sdr_retrieve(x, sdr_active_indices, cantor_leaf_ids, cantor_n_leaves)
+        else:
+            retrieved = self._flat_retrieve(x)
+        alpha = torch.sigmoid(self.gate(x))
+        if self.training and self.hebbian_boost:
+            with torch.no_grad():
+                indices = self._hash(token_ids)
+                flat_idx = indices.reshape(-1)
+                flat_x = x.detach().reshape(-1, D)
+                mem_dtype = self.memory.data.dtype
+                updates = (
+                    self.hebbian_lr * flat_x
+                    - self.hebbian_lr * self.memory.data[flat_idx]
+                ).to(mem_dtype)
+                self.memory.data.index_add_(0, flat_idx, updates)
+        hit_rate = (alpha.detach() > 0.1).float().mean()
+        return x + alpha * retrieved, hit_rate

overlay/hydra/eval.py CHANGED Viewed

@@ -1,217 +1,210 @@
-"""Evaluation: factual probes + sampled factual English scoring.
-Extracted from train.py (W1 modularization). Semantics unchanged.
-Perf optimizations (eval_perf_fix):
-- Probe mode: single forward per prompt instead of autoregressive gen
-- Batch decode: all GPU work first, all CPU decode after
-- Batched factual probes: single padded forward instead of N sequential
-"""
-from __future__ import annotations
-import os
-import re as _re
-import torch
-from hydra.config import FACTUAL_SAMPLES, FACTUAL_BATCH, FACTUAL_GEN_TOKENS
-# Default to probe mode (1 forward per prompt); set HYDRA_FACTUAL_MODE=gen for
-# the original autoregressive generation path.
-FACTUAL_MODE = os.environ.get("HYDRA_FACTUAL_MODE", "probe")
-FACTUAL_EVAL = [
-    # Hard factual recall — requires specific knowledge memorization
-    ("The capital of France is", ["Paris", "paris"]),
-    ("Water boils at", ["100", "boiling"]),
-    ("The largest planet in our solar system is", ["Jupiter", "jupiter"]),
-    # Easier completions — common collocations / patterns the model may pick up
-    ("Once upon a", ["time"]),
-    ("Hello, my name", ["is", "'s"]),
-    ("The cat sat on the", ["mat", "floor", "rug", "table", "couch", "chair", "ground"]),
-    ("She opened the door and", ["walked", "saw", "found", "stepped", "looked", "went", "ran"]),
-    # Original hard ones kept for completeness
-    ("The speed of light is approximately", ["299", "300", "186,000", "light speed"]),
-    ("Two plus two equals", ["4", "four"]),
-]
-_FACTUAL_PROBES = [
-    "The capital of France is",
-    "Water boils at",
-    "The largest planet in our solar system is",
-    "The speed of light is approximately",
-    "Shakespeare wrote",
-]
-def run_factual_probes(model, tokenizer, device, autocast_ctx) -> None:
-    """Top-5 next-token predictions for canonical factual prompts.
-    Batched: pads all prompts into a single forward pass instead of N
-    sequential passes.
-    """
-    print("\n--- Factual Probes ---")
-    model.eval()
-    # Process probes one at a time to avoid cooperative launch limit
-    # (batched forward with B=len(probes) can exceed SM residency cap).
-    for prompt_text in _FACTUAL_PROBES:
-        ids = tokenizer.encode(prompt_text)
-        x = torch.tensor([ids], device=device)
-        with torch.no_grad(), autocast_ctx:
-            logits = model(x)
-        probs = torch.softmax(logits[0, -1].float(), dim=-1)
-        top5 = torch.topk(probs, 5)
-        completions = [tokenizer.decode([idx.item()]) for idx in top5.indices]
-        probs_list = [f"{p:.4f}" for p in top5.values[:3].tolist()]
-        print(f'  "{prompt_text}" -> {completions[:3]} (p={probs_list})')
-    print("--- End Factual Probes ---\n")
-# ---------------------------------------------------------------------------
-# Probe mode: single forward per prompt (Fix D)
-# ---------------------------------------------------------------------------
-def _run_factual_english_probe(model, tokenizer, max_seq_len: int):
-    """Fast probe mode: for each (prompt, answers), encode prompt + each answer
-    candidate as a single sequence, do ONE forward pass, and check if the model's
-    argmax at the last prompt token matches the first answer token.
-    Falls back to checking top-K predictions to be generous (same as gen mode
-    which samples multiple temperatures).
-    """
-    print("---")
-    print("factual_english_samples: (probe mode)")
-    model.eval()
-    hits = 0
-    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        for prompt, answers in FACTUAL_EVAL:
-            prompt_ids = tokenizer.encode(prompt)
-            prompt_len = len(prompt_ids)
-            x = torch.tensor([prompt_ids], device="cuda", dtype=torch.long)
-            logits = model(x, targets=None)
-            # logits shape: [1, seq_len, vocab] or [1, vocab]
-            if logits.dim() == 3:
-                last_logits = logits[0, -1, :]
-            else:
-                last_logits = logits[0]
-            probs = torch.softmax(last_logits.float(), dim=-1)
-            # Check top-K predictions (generous: K=20 to match multi-sample gen)
-            top_k = min(20, probs.shape[-1])
-            top_ids = torch.topk(probs, top_k).indices.tolist()
-            top_tokens = [tokenizer.decode([tid]).strip().lower() for tid in top_ids]
-            answers_lower = [a.lower() for a in answers]
-            any_hit = any(
-                any(a in tok for a in answers_lower)
-                for tok in top_tokens
-            )
-            if any_hit:
-                hits += 1
-            best_completion = tokenizer.decode([top_ids[0]])
-            print(f"  prompt: {prompt!r}")
-            print(f"  output: {(prompt + best_completion).replace(chr(10), ' ')!r}")
-            print(f"  hit:    {any_hit} (probe top-{top_k})")
-    score = hits / len(FACTUAL_EVAL)
-    print("---")
-    print(f"factual_english_score: {score:.4f}")
-    print(f"factual_english_hits:  {hits}/{len(FACTUAL_EVAL)}")
-    return score, hits, len(FACTUAL_EVAL)
-# ---------------------------------------------------------------------------
-# Gen mode: original autoregressive path (Fix F: batch decode)
-# ---------------------------------------------------------------------------
-def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
-    """Original autoregressive generation path with batch decode optimization:
-    all GPU work runs first, then all CPU decoding happens after."""
-    print("---")
-    print("factual_english_samples: (gen mode)")
-    model.eval()
-    num_samples = FACTUAL_SAMPLES
-    batch = FACTUAL_BATCH
-    gen_tokens = FACTUAL_GEN_TOKENS
-    # Optional fast incremental decode path for recurrence-capable backbones.
-    # If disabled, we preserve the original full-context re-forward behavior.
-    incremental_decode = os.environ.get("HYDRA_FACTUAL_GEN_INCREMENTAL", "1") == "1"
-    temps = [0.7, 0.9, 1.1]
-    hits = 0
-    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
-        for prompt, answers in FACTUAL_EVAL:
-            ids = tokenizer.encode(prompt)
-            answers_lower = [a.lower() for a in answers]
-            # Collect all generated token sequences on GPU first
-            all_rows: list[list[int]] = []
-            samples_done = 0
-            batch_idx = 0
-            while samples_done < num_samples:
-                b = min(batch, num_samples - samples_done)
-                temp = temps[batch_idx % len(temps)]
-                batch_idx += 1
-                ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
-                logits = model(ctx, targets=None)
-                for _ in range(gen_tokens):
-                    next_logits = logits[:, -1, :] if logits.dim() == 3 else logits
-                    probs = torch.softmax(next_logits.float() / temp, dim=-1)
-                    next_id = torch.multinomial(probs, num_samples=1)
-                    ctx = torch.cat([ctx, next_id], dim=1)
-                    if ctx.size(1) >= max_seq_len:
-                        break
-                    if incremental_decode:
-                        logits = model(ctx[:, -1:], targets=None)
-                    else:
-                        logits = model(ctx, targets=None)
-                # Transfer to CPU in one shot, no per-row sync
-                all_rows.extend(ctx.cpu().tolist())
-                samples_done += b
-            # CPU-side batch decode — no GPU sync between decodes
-            any_hit = False
-            first_gen = None
-            hit_gen = None
-            for row in all_rows:
-                generated = tokenizer.decode(row)
-                continuation = generated[len(prompt):].strip()
-                _words = set(w.lower() for w in _re.findall(r"\b[\w'-]+\b", continuation))
-                hit = any(a in _words for a in answers_lower)
-                if first_gen is None:
-                    first_gen = generated
-                if hit:
-                    any_hit = True
-                    if hit_gen is None:
-                        hit_gen = generated
-            if any_hit:
-                hits += 1
-            print(f"  prompt: {prompt!r}")
-            print(f"  output: {(first_gen or '').replace(chr(10), ' ')!r}")
-            print(f"  hit:    {any_hit} (any of {num_samples} samples, temps={temps}, gen={gen_tokens}tok)")
-            if hit_gen is not None and hit_gen != first_gen:
-                print(f"  hit_sample: {hit_gen.replace(chr(10), ' ')!r}")
-    score = hits / len(FACTUAL_EVAL)
-    print("---")
-    print(f"factual_english_score: {score:.4f}")
-    print(f"factual_english_hits:  {hits}/{len(FACTUAL_EVAL)}")
-    return score, hits, len(FACTUAL_EVAL)
-# ---------------------------------------------------------------------------
-# Public entry point
-# ---------------------------------------------------------------------------
-def run_factual_english(model, tokenizer, max_seq_len: int):
-    """Dispatch to probe (fast, default) or gen (original) mode.
-    Set HYDRA_FACTUAL_MODE=gen to use the autoregressive path.
-    """
-    if FACTUAL_MODE == "gen":
-        return _run_factual_english_gen(model, tokenizer, max_seq_len)
-    return _run_factual_english_probe(model, tokenizer, max_seq_len)

+"""Evaluation: factual probes + sampled factual English scoring.
+Extracted from train.py (W1 modularization). Semantics unchanged.
+Perf optimizations (eval_perf_fix):
+- Probe mode: single forward per prompt instead of autoregressive gen
+- Batch decode: all GPU work first, all CPU decode after
+- Batched factual probes: single padded forward instead of N sequential
+"""
+from __future__ import annotations
+import os
+import re as _re
+import torch
+from hydra.config import FACTUAL_SAMPLES, FACTUAL_BATCH, FACTUAL_GEN_TOKENS
+# Default to probe mode (1 forward per prompt); set HYDRA_FACTUAL_MODE=gen for
+# the original autoregressive generation path.
+FACTUAL_MODE = os.environ.get("HYDRA_FACTUAL_MODE", "probe")
+FACTUAL_EVAL = [
+    # Hard factual recall — requires specific knowledge memorization
+    ("The capital of France is", ["Paris", "paris"]),
+    ("Water boils at", ["100", "boiling"]),
+    ("The largest planet in our solar system is", ["Jupiter", "jupiter"]),
+    # Easier completions — common collocations / patterns the model may pick up
+    ("Once upon a", ["time"]),
+    ("Hello, my name", ["is", "'s"]),
+    ("The cat sat on the", ["mat", "floor", "rug", "table", "couch", "chair", "ground"]),
+    ("She opened the door and", ["walked", "saw", "found", "stepped", "looked", "went", "ran"]),
+    # Original hard ones kept for completeness
+    ("The speed of light is approximately", ["299", "300", "186,000", "light speed"]),
+    ("Two plus two equals", ["4", "four"]),
+]
+_FACTUAL_PROBES = [
+    "The capital of France is",
+    "Water boils at",
+    "The largest planet in our solar system is",
+    "The speed of light is approximately",
+    "Shakespeare wrote",
+]
+def run_factual_probes(model, tokenizer, device, autocast_ctx) -> None:
+    """Top-5 next-token predictions for canonical factual prompts.
+    Batched: pads all prompts into a single forward pass instead of N
+    sequential passes.
+    """
+    print("\n--- Factual Probes ---")
+    model.eval()
+    # Process probes one at a time to avoid cooperative launch limit
+    # (batched forward with B=len(probes) can exceed SM residency cap).
+    for prompt_text in _FACTUAL_PROBES:
+        ids = tokenizer.encode(prompt_text)
+        x = torch.tensor([ids], device=device)
+        with torch.no_grad(), autocast_ctx:
+            logits = model(x)
+        probs = torch.softmax(logits[0, -1].float(), dim=-1)
+        top5 = torch.topk(probs, 5)
+        completions = [tokenizer.decode([idx.item()]) for idx in top5.indices]
+        probs_list = [f"{p:.4f}" for p in top5.values[:3].tolist()]
+        print(f'  "{prompt_text}" -> {completions[:3]} (p={probs_list})')
+    print("--- End Factual Probes ---\n")
+# ---------------------------------------------------------------------------
+# Probe mode: single forward per prompt (Fix D)
+# ---------------------------------------------------------------------------
+def _run_factual_english_probe(model, tokenizer, max_seq_len: int):
+    """Fast probe mode: for each (prompt, answers), encode prompt + each answer
+    candidate as a single sequence, do ONE forward pass, and check if the model's
+    argmax at the last prompt token matches the first answer token.
+    Falls back to checking top-K predictions to be generous (same as gen mode
+    which samples multiple temperatures).
+    """
+    print("---")
+    print("factual_english_samples: (probe mode)")
+    model.eval()
+    hits = 0
+    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        for prompt, answers in FACTUAL_EVAL:
+            prompt_ids = tokenizer.encode(prompt)
+            prompt_len = len(prompt_ids)
+            x = torch.tensor([prompt_ids], device="cuda", dtype=torch.long)
+            logits = model(x, targets=None)
+            # logits shape: [1, seq_len, vocab] or [1, vocab]
+            if logits.dim() == 3:
+                last_logits = logits[0, -1, :]
+            else:
+                last_logits = logits[0]
+            probs = torch.softmax(last_logits.float(), dim=-1)
+            # Check top-K predictions (generous: K=20 to match multi-sample gen)
+            top_k = min(20, probs.shape[-1])
+            top_ids = torch.topk(probs, top_k).indices.tolist()
+            top_tokens = [tokenizer.decode([tid]).strip().lower() for tid in top_ids]
+            answers_lower = [a.lower() for a in answers]
+            any_hit = any(
+                any(a in tok for a in answers_lower)
+                for tok in top_tokens
+            )
+            if any_hit:
+                hits += 1
+            best_completion = tokenizer.decode([top_ids[0]])
+            print(f"  prompt: {prompt!r}")
+            print(f"  output: {(prompt + best_completion).replace(chr(10), ' ')!r}")
+            print(f"  hit:    {any_hit} (probe top-{top_k})")
+    score = hits / len(FACTUAL_EVAL)
+    print("---")
+    print(f"factual_english_score: {score:.4f}")
+    print(f"factual_english_hits:  {hits}/{len(FACTUAL_EVAL)}")
+    return score, hits, len(FACTUAL_EVAL)
+# ---------------------------------------------------------------------------
+# Gen mode: original autoregressive path (Fix F: batch decode)
+# ---------------------------------------------------------------------------
+def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
+    """Original autoregressive generation path with batch decode optimization:
+    all GPU work runs first, then all CPU decoding happens after."""
+    print("---")
+    print("factual_english_samples: (gen mode)")
+    model.eval()
+    num_samples = FACTUAL_SAMPLES
+    batch = FACTUAL_BATCH
+    gen_tokens = FACTUAL_GEN_TOKENS
+    temps = [0.7, 0.9, 1.1]
+    hits = 0
+    with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        for prompt, answers in FACTUAL_EVAL:
+            ids = tokenizer.encode(prompt)
+            answers_lower = [a.lower() for a in answers]
+            # Collect all generated token sequences on GPU first
+            all_rows: list[list[int]] = []
+            samples_done = 0
+            batch_idx = 0
+            while samples_done < num_samples:
+                b = min(batch, num_samples - samples_done)
+                temp = temps[batch_idx % len(temps)]
+                batch_idx += 1
+                ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
+                for _ in range(gen_tokens):
+                    logits = model(ctx, targets=None)
+                    next_logits = logits[:, -1, :] if logits.dim() == 3 else logits
+                    probs = torch.softmax(next_logits.float() / temp, dim=-1)
+                    next_id = torch.multinomial(probs, num_samples=1)
+                    ctx = torch.cat([ctx, next_id], dim=1)
+                    if ctx.size(1) >= max_seq_len:
+                        break
+                # Transfer to CPU in one shot, no per-row sync
+                all_rows.extend(ctx.cpu().tolist())
+                samples_done += b
+            # CPU-side batch decode — no GPU sync between decodes
+            any_hit = False
+            first_gen = None
+            hit_gen = None
+            for row in all_rows:
+                generated = tokenizer.decode(row)
+                continuation = generated[len(prompt):].strip()
+                _words = set(w.lower() for w in _re.findall(r"\b[\w'-]+\b", continuation))
+                hit = any(a in _words for a in answers_lower)
+                if first_gen is None:
+                    first_gen = generated
+                if hit:
+                    any_hit = True
+                    if hit_gen is None:
+                        hit_gen = generated
+            if any_hit:
+                hits += 1
+            print(f"  prompt: {prompt!r}")
+            print(f"  output: {(first_gen or '').replace(chr(10), ' ')!r}")
+            print(f"  hit:    {any_hit} (any of {num_samples} samples, temps={temps}, gen={gen_tokens}tok)")
+            if hit_gen is not None and hit_gen != first_gen:
+                print(f"  hit_sample: {hit_gen.replace(chr(10), ' ')!r}")
+    score = hits / len(FACTUAL_EVAL)
+    print("---")
+    print(f"factual_english_score: {score:.4f}")
+    print(f"factual_english_hits:  {hits}/{len(FACTUAL_EVAL)}")
+    return score, hits, len(FACTUAL_EVAL)
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+def run_factual_english(model, tokenizer, max_seq_len: int):
+    """Dispatch to probe (fast, default) or gen (original) mode.
+    Set HYDRA_FACTUAL_MODE=gen to use the autoregressive path.
+    """
+    if FACTUAL_MODE == "gen":
+        return _run_factual_english_gen(model, tokenizer, max_seq_len)
+    return _run_factual_english_probe(model, tokenizer, max_seq_len)

overlay/hydra/gdn_block.py CHANGED Viewed

@@ -1,126 +1,126 @@
-"""GDNBlock — Gated Delta Net block, drop-in shape-compatible with Mamba3Block and HyenaBlock.
-GatedDeltaNet (GDN) reference: arXiv:2412.06464 (ICLR 2025, NVLabs).
-Implementation: flash-linear-attention (fla) library, Triton kernels, sm86-compatible.
-Interface contract (MUST match how Mamba3/Hyena are called in hydra/model.py):
-    block = GDNBlock(d_model, ...)
-    y = block(x)    # x: [B, T, d_model]  ->  y: [B, T, d_model]
-The surrounding mHC layer does NOT pre-norm before calling this block (the
-raw hidden state is passed in); the block itself applies no input normalization,
-same as HyenaBlock.  We return the raw operator output; the mHC layer adds it
-as a residual stream contribution.
-NO attention, NO softmax-over-sequence-dim.  All state is stateless between
-.forward() calls by default (use_cache=False, past_key_values=None).
-"""
-from __future__ import annotations
-try:
-    from fla.layers.gated_deltanet import GatedDeltaNet as _GatedDeltaNet
-except ImportError as _fla_err:
-    raise ImportError(
-        "flash-linear-attention (fla) is required for GDNBlock but could not be imported. "
-        "Install it with:\n"
-        "    pip install flash-linear-attention\n"
-        "or from source:\n"
-        "    pip install git+https://github.com/fla-org/flash-linear-attention.git\n"
-        f"Original error: {_fla_err}"
-    ) from _fla_err
-import torch
-import torch.nn as nn
-class GDNBlock(nn.Module):
-    """Gated Delta Net block, drop-in shape-compatible with HYDRA's Mamba3Block and HyenaBlock.
-    Wraps `fla.layers.GatedDeltaNet` with the same external API that
-    `hydra.hyena_block.HyenaBlock` exposes:
-        forward(x: Tensor[B, T, d_model]) -> Tensor[B, T, d_model]
-    Internal GatedDeltaNet.forward returns a 3-tuple
-    (hidden_states, attn_weights, past_key_values); we extract [0] and
-    return only the hidden states, keeping the residual stream unchanged.
-    GDN outperforms Mamba-2 on in-context retrieval benchmarks (MQAR, etc.)
-    at equal or faster compute, making it a targeted fix for HYDRA's factual
-    plateau.
-    Parameter counts are deliberately kept within 2x of a Mamba3 block at the
-    same d_model/n_heads to be drop-in affordable.
-    """
-    def __init__(
-        self,
-        d_model: int,
-        n_heads: int = 6,
-        mode: str = "chunk",       # 'chunk' for training, 'fused_recurrent' for inference
-        expand_v: float = 2.0,     # value-projection expansion; controls KV memory
-        use_short_conv: bool = True,
-        conv_size: int = 4,
-    ):
-        super().__init__()
-        self.d_model = d_model
-        self.n_heads = n_heads
-        self.mode = mode
-        # head_dim must divide d_model.  GDN uses separate q/k head_dim from v;
-        # we set head_dim for q/k such that n_heads * head_dim == d_model.
-        if d_model % n_heads != 0:
-            raise ValueError(
-                f"d_model={d_model} must be divisible by n_heads={n_heads} "
-                "so that head_dim = d_model // n_heads is an integer."
-            )
-        head_dim = d_model // n_heads
-        self.gdn = _GatedDeltaNet(
-            hidden_size=d_model,
-            expand_v=expand_v,
-            head_dim=head_dim,
-            num_heads=n_heads,
-            mode=mode,
-            use_gate=True,          # gating is the key architectural feature of GDN
-            use_short_conv=use_short_conv,
-            conv_size=conv_size,
-            layer_idx=None,         # no KV-cache layer indexing; we manage state ourselves
-        )
-    # ------------------------------------------------------------------
-    # Forward
-    # ------------------------------------------------------------------
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """x: [B, T, d_model]  ->  y: [B, T, d_model].
-        Passes through GatedDeltaNet with use_cache=False so no recurrent
-        state leaks between independent forward() calls (important for
-        gradient-accumulation loops and eval).
-        """
-        # GatedDeltaNet.forward signature:
-        #   (hidden_states, attention_mask=None, past_key_values=None,
-        #    use_cache=False, output_attentions=False)
-        # Returns: tuple(hidden_states, attn_weights|None, past_kv|None)
-        out, _, _ = self.gdn(
-            hidden_states=x,
-            attention_mask=None,
-            past_key_values=None,
-            use_cache=False,
-            output_attentions=False,
-        )
-        return out
-    # ------------------------------------------------------------------
-    # API parity with HyenaBlock and Mamba3Block
-    # ------------------------------------------------------------------
-    def invalidate_caches(self) -> None:
-        """No-op — GDNBlock holds no persistent filter cache.
-        Provided for API parity with HyenaBlock, which invalidates its
-        Hyena filter cache here.  Calling this is always safe.
-        """
-        pass

+"""GDNBlock — Gated Delta Net block, drop-in shape-compatible with Mamba3Block and HyenaBlock.
+GatedDeltaNet (GDN) reference: arXiv:2412.06464 (ICLR 2025, NVLabs).
+Implementation: flash-linear-attention (fla) library, Triton kernels, sm86-compatible.
+Interface contract (MUST match how Mamba3/Hyena are called in hydra/model.py):
+    block = GDNBlock(d_model, ...)
+    y = block(x)    # x: [B, T, d_model]  ->  y: [B, T, d_model]
+The surrounding mHC layer does NOT pre-norm before calling this block (the
+raw hidden state is passed in); the block itself applies no input normalization,
+same as HyenaBlock.  We return the raw operator output; the mHC layer adds it
+as a residual stream contribution.
+NO attention, NO softmax-over-sequence-dim.  All state is stateless between
+.forward() calls by default (use_cache=False, past_key_values=None).
+"""
+from __future__ import annotations
+try:
+    from fla.layers.gated_deltanet import GatedDeltaNet as _GatedDeltaNet
+except ImportError as _fla_err:
+    raise ImportError(
+        "flash-linear-attention (fla) is required for GDNBlock but could not be imported. "
+        "Install it with:\n"
+        "    pip install flash-linear-attention\n"
+        "or from source:\n"
+        "    pip install git+https://github.com/fla-org/flash-linear-attention.git\n"
+        f"Original error: {_fla_err}"
+    ) from _fla_err
+import torch
+import torch.nn as nn
+class GDNBlock(nn.Module):
+    """Gated Delta Net block, drop-in shape-compatible with HYDRA's Mamba3Block and HyenaBlock.
+    Wraps `fla.layers.GatedDeltaNet` with the same external API that
+    `hydra.hyena_block.HyenaBlock` exposes:
+        forward(x: Tensor[B, T, d_model]) -> Tensor[B, T, d_model]
+    Internal GatedDeltaNet.forward returns a 3-tuple
+    (hidden_states, attn_weights, past_key_values); we extract [0] and
+    return only the hidden states, keeping the residual stream unchanged.
+    GDN outperforms Mamba-2 on in-context retrieval benchmarks (MQAR, etc.)
+    at equal or faster compute, making it a targeted fix for HYDRA's factual
+    plateau.
+    Parameter counts are deliberately kept within 2x of a Mamba3 block at the
+    same d_model/n_heads to be drop-in affordable.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int = 6,
+        mode: str = "chunk",       # 'chunk' for training, 'fused_recurrent' for inference
+        expand_v: float = 2.0,     # value-projection expansion; controls KV memory
+        use_short_conv: bool = True,
+        conv_size: int = 4,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.mode = mode
+        # head_dim must divide d_model.  GDN uses separate q/k head_dim from v;
+        # we set head_dim for q/k such that n_heads * head_dim == d_model.
+        if d_model % n_heads != 0:
+            raise ValueError(
+                f"d_model={d_model} must be divisible by n_heads={n_heads} "
+                "so that head_dim = d_model // n_heads is an integer."
+            )
+        head_dim = d_model // n_heads
+        self.gdn = _GatedDeltaNet(
+            hidden_size=d_model,
+            expand_v=expand_v,
+            head_dim=head_dim,
+            num_heads=n_heads,
+            mode=mode,
+            use_gate=True,          # gating is the key architectural feature of GDN
+            use_short_conv=use_short_conv,
+            conv_size=conv_size,
+            layer_idx=None,         # no KV-cache layer indexing; we manage state ourselves
+        )
+    # ------------------------------------------------------------------
+    # Forward
+    # ------------------------------------------------------------------
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """x: [B, T, d_model]  ->  y: [B, T, d_model].
+        Passes through GatedDeltaNet with use_cache=False so no recurrent
+        state leaks between independent forward() calls (important for
+        gradient-accumulation loops and eval).
+        """
+        # GatedDeltaNet.forward signature:
+        #   (hidden_states, attention_mask=None, past_key_values=None,
+        #    use_cache=False, output_attentions=False)
+        # Returns: tuple(hidden_states, attn_weights|None, past_kv|None)
+        out, _, _ = self.gdn(
+            hidden_states=x,
+            attention_mask=None,
+            past_key_values=None,
+            use_cache=False,
+            output_attentions=False,
+        )
+        return out
+    # ------------------------------------------------------------------
+    # API parity with HyenaBlock and Mamba3Block
+    # ------------------------------------------------------------------
+    def invalidate_caches(self) -> None:
+        """No-op — GDNBlock holds no persistent filter cache.
+        Provided for API parity with HyenaBlock, which invalidates its
+        Hyena filter cache here.  Calling this is always safe.
+        """
+        pass

overlay/hydra/hyena_block.py CHANGED Viewed

@@ -1,68 +1,68 @@
-"""HyenaBlock — drop-in block for HYDRA, supplement to Mamba3.
-Wraps `subsystems.hyena_pure.HyenaOperator` with a pre-norm + residual scheme
-consistent with how the mHC stack wraps Mamba3 in `hydra/model.py`.
-Interface contract (MUST match how Mamba3 is called in model.py):
-    block = HyenaBlock(d_model, seq_len)
-    y = block(x)   # x: [B, T, d_model]  ->  y: [B, T, d_model]
-The surrounding mHC layer does the pre-norm (`norm(h)`) BEFORE calling the
-block, so the block itself should NOT re-normalize at input — same as Mamba3
-in the current model. We return the raw operator output; the mHC layer then
-adds it as a residual stream contribution.
-NO attention, NO softmax-over-sequence-dim, NO KV-cache. All forbidden
-imports enumerated in tests/test_hyena.py (test #7) are absent.
-"""
-from __future__ import annotations
-import os
-import torch
-import torch.nn as nn
-from subsystems.hyena_pure import HyenaOperator
-class HyenaBlock(nn.Module):
-    """Single Hyena block, shape-compatible with Mamba3 in HYDRA."""
-    def __init__(
-        self,
-        d_model: int,
-        seq_len: int,
-        order: int | None = None,
-        filter_order: int | None = None,
-        dropout: float = 0.0,
-        filter_dropout: float = 0.0,
-        short_filter_order: int = 3,
-        activation: str = "id",
-    ):
-        super().__init__()
-        # Env overrides (documented in hydra/config.py).
-        if order is None:
-            order = int(os.environ.get("HYDRA_HYENA_ORDER", "2"))
-        if filter_order is None:
-            filter_order = int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64"))
-        self.d_model = d_model
-        self.seq_len = seq_len
-        self.order = order
-        self.filter_order = filter_order
-        self.operator = HyenaOperator(
-            d_model=d_model,
-            l_max=seq_len,
-            order=order,
-            filter_order=filter_order,
-            dropout=dropout,
-            filter_dropout=filter_dropout,
-            short_filter_order=short_filter_order,
-            activation=activation,
-        )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """x: [B, T, d_model]  ->  y: [B, T, d_model]."""
-        return self.operator(x)

+"""HyenaBlock — drop-in block for HYDRA, supplement to Mamba3.
+Wraps `subsystems.hyena_pure.HyenaOperator` with a pre-norm + residual scheme
+consistent with how the mHC stack wraps Mamba3 in `hydra/model.py`.
+Interface contract (MUST match how Mamba3 is called in model.py):
+    block = HyenaBlock(d_model, seq_len)
+    y = block(x)   # x: [B, T, d_model]  ->  y: [B, T, d_model]
+The surrounding mHC layer does the pre-norm (`norm(h)`) BEFORE calling the
+block, so the block itself should NOT re-normalize at input — same as Mamba3
+in the current model. We return the raw operator output; the mHC layer then
+adds it as a residual stream contribution.
+NO attention, NO softmax-over-sequence-dim, NO KV-cache. All forbidden
+imports enumerated in tests/test_hyena.py (test #7) are absent.
+"""
+from __future__ import annotations
+import os
+import torch
+import torch.nn as nn
+from subsystems.hyena_pure import HyenaOperator
+class HyenaBlock(nn.Module):
+    """Single Hyena block, shape-compatible with Mamba3 in HYDRA."""
+    def __init__(
+        self,
+        d_model: int,
+        seq_len: int,
+        order: int | None = None,
+        filter_order: int | None = None,
+        dropout: float = 0.0,
+        filter_dropout: float = 0.0,
+        short_filter_order: int = 3,
+        activation: str = "id",
+    ):
+        super().__init__()
+        # Env overrides (documented in hydra/config.py).
+        if order is None:
+            order = int(os.environ.get("HYDRA_HYENA_ORDER", "2"))
+        if filter_order is None:
+            filter_order = int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64"))
+        self.d_model = d_model
+        self.seq_len = seq_len
+        self.order = order
+        self.filter_order = filter_order
+        self.operator = HyenaOperator(
+            d_model=d_model,
+            l_max=seq_len,
+            order=order,
+            filter_order=filter_order,
+            dropout=dropout,
+            filter_dropout=filter_dropout,
+            short_filter_order=short_filter_order,
+            activation=activation,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """x: [B, T, d_model]  ->  y: [B, T, d_model]."""
+        return self.operator(x)

overlay/hydra/lightning_module.py CHANGED Viewed

@@ -1,326 +1,326 @@
-"""LightningModule wrapping PostSemClawModel.
-Thin adapter. The model and the MuonAdamW optimizer are unchanged. This
-module implements:
-  • configure_optimizers — returns the existing MuonAdamW (subclass of
-    torch.optim.Optimizer) built by model.setup_optimizer. Lightning accepts
-    this directly.
-  • training_step — splits (B, T+1) batches into (x, y), forwards through
-    the model, logs loss / bpb / tps / mfu / vram. Preserves the
-    sampled-softmax path inside PostSemClawModel (no changes there).
-  • optimizer_step — before each step we update LR + muon momentum + WD
-    using the same time-progress schedule as hydra/training.py
-    (get_lr_multiplier / get_muon_momentum / get_weight_decay). Lightning
-    handles grad accumulation via Trainer(accumulate_grad_batches=N).
-The SDR SOM update and Hestia QAT snap are called at the same cadence as
-the legacy loop, but inline on the main thread (Lightning provides its own
-callbacks for async work if we need to extract them later — keeping it
-simple for now).
-Env vars respected:
-  HYDRA_TIME_BUDGET          — wall-clock budget (s) used for LR schedule
-                                and as Trainer max_time
-  HYDRA_HESTIA_INTERVAL      — steps between Hestia snaps (default 100)
-  HYDRA_BATCH_SIZE           — device batch size (for throughput calc)
-  HYDRA_SEQ_LEN              — sequence length (for throughput calc)
-"""
-from __future__ import annotations
-import math
-import os
-import time
-import torch
-import lightning as L
-from hydra.config import (
-    ADAM_BETAS,
-    EMBEDDING_LR,
-    FINAL_LR_FRAC,
-    GPU_BF16_PEAK_FLOPS,
-    MATRIX_LR,
-    SCALAR_LR,
-    UNEMBEDDING_LR,
-    WARMUP_RATIO,
-    WEIGHT_DECAY,
-    PostSemClawConfig,
-)
-from hydra.model import PostSemClawModel
-# ---------------------------------------------------------------------------
-# LR / momentum / wd schedules — verbatim copy of hydra/training.py so the
-# curves match exactly. Kept here to avoid import cycles.
-# ---------------------------------------------------------------------------
-def _lr_multiplier(progress: float) -> float:
-    if progress < WARMUP_RATIO:
-        return progress / WARMUP_RATIO if WARMUP_RATIO > 0 else 1.0
-    decay_progress = (progress - WARMUP_RATIO) / max(1.0 - WARMUP_RATIO, 1e-9)
-    return FINAL_LR_FRAC + 0.5 * (1.0 - FINAL_LR_FRAC) * (
-        1 + math.cos(math.pi * decay_progress)
-    )
-def _muon_momentum(step: int) -> float:
-    frac = min(step / 300.0, 1.0)
-    return (1 - frac) * 0.85 + frac * 0.95
-def _weight_decay(progress: float) -> float:
-    return WEIGHT_DECAY * (1 - progress)
-# ---------------------------------------------------------------------------
-class HydraLightningModule(L.LightningModule):
-    """Lightning wrapper. Public attrs: self.model, self.config."""
-    def __init__(self, config: PostSemClawConfig):
-        super().__init__()
-        self.config = config
-        self.model = PostSemClawModel(config)
-        # Model weights init must be deferred to the correct device; done by
-        # caller after construction (to match the meta-device + to_empty()
-        # pattern used in the legacy loop).
-        # Time-based progress tracks the legacy loop's semantics: LR cosine
-        # is driven by wall-clock, not step count. We capture training start
-        # in on_train_start and TIME_BUDGET from env.
-        self.time_budget = float(
-            int(os.environ.get("HYDRA_TIME_BUDGET", "300"))
-        )
-        self._train_start_time: float | None = None
-        self._total_training_time = 0.0
-        self._last_step_end: float | None = None
-        self._hestia_interval = int(os.environ.get("HYDRA_HESTIA_INTERVAL", "100"))
-        self._flops_per_token = 0
-        self._tokens_per_step = 0
-        # Smoothed loss for the header-line log (matches legacy format).
-        self._ema_beta = 0.9
-        self._smooth_loss = 0.0
-        self._bpt_ema = 0.0
-        self._token_bytes: torch.Tensor | None = None
-    # ------------------------------------------------------------------
-    # Lifecycle
-    # ------------------------------------------------------------------
-    def on_train_start(self) -> None:
-        self._train_start_time = time.time()
-        self._last_step_end = self._train_start_time
-        self._flops_per_token = self.model.estimate_flops()
-        # Tokens processed per optimizer step (pre-accum).
-        B = int(os.environ.get("HYDRA_BATCH_SIZE", "1"))
-        T = int(os.environ.get("HYDRA_SEQ_LEN", "512"))
-        self._tokens_per_step = B * T
-        # Build/cache token_bytes LUT (for bits-per-byte live metric).
-        import prepare as _p
-        self._token_bytes = _p.get_token_bytes(device=self.device)
-    def configure_optimizers(self):
-        optimizer = self.model.setup_optimizer(
-            unembedding_lr=UNEMBEDDING_LR,
-            embedding_lr=EMBEDDING_LR,
-            scalar_lr=SCALAR_LR,
-            adam_betas=ADAM_BETAS,
-            matrix_lr=MATRIX_LR,
-            weight_decay=WEIGHT_DECAY,
-        )
-        return optimizer
-    # ------------------------------------------------------------------
-    # Training step. Lightning auto-handles: autocast (via precision flag
-    # on Trainer), backward, grad-accum, zero_grad. We only:
-    #   - split batch into (x, y)
-    #   - forward through model (autocast is established by Trainer)
-    #   - return loss (grads flow from return)
-    # ------------------------------------------------------------------
-    def training_step(self, batch: torch.Tensor, batch_idx: int):
-        # DataLoader produces (B, T+1) rows; split into input/target.
-        # Lightning's default collate already moved batch to self.device via
-        # the accelerator callback when pin_memory=True and device != cpu.
-        if batch.dim() != 2:
-            raise RuntimeError(f"Expected (B, T+1) batch, got shape {tuple(batch.shape)}")
-        x = batch[:, :-1].contiguous()
-        y = batch[:, 1:].contiguous()
-        loss = self.model(x, y)
-        # Lightning applies the grad-accum divisor automatically; we just
-        # return the raw loss. loss.detach() is stored for logging.
-        self._log_step(loss.detach(), y)
-        return loss
-    # ------------------------------------------------------------------
-    # Optimizer step hook: update LR / momentum / WD using time-progress.
-    # Runs once per optimizer step (after all accum micro-batches).
-    # ------------------------------------------------------------------
-    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
-        # Update schedules from wall-clock progress.
-        now = time.time()
-        if self._train_start_time is None:
-            self._train_start_time = now
-            self._last_step_end = now
-        progress = min(self._total_training_time / max(self.time_budget, 1.0), 1.0)
-        step = self.global_step
-        lrm = _lr_multiplier(progress)
-        mom = _muon_momentum(step)
-        wd = _weight_decay(progress)
-        for group in optimizer.param_groups:
-            group["lr"] = group["initial_lr"] * lrm
-            if group.get("kind") == "muon":
-                group["momentum"] = mom
-                group["weight_decay"] = wd
-        # Grad clip (matches legacy loop). Lightning provides this via
-        # Trainer(gradient_clip_val=1.0) but we want the exact call-site.
-        torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
-        # Hyena train-cache: we must flush accumulated micro-batch grads BACK
-        # into the filter MLP params AFTER the accum-backward closure has run
-        # but BEFORE the optimizer actually consumes the grads. Lightning
-        # composes these so the closure runs inside optimizer.step(). We wrap
-        # the closure to insert our flush at the exact right moment.
-        #
-        # Ordering within the wrapped closure:
-        #   1. optimizer_closure() — runs all micro-batch forwards + backwards.
-        #      Each Hyena micro-batch backward accumulates into _k_leaf.grad.
-        #   2. flush_hyena_pending_grads() — one-shot
-        #      torch.autograd.backward(_k_graph, _k_leaf.grad) per HyenaFilter.
-        #      Now filter MLP / pos_emb / bias params have their correct grads.
-        #
-        # No-op when HYDRA_HYENA_TRAIN_CACHE=0 or no Hyena blocks exist.
-        _has_flush = hasattr(self.model, "flush_hyena_pending_grads")
-        if _has_flush:
-            _orig_closure = optimizer_closure
-            def _wrapped_closure():
-                result = _orig_closure()
-                self.model.flush_hyena_pending_grads()
-                return result
-            effective_closure = _wrapped_closure
-        else:
-            effective_closure = optimizer_closure
-        # Run the step (this is what Lightning would have done for us).
-        optimizer.step(closure=effective_closure)
-        self.model.zero_grad(set_to_none=True)
-        # Hyena filter-rfft cache invalidation. No-op if:
-        #   (a) no Hyena layers are in the model, or
-        #   (b) HYDRA_HYENA_FILTER_CACHE=0 and HYDRA_HYENA_TRAIN_CACHE=0
-        #       (the operators never populated either cache)
-        # In either case this is a handful of Python attribute resets.
-        if hasattr(self.model, "invalidate_hyena_caches"):
-            self.model.invalidate_hyena_caches()
-        # Hestia QAT snap every N steps. Temperature anneals every step.
-        progress_now = (now - self._train_start_time) / max(self.time_budget, 1.0)
-        self.model.hestia.anneal_temperature(progress_now)
-        if self._hestia_interval > 0 and step % self._hestia_interval == 0:
-            self.model.hestia.apply_to(self.model)
-        # SDR SOM update when the model stashed an sdr in the last forward.
-        _last_sdr = getattr(self.model, "_last_sdr", None)
-        if _last_sdr is not None and hasattr(self.model.sdr_semantic, "maybe_som_update"):
-            # x from the last training_step is not available here without
-            # captured state; the legacy loop passed (x, _last_sdr). To keep
-            # the interface clean we pass the last batch's x via a buffer.
-            # Since _last_sdr is derived from idx, we reuse self._last_x.
-            if getattr(self, "_last_x", None) is not None:
-                self.model.sdr_semantic.maybe_som_update(self._last_x, _last_sdr)
-        # Advance the wall-clock counter for LR schedule (matches legacy
-        # behavior which incremented only after the first warm-up step).
-        dt = now - (self._last_step_end or now)
-        self._last_step_end = now
-        if step > 10:
-            self._total_training_time += dt
-    # ------------------------------------------------------------------
-    # Logging — mirrors the step=NNNNN line format of the legacy loop so
-    # grep/tee pipelines keep working.
-    # ------------------------------------------------------------------
-    def _log_step(self, loss: torch.Tensor, y: torch.Tensor) -> None:
-        # Stash the current x so optimizer_step can drive SOM update.
-        self._last_x = None  # reset; we will set it below.
-        # We don't have x here (already discarded); emit a None marker that
-        # the SOM hook will silently skip if absent.
-        loss_f = float(loss.item())
-        if not math.isfinite(loss_f) or loss_f > 100:
-            # Let Lightning raise / the trainer callbacks handle this.
-            self.log("train_loss_nan", 1.0)
-            return
-        step = self.global_step
-        self._smooth_loss = (
-            self._ema_beta * self._smooth_loss + (1 - self._ema_beta) * loss_f
-        )
-        debiased = self._smooth_loss / max(1 - self._ema_beta ** (step + 1), 1e-9)
-        dt = max(time.time() - (self._last_step_end or time.time()), 1e-6)
-        tps = int(self._tokens_per_step / dt) if dt > 0 else 0
-        mfu = (
-            100.0
-            * self._flops_per_token
-            * self._tokens_per_step
-            / dt
-            / GPU_BF16_PEAK_FLOPS
-            if dt > 0
-            else 0.0
-        )
-        # bpb live: y flat -> token_bytes LUT -> avg bytes/token
-        bpt = debiased / math.log(2)
-        if self._token_bytes is not None:
-            with torch.no_grad():
-                y_flat = y.reshape(-1)
-                nbytes = self._token_bytes[y_flat]
-                mask = nbytes > 0
-                denom = mask.sum().clamp(min=1).float()
-                avg_bpt = (nbytes.float() * mask.float()).sum() / denom
-                bpt_batch = float(avg_bpt.item())
-            if step == 0 or self._bpt_ema <= 0.0:
-                self._bpt_ema = bpt_batch
-            else:
-                self._bpt_ema = 0.98 * self._bpt_ema + 0.02 * bpt_batch
-        bpb = bpt / max(self._bpt_ema, 1e-6)
-        vram = (
-            torch.cuda.memory_allocated() / 1024 / 1024
-            if torch.cuda.is_available()
-            else 0.0
-        )
-        self.log_dict(
-            {
-                "train/loss": debiased,
-                "train/bpb": bpb,
-                "train/bpt": bpt,
-                "train/tps": float(tps),
-                "train/mfu": float(mfu),
-                "train/vram_mib": float(vram),
-            },
-            prog_bar=False,
-            on_step=True,
-            on_epoch=False,
-        )
-        # Match legacy one-line format: "step=NNNNN loss=x bpb=y tps=z ..."
-        print(
-            f"step={step:05d} loss={debiased:.4f} bpb={bpb:.4f} "
-            f"bpt={bpt:.3f} bpt_div={self._bpt_ema:.2f} "
-            f"tps={tps} dt_ms={dt*1000:.0f} mfu={mfu:.1f} "
-            f"vram={vram:.0f}MiB",
-            flush=True,
-        )

+"""LightningModule wrapping PostSemClawModel.
+Thin adapter. The model and the MuonAdamW optimizer are unchanged. This
+module implements:
+  • configure_optimizers — returns the existing MuonAdamW (subclass of
+    torch.optim.Optimizer) built by model.setup_optimizer. Lightning accepts
+    this directly.
+  • training_step — splits (B, T+1) batches into (x, y), forwards through
+    the model, logs loss / bpb / tps / mfu / vram. Preserves the
+    sampled-softmax path inside PostSemClawModel (no changes there).
+  • optimizer_step — before each step we update LR + muon momentum + WD
+    using the same time-progress schedule as hydra/training.py
+    (get_lr_multiplier / get_muon_momentum / get_weight_decay). Lightning
+    handles grad accumulation via Trainer(accumulate_grad_batches=N).
+The SDR SOM update and Hestia QAT snap are called at the same cadence as
+the legacy loop, but inline on the main thread (Lightning provides its own
+callbacks for async work if we need to extract them later — keeping it
+simple for now).
+Env vars respected:
+  HYDRA_TIME_BUDGET          — wall-clock budget (s) used for LR schedule
+                                and as Trainer max_time
+  HYDRA_HESTIA_INTERVAL      — steps between Hestia snaps (default 100)
+  HYDRA_BATCH_SIZE           — device batch size (for throughput calc)
+  HYDRA_SEQ_LEN              — sequence length (for throughput calc)
+"""
+from __future__ import annotations
+import math
+import os
+import time
+import torch
+import lightning as L
+from hydra.config import (
+    ADAM_BETAS,
+    EMBEDDING_LR,
+    FINAL_LR_FRAC,
+    GPU_BF16_PEAK_FLOPS,
+    MATRIX_LR,
+    SCALAR_LR,
+    UNEMBEDDING_LR,
+    WARMUP_RATIO,
+    WEIGHT_DECAY,
+    PostSemClawConfig,
+)
+from hydra.model import PostSemClawModel
+# ---------------------------------------------------------------------------
+# LR / momentum / wd schedules — verbatim copy of hydra/training.py so the
+# curves match exactly. Kept here to avoid import cycles.
+# ---------------------------------------------------------------------------
+def _lr_multiplier(progress: float) -> float:
+    if progress < WARMUP_RATIO:
+        return progress / WARMUP_RATIO if WARMUP_RATIO > 0 else 1.0
+    decay_progress = (progress - WARMUP_RATIO) / max(1.0 - WARMUP_RATIO, 1e-9)
+    return FINAL_LR_FRAC + 0.5 * (1.0 - FINAL_LR_FRAC) * (
+        1 + math.cos(math.pi * decay_progress)
+    )
+def _muon_momentum(step: int) -> float:
+    frac = min(step / 300.0, 1.0)
+    return (1 - frac) * 0.85 + frac * 0.95
+def _weight_decay(progress: float) -> float:
+    return WEIGHT_DECAY * (1 - progress)
+# ---------------------------------------------------------------------------
+class HydraLightningModule(L.LightningModule):
+    """Lightning wrapper. Public attrs: self.model, self.config."""
+    def __init__(self, config: PostSemClawConfig):
+        super().__init__()
+        self.config = config
+        self.model = PostSemClawModel(config)
+        # Model weights init must be deferred to the correct device; done by
+        # caller after construction (to match the meta-device + to_empty()
+        # pattern used in the legacy loop).
+        # Time-based progress tracks the legacy loop's semantics: LR cosine
+        # is driven by wall-clock, not step count. We capture training start
+        # in on_train_start and TIME_BUDGET from env.
+        self.time_budget = float(
+            int(os.environ.get("HYDRA_TIME_BUDGET", "300"))
+        )
+        self._train_start_time: float | None = None
+        self._total_training_time = 0.0
+        self._last_step_end: float | None = None
+        self._hestia_interval = int(os.environ.get("HYDRA_HESTIA_INTERVAL", "100"))
+        self._flops_per_token = 0
+        self._tokens_per_step = 0
+        # Smoothed loss for the header-line log (matches legacy format).
+        self._ema_beta = 0.9
+        self._smooth_loss = 0.0
+        self._bpt_ema = 0.0
+        self._token_bytes: torch.Tensor | None = None
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+    def on_train_start(self) -> None:
+        self._train_start_time = time.time()
+        self._last_step_end = self._train_start_time
+        self._flops_per_token = self.model.estimate_flops()
+        # Tokens processed per optimizer step (pre-accum).
+        B = int(os.environ.get("HYDRA_BATCH_SIZE", "1"))
+        T = int(os.environ.get("HYDRA_SEQ_LEN", "512"))
+        self._tokens_per_step = B * T
+        # Build/cache token_bytes LUT (for bits-per-byte live metric).
+        import prepare as _p
+        self._token_bytes = _p.get_token_bytes(device=self.device)
+    def configure_optimizers(self):
+        optimizer = self.model.setup_optimizer(
+            unembedding_lr=UNEMBEDDING_LR,
+            embedding_lr=EMBEDDING_LR,
+            scalar_lr=SCALAR_LR,
+            adam_betas=ADAM_BETAS,
+            matrix_lr=MATRIX_LR,
+            weight_decay=WEIGHT_DECAY,
+        )
+        return optimizer
+    # ------------------------------------------------------------------
+    # Training step. Lightning auto-handles: autocast (via precision flag
+    # on Trainer), backward, grad-accum, zero_grad. We only:
+    #   - split batch into (x, y)
+    #   - forward through model (autocast is established by Trainer)
+    #   - return loss (grads flow from return)
+    # ------------------------------------------------------------------
+    def training_step(self, batch: torch.Tensor, batch_idx: int):
+        # DataLoader produces (B, T+1) rows; split into input/target.
+        # Lightning's default collate already moved batch to self.device via
+        # the accelerator callback when pin_memory=True and device != cpu.
+        if batch.dim() != 2:
+            raise RuntimeError(f"Expected (B, T+1) batch, got shape {tuple(batch.shape)}")
+        x = batch[:, :-1].contiguous()
+        y = batch[:, 1:].contiguous()
+        loss = self.model(x, y)
+        # Lightning applies the grad-accum divisor automatically; we just
+        # return the raw loss. loss.detach() is stored for logging.
+        self._log_step(loss.detach(), y)
+        return loss
+    # ------------------------------------------------------------------
+    # Optimizer step hook: update LR / momentum / WD using time-progress.
+    # Runs once per optimizer step (after all accum micro-batches).
+    # ------------------------------------------------------------------
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure):
+        # Update schedules from wall-clock progress.
+        now = time.time()
+        if self._train_start_time is None:
+            self._train_start_time = now
+            self._last_step_end = now
+        progress = min(self._total_training_time / max(self.time_budget, 1.0), 1.0)
+        step = self.global_step
+        lrm = _lr_multiplier(progress)
+        mom = _muon_momentum(step)
+        wd = _weight_decay(progress)
+        for group in optimizer.param_groups:
+            group["lr"] = group["initial_lr"] * lrm
+            if group.get("kind") == "muon":
+                group["momentum"] = mom
+                group["weight_decay"] = wd
+        # Grad clip (matches legacy loop). Lightning provides this via
+        # Trainer(gradient_clip_val=1.0) but we want the exact call-site.
+        torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
+        # Hyena train-cache: we must flush accumulated micro-batch grads BACK
+        # into the filter MLP params AFTER the accum-backward closure has run
+        # but BEFORE the optimizer actually consumes the grads. Lightning
+        # composes these so the closure runs inside optimizer.step(). We wrap
+        # the closure to insert our flush at the exact right moment.
+        #
+        # Ordering within the wrapped closure:
+        #   1. optimizer_closure() — runs all micro-batch forwards + backwards.
+        #      Each Hyena micro-batch backward accumulates into _k_leaf.grad.
+        #   2. flush_hyena_pending_grads() — one-shot
+        #      torch.autograd.backward(_k_graph, _k_leaf.grad) per HyenaFilter.
+        #      Now filter MLP / pos_emb / bias params have their correct grads.
+        #
+        # No-op when HYDRA_HYENA_TRAIN_CACHE=0 or no Hyena blocks exist.
+        _has_flush = hasattr(self.model, "flush_hyena_pending_grads")
+        if _has_flush:
+            _orig_closure = optimizer_closure
+            def _wrapped_closure():
+                result = _orig_closure()
+                self.model.flush_hyena_pending_grads()
+                return result
+            effective_closure = _wrapped_closure
+        else:
+            effective_closure = optimizer_closure
+        # Run the step (this is what Lightning would have done for us).
+        optimizer.step(closure=effective_closure)
+        self.model.zero_grad(set_to_none=True)
+        # Hyena filter-rfft cache invalidation. No-op if:
+        #   (a) no Hyena layers are in the model, or
+        #   (b) HYDRA_HYENA_FILTER_CACHE=0 and HYDRA_HYENA_TRAIN_CACHE=0
+        #       (the operators never populated either cache)
+        # In either case this is a handful of Python attribute resets.
+        if hasattr(self.model, "invalidate_hyena_caches"):
+            self.model.invalidate_hyena_caches()
+        # Hestia QAT snap every N steps. Temperature anneals every step.
+        progress_now = (now - self._train_start_time) / max(self.time_budget, 1.0)
+        self.model.hestia.anneal_temperature(progress_now)
+        if self._hestia_interval > 0 and step % self._hestia_interval == 0:
+            self.model.hestia.apply_to(self.model)
+        # SDR SOM update when the model stashed an sdr in the last forward.
+        _last_sdr = getattr(self.model, "_last_sdr", None)
+        if _last_sdr is not None and hasattr(self.model.sdr_semantic, "maybe_som_update"):
+            # x from the last training_step is not available here without
+            # captured state; the legacy loop passed (x, _last_sdr). To keep
+            # the interface clean we pass the last batch's x via a buffer.
+            # Since _last_sdr is derived from idx, we reuse self._last_x.
+            if getattr(self, "_last_x", None) is not None:
+                self.model.sdr_semantic.maybe_som_update(self._last_x, _last_sdr)
+        # Advance the wall-clock counter for LR schedule (matches legacy
+        # behavior which incremented only after the first warm-up step).
+        dt = now - (self._last_step_end or now)
+        self._last_step_end = now
+        if step > 10:
+            self._total_training_time += dt
+    # ------------------------------------------------------------------
+    # Logging — mirrors the step=NNNNN line format of the legacy loop so
+    # grep/tee pipelines keep working.
+    # ------------------------------------------------------------------
+    def _log_step(self, loss: torch.Tensor, y: torch.Tensor) -> None:
+        # Stash the current x so optimizer_step can drive SOM update.
+        self._last_x = None  # reset; we will set it below.
+        # We don't have x here (already discarded); emit a None marker that
+        # the SOM hook will silently skip if absent.
+        loss_f = float(loss.item())
+        if not math.isfinite(loss_f) or loss_f > 100:
+            # Let Lightning raise / the trainer callbacks handle this.
+            self.log("train_loss_nan", 1.0)
+            return
+        step = self.global_step
+        self._smooth_loss = (
+            self._ema_beta * self._smooth_loss + (1 - self._ema_beta) * loss_f
+        )
+        debiased = self._smooth_loss / max(1 - self._ema_beta ** (step + 1), 1e-9)
+        dt = max(time.time() - (self._last_step_end or time.time()), 1e-6)
+        tps = int(self._tokens_per_step / dt) if dt > 0 else 0
+        mfu = (
+            100.0
+            * self._flops_per_token
+            * self._tokens_per_step
+            / dt
+            / GPU_BF16_PEAK_FLOPS
+            if dt > 0
+            else 0.0
+        )
+        # bpb live: y flat -> token_bytes LUT -> avg bytes/token
+        bpt = debiased / math.log(2)
+        if self._token_bytes is not None:
+            with torch.no_grad():
+                y_flat = y.reshape(-1)
+                nbytes = self._token_bytes[y_flat]
+                mask = nbytes > 0
+                denom = mask.sum().clamp(min=1).float()
+                avg_bpt = (nbytes.float() * mask.float()).sum() / denom
+                bpt_batch = float(avg_bpt.item())
+            if step == 0 or self._bpt_ema <= 0.0:
+                self._bpt_ema = bpt_batch
+            else:
+                self._bpt_ema = 0.98 * self._bpt_ema + 0.02 * bpt_batch
+        bpb = bpt / max(self._bpt_ema, 1e-6)
+        vram = (
+            torch.cuda.memory_allocated() / 1024 / 1024
+            if torch.cuda.is_available()
+            else 0.0
+        )
+        self.log_dict(
+            {
+                "train/loss": debiased,
+                "train/bpb": bpb,
+                "train/bpt": bpt,
+                "train/tps": float(tps),
+                "train/mfu": float(mfu),
+                "train/vram_mib": float(vram),
+            },
+            prog_bar=False,
+            on_step=True,
+            on_epoch=False,
+        )
+        # Match legacy one-line format: "step=NNNNN loss=x bpb=y tps=z ..."
+        print(
+            f"step={step:05d} loss={debiased:.4f} bpb={bpb:.4f} "
+            f"bpt={bpt:.3f} bpt_div={self._bpt_ema:.2f} "
+            f"tps={tps} dt_ms={dt*1000:.0f} mfu={mfu:.1f} "
+            f"vram={vram:.0f}MiB",
+            flush=True,
+        )

overlay/hydra/model.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

overlay/hydra/optimizer.py CHANGED Viewed

@@ -1,252 +1,252 @@
-"""MuonAdamW optimizer — combined Muon (2D matrices) + AdamW (everything else).
-Extracted verbatim from train.py (W1 modularization). Semantics unchanged.
-F1-F15 state preserved:
-- F7 REVERTED: `stacked_params_buf` persistent across steps was REMOVED — each
-  step calls `torch.stack([p.grad for p in params])` / `torch.stack(params)`
-  fresh. Persistent copies of param storage would be mutated between forward
-  passes (via lerp_/sub_ on stacked tensors that share storage with params),
-  triggering "modified in-place" errors on grad_accum=2 backwards.
-- F11/F15: `@torch.compile` on `adamw_step_fused` / `muon_step_fused` intact.
-- F15 compile is default-ON (HYDRA_MUON_COMPILE=1), configured with
-  dynamic=True + mode="default" to avoid the step-17→18 cudagraphs
-  stream-capture deadlock. See .omc/muon_compile_bug.md for the full
-  investigation.
-"""
-from __future__ import annotations
-import os
-import torch
-# HYDRA_FUSED_ADAMW=1 (default) -> vectorized torch._fused_adamw_ kernel.
-_HYDRA_FUSED_ADAMW = os.environ.get("HYDRA_FUSED_ADAMW", "1") == "1"
-_HAS_FUSED_ADAMW = hasattr(torch, "_fused_adamw_")
-polar_express_coeffs = [
-    (8.156554524902461, -22.48329292557795, 15.878769915207462),
-    (4.042929935166739, -2.808917465908714, 0.5000178451051316),
-    (3.8916678022926607, -2.772484153217685, 0.5060648178503393),
-    (3.285753657755655, -2.3681294933425376, 0.46449024233003106),
-    (2.3465413258596377, -1.7097828382687081, 0.42323551169305323),
-]
-def adamw_step_fused(p, grad, exp_avg, exp_avg_sq, step_t, lr_t, beta1_t, beta2_t, eps_t, wd_t):
-    # Per-param AdamW fallback. Fast path is torch._fused_adamw_ (1 CUDA launch
-    # for the whole group) driven from MuonAdamW._step_adamw below.
-    grad = grad.to(p.dtype)  # handle mixed bf16/fp32 from autocast
-    p.mul_(1 - lr_t * wd_t)
-    exp_avg.lerp_(grad, 1 - beta1_t)
-    exp_avg_sq.lerp_(grad.square(), 1 - beta2_t)
-    bias1 = 1 - beta1_t ** step_t
-    bias2 = 1 - beta2_t ** step_t
-    denom = (exp_avg_sq / bias2).sqrt() + eps_t
-    step_size = lr_t / bias1
-    p.add_(exp_avg / denom, alpha=-step_size)
-# ---------------------------------------------------------------------------
-# F15 muon_step_fused compile strategy.
-#
-# HYDRA_MUON_COMPILE env gate:
-#   "1" (default ON) — wrap with torch.compile(dynamic=True, mode="default").
-#       Dynamic=True collapses the per-shape specialization cache so that N
-#       Muon param-groups with N distinct shapes trigger 1 compile, not N.
-#       mode="default" keeps the inductor codegen but disables cudagraphs,
-#       which is what caused the step-17→18 silent deadlock observed under
-#       the original dynamic=False configuration: cudagraph stream capture
-#       can deadlock against HTM's CUDA kernels running on the default
-#       stream, and the failure mode at capture-time is a silent hang
-#       (100% GPU util, no log output, process state R).
-#   "0" — fall back to eager Python (slower, ~43k tps vs ~63k compiled).
-#       Keeps an escape hatch in case a future torch/inductor regression
-#       reintroduces a deadlock.
-#
-# Defensive .clone() on stacked_grads before in-place lerp_ eliminates the
-# alias-analysis edge case where inductor sees `g is stacked_grads` and
-# subsequent `stacked_grads.square()` operating on the post-lerp storage.
-# ---------------------------------------------------------------------------
-_MUON_COMPILE = os.environ.get("HYDRA_MUON_COMPILE", "1") == "1"
-def _maybe_compile(fn):
-    if _MUON_COMPILE:
-        # mode="default" explicitly opts OUT of cudagraphs (which reduce-overhead
-        # would enable) to avoid stream-capture deadlocks against HTM's CUDA
-        # kernels. dynamic=True minimizes recompile count across param-group
-        # shapes.
-        return torch.compile(fn, fullgraph=False, dynamic=True, mode="default")
-    return fn
-@_maybe_compile
-def muon_step_fused(stacked_grads, stacked_params, momentum_buffer, second_momentum_buffer,
-                    momentum_t, lr_t, wd_t, beta2_t, ns_steps, red_dim):
-    # Cast grads to param dtype AND clone defensively to break any alias
-    # between the (freshly-stacked) input and the in-place lerp_ below.
-    # Without this, inductor's alias analysis can emit code that reads from
-    # post-mutation storage when computing `v_mean = g.square().mean(...)`.
-    stacked_grads = stacked_grads.to(momentum_buffer.dtype).clone()
-    # Nesterov momentum
-    momentum = momentum_t.to(stacked_grads.dtype)
-    momentum_buffer.lerp_(stacked_grads, 1 - momentum)
-    g = stacked_grads.lerp_(momentum_buffer, momentum)
-    # Polar express orthogonalization
-    X = g.bfloat16()
-    X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6)
-    if g.size(-2) > g.size(-1):
-        for a, b, c in polar_express_coeffs[:ns_steps]:
-            A = X.mT @ X
-            B = b * A + c * (A @ A)
-            X = a * X + X @ B
-    else:
-        for a, b, c in polar_express_coeffs[:ns_steps]:
-            A = X @ X.mT
-            B = b * A + c * (A @ A)
-            X = a * X + B @ X
-    g = X
-    # NorMuon variance reduction
-    # Keep beta2 in the state-buffer dtype, not g.dtype, so lerp_ on the
-    # float32 second_momentum_buffer doesn't hit a dtype mismatch on h200.
-    beta2 = beta2_t.to(second_momentum_buffer.dtype)
-    v_mean = g.float().square().mean(dim=red_dim, keepdim=True)
-    red_dim_size = g.size(red_dim)
-    v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size
-    v_norm = v_norm_sq.sqrt()
-    second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2)
-    step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt()
-    scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square()
-    v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt()
-    final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10))
-    g = g * final_scale.to(g.dtype)
-    # Cautious weight decay + parameter update
-    lr = lr_t.to(g.dtype)
-    wd = wd_t.to(g.dtype)
-    mask = (g * stacked_params) >= 0
-    stacked_params.sub_(lr * g + lr * wd * stacked_params * mask)
-class MuonAdamW(torch.optim.Optimizer):
-    """Combined optimizer: Muon for 2D matrix params, AdamW for others."""
-    def __init__(self, param_groups):
-        super().__init__(param_groups, defaults={})
-        # 0-D CPU tensors to avoid torch.compile recompilation when values change
-        self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
-        self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
-        self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
-        self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
-        self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
-        self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
-        self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
-        self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
-        self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
-        self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
-    def _step_adamw(self, group):
-        params, grads, exp_avgs, exp_avg_sqs, state_steps = [], [], [], [], []
-        for p in group['params']:
-            if p.grad is None:
-                continue
-            state = self.state[p]
-            if not state:
-                state['step'] = 0
-                state['exp_avg'] = torch.zeros_like(p)
-                state['exp_avg_sq'] = torch.zeros_like(p)
-            if 'step_t' not in state:
-                # _fused_adamw_ wants a per-param float step tensor on-device.
-                state['step_t'] = torch.tensor(
-                    float(state['step']), dtype=torch.float32, device=p.device
-                )
-            state['step'] += 1
-            params.append(p)
-            grads.append(p.grad.to(p.dtype) if p.grad.dtype != p.dtype else p.grad)
-            exp_avgs.append(state['exp_avg'])
-            exp_avg_sqs.append(state['exp_avg_sq'])
-            state_steps.append(state['step_t'])
-        if not params:
-            return
-        if _HYDRA_FUSED_ADAMW and _HAS_FUSED_ADAMW and params[0].is_cuda:
-            # _fused_adamw_ needs uniform (device, dtype) within a call, so
-            # group by (device, dtype) — same pattern as PyTorch's own
-            # AdamW(fused=True) path (_group_tensors_by_device_and_dtype).
-            buckets = {}
-            for p, g, ea, es, st in zip(params, grads, exp_avgs, exp_avg_sqs, state_steps):
-                key = (p.device, p.dtype)
-                buckets.setdefault(key, ([], [], [], [], []))
-                b_p, b_g, b_ea, b_es, b_st = buckets[key]
-                b_p.append(p); b_g.append(g); b_ea.append(ea); b_es.append(es); b_st.append(st)
-            lr_f = float(group['lr'])
-            b1_f = float(group['betas'][0])
-            b2_f = float(group['betas'][1])
-            wd_f = float(group['weight_decay'])
-            eps_f = float(group['eps'])
-            for (_dev, _dt), (b_p, b_g, b_ea, b_es, b_st) in buckets.items():
-                torch._foreach_add_(b_st, 1.0)
-                torch._fused_adamw_(
-                    b_p, b_g, b_ea, b_es,
-                    [],  # max_exp_avg_sqs unused (amsgrad=False)
-                    b_st,
-                    amsgrad=False,
-                    lr=lr_f, beta1=b1_f, beta2=b2_f,
-                    weight_decay=wd_f, eps=eps_f,
-                    maximize=False,
-                    grad_scale=None, found_inf=None,
-                )
-            return
-        # Fallback per-param path.
-        self._adamw_lr_t.fill_(group['lr'])
-        self._adamw_beta1_t.fill_(group['betas'][0])
-        self._adamw_beta2_t.fill_(group['betas'][1])
-        self._adamw_eps_t.fill_(group['eps'])
-        self._adamw_wd_t.fill_(group['weight_decay'])
-        for p, grad, exp_avg, exp_avg_sq in zip(params, grads, exp_avgs, exp_avg_sqs):
-            self._adamw_step_t.fill_(self.state[p]['step'])
-            adamw_step_fused(p, grad, exp_avg, exp_avg_sq,
-                             self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t,
-                             self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t)
-    def _step_muon(self, group):
-        params = [p for p in group['params'] if p.grad is not None]
-        if not params:
-            return
-        p = params[0]
-        state = self.state[p]
-        num_params = len(params)
-        shape, device, dtype = p.shape, p.device, p.dtype
-        if "momentum_buffer" not in state:
-            state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device)
-        red_dim = -1 if shape[-2] >= shape[-1] else -2
-        if "second_momentum_buffer" not in state:
-            # Shape must match v_mean = stacked_grads.square().mean(dim=red_dim, keepdim=True)
-            full_shape = (num_params, *shape)
-            state_shape = list(full_shape)
-            state_shape[len(state_shape) + red_dim] = 1  # red_dim is negative
-            state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device)
-        # F7 REVERT: fresh stacks each step (no persistent stacked_params_buf).
-        # This was the autograd-safety fix that unblocks grad_accum>=2.
-        stacked_grads = torch.stack([p.grad for p in params])
-        stacked_params = torch.stack(params)
-        self._muon_momentum_t.fill_(group["momentum"])
-        self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0)
-        self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1]) ** 0.5)
-        self._muon_wd_t.fill_(group["weight_decay"])
-        muon_step_fused(stacked_grads, stacked_params,
-                        state["momentum_buffer"], state["second_momentum_buffer"],
-                        self._muon_momentum_t, self._muon_lr_t, self._muon_wd_t,
-                        self._muon_beta2_t, group["ns_steps"], red_dim)
-        torch._foreach_copy_(params, list(stacked_params.unbind(0)))
-    @torch.no_grad()
-    def step(self):
-        for group in self.param_groups:
-            if group['kind'] == 'adamw':
-                self._step_adamw(group)
-            elif group['kind'] == 'muon':
-                self._step_muon(group)

+"""MuonAdamW optimizer — combined Muon (2D matrices) + AdamW (everything else).
+Extracted verbatim from train.py (W1 modularization). Semantics unchanged.
+F1-F15 state preserved:
+- F7 REVERTED: `stacked_params_buf` persistent across steps was REMOVED — each
+  step calls `torch.stack([p.grad for p in params])` / `torch.stack(params)`
+  fresh. Persistent copies of param storage would be mutated between forward
+  passes (via lerp_/sub_ on stacked tensors that share storage with params),
+  triggering "modified in-place" errors on grad_accum=2 backwards.
+- F11/F15: `@torch.compile` on `adamw_step_fused` / `muon_step_fused` intact.
+- F15 compile is default-ON (HYDRA_MUON_COMPILE=1), configured with
+  dynamic=True + mode="default" to avoid the step-17→18 cudagraphs
+  stream-capture deadlock. See .omc/muon_compile_bug.md for the full
+  investigation.
+"""
+from __future__ import annotations
+import os
+import torch
+# HYDRA_FUSED_ADAMW=1 (default) -> vectorized torch._fused_adamw_ kernel.
+_HYDRA_FUSED_ADAMW = os.environ.get("HYDRA_FUSED_ADAMW", "1") == "1"
+_HAS_FUSED_ADAMW = hasattr(torch, "_fused_adamw_")
+polar_express_coeffs = [
+    (8.156554524902461, -22.48329292557795, 15.878769915207462),
+    (4.042929935166739, -2.808917465908714, 0.5000178451051316),
+    (3.8916678022926607, -2.772484153217685, 0.5060648178503393),
+    (3.285753657755655, -2.3681294933425376, 0.46449024233003106),
+    (2.3465413258596377, -1.7097828382687081, 0.42323551169305323),
+]
+def adamw_step_fused(p, grad, exp_avg, exp_avg_sq, step_t, lr_t, beta1_t, beta2_t, eps_t, wd_t):
+    # Per-param AdamW fallback. Fast path is torch._fused_adamw_ (1 CUDA launch
+    # for the whole group) driven from MuonAdamW._step_adamw below.
+    grad = grad.to(p.dtype)  # handle mixed bf16/fp32 from autocast
+    p.mul_(1 - lr_t * wd_t)
+    exp_avg.lerp_(grad, 1 - beta1_t)
+    exp_avg_sq.lerp_(grad.square(), 1 - beta2_t)
+    bias1 = 1 - beta1_t ** step_t
+    bias2 = 1 - beta2_t ** step_t
+    denom = (exp_avg_sq / bias2).sqrt() + eps_t
+    step_size = lr_t / bias1
+    p.add_(exp_avg / denom, alpha=-step_size)
+# ---------------------------------------------------------------------------
+# F15 muon_step_fused compile strategy.
+#
+# HYDRA_MUON_COMPILE env gate:
+#   "1" (default ON) — wrap with torch.compile(dynamic=True, mode="default").
+#       Dynamic=True collapses the per-shape specialization cache so that N
+#       Muon param-groups with N distinct shapes trigger 1 compile, not N.
+#       mode="default" keeps the inductor codegen but disables cudagraphs,
+#       which is what caused the step-17→18 silent deadlock observed under
+#       the original dynamic=False configuration: cudagraph stream capture
+#       can deadlock against HTM's CUDA kernels running on the default
+#       stream, and the failure mode at capture-time is a silent hang
+#       (100% GPU util, no log output, process state R).
+#   "0" — fall back to eager Python (slower, ~43k tps vs ~63k compiled).
+#       Keeps an escape hatch in case a future torch/inductor regression
+#       reintroduces a deadlock.
+#
+# Defensive .clone() on stacked_grads before in-place lerp_ eliminates the
+# alias-analysis edge case where inductor sees `g is stacked_grads` and
+# subsequent `stacked_grads.square()` operating on the post-lerp storage.
+# ---------------------------------------------------------------------------
+_MUON_COMPILE = os.environ.get("HYDRA_MUON_COMPILE", "1") == "1"
+def _maybe_compile(fn):
+    if _MUON_COMPILE:
+        # mode="default" explicitly opts OUT of cudagraphs (which reduce-overhead
+        # would enable) to avoid stream-capture deadlocks against HTM's CUDA
+        # kernels. dynamic=True minimizes recompile count across param-group
+        # shapes.
+        return torch.compile(fn, fullgraph=False, dynamic=True, mode="default")
+    return fn
+@_maybe_compile
+def muon_step_fused(stacked_grads, stacked_params, momentum_buffer, second_momentum_buffer,
+                    momentum_t, lr_t, wd_t, beta2_t, ns_steps, red_dim):
+    # Cast grads to param dtype AND clone defensively to break any alias
+    # between the (freshly-stacked) input and the in-place lerp_ below.
+    # Without this, inductor's alias analysis can emit code that reads from
+    # post-mutation storage when computing `v_mean = g.square().mean(...)`.
+    stacked_grads = stacked_grads.to(momentum_buffer.dtype).clone()
+    # Nesterov momentum
+    momentum = momentum_t.to(device=momentum_buffer.device, dtype=stacked_grads.dtype)
+    momentum_buffer.lerp_(stacked_grads, 1 - momentum)
+    g = stacked_grads.lerp_(momentum_buffer, momentum)
+    # Polar express orthogonalization
+    X = g.bfloat16()
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6)
+    if g.size(-2) > g.size(-1):
+        for a, b, c in polar_express_coeffs[:ns_steps]:
+            A = X.mT @ X
+            B = b * A + c * (A @ A)
+            X = a * X + X @ B
+    else:
+        for a, b, c in polar_express_coeffs[:ns_steps]:
+            A = X @ X.mT
+            B = b * A + c * (A @ A)
+            X = a * X + B @ X
+    g = X
+    # NorMuon variance reduction
+    # Keep beta2 in the state-buffer dtype, not g.dtype, so lerp_ on the
+    # float32 second_momentum_buffer doesn't hit a dtype mismatch on h200.
+    beta2 = beta2_t.to(device=second_momentum_buffer.device, dtype=second_momentum_buffer.dtype)
+    v_mean = g.float().square().mean(dim=red_dim, keepdim=True)
+    red_dim_size = g.size(red_dim)
+    v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size
+    v_norm = v_norm_sq.sqrt()
+    second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2)
+    step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt()
+    scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square()
+    v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt()
+    final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10))
+    g = g * final_scale.to(g.dtype)
+    # Cautious weight decay + parameter update
+    lr = lr_t.to(device=stacked_params.device, dtype=g.dtype)
+    wd = wd_t.to(device=stacked_params.device, dtype=g.dtype)
+    mask = (g * stacked_params) >= 0
+    stacked_params.sub_(lr * g + lr * wd * stacked_params * mask)
+class MuonAdamW(torch.optim.Optimizer):
+    """Combined optimizer: Muon for 2D matrix params, AdamW for others."""
+    def __init__(self, param_groups):
+        super().__init__(param_groups, defaults={})
+        # 0-D CPU tensors to avoid torch.compile recompilation when values change
+        self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+        self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
+    def _step_adamw(self, group):
+        params, grads, exp_avgs, exp_avg_sqs, state_steps = [], [], [], [], []
+        for p in group['params']:
+            if p.grad is None:
+                continue
+            state = self.state[p]
+            if not state:
+                state['step'] = 0
+                state['exp_avg'] = torch.zeros_like(p)
+                state['exp_avg_sq'] = torch.zeros_like(p)
+            if 'step_t' not in state:
+                # _fused_adamw_ wants a per-param float step tensor on-device.
+                state['step_t'] = torch.tensor(
+                    float(state['step']), dtype=torch.float32, device=p.device
+                )
+            state['step'] += 1
+            params.append(p)
+            grads.append(p.grad.to(p.dtype) if p.grad.dtype != p.dtype else p.grad)
+            exp_avgs.append(state['exp_avg'])
+            exp_avg_sqs.append(state['exp_avg_sq'])
+            state_steps.append(state['step_t'])
+        if not params:
+            return
+        if _HYDRA_FUSED_ADAMW and _HAS_FUSED_ADAMW and params[0].is_cuda:
+            # _fused_adamw_ needs uniform (device, dtype) within a call, so
+            # group by (device, dtype) — same pattern as PyTorch's own
+            # AdamW(fused=True) path (_group_tensors_by_device_and_dtype).
+            buckets = {}
+            for p, g, ea, es, st in zip(params, grads, exp_avgs, exp_avg_sqs, state_steps):
+                key = (p.device, p.dtype)
+                buckets.setdefault(key, ([], [], [], [], []))
+                b_p, b_g, b_ea, b_es, b_st = buckets[key]
+                b_p.append(p); b_g.append(g); b_ea.append(ea); b_es.append(es); b_st.append(st)
+            lr_f = float(group['lr'])
+            b1_f = float(group['betas'][0])
+            b2_f = float(group['betas'][1])
+            wd_f = float(group['weight_decay'])
+            eps_f = float(group['eps'])
+            for (_dev, _dt), (b_p, b_g, b_ea, b_es, b_st) in buckets.items():
+                torch._foreach_add_(b_st, 1.0)
+                torch._fused_adamw_(
+                    b_p, b_g, b_ea, b_es,
+                    [],  # max_exp_avg_sqs unused (amsgrad=False)
+                    b_st,
+                    amsgrad=False,
+                    lr=lr_f, beta1=b1_f, beta2=b2_f,
+                    weight_decay=wd_f, eps=eps_f,
+                    maximize=False,
+                    grad_scale=None, found_inf=None,
+                )
+            return
+        # Fallback per-param path.
+        self._adamw_lr_t.fill_(group['lr'])
+        self._adamw_beta1_t.fill_(group['betas'][0])
+        self._adamw_beta2_t.fill_(group['betas'][1])
+        self._adamw_eps_t.fill_(group['eps'])
+        self._adamw_wd_t.fill_(group['weight_decay'])
+        for p, grad, exp_avg, exp_avg_sq in zip(params, grads, exp_avgs, exp_avg_sqs):
+            self._adamw_step_t.fill_(self.state[p]['step'])
+            adamw_step_fused(p, grad, exp_avg, exp_avg_sq,
+                             self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t,
+                             self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t)
+    def _step_muon(self, group):
+        params = [p for p in group['params'] if p.grad is not None]
+        if not params:
+            return
+        p = params[0]
+        state = self.state[p]
+        num_params = len(params)
+        shape, device, dtype = p.shape, p.device, p.dtype
+        if "momentum_buffer" not in state:
+            state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device)
+        red_dim = -1 if shape[-2] >= shape[-1] else -2
+        if "second_momentum_buffer" not in state:
+            # Shape must match v_mean = stacked_grads.square().mean(dim=red_dim, keepdim=True)
+            full_shape = (num_params, *shape)
+            state_shape = list(full_shape)
+            state_shape[len(state_shape) + red_dim] = 1  # red_dim is negative
+            state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device)
+        # F7 REVERT: fresh stacks each step (no persistent stacked_params_buf).
+        # This was the autograd-safety fix that unblocks grad_accum>=2.
+        stacked_grads = torch.stack([p.grad for p in params])
+        stacked_params = torch.stack(params)
+        self._muon_momentum_t.fill_(group["momentum"])
+        self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0)
+        self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1]) ** 0.5)
+        self._muon_wd_t.fill_(group["weight_decay"])
+        muon_step_fused(stacked_grads, stacked_params,
+                        state["momentum_buffer"], state["second_momentum_buffer"],
+                        self._muon_momentum_t, self._muon_lr_t, self._muon_wd_t,
+                        self._muon_beta2_t, group["ns_steps"], red_dim)
+        torch._foreach_copy_(params, list(stacked_params.unbind(0)))
+    @torch.no_grad()
+    def step(self):
+        for group in self.param_groups:
+            if group['kind'] == 'adamw':
+                self._step_adamw(group)
+            elif group['kind'] == 'muon':
+                self._step_muon(group)

overlay/hydra/reality_bridge.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from __future__ import annotations
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+@dataclass(frozen=True)
+class RealityBridgeOutput:
+    reality: torch.Tensor
+    poincare: torch.Tensor
+    l0_indices: torch.Tensor
+    l0_values: torch.Tensor
+class RealityPoincareBridge(nn.Module):
+    """Default-off SEM-Claw continuous→discrete bridge.
+    PyTorch GEMM creates a compact 133-d reality latent, then a differentiable
+    Poincare-disk projection is kept for metrics/regularizers while a detached
+    int16 L0/top-k index buffer feeds Engram/Cantor sparse retrieval. This is a
+    production-shaped version of rs.md's Poincare/Reality Buffer without adding
+    speculative E7 machinery to the hot path.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        d_reality: int = 133,
+        d_poincare: int = 2,
+        l0_k: int = 64,
+    ) -> None:
+        super().__init__()
+        if d_model <= 0:
+            raise ValueError(f"d_model must be positive, got {d_model}")
+        if d_reality <= 0:
+            raise ValueError(f"d_reality must be positive, got {d_reality}")
+        if d_poincare != 2:
+            raise ValueError("Poincare bridge currently expects d_poincare=2")
+        if l0_k <= 0:
+            raise ValueError(f"l0_k must be positive, got {l0_k}")
+        self.d_model = int(d_model)
+        self.d_reality = int(d_reality)
+        self.d_poincare = int(d_poincare)
+        self.l0_k = min(int(l0_k), self.d_reality)
+        self.to_reality = nn.Linear(d_model, d_reality, bias=False)
+        self.to_tangent2 = nn.Linear(d_reality, d_poincare, bias=False)
+        nn.init.normal_(self.to_reality.weight, mean=0.0, std=0.02)
+        nn.init.normal_(self.to_tangent2.weight, mean=0.0, std=0.02)
+    @staticmethod
+    def poincare_expmap0(tangent2: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
+        t = tangent2.float()
+        r = t.norm(dim=-1, keepdim=True).clamp_min(eps)
+        y = torch.tanh(r) * (t / r)
+        return y.to(tangent2.dtype)
+    def forward(self, x: torch.Tensor) -> RealityBridgeOutput:
+        if x.shape[-1] != self.d_model:
+            raise ValueError(f"expected last dim {self.d_model}, got {x.shape[-1]}")
+        reality = self.to_reality(x)
+        tangent2 = self.to_tangent2(reality)
+        poincare = self.poincare_expmap0(tangent2)
+        vals, idx = reality.float().abs().topk(self.l0_k, dim=-1)
+        return RealityBridgeOutput(
+            reality=reality,
+            poincare=poincare,
+            l0_indices=idx.to(torch.int16),
+            l0_values=vals.to(reality.dtype),
+        )

overlay/hydra/training.py CHANGED Viewed

@@ -1,948 +1,967 @@
-"""HYDRA training entry: setup, train loop, eval, summary.
-Extracted from the monolithic train.py (W1 modularization). Semantics
-preserved. Public entrypoint: `main()`.
-"""
-from __future__ import annotations
-import gc
-import json
-import math
-import os
-import sys
-import threading
-import time
-from dataclasses import asdict
-from pathlib import Path
-import torch
-# Line-buffered stdout so `python -u train.py | tee run.log | grep step` is
-# live (no \r overwrite, no 4k block-buffered pipe stalls). Safe on Python
-# 3.7+ where io.TextIOWrapper.reconfigure exists.
-try:
-    sys.stdout.reconfigure(line_buffering=True)  # type: ignore[attr-defined]
-except Exception:
-    pass
-from hydra.config import (
-    ADAM_BETAS, CURRICULUM_SHORT_SEQ_LEN, CURRICULUM_SHORT_STEPS,
-    D_MODEL, D_STATE, DEVICE_BATCH_SIZE, EMA_DECAY, EMBEDDING_LR,
-    ENGRAM_KEY_DIM, ENGRAM_LAYER_IDX, ENGRAM_N_COLUMNS, EXPAND,
-    FINAL_LR_FRAC, GPU_BF16_PEAK_FLOPS, HEADDIM, MATRIX_LR, N_HEADS,
-    N_LAYER, PostSemClawConfig, SCALAR_LR, SEED, TOTAL_BATCH_SIZE,
-    UNEMBEDDING_LR, USE_EMA, WARMUP_RATIO, WEIGHT_DECAY,
-)
-from hydra.diffusion_loss import mdlm_masked_forward_process, mdlm_rb_loss
-from hydra.eval import run_factual_english, run_factual_probes
-from hydra.model import PostSemClawModel
-import prepare as _prepare_mod
-from prepare import MAX_SEQ_LEN, TIME_BUDGET as _TIME_BUDGET, Tokenizer, evaluate_bpb as _evaluate_bpb_shards, get_token_bytes, make_dataloader as _make_dataloader_shards
-# Streaming Nemotron path (Super3 recipe). Opt-in via HYDRA_USE_NEMOTRON=1.
-if os.environ.get("HYDRA_USE_NEMOTRON", "0") == "1":
-    import prepare_nemotron as _p_nemo
-    make_dataloader = _p_nemo.make_dataloader
-    evaluate_bpb = _p_nemo.evaluate_bpb
-else:
-    make_dataloader = _make_dataloader_shards
-    evaluate_bpb = _evaluate_bpb_shards
-TIME_BUDGET = int(os.environ.get("HYDRA_TIME_BUDGET", str(_TIME_BUDGET)))
-_prepare_mod.TIME_BUDGET = TIME_BUDGET  # sync for evaluate_bpb
-CACHE_DIR = Path.home() / ".cache" / "autoresearch"
-LATEST_CKPT = CACHE_DIR / "latest.pt"
-PRETRAIN_FINAL_CKPT = CACHE_DIR / "pretrain_final.pt"
-FAILED_CKPT = CACHE_DIR / "latest_failed.pt"          # crash/FAIL path — never overwrites good
-BEST_CKPT = CACHE_DIR / "best_bpb.pt"                 # lowest val_bpb seen
-CKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", "250"))
-CKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))  # how many .N backups to keep
-RESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", str(LATEST_CKPT))
-# MDLM (Masked Diffusion LM) Rao-Blackwellized ELBO loss path.
-#   HYDRA_USE_MDLM=1         : switch training loss from AR sampled-softmax CE
-#                              to MDLM RB weighted CE (arXiv:2406.07524).
-#   HYDRA_MDLM_MASK_ID=N     : token id used for the MASK sentinel (default:
-#                              last valid id, vocab_size - 1). Ensure this id
-#                              never appears in training targets — typical
-#                              practice is to reserve it.
-#   HYDRA_MDLM_SCHEDULE=loglinear|linear  : noise schedule (default loglinear).
-# When enabled, the per-step flow is:
-#   1. mdlm_masked_forward_process(y)  ->  (x_noised, mask_positions, weights)
-#   2. logits = model(x_noised)                          (no targets -> full V logits)
-#   3. loss = mdlm_rb_loss(logits, y, mask_positions, weights)
-# Sampled-softmax is bypassed in this path because the RB ELBO needs
-# full-vocab logits on masked positions.
-USE_MDLM = os.environ.get("HYDRA_USE_MDLM", "0") == "1"
-MDLM_MASK_ID = int(os.environ.get("HYDRA_MDLM_MASK_ID", "-1"))  # -1 => default to vocab_size-1 at runtime
-MDLM_SCHEDULE = os.environ.get("HYDRA_MDLM_SCHEDULE", "loglinear")
-# ---------------------------------------------------------------------------
-# Schedules
-# ---------------------------------------------------------------------------
-def get_lr_multiplier(progress: float) -> float:
-    if progress < WARMUP_RATIO:
-        return progress / WARMUP_RATIO if WARMUP_RATIO > 0 else 1.0
-    decay_progress = (progress - WARMUP_RATIO) / (1.0 - WARMUP_RATIO)
-    return FINAL_LR_FRAC + 0.5 * (1.0 - FINAL_LR_FRAC) * (1 + math.cos(math.pi * decay_progress))
-def get_muon_momentum(step: int) -> float:
-    frac = min(step / 300, 1)
-    return (1 - frac) * 0.85 + frac * 0.95
-def get_weight_decay(progress: float) -> float:
-    return WEIGHT_DECAY * (1 - progress)
-_CKPT_WORKER_THREAD: threading.Thread | None = None
-def _ckpt_snapshot_state_dicts(
-    model: PostSemClawModel,
-    optimizer: torch.optim.Optimizer,
-) -> tuple[dict, dict]:
-    """Detach + CPU-clone every tensor so a bg thread can serialize safely
-    while the main loop keeps mutating live weights/optimizer state."""
-    msd = {k: (v.detach().to("cpu", copy=True) if torch.is_tensor(v) else v)
-           for k, v in model.state_dict().items()}
-    # optimizer.state_dict() is a nested dict; walk it.
-    osd_raw = optimizer.state_dict()
-    def _to_cpu(obj):
-        if torch.is_tensor(obj):
-            return obj.detach().to("cpu", copy=True)
-        if isinstance(obj, dict):
-            return {k: _to_cpu(v) for k, v in obj.items()}
-        if isinstance(obj, list):
-            return [_to_cpu(v) for v in obj]
-        if isinstance(obj, tuple):
-            return tuple(_to_cpu(v) for v in obj)
-        return obj
-    osd = _to_cpu(osd_raw)
-    return msd, osd
-def save_ckpt(
-    model: PostSemClawModel,
-    optimizer: torch.optim.Optimizer,
-    config: PostSemClawConfig,
-    step: int,
-    total_training_time: float,
-    smooth_train_loss: float,
-    bpt_ema: float,
-    epoch: int,
-    path: Path,
-    *,
-    val_bpb: float | None = None,
-    blocking: bool = False,
-) -> None:
-    """Save a training checkpoint.
-    Default behavior is async: the GPU→CPU state_dict clone runs on the main
-    thread (unavoidable; needs to happen before the next optimizer.step that
-    mutates live weights), then `torch.save` is dispatched to a daemon
-    worker thread. The next call joins any still-running prior save so only
-    one disk write is in flight.
-    `blocking=True` restores the original synchronous behavior — used for
-    end-of-training saves where correctness on process exit matters.
-    """
-    global _CKPT_WORKER_THREAD
-    try:
-        CACHE_DIR.mkdir(parents=True, exist_ok=True)
-        msd, osd = _ckpt_snapshot_state_dicts(model, optimizer)
-        # asdict() recursively converts dataclass fields to a dict and
-        # renders tuples as lists. hyena_layers therefore round-trips as a
-        # JSON-safe list; config_from_dict normalizes it back to a tuple.
-        payload = {
-            "model_state_dict": msd,
-            "optimizer_state_dict": osd,
-            "config": asdict(config),
-            "step": step,
-            "epoch": epoch,
-            "train_seconds": total_training_time,
-            "smoothed_loss": smooth_train_loss,
-            "bpt_ema": bpt_ema,
-            "val_bpb": val_bpb,
-        }
-        path_str = str(path)
-        def _rotate(p: str) -> None:
-            """Keep up to CKPT_ROTATIONS previous versions as p.1, p.2, ..."""
-            if CKPT_ROTATIONS <= 0:
-                return
-            try:
-                # Walk from oldest to newest so we don't clobber newer with older.
-                for i in range(CKPT_ROTATIONS, 0, -1):
-                    src = f"{p}.{i-1}" if i > 1 else p
-                    dst = f"{p}.{i}"
-                    if os.path.exists(src):
-                        os.replace(src, dst)
-            except Exception as e:
-                # Rotation is best-effort; never block a save on it.
-                print(f"[ckpt] rotate warn {p}: {type(e).__name__}: {e}", flush=True)
-        def _write():
-            try:
-                _rotate(path_str)
-                tmp = path_str + ".tmp"
-                torch.save(payload, tmp)
-                os.replace(tmp, path_str)
-                print(f"[ckpt] saved {path_str} (step={step})", flush=True)
-            except Exception as e:
-                print(f"[ckpt] SAVE FAILED {path_str}: {type(e).__name__}: {e}", flush=True)
-        if blocking:
-            _write()
-            return
-        # Join previous writer so at most one torch.save runs at a time.
-        if _CKPT_WORKER_THREAD is not None and _CKPT_WORKER_THREAD.is_alive():
-            _CKPT_WORKER_THREAD.join()
-        _CKPT_WORKER_THREAD = threading.Thread(
-            target=_write, daemon=True, name=f"ckpt-save-{step}"
-        )
-        _CKPT_WORKER_THREAD.start()
-    except Exception as e:
-        print(f"[ckpt] SNAPSHOT FAILED {path}: {type(e).__name__}: {e}", flush=True)
-def config_from_dict(cfg_dict: dict) -> PostSemClawConfig:
-    """Reconstruct a PostSemClawConfig from a checkpoint's asdict() payload.
-    Newly-added fields (e.g. `hyena_layers`) are defaulted when absent in
-    older checkpoints, and list-ified tuples are coerced back to tuples so
-    the dataclass keeps its declared types.
-    This is the ckpt-safe inverse of `asdict(config)` used by save_ckpt and
-    guarantees that a resume path can rebuild the exact same model topology
-    (Mamba3 vs HyenaBlock per layer) regardless of env-var state at resume.
-    """
-    # Only keep keys that are actually declared on PostSemClawConfig — extra
-    # keys in older/newer checkpoints must not crash construction.
-    field_names = {f.name for f in PostSemClawConfig.__dataclass_fields__.values()}
-    filtered = {k: v for k, v in cfg_dict.items() if k in field_names}
-    # asdict renders tuple[int,...] as list[int]; coerce back so the model
-    # builder sees the declared type.
-    if "hyena_layers" in filtered and filtered["hyena_layers"] is not None:
-        filtered["hyena_layers"] = tuple(sorted(int(x) for x in filtered["hyena_layers"]))
-    return PostSemClawConfig(**filtered)
-def _try_load_ckpt(path: Path, model, optimizer, device):
-    """Attempt to load a single ckpt. Returns the tuple on success, None on any failure."""
-    if not path.exists():
-        return None
-    ckpt = torch.load(str(path), map_location=device, weights_only=False)
-    state = ckpt.get("model_state_dict", ckpt)
-    missing, unexpected = model.load_state_dict(state, strict=False)
-    if missing:
-        print(f"[ckpt] {path.name} missing={len(missing)}", flush=True)
-    if unexpected:
-        print(f"[ckpt] {path.name} unexpected={len(unexpected)}", flush=True)
-    optimizer_state = ckpt.get("optimizer_state_dict")
-    if optimizer_state is not None:
-        try:
-            optimizer.load_state_dict(optimizer_state)
-        except Exception as e:
-            print(f"[ckpt] optimizer restore failed from {path.name}: {type(e).__name__}: {e}", flush=True)
-    step = int(ckpt.get("step", 0))
-    total_training_time = float(ckpt.get("train_seconds", 0.0))
-    smooth_train_loss = float(ckpt.get("smoothed_loss", 0.0))
-    bpt_ema = float(ckpt.get("bpt_ema", 0.0))
-    epoch = int(ckpt.get("epoch", 0))
-    print(
-        f"[ckpt] resumed {path} step={step} train_seconds={total_training_time:.1f}",
-        flush=True,
-    )
-    # Warn if resuming a schedule-exhausted ckpt — user is probably warm-starting.
-    budget = float(os.environ.get("HYDRA_TIME_BUDGET", "0") or 0)
-    if budget and total_training_time >= 0.99 * budget:
-        print(
-            f"[ckpt] WARNING: resumed ckpt used {total_training_time:.0f}s of {budget:.0f}s "
-            f"budget. LR schedule is essentially exhausted. "
-            f"Set HYDRA_WARMSTART=1 to reset optimizer + scheduler and keep only weights.",
-            flush=True,
-        )
-    return step, total_training_time, smooth_train_loss, bpt_ema, epoch
-def maybe_resume_ckpt(
-    model: PostSemClawModel,
-    optimizer: torch.optim.Optimizer,
-    device: torch.device,
-) -> tuple[int, float, float, float, int]:
-    if not RESUME_CKPT or RESUME_CKPT.lower() == "none":
-        print("[ckpt] resume disabled; starting fresh", flush=True)
-        return 0, 0.0, 0.0, 0.0, 0
-    resume_path = Path(os.path.expanduser(RESUME_CKPT))
-    # Try the primary path, then rotated backups. This is crucial because a
-    # partial / killed torch.save on the primary path would leave a corrupt
-    # file. If that fails we fall back to latest.pt.1, .2, .3 automatically.
-    candidates: list[Path] = [resume_path]
-    for i in range(1, CKPT_ROTATIONS + 1):
-        candidates.append(Path(str(resume_path) + f".{i}"))
-    for cand in candidates:
-        if not cand.exists():
-            continue
-        try:
-            result = _try_load_ckpt(cand, model, optimizer, device)
-            if result is not None:
-                if cand != resume_path:
-                    print(f"[ckpt] fell back to rotation {cand.name}", flush=True)
-                return result
-        except Exception as e:
-            print(f"[ckpt] {cand.name} load failed: {type(e).__name__}: {e}", flush=True)
-            continue
-    print(f"[ckpt] no usable checkpoint in {resume_path} + rotations; starting fresh", flush=True)
-    return 0, 0.0, 0.0, 0.0, 0
-# ---------------------------------------------------------------------------
-# Main entry
-# ---------------------------------------------------------------------------
-def main() -> None:
-    t_start = time.time()
-    torch.manual_seed(SEED)
-    torch.cuda.manual_seed(SEED)
-    # Precision / kernel-selection knobs for peak throughput on Ampere.
-    # - high : matmul uses TF32 (Ampere's 10-bit mantissa accum) for fp32 ops
-    # - allow_tf32 : explicit for both matmul + cudnn paths
-    # - cudnn.benchmark : env-gated (HYDRA_CUDNN_BENCHMARK, default OFF).
-    #   TRUE can lock in a locally-better-but-globally-slower algorithm
-    #   after the autotune phase ends, causing tps to degrade 15-20%
-    #   over the first ~100 steps. Observed 2026-04-22 and confirmed by
-    #   differential profiling. Default is now FALSE; set =1 only if you
-    #   see a specific workload where benchmark helps sustained tps.
-    torch.set_float32_matmul_precision("high")
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-    torch.backends.cudnn.benchmark = os.environ.get("HYDRA_CUDNN_BENCHMARK", "0") == "1"
-    device = torch.device("cuda")
-    autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
-    # Streaming path skips prepare.py (which normally trains the tokenizer
-    # and builds the retina), so we must materialize both before model init.
     if os.environ.get("HYDRA_USE_NEMOTRON", "0") == "1":
         _p_nemo.ensure_tokenizer()
-        if os.environ.get("HYDRA_THROUGHPUT_MODE", "0") != "1":
-            # Retina: HF Hub cache hit for this (vocab, n_bits, target_active) combo
-            # returns in seconds; otherwise build_retina streams Nemotron docs to
-            # compute cooccurrence + train SOM, then uploads back to the cache.
-            import subsystems.sdr_retina as _sdr_retina
-            _sdr_retina.build_retina()
-    tokenizer = Tokenizer.from_directory()
-    vocab_size = tokenizer.get_vocab_size()
-    print(f"Vocab size: {vocab_size:,}")
-    config = PostSemClawConfig(
-        sequence_len=MAX_SEQ_LEN,
-        vocab_size=vocab_size,
-        n_layer=N_LAYER,
-        d_model=D_MODEL,
-        d_state=D_STATE,
-        headdim=HEADDIM,
-        n_heads=N_HEADS,
-        expand=EXPAND,
-        engram_n_columns=ENGRAM_N_COLUMNS,
-        engram_key_dim=ENGRAM_KEY_DIM,
-        engram_layer_idx=ENGRAM_LAYER_IDX,
-    )
-    print(f"Model config: {asdict(config)}")
-    with torch.device("meta"):
-        model = PostSemClawModel(config)
-    model.to_empty(device=device)
-    model.init_weights()
-    param_counts = model.num_scaling_params()
-    print("Parameter counts:")
-    for key, value in param_counts.items():
-        print(f"  {key:24s}: {value:,}")
-    num_params = param_counts['total']
-    num_flops_per_token = model.estimate_flops()
-    print(f"Estimated FLOPs per token: {num_flops_per_token:e}")
-    tokens_per_fwdbwd = DEVICE_BATCH_SIZE * MAX_SEQ_LEN
-    assert TOTAL_BATCH_SIZE % tokens_per_fwdbwd == 0
-    grad_accum_steps = TOTAL_BATCH_SIZE // tokens_per_fwdbwd
-    optimizer = model.setup_optimizer(
-        unembedding_lr=UNEMBEDDING_LR,
-        embedding_lr=EMBEDDING_LR,
-        scalar_lr=SCALAR_LR,
-        adam_betas=ADAM_BETAS,
-        matrix_lr=MATRIX_LR,
-        weight_decay=WEIGHT_DECAY,
-    )
-    step, total_training_time, smooth_train_loss, bpt_ema, resume_epoch = maybe_resume_ckpt(
-        model, optimizer, device,
-    )
-    # Learnability #4: inform the model of the BOS token id so it can mask
-    # doc-separator positions in packed sequences. Always set (the mask only
-    # fires when HYDRA_DOC_SEP_MASK=1 is also on).
-    if hasattr(model, 'set_bos_token_id'):
-        model.set_bos_token_id(tokenizer.get_bos_token_id())
-    # Learnability #2: EMA shadow copy of weights. AveragedModel clones every
-    # parameter; we update it after every optimizer step and save it at the
-    # end alongside the raw checkpoint. Defaults OFF.
-    ema_model = None
-    if USE_EMA:
-        try:
-            from torch.optim.swa_utils import AveragedModel, get_ema_multi_avg_fn
-            # decay=EMA_DECAY; avg_fn uses get_ema_multi_avg_fn for numerical
-            # stability across bf16/fp32 mixed parameter groups.
-            ema_model = AveragedModel(
-                model,
-                multi_avg_fn=get_ema_multi_avg_fn(EMA_DECAY),
-            )
-            print(f"[EMA] enabled with decay={EMA_DECAY}")
-        except Exception as _e:
-            print(f"[EMA] disabled — AveragedModel init failed: {_e}")
-            ema_model = None
-    print("torch.compile: Muon step compiled; AdamW uses torch._fused_adamw_ (model blocks use native CUDA kernels)")
-    # Learnability #7: curriculum short-then-long. If enabled, build the
-    # initial dataloader at the short seq_len; we swap to full MAX_SEQ_LEN
-    # after CURRICULUM_SHORT_STEPS optimizer steps (see loop below).
-    _curriculum_active = CURRICULUM_SHORT_STEPS > 0 and CURRICULUM_SHORT_SEQ_LEN < MAX_SEQ_LEN
-    _current_seq_len = CURRICULUM_SHORT_SEQ_LEN if _curriculum_active else MAX_SEQ_LEN
-    if _curriculum_active:
-        print(
-            f"[CURRICULUM] starting at T={_current_seq_len} for "
-            f"{CURRICULUM_SHORT_STEPS} steps, then switching to T={MAX_SEQ_LEN}"
-        )
-    train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, _current_seq_len, "train")
-    x, y, epoch = next(train_loader)  # prefetch first batch
-    if resume_epoch > 0:
-        epoch = max(epoch, resume_epoch)
-    print(f"Time budget: {TIME_BUDGET}s")
-    print(f"Gradient accumulation steps: {grad_accum_steps}")
-    # Token→byte LUT for bits-per-byte computation. evaluate_bpb in prepare.py
-    # uses total_nats / (ln(2) * total_bytes); our live metric needs to match.
-    # Without this, `bpb = loss/ln(2)` is actually bits-per-TOKEN, which at
-    # vocab=8192 scales by ~4 and makes live train bpb non-comparable with
-    # val_bpb (champion 1.279 bpb vs train printing "8.04").
-    token_bytes = get_token_bytes(device=device)
-    # -----------------------------------------------------------------------
-    # Training loop
-    # -----------------------------------------------------------------------
-    t_start_training = time.time()
-    # Async postprocessing — run SOM + Hestia on background threads so
-    # the GPU doesn't idle during their CPU-bound work.
-    _ASYNC_POSTPROCESS = os.environ.get("HYDRA_ASYNC_POSTPROCESS", "1") == "1"
-    _som_thread: threading.Thread | None = None
-    _hestia_thread: threading.Thread | None = None
-    _hestia_stream: torch.cuda.Stream | None = (
-        torch.cuda.Stream() if _ASYNC_POSTPROCESS else None
-    )
-    # HYDRA_PROFILE_STEPS=N prints a per-phase cpu/gpu time breakdown for the
-    # first N steps (and every 100th step thereafter if N<0). Zero overhead
-    # when disabled. Used to find what's eating CPU budget when GPU should
-    # be the bottleneck.
-    _profile_steps = int(os.environ.get("HYDRA_PROFILE_STEPS", "0"))
-    while True:
-        torch.cuda.synchronize()
-        t0 = time.time()
-        _prof = _profile_steps and (step < _profile_steps or (_profile_steps < 0 and step % 100 == 0))
-        _gpu_ms = 0.0
-        _data_ms = 0.0
-        for micro_step in range(grad_accum_steps):
-            if _prof:
-                torch.cuda.synchronize(); _t_micro = time.time()
-            if USE_MDLM:
-                # MDLM path: corrupt y -> x_noised, run model to get full-V logits,
-                # compute RB weighted CE on masked positions. x (original input) is
-                # unused in this path — the model only sees the noised version of y.
-                _mask_id = MDLM_MASK_ID if MDLM_MASK_ID >= 0 else (vocab_size - 1)
-                x_noised, mask_positions, loss_weights = mdlm_masked_forward_process(
-                    y, mask_token_id=_mask_id, alpha_schedule=MDLM_SCHEDULE,
-                )
-                with autocast_ctx:
-                    logits = model(x_noised)  # targets=None -> (B, T, V) logits
-                loss = mdlm_rb_loss(logits, y, mask_positions, loss_weights)
-            else:
-                with autocast_ctx:
-                    loss = model(x, y)
-            train_loss = loss.detach()
-            loss = loss / grad_accum_steps
-            loss.backward()
-            if _prof:
-                torch.cuda.synchronize()
-                _gpu_ms += (time.time() - _t_micro) * 1000
-                _t_data = time.time()
-            x, y, epoch = next(train_loader)
-            if _prof:
-                _data_ms += (time.time() - _t_data) * 1000
-        if _prof:
-            torch.cuda.synchronize(); _t_fb = time.time()
-        # Progress and schedules
-        progress = min(total_training_time / TIME_BUDGET, 1.0)
-        lrm = get_lr_multiplier(progress)
-        muon_momentum = get_muon_momentum(step)
-        muon_weight_decay = get_weight_decay(progress)
-        for group in optimizer.param_groups:
-            group["lr"] = group["initial_lr"] * lrm
-            if group['kind'] == 'muon':
-                group["momentum"] = muon_momentum
-                group["weight_decay"] = muon_weight_decay
-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
-        optimizer.step()
-        if _prof:
-            torch.cuda.synchronize(); _t_opt = time.time()
-        # Learnability #2: EMA update after every optimizer step.
-        if ema_model is not None:
-            try:
-                ema_model.update_parameters(model)
-            except Exception as _e:
-                print(f"[EMA] update failed at step {step}: {_e}", flush=True)
-        # Learnability #7: curriculum transition. After
-        # CURRICULUM_SHORT_STEPS optimizer steps, rebuild the dataloader at
-        # MAX_SEQ_LEN. Done once, then the flag flips off.
-        if _curriculum_active and step + 1 >= CURRICULUM_SHORT_STEPS:
-            print(
-                f"[CURRICULUM] step={step+1} — switching from T={_current_seq_len} "
-                f"to T={MAX_SEQ_LEN}",
-                flush=True,
-            )
-            _current_seq_len = MAX_SEQ_LEN
-            _curriculum_active = False
-            train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, _current_seq_len, "train")
-            # Prefetch the next batch at the new seq_len so the following
-            # loop iteration consumes fresh data.
-            x, y, epoch = next(train_loader)
-        # Online SOM update — retina is now a plain Python attribute (not a
-        # registered buffer) so mutations do not invalidate torch.compile guards.
-        # Runs fully on CPU; safe to overlap with GPU forward pass.
-        _last_sdr = getattr(model, "_last_sdr", None)
-        if _last_sdr is not None:
-            if _ASYNC_POSTPROCESS:
-                if _som_thread is not None:
-                    _som_thread.join()
-                # Clone tensors before next step overwrites them
-                _som_x = x.clone()
-                _som_sdr = _last_sdr.clone()
-                _som_thread = threading.Thread(
-                    target=model.sdr_semantic.maybe_som_update,
-                    args=(_som_x, _som_sdr),
-                    daemon=True,
-                )
-                _som_thread.start()
-            else:
-                model.sdr_semantic.maybe_som_update(x, _last_sdr)
-        # Hestia QAT — anneal temperature every step, snap every N steps.
-        # apply_to walks all Linear modules (CPU) then does .data.copy_ (GPU).
-        # Background thread + separate CUDA stream lets this overlap with
-        # the next forward pass on the default stream.
-        _hestia_progress = (time.time() - t_start_training) / max(TIME_BUDGET, 1)
-        _hestia_interval = int(os.environ.get("HYDRA_HESTIA_INTERVAL", "100"))
-        if step % _hestia_interval == 0:
-            if _ASYNC_POSTPROCESS:
-                if _hestia_thread is not None:
-                    _hestia_thread.join()
-                def _hestia_bg(mdl: torch.nn.Module, prog: float) -> None:
-                    assert _hestia_stream is not None
-                    with torch.cuda.stream(_hestia_stream):
-                        mdl.hestia.anneal_temperature(prog)
-                        mdl.hestia.apply_to(mdl)
-                _hestia_thread = threading.Thread(
-                    target=_hestia_bg,
-                    args=(model, _hestia_progress),
-                    daemon=True,
-                )
-                _hestia_thread.start()
-            else:
-                model.hestia.anneal_temperature(_hestia_progress)
-                model.hestia.apply_to(model)
-        else:
-            # anneal_temperature is cheap (~1 us), keep inline
-            model.hestia.anneal_temperature(_hestia_progress)
-        model.zero_grad(set_to_none=True)
-        train_loss_f = train_loss.item()
-        if math.isnan(train_loss_f) or train_loss_f > 100:
-            print("FAIL")
-            # Save to a DIFFERENT file — never clobber a good latest.pt with
-            # a NaN/diverged state. The good ckpt from the last periodic save
-            # is the right place to resume from.
-            save_ckpt(
-                model,
-                optimizer,
-                config,
-                step,
-                total_training_time,
-                smooth_train_loss,
-                bpt_ema,
-                epoch,
-                FAILED_CKPT,
-                blocking=True,
-            )
-            raise SystemExit(1)
-        torch.cuda.synchronize()
-        t1 = time.time()
-        dt = t1 - t0
-        if _prof:
-            fb = (_t_fb - t0) * 1000
-            opt = (_t_opt - _t_fb) * 1000
-            rest = (t1 - _t_opt) * 1000
-            print(
-                f"[PROF step={step:05d}] gpu={_gpu_ms:.0f}ms data_fetch={_data_ms:.0f}ms "
-                f"(sum_fb={fb:.0f}) opt={opt:.0f}ms rest={rest:.0f}ms total={dt*1000:.0f}ms",
-                flush=True,
-            )
-        if step > 10:
-            total_training_time += dt
-        ema_beta = 0.9
-        smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss_f
-        debiased_smooth_loss = smooth_train_loss / (1 - ema_beta ** (step + 1))
-        pct_done = 100 * progress
-        tok_per_sec = int(TOTAL_BATCH_SIZE / dt)
-        mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE / dt / GPU_BF16_PEAK_FLOPS
-        remaining = max(0, TIME_BUDGET - total_training_time)
-        # Bytes-per-token for the CURRENT batch. evaluate_bpb in prepare.py
-        # computes bits-per-BYTE (total_nats / (ln2 * total_bytes)); to match
-        # that semantics live, we EMA-smooth the per-batch bytes/token and
-        # divide. Without this, the old `bpb = loss/ln2` was actually
-        # bits-per-token — ~4× larger than val_bpb at vocab=8192 and
-        # therefore not comparable to the champion 1.279 bpb metric.
-        with torch.no_grad():
-            y_flat = y.view(-1)
-            nbytes_batch = token_bytes[y_flat]
-            mask = nbytes_batch > 0
-            mask_count = mask.sum().clamp(min=1).float()
-            avg_bytes_per_tok = (nbytes_batch.float() * mask.float()).sum() / mask_count
-            bpt_batch = float(avg_bytes_per_tok.item())
-        if step == 0 or bpt_ema <= 0.0:
-            bpt_ema = bpt_batch
-        else:
-            bpt_ema = 0.98 * bpt_ema + 0.02 * bpt_batch
-        # Dual metric: bpb (byte-normalized, comparable with val_bpb) AND
-        # bpt (bits per token, the raw loss in bits). bpt_div exposes the
-        # current avg bytes-per-token so the conversion is transparent.
-        bpt = debiased_smooth_loss / math.log(2)
-        bpb = bpt / max(bpt_ema, 1e-6)
-        vram_mib = torch.cuda.memory_allocated() / 1024 / 1024
-        current_lr = optimizer.param_groups[0]["lr"]
-        # Per-step line-buffered log. NOT \r-overwritten so tee/grep see it.
-        # Keep key=value pairs grep-friendly.
-        ppl = 2.0 ** bpb  # perplexity (byte-level)
-        print(
-            f"step={step:05d} loss={debiased_smooth_loss:.4f} bpb={bpb:.4f} ppl={ppl:.3f} "
-            f"bpt={bpt:.3f} bpt_div={bpt_ema:.2f} "
-            f"tps={tok_per_sec} dt_ms={dt*1000:.0f} mfu={mfu:.1f} "
-            f"lr={current_lr:.2e} vram={vram_mib:.0f}MiB "
-            f"pct={pct_done:.1f} epoch={epoch} remaining={remaining:.0f}s",
-            flush=True,
-        )
-        if step == 0:
-            gc.collect()
-            gc.freeze()
-            gc.disable()
-        # No periodic gc.collect() — we disabled+froze at step 0 on purpose,
-        # so a manual collect every 5k steps just re-scans frozen objects
-        # (burned ~900 ms/event in production) for no live-garbage reason.
-        if CKPT_INTERVAL > 0 and step > 0 and step % CKPT_INTERVAL == 0:
-            save_ckpt(
-                model,
-                optimizer,
-                config,
-                step,
-                total_training_time,
-                smooth_train_loss,
-                bpt_ema,
-                epoch,
-                LATEST_CKPT,
-            )
-        # Periodic mid-training validation so we can see the model learning
-        # English in real time (not just at the end). Small val batch so it
-        # doesn't eat significant training time.
-        mid_val_interval = int(os.environ.get("HYDRA_MID_VAL_INTERVAL", "500"))
-        if mid_val_interval > 0 and step > 0 and step % mid_val_interval == 0:
-            model.eval()
-            try:
-                # Defrag GPU memory before eval allocates fresh chunks —
-                # without this the eval path can OOM on 6GB cards even
-                # though total usage fits, because the allocator's free
-                # blocks are fragmented.
-                torch.cuda.empty_cache()
-                _orig_mid = _prepare_mod.EVAL_TOKENS
-                _prepare_mod.EVAL_TOKENS = 262144  # ~260K tokens, fast
-                with torch.no_grad():
-                    with autocast_ctx:
-                        mid_bpb = evaluate_bpb(model, tokenizer, DEVICE_BATCH_SIZE)
-                _prepare_mod.EVAL_TOKENS = _orig_mid
-                mid_ppl = 2.0 ** mid_bpb
-                print(f"[MID_VAL] step={step} val_bpb={mid_bpb:.4f} val_ppl={mid_ppl:.3f}", flush=True)
-                # Per-layer diagnostic panel. Only printed when HYDRA_LAYER_DIAGNOSTICS=1
-                # is set (otherwise the layer_* keys are absent from _metrics).
-                _diag_metrics = model.get_secondary_metrics()
-                _layer_keys = sorted([k for k in _diag_metrics.keys() if k.startswith('layer_')])
-                if _layer_keys:
-                    # Condense: one row per layer showing the four core signals.
-                    n_layers = len(model.blocks)
-                    print(f"[LAYER_DIAG] step={step}", flush=True)
-                    for li in range(n_layers):
-                        d_ratio = _diag_metrics.get(f'layer_{li}_delta_ratio', float('nan'))
-                        out_n   = _diag_metrics.get(f'layer_{li}_out_norm',    float('nan'))
-                        g_norm  = _diag_metrics.get(f'layer_{li}_grad_norm',   float('nan'))
-                        eff_r   = _diag_metrics.get(f'layer_{li}_eff_rank',    float('nan'))
-                        f_std   = _diag_metrics.get(f'layer_{li}_feat_std',    float('nan'))
-                        print(
-                            f"[LAYER_DIAG]   L{li:02d}  delta_ratio={d_ratio:.4f}  "
-                            f"out_norm={out_n:.4f}  grad_norm={g_norm:.3e}  "
-                            f"eff_rank={eff_r:.1f}  feat_std={f_std:.4f}",
-                            flush=True,
-                        )
-                    htm_proj_g = _diag_metrics.get('htm_proj_grad_norm', None)
-                    if htm_proj_g is not None:
-                        print(f"[LAYER_DIAG]   htm_proj grad_norm={htm_proj_g:.3e}", flush=True)
-            except Exception as e:
-                print(f"[MID_VAL] failed: {e}", flush=True)
-            model.train()
-        step += 1
-        if step > 10 and total_training_time >= TIME_BUDGET:
-            break
-    # Drain async postprocessing threads before eval
-    if _som_thread is not None:
-        _som_thread.join()
-    if _hestia_thread is not None:
-        _hestia_thread.join()
-    if _hestia_stream is not None:
-        _hestia_stream.synchronize()
-    total_tokens = step * TOTAL_BATCH_SIZE
-    # ----------------------------------------------------------------------
-    # SAVE ORDER (critical):
-    #   1. Save PRETRAIN_FINAL_CKPT with val_bpb=None  (hedge against eval OOM)
-    #   2. Save LATEST_CKPT with val_bpb=None          (hedge against eval OOM)
-    #   3. Run eval (may OOM on small GPUs; we survive it)
-    #   4. Re-save both ckpts with val_bpb filled in
-    # This way we NEVER lose the final trained weights to an eval crash.
-    # Previous ordering put eval first, so an eval-time OOM destroyed the
-    # only record of a 6h training run (2026-04-22 incident).
-    # ----------------------------------------------------------------------
-    save_ckpt(
-        model, optimizer, config, step, total_training_time,
-        smooth_train_loss, bpt_ema, epoch, PRETRAIN_FINAL_CKPT,
-        val_bpb=None, blocking=True,
-    )
-    save_ckpt(
-        model, optimizer, config, step, total_training_time,
-        smooth_train_loss, bpt_ema, epoch, LATEST_CKPT,
-        val_bpb=None, blocking=True,
-    )
-    # Now it's safe to eval — ckpts are on disk regardless of what happens here.
-    # HYDRA_EVAL_BATCH overrides DEVICE_BATCH_SIZE (env-tunable; default halves
-    # the training batch because eval holds activations for full sequence and
-    # does not benefit from overlap with backward). HYDRA_EVAL_TOKENS controls
-    # how many val tokens to sweep (default 2 M, short enough for autoresearch
-    # 5-min budgets).
-    val_bpb: float | None = None
-    _eval_B = int(os.environ.get("HYDRA_EVAL_BATCH", str(max(1, DEVICE_BATCH_SIZE // 2))))
-    _eval_tokens = int(os.environ.get("HYDRA_EVAL_TOKENS", str(2 * 524288)))
-    try:
-        # Aggressive VRAM reclaim for 6GB cards. Peak training VRAM = 5.1GB
-        # which leaves < 1GB for the eval forward — the driver can't satisfy
-        # the allocation. Free EVERY tensor we don't strictly need:
-        #   - optimizer grads (set_to_none releases tensor)
-        #   - optimizer.state (fp32 Muon NS workspace, AdamW moments — ~size-of-params each)
-        #   - model internal caches (HTM subsample cache, SDR stash)
-        # After this, VRAM should be ~params only (bf16 ≈ 120MB at 60M params).
-        optimizer.zero_grad(set_to_none=True)
-        if hasattr(optimizer, 'state') and optimizer.state:
-            for p, st in list(optimizer.state.items()):
-                st.clear()
-            optimizer.state.clear()
-        for p in model.parameters():
-            if p.grad is not None:
-                p.grad = None
-        if hasattr(model, '_htm_cache'):
-            model._htm_cache = None
-        if hasattr(model, '_last_sdr'):
-            model._last_sdr = None
-        import gc as _gc
-        _gc.collect()
-        torch.cuda.empty_cache()
-        torch.cuda.synchronize()
-        try:
-            _free_mb = torch.cuda.mem_get_info()[0] / 1024 / 1024
-            print(f"[VAL] free_vram_mb={_free_mb:.0f} (cleared optimizer state)", flush=True)
-        except Exception:
-            pass
-        print(f"[VAL] running eval on {_eval_tokens} tokens at B={_eval_B}...", flush=True)
-        model.eval()
-        _orig = _prepare_mod.EVAL_TOKENS
-        _prepare_mod.EVAL_TOKENS = _eval_tokens
-        with autocast_ctx:
-            val_bpb = evaluate_bpb(model, tokenizer, _eval_B)
-        _prepare_mod.EVAL_TOKENS = _orig
-        val_ppl = 2 ** val_bpb
-        print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
-    except torch.cuda.OutOfMemoryError as e:
-        print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
-        torch.cuda.empty_cache()
-    except Exception as e:
-        import traceback as _tb
-        print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
-        _tb.print_exc()
-        try:
-            _free = torch.cuda.mem_get_info()[0] / 1024 / 1024
-            print(f"[VAL] post-crash free_vram_mb={_free:.0f}", flush=True)
-        except Exception:
-            pass
-    # Final ckpts with val_bpb filled in (if eval succeeded).
-    save_ckpt(
-        model, optimizer, config, step, total_training_time,
-        smooth_train_loss, bpt_ema, epoch, LATEST_CKPT,
-        val_bpb=val_bpb, blocking=True,
-    )
-    save_ckpt(
-        model, optimizer, config, step, total_training_time,
-        smooth_train_loss, bpt_ema, epoch, PRETRAIN_FINAL_CKPT,
-        val_bpb=val_bpb, blocking=True,
-    )
-    # Learnability #2: persist EMA weights alongside the raw checkpoint.
-    # latest_ema.pt contains ema_model.module (the Averaged params) so it
-    # can be loaded by evaluation / inference code that expects the same
-    # state_dict shape as the raw model.
-    if ema_model is not None:
-        try:
-            ema_ckpt_path = CACHE_DIR / "latest_ema.pt"
-            CACHE_DIR.mkdir(parents=True, exist_ok=True)
-            torch.save({
-                "model_state_dict": ema_model.module.state_dict(),
-                "config": asdict(config),
-                "step": step,
-                "epoch": epoch,
-                "train_seconds": total_training_time,
-                "val_bpb": val_bpb,
-                "ema_decay": EMA_DECAY,
-            }, str(ema_ckpt_path))
-            print(f"[EMA] saved {ema_ckpt_path} (step={step})", flush=True)
-        except Exception as _e:
-            print(f"[EMA] save failed: {_e}", flush=True)
-    run_factual_probes(model, tokenizer, device, autocast_ctx)
-    t_end = time.time()
-    startup_time = t_start_training - t_start
-    steady_state_mfu = (
-        100 * num_flops_per_token * TOTAL_BATCH_SIZE * (step - 10)
-        / total_training_time / GPU_BF16_PEAK_FLOPS
-        if total_training_time > 0 else 0
-    )
-    peak_vram_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
-    metrics = model.get_secondary_metrics()
-    print("---")
-    print(f"val_bpb:          {val_bpb:.6f}" if val_bpb is not None else "val_bpb:          SKIPPED")
-    print(f"training_seconds: {total_training_time:.1f}")
-    print(f"total_seconds:    {t_end - t_start:.1f}")
-    print(f"peak_vram_mb:     {peak_vram_mb:.1f}")
-    print(f"mfu_percent:      {steady_state_mfu:.2f}")
-    print(f"total_tokens_M:   {total_tokens / 1e6:.1f}")
-    print(f"num_steps:        {step}")
-    print(f"num_params_M:     {num_params / 1e6:.1f}")
-    print(f"n_layer:          {N_LAYER}")
-    print(f"d_model:          {D_MODEL}")
-    print(f"engram_hit_rate:   {metrics.get('engram_hit_rate', 0.0):.4f}")
-    print(f"sdr_active_bits:  {metrics.get('sdr_active_bits', 0):.1f}")
-    print(f"htm_anomaly:      {metrics.get('htm_anomaly', 0):.4f}")
-    # Per-layer summary panel — only printed when diagnostics were active.
-    _layer_keys = sorted([k for k in metrics.keys() if k.startswith('layer_')])
-    if _layer_keys:
-        n_layers = len(model.blocks)
-        print("--- per-layer diagnostic panel ---")
-        for li in range(n_layers):
-            d_ratio = metrics.get(f'layer_{li}_delta_ratio', float('nan'))
-            out_n   = metrics.get(f'layer_{li}_out_norm',    float('nan'))
-            g_norm  = metrics.get(f'layer_{li}_grad_norm',   float('nan'))
-            eff_r   = metrics.get(f'layer_{li}_eff_rank',    float('nan'))
-            f_std   = metrics.get(f'layer_{li}_feat_std',    float('nan'))
-            print(
-                f"L{li:02d}  delta_ratio={d_ratio:.4f}  out_norm={out_n:.4f}  "
-                f"grad_norm={g_norm:.3e}  eff_rank={eff_r:.1f}  feat_std={f_std:.4f}"
-            )
-    # Emit full metrics dictionary as JSON for sweep aggregation. Path from
-    # HYDRA_METRICS_OUT env var; default=/tmp/hydra_run_metrics.json. Always
-    # written (even without diagnostics) so the aggregator can compare runs.
-    _metrics_out = os.environ.get("HYDRA_METRICS_OUT", "/tmp/hydra_run_metrics.json")
-    try:
-        _dump = dict(metrics)
-        _dump.update({
-            'val_bpb': float(val_bpb),
-            'val_ppl': float(val_ppl),
-            'n_layer': int(N_LAYER),
-            'd_model': int(D_MODEL),
-            'num_params_M': float(num_params / 1e6),
-            'num_steps': int(step),
-            'total_tokens_M': float(total_tokens / 1e6),
-            'peak_vram_mb': float(peak_vram_mb),
-            'training_seconds': float(total_training_time),
-            'sdr_target_active': int(os.environ.get("HYDRA_SDR_TARGET_ACTIVE", "327")),
-        })
-        Path(_metrics_out).parent.mkdir(parents=True, exist_ok=True)
-        with open(_metrics_out, 'w') as _f:
-            json.dump(_dump, _f, indent=2, sort_keys=True)
-        print(f"[METRICS] wrote {_metrics_out}", flush=True)
-        # Also emit a single-line JSON to stdout so the sweep aggregator can
-        # scrape it from HF Jobs logs without pulling files out of the container.
-        print("[METRICS_JSON] " + json.dumps(_dump, sort_keys=True), flush=True)
-    except Exception as _e:
-        print(f"[METRICS] write failed: {_e}", flush=True)
-    run_factual_english(model, tokenizer, MAX_SEQ_LEN)
-    # startup_time is informative but not printed (preserve historical output)
-    _ = startup_time

+"""HYDRA training entry: setup, train loop, eval, summary.
+Extracted from the monolithic train.py (W1 modularization). Semantics
+preserved. Public entrypoint: `main()`.
+"""
+from __future__ import annotations
+import gc
+import json
+import math
+import os
+import sys
+import threading
+import time
+from dataclasses import asdict
+from pathlib import Path
+import torch
+# Line-buffered stdout so `python -u train.py | tee run.log | grep step` is
+# live (no \r overwrite, no 4k block-buffered pipe stalls). Safe on Python
+# 3.7+ where io.TextIOWrapper.reconfigure exists.
+try:
+    sys.stdout.reconfigure(line_buffering=True)  # type: ignore[attr-defined]
+except Exception:
+    pass
+from hydra.config import (
+    ADAM_BETAS, CURRICULUM_SHORT_SEQ_LEN, CURRICULUM_SHORT_STEPS,
+    D_MODEL, D_STATE, DEVICE_BATCH_SIZE, EMA_DECAY, EMBEDDING_LR,
+    ENGRAM_KEY_DIM, ENGRAM_LAYER_IDX, ENGRAM_N_COLUMNS, EXPAND,
+    FINAL_LR_FRAC, GPU_BF16_PEAK_FLOPS, HEADDIM, MATRIX_LR, N_HEADS,
+    N_LAYER, PostSemClawConfig, SCALAR_LR, SEED, TOTAL_BATCH_SIZE,
+    UNEMBEDDING_LR, USE_EMA, WARMUP_RATIO, WEIGHT_DECAY,
+)
+from hydra.diffusion_loss import mdlm_masked_forward_process, mdlm_rb_loss
+from hydra.eval import run_factual_english, run_factual_probes
+from hydra.model import PostSemClawModel
+import prepare as _prepare_mod
+from prepare import MAX_SEQ_LEN, TIME_BUDGET as _TIME_BUDGET, Tokenizer, evaluate_bpb as _evaluate_bpb_shards, get_token_bytes, make_dataloader as _make_dataloader_shards
+# Streaming Nemotron path (Super3 recipe). Opt-in via HYDRA_USE_NEMOTRON=1.
+if os.environ.get("HYDRA_USE_NEMOTRON", "0") == "1":
+    import prepare_nemotron as _p_nemo
+    make_dataloader = _p_nemo.make_dataloader
+    evaluate_bpb = _p_nemo.evaluate_bpb
+else:
+    make_dataloader = _make_dataloader_shards
+    evaluate_bpb = _evaluate_bpb_shards
+TIME_BUDGET = int(os.environ.get("HYDRA_TIME_BUDGET", str(_TIME_BUDGET)))
+_prepare_mod.TIME_BUDGET = TIME_BUDGET  # sync for evaluate_bpb
+CACHE_DIR = Path.home() / ".cache" / "autoresearch"
+LATEST_CKPT = CACHE_DIR / "latest.pt"
+PRETRAIN_FINAL_CKPT = CACHE_DIR / "pretrain_final.pt"
+FAILED_CKPT = CACHE_DIR / "latest_failed.pt"          # crash/FAIL path — never overwrites good
+BEST_CKPT = CACHE_DIR / "best_bpb.pt"                 # lowest val_bpb seen
+CKPT_INTERVAL = int(os.environ.get("HYDRA_CKPT_INTERVAL", "250"))
+CKPT_ROTATIONS = int(os.environ.get("HYDRA_CKPT_ROTATIONS", "3"))  # how many .N backups to keep
+RESUME_CKPT = os.environ.get("HYDRA_RESUME_CKPT", str(LATEST_CKPT))
+# MDLM (Masked Diffusion LM) Rao-Blackwellized ELBO loss path.
+#   HYDRA_USE_MDLM=1         : switch training loss from AR sampled-softmax CE
+#                              to MDLM RB weighted CE (arXiv:2406.07524).
+#   HYDRA_MDLM_MASK_ID=N     : token id used for the MASK sentinel (default:
+#                              last valid id, vocab_size - 1). Ensure this id
+#                              never appears in training targets — typical
+#                              practice is to reserve it.
+#   HYDRA_MDLM_SCHEDULE=loglinear|linear  : noise schedule (default loglinear).
+# When enabled, the per-step flow is:
+#   1. mdlm_masked_forward_process(y)  ->  (x_noised, mask_positions, weights)
+#   2. logits = model(x_noised)                          (no targets -> full V logits)
+#   3. loss = mdlm_rb_loss(logits, y, mask_positions, weights)
+# Sampled-softmax is bypassed in this path because the RB ELBO needs
+# full-vocab logits on masked positions.
+USE_MDLM = os.environ.get("HYDRA_USE_MDLM", "0") == "1"
+MDLM_MASK_ID = int(os.environ.get("HYDRA_MDLM_MASK_ID", "-1"))  # -1 => default to vocab_size-1 at runtime
+MDLM_SCHEDULE = os.environ.get("HYDRA_MDLM_SCHEDULE", "loglinear")
+# ---------------------------------------------------------------------------
+# Schedules
+# ---------------------------------------------------------------------------
+def get_lr_multiplier(progress: float) -> float:
+    if progress < WARMUP_RATIO:
+        return progress / WARMUP_RATIO if WARMUP_RATIO > 0 else 1.0
+    decay_progress = (progress - WARMUP_RATIO) / (1.0 - WARMUP_RATIO)
+    return FINAL_LR_FRAC + 0.5 * (1.0 - FINAL_LR_FRAC) * (1 + math.cos(math.pi * decay_progress))
+def get_muon_momentum(step: int) -> float:
+    frac = min(step / 300, 1)
+    return (1 - frac) * 0.85 + frac * 0.95
+def get_weight_decay(progress: float) -> float:
+    return WEIGHT_DECAY * (1 - progress)
+_CKPT_WORKER_THREAD: threading.Thread | None = None
+def _ckpt_snapshot_state_dicts(
+    model: PostSemClawModel,
+    optimizer: torch.optim.Optimizer,
+) -> tuple[dict, dict]:
+    """Detach + CPU-clone every tensor so a bg thread can serialize safely
+    while the main loop keeps mutating live weights/optimizer state."""
+    msd = {k: (v.detach().to("cpu", copy=True) if torch.is_tensor(v) else v)
+           for k, v in model.state_dict().items()}
+    # optimizer.state_dict() is a nested dict; walk it.
+    osd_raw = optimizer.state_dict()
+    def _to_cpu(obj):
+        if torch.is_tensor(obj):
+            return obj.detach().to("cpu", copy=True)
+        if isinstance(obj, dict):
+            return {k: _to_cpu(v) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [_to_cpu(v) for v in obj]
+        if isinstance(obj, tuple):
+            return tuple(_to_cpu(v) for v in obj)
+        return obj
+    osd = _to_cpu(osd_raw)
+    return msd, osd
+def save_ckpt(
+    model: PostSemClawModel,
+    optimizer: torch.optim.Optimizer,
+    config: PostSemClawConfig,
+    step: int,
+    total_training_time: float,
+    smooth_train_loss: float,
+    bpt_ema: float,
+    epoch: int,
+    path: Path,
+    *,
+    val_bpb: float | None = None,
+    blocking: bool = False,
+) -> None:
+    """Save a training checkpoint.
+    Default behavior is async: the GPU→CPU state_dict clone runs on the main
+    thread (unavoidable; needs to happen before the next optimizer.step that
+    mutates live weights), then `torch.save` is dispatched to a daemon
+    worker thread. The next call joins any still-running prior save so only
+    one disk write is in flight.
+    `blocking=True` restores the original synchronous behavior — used for
+    end-of-training saves where correctness on process exit matters.
+    """
+    global _CKPT_WORKER_THREAD
+    try:
+        CACHE_DIR.mkdir(parents=True, exist_ok=True)
+        msd, osd = _ckpt_snapshot_state_dicts(model, optimizer)
+        # asdict() recursively converts dataclass fields to a dict and
+        # renders tuples as lists. hyena_layers therefore round-trips as a
+        # JSON-safe list; config_from_dict normalizes it back to a tuple.
+        payload = {
+            "model_state_dict": msd,
+            "optimizer_state_dict": osd,
+            "config": asdict(config),
+            "step": step,
+            "epoch": epoch,
+            "train_seconds": total_training_time,
+            "smoothed_loss": smooth_train_loss,
+            "bpt_ema": bpt_ema,
+            "val_bpb": val_bpb,
+        }
+        path_str = str(path)
+        def _rotate(p: str) -> None:
+            """Keep up to CKPT_ROTATIONS previous versions as p.1, p.2, ..."""
+            if CKPT_ROTATIONS <= 0:
+                return
+            try:
+                # Walk from oldest to newest so we don't clobber newer with older.
+                for i in range(CKPT_ROTATIONS, 0, -1):
+                    src = f"{p}.{i-1}" if i > 1 else p
+                    dst = f"{p}.{i}"
+                    if os.path.exists(src):
+                        os.replace(src, dst)
+            except Exception as e:
+                # Rotation is best-effort; never block a save on it.
+                print(f"[ckpt] rotate warn {p}: {type(e).__name__}: {e}", flush=True)
+        def _write():
+            try:
+                _rotate(path_str)
+                tmp = path_str + ".tmp"
+                torch.save(payload, tmp)
+                os.replace(tmp, path_str)
+                print(f"[ckpt] saved {path_str} (step={step})", flush=True)
+            except Exception as e:
+                print(f"[ckpt] SAVE FAILED {path_str}: {type(e).__name__}: {e}", flush=True)
+        if blocking:
+            _write()
+            return
+        # Join previous writer so at most one torch.save runs at a time.
+        if _CKPT_WORKER_THREAD is not None and _CKPT_WORKER_THREAD.is_alive():
+            _CKPT_WORKER_THREAD.join()
+        _CKPT_WORKER_THREAD = threading.Thread(
+            target=_write, daemon=True, name=f"ckpt-save-{step}"
+        )
+        _CKPT_WORKER_THREAD.start()
+        # Non-default checkpoint paths are usually tests or one-off utilities that
+        # expect save_ckpt() to be durable when it returns. Keep the hot training
+        # path async for CACHE_DIR checkpoints, but make explicit custom paths
+        # deterministic.
+        if path.parent.resolve() != CACHE_DIR.resolve():
+            _CKPT_WORKER_THREAD.join()
+    except Exception as e:
+        print(f"[ckpt] SNAPSHOT FAILED {path}: {type(e).__name__}: {e}", flush=True)
+def config_from_dict(cfg_dict: dict) -> PostSemClawConfig:
+    """Reconstruct a PostSemClawConfig from a checkpoint's asdict() payload.
+    Newly-added fields (e.g. `hyena_layers`) are defaulted when absent in
+    older checkpoints, and list-ified tuples are coerced back to tuples so
+    the dataclass keeps its declared types.
+    This is the ckpt-safe inverse of `asdict(config)` used by save_ckpt and
+    guarantees that a resume path can rebuild the exact same model topology
+    (Mamba3 vs HyenaBlock per layer) regardless of env-var state at resume.
+    """
+    # Only keep keys that are actually declared on PostSemClawConfig — extra
+    # keys in older/newer checkpoints must not crash construction.
+    field_names = {f.name for f in PostSemClawConfig.__dataclass_fields__.values()}
+    filtered = {k: v for k, v in cfg_dict.items() if k in field_names}
+    # asdict renders tuple[int,...] as list[int]; coerce back so the model
+    # builder sees the declared type.
+    if "hyena_layers" in filtered and filtered["hyena_layers"] is not None:
+        filtered["hyena_layers"] = tuple(sorted(int(x) for x in filtered["hyena_layers"]))
+    return PostSemClawConfig(**filtered)
+def _try_load_ckpt(path: Path, model, optimizer, device):
+    """Attempt to load a single ckpt. Returns the tuple on success, None on any failure."""
+    if not path.exists():
+        return None
+    ckpt = torch.load(str(path), map_location=device, weights_only=False)
+    state = ckpt.get("model_state_dict", ckpt)
+    missing, unexpected = model.load_state_dict(state, strict=False)
+    if missing:
+        print(f"[ckpt] {path.name} missing={len(missing)}", flush=True)
+    if unexpected:
+        print(f"[ckpt] {path.name} unexpected={len(unexpected)}", flush=True)
+    optimizer_state = ckpt.get("optimizer_state_dict")
+    if optimizer_state is not None:
+        try:
+            optimizer.load_state_dict(optimizer_state)
+        except Exception as e:
+            print(f"[ckpt] optimizer restore failed from {path.name}: {type(e).__name__}: {e}", flush=True)
+    step = int(ckpt.get("step", 0))
+    total_training_time = float(ckpt.get("train_seconds", 0.0))
+    smooth_train_loss = float(ckpt.get("smoothed_loss", 0.0))
+    bpt_ema = float(ckpt.get("bpt_ema", 0.0))
+    epoch = int(ckpt.get("epoch", 0))
+    print(
+        f"[ckpt] resumed {path} step={step} train_seconds={total_training_time:.1f}",
+        flush=True,
+    )
+    # Warn if resuming a schedule-exhausted ckpt — user is probably warm-starting.
+    budget = float(os.environ.get("HYDRA_TIME_BUDGET", "0") or 0)
+    if budget and total_training_time >= 0.99 * budget:
+        print(
+            f"[ckpt] WARNING: resumed ckpt used {total_training_time:.0f}s of {budget:.0f}s "
+            f"budget. LR schedule is essentially exhausted. "
+            f"Set HYDRA_WARMSTART=1 to reset optimizer + scheduler and keep only weights.",
+            flush=True,
+        )
+    return step, total_training_time, smooth_train_loss, bpt_ema, epoch
+def maybe_resume_ckpt(
+    model: PostSemClawModel,
+    optimizer: torch.optim.Optimizer,
+    device: torch.device,
+) -> tuple[int, float, float, float, int]:
+    if not RESUME_CKPT or RESUME_CKPT.lower() == "none":
+        print("[ckpt] resume disabled; starting fresh", flush=True)
+        return 0, 0.0, 0.0, 0.0, 0
+    resume_path = Path(os.path.expanduser(RESUME_CKPT))
+    # Try the primary path, then rotated backups. This is crucial because a
+    # partial / killed torch.save on the primary path would leave a corrupt
+    # file. If that fails we fall back to latest.pt.1, .2, .3 automatically.
+    candidates: list[Path] = [resume_path]
+    for i in range(1, CKPT_ROTATIONS + 1):
+        candidates.append(Path(str(resume_path) + f".{i}"))
+    for cand in candidates:
+        if not cand.exists():
+            continue
+        try:
+            result = _try_load_ckpt(cand, model, optimizer, device)
+            if result is not None:
+                if cand != resume_path:
+                    print(f"[ckpt] fell back to rotation {cand.name}", flush=True)
+                return result
+        except Exception as e:
+            print(f"[ckpt] {cand.name} load failed: {type(e).__name__}: {e}", flush=True)
+            continue
+    print(f"[ckpt] no usable checkpoint in {resume_path} + rotations; starting fresh", flush=True)
+    return 0, 0.0, 0.0, 0.0, 0
+# ---------------------------------------------------------------------------
+# Main entry
+# ---------------------------------------------------------------------------
+def main() -> None:
+    t_start = time.time()
+    torch.manual_seed(SEED)
+    torch.cuda.manual_seed(SEED)
+    # Precision / kernel-selection knobs for peak throughput on Ampere.
+    # - high : matmul uses TF32 (Ampere's 10-bit mantissa accum) for fp32 ops
+    # - allow_tf32 : explicit for both matmul + cudnn paths
+    # - cudnn.benchmark : env-gated (HYDRA_CUDNN_BENCHMARK, default OFF).
+    #   TRUE can lock in a locally-better-but-globally-slower algorithm
+    #   after the autotune phase ends, causing tps to degrade 15-20%
+    #   over the first ~100 steps. Observed 2026-04-22 and confirmed by
+    #   differential profiling. Default is now FALSE; set =1 only if you
+    #   see a specific workload where benchmark helps sustained tps.
+    torch.set_float32_matmul_precision("high")
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cudnn.benchmark = os.environ.get("HYDRA_CUDNN_BENCHMARK", "0") == "1"
+    device = torch.device("cuda")
+    autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
+    # Streaming path skips prepare.py (which normally trains the tokenizer
+    # and builds the retina), so we must materialize both before model init.
     if os.environ.get("HYDRA_USE_NEMOTRON", "0") == "1":
         _p_nemo.ensure_tokenizer()
+        # Retina: HF Hub cache hit for this (vocab, n_bits, target_active) combo
+        # returns in seconds; otherwise build_retina streams Nemotron docs to
+        # compute cooccurrence + train SOM, then uploads back to the cache.
+        import subsystems.sdr_retina as _sdr_retina
+        _sdr_retina.build_retina()
+    tokenizer = Tokenizer.from_directory()
+    vocab_size = tokenizer.get_vocab_size()
+    print(f"Vocab size: {vocab_size:,}")
+    config = PostSemClawConfig(
+        sequence_len=MAX_SEQ_LEN,
+        vocab_size=vocab_size,
+        n_layer=N_LAYER,
+        d_model=D_MODEL,
+        d_state=D_STATE,
+        headdim=HEADDIM,
+        n_heads=N_HEADS,
+        expand=EXPAND,
+        engram_n_columns=ENGRAM_N_COLUMNS,
+        engram_key_dim=ENGRAM_KEY_DIM,
+        engram_layer_idx=ENGRAM_LAYER_IDX,
+    )
+    print(f"Model config: {asdict(config)}")
+    with torch.device("meta"):
+        model = PostSemClawModel(config)
+    model.to_empty(device=device)
+    model.init_weights()
+    param_counts = model.num_scaling_params()
+    print("Parameter counts:")
+    for key, value in param_counts.items():
+        print(f"  {key:24s}: {value:,}")
+    num_params = param_counts['total']
+    num_flops_per_token = model.estimate_flops()
+    print(f"Estimated FLOPs per token: {num_flops_per_token:e}")
+    tokens_per_fwdbwd = DEVICE_BATCH_SIZE * MAX_SEQ_LEN
+    assert TOTAL_BATCH_SIZE % tokens_per_fwdbwd == 0
+    grad_accum_steps = TOTAL_BATCH_SIZE // tokens_per_fwdbwd
+    optimizer = model.setup_optimizer(
+        unembedding_lr=UNEMBEDDING_LR,
+        embedding_lr=EMBEDDING_LR,
+        scalar_lr=SCALAR_LR,
+        adam_betas=ADAM_BETAS,
+        matrix_lr=MATRIX_LR,
+        weight_decay=WEIGHT_DECAY,
+    )
+    step, total_training_time, smooth_train_loss, bpt_ema, resume_epoch = maybe_resume_ckpt(
+        model, optimizer, device,
+    )
+    # Learnability #4: inform the model of the BOS token id so it can mask
+    # doc-separator positions in packed sequences. Always set (the mask only
+    # fires when HYDRA_DOC_SEP_MASK=1 is also on).
+    if hasattr(model, 'set_bos_token_id'):
+        model.set_bos_token_id(tokenizer.get_bos_token_id())
+    # Learnability #2: EMA shadow copy of weights. AveragedModel clones every
+    # parameter; we update it after every optimizer step and save it at the
+    # end alongside the raw checkpoint. Defaults OFF.
+    ema_model = None
+    if USE_EMA:
+        try:
+            from torch.optim.swa_utils import AveragedModel, get_ema_multi_avg_fn
+            # decay=EMA_DECAY; avg_fn uses get_ema_multi_avg_fn for numerical
+            # stability across bf16/fp32 mixed parameter groups.
+            ema_model = AveragedModel(
+                model,
+                multi_avg_fn=get_ema_multi_avg_fn(EMA_DECAY),
+            )
+            print(f"[EMA] enabled with decay={EMA_DECAY}")
+        except Exception as _e:
+            print(f"[EMA] disabled — AveragedModel init failed: {_e}")
+            ema_model = None
+    print("torch.compile: Muon step compiled; AdamW uses torch._fused_adamw_ (model blocks use native CUDA kernels)")
+    # Learnability #7: curriculum short-then-long. If enabled, build the
+    # initial dataloader at the short seq_len; we swap to full MAX_SEQ_LEN
+    # after CURRICULUM_SHORT_STEPS optimizer steps (see loop below).
+    _curriculum_active = CURRICULUM_SHORT_STEPS > 0 and CURRICULUM_SHORT_SEQ_LEN < MAX_SEQ_LEN
+    _current_seq_len = CURRICULUM_SHORT_SEQ_LEN if _curriculum_active else MAX_SEQ_LEN
+    if _curriculum_active:
+        print(
+            f"[CURRICULUM] starting at T={_current_seq_len} for "
+            f"{CURRICULUM_SHORT_STEPS} steps, then switching to T={MAX_SEQ_LEN}"
+        )
+    train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, _current_seq_len, "train")
+    x, y, epoch = next(train_loader)  # prefetch first batch
+    if resume_epoch > 0:
+        epoch = max(epoch, resume_epoch)
+    print(f"Time budget: {TIME_BUDGET}s")
+    print(f"Gradient accumulation steps: {grad_accum_steps}")
+    # Token→byte LUT for bits-per-byte computation. evaluate_bpb in prepare.py
+    # uses total_nats / (ln(2) * total_bytes); our live metric needs to match.
+    # Without this, `bpb = loss/ln(2)` is actually bits-per-TOKEN, which at
+    # vocab=8192 scales by ~4 and makes live train bpb non-comparable with
+    # val_bpb (champion 1.279 bpb vs train printing "8.04").
+    token_bytes = get_token_bytes(device=device)
+    # -----------------------------------------------------------------------
+    # Training loop
+    # -----------------------------------------------------------------------
+    t_start_training = time.time()
+    # Async postprocessing — run SOM + Hestia on background threads so
+    # the GPU doesn't idle during their CPU-bound work.
+    _ASYNC_POSTPROCESS = os.environ.get("HYDRA_ASYNC_POSTPROCESS", "1") == "1"
+    _som_thread: threading.Thread | None = None
+    _hestia_thread: threading.Thread | None = None
+    _hestia_stream: torch.cuda.Stream | None = (
+        torch.cuda.Stream() if _ASYNC_POSTPROCESS else None
+    )
+    # HYDRA_PROFILE_STEPS=N prints a per-phase cpu/gpu time breakdown for the
+    # first N steps (and every 100th step thereafter if N<0). Zero overhead
+    # when disabled. Used to find what's eating CPU budget when GPU should
+    # be the bottleneck.
+    _profile_steps = int(os.environ.get("HYDRA_PROFILE_STEPS", "0"))
+    while True:
+        torch.cuda.synchronize()
+        t0 = time.time()
+        _prof = _profile_steps and (step < _profile_steps or (_profile_steps < 0 and step % 100 == 0))
+        _gpu_ms = 0.0
+        _data_ms = 0.0
+        for micro_step in range(grad_accum_steps):
+            if _prof:
+                torch.cuda.synchronize(); _t_micro = time.time()
+            if USE_MDLM:
+                # MDLM path: corrupt y -> x_noised, run model to get full-V logits,
+                # compute RB weighted CE on masked positions. x (original input) is
+                # unused in this path — the model only sees the noised version of y.
+                _mask_id = MDLM_MASK_ID if MDLM_MASK_ID >= 0 else (vocab_size - 1)
+                x_noised, mask_positions, loss_weights = mdlm_masked_forward_process(
+                    y, mask_token_id=_mask_id, alpha_schedule=MDLM_SCHEDULE,
+                )
+                with autocast_ctx:
+                    logits = model(x_noised)  # targets=None -> (B, T, V) logits
+                loss = mdlm_rb_loss(logits, y, mask_positions, loss_weights)
+            else:
+                with autocast_ctx:
+                    loss = model(x, y)
+            train_loss = loss.detach()
+            loss = loss / grad_accum_steps
+            loss.backward()
+            if _prof:
+                torch.cuda.synchronize()
+                _gpu_ms += (time.time() - _t_micro) * 1000
+                _t_data = time.time()
+            x, y, epoch = next(train_loader)
+            if _prof:
+                _data_ms += (time.time() - _t_data) * 1000
+        if _prof:
+            torch.cuda.synchronize(); _t_fb = time.time()
+        # Progress and schedules
+        progress = min(total_training_time / TIME_BUDGET, 1.0)
+        lrm = get_lr_multiplier(progress)
+        muon_momentum = get_muon_momentum(step)
+        muon_weight_decay = get_weight_decay(progress)
+        for group in optimizer.param_groups:
+            group["lr"] = group["initial_lr"] * lrm
+            if group['kind'] == 'muon':
+                group["momentum"] = muon_momentum
+                group["weight_decay"] = muon_weight_decay
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        optimizer.step()
+        if _prof:
+            torch.cuda.synchronize(); _t_opt = time.time()
+        # Learnability #2: EMA update after every optimizer step.
+        if ema_model is not None:
+            try:
+                ema_model.update_parameters(model)
+            except Exception as _e:
+                print(f"[EMA] update failed at step {step}: {_e}", flush=True)
+        # Learnability #7: curriculum transition. After
+        # CURRICULUM_SHORT_STEPS optimizer steps, rebuild the dataloader at
+        # MAX_SEQ_LEN. Done once, then the flag flips off.
+        if _curriculum_active and step + 1 >= CURRICULUM_SHORT_STEPS:
+            print(
+                f"[CURRICULUM] step={step+1} — switching from T={_current_seq_len} "
+                f"to T={MAX_SEQ_LEN}",
+                flush=True,
+            )
+            _current_seq_len = MAX_SEQ_LEN
+            _curriculum_active = False
+            train_loader = make_dataloader(tokenizer, DEVICE_BATCH_SIZE, _current_seq_len, "train")
+            # Prefetch the next batch at the new seq_len so the following
+            # loop iteration consumes fresh data.
+            x, y, epoch = next(train_loader)
+        # Online SOM update — retina is now a plain Python attribute (not a
+        # registered buffer) so mutations do not invalidate torch.compile guards.
+        # Runs fully on CPU; safe to overlap with GPU forward pass.
+        _last_sdr = getattr(model, "_last_sdr", None)
+        if _last_sdr is not None:
+            if _ASYNC_POSTPROCESS:
+                if _som_thread is not None:
+                    _som_thread.join()
+                # Clone tensors before next step overwrites them
+                _som_x = x.clone()
+                _som_sdr = _last_sdr.clone()
+                _som_thread = threading.Thread(
+                    target=model.sdr_semantic.maybe_som_update,
+                    args=(_som_x, _som_sdr),
+                    daemon=True,
+                )
+                _som_thread.start()
+            else:
+                model.sdr_semantic.maybe_som_update(x, _last_sdr)
+        # Hestia QAT — anneal temperature every step, snap every N steps.
+        # apply_to walks all Linear modules (CPU) then does .data.copy_ (GPU).
+        # Background thread + separate CUDA stream lets this overlap with
+        # the next forward pass on the default stream.
+        _hestia_progress = (time.time() - t_start_training) / max(TIME_BUDGET, 1)
+        _hestia_interval = int(os.environ.get("HYDRA_HESTIA_INTERVAL", "100"))
+        if step % _hestia_interval == 0:
+            if _ASYNC_POSTPROCESS:
+                if _hestia_thread is not None:
+                    _hestia_thread.join()
+                def _hestia_bg(mdl: torch.nn.Module, prog: float) -> None:
+                    assert _hestia_stream is not None
+                    with torch.cuda.stream(_hestia_stream):
+                        mdl.hestia.anneal_temperature(prog)
+                        mdl.hestia.apply_to(mdl)
+                _hestia_thread = threading.Thread(
+                    target=_hestia_bg,
+                    args=(model, _hestia_progress),
+                    daemon=True,
+                )
+                _hestia_thread.start()
+            else:
+                model.hestia.anneal_temperature(_hestia_progress)
+                model.hestia.apply_to(model)
+        else:
+            # anneal_temperature is cheap (~1 us), keep inline
+            model.hestia.anneal_temperature(_hestia_progress)
+        model.zero_grad(set_to_none=True)
+        train_loss_f = train_loss.item()
+        if math.isnan(train_loss_f) or train_loss_f > 100:
+            print("FAIL")
+            # Save to a DIFFERENT file — never clobber a good latest.pt with
+            # a NaN/diverged state. The good ckpt from the last periodic save
+            # is the right place to resume from.
+            save_ckpt(
+                model,
+                optimizer,
+                config,
+                step,
+                total_training_time,
+                smooth_train_loss,
+                bpt_ema,
+                epoch,
+                FAILED_CKPT,
+                blocking=True,
+            )
+            raise SystemExit(1)
+        torch.cuda.synchronize()
+        t1 = time.time()
+        dt = t1 - t0
+        if _prof:
+            fb = (_t_fb - t0) * 1000
+            opt = (_t_opt - _t_fb) * 1000
+            rest = (t1 - _t_opt) * 1000
+            print(
+                f"[PROF step={step:05d}] gpu={_gpu_ms:.0f}ms data_fetch={_data_ms:.0f}ms "
+                f"(sum_fb={fb:.0f}) opt={opt:.0f}ms rest={rest:.0f}ms total={dt*1000:.0f}ms",
+                flush=True,
+            )
+        if step > 10:
+            total_training_time += dt
+        ema_beta = 0.9
+        smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss_f
+        debiased_smooth_loss = smooth_train_loss / (1 - ema_beta ** (step + 1))
+        pct_done = 100 * progress
+        tok_per_sec = int(TOTAL_BATCH_SIZE / dt)
+        mfu = 100 * num_flops_per_token * TOTAL_BATCH_SIZE / dt / GPU_BF16_PEAK_FLOPS
+        remaining = max(0, TIME_BUDGET - total_training_time)
+        # Bytes-per-token for the CURRENT batch. evaluate_bpb in prepare.py
+        # computes bits-per-BYTE (total_nats / (ln2 * total_bytes)); to match
+        # that semantics live, we EMA-smooth the per-batch bytes/token and
+        # divide. Without this, the old `bpb = loss/ln2` was actually
+        # bits-per-token — ~4× larger than val_bpb at vocab=8192 and
+        # therefore not comparable to the champion 1.279 bpb metric.
+        with torch.no_grad():
+            y_flat = y.view(-1)
+            nbytes_batch = token_bytes[y_flat]
+            mask = nbytes_batch > 0
+            mask_count = mask.sum().clamp(min=1).float()
+            avg_bytes_per_tok = (nbytes_batch.float() * mask.float()).sum() / mask_count
+            bpt_batch = float(avg_bytes_per_tok.item())
+        if step == 0 or bpt_ema <= 0.0:
+            bpt_ema = bpt_batch
+        else:
+            bpt_ema = 0.98 * bpt_ema + 0.02 * bpt_batch
+        # Dual metric: bpb (byte-normalized, comparable with val_bpb) AND
+        # bpt (bits per token, the raw loss in bits). bpt_div exposes the
+        # current avg bytes-per-token so the conversion is transparent.
+        bpt = debiased_smooth_loss / math.log(2)
+        bpb = bpt / max(bpt_ema, 1e-6)
+        vram_mib = torch.cuda.memory_allocated() / 1024 / 1024
+        current_lr = optimizer.param_groups[0]["lr"]
+        # Per-step line-buffered log. NOT \r-overwritten so tee/grep see it.
+        # Keep key=value pairs grep-friendly.
+        ppl = 2.0 ** bpb  # perplexity (byte-level)
+        print(
+            f"step={step:05d} loss={debiased_smooth_loss:.4f} bpb={bpb:.4f} ppl={ppl:.3f} "
+            f"bpt={bpt:.3f} bpt_div={bpt_ema:.2f} "
+            f"tps={tok_per_sec} dt_ms={dt*1000:.0f} mfu={mfu:.1f} "
+            f"lr={current_lr:.2e} vram={vram_mib:.0f}MiB "
+            f"pct={pct_done:.1f} epoch={epoch} remaining={remaining:.0f}s",
+            flush=True,
+        )
+        if step == 0:
+            gc.collect()
+            gc.freeze()
+            gc.disable()
+        # No periodic gc.collect() — we disabled+froze at step 0 on purpose,
+        # so a manual collect every 5k steps just re-scans frozen objects
+        # (burned ~900 ms/event in production) for no live-garbage reason.
+        if CKPT_INTERVAL > 0 and step > 0 and step % CKPT_INTERVAL == 0:
+            save_ckpt(
+                model,
+                optimizer,
+                config,
+                step,
+                total_training_time,
+                smooth_train_loss,
+                bpt_ema,
+                epoch,
+                LATEST_CKPT,
+            )
+        # Periodic mid-training validation so we can see the model learning
+        # English in real time (not just at the end). Small val batch so it
+        # doesn't eat significant training time.
+        mid_val_interval = int(os.environ.get("HYDRA_MID_VAL_INTERVAL", "500"))
+        if mid_val_interval > 0 and step > 0 and step % mid_val_interval == 0:
+            model.eval()
+            try:
+                # Defrag GPU memory before eval allocates fresh chunks —
+                # without this the eval path can OOM on 6GB cards even
+                # though total usage fits, because the allocator's free
+                # blocks are fragmented.
+                torch.cuda.empty_cache()
+                _orig_mid = _prepare_mod.EVAL_TOKENS
+                _prepare_mod.EVAL_TOKENS = 262144  # ~260K tokens, fast
+                with torch.no_grad():
+                    with autocast_ctx:
+                        mid_bpb = evaluate_bpb(model, tokenizer, DEVICE_BATCH_SIZE)
+                _prepare_mod.EVAL_TOKENS = _orig_mid
+                mid_ppl = 2.0 ** mid_bpb
+                print(f"[MID_VAL] step={step} val_bpb={mid_bpb:.4f} val_ppl={mid_ppl:.3f}", flush=True)
+                # Per-layer diagnostic panel. Only printed when HYDRA_LAYER_DIAGNOSTICS=1
+                # is set (otherwise the layer_* keys are absent from _metrics).
+                _diag_metrics = model.get_secondary_metrics()
+                _layer_keys = sorted([k for k in _diag_metrics.keys() if k.startswith('layer_')])
+                if _layer_keys:
+                    # Condense: one row per layer showing the four core signals.
+                    n_layers = len(model.blocks)
+                    print(f"[LAYER_DIAG] step={step}", flush=True)
+                    for li in range(n_layers):
+                        d_ratio = _diag_metrics.get(f'layer_{li}_delta_ratio', float('nan'))
+                        out_n   = _diag_metrics.get(f'layer_{li}_out_norm',    float('nan'))
+                        g_norm  = _diag_metrics.get(f'layer_{li}_grad_norm',   float('nan'))
+                        eff_r   = _diag_metrics.get(f'layer_{li}_eff_rank',    float('nan'))
+                        f_std   = _diag_metrics.get(f'layer_{li}_feat_std',    float('nan'))
+                        print(
+                            f"[LAYER_DIAG]   L{li:02d}  delta_ratio={d_ratio:.4f}  "
+                            f"out_norm={out_n:.4f}  grad_norm={g_norm:.3e}  "
+                            f"eff_rank={eff_r:.1f}  feat_std={f_std:.4f}",
+                            flush=True,
+                        )
+                    htm_proj_g = _diag_metrics.get('htm_proj_grad_norm', None)
+                    if htm_proj_g is not None:
+                        print(f"[LAYER_DIAG]   htm_proj grad_norm={htm_proj_g:.3e}", flush=True)
+            except Exception as e:
+                print(f"[MID_VAL] failed: {e}", flush=True)
+            model.train()
+        step += 1
+        if step > 10 and total_training_time >= TIME_BUDGET:
+            break
+    # Drain async postprocessing threads before eval
+    if _som_thread is not None:
+        _som_thread.join()
+    if _hestia_thread is not None:
+        _hestia_thread.join()
+    if _hestia_stream is not None:
+        _hestia_stream.synchronize()
+    total_tokens = step * TOTAL_BATCH_SIZE
+    # ----------------------------------------------------------------------
+    # SAVE ORDER (critical):
+    #   1. Save PRETRAIN_FINAL_CKPT with val_bpb=None  (hedge against eval OOM)
+    #   2. Save LATEST_CKPT with val_bpb=None          (hedge against eval OOM)
+    #   3. Run eval (may OOM on small GPUs; we survive it)
+    #   4. Re-save both ckpts with val_bpb filled in
+    # This way we NEVER lose the final trained weights to an eval crash.
+    # Previous ordering put eval first, so an eval-time OOM destroyed the
+    # only record of a 6h training run (2026-04-22 incident).
+    # ----------------------------------------------------------------------
+    save_ckpt(
+        model, optimizer, config, step, total_training_time,
+        smooth_train_loss, bpt_ema, epoch, PRETRAIN_FINAL_CKPT,
+        val_bpb=None, blocking=True,
+    )
+    save_ckpt(
+        model, optimizer, config, step, total_training_time,
+        smooth_train_loss, bpt_ema, epoch, LATEST_CKPT,
+        val_bpb=None, blocking=True,
+    )
+    # Now it's safe to eval — ckpts are on disk regardless of what happens here.
+    # HYDRA_EVAL_BATCH overrides DEVICE_BATCH_SIZE (env-tunable; default halves
+    # the training batch because eval holds activations for full sequence and
+    # does not benefit from overlap with backward). HYDRA_EVAL_TOKENS controls
+    # how many val tokens to sweep (default 2 M, short enough for autoresearch
+    # 5-min budgets).
+    val_bpb: float | None = None
+    # Eval batch: default to 4 on cloud GPUs (enough freed VRAM after optimizer
+    # clear), fall back to DEVICE_BATCH_SIZE//2 on tiny cards. Env-overridable.
+    _eval_B = int(os.environ.get("HYDRA_EVAL_BATCH",
+        str(max(1, DEVICE_BATCH_SIZE // 2) if DEVICE_BATCH_SIZE <= 8 else 4)))
+    # Eval tokens: default 1M (1,048,576) — gives statistically meaningful BPB
+    # (256 forward passes at B=4, seq=1024). Env-overridable for fast/slow sweeps.
+    _eval_tokens = int(os.environ.get("HYDRA_EVAL_TOKENS", str(1048576)))
+    try:
+        # Aggressive VRAM reclaim for 6GB cards. Peak training VRAM = 5.1GB
+        # which leaves < 1GB for the eval forward — the driver can't satisfy
+        # the allocation. Free EVERY tensor we don't strictly need:
+        #   - optimizer grads (set_to_none releases tensor)
+        #   - optimizer.state (fp32 Muon NS workspace, AdamW moments — ~size-of-params each)
+        #   - model internal caches (HTM subsample cache, SDR stash)
+        # After this, VRAM should be ~params only (bf16 ≈ 120MB at 60M params).
+        optimizer.zero_grad(set_to_none=True)
+        if hasattr(optimizer, 'state') and optimizer.state:
+            for p, st in list(optimizer.state.items()):
+                st.clear()
+            optimizer.state.clear()
+        for p in model.parameters():
+            if p.grad is not None:
+                p.grad = None
+        if hasattr(model, '_htm_cache'):
+            model._htm_cache = None
+        if hasattr(model, '_last_sdr'):
+            model._last_sdr = None
+        import gc as _gc
+        _gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        try:
+            _free_mb = torch.cuda.mem_get_info()[0] / 1024 / 1024
+            print(f"[VAL] free_vram_mb={_free_mb:.0f} (cleared optimizer state)", flush=True)
+        except Exception:
+            pass
+        print(f"[VAL] running eval on {_eval_tokens} tokens at B={_eval_B}...", flush=True)
+        model.eval()
+        _orig = _prepare_mod.EVAL_TOKENS
+        _prepare_mod.EVAL_TOKENS = _eval_tokens
+        # Nemotron path reads HYDRA_STREAM_EVAL_TOKENS env var directly,
+        # not _prepare_mod.EVAL_TOKENS. Sync both so eval budget is
+        # respected regardless of which dataloader path is active.
+        _orig_stream = os.environ.get("HYDRA_STREAM_EVAL_TOKENS")
+        os.environ["HYDRA_STREAM_EVAL_TOKENS"] = str(_eval_tokens)
+        with autocast_ctx:
+            val_bpb = evaluate_bpb(model, tokenizer, _eval_B)
+        _prepare_mod.EVAL_TOKENS = _orig
+        if _orig_stream is not None:
+            os.environ["HYDRA_STREAM_EVAL_TOKENS"] = _orig_stream
+        else:
+            os.environ.pop("HYDRA_STREAM_EVAL_TOKENS", None)
+        val_ppl = 2 ** val_bpb
+        print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
+    except torch.cuda.OutOfMemoryError as e:
+        print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
+        torch.cuda.empty_cache()
+    except Exception as e:
+        import traceback as _tb
+        print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
+        _tb.print_exc()
+        try:
+            _free = torch.cuda.mem_get_info()[0] / 1024 / 1024
+            print(f"[VAL] post-crash free_vram_mb={_free:.0f}", flush=True)
+        except Exception:
+            pass
+    # Final ckpts with val_bpb filled in (if eval succeeded).
+    save_ckpt(
+        model, optimizer, config, step, total_training_time,
+        smooth_train_loss, bpt_ema, epoch, LATEST_CKPT,
+        val_bpb=val_bpb, blocking=True,
+    )
+    save_ckpt(
+        model, optimizer, config, step, total_training_time,
+        smooth_train_loss, bpt_ema, epoch, PRETRAIN_FINAL_CKPT,
+        val_bpb=val_bpb, blocking=True,
+    )
+    # Learnability #2: persist EMA weights alongside the raw checkpoint.
+    # latest_ema.pt contains ema_model.module (the Averaged params) so it
+    # can be loaded by evaluation / inference code that expects the same
+    # state_dict shape as the raw model.
+    if ema_model is not None:
+        try:
+            ema_ckpt_path = CACHE_DIR / "latest_ema.pt"
+            CACHE_DIR.mkdir(parents=True, exist_ok=True)
+            torch.save({
+                "model_state_dict": ema_model.module.state_dict(),
+                "config": asdict(config),
+                "step": step,
+                "epoch": epoch,
+                "train_seconds": total_training_time,
+                "val_bpb": val_bpb,
+                "ema_decay": EMA_DECAY,
+            }, str(ema_ckpt_path))
+            print(f"[EMA] saved {ema_ckpt_path} (step={step})", flush=True)
+        except Exception as _e:
+            print(f"[EMA] save failed: {_e}", flush=True)
+    run_factual_probes(model, tokenizer, device, autocast_ctx)
+    t_end = time.time()
+    startup_time = t_start_training - t_start
+    steady_state_mfu = (
+        100 * num_flops_per_token * TOTAL_BATCH_SIZE * (step - 10)
+        / total_training_time / GPU_BF16_PEAK_FLOPS
+        if total_training_time > 0 else 0
+    )
+    peak_vram_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
+    metrics = model.get_secondary_metrics()
+    print("---")
+    print(f"val_bpb:          {val_bpb:.6f}" if val_bpb is not None else "val_bpb:          SKIPPED")
+    print(f"training_seconds: {total_training_time:.1f}")
+    print(f"total_seconds:    {t_end - t_start:.1f}")
+    print(f"peak_vram_mb:     {peak_vram_mb:.1f}")
+    print(f"mfu_percent:      {steady_state_mfu:.2f}")
+    print(f"total_tokens_M:   {total_tokens / 1e6:.1f}")
+    print(f"num_steps:        {step}")
+    print(f"num_params_M:     {num_params / 1e6:.1f}")
+    print(f"n_layer:          {N_LAYER}")
+    print(f"d_model:          {D_MODEL}")
+    print(f"engram_hit_rate:   {metrics.get('engram_hit_rate', 0.0):.4f}")
+    print(f"sdr_active_bits:  {metrics.get('sdr_active_bits', 0):.1f}")
+    print(f"htm_anomaly:      {metrics.get('htm_anomaly', 0):.4f}")
+    # Per-layer summary panel — only printed when diagnostics were active.
+    _layer_keys = sorted([k for k in metrics.keys() if k.startswith('layer_')])
+    if _layer_keys:
+        n_layers = len(model.blocks)
+        print("--- per-layer diagnostic panel ---")
+        for li in range(n_layers):
+            d_ratio = metrics.get(f'layer_{li}_delta_ratio', float('nan'))
+            out_n   = metrics.get(f'layer_{li}_out_norm',    float('nan'))
+            g_norm  = metrics.get(f'layer_{li}_grad_norm',   float('nan'))
+            eff_r   = metrics.get(f'layer_{li}_eff_rank',    float('nan'))
+            f_std   = metrics.get(f'layer_{li}_feat_std',    float('nan'))
+            print(
+                f"L{li:02d}  delta_ratio={d_ratio:.4f}  out_norm={out_n:.4f}  "
+                f"grad_norm={g_norm:.3e}  eff_rank={eff_r:.1f}  feat_std={f_std:.4f}"
+            )
+    # Emit full metrics dictionary as JSON for sweep aggregation. Path from
+    # HYDRA_METRICS_OUT env var; default=/tmp/hydra_run_metrics.json. Always
+    # written (even without diagnostics) so the aggregator can compare runs.
+    _metrics_out = os.environ.get("HYDRA_METRICS_OUT", "/tmp/hydra_run_metrics.json")
+    try:
+        _dump = dict(metrics)
+        _dump.update({
+            'val_bpb': (float(val_bpb) if val_bpb is not None else None),
+            'val_ppl': (float(val_ppl) if val_ppl is not None else None),
+            'n_layer': int(N_LAYER),
+            'd_model': int(D_MODEL),
+            'num_params_M': float(num_params / 1e6),
+            'num_steps': int(step),
+            'total_tokens_M': float(total_tokens / 1e6),
+            'peak_vram_mb': float(peak_vram_mb),
+            'training_seconds': float(total_training_time),
+            'sdr_target_active': int(os.environ.get("HYDRA_SDR_TARGET_ACTIVE", "327")),
+        })
+        Path(_metrics_out).parent.mkdir(parents=True, exist_ok=True)
+        with open(_metrics_out, 'w') as _f:
+            json.dump(_dump, _f, indent=2, sort_keys=True)
+        print(f"[METRICS] wrote {_metrics_out}", flush=True)
+        # Also emit a single-line JSON to stdout so the sweep aggregator can
+        # scrape it from HF Jobs logs without pulling files out of the container.
+        print("[METRICS_JSON] " + json.dumps(_dump, sort_keys=True), flush=True)
+    except Exception as _e:
+        print(f"[METRICS] write failed: {_e}", flush=True)
+    run_factual_english(model, tokenizer, MAX_SEQ_LEN)
+    # startup_time is informative but not printed (preserve historical output)
+    _ = startup_time

overlay/kernels/cuda/decode_kernels.cu CHANGED Viewed

@@ -1,10 +1,10 @@
-/*
- * CuTe DSL decode kernels for Mamba-3 autoregressive generation.
- *
- * Phase 2: Optimized single-token SSM step for inference.
- * Phase 1: Not needed (training only, no generation).
- *
- * Fuses: input_proj + conv_step + ssm_step + output_proj
- * into a single kernel launch for minimal latency.
- */
-// Stub: Phase 2 implementation

+/*
+ * CuTe DSL decode kernels for Mamba-3 autoregressive generation.
+ *
+ * Phase 2: Optimized single-token SSM step for inference.
+ * Phase 1: Not needed (training only, no generation).
+ *
+ * Fuses: input_proj + conv_step + ssm_step + output_proj
+ * into a single kernel launch for minimal latency.
+ */
+// Stub: Phase 2 implementation

overlay/kernels/cuda/flashfftconv/LICENSE CHANGED Viewed

@@ -1,201 +1,201 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

overlay/kernels/cuda/flashfftconv/README.md CHANGED Viewed

@@ -1,57 +1,57 @@
-# flashfftconv (vendored)
-Vendored from https://github.com/HazyResearch/flash-fft-conv (Apache 2.0 license).
-**Upstream commit:** see `UPSTREAM_COMMIT`.
-## What this is
-HazyResearch's Monarch-matrix-decomposition FFT convolution CUDA kernel. Provides a
-drop-in replacement for `torch.fft.rfft + complex-mult + irfft` that runs ~2-3x
-faster than cuFFT for the specific power-of-two lengths it supports (256, 512,
-1024, 2048, 4096, 8192, ..., up to 4M).
-In HYDRA, we use it to accelerate `subsystems/hyena_pure.fftconv_ref`. The
-accelerated path is opt-in via `HYDRA_HYENA_FLASH_FFT=1`; default behavior is
-unchanged (pure PyTorch fallback).
-## How to build
-The vendored tree contains:
-- `flashfftconv/` — pure-Python wrappers (imports `monarch_cuda` CUDA extension)
-- `csrc/` — CUDA source files and setup.py for the native extension
-Build instructions:
-```bash
-cd /home/mikeb/work/feather/kernels/cuda/flashfftconv/csrc
-# Edit `csrc/setup.py` first: change the cc_flag line to match your GPU arch
-# (RTX 3060 = 8.6, A100 = 8.0, H100 = 9.0). Example for RTX 3060:
-#   cc_flag = ['--generate-code=arch=compute_86,code=compute_86']
-# Build with the local CUDA toolchain (must match your torch.version.cuda):
-CUDA_HOME=/usr/local/cuda-12.1 .venv/bin/pip install -e .
-```
-Then install the Python wrappers:
-```bash
-cd /home/mikeb/work/feather/kernels/cuda/flashfftconv
-.venv/bin/pip install -e .
-```
-## Runtime usage
-Once installed, set `HYDRA_HYENA_FLASH_FFT=1` and training will use it.
-`subsystems/hyena_pure.fftconv_ref` auto-detects via `try: import flashfftconv`
-and falls back to pure PyTorch on import failure.
-## Known caveats
-- Seqlen must be a power of 2 AND in the supported set: {256, 512, 1024, 2048,
-  4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304}.
-  For HYDRA, `fft_size = 2 * seq_len` → seq_len in {128, 256, 512, 1024, 2048, ...}.
-- dtype must be fp16 or bf16 (fp32 not supported).
-- GPU arch must be compiled into the extension (see setup.py cc_flag).
-- CUDA toolchain major.minor should match `torch.version.cuda` major (12.x ↔ 12.x).

+# flashfftconv (vendored)
+Vendored from https://github.com/HazyResearch/flash-fft-conv (Apache 2.0 license).
+**Upstream commit:** see `UPSTREAM_COMMIT`.
+## What this is
+HazyResearch's Monarch-matrix-decomposition FFT convolution CUDA kernel. Provides a
+drop-in replacement for `torch.fft.rfft + complex-mult + irfft` that runs ~2-3x
+faster than cuFFT for the specific power-of-two lengths it supports (256, 512,
+1024, 2048, 4096, 8192, ..., up to 4M).
+In HYDRA, we use it to accelerate `subsystems/hyena_pure.fftconv_ref`. The
+accelerated path is opt-in via `HYDRA_HYENA_FLASH_FFT=1`; default behavior is
+unchanged (pure PyTorch fallback).
+## How to build
+The vendored tree contains:
+- `flashfftconv/` — pure-Python wrappers (imports `monarch_cuda` CUDA extension)
+- `csrc/` — CUDA source files and setup.py for the native extension
+Build instructions:
+```bash
+cd /home/mikeb/work/feather/kernels/cuda/flashfftconv/csrc
+# Edit `csrc/setup.py` first: change the cc_flag line to match your GPU arch
+# (RTX 3060 = 8.6, A100 = 8.0, H100 = 9.0). Example for RTX 3060:
+#   cc_flag = ['--generate-code=arch=compute_86,code=compute_86']
+# Build with the local CUDA toolchain (must match your torch.version.cuda):
+CUDA_HOME=/usr/local/cuda-12.1 .venv/bin/pip install -e .
+```
+Then install the Python wrappers:
+```bash
+cd /home/mikeb/work/feather/kernels/cuda/flashfftconv
+.venv/bin/pip install -e .
+```
+## Runtime usage
+Once installed, set `HYDRA_HYENA_FLASH_FFT=1` and training will use it.
+`subsystems/hyena_pure.fftconv_ref` auto-detects via `try: import flashfftconv`
+and falls back to pure PyTorch on import failure.
+## Known caveats
+- Seqlen must be a power of 2 AND in the supported set: {256, 512, 1024, 2048,
+  4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304}.
+  For HYDRA, `fft_size = 2 * seq_len` → seq_len in {128, 256, 512, 1024, 2048, ...}.
+- dtype must be fp16 or bf16 (fp32 not supported).
+- GPU arch must be compiled into the extension (see setup.py cc_flag).
+- CUDA toolchain major.minor should match `torch.version.cuda` major (12.x ↔ 12.x).

overlay/kernels/cuda/flashfftconv/UPSTREAM_COMMIT CHANGED Viewed

	@@ -1 +1 @@
1	- b8771028717f46d5b22cbb8e12833f35033d621b


1	+ b8771028717f46d5b22cbb8e12833f35033d621b

overlay/kernels/cuda/flashfftconv/csrc/.gitignore CHANGED Viewed

@@ -1,10 +1,10 @@
-*.npy
-*.json
-*.png
-*/*.npy
-*/*.json
-*/*.png
-*.DS_Store
 */*.DS_Store

+*.npy
+*.json
+*.png
+*/*.npy
+*/*.json
+*/*.png
+*.DS_Store
 */*.DS_Store

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly.h CHANGED Viewed

@@ -1,374 +1,374 @@
-// Copyright (c) 2023 Dan Fu, Hermann Kumbong
-#include <torch/extension.h>
-#include <vector>
-#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_IS_HALF_OR_BFLOAT(x) TORCH_CHECK(x.dtype() == torch::kFloat16 || x.dtype() == torch::kBFloat16, #x " must be float16 or bfloat16")
-#define CHECK_INPUT(x) \
-    CHECK_CUDA(x);     \
-    CHECK_CONTIGUOUS(x); \
-    CHECK_IS_HALF_OR_BFLOAT(x)
-#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
-std::vector<torch::Tensor> butterfly_cuda(
-    torch::Tensor x,
-    torch::Tensor d_f_T,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    std::optional<at::Tensor> x_gate = std::nullopt
-);
-std::vector<torch::Tensor> butterfly_bf16_cuda(
-    torch::Tensor x,
-    torch::Tensor d_f_T_real,
-    torch::Tensor d_f_T_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    std::optional<at::Tensor> out_gate = std::nullopt
-);
-std::vector<torch::Tensor> butterfly_padded_cuda(
-    torch::Tensor x,
-    torch::Tensor d_f_T,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int M,
-    std::optional<at::Tensor> x_gate = std::nullopt
-);
-std::vector<torch::Tensor> butterfly_padded_bf16_cuda(
-    torch::Tensor x,
-    torch::Tensor d_f_T_real,
-    torch::Tensor d_f_T_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int M,
-    std::optional<at::Tensor> x_gate = std::nullopt
-);
-torch::Tensor butterfly_ifft_cuda(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f_T,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    std::optional<at::Tensor> out_gate = std::nullopt
-);
-torch::Tensor butterfly_ifft_bf16_cuda(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f_real,
-    torch::Tensor d_f_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    std::optional<at::Tensor> x_gate = std::nullopt
-);
-torch::Tensor butterfly_ifft_padded_cuda(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int N,
-    std::optional<at::Tensor> out_gate = std::nullopt
-);
-torch::Tensor butterfly_ifft_padded_bf16_cuda(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f_real,
-    torch::Tensor d_f_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int N,
-    std::optional<at::Tensor> out_gate = std::nullopt
-);
-std::vector<torch::Tensor> butterfly(
-    torch::Tensor x,
-    torch::Tensor d_f_T,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag
-){
-    CHECK_INPUT(x);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag);
-}
-std::vector<torch::Tensor> butterfly_gated(
-    torch::Tensor x,
-    torch::Tensor d_f_T,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    torch::Tensor x_gate
-){
-    CHECK_INPUT(x);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    CHECK_INPUT(x_gate);
-    return butterfly_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, x_gate);
-}
-std::vector<torch::Tensor> butterfly_bf16(
-    torch::Tensor x,
-    torch::Tensor d_f_T_real,
-    torch::Tensor d_f_T_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag
-){
-    CHECK_INPUT(x);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    CHECK_INPUT(d_f_T_real);
-    CHECK_INPUT(d_f_T_imag);
-    return butterfly_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag);
-}
-std::vector<torch::Tensor> butterfly_gated_bf16(
-    torch::Tensor x,
-    torch::Tensor d_f_T_real,
-    torch::Tensor d_f_T_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    torch::Tensor x_gate
-){
-    CHECK_INPUT(x);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    CHECK_INPUT(d_f_T_real);
-    CHECK_INPUT(d_f_T_imag);
-    CHECK_INPUT(x_gate);
-    return butterfly_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, x_gate);
-}
-torch::Tensor butterfly_ifft(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f_T,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag
-){
-    CHECK_INPUT(x_real);
-    CHECK_INPUT(x_imag);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_ifft_cuda(x_real, x_imag, d_f_T, twiddle_factors_real, twiddle_factors_imag);
-}
-torch::Tensor butterfly_ifft_gated(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f_T,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    torch::Tensor out_gate
-){
-    CHECK_INPUT(x_real);
-    CHECK_INPUT(x_imag);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    CHECK_INPUT(out_gate);
-    return butterfly_ifft_cuda(x_real, x_imag, d_f_T, twiddle_factors_real, twiddle_factors_imag, out_gate);
-}
-torch::Tensor butterfly_ifft_bf16(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f_real,
-    torch::Tensor d_f_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag
-){
-    CHECK_INPUT(x_real);
-    CHECK_INPUT(x_imag);
-    CHECK_INPUT(d_f_real);
-    CHECK_INPUT(d_f_imag);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_ifft_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag);
-}
-torch::Tensor butterfly_ifft_gated_bf16(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f_real,
-    torch::Tensor d_f_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    torch::Tensor out_gate
-){
-    CHECK_INPUT(x_real);
-    CHECK_INPUT(x_imag);
-    CHECK_INPUT(d_f_real);
-    CHECK_INPUT(d_f_imag);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    CHECK_INPUT(out_gate);
-    return butterfly_ifft_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, out_gate);
-}
-std::vector<torch::Tensor> butterfly_padded(
-    torch::Tensor x,
-    torch::Tensor d_f_T,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int M
-){
-    CHECK_INPUT(x);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_padded_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, M);
-}
-std::vector<torch::Tensor> butterfly_padded_bf16(
-    torch::Tensor x,
-    torch::Tensor d_f_T_real,
-    torch::Tensor d_f_T_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int M
-){
-    CHECK_INPUT(x);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_padded_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, M);
-}
-std::vector<torch::Tensor> butterfly_padded_gated(
-    torch::Tensor x,
-    torch::Tensor d_f_T,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int M,
-    torch::Tensor x_gate
-){
-    CHECK_INPUT(x);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_padded_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, M, x_gate);
-}
-std::vector<torch::Tensor> butterfly_padded_gated_bf16(
-    torch::Tensor x,
-    torch::Tensor d_f_T_real,
-    torch::Tensor d_f_T_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int M,
-    torch::Tensor x_gate
-){
-    CHECK_INPUT(x);
-    CHECK_INPUT(d_f_T_real);
-    CHECK_INPUT(d_f_T_imag);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_padded_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, M, x_gate);
-}
-torch::Tensor butterfly_ifft_padded(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int N
-){
-    CHECK_INPUT(x_real);
-    CHECK_INPUT(x_imag);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_ifft_padded_cuda(x_real, x_imag, d_f, twiddle_factors_real, twiddle_factors_imag, N);
-}
-torch::Tensor butterfly_ifft_padded_gated(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int N,
-    torch::Tensor out_gate
-){
-    CHECK_INPUT(x_real);
-    CHECK_INPUT(x_imag);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_ifft_padded_cuda(x_real, x_imag, d_f, twiddle_factors_real, twiddle_factors_imag, N, out_gate);
-}
-torch::Tensor butterfly_ifft_padded_bf16(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f_real,
-    torch::Tensor d_f_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int N
-){
-    CHECK_INPUT(x_real);
-    CHECK_INPUT(x_imag);
-    CHECK_INPUT(d_f_real);
-    CHECK_INPUT(d_f_imag);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_ifft_padded_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, N);
-}
-torch::Tensor butterfly_ifft_padded_gated_bf16(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f_real,
-    torch::Tensor d_f_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    int N,
-    torch::Tensor out_gate
-){
-    CHECK_INPUT(x_real);
-    CHECK_INPUT(x_imag);
-    CHECK_INPUT(d_f_real);
-    CHECK_INPUT(d_f_imag);
-    CHECK_INPUT(twiddle_factors_real);
-    CHECK_INPUT(twiddle_factors_imag);
-    return butterfly_ifft_padded_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, N, out_gate);
 }

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_IS_HALF_OR_BFLOAT(x) TORCH_CHECK(x.dtype() == torch::kFloat16 || x.dtype() == torch::kBFloat16, #x " must be float16 or bfloat16")
+#define CHECK_INPUT(x) \
+    CHECK_CUDA(x);     \
+    CHECK_CONTIGUOUS(x); \
+    CHECK_IS_HALF_OR_BFLOAT(x)
+#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+std::vector<torch::Tensor> butterfly_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly_padded_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly_padded_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_padded_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+torch::Tensor butterfly_ifft_padded_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    std::optional<at::Tensor> out_gate = std::nullopt
+);
+std::vector<torch::Tensor> butterfly(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag);
+}
+std::vector<torch::Tensor> butterfly_gated(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(x_gate);
+    return butterfly_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, x_gate);
+}
+std::vector<torch::Tensor> butterfly_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(d_f_T_real);
+    CHECK_INPUT(d_f_T_imag);
+    return butterfly_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag);
+}
+std::vector<torch::Tensor> butterfly_gated_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(d_f_T_real);
+    CHECK_INPUT(d_f_T_imag);
+    CHECK_INPUT(x_gate);
+    return butterfly_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, x_gate);
+}
+torch::Tensor butterfly_ifft(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_cuda(x_real, x_imag, d_f_T, twiddle_factors_real, twiddle_factors_imag);
+}
+torch::Tensor butterfly_ifft_gated(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(out_gate);
+    return butterfly_ifft_cuda(x_real, x_imag, d_f_T, twiddle_factors_real, twiddle_factors_imag, out_gate);
+}
+torch::Tensor butterfly_ifft_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag);
+}
+torch::Tensor butterfly_ifft_gated_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    CHECK_INPUT(out_gate);
+    return butterfly_ifft_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, out_gate);
+}
+std::vector<torch::Tensor> butterfly_padded(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, M);
+}
+std::vector<torch::Tensor> butterfly_padded_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, M);
+}
+std::vector<torch::Tensor> butterfly_padded_gated(
+    torch::Tensor x,
+    torch::Tensor d_f_T,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_cuda(x, d_f_T, twiddle_factors_real, twiddle_factors_imag, M, x_gate);
+}
+std::vector<torch::Tensor> butterfly_padded_gated_bf16(
+    torch::Tensor x,
+    torch::Tensor d_f_T_real,
+    torch::Tensor d_f_T_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int M,
+    torch::Tensor x_gate
+){
+    CHECK_INPUT(x);
+    CHECK_INPUT(d_f_T_real);
+    CHECK_INPUT(d_f_T_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_padded_bf16_cuda(x, d_f_T_real, d_f_T_imag, twiddle_factors_real, twiddle_factors_imag, M, x_gate);
+}
+torch::Tensor butterfly_ifft_padded(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_cuda(x_real, x_imag, d_f, twiddle_factors_real, twiddle_factors_imag, N);
+}
+torch::Tensor butterfly_ifft_padded_gated(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_cuda(x_real, x_imag, d_f, twiddle_factors_real, twiddle_factors_imag, N, out_gate);
+}
+torch::Tensor butterfly_ifft_padded_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, N);
+}
+torch::Tensor butterfly_ifft_padded_gated_bf16(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    int N,
+    torch::Tensor out_gate
+){
+    CHECK_INPUT(x_real);
+    CHECK_INPUT(x_imag);
+    CHECK_INPUT(d_f_real);
+    CHECK_INPUT(d_f_imag);
+    CHECK_INPUT(twiddle_factors_real);
+    CHECK_INPUT(twiddle_factors_imag);
+    return butterfly_ifft_padded_bf16_cuda(x_real, x_imag, d_f_real, d_f_imag, twiddle_factors_real, twiddle_factors_imag, N, out_gate);
 }

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda.cu CHANGED Viewed

@@ -1,699 +1,699 @@
-// Copyright (c) 2023 Dan Fu, Hermann Kumbong
-#include <torch/extension.h>
-#include <vector>
-#include <stdio.h>
-#include <mma.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include "shared.h"
-using namespace nvcuda;
-__global__ void butterfly_cuda_kernel_64(
-    const __half2 *__restrict__ x,
-    const __half2 *__restrict__ x_gate,
-    const complex_half_t *__restrict__ d_f,
-    const __half2 *__restrict__ twiddle_factors_real,
-    const __half2 *__restrict__ twiddle_factors_imag,
-    __half2 *__restrict__ out_real,
-    __half2 *__restrict__ out_imag,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    extern __shared__ half x_shared[];
-    half *d_f_real = &x_shared[N * N];
-    half *d_f_imag = &d_f_real[N * N];
-    half *twiddles_real_shared = &d_f_imag[N * N];
-    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
-    half *out_real_shared = &twiddles_imag_shared[N * N];
-    half *out_imag_shared = &out_real_shared[N * N];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        // #pragma unroll
-        shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x;
-        d_f_real[shared_offset] = d_f[shared_offset].real();
-        d_f_imag[shared_offset] = d_f[shared_offset].imag();
-        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
-        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
-    }
-    __half2 tmp_real, tmp_imag;
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[4];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[4];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[4][4];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[4];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[4];
-    __syncthreads();
-    for (int i = 0; i < 4; i++)
-    {
-        wmma::load_matrix_sync(a_frag_real[i], d_f_real + i * N * 16 + threadIdx.y * 16, N);
-        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag + i * N * 16 + threadIdx.y * 16, N);
-        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
-        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
-    }
-    for (int t = 0; t < 16; t++)
-    {
-        for (int i = 0; i < n; i++)
-        {
-            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
-            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-            if(x_gate != nullptr){
-                reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
-            }else{
-                reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
-            }
-        }
-        __syncthreads();
-        for (int i = 0; i < 4; i++)
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
-            }
-        }
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
-            for (int k = 0; k < 4; k++)
-            {
-                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
-            }
-        }
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
-            for (int k = 0; k < 4; k++)
-            {
-                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
-            }
-        }
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
-            {
-                tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
-                tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
-                reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
-                reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
-            }
-            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
-            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
-        }
-        __syncthreads();
-#pragma unroll
-        for (int i = 0; i < n; i++)
-        {
-            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
-            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
-            out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
-        }
-        __syncthreads();
-    }
-}
-__global__ void butterfly_cuda_kernel_32(
-    const __half2 *__restrict__ x,
-    const __half2 *__restrict__ x_gate,
-    const complex_half_t *__restrict__ d_f,
-    const __half2 *__restrict__ twiddle_factors_real,
-    const __half2 *__restrict__ twiddle_factors_imag,
-    __half2 *__restrict__ out_real,
-    __half2 *__restrict__ out_imag,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    __shared__ half x_shared[32 * 64];
-    __shared__ half d_f_real[32 * 32];
-    __shared__ half d_f_imag[32 * 32];
-    __shared__ half twiddles_real_shared[32 * 64];
-    __shared__ half twiddles_imag_shared[32 * 64];
-    __shared__ half out_real_shared[32 * 64];
-    __shared__ half out_imag_shared[32 * 64];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        if(x_gate == nullptr){
-            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
-        }else{
-            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
-        }
-        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        // #pragma unroll
-        d_f_real[shared_offset] = d_f[shared_offset].real();
-        d_f_imag[shared_offset] = d_f[shared_offset].imag();
-    }
-    __syncthreads();
-    if (threadIdx.y < N / 16)
-    {
-        __half2 tmp_real, tmp_imag;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[2][2];
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[2][2];
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[2][2];
-        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[2][2];
-        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[2][2];
-        int t = threadIdx.y * 32;
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
-                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
-                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
-                for (int k = 0; k < 2; k++)
-                {
-                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
-                }
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::fill_fragment(acc_frag_imag[i][j], __float2half(0.0f));
-                for (int k = 0; k < 2; k++)
-                {
-                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
-                }
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
-                {
-                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k];
-                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k];
-                    reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]));
-                    reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]));
-                }
-                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
-                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
-            }
-        }
-    }
-    __syncthreads();
-#pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
-        out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
-    }
-}
-__global__ void butterfly_cuda_kernel_128(
-    const __half2 *__restrict__ x,
-    const __half2 *__restrict__ x_gate,
-    const complex_half_t *__restrict__ d_f,
-    const __half2 *__restrict__ twiddle_factors_real,
-    const __half2 *__restrict__ twiddle_factors_imag,
-    __half2 *__restrict__ out_real,
-    __half2 *__restrict__ out_imag,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 128 * 32 * gridDim.x * 2 + blockIdx.z * 16 * 128 * 32 * gridDim.x * 2 + blockIdx.x * 64 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    extern __shared__ half shared_real[];
-    half *shared_imag = &shared_real[128 * 128];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[8];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[8];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[8][8];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[8];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[8];
-    for (int i = 0; i < n; i++)
-    {
-        for(int j=0; j< 4; j++){
-            shared_offset = (threadIdx.y + i * B_Y) * 128 + threadIdx.x + j * blockDim.x;
-            shared_real[shared_offset] = d_f[shared_offset].real();
-            shared_imag[shared_offset] = d_f[shared_offset].imag();
-        }
-    }
-    __syncthreads();
-    for (int i = 0; i < 8; i++){
-        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
-        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
-    }
-    __syncthreads();
-    for (int i = 0; i < n; i++)
-    {
-        for(int j=0; j< 2; j++){
-            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
-            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-            reinterpret_cast<__half2*>(shared_real)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-            reinterpret_cast<__half2*>(shared_imag)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        }
-    }
-    __syncthreads();
-    for (int i = 0; i < 8; i++){
-        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
-        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
-    }
-    __syncthreads();
-    for(int t=0; t< 16; t++){
-        for (int i = 0; i < n; i++)
-        {
-            for(int j=0; j< 2; j++){
-                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
-                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-                if(x_gate != nullptr){
-                    reinterpret_cast<__half2*>(shared_real)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
-                }else{
-                    reinterpret_cast<__half2*>(shared_real)[shared_offset] = x[offset + idx];
-                }
-            }
-        }
-        __syncthreads();
-        for (int i = 0; i < 8; i++)
-        {
-            for (int j = 0; j < 8; j++)
-            {
-                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
-            }
-        }
-        __syncthreads();
-        #pragma unroll
-            for (int j = 0; j < 8; j++)
-            {
-                wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
-                for (int k = 0; k < 8; k++)
-                {
-                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
-                }
-            }
-    #pragma unroll
-            for (int j = 0; j < 8; j++)
-            {
-                wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
-                for (int k = 0; k < 8; k++)
-                {
-                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
-                }
-            }
-            __half2 tmp_real, tmp_imag;
-    #pragma unroll
-            for (int j = 0; j < 8; j++)
-            {
-                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
-                {
-                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
-                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
-                    reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
-                    reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
-                }
-                wmma::store_matrix_sync(shared_real + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
-                wmma::store_matrix_sync(shared_imag + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
-            }
-            __syncthreads();
-    #pragma unroll
-            for (int i = 0; i < n; i++)
-            {
-                for(int j=0; j< 2; j++){
-                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
-                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-                    out_real[offset + idx] = reinterpret_cast<__half2*>(shared_real)[shared_offset];
-                    out_imag[offset + idx] = reinterpret_cast<__half2*>(shared_imag)[shared_offset];
-                }
-            }
-            __syncthreads();
-    }
-}
-__global__ void butterfly_cuda_kernel_16(
-    const __half2 *__restrict__ x,
-    const __half2 *__restrict__ x_gate,
-    const complex_half_t *__restrict__ d_f,
-    const __half2 *__restrict__ twiddle_factors_real,
-    const __half2 *__restrict__ twiddle_factors_imag,
-    __half2 *__restrict__ out_real,
-    __half2 *__restrict__ out_imag,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    __shared__ half x_shared[16 * 64];
-    __shared__ half d_f_real[16 * 16];
-    __shared__ half d_f_imag[16 * 16];
-    __shared__ half twiddles_real_shared[16 * 64];
-    __shared__ half twiddles_imag_shared[16 * 64];
-    __shared__ half out_real_shared[16 * 64];
-    __shared__ half out_imag_shared[16 * 64];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        if(x_gate != NULL)
-            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
-        else
-            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
-        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        // #pragma unroll
-        if(threadIdx.x  < 16 ){
-            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
-            d_f_real[shared_offset] = d_f[shared_offset].real();
-            d_f_imag[shared_offset] = d_f[shared_offset].imag();
-        }
-    }
-    __syncthreads();
-    if (threadIdx.y < 4)
-    {
-        __half2 tmp_real, tmp_imag;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
-        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
-        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag;
-        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
-        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
-        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
-        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
-        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
-        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
-        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
-        wmma::fill_fragment(acc_frag_imag, __float2half(0.0f));
-        wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
-        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
-        {
-            tmp_real = reinterpret_cast<__half2 *>(acc_frag_real.x)[k];
-            tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag.x)[k];
-            reinterpret_cast<__half2 *>(acc_frag_real.x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]));
-            reinterpret_cast<__half2 *>(acc_frag_imag.x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]));
-        }
-        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
-        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
-    }
-    __syncthreads();
-#pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
-        out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
-    }
-}
-std::vector<torch::Tensor> butterfly_cuda(
-    torch::Tensor x,
-    torch::Tensor d_f,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    std::optional<at::Tensor> x_gate = std::nullopt)
-{
-    uint B = x.size(0);
-    uint H = x.size(1);
-    // uint m = x.size(1);
-    // const int TILE_SIZE = 16;
-    uint N = x.size(2);
-    uint M = x.size(3);
-    dim3 gridDim;
-    dim3 blockDim;
-    gridDim.y = B;
-    gridDim.z = H;
-    torch::Tensor out_real = torch::empty({B, H, N, M}, x.options());
-    torch::Tensor out_imag = torch::empty({B, H, N, M}, x.options());
-    //set blockDims
-    switch(N){
-        case 128:
-            blockDim.x = 32;
-            blockDim.y = 8;
-            break;
-        default:
-            blockDim.x = 32;
-            blockDim.y = 4;
-            break;
-    }
-    //set gridDim.x
-    switch(N){
-        case 128:
-            switch (M){
-                case 16384:
-                    gridDim.x = 128;
-                    break;
-                case 8192:
-                    gridDim.x = 64;
-                    break;
-                case 4096:
-                    gridDim.x = 32;
-                    break;
-                default:
-                    gridDim.x = 256;
-                    break;
-            }
-            break;
-        default:
-            switch (M){
-                case 16384:
-                    gridDim.x = 256;
-                    break;
-                case 8192:
-                    gridDim.x = 128;
-                    break;
-                case 4096:
-                    gridDim.x = 64;
-                    break;
-                default:
-                    gridDim.x = 512;
-                    break;
-            }
-            break;
-    }
-    switch (N)
-    {
-    case 16:
-        butterfly_cuda_kernel_16<<<gridDim, blockDim>>>(
-            static_cast<__half2 *>(x.data_ptr()),
-            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
-            static_cast<complex_half_t *>(d_f.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__half2 *>(out_real.data_ptr()),
-            static_cast<__half2 *>(out_imag.data_ptr()),
-            B,
-            H,
-            N);
-        break;
-    case 32:
-        butterfly_cuda_kernel_32<<<gridDim, blockDim>>>(
-            static_cast<__half2 *>(x.data_ptr()),
-            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
-            static_cast<complex_half_t *>(d_f.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__half2 *>(out_real.data_ptr()),
-            static_cast<__half2 *>(out_imag.data_ptr()),
-            B,
-            H,
-            N);
-        break;
-    case 64:
-        gridDim.z = H / 16;
-        cudaFuncSetAttribute(&butterfly_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
-        butterfly_cuda_kernel_64<<<gridDim, blockDim, 57344>>>(
-            static_cast<__half2 *>(x.data_ptr()),
-            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
-            static_cast<complex_half_t *>(d_f.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__half2 *>(out_real.data_ptr()),
-            static_cast<__half2 *>(out_imag.data_ptr()),
-            B,
-            H,
-            N);
-        break;
-    case 128:
-        gridDim.z = H / 16;
-        cudaFuncSetAttribute(&butterfly_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
-        butterfly_cuda_kernel_128<<<gridDim, blockDim, 65536>>>(
-            static_cast<__half2 *>(x.data_ptr()),
-            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
-            static_cast<complex_half_t *>(d_f.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__half2 *>(out_real.data_ptr()),
-            static_cast<__half2 *>(out_imag.data_ptr()),
-            B,
-            H,
-            N);
-        break;
-    default:
-    printf("Not yet implemented \n");
-        break;
-    }
-    return {out_real, out_imag};
 }

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_cuda_kernel_64(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ half x_shared[];
+    half *d_f_real = &x_shared[N * N];
+    half *d_f_imag = &d_f_real[N * N];
+    half *twiddles_real_shared = &d_f_imag[N * N];
+    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    half *out_real_shared = &twiddles_imag_shared[N * N];
+    half *out_imag_shared = &out_real_shared[N * N];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
+        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
+    }
+    __half2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[4][4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[4];
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        wmma::load_matrix_sync(a_frag_real[i], d_f_real + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            if(x_gate != nullptr){
+                reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+            }else{
+                reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+            {
+                tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
+                tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
+                reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
+                reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
+            }
+            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+            out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_cuda_kernel_32(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_shared[32 * 64];
+    __shared__ half d_f_real[32 * 32];
+    __shared__ half d_f_imag[32 * 32];
+    __shared__ half twiddles_real_shared[32 * 64];
+    __shared__ half twiddles_imag_shared[32 * 64];
+    __shared__ half out_real_shared[32 * 64];
+    __shared__ half out_imag_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate == nullptr){
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
+        }else{
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        }
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        __half2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[i][j], __float2half(0.0f));
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k];
+                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k];
+                    reinterpret_cast<__half2 *>(acc_frag_real[i][j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]));
+                    reinterpret_cast<__half2 *>(acc_frag_imag[i][j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[i][j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[i][j].x)[k]));
+                }
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+    }
+}
+__global__ void butterfly_cuda_kernel_128(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 128 * 32 * gridDim.x * 2 + blockIdx.z * 16 * 128 * 32 * gridDim.x * 2 + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ half shared_real[];
+    half *shared_imag = &shared_real[128 * 128];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag[8][8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 4; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 128 + threadIdx.x + j * blockDim.x;
+            shared_real[shared_offset] = d_f[shared_offset].real();
+            shared_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__half2*>(shared_real)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__half2*>(shared_imag)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
+    }
+    __syncthreads();
+    for(int t=0; t< 16; t++){
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(x_gate != nullptr){
+                    reinterpret_cast<__half2*>(shared_real)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+                }else{
+                    reinterpret_cast<__half2*>(shared_real)[shared_offset] = x[offset + idx];
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
+            }
+        }
+        __syncthreads();
+        #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[j], __float2half(0.0f));
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+                }
+            }
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[j], __float2half(0.0f));
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+                }
+            }
+            __half2 tmp_real, tmp_imag;
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k];
+                    tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k];
+                    reinterpret_cast<__half2 *>(acc_frag_real[j].x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]));
+                    reinterpret_cast<__half2 *>(acc_frag_imag[j].x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag[j].x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real[j].x)[k]));
+                }
+                wmma::store_matrix_sync(shared_real + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
+                wmma::store_matrix_sync(shared_imag + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = 0; i < n; i++)
+            {
+                for(int j=0; j< 2; j++){
+                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                    out_real[offset + idx] = reinterpret_cast<__half2*>(shared_real)[shared_offset];
+                    out_imag[offset + idx] = reinterpret_cast<__half2*>(shared_imag)[shared_offset];
+                }
+            }
+            __syncthreads();
+    }
+}
+__global__ void butterfly_cuda_kernel_16(
+    const __half2 *__restrict__ x,
+    const __half2 *__restrict__ x_gate,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_shared[16 * 64];
+    __shared__ half d_f_real[16 * 16];
+    __shared__ half d_f_imag[16 * 16];
+    __shared__ half twiddles_real_shared[16 * 64];
+    __shared__ half twiddles_imag_shared[16 * 64];
+    __shared__ half out_real_shared[16 * 64];
+    __shared__ half out_imag_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate != NULL)
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        else
+            reinterpret_cast<__half2 *>(x_shared)[shared_offset] = x[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        if(threadIdx.x  < 16 ){
+            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
+            d_f_real[shared_offset] = d_f[shared_offset].real();
+            d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        __half2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_imag;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
+        wmma::fill_fragment(acc_frag_imag, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
+        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
+        {
+            tmp_real = reinterpret_cast<__half2 *>(acc_frag_real.x)[k];
+            tmp_imag = reinterpret_cast<__half2 *>(acc_frag_imag.x)[k];
+            reinterpret_cast<__half2 *>(acc_frag_real.x)[k] = __hsub2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]));
+            reinterpret_cast<__half2 *>(acc_frag_imag.x)[k] = __hadd2(__hmul2(tmp_real, reinterpret_cast<__half2 *>(tw_frag_imag.x)[k]), __hmul2(tmp_imag, reinterpret_cast<__half2 *>(tw_frag_real.x)[k]));
+        }
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        out_imag[idx] = reinterpret_cast<__half2 *>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+    }
+}
+std::vector<torch::Tensor> butterfly_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt)
+{
+    uint B = x.size(0);
+    uint H = x.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    uint N = x.size(2);
+    uint M = x.size(3);
+    dim3 gridDim;
+    dim3 blockDim;
+    gridDim.y = B;
+    gridDim.z = H;
+    torch::Tensor out_real = torch::empty({B, H, N, M}, x.options());
+    torch::Tensor out_imag = torch::empty({B, H, N, M}, x.options());
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+    case 16:
+        butterfly_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        butterfly_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_cuda_kernel_64<<<gridDim, blockDim, 57344>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_cuda_kernel_128<<<gridDim, blockDim, 65536>>>(
+            static_cast<__half2 *>(x.data_ptr()),
+            x_gate ? static_cast<__half2 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out_real.data_ptr()),
+            static_cast<__half2 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    default:
+    printf("Not yet implemented \n");
+        break;
+    }
+    return {out_real, out_imag};
 }

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_cuda_bf16.cu CHANGED Viewed

@@ -1,725 +1,725 @@
-// Copyright (c) 2023 Dan Fu, Hermann Kumbong
-#include <torch/extension.h>
-#include <vector>
-#include <stdio.h>
-#include <mma.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include "shared.h"
-using namespace nvcuda;
-__global__ void butterfly_cuda_kernel_64(
-    const __nv_bfloat162 *__restrict__ x,
-    const __nv_bfloat162 *__restrict__ x_gate,
-    const __nv_bfloat162 *__restrict__ d_f_real,
-    const __nv_bfloat162 *__restrict__ d_f_imag,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
-    __nv_bfloat162 *__restrict__ out_real,
-    __nv_bfloat162 *__restrict__ out_imag,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    extern __shared__ __nv_bfloat16 x_shared[];
-    __nv_bfloat16 *d_f_real_shared = &x_shared[N * N];
-    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
-    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
-    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
-    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
-    float *out_imag_shared = &out_real_shared[N * N];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        // #pragma unroll
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] = d_f_real[shared_offset];
-        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] = d_f_imag[shared_offset];
-    }
-    float2 tmp_real, tmp_imag;
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[4];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[4];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[4][4];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[4];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[4];
-    __syncthreads();
-    for (int i = 0; i < 4; i++)
-    {
-        wmma::load_matrix_sync(a_frag_real[i], d_f_real_shared + i * N * 16 + threadIdx.y * 16, N);
-        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag_shared + i * N * 16 + threadIdx.y * 16, N);
-        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
-        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
-    }
-    for (int t = 0; t < 16; t++)
-    {
-        for (int i = 0; i < n; i++)
-        {
-            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
-            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-            if(x_gate != nullptr){
-                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
-            }else{
-                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
-            }
-        }
-        __syncthreads();
-        for (int i = 0; i < 4; i++)
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
-            }
-        }
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            wmma::fill_fragment(acc_frag_real[j], 0.0f);
-            for (int k = 0; k < 4; k++)
-            {
-                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
-            }
-        }
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            wmma::fill_fragment(acc_frag_imag[j], 0.0f);
-            for (int k = 0; k < 4; k++)
-            {
-                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
-            }
-        }
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
-            {
-                tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
-                tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
-                reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
-                reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
-            }
-            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
-            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
-        }
-        __syncthreads();
-#pragma unroll
-        for (int i = 0; i < n; i++)
-        {
-            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
-            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
-            out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
-        }
-        __syncthreads();
-    }
-}
-__global__ void butterfly_cuda_kernel_32(
-    const __nv_bfloat162 *__restrict__ x,
-    const __nv_bfloat162 *__restrict__ x_gate,
-    const __nv_bfloat16 *__restrict__ d_f_real,
-    const __nv_bfloat16 *__restrict__ d_f_imag,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
-    __nv_bfloat162 *__restrict__ out_real,
-    __nv_bfloat162 *__restrict__ out_imag,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    __shared__ __nv_bfloat16 x_shared[32 * 64];
-    __shared__ __nv_bfloat16 d_f_real_shared[32 * 32];
-    __shared__ __nv_bfloat16 d_f_imag_shared[32 * 32];
-    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
-    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
-    __shared__ float out_real_shared[32 * 64];
-    __shared__ float out_imag_shared[32 * 64];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        if(x_gate != nullptr){
-            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
-        }else{
-            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
-        }
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        // #pragma unroll
-        d_f_real_shared[shared_offset] = d_f_real[shared_offset];
-        d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
-    }
-    __syncthreads();
-    if (threadIdx.y < N / 16)
-    {
-        float2 tmp_real, tmp_imag;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[2][2];
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[2][2];
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[2][2];
-        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[2][2];
-        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[2][2];
-        int t = threadIdx.y * 32;
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
-                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
-                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
-                for (int k = 0; k < 2; k++)
-                {
-                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
-                }
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::fill_fragment(acc_frag_imag[i][j], 0.0f);
-                for (int k = 0; k < 2; k++)
-                {
-                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
-                }
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                 for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
-                {
-                    tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k];
-                    tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k];
-                    reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]);
-                    reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]);
-                }
-                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
-                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
-            }
-        }
-    }
-    __syncthreads();
-#pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
-        out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
-    }
-}
-__global__ void butterfly_cuda_kernel_128(
-    const __nv_bfloat162 *__restrict__ x,
-    const __nv_bfloat162 *__restrict__ x_gate,
-    const __nv_bfloat162 *__restrict__ d_f_real,
-    const __nv_bfloat162 *__restrict__ d_f_imag,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
-    __nv_bfloat162 *__restrict__ out_real,
-    __nv_bfloat162 *__restrict__ out_imag,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    extern __shared__ __nv_bfloat16 shared_real[];
-    __nv_bfloat16 *shared_imag = &shared_real[128 * 128];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[8];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[8];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[8][8];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[8];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[8];
-    for (int i = 0; i < n; i++)
-    {
-        for(int j=0; j< 2; j++){
-            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-            reinterpret_cast<__nv_bfloat162 *>(shared_real)[shared_offset] = d_f_real[shared_offset];
-            reinterpret_cast<__nv_bfloat162 *>(shared_imag)[shared_offset] = d_f_imag[shared_offset];
-        }
-    }
-    __syncthreads();
-    for (int i = 0; i < 8; i++){
-        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
-        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
-    }
-    __syncthreads();
-    for (int i = 0; i < n; i++)
-    {
-        for(int j=0; j< 2; j++){
-            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
-            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-            reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-            reinterpret_cast<__nv_bfloat162*>(shared_imag)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        }
-    }
-    __syncthreads();
-    for (int i = 0; i < 8; i++){
-        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
-        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
-    }
-    __syncthreads();
-    for(int t=0; t< 16; t++){
-        for (int i = 0; i < n; i++)
-        {
-            for(int j=0; j< 2; j++){
-                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
-                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-                if(x_gate != nullptr){
-                    reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
-                }else{
-                    reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = x[offset + idx];
-                }
-            }
-        }
-        __syncthreads();
-        for (int i = 0; i < 8; i++)
-        {
-            for (int j = 0; j < 8; j++)
-            {
-                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
-            }
-        }
-        __syncthreads();
-        #pragma unroll
-            for (int j = 0; j < 8; j++)
-            {
-                wmma::fill_fragment(acc_frag_real[j], 0.0f);
-                for (int k = 0; k < 8; k++)
-                {
-                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
-                }
-            }
-    #pragma unroll
-            for (int j = 0; j < 8; j++)
-            {
-                wmma::fill_fragment(acc_frag_imag[j], 0.0f);
-                for (int k = 0; k < 8; k++)
-                {
-                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
-                }
-            }
-            float2 tmp_real, tmp_imag;
-    #pragma unroll
-            for (int j = 0; j < 8; j++)
-            {
-                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
-                {
-                    tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
-                    tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
-                    reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
-                    reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
-                }
-            }
-            for (int j = 0; j < 8; j++)
-            {
-                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
-            }
-            __syncthreads();
-    #pragma unroll
-            for (int i = 0; i < n; i++)
-            {
-                for(int j=0; j< 2; j++){
-                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
-                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-                    out_real[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
-                }
-            }
-            __syncthreads();
-            for (int j = 0; j < 8; j++)
-            {
-                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
-            }
-            __syncthreads();
-    #pragma unroll
-            for (int i = 0; i < n; i++)
-            {
-                for(int j=0; j< 2; j++){
-                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
-                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-                    out_imag[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
-                }
-            }
-    }
-}
-__global__ void butterfly_cuda_kernel_16(
-    const __nv_bfloat162 *__restrict__ x,
-    const __nv_bfloat162 *__restrict__ x_gate,
-    const __nv_bfloat16 *__restrict__ d_f_real,
-    const __nv_bfloat16 *__restrict__ d_f_imag,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
-    __nv_bfloat162 *__restrict__ out_real,
-    __nv_bfloat162 *__restrict__ out_imag,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    __shared__ __nv_bfloat16 x_shared[16 * 64];
-    __shared__ __nv_bfloat16 d_f_real_shared[16 * 16];
-    __shared__ __nv_bfloat16 d_f_imag_shared[16 * 16];
-    __shared__ __nv_bfloat16 twiddles_real_shared[16 * 64];
-    __shared__ __nv_bfloat16 twiddles_imag_shared[16 * 64];
-    __shared__ float out_real_shared[16 * 64];
-    __shared__ float out_imag_shared[16 * 64];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        if(x_gate != nullptr){
-            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
-        }else{
-            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
-        }
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        // #pragma unroll
-        if(threadIdx.x  < 16 ){
-            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
-            d_f_real_shared[shared_offset] = d_f_real[shared_offset];
-            d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
-        }
-    }
-    __syncthreads();
-    if (threadIdx.y < 4)
-    {
-        float2 tmp_real, tmp_imag;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag;
-        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
-        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag;
-        wmma::load_matrix_sync(a_frag_real, d_f_real_shared, N);
-        wmma::load_matrix_sync(a_frag_imag, d_f_imag_shared, N);
-        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
-        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
-        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
-        wmma::fill_fragment(acc_frag_real, 0.0f);
-        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
-        wmma::fill_fragment(acc_frag_imag, 0.0f);
-         wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
-#pragma unroll
-        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
-        {
-            tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real.x)[k];
-            tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag.x)[k];
-            reinterpret_cast<float2 *>(acc_frag_real.x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]);
-            reinterpret_cast<float2 *>(acc_frag_imag.x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]);
-        }
-        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
-        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
-    }
-    __syncthreads();
-#pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
-        out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
-    }
-}
-std::vector<torch::Tensor> butterfly_bf16_cuda(
-    torch::Tensor x,
-    torch::Tensor d_f_real,
-    torch::Tensor d_f_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    std::optional<at::Tensor> x_gate = std::nullopt
-    )
-{
-    uint B = x.size(0);
-    uint H = x.size(1);
-    // uint m = x.size(1);
-    // const int TILE_SIZE = 16;
-    uint N = x.size(2);
-    uint M = x.size(3);
-    dim3 gridDim;
-    dim3 blockDim;
-    gridDim.y = B;
-    gridDim.z = H;
-    torch::Tensor out_real = torch::empty({B, H, N, M}, x.options());
-    torch::Tensor out_imag = torch::empty({B, H, N, M}, x.options());
-    //set blockDims
-    switch(N){
-        case 128:
-            blockDim.x = 32;
-            blockDim.y = 8;
-            break;
-        default:
-            blockDim.x = 32;
-            blockDim.y = 4;
-            break;
-    }
-    //set gridDim.x
-    switch(N){
-        case 128:
-            switch (M){
-                case 16384:
-                    gridDim.x = 128;
-                    break;
-                case 8192:
-                    gridDim.x = 64;
-                    break;
-                case 4096:
-                    gridDim.x = 32;
-                    break;
-                default:
-                    gridDim.x = 256;
-                    break;
-            }
-            break;
-        default:
-            switch (M){
-                case 16384:
-                    gridDim.x = 256;
-                    break;
-                case 8192:
-                    gridDim.x = 128;
-                    break;
-                case 4096:
-                    gridDim.x = 64;
-                    break;
-                default:
-                    gridDim.x = 512;
-                    break;
-            }
-            break;
-    }
-    switch (N)
-    {
-    case 16:
-        butterfly_cuda_kernel_16<<<gridDim, blockDim>>>(
-            static_cast<__nv_bfloat162 *>(x.data_ptr()),
-            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
-            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
-            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
-            B,
-            H,
-            N);
-        break;
-    case 32:
-        butterfly_cuda_kernel_32<<<gridDim, blockDim>>>(
-            static_cast<__nv_bfloat162 *>(x.data_ptr()),
-            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
-            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
-            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
-            B,
-            H,
-            N);
-        break;
-    case 64:
-        gridDim.z = H / 16;
-        cudaFuncSetAttribute(&butterfly_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
-        butterfly_cuda_kernel_64<<<gridDim, blockDim, 78000>>>(
-            static_cast<__nv_bfloat162 *>(x.data_ptr()),
-            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
-            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
-            B,
-            H,
-            N);
-        break;
-    case 128:
-        gridDim.z = H / 16;
-        cudaFuncSetAttribute(&butterfly_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
-        butterfly_cuda_kernel_128<<<gridDim, blockDim, 65536>>>(
-            static_cast<__nv_bfloat162 *>(x.data_ptr()),
-            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
-            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
-            B,
-            H,
-            N);
-        break;
-    default:
-    printf("Not yet implemented \n");
-        break;
-    }
-    return {out_real, out_imag};
 }

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_cuda_kernel_64(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 x_shared[];
+    __nv_bfloat16 *d_f_real_shared = &x_shared[N * N];
+    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
+    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
+    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
+    float *out_imag_shared = &out_real_shared[N * N];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] = d_f_real[shared_offset];
+        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] = d_f_imag[shared_offset];
+    }
+    float2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[4][4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[4];
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+        wmma::load_matrix_sync(a_frag_real[i], d_f_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(a_frag_imag[i], d_f_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + threadIdx.y * N * 16 + i * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + threadIdx.y * N * 16 + i * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            if(x_gate != nullptr){
+                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+            }else{
+                reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * N * 16 + j * 16, N);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_real[j], 0.0f);
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::fill_fragment(acc_frag_imag[j], 0.0f);
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+            {
+                tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
+                tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
+                reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
+                reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
+            }
+            wmma::store_matrix_sync(out_real_shared + threadIdx.y * N * 16 + j * 16, acc_frag_real[j], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(out_imag_shared + threadIdx.y * N * 16 + j * 16, acc_frag_imag[j], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+            out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_cuda_kernel_32(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_shared[32 * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[32 * 32];
+    __shared__ __nv_bfloat16 d_f_imag_shared[32 * 32];
+    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
+    __shared__ float out_real_shared[32 * 64];
+    __shared__ float out_imag_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate != nullptr){
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        }else{
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
+        }
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+        d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        float2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag[i][j], x_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[i][j], 0.0f);
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[i][j], a_frag_imag[i][k], b_frag[k][j], acc_frag_imag[i][j]);
+                }
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                 for (int k = 0; k < acc_frag_real[i][j].num_elements / 2; k++)
+                {
+                    tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k];
+                    tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k];
+                    reinterpret_cast<float2 *>(acc_frag_real[i][j].x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]);
+                    reinterpret_cast<float2 *>(acc_frag_imag[i][j].x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[i][j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[i][j].x)[k]);
+                }
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+                wmma::store_matrix_sync(out_imag_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_imag[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+    }
+}
+__global__ void butterfly_cuda_kernel_128(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 shared_real[];
+    __nv_bfloat16 *shared_imag = &shared_real[128 * 128];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag[8][8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162 *>(shared_real)[shared_offset] = d_f_real[shared_offset];
+            reinterpret_cast<__nv_bfloat162 *>(shared_imag)[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(a_frag_real[i], shared_real + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(a_frag_imag[i], shared_imag + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__nv_bfloat162*>(shared_imag)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], shared_real + threadIdx.y * 128 * 16 + i * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], shared_imag + threadIdx.y * 128 * 16 + i * 16, 128);
+    }
+    __syncthreads();
+    for(int t=0; t< 16; t++){
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(x_gate != nullptr){
+                    reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+                }else{
+                    reinterpret_cast<__nv_bfloat162*>(shared_real)[shared_offset] = x[offset + idx];
+                }
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::load_matrix_sync(b_frag[i][j], shared_real + i * 128 * 16 + j * 16, 128);
+            }
+        }
+        __syncthreads();
+        #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[j], 0.0f);
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[j], a_frag_real[k], b_frag[k][j], acc_frag_real[j]);
+                }
+            }
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::fill_fragment(acc_frag_imag[j], 0.0f);
+                for (int k = 0; k < 8; k++)
+                {
+                    wmma::mma_sync(acc_frag_imag[j], a_frag_imag[k], b_frag[k][j], acc_frag_imag[j]);
+                }
+            }
+            float2 tmp_real, tmp_imag;
+    #pragma unroll
+            for (int j = 0; j < 8; j++)
+            {
+                for (int k = 0; k < acc_frag_real[j].num_elements / 2; k++)
+                {
+                    tmp_real = reinterpret_cast<float2 *>(acc_frag_real[j].x)[k];
+                    tmp_imag = reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k];
+                    reinterpret_cast<float2 *>(acc_frag_real[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]);
+                    reinterpret_cast<float2 *>(acc_frag_imag[j].x)[k] = tmp_real * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag[j].x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real[j].x)[k]);
+                }
+            }
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_real[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = 0; i < n; i++)
+            {
+                for(int j=0; j< 2; j++){
+                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                    out_real[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
+                }
+            }
+            __syncthreads();
+            for (int j = 0; j < 8; j++)
+            {
+                wmma::store_matrix_sync(reinterpret_cast<float*>(shared_real) + threadIdx.y * 128 * 16 + j * 16, acc_frag_imag[j], 128, wmma::mem_row_major);
+            }
+            __syncthreads();
+    #pragma unroll
+            for (int i = 0; i < n; i++)
+            {
+                for(int j=0; j< 2; j++){
+                    idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                    shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                    out_imag[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(shared_real)[shared_offset]);
+                }
+            }
+    }
+}
+__global__ void butterfly_cuda_kernel_16(
+    const __nv_bfloat162 *__restrict__ x,
+    const __nv_bfloat162 *__restrict__ x_gate,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_imag,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_shared[16 * 64];
+    __shared__ __nv_bfloat16 d_f_real_shared[16 * 16];
+    __shared__ __nv_bfloat16 d_f_imag_shared[16 * 16];
+    __shared__ __nv_bfloat16 twiddles_real_shared[16 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[16 * 64];
+    __shared__ float out_real_shared[16 * 64];
+    __shared__ float out_imag_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        if(x_gate != nullptr){
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = __hmul2(x[idx + offset], x_gate[idx + offset]);
+        }else{
+            reinterpret_cast<__nv_bfloat162 *>(x_shared)[shared_offset] = x[idx + offset];
+        }
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        if(threadIdx.x  < 16 ){
+            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
+            d_f_real_shared[shared_offset] = d_f_real[shared_offset];
+            d_f_imag_shared[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        float2 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_imag;
+        wmma::load_matrix_sync(a_frag_real, d_f_real_shared, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag_shared, N);
+        wmma::load_matrix_sync(b_frag, x_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        wmma::fill_fragment(acc_frag_real, 0.0f);
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag, acc_frag_real);
+        wmma::fill_fragment(acc_frag_imag, 0.0f);
+         wmma::mma_sync(acc_frag_imag, a_frag_imag, b_frag, acc_frag_imag);
+#pragma unroll
+        for (int k = 0; k < acc_frag_real.num_elements / 2; k++)
+        {
+            tmp_real = 	reinterpret_cast<float2 *>(acc_frag_real.x)[k];
+            tmp_imag = 	reinterpret_cast<float2 *>(acc_frag_imag.x)[k];
+            reinterpret_cast<float2 *>(acc_frag_real.x)[k] = 	tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]) - tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]);
+            reinterpret_cast<float2 *>(acc_frag_imag.x)[k] =  tmp_real  * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_imag.x)[k]) + tmp_imag * __bfloat1622float2(reinterpret_cast<__nv_bfloat162 *>(tw_frag_real.x)[k]);
+        }
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+        wmma::store_matrix_sync(out_imag_shared + threadIdx.y * 16, acc_frag_imag, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        out_imag[idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(out_imag_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+    }
+}
+std::vector<torch::Tensor> butterfly_bf16_cuda(
+    torch::Tensor x,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> x_gate = std::nullopt
+    )
+{
+    uint B = x.size(0);
+    uint H = x.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    uint N = x.size(2);
+    uint M = x.size(3);
+    dim3 gridDim;
+    dim3 blockDim;
+    gridDim.y = B;
+    gridDim.z = H;
+    torch::Tensor out_real = torch::empty({B, H, N, M}, x.options());
+    torch::Tensor out_imag = torch::empty({B, H, N, M}, x.options());
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+    case 16:
+        butterfly_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        butterfly_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+        butterfly_cuda_kernel_64<<<gridDim, blockDim, 78000>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_cuda_kernel_128<<<gridDim, blockDim, 65536>>>(
+            static_cast<__nv_bfloat162 *>(x.data_ptr()),
+            x_gate ? static_cast<__nv_bfloat162 *>(x_gate.value().data_ptr()) : nullptr,
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out_imag.data_ptr()),
+            B,
+            H,
+            N);
+        break;
+    default:
+    printf("Not yet implemented \n");
+        break;
+    }
+    return {out_real, out_imag};
 }

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda.cu CHANGED Viewed

@@ -1,723 +1,723 @@
-// Copyright (c) 2023 Dan Fu, Hermann Kumbong
-#include <torch/extension.h>
-#include <vector>
-#include <stdio.h>
-#include <mma.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include "shared.h"
-using namespace nvcuda;
-__global__ void butterfly_ifft_cuda_kernel_64(
-    const __half2 *__restrict__ x_real,
-    const __half2 *__restrict__ x_imag,
-    const complex_half_t *__restrict__ d_f,
-    const __half2 *__restrict__ twiddle_factors_real,
-    const __half2 *__restrict__ twiddle_factors_imag,
-    __half2 *__restrict__ out_real,
-    __half2 *__restrict__ out_gate,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    extern __shared__ half x_real_shared[];
-    half *x_imag_shared = &x_real_shared[N * N];
-    half *d_f_real = &x_imag_shared[N * N];
-    half *d_f_imag = &d_f_real[N * N];
-    half *twiddles_real_shared = &d_f_imag[N * N];
-    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
-    half *out_real_shared = &twiddles_imag_shared[N * N];
-    half tmp_real, tmp_imag;
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[4][4];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[4][4];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[4];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[4];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[4];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        // #pragma unroll
-        shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x;
-        d_f_real[shared_offset] = d_f[shared_offset].real();
-        d_f_imag[shared_offset] = d_f[shared_offset].imag();
-        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
-        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
-    }
-    __syncthreads();
-    for (int i = 0; i < 4; i++)
-    {
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
-            wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
-        }
-        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
-        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
-    }
-    for (int t = 0; t < 16; t++)
-    {
-        for (int i = 0; i < n; i++)
-        {
-            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
-            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-            reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
-            reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
-        }
-        __syncthreads();
-        for (int i = 0; i < 4; i++)
-        {
-            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
-            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
-        }
-        for (int j = 0; j < 4; j++)
-        {
-            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
-            {
-                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
-                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
-                b_frag_real[j].x[k] = tmp_real;
-                b_frag_imag[j].x[k] = tmp_imag;
-            }
-        }
-        for (int i = 0; i < 4; i++)
-        {
-            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
-// bd
-#pragma unroll
-            for (int k = 0; k < 4; k++)
-            {
-                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
-            }
-            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
-            {
-                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
-            }
-        }
-        for (int i = 0; i < 4; i++)
-        {
-// ac - bd
-#pragma unroll
-            for (int k = 0; k < 4; k++)
-            {
-                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < 4; i++)
-        {
-            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
-        }
-        __syncthreads();
-#pragma unroll
-        for (int i = 0; i < n; i++)
-        {
-            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
-            if(out_gate != nullptr){
-                out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
-            }
-            else{
-                out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
-            }
-        }
-        __syncthreads();
-    }
-}
-__global__ void butterfly_ifft_cuda_kernel_32(
-    const __half2 *__restrict__ x_real,
-    const __half2 *__restrict__ x_imag,
-    const complex_half_t *__restrict__ d_f,
-    const __half2 *__restrict__ twiddle_factors_real,
-    const __half2 *__restrict__ twiddle_factors_imag,
-    __half2 *__restrict__ out_real,
-    __half2 *__restrict__ out_gate,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    __shared__ half x_real_shared[32 * 64];
-    __shared__ half x_imag_shared[32 * 64];
-    __shared__ half d_f_real[32 * 32];
-    __shared__ half d_f_imag[32 * 32];
-    __shared__ half twiddles_real_shared[32 * 64];
-    __shared__ half twiddles_imag_shared[32 * 64];
-    __shared__ half out_real_shared[32 * 64];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
-        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
-        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        // #pragma unroll
-        d_f_real[shared_offset] = d_f[shared_offset].real();
-        d_f_imag[shared_offset] = d_f[shared_offset].imag();
-    }
-    __syncthreads();
-    if (threadIdx.y < N / 16)
-    {
-        half tmp_real, tmp_imag;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[2][2];
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[2][2];
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[2][2];
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[2][2];
-        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[2][2];
-        int t = threadIdx.y * 32;
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
-                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
-                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-            }
-        }
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
-                {
-                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
-                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
-                    b_frag_real[i][j].x[k] = tmp_real;
-                    b_frag_imag[i][j].x[k] = tmp_imag;
-                }
-            }
-        }
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
-                // bd
-                for (int k = 0; k < 2; k++)
-                {
-                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
-                }
-                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
-                {
-                    acc_frag_real[i][j].x[k] = __hneg(acc_frag_real[i][j].x[k]);
-                }
-            }
-        }
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                // ac - bd
-                for (int k = 0; k < 2; k++)
-                {
-                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
-                }
-            }
-        }
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
-            }
-        }
-    }
-    __syncthreads();
-#pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        if(out_gate != nullptr){
-            out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
-        }
-        else{
-            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
-        }
-    }
-}
-__global__ void butterfly_ifft_cuda_kernel_128(
-    const __half2 *__restrict__ x_real,
-    const __half2 *__restrict__ x_imag,
-    const complex_half_t *__restrict__ d_f,
-    const __half2 *__restrict__ twiddle_factors_real,
-    const __half2 *__restrict__ twiddle_factors_imag,
-    __half2 *__restrict__ out_real,
-    __half2 *__restrict__ out_gate,
-    uint B,
-    uint H,
-    int N)
-{
-     const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = 8;
-    const int n = 16;
-    extern __shared__ half real_shared[];
-    half *imag_shared = &real_shared[128 * 128];
-    half *real_shared_2 = &imag_shared[128 * 128];
-    half *imag_shared_2 = &real_shared_2[128 * 128];
-    __half2 tmp_real, tmp_imag;
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag[8][8];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[8];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[8];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[8];
-    for (int i = 0; i < n; i++)
-    {
-        for(int j=0; j< 4; j++){
-            shared_offset = (threadIdx.y + i * B_Y) * 128 + threadIdx.x + j * blockDim.x;
-            real_shared_2[shared_offset] = d_f[shared_offset].real();
-            imag_shared_2[shared_offset] = d_f[shared_offset].imag();
-        }
-    }
-    __syncthreads();
-    for (int i = 0; i < n; i++)
-    {
-        for(int j=0; j< 2; j++){
-            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
-            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-            reinterpret_cast<__half2*>(real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-            reinterpret_cast<__half2*>(imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        }
-    }
-    __syncthreads();
-    for (int i = 0; i < 8; i++){
-        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
-        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
-    }
-    __syncthreads();
-    for (int t = 0; t < 16; t++)
-    {
-        for (int i = 0; i < n; i++)
-        {
-            for(int j=0; j< 2; j++){
-                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
-                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-                reinterpret_cast<__half2*>(real_shared)[shared_offset] = x_real[offset + idx];
-                reinterpret_cast<__half2*>(imag_shared)[shared_offset] = x_imag[offset + idx];
-            }
-        }
-        __syncthreads();
-        for (int i = 0; i < 8; i++)
-        {
-            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
-            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
-        }
-        for (int j = 0; j < 8; j++)
-        {
-            for (int k = 0; k < tw_frag_real[j].num_elements/2; k++)
-            {
-                tmp_real = __hsub2(__hmul2(reinterpret_cast<__half2*>(tw_frag_real[j].x)[k], reinterpret_cast<__half2*>(b_frag_real[j].x)[k]),
-                 __hmul2(reinterpret_cast<__half2*>(tw_frag_imag[j].x)[k], reinterpret_cast<__half2*>(b_frag_imag[j].x)[k]));
-                tmp_imag = __hadd2(__hmul2(reinterpret_cast<__half2*>(tw_frag_real[j].x)[k], reinterpret_cast<__half2*>(b_frag_imag[j].x)[k]),
-                 __hmul2(reinterpret_cast<__half2*>(tw_frag_imag[j].x)[k], reinterpret_cast<__half2*>(b_frag_real[j].x)[k]));
-                reinterpret_cast<__half2*>(b_frag_real[j].x)[k] = tmp_real;
-                reinterpret_cast<__half2*>(b_frag_imag[j].x)[k] = tmp_imag;
-            }
-        }
-        for (int i = 0; i < 8; i++){
-            for (int j = 0; j < 8; j++){
-                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
-            }
-        }
-        __syncthreads();
-        for (int i = 0; i < 8; i++)
-        {
-            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
-// bd
-#pragma unroll
-            for (int k = 0; k < 8; k++)
-            {
-                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
-            }
-            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
-            {
-                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
-            }
-        }
-        for (int i = 0; i < 8; i++){
-            for (int j = 0; j < 8; j++){
-                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
-            }
-        }
-        __syncthreads();
-        for (int i = 0; i < 8; i++)
-        {
-// ac - bd
-#pragma unroll
-            for (int k = 0; k < 8; k++)
-            {
-                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < 8; i++)
-        {
-            wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
-        }
-        __syncthreads();
-#pragma unroll
-        for (int i = 0; i < n; i++)
-        {
-            for(int j=0; j< 2; j++){
-                idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
-                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-                if(out_gate != nullptr){
-                    out_real[offset + idx] = __hmul2(reinterpret_cast<__half2*>(real_shared)[shared_offset], out_gate[offset + idx]);
-                }
-                else{
-                    out_real[offset + idx] = reinterpret_cast<__half2*>(real_shared)[shared_offset];
-                }
-            }
-        }
-        __syncthreads();
-    }
-}
-__global__ void butterfly_ifft_cuda_kernel_16(
-    const __half2 *__restrict__ x_real,
-    const __half2 *__restrict__ x_imag,
-    const complex_half_t *__restrict__ d_f,
-    const __half2 *__restrict__ twiddle_factors_real,
-    const __half2 *__restrict__ twiddle_factors_imag,
-    __half2 *__restrict__ out_real,
-    __half2 *__restrict__ out_gate,
-    uint B,
-    uint H,
-    int N)
-{
-   const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    __shared__ half x_real_shared[16 * 64];
-    __shared__ half x_imag_shared[16 * 64];
-    __shared__ half d_f_real[16 * 16];
-    __shared__ half d_f_imag[16 * 16];
-    __shared__ half twiddles_real_shared[16 * 64];
-    __shared__ half twiddles_imag_shared[16 * 64];
-    __shared__ half out_real_shared[16 * 64];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
-        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
-        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        if(threadIdx.x  < 16 ){
-            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
-            d_f_real[shared_offset] = d_f[shared_offset].real();
-            d_f_imag[shared_offset] = d_f[shared_offset].imag();
-        }
-    }
-    __syncthreads();
-    //check if it is better to have one warp do all the multiplication or split between warps
-    if (threadIdx.y < 4)
-    {
-        half tmp_real, tmp_imag;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag;
-        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
-        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
-        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
-        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
-        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
-        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
-        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
-        for (int k = 0; k < tw_frag_real.num_elements; k++)
-        {
-            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
-            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
-            b_frag_real.x[k] = tmp_real;
-            b_frag_imag.x[k] = tmp_imag;
-        }
-        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
-        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
-        for(int k=0; k< acc_frag_real.num_elements; k++){
-            acc_frag_real.x[k] = __hneg(acc_frag_real.x[k]);
-        }
-        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
-        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
-    }
-    __syncthreads();
-#pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        if(out_gate != nullptr){
-            out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
-        }
-        else{
-            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
-        }
-    }
-}
-torch::Tensor butterfly_ifft_cuda(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    std::optional<at::Tensor> out_gate = std::nullopt)
-{
-    uint B = x_real.size(0);
-    uint H = x_real.size(1);
-    // uint m = x.size(1);
-    // const int TILE_SIZE = 16;
-    dim3 gridDim;
-    dim3 blockDim;
-    uint N = x_real.size(2);
-    uint M = x_real.size(3);
-    gridDim.y = B;
-    blockDim.x = 32;
-    blockDim.y = 4;
-    torch::Tensor out = torch::empty({B, H, N, M}, x_real.options());
-    gridDim.z = H;
-    //set blockDims
-    switch(N){
-        case 128:
-            blockDim.x = 32;
-            blockDim.y = 8;
-            break;
-        default:
-            blockDim.x = 32;
-            blockDim.y = 4;
-            break;
-    }
-    //set gridDim.x
-    switch(N){
-        case 128:
-            switch (M){
-                case 16384:
-                    gridDim.x = 128;
-                    break;
-                case 8192:
-                    gridDim.x = 64;
-                    break;
-                case 4096:
-                    gridDim.x = 32;
-                    break;
-                default:
-                    gridDim.x = 256;
-                    break;
-            }
-            break;
-        default:
-            switch (M){
-                case 16384:
-                    gridDim.x = 256;
-                    break;
-                case 8192:
-                    gridDim.x = 128;
-                    break;
-                case 4096:
-                    gridDim.x = 64;
-                    break;
-                default:
-                    gridDim.x = 512;
-                    break;
-            }
-            break;
-    }
-    switch (N)
-    {
-    case 16:
-        butterfly_ifft_cuda_kernel_16<<<gridDim, blockDim>>>(
-            static_cast<__half2 *>(x_real.data_ptr()),
-            static_cast<__half2 *>(x_imag.data_ptr()),
-            static_cast<complex_half_t *>(d_f.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__half2 *>(out.data_ptr()),
-            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
-            B,
-            H,
-            N);
-        break;
-    case 32:
-        butterfly_ifft_cuda_kernel_32<<<gridDim, blockDim>>>(
-            static_cast<__half2 *>(x_real.data_ptr()),
-            static_cast<__half2 *>(x_imag.data_ptr()),
-            static_cast<complex_half_t *>(d_f.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__half2 *>(out.data_ptr()),
-            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
-            B,
-            H,
-            N);
-        break;
-    case 64:
-        gridDim.z = H / 16;
-        cudaFuncSetAttribute(&butterfly_ifft_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
-        butterfly_ifft_cuda_kernel_64<<<gridDim, blockDim, 8 * N * N * sizeof(half)>>>(
-            static_cast<__half2 *>(x_real.data_ptr()),
-            static_cast<__half2 *>(x_imag.data_ptr()),
-            static_cast<complex_half_t *>(d_f.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__half2 *>(out.data_ptr()),
-            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
-            B,
-            H,
-            N);
-        break;
-    case 128:
-        gridDim.z = H / 16;
-        cudaFuncSetAttribute(&butterfly_ifft_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536*2);
-        butterfly_ifft_cuda_kernel_128<<<gridDim, blockDim, 65536*2>>>(
-            static_cast<__half2 *>(x_real.data_ptr()),
-            static_cast<__half2 *>(x_imag.data_ptr()),
-            static_cast<complex_half_t *>(d_f.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__half2 *>(out.data_ptr()),
-            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
-            B,
-            H,
-            N);
-        break;
-    default:
-        printf("Not implemented\n");
-    }
-    return out;
-}

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_ifft_cuda_kernel_64(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ half x_real_shared[];
+    half *x_imag_shared = &x_real_shared[N * N];
+    half *d_f_real = &x_imag_shared[N * N];
+    half *d_f_imag = &d_f_real[N * N];
+    half *twiddles_real_shared = &d_f_imag[N * N];
+    half *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    half *out_real_shared = &twiddles_imag_shared[N * N];
+    half tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[4][4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[4][4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[4];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x;
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        d_f_real[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].real();
+        d_f_imag[shared_offset + blockDim.x] = d_f[shared_offset + blockDim.x].imag();
+    }
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+            wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+        }
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+            reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
+// bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            if(out_gate != nullptr){
+                out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
+            }
+            else{
+                out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_cuda_kernel_32(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_real_shared[32 * 64];
+    __shared__ half x_imag_shared[32 * 64];
+    __shared__ half d_f_real[32 * 32];
+    __shared__ half d_f_imag[32 * 32];
+    __shared__ half twiddles_real_shared[32 * 64];
+    __shared__ half twiddles_imag_shared[32 * 64];
+    __shared__ half out_real_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        d_f_real[shared_offset] = d_f[shared_offset].real();
+        d_f_imag[shared_offset] = d_f[shared_offset].imag();
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        half tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
+                {
+                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
+                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
+                    b_frag_real[i][j].x[k] = tmp_real;
+                    b_frag_imag[i][j].x[k] = tmp_imag;
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], __float2half(0.0f));
+                // bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
+                }
+                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
+                {
+                    acc_frag_real[i][j].x[k] = __hneg(acc_frag_real[i][j].x[k]);
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                // ac - bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
+        }
+        else{
+            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        }
+    }
+}
+__global__ void butterfly_ifft_cuda_kernel_128(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+     const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = 8;
+    const int n = 16;
+    extern __shared__ half real_shared[];
+    half *imag_shared = &real_shared[128 * 128];
+    half *real_shared_2 = &imag_shared[128 * 128];
+    half *imag_shared_2 = &real_shared_2[128 * 128];
+    __half2 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag[8][8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 4; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 128 + threadIdx.x + j * blockDim.x;
+            real_shared_2[shared_offset] = d_f[shared_offset].real();
+            imag_shared_2[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__half2*>(real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__half2*>(imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                reinterpret_cast<__half2*>(real_shared)[shared_offset] = x_real[offset + idx];
+                reinterpret_cast<__half2*>(imag_shared)[shared_offset] = x_imag[offset + idx];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 8; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements/2; k++)
+            {
+                tmp_real = __hsub2(__hmul2(reinterpret_cast<__half2*>(tw_frag_real[j].x)[k], reinterpret_cast<__half2*>(b_frag_real[j].x)[k]),
+                 __hmul2(reinterpret_cast<__half2*>(tw_frag_imag[j].x)[k], reinterpret_cast<__half2*>(b_frag_imag[j].x)[k]));
+                tmp_imag = __hadd2(__hmul2(reinterpret_cast<__half2*>(tw_frag_real[j].x)[k], reinterpret_cast<__half2*>(b_frag_imag[j].x)[k]),
+                 __hmul2(reinterpret_cast<__half2*>(tw_frag_imag[j].x)[k], reinterpret_cast<__half2*>(b_frag_real[j].x)[k]));
+                reinterpret_cast<__half2*>(b_frag_real[j].x)[k] = tmp_real;
+                reinterpret_cast<__half2*>(b_frag_imag[j].x)[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], __float2half(0.0f));
+// bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = __hneg(acc_frag_real[i].x[k]);
+            }
+        }
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(out_gate != nullptr){
+                    out_real[offset + idx] = __hmul2(reinterpret_cast<__half2*>(real_shared)[shared_offset], out_gate[offset + idx]);
+                }
+                else{
+                    out_real[offset + idx] = reinterpret_cast<__half2*>(real_shared)[shared_offset];
+                }
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_cuda_kernel_16(
+    const __half2 *__restrict__ x_real,
+    const __half2 *__restrict__ x_imag,
+    const complex_half_t *__restrict__ d_f,
+    const __half2 *__restrict__ twiddle_factors_real,
+    const __half2 *__restrict__ twiddle_factors_imag,
+    __half2 *__restrict__ out_real,
+    __half2 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+   const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ half x_real_shared[16 * 64];
+    __shared__ half x_imag_shared[16 * 64];
+    __shared__ half d_f_real[16 * 16];
+    __shared__ half d_f_imag[16 * 16];
+    __shared__ half twiddles_real_shared[16 * 64];
+    __shared__ half twiddles_imag_shared[16 * 64];
+    __shared__ half out_real_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__half2 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__half2 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__half2 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__half2 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        if(threadIdx.x  < 16 ){
+            shared_offset = (threadIdx.y + i * B_Y) * 16 + threadIdx.x;
+            d_f_real[shared_offset] = d_f[shared_offset].real();
+            d_f_imag[shared_offset] = d_f[shared_offset].imag();
+        }
+    }
+    __syncthreads();
+    //check if it is better to have one warp do all the multiplication or split between warps
+    if (threadIdx.y < 4)
+    {
+        half tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag_imag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, half> acc_frag_real;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        for (int k = 0; k < tw_frag_real.num_elements; k++)
+        {
+            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
+            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
+            b_frag_real.x[k] = tmp_real;
+            b_frag_imag.x[k] = tmp_imag;
+        }
+        wmma::fill_fragment(acc_frag_real, __float2half(0.0f));
+        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
+        for(int k=0; k< acc_frag_real.num_elements; k++){
+            acc_frag_real.x[k] = __hneg(acc_frag_real.x[k]);
+        }
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x], out_gate[idx]);
+        }
+        else{
+            out_real[idx] = reinterpret_cast<__half2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x];
+        }
+    }
+}
+torch::Tensor butterfly_ifft_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt)
+{
+    uint B = x_real.size(0);
+    uint H = x_real.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    dim3 gridDim;
+    dim3 blockDim;
+    uint N = x_real.size(2);
+    uint M = x_real.size(3);
+    gridDim.y = B;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    torch::Tensor out = torch::empty({B, H, N, M}, x_real.options());
+    gridDim.z = H;
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+    case 16:
+        butterfly_ifft_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        butterfly_ifft_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536);
+        butterfly_ifft_cuda_kernel_64<<<gridDim, blockDim, 8 * N * N * sizeof(half)>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536*2);
+        butterfly_ifft_cuda_kernel_128<<<gridDim, blockDim, 65536*2>>>(
+            static_cast<__half2 *>(x_real.data_ptr()),
+            static_cast<__half2 *>(x_imag.data_ptr()),
+            static_cast<complex_half_t *>(d_f.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__half2 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__half2 *>(out.data_ptr()),
+            out_gate ? static_cast<__half2 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    default:
+        printf("Not implemented\n");
+    }
+    return out;
+}

overlay/kernels/cuda/flashfftconv/csrc/butterfly/butterfly_ifft_cuda_bf16.cu CHANGED Viewed

@@ -1,705 +1,705 @@
-// Copyright (c) 2023 Dan Fu, Hermann Kumbong
-#include <torch/extension.h>
-#include <vector>
-#include <stdio.h>
-#include <mma.h>
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
-#include <cuda_runtime.h>
-#include "shared.h"
-using namespace nvcuda;
-__global__ void butterfly_ifft_bf16_cuda_kernel_64(
-    const __nv_bfloat162 *__restrict__ x_real,
-    const __nv_bfloat162 *__restrict__ x_imag,
-    const __nv_bfloat162 *__restrict__ d_f_real,
-    const __nv_bfloat162 *__restrict__ d_f_imag,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
-    __nv_bfloat162 *__restrict__ out_real,
-    __nv_bfloat162 *__restrict__ out_gate,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    extern __shared__ __nv_bfloat16 x_real_shared[];
-    __nv_bfloat16 *x_imag_shared = &x_real_shared[N * N];
-    __nv_bfloat16 *d_f_real_shared = &x_imag_shared[N * N];
-    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
-    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
-    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
-    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
-    __nv_bfloat16 tmp_real, tmp_imag;
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[4][4];
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[4][4];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[4];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[4];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[4];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        // #pragma unroll
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] = d_f_real[shared_offset];
-        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] = d_f_imag[shared_offset];
-    }
-    __syncthreads();
-    for (int i = 0; i < 4; i++)
-    {
-#pragma unroll
-        for (int j = 0; j < 4; j++)
-        {
-            wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
-            wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
-        }
-        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
-        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
-    }
-    for (int t = 0; t < 16; t++)
-    {
-        for (int i = 0; i < n; i++)
-        {
-            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
-            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-            reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
-            reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
-        }
-        __syncthreads();
-        for (int i = 0; i < 4; i++)
-        {
-            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
-            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
-        }
-        for (int j = 0; j < 4; j++)
-        {
-            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
-            {
-                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
-                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
-                b_frag_real[j].x[k] = tmp_real;
-                b_frag_imag[j].x[k] = tmp_imag;
-            }
-        }
-        for (int i = 0; i < 4; i++)
-        {
-            wmma::fill_fragment(acc_frag_real[i], 0.0f);
-// bd
-#pragma unroll
-            for (int k = 0; k < 4; k++)
-            {
-                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
-            }
-            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
-            {
-                acc_frag_real[i].x[k] = - acc_frag_real[i].x[k];
-            }
-        }
-        for (int i = 0; i < 4; i++)
-        {
-// ac - bd
-#pragma unroll
-            for (int k = 0; k < 4; k++)
-            {
-                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < 4; i++)
-        {
-            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
-        }
-        __syncthreads();
-#pragma unroll
-        for (int i = 0; i < n; i++)
-        {
-            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
-            if(out_gate != nullptr){
-                out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]); ;
-            }else{
-                out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
-            }
-        }
-        __syncthreads();
-    }
-}
-__global__ void butterfly_ifft_bf16_cuda_kernel_32(
-    const __nv_bfloat162 *__restrict__ x_real,
-    const __nv_bfloat162 *__restrict__ x_imag,
-    const __nv_bfloat16 *__restrict__ d_f_real,
-    const __nv_bfloat16 *__restrict__ d_f_imag,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
-    __nv_bfloat162 *__restrict__ out_real,
-    __nv_bfloat162 *__restrict__ out_gate,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    __shared__ __nv_bfloat16 x_real_shared[32 * 64];
-    __shared__ __nv_bfloat16 x_imag_shared[32 * 64];
-    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
-    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
-    __shared__ float out_real_shared[32 * 64];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
-        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-    }
-    __syncthreads();
-    if (threadIdx.y < N / 16)
-    {
-        __nv_bfloat16 tmp_real, tmp_imag;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[2][2];
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[2][2];
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[2][2];
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[2][2];
-        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[2][2];
-        int t = threadIdx.y * 32;
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
-                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
-                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
-            }
-        }
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
-                {
-                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
-                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
-                    b_frag_real[i][j].x[k] = tmp_real;
-                    b_frag_imag[i][j].x[k] = tmp_imag;
-                }
-            }
-        }
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
-                // bd
-                for (int k = 0; k < 2; k++)
-                {
-                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
-                }
-                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
-                {
-                    acc_frag_real[i][j].x[k] = - acc_frag_real[i][j].x[k];
-                }
-            }
-        }
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                // ac - bd
-                for (int k = 0; k < 2; k++)
-                {
-                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
-                }
-            }
-        }
-        for (int i = 0; i < 2; i++)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
-            }
-        }
-    }
-    __syncthreads();
-#pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        if(out_gate != nullptr){
-            out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]);
-        }else{
-            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
-        }
-    }
-}
-__global__ void butterfly_ifft_bf16_cuda_kernel_128(
-    const __nv_bfloat162 *__restrict__ x_real,
-    const __nv_bfloat162 *__restrict__ x_imag,
-    const __nv_bfloat162 *__restrict__ d_f_real,
-    const __nv_bfloat162 *__restrict__ d_f_imag,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
-    __nv_bfloat162 *__restrict__ out_real,
-    __nv_bfloat162 *__restrict__ out_gate,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    extern __shared__ __nv_bfloat16 real_shared[];
-    __nv_bfloat16 *imag_shared = &real_shared[128 * 128];
-    __nv_bfloat16 *real_shared_2 = &imag_shared[128 * 128];
-    __nv_bfloat16 *imag_shared_2 = &real_shared_2[128 * 128];
-    __nv_bfloat16 tmp_real, tmp_imag;
-    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag[8][8];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[8];
-    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[8];
-    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[8];
-    for (int i = 0; i < n; i++)
-    {
-        for(int j=0; j< 2; j++){
-            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-            reinterpret_cast<__nv_bfloat162*>(real_shared_2)[shared_offset] = d_f_real[shared_offset];
-            reinterpret_cast<__nv_bfloat162*>(imag_shared_2)[shared_offset] = d_f_imag[shared_offset];
-        }
-    }
-    for (int i = 0; i < n; i++)
-    {
-        for(int j=0; j< 2; j++){
-            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
-            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-            reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-            reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-        }
-    }
-    __syncthreads();
-    for (int i = 0; i < 8; i++){
-        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
-        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
-    }
-    __syncthreads();
-    for (int t = 0; t < 16; t++)
-    {
-        for (int i = 0; i < 8; i++){
-            for (int j = 0; j < 8; j++){
-                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
-            }
-        }
-        for (int i = 0; i < n; i++)
-        {
-            for(int j=0; j< 2; j++){
-                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
-                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-                reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = x_real[offset + idx];
-                reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = x_imag[offset + idx];
-            }
-        }
-        __syncthreads();
-        for (int i = 0; i < 8; i++)
-        {
-            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
-            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
-        }
-        for (int j = 0; j < 8; j++)
-        {
-            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
-            {
-                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
-                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
-                b_frag_real[j].x[k] = tmp_real;
-                b_frag_imag[j].x[k] = tmp_imag;
-            }
-        }
-        for (int i = 0; i < 8; i++)
-        {
-            wmma::fill_fragment(acc_frag_real[i], 0.0f);
-// bd
-#pragma unroll
-            for (int k = 0; k < 8; k++)
-            {
-                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
-            }
-            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
-            {
-                acc_frag_real[i].x[k] = - acc_frag_real[i].x[k];
-            }
-        }
-        for (int i = 0; i < 8; i++){
-            for (int j = 0; j < 8; j++){
-                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
-            }
-        }
-        for (int i = 0; i < 8; i++)
-        {
-// ac - bd
-#pragma unroll
-            for (int k = 0; k < 8; k++)
-            {
-                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
-            }
-        }
-#pragma unroll
-        for (int i = 0; i < 8; i++)
-        {
-            //wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
-            wmma::store_matrix_sync(reinterpret_cast<float*>(real_shared) + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
-        }
-        __syncthreads();
-#pragma unroll
-        for (int i = 0; i < n; i++)
-        {
-            for(int j=0; j< 2; j++){
-                idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
-                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
-                if(out_gate != nullptr){
-                    out_real[offset + idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]), out_gate[offset + idx]);
-                }else{
-                    out_real[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]);
-                }
-            }
-        }
-        __syncthreads();
-    }
-}
-__global__ void butterfly_ifft_bf16_cuda_kernel_16(
-    const __nv_bfloat162 *__restrict__ x_real,
-    const __nv_bfloat162 *__restrict__ x_imag,
-    const __nv_bfloat16 *__restrict__ d_f_real,
-    const __nv_bfloat16 *__restrict__ d_f_imag,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
-    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
-    __nv_bfloat162 *__restrict__ out_real,
-    __nv_bfloat162 *__restrict__ out_gate,
-    uint B,
-    uint H,
-    int N)
-{
-    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
-    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
-    int idx;
-    int shared_offset;
-    const int B_Y = blockDim.y;
-    const int n = N / B_Y;
-    __shared__ __nv_bfloat16 x_real_shared[16 * 64];
-    __shared__ __nv_bfloat16 x_imag_shared[16 * 64];
-    __shared__ __nv_bfloat16 twiddles_real_shared[16 * 64];
-    __shared__ __nv_bfloat16 twiddles_imag_shared[16 * 64];
-    __shared__ float out_real_shared[16 * 64];
-    // #pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
-        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
-        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
-        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
-    }
-    __syncthreads();
-    if (threadIdx.y < 4)
-    {
-        __nv_bfloat16 tmp_real, tmp_imag;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
-        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real;
-        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag;
-        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
-        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
-        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
-        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
-        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
-        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
-        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
-        for (int k = 0; k < tw_frag_real.num_elements; k++)
-        {
-            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
-            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
-            b_frag_real.x[k] = tmp_real;
-            b_frag_imag.x[k] = tmp_imag;
-        }
-        wmma::fill_fragment(acc_frag_real, 0.0f);
-        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
-        for(int k=0; k< acc_frag_real.num_elements; k++){
-            acc_frag_real.x[k] = - acc_frag_real.x[k];
-        }
-        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
-        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
-    }
-    __syncthreads();
-#pragma unroll
-    for (int i = 0; i < n; i++)
-    {
-        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
-        if(out_gate != nullptr){
-            out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]);
-        }else{
-            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
-        }
-    }
-}
-torch::Tensor butterfly_ifft_bf16_cuda(
-    torch::Tensor x_real,
-    torch::Tensor x_imag,
-    torch::Tensor d_f_real,
-    torch::Tensor d_f_imag,
-    torch::Tensor twiddle_factors_real,
-    torch::Tensor twiddle_factors_imag,
-    std::optional<at::Tensor> out_gate = std::nullopt
-    )
-{
-    uint B = x_real.size(0);
-    uint H = x_real.size(1);
-    // uint m = x.size(1);
-    // const int TILE_SIZE = 16;
-    dim3 gridDim;
-    dim3 blockDim;
-    uint N = x_real.size(2);
-    uint M = x_real.size(3);
-    gridDim.y = B;
-    blockDim.x = 32;
-    blockDim.y = 4;
-    torch::Tensor out = torch::empty({B, H, N, M}, x_real.options());
-    //set blockDims
-    switch(N){
-        case 128:
-            blockDim.x = 32;
-            blockDim.y = 8;
-            break;
-        default:
-            blockDim.x = 32;
-            blockDim.y = 4;
-            break;
-    }
-    //set gridDim.x
-    switch(N){
-        case 128:
-            switch (M){
-                case 16384:
-                    gridDim.x = 128;
-                    break;
-                case 8192:
-                    gridDim.x = 64;
-                    break;
-                case 4096:
-                    gridDim.x = 32;
-                    break;
-                default:
-                    gridDim.x = 256;
-                    break;
-            }
-            break;
-        default:
-            switch (M){
-                case 16384:
-                    gridDim.x = 256;
-                    break;
-                case 8192:
-                    gridDim.x = 128;
-                    break;
-                case 4096:
-                    gridDim.x = 64;
-                    break;
-                default:
-                    gridDim.x = 512;
-                    break;
-            }
-            break;
-    }
-    switch (N)
-    {
-     case 16:
-        gridDim.z = H;
-        butterfly_ifft_bf16_cuda_kernel_16<<<gridDim, blockDim>>>(
-            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
-            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
-            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out.data_ptr()),
-            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
-            B,
-            H,
-            N);
-        break;
-    case 32:
-        gridDim.z = H;
-        butterfly_ifft_bf16_cuda_kernel_32<<<gridDim, blockDim>>>(
-            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
-            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
-            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out.data_ptr()),
-            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
-            B,
-            H,
-            N);
-        break;
-    case 64:
-        gridDim.z = H / 16;
-        cudaFuncSetAttribute(&butterfly_ifft_bf16_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
-        butterfly_ifft_bf16_cuda_kernel_64<<<gridDim, blockDim, 78000>>>(
-            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out.data_ptr()),
-            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
-            B,
-            H,
-            N);
-        break;
-    case 128:
-        gridDim.z = H / 16;
-        cudaFuncSetAttribute(&butterfly_ifft_bf16_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
-        butterfly_ifft_bf16_cuda_kernel_128<<<gridDim, blockDim, 65536 * 2>>>(
-            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
-            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
-            static_cast<__nv_bfloat162 *>(out.data_ptr()),
-            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
-            B,
-            H,
-            N);
-        break;
-    default:
-        printf("Not implemented\n");
-    }
-    return out;
-}

+// Copyright (c) 2023 Dan Fu, Hermann Kumbong
+#include <torch/extension.h>
+#include <vector>
+#include <stdio.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include "shared.h"
+using namespace nvcuda;
+__global__ void butterfly_ifft_bf16_cuda_kernel_64(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 64 * 32 * gridDim.x + blockIdx.z * 16 * 64 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 x_real_shared[];
+    __nv_bfloat16 *x_imag_shared = &x_real_shared[N * N];
+    __nv_bfloat16 *d_f_real_shared = &x_imag_shared[N * N];
+    __nv_bfloat16 *d_f_imag_shared = &d_f_real_shared[N * N];
+    __nv_bfloat16 *twiddles_real_shared = &d_f_imag_shared[N * N];
+    __nv_bfloat16 *twiddles_imag_shared = &twiddles_real_shared[N * N];
+    float *out_real_shared = reinterpret_cast<float*>(&twiddles_imag_shared[N * N]);
+    __nv_bfloat16 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[4][4];
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[4][4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[4];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[4];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[4];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        // #pragma unroll
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(d_f_real_shared)[shared_offset] = d_f_real[shared_offset];
+        reinterpret_cast<__nv_bfloat162 *>(d_f_imag_shared)[shared_offset] = d_f_imag[shared_offset];
+    }
+    __syncthreads();
+    for (int i = 0; i < 4; i++)
+    {
+#pragma unroll
+        for (int j = 0; j < 4; j++)
+        {
+            wmma::load_matrix_sync(a_frag_real[i][j], d_f_real_shared + j * N * 16 + i * 16, N);
+            wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag_shared + j * N * 16 + i * 16, N);
+        }
+        wmma::load_matrix_sync(tw_frag_real[i], twiddles_real_shared + i * N * 16 + threadIdx.y * 16, N);
+        wmma::load_matrix_sync(tw_frag_imag[i], twiddles_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+    }
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < n; i++)
+        {
+            idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+            reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+            reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        }
+        __syncthreads();
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], x_real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], x_imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 4; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], 0.0f);
+// bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_imag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = - acc_frag_real[i].x[k];
+            }
+        }
+        for (int i = 0; i < 4; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 4; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag_real[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 4; i++)
+        {
+            wmma::store_matrix_sync(out_real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x + t * 64 * 32 * gridDim.x;
+            if(out_gate != nullptr){
+                out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]); ;
+            }else{
+                out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_bf16_cuda_kernel_32(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 32 * 32 * gridDim.x + blockIdx.z * 32 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 x_imag_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_real_shared[32 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[32 * 64];
+    __shared__ float out_real_shared[32 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+    }
+    __syncthreads();
+    if (threadIdx.y < N / 16)
+    {
+        __nv_bfloat16 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real[2][2];
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[2][2];
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[2][2];
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[2][2];
+        int t = threadIdx.y * 32;
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::load_matrix_sync(a_frag_real[i][j], d_f_real + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(a_frag_imag[i][j], d_f_imag + j * N * 16 + i * 16, N);
+                wmma::load_matrix_sync(b_frag_real[i][j], x_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(b_frag_imag[i][j], x_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_real[i][j], twiddles_real_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+                wmma::load_matrix_sync(tw_frag_imag[i][j], twiddles_imag_shared + i * 2 * N * 16 + j * 16 + t, 2 * N);
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                for (int k = 0; k < tw_frag_real[i][j].num_elements; k++)
+                {
+                    tmp_real = __hsub(__hmul(tw_frag_real[i][j].x[k], b_frag_real[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_imag[i][j].x[k]));
+                    tmp_imag = __hadd(__hmul(tw_frag_real[i][j].x[k], b_frag_imag[i][j].x[k]), __hmul(tw_frag_imag[i][j].x[k], b_frag_real[i][j].x[k]));
+                    b_frag_real[i][j].x[k] = tmp_real;
+                    b_frag_imag[i][j].x[k] = tmp_imag;
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::fill_fragment(acc_frag_real[i][j], 0.0f);
+                // bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_imag[i][k], b_frag_imag[k][j], acc_frag_real[i][j]);
+                }
+                for (int k = 0; k < acc_frag_real[i][j].num_elements; k++)
+                {
+                    acc_frag_real[i][j].x[k] = - acc_frag_real[i][j].x[k];
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                // ac - bd
+                for (int k = 0; k < 2; k++)
+                {
+                    wmma::mma_sync(acc_frag_real[i][j], a_frag_real[i][k], b_frag_real[k][j], acc_frag_real[i][j]);
+                }
+            }
+        }
+        for (int i = 0; i < 2; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                wmma::store_matrix_sync(out_real_shared + i * 2 * N * 16 + j * 16 + t, acc_frag_real[i][j], 2 * N, wmma::mem_row_major);
+            }
+        }
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]);
+        }else{
+            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        }
+    }
+}
+__global__ void butterfly_ifft_bf16_cuda_kernel_128(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat162 *__restrict__ d_f_real,
+    const __nv_bfloat162 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 128 * 32 * 2 * gridDim.x + blockIdx.z * 16 * 128 * 32 * 2 * gridDim.x + blockIdx.x * 64 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 64 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    extern __shared__ __nv_bfloat16 real_shared[];
+    __nv_bfloat16 *imag_shared = &real_shared[128 * 128];
+    __nv_bfloat16 *real_shared_2 = &imag_shared[128 * 128];
+    __nv_bfloat16 *imag_shared_2 = &real_shared_2[128 * 128];
+    __nv_bfloat16 tmp_real, tmp_imag;
+    wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag[8][8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real[8];
+    wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag[8];
+    wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real[8];
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(real_shared_2)[shared_offset] = d_f_real[shared_offset];
+            reinterpret_cast<__nv_bfloat162*>(imag_shared_2)[shared_offset] = d_f_imag[shared_offset];
+        }
+    }
+    for (int i = 0; i < n; i++)
+    {
+        for(int j=0; j< 2; j++){
+            idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x;
+            shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+            reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+            reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+        }
+    }
+    __syncthreads();
+    for (int i = 0; i < 8; i++){
+        wmma::load_matrix_sync(tw_frag_real[i], real_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+        wmma::load_matrix_sync(tw_frag_imag[i], imag_shared + i * 128 * 16 + threadIdx.y * 16, 128);
+    }
+    __syncthreads();
+    for (int t = 0; t < 16; t++)
+    {
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], imag_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx = (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                reinterpret_cast<__nv_bfloat162*>(real_shared)[shared_offset] = x_real[offset + idx];
+                reinterpret_cast<__nv_bfloat162*>(imag_shared)[shared_offset] = x_imag[offset + idx];
+            }
+        }
+        __syncthreads();
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::load_matrix_sync(b_frag_real[i], real_shared + i * N * 16 + threadIdx.y * 16, N);
+            wmma::load_matrix_sync(b_frag_imag[i], imag_shared + i * N * 16 + threadIdx.y * 16, N);
+        }
+        for (int j = 0; j < 8; j++)
+        {
+            for (int k = 0; k < tw_frag_real[j].num_elements; k++)
+            {
+                tmp_real = __hsub(__hmul(tw_frag_real[j].x[k], b_frag_real[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_imag[j].x[k]));
+                tmp_imag = __hadd(__hmul(tw_frag_real[j].x[k], b_frag_imag[j].x[k]), __hmul(tw_frag_imag[j].x[k], b_frag_real[j].x[k]));
+                b_frag_real[j].x[k] = tmp_real;
+                b_frag_imag[j].x[k] = tmp_imag;
+            }
+        }
+        for (int i = 0; i < 8; i++)
+        {
+            wmma::fill_fragment(acc_frag_real[i], 0.0f);
+// bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_imag[k], acc_frag_real[i]);
+            }
+            for (int k = 0; k < acc_frag_real[i].num_elements; k++)
+            {
+                acc_frag_real[i].x[k] = - acc_frag_real[i].x[k];
+            }
+        }
+        for (int i = 0; i < 8; i++){
+            for (int j = 0; j < 8; j++){
+                wmma::load_matrix_sync(a_frag[i][j], real_shared_2 + j * 128 * 16 + i * 16, 128);
+            }
+        }
+        for (int i = 0; i < 8; i++)
+        {
+// ac - bd
+#pragma unroll
+            for (int k = 0; k < 8; k++)
+            {
+                wmma::mma_sync(acc_frag_real[i], a_frag[i][k], b_frag_real[k], acc_frag_real[i]);
+            }
+        }
+#pragma unroll
+        for (int i = 0; i < 8; i++)
+        {
+            //wmma::store_matrix_sync(real_shared + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+            wmma::store_matrix_sync(reinterpret_cast<float*>(real_shared) + i * N * 16 + threadIdx.y * 16, acc_frag_real[i], N, wmma::mem_row_major);
+        }
+        __syncthreads();
+#pragma unroll
+        for (int i = 0; i < n; i++)
+        {
+            for(int j=0; j< 2; j++){
+                idx =  (threadIdx.y + i * B_Y) * 32 * 2 * gridDim.x + j * blockDim.x + t * 128 * 32 * 2 * gridDim.x;
+                shared_offset = (threadIdx.y + i * B_Y) * 64 + threadIdx.x + j * blockDim.x;
+                if(out_gate != nullptr){
+                    out_real[offset + idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]), out_gate[offset + idx]);
+                }else{
+                    out_real[offset + idx] = __float22bfloat162_rn(reinterpret_cast<float2*>(real_shared)[shared_offset]);
+                }
+            }
+        }
+        __syncthreads();
+    }
+}
+__global__ void butterfly_ifft_bf16_cuda_kernel_16(
+    const __nv_bfloat162 *__restrict__ x_real,
+    const __nv_bfloat162 *__restrict__ x_imag,
+    const __nv_bfloat16 *__restrict__ d_f_real,
+    const __nv_bfloat16 *__restrict__ d_f_imag,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_real,
+    const __nv_bfloat162 *__restrict__ twiddle_factors_imag,
+    __nv_bfloat162 *__restrict__ out_real,
+    __nv_bfloat162 *__restrict__ out_gate,
+    uint B,
+    uint H,
+    int N)
+{
+    const int offset = blockIdx.y * H * 16 * 32 * gridDim.x + blockIdx.z * 16 * 32 * gridDim.x + blockIdx.x * 32 + threadIdx.x;
+    const int tw_offset = blockIdx.x * 32 + threadIdx.x;
+    int idx;
+    int shared_offset;
+    const int B_Y = blockDim.y;
+    const int n = N / B_Y;
+    __shared__ __nv_bfloat16 x_real_shared[16 * 64];
+    __shared__ __nv_bfloat16 x_imag_shared[16 * 64];
+    __shared__ __nv_bfloat16 twiddles_real_shared[16 * 64];
+    __shared__ __nv_bfloat16 twiddles_imag_shared[16 * 64];
+    __shared__ float out_real_shared[16 * 64];
+    // #pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        shared_offset = (threadIdx.y + i * B_Y) * 32 + threadIdx.x;
+        reinterpret_cast<__nv_bfloat162 *>(x_real_shared)[shared_offset] = x_real[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(x_imag_shared)[shared_offset] = x_imag[idx + offset];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_real_shared)[shared_offset] = twiddle_factors_real[tw_offset + idx];
+        reinterpret_cast<__nv_bfloat162 *>(twiddles_imag_shared)[shared_offset] = twiddle_factors_imag[tw_offset + idx];
+    }
+    __syncthreads();
+    if (threadIdx.y < 4)
+    {
+        __nv_bfloat16 tmp_real, tmp_imag;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_real;
+        wmma::fragment<wmma::matrix_a, 16, 16, 16, __nv_bfloat16, wmma::col_major> a_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> tw_frag_imag;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_real;
+        wmma::fragment<wmma::matrix_b, 16, 16, 16, __nv_bfloat16, wmma::row_major> b_frag_imag;
+        wmma::fragment<wmma::accumulator, 16, 16, 16, float> acc_frag_real;
+        wmma::load_matrix_sync(a_frag_real, d_f_real, N);
+        wmma::load_matrix_sync(a_frag_imag, d_f_imag, N);
+        wmma::load_matrix_sync(b_frag_real, x_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(b_frag_imag, x_imag_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_real, twiddles_real_shared + threadIdx.y * 16, 64);
+        wmma::load_matrix_sync(tw_frag_imag, twiddles_imag_shared + threadIdx.y * 16, 64);
+        for (int k = 0; k < tw_frag_real.num_elements; k++)
+        {
+            tmp_real = __hsub(__hmul(tw_frag_real.x[k], b_frag_real.x[k]), __hmul(tw_frag_imag.x[k], b_frag_imag.x[k]));
+            tmp_imag = __hadd(__hmul(tw_frag_real.x[k], b_frag_imag.x[k]), __hmul(tw_frag_imag.x[k], b_frag_real.x[k]));
+            b_frag_real.x[k] = tmp_real;
+            b_frag_imag.x[k] = tmp_imag;
+        }
+        wmma::fill_fragment(acc_frag_real, 0.0f);
+        wmma::mma_sync(acc_frag_real, a_frag_imag, b_frag_imag, acc_frag_real);
+        for(int k=0; k< acc_frag_real.num_elements; k++){
+            acc_frag_real.x[k] = - acc_frag_real.x[k];
+        }
+        wmma::mma_sync(acc_frag_real, a_frag_real, b_frag_real, acc_frag_real);
+        wmma::store_matrix_sync(out_real_shared + threadIdx.y * 16, acc_frag_real, 64, wmma::mem_row_major);
+    }
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < n; i++)
+    {
+        idx = offset + (threadIdx.y + i * B_Y) * 32 * gridDim.x;
+        if(out_gate != nullptr){
+            out_real[idx] = __hmul2(__float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]), out_gate[idx]);
+        }else{
+            out_real[idx] = __float22bfloat162_rn(reinterpret_cast<float2 *>(out_real_shared)[(threadIdx.y + i * B_Y) * 32 + threadIdx.x]);
+        }
+    }
+}
+torch::Tensor butterfly_ifft_bf16_cuda(
+    torch::Tensor x_real,
+    torch::Tensor x_imag,
+    torch::Tensor d_f_real,
+    torch::Tensor d_f_imag,
+    torch::Tensor twiddle_factors_real,
+    torch::Tensor twiddle_factors_imag,
+    std::optional<at::Tensor> out_gate = std::nullopt
+    )
+{
+    uint B = x_real.size(0);
+    uint H = x_real.size(1);
+    // uint m = x.size(1);
+    // const int TILE_SIZE = 16;
+    dim3 gridDim;
+    dim3 blockDim;
+    uint N = x_real.size(2);
+    uint M = x_real.size(3);
+    gridDim.y = B;
+    blockDim.x = 32;
+    blockDim.y = 4;
+    torch::Tensor out = torch::empty({B, H, N, M}, x_real.options());
+    //set blockDims
+    switch(N){
+        case 128:
+            blockDim.x = 32;
+            blockDim.y = 8;
+            break;
+        default:
+            blockDim.x = 32;
+            blockDim.y = 4;
+            break;
+    }
+    //set gridDim.x
+    switch(N){
+        case 128:
+            switch (M){
+                case 16384:
+                    gridDim.x = 128;
+                    break;
+                case 8192:
+                    gridDim.x = 64;
+                    break;
+                case 4096:
+                    gridDim.x = 32;
+                    break;
+                default:
+                    gridDim.x = 256;
+                    break;
+            }
+            break;
+        default:
+            switch (M){
+                case 16384:
+                    gridDim.x = 256;
+                    break;
+                case 8192:
+                    gridDim.x = 128;
+                    break;
+                case 4096:
+                    gridDim.x = 64;
+                    break;
+                default:
+                    gridDim.x = 512;
+                    break;
+            }
+            break;
+    }
+    switch (N)
+    {
+     case 16:
+        gridDim.z = H;
+        butterfly_ifft_bf16_cuda_kernel_16<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 32:
+        gridDim.z = H;
+        butterfly_ifft_bf16_cuda_kernel_32<<<gridDim, blockDim>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat16 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 64:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_bf16_cuda_kernel_64, cudaFuncAttributeMaxDynamicSharedMemorySize, 78000);
+        butterfly_ifft_bf16_cuda_kernel_64<<<gridDim, blockDim, 78000>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    case 128:
+        gridDim.z = H / 16;
+        cudaFuncSetAttribute(&butterfly_ifft_bf16_cuda_kernel_128, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536 * 2);
+        butterfly_ifft_bf16_cuda_kernel_128<<<gridDim, blockDim, 65536 * 2>>>(
+            static_cast<__nv_bfloat162 *>(x_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(x_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(d_f_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_real.data_ptr()),
+            static_cast<__nv_bfloat162 *>(twiddle_factors_imag.data_ptr()),
+            static_cast<__nv_bfloat162 *>(out.data_ptr()),
+            out_gate ? static_cast<__nv_bfloat162 *>(out_gate.value().data_ptr()) : nullptr,
+            B,
+            H,
+            N);
+        break;
+    default:
+        printf("Not implemented\n");
+    }
+    return out;
+}