Spaces:

Jackoatmon
/

feather-runtime

Runtime error

App Files Files Community

Jackoatmon commited on 15 days ago

Commit

4f61010

verified ·

1 Parent(s): c2bf4b6

Update Feather a10g-large runtime image with fused SDR fallback

Browse files

Files changed (8) hide show

overlay/htm_rust/build.rs +12 -6
overlay/htm_rust/src/gpu/fused.rs +33 -50
overlay/hydra/config.py +2 -2
overlay/hydra/engram.py +104 -121
overlay/scripts/launch_feather_hf_job.py +101 -34
overlay/scripts/run_domain_expanded_pretrain.sh +5 -1
overlay/subsystems/fused_sdr_project.py +7 -3
overlay/subsystems/sdr_semantic.py +27 -5

overlay/htm_rust/build.rs CHANGED Viewed

@@ -26,8 +26,11 @@ fn main() {
         return;
     }
-    // Kernels to compile. Each .cu file → one .ptx file, embedded by name.
-    let kernels: &[&str] = &[
         "sp_overlap",
         "sp_topk",
         "sp_learn",
@@ -40,17 +43,20 @@ fn main() {
         "tm_grow",
         "tm_anomaly",
         "tm_reset",
-        "htm_fused_step",
     ];
     let kernels_dir = PathBuf::from("src/gpu/kernels");
-    for k in kernels {
         let src = kernels_dir.join(format!("{k}.cu"));
         println!("cargo:rerun-if-changed={}", src.display());
     }
-    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
-    let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_86".into());
     let nvcc = find_nvcc();
     println!("cargo:warning=htm_rust: nvcc = {nvcc}");

         return;
     }
+    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
+    let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_86".into());
+    // Base kernels — compile for any sm_80+ GPU. Each .cu file → one .ptx file.
+    let base_kernels: &[&str] = &[
         "sp_overlap",
         "sp_topk",
         "sp_learn",
         "tm_grow",
         "tm_anomaly",
         "tm_reset",
     ];
+    // htm_fused_step now compiles for ALL architectures (sm_80+).
+    // On Hopper (sm_90+): uses cluster-distributed shared memory for hot state.
+    // On Ampere (sm_86) and other pre-Hopper: uses global memory reads/writes
+    // with grid.sync() for cross-block synchronization (cooperative launch).
+    let kernels: Vec<&str> = base_kernels.iter().chain(["htm_fused_step"].iter()).copied().collect();
     let kernels_dir = PathBuf::from("src/gpu/kernels");
+    for k in &kernels {
         let src = kernels_dir.join(format!("{k}.cu"));
         println!("cargo:rerun-if-changed={}", src.display());
     }
     let nvcc = find_nvcc();
     println!("cargo:warning=htm_rust: nvcc = {nvcc}");

overlay/htm_rust/src/gpu/fused.rs CHANGED Viewed

@@ -20,15 +20,15 @@
 use std::ffi::CString;
 use std::sync::Arc;
-use cudarc::driver::{
-    result, sys, CudaDevice, CudaSlice, DevicePtr, DeviceRepr, DriverError, LaunchConfig,
-};
 use cudarc::nvrtc::Ptx;
 use super::sp_gpu::SpatialPoolerGpu;
 use super::tm_gpu::{TemporalMemoryGpu, MAX_SEGMENTS_PER_CELL, MAX_SYN_PER_SEGMENT};
-const PTX_HTM_FUSED: &str = include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/htm_fused_step.ptx"));
 /// Struct-by-value pointer pack — matches C-side `FusedPtrs`.
 ///
@@ -132,9 +132,11 @@ pub(crate) fn plan_fused_launch(
     grid_cap_override: Option<u32>,
 ) -> Result<FusedLaunchPlan, String> {
     let sm_count = sm_count.max(1);
-    // 1024 threads/block exceeds the register file on Ampere and makes the
-    // cooperative-grid residency probe lie when the launch uses a different
-    // block size. Keep the planned block size identical to the occupancy probe.
     let block_dim_x = 256u32;
     // Cluster launch path: cooperative launch is not required. Keep the probe
@@ -143,11 +145,10 @@ pub(crate) fn plan_fused_launch(
         eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
     }
-    // Cluster constraint: grid_dim_x must equal the cluster size (16) so that
-    // each region maps to exactly one cluster. `HTM_FUSED_GRID_CAP` can lower
-    // this for debugging but should not exceed 16 for cluster correctness.
     let default_grid_cap = 16u32;
-    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap).min(16);
     let resident_bound = if cooperative_grid_limit > 0 {
         cooperative_grid_limit.max(sm_count * 2)
     } else {
@@ -217,7 +218,7 @@ pub struct FusedState {
     pub cell_active_bits_b: CudaSlice<u32>,
     pub cell_winner_bits_a: CudaSlice<u32>,
     pub cell_winner_bits_b: CudaSlice<u32>,
-    pub step_scratch: CudaSlice<u32>, // length 6
     pub grid_dim_x: u32,
     pub block_dim_x: u32,
@@ -240,10 +241,7 @@ impl FusedState {
         initial_threshold: f32,
     ) -> Result<Self, DriverError> {
         let n_cells = n_columns * cells_per_column;
-        assert!(
-            n_cells % 32 == 0,
-            "n_cells must be divisible by 32 for bitsets"
-        );
         let bits_words = n_cells / 32;
         let mut inhibition_threshold = dev.alloc_zeros::<f32>(n_columns)?;
@@ -280,8 +278,7 @@ impl FusedState {
         // every launched kernel function, otherwise cuLaunchKernelEx rejects
         // the cluster dim with CUDA_ERROR_INVALID_CLUSTER_SIZE.
         unsafe {
-            let attr =
-                sys::CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED;
             // Ignore errors: older CUDA may lack the attribute, in which case
             // only portable sizes (<= 8) work — plan_fused_launch caps at 8.
             let _ = sys::lib().cuFuncSetAttribute(function, attr, 1);
@@ -297,9 +294,9 @@ impl FusedState {
         };
         // T1: Probe Hopper cluster launch capability.
-        let max_cluster_size = match dev
-            .attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH)
-        {
             Ok(v) if v > 0 => {
                 // H200/sm_90a supports up to 16 blocks per cluster.
                 // There is no MAX_CLUSTER_SIZE attribute in CUDA 12.4; hard-code the
@@ -349,11 +346,7 @@ impl FusedState {
         Ok(Self {
             dev,
-            raw_kernel: RawFusedKernel {
-                module,
-                function,
-                function_batched,
-            },
             inhibition_threshold,
             cell_active_bits_a,
             cell_active_bits_b,
@@ -452,7 +445,7 @@ pub fn launch_fused(
         inputs: *inputs_flat.device_ptr(),
         cols_out: *cols_out.device_ptr(),
         anom_out: *anom_out.device_ptr(),
-        barrier_counters: 0u64, // ABI-compat dummy; cluster barrier replaces DLB.
         step_scratch: *fused.step_scratch.device_ptr(),
     };
@@ -500,17 +493,14 @@ pub fn launch_fused(
             }
         } else {
             // Pre-Hopper: cooperative kernel launch. The fused kernel uses
-            // cg::this_grid().sync(); normal launches poison the CUDA context
-            // with an asynchronous unspecified launch failure.
             let ret = sys::lib().cuLaunchCooperativeKernel(
                 fused.raw_kernel.function,
-                grid_x,
-                1,
-                1,
-                block_x,
-                1,
-                1,
-                0,
                 cu_stream,
                 kernel_params.as_mut_ptr(),
             );
@@ -626,7 +616,7 @@ pub(super) fn launch_fused_batched_raw(
                 inputs: inputs_per_region[i],
                 cols_out: cols_per_region[i],
                 anom_out: anom_per_region[i],
-                barrier_counters: 0u64, // ABI-compat dummy; cluster barrier replaces DLB.
                 step_scratch: *r.fused_state.step_scratch.device_ptr(),
             }
         })
@@ -646,8 +636,8 @@ pub(super) fn launch_fused_batched_raw(
         let r0 = unsafe { &*region_ptrs[0] };
         r0.fused_state.cluster_info.max_cluster_size > 0
     };
-    let grid_x =
-        plan_batched_grid_dim(grid_x, cooperative_grid_limit, b, use_cluster).map_err(|msg| {
             eprintln!("[htm_rust] FATAL: {msg}");
             DriverError(cudarc::driver::sys::CUresult::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE)
         })?;
@@ -688,19 +678,12 @@ pub(super) fn launch_fused_batched_raw(
                 return Err(DriverError(ret));
             }
         } else {
-            // Pre-Hopper: cooperative kernel launch. The fused kernel uses
-            // cg::this_grid().sync(), which is only valid under cooperative
-            // launch. A normal launch can run until the first grid.sync() and
-            // then poison the CUDA context with an unspecified launch failure.
             let ret = sys::lib().cuLaunchCooperativeKernel(
                 function_batched,
-                grid_x,
-                b as u32,
-                1,
-                block_x,
-                1,
-                1,
-                0,
                 cu_stream,
                 kernel_params.as_mut_ptr(),
             );

 use std::ffi::CString;
 use std::sync::Arc;
+use cudarc::driver::{result, sys, CudaDevice, CudaSlice, DeviceRepr, DevicePtr, DriverError,
+                      LaunchConfig};
 use cudarc::nvrtc::Ptx;
 use super::sp_gpu::SpatialPoolerGpu;
 use super::tm_gpu::{TemporalMemoryGpu, MAX_SEGMENTS_PER_CELL, MAX_SYN_PER_SEGMENT};
+const PTX_HTM_FUSED: &str =
+    include_str!(concat!(env!("HTM_GPU_PTX_DIR"), "/htm_fused_step.ptx"));
 /// Struct-by-value pointer pack — matches C-side `FusedPtrs`.
 ///
     grid_cap_override: Option<u32>,
 ) -> Result<FusedLaunchPlan, String> {
     let sm_count = sm_count.max(1);
+    // 1024 threads/block exceeds the register file on Ampere (sm_86: 65536
+    // regs/SM ÷ 1024 = 64 regs/thread; fused kernel needs ~80+). 256 gives
+    // 256 regs/thread which is ample. Compensate with more blocks via
+    // cooperative launch. On Hopper (228 KB smem, 255 regs/thread baseline),
+    // 1024 works fine, but 256 is safe everywhere.
     let block_dim_x = 256u32;
     // Cluster launch path: cooperative launch is not required. Keep the probe
         eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
     }
+    // Tested grid_cap: 4 blocks = 30ms (too serial), 16 blocks = 10.8ms (parallel wins).
+    // Parallelism in SP overlap + TM predict stages outweighs grid.sync() cost.
     let default_grid_cap = 16u32;
+    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap);
     let resident_bound = if cooperative_grid_limit > 0 {
         cooperative_grid_limit.max(sm_count * 2)
     } else {
     pub cell_active_bits_b: CudaSlice<u32>,
     pub cell_winner_bits_a: CudaSlice<u32>,
     pub cell_winner_bits_b: CudaSlice<u32>,
+    pub step_scratch: CudaSlice<u32>,       // length 6
     pub grid_dim_x: u32,
     pub block_dim_x: u32,
         initial_threshold: f32,
     ) -> Result<Self, DriverError> {
         let n_cells = n_columns * cells_per_column;
+        assert!(n_cells % 32 == 0, "n_cells must be divisible by 32 for bitsets");
         let bits_words = n_cells / 32;
         let mut inhibition_threshold = dev.alloc_zeros::<f32>(n_columns)?;
         // every launched kernel function, otherwise cuLaunchKernelEx rejects
         // the cluster dim with CUDA_ERROR_INVALID_CLUSTER_SIZE.
         unsafe {
+            let attr = sys::CUfunction_attribute::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED;
             // Ignore errors: older CUDA may lack the attribute, in which case
             // only portable sizes (<= 8) work — plan_fused_launch caps at 8.
             let _ = sys::lib().cuFuncSetAttribute(function, attr, 1);
         };
         // T1: Probe Hopper cluster launch capability.
+        let max_cluster_size = match dev.attribute(
+            cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH,
+        ) {
             Ok(v) if v > 0 => {
                 // H200/sm_90a supports up to 16 blocks per cluster.
                 // There is no MAX_CLUSTER_SIZE attribute in CUDA 12.4; hard-code the
         Ok(Self {
             dev,
+            raw_kernel: RawFusedKernel { module, function, function_batched },
             inhibition_threshold,
             cell_active_bits_a,
             cell_active_bits_b,
         inputs: *inputs_flat.device_ptr(),
         cols_out: *cols_out.device_ptr(),
         anom_out: *anom_out.device_ptr(),
+        barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
         step_scratch: *fused.step_scratch.device_ptr(),
     };
             }
         } else {
             // Pre-Hopper: cooperative kernel launch. The fused kernel uses
+            // grid.sync() for cross-block synchronization which REQUIRES
+            // cuLaunchCooperativeKernel (normal launch silently crashes on
+            // the first grid.sync() call).
             let ret = sys::lib().cuLaunchCooperativeKernel(
                 fused.raw_kernel.function,
+                grid_x, 1, 1,
+                block_x, 1, 1,
+                0,  // sharedMemBytes
                 cu_stream,
                 kernel_params.as_mut_ptr(),
             );
                 inputs: inputs_per_region[i],
                 cols_out: cols_per_region[i],
                 anom_out: anom_per_region[i],
+                barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
                 step_scratch: *r.fused_state.step_scratch.device_ptr(),
             }
         })
         let r0 = unsafe { &*region_ptrs[0] };
         r0.fused_state.cluster_info.max_cluster_size > 0
     };
+    let grid_x = plan_batched_grid_dim(grid_x, cooperative_grid_limit, b, use_cluster)
+        .map_err(|msg| {
             eprintln!("[htm_rust] FATAL: {msg}");
             DriverError(cudarc::driver::sys::CUresult::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE)
         })?;
                 return Err(DriverError(ret));
             }
         } else {
+            // Pre-Hopper: cooperative kernel launch (grid.sync() requires it).
             let ret = sys::lib().cuLaunchCooperativeKernel(
                 function_batched,
+                grid_x, b as u32, 1,
+                block_x, 1, 1,
+                0,  // sharedMemBytes
                 cu_stream,
                 kernel_params.as_mut_ptr(),
             );

overlay/hydra/config.py CHANGED Viewed

@@ -110,8 +110,8 @@ class PostSemClawConfig:
     gdn_layers: tuple[int, ...] = field(default_factory=_parse_gdn_layers_env)
     # Label smoothing + Z-loss
-    label_smoothing: float = 0.0   # disabled: any smoothing hurts in 5-min budget
-    z_loss_weight: float = 1e-4
 # ---------------------------------------------------------------------------

     gdn_layers: tuple[int, ...] = field(default_factory=_parse_gdn_layers_env)
     # Label smoothing + Z-loss
+    label_smoothing: float = field(default_factory=lambda: float(os.environ.get("HYDRA_LABEL_SMOOTHING", "0.0")))
+    z_loss_weight: float = field(default_factory=lambda: float(os.environ.get("HYDRA_Z_LOSS_WEIGHT", "1e-4")))
 # ---------------------------------------------------------------------------

overlay/hydra/engram.py CHANGED Viewed

@@ -1,93 +1,23 @@
-"""GPU Engram — Sparse Modern Hopfield retrieval path.
-## What changed (scatter-gather → Hopfield matmul)
-The original forward used `self.memory[indices]` (scatter-gather), which misses
-L2 cache at n_columns > 4096 and creates a hard tps ceiling.
-The replacement uses:
-    scores   = x @ self.memory.T          # (B, T, n_columns) — coalesced matmul
-    weights  = entmax15(scores, dim=-1)   # sparse attention; 95%+ exact zeros
-    retrieved = weights @ self.memory     # (B, T, d_model)   — coalesced matmul
-Both matmuls are tile-friendly (cuBLAS GEMM), so L2 reuse is high regardless of
-n_columns. Gradient flows through both matmuls so `self.memory` learns via
-autograd in addition to (or instead of) the Hebbian EMA writes.
-## Sparsity mechanism
-alpha-entmax with alpha=1.5 (entmax15) is a sparse attention operator that maps
-logit vectors to distributions where many entries are *exactly* zero (not merely
-small). It generalises softmax (alpha=1) and argmax (alpha→∞). At n_columns=1024
-with d_model=64 a random batch typically hits ≥95% zero entries — the key
-property that keeps bandwidth proportional to *attended* columns, not all columns.
-Fallback: if `entmax` is not pip-installed, top-k softmax (k=32) is used instead.
-This is chosen at module-import time — NO runtime branching per forward call.
-## token_ids argument
-token_ids is accepted for API compatibility with the rest of the hydra stack
-(train.py, lightning_module.py call `engram(x, token_ids)`). It is NOT used in
-the retrieval path — the Hopfield path computes dense similarity over the whole
-memory bank, which subsumes any hash-based column selection. Documented here to
-prevent confusion.
-## Hebbian writes (hebbian_boost=False by default)
-With Hopfield retrieval, gradient signals reach self.memory through autograd, so
-Hebbian EMA writes are no longer critical. They are preserved as an *optional*
-boost (hebbian_boost=True) for experiments that want both signals. Default is off.
-## Checkpoint compatibility
-`self.memory` shape (n_columns, d_model) is unchanged, so existing .pt / .ckpt
-files load without modification.
-"""
 from __future__ import annotations
 import torch
 import torch.nn as nn
-# ---------------------------------------------------------------------------
-# Sparse-attention backend — chosen ONCE at import time, no runtime branching.
-# ---------------------------------------------------------------------------
-try:
-    from entmax import entmax15 as _entmax15  # type: ignore[import]
-    def _sparse_attention(scores: torch.Tensor) -> torch.Tensor:
-        """alpha-entmax (alpha=1.5): truly sparse distribution over last dim."""
-        return _entmax15(scores, dim=-1).to(dtype=scores.dtype)
-    _BACKEND = "entmax15"
-except ImportError:  # pragma: no cover — entmax always installed in CI
-    _K = 32  # top-k for fallback
-    def _sparse_attention(scores: torch.Tensor) -> torch.Tensor:  # type: ignore[misc]
-        """Top-k softmax fallback: zero outside the k highest-scoring columns."""
-        topk_vals, topk_idx = scores.topk(_K, dim=-1)
-        topk_w = torch.softmax(topk_vals, dim=-1)
-        weights = torch.zeros_like(scores)
-        weights.scatter_(-1, topk_idx, topk_w.to(dtype=weights.dtype))
-        return weights
-    _BACKEND = "topk32"
 class GPUEngram(nn.Module):
-    """GPU Engram: Sparse Modern Hopfield retrieval.
-    Args:
-        d_model:       Model dimension — must match the surrounding transformer.
-        n_columns:     Number of memory columns (key-value pairs). Safe at 32 768
-                       with the matmul path; the old scatter-gather had an L2
-                       cliff above ~4 096.
-        max_ngram:     Retained for API compatibility; unused in retrieval path.
-        hebbian_boost: If True, also run a Hebbian EMA write on the memory bank
-                       during training (old behaviour, now optional). Default False.
     """
     def __init__(
@@ -101,20 +31,15 @@ class GPUEngram(nn.Module):
         self.n_columns = n_columns
         self.max_ngram = max_ngram
         self.hebbian_boost = hebbian_boost
-        # Shape unchanged from original — existing checkpoints load cleanly.
         self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
         self.gate = nn.Linear(d_model, 1, bias=True)
-        nn.init.constant_(self.gate.bias, 0.0)  # START OPEN
-        # Retained for any external code that reads these attrs.
         self.primes = [2654435761, 2246822519, 3266489917]
         self.hebbian_lr = 0.01
-    # ------------------------------------------------------------------
-    # _hash: retained for API/checkpoint compat; unused in forward below.
-    # ------------------------------------------------------------------
     def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
-        """N-gram hash → column index (kept for backward-compat; not used in retrieval)."""
         B, T = token_ids.shape
         h = token_ids * self.primes[0]
         if T > 1:
@@ -127,44 +52,103 @@ class GPUEngram(nn.Module):
             h = h ^ (shifted2 * self.primes[2])
         return h % self.n_columns
-    # ------------------------------------------------------------------
-    # forward
-    # ------------------------------------------------------------------
-    def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
-        """Hopfield retrieve + soft gate + residual.
-        Args:
-            x:         (B, T, d_model) — input activations.
-            token_ids: (B, T) — token indices. Accepted for API compatibility;
-                       NOT used in the retrieval path (see module docstring).
-        Returns:
-            (x + alpha * retrieved, hit_rate)
-            - x + alpha * retrieved: (B, T, d_model)
-            - hit_rate: scalar tensor — fraction of gate values > 0.1
-        """
-        # ---- 1. Similarity scores (coalesced GEMM) ----------------------
-        # scores[b, t, c] = dot(x[b,t], memory[c])
-        scores = x @ self.memory.T  # (B, T, n_columns)
-        # ---- 2. Sparse attention weights --------------------------------
-        # _sparse_attention is fixed at import time (entmax15 or top-k).
-        weights = _sparse_attention(scores)  # (B, T, n_columns), many exact zeros
-        # ---- 3. Retrieved vector (coalesced GEMM) -----------------------
-        retrieved = weights @ self.memory  # (B, T, d_model)
-        # ---- 4. Soft gate (unchanged) -----------------------------------
-        alpha = torch.sigmoid(self.gate(x))  # (B, T, 1)
-        # ---- 5. Optional Hebbian EMA write ------------------------------
         if self.training and self.hebbian_boost:
             with torch.no_grad():
-                # Reuse the hash-based indices for the write target (sparse update).
                 indices = self._hash(token_ids)
-                flat_idx = indices.reshape(-1)           # (B*T,)
-                flat_x = x.detach().reshape(-1, x.shape[-1])  # (B*T, d_model)
                 mem_dtype = self.memory.data.dtype
                 updates = (
                     self.hebbian_lr * flat_x
@@ -172,6 +156,5 @@ class GPUEngram(nn.Module):
                 ).to(mem_dtype)
                 self.memory.data.index_add_(0, flat_idx, updates)
-        # ---- 6. Residual + hit_rate -------------------------------------
         hit_rate = (alpha.detach() > 0.1).float().mean()
         return x + alpha * retrieved, hit_rate

+"""GPU Engram — Top-k Sparse Hopfield retrieval with optional Cantor/SDR nerve constraint."""
 from __future__ import annotations
+import os
 import torch
 import torch.nn as nn
+_ENGRAM_TOPK = int(os.environ.get("HYDRA_ENGRAM_TOPK", "64"))
 class GPUEngram(nn.Module):
+    """GPU Engram: Top-k Sparse Hopfield retrieval.
+    Default `routing_mode=flat` preserves the existing full-memory top-k path.
+    `cantor_sdr` constrains candidates to the current Cantor leaf shard and SDR
+    active offsets. `auto` only uses that local path when it is cheaper than the
+    full score matrix (`K * d_model < n_columns`).
     """
     def __init__(
         self.n_columns = n_columns
         self.max_ngram = max_ngram
         self.hebbian_boost = hebbian_boost
         self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
         self.gate = nn.Linear(d_model, 1, bias=True)
+        nn.init.constant_(self.gate.bias, 0.0)
+        self.topk_k = min(_ENGRAM_TOPK, n_columns)
         self.primes = [2654435761, 2246822519, 3266489917]
         self.hebbian_lr = 0.01
+        self.routing_mode = os.environ.get("HYDRA_ENGRAM_ROUTING", "auto").lower()
     def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
         B, T = token_ids.shape
         h = token_ids * self.primes[0]
         if T > 1:
             h = h ^ (shifted2 * self.primes[2])
         return h % self.n_columns
+    def _validate_active_indices(self, sdr_active_indices: torch.Tensor, x: torch.Tensor) -> None:
+        if not torch.is_floating_point(sdr_active_indices) and sdr_active_indices.dtype != torch.bool:
+            pass
+        else:
+            raise ValueError("Engram Cantor/SDR routing expects compact active indices, not a dense SDR mask")
+        if sdr_active_indices.dim() not in (2, 3):
+            raise ValueError("compact active indices must have shape (B,T,K) or (B*T,K)")
+        # Dense SDR masks arrive with K ~= n_bits; compact buffers are small
+        # (retina target_active or RealityBridge l0_k). Refuse obviously dense
+        # masks so forced cantor_sdr cannot silently route 0/1 values as offsets.
+        if sdr_active_indices.shape[-1] > 1024 or sdr_active_indices.shape[-1] > self.n_columns:
+            raise ValueError("Engram Cantor/SDR routing expects compact active indices, not a dense SDR mask")
+    def _cantor_sdr_candidates(
+        self,
+        sdr_active_indices: torch.Tensor,
+        cantor_leaf_ids: torch.Tensor,
+        n_leaves: int,
+    ) -> torch.Tensor:
+        """Map SDR active offsets into each Cantor leaf's Engram column shard."""
+        self._validate_active_indices(sdr_active_indices, cantor_leaf_ids)
+        if sdr_active_indices.dim() == 2:
+            B, T = cantor_leaf_ids.shape
+            sdr_active_indices = sdr_active_indices.view(B, T, -1)
+        sdr = sdr_active_indices.to(device=cantor_leaf_ids.device, dtype=torch.long)
+        leaves = cantor_leaf_ids.to(dtype=torch.long).clamp(min=0, max=max(0, n_leaves - 1))
+        cols_per_leaf = max(1, self.n_columns // max(1, n_leaves))
+        offsets = sdr.remainder(cols_per_leaf)
+        base = leaves.unsqueeze(-1) * cols_per_leaf
+        return (base + offsets).clamp(max=self.n_columns - 1)
+    def _flat_retrieve(self, x: torch.Tensor) -> torch.Tensor:
+        scores = x @ self.memory.T
+        topk_vals, topk_idx = scores.topk(self.topk_k, dim=-1)
+        topk_w = torch.softmax(topk_vals, dim=-1)
+        selected_mem = self.memory[topk_idx]
+        return torch.einsum('btk,btkd->btd', topk_w, selected_mem)
+    def _cantor_sdr_retrieve(
+        self,
+        x: torch.Tensor,
+        sdr_active_indices: torch.Tensor,
+        cantor_leaf_ids: torch.Tensor,
+        cantor_n_leaves: int,
+    ) -> torch.Tensor:
+        candidates = self._cantor_sdr_candidates(
+            sdr_active_indices,
+            cantor_leaf_ids,
+            n_leaves=cantor_n_leaves,
+        )
+        cand_mem = self.memory[candidates]
+        scores = torch.einsum('btd,btkd->btk', x, cand_mem)
+        k = min(self.topk_k, scores.shape[-1])
+        topk_vals, local_idx = scores.topk(k, dim=-1)
+        topk_w = torch.softmax(topk_vals, dim=-1)
+        global_idx = candidates.gather(-1, local_idx)
+        selected_mem = self.memory[global_idx]
+        return torch.einsum('btk,btkd->btd', topk_w, selected_mem)
+    def forward(
+        self,
+        x: torch.Tensor,
+        token_ids: torch.Tensor,
+        sdr_active_indices: torch.Tensor | None = None,
+        cantor_leaf_ids: torch.Tensor | None = None,
+        cantor_n_leaves: int | None = None,
+    ):
+        B, T, D = x.shape
+        mode = self.routing_mode
+        use_cantor = (
+            mode in {"cantor_sdr", "auto"}
+            and sdr_active_indices is not None
+            and cantor_leaf_ids is not None
+            and cantor_n_leaves is not None
+        )
+        if mode == "auto" and use_cantor:
+            k_active = sdr_active_indices.shape[-1]
+            # Compare actual retrieval candidates against the full-memory scan.
+            # The previous `(k_active * D) < n_columns` check mixed candidate
+            # count with feature dimension, so d256/k64 fell back to flat
+            # retrieval even though Cantor/SDR scores only 64 candidates vs
+            # 8k-16k memory columns. That kept required subsystems active but
+            # spent tens of billions of extra MACs per forward.
+            use_cantor = k_active < self.n_columns
+        if use_cantor and mode in {"cantor_sdr", "auto"}:
+            retrieved = self._cantor_sdr_retrieve(x, sdr_active_indices, cantor_leaf_ids, cantor_n_leaves)
+        else:
+            retrieved = self._flat_retrieve(x)
+        alpha = torch.sigmoid(self.gate(x))
         if self.training and self.hebbian_boost:
             with torch.no_grad():
                 indices = self._hash(token_ids)
+                flat_idx = indices.reshape(-1)
+                flat_x = x.detach().reshape(-1, D)
                 mem_dtype = self.memory.data.dtype
                 updates = (
                     self.hebbian_lr * flat_x
                 ).to(mem_dtype)
                 self.memory.data.index_add_(0, flat_idx, updates)
         hit_rate = (alpha.detach() > 0.1).float().mean()
         return x + alpha * retrieved, hit_rate

overlay/scripts/launch_feather_hf_job.py CHANGED Viewed

@@ -17,8 +17,9 @@ if str(REPO_ROOT) not in sys.path:
 from configs.harness_config import HarnessConfig
 from scripts.hf_routing import resolve_routing
-GPU_FLAVOR = os.environ.get('FEATHER_HF_FLAVOR', 'a10g-large')
-GPU_PROFILE = os.environ.get('FEATHER_GPU_PROFILE', GPU_FLAVOR)
 GPU_ARCH_BY_FLAVOR = {
     'a10g-small': ('sm_86', '8.6'),
     'a10g-large': ('sm_86', '8.6'),
@@ -32,15 +33,12 @@ GPU_ARCH_BY_FLAVOR = {
     'h200x4': ('sm_90a', '9.0'),
     'h200x8': ('sm_90a', '9.0'),
 }
-HTM_CUDA_ARCH, TORCH_CUDA_ARCH = GPU_ARCH_BY_FLAVOR.get(GPU_FLAVOR, ('sm_86', '8.6'))
 HF_NAMESPACE = os.environ.get('FEATHER_HF_NAMESPACE')
 DEFAULT_IMAGE = os.environ.get('FEATHER_HF_IMAGE', 'ghcr.io/slapglif/feather-hf-runtime:a10g-large')
 IMAGE_DIR = Path(__file__).resolve().parents[1] / 'hf_jobs' / 'feather_h200_image'
 TIMEOUT = os.environ.get('FEATHER_HF_JOB_TIMEOUT', '12h')
 SPACE_PRIVATE = os.environ.get('FEATHER_HF_SPACE_PRIVATE', '1') == '1'
 OUTPUT_PRIVATE = os.environ.get('FEATHER_HF_OUTPUT_PRIVATE', '1') == '1'
-TARGET_SHARDS = os.environ.get('HYDRA_TARGET_SHARDS', '2048')
-TIME_BUDGET = os.environ.get('HYDRA_TIME_BUDGET', '43200')
 DOWNLOAD_WORKERS = os.environ.get('HYDRA_DOWNLOAD_WORKERS', '16')
 CKPT_INTERVAL = os.environ.get('HYDRA_CKPT_INTERVAL', '1000')
 DRY_RUN = os.environ.get('FEATHER_HF_DRY_RUN', '0') == '1'
@@ -52,6 +50,10 @@ SKIP_UPLOAD = os.environ.get('FEATHER_HF_SKIP_UPLOAD', '0') == '1'
 SYNC_OVERLAY = os.environ.get('FEATHER_HF_SYNC_OVERLAY', '1') == '1'
 def should_enable_fast_start_streaming(target_shards: str, time_budget: str) -> bool:
     """Use streaming data path for short-budget launch profiles."""
     try:
@@ -62,6 +64,22 @@ def should_enable_fast_start_streaming(target_shards: str, time_budget: str) ->
     return shards > 0 and shards <= 256 and budget > 0 and budget <= 1800
 def sync_overlay_from_repo() -> None:
     """Refresh Space overlay with required project files."""
     overlay = IMAGE_DIR / 'overlay'
@@ -120,23 +138,29 @@ def sync_overlay_from_repo() -> None:
 def load_hf_token() -> str | None:
     """Load a Hugging Face token without printing or persisting secret values."""
     for env_name in ('HF_TOKEN', 'HUGGINGFACE_HUB_TOKEN'):
         token = os.environ.get(env_name)
         if token:
-            return token
     token_file = Path(os.environ.get('HF_TOKEN_PATH', Path.home() / '.cache' / 'huggingface' / 'token')).expanduser()
     try:
         token = token_file.read_text(encoding='utf-8').strip()
     except FileNotFoundError:
-        return None
     except OSError:
-        return None
-    return token or None
 def require_token() -> str:
-    token = load_hf_token()
     if not token:
         raise SystemExit(
             'HF token required: set HF_TOKEN/HUGGINGFACE_HUB_TOKEN or run `huggingface-cli login` '
@@ -192,9 +216,65 @@ def wait_for_space(api: HfApi, repo_id: str, timeout_s: int = 1800) -> None:
         time.sleep(20)
 def main() -> int:
-    token = require_token()
     routing = resolve_routing(token=token)
     api = HfApi(token=token)
     secondary_gates = HarnessConfig().to_secondary_gates()
@@ -205,6 +285,13 @@ def main() -> int:
     print(f'[launch] retina_cache_repo={routing.retina_cache_repo}', flush=True)
     print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT}', flush=True)
     print(f'[launch] namespace={routing.job_namespace}', flush=True)
     print(f'[launch] flavor={GPU_FLAVOR} profile={GPU_PROFILE} htm_cuda_arch={HTM_CUDA_ARCH} torch_cuda_arch={TORCH_CUDA_ARCH}', flush=True)
     print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
     print(f'[launch] secondary_gates={json.dumps(secondary_gates, sort_keys=True)}', flush=True)
@@ -217,6 +304,8 @@ def main() -> int:
             print('[launch] auto-enabled HYDRA_USE_NEMOTRON=1 for short-budget fast-start profile', flush=True)
         if 'HYDRA_LOCAL_SHARDS_ONLY' not in os.environ and fast_start_streaming:
             print('[launch] auto-enabled HYDRA_LOCAL_SHARDS_ONLY=0 for Nemotron streaming fast-start profile', flush=True)
         print('[launch] dry-run mode; skipping repo creation, upload, and job submission', flush=True)
         return 0
@@ -290,29 +379,7 @@ def main() -> int:
     # keep throughput path enabled. Caller can explicitly override each key by
     # setting it in the parent environment.
     if GPU_FLAVOR.startswith('a10'):
-        _a10_defaults = {
-            'HYDRA_MUON_COMPILE': '0',
-            'HYDRA_FORCE_HTM_CPU': '1',
-            'HYDRA_INERT_MAMBA': '1',
-            'HYDRA_ALLOW_SYNTHETIC_RETINA': '1',
-            'HYDRA_FASTPATH': '1',
-        }
-        for _k, _default in _a10_defaults.items():
-            if _k in os.environ:
-                env[_k] = os.environ[_k]
-            else:
-                env.setdefault(_k, _default)
-        if env.get('HYDRA_INERT_MAMBA') == '0' and 'HYDRA_FASTPATH' not in os.environ:
-            env['HYDRA_FASTPATH'] = '0'
-        print(
-            '[launch] applied A10 env profile '
-            f"(HYDRA_MUON_COMPILE={env['HYDRA_MUON_COMPILE']}, "
-            f"HYDRA_FORCE_HTM_CPU={env['HYDRA_FORCE_HTM_CPU']}, "
-            f"HYDRA_INERT_MAMBA={env['HYDRA_INERT_MAMBA']}, "
-            f"HYDRA_ALLOW_SYNTHETIC_RETINA={env['HYDRA_ALLOW_SYNTHETIC_RETINA']}, "
-            f"HYDRA_FASTPATH={env['HYDRA_FASTPATH']})",
-            flush=True,
-        )
     # Pass through any HYDRA_* / FEATHER_* overrides from the caller's env so
     # sweep drivers can set HYDRA_N_LAYER, HYDRA_SDR_TARGET_ACTIVE,
     # HYDRA_LAYER_DIAGNOSTICS, HYDRA_METRICS_OUT, HYDRA_MID_VAL_INTERVAL, etc.

 from configs.harness_config import HarnessConfig
 from scripts.hf_routing import resolve_routing
+TARGET_SHARDS = os.environ.get('HYDRA_TARGET_SHARDS', '2048')
+TIME_BUDGET = os.environ.get('HYDRA_TIME_BUDGET', '43200')
+REQUESTED_GPU_FLAVOR = os.environ.get('FEATHER_HF_FLAVOR', 'a10g-large')
 GPU_ARCH_BY_FLAVOR = {
     'a10g-small': ('sm_86', '8.6'),
     'a10g-large': ('sm_86', '8.6'),
     'h200x4': ('sm_90a', '9.0'),
     'h200x8': ('sm_90a', '9.0'),
 }
 HF_NAMESPACE = os.environ.get('FEATHER_HF_NAMESPACE')
 DEFAULT_IMAGE = os.environ.get('FEATHER_HF_IMAGE', 'ghcr.io/slapglif/feather-hf-runtime:a10g-large')
 IMAGE_DIR = Path(__file__).resolve().parents[1] / 'hf_jobs' / 'feather_h200_image'
 TIMEOUT = os.environ.get('FEATHER_HF_JOB_TIMEOUT', '12h')
 SPACE_PRIVATE = os.environ.get('FEATHER_HF_SPACE_PRIVATE', '1') == '1'
 OUTPUT_PRIVATE = os.environ.get('FEATHER_HF_OUTPUT_PRIVATE', '1') == '1'
 DOWNLOAD_WORKERS = os.environ.get('HYDRA_DOWNLOAD_WORKERS', '16')
 CKPT_INTERVAL = os.environ.get('HYDRA_CKPT_INTERVAL', '1000')
 DRY_RUN = os.environ.get('FEATHER_HF_DRY_RUN', '0') == '1'
 SYNC_OVERLAY = os.environ.get('FEATHER_HF_SYNC_OVERLAY', '1') == '1'
+def _truthy_env(name: str) -> bool:
+    return os.environ.get(name, '0').strip().lower() in {'1', 'true', 'yes', 'on'}
 def should_enable_fast_start_streaming(target_shards: str, time_budget: str) -> bool:
     """Use streaming data path for short-budget launch profiles."""
     try:
     return shards > 0 and shards <= 256 and budget > 0 and budget <= 1800
+def resolve_effective_gpu_flavor(requested_flavor: str, target_shards: str, time_budget: str) -> str:
+    """Keep canary/non-full launches on A10 unless H200 is explicitly allowed."""
+    if (
+        requested_flavor.startswith('h200')
+        and should_enable_fast_start_streaming(target_shards, time_budget)
+        and not _truthy_env('FEATHER_HF_ALLOW_H200_CANARY')
+    ):
+        return os.environ.get('FEATHER_HF_CANARY_FLAVOR', 'a10g-large')
+    return requested_flavor
+GPU_FLAVOR = resolve_effective_gpu_flavor(REQUESTED_GPU_FLAVOR, TARGET_SHARDS, TIME_BUDGET)
+GPU_PROFILE = os.environ.get('FEATHER_GPU_PROFILE', GPU_FLAVOR)
+HTM_CUDA_ARCH, TORCH_CUDA_ARCH = GPU_ARCH_BY_FLAVOR.get(GPU_FLAVOR, ('sm_86', '8.6'))
 def sync_overlay_from_repo() -> None:
     """Refresh Space overlay with required project files."""
     overlay = IMAGE_DIR / 'overlay'
 def load_hf_token() -> str | None:
     """Load a Hugging Face token without printing or persisting secret values."""
+    token, _source = load_hf_token_with_source()
+    return token
+def load_hf_token_with_source() -> tuple[str | None, str]:
+    """Load a Hugging Face token and return a non-secret source label."""
     for env_name in ('HF_TOKEN', 'HUGGINGFACE_HUB_TOKEN'):
         token = os.environ.get(env_name)
         if token:
+            return token, 'provided'
     token_file = Path(os.environ.get('HF_TOKEN_PATH', Path.home() / '.cache' / 'huggingface' / 'token')).expanduser()
     try:
         token = token_file.read_text(encoding='utf-8').strip()
     except FileNotFoundError:
+        return None, 'missing'
     except OSError:
+        return None, 'unreadable'
+    return (token, 'token_file') if token else (None, 'empty_file')
 def require_token() -> str:
+    token, _source = load_hf_token_with_source()
     if not token:
         raise SystemExit(
             'HF token required: set HF_TOKEN/HUGGINGFACE_HUB_TOKEN or run `huggingface-cli login` '
         time.sleep(20)
+def _configure_line_buffered_output(stdout=sys.stdout, stderr=sys.stderr) -> None:
+    """Make launch progress visible immediately when stdout/stderr are pipes."""
+    for stream in (stdout, stderr):
+        reconfigure = getattr(stream, 'reconfigure', None)
+        if reconfigure is None:
+            continue
+        try:
+            reconfigure(line_buffering=True)
+        except (TypeError, ValueError):
+            # Some wrapped streams do not support reconfigure at runtime.
+            pass
+def apply_a10_env_profile(env: dict[str, str]) -> None:
+    """Apply operational A10 canary defaults unless caller supplied overrides."""
+    if not GPU_FLAVOR.startswith('a10'):
+        return
+    _a10_defaults = {
+        'HYDRA_MUON_COMPILE': '0',
+        'HYDRA_FORCE_HTM_CPU': '1',
+        'HYDRA_INERT_MAMBA': '1',
+        'HYDRA_HYENA_LAYERS': '0,1,2,3',
+        'HYDRA_DISABLE_FUSED_SDR_TRITON': '1',
+        'HYDRA_ALLOW_SYNTHETIC_RETINA': '1',
+        'HYDRA_FASTPATH': '1',
+    }
+    for _k, _default in _a10_defaults.items():
+        if _k in os.environ:
+            env[_k] = os.environ[_k]
+        else:
+            env.setdefault(_k, _default)
+    if env.get('HYDRA_INERT_MAMBA') == '0' and 'HYDRA_FASTPATH' not in os.environ:
+        env['HYDRA_FASTPATH'] = '0'
+    print(
+        '[launch] applied A10 env profile '
+        f"(HYDRA_MUON_COMPILE={env['HYDRA_MUON_COMPILE']}, "
+        f"HYDRA_FORCE_HTM_CPU={env['HYDRA_FORCE_HTM_CPU']}, "
+        f"HYDRA_INERT_MAMBA={env['HYDRA_INERT_MAMBA']}, "
+        f"HYDRA_HYENA_LAYERS={env['HYDRA_HYENA_LAYERS']}, "
+        f"HYDRA_DISABLE_FUSED_SDR_TRITON={env['HYDRA_DISABLE_FUSED_SDR_TRITON']}, "
+        f"HYDRA_ALLOW_SYNTHETIC_RETINA={env['HYDRA_ALLOW_SYNTHETIC_RETINA']}, "
+        f"HYDRA_FASTPATH={env['HYDRA_FASTPATH']})",
+        flush=True,
+    )
 def main() -> int:
+    _configure_line_buffered_output()
+    print(f'[launch] phase=start dry_run={int(DRY_RUN)} use_space_image={int(USE_SPACE_IMAGE)} skip_upload={int(SKIP_UPLOAD)} sync_overlay={int(SYNC_OVERLAY)}', flush=True)
+    token, token_source = load_hf_token_with_source()
+    if not token:
+        raise SystemExit(
+            'HF token required: set HF_TOKEN/HUGGINGFACE_HUB_TOKEN or run `huggingface-cli login` '
+            'so ~/.cache/huggingface/token exists'
+        )
+    print(f'[launch] phase=token_loaded source={token_source}', flush=True)
     routing = resolve_routing(token=token)
+    print('[launch] phase=routing_resolved', flush=True)
+    print('[launch] phase=api_init', flush=True)
     api = HfApi(token=token)
     secondary_gates = HarnessConfig().to_secondary_gates()
     print(f'[launch] retina_cache_repo={routing.retina_cache_repo}', flush=True)
     print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT}', flush=True)
     print(f'[launch] namespace={routing.job_namespace}', flush=True)
+    print(f'[launch] requested_flavor={REQUESTED_GPU_FLAVOR} effective_flavor={GPU_FLAVOR}', flush=True)
+    if REQUESTED_GPU_FLAVOR != GPU_FLAVOR:
+        print(
+            '[launch] cost-aware override: requested h200 for short-budget canary/non-full run; '
+            f'using {GPU_FLAVOR} instead (set FEATHER_HF_ALLOW_H200_CANARY=1 to spend H200)',
+            flush=True,
+        )
     print(f'[launch] flavor={GPU_FLAVOR} profile={GPU_PROFILE} htm_cuda_arch={HTM_CUDA_ARCH} torch_cuda_arch={TORCH_CUDA_ARCH}', flush=True)
     print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
     print(f'[launch] secondary_gates={json.dumps(secondary_gates, sort_keys=True)}', flush=True)
             print('[launch] auto-enabled HYDRA_USE_NEMOTRON=1 for short-budget fast-start profile', flush=True)
         if 'HYDRA_LOCAL_SHARDS_ONLY' not in os.environ and fast_start_streaming:
             print('[launch] auto-enabled HYDRA_LOCAL_SHARDS_ONLY=0 for Nemotron streaming fast-start profile', flush=True)
+        dry_run_env: dict[str, str] = {}
+        apply_a10_env_profile(dry_run_env)
         print('[launch] dry-run mode; skipping repo creation, upload, and job submission', flush=True)
         return 0
     # keep throughput path enabled. Caller can explicitly override each key by
     # setting it in the parent environment.
     if GPU_FLAVOR.startswith('a10'):
+        apply_a10_env_profile(env)
     # Pass through any HYDRA_* / FEATHER_* overrides from the caller's env so
     # sweep drivers can set HYDRA_N_LAYER, HYDRA_SDR_TARGET_ACTIVE,
     # HYDRA_LAYER_DIAGNOSTICS, HYDRA_METRICS_OUT, HYDRA_MID_VAL_INTERVAL, etc.

overlay/scripts/run_domain_expanded_pretrain.sh CHANGED Viewed

@@ -188,7 +188,11 @@ fi
 RESUME_PATH="$(resolve_resume_path || true)"
-export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
 export HYDRA_TIME_BUDGET="${HYDRA_TIME_BUDGET:-28800}"
 export HYDRA_TARGET_SHARDS="$TARGET_SHARDS"
 export HYDRA_DOWNLOAD_WORKERS="$DOWNLOAD_WORKERS"

 RESUME_PATH="$(resolve_resume_path || true)"
+# Only inject WSL library paths when running on WSL. Cloud containers
+# (H200/A10G HF Jobs) already have their driver paths set by entrypoint.py.
+if [[ -d /usr/lib/wsl/lib ]]; then
+  export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
+fi
 export HYDRA_TIME_BUDGET="${HYDRA_TIME_BUDGET:-28800}"
 export HYDRA_TARGET_SHARDS="$TARGET_SHARDS"
 export HYDRA_DOWNLOAD_WORKERS="$DOWNLOAD_WORKERS"

overlay/subsystems/fused_sdr_project.py CHANGED Viewed

@@ -14,6 +14,8 @@ Backward: Computes grad_weight, grad_delta_u, grad_delta_v via associativity
 VRAM: Forward only materializes out (P×D = 8MB at P=16384, D=256).
      No dense (P, N) or (P, K, D) intermediates.
 """
 import torch
 import triton
 import triton.language as tl
@@ -114,9 +116,11 @@ class FusedSDRProject(torch.autograd.Function):
         out = torch.empty(P, D, device=active.device, dtype=sdr_proj_weight.dtype)
-        if not active.is_cuda:
-            # Local CPU validation has no Triton driver. Keep the same custom
-            # autograd contract but use a deterministic gather+sum fallback.
             out = wt[active].sum(dim=1).to(dtype=sdr_proj_weight.dtype)
             ctx.save_for_backward(active, token_ids, sdr_proj_weight, delta_u, delta_v)
             return out.view(B, T, D)

 VRAM: Forward only materializes out (P×D = 8MB at P=16384, D=256).
      No dense (P, N) or (P, K, D) intermediates.
 """
+import os
 import torch
 import triton
 import triton.language as tl
         out = torch.empty(P, D, device=active.device, dtype=sdr_proj_weight.dtype)
+        if (not active.is_cuda) or os.environ.get("HYDRA_DISABLE_FUSED_SDR_TRITON", "0") == "1":
+            # Local CPU validation and A10-safe canaries may have no usable
+            # Triton driver even when torch CUDA itself is available. Keep the
+            # same custom autograd contract but use a deterministic gather+sum
+            # fallback.
             out = wt[active].sum(dim=1).to(dtype=sdr_proj_weight.dtype)
             ctx.save_for_backward(active, token_ids, sdr_proj_weight, delta_u, delta_v)
             return out.view(B, T, D)

overlay/subsystems/sdr_semantic.py CHANGED Viewed

@@ -46,10 +46,19 @@ class _SDRSTE(torch.autograd.Function):
         flat_grad = grad_out.reshape(B * T, n_bits).to(delta_v.dtype)
         flat_ids = token_ids.reshape(B * T)
         V = delta_u.shape[0]
-        per_tok = torch.zeros(V, n_bits, device=flat_grad.device, dtype=delta_v.dtype)
-        per_tok.index_add_(0, flat_ids, flat_grad)
-        grad_delta_u = per_tok @ delta_v.t()
-        grad_delta_v = delta_u.t() @ per_tok
         return None, grad_delta_u, grad_delta_v, None
@@ -240,12 +249,25 @@ class SemanticFoldingSDR(nn.Module):
         sdr_binary = sdr_binary.view(B, T, self.n_bits)
         return _SDRSTE.apply(sdr_binary, self.delta_u, self.delta_v, token_ids)
     @torch.no_grad()
     def binary_only(self, token_ids: torch.Tensor) -> torch.Tensor:
         """uint8 retina view — no STE, no autocast cost. For HTM/consumers that
         only need the binary pattern.  Reconstructs dense from CSR indices."""
         B, T = token_ids.shape
-        idx = self._retina_indices[token_ids.reshape(-1)]  # (B*T, K) int16
         sdr = torch.zeros(
             B * T, self.n_bits, dtype=torch.uint8, device=token_ids.device,
         )

         flat_grad = grad_out.reshape(B * T, n_bits).to(delta_v.dtype)
         flat_ids = token_ids.reshape(B * T)
         V = delta_u.shape[0]
+        R = delta_u.shape[1]  # delta_rank — typically 32
+        # OOM fix: old code allocated (V, n_bits) = 4GB buffer via index_add.
+        # Instead, project to rank-R space first (small), then scatter.
+        #   grad_delta_u[t, r] = sum_{pos: id=flat_ids[pos]=t} (flat_grad[pos] @ delta_v[r])
+        #   = index_add(V, R, flat_ids, flat_grad @ delta_v.T)
+        projected = flat_grad @ delta_v.t()  # (B*T, R) — ~1MB at B=8,T=1024,R=32
+        per_tok_u = torch.zeros(V, R, device=flat_grad.device, dtype=delta_v.dtype)
+        per_tok_u.index_add_(0, flat_ids, projected)
+        grad_delta_u = per_tok_u  # (V, R) — ~8MB at V=65536
+        #   grad_delta_v = sum_{pos} delta_u[flat_ids[pos]]^T @ flat_grad[pos]
+        #   = delta_u[flat_ids].T @ flat_grad  — no intermediate buffer
+        gathered_u = delta_u[flat_ids]  # (B*T, R) — ~1MB
+        grad_delta_v = gathered_u.t() @ flat_grad  # (R, n_bits) — ~2MB
         return None, grad_delta_u, grad_delta_v, None
         sdr_binary = sdr_binary.view(B, T, self.n_bits)
         return _SDRSTE.apply(sdr_binary, self.delta_u, self.delta_v, token_ids)
+    @torch.no_grad()
+    def active_indices(self, token_ids: torch.Tensor) -> torch.Tensor:
+        """Compact int16 Reality Buffer view: (B,T,K) active retina offsets.
+        This is the production discrete bridge for Cantor/Engram routing. It
+        avoids reconstructing dense (B,T,n_bits) masks when consumers only need
+        the L0 support set.
+        """
+        if token_ids.dim() != 2:
+            raise ValueError(f"expected (B, T) token_ids, got shape {tuple(token_ids.shape)}")
+        B, T = token_ids.shape
+        return self._retina_indices[token_ids.reshape(-1)].view(B, T, self.target_active)
     @torch.no_grad()
     def binary_only(self, token_ids: torch.Tensor) -> torch.Tensor:
         """uint8 retina view — no STE, no autocast cost. For HTM/consumers that
         only need the binary pattern.  Reconstructs dense from CSR indices."""
         B, T = token_ids.shape
+        idx = self.active_indices(token_ids).reshape(B * T, self.target_active)
         sdr = torch.zeros(
             B * T, self.n_bits, dtype=torch.uint8, device=token_ids.device,
         )