Spaces:

Jackoatmon
/

feather-a10-runtime

Runtime error

App Files Files Community

Jackoatmon commited on Apr 25

Commit

1981f80

verified ·

1 Parent(s): f2e46e6

Update Feather training runtime image

Browse files

Files changed (37) hide show

overlay/configs/harness_config.py +47 -17
overlay/harness/eval_agent.py +188 -60
overlay/harness/orchestrator.py +16 -13
overlay/htm_rust/build.rs +27 -35
overlay/htm_rust/src/gpu/fused.rs +87 -93
overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu +77 -77
overlay/hydra/engram.py +73 -75
overlay/hydra/eval.py +8 -1
overlay/hydra/model.py +21 -6
overlay/hydra/training.py +46 -5
overlay/prepare_nemotron.py +187 -232
overlay/pyproject.toml +1 -0
overlay/scripts/autoresearch_iter.sh +144 -0
overlay/scripts/benchmark_hyena_stack.py +50 -29
overlay/scripts/export_hpo_priors.py +74 -0
overlay/scripts/hpo_orchestrator.py +319 -0
overlay/scripts/launch_feather_hf_job.py +145 -110
overlay/scripts/long_train.sh +38 -38
overlay/scripts/optuna_hpo.py +725 -0
overlay/scripts/parse_metrics.py +24 -0
overlay/scripts/run_domain_expanded_pretrain.sh +262 -262
overlay/scripts/run_meta.sh +13 -13
overlay/scripts/run_phase1.sh +32 -32
overlay/scripts/run_phase2.sh +25 -25
overlay/scripts/run_tps_gate.sh +23 -0
overlay/scripts/setup.sh +28 -27
overlay/scripts/strip_optimizer_state.py +29 -0
overlay/scripts/sweep_depth_aggregate.py +141 -45
overlay/scripts/sweep_depth_local.sh +62 -62
overlay/scripts/train_champion_12h.sh +50 -0
overlay/scripts/train_champion_5h.sh +45 -0
overlay/scripts/train_champion_resume.sh +38 -0
overlay/scripts/train_champion_resume_clean.sh +43 -0
overlay/scripts/train_champion_v2.sh +54 -0
overlay/scripts/train_champion_warmstart.sh +47 -0
overlay/scripts/wsl_bootstrap_tps.sh +68 -0
overlay/subsystems/htm.py +43 -57

overlay/configs/harness_config.py CHANGED Viewed

@@ -1,10 +1,13 @@
-"""Harness configuration for HYDRA's self-evolving outer loop."""
-from typing import Literal
-from pydantic import BaseModel, Field
-class HarnessConfig(BaseModel):
     """Configuration for the HYDRA harness behavior."""
     # Inner loop
@@ -47,15 +50,19 @@ class HarnessConfig(BaseModel):
         default=5.0, description="Max % regression from best known val_bpb"
     )
-    # Keep/discard criteria
-    primary_metric: str = "val_bpb"
-    secondary_metrics: dict = Field(
-        default_factory=lambda: {
-            "mhc_spectral_norm": {"max": 2.0},
-            "engram_hit_rate": {"min": 0.1},
-            "hestia_quant_error": {"max": 0.05},
-        }
-    )
     # Experiment execution
     experiment_timeout: int = Field(
@@ -73,6 +80,29 @@ class HarnessConfig(BaseModel):
     gate_mhc_spectral_norm: float | None = Field(
         default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
     )
-    gate_engram_hit_rate: float | None = Field(
-        default=None, description="Min engram_hit_rate for keep (None=disabled)"
-    )

+"""Harness configuration for HYDRA's self-evolving outer loop."""
+from typing import Literal
+from pydantic import BaseModel, Field
+type GateThresholds = dict[str, float]
+type GateConfig = dict[str, GateThresholds]
+class HarnessConfig(BaseModel):
     """Configuration for the HYDRA harness behavior."""
     # Inner loop
         default=5.0, description="Max % regression from best known val_bpb"
     )
+    # Keep/discard criteria
+    primary_metric: str = "val_bpb"
+    secondary_metrics: GateConfig = Field(
+        default_factory=lambda: {
+            "mhc_spectral_norm": {"max": 2.0},
+            "engram_hit_rate": {"min": 0.1},
+            "factual_english_score": {"min": 0.5},
+            "instruction_following_score": {"min": 0.5},
+            "distinct_2": {"min": 0.1},
+            "repetition_rate": {"max": 0.2},
+            "hestia_quant_error": {"max": 0.05},
+        }
+    )
     # Experiment execution
     experiment_timeout: int = Field(
     gate_mhc_spectral_norm: float | None = Field(
         default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
     )
+    gate_engram_hit_rate: float | None = Field(
+        default=None, description="Min engram_hit_rate for keep (None=disabled)"
+    )
+    gate_tps_median: float | None = Field(
+        default=None,
+        description="Min steady-state tps_median for keep (None=disabled)",
+    )
+    gate_tps_p10: float | None = Field(
+        default=None,
+        description="Min steady-state tps_p10 for keep (None=disabled)",
+    )
+    def to_secondary_gates(self) -> GateConfig:
+        """Build active keep/discard gates from defaults plus gate_* overrides."""
+        gates = {metric: thresholds.copy() for metric, thresholds in self.secondary_metrics.items()}
+        if self.gate_mhc_spectral_norm is not None:
+            gates.setdefault("mhc_spectral_norm", {})["max"] = self.gate_mhc_spectral_norm
+        if self.gate_engram_hit_rate is not None:
+            gates.setdefault("engram_hit_rate", {})["min"] = self.gate_engram_hit_rate
+        if self.gate_tps_median is not None:
+            gates.setdefault("tps_median", {})["min"] = self.gate_tps_median
+        if self.gate_tps_p10 is not None:
+            gates.setdefault("tps_p10", {})["min"] = self.gate_tps_p10
+        return gates

overlay/harness/eval_agent.py CHANGED Viewed

@@ -1,10 +1,15 @@
-"""Eval agent: parse run.log and extract metrics from training runs."""
-import re
-from dataclasses import dataclass, field
 @dataclass
-class ExperimentResult:
     """Parsed result from a single experiment run.
     All float fields default to 0.0; integer fields default to 0.
@@ -23,19 +28,38 @@ class ExperimentResult:
     peak_vram_mb: float = 0.0
     mfu_percent: float = 0.0
-    # Throughput
-    total_tokens_m: float = 0.0
-    num_steps: int = 0
     # Model shape (echoed by train.py summary block)
     num_params_m: float = 0.0
     n_layer: int = 0
     d_model: int = 0
-    # Secondary health metrics
-    mhc_spectral_norm: float = 0.0
-    engram_hit_rate: float = 0.0
-    sr_bypass_rate: float = 0.0
     # Status
     crashed: bool = False
@@ -56,12 +80,48 @@ _PATTERNS: dict[str, str] = {
     "n_layer": r"^n_layer:\s+(\d+)",
     "d_model": r"^d_model:\s+(\d+)",
     "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
-    "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
-    "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
-}
 # Attributes that should be parsed as int rather than float.
-_INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"})
 def parse_run_log(log_path: str) -> ExperimentResult:
@@ -84,22 +144,60 @@ def parse_run_log(log_path: str) -> ExperimentResult:
         result.error_message = f"Log file not found: {log_path}"
         return result
-    # Detect crash signals in output.
-    if "Traceback" in content or "FAIL" in content or "Error" in content:
-        result.crashed = True
-        lines = content.strip().splitlines()
-        result.error_message = "\n".join(lines[-20:])
-    for attr, pattern in _PATTERNS.items():
-        match = re.search(pattern, content, re.MULTILINE)
-        if match:
-            raw = match.group(1)
-            setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw))
-    return result
-def check_secondary_alarms(result: ExperimentResult) -> list[str]:
     """Check secondary metrics against fixed alarm thresholds.
     Args:
@@ -118,19 +216,44 @@ def check_secondary_alarms(result: ExperimentResult) -> list[str]:
         alarms.append(
             f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
         )
-    if 0 < result.mfu_percent < 10:
-        alarms.append(
-            f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
-        )
-    return alarms
-def should_keep(
-    result: ExperimentResult,
-    best_bpb: float,
-    gates: dict | None = None,
-) -> tuple[bool, str]:
     """Decide whether to keep or discard an experiment.
     The primary criterion is strictly lower val_bpb than the current best.
@@ -154,19 +277,24 @@ def should_keep(
     if result.val_bpb >= best_bpb:
         return False, "discard"
-    # Secondary gate checks.
-    if gates:
-        gate_mhc = gates.get("mhc_spectral_norm", {}).get("max")
-        if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc:
-            return (
-                False,
-                f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}",
-            )
-        gate_engram = gates.get("engram_hit_rate", {}).get("min")
-        if gate_engram is not None and result.engram_hit_rate < gate_engram:
-            return (
-                False,
-                f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}",
-            )
-    return True, "keep"

+"""Eval agent: parse run.log and extract metrics from training runs."""
+import re
+import statistics
+from dataclasses import dataclass
+type GateThresholds = dict[str, float]
+type GateConfig = dict[str, GateThresholds]
 @dataclass
+class ExperimentResult:
     """Parsed result from a single experiment run.
     All float fields default to 0.0; integer fields default to 0.
     peak_vram_mb: float = 0.0
     mfu_percent: float = 0.0
+    # Throughput
+    total_tokens_m: float = 0.0
+    num_steps: int = 0
+    tps_median: float = 0.0
+    tps_p10: float = 0.0
+    tps_min: float = 0.0
+    tps_max: float = 0.0
+    tps_samples: int = 0
     # Model shape (echoed by train.py summary block)
     num_params_m: float = 0.0
     n_layer: int = 0
     d_model: int = 0
+    # Secondary health metrics
+    mhc_spectral_norm: float = 0.0
+    engram_hit_rate: float = 0.0
+    sr_bypass_rate: float = 0.0
+    # Evaluation breadth metrics
+    factual_english_score: float = 0.0
+    instruction_following_score: float = 0.0
+    distinct_1: float = 0.0
+    distinct_2: float = 0.0
+    repetition_rate: float = 0.0
+    repetition_bigram_rate: float = 0.0
+    calibration_ece: float = 0.0
+    calibration_brier: float = 0.0
+    calibration_accuracy: float = 0.0
+    calibration_tokens: int = 0
+    eval_seed: int = 0
+    eval_seed_group: str = ""
     # Status
     crashed: bool = False
     "n_layer": r"^n_layer:\s+(\d+)",
     "d_model": r"^d_model:\s+(\d+)",
     "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
+    "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
+    "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
+    "factual_english_score": r"^factual_english_score:\s+([\d.]+)",
+    "instruction_following_score": r"^instruction_following_score:\s+([\d.]+)",
+    "distinct_1": r"^distinct_1:\s+([\d.]+)",
+    "distinct_2": r"^distinct_2:\s+([\d.]+)",
+    "repetition_rate": r"^repetition_rate:\s+([\d.]+)",
+    "repetition_bigram_rate": r"^repetition_bigram_rate:\s+([\d.]+)",
+    "calibration_ece": r"^calibration_ece:\s+([\d.]+)",
+    "calibration_brier": r"^calibration_brier:\s*([\d.]+)",
+    "calibration_accuracy": r"^calibration_accuracy:\s+([\d.]+)",
+    "calibration_tokens": r"^calibration_tokens:\s+(\d+)",
+    "eval_seed": r"^eval_seed:\s+(\d+)",
+    "eval_seed_group": r"^eval_seed_group:\s+(.+)",
+}
 # Attributes that should be parsed as int rather than float.
+_INT_ATTRS: frozenset[str] = frozenset(
+    {
+        "num_steps",
+        "n_layer",
+        "d_model",
+        "calibration_tokens",
+        "eval_seed",
+    }
+)
+_STR_ATTRS: frozenset[str] = frozenset({"eval_seed_group"})
+_STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
+_TPS_PATTERN = re.compile(r"\btps=(\d+)\b")
+def _percentile_linear(sorted_values: list[float], pct: float) -> float:
+    """Compute percentile via linear interpolation (0 <= pct <= 100)."""
+    if not sorted_values:
+        return 0.0
+    if len(sorted_values) == 1:
+        return sorted_values[0]
+    rank = (len(sorted_values) - 1) * (pct / 100.0)
+    lo = int(rank)
+    hi = min(lo + 1, len(sorted_values) - 1)
+    frac = rank - lo
+    return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac
 def parse_run_log(log_path: str) -> ExperimentResult:
         result.error_message = f"Log file not found: {log_path}"
         return result
+    # Detect crash signals in output. Keep this strict to avoid false positives
+    # from benign log lines that include "error" in a non-fatal context.
+    if (
+        "Traceback" in content
+        or "\nFAIL\n" in content
+        or "[TPS_GUARD] FAIL" in content
+        or "raise SystemExit(1)" in content
+    ):
+        result.crashed = True
+        lines = content.strip().splitlines()
+        result.error_message = "\n".join(lines[-20:])
+    for attr, pattern in _PATTERNS.items():
+        match = re.search(pattern, content, re.MULTILINE)
+        if match:
+            raw = match.group(1)
+            if attr in _INT_ATTRS:
+                setattr(result, attr, int(raw))
+            elif attr in _STR_ATTRS:
+                setattr(result, attr, raw.strip())
+            else:
+                setattr(result, attr, float(raw))
+    warmup_steps = 10
+    warmup_match = re.search(r"\[TPS_GUARD\] enabled .*?warmup_steps=(\d+)", content)
+    if warmup_match:
+        warmup_steps = int(warmup_match.group(1))
+    step_tps_samples: list[tuple[int, int]] = []
+    for m in _STEP_TPS_PATTERN.finditer(content):
+        step_tps_samples.append((int(m.group(1)), int(m.group(2))))
+    tps_values: list[float] = []
+    if step_tps_samples:
+        for step, tps in step_tps_samples:
+            if step >= warmup_steps:
+                tps_values.append(float(tps))
+        if not tps_values:
+            tps_values = [float(tps) for _, tps in step_tps_samples]
+    else:
+        tps_values = [float(m.group(1)) for m in _TPS_PATTERN.finditer(content)]
+    if tps_values:
+        sorted_tps = sorted(tps_values)
+        result.tps_samples = len(tps_values)
+        result.tps_median = float(statistics.median(tps_values))
+        result.tps_p10 = float(_percentile_linear(sorted_tps, 10.0))
+        result.tps_min = float(sorted_tps[0])
+        result.tps_max = float(sorted_tps[-1])
+    return result
+def check_secondary_alarms(result: ExperimentResult) -> list[str]:
     """Check secondary metrics against fixed alarm thresholds.
     Args:
         alarms.append(
             f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
         )
+    if 0 < result.mfu_percent < 10:
+        alarms.append(
+            f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
+        )
+    if result.calibration_ece > 0.35:
+        alarms.append(
+            f"calibration_ece={result.calibration_ece:.4f} > 0.35 (poor calibration)"
+        )
+    if result.tps_median > 0 and result.tps_median < 50000:
+        alarms.append(
+            f"tps_median={result.tps_median:.0f} < 50000 (throughput below A10 objective)"
+        )
+    return alarms
+def _check_gate(
+    result: ExperimentResult,
+    gates: GateConfig,
+    metric: str,
+) -> tuple[bool, str] | None:
+    """Evaluate a single min/max gate against an ExperimentResult metric."""
+    gate = gates.get(metric, {})
+    value = getattr(result, metric)
+    max_value = gate.get("max")
+    if max_value is not None and value > max_value:
+        return False, f"{metric} {value:.4f} > gate {max_value}"
+    min_value = gate.get("min")
+    if min_value is not None and value < min_value:
+        return False, f"{metric} {value:.4f} < gate {min_value}"
+    return None
+def should_keep(
+    result: ExperimentResult,
+    best_bpb: float,
+    gates: GateConfig | None = None,
+) -> tuple[bool, str]:
     """Decide whether to keep or discard an experiment.
     The primary criterion is strictly lower val_bpb than the current best.
     if result.val_bpb >= best_bpb:
         return False, "discard"
+    # Secondary gate checks.
+    if gates:
+        gate_metrics = (
+            "mhc_spectral_norm",
+            "engram_hit_rate",
+            "factual_english_score",
+            "instruction_following_score",
+            "distinct_1",
+            "distinct_2",
+            "repetition_rate",
+            "repetition_bigram_rate",
+            "calibration_ece",
+            "tps_median",
+            "tps_p10",
+        )
+        for metric in gate_metrics:
+            gate_result = _check_gate(result, gates, metric)
+            if gate_result is not None:
+                return gate_result
+    return True, "keep"

overlay/harness/orchestrator.py CHANGED Viewed

@@ -20,11 +20,12 @@ provides the infrastructure ("rails") that the autoresearch loop runs on.
 """
 import argparse
 import csv
-import os
-import subprocess
-import time
-from harness.eval_agent import ExperimentResult, check_secondary_alarms, parse_run_log, should_keep
 from harness.git_utils import REPO_DIR, commit_all, current_commit_short, reset_to
 from harness.health_monitor import check_health, reset_peak_stats
 from harness.meta_agent import run_meta_iteration
@@ -144,12 +145,12 @@ def run_experiment(timeout: int = 600) -> str:
 # ---------------------------------------------------------------------------
-def run_loop(
-    meta_interval: int = 20,
-    max_experiments: int | None = None,
-    experiment_timeout: int = 600,
-    secondary_gates: dict | None = None,
-) -> None:
     """Run the HYDRA autoresearch loop.
     This function runs indefinitely (or until ``max_experiments`` is reached
@@ -162,8 +163,10 @@ def run_loop(
         secondary_gates: Optional gate thresholds forwarded to
             :func:`~harness.eval_agent.should_keep`.
     """
-    init_results_tsv()
-    best_bpb = _load_best_bpb()
     experiment_num = count_experiments()
     print(

 """
 import argparse
 import csv
+import os
+import subprocess
+import time
+from configs.harness_config import HarnessConfig
+from harness.eval_agent import ExperimentResult, check_secondary_alarms, parse_run_log, should_keep
 from harness.git_utils import REPO_DIR, commit_all, current_commit_short, reset_to
 from harness.health_monitor import check_health, reset_peak_stats
 from harness.meta_agent import run_meta_iteration
 # ---------------------------------------------------------------------------
+def run_loop(
+    meta_interval: int = 20,
+    max_experiments: int | None = None,
+    experiment_timeout: int = 600,
+    secondary_gates: dict[str, dict[str, float]] | None = None,
+) -> None:
     """Run the HYDRA autoresearch loop.
     This function runs indefinitely (or until ``max_experiments`` is reached
         secondary_gates: Optional gate thresholds forwarded to
             :func:`~harness.eval_agent.should_keep`.
     """
+    init_results_tsv()
+    if secondary_gates is None:
+        secondary_gates = HarnessConfig().to_secondary_gates()
+    best_bpb = _load_best_bpb()
     experiment_num = count_experiments()
     print(

overlay/htm_rust/build.rs CHANGED Viewed

@@ -26,39 +26,37 @@ fn main() {
         return;
     }
-    // Kernels to compile. Each .cu file → one .ptx file, embedded by name.
-    // htm_fused_step currently requires Hopper-only cluster APIs (sm_90+).
-    let mut kernels: Vec<&str> = vec![
-        "sp_overlap",
-        "sp_topk",
-        "sp_learn",
-        "sp_duty",
-        "sp_boost_fused",
         "tm_predict",
         "tm_activate",
         "tm_learn",
-        "tm_punish",
-        "tm_grow",
-        "tm_anomaly",
-        "tm_reset",
-    ];
     let kernels_dir = PathBuf::from("src/gpu/kernels");
-    for k in &kernels {
-        let src = kernels_dir.join(format!("{k}.cu"));
-        println!("cargo:rerun-if-changed={}", src.display());
-    }
-    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
-    let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_90a".into());
-    let fused_supported = arch.starts_with("sm_90");
-    if fused_supported {
-        kernels.push("htm_fused_step");
-    }
-    println!(
-        "cargo:rustc-env=HTM_GPU_FUSED_AVAILABLE={}",
-        if fused_supported { "1" } else { "0" }
-    );
     let nvcc = find_nvcc();
     println!("cargo:warning=htm_rust: nvcc = {nvcc}");
@@ -81,7 +79,7 @@ fn main() {
     // than the nvcc toolchain. Set HTM_PTX_VERSION to e.g. "7.8" or "8.0".
     let ptx_version_override = env::var("HTM_PTX_VERSION").ok();
-    for k in kernels {
         let src = kernels_dir.join(format!("{k}.cu"));
         let ptx = out_dir.join(format!("{k}.ptx"));
         if !src.exists() {
@@ -129,13 +127,7 @@ fn main() {
             std::fs::write(&ptx, patched)
                 .unwrap_or_else(|e| panic!("write {} failed: {e}", ptx.display()));
         }
-    }
-    if !fused_supported {
-        let fused_ptx = out_dir.join("htm_fused_step.ptx");
-        std::fs::write(&fused_ptx, "// fused kernel disabled for this CUDA arch\n")
-            .unwrap_or_else(|e| panic!("write {} failed: {e}", fused_ptx.display()));
-    }
     // Export OUT_DIR for include_str! in Rust.
     println!(

         return;
     }
+    let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
+    let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_90a".into());
+    // Base kernels — compile for any sm_80+ GPU. Each .cu file → one .ptx file.
+    let base_kernels: &[&str] = &[
+        "sp_overlap",
+        "sp_topk",
+        "sp_learn",
+        "sp_duty",
+        "sp_boost_fused",
         "tm_predict",
         "tm_activate",
         "tm_learn",
+        "tm_punish",
+        "tm_grow",
+        "tm_anomaly",
+        "tm_reset",
+    ];
+    // htm_fused_step now compiles for ALL architectures (sm_80+).
+    // On Hopper (sm_90+): uses cluster-distributed shared memory for hot state.
+    // On Ampere (sm_86) and other pre-Hopper: uses global memory reads/writes
+    // with grid.sync() for cross-block synchronization (cooperative launch).
+    let kernels: Vec<&str> = base_kernels.iter().chain(["htm_fused_step"].iter()).copied().collect();
     let kernels_dir = PathBuf::from("src/gpu/kernels");
+    for k in &kernels {
+        let src = kernels_dir.join(format!("{k}.cu"));
+        println!("cargo:rerun-if-changed={}", src.display());
+    }
     let nvcc = find_nvcc();
     println!("cargo:warning=htm_rust: nvcc = {nvcc}");
     // than the nvcc toolchain. Set HTM_PTX_VERSION to e.g. "7.8" or "8.0".
     let ptx_version_override = env::var("HTM_PTX_VERSION").ok();
+    for k in kernels {
         let src = kernels_dir.join(format!("{k}.cu"));
         let ptx = out_dir.join(format!("{k}.ptx"));
         if !src.exists() {
             std::fs::write(&ptx, patched)
                 .unwrap_or_else(|e| panic!("write {} failed: {e}", ptx.display()));
         }
+    }
     // Export OUT_DIR for include_str! in Rust.
     println!(

overlay/htm_rust/src/gpu/fused.rs CHANGED Viewed

@@ -132,7 +132,12 @@ pub(crate) fn plan_fused_launch(
     grid_cap_override: Option<u32>,
 ) -> Result<FusedLaunchPlan, String> {
     let sm_count = sm_count.max(1);
-    let block_dim_x = 1024u32;
     // Cluster launch path: cooperative launch is not required. Keep the probe
     // result for residency estimation only.
@@ -140,11 +145,10 @@ pub(crate) fn plan_fused_launch(
         eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
     }
-    // Cluster constraint: grid_dim_x must equal the cluster size (16) so that
-    // each region maps to exactly one cluster. `HTM_FUSED_GRID_CAP` can lower
-    // this for debugging but should not exceed 16 for cluster correctness.
     let default_grid_cap = 16u32;
-    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap).min(16);
     let resident_bound = if cooperative_grid_limit > 0 {
         cooperative_grid_limit.max(sm_count * 2)
     } else {
@@ -460,15 +464,21 @@ pub fn launch_fused(
                 return Err(DriverError(ret));
             }
         } else {
-            // Fallback for devices that don't support cluster launch.
-            result::launch_kernel(
                 fused.raw_kernel.function,
-                (grid_x, 1, 1),
-                (block_x, 1, 1),
-                0,
                 cu_stream,
-                &mut kernel_params,
-            )?;
         }
     }
@@ -503,41 +513,29 @@ pub(super) fn launch_fused_batched_raw(
     assert_eq!(anom_per_region.len(), b);
     assert!(b >= 1, "need at least one region");
-    // Reset per-region step_scratch before each launch.
-    for &rp in region_ptrs.iter() {
-        let r = unsafe { &mut *rp };
-        let fused = r
-            .fused_state
-            .as_mut()
-            .expect("launch_fused_batched_raw requires fused_state");
-        let dev = r.sp_gpu.dev_ref().clone();
-        dev.memset_zeros(&mut fused.step_scratch)?;
-        fused.iter_counter = fused.iter_counter.wrapping_add(1);
-    }
     // Shared config — all regions use identical sp/tm parameters.
-    let (grid_x, block_x, function_batched, cu_stream, cu_ctx) = {
-        let r0 = unsafe { &*region_ptrs[0] };
-        let fused = r0
-            .fused_state
-            .as_ref()
-            .expect("launch_fused_batched_raw requires fused_state");
-        (
-            fused.grid_dim_x,
-            fused.block_dim_x,
-            fused.raw_kernel.function_batched,
-            *r0.sp_gpu.dev_ref().cu_stream(),
-            *r0.sp_gpu.dev_ref().cu_primary_ctx(),
-        )
-    };
-    let cfg = {
-        let r = unsafe { &*region_ptrs[0] };
-        let fused = r
-            .fused_state
-            .as_ref()
-            .expect("launch_fused_batched_raw requires fused_state");
-        FusedConfig {
             input_bits: input_bits as u32,
             n_columns: r.sp_gpu.n_columns_accessor() as u32,
             synapses_per_col: r.sp_gpu.synapses_per_col_accessor() as u32,
@@ -562,41 +560,38 @@ pub(super) fn launch_fused_batched_raw(
             initial_perm_i16: r.tm_gpu.initial_perm_i16 as i32,
             t: t as u32,
             learn: if learn { 1 } else { 0 },
-            iter_seed: fused.iter_counter,
-            cooperative_grid_sync: 1,
-        }
-    };
     // Build B FusedPtrs per-region.
-    let mut ptrs_vec: Vec<FusedPtrs> = Vec::with_capacity(b);
-    for i in 0..b {
-        let r = unsafe { &*region_ptrs[i] };
-        let fused = r
-            .fused_state
-            .as_ref()
-            .expect("launch_fused_batched_raw requires fused_state");
-        ptrs_vec.push(FusedPtrs {
-            syn_bit: *r.sp_gpu.syn_bit_accessor().device_ptr(),
-            syn_perm: *r.sp_gpu.syn_perm_accessor().device_ptr(),
-            boost: *r.sp_gpu.boost_accessor().device_ptr(),
-            active_duty: *r.sp_gpu.active_duty_accessor().device_ptr(),
-            inhibition_threshold: *fused.inhibition_threshold.device_ptr(),
-            seg_cell_id: *r.tm_gpu.seg_cell_id_accessor().device_ptr(),
-            seg_syn_count: *r.tm_gpu.seg_syn_count_accessor().device_ptr(),
-            syn_presyn: *r.tm_gpu.syn_presyn_accessor().device_ptr(),
-            tm_syn_perm: *r.tm_gpu.syn_perm_accessor().device_ptr(),
-            cell_seg_count: *r.tm_gpu.cell_seg_count_accessor().device_ptr(),
-            cell_active_a: *fused.cell_active_bits_a.device_ptr(),
-            cell_active_b: *fused.cell_active_bits_b.device_ptr(),
-            cell_winner_a: *fused.cell_winner_bits_a.device_ptr(),
-            cell_winner_b: *fused.cell_winner_bits_b.device_ptr(),
-            inputs: inputs_per_region[i],
-            cols_out: cols_per_region[i],
-            anom_out: anom_per_region[i],
-            barrier_counters: 0u64, // ABI-compat dummy; cluster barrier replaces DLB.
-            step_scratch: *fused.step_scratch.device_ptr(),
-        });
-    }
     // Upload FusedPtrs array to device (B * sizeof(FusedPtrs) bytes).
     // FusedPtrs is repr(C) + DeviceRepr so htod_sync_copy handles it.
@@ -608,14 +603,10 @@ pub(super) fn launch_fused_batched_raw(
     // Grid = (grid_x, B, 1) with cluster_dim=(16,1,1): each region (Y slice)
     // occupies exactly one cluster of 16 blocks. All 8 clusters run concurrently
     // on the H200's 132 SMs (8 × 16 = 128 blocks ≤ 132 SMs).
-    let use_cluster = {
-        let r0 = unsafe { &*region_ptrs[0] };
-        let fused = r0
-            .fused_state
-            .as_ref()
-            .expect("launch_fused_batched_raw requires fused_state");
-        fused.cluster_info.max_cluster_size > 0
-    };
     unsafe {
         result::ctx::set_current(cu_ctx)?;
@@ -653,15 +644,18 @@ pub(super) fn launch_fused_batched_raw(
                 return Err(DriverError(ret));
             }
         } else {
-            // Fallback: plain non-cooperative launch for non-Hopper devices.
-            result::launch_kernel(
                 function_batched,
-                (grid_x, b as u32, 1),
-                (block_x, 1, 1),
-                0,
                 cu_stream,
-                &mut kernel_params,
-            )?;
         }
     }

     grid_cap_override: Option<u32>,
 ) -> Result<FusedLaunchPlan, String> {
     let sm_count = sm_count.max(1);
+    // 1024 threads/block exceeds the register file on Ampere (sm_86: 65536
+    // regs/SM ÷ 1024 = 64 regs/thread; fused kernel needs ~80+). 256 gives
+    // 256 regs/thread which is ample. Compensate with more blocks via
+    // cooperative launch. On Hopper (228 KB smem, 255 regs/thread baseline),
+    // 1024 works fine, but 256 is safe everywhere.
+    let block_dim_x = 256u32;
     // Cluster launch path: cooperative launch is not required. Keep the probe
     // result for residency estimation only.
         eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
     }
+    // Tested grid_cap: 4 blocks = 30ms (too serial), 16 blocks = 10.8ms (parallel wins).
+    // Parallelism in SP overlap + TM predict stages outweighs grid.sync() cost.
     let default_grid_cap = 16u32;
+    let grid_cap = grid_cap_override.unwrap_or(default_grid_cap);
     let resident_bound = if cooperative_grid_limit > 0 {
         cooperative_grid_limit.max(sm_count * 2)
     } else {
                 return Err(DriverError(ret));
             }
         } else {
+            // Pre-Hopper: cooperative kernel launch. The fused kernel uses
+            // grid.sync() for cross-block synchronization which REQUIRES
+            // cuLaunchCooperativeKernel (normal launch silently crashes on
+            // the first grid.sync() call).
+            let ret = sys::lib().cuLaunchCooperativeKernel(
                 fused.raw_kernel.function,
+                grid_x, 1, 1,
+                block_x, 1, 1,
+                0,  // sharedMemBytes
                 cu_stream,
+                kernel_params.as_mut_ptr(),
+            );
+            if ret != sys::CUresult::CUDA_SUCCESS {
+                return Err(DriverError(ret));
+            }
         }
     }
     assert_eq!(anom_per_region.len(), b);
     assert!(b >= 1, "need at least one region");
+    // Reset per-region step_scratch before each launch.
+    for &rp in region_ptrs.iter() {
+        let r = unsafe { &mut *rp };
+        let dev = r.sp_gpu.dev_ref().clone();
+        dev.memset_zeros(&mut r.fused_state.step_scratch)?;
+        r.fused_state.iter_counter = r.fused_state.iter_counter.wrapping_add(1);
+    }
     // Shared config — all regions use identical sp/tm parameters.
+    let (grid_x, block_x, function_batched, cu_stream, cu_ctx) = {
+        let r0 = unsafe { &*region_ptrs[0] };
+        (
+            r0.fused_state.grid_dim_x,
+            r0.fused_state.block_dim_x,
+            r0.fused_state.raw_kernel.function_batched,
+            *r0.sp_gpu.dev_ref().cu_stream(),
+            *r0.sp_gpu.dev_ref().cu_primary_ctx(),
+        )
+    };
+    let cfg = {
+        let r = unsafe { &*region_ptrs[0] };
+        FusedConfig {
             input_bits: input_bits as u32,
             n_columns: r.sp_gpu.n_columns_accessor() as u32,
             synapses_per_col: r.sp_gpu.synapses_per_col_accessor() as u32,
             initial_perm_i16: r.tm_gpu.initial_perm_i16 as i32,
             t: t as u32,
             learn: if learn { 1 } else { 0 },
+            iter_seed: r.fused_state.iter_counter,
+            cooperative_grid_sync: 1,
+        }
+    };
     // Build B FusedPtrs per-region.
+    let ptrs_vec: Vec<FusedPtrs> = (0..b)
+        .map(|i| {
+            let r = unsafe { &*region_ptrs[i] };
+            FusedPtrs {
+                syn_bit: *r.sp_gpu.syn_bit_accessor().device_ptr(),
+                syn_perm: *r.sp_gpu.syn_perm_accessor().device_ptr(),
+                boost: *r.sp_gpu.boost_accessor().device_ptr(),
+                active_duty: *r.sp_gpu.active_duty_accessor().device_ptr(),
+                inhibition_threshold: *r.fused_state.inhibition_threshold.device_ptr(),
+                seg_cell_id: *r.tm_gpu.seg_cell_id_accessor().device_ptr(),
+                seg_syn_count: *r.tm_gpu.seg_syn_count_accessor().device_ptr(),
+                syn_presyn: *r.tm_gpu.syn_presyn_accessor().device_ptr(),
+                tm_syn_perm: *r.tm_gpu.syn_perm_accessor().device_ptr(),
+                cell_seg_count: *r.tm_gpu.cell_seg_count_accessor().device_ptr(),
+                cell_active_a: *r.fused_state.cell_active_bits_a.device_ptr(),
+                cell_active_b: *r.fused_state.cell_active_bits_b.device_ptr(),
+                cell_winner_a: *r.fused_state.cell_winner_bits_a.device_ptr(),
+                cell_winner_b: *r.fused_state.cell_winner_bits_b.device_ptr(),
+                inputs: inputs_per_region[i],
+                cols_out: cols_per_region[i],
+                anom_out: anom_per_region[i],
+                barrier_counters: 0u64,  // ABI-compat dummy; cluster barrier replaces DLB.
+                step_scratch: *r.fused_state.step_scratch.device_ptr(),
+            }
+        })
+        .collect();
     // Upload FusedPtrs array to device (B * sizeof(FusedPtrs) bytes).
     // FusedPtrs is repr(C) + DeviceRepr so htod_sync_copy handles it.
     // Grid = (grid_x, B, 1) with cluster_dim=(16,1,1): each region (Y slice)
     // occupies exactly one cluster of 16 blocks. All 8 clusters run concurrently
     // on the H200's 132 SMs (8 × 16 = 128 blocks ≤ 132 SMs).
+    let use_cluster = {
+        let r0 = unsafe { &*region_ptrs[0] };
+        r0.fused_state.cluster_info.max_cluster_size > 0
+    };
     unsafe {
         result::ctx::set_current(cu_ctx)?;
                 return Err(DriverError(ret));
             }
         } else {
+            // Pre-Hopper: cooperative kernel launch (grid.sync() requires it).
+            let ret = sys::lib().cuLaunchCooperativeKernel(
                 function_batched,
+                grid_x, b as u32, 1,
+                block_x, 1, 1,
+                0,  // sharedMemBytes
                 cu_stream,
+                kernel_params.as_mut_ptr(),
+            );
+            if ret != sys::CUresult::CUDA_SUCCESS {
+                return Err(DriverError(ret));
+            }
         }
     }

overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu CHANGED Viewed

@@ -124,13 +124,21 @@ struct FusedConfig {
 //
 // The flags / expected / phase / cooperative_grid_sync parameters are kept
 // in the signature for call-site compatibility but are unused.
-__device__ static inline void fused_grid_barrier(cg::grid_group /* grid */,
                                                  unsigned int * /* flags — unused */,
                                                  unsigned int /* expected — unused */,
                                                  unsigned int /* phase — unused */,
                                                  unsigned int /* cooperative_grid_sync — unused */) {
     auto cluster = cg::this_cluster();
     cluster.sync();
 }
 __device__ static inline unsigned int warp_sum_u32(unsigned int v) {
@@ -187,17 +195,26 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
     // DSMEM: Cluster-distributed shared memory for hot per-column
     // state (inhibition_threshold, boost, active_duty).
     //
-    // Each block in the cluster owns a contiguous slice of
-    // [my_col_start, my_col_end) columns in its own __shared__
-    // arrays. Any block can peer-read another block's slice via
-    // cluster.map_shared_rank(ptr, owner_block_rank)[offset].
     //
-    // This eliminates 2×n_cols×T GMEM reads per forward call
-    // (read + potential re-read of threshold/boost/duty per timestep).
     // =========================================================
     auto cluster = cg::this_cluster();
     const unsigned int cluster_block_rank = cluster.block_rank();  // 0..cluster_size-1
     const unsigned int cluster_sz         = cluster.num_blocks();  // == gridDim.x (≤16)
     // Partition n_cols evenly across cluster blocks.
     // Each block owns cols_per_block columns starting at my_col_start.
@@ -209,27 +226,27 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
         (my_col_start + cols_per_block < n_cols)
             ? (my_col_start + cols_per_block) : n_cols;        // clamp
     // Cluster-distributed shared memory arrays.
     // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
     // Peer blocks address into each other's smem via map_shared_rank.
     __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
     __shared__ float s_boost     [COLS_PER_CLUSTER_BLOCK_MAX];
     __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
-    // TMA multicast input staging tile (T9).
-    //
-    // On Hopper (sm_90a), cg::memcpy_async with cluster scope issues a single
-    // TMA DMA that multicasts the source data to all 16 SMs in the cluster
-    // simultaneously — replacing ~16 per-block GMEM reads per timestep with a
-    // single hardware DMA.  After cg::wait(cluster) every SM's s_input_tile
-    // is populated identically without any additional DRAM traffic.
-    //
-    // Fallback: when cfg.input_bits > INPUT_BITS_MAX the tile is bypassed
-    // and each thread reads directly from GMEM (original path).
     //
-    // Alignment: 16-byte aligned to satisfy TMA descriptor requirements.
     __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
     // Initial GMEM → smem load (reads state from previous forward call).
     // Each block loads only its own slice; tid strides across the slice.
     for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
@@ -242,6 +259,11 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
     // All blocks in the cluster must finish loading before any block
     // starts reading peer smem inside the T-loop.
     cluster.sync();
     const unsigned int S   = cfg.synapses_per_col;
     const unsigned int cpc = cfg.cells_per_column;
@@ -307,32 +329,19 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
         // Ordering: BARRIER 1 completes before we issue the DMA.
         // The DMA completes before Stage A reads s_input_tile.
         // =========================================================
         const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
         if (use_input_tile) {
-            // Thread-block scope async copy: each SM independently loads
-            // its own input tile from GMEM into shared memory.
-            //
-            // NOTE: CUDA 12.1's cooperative_groups::memcpy_async() rejects
-            // cluster_group at compile time (static_assert in async.h:171).
-            // True TMA multicast (single DMA for all 16 SMs in the cluster)
-            // would require raw PTX cp.async.bulk.tensor with multicast mode,
-            // which needs cuTensorMap descriptors on the host side (T11).
-            //
-            // This per-SM path still gives a meaningful win: it converts
-            // the original per-synapse scattered GMEM reads (random access
-            // pattern hitting multiple cache lines) into one sequential DMA
-            // per SM, improving L2 hit rate and hardware prefetcher
-            // effectiveness.  The cluster.sync() below ensures all SMs in
-            // the cluster have finished loading before any SM enters Stage A.
             auto tb = cg::this_thread_block();
             cg::memcpy_async(tb, s_input_tile,
                              inputs + inp_off,
                              cfg.input_bits);
             cg::wait(tb);
-            // Cluster barrier: all 16 SMs must have loaded their tile
-            // before any SM begins reading s_input_tile in Stage A.
             cluster.sync();
         }
         // =========================================================
         // STAGE A: Spatial Pooler
@@ -350,22 +359,31 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
                 float p = syn_perm[base + s];
                 // T9: read from cluster-broadcast tile when available;
                 // fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
                 unsigned int inp_byte = use_input_tile
                     ? (unsigned int)s_input_tile[b]
                     : (unsigned int)inputs[inp_off + b];
                 unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
                 local += hit;
             }
             unsigned int overlap = warp_sum_u32(local);
             overlap = __shfl_sync(0xffffffffu, overlap, 0);
-            // Determine which cluster block owns column c and read
-            // boost + threshold from that block's shared memory.
             const unsigned int owner_block  = c / cols_per_block;
             const unsigned int owner_offset = c - owner_block * cols_per_block;
             float boost_val = cluster.map_shared_rank(s_boost,      owner_block)[owner_offset];
             float thr       = cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset];
             float boosted = (float)overlap * boost_val;
             unsigned int is_active = (boosted > thr) ? 1u : 0u;
@@ -383,9 +401,13 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
                 for (unsigned int s = lane; s < S; s += 32u) {
                     unsigned int b = syn_bit[base + s];
                     float p = syn_perm[base + s];
                     unsigned int inp_byte = use_input_tile
                         ? (unsigned int)s_input_tile[b]
                         : (unsigned int)inputs[inp_off + b];
                     if (inp_byte != 0u) {
                         p += cfg.sp_inc;
                         if (p > 1.0f) p = 1.0f;
@@ -398,15 +420,20 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
             }
             // active_duty EMA + threshold adaptation.
-            // Writes go to both peer DSMEM (hot path for next timestep)
-            // and GMEM (persistence across forward calls).
             if (lane == 0) {
                 float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
                 float sample = is_active ? 1.0f : 0.0f;
                 ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
                 // Writeback: peer smem (for next timestep read) + GMEM (persistence).
                 cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
                 active_duty[c] = ad;
                 // Threshold steers toward target sparsity.
@@ -415,50 +442,23 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
                 if (new_thr < 0.1f) new_thr = 0.1f;
                 if (new_thr > 1000.0f) new_thr = 1000.0f;
                 // Writeback: peer smem (for next timestep read) + GMEM (persistence).
                 cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
                 inhibition_threshold[c] = new_thr;
             }
         }
         // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
         //
-        // DATA FLOW PROOF (T-loop iteration invariant):
-        //
-        // WRITE SITES (lane==0 inside Stage A per-col loop):
-        //   Line 328: cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad
-        //   Line 338: cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset] = new_thr
-        //
-        // READ SITES (Stage A of the NEXT timestep t+1):
-        //   Line 290: cluster.map_shared_rank(s_boost,      owner_block)[owner_offset]  (read)
-        //   Line 291: cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset]  (read)
-        //   Line 323: cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset]  (read)
-        //
-        // PARTITION MISMATCH (root cause of T8 staleness):
-        //   cols_per_block = ceil(n_cols / cluster_sz)   [smem partition]
-        //   col_lo/col_hi  = floor(gwarp*n_cols/n_warps) [gwarp work partition]
-        //   These are NOT identical — up to 1 column can spill across partition boundaries.
-        //   Example: n_cols=1000, cluster_sz=16 → cols_per_block=63, block 1 col_lo=62
-        //   → block 1 processes column 62 but column 62 belongs to block 0's smem slice.
-        //   → block 1 issues a PEER WRITE to block 0's s_inhib_thr / s_active_duty.
-        //
-        // RACE WITHOUT SYNC:
-        //   Blocks run Stage A concurrently. Block 1 writes block 0's smem at column 62.
-        //   Block 0 may simultaneously READ s_inhib_thr[62] for its own column 62 in
-        //   Stage A of the same timestep → concurrent peer write + local read → undefined.
-        //   Additionally, without cluster.sync() after all peer writes complete, block 0's
-        //   t+1 Stage A reads might observe t-1 values still cached in its smem.
-        //
-        // FIX: cluster.sync() here, AFTER Stage A's per-column loop, ensures:
-        //   1. All peer smem writes from this timestep are globally visible to all blocks.
-        //   2. No block can enter Stage B (or start t+1 Stage A) with stale smem values.
-        //   3. GMEM writes (lines 329, 339) are already committed to L2; __threadfence()
-        //      below ensures they are visible to all SMs before the cluster barrier.
-        //
-        // ORDERING: write → cluster.sync() here → __threadfence() → cluster.sync() in
-        //           fused_grid_barrier → next-timestep reads.  Both visibility guarantees
-        //           are now satisfied.
         cluster.sync();
         // ---- BARRIER 2: SP active_mask must be visible before TM reads ----
         // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
@@ -660,7 +660,7 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
 }
 // Single-region kernel (legacy call site).
-__global__
 void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
     htm_fused_step_body(P, cfg);
 }
@@ -668,7 +668,7 @@ void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
 // Batched kernel: one cooperative launch for B regions. grid.y = B,
 // grid.x = per-region block count. Each block reads its region's
 // FusedPtrs from the device array via blockIdx.y.
-__global__
 void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
     const FusedPtrs P = P_arr[blockIdx.y];
     htm_fused_step_body(P, cfg);

 //
 // The flags / expected / phase / cooperative_grid_sync parameters are kept
 // in the signature for call-site compatibility but are unused.
+__device__ static inline void fused_grid_barrier(cg::grid_group grid,
                                                  unsigned int * /* flags — unused */,
                                                  unsigned int /* expected — unused */,
                                                  unsigned int /* phase — unused */,
                                                  unsigned int /* cooperative_grid_sync — unused */) {
+#if __CUDA_ARCH__ >= 900
+    // Hopper+ : hardware cluster barrier (~10-40 ns)
     auto cluster = cg::this_cluster();
     cluster.sync();
+#else
+    // Pre-Hopper (sm_80, sm_86, sm_89): grid-level cooperative sync.
+    // Requires cooperative kernel launch. ~us-ms range, adequate for HTM
+    // workload (kernel launch frequency is low).
+    grid.sync();
+#endif
 }
 __device__ static inline unsigned int warp_sum_u32(unsigned int v) {
     // DSMEM: Cluster-distributed shared memory for hot per-column
     // state (inhibition_threshold, boost, active_duty).
     //
+    // On Hopper (sm_90+): Each block in the cluster owns a contiguous
+    // slice of columns in its own __shared__ arrays. Any block can
+    // peer-read another block's slice via cluster.map_shared_rank().
     //
+    // On Ampere (sm_86) and other pre-Hopper: No cluster support.
+    // Read/write directly from/to global memory (inhibition_threshold,
+    // boost, active_duty device pointers). Slightly higher latency but
+    // functionally correct.
     // =========================================================
+#if __CUDA_ARCH__ >= 900
+    // Hopper+ cluster path
     auto cluster = cg::this_cluster();
     const unsigned int cluster_block_rank = cluster.block_rank();  // 0..cluster_size-1
     const unsigned int cluster_sz         = cluster.num_blocks();  // == gridDim.x (≤16)
+#else
+    // Pre-Hopper: no cluster, each block is independent.
+    const unsigned int cluster_block_rank = blockIdx.x;
+    const unsigned int cluster_sz         = gridDim.x;
+#endif
     // Partition n_cols evenly across cluster blocks.
     // Each block owns cols_per_block columns starting at my_col_start.
         (my_col_start + cols_per_block < n_cols)
             ? (my_col_start + cols_per_block) : n_cols;        // clamp
+#if __CUDA_ARCH__ >= 900
     // Cluster-distributed shared memory arrays.
     // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
     // Peer blocks address into each other's smem via map_shared_rank.
     __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
     __shared__ float s_boost     [COLS_PER_CLUSTER_BLOCK_MAX];
     __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
+#endif
+    // TMA multicast input staging tile (T9) — HOPPER ONLY.
     //
+    // On Hopper: cg::memcpy_async with cluster scope multicasts input to all
+    // 16 SMs, reducing DRAM traffic by ~16×.
+    // On Ampere: 32 KB smem allocation exceeds per-block budget when
+    // cooperatively launched (48 KB total, registers eat the rest). Skip the
+    // tile entirely — Stage A reads from GMEM directly (original path).
+#if __CUDA_ARCH__ >= 900
     __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
+#endif
+#if __CUDA_ARCH__ >= 900
     // Initial GMEM → smem load (reads state from previous forward call).
     // Each block loads only its own slice; tid strides across the slice.
     for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
     // All blocks in the cluster must finish loading before any block
     // starts reading peer smem inside the T-loop.
     cluster.sync();
+#else
+    // Pre-Hopper: no smem caching needed — reads go directly to GMEM.
+    // Grid sync ensures all blocks have completed Phase 0 init before T-loop.
+    grid.sync();
+#endif
     const unsigned int S   = cfg.synapses_per_col;
     const unsigned int cpc = cfg.cells_per_column;
         // Ordering: BARRIER 1 completes before we issue the DMA.
         // The DMA completes before Stage A reads s_input_tile.
         // =========================================================
+#if __CUDA_ARCH__ >= 900
         const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
         if (use_input_tile) {
             auto tb = cg::this_thread_block();
             cg::memcpy_async(tb, s_input_tile,
                              inputs + inp_off,
                              cfg.input_bits);
             cg::wait(tb);
             cluster.sync();
         }
+#else
+        const bool use_input_tile = false;
+#endif
         // =========================================================
         // STAGE A: Spatial Pooler
                 float p = syn_perm[base + s];
                 // T9: read from cluster-broadcast tile when available;
                 // fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
+#if __CUDA_ARCH__ >= 900
                 unsigned int inp_byte = use_input_tile
                     ? (unsigned int)s_input_tile[b]
                     : (unsigned int)inputs[inp_off + b];
+#else
+                unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
+#endif
                 unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
                 local += hit;
             }
             unsigned int overlap = warp_sum_u32(local);
             overlap = __shfl_sync(0xffffffffu, overlap, 0);
+            // Read boost + threshold for column c.
+#if __CUDA_ARCH__ >= 900
+            // Hopper: read from cluster-distributed shared memory.
             const unsigned int owner_block  = c / cols_per_block;
             const unsigned int owner_offset = c - owner_block * cols_per_block;
             float boost_val = cluster.map_shared_rank(s_boost,      owner_block)[owner_offset];
             float thr       = cluster.map_shared_rank(s_inhib_thr,  owner_block)[owner_offset];
+#else
+            // Pre-Hopper: read directly from global memory.
+            float boost_val = boost[c];
+            float thr       = inhibition_threshold[c];
+#endif
             float boosted = (float)overlap * boost_val;
             unsigned int is_active = (boosted > thr) ? 1u : 0u;
                 for (unsigned int s = lane; s < S; s += 32u) {
                     unsigned int b = syn_bit[base + s];
                     float p = syn_perm[base + s];
+#if __CUDA_ARCH__ >= 900
                     unsigned int inp_byte = use_input_tile
                         ? (unsigned int)s_input_tile[b]
                         : (unsigned int)inputs[inp_off + b];
+#else
+                    unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
+#endif
                     if (inp_byte != 0u) {
                         p += cfg.sp_inc;
                         if (p > 1.0f) p = 1.0f;
             }
             // active_duty EMA + threshold adaptation.
+            // Writes go to both DSMEM (hot path, Hopper only) and GMEM (persistence).
             if (lane == 0) {
+#if __CUDA_ARCH__ >= 900
                 float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
+#else
+                float ad = active_duty[c];
+#endif
                 float sample = is_active ? 1.0f : 0.0f;
                 ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
+#if __CUDA_ARCH__ >= 900
                 // Writeback: peer smem (for next timestep read) + GMEM (persistence).
                 cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
+#endif
                 active_duty[c] = ad;
                 // Threshold steers toward target sparsity.
                 if (new_thr < 0.1f) new_thr = 0.1f;
                 if (new_thr > 1000.0f) new_thr = 1000.0f;
+#if __CUDA_ARCH__ >= 900
                 // Writeback: peer smem (for next timestep read) + GMEM (persistence).
                 cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
+#endif
                 inhibition_threshold[c] = new_thr;
             }
         }
         // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
         //
+        // On Hopper: cluster.sync() ensures all peer smem writes from this
+        // timestep are visible to all blocks before Stage B / next t.
+        // On pre-Hopper: no smem peer writes occur (all state in GMEM),
+        // so no extra sync needed here — the grid barrier below suffices.
+#if __CUDA_ARCH__ >= 900
         cluster.sync();
+#endif
         // ---- BARRIER 2: SP active_mask must be visible before TM reads ----
         // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
 }
 // Single-region kernel (legacy call site).
+__global__ __launch_bounds__(256, 2)
 void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
     htm_fused_step_body(P, cfg);
 }
 // Batched kernel: one cooperative launch for B regions. grid.y = B,
 // grid.x = per-region block count. Each block reads its region's
 // FusedPtrs from the device array via blockIdx.y.
+__global__ __launch_bounds__(256, 2)
 void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
     const FusedPtrs P = P_arr[blockIdx.y];
     htm_fused_step_body(P, cfg);

overlay/hydra/engram.py CHANGED Viewed

@@ -1,93 +1,80 @@
-"""GPU Engram — Sparse Modern Hopfield retrieval path.
-## What changed (scatter-gather → Hopfield matmul)
 The original forward used `self.memory[indices]` (scatter-gather), which misses
 L2 cache at n_columns > 4096 and creates a hard tps ceiling.
-The replacement uses:
-    scores   = x @ self.memory.T          # (B, T, n_columns) — coalesced matmul
-    weights  = entmax15(scores, dim=-1)   # sparse attention; 95%+ exact zeros
-    retrieved = weights @ self.memory     # (B, T, d_model)   — coalesced matmul
-Both matmuls are tile-friendly (cuBLAS GEMM), so L2 reuse is high regardless of
-n_columns. Gradient flows through both matmuls so `self.memory` learns via
-autograd in addition to (or instead of) the Hebbian EMA writes.
-## Sparsity mechanism
-alpha-entmax with alpha=1.5 (entmax15) is a sparse attention operator that maps
-logit vectors to distributions where many entries are *exactly* zero (not merely
-small). It generalises softmax (alpha=1) and argmax (alpha→∞). At n_columns=1024
-with d_model=64 a random batch typically hits ≥95% zero entries — the key
-property that keeps bandwidth proportional to *attended* columns, not all columns.
-Fallback: if `entmax` is not pip-installed, top-k softmax (k=32) is used instead.
-This is chosen at module-import time — NO runtime branching per forward call.
-## token_ids argument
-token_ids is accepted for API compatibility with the rest of the hydra stack
-(train.py, lightning_module.py call `engram(x, token_ids)`). It is NOT used in
-the retrieval path — the Hopfield path computes dense similarity over the whole
-memory bank, which subsumes any hash-based column selection. Documented here to
-prevent confusion.
-## Hebbian writes (hebbian_boost=False by default)
-With Hopfield retrieval, gradient signals reach self.memory through autograd, so
-Hebbian EMA writes are no longer critical. They are preserved as an *optional*
-boost (hebbian_boost=True) for experiments that want both signals. Default is off.
 ## Checkpoint compatibility
-`self.memory` shape (n_columns, d_model) is unchanged, so existing .pt / .ckpt
-files load without modification.
 """
 from __future__ import annotations
 import torch
 import torch.nn as nn
-# ---------------------------------------------------------------------------
-# Sparse-attention backend — chosen ONCE at import time, no runtime branching.
-# ---------------------------------------------------------------------------
-try:
-    from entmax import entmax15 as _entmax15  # type: ignore[import]
-    def _sparse_attention(scores: torch.Tensor) -> torch.Tensor:
-        """alpha-entmax (alpha=1.5): truly sparse distribution over last dim."""
-        return _entmax15(scores, dim=-1)
-    _BACKEND = "entmax15"
-except ImportError:  # pragma: no cover — entmax always installed in CI
-    _K = 32  # top-k for fallback
-    def _sparse_attention(scores: torch.Tensor) -> torch.Tensor:  # type: ignore[misc]
-        """Top-k softmax fallback: zero outside the k highest-scoring columns."""
-        topk_vals, topk_idx = scores.topk(_K, dim=-1)
-        topk_w = torch.softmax(topk_vals, dim=-1).to(scores.dtype)
-        weights = torch.zeros_like(scores)
-        weights.scatter_(-1, topk_idx, topk_w)
-        return weights
-    _BACKEND = "topk32"
 class GPUEngram(nn.Module):
-    """GPU Engram: Sparse Modern Hopfield retrieval.
     Args:
         d_model:       Model dimension — must match the surrounding transformer.
-        n_columns:     Number of memory columns (key-value pairs). Safe at 32 768
-                       with the matmul path; the old scatter-gather had an L2
-                       cliff above ~4 096.
-        max_ngram:     Retained for API compatibility; unused in retrieval path.
         hebbian_boost: If True, also run a Hebbian EMA write on the memory bank
-                       during training (old behaviour, now optional). Default False.
     """
     def __init__(
@@ -105,16 +92,18 @@ class GPUEngram(nn.Module):
         self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
         self.gate = nn.Linear(d_model, 1, bias=True)
         nn.init.constant_(self.gate.bias, 0.0)  # START OPEN
         # Retained for any external code that reads these attrs.
         self.primes = [2654435761, 2246822519, 3266489917]
         self.hebbian_lr = 0.01
     # ------------------------------------------------------------------
-    # _hash: retained for API/checkpoint compat; unused in forward below.
     # ------------------------------------------------------------------
     def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
-        """N-gram hash → column index (kept for backward-compat; not used in retrieval)."""
         B, T = token_ids.shape
         h = token_ids * self.primes[0]
         if T > 1:
@@ -132,39 +121,48 @@ class GPUEngram(nn.Module):
     # ------------------------------------------------------------------
     def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
-        """Hopfield retrieve + soft gate + residual.
         Args:
             x:         (B, T, d_model) — input activations.
-            token_ids: (B, T) — token indices. Accepted for API compatibility;
-                       NOT used in the retrieval path (see module docstring).
         Returns:
             (x + alpha * retrieved, hit_rate)
             - x + alpha * retrieved: (B, T, d_model)
             - hit_rate: scalar tensor — fraction of gate values > 0.1
         """
         # ---- 1. Similarity scores (coalesced GEMM) ----------------------
         # scores[b, t, c] = dot(x[b,t], memory[c])
         scores = x @ self.memory.T  # (B, T, n_columns)
-        # ---- 2. Sparse attention weights --------------------------------
-        # _sparse_attention is fixed at import time (entmax15 or top-k).
-        weights = _sparse_attention(scores)  # (B, T, n_columns), many exact zeros
-        # ---- 3. Retrieved vector (coalesced GEMM) -----------------------
-        retrieved = weights @ self.memory  # (B, T, d_model)
-        # ---- 4. Soft gate (unchanged) -----------------------------------
         alpha = torch.sigmoid(self.gate(x))  # (B, T, 1)
-        # ---- 5. Optional Hebbian EMA write ------------------------------
         if self.training and self.hebbian_boost:
             with torch.no_grad():
-                # Reuse the hash-based indices for the write target (sparse update).
                 indices = self._hash(token_ids)
-                flat_idx = indices.reshape(-1)           # (B*T,)
-                flat_x = x.detach().reshape(-1, x.shape[-1])  # (B*T, d_model)
                 mem_dtype = self.memory.data.dtype
                 updates = (
                     self.hebbian_lr * flat_x
@@ -172,6 +170,6 @@ class GPUEngram(nn.Module):
                 ).to(mem_dtype)
                 self.memory.data.index_add_(0, flat_idx, updates)
-        # ---- 6. Residual + hit_rate -------------------------------------
         hit_rate = (alpha.detach() > 0.1).float().mean()
         return x + alpha * retrieved, hit_rate

+"""GPU Engram — Top-k Sparse Hopfield retrieval, scales to n_columns >= 32768.
+## What changed (scatter-gather → top-k Hopfield)
 The original forward used `self.memory[indices]` (scatter-gather), which misses
 L2 cache at n_columns > 4096 and creates a hard tps ceiling.
+An earlier Hopfield implementation used `entmax15` for sparse attention, but
+entmax's internal `torch.sort` over the full n_columns dimension allocates
+~1 GB scratch at (B*T=8192, n_columns=32768) and OOMs on a 6 GB card.
+This module replaces the sort-based entmax with **top-k softmax**, which is
+O(B*T*K) in memory and O(B*T*K * log n_columns) in compute (the top-k is
+radix-selection under the hood — not a full sort). Sparsity is still exact:
+only K columns have non-zero weight per (batch, position).
+## Why this scales where entmax didn't
+- `scores = x @ memory.T` is (B, T, n_columns) — 268 MB at bf16 with n_columns=32768.
+- `scores.topk(K)` allocates only (B, T, K) — ~2 MB at K=64. No full sort.
+- `memory[topk_idx]` gathers (B, T, K, d_model) — ~32 MB at bf16. Gather is
+  on the LAST axis of memory (columns), contiguous stride-1 rows, cache-friendly.
+- `retrieved = einsum(topk_w, selected_mem)` — ~4 MB. Final reduction.
+Peak working set well under 400 MB at any reasonable n_columns + K. The weights
+tensor is never densified (which would have been the (B, T, n_columns) killer).
+## Gradient flow
+Both the topk gather and the einsum are autograd-tracked, so `self.memory`
+receives gradient from the LM loss (which the Hebbian scatter-gather path did
+not). `topk` indices are detached — gradient flows through `topk_vals` via the
+selected memory rows.
+## Sparsity
+Exactly K columns have non-zero weight per position. Default K=64, tunable via
+HYDRA_ENGRAM_TOPK.
+## token_ids argument
+Accepted for API compatibility with hydra/model.py; unused in retrieval. The
+optional Hebbian boost (hebbian_boost=True) uses the hash-indexed path for
+its EMA write only.
 ## Checkpoint compatibility
+`self.memory` shape (n_columns, d_model) is unchanged; existing .pt/.ckpt
+files load without migration.
 """
 from __future__ import annotations
+import os
 import torch
 import torch.nn as nn
+# Top-k width — how many memory columns get non-zero weight per position.
+# Default 64 matches the entmax sparsity fraction we observed empirically
+# (~0.2% of 32768 columns == 64). HYDRA_ENGRAM_TOPK env var overrides.
+_ENGRAM_TOPK = int(os.environ.get("HYDRA_ENGRAM_TOPK", "64"))
 class GPUEngram(nn.Module):
+    """GPU Engram: Top-k Sparse Hopfield retrieval.
     Args:
         d_model:       Model dimension — must match the surrounding transformer.
+        n_columns:     Number of memory columns (key-value pairs). Safe up to
+                       n_columns = 65536 at d_model = 384 on a 6 GB card with
+                       B*T <= 8192.
+        max_ngram:     Retained for API compatibility; unused in retrieval.
         hebbian_boost: If True, also run a Hebbian EMA write on the memory bank
+                       during training. Default False — the top-k gradient path
+                       provides learning signal without this.
     """
     def __init__(
         self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
         self.gate = nn.Linear(d_model, 1, bias=True)
         nn.init.constant_(self.gate.bias, 0.0)  # START OPEN
+        # Clamp topk K to n_columns so topk doesn't error at small engram.
+        self.topk_k = min(_ENGRAM_TOPK, n_columns)
         # Retained for any external code that reads these attrs.
         self.primes = [2654435761, 2246822519, 3266489917]
         self.hebbian_lr = 0.01
     # ------------------------------------------------------------------
+    # _hash: retained for API/checkpoint compat; unused in retrieval path.
     # ------------------------------------------------------------------
     def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
+        """N-gram hash → column index (Hebbian-write target only, not retrieval)."""
         B, T = token_ids.shape
         h = token_ids * self.primes[0]
         if T > 1:
     # ------------------------------------------------------------------
     def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
+        """Top-k Hopfield retrieve + soft gate + residual.
         Args:
             x:         (B, T, d_model) — input activations.
+            token_ids: (B, T) — accepted for API compat; only used in the
+                       optional Hebbian boost path.
         Returns:
             (x + alpha * retrieved, hit_rate)
             - x + alpha * retrieved: (B, T, d_model)
             - hit_rate: scalar tensor — fraction of gate values > 0.1
         """
+        B, T, D = x.shape
         # ---- 1. Similarity scores (coalesced GEMM) ----------------------
         # scores[b, t, c] = dot(x[b,t], memory[c])
         scores = x @ self.memory.T  # (B, T, n_columns)
+        # ---- 2. Top-k sparse attention ----------------------------------
+        # topk uses radix select, not a sort — O(n_columns) memory, not O(n_columns log n_columns).
+        # Never materializes a dense (B, T, n_columns) weights tensor.
+        topk_vals, topk_idx = scores.topk(self.topk_k, dim=-1)  # (B, T, K), (B, T, K)
+        topk_w = torch.softmax(topk_vals, dim=-1)                # (B, T, K)
+        # ---- 3. Gather selected memory rows -----------------------------
+        # memory[topk_idx] is a gather along axis 0 of memory (n_columns, d_model).
+        # Output shape (B, T, K, d_model) — K is small, so gather bandwidth is
+        # O(B*T*K*d_model), independent of n_columns.
+        selected_mem = self.memory[topk_idx]  # (B, T, K, d_model)
+        # ---- 4. Weighted sum → retrieved vector -------------------------
+        retrieved = torch.einsum('btk,btkd->btd', topk_w, selected_mem)  # (B, T, d_model)
+        # ---- 5. Soft gate -----------------------------------------------
         alpha = torch.sigmoid(self.gate(x))  # (B, T, 1)
+        # ---- 6. Optional Hebbian EMA write ------------------------------
         if self.training and self.hebbian_boost:
             with torch.no_grad():
                 indices = self._hash(token_ids)
+                flat_idx = indices.reshape(-1)                # (B*T,)
+                flat_x = x.detach().reshape(-1, D)            # (B*T, d_model)
                 mem_dtype = self.memory.data.dtype
                 updates = (
                     self.hebbian_lr * flat_x
                 ).to(mem_dtype)
                 self.memory.data.index_add_(0, flat_idx, updates)
+        # ---- 7. Residual + hit_rate -------------------------------------
         hit_rate = (alpha.detach() > 0.1).float().mean()
         return x + alpha * retrieved, hit_rate

overlay/hydra/eval.py CHANGED Viewed

@@ -138,6 +138,9 @@ def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
     num_samples = FACTUAL_SAMPLES
     batch = FACTUAL_BATCH
     gen_tokens = FACTUAL_GEN_TOKENS
     temps = [0.7, 0.9, 1.1]
     hits = 0
@@ -154,14 +157,18 @@ def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
                 temp = temps[batch_idx % len(temps)]
                 batch_idx += 1
                 ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
                 for _ in range(gen_tokens):
-                    logits = model(ctx, targets=None)
                     next_logits = logits[:, -1, :] if logits.dim() == 3 else logits
                     probs = torch.softmax(next_logits.float() / temp, dim=-1)
                     next_id = torch.multinomial(probs, num_samples=1)
                     ctx = torch.cat([ctx, next_id], dim=1)
                     if ctx.size(1) >= max_seq_len:
                         break
                 # Transfer to CPU in one shot, no per-row sync
                 all_rows.extend(ctx.cpu().tolist())
                 samples_done += b

     num_samples = FACTUAL_SAMPLES
     batch = FACTUAL_BATCH
     gen_tokens = FACTUAL_GEN_TOKENS
+    # Optional fast incremental decode path for recurrence-capable backbones.
+    # If disabled, we preserve the original full-context re-forward behavior.
+    incremental_decode = os.environ.get("HYDRA_FACTUAL_GEN_INCREMENTAL", "1") == "1"
     temps = [0.7, 0.9, 1.1]
     hits = 0
                 temp = temps[batch_idx % len(temps)]
                 batch_idx += 1
                 ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
+                logits = model(ctx, targets=None)
                 for _ in range(gen_tokens):
                     next_logits = logits[:, -1, :] if logits.dim() == 3 else logits
                     probs = torch.softmax(next_logits.float() / temp, dim=-1)
                     next_id = torch.multinomial(probs, num_samples=1)
                     ctx = torch.cat([ctx, next_id], dim=1)
                     if ctx.size(1) >= max_seq_len:
                         break
+                    if incremental_decode:
+                        logits = model(ctx[:, -1:], targets=None)
+                    else:
+                        logits = model(ctx, targets=None)
                 # Transfer to CPU in one shot, no per-row sync
                 all_rows.extend(ctx.cpu().tolist())
                 samples_done += b

overlay/hydra/model.py CHANGED Viewed

@@ -145,7 +145,7 @@ class PostSemClawModel(nn.Module):
                 expand=config.expand,
                 headdim=config.headdim,
                 is_mimo=False,          # SISO path uses stable mamba3_siso_combined kernel
-                chunk_size=64,          # upstream-recommended SISO chunk; 16 violated tl.dot M>=16 constraint
                 is_outproj_norm=False,
                 dtype=torch.bfloat16,
             )
@@ -173,8 +173,13 @@ class PostSemClawModel(nn.Module):
             reset_each_forward=True,
         )
-        # Gradient bridge: (n_columns + anomaly) -> d_model.
-        self.htm_proj = nn.Linear(config.htm_n_columns + 1, config.d_model, bias=False)
         # GPU Engram with Hebbian writes — runs EVERY step.
         self.engram = GPUEngram(
@@ -349,11 +354,13 @@ class PostSemClawModel(nn.Module):
                 nn.init.normal_(block.out_proj.weight, mean=0.0, std=out_std)
         nn.init.normal_(self.htm_proj.weight, mean=0.0, std=s)
         # Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
         # dtypes in the same shape group would break lerp_ dtype checks.
         self.wte.to(dtype=torch.bfloat16)
         self.htm_proj.to(dtype=torch.bfloat16)
         self.engram.to(dtype=torch.bfloat16)
     def set_bos_token_id(self, bos_id: int) -> None:
@@ -402,11 +409,13 @@ class PostSemClawModel(nn.Module):
         blocks = sum(p.numel() for p in self.blocks.parameters())
         sdr = sum(p.numel() for p in self.sdr_semantic.parameters())
         htm_proj = sum(p.numel() for p in self.htm_proj.parameters())
         engram = sum(p.numel() for p in self.engram.parameters())
         total = sum(p.numel() for p in self.parameters())
         return {
             'wte': wte, 'lm_head': lm_head, 'blocks': blocks,
             'sdr_semantic': sdr, 'htm_proj': htm_proj,
             'engram': engram, 'total': total,
         }
@@ -516,9 +525,13 @@ class PostSemClawModel(nn.Module):
         for shape in sorted({p.shape for p in matrix_params}):
             group_params = [p for p in matrix_params if p.shape == shape]
             param_groups.append(dict(
                 kind='muon', params=group_params, lr=matrix_lr,
-                momentum=0.95, ns_steps=5, beta2=0.95, weight_decay=weight_decay,
             ))
         optimizer = MuonAdamW(param_groups)
@@ -610,8 +623,10 @@ class PostSemClawModel(nn.Module):
         if self._htm_stop_grad:
             htm_out = htm_out.detach()
-        # Gradient bridge: HTM columns+anomaly -> d_model.
-        htm_proj_out = self.htm_proj(htm_out.to(dense_emb.dtype))
         x = dense_emb + htm_proj_out
         x = norm(x)

                 expand=config.expand,
                 headdim=config.headdim,
                 is_mimo=False,          # SISO path uses stable mamba3_siso_combined kernel
+                chunk_size=int(os.environ.get("HYDRA_MAMBA3_CHUNK", "64")),  # 64 is the validated default; 128 tripped a Triton autotune hang (>8min, no progress)
                 is_outproj_norm=False,
                 dtype=torch.bfloat16,
             )
             reset_each_forward=True,
         )
+        # Gradient bridge split:
+        #   (a) sparse HTM columns -> d_model
+        #   (b) scalar anomaly     -> d_model
+        # This avoids forcing the anomaly scalar through the same projection
+        # statistics as the high-dimensional sparse HTM column vector.
+        self.htm_proj = nn.Linear(config.htm_n_columns, config.d_model, bias=False)
+        self.htm_anom_proj = nn.Linear(1, config.d_model, bias=False)
         # GPU Engram with Hebbian writes — runs EVERY step.
         self.engram = GPUEngram(
                 nn.init.normal_(block.out_proj.weight, mean=0.0, std=out_std)
         nn.init.normal_(self.htm_proj.weight, mean=0.0, std=s)
+        nn.init.normal_(self.htm_anom_proj.weight, mean=0.0, std=s)
         # Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
         # dtypes in the same shape group would break lerp_ dtype checks.
         self.wte.to(dtype=torch.bfloat16)
         self.htm_proj.to(dtype=torch.bfloat16)
+        self.htm_anom_proj.to(dtype=torch.bfloat16)
         self.engram.to(dtype=torch.bfloat16)
     def set_bos_token_id(self, bos_id: int) -> None:
         blocks = sum(p.numel() for p in self.blocks.parameters())
         sdr = sum(p.numel() for p in self.sdr_semantic.parameters())
         htm_proj = sum(p.numel() for p in self.htm_proj.parameters())
+        htm_anom_proj = sum(p.numel() for p in self.htm_anom_proj.parameters())
         engram = sum(p.numel() for p in self.engram.parameters())
         total = sum(p.numel() for p in self.parameters())
         return {
             'wte': wte, 'lm_head': lm_head, 'blocks': blocks,
             'sdr_semantic': sdr, 'htm_proj': htm_proj,
+            'htm_anom_proj': htm_anom_proj,
             'engram': engram, 'total': total,
         }
         for shape in sorted({p.shape for p in matrix_params}):
             group_params = [p for p in matrix_params if p.shape == shape]
+            # ns_steps: Muon polar-express inner iterations. Default 5 (paper),
+            # but 3 converges on small matrices (d_model ~ 384) with ~40% lower
+            # optimizer step cost. Env-tunable for experimentation.
+            _ns_steps = int(os.environ.get("HYDRA_MUON_NS_STEPS", "3"))
             param_groups.append(dict(
                 kind='muon', params=group_params, lr=matrix_lr,
+                momentum=0.95, ns_steps=_ns_steps, beta2=0.95, weight_decay=weight_decay,
             ))
         optimizer = MuonAdamW(param_groups)
         if self._htm_stop_grad:
             htm_out = htm_out.detach()
+        # Gradient bridge split: columns and anomaly use separate projections.
+        htm_cols = htm_out[..., :-1].to(dense_emb.dtype)
+        htm_anom = htm_out[..., -1:].to(dense_emb.dtype)
+        htm_proj_out = self.htm_proj(htm_cols) + self.htm_anom_proj(htm_anom)
         x = dense_emb + htm_proj_out
         x = norm(x)

overlay/hydra/training.py CHANGED Viewed

@@ -779,15 +779,49 @@ def main() -> None:
     )
     # Now it's safe to eval — ckpts are on disk regardless of what happens here.
     val_bpb: float | None = None
     try:
-        torch.cuda.empty_cache()  # defrag before eval allocates logit chunks
-        print(f"[VAL] running eval on {4 * 524288} tokens...", flush=True)
         model.eval()
         _orig = _prepare_mod.EVAL_TOKENS
-        _prepare_mod.EVAL_TOKENS = 4 * 524288
         with autocast_ctx:
-            val_bpb = evaluate_bpb(model, tokenizer, DEVICE_BATCH_SIZE)
         _prepare_mod.EVAL_TOKENS = _orig
         val_ppl = 2 ** val_bpb
         print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
@@ -795,7 +829,14 @@ def main() -> None:
         print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
         torch.cuda.empty_cache()
     except Exception as e:
         print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
     # Final ckpts with val_bpb filled in (if eval succeeded).
     save_ckpt(
@@ -843,7 +884,7 @@ def main() -> None:
     metrics = model.get_secondary_metrics()
     print("---")
-    print(f"val_bpb:          {val_bpb:.6f}")
     print(f"training_seconds: {total_training_time:.1f}")
     print(f"total_seconds:    {t_end - t_start:.1f}")
     print(f"peak_vram_mb:     {peak_vram_mb:.1f}")

     )
     # Now it's safe to eval — ckpts are on disk regardless of what happens here.
+    # HYDRA_EVAL_BATCH overrides DEVICE_BATCH_SIZE (env-tunable; default halves
+    # the training batch because eval holds activations for full sequence and
+    # does not benefit from overlap with backward). HYDRA_EVAL_TOKENS controls
+    # how many val tokens to sweep (default 2 M, short enough for autoresearch
+    # 5-min budgets).
     val_bpb: float | None = None
+    _eval_B = int(os.environ.get("HYDRA_EVAL_BATCH", str(max(1, DEVICE_BATCH_SIZE // 2))))
+    _eval_tokens = int(os.environ.get("HYDRA_EVAL_TOKENS", str(2 * 524288)))
     try:
+        # Aggressive VRAM reclaim for 6GB cards. Peak training VRAM = 5.1GB
+        # which leaves < 1GB for the eval forward — the driver can't satisfy
+        # the allocation. Free EVERY tensor we don't strictly need:
+        #   - optimizer grads (set_to_none releases tensor)
+        #   - optimizer.state (fp32 Muon NS workspace, AdamW moments — ~size-of-params each)
+        #   - model internal caches (HTM subsample cache, SDR stash)
+        # After this, VRAM should be ~params only (bf16 ≈ 120MB at 60M params).
+        optimizer.zero_grad(set_to_none=True)
+        if hasattr(optimizer, 'state') and optimizer.state:
+            for p, st in list(optimizer.state.items()):
+                st.clear()
+            optimizer.state.clear()
+        for p in model.parameters():
+            if p.grad is not None:
+                p.grad = None
+        if hasattr(model, '_htm_cache'):
+            model._htm_cache = None
+        if hasattr(model, '_last_sdr'):
+            model._last_sdr = None
+        import gc as _gc
+        _gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        try:
+            _free_mb = torch.cuda.mem_get_info()[0] / 1024 / 1024
+            print(f"[VAL] free_vram_mb={_free_mb:.0f} (cleared optimizer state)", flush=True)
+        except Exception:
+            pass
+        print(f"[VAL] running eval on {_eval_tokens} tokens at B={_eval_B}...", flush=True)
         model.eval()
         _orig = _prepare_mod.EVAL_TOKENS
+        _prepare_mod.EVAL_TOKENS = _eval_tokens
         with autocast_ctx:
+            val_bpb = evaluate_bpb(model, tokenizer, _eval_B)
         _prepare_mod.EVAL_TOKENS = _orig
         val_ppl = 2 ** val_bpb
         print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
         print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
         torch.cuda.empty_cache()
     except Exception as e:
+        import traceback as _tb
         print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
+        _tb.print_exc()
+        try:
+            _free = torch.cuda.mem_get_info()[0] / 1024 / 1024
+            print(f"[VAL] post-crash free_vram_mb={_free:.0f}", flush=True)
+        except Exception:
+            pass
     # Final ckpts with val_bpb filled in (if eval succeeded).
     save_ckpt(
     metrics = model.get_secondary_metrics()
     print("---")
+    print(f"val_bpb:          {val_bpb:.6f}" if val_bpb is not None else "val_bpb:          SKIPPED")
     print(f"training_seconds: {total_training_time:.1f}")
     print(f"total_seconds:    {t_end - t_start:.1f}")
     print(f"peak_vram_mb:     {peak_vram_mb:.1f}")

overlay/prepare_nemotron.py CHANGED Viewed

@@ -20,15 +20,15 @@ Full blend mode (env HYDRA_USE_FULL_BLEND=1):
 """
 from __future__ import annotations
-import os
-import random
-from itertools import cycle
-from typing import Iterator
-import numpy as np
 import torch
-import prepare as _p  # reuse tokenizer, BOS, byte-length helpers
 NEMOTRON_REPO = "nvidia/Nemotron-Pretraining-Specialized-v1.1"
@@ -37,14 +37,13 @@ NEMOTRON_REPO = "nvidia/Nemotron-Pretraining-Specialized-v1.1"
 # Keys are logical dataset names used by _open_blend_stream / _open_stream.
 # ---------------------------------------------------------------------------
 FULL_BLEND_WEIGHTS: dict[str, float] = {
-    "fineweb-edu":            0.55,  # HuggingFaceFW/fineweb-edu — PRIMARY (high-quality English)
-    "wikipedia":              0.25,  # wikimedia/wikipedia — factual grounding
-    "cosmopedia":             0.15,  # HuggingFaceTB/cosmopedia — synthetic textbook
-    "fineweb":                0.05,  # HuggingFaceFW/fineweb — general web
-    # REMOVED code/math: was polluting English generation with Python syntax
-    # "stack-v2":             0.00,
-    # "nemotron-math":        0.00,
-    # "nemotron-specialized": 0.00,
 }
 # Mapping from logical blend name → (HF repo, optional config/name, text column).
@@ -66,13 +65,94 @@ PHASE1_WEIGHTS = {
     "Nemotron-Pretraining-Formal-Logic":                0.20,
     "Nemotron-Pretraining-Multiple-Choice":             0.20,
 }
-PHASE2_WEIGHTS = {
     "Nemotron-Pretraining-Multiple-Choice":             0.45,
     "Nemotron-Pretraining-Economics":                   0.20,
     "Nemotron-Pretraining-Formal-Logic":                0.15,
     "Nemotron-Pretraining-Code-Concepts":               0.10,
     "Nemotron-Pretraining-Unconditional-Algorithmic":   0.10,
-}
 def _phase_weights() -> dict[str, float]:
@@ -83,129 +163,61 @@ def _phase_weights() -> dict[str, float]:
     return PHASE2_WEIGHTS if phase == "phase2" else PHASE1_WEIGHTS
-_PREFETCH_THREAD = None
-_PREFETCH_STARTED = set()
-def _find_local_parquets(repo: str, sub_config: str | None) -> list[str]:
-    """Return LOCAL parquet paths in HF hub cache for a given repo+config.
-    If sub_config filter yields zero matches but parquet files exist in the
-    repo dir, returns all parquet files (some datasets like fineweb use a
-    builder config name that doesn't match the filesystem path).
-    """
-    import glob
-    repo_dir = "datasets--" + repo.replace("/", "--")
-    base = os.path.expanduser(f"~/.cache/huggingface/hub/{repo_dir}/snapshots")
-    if not os.path.isdir(base):
-        return []
-    all_paths = []
-    for snap in os.listdir(base):
-        all_paths.extend(glob.glob(os.path.join(base, snap, "**", "*.parquet"), recursive=True))
-    if sub_config is None:
-        return sorted(all_paths)
-    filtered = [p for p in all_paths if f"/{sub_config}/" in p]
-    # Fallback: if the config name doesn't match filesystem paths, use all parquet
-    if not filtered and all_paths:
-        return sorted(all_paths)
-    return sorted(filtered)
-def _start_background_prefetch(repo: str, sub_config: str | None):
-    """Start a daemon thread that downloads parquet shards ahead of consumption.
-    Feeds HF's local cache so streaming=True serves from disk, never network.
-    Idempotent per (repo, sub_config). Runs at throttled speed to not flood.
-    """
-    import threading
-    key = (repo, sub_config)
-    if key in _PREFETCH_STARTED:
-        return
-    _PREFETCH_STARTED.add(key)
-    def worker():
-        try:
-            from huggingface_hub import HfApi, hf_hub_download
-            os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
-            token = os.environ.get("HF_TOKEN")
-            api = HfApi(token=token)
-            files = api.list_repo_files(repo, repo_type="dataset")
-            parquet = sorted(f for f in files if f.endswith(".parquet"))
-            if sub_config is not None:
-                filtered = [f for f in parquet if f"/{sub_config}/" in f or f.startswith(f"{sub_config}/")]
-                if filtered:
-                    parquet = filtered
-            # Fetch shards one by one, skipping already-cached (hf_hub_download is idempotent)
-            for f in parquet:
-                try:
-                    hf_hub_download(repo_id=repo, filename=f, repo_type="dataset", token=token)
-                except Exception:
-                    pass  # skip unavailable shards
-        except Exception:
-            pass  # prefetch is best-effort, don't disrupt training
-    t = threading.Thread(target=worker, daemon=True, name=f"prefetch-{repo}")
-    t.start()
-def _open_stream(config: str, split: str):
     """Open a streaming iterator over one dataset config.
-    Uses HF streaming (reads local cache when shards present, network otherwise).
-    Starts a background prefetcher that downloads remaining shards in parallel.
-    """
-    from datasets import load_dataset
-    token = os.environ.get("HF_TOKEN")
-    shuffle_buf = int(os.environ.get("HYDRA_STREAM_SHUFFLE_BUFFER", "2048"))
-    if config in _BLEND_REGISTRY:
-        repo, name, _text_col = _BLEND_REGISTRY[config]
-        effective_cfg = name
         if config == "nemotron-specialized":
-            effective_cfg = "Nemotron-Pretraining-Code-Concepts"
             repo = NEMOTRON_REPO
     else:
-        repo = NEMOTRON_REPO
-        effective_cfg = config
-    # Kick off background prefetch of remaining shards for this dataset
-    if os.environ.get("HYDRA_BACKGROUND_PREFETCH", "1") == "1":
-        _start_background_prefetch(repo, effective_cfg)
-    local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "1") == "1"
-    if local_only:
-        local_paths = _find_local_parquets(repo, effective_cfg)
-        if not local_paths:
-            raise RuntimeError(
-                f"No local parquet files for {repo} (config={effective_cfg}). "
-                f"Run scripts/predownload_shards.py first, or set HYDRA_LOCAL_SHARDS_ONLY=0."
-            )
         ds = load_dataset(
-            "parquet",
-            data_files=local_paths,
             split="train",
             streaming=True,
         )
-    else:
-        kwargs: dict = dict(split="train", streaming=True, token=token)
-        if effective_cfg is not None:
-            kwargs["name"] = effective_cfg
-        ds = load_dataset(repo, **kwargs)
     ds = ds.shuffle(seed=42, buffer_size=shuffle_buf)
     return iter(ds)
-def _extract_text(row: dict) -> str:
     """Pick the right text column — datasets have different column names.
     Priority order: text, content, prompt_completion, question, body.
     For math datasets that split into problem+solution, concatenate both.
     Fallback: concatenate all string-valued fields.
     """
-    # Fast path: most datasets use "text" or "content".
-    for k in ("text", "content", "prompt_completion", "question", "body"):
-        if k in row and row[k]:
-            return row[k]
     # Math datasets may have problem + solution as separate fields.
     if "problem" in row and "solution" in row:
         p = row["problem"] or ""
@@ -221,15 +233,20 @@ def _extract_text(row: dict) -> str:
     return "\n".join(parts)
-class _WeightedStream:
     """Infinite weighted-round-robin over configs' streaming iterators."""
-    def __init__(self, weights: dict[str, float], seed: int = 0):
-        self.configs = list(weights.keys())
-        self.weights = [weights[c] for c in self.configs]
-        self.streams = {c: _open_stream(c, "train") for c in self.configs}
-        self.rng = random.Random(seed)
-        self.epoch = 1
     def _reopen(self, config: str):
         # stream exhausted — reopen (HF streaming typically infinite but restart on edge)
@@ -245,22 +262,20 @@ class _WeightedStream:
         # exist in the Nemotron configs. Controlled by HYDRA_FACTUAL_INJECT_RATE
         # (default 50 = inject one factual doc every 50 Nemotron docs = ~2%).
         inject_rate = int(os.environ.get("HYDRA_FACTUAL_INJECT_RATE", "50"))
-        if inject_rate > 0 and not hasattr(self, '_factual_docs'):
-            factual_path = os.path.join(
-                os.path.dirname(os.path.abspath(__file__)), "data", "factual", "facts.txt")
-            if os.path.exists(factual_path):
-                self._factual_docs = open(factual_path).read().strip().split('\n')
-                self._factual_idx = 0
-                self._inject_counter = 0
-            else:
-                self._factual_docs = None
-        if inject_rate > 0 and hasattr(self, '_factual_docs') and self._factual_docs:
-            self._inject_counter = getattr(self, '_inject_counter', 0) + 1
-            if self._inject_counter >= inject_rate:
-                self._inject_counter = 0
-                doc = self._factual_docs[self._factual_idx % len(self._factual_docs)]
-                self._factual_idx += 1
-                return doc, self.epoch
         config = self.rng.choices(self.configs, weights=self.weights, k=1)[0]
         try:
@@ -293,9 +308,9 @@ def _document_batches(split: str, tokenizer_batch_size: int = 128) -> Iterator[t
         stream = _WeightedStream(_phase_weights(), seed=0)
     prefetch_depth = int(os.environ.get("HYDRA_STREAM_PREFETCH", "32"))
-    q: queue.Queue = queue.Queue(maxsize=prefetch_depth)
-    sentinel_stop = object()
-    error_box: list = []
     def producer():
         try:
@@ -320,7 +335,7 @@ def _document_batches(split: str, tokenizer_batch_size: int = 128) -> Iterator[t
             if error_box:
                 raise error_box[0]
             return
-        yield item
 def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 1000):
@@ -331,47 +346,24 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
       stage 2: BPE tokenization → token-id lists (this function's producer thread)
       stage 3: best-fit packing → (B, T+1) tensor rows (main thread, consumes)
-    Local cache (HYDRA_TOKEN_CACHE_GB, default 2):
-      Packed (T+1) rows are written to a binary shard on first pass. Subsequent
-      launches with a non-empty cache mmap that file and cycle through it,
-      skipping the 5-min streaming cold-start entirely. Cache key includes
-      (T, vocab_size) so shape changes invalidate the cache automatically.
     """
     import queue
     import threading
     assert split in ("train", "val")
     row_capacity = T + 1
-    bos_token = tokenizer.get_bos_token_id()
-    # --- Local packed-token cache (train only; val path skips cache-write) ---
-    cache_enabled = split == "train"
-    cache_gb = float(os.environ.get("HYDRA_TOKEN_CACHE_GB", "2"))
-    cache_dir = os.path.expanduser("~/.cache/autoresearch")
-    os.makedirs(cache_dir, exist_ok=True)
-    vocab_size = tokenizer.get_vocab_size()
-    cache_path = os.path.join(cache_dir, f"packed_tokens_v1_T{T}_V{vocab_size}_{split}.bin")
-    cache_target_bytes = int(cache_gb * 1024**3)
-    dtype_np = np.int32  # vocab < 2^31
-    bytes_per_row = row_capacity * 4  # int32
-    cache_rows_target = cache_target_bytes // bytes_per_row
-    # If train cache exists and is ready, mmap and yield from it
-    if cache_enabled and os.path.exists(cache_path) and os.path.getsize(cache_path) >= cache_target_bytes // 2:
-        print(f"[token-cache] using {cache_path} ({os.path.getsize(cache_path) / 1024**3:.2f} GB)")
-        yield from _mmap_cache_loader(cache_path, B, T, row_capacity, dtype_np)
-        return  # unreachable (mmap loader is infinite), but satisfies generator protocol
-    if cache_enabled:
-        print(f"[token-cache] building {cache_path} (target {cache_gb:.1f} GB) on first pass")
     batches = _document_batches(split)
     # Stage 2: tokenization prefetch thread. Each queue element is a list of
     # token-id lists (pre-tokenized docs). HYDRA_TOKEN_PREFETCH controls depth.
     tok_prefetch = int(os.environ.get("HYDRA_TOKEN_PREFETCH", "8"))
-    tok_q: queue.Queue = queue.Queue(maxsize=tok_prefetch)
-    tok_sentinel = object()
-    tok_err_box: list = []
     def tokenizer_producer():
         try:
@@ -395,8 +387,8 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
             if tok_err_box:
                 raise tok_err_box[0]
             raise StopIteration
-        token_lists, epoch = item
-        doc_buffer.extend(token_lists)
     row_buffer = torch.empty((B, row_capacity), dtype=torch.long)
     cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=True)
@@ -406,10 +398,6 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
     inputs = gpu_buffer[: B * T].view(B, T)
     targets = gpu_buffer[B * T :].view(B, T)
-    # Open cache file for append-on-build
-    cache_fh = open(cache_path + ".tmp", "wb") if cache_enabled else None
-    cache_rows_written = 0
     while True:
         for row_idx in range(B):
             pos = 0
@@ -437,43 +425,6 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
         cpu_inputs.copy_(row_buffer[:, :-1])
         cpu_targets.copy_(row_buffer[:, 1:])
         gpu_buffer.copy_(cpu_buffer, non_blocking=True)
-        # Write packed rows to cache (append) until target size reached
-        if cache_fh is not None:
-            np_rows = row_buffer.numpy().astype(np.int32, copy=False)
-            cache_fh.write(np_rows.tobytes())
-            cache_rows_written += B
-            if cache_rows_written >= cache_rows_target:
-                cache_fh.flush()
-                cache_fh.close()
-                os.replace(cache_path + ".tmp", cache_path)
-                cache_fh = None
-                print(f"[token-cache] finalized {cache_path} ({cache_rows_written} rows)")
-        yield inputs, targets, epoch
-def _mmap_cache_loader(cache_path: str, B: int, T: int, row_capacity: int, dtype_np):
-    """Read packed (T+1) rows from mmap cache, cycle forever."""
-    data = np.memmap(cache_path, dtype=dtype_np, mode="r").reshape(-1, row_capacity)
-    n_rows = data.shape[0]
-    cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=True)
-    gpu_buffer = torch.empty(2 * B * T, dtype=torch.long, device="cuda")
-    cpu_inputs = cpu_buffer[: B * T].view(B, T)
-    cpu_targets = cpu_buffer[B * T :].view(B, T)
-    inputs = gpu_buffer[: B * T].view(B, T)
-    targets = gpu_buffer[B * T :].view(B, T)
-    idx = 0
-    epoch = 1
-    while True:
-        if idx + B > n_rows:
-            idx = 0
-            epoch += 1
-        batch = torch.from_numpy(data[idx:idx + B].astype(np.int64, copy=True))
-        idx += B
-        cpu_inputs.copy_(batch[:, :-1])
-        cpu_targets.copy_(batch[:, 1:])
-        gpu_buffer.copy_(cpu_buffer, non_blocking=True)
         yield inputs, targets, epoch
@@ -511,22 +462,24 @@ def evaluate_bpb(model, tokenizer, B: int) -> float:
     return total_nats / (math.log(2) * max(total_bytes, 1))
-def ensure_tokenizer():
     """Ensure rustbpe tokenizer exists. If absent, train on a Nemotron stream
     sample using the same rustbpe.train_from_iterator API that prepare.py uses
     (production path — don't fork tokenizer training logic).
     """
     import pickle
     import torch
-    path = os.path.join(_p.TOKENIZER_DIR, "tokenizer.pkl")
-    token_bytes_path = os.path.join(_p.TOKENIZER_DIR, "token_bytes.pt")
-    if os.path.exists(path) and os.path.exists(token_bytes_path):
-        print(f"[nemotron] tokenizer + token_bytes already trained at {_p.TOKENIZER_DIR}", flush=True)
-        return
-    os.makedirs(_p.TOKENIZER_DIR, exist_ok=True)
     print(f"[nemotron] training BPE (vocab_size={_p.VOCAB_SIZE}) on stream sample…", flush=True)
-    import rustbpe
-    import tiktoken
     # Pull a sample of docs — use full blend if active so BPE covers all 7 sources.
     n_docs = int(os.environ.get("HYDRA_BPE_TRAIN_DOCS", "20000"))
@@ -542,7 +495,8 @@ def ensure_tokenizer():
     print(f"[nemotron] collected {len(sample_texts)} sample docs; training BPE…", flush=True)
     # Train rustbpe — identical API to prepare.py's train_tokenizer().
-    tokenizer = rustbpe.Tokenizer()
     vocab_size_no_special = _p.VOCAB_SIZE - len(_p.SPECIAL_TOKENS)
     tokenizer.train_from_iterator(iter(sample_texts), vocab_size_no_special, pattern=_p.SPLIT_PATTERN)
@@ -567,6 +521,7 @@ def ensure_tokenizer():
     for token_id in range(enc.n_vocab):
         tstr = enc.decode([token_id])
         token_bytes_list.append(0 if tstr in special_set else len(tstr.encode("utf-8")))
-    token_bytes_tensor = torch.tensor(token_bytes_list, dtype=torch.int32)
-    torch.save(token_bytes_tensor, token_bytes_path)
-    print(f"[nemotron] BPE + token_bytes saved to {_p.TOKENIZER_DIR}", flush=True)

 """
 from __future__ import annotations
+import os
+import random
+import importlib
+from itertools import cycle
+from typing import Any, Iterator, cast
 import torch
+import prepare as _p  # reuse tokenizer, BOS, byte-length helpers
 NEMOTRON_REPO = "nvidia/Nemotron-Pretraining-Specialized-v1.1"
 # Keys are logical dataset names used by _open_blend_stream / _open_stream.
 # ---------------------------------------------------------------------------
 FULL_BLEND_WEIGHTS: dict[str, float] = {
+    "fineweb-edu":            0.35,  # HuggingFaceFW/fineweb-edu
+    "fineweb":                0.15,  # HuggingFaceFW/fineweb (sample-100BT)
+    "stack-v2":               0.15,  # bigcode/the-stack-v2
+    "nemotron-math":          0.10,  # nvidia/Nemotron-CC-Math-v1
+    "nemotron-specialized":   0.10,  # nvidia/Nemotron-Pretraining-Specialized-v1.1
+    "wikipedia":              0.08,  # olm/wikipedia
+    "cosmopedia":             0.07,  # HuggingFaceTB/cosmopedia
 }
 # Mapping from logical blend name → (HF repo, optional config/name, text column).
     "Nemotron-Pretraining-Formal-Logic":                0.20,
     "Nemotron-Pretraining-Multiple-Choice":             0.20,
 }
+PHASE2_WEIGHTS = {
     "Nemotron-Pretraining-Multiple-Choice":             0.45,
     "Nemotron-Pretraining-Economics":                   0.20,
     "Nemotron-Pretraining-Formal-Logic":                0.15,
     "Nemotron-Pretraining-Code-Concepts":               0.10,
     "Nemotron-Pretraining-Unconditional-Algorithmic":   0.10,
+}
+type StreamBatch = tuple[list[str], int]
+type TokenBatch = tuple[list[list[int]], int]
+def _tokenizer_cache_repo() -> str:
+    return (
+        os.environ.get("HYDRA_TOKENIZER_CACHE_REPO")
+        or os.environ.get("FEATHER_HF_OUTPUT_REPO")
+        or os.environ.get("HF_REPO_ID")
+        or os.environ.get("HYDRA_RETINA_CACHE_REPO")
+        or os.environ.get("FEATHER_HF_RETINA_CACHE_REPO")
+        or ""
+    )
+def _tokenizer_cache_prefix() -> str:
+    return f"tokenizer/vocab{_p.VOCAB_SIZE}"
+def maybe_hydrate_tokenizer_cache() -> bool:
+    """Try to download tokenizer artifacts from HF cache storage."""
+    repo_id = _tokenizer_cache_repo()
+    token = os.environ.get("HF_TOKEN")
+    if not repo_id or not token:
+        return False
+    try:
+        from huggingface_hub import hf_hub_download
+    except Exception as e:  # noqa: BLE001
+        print(f"[nemotron] tokenizer cache unavailable: {type(e).__name__}: {e}", flush=True)
+        return False
+    os.makedirs(_p.TOKENIZER_DIR, exist_ok=True)
+    prefix = _tokenizer_cache_prefix()
+    try:
+        hf_hub_download(
+            repo_id=repo_id,
+            repo_type="model",
+            subfolder=prefix,
+            filename="tokenizer.pkl",
+            token=token,
+            local_dir=_p.TOKENIZER_DIR,
+        )
+        hf_hub_download(
+            repo_id=repo_id,
+            repo_type="model",
+            subfolder=prefix,
+            filename="token_bytes.pt",
+            token=token,
+            local_dir=_p.TOKENIZER_DIR,
+        )
+    except Exception as e:  # noqa: BLE001
+        print(f"[nemotron] tokenizer cache miss in {repo_id}/{prefix}: {type(e).__name__}: {e}", flush=True)
+        return False
+    print(f"[nemotron] hydrated tokenizer cache from {repo_id}/{prefix}", flush=True)
+    return True
+def upload_tokenizer_cache() -> None:
+    """Upload tokenizer artifacts for reuse by future jobs."""
+    repo_id = _tokenizer_cache_repo()
+    token = os.environ.get("HF_TOKEN")
+    if not repo_id or not token:
+        return
+    path = os.path.join(_p.TOKENIZER_DIR, "tokenizer.pkl")
+    token_bytes_path = os.path.join(_p.TOKENIZER_DIR, "token_bytes.pt")
+    if not (os.path.exists(path) and os.path.exists(token_bytes_path)):
+        return
+    try:
+        from huggingface_hub import HfApi
+        api = HfApi(token=token)
+        prefix = _tokenizer_cache_prefix()
+        api.upload_file(path_or_fileobj=path, path_in_repo=f"{prefix}/tokenizer.pkl", repo_id=repo_id, repo_type="model")
+        api.upload_file(path_or_fileobj=token_bytes_path, path_in_repo=f"{prefix}/token_bytes.pt", repo_id=repo_id, repo_type="model")
+        print(f"[nemotron] uploaded tokenizer cache to {repo_id}/{prefix}", flush=True)
+    except Exception as e:  # noqa: BLE001
+        print(f"[nemotron] tokenizer cache upload skipped: {type(e).__name__}: {e}", flush=True)
 def _phase_weights() -> dict[str, float]:
     return PHASE2_WEIGHTS if phase == "phase2" else PHASE1_WEIGHTS
+def _open_stream(config: str, split: str):
     """Open a streaming iterator over one dataset config.
+    Handles two modes:
+      1. Nemotron sub-configs (e.g. "Nemotron-Pretraining-Code-Concepts") —
+         loaded from NEMOTRON_REPO with the config name.
+      2. Full-blend logical names (e.g. "fineweb-edu", "stack-v2") —
+         looked up in _BLEND_REGISTRY for repo / sub-config / text column.
+    Yields dicts; text extraction handled downstream by _extract_text.
+    """
+    load_dataset = importlib.import_module("datasets").load_dataset
+    token = os.environ.get("HF_TOKEN")
+    shuffle_buf = int(os.environ.get("HYDRA_STREAM_SHUFFLE_BUFFER", "2048"))
+    if config in _BLEND_REGISTRY:
+        repo, name, _text_col = _BLEND_REGISTRY[config]
+        kwargs: dict[str, object] = dict(
+            split="train",
+            streaming=True,
+            token=token,
+        )
+        if name is not None:
+            kwargs["name"] = name
+        # nemotron-specialized has multiple sub-configs; pick the first one
+        # (diversity blend) when accessed via the full-blend path.
         if config == "nemotron-specialized":
+            kwargs["name"] = "Nemotron-Pretraining-Code-Concepts"
             repo = NEMOTRON_REPO
+        ds = load_dataset(repo, **kwargs)
     else:
+        # Legacy Nemotron sub-config path (Phase 1 / Phase 2).
         ds = load_dataset(
+            NEMOTRON_REPO,
+            config,
             split="train",
             streaming=True,
+            token=token,
         )
     ds = ds.shuffle(seed=42, buffer_size=shuffle_buf)
     return iter(ds)
+def _extract_text(row: dict[str, object]) -> str:
     """Pick the right text column — datasets have different column names.
     Priority order: text, content, prompt_completion, question, body.
     For math datasets that split into problem+solution, concatenate both.
     Fallback: concatenate all string-valued fields.
     """
+    # Fast path: most datasets use "text" or "content".
+    for k in ("text", "content", "prompt_completion", "question", "body"):
+        value = row.get(k)
+        if isinstance(value, str) and value:
+            return value
     # Math datasets may have problem + solution as separate fields.
     if "problem" in row and "solution" in row:
         p = row["problem"] or ""
     return "\n".join(parts)
+class _WeightedStream:
     """Infinite weighted-round-robin over configs' streaming iterators."""
+    def __init__(self, weights: dict[str, float], seed: int = 0):
+        self.configs = list(weights.keys())
+        self.weights = [weights[c] for c in self.configs]
+        self.streams: dict[str, Iterator[dict[str, object]]] = {
+            c: _open_stream(c, "train") for c in self.configs
+        }
+        self.rng = random.Random(seed)
+        self.epoch = 1
+        self._factual_docs: list[str] | None = None
+        self._factual_idx = 0
+        self._inject_counter = 0
     def _reopen(self, config: str):
         # stream exhausted — reopen (HF streaming typically infinite but restart on edge)
         # exist in the Nemotron configs. Controlled by HYDRA_FACTUAL_INJECT_RATE
         # (default 50 = inject one factual doc every 50 Nemotron docs = ~2%).
         inject_rate = int(os.environ.get("HYDRA_FACTUAL_INJECT_RATE", "50"))
+        if inject_rate > 0 and self._factual_docs is None:
+            factual_path = os.path.join(
+                os.path.dirname(os.path.abspath(__file__)), "data", "factual", "facts.txt")
+            if os.path.exists(factual_path):
+                self._factual_docs = open(factual_path).read().strip().split('\n')
+                self._factual_idx = 0
+                self._inject_counter = 0
+        if inject_rate > 0 and self._factual_docs:
+            self._inject_counter += 1
+            if self._inject_counter >= inject_rate:
+                self._inject_counter = 0
+                doc = self._factual_docs[self._factual_idx % len(self._factual_docs)]
+                self._factual_idx += 1
+                return doc, self.epoch
         config = self.rng.choices(self.configs, weights=self.weights, k=1)[0]
         try:
         stream = _WeightedStream(_phase_weights(), seed=0)
     prefetch_depth = int(os.environ.get("HYDRA_STREAM_PREFETCH", "32"))
+    q: queue.Queue[StreamBatch | object] = queue.Queue(maxsize=prefetch_depth)
+    sentinel_stop = object()
+    error_box: list[BaseException] = []
     def producer():
         try:
             if error_box:
                 raise error_box[0]
             return
+        yield cast(StreamBatch, item)
 def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 1000):
       stage 2: BPE tokenization → token-id lists (this function's producer thread)
       stage 3: best-fit packing → (B, T+1) tensor rows (main thread, consumes)
+    Queue depths tunable via HYDRA_STREAM_PREFETCH and HYDRA_TOKEN_PREFETCH.
+    Goal: zero tps loss from I/O or tokenizer overhead — training loop pulls
+    from an always-full queue.
     """
     import queue
     import threading
     assert split in ("train", "val")
     row_capacity = T + 1
     batches = _document_batches(split)
+    bos_token = tokenizer.get_bos_token_id()
     # Stage 2: tokenization prefetch thread. Each queue element is a list of
     # token-id lists (pre-tokenized docs). HYDRA_TOKEN_PREFETCH controls depth.
     tok_prefetch = int(os.environ.get("HYDRA_TOKEN_PREFETCH", "8"))
+    tok_q: queue.Queue[TokenBatch | object] = queue.Queue(maxsize=tok_prefetch)
+    tok_sentinel = object()
+    tok_err_box: list[BaseException] = []
     def tokenizer_producer():
         try:
             if tok_err_box:
                 raise tok_err_box[0]
             raise StopIteration
+        token_lists, epoch = cast(TokenBatch, item)
+        doc_buffer.extend(token_lists)
     row_buffer = torch.empty((B, row_capacity), dtype=torch.long)
     cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=True)
     inputs = gpu_buffer[: B * T].view(B, T)
     targets = gpu_buffer[B * T :].view(B, T)
     while True:
         for row_idx in range(B):
             pos = 0
         cpu_inputs.copy_(row_buffer[:, :-1])
         cpu_targets.copy_(row_buffer[:, 1:])
         gpu_buffer.copy_(cpu_buffer, non_blocking=True)
         yield inputs, targets, epoch
     return total_nats / (math.log(2) * max(total_bytes, 1))
+def ensure_tokenizer():
     """Ensure rustbpe tokenizer exists. If absent, train on a Nemotron stream
     sample using the same rustbpe.train_from_iterator API that prepare.py uses
     (production path — don't fork tokenizer training logic).
     """
     import pickle
     import torch
+    path = os.path.join(_p.TOKENIZER_DIR, "tokenizer.pkl")
+    token_bytes_path = os.path.join(_p.TOKENIZER_DIR, "token_bytes.pt")
+    if os.path.exists(path) and os.path.exists(token_bytes_path):
+        print(f"[nemotron] tokenizer + token_bytes already trained at {_p.TOKENIZER_DIR}", flush=True)
+        return
+    if maybe_hydrate_tokenizer_cache() and os.path.exists(path) and os.path.exists(token_bytes_path):
+        return
+    os.makedirs(_p.TOKENIZER_DIR, exist_ok=True)
     print(f"[nemotron] training BPE (vocab_size={_p.VOCAB_SIZE}) on stream sample…", flush=True)
+    import rustbpe
+    import tiktoken
     # Pull a sample of docs — use full blend if active so BPE covers all 7 sources.
     n_docs = int(os.environ.get("HYDRA_BPE_TRAIN_DOCS", "20000"))
     print(f"[nemotron] collected {len(sample_texts)} sample docs; training BPE…", flush=True)
     # Train rustbpe — identical API to prepare.py's train_tokenizer().
+    tokenizer_cls = getattr(rustbpe, "Tokenizer")
+    tokenizer: Any = tokenizer_cls()
     vocab_size_no_special = _p.VOCAB_SIZE - len(_p.SPECIAL_TOKENS)
     tokenizer.train_from_iterator(iter(sample_texts), vocab_size_no_special, pattern=_p.SPLIT_PATTERN)
     for token_id in range(enc.n_vocab):
         tstr = enc.decode([token_id])
         token_bytes_list.append(0 if tstr in special_set else len(tstr.encode("utf-8")))
+    token_bytes_tensor = torch.tensor(token_bytes_list, dtype=torch.int32)
+    torch.save(token_bytes_tensor, token_bytes_path)
+    print(f"[nemotron] BPE + token_bytes saved to {_p.TOKENIZER_DIR}", flush=True)
+    upload_tokenizer_cache()

overlay/pyproject.toml CHANGED Viewed

@@ -7,6 +7,7 @@ requires-python = ">=3.11"
 dependencies = [
     "matplotlib>=3.10.8",
     "numpy>=2.2.6",
     "pandas>=2.3.3",
     "pyarrow>=21.0.0",
     "requests>=2.32.0",

 dependencies = [
     "matplotlib>=3.10.8",
     "numpy>=2.2.6",
+    "optuna>=4.4.0",
     "pandas>=2.3.3",
     "pyarrow>=21.0.0",
     "requests>=2.32.0",

overlay/scripts/autoresearch_iter.sh ADDED Viewed

	@@ -0,0 +1,144 @@

+#!/bin/bash
+# Autoresearch single-iteration runner — called from cron every 5 min.
+#
+# Philosophy (Apr 22 2026 rewrite): HYDRA is NOT a transformer. Semantic
+# folding (SDR retina) + HTM episodic engram + GDN memory layers provide
+# enormous latent capacity at tiny d_model. DEPTH > WIDTH. Per the user's
+# guidance, start absolute-smallest, fill VRAM with depth.
+#
+# Base config: d_model=128, n_layer=16 (~60M params). Mutations explore
+# deeper stacks, engram/GDN layout, SDR sparsity. Eval OOM fixed via
+# HYDRA_EVAL_BATCH=1 + HYDRA_CE_CHUNK=64 (was =1024 = no chunking).
+set -u
+REPO=/home/mikeb/work/feather
+RESULTS=$REPO/results.tsv
+LOG_DIR=$REPO/.omc/autoresearch_logs
+mkdir -p "$LOG_DIR"
+ITER_LOG=$LOG_DIR/iter_$(date +%Y%m%d_%H%M%S).log
+cd "$REPO"
+# Skip if training already running — check the actual python process, not shells
+# whose argv merely contains the pattern string (e.g. pgrep wait-loops).
+if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {found=1} END {exit !found}'; then
+  echo "[$(date +%H:%M:%S)] skip — training already running" >> "$LOG_DIR/skips.log"
+  exit 0
+fi
+# Skip if stop-file exists
+if [ -f "$REPO/.omc/autoresearch_STOP" ]; then
+  echo "[$(date +%H:%M:%S)] STOPPED — .omc/autoresearch_STOP exists" >> "$LOG_DIR/skips.log"
+  exit 0
+fi
+# Compute next experiment index from results.tsv
+if [ ! -f "$RESULTS" ]; then
+  printf "experiment\tcommit\tval_bpb\ttps_avg\tfactual\tstatus\tdescription\n" > "$RESULTS"
+fi
+NEXT_EXP=$(awk -F'\t' 'NR>1 && $1~/^[0-9]+$/ {if ($1+0 > max) max=$1+0} END {print max+1}' "$RESULTS")
+[ -z "$NEXT_EXP" ] && NEXT_EXP=1
+# Mutation pool — explores deep+narrow regime.
+# Base: d_model=128, n_layer=16, expand=3, d_state=64, engram=8192, B=16, seq=1024, GDN@5,11
+MUTATIONS=(
+  "baseline-deep-narrow|"
+  "n_layer=16 (shallower-control)|HYDRA_N_LAYER=16"
+  "n_layer=24 (max depth)|HYDRA_N_LAYER=24"
+  "d_model=96 (leaner)|HYDRA_D_MODEL=96"
+  "d_model=160 (slightly wider)|HYDRA_D_MODEL=160"
+  "GDN_LAYERS=0,3,6,9,12,15,18 (7 GDN)|HYDRA_GDN_LAYERS=0,3,6,9,12,15,18"
+  "GDN_LAYERS=1,3,5,7,9,11,13,15,17 (9 GDN)|HYDRA_GDN_LAYERS=1,3,5,7,9,11,13,15,17"
+  "GDN_LAYERS= (all-Mamba3 depth)|HYDRA_GDN_LAYERS="
+  "D_STATE=128 (fatter SSM state)|HYDRA_D_STATE=128"
+  "D_STATE=32 (leaner SSM state)|HYDRA_D_STATE=32"
+  "EXPAND=2 (leaner FFN)|HYDRA_EXPAND=2"
+  "EXPAND=4 (fatter FFN)|HYDRA_EXPAND=4"
+  "engram=32768 (even wider)|HYDRA_ENGRAM_N_COLUMNS=32768"
+  "engram_topk=128 (denser retrieve)|HYDRA_ENGRAM_TOPK=128"
+  "D_STATE=96 (mid SSM)|HYDRA_D_STATE=96"
+  "HTM_SUBSAMPLE=64 (2x HTM)|HYDRA_HTM_SUBSAMPLE=64"
+  "batch=16 (fill VRAM)|HYDRA_BATCH_SIZE=16"
+  "batch=4 seq=2048 (long-range)|HYDRA_BATCH_SIZE=4 HYDRA_SEQ_LEN=2048"
+  "MATRIX_LR=0.18|HYDRA_MATRIX_LR=0.18"
+  "WARMUP_RATIO=0.05|HYDRA_WARMUP_RATIO=0.05"
+  "total_batch=16384 (2x opt steps)|HYDRA_TOTAL_BATCH=16384"
+  "total_batch=8192 (4x opt steps)|HYDRA_TOTAL_BATCH=8192"
+  "HEADDIM=64 (bigger heads)|HYDRA_HEADDIM=64"
+  "engram_layer_idx=8 (mid-stack)|HYDRA_ENGRAM_LAYER_IDX=8"
+  "EXPAND=4 + n_layer=20 (fat+deep)|HYDRA_EXPAND=4 HYDRA_N_LAYER=20"
+  "B=16 + total_batch=16384|HYDRA_BATCH_SIZE=16 HYDRA_TOTAL_BATCH=16384"
+  "engram=32768 + EXPAND=4|HYDRA_ENGRAM_N_COLUMNS=32768 HYDRA_EXPAND=4"
+  "MTP_K=2 + HEADDIM=64|HYDRA_MTP_K=2 HYDRA_HEADDIM=64"
+  "label_smoothing=0.1|HYDRA_LABEL_SMOOTHING=0.1"
+  "z_loss=0.001 (10x)|HYDRA_Z_LOSS_WEIGHT=0.001"
+  "HTM_STOP_GRAD=1|HYDRA_HTM_STOP_GRAD=1"
+  "DROPOUT=0.0|HYDRA_DROPOUT=0.0"
+  "TIME=900s long-budget champion|HYDRA_TIME_BUDGET=900 HYDRA_ENGRAM_N_COLUMNS=32768 HYDRA_EXPAND=4"
+  "TIME=1200s deep n_layer=24|HYDRA_TIME_BUDGET=1200 HYDRA_N_LAYER=24"
+)
+# Index into mutation pool (wrap around for continuous search, start at exp13)
+MUT_IDX=$(( (NEXT_EXP - 13) % ${#MUTATIONS[@]} ))
+[ "$MUT_IDX" -lt 0 ] && MUT_IDX=0
+IFS='|' read -r DESC EXTRA_ENV <<< "${MUTATIONS[$MUT_IDX]}"
+echo "[$(date +%H:%M:%S)] Starting exp $NEXT_EXP: $DESC" >> "$ITER_LOG"
+# Launch training with mutation
+# KEY CHANGES vs prior iter:
+#   d_model 384→128   (3x narrower)
+#   n_layer 10→16     (1.6x deeper)
+#   batch 8→16        (fill VRAM)
+#   CE_CHUNK 1024→64  (16x smaller eval logit chunks — fixes OOM)
+#   EVAL_BATCH 2→1    (halve eval memory)
+#   EVAL_TOKENS 131K  (keep, ~3-4s eval)
+rm -f run.log
+env \
+  LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
+  PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+  HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
+  HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
+  HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
+  HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
+  HYDRA_TIME_BUDGET=600 \
+  HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
+  HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
+  HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
+  HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
+  HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
+  HYDRA_CKPT_INTERVAL=0 HYDRA_MID_VAL_INTERVAL=0 \
+  HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
+  HYDRA_Z_LOSS_WEIGHT=0.001 \
+  HYDRA_RESUME_CKPT=none \
+  $EXTRA_ENV \
+  ./.venv/bin/python -u train.py > run.log 2>&1
+STATUS=$?
+# Parse metrics
+METRICS=$(./.venv/bin/python scripts/parse_metrics.py run.log 2>/dev/null || echo "NA	NA	NA")
+VAL_BPB=$(echo "$METRICS" | cut -f1)
+TPS=$(echo "$METRICS" | cut -f2)
+FACTUAL=$(echo "$METRICS" | cut -f3)
+COMMIT=$(git rev-parse --short HEAD)
+# BPB can be: "NA" (parse fail), "~X.XXXX" (train_bpb fallback when eval OOMs),
+# or "X.XXXX" (real val_bpb). The ~ prefix marks the fallback.
+if [ "$STATUS" -ne 0 ]; then
+  STATUS_STR="crash"
+elif [ "$VAL_BPB" = "NA" ]; then
+  STATUS_STR="no_metrics"
+elif [[ "$VAL_BPB" == ~* ]]; then
+  STATUS_STR="train_bpb"
+else
+  STATUS_STR="ok"
+fi
+printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" "$NEXT_EXP" "$COMMIT" "$VAL_BPB" "$TPS" "$FACTUAL" "$STATUS_STR" "$DESC" >> "$RESULTS"
+echo "[$(date +%H:%M:%S)] Done exp $NEXT_EXP: bpb=$VAL_BPB tps=$TPS factual=$FACTUAL status=$STATUS_STR" >> "$ITER_LOG"
+# Auto-stop condition: great result
+if [ "$FACTUAL" != "NA" ]; then
+  HITS=$(echo "$FACTUAL" | cut -d/ -f1)
+  if [ -n "$HITS" ] && [ "$HITS" -ge 7 ] 2>/dev/null; then
+    touch "$REPO/.omc/autoresearch_STOP"
+    echo "[$(date +%H:%M:%S)] STOP: reached factual>=7/9 at exp $NEXT_EXP" >> "$ITER_LOG"
+  fi
+fi

overlay/scripts/benchmark_hyena_stack.py CHANGED Viewed

@@ -26,8 +26,11 @@ Invocation:
     # On A100/A10G (production cloud hardware), use time=900 (15 min) for
     # stable steady-state numbers.
-After each run the script prints:
-    BENCHMARK config=<name> tps_steady=<avg> bpb_at_500=<val> vram_peak=<MiB>
 Collate those lines into the matrix table manually, then pick the winner
 for the 6-hour production run (HYDRA_TIME_BUDGET=21600).
@@ -81,7 +84,7 @@ CONFIGS = {
 }
-def build_env(cfg_overrides: dict) -> dict:
     """Compose a full env dict from the inherited env + config overrides."""
     env = os.environ.copy()
     # Ensure the Hyena layer selection is always present (defaults to off).
@@ -91,7 +94,7 @@ def build_env(cfg_overrides: dict) -> dict:
     return env
-def parse_step_line(line: str) -> dict | None:
     """Parse a single step=... line into a dict of metrics, or None."""
     if not line.startswith("step="):
         return None
@@ -102,7 +105,7 @@ def parse_step_line(line: str) -> dict | None:
         return None
-def summarize(log_path: Path, warmup_steps: int = 50) -> dict:
     """Tail log_path, compute steady-state TPS / BPB@500 / VRAM peak.
     Skips the first `warmup_steps` to discard CUDA graph capture / autotune
@@ -138,20 +141,29 @@ def summarize(log_path: Path, warmup_steps: int = 50) -> dict:
     tps_sorted = sorted(tps_vals)
     tps_steady = tps_sorted[len(tps_sorted) // 2]  # median
-    return {
-        "tps_steady": tps_steady,
-        "bpb_at_500": bpb_at_500 or (bpbs[-1] if bpbs else 0.0),
-        "vram_peak": vram_peak,
-        "steps": len(tps_vals) + warmup_steps,
-    }
-def main() -> int:
-    ap = argparse.ArgumentParser()
-    ap.add_argument("--config", required=True, choices=list(CONFIGS))
-    ap.add_argument("--time", type=int, default=300, help="training seconds")
-    ap.add_argument("--log", default=None, help="output log path (default: run_bench_<cfg>.log)")
-    args = ap.parse_args()
     cfg = CONFIGS[args.config]
     log_path = Path(args.log or (REPO / f"run_bench_{args.config}.log"))
@@ -178,16 +190,25 @@ def main() -> int:
         print(f"BENCH FAIL config={args.config}", flush=True)
         return proc.returncode
-    summary = summarize(log_path)
-    print(
-        f"BENCHMARK config={args.config} "
-        f"tps_steady={summary['tps_steady']:.0f} "
-        f"bpb_at_500={summary['bpb_at_500']:.4f} "
-        f"vram_peak={summary['vram_peak']:.0f}MiB "
-        f"steps={summary['steps']}",
-        flush=True,
-    )
-    return 0
 if __name__ == "__main__":

     # On A100/A10G (production cloud hardware), use time=900 (15 min) for
     # stable steady-state numbers.
+After each run the script prints:
+    BENCHMARK config=<name> tps_steady=<avg> bpb_at_500=<val> vram_peak=<MiB>
+If `--min-tps` is set (>0), the script exits non-zero when steady-state TPS
+falls below the threshold.
 Collate those lines into the matrix table manually, then pick the winner
 for the 6-hour production run (HYDRA_TIME_BUDGET=21600).
 }
+def build_env(cfg_overrides: dict[str, str]) -> dict[str, str]:
     """Compose a full env dict from the inherited env + config overrides."""
     env = os.environ.copy()
     # Ensure the Hyena layer selection is always present (defaults to off).
     return env
+def parse_step_line(line: str) -> dict[str, float] | None:
     """Parse a single step=... line into a dict of metrics, or None."""
     if not line.startswith("step="):
         return None
         return None
+def summarize(log_path: Path, warmup_steps: int = 50) -> dict[str, float]:
     """Tail log_path, compute steady-state TPS / BPB@500 / VRAM peak.
     Skips the first `warmup_steps` to discard CUDA graph capture / autotune
     tps_sorted = sorted(tps_vals)
     tps_steady = tps_sorted[len(tps_sorted) // 2]  # median
+    return {
+        "tps_steady": tps_steady,
+        "bpb_at_500": bpb_at_500 or (bpbs[-1] if bpbs else 0.0),
+        "vram_peak": vram_peak,
+        "steps": len(tps_vals) + warmup_steps,
+    }
+def fails_tps_floor(summary: dict[str, float], min_tps: float) -> bool:
+    if min_tps <= 0:
+        return False
+    tps_steady = float(summary.get("tps_steady", 0.0))
+    return tps_steady < float(min_tps)
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--config", required=True, choices=list(CONFIGS))
+    ap.add_argument("--time", type=int, default=300, help="training seconds")
+    ap.add_argument("--log", default=None, help="output log path (default: run_bench_<cfg>.log)")
+    ap.add_argument("--min-tps", type=float, default=50000.0, help="Required steady-state TPS floor (set 0 to disable)")
+    ap.add_argument("--warmup-steps", type=int, default=50, help="Number of initial steps to skip before TPS median")
+    args = ap.parse_args()
     cfg = CONFIGS[args.config]
     log_path = Path(args.log or (REPO / f"run_bench_{args.config}.log"))
         print(f"BENCH FAIL config={args.config}", flush=True)
         return proc.returncode
+    summary = summarize(log_path, warmup_steps=max(0, int(args.warmup_steps)))
+    print(
+        f"BENCHMARK config={args.config} "
+        f"tps_steady={summary['tps_steady']:.0f} "
+        f"bpb_at_500={summary['bpb_at_500']:.4f} "
+        f"vram_peak={summary['vram_peak']:.0f}MiB "
+        f"steps={summary['steps']}",
+        flush=True,
+    )
+    if fails_tps_floor(summary, args.min_tps):
+        print(
+            f"BENCH FAIL config={args.config} tps_steady={summary['tps_steady']:.0f} < min_tps={args.min_tps:.0f}",
+            flush=True,
+        )
+        return 2
+    print(f"BENCH PASS config={args.config} min_tps={args.min_tps:.0f}", flush=True)
+    return 0
 if __name__ == "__main__":

overlay/scripts/export_hpo_priors.py ADDED Viewed

	@@ -0,0 +1,74 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import datetime as dt
+import json
+from pathlib import Path
+from typing import Any
+import optuna
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Export top Optuna trials as transfer-learning priors")
+    parser.add_argument("--study-name", action="append", default=[], help="Repeat to merge multiple studies")
+    parser.add_argument("--storage", default="sqlite:///optuna_hpo.db")
+    parser.add_argument("--top-k", type=int, default=20)
+    parser.add_argument("--out", type=Path, default=Path("docs") / "hpo_transfer_priors.json")
+    parser.add_argument("--metric", default="val_bpb")
+    return parser.parse_args()
+def _completed_trials(study: optuna.Study) -> list[optuna.trial.FrozenTrial]:
+    trials = [t for t in study.trials if t.value is not None]
+    reverse = study.direction == optuna.study.StudyDirection.MAXIMIZE
+    return sorted(trials, key=lambda t: float(t.value), reverse=reverse)
+def _serialize_trial(trial: optuna.trial.FrozenTrial) -> dict[str, Any]:
+    return {
+        "trial_number": trial.number,
+        "value": float(trial.value) if trial.value is not None else None,
+        "params": dict(trial.params),
+        "user_attrs": dict(trial.user_attrs),
+    }
+def main() -> int:
+    args = parse_args()
+    study_names = args.study_name or ["hydra_hpo"]
+    merged_trials: list[dict[str, Any]] = []
+    total_trials = 0
+    total_completed = 0
+    for study_name in study_names:
+        study = optuna.load_study(study_name=study_name, storage=args.storage)
+        ranked = _completed_trials(study)
+        selected = ranked[: max(0, args.top_k)]
+        total_trials += len(study.trials)
+        total_completed += len(ranked)
+        for t in selected:
+            row = _serialize_trial(t)
+            row["study_name"] = study_name
+            merged_trials.append(row)
+    payload = {
+        "schema_version": 1,
+        "generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
+        "study_names": study_names,
+        "metric": args.metric,
+        "n_total_trials": total_trials,
+        "n_completed_trials": total_completed,
+        "top_k_per_study": args.top_k,
+        "trials": merged_trials,
+    }
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    args.out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    print(f"[hpo-priors] wrote {args.out} with {len(merged_trials)} merged trials")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

overlay/scripts/hpo_orchestrator.py ADDED Viewed

	@@ -0,0 +1,319 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Any
+import optuna
+REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+from scripts.hf_routing import resolve_routing
+HPO_SCRIPT = REPO_ROOT / "scripts" / "optuna_hpo.py"
+def _run_worker(args: list[str]) -> int:
+    cmd = [sys.executable, str(HPO_SCRIPT), *args]
+    proc = subprocess.run(cmd, cwd=str(REPO_ROOT), text=True)
+    return proc.returncode
+def _study_stats(storage: str, study_name: str) -> dict[str, Any]:
+    try:
+        study = optuna.load_study(study_name=study_name, storage=storage)
+    except KeyError:
+        return {
+            "study_name": study_name,
+            "status": "missing",
+            "direction": None,
+            "n_trials": 0,
+            "n_completed": 0,
+            "n_pruned": 0,
+            "n_failed": 0,
+        }
+    completed = [t for t in study.trials if t.value is not None]
+    pruned = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
+    failed = [t for t in study.trials if t.state == optuna.trial.TrialState.FAIL]
+    stats: dict[str, Any] = {
+        "study_name": study.study_name,
+        "direction": str(study.direction),
+        "n_trials": len(study.trials),
+        "n_completed": len(completed),
+        "n_pruned": len(pruned),
+        "n_failed": len(failed),
+    }
+    if completed:
+        stats.update(
+            {
+                "best_value": study.best_value,
+                "best_params": study.best_params,
+                "best_trial_number": study.best_trial.number,
+                "best_trial_user_attrs": study.best_trial.user_attrs,
+            }
+        )
+    return stats
+def _phase_args(phase: str, base: argparse.Namespace) -> list[str]:
+    common = [
+        "--study-name",
+        base.study_name,
+        "--storage",
+        base.storage,
+        "--metric",
+        base.metric,
+        "--direction",
+        base.direction,
+        "--seed",
+        str(base.seed),
+        "--min-tps",
+        str(base.min_tps),
+        "--summary-out",
+        str(base.summary_out),
+        "--runner",
+        base.runner,
+        "--hf-namespace",
+        base.hf_namespace,
+        "--hf-image",
+        base.hf_image,
+        "--hf-flavor",
+        base.hf_flavor,
+        "--hf-timeout",
+        base.hf_timeout,
+        "--hf-command",
+        base.hf_command,
+        "--hf-token-env",
+        base.hf_token_env,
+        "--hf-poll-interval",
+        str(base.hf_poll_interval),
+        "--hf-launcher-script",
+        str(base.hf_launcher_script),
+        "--priors-file",
+        str(base.priors_file),
+    ]
+    if base.hf_output_repo:
+        common.extend(["--hf-output-repo", base.hf_output_repo])
+    if base.hf_use_bash:
+        common.append("--hf-use-bash")
+    if base.hf_stop_after_metric:
+        common.append("--hf-stop-after-metric")
+    else:
+        common.append("--no-hf-stop-after-metric")
+    if base.apply_priors:
+        common.append("--apply-priors")
+    else:
+        common.append("--no-apply-priors")
+    if phase == "phase1":
+        return [
+            *common,
+            "--trials",
+            str(base.phase1_trials),
+            "--trial-time-budget",
+            str(base.phase1_trial_time_budget),
+            "--trial-timeout",
+            str(base.phase1_trial_timeout),
+            "--n-startup-trials",
+            str(base.phase1_n_startup),
+            "--n-warmup-steps",
+            str(base.phase1_n_warmup),
+            "--patience-trials",
+            str(base.phase1_patience),
+            "--min-improvement",
+            str(base.phase1_min_improvement),
+        ]
+    if phase == "phase2":
+        return [
+            *common,
+            "--trials",
+            str(base.phase2_trials),
+            "--trial-time-budget",
+            str(base.phase2_trial_time_budget),
+            "--trial-timeout",
+            str(base.phase2_trial_timeout),
+            "--n-startup-trials",
+            str(base.phase2_n_startup),
+            "--n-warmup-steps",
+            str(base.phase2_n_warmup),
+            "--patience-trials",
+            str(base.phase2_patience),
+            "--min-improvement",
+            str(base.phase2_min_improvement),
+        ]
+    raise ValueError(f"Unknown phase: {phase}")
+def cmd_phase(args: argparse.Namespace) -> int:
+    rc = _run_worker(_phase_args(args.phase, args))
+    stats = _study_stats(args.storage, args.study_name)
+    args.summary_out.parent.mkdir(parents=True, exist_ok=True)
+    args.summary_out.write_text(json.dumps({"phase": args.phase, "stats": stats}, indent=2), encoding="utf-8")
+    print(json.dumps({"phase": args.phase, "stats": stats}, indent=2))
+    return rc
+def cmd_parallel(args: argparse.Namespace) -> int:
+    worker_args = _phase_args(args.phase, args)
+    procs: list[subprocess.Popen[str]] = []
+    for _ in range(args.workers):
+        cmd = [sys.executable, str(HPO_SCRIPT), *worker_args]
+        procs.append(subprocess.Popen(cmd, cwd=str(REPO_ROOT), text=True))
+    exit_codes = [p.wait() for p in procs]
+    stats = _study_stats(args.storage, args.study_name)
+    payload = {
+        "phase": args.phase,
+        "workers": args.workers,
+        "exit_codes": exit_codes,
+        "stats": stats,
+    }
+    args.summary_out.parent.mkdir(parents=True, exist_ok=True)
+    args.summary_out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    print(json.dumps(payload, indent=2))
+    return 0 if all(code == 0 for code in exit_codes) else 1
+def cmd_recommend(args: argparse.Namespace) -> int:
+    stats = _study_stats(args.storage, args.study_name)
+    min_tps_floor = float(args.min_tps)
+    if stats.get("status") == "missing":
+        payload = {
+            "stats": stats,
+            "recommendation": {
+                "status": "create_study_first",
+                "next_step": "Run phase1 (serial or parallel) to create and populate the study.",
+                "example": f"python scripts/hpo_orchestrator.py parallel --phase phase1 --workers 3 --storage {args.storage} --study-name {args.study_name}",
+            },
+        }
+        args.summary_out.parent.mkdir(parents=True, exist_ok=True)
+        args.summary_out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        print(json.dumps(payload, indent=2))
+        return 0
+    n_completed = int(stats.get("n_completed", 0))
+    if n_completed < 10:
+        recommendation = {
+            "status": "insufficient_data",
+            "next_step": "Run phase1 with 2-4 parallel workers until >=10 completed trials.",
+            "early_stop_policy": {
+                "patience_trials": 8,
+                "min_improvement": 0.001,
+            },
+            "throughput_guard": {
+                "min_tps": min_tps_floor,
+                "note": "Trials below this TPS floor are pruned.",
+            },
+            "transfer_learning": {
+                "export_priors": f"python scripts/export_hpo_priors.py --storage {args.storage} --study-name {args.study_name} --top-k 10 --out docs/hpo_transfer_priors.json",
+                "use_priors": "Enabled by default in scripts/optuna_hpo.py (override with --no-apply-priors)",
+            },
+        }
+    else:
+        recommendation = {
+            "status": "ready_for_full_optimization",
+            "next_step": "Run phase2 with 3-4 parallel workers.",
+            "suggested_full_run": {
+                "trials": 60,
+                "workers": 4,
+                "trial_time_budget": 300,
+                "trial_timeout": 900,
+                "min_tps": min_tps_floor,
+                "patience_trials": 12,
+                "min_improvement": 0.0005,
+            },
+            "transfer_learning": {
+                "refresh_priors": f"python scripts/export_hpo_priors.py --storage {args.storage} --study-name {args.study_name} --top-k 20 --out docs/hpo_transfer_priors.json",
+                "notes": "Carry priors into new studies unless architecture/objective diverges significantly.",
+            },
+        }
+    payload = {"stats": stats, "recommendation": recommendation}
+    args.summary_out.parent.mkdir(parents=True, exist_ok=True)
+    args.summary_out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+    print(json.dumps(payload, indent=2))
+    return 0
+def build_parser() -> argparse.ArgumentParser:
+    routing_defaults = resolve_routing(token=os.environ.get("HF_TOKEN"))
+    parser = argparse.ArgumentParser(description="Phase-oriented orchestration for Optuna HPO")
+    sub = parser.add_subparsers(dest="cmd", required=True)
+    def add_common(p: argparse.ArgumentParser) -> None:
+        p.add_argument("--study-name", default="hydra_hpo")
+        p.add_argument("--storage", default="sqlite:///optuna_hpo.db")
+        p.add_argument("--metric", default="val_bpb")
+        p.add_argument("--direction", choices=["minimize", "maximize"], default="minimize")
+        p.add_argument("--seed", type=int, default=42)
+        p.add_argument("--min-tps", type=float, default=50000.0)
+        p.add_argument("--summary-out", type=Path, default=REPO_ROOT / ".tmp" / "optuna" / "orchestrator_summary.json")
+        p.add_argument("--runner", choices=["local", "hf-job", "hf-launcher"], default="local")
+        p.add_argument("--hf-namespace", default=routing_defaults.job_namespace)
+        p.add_argument("--hf-image", default=f"hf.co/spaces/{routing_defaults.space_repo}")
+        p.add_argument("--hf-flavor", default="a10g-large")
+        p.add_argument("--hf-timeout", default="25m")
+        p.add_argument("--hf-command", default="/app/entrypoint.py")
+        p.add_argument("--hf-use-bash", action="store_true")
+        p.add_argument("--hf-token-env", default="HF_TOKEN")
+        p.add_argument("--hf-poll-interval", type=int, default=12)
+        p.add_argument("--hf-launcher-script", type=Path, default=REPO_ROOT / "scripts" / "launch_feather_hf_job.py")
+        p.add_argument("--hf-output-repo", default=routing_defaults.output_repo)
+        p.add_argument("--priors-file", type=Path, default=REPO_ROOT / "docs" / "hpo_transfer_priors.json")
+        p.add_argument("--apply-priors", action="store_true", default=True)
+        p.add_argument("--no-apply-priors", action="store_false", dest="apply_priors")
+        p.add_argument("--hf-stop-after-metric", action="store_true", default=True)
+        p.add_argument("--no-hf-stop-after-metric", action="store_false", dest="hf_stop_after_metric")
+        # Phase-1 defaults
+        p.add_argument("--phase1-trials", type=int, default=30)
+        p.add_argument("--phase1-trial-time-budget", type=int, default=180)
+        p.add_argument("--phase1-trial-timeout", type=int, default=600)
+        p.add_argument("--phase1-n-startup", type=int, default=5)
+        p.add_argument("--phase1-n-warmup", type=int, default=0)
+        p.add_argument("--phase1-patience", type=int, default=8)
+        p.add_argument("--phase1-min-improvement", type=float, default=0.001)
+        # Phase-2 defaults
+        p.add_argument("--phase2-trials", type=int, default=60)
+        p.add_argument("--phase2-trial-time-budget", type=int, default=300)
+        p.add_argument("--phase2-trial-timeout", type=int, default=900)
+        p.add_argument("--phase2-n-startup", type=int, default=8)
+        p.add_argument("--phase2-n-warmup", type=int, default=0)
+        p.add_argument("--phase2-patience", type=int, default=12)
+        p.add_argument("--phase2-min-improvement", type=float, default=0.0005)
+    p_phase = sub.add_parser("phase", help="Run a single phase serially")
+    add_common(p_phase)
+    p_phase.add_argument("--phase", choices=["phase1", "phase2"], required=True)
+    p_phase.set_defaults(func=cmd_phase)
+    p_parallel = sub.add_parser("parallel", help="Run a phase with N parallel workers")
+    add_common(p_parallel)
+    p_parallel.add_argument("--phase", choices=["phase1", "phase2"], required=True)
+    p_parallel.add_argument("--workers", type=int, default=3)
+    p_parallel.set_defaults(func=cmd_parallel)
+    p_reco = sub.add_parser("recommend", help="Recommend full-run settings from current study")
+    add_common(p_reco)
+    p_reco.set_defaults(func=cmd_recommend)
+    return parser
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+    return int(args.func(args))
+if __name__ == "__main__":
+    raise SystemExit(main())

overlay/scripts/launch_feather_hf_job.py CHANGED Viewed

@@ -2,37 +2,104 @@
 from __future__ import annotations
 import os
 import sys
 import time
 from pathlib import Path
 from huggingface_hub import HfApi
-from huggingface_hub._space_api import SpaceHardware
-from huggingface_hub.errors import HfHubHTTPError
-# ../../../../ from overlay/scripts/launch_feather_hf_job.py -> repository root
-REPO_ROOT = Path(__file__).resolve().parents[4]
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
 from scripts.hf_routing import resolve_routing
 DEFAULT_IMAGE = os.environ.get('FEATHER_HF_IMAGE', 'ghcr.io/slapglif/feather-hf-runtime:latest')
-IMAGE_DIR = REPO_ROOT / 'hf_jobs' / 'feather_h200_image'
 TIMEOUT = os.environ.get('FEATHER_HF_JOB_TIMEOUT', '12h')
-FLAVOR_RAW = os.environ.get('FEATHER_HF_FLAVOR', 'h200')
 TARGET_SHARDS = os.environ.get('HYDRA_TARGET_SHARDS', '2048')
 TIME_BUDGET = os.environ.get('HYDRA_TIME_BUDGET', '43200')
 DOWNLOAD_WORKERS = os.environ.get('HYDRA_DOWNLOAD_WORKERS', '16')
 CKPT_INTERVAL = os.environ.get('HYDRA_CKPT_INTERVAL', '1000')
 DRY_RUN = os.environ.get('FEATHER_HF_DRY_RUN', '0') == '1'
 USE_SPACE_IMAGE = os.environ.get('FEATHER_HF_USE_SPACE_IMAGE', '0') == '1'
 # When true, assume the Space image has already been built by a previous
 # invocation and skip the upload+build wait. Used by sweep drivers that fan
 # out many jobs against a single pre-uploaded image.
 SKIP_UPLOAD = os.environ.get('FEATHER_HF_SKIP_UPLOAD', '0') == '1'
-JOB_SUBMIT_RETRIES = max(1, int(os.environ.get('FEATHER_HF_JOB_SUBMIT_RETRIES', '5')))
-JOB_SUBMIT_RETRY_BASE_S = max(1.0, float(os.environ.get('FEATHER_HF_JOB_SUBMIT_RETRY_BASE_S', '3')))
 def require_token() -> str:
@@ -59,115 +126,52 @@ def wait_for_space(api: HfApi, repo_id: str, token: str, timeout_s: int = 1800)
     """
     start = time.time()
     seen_build_completion = False
     while True:
-        try:
-            runtime = api.get_space_runtime(repo_id, token=token)
-        except HfHubHTTPError as exc:
-            code = getattr(getattr(exc, 'response', None), 'status_code', None)
-            if isinstance(code, int) and code >= 500:
-                if time.time() - start > timeout_s:
-                    raise TimeoutError(
-                        f'Space {repo_id} runtime endpoint unstable for {timeout_s}s '
-                        f'(last HTTP {code})'
-                    ) from exc
-                print(f'[space] runtime endpoint HTTP {code}; retrying...', flush=True)
-                time.sleep(20)
-                continue
-            raise
         stage = getattr(runtime, 'stage', None)
-        hardware = getattr(runtime, 'hardware', None)
-        err = getattr(runtime, 'errorMessage', None) or getattr(runtime, 'error_message', None)
-        print(f'[space] stage={stage} hardware={hardware}', flush=True)
-        if stage in {'APP_STARTING', 'RUNNING', 'PAUSED', 'SLEEPING'}:
-            seen_build_completion = True
-        if stage in {'RUNNING', 'PAUSED', 'SLEEPING'}:
-            return
         # Image is built — Jobs can use it regardless of Space boot outcome.
-        if seen_build_completion and stage in {'RUNTIME_ERROR', 'APP_STARTING_ERROR'}:
-            msg = (
-                f'[space] Space boot failed with {stage} but built image is '
-                'available in the Space registry and is usable by HF Jobs.'
-            )
-            print(msg, flush=True)
-            return
         # Hard build failures — no image was produced.
         if stage in {'BUILD_ERROR', 'CONFIG_ERROR', 'NO_APP_FILE'}:
             raise RuntimeError(f'Space {repo_id} build failed: stage={stage} error={err!r}')
         if time.time() - start > timeout_s:
             raise TimeoutError(f'Space {repo_id} did not become ready in {timeout_s}s (last stage={stage})')
-        time.sleep(20)
-def submit_job_with_retry(
-    api: HfApi,
-    *,
-    image: str,
-    command: list[str],
-    env: dict[str, str],
-    secrets: dict[str, str],
-    flavor: SpaceHardware,
-    timeout: str,
-    token: str,
-    namespace: str,
-):
-    last_exc: Exception | None = None
-    for attempt in range(1, JOB_SUBMIT_RETRIES + 1):
-        try:
-            return api.run_job(
-                image=image,
-                command=command,
-                env=env,
-                secrets=secrets,
-                flavor=flavor,
-                timeout=timeout,
-                token=token,
-                namespace=namespace,
-            )
-        except HfHubHTTPError as exc:
-            last_exc = exc
-            code = getattr(getattr(exc, 'response', None), 'status_code', None)
-            if not (isinstance(code, int) and code >= 500):
-                raise
-            if attempt >= JOB_SUBMIT_RETRIES:
-                raise SystemExit(
-                    f'HF Jobs backend returned HTTP {code} after {JOB_SUBMIT_RETRIES} '
-                    'submit attempts; failing fast.'
-                ) from exc
-            wait_s = JOB_SUBMIT_RETRY_BASE_S * attempt
-            print(
-                f'[launch] HF Jobs backend returned HTTP {code}; retrying submit in '
-                f'{wait_s:.1f}s (attempt {attempt}/{JOB_SUBMIT_RETRIES})',
-                flush=True,
-            )
-            time.sleep(wait_s)
-    if last_exc is not None:
-        raise last_exc
-    raise RuntimeError('submit_job_with_retry exhausted without a result')
 def main() -> int:
     token = require_token()
     routing = resolve_routing(token=token)
     api = HfApi(token=token)
     print(f'[launch] image_dir={IMAGE_DIR}', flush=True)
     print(f'[launch] owner={routing.owner}', flush=True)
     print(f'[launch] space_repo={routing.space_repo}', flush=True)
     print(f'[launch] output_repo={routing.output_repo}', flush=True)
     print(f'[launch] retina_cache_repo={routing.retina_cache_repo}', flush=True)
     print(f'[launch] namespace={routing.job_namespace}', flush=True)
-    try:
-        flavor = SpaceHardware(FLAVOR_RAW)
-    except ValueError as exc:
-        valid = ", ".join([hw.value for hw in SpaceHardware])
-        raise SystemExit(f'Invalid FEATHER_HF_FLAVOR={FLAVOR_RAW!r}. Valid values: {valid}') from exc
-    print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT} flavor={flavor.value}', flush=True)
-    print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
-    if not USE_SPACE_IMAGE:
-        print(f'[launch] image={DEFAULT_IMAGE}', flush=True)
     api.create_repo(repo_id=routing.space_repo, repo_type='space', space_sdk='docker', private=True, exist_ok=True, token=token)
     api.create_repo(repo_id=routing.output_repo, repo_type='model', private=True, exist_ok=True, token=token)
@@ -175,17 +179,19 @@ def main() -> int:
         print('[launch] dry-run mode; skipping upload and job submission', flush=True)
         return 0
-    image_ref = DEFAULT_IMAGE
-    if USE_SPACE_IMAGE:
-        if SKIP_UPLOAD:
-            print('[launch] FEATHER_HF_SKIP_UPLOAD=1; reusing existing Space image', flush=True)
-        else:
-            print('[launch] uploading custom Docker Space image context...', flush=True)
             api.upload_folder(
                 repo_id=routing.space_repo,
                 repo_type='space',
                 folder_path=str(IMAGE_DIR),
-                commit_message='Update Feather H200 training runtime image',
                 token=token,
             )
@@ -205,8 +211,38 @@ def main() -> int:
         'HYDRA_DOWNLOAD_WORKERS': DOWNLOAD_WORKERS,
         'HYDRA_CKPT_INTERVAL': CKPT_INTERVAL,
         'PYTHONUNBUFFERED': '1',
-        'FEATHER_RUNTIME_MODE': 'job',
-    }
     # Pass through any HYDRA_* / FEATHER_* overrides from the caller's env so
     # sweep drivers can set HYDRA_N_LAYER, HYDRA_SDR_TARGET_ACTIVE,
     # HYDRA_LAYER_DIAGNOSTICS, HYDRA_METRICS_OUT, HYDRA_MID_VAL_INTERVAL, etc.
@@ -216,17 +252,16 @@ def main() -> int:
             env[_k] = _v
     secrets = {'HF_TOKEN': token}
-    print('[launch] submitting HF Job...', flush=True)
-    job = submit_job_with_retry(
-        api,
         image=image_ref,
         command=['python', '/app/entrypoint.py'],
         env=env,
         secrets=secrets,
-        flavor=flavor,
         timeout=TIMEOUT,
-        token=token,
         namespace=routing.job_namespace,
     )
     print(f'[launch] submitted job_id={job.id} status={job.status.stage} url={job.url}', flush=True)
     return 0

 from __future__ import annotations
 import os
+import shutil
 import sys
 import time
+import json
+from typing import Any, cast
 from pathlib import Path
 from huggingface_hub import HfApi
+REPO_ROOT = Path(__file__).resolve().parents[1]
 if str(REPO_ROOT) not in sys.path:
     sys.path.insert(0, str(REPO_ROOT))
 from scripts.hf_routing import resolve_routing
+from configs.harness_config import HarnessConfig
 DEFAULT_IMAGE = os.environ.get('FEATHER_HF_IMAGE', 'ghcr.io/slapglif/feather-hf-runtime:latest')
+IMAGE_DIR = Path(__file__).resolve().parents[1] / 'hf_jobs' / 'feather_h200_image'
 TIMEOUT = os.environ.get('FEATHER_HF_JOB_TIMEOUT', '12h')
 TARGET_SHARDS = os.environ.get('HYDRA_TARGET_SHARDS', '2048')
 TIME_BUDGET = os.environ.get('HYDRA_TIME_BUDGET', '43200')
 DOWNLOAD_WORKERS = os.environ.get('HYDRA_DOWNLOAD_WORKERS', '16')
 CKPT_INTERVAL = os.environ.get('HYDRA_CKPT_INTERVAL', '1000')
+JOB_FLAVOR = os.environ.get('FEATHER_HF_FLAVOR', 'a10g-small')
 DRY_RUN = os.environ.get('FEATHER_HF_DRY_RUN', '0') == '1'
 USE_SPACE_IMAGE = os.environ.get('FEATHER_HF_USE_SPACE_IMAGE', '0') == '1'
 # When true, assume the Space image has already been built by a previous
 # invocation and skip the upload+build wait. Used by sweep drivers that fan
 # out many jobs against a single pre-uploaded image.
 SKIP_UPLOAD = os.environ.get('FEATHER_HF_SKIP_UPLOAD', '0') == '1'
+SYNC_OVERLAY = os.environ.get('FEATHER_HF_SYNC_OVERLAY', '1') == '1'
+def should_enable_fast_start_streaming(target_shards: str, time_budget: str) -> bool:
+    """Use streaming data path for short-budget launch profiles."""
+    try:
+        shards = int(target_shards)
+        budget = int(time_budget)
+    except ValueError:
+        return False
+    return shards > 0 and shards <= 256 and budget > 0 and budget <= 1800
+def sync_overlay_from_repo() -> None:
+    """Refresh Space overlay with required project files."""
+    overlay = IMAGE_DIR / 'overlay'
+    overlay.mkdir(parents=True, exist_ok=True)
+    for child in overlay.iterdir():
+        if child.is_dir():
+            shutil.rmtree(child)
+        else:
+            child.unlink()
+    include_paths = [
+        'hydra',
+        'subsystems',
+        'scripts',
+        'htm_rust',
+        'harness',
+        'configs',
+        'prepare.py',
+        'prepare_nemotron.py',
+        'train.py',
+        'pyproject.toml',
+        'uv.lock',
+    ]
+    ignore = shutil.ignore_patterns(
+        '__pycache__',
+        '.pytest_cache',
+        '.ruff_cache',
+        '.venv',
+        '.git',
+        'target',
+        '*.pyc',
+    )
+    copied: list[str] = []
+    for rel in include_paths:
+        src = REPO_ROOT / rel
+        dst = overlay / rel
+        if not src.exists():
+            continue
+        if src.is_dir():
+            shutil.copytree(src, dst, dirs_exist_ok=True, ignore=ignore)
+        else:
+            dst.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy2(src, dst)
+        copied.append(rel)
+    scripts_dir = overlay / 'scripts'
+    if scripts_dir.exists():
+        for sh_path in scripts_dir.rglob('*.sh'):
+            data = sh_path.read_bytes()
+            data = data.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
+            sh_path.write_bytes(data)
+    print(f'[launch] overlay synced from repo ({len(copied)} paths): {copied}', flush=True)
 def require_token() -> str:
     """
     start = time.time()
     seen_build_completion = False
+    seen_building = False
     while True:
+        runtime = api.get_space_runtime(repo_id, token=token)
         stage = getattr(runtime, 'stage', None)
+        hardware = getattr(runtime, 'hardware', None)
+        err = getattr(runtime, 'errorMessage', None) or getattr(runtime, 'error_message', None)
+        print(f'[space] stage={stage} hardware={hardware}', flush=True)
+        if stage == 'BUILDING':
+            seen_building = True
+        if stage in {'APP_STARTING', 'RUNNING', 'PAUSED', 'SLEEPING'}:
+            seen_build_completion = True
+        if stage in {'RUNNING', 'PAUSED', 'SLEEPING'}:
+            return
         # Image is built — Jobs can use it regardless of Space boot outcome.
+        if (seen_build_completion or seen_building) and stage in {'RUNTIME_ERROR', 'APP_STARTING_ERROR'}:
+            print(f'[space] Space boot failed with {stage} but built image is '
+                  f'available in the Space registry and is usable by HF Jobs.',
+                  flush=True)
+            return
         # Hard build failures — no image was produced.
         if stage in {'BUILD_ERROR', 'CONFIG_ERROR', 'NO_APP_FILE'}:
             raise RuntimeError(f'Space {repo_id} build failed: stage={stage} error={err!r}')
         if time.time() - start > timeout_s:
             raise TimeoutError(f'Space {repo_id} did not become ready in {timeout_s}s (last stage={stage})')
+        time.sleep(20)
 def main() -> int:
     token = require_token()
     routing = resolve_routing(token=token)
     api = HfApi(token=token)
+    secondary_gates = HarnessConfig().to_secondary_gates()
     print(f'[launch] image_dir={IMAGE_DIR}', flush=True)
     print(f'[launch] owner={routing.owner}', flush=True)
     print(f'[launch] space_repo={routing.space_repo}', flush=True)
     print(f'[launch] output_repo={routing.output_repo}', flush=True)
     print(f'[launch] retina_cache_repo={routing.retina_cache_repo}', flush=True)
+    print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT}', flush=True)
+    print(f'[launch] flavor={JOB_FLAVOR}', flush=True)
     print(f'[launch] namespace={routing.job_namespace}', flush=True)
+    print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
+    print(f'[launch] secondary_gates={json.dumps(secondary_gates, sort_keys=True)}', flush=True)
+    if not USE_SPACE_IMAGE:
+        print(f'[launch] image={DEFAULT_IMAGE}', flush=True)
     api.create_repo(repo_id=routing.space_repo, repo_type='space', space_sdk='docker', private=True, exist_ok=True, token=token)
     api.create_repo(repo_id=routing.output_repo, repo_type='model', private=True, exist_ok=True, token=token)
         print('[launch] dry-run mode; skipping upload and job submission', flush=True)
         return 0
+    image_ref = DEFAULT_IMAGE
+    if USE_SPACE_IMAGE:
+        if SKIP_UPLOAD:
+            print('[launch] FEATHER_HF_SKIP_UPLOAD=1; reusing existing Space image', flush=True)
+        else:
+            if SYNC_OVERLAY:
+                sync_overlay_from_repo()
+            print('[launch] uploading custom Docker Space image context...', flush=True)
             api.upload_folder(
                 repo_id=routing.space_repo,
                 repo_type='space',
                 folder_path=str(IMAGE_DIR),
+                commit_message='Update Feather training runtime image',
                 token=token,
             )
         'HYDRA_DOWNLOAD_WORKERS': DOWNLOAD_WORKERS,
         'HYDRA_CKPT_INTERVAL': CKPT_INTERVAL,
         'PYTHONUNBUFFERED': '1',
+        'FEATHER_RUNTIME_MODE': 'job',
+    }
+    if 'HYDRA_USE_NEMOTRON' not in os.environ and should_enable_fast_start_streaming(TARGET_SHARDS, TIME_BUDGET):
+        env['HYDRA_USE_NEMOTRON'] = '1'
+        print('[launch] auto-enabled HYDRA_USE_NEMOTRON=1 for short-budget fast-start profile', flush=True)
+    # A10 compatibility profile: avoid known PTX/compile runtime pitfalls and
+    # keep throughput path enabled. Caller can explicitly override each key by
+    # setting it in the parent environment.
+    if JOB_FLAVOR.startswith('a10'):
+        _a10_defaults = {
+            'HYDRA_MUON_COMPILE': '0',
+            'HYDRA_FORCE_HTM_CPU': '1',
+            'HYDRA_INERT_MAMBA': '1',
+            'HYDRA_ALLOW_SYNTHETIC_RETINA': '1',
+            'HYDRA_FASTPATH': '1',
+        }
+        for _k, _default in _a10_defaults.items():
+            if _k in os.environ:
+                env[_k] = os.environ[_k]
+            else:
+                env.setdefault(_k, _default)
+        if env.get('HYDRA_INERT_MAMBA') == '0' and 'HYDRA_FASTPATH' not in os.environ:
+            env['HYDRA_FASTPATH'] = '0'
+        print(
+            '[launch] applied A10 env profile '
+            f"(HYDRA_MUON_COMPILE={env['HYDRA_MUON_COMPILE']}, "
+            f"HYDRA_FORCE_HTM_CPU={env['HYDRA_FORCE_HTM_CPU']}, "
+            f"HYDRA_INERT_MAMBA={env['HYDRA_INERT_MAMBA']}, "
+            f"HYDRA_ALLOW_SYNTHETIC_RETINA={env['HYDRA_ALLOW_SYNTHETIC_RETINA']}, "
+            f"HYDRA_FASTPATH={env['HYDRA_FASTPATH']})",
+            flush=True,
+        )
     # Pass through any HYDRA_* / FEATHER_* overrides from the caller's env so
     # sweep drivers can set HYDRA_N_LAYER, HYDRA_SDR_TARGET_ACTIVE,
     # HYDRA_LAYER_DIAGNOSTICS, HYDRA_METRICS_OUT, HYDRA_MID_VAL_INTERVAL, etc.
             env[_k] = _v
     secrets = {'HF_TOKEN': token}
+    print(f'[launch] submitting HF Job on flavor={JOB_FLAVOR}...', flush=True)
+    job = api.run_job(
         image=image_ref,
         command=['python', '/app/entrypoint.py'],
         env=env,
         secrets=secrets,
+        flavor=cast(Any, JOB_FLAVOR),
         timeout=TIMEOUT,
         namespace=routing.job_namespace,
+        token=token,
     )
     print(f'[launch] submitted job_id={job.id} status={job.status.stage} url={job.url}', flush=True)
     return 0

overlay/scripts/long_train.sh CHANGED Viewed

@@ -1,38 +1,38 @@
-#!/usr/bin/env bash
-# Long-training run for full-architecture completion attempt.
-#
-# The 5-minute autoresearch budget is for mutation screening — it's nowhere
-# near enough compute for this small model (~6M params) to produce coherent
-# English. This script runs the SAME full-architecture train.py with an
-# extended budget so the "factual English" completion criterion can actually
-# be tested end-to-end.
-#
-# Usage:
-#   ./scripts/long_train.sh            # default 1-hour budget
-#   HYDRA_TIME_BUDGET=7200 ./scripts/long_train.sh   # 2 hours
-#   HYDRA_D_MODEL=384 HYDRA_N_LAYER=6 ./scripts/long_train.sh   # scale model
-#
-# Output: run_long_<timestamp>.log in repo root. Includes factual_english_score.
-set -euo pipefail
-cd "$(dirname "$0")/.."
-TIME_BUDGET="${HYDRA_TIME_BUDGET:-3600}"
-STAMP="$(date +%Y%m%d_%H%M%S)"
-LOG="run_long_${STAMP}.log"
-export HYDRA_TIME_BUDGET="${TIME_BUDGET}"
-echo "=== HYDRA long-training run ==="
-echo "time_budget:  ${TIME_BUDGET}s ($((TIME_BUDGET / 60))m)"
-echo "d_model:      ${HYDRA_D_MODEL:-256 (default)}"
-echo "n_layer:      ${HYDRA_N_LAYER:-4 (default)}"
-echo "d_state:      ${HYDRA_D_STATE:-64 (default)}"
-echo "log:          ${LOG}"
-echo
-.venv/bin/python train.py 2>&1 | tee "${LOG}"
-echo
-echo "=== Summary ==="
-grep -E "^val_bpb:|^factual_english_score:|^factual_english_hits:|^peak_vram_mb:|^num_steps:" "${LOG}"

+#!/usr/bin/env bash
+# Long-training run for full-architecture completion attempt.
+#
+# The 5-minute autoresearch budget is for mutation screening — it's nowhere
+# near enough compute for this small model (~6M params) to produce coherent
+# English. This script runs the SAME full-architecture train.py with an
+# extended budget so the "factual English" completion criterion can actually
+# be tested end-to-end.
+#
+# Usage:
+#   ./scripts/long_train.sh            # default 1-hour budget
+#   HYDRA_TIME_BUDGET=7200 ./scripts/long_train.sh   # 2 hours
+#   HYDRA_D_MODEL=384 HYDRA_N_LAYER=6 ./scripts/long_train.sh   # scale model
+#
+# Output: run_long_<timestamp>.log in repo root. Includes factual_english_score.
+set -euo pipefail
+cd "$(dirname "$0")/.."
+TIME_BUDGET="${HYDRA_TIME_BUDGET:-3600}"
+STAMP="$(date +%Y%m%d_%H%M%S)"
+LOG="run_long_${STAMP}.log"
+export HYDRA_TIME_BUDGET="${TIME_BUDGET}"
+echo "=== HYDRA long-training run ==="
+echo "time_budget:  ${TIME_BUDGET}s ($((TIME_BUDGET / 60))m)"
+echo "d_model:      ${HYDRA_D_MODEL:-256 (default)}"
+echo "n_layer:      ${HYDRA_N_LAYER:-4 (default)}"
+echo "d_state:      ${HYDRA_D_STATE:-64 (default)}"
+echo "log:          ${LOG}"
+echo
+.venv/bin/python train.py 2>&1 | tee "${LOG}"
+echo
+echo "=== Summary ==="
+grep -E "^val_bpb:|^factual_english_score:|^factual_english_hits:|^peak_vram_mb:|^num_steps:" "${LOG}"

overlay/scripts/optuna_hpo.py ADDED Viewed

	@@ -0,0 +1,725 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+import tempfile
+from pathlib import Path
+from typing import Any
+import optuna
+_HF_ENV_KEY_RE = re.compile(r"^[A-Z][A-Z0-9_]*$")
+REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(REPO_ROOT))
+from scripts.hf_routing import resolve_routing
+TRAIN_ENTRYPOINT = REPO_ROOT / "train.py"
+SEARCH_SPACE_KEYS = {
+    "d_model",
+    "n_layer",
+    "d_state",
+    "headdim",
+    "expand",
+    "seq_len",
+    "batch_size",
+    "grad_accum",
+    "matrix_lr",
+    "embed_lr",
+    "unembed_lr",
+    "engram_n_columns",
+    "sdr_target_active",
+    "hyena_layers",
+}
+def _filter_prior_params(raw: dict[str, Any]) -> dict[str, Any]:
+    return {k: v for k, v in raw.items() if k in SEARCH_SPACE_KEYS}
+def _load_prior_param_sets(path: Path) -> list[dict[str, Any]]:
+    if not path.exists():
+        return []
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    if isinstance(payload, dict):
+        rows = payload.get("trials", [])
+    elif isinstance(payload, list):
+        rows = payload
+    else:
+        rows = []
+    out: list[dict[str, Any]] = []
+    for item in rows:
+        if not isinstance(item, dict):
+            continue
+        params_obj = item.get("params", item)
+        if not isinstance(params_obj, dict):
+            continue
+        filtered = _filter_prior_params(params_obj)
+        if filtered:
+            out.append(filtered)
+    return out
+def _enqueue_transfer_priors(study: optuna.Study, priors_file: Path, apply_priors: bool) -> int:
+    if not apply_priors:
+        return 0
+    priors_raw = _load_prior_param_sets(priors_file)
+    if not priors_raw:
+        return 0
+    # Deduplicate param sets across merged studies.
+    priors: list[dict[str, Any]] = []
+    seen: set[str] = set()
+    for params in priors_raw:
+        key = json.dumps(params, sort_keys=True)
+        if key in seen:
+            continue
+        seen.add(key)
+        priors.append(params)
+    enqueued = 0
+    for params in priors:
+        before = len(study.get_trials(deepcopy=False))
+        try:
+            study.enqueue_trial(params, user_attrs={"seed_source": "transfer_priors"}, skip_if_exists=True)
+        except TypeError:
+            study.enqueue_trial(params, user_attrs={"seed_source": "transfer_priors"})
+        after = len(study.get_trials(deepcopy=False))
+        if after > before:
+            enqueued += 1
+    return enqueued
+def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any] | None:
+    metrics_line: str | None = None
+    for line in stdout.splitlines():
+        if "[METRICS_JSON]" in line:
+            metrics_line = line
+    if not metrics_line:
+        return None
+    m = re.search(r"\[METRICS_JSON\]\s*(\{.*\})", metrics_line)
+    if not m:
+        return None
+    try:
+        return json.loads(m.group(1))
+    except json.JSONDecodeError:
+        return None
+def _parse_metrics_from_log_lines(lines: list[str]) -> dict[str, Any] | None:
+    metrics_line: str | None = None
+    for line in lines:
+        if "[METRICS_JSON]" in line:
+            metrics_line = line
+    if not metrics_line:
+        return None
+    m = re.search(r"\[METRICS_JSON\]\s*(\{.*\})", metrics_line)
+    if not m:
+        return None
+    try:
+        return json.loads(m.group(1))
+    except json.JSONDecodeError:
+        return None
+def _parse_last_train_bpb_from_logs(lines: list[str]) -> float | None:
+    """Best-effort fallback when final eval crashes before metrics JSON write."""
+    last: float | None = None
+    for line in lines:
+        m = re.search(r"\bbpb=([0-9]+(?:\.[0-9]+)?)", line)
+        if m:
+            last = float(m.group(1))
+    return last
+def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path) -> dict[str, str]:
+    env = os.environ.copy()
+    # Runtime and reporting
+    env["HYDRA_METRICS_OUT"] = str(metrics_path)
+    env["HYDRA_TIME_BUDGET"] = str(args.trial_time_budget)
+    env["PYTHONUNBUFFERED"] = "1"
+    # Search space — fully env-driven to match existing training stack.
+    env["HYDRA_D_MODEL"] = str(trial.suggest_categorical("d_model", [64, 96, 128, 160, 192]))
+    env["HYDRA_N_LAYER"] = str(trial.suggest_int("n_layer", 1, 4))
+    env["HYDRA_D_STATE"] = str(trial.suggest_categorical("d_state", [16, 32, 48]))
+    env["HYDRA_HEADDIM"] = str(trial.suggest_categorical("headdim", [8, 16, 32]))
+    env["HYDRA_EXPAND"] = str(trial.suggest_categorical("expand", [1, 2]))
+    seq_len = trial.suggest_categorical("seq_len", [32, 64])
+    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
+    grad_accum = trial.suggest_categorical("grad_accum", [8, 16, 32, 64])
+    # Keep TOTAL_BATCH_SIZE divisible by DEVICE_BATCH_SIZE * MAX_SEQ_LEN.
+    total_batch = batch_size * seq_len * grad_accum
+    env["HYDRA_SEQ_LEN"] = str(seq_len)
+    env["HYDRA_BATCH_SIZE"] = str(batch_size)
+    env["HYDRA_TOTAL_BATCH"] = str(total_batch)
+    env["HYDRA_MATRIX_LR"] = str(trial.suggest_float("matrix_lr", 0.005, 0.2, log=True))
+    env["HYDRA_EMBED_LR"] = str(trial.suggest_float("embed_lr", 0.05, 1.0, log=True))
+    env["HYDRA_UNEMBED_LR"] = str(trial.suggest_float("unembed_lr", 0.0005, 0.02, log=True))
+    env["HYDRA_ENGRAM_N_COLUMNS"] = str(trial.suggest_categorical("engram_n_columns", [256, 512, 1024]))
+    env["HYDRA_SDR_TARGET_ACTIVE"] = str(trial.suggest_categorical("sdr_target_active", [128, 256, 327, 512]))
+    env["HYDRA_HYENA_LAYERS"] = trial.suggest_categorical("hyena_layers", ["", "0", "1", "0,1"])
+    # Keep trials alive long enough to emit metrics.
+    env["HYDRA_FAIL_LOSS_THRESHOLD"] = "1000000"
+    env["HYDRA_USE_NEMOTRON"] = os.environ.get("HYDRA_USE_NEMOTRON", "1")
+    env["HYDRA_LOCAL_SHARDS_ONLY"] = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "0")
+    # Strict optimal-path defaults (no forced fallback profile).
+    env["HYDRA_MUON_COMPILE"] = os.environ.get("HYDRA_MUON_COMPILE", "1")
+    env["HYDRA_FORCE_HTM_CPU"] = os.environ.get("HYDRA_FORCE_HTM_CPU", "0")
+    env["HYDRA_ALLOW_SYNTHETIC_RETINA"] = os.environ.get("HYDRA_ALLOW_SYNTHETIC_RETINA", "0")
+    env["HYDRA_INERT_MAMBA"] = os.environ.get("HYDRA_INERT_MAMBA", "0")
+    env["HYDRA_FASTPATH"] = os.environ.get("HYDRA_FASTPATH", "0")
+    return env
+def _sanitize_hf_env(env: dict[str, str]) -> dict[str, str]:
+    """HF Jobs API accepts only strictly alnum/underscore env keys."""
+    sanitized: dict[str, str] = {}
+    for key, value in env.items():
+        if _HF_ENV_KEY_RE.match(key):
+            sanitized[key] = str(value)
+    return sanitized
+def _hf_command_candidates(args: argparse.Namespace) -> list[list[str]]:
+    if args.hf_use_bash:
+        return [["bash", "-lc", args.hf_command]]
+    raw = args.hf_command.strip()
+    if args.hf_auto_command_fallback and raw == "/app/entrypoint.py":
+        candidates = [
+            ["/usr/bin/python3", "/app/entrypoint.py"],
+            ["/usr/local/bin/python3", "/app/entrypoint.py"],
+            ["python3", "/app/entrypoint.py"],
+            ["python", "/app/entrypoint.py"],
+            ["/app/entrypoint.py"],
+        ]
+        uniq: list[list[str]] = []
+        seen: set[tuple[str, ...]] = set()
+        for c in candidates:
+            key = tuple(c)
+            if key not in seen:
+                seen.add(key)
+                uniq.append(c)
+        return uniq
+    return [raw.split()]
+def _objective_local(args: argparse.Namespace):
+    def objective(trial: optuna.Trial) -> float:
+        trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
+        metrics_path = trial_dir / "metrics.json"
+        env = _trial_env(trial, args, metrics_path)
+        proc = subprocess.run(
+            [sys.executable, str(TRAIN_ENTRYPOINT)],
+            cwd=str(REPO_ROOT),
+            env=env,
+            text=True,
+            capture_output=True,
+            timeout=args.trial_timeout,
+        )
+        metrics: dict[str, Any] | None = None
+        if metrics_path.exists():
+            try:
+                metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
+            except json.JSONDecodeError:
+                metrics = None
+        if metrics is None:
+            metrics = _parse_metrics_from_stdout(proc.stdout)
+        if metrics is None:
+            raise optuna.TrialPruned("No metrics found (HYDRA_METRICS_OUT/[METRICS_JSON])")
+        if proc.returncode != 0:
+            raise optuna.TrialPruned(f"Training failed rc={proc.returncode}")
+        metric_key = args.metric
+        if metric_key not in metrics or metrics[metric_key] is None:
+            raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
+        tps_val = metrics.get("tps")
+        if tps_val is not None:
+            tps_f = float(tps_val)
+            trial.set_user_attr("tps", tps_f)
+            if args.min_tps is not None and tps_f < args.min_tps:
+                raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
+        value = float(metrics[metric_key])
+        # Keep useful context on trial
+        trial.set_user_attr("summary_path", metrics.get("summary_path"))
+        trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
+        return value
+    return objective
+def _objective_hf_job(args: argparse.Namespace):
+    from huggingface_hub import HfApi
+    from huggingface_hub.utils import get_token
+    token = os.environ.get(args.hf_token_env) or get_token()
+    if not token:
+        raise RuntimeError(
+            f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
+        )
+    api = HfApi(token=token)
+    terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
+    def objective(trial: optuna.Trial) -> float:
+        trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
+        metrics_path = trial_dir / "metrics.json"
+        env = _trial_env(trial, args, metrics_path)
+        env = _sanitize_hf_env(env)
+        selected_job = None
+        launch_errors: list[str] = []
+        for command in _hf_command_candidates(args):
+            try:
+                job = api.run_job(
+                    image=args.hf_image,
+                    command=command,
+                    env=env,
+                    secrets={args.hf_token_env: token},
+                    flavor=args.hf_flavor,
+                    timeout=args.hf_timeout,
+                    labels={"project": "feather", "goal": "optuna-hpo", "trial": str(trial.number)},
+                    token=token,
+                    namespace=args.hf_namespace,
+                )
+            except Exception as e:
+                launch_errors.append(f"launch:{command}: {type(e).__name__}: {e}")
+                continue
+            # Bootstrap check: reject known command/exec failures quickly.
+            bootstrap_deadline = time.time() + args.hf_bootstrap_seconds
+            bootstrap_stage = "UNKNOWN"
+            bootstrap_logs: list[str] = []
+            bootstrap_msg = ""
+            while time.time() < bootstrap_deadline:
+                info = api.inspect_job(job_id=job.id, token=token, namespace=args.hf_namespace)
+                bootstrap_stage = str(info.status.stage)
+                bootstrap_msg = str(getattr(info.status, "message", "") or "")
+                bootstrap_logs = list(
+                    api.fetch_job_logs(
+                        job_id=job.id,
+                        follow=False,
+                        token=token,
+                        namespace=args.hf_namespace,
+                    )
+                )
+                if bootstrap_stage in {"RUNNING", "COMPLETED"} or bootstrap_logs:
+                    break
+                if bootstrap_stage in {"ERROR", "FAILED", "CANCELLED", "CANCELED", "TIMEOUT"}:
+                    break
+                time.sleep(2)
+            detail = bootstrap_msg.lower()
+            unusable = bootstrap_stage in {"ERROR", "FAILED"} and len(bootstrap_logs) == 0 and any(
+                k in detail for k in ("executable file not found", "permission denied", "exec:")
+            )
+            if unusable:
+                launch_errors.append(f"bootstrap:{command}: {bootstrap_msg}")
+                continue
+            selected_job = job
+            break
+        if selected_job is None:
+            raise optuna.TrialPruned(f"HF job launch failed across command candidates: {launch_errors[:3]}")
+        job = selected_job
+        job_id = job.id
+        trial.set_user_attr("hf_job_id", job_id)
+        start = time.time()
+        metrics: dict[str, Any] | None = None
+        tps_seen: float | None = None
+        stage: str = "UNKNOWN"
+        log_lines: list[str] = []
+        terminal_detail: str | None = None
+        while True:
+            info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
+            stage = str(info.status.stage)
+            terminal_detail = str(getattr(info.status, "message", "")) or terminal_detail
+            log_lines = list(api.fetch_job_logs(job_id=job_id, follow=False, token=token, namespace=args.hf_namespace))
+            m = _parse_metrics_from_log_lines(log_lines)
+            if m is not None:
+                metrics = m
+                break
+            # Capture latest tps even before final metrics json
+            for line in log_lines:
+                mt = re.search(r"\btps=([0-9]+(?:\.[0-9]+)?)", line)
+                if mt:
+                    tps_seen = float(mt.group(1))
+            if stage in terminal_states:
+                break
+            if time.time() - start > args.trial_timeout:
+                break
+            time.sleep(args.hf_poll_interval)
+        # Best-effort stop to control cost
+        try:
+            info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
+            if info.status.stage not in terminal_states and args.hf_stop_after_metric:
+                api.cancel_job(job_id=job_id, token=token, namespace=args.hf_namespace)
+        except Exception:
+            pass
+        # Save logs for debugging
+        (trial_dir / "hf_job.log").write_text("\n".join(log_lines), encoding="utf-8")
+        trial.set_user_attr("hf_stage", stage)
+        trial.set_user_attr("hf_log_lines", len(log_lines))
+        if terminal_detail:
+            trial.set_user_attr("hf_status_message", terminal_detail)
+        if metrics is None:
+            if args.allow_log_metric_fallback and args.metric == "val_bpb":
+                fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
+                if fallback_bpb is not None:
+                    trial.set_user_attr("metric_source", "log_bpb_fallback")
+                    if tps_seen is not None:
+                        trial.set_user_attr("tps", tps_seen)
+                        if args.min_tps is not None and tps_seen < args.min_tps:
+                            raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {args.min_tps}")
+                    return float(fallback_bpb)
+            if tps_seen is not None:
+                trial.set_user_attr("tps", tps_seen)
+            detail = f"stage={stage}, logs={len(log_lines)}"
+            if terminal_detail:
+                detail = f"{detail}, message={terminal_detail}"
+            raise optuna.TrialPruned(f"No metrics found from HF job ({detail})")
+        metric_key = args.metric
+        if metric_key not in metrics or metrics[metric_key] is None:
+            raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
+        tps_val = metrics.get("tps")
+        if tps_val is not None:
+            tps_f = float(tps_val)
+            trial.set_user_attr("tps", tps_f)
+            if args.min_tps is not None and tps_f < args.min_tps:
+                raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
+        value = float(metrics[metric_key])
+        trial.set_user_attr("summary_path", metrics.get("summary_path"))
+        trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
+        return value
+    return objective
+def _objective_hf_launcher(args: argparse.Namespace):
+    from huggingface_hub import HfApi
+    from huggingface_hub.utils import get_token
+    token = os.environ.get(args.hf_token_env) or get_token()
+    if not token:
+        raise RuntimeError(
+            f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
+        )
+    api = HfApi(token=token)
+    terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
+    def objective(trial: optuna.Trial) -> float:
+        trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
+        metrics_path = trial_dir / "metrics.json"
+        env = _trial_env(trial, args, metrics_path)
+        env = _sanitize_hf_env(env)
+        local_env = os.environ.copy()
+        local_env.update(env)
+        local_env[args.hf_token_env] = token
+        local_env["FEATHER_HF_NAMESPACE"] = args.hf_namespace
+        local_env["FEATHER_HF_FLAVOR"] = args.hf_flavor
+        local_env["FEATHER_HF_JOB_TIMEOUT"] = args.hf_timeout
+        local_env["FEATHER_HF_IMAGE"] = args.hf_image
+        local_env["FEATHER_HF_SPACE_REPO"] = f"{args.hf_namespace}/feather-h200-runtime"
+        if args.hf_output_repo:
+            local_env["FEATHER_HF_OUTPUT_REPO"] = args.hf_output_repo
+        else:
+            local_env["FEATHER_HF_OUTPUT_REPO"] = f"{args.hf_namespace}/feather-pretrain-checkpoints"
+        proc = subprocess.run(
+            [sys.executable, str(args.hf_launcher_script)],
+            cwd=str(REPO_ROOT),
+            env=local_env,
+            text=True,
+            capture_output=True,
+            timeout=max(args.trial_timeout, 120),
+        )
+        launch_stdout = proc.stdout or ""
+        launch_stderr = proc.stderr or ""
+        m = re.search(r"job_id=([a-zA-Z0-9_-]+)", launch_stdout)
+        if proc.returncode != 0 or not m:
+            raise optuna.TrialPruned(
+                f"HF launcher failed rc={proc.returncode}; stderr={launch_stderr[-400:]} stdout_tail={launch_stdout[-400:]}"
+            )
+        job_id = m.group(1)
+        trial.set_user_attr("hf_job_id", job_id)
+        start = time.time()
+        metrics: dict[str, Any] | None = None
+        tps_seen: float | None = None
+        stage: str = "UNKNOWN"
+        log_lines: list[str] = []
+        terminal_detail: str | None = None
+        while True:
+            info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
+            stage = str(info.status.stage)
+            terminal_detail = str(getattr(info.status, "message", "") or "") or terminal_detail
+            log_lines = list(api.fetch_job_logs(job_id=job_id, follow=False, token=token, namespace=args.hf_namespace))
+            mtr = _parse_metrics_from_log_lines(log_lines)
+            if mtr is not None:
+                metrics = mtr
+                break
+            for line in log_lines:
+                mt = re.search(r"\btps=([0-9]+(?:\.[0-9]+)?)", line)
+                if mt:
+                    tps_seen = float(mt.group(1))
+            if stage in terminal_states:
+                break
+            if time.time() - start > args.trial_timeout:
+                break
+            time.sleep(args.hf_poll_interval)
+        try:
+            info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
+            if info.status.stage not in terminal_states and args.hf_stop_after_metric:
+                api.cancel_job(job_id=job_id, token=token, namespace=args.hf_namespace)
+        except Exception:
+            pass
+        (trial_dir / "hf_job.log").write_text("\n".join(log_lines), encoding="utf-8")
+        trial.set_user_attr("hf_stage", stage)
+        trial.set_user_attr("hf_log_lines", len(log_lines))
+        if terminal_detail:
+            trial.set_user_attr("hf_status_message", terminal_detail)
+        if metrics is None:
+            if args.allow_log_metric_fallback and args.metric == "val_bpb":
+                fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
+                if fallback_bpb is not None:
+                    trial.set_user_attr("metric_source", "log_bpb_fallback")
+                    if tps_seen is not None:
+                        trial.set_user_attr("tps", tps_seen)
+                        if args.min_tps is not None and tps_seen < args.min_tps:
+                            raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {args.min_tps}")
+                    return float(fallback_bpb)
+            if tps_seen is not None:
+                trial.set_user_attr("tps", tps_seen)
+            detail = f"stage={stage}, logs={len(log_lines)}"
+            if terminal_detail:
+                detail = f"{detail}, message={terminal_detail}"
+            raise optuna.TrialPruned(f"No metrics found from HF launcher job ({detail})")
+        metric_key = args.metric
+        if metric_key not in metrics or metrics[metric_key] is None:
+            raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
+        tps_val = metrics.get("tps")
+        if tps_val is not None:
+            tps_f = float(tps_val)
+            trial.set_user_attr("tps", tps_f)
+            if args.min_tps is not None and tps_f < args.min_tps:
+                raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
+        value = float(metrics[metric_key])
+        trial.set_user_attr("summary_path", metrics.get("summary_path"))
+        trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
+        return value
+    return objective
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    routing_defaults = resolve_routing(token=os.environ.get("HF_TOKEN"))
+    parser = argparse.ArgumentParser(description="Optuna HPO runner for HYDRA train.py")
+    parser.add_argument("--study-name", default="hydra_hpo", help="Optuna study name")
+    parser.add_argument("--storage", default="sqlite:///optuna_hpo.db", help="Optuna storage URL")
+    parser.add_argument("--direction", choices=["minimize", "maximize"], default="minimize")
+    parser.add_argument("--metric", default="val_bpb", help="Metric key to optimize from HYDRA metrics")
+    parser.add_argument(
+        "--min-tps",
+        type=float,
+        default=50000.0,
+        help="TPS floor; prune trials under this value (set 0 to disable)",
+    )
+    parser.add_argument("--trials", type=int, default=20, help="Number of Optuna trials")
+    parser.add_argument("--study-timeout", type=int, default=None, help="Study timeout in seconds")
+    parser.add_argument("--trial-time-budget", type=int, default=300, help="HYDRA_TIME_BUDGET passed to each trial")
+    parser.add_argument("--trial-timeout", type=int, default=900, help="Subprocess timeout per trial in seconds")
+    parser.add_argument("--runner", choices=["local", "hf-job", "hf-launcher"], default="local", help="Trial execution backend")
+    parser.add_argument("--hf-namespace", default=routing_defaults.job_namespace, help="HF namespace for jobs")
+    parser.add_argument("--hf-image", default=f"hf.co/spaces/{routing_defaults.space_repo}", help="HF jobs image")
+    parser.add_argument("--hf-flavor", default="a10g-large", help="HF jobs hardware flavor")
+    parser.add_argument("--hf-timeout", default="25m", help="HF job timeout string")
+    parser.add_argument("--hf-command", default="/app/entrypoint.py", help="Command executed inside HF job")
+    parser.add_argument("--hf-use-bash", action="store_true", help="Run HF command via bash -lc")
+    parser.add_argument("--hf-auto-command-fallback", action="store_true", default=True, help="Auto-wrap entrypoint command with python/python3/uv fallback")
+    parser.add_argument("--no-hf-auto-command-fallback", action="store_false", dest="hf_auto_command_fallback")
+    parser.add_argument("--hf-poll-interval", type=int, default=12, help="HF job poll interval seconds")
+    parser.add_argument("--hf-bootstrap-seconds", type=int, default=18, help="Initial seconds to validate command bootstrap")
+    parser.add_argument("--hf-token-env", default="HF_TOKEN", help="Token env key passed as HF job secret")
+    parser.add_argument("--hf-stop-after-metric", action="store_true", default=True, help="Cancel running job after metrics captured")
+    parser.add_argument("--no-hf-stop-after-metric", action="store_false", dest="hf_stop_after_metric")
+    parser.add_argument("--hf-launcher-script", type=Path, default=REPO_ROOT / "scripts" / "launch_feather_hf_job.py", help="Local launcher script for hf-launcher runner")
+    parser.add_argument("--hf-output-repo", default=routing_defaults.output_repo, help="Optional FEATHER_HF_OUTPUT_REPO override for launcher runner")
+    parser.add_argument("--allow-log-metric-fallback", action="store_true", default=False, help="When metrics JSON is missing, allow val_bpb fallback from latest logged train bpb")
+    parser.add_argument("--no-allow-log-metric-fallback", action="store_false", dest="allow_log_metric_fallback")
+    parser.add_argument("--priors-file", type=Path, default=REPO_ROOT / "docs" / "hpo_transfer_priors.json", help="Path to transfer-learning prior trials JSON")
+    parser.add_argument("--apply-priors", action="store_true", default=True, help="Enqueue transfer-learning prior trials before optimize")
+    parser.add_argument("--no-apply-priors", action="store_false", dest="apply_priors")
+    parser.add_argument("--seed", type=int, default=42, help="Seed for sampler")
+    parser.add_argument("--n-startup-trials", type=int, default=5, help="Pruner startup trials before pruning")
+    parser.add_argument("--n-warmup-steps", type=int, default=0, help="Pruner warmup steps")
+    parser.add_argument("--patience-trials", type=int, default=None, help="Stop study after this many completed trials without meaningful improvement")
+    parser.add_argument("--min-improvement", type=float, default=0.0, help="Minimum best-value improvement to reset patience")
+    parser.add_argument("--work-dir", type=Path, default=REPO_ROOT / ".tmp" / "optuna", help="Directory for trial artifacts")
+    parser.add_argument("--summary-out", type=Path, default=REPO_ROOT / ".tmp" / "optuna" / "best_summary.json")
+    return parser.parse_args(argv)
+def main() -> int:
+    args = parse_args()
+    args.work_dir.mkdir(parents=True, exist_ok=True)
+    args.summary_out.parent.mkdir(parents=True, exist_ok=True)
+    sampler = optuna.samplers.TPESampler(seed=args.seed, multivariate=True)
+    pruner = optuna.pruners.MedianPruner(
+        n_startup_trials=args.n_startup_trials,
+        n_warmup_steps=args.n_warmup_steps,
+    )
+    study = optuna.create_study(
+        study_name=args.study_name,
+        storage=args.storage,
+        load_if_exists=True,
+        direction=args.direction,
+        sampler=sampler,
+        pruner=pruner,
+    )
+    enqueued_priors = _enqueue_transfer_priors(study, args.priors_file, args.apply_priors)
+    if enqueued_priors:
+        print(f"[hpo] enqueued {enqueued_priors} transfer priors from {args.priors_file}")
+    state: dict[str, Any] = {
+        "best": None,
+        "best_trial_number": None,
+        "last_improve_trial_number": None,
+    }
+    def _improved(new_value: float, best_value: float) -> bool:
+        if args.direction == "minimize":
+            return new_value < (best_value - args.min_improvement)
+        return new_value > (best_value + args.min_improvement)
+    def _early_stop_callback(study_obj: optuna.Study, trial: optuna.trial.FrozenTrial) -> None:
+        if trial.value is None:
+            return
+        if state["best"] is None or _improved(float(trial.value), float(state["best"])):
+            state["best"] = float(trial.value)
+            state["best_trial_number"] = trial.number
+            state["last_improve_trial_number"] = trial.number
+            return
+        if args.patience_trials is None:
+            return
+        if state["last_improve_trial_number"] is None:
+            return
+        since = trial.number - int(state["last_improve_trial_number"])
+        if since >= args.patience_trials:
+            study_obj.stop()
+    callbacks = [_early_stop_callback] if args.patience_trials is not None else None
+    if args.runner == "local":
+        objective_fn = _objective_local(args)
+    elif args.runner == "hf-job":
+        objective_fn = _objective_hf_job(args)
+    else:
+        objective_fn = _objective_hf_launcher(args)
+    study.optimize(
+        objective_fn,
+        n_trials=args.trials,
+        timeout=args.study_timeout,
+        callbacks=callbacks,
+    )
+    completed = [t for t in study.trials if t.value is not None]
+    if completed:
+        best = {
+            "study_name": study.study_name,
+            "direction": args.direction,
+            "metric": args.metric,
+            "best_value": study.best_value,
+            "best_params": study.best_params,
+            "best_trial_number": study.best_trial.number,
+            "best_trial_user_attrs": study.best_trial.user_attrs,
+            "n_trials": len(study.trials),
+            "n_completed": len(completed),
+            "patience_trials": args.patience_trials,
+            "min_improvement": args.min_improvement,
+            "enqueued_priors": enqueued_priors,
+        }
+    else:
+        best = {
+            "study_name": study.study_name,
+            "direction": args.direction,
+            "metric": args.metric,
+            "best_value": None,
+            "best_params": {},
+            "best_trial_number": None,
+            "best_trial_user_attrs": {},
+            "n_trials": len(study.trials),
+            "n_completed": 0,
+            "enqueued_priors": enqueued_priors,
+            "note": "No completed trials with metrics found.",
+        }
+    args.summary_out.write_text(json.dumps(best, indent=2), encoding="utf-8")
+    print(json.dumps(best, indent=2))
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

overlay/scripts/parse_metrics.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Parse train.py run.log → (bpb, tps_avg, factual).
+bpb priority order:
+  1. val_bpb from [VAL] line (cleanest signal, but OOMs on 6GB cards)
+  2. train_bpb from the LAST step= line (proxy when val fails — not held-out
+     but monotone with model capability over a 5-min budget)
+"""
+import re, sys
+txt = open(sys.argv[1]).read()
+m = re.search(r'val_bpb:\s+([\d\.]+)', txt)
+if m:
+    bpb = m.group(1)
+else:
+    step_lines = re.findall(r'^step=\d+\s+loss=[\d\.]+\s+bpb=([\d\.]+)', txt, re.M)
+    bpb = f'~{step_lines[-1]}' if step_lines else 'NA'
+tps_vals = [int(m.group(1)) for m in re.finditer(r'tps=(\d+)', txt)]
+tps_avg = f'{sum(tps_vals)/len(tps_vals):.0f}' if tps_vals else 'NA'
+m = re.search(r'factual_english_hits:\s+(\d+/\d+)', txt)
+factual = m.group(1) if m else 'NA'
+print(f"{bpb}\t{tps_avg}\t{factual}")

overlay/scripts/run_domain_expanded_pretrain.sh CHANGED Viewed

@@ -1,262 +1,262 @@
-#!/usr/bin/env bash
-# Domain-expanded streaming pretrain launcher for Feather/HYDRA.
-#
-# Usage:
-#   ./scripts/run_domain_expanded_pretrain.sh
-#   HYDRA_TARGET_SHARDS=2048 HYDRA_TIME_BUDGET=28800 ./scripts/run_domain_expanded_pretrain.sh
-#   ./scripts/run_domain_expanded_pretrain.sh --target-shards 1024 --dry-run
-#   ./scripts/run_domain_expanded_pretrain.sh --target-shards -1 --download-workers 16
-#
-# Behavior:
-#   - counts currently cached parquet shards in ~/.cache/autoresearch/data
-#   - optionally expands shard coverage toward a target via prepare.py
-#   - skips prepare.py entirely when target coverage is already satisfied
-#   - exports WSL CUDA library paths and long-run HYDRA_* env vars
-#   - prefers an existing latest/pretrain checkpoint path if one is present
-#   - streams stdout/stderr to a stable repo log: run_domain_expanded.log
-set -euo pipefail
-REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-cd "$REPO_ROOT"
-CACHE_ROOT="${HYDRA_CACHE_ROOT:-$HOME/.cache/autoresearch}"
-DATA_DIR="${HYDRA_DATA_DIR:-$CACHE_ROOT/data}"
-CKPT_DIR="${HYDRA_CKPT_DIR:-$CACHE_ROOT/ckpts}"
-LOG_FILE="${HYDRA_DOMAIN_EXPANDED_LOG:-$REPO_ROOT/run_domain_expanded.log}"
-DEFAULT_TARGET_SHARDS="2048"
-TARGET_SHARDS="${HYDRA_TARGET_SHARDS:-$DEFAULT_TARGET_SHARDS}"
-DOWNLOAD_WORKERS="${HYDRA_DOWNLOAD_WORKERS:-8}"
-DRY_RUN=0
-SKIP_TRAIN=0
-FORCE_PREPARE=0
-NO_RESUME=0
-EXPLICIT_RESUME_PATH="${HYDRA_RESUME_PATH:-}"
-usage() {
-  sed -n '2,16p' "$0"
-  cat <<'EOF'
-Options:
-  --target-shards N       Target number of train shards to have locally (-1 = all)
-  --download-workers N    Parallel workers for prepare.py downloads
-  --resume PATH           Override auto-detected checkpoint path
-  --no-resume             Ignore existing checkpoints
-  --skip-train            Only ensure shard coverage, do not launch train.py
-  --force-prepare         Run prepare.py even if target coverage is already satisfied
-  --dry-run               Print planned actions without running prepare.py/train.py
-  -h, --help              Show this help
-EOF
-}
-while [[ $# -gt 0 ]]; do
-  case "$1" in
-    --target-shards)
-      TARGET_SHARDS="$2"
-      shift 2
-      ;;
-    --download-workers)
-      DOWNLOAD_WORKERS="$2"
-      shift 2
-      ;;
-    --resume)
-      EXPLICIT_RESUME_PATH="$2"
-      shift 2
-      ;;
-    --no-resume)
-      NO_RESUME=1
-      shift
-      ;;
-    --skip-train)
-      SKIP_TRAIN=1
-      shift
-      ;;
-    --force-prepare)
-      FORCE_PREPARE=1
-      shift
-      ;;
-    --dry-run)
-      DRY_RUN=1
-      shift
-      ;;
-    -h|--help)
-      usage
-      exit 0
-      ;;
-    *)
-      echo "Unknown option: $1" >&2
-      usage >&2
-      exit 2
-      ;;
-  esac
-done
-if ! [[ "$TARGET_SHARDS" =~ ^-?[0-9]+$ ]]; then
-  echo "Invalid --target-shards: $TARGET_SHARDS" >&2
-  exit 2
-fi
-if ! [[ "$DOWNLOAD_WORKERS" =~ ^[0-9]+$ ]] || [[ "$DOWNLOAD_WORKERS" -lt 1 ]]; then
-  echo "Invalid --download-workers: $DOWNLOAD_WORKERS" >&2
-  exit 2
-fi
-python_has_deps() {
-  local py="$1"
-  "$py" - <<'PY' >/dev/null 2>&1
-import requests, pyarrow, rustbpe, torch
-PY
-}
-if [[ -x "$REPO_ROOT/.venv/bin/python" ]] && python_has_deps "$REPO_ROOT/.venv/bin/python"; then
-  PYTHON_CMD=("$REPO_ROOT/.venv/bin/python")
-elif command -v uv >/dev/null 2>&1; then
-  PYTHON_CMD=(uv run python)
-elif command -v python3 >/dev/null 2>&1 && python_has_deps "$(command -v python3)"; then
-  PYTHON_CMD=(python3)
-else
-  echo "No usable Python interpreter found with required deps (.venv/bin/python, uv run python, or python3)." >&2
-  exit 1
-fi
-count_train_shards() {
-  if [[ ! -d "$DATA_DIR" ]]; then
-    echo 0
-    return
-  fi
-  find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' ! -name 'shard_06542.parquet' | wc -l
-}
-count_total_shards() {
-  if [[ ! -d "$DATA_DIR" ]]; then
-    echo 0
-    return
-  fi
-  find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' | wc -l
-}
-resolve_resume_path() {
-  if [[ "$NO_RESUME" -eq 1 ]]; then
-    return 0
-  fi
-  if [[ -n "$EXPLICIT_RESUME_PATH" ]]; then
-    local expanded
-    expanded="${EXPLICIT_RESUME_PATH/#\~/$HOME}"
-    if [[ -f "$expanded" ]]; then
-      printf '%s\n' "$expanded"
-      return 0
-    fi
-    echo "Requested resume checkpoint not found: $expanded" >&2
-    exit 1
-  fi
-  local candidates=(
-    "$CKPT_DIR/latest.pt"
-    "$CKPT_DIR/pretrain_latest.pt"
-    "$CKPT_DIR/pretrain_final.pt"
-    "$CACHE_ROOT/latest.pt"
-    "$CACHE_ROOT/pretrain_latest.pt"
-    "$CACHE_ROOT/pretrain_final.pt"
-    "$REPO_ROOT/latest.pt"
-    "$REPO_ROOT/pretrain_final.pt"
-  )
-  local candidate
-  for candidate in "${candidates[@]}"; do
-    if [[ -f "$candidate" ]]; then
-      printf '%s\n' "$candidate"
-      return 0
-    fi
-  done
-}
-CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
-CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
-HAS_VAL=0
-if [[ -f "$DATA_DIR/shard_06542.parquet" ]]; then
-  HAS_VAL=1
-fi
-PREPARE_NUM_SHARDS="$TARGET_SHARDS"
-if [[ "$TARGET_SHARDS" -eq -1 ]]; then
-  TARGET_DESC="all available train shards"
-  NEED_PREPARE=1
-elif [[ "$CURRENT_TRAIN_SHARDS" -ge "$TARGET_SHARDS" ]]; then
-  TARGET_DESC="$TARGET_SHARDS"
-  NEED_PREPARE="$FORCE_PREPARE"
-else
-  TARGET_DESC="$TARGET_SHARDS"
-  NEED_PREPARE=1
-fi
-RESUME_PATH="$(resolve_resume_path || true)"
-export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
-export HYDRA_TIME_BUDGET="${HYDRA_TIME_BUDGET:-28800}"
-export HYDRA_TARGET_SHARDS="$TARGET_SHARDS"
-export HYDRA_DOWNLOAD_WORKERS="$DOWNLOAD_WORKERS"
-export HYDRA_DOMAIN_EXPANDED_LOG="$LOG_FILE"
-export HYDRA_CKPT_INTERVAL="${HYDRA_CKPT_INTERVAL:-2000}"
-export HYDRA_CHECKPOINT_INTERVAL="${HYDRA_CHECKPOINT_INTERVAL:-$HYDRA_CKPT_INTERVAL}"
-if [[ -n "$RESUME_PATH" ]]; then
-  export HYDRA_RESUME_PATH="$RESUME_PATH"
-  export HYDRA_RESUME_CKPT="$RESUME_PATH"
-fi
-mkdir -p "$(dirname "$LOG_FILE")"
-ts() { date '+%Y-%m-%d %H:%M:%S'; }
-log() {
-  local line="[$(ts)] $*"
-  echo "$line"
-  echo "$line" >> "$LOG_FILE"
-}
-log "=== domain-expanded pretrain launcher ==="
-log "repo_root=$REPO_ROOT"
-log "data_dir=$DATA_DIR train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS has_val=$HAS_VAL"
-log "target_train_shards=$TARGET_DESC download_workers=$DOWNLOAD_WORKERS"
-log "log_file=$LOG_FILE"
-log "python=${PYTHON_CMD[*]}"
-log "HYDRA_TIME_BUDGET=$HYDRA_TIME_BUDGET"
-log "HYDRA_CKPT_INTERVAL=$HYDRA_CKPT_INTERVAL"
-if [[ -n "$RESUME_PATH" ]]; then
-  log "resume_checkpoint=$RESUME_PATH"
-else
-  log "resume_checkpoint=<none found>"
-fi
-log "note=train.py consumes HYDRA_RESUME_CKPT and HYDRA_CKPT_INTERVAL env vars; launcher exports them automatically"
-if [[ "${HYDRA_USE_NEMOTRON:-0}" == "1" ]]; then
-  # Streaming Nemotron path (Super3 recipe) pulls tokens directly from HF at
-  # train-time via prepare_nemotron.make_dataloader. The disk-shard prepare.py
-  # download phase is redundant in this mode and wastes 20-30 min of paid GPU
-  # time on shard parquet transfers we'll never read.
-  log "prepare_action=skip reason=HYDRA_USE_NEMOTRON=1 (streaming at train-time)"
-elif [[ "$NEED_PREPARE" -eq 1 ]]; then
-  PREPARE_CMD=("${PYTHON_CMD[@]}" prepare.py --num-shards "$PREPARE_NUM_SHARDS" --download-workers "$DOWNLOAD_WORKERS")
-  log "prepare_action=run command=${PREPARE_CMD[*]}"
-  if [[ "$DRY_RUN" -eq 0 ]]; then
-    "${PREPARE_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
-    CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
-    CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
-    log "post_prepare train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS"
-  fi
-else
-  log "prepare_action=skip reason=target_already_satisfied"
-fi
-TRAIN_CMD=("${PYTHON_CMD[@]}" -u train.py)
-if [[ "$SKIP_TRAIN" -eq 1 ]]; then
-  log "train_action=skip reason=--skip-train"
-  exit 0
-fi
-log "train_action=launch command=${TRAIN_CMD[*]}"
-if [[ "$DRY_RUN" -eq 1 ]]; then
-  exit 0
-fi
-set +e
-"${TRAIN_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
-EXIT_CODE=${PIPESTATUS[0]}
-set -e
-log "train_exit_code=$EXIT_CODE"
-exit "$EXIT_CODE"

+#!/usr/bin/env bash
+# Domain-expanded streaming pretrain launcher for Feather/HYDRA.
+#
+# Usage:
+#   ./scripts/run_domain_expanded_pretrain.sh
+#   HYDRA_TARGET_SHARDS=2048 HYDRA_TIME_BUDGET=28800 ./scripts/run_domain_expanded_pretrain.sh
+#   ./scripts/run_domain_expanded_pretrain.sh --target-shards 1024 --dry-run
+#   ./scripts/run_domain_expanded_pretrain.sh --target-shards -1 --download-workers 16
+#
+# Behavior:
+#   - counts currently cached parquet shards in ~/.cache/autoresearch/data
+#   - optionally expands shard coverage toward a target via prepare.py
+#   - skips prepare.py entirely when target coverage is already satisfied
+#   - exports WSL CUDA library paths and long-run HYDRA_* env vars
+#   - prefers an existing latest/pretrain checkpoint path if one is present
+#   - streams stdout/stderr to a stable repo log: run_domain_expanded.log
+set -euo pipefail
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$REPO_ROOT"
+CACHE_ROOT="${HYDRA_CACHE_ROOT:-$HOME/.cache/autoresearch}"
+DATA_DIR="${HYDRA_DATA_DIR:-$CACHE_ROOT/data}"
+CKPT_DIR="${HYDRA_CKPT_DIR:-$CACHE_ROOT/ckpts}"
+LOG_FILE="${HYDRA_DOMAIN_EXPANDED_LOG:-$REPO_ROOT/run_domain_expanded.log}"
+DEFAULT_TARGET_SHARDS="2048"
+TARGET_SHARDS="${HYDRA_TARGET_SHARDS:-$DEFAULT_TARGET_SHARDS}"
+DOWNLOAD_WORKERS="${HYDRA_DOWNLOAD_WORKERS:-8}"
+DRY_RUN=0
+SKIP_TRAIN=0
+FORCE_PREPARE=0
+NO_RESUME=0
+EXPLICIT_RESUME_PATH="${HYDRA_RESUME_PATH:-}"
+usage() {
+  sed -n '2,16p' "$0"
+  cat <<'EOF'
+Options:
+  --target-shards N       Target number of train shards to have locally (-1 = all)
+  --download-workers N    Parallel workers for prepare.py downloads
+  --resume PATH           Override auto-detected checkpoint path
+  --no-resume             Ignore existing checkpoints
+  --skip-train            Only ensure shard coverage, do not launch train.py
+  --force-prepare         Run prepare.py even if target coverage is already satisfied
+  --dry-run               Print planned actions without running prepare.py/train.py
+  -h, --help              Show this help
+EOF
+}
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --target-shards)
+      TARGET_SHARDS="$2"
+      shift 2
+      ;;
+    --download-workers)
+      DOWNLOAD_WORKERS="$2"
+      shift 2
+      ;;
+    --resume)
+      EXPLICIT_RESUME_PATH="$2"
+      shift 2
+      ;;
+    --no-resume)
+      NO_RESUME=1
+      shift
+      ;;
+    --skip-train)
+      SKIP_TRAIN=1
+      shift
+      ;;
+    --force-prepare)
+      FORCE_PREPARE=1
+      shift
+      ;;
+    --dry-run)
+      DRY_RUN=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+if ! [[ "$TARGET_SHARDS" =~ ^-?[0-9]+$ ]]; then
+  echo "Invalid --target-shards: $TARGET_SHARDS" >&2
+  exit 2
+fi
+if ! [[ "$DOWNLOAD_WORKERS" =~ ^[0-9]+$ ]] || [[ "$DOWNLOAD_WORKERS" -lt 1 ]]; then
+  echo "Invalid --download-workers: $DOWNLOAD_WORKERS" >&2
+  exit 2
+fi
+python_has_deps() {
+  local py="$1"
+  "$py" - <<'PY' >/dev/null 2>&1
+import requests, pyarrow, rustbpe, torch
+PY
+}
+if [[ -x "$REPO_ROOT/.venv/bin/python" ]] && python_has_deps "$REPO_ROOT/.venv/bin/python"; then
+  PYTHON_CMD=("$REPO_ROOT/.venv/bin/python")
+elif command -v uv >/dev/null 2>&1; then
+  PYTHON_CMD=(uv run python)
+elif command -v python3 >/dev/null 2>&1 && python_has_deps "$(command -v python3)"; then
+  PYTHON_CMD=(python3)
+else
+  echo "No usable Python interpreter found with required deps (.venv/bin/python, uv run python, or python3)." >&2
+  exit 1
+fi
+count_train_shards() {
+  if [[ ! -d "$DATA_DIR" ]]; then
+    echo 0
+    return
+  fi
+  find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' ! -name 'shard_06542.parquet' | wc -l
+}
+count_total_shards() {
+  if [[ ! -d "$DATA_DIR" ]]; then
+    echo 0
+    return
+  fi
+  find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' | wc -l
+}
+resolve_resume_path() {
+  if [[ "$NO_RESUME" -eq 1 ]]; then
+    return 0
+  fi
+  if [[ -n "$EXPLICIT_RESUME_PATH" ]]; then
+    local expanded
+    expanded="${EXPLICIT_RESUME_PATH/#\~/$HOME}"
+    if [[ -f "$expanded" ]]; then
+      printf '%s\n' "$expanded"
+      return 0
+    fi
+    echo "Requested resume checkpoint not found: $expanded" >&2
+    exit 1
+  fi
+  local candidates=(
+    "$CKPT_DIR/latest.pt"
+    "$CKPT_DIR/pretrain_latest.pt"
+    "$CKPT_DIR/pretrain_final.pt"
+    "$CACHE_ROOT/latest.pt"
+    "$CACHE_ROOT/pretrain_latest.pt"
+    "$CACHE_ROOT/pretrain_final.pt"
+    "$REPO_ROOT/latest.pt"
+    "$REPO_ROOT/pretrain_final.pt"
+  )
+  local candidate
+  for candidate in "${candidates[@]}"; do
+    if [[ -f "$candidate" ]]; then
+      printf '%s\n' "$candidate"
+      return 0
+    fi
+  done
+}
+CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
+CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
+HAS_VAL=0
+if [[ -f "$DATA_DIR/shard_06542.parquet" ]]; then
+  HAS_VAL=1
+fi
+PREPARE_NUM_SHARDS="$TARGET_SHARDS"
+if [[ "$TARGET_SHARDS" -eq -1 ]]; then
+  TARGET_DESC="all available train shards"
+  NEED_PREPARE=1
+elif [[ "$CURRENT_TRAIN_SHARDS" -ge "$TARGET_SHARDS" ]]; then
+  TARGET_DESC="$TARGET_SHARDS"
+  NEED_PREPARE="$FORCE_PREPARE"
+else
+  TARGET_DESC="$TARGET_SHARDS"
+  NEED_PREPARE=1
+fi
+RESUME_PATH="$(resolve_resume_path || true)"
+export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
+export HYDRA_TIME_BUDGET="${HYDRA_TIME_BUDGET:-28800}"
+export HYDRA_TARGET_SHARDS="$TARGET_SHARDS"
+export HYDRA_DOWNLOAD_WORKERS="$DOWNLOAD_WORKERS"
+export HYDRA_DOMAIN_EXPANDED_LOG="$LOG_FILE"
+export HYDRA_CKPT_INTERVAL="${HYDRA_CKPT_INTERVAL:-2000}"
+export HYDRA_CHECKPOINT_INTERVAL="${HYDRA_CHECKPOINT_INTERVAL:-$HYDRA_CKPT_INTERVAL}"
+if [[ -n "$RESUME_PATH" ]]; then
+  export HYDRA_RESUME_PATH="$RESUME_PATH"
+  export HYDRA_RESUME_CKPT="$RESUME_PATH"
+fi
+mkdir -p "$(dirname "$LOG_FILE")"
+ts() { date '+%Y-%m-%d %H:%M:%S'; }
+log() {
+  local line="[$(ts)] $*"
+  echo "$line"
+  echo "$line" >> "$LOG_FILE"
+}
+log "=== domain-expanded pretrain launcher ==="
+log "repo_root=$REPO_ROOT"
+log "data_dir=$DATA_DIR train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS has_val=$HAS_VAL"
+log "target_train_shards=$TARGET_DESC download_workers=$DOWNLOAD_WORKERS"
+log "log_file=$LOG_FILE"
+log "python=${PYTHON_CMD[*]}"
+log "HYDRA_TIME_BUDGET=$HYDRA_TIME_BUDGET"
+log "HYDRA_CKPT_INTERVAL=$HYDRA_CKPT_INTERVAL"
+if [[ -n "$RESUME_PATH" ]]; then
+  log "resume_checkpoint=$RESUME_PATH"
+else
+  log "resume_checkpoint=<none found>"
+fi
+log "note=train.py consumes HYDRA_RESUME_CKPT and HYDRA_CKPT_INTERVAL env vars; launcher exports them automatically"
+if [[ "${HYDRA_USE_NEMOTRON:-0}" == "1" ]]; then
+  # Streaming Nemotron path (Super3 recipe) pulls tokens directly from HF at
+  # train-time via prepare_nemotron.make_dataloader. The disk-shard prepare.py
+  # download phase is redundant in this mode and wastes 20-30 min of paid GPU
+  # time on shard parquet transfers we'll never read.
+  log "prepare_action=skip reason=HYDRA_USE_NEMOTRON=1 (streaming at train-time)"
+elif [[ "$NEED_PREPARE" -eq 1 ]]; then
+  PREPARE_CMD=("${PYTHON_CMD[@]}" prepare.py --num-shards "$PREPARE_NUM_SHARDS" --download-workers "$DOWNLOAD_WORKERS")
+  log "prepare_action=run command=${PREPARE_CMD[*]}"
+  if [[ "$DRY_RUN" -eq 0 ]]; then
+    "${PREPARE_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
+    CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
+    CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
+    log "post_prepare train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS"
+  fi
+else
+  log "prepare_action=skip reason=target_already_satisfied"
+fi
+TRAIN_CMD=("${PYTHON_CMD[@]}" -u train.py)
+if [[ "$SKIP_TRAIN" -eq 1 ]]; then
+  log "train_action=skip reason=--skip-train"
+  exit 0
+fi
+log "train_action=launch command=${TRAIN_CMD[*]}"
+if [[ "$DRY_RUN" -eq 1 ]]; then
+  exit 0
+fi
+set +e
+"${TRAIN_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
+EXIT_CODE=${PIPESTATUS[0]}
+set -e
+log "train_exit_code=$EXIT_CODE"
+exit "$EXIT_CODE"

overlay/scripts/run_meta.sh CHANGED Viewed

@@ -1,13 +1,13 @@
-#!/usr/bin/env bash
-set -euo pipefail
-echo "=== HYDRA Meta-Agent ==="
-cd "$(dirname "$0")/.."
-echo "Running meta-agent iteration..."
-uv run python -c "
-from harness.meta_agent import run_meta_iteration
-import json
-result = run_meta_iteration()
-print(json.dumps(result, indent=2))
-"

+#!/usr/bin/env bash
+set -euo pipefail
+echo "=== HYDRA Meta-Agent ==="
+cd "$(dirname "$0")/.."
+echo "Running meta-agent iteration..."
+uv run python -c "
+from harness.meta_agent import run_meta_iteration
+import json
+result = run_meta_iteration()
+print(json.dumps(result, indent=2))
+"

overlay/scripts/run_phase1.sh CHANGED Viewed

@@ -1,32 +1,32 @@
-#!/usr/bin/env bash
-set -euo pipefail
-echo "=== HYDRA Phase 1: Sequential Subsystem Bring-Up ==="
-cd "$(dirname "$0")/.."
-SUBSYSTEMS=("mamba3" "mhc" "engram" "hestia" "sdr")
-for sub in "${SUBSYSTEMS[@]}"; do
-    echo ""
-    echo "--- Subsystem: ${sub} ---"
-    BRANCH="autoresearch/phase1-${sub}"
-    # Create branch if it doesn't exist
-    if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
-        git checkout -b "${BRANCH}"
-    else
-        git checkout "${BRANCH}"
-    fi
-    echo "Running: uv run subsystems/train_${sub}.py"
-    uv run "subsystems/train_${sub}.py" > "run_${sub}.log" 2>&1 || true
-    # Extract result
-    echo "Result:"
-    grep "^val_bpb:" "run_${sub}.log" || echo "  (crashed)"
-    grep "^peak_vram_mb:" "run_${sub}.log" || true
-done
-echo ""
-echo "=== Phase 1 complete ==="
-git checkout main 2>/dev/null || git checkout master

+#!/usr/bin/env bash
+set -euo pipefail
+echo "=== HYDRA Phase 1: Sequential Subsystem Bring-Up ==="
+cd "$(dirname "$0")/.."
+SUBSYSTEMS=("mamba3" "mhc" "engram" "hestia" "sdr")
+for sub in "${SUBSYSTEMS[@]}"; do
+    echo ""
+    echo "--- Subsystem: ${sub} ---"
+    BRANCH="autoresearch/phase1-${sub}"
+    # Create branch if it doesn't exist
+    if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
+        git checkout -b "${BRANCH}"
+    else
+        git checkout "${BRANCH}"
+    fi
+    echo "Running: uv run subsystems/train_${sub}.py"
+    uv run "subsystems/train_${sub}.py" > "run_${sub}.log" 2>&1 || true
+    # Extract result
+    echo "Result:"
+    grep "^val_bpb:" "run_${sub}.log" || echo "  (crashed)"
+    grep "^peak_vram_mb:" "run_${sub}.log" || true
+done
+echo ""
+echo "=== Phase 1 complete ==="
+git checkout main 2>/dev/null || git checkout master

overlay/scripts/run_phase2.sh CHANGED Viewed

@@ -1,25 +1,25 @@
-#!/usr/bin/env bash
-set -euo pipefail
-echo "=== HYDRA Phase 2: Integrated Autoresearch ==="
-cd "$(dirname "$0")/.."
-TAG="${1:-$(date +%b%d | tr '[:upper:]' '[:lower:]')}"
-# Validate tag: only alphanumeric, hyphens, underscores, dots
-if [[ ! "${TAG}" =~ ^[a-zA-Z0-9._-]+$ ]]; then
-    echo "Error: invalid tag '${TAG}'. Use only alphanumeric, hyphens, underscores, dots." >&2
-    exit 1
-fi
-BRANCH="autoresearch/${TAG}"
-if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
-    git checkout -b -- "${BRANCH}"
-else
-    git checkout -- "${BRANCH}"
-fi
-echo "Branch: ${BRANCH}"
-echo "Starting orchestrator..."
-uv run -m harness.orchestrator

+#!/usr/bin/env bash
+set -euo pipefail
+echo "=== HYDRA Phase 2: Integrated Autoresearch ==="
+cd "$(dirname "$0")/.."
+TAG="${1:-$(date +%b%d | tr '[:upper:]' '[:lower:]')}"
+# Validate tag: only alphanumeric, hyphens, underscores, dots
+if [[ ! "${TAG}" =~ ^[a-zA-Z0-9._-]+$ ]]; then
+    echo "Error: invalid tag '${TAG}'. Use only alphanumeric, hyphens, underscores, dots." >&2
+    exit 1
+fi
+BRANCH="autoresearch/${TAG}"
+if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
+    git checkout -b -- "${BRANCH}"
+else
+    git checkout -- "${BRANCH}"
+fi
+echo "Branch: ${BRANCH}"
+echo "Starting orchestrator..."
+uv run -m harness.orchestrator

overlay/scripts/run_tps_gate.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# Run a reproducible throughput gate.
+# Default gate: 50k TPS steady-state.
+#
+# Usage:
+#   bash scripts/run_tps_gate.sh [config] [seconds] [min_tps]
+# Example:
+#   bash scripts/run_tps_gate.sh baseline 300 50000
+CONFIG="${1:-baseline}"
+SECONDS_BUDGET="${2:-300}"
+MIN_TPS="${3:-50000}"
+echo "[tps-gate] config=$CONFIG seconds=$SECONDS_BUDGET min_tps=$MIN_TPS"
+python scripts/benchmark_hyena_stack.py \
+  --config "$CONFIG" \
+  --time "$SECONDS_BUDGET" \
+  --min-tps "$MIN_TPS"
+echo "[tps-gate] PASS"

overlay/scripts/setup.sh CHANGED Viewed

@@ -1,27 +1,28 @@
-#!/usr/bin/env bash
-set -euo pipefail
-echo "=== HYDRA Setup ==="
-echo ""
-# Check uv
-if ! command -v uv &>/dev/null; then
-    echo "Installing uv..."
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-fi
-# Install Python dependencies
-echo "Installing Python dependencies..."
-cd "$(dirname "$0")/.."
-uv sync
-# Prepare data (download shards + train tokenizer)
-echo ""
-echo "Preparing data (this may take a few minutes on first run)..."
-uv run prepare.py --num-shards 10
-echo ""
-echo "=== Setup complete ==="
-echo "Run experiments with: uv run train.py"
-echo "Run orchestrator with: uv run -m harness.orchestrator"
-echo "Run Phase 1 subsystems with: bash scripts/run_phase1.sh"

+#!/usr/bin/env bash
+set -euo pipefail
+echo "=== HYDRA Setup ==="
+echo ""
+# Check uv
+if ! command -v uv &>/dev/null; then
+    echo "Installing uv..."
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+fi
+# Install Python dependencies
+echo "Installing Python dependencies..."
+cd "$(dirname "$0")/.."
+uv sync
+# Prepare data (download shards + train tokenizer)
+echo ""
+echo "Preparing data (this may take a few minutes on first run)..."
+uv run prepare.py --num-shards 10
+echo ""
+echo "=== Setup complete ==="
+echo "Run experiments with: uv run train.py"
+echo "Run orchestrator with: uv run -m harness.orchestrator"
+echo "Run Phase 1 subsystems with: bash scripts/run_phase1.sh"
+echo "For WSL/CUDA throughput gate: see docs/WSL_TPS_RUNBOOK.md"

overlay/scripts/strip_optimizer_state.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Strip optimizer_state_dict from a checkpoint, keeping only model weights
+and config metadata.
+Reason: resuming training.py's standard path restores the optimizer state,
+which (in our 6GB / Muon-compile / bf16 setup) reproducibly produces a
+NaN/>100-loss on the first forward after load. Reloading model weights
+only and letting the optimizer initialize fresh sidesteps the issue.
+Output checkpoint also clears `step`, `train_seconds`, `epoch` so the LR
+schedule and warmup restart from zero — useful when we want to fine-tune
+the trained weights at a new schedule length.
+"""
+import sys, torch
+src, dst = sys.argv[1], sys.argv[2]
+ckpt = torch.load(src, map_location="cpu", weights_only=False)
+keep = {
+    "model_state_dict": ckpt.get("model_state_dict", ckpt),
+    "config": ckpt.get("config"),
+    # Reset training progress markers so LR schedule warmups cleanly.
+    "step": 0,
+    "train_seconds": 0.0,
+    "smoothed_loss": 0.0,
+    "bpt_ema": 0.0,
+    "epoch": 0,
+}
+# Explicitly do NOT copy optimizer_state_dict.
+torch.save(keep, dst)
+print(f"Stripped -> {dst} (orig {sum(1 for _ in ckpt)} keys, kept {len(keep)})")

overlay/scripts/sweep_depth_aggregate.py CHANGED Viewed

@@ -11,16 +11,56 @@ Usage:
 """
 from __future__ import annotations
-import json
-import os
-import sys
-from pathlib import Path
-MANIFEST = Path(sys.argv[1] if len(sys.argv) > 1 else '/tmp/sweep_depth_manifest.txt')
-def fetch_metrics_from_job(job_id: str) -> dict | None:
-    """Fetch HF Job stdout and parse the [METRICS_JSON] line."""
     try:
         from huggingface_hub import HfApi  # type: ignore
     except Exception as e:
@@ -33,41 +73,73 @@ def fetch_metrics_from_job(job_id: str) -> dict | None:
         print(f'[agg] could not fetch logs for job={job_id}: {e}', file=sys.stderr)
         return None
-    last_json = None
-    for line in logs_stream:
-        # HfApi returns strings or JobLogEntry-like objects depending on version.
-        text = getattr(line, 'data', None) or str(line)
-        if '[METRICS_JSON]' in text:
-            payload = text.split('[METRICS_JSON]', 1)[1].strip()
-            try:
-                last_json = json.loads(payload)
-            except Exception:
-                # Might be truncated on a line boundary — keep looking.
-                pass
-    return last_json
-def compare(results: dict[int, dict]) -> None:
-    """Pretty-print comparison across n_layer values."""
-    if not results:
-        print('[agg] no results')
-        return
-    sorted_n = sorted(results.keys())
-    # Top-level scalars
-    print('\n=== Top-level scalars ===')
     hdr = ['metric'] + [f'L={n}' for n in sorted_n]
     print('  '.join(f'{h:>14}' for h in hdr))
-    for key in ('val_bpb', 'val_ppl', 'num_params_M', 'total_tokens_M',
-                'training_seconds', 'peak_vram_mb', 'sdr_target_active',
-                'htm_anomaly', 'engram_hit_rate', 'sdr_active_bits'):
-        row = [key] + [f'{results[n].get(key, float("nan")):.4f}' if isinstance(results[n].get(key), (int, float)) else 'n/a' for n in sorted_n]
-        print('  '.join(f'{c:>14}' for c in row))
     # Per-layer panel — one table per metric.
     print('\n=== Per-layer: delta_ratio (residual contribution) ===')
     print('  '.join(['layer'] + [f'L={n:>2}' for n in sorted_n]))
-    max_depth = max(results[n].get('n_layer', 0) for n in sorted_n)
     for li in range(max_depth):
         row = [f'L{li:02d}']
         for n in sorted_n:
@@ -104,16 +176,40 @@ def compare(results: dict[int, dict]) -> None:
     # Dead-layer detection
     print('\n=== Dead-layer detection (delta_ratio < 0.02) ===')
-    for n in sorted_n:
-        r = results[n]
-        n_layer = r.get('n_layer', 0)
         dead = []
         for li in range(n_layer):
             v = r.get(f'layer_{li}_delta_ratio')
             if isinstance(v, (int, float)) and v < 0.02:
                 dead.append(li)
-        status = 'ALL LIVE' if not dead else f'DEAD LAYERS: {dead}'
-        print(f'  n_layer={n:2d}  val_bpb={r.get("val_bpb", float("nan")):.4f}  {status}')
 def main() -> int:
@@ -134,7 +230,7 @@ def main() -> int:
         jobs[n_layer] = job_id
     print(f'[agg] reading {len(jobs)} jobs from {MANIFEST}')
-    results: dict[int, dict] = {}
     for n, jid in jobs.items():
         print(f'[agg] fetching job={jid} (n_layer={n}) ...')
         m = fetch_metrics_from_job(jid)

 """
 from __future__ import annotations
+import json
+import os
+import statistics
+import re
+import sys
+from pathlib import Path
+from configs.harness_config import HarnessConfig
+type MetricValue = float | int | str | bool | None
+type MetricsDict = dict[str, MetricValue]
+MANIFEST = Path(sys.argv[1] if len(sys.argv) > 1 else '/tmp/sweep_depth_manifest.txt')
+STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
+MIN_TPS = float(os.environ.get('SWEEP_MIN_TPS', '0'))
+def _zero_shot_score(result: MetricsDict) -> float:
+    """Composite quality score for tie-breaking among BPB-near runs."""
+    factual = float(result.get('factual_english_score', 0.0) or 0.0)
+    instruction = float(result.get('instruction_following_score', 0.0) or 0.0)
+    distinct_2 = float(result.get('distinct_2', 0.0) or 0.0)
+    repetition = float(result.get('repetition_rate', 0.0) or 0.0)
+    return factual + instruction + distinct_2 - repetition
+def _metric_float(result: MetricsDict, key: str, default: float = 0.0) -> float:
+    value = result.get(key, default)
+    return float(value) if isinstance(value, (int, float)) else default
+def _metric_int(result: MetricsDict, key: str, default: int = 0) -> int:
+    value = result.get(key, default)
+    return int(value) if isinstance(value, int) else default
+def _percentile_linear(sorted_values: list[float], pct: float) -> float:
+    if not sorted_values:
+        return 0.0
+    if len(sorted_values) == 1:
+        return sorted_values[0]
+    rank = (len(sorted_values) - 1) * (pct / 100.0)
+    lo = int(rank)
+    hi = min(lo + 1, len(sorted_values) - 1)
+    frac = rank - lo
+    return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac
+def fetch_metrics_from_job(job_id: str) -> MetricsDict | None:
+    """Fetch HF Job stdout and parse the [METRICS_JSON] line."""
     try:
         from huggingface_hub import HfApi  # type: ignore
     except Exception as e:
         print(f'[agg] could not fetch logs for job={job_id}: {e}', file=sys.stderr)
         return None
+    last_json = None
+    tps_samples: list[tuple[int, int]] = []
+    warmup_steps = 25
+    for line in logs_stream:
+        # HfApi returns strings or JobLogEntry-like objects depending on version.
+        text = getattr(line, 'data', None) or str(line)
+        wm = re.search(r"\[TPS_GUARD\] enabled .*?warmup_steps=(\d+)", text)
+        if wm:
+            warmup_steps = int(wm.group(1))
+        sm = STEP_TPS_PATTERN.search(text)
+        if sm:
+            tps_samples.append((int(sm.group(1)), int(sm.group(2))))
+        if '[METRICS_JSON]' in text:
+            payload = text.split('[METRICS_JSON]', 1)[1].strip()
+            try:
+                last_json = json.loads(payload)
+            except Exception:
+                # Might be truncated on a line boundary — keep looking.
+                pass
+    if last_json is None:
+        return None
+    steady_tps = [float(tps) for step, tps in tps_samples if step >= warmup_steps]
+    if not steady_tps:
+        steady_tps = [float(tps) for _, tps in tps_samples]
+    if steady_tps:
+        sorted_tps = sorted(steady_tps)
+        last_json['tps_samples'] = len(steady_tps)
+        last_json['tps_median'] = float(statistics.median(steady_tps))
+        last_json['tps_p10'] = float(_percentile_linear(sorted_tps, 10.0))
+        last_json['tps_min'] = float(sorted_tps[0])
+        last_json['tps_max'] = float(sorted_tps[-1])
+        last_json['tps_warmup_steps'] = int(warmup_steps)
+    return last_json
+def compare(results: dict[int, MetricsDict]) -> None:
+    """Pretty-print comparison across n_layer values."""
+    if not results:
+        print('[agg] no results')
+        return
+    sorted_n = sorted(results.keys())
+    secondary_gates = HarnessConfig().to_secondary_gates()
+    print('\n=== Active secondary gates ===')
+    for metric, thresholds in sorted(secondary_gates.items()):
+        print(f'  {metric}: {json.dumps(thresholds, sort_keys=True)}')
+    # Top-level scalars
+    print('\n=== Top-level scalars ===')
     hdr = ['metric'] + [f'L={n}' for n in sorted_n]
     print('  '.join(f'{h:>14}' for h in hdr))
+    for key in ('val_bpb', 'val_ppl', 'num_params_M', 'total_tokens_M',
+                'training_seconds', 'peak_vram_mb', 'sdr_target_active',
+                'htm_anomaly', 'engram_hit_rate', 'sdr_active_bits',
+                'tps_median', 'tps_p10', 'tps_min', 'tps_max', 'tps_samples'):
+        row = [key] + [f'{results[n].get(key, float("nan")):.4f}' if isinstance(results[n].get(key), (int, float)) else 'n/a' for n in sorted_n]
+        print('  '.join(f'{c:>14}' for c in row))
     # Per-layer panel — one table per metric.
     print('\n=== Per-layer: delta_ratio (residual contribution) ===')
     print('  '.join(['layer'] + [f'L={n:>2}' for n in sorted_n]))
+    max_depth = max(_metric_int(results[n], 'n_layer', 0) for n in sorted_n)
     for li in range(max_depth):
         row = [f'L{li:02d}']
         for n in sorted_n:
     # Dead-layer detection
     print('\n=== Dead-layer detection (delta_ratio < 0.02) ===')
+    for n in sorted_n:
+        r = results[n]
+        n_layer = _metric_int(r, 'n_layer', 0)
         dead = []
         for li in range(n_layer):
             v = r.get(f'layer_{li}_delta_ratio')
             if isinstance(v, (int, float)) and v < 0.02:
                 dead.append(li)
+        status = 'ALL LIVE' if not dead else f'DEAD LAYERS: {dead}'
+        print(f'  n_layer={n:2d}  val_bpb={r.get("val_bpb", float("nan")):.4f}  {status}')
+    print('\n=== Throughput-constrained ranking ===')
+    ranked = sorted(
+        ((n, r) for n, r in results.items() if isinstance(r.get('val_bpb'), (int, float))),
+        key=lambda x: (
+            (MIN_TPS > 0) and (_metric_float(x[1], 'tps_median', 0.0) < MIN_TPS),
+            _metric_float(x[1], 'val_bpb', float('inf')),
+            -_zero_shot_score(x[1]),
+        ),
+    )
+    feasible_count = 0
+    for n, r in ranked:
+        tps_median = _metric_float(r, 'tps_median', 0.0)
+        feasible = (MIN_TPS <= 0) or (tps_median >= MIN_TPS)
+        zero_shot_score = _zero_shot_score(r)
+        if feasible:
+            feasible_count += 1
+        print(
+            f"  n_layer={n:2d} val_bpb={_metric_float(r, 'val_bpb', float('nan')):.4f} "
+            f"tps_median={tps_median:.0f} zero_shot_score={zero_shot_score:.4f} feasible={feasible}",
+            flush=True,
+        )
+    if MIN_TPS > 0:
+        print(f"[agg] throughput gate: tps_median >= {MIN_TPS:.0f}; feasible={feasible_count}/{len(ranked)}")
 def main() -> int:
         jobs[n_layer] = job_id
     print(f'[agg] reading {len(jobs)} jobs from {MANIFEST}')
+    results: dict[int, MetricsDict] = {}
     for n, jid in jobs.items():
         print(f'[agg] fetching job={jid} (n_layer={n}) ...')
         m = fetch_metrics_from_job(jid)

overlay/scripts/sweep_depth_local.sh CHANGED Viewed

@@ -1,62 +1,62 @@
-#!/usr/bin/env bash
-# Local sequential depth sweep on RTX 3060.
-# Uses real mamba_ssm Mamba3 (grafted from state-spaces/mamba main).
-# Config: Gen 76 local champion (d_model=96, engram=4096, target_active=327),
-# sweeping n_layer ∈ {1, 2, 3, 4}. Each run 300s (~5 min) → ~20 min total.
-set -euo pipefail
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
-export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
-# WSL2: libcuda.so.1 lives at /usr/lib/wsl/lib; prepend it so cudarc finds the
-# CUDA driver library at runtime.
-export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:/usr/lib/wsl/lib:${LD_LIBRARY_PATH:-}
-export PYTORCH_ALLOC_CONF=expandable_segments:True
-# GPU HTM path: use non-fused step_many_cuda (fused megakernel is Hopper-only).
-# This drops htm_await from ~20-40s/step (CPU) to ~0ms (GPU, async).
-export HYDRA_HTM_FUSED=0
-# Architecture (Gen 76 + user audit: keep target_active=327 for gradient plasticity).
-export HYDRA_D_MODEL=96
-export HYDRA_D_STATE=16
-export HYDRA_HEADDIM=12
-export HYDRA_EXPAND=3
-export HYDRA_ENGRAM_N_COLUMNS=4096
-export HYDRA_SDR_TARGET_ACTIVE=327
-# Training knobs tuned for 6GB VRAM.
-export HYDRA_BATCH_SIZE=1
-export HYDRA_TOTAL_BATCH=32768        # 1 * 8 accum * 512 seq * 8 heads = Gen 76 config
-export HYDRA_TIME_BUDGET=300          # 5 min per run
-export HYDRA_CKPT_INTERVAL=0          # don't save ckpts during sweep
-export HYDRA_MID_VAL_INTERVAL=250
-# Full per-layer diagnostic panel.
-export HYDRA_LAYER_DIAGNOSTICS=1
-export HYDRA_LAYER_DIAG_SVD_EVERY=100
-# Use cached shards + tokenizer + retina (vocab=8192, target_active=327).
-# NOT streaming — already have 2049 shards from prior local runs.
-unset HYDRA_USE_NEMOTRON
-PY=/home/mikeb/work/feather/.venv/bin/python3
-OUT_DIR=/tmp/local_sweep
-mkdir -p "$OUT_DIR"
-for N in 1 2 3 4; do
-    echo "=========================================="
-    echo "=== n_layer=$N  $(date +%H:%M:%S) ==="
-    echo "=========================================="
-    export HYDRA_N_LAYER=$N
-    export HYDRA_METRICS_OUT="$OUT_DIR/sweep_n${N}_metrics.json"
-    LOG="$OUT_DIR/sweep_n${N}.log"
-    "$PY" -u train.py > "$LOG" 2>&1 || echo "[WARN] n_layer=$N run exited non-zero (see $LOG)"
-    echo "=== n_layer=$N done; metrics=$HYDRA_METRICS_OUT log=$LOG ==="
-    # Quick tail of the important lines
-    grep -E "val_bpb|LAYER_DIAG|METRICS_JSON" "$LOG" | tail -20 || true
-done
-echo ""
-echo "=== SWEEP COMPLETE ==="
-ls -la "$OUT_DIR"

+#!/usr/bin/env bash
+# Local sequential depth sweep on RTX 3060.
+# Uses real mamba_ssm Mamba3 (grafted from state-spaces/mamba main).
+# Config: Gen 76 local champion (d_model=96, engram=4096, target_active=327),
+# sweeping n_layer ∈ {1, 2, 3, 4}. Each run 300s (~5 min) → ~20 min total.
+set -euo pipefail
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
+# WSL2: libcuda.so.1 lives at /usr/lib/wsl/lib; prepend it so cudarc finds the
+# CUDA driver library at runtime.
+export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:/usr/lib/wsl/lib:${LD_LIBRARY_PATH:-}
+export PYTORCH_ALLOC_CONF=expandable_segments:True
+# GPU HTM path: use non-fused step_many_cuda (fused megakernel is Hopper-only).
+# This drops htm_await from ~20-40s/step (CPU) to ~0ms (GPU, async).
+export HYDRA_HTM_FUSED=0
+# Architecture (Gen 76 + user audit: keep target_active=327 for gradient plasticity).
+export HYDRA_D_MODEL=96
+export HYDRA_D_STATE=16
+export HYDRA_HEADDIM=12
+export HYDRA_EXPAND=3
+export HYDRA_ENGRAM_N_COLUMNS=4096
+export HYDRA_SDR_TARGET_ACTIVE=327
+# Training knobs tuned for 6GB VRAM.
+export HYDRA_BATCH_SIZE=1
+export HYDRA_TOTAL_BATCH=32768        # 1 * 8 accum * 512 seq * 8 heads = Gen 76 config
+export HYDRA_TIME_BUDGET=300          # 5 min per run
+export HYDRA_CKPT_INTERVAL=0          # don't save ckpts during sweep
+export HYDRA_MID_VAL_INTERVAL=250
+# Full per-layer diagnostic panel.
+export HYDRA_LAYER_DIAGNOSTICS=1
+export HYDRA_LAYER_DIAG_SVD_EVERY=100
+# Use cached shards + tokenizer + retina (vocab=8192, target_active=327).
+# NOT streaming — already have 2049 shards from prior local runs.
+unset HYDRA_USE_NEMOTRON
+PY=/home/mikeb/work/feather/.venv/bin/python3
+OUT_DIR=/tmp/local_sweep
+mkdir -p "$OUT_DIR"
+for N in 1 2 3 4; do
+    echo "=========================================="
+    echo "=== n_layer=$N  $(date +%H:%M:%S) ==="
+    echo "=========================================="
+    export HYDRA_N_LAYER=$N
+    export HYDRA_METRICS_OUT="$OUT_DIR/sweep_n${N}_metrics.json"
+    LOG="$OUT_DIR/sweep_n${N}.log"
+    "$PY" -u train.py > "$LOG" 2>&1 || echo "[WARN] n_layer=$N run exited non-zero (see $LOG)"
+    echo "=== n_layer=$N done; metrics=$HYDRA_METRICS_OUT log=$LOG ==="
+    # Quick tail of the important lines
+    grep -E "val_bpb|LAYER_DIAG|METRICS_JSON" "$LOG" | tail -20 || true
+done
+echo ""
+echo "=== SWEEP COMPLETE ==="
+ls -la "$OUT_DIR"

overlay/scripts/train_champion_12h.sh ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/bin/bash
+# 12-hour champion training run. Config matches autoresearch iter.sh base
+# after 61 mutation experiments identified the Pareto-optimal knobs.
+#
+# Champion config (train_bpb ~1.6169 at 10-min budget, 29.7k tps):
+#   d_model=160, n_layer=20, B=8, seq=1024
+#   engram=16384, z_loss=0.001, no GDN (pure Mamba3 stack)
+#   TIME_BUDGET=43200s (12 hours)
+#   CKPT_INTERVAL=500 steps (~every 15 min at ~30 steps/s)
+#
+# Assumes .omc/autoresearch_STOP sentinel is present (cron loop disabled).
+# Output goes to run_champion_12h.log in repo root.
+set -u
+REPO=/home/mikeb/work/feather
+cd "$REPO"
+# Bail if autoresearch loop sentinel not set (would conflict)
+if [ ! -f "$REPO/.omc/autoresearch_STOP" ]; then
+  echo "ERROR: .omc/autoresearch_STOP not present — autoresearch cron still active."
+  echo "Run: touch $REPO/.omc/autoresearch_STOP"
+  exit 1
+fi
+# Bail if another training is running
+if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {found=1} END {exit !found}'; then
+  echo "ERROR: another python train.py is already running"
+  exit 1
+fi
+rm -f run_champion_12h.log
+env \
+  LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
+  PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+  HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
+  HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
+  HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
+  HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
+  HYDRA_TIME_BUDGET=43200 \
+  HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
+  HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
+  HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
+  HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
+  HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
+  HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
+  HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
+  HYDRA_Z_LOSS_WEIGHT=0.001 \
+  HYDRA_RESUME_CKPT=none \
+  ./.venv/bin/python -u train.py > run_champion_12h.log 2>&1
+echo "exit=$?"

overlay/scripts/train_champion_5h.sh ADDED Viewed

	@@ -0,0 +1,45 @@

+#!/bin/bash
+# 5-hour champion training — fresh start with properly-timed cosine schedule.
+#
+# Why not 12h: at 12h budget, the cosine LR stays near peak for the first
+# ~6h, leaving the model thrashing around bpb~1.72 (plateau observed).
+# The schedule is stretched too thin.
+#
+# Why 5h: 18000s is long enough to build capacity (~17000 steps at 30k tps)
+# while letting the cosine actually decay to zero within the window. The
+# "cooling" phase (last 20% = 1h) is where the bpb drops sharply below
+# the 10-min champion's 1.62.
+#
+# Why not resume from latest.pt: the saved ckpt triggers NaN on first
+# forward after resume (reproducible; ckpt/optimizer state incompatibility
+# not worth debugging — fresh start is faster).
+set -u
+REPO=/home/mikeb/work/feather
+cd "$REPO"
+if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
+  echo "ERROR: another python train.py is running"
+  exit 1
+fi
+rm -f run_champion_5h.log
+env \
+  LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
+  PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+  HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
+  HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
+  HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
+  HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
+  HYDRA_TIME_BUDGET=18000 \
+  HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
+  HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
+  HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
+  HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
+  HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
+  HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
+  HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
+  HYDRA_Z_LOSS_WEIGHT=0.001 \
+  HYDRA_RESUME_CKPT=none \
+  ./.venv/bin/python -u train.py > run_champion_5h.log 2>&1
+echo "exit=$?"

overlay/scripts/train_champion_resume.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/bin/bash
+# Resume the original 12h run from its step-5000 checkpoint with the SAME
+# budget (43200s). This keeps the optimizer state and LR schedule identical
+# to what was running at ckpt save, so there's no mismatch between loaded
+# momentum and new lr.
+#
+# Intent: validate that the resume path itself works (vs the failed warmstart
+# attempts where budget change caused NaN on first step).
+set -u
+REPO=/home/mikeb/work/feather
+cd "$REPO"
+if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
+  echo "ERROR: another python train.py is running"
+  exit 1
+fi
+rm -f run_champion_resume.log
+env \
+  LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
+  PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+  HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
+  HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
+  HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
+  HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
+  HYDRA_TIME_BUDGET=43200 \
+  HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
+  HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
+  HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
+  HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
+  HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
+  HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
+  HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
+  HYDRA_Z_LOSS_WEIGHT=0.001 \
+  HYDRA_RESUME_CKPT=/home/mikeb/.cache/autoresearch/latest.pt \
+  ./.venv/bin/python -u train.py > run_champion_resume.log 2>&1
+echo "exit=$?"

overlay/scripts/train_champion_resume_clean.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/bin/bash
+# Resume training from weights-only ckpt (optimizer state stripped) to
+# avoid the reproducible NaN that plain resume triggers.
+#
+# The step/train_seconds/epoch are also reset to 0 so the LR schedule
+# warmup runs cleanly and cosine decay matches the new TIME_BUDGET.
+# Model weights carry over ~2500 steps of prior training.
+set -u
+REPO=/home/mikeb/work/feather
+cd "$REPO"
+if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
+  echo "ERROR: another python train.py is running"
+  exit 1
+fi
+CKPT=/home/mikeb/.cache/autoresearch/weights_only_clean.pt
+if [ ! -f "$CKPT" ]; then
+  echo "ERROR: $CKPT missing. Run scripts/strip_optimizer_state.py first."
+  exit 1
+fi
+rm -f run_champion_resume_clean.log
+env \
+  LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
+  PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+  HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
+  HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
+  HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
+  HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
+  HYDRA_TIME_BUDGET=18000 \
+  HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
+  HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
+  HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
+  HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
+  HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
+  HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
+  HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
+  HYDRA_Z_LOSS_WEIGHT=0.001 \
+  HYDRA_RESUME_CKPT="$CKPT" \
+  ./.venv/bin/python -u train.py > run_champion_resume_clean.log 2>&1
+echo "exit=$?"

overlay/scripts/train_champion_v2.sh ADDED Viewed

	@@ -0,0 +1,54 @@

+#!/bin/bash
+# Champion training v2 — fixes data pipeline + mode collapse.
+#
+# Diagnosis from step-3500 ckpt sampling:
+#   - Greedy decoding collapses to "a whole grains, etc." attractor
+#   - Top-p produces grammatical but factually-empty text
+#   - Token cache being built on-the-fly; blend sources were silently
+#     unavailable because HYDRA_LOCAL_SHARDS_ONLY=1 + no cached parquets
+#   - FULL_BLEND has only 4 active sources (fineweb-edu, wikipedia,
+#     cosmopedia, fineweb), all weight-0 for code/math
+#
+# Fixes:
+#   A) HYDRA_LOCAL_SHARDS_ONLY=0  → stream directly from HF Hub
+#   B) HYDRA_BACKGROUND_PREFETCH=1 → download remaining shards in BG
+#   C) HYDRA_ENTROPY_PENALTY=0.01 → break single-attractor mode collapse
+#   D) HYDRA_LABEL_SMOOTHING=0.1  → soft targets discourage peaked dist
+#   E) Resume from weights_only_clean.pt (inherit prior training)
+set -u
+REPO=/home/mikeb/work/feather
+cd "$REPO"
+if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
+  echo "ERROR: another python train.py is running"
+  exit 1
+fi
+CKPT=/home/mikeb/.cache/autoresearch/weights_only_clean.pt
+if [ ! -f "$CKPT" ]; then
+  echo "ERROR: $CKPT missing."
+  exit 1
+fi
+rm -f run_champion_v2.log
+env \
+  LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
+  PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+  HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
+  HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
+  HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
+  HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
+  HYDRA_TIME_BUDGET=18000 \
+  HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
+  HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
+  HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
+  HYDRA_LOCAL_SHARDS_ONLY=0 HYDRA_BACKGROUND_PREFETCH=1 \
+  HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
+  HYDRA_ENTROPY_PENALTY=0.01 HYDRA_LABEL_SMOOTHING=0.1 \
+  HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
+  HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
+  HYDRA_Z_LOSS_WEIGHT=0.001 \
+  HYDRA_RESUME_CKPT="$CKPT" \
+  ./.venv/bin/python -u train.py > run_champion_v2.log 2>&1
+echo "exit=$?"

overlay/scripts/train_champion_warmstart.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+#!/bin/bash
+# Warm-start from the 12h champion training's latest.pt, with a TIGHTER
+# total budget so the cosine LR decay actually kicks in.
+#
+# Problem: The plain 12h run (43200s) keeps lr near peak (1.1e-2) for the
+# first ~6h, leaving the model thrashing around its local min (bpb ~1.72
+# rolling avg from step 2700 onward). User correctly pointed out the
+# schedule shape for a long budget wastes time in exploration.
+#
+# Fix: resume the already-trained weights (step ~5000, train_seconds ~5600)
+# but run with HYDRA_TIME_BUDGET=20000 (5.5h total). The scheduler treats
+# loaded train_seconds=5600 as "already 28% through" a 20000s budget, so
+# lr decays from ~1.05e-2 now to near-zero over the next 4h — the "cooling"
+# phase that produces the stable low-bpb endpoint.
+#
+# Total additional wall-clock: ~4h. Previous checkpoints are preserved
+# (ckpt rotations keep latest.pt, latest.pt.1, etc.).
+set -u
+REPO=/home/mikeb/work/feather
+cd "$REPO"
+if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
+  echo "ERROR: another python train.py is running"
+  exit 1
+fi
+rm -f run_champion_warmstart.log
+env \
+  LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
+  PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+  HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
+  HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
+  HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
+  HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
+  HYDRA_TIME_BUDGET=20000 \
+  HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
+  HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
+  HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
+  HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
+  HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
+  HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
+  HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
+  HYDRA_Z_LOSS_WEIGHT=0.001 \
+  HYDRA_RESUME_CKPT=/home/mikeb/.cache/autoresearch/latest.pt \
+  ./.venv/bin/python -u train.py > run_champion_warmstart.log 2>&1
+echo "exit=$?"

overlay/scripts/wsl_bootstrap_tps.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# Bootstrap a WSL CUDA Python env capable of running train.py TPS checks.
+# Usage:
+#   bash scripts/wsl_bootstrap_tps.sh [cuda-tag]
+# Example:
+#   bash scripts/wsl_bootstrap_tps.sh cu121
+CUDA_TAG="${1:-cu121}"
+PYTHON_BIN="${PYTHON_BIN:-python3}"
+VENV_DIR="${VENV_DIR:-.venv-wsl}"
+if ! grep -qiE "microsoft|wsl" /proc/version 2>/dev/null; then
+  echo "[bootstrap] warning: not running inside WSL; continuing anyway"
+fi
+if ! command -v nvidia-smi >/dev/null 2>&1; then
+  echo "[bootstrap] error: nvidia-smi not found. Install NVIDIA driver + WSL GPU support first."
+  exit 1
+fi
+if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
+  echo "[bootstrap] error: Python binary not found: $PYTHON_BIN"
+  exit 1
+fi
+"$PYTHON_BIN" -m venv "$VENV_DIR"
+source "$VENV_DIR/bin/activate"
+python -m pip install --upgrade pip wheel setuptools
+case "$CUDA_TAG" in
+  cu118)
+    TORCH_INDEX_URL="https://download.pytorch.org/whl/cu118"
+    ;;
+  cu121)
+    TORCH_INDEX_URL="https://download.pytorch.org/whl/cu121"
+    ;;
+  cu124)
+    TORCH_INDEX_URL="https://download.pytorch.org/whl/cu124"
+    ;;
+  *)
+    echo "[bootstrap] error: unsupported cuda tag '$CUDA_TAG' (supported: cu118, cu121, cu124)"
+    exit 1
+    ;;
+esac
+python -m pip install "torch" --index-url "$TORCH_INDEX_URL"
+python -m pip install -e ".[dev]"
+# IMPORTANT: --no-build-isolation keeps pip from pulling torch-cpu into an
+# isolated build env, which would break mamba-ssm extension builds.
+python -m pip install "causal-conv1d>=1.4.0" --no-build-isolation
+python -m pip install "mamba-ssm" --no-build-isolation
+python - <<'PY'
+import torch
+print(f"[bootstrap] torch={torch.__version__}")
+print(f"[bootstrap] torch_cuda={torch.version.cuda}")
+print(f"[bootstrap] cuda_available={torch.cuda.is_available()}")
+if not torch.cuda.is_available():
+    raise SystemExit("[bootstrap] error: CUDA not available to torch")
+import mamba_ssm  # noqa: F401
+print("[bootstrap] mamba_ssm import OK")
+PY
+echo "[bootstrap] done. Activate env with: source $VENV_DIR/bin/activate"

overlay/subsystems/htm.py CHANGED Viewed

@@ -29,40 +29,38 @@ copy is small compared to the SP/TM compute.
 from __future__ import annotations
 import time
-from concurrent.futures import ThreadPoolExecutor
 import numpy as np
 import torch
 import torch.nn as nn
-import htm_rust
-# step_many releases the GIL for the whole pass, so multiple threads can
-# truly run regions in parallel — wall-clock scales with B up to CPU cores.
-_HTM_HAS_STEP_MANY = hasattr(htm_rust.HTMRegion, "step_many")
 # GPU backend: built with `maturin develop --features gpu`. One CUDA region
 # per batch slot, persistent device state for SP synapses. Transparent
 # fallback to CPU when not available.
-_HTM_HAS_GPU = hasattr(htm_rust, "HTMRegionGpu")
 # Zero-copy CUDA path: consumes torch CUDA tensors directly via the
 # __cuda_array_interface__ protocol, skipping the sdr.cpu()/numpy round-trip
 # and the D2H of outputs. Huge win when the input SDR already lives on GPU
 # (which is the train.py hot path — retina is a device buffer).
-_HTM_HAS_CAI = _HTM_HAS_GPU and hasattr(htm_rust.HTMRegionGpu, "step_many_cuda")
 # Fused megakernel path: collapses all T timesteps + SP + TM into a single
 # CUDA launch per forward. Replaces global top-K with per-column threshold
 # inhibition (see htm_rust/docs/GPU_HTM.md §Fused Kernel).
 # Opt-in via env var (default on when available).
 import os as _os_fused
-_HTM_HAS_FUSED = _HTM_HAS_GPU and hasattr(htm_rust.HTMRegionGpu, "step_many_fused_cuda")
-_HTM_GPU_FUSED_RUNTIME = bool(
-    _HTM_HAS_FUSED and hasattr(htm_rust, "gpu_fused_available") and htm_rust.gpu_fused_available()
-)
-_HTM_USE_FUSED = (
-    _HTM_HAS_FUSED
-    and _HTM_GPU_FUSED_RUNTIME
-    and bool(int(_os_fused.environ.get("HYDRA_HTM_FUSED", "1")))
-)
 class HTMLayer(nn.Module):
@@ -87,11 +85,11 @@ class HTMLayer(nn.Module):
         learn: bool = True,
         reset_each_forward: bool = True,
         use_gpu: bool | None = None,
-    ) -> None:
-        super().__init__()
-        self.input_bits = input_bits
-        self.n_columns = n_columns
-        self.cells_per_column = cells_per_column
         self.learn = learn
         self.reset_each_forward = reset_each_forward
         self._seed_base = seed
@@ -101,39 +99,27 @@ class HTMLayer(nn.Module):
         # converges since the EMA accumulates over many calls. Env:
         # HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
         import os as _os
-        self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
-        self._forward_counter = 0
-        # GPU backend gate. Default: auto-detect — use GPU when the pyo3
-        # module was built with --features gpu AND CUDA is actually usable.
-        if use_gpu is None:
-            use_gpu = _HTM_HAS_GPU and torch.cuda.is_available()
-        elif use_gpu and not _HTM_HAS_GPU:
-            raise RuntimeError(
-                "HTMLayer(use_gpu=True) but htm_rust was not built with "
-                "--features gpu. Re-run `maturin develop --features gpu`."
-            )
-        self._use_gpu = bool(use_gpu)
-        self._gpu_fallback = _os.environ.get("HYDRA_HTM_GPU_FALLBACK", "1") == "1"
-        cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
-        self._region_cls = cls
-        try:
-            self._regions = [
-                cls(input_bits, n_columns, cells_per_column, seed + i)
-                for i in range(batch_size)
-            ]
-        except RuntimeError as e:
-            if not self._use_gpu or not self._gpu_fallback:
-                raise
-            print(
-                f"[htm] GPU region init failed ({e}); falling back to CPU HTMRegion",
-                flush=True,
-            )
-            self._use_gpu = False
-            self._region_cls = htm_rust.HTMRegion
-            self._regions = [
-                self._region_cls(input_bits, n_columns, cells_per_column, seed + i)
-                for i in range(batch_size)
-            ]
         self.register_buffer("_dummy", torch.zeros(1), persistent=False)
         import os as _os
         self._htm_pool = ThreadPoolExecutor(max_workers=min(_os.cpu_count() or 4, 16))
@@ -278,12 +264,12 @@ class HTMLayer(nn.Module):
             # grid.y = B processes all regions concurrently — ~B× speedup.
             # Falls back to sequential dispatch if the batched entry isn't
             # available (older htm_rust wheel).
-            if _HTM_USE_FUSED and hasattr(htm_rust, "step_batch_fused_cuda"):
                 # Slice self._regions to match B: _ensure_regions may have
                 # allocated more regions than the current batch size needs
                 # (e.g. factual eval uses smaller batches than training).
                 try:
-                    htm_rust.step_batch_fused_cuda(
                         self._regions[:B],
                         [sdr_u8[b].__cuda_array_interface__ for b in range(B)],
                         [cols_out[b].__cuda_array_interface__ for b in range(B)],

 from __future__ import annotations
 import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any
 import numpy as np
 import torch
 import torch.nn as nn
+import htm_rust
+_HTM_REGION: Any = getattr(htm_rust, "HTMRegion", None)
+_HTM_REGION_GPU: Any = getattr(htm_rust, "HTMRegionGpu", None)
+_HTM_STEP_BATCH_FUSED_CUDA: Any = getattr(htm_rust, "step_batch_fused_cuda", None)
+# step_many releases the GIL for the whole pass, so multiple threads can
+# truly run regions in parallel — wall-clock scales with B up to CPU cores.
+_HTM_HAS_STEP_MANY = hasattr(_HTM_REGION, "step_many")
 # GPU backend: built with `maturin develop --features gpu`. One CUDA region
 # per batch slot, persistent device state for SP synapses. Transparent
 # fallback to CPU when not available.
+_HTM_HAS_GPU = hasattr(htm_rust, "HTMRegionGpu")
 # Zero-copy CUDA path: consumes torch CUDA tensors directly via the
 # __cuda_array_interface__ protocol, skipping the sdr.cpu()/numpy round-trip
 # and the D2H of outputs. Huge win when the input SDR already lives on GPU
 # (which is the train.py hot path — retina is a device buffer).
+_HTM_HAS_CAI = _HTM_HAS_GPU and hasattr(_HTM_REGION_GPU, "step_many_cuda")
 # Fused megakernel path: collapses all T timesteps + SP + TM into a single
 # CUDA launch per forward. Replaces global top-K with per-column threshold
 # inhibition (see htm_rust/docs/GPU_HTM.md §Fused Kernel).
 # Opt-in via env var (default on when available).
 import os as _os_fused
+_HTM_HAS_FUSED = _HTM_HAS_GPU and hasattr(_HTM_REGION_GPU, "step_many_fused_cuda")
+_HTM_USE_FUSED = _HTM_HAS_FUSED and bool(int(_os_fused.environ.get("HYDRA_HTM_FUSED", "1")))
 class HTMLayer(nn.Module):
         learn: bool = True,
         reset_each_forward: bool = True,
         use_gpu: bool | None = None,
+    ) -> None:
+        super().__init__()
+        self.input_bits = input_bits
+        self.n_columns = n_columns
+        self.cells_per_column = cells_per_column
         self.learn = learn
         self.reset_each_forward = reset_each_forward
         self._seed_base = seed
         # converges since the EMA accumulates over many calls. Env:
         # HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
         import os as _os
+        self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
+        self._forward_counter = 0
+        force_cpu = _os.environ.get("HYDRA_FORCE_HTM_CPU", "0") == "1"
+        # GPU backend gate. Default: auto-detect — use GPU when the pyo3
+        # module was built with --features gpu AND CUDA is actually usable.
+        if use_gpu is None:
+            use_gpu = (not force_cpu) and _HTM_HAS_GPU and torch.cuda.is_available()
+        elif use_gpu and not _HTM_HAS_GPU:
+            raise RuntimeError(
+                "HTMLayer(use_gpu=True) but htm_rust was not built with "
+                "--features gpu. Re-run `maturin develop --features gpu`."
+            )
+        elif use_gpu and force_cpu:
+            use_gpu = False
+        self._use_gpu = bool(use_gpu)
+        cls = _HTM_REGION_GPU if self._use_gpu else _HTM_REGION
+        self._region_cls = cls
+        self._regions = [
+            cls(input_bits, n_columns, cells_per_column, seed + i)
+            for i in range(batch_size)
+        ]
         self.register_buffer("_dummy", torch.zeros(1), persistent=False)
         import os as _os
         self._htm_pool = ThreadPoolExecutor(max_workers=min(_os.cpu_count() or 4, 16))
             # grid.y = B processes all regions concurrently — ~B× speedup.
             # Falls back to sequential dispatch if the batched entry isn't
             # available (older htm_rust wheel).
+            if _HTM_USE_FUSED and _HTM_STEP_BATCH_FUSED_CUDA is not None:
                 # Slice self._regions to match B: _ensure_regions may have
                 # allocated more regions than the current batch size needs
                 # (e.g. factual eval uses smaller batches than training).
                 try:
+                    _HTM_STEP_BATCH_FUSED_CUDA(
                         self._regions[:B],
                         [sdr_u8[b].__cuda_array_interface__ for b in range(B)],
                         [cols_out[b].__cuda_array_interface__ for b in range(B)],