Spaces:
Runtime error
Runtime error
Update Feather training runtime image
Browse files- overlay/configs/harness_config.py +47 -17
- overlay/harness/eval_agent.py +188 -60
- overlay/harness/orchestrator.py +16 -13
- overlay/htm_rust/build.rs +27 -35
- overlay/htm_rust/src/gpu/fused.rs +87 -93
- overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu +77 -77
- overlay/hydra/engram.py +73 -75
- overlay/hydra/eval.py +8 -1
- overlay/hydra/model.py +21 -6
- overlay/hydra/training.py +46 -5
- overlay/prepare_nemotron.py +187 -232
- overlay/pyproject.toml +1 -0
- overlay/scripts/autoresearch_iter.sh +144 -0
- overlay/scripts/benchmark_hyena_stack.py +50 -29
- overlay/scripts/export_hpo_priors.py +74 -0
- overlay/scripts/hpo_orchestrator.py +319 -0
- overlay/scripts/launch_feather_hf_job.py +145 -110
- overlay/scripts/long_train.sh +38 -38
- overlay/scripts/optuna_hpo.py +725 -0
- overlay/scripts/parse_metrics.py +24 -0
- overlay/scripts/run_domain_expanded_pretrain.sh +262 -262
- overlay/scripts/run_meta.sh +13 -13
- overlay/scripts/run_phase1.sh +32 -32
- overlay/scripts/run_phase2.sh +25 -25
- overlay/scripts/run_tps_gate.sh +23 -0
- overlay/scripts/setup.sh +28 -27
- overlay/scripts/strip_optimizer_state.py +29 -0
- overlay/scripts/sweep_depth_aggregate.py +141 -45
- overlay/scripts/sweep_depth_local.sh +62 -62
- overlay/scripts/train_champion_12h.sh +50 -0
- overlay/scripts/train_champion_5h.sh +45 -0
- overlay/scripts/train_champion_resume.sh +38 -0
- overlay/scripts/train_champion_resume_clean.sh +43 -0
- overlay/scripts/train_champion_v2.sh +54 -0
- overlay/scripts/train_champion_warmstart.sh +47 -0
- overlay/scripts/wsl_bootstrap_tps.sh +68 -0
- overlay/subsystems/htm.py +43 -57
overlay/configs/harness_config.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
-
"""Harness configuration for HYDRA's self-evolving outer loop."""
|
| 2 |
-
from typing import Literal
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
from pydantic import BaseModel, Field
|
| 5 |
|
| 6 |
-
|
| 7 |
-
class HarnessConfig(BaseModel):
|
| 8 |
"""Configuration for the HYDRA harness behavior."""
|
| 9 |
|
| 10 |
# Inner loop
|
|
@@ -47,15 +50,19 @@ class HarnessConfig(BaseModel):
|
|
| 47 |
default=5.0, description="Max % regression from best known val_bpb"
|
| 48 |
)
|
| 49 |
|
| 50 |
-
# Keep/discard criteria
|
| 51 |
-
primary_metric: str = "val_bpb"
|
| 52 |
-
secondary_metrics:
|
| 53 |
-
default_factory=lambda: {
|
| 54 |
-
"mhc_spectral_norm": {"max": 2.0},
|
| 55 |
-
"engram_hit_rate": {"min": 0.1},
|
| 56 |
-
"
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
# Experiment execution
|
| 61 |
experiment_timeout: int = Field(
|
|
@@ -73,6 +80,29 @@ class HarnessConfig(BaseModel):
|
|
| 73 |
gate_mhc_spectral_norm: float | None = Field(
|
| 74 |
default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
|
| 75 |
)
|
| 76 |
-
gate_engram_hit_rate: float | None = Field(
|
| 77 |
-
default=None, description="Min engram_hit_rate for keep (None=disabled)"
|
| 78 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Harness configuration for HYDRA's self-evolving outer loop."""
|
| 2 |
+
from typing import Literal
|
| 3 |
+
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
|
| 6 |
+
type GateThresholds = dict[str, float]
|
| 7 |
+
type GateConfig = dict[str, GateThresholds]
|
| 8 |
|
|
|
|
| 9 |
|
| 10 |
+
class HarnessConfig(BaseModel):
|
|
|
|
| 11 |
"""Configuration for the HYDRA harness behavior."""
|
| 12 |
|
| 13 |
# Inner loop
|
|
|
|
| 50 |
default=5.0, description="Max % regression from best known val_bpb"
|
| 51 |
)
|
| 52 |
|
| 53 |
+
# Keep/discard criteria
|
| 54 |
+
primary_metric: str = "val_bpb"
|
| 55 |
+
secondary_metrics: GateConfig = Field(
|
| 56 |
+
default_factory=lambda: {
|
| 57 |
+
"mhc_spectral_norm": {"max": 2.0},
|
| 58 |
+
"engram_hit_rate": {"min": 0.1},
|
| 59 |
+
"factual_english_score": {"min": 0.5},
|
| 60 |
+
"instruction_following_score": {"min": 0.5},
|
| 61 |
+
"distinct_2": {"min": 0.1},
|
| 62 |
+
"repetition_rate": {"max": 0.2},
|
| 63 |
+
"hestia_quant_error": {"max": 0.05},
|
| 64 |
+
}
|
| 65 |
+
)
|
| 66 |
|
| 67 |
# Experiment execution
|
| 68 |
experiment_timeout: int = Field(
|
|
|
|
| 80 |
gate_mhc_spectral_norm: float | None = Field(
|
| 81 |
default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
|
| 82 |
)
|
| 83 |
+
gate_engram_hit_rate: float | None = Field(
|
| 84 |
+
default=None, description="Min engram_hit_rate for keep (None=disabled)"
|
| 85 |
+
)
|
| 86 |
+
gate_tps_median: float | None = Field(
|
| 87 |
+
default=None,
|
| 88 |
+
description="Min steady-state tps_median for keep (None=disabled)",
|
| 89 |
+
)
|
| 90 |
+
gate_tps_p10: float | None = Field(
|
| 91 |
+
default=None,
|
| 92 |
+
description="Min steady-state tps_p10 for keep (None=disabled)",
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
def to_secondary_gates(self) -> GateConfig:
|
| 96 |
+
"""Build active keep/discard gates from defaults plus gate_* overrides."""
|
| 97 |
+
gates = {metric: thresholds.copy() for metric, thresholds in self.secondary_metrics.items()}
|
| 98 |
+
|
| 99 |
+
if self.gate_mhc_spectral_norm is not None:
|
| 100 |
+
gates.setdefault("mhc_spectral_norm", {})["max"] = self.gate_mhc_spectral_norm
|
| 101 |
+
if self.gate_engram_hit_rate is not None:
|
| 102 |
+
gates.setdefault("engram_hit_rate", {})["min"] = self.gate_engram_hit_rate
|
| 103 |
+
if self.gate_tps_median is not None:
|
| 104 |
+
gates.setdefault("tps_median", {})["min"] = self.gate_tps_median
|
| 105 |
+
if self.gate_tps_p10 is not None:
|
| 106 |
+
gates.setdefault("tps_p10", {})["min"] = self.gate_tps_p10
|
| 107 |
+
|
| 108 |
+
return gates
|
overlay/harness/eval_agent.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
| 1 |
-
"""Eval agent: parse run.log and extract metrics from training runs."""
|
| 2 |
-
import re
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
@dataclass
|
| 7 |
-
class ExperimentResult:
|
| 8 |
"""Parsed result from a single experiment run.
|
| 9 |
|
| 10 |
All float fields default to 0.0; integer fields default to 0.
|
|
@@ -23,19 +28,38 @@ class ExperimentResult:
|
|
| 23 |
peak_vram_mb: float = 0.0
|
| 24 |
mfu_percent: float = 0.0
|
| 25 |
|
| 26 |
-
# Throughput
|
| 27 |
-
total_tokens_m: float = 0.0
|
| 28 |
-
num_steps: int = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# Model shape (echoed by train.py summary block)
|
| 31 |
num_params_m: float = 0.0
|
| 32 |
n_layer: int = 0
|
| 33 |
d_model: int = 0
|
| 34 |
|
| 35 |
-
# Secondary health metrics
|
| 36 |
-
mhc_spectral_norm: float = 0.0
|
| 37 |
-
engram_hit_rate: float = 0.0
|
| 38 |
-
sr_bypass_rate: float = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
# Status
|
| 41 |
crashed: bool = False
|
|
@@ -56,12 +80,48 @@ _PATTERNS: dict[str, str] = {
|
|
| 56 |
"n_layer": r"^n_layer:\s+(\d+)",
|
| 57 |
"d_model": r"^d_model:\s+(\d+)",
|
| 58 |
"mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
|
| 59 |
-
"engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
|
| 60 |
-
"sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
# Attributes that should be parsed as int rather than float.
|
| 64 |
-
_INT_ATTRS: frozenset[str] = frozenset(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
def parse_run_log(log_path: str) -> ExperimentResult:
|
|
@@ -84,22 +144,60 @@ def parse_run_log(log_path: str) -> ExperimentResult:
|
|
| 84 |
result.error_message = f"Log file not found: {log_path}"
|
| 85 |
return result
|
| 86 |
|
| 87 |
-
# Detect crash signals in output.
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
"""Check secondary metrics against fixed alarm thresholds.
|
| 104 |
|
| 105 |
Args:
|
|
@@ -118,19 +216,44 @@ def check_secondary_alarms(result: ExperimentResult) -> list[str]:
|
|
| 118 |
alarms.append(
|
| 119 |
f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
|
| 120 |
)
|
| 121 |
-
if 0 < result.mfu_percent < 10:
|
| 122 |
-
alarms.append(
|
| 123 |
-
f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
"""Decide whether to keep or discard an experiment.
|
| 135 |
|
| 136 |
The primary criterion is strictly lower val_bpb than the current best.
|
|
@@ -154,19 +277,24 @@ def should_keep(
|
|
| 154 |
if result.val_bpb >= best_bpb:
|
| 155 |
return False, "discard"
|
| 156 |
|
| 157 |
-
# Secondary gate checks.
|
| 158 |
-
if gates:
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Eval agent: parse run.log and extract metrics from training runs."""
|
| 2 |
+
import re
|
| 3 |
+
import statistics
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
type GateThresholds = dict[str, float]
|
| 8 |
+
type GateConfig = dict[str, GateThresholds]
|
| 9 |
|
| 10 |
|
| 11 |
@dataclass
|
| 12 |
+
class ExperimentResult:
|
| 13 |
"""Parsed result from a single experiment run.
|
| 14 |
|
| 15 |
All float fields default to 0.0; integer fields default to 0.
|
|
|
|
| 28 |
peak_vram_mb: float = 0.0
|
| 29 |
mfu_percent: float = 0.0
|
| 30 |
|
| 31 |
+
# Throughput
|
| 32 |
+
total_tokens_m: float = 0.0
|
| 33 |
+
num_steps: int = 0
|
| 34 |
+
tps_median: float = 0.0
|
| 35 |
+
tps_p10: float = 0.0
|
| 36 |
+
tps_min: float = 0.0
|
| 37 |
+
tps_max: float = 0.0
|
| 38 |
+
tps_samples: int = 0
|
| 39 |
|
| 40 |
# Model shape (echoed by train.py summary block)
|
| 41 |
num_params_m: float = 0.0
|
| 42 |
n_layer: int = 0
|
| 43 |
d_model: int = 0
|
| 44 |
|
| 45 |
+
# Secondary health metrics
|
| 46 |
+
mhc_spectral_norm: float = 0.0
|
| 47 |
+
engram_hit_rate: float = 0.0
|
| 48 |
+
sr_bypass_rate: float = 0.0
|
| 49 |
+
|
| 50 |
+
# Evaluation breadth metrics
|
| 51 |
+
factual_english_score: float = 0.0
|
| 52 |
+
instruction_following_score: float = 0.0
|
| 53 |
+
distinct_1: float = 0.0
|
| 54 |
+
distinct_2: float = 0.0
|
| 55 |
+
repetition_rate: float = 0.0
|
| 56 |
+
repetition_bigram_rate: float = 0.0
|
| 57 |
+
calibration_ece: float = 0.0
|
| 58 |
+
calibration_brier: float = 0.0
|
| 59 |
+
calibration_accuracy: float = 0.0
|
| 60 |
+
calibration_tokens: int = 0
|
| 61 |
+
eval_seed: int = 0
|
| 62 |
+
eval_seed_group: str = ""
|
| 63 |
|
| 64 |
# Status
|
| 65 |
crashed: bool = False
|
|
|
|
| 80 |
"n_layer": r"^n_layer:\s+(\d+)",
|
| 81 |
"d_model": r"^d_model:\s+(\d+)",
|
| 82 |
"mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
|
| 83 |
+
"engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
|
| 84 |
+
"sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
|
| 85 |
+
"factual_english_score": r"^factual_english_score:\s+([\d.]+)",
|
| 86 |
+
"instruction_following_score": r"^instruction_following_score:\s+([\d.]+)",
|
| 87 |
+
"distinct_1": r"^distinct_1:\s+([\d.]+)",
|
| 88 |
+
"distinct_2": r"^distinct_2:\s+([\d.]+)",
|
| 89 |
+
"repetition_rate": r"^repetition_rate:\s+([\d.]+)",
|
| 90 |
+
"repetition_bigram_rate": r"^repetition_bigram_rate:\s+([\d.]+)",
|
| 91 |
+
"calibration_ece": r"^calibration_ece:\s+([\d.]+)",
|
| 92 |
+
"calibration_brier": r"^calibration_brier:\s*([\d.]+)",
|
| 93 |
+
"calibration_accuracy": r"^calibration_accuracy:\s+([\d.]+)",
|
| 94 |
+
"calibration_tokens": r"^calibration_tokens:\s+(\d+)",
|
| 95 |
+
"eval_seed": r"^eval_seed:\s+(\d+)",
|
| 96 |
+
"eval_seed_group": r"^eval_seed_group:\s+(.+)",
|
| 97 |
+
}
|
| 98 |
|
| 99 |
# Attributes that should be parsed as int rather than float.
|
| 100 |
+
_INT_ATTRS: frozenset[str] = frozenset(
|
| 101 |
+
{
|
| 102 |
+
"num_steps",
|
| 103 |
+
"n_layer",
|
| 104 |
+
"d_model",
|
| 105 |
+
"calibration_tokens",
|
| 106 |
+
"eval_seed",
|
| 107 |
+
}
|
| 108 |
+
)
|
| 109 |
+
_STR_ATTRS: frozenset[str] = frozenset({"eval_seed_group"})
|
| 110 |
+
_STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
|
| 111 |
+
_TPS_PATTERN = re.compile(r"\btps=(\d+)\b")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _percentile_linear(sorted_values: list[float], pct: float) -> float:
|
| 115 |
+
"""Compute percentile via linear interpolation (0 <= pct <= 100)."""
|
| 116 |
+
if not sorted_values:
|
| 117 |
+
return 0.0
|
| 118 |
+
if len(sorted_values) == 1:
|
| 119 |
+
return sorted_values[0]
|
| 120 |
+
rank = (len(sorted_values) - 1) * (pct / 100.0)
|
| 121 |
+
lo = int(rank)
|
| 122 |
+
hi = min(lo + 1, len(sorted_values) - 1)
|
| 123 |
+
frac = rank - lo
|
| 124 |
+
return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac
|
| 125 |
|
| 126 |
|
| 127 |
def parse_run_log(log_path: str) -> ExperimentResult:
|
|
|
|
| 144 |
result.error_message = f"Log file not found: {log_path}"
|
| 145 |
return result
|
| 146 |
|
| 147 |
+
# Detect crash signals in output. Keep this strict to avoid false positives
|
| 148 |
+
# from benign log lines that include "error" in a non-fatal context.
|
| 149 |
+
if (
|
| 150 |
+
"Traceback" in content
|
| 151 |
+
or "\nFAIL\n" in content
|
| 152 |
+
or "[TPS_GUARD] FAIL" in content
|
| 153 |
+
or "raise SystemExit(1)" in content
|
| 154 |
+
):
|
| 155 |
+
result.crashed = True
|
| 156 |
+
lines = content.strip().splitlines()
|
| 157 |
+
result.error_message = "\n".join(lines[-20:])
|
| 158 |
+
|
| 159 |
+
for attr, pattern in _PATTERNS.items():
|
| 160 |
+
match = re.search(pattern, content, re.MULTILINE)
|
| 161 |
+
if match:
|
| 162 |
+
raw = match.group(1)
|
| 163 |
+
if attr in _INT_ATTRS:
|
| 164 |
+
setattr(result, attr, int(raw))
|
| 165 |
+
elif attr in _STR_ATTRS:
|
| 166 |
+
setattr(result, attr, raw.strip())
|
| 167 |
+
else:
|
| 168 |
+
setattr(result, attr, float(raw))
|
| 169 |
+
|
| 170 |
+
warmup_steps = 10
|
| 171 |
+
warmup_match = re.search(r"\[TPS_GUARD\] enabled .*?warmup_steps=(\d+)", content)
|
| 172 |
+
if warmup_match:
|
| 173 |
+
warmup_steps = int(warmup_match.group(1))
|
| 174 |
+
|
| 175 |
+
step_tps_samples: list[tuple[int, int]] = []
|
| 176 |
+
for m in _STEP_TPS_PATTERN.finditer(content):
|
| 177 |
+
step_tps_samples.append((int(m.group(1)), int(m.group(2))))
|
| 178 |
+
|
| 179 |
+
tps_values: list[float] = []
|
| 180 |
+
if step_tps_samples:
|
| 181 |
+
for step, tps in step_tps_samples:
|
| 182 |
+
if step >= warmup_steps:
|
| 183 |
+
tps_values.append(float(tps))
|
| 184 |
+
if not tps_values:
|
| 185 |
+
tps_values = [float(tps) for _, tps in step_tps_samples]
|
| 186 |
+
else:
|
| 187 |
+
tps_values = [float(m.group(1)) for m in _TPS_PATTERN.finditer(content)]
|
| 188 |
+
|
| 189 |
+
if tps_values:
|
| 190 |
+
sorted_tps = sorted(tps_values)
|
| 191 |
+
result.tps_samples = len(tps_values)
|
| 192 |
+
result.tps_median = float(statistics.median(tps_values))
|
| 193 |
+
result.tps_p10 = float(_percentile_linear(sorted_tps, 10.0))
|
| 194 |
+
result.tps_min = float(sorted_tps[0])
|
| 195 |
+
result.tps_max = float(sorted_tps[-1])
|
| 196 |
+
|
| 197 |
+
return result
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def check_secondary_alarms(result: ExperimentResult) -> list[str]:
|
| 201 |
"""Check secondary metrics against fixed alarm thresholds.
|
| 202 |
|
| 203 |
Args:
|
|
|
|
| 216 |
alarms.append(
|
| 217 |
f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
|
| 218 |
)
|
| 219 |
+
if 0 < result.mfu_percent < 10:
|
| 220 |
+
alarms.append(
|
| 221 |
+
f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
|
| 222 |
+
)
|
| 223 |
+
if result.calibration_ece > 0.35:
|
| 224 |
+
alarms.append(
|
| 225 |
+
f"calibration_ece={result.calibration_ece:.4f} > 0.35 (poor calibration)"
|
| 226 |
+
)
|
| 227 |
+
if result.tps_median > 0 and result.tps_median < 50000:
|
| 228 |
+
alarms.append(
|
| 229 |
+
f"tps_median={result.tps_median:.0f} < 50000 (throughput below A10 objective)"
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
return alarms
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def _check_gate(
|
| 236 |
+
result: ExperimentResult,
|
| 237 |
+
gates: GateConfig,
|
| 238 |
+
metric: str,
|
| 239 |
+
) -> tuple[bool, str] | None:
|
| 240 |
+
"""Evaluate a single min/max gate against an ExperimentResult metric."""
|
| 241 |
+
gate = gates.get(metric, {})
|
| 242 |
+
value = getattr(result, metric)
|
| 243 |
+
max_value = gate.get("max")
|
| 244 |
+
if max_value is not None and value > max_value:
|
| 245 |
+
return False, f"{metric} {value:.4f} > gate {max_value}"
|
| 246 |
+
min_value = gate.get("min")
|
| 247 |
+
if min_value is not None and value < min_value:
|
| 248 |
+
return False, f"{metric} {value:.4f} < gate {min_value}"
|
| 249 |
+
return None
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def should_keep(
|
| 253 |
+
result: ExperimentResult,
|
| 254 |
+
best_bpb: float,
|
| 255 |
+
gates: GateConfig | None = None,
|
| 256 |
+
) -> tuple[bool, str]:
|
| 257 |
"""Decide whether to keep or discard an experiment.
|
| 258 |
|
| 259 |
The primary criterion is strictly lower val_bpb than the current best.
|
|
|
|
| 277 |
if result.val_bpb >= best_bpb:
|
| 278 |
return False, "discard"
|
| 279 |
|
| 280 |
+
# Secondary gate checks.
|
| 281 |
+
if gates:
|
| 282 |
+
gate_metrics = (
|
| 283 |
+
"mhc_spectral_norm",
|
| 284 |
+
"engram_hit_rate",
|
| 285 |
+
"factual_english_score",
|
| 286 |
+
"instruction_following_score",
|
| 287 |
+
"distinct_1",
|
| 288 |
+
"distinct_2",
|
| 289 |
+
"repetition_rate",
|
| 290 |
+
"repetition_bigram_rate",
|
| 291 |
+
"calibration_ece",
|
| 292 |
+
"tps_median",
|
| 293 |
+
"tps_p10",
|
| 294 |
+
)
|
| 295 |
+
for metric in gate_metrics:
|
| 296 |
+
gate_result = _check_gate(result, gates, metric)
|
| 297 |
+
if gate_result is not None:
|
| 298 |
+
return gate_result
|
| 299 |
+
|
| 300 |
+
return True, "keep"
|
overlay/harness/orchestrator.py
CHANGED
|
@@ -20,11 +20,12 @@ provides the infrastructure ("rails") that the autoresearch loop runs on.
|
|
| 20 |
"""
|
| 21 |
import argparse
|
| 22 |
import csv
|
| 23 |
-
import os
|
| 24 |
-
import subprocess
|
| 25 |
-
import time
|
| 26 |
-
|
| 27 |
-
from
|
|
|
|
| 28 |
from harness.git_utils import REPO_DIR, commit_all, current_commit_short, reset_to
|
| 29 |
from harness.health_monitor import check_health, reset_peak_stats
|
| 30 |
from harness.meta_agent import run_meta_iteration
|
|
@@ -144,12 +145,12 @@ def run_experiment(timeout: int = 600) -> str:
|
|
| 144 |
# ---------------------------------------------------------------------------
|
| 145 |
|
| 146 |
|
| 147 |
-
def run_loop(
|
| 148 |
-
meta_interval: int = 20,
|
| 149 |
-
max_experiments: int | None = None,
|
| 150 |
-
experiment_timeout: int = 600,
|
| 151 |
-
secondary_gates: dict | None = None,
|
| 152 |
-
) -> None:
|
| 153 |
"""Run the HYDRA autoresearch loop.
|
| 154 |
|
| 155 |
This function runs indefinitely (or until ``max_experiments`` is reached
|
|
@@ -162,8 +163,10 @@ def run_loop(
|
|
| 162 |
secondary_gates: Optional gate thresholds forwarded to
|
| 163 |
:func:`~harness.eval_agent.should_keep`.
|
| 164 |
"""
|
| 165 |
-
init_results_tsv()
|
| 166 |
-
|
|
|
|
|
|
|
| 167 |
experiment_num = count_experiments()
|
| 168 |
|
| 169 |
print(
|
|
|
|
| 20 |
"""
|
| 21 |
import argparse
|
| 22 |
import csv
|
| 23 |
+
import os
|
| 24 |
+
import subprocess
|
| 25 |
+
import time
|
| 26 |
+
|
| 27 |
+
from configs.harness_config import HarnessConfig
|
| 28 |
+
from harness.eval_agent import ExperimentResult, check_secondary_alarms, parse_run_log, should_keep
|
| 29 |
from harness.git_utils import REPO_DIR, commit_all, current_commit_short, reset_to
|
| 30 |
from harness.health_monitor import check_health, reset_peak_stats
|
| 31 |
from harness.meta_agent import run_meta_iteration
|
|
|
|
| 145 |
# ---------------------------------------------------------------------------
|
| 146 |
|
| 147 |
|
| 148 |
+
def run_loop(
|
| 149 |
+
meta_interval: int = 20,
|
| 150 |
+
max_experiments: int | None = None,
|
| 151 |
+
experiment_timeout: int = 600,
|
| 152 |
+
secondary_gates: dict[str, dict[str, float]] | None = None,
|
| 153 |
+
) -> None:
|
| 154 |
"""Run the HYDRA autoresearch loop.
|
| 155 |
|
| 156 |
This function runs indefinitely (or until ``max_experiments`` is reached
|
|
|
|
| 163 |
secondary_gates: Optional gate thresholds forwarded to
|
| 164 |
:func:`~harness.eval_agent.should_keep`.
|
| 165 |
"""
|
| 166 |
+
init_results_tsv()
|
| 167 |
+
if secondary_gates is None:
|
| 168 |
+
secondary_gates = HarnessConfig().to_secondary_gates()
|
| 169 |
+
best_bpb = _load_best_bpb()
|
| 170 |
experiment_num = count_experiments()
|
| 171 |
|
| 172 |
print(
|
overlay/htm_rust/build.rs
CHANGED
|
@@ -26,39 +26,37 @@ fn main() {
|
|
| 26 |
return;
|
| 27 |
}
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
|
|
|
|
|
|
| 37 |
"tm_predict",
|
| 38 |
"tm_activate",
|
| 39 |
"tm_learn",
|
| 40 |
-
"tm_punish",
|
| 41 |
-
"tm_grow",
|
| 42 |
-
"tm_anomaly",
|
| 43 |
-
"tm_reset",
|
| 44 |
-
];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
let kernels_dir = PathBuf::from("src/gpu/kernels");
|
| 47 |
-
for k in &kernels {
|
| 48 |
-
let src = kernels_dir.join(format!("{k}.cu"));
|
| 49 |
-
println!("cargo:rerun-if-changed={}", src.display());
|
| 50 |
-
}
|
| 51 |
|
| 52 |
-
let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
|
| 53 |
-
let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_90a".into());
|
| 54 |
-
let fused_supported = arch.starts_with("sm_90");
|
| 55 |
-
if fused_supported {
|
| 56 |
-
kernels.push("htm_fused_step");
|
| 57 |
-
}
|
| 58 |
-
println!(
|
| 59 |
-
"cargo:rustc-env=HTM_GPU_FUSED_AVAILABLE={}",
|
| 60 |
-
if fused_supported { "1" } else { "0" }
|
| 61 |
-
);
|
| 62 |
|
| 63 |
let nvcc = find_nvcc();
|
| 64 |
println!("cargo:warning=htm_rust: nvcc = {nvcc}");
|
|
@@ -81,7 +79,7 @@ fn main() {
|
|
| 81 |
// than the nvcc toolchain. Set HTM_PTX_VERSION to e.g. "7.8" or "8.0".
|
| 82 |
let ptx_version_override = env::var("HTM_PTX_VERSION").ok();
|
| 83 |
|
| 84 |
-
for k in kernels {
|
| 85 |
let src = kernels_dir.join(format!("{k}.cu"));
|
| 86 |
let ptx = out_dir.join(format!("{k}.ptx"));
|
| 87 |
if !src.exists() {
|
|
@@ -129,13 +127,7 @@ fn main() {
|
|
| 129 |
std::fs::write(&ptx, patched)
|
| 130 |
.unwrap_or_else(|e| panic!("write {} failed: {e}", ptx.display()));
|
| 131 |
}
|
| 132 |
-
}
|
| 133 |
-
|
| 134 |
-
if !fused_supported {
|
| 135 |
-
let fused_ptx = out_dir.join("htm_fused_step.ptx");
|
| 136 |
-
std::fs::write(&fused_ptx, "// fused kernel disabled for this CUDA arch\n")
|
| 137 |
-
.unwrap_or_else(|e| panic!("write {} failed: {e}", fused_ptx.display()));
|
| 138 |
-
}
|
| 139 |
|
| 140 |
// Export OUT_DIR for include_str! in Rust.
|
| 141 |
println!(
|
|
|
|
| 26 |
return;
|
| 27 |
}
|
| 28 |
|
| 29 |
+
let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
|
| 30 |
+
let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_90a".into());
|
| 31 |
+
|
| 32 |
+
// Base kernels — compile for any sm_80+ GPU. Each .cu file → one .ptx file.
|
| 33 |
+
let base_kernels: &[&str] = &[
|
| 34 |
+
"sp_overlap",
|
| 35 |
+
"sp_topk",
|
| 36 |
+
"sp_learn",
|
| 37 |
+
"sp_duty",
|
| 38 |
+
"sp_boost_fused",
|
| 39 |
"tm_predict",
|
| 40 |
"tm_activate",
|
| 41 |
"tm_learn",
|
| 42 |
+
"tm_punish",
|
| 43 |
+
"tm_grow",
|
| 44 |
+
"tm_anomaly",
|
| 45 |
+
"tm_reset",
|
| 46 |
+
];
|
| 47 |
+
|
| 48 |
+
// htm_fused_step now compiles for ALL architectures (sm_80+).
|
| 49 |
+
// On Hopper (sm_90+): uses cluster-distributed shared memory for hot state.
|
| 50 |
+
// On Ampere (sm_86) and other pre-Hopper: uses global memory reads/writes
|
| 51 |
+
// with grid.sync() for cross-block synchronization (cooperative launch).
|
| 52 |
+
let kernels: Vec<&str> = base_kernels.iter().chain(["htm_fused_step"].iter()).copied().collect();
|
| 53 |
|
| 54 |
let kernels_dir = PathBuf::from("src/gpu/kernels");
|
| 55 |
+
for k in &kernels {
|
| 56 |
+
let src = kernels_dir.join(format!("{k}.cu"));
|
| 57 |
+
println!("cargo:rerun-if-changed={}", src.display());
|
| 58 |
+
}
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
let nvcc = find_nvcc();
|
| 62 |
println!("cargo:warning=htm_rust: nvcc = {nvcc}");
|
|
|
|
| 79 |
// than the nvcc toolchain. Set HTM_PTX_VERSION to e.g. "7.8" or "8.0".
|
| 80 |
let ptx_version_override = env::var("HTM_PTX_VERSION").ok();
|
| 81 |
|
| 82 |
+
for k in kernels {
|
| 83 |
let src = kernels_dir.join(format!("{k}.cu"));
|
| 84 |
let ptx = out_dir.join(format!("{k}.ptx"));
|
| 85 |
if !src.exists() {
|
|
|
|
| 127 |
std::fs::write(&ptx, patched)
|
| 128 |
.unwrap_or_else(|e| panic!("write {} failed: {e}", ptx.display()));
|
| 129 |
}
|
| 130 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
// Export OUT_DIR for include_str! in Rust.
|
| 133 |
println!(
|
overlay/htm_rust/src/gpu/fused.rs
CHANGED
|
@@ -132,7 +132,12 @@ pub(crate) fn plan_fused_launch(
|
|
| 132 |
grid_cap_override: Option<u32>,
|
| 133 |
) -> Result<FusedLaunchPlan, String> {
|
| 134 |
let sm_count = sm_count.max(1);
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
// Cluster launch path: cooperative launch is not required. Keep the probe
|
| 138 |
// result for residency estimation only.
|
|
@@ -140,11 +145,10 @@ pub(crate) fn plan_fused_launch(
|
|
| 140 |
eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
|
| 141 |
}
|
| 142 |
|
| 143 |
-
//
|
| 144 |
-
//
|
| 145 |
-
// this for debugging but should not exceed 16 for cluster correctness.
|
| 146 |
let default_grid_cap = 16u32;
|
| 147 |
-
let grid_cap = grid_cap_override.unwrap_or(default_grid_cap)
|
| 148 |
let resident_bound = if cooperative_grid_limit > 0 {
|
| 149 |
cooperative_grid_limit.max(sm_count * 2)
|
| 150 |
} else {
|
|
@@ -460,15 +464,21 @@ pub fn launch_fused(
|
|
| 460 |
return Err(DriverError(ret));
|
| 461 |
}
|
| 462 |
} else {
|
| 463 |
-
//
|
| 464 |
-
|
|
|
|
|
|
|
|
|
|
| 465 |
fused.raw_kernel.function,
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
0,
|
| 469 |
cu_stream,
|
| 470 |
-
|
| 471 |
-
)
|
|
|
|
|
|
|
|
|
|
| 472 |
}
|
| 473 |
}
|
| 474 |
|
|
@@ -503,41 +513,29 @@ pub(super) fn launch_fused_batched_raw(
|
|
| 503 |
assert_eq!(anom_per_region.len(), b);
|
| 504 |
assert!(b >= 1, "need at least one region");
|
| 505 |
|
| 506 |
-
// Reset per-region step_scratch before each launch.
|
| 507 |
-
for &rp in region_ptrs.iter() {
|
| 508 |
-
let r = unsafe { &mut *rp };
|
| 509 |
-
let
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
let dev = r.sp_gpu.dev_ref().clone();
|
| 514 |
-
dev.memset_zeros(&mut fused.step_scratch)?;
|
| 515 |
-
fused.iter_counter = fused.iter_counter.wrapping_add(1);
|
| 516 |
-
}
|
| 517 |
|
| 518 |
// Shared config — all regions use identical sp/tm parameters.
|
| 519 |
-
let (grid_x, block_x, function_batched, cu_stream, cu_ctx) = {
|
| 520 |
-
let r0 = unsafe { &*region_ptrs[0] };
|
| 521 |
-
|
| 522 |
-
.fused_state
|
| 523 |
-
.
|
| 524 |
-
.
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
let cfg = {
|
| 535 |
-
let r = unsafe { &*region_ptrs[0] };
|
| 536 |
-
let fused = r
|
| 537 |
-
.fused_state
|
| 538 |
-
.as_ref()
|
| 539 |
-
.expect("launch_fused_batched_raw requires fused_state");
|
| 540 |
-
FusedConfig {
|
| 541 |
input_bits: input_bits as u32,
|
| 542 |
n_columns: r.sp_gpu.n_columns_accessor() as u32,
|
| 543 |
synapses_per_col: r.sp_gpu.synapses_per_col_accessor() as u32,
|
|
@@ -562,41 +560,38 @@ pub(super) fn launch_fused_batched_raw(
|
|
| 562 |
initial_perm_i16: r.tm_gpu.initial_perm_i16 as i32,
|
| 563 |
t: t as u32,
|
| 564 |
learn: if learn { 1 } else { 0 },
|
| 565 |
-
iter_seed:
|
| 566 |
-
cooperative_grid_sync: 1,
|
| 567 |
-
}
|
| 568 |
-
};
|
| 569 |
|
| 570 |
// Build B FusedPtrs per-region.
|
| 571 |
-
let
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
step_scratch: *fused.step_scratch.device_ptr(),
|
| 598 |
-
});
|
| 599 |
-
}
|
| 600 |
|
| 601 |
// Upload FusedPtrs array to device (B * sizeof(FusedPtrs) bytes).
|
| 602 |
// FusedPtrs is repr(C) + DeviceRepr so htod_sync_copy handles it.
|
|
@@ -608,14 +603,10 @@ pub(super) fn launch_fused_batched_raw(
|
|
| 608 |
// Grid = (grid_x, B, 1) with cluster_dim=(16,1,1): each region (Y slice)
|
| 609 |
// occupies exactly one cluster of 16 blocks. All 8 clusters run concurrently
|
| 610 |
// on the H200's 132 SMs (8 × 16 = 128 blocks ≤ 132 SMs).
|
| 611 |
-
let use_cluster = {
|
| 612 |
-
let r0 = unsafe { &*region_ptrs[0] };
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
.as_ref()
|
| 616 |
-
.expect("launch_fused_batched_raw requires fused_state");
|
| 617 |
-
fused.cluster_info.max_cluster_size > 0
|
| 618 |
-
};
|
| 619 |
|
| 620 |
unsafe {
|
| 621 |
result::ctx::set_current(cu_ctx)?;
|
|
@@ -653,15 +644,18 @@ pub(super) fn launch_fused_batched_raw(
|
|
| 653 |
return Err(DriverError(ret));
|
| 654 |
}
|
| 655 |
} else {
|
| 656 |
-
//
|
| 657 |
-
|
| 658 |
function_batched,
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
0,
|
| 662 |
cu_stream,
|
| 663 |
-
|
| 664 |
-
)
|
|
|
|
|
|
|
|
|
|
| 665 |
}
|
| 666 |
}
|
| 667 |
|
|
|
|
| 132 |
grid_cap_override: Option<u32>,
|
| 133 |
) -> Result<FusedLaunchPlan, String> {
|
| 134 |
let sm_count = sm_count.max(1);
|
| 135 |
+
// 1024 threads/block exceeds the register file on Ampere (sm_86: 65536
|
| 136 |
+
// regs/SM ÷ 1024 = 64 regs/thread; fused kernel needs ~80+). 256 gives
|
| 137 |
+
// 256 regs/thread which is ample. Compensate with more blocks via
|
| 138 |
+
// cooperative launch. On Hopper (228 KB smem, 255 regs/thread baseline),
|
| 139 |
+
// 1024 works fine, but 256 is safe everywhere.
|
| 140 |
+
let block_dim_x = 256u32;
|
| 141 |
|
| 142 |
// Cluster launch path: cooperative launch is not required. Keep the probe
|
| 143 |
// result for residency estimation only.
|
|
|
|
| 145 |
eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
|
| 146 |
}
|
| 147 |
|
| 148 |
+
// Tested grid_cap: 4 blocks = 30ms (too serial), 16 blocks = 10.8ms (parallel wins).
|
| 149 |
+
// Parallelism in SP overlap + TM predict stages outweighs grid.sync() cost.
|
|
|
|
| 150 |
let default_grid_cap = 16u32;
|
| 151 |
+
let grid_cap = grid_cap_override.unwrap_or(default_grid_cap);
|
| 152 |
let resident_bound = if cooperative_grid_limit > 0 {
|
| 153 |
cooperative_grid_limit.max(sm_count * 2)
|
| 154 |
} else {
|
|
|
|
| 464 |
return Err(DriverError(ret));
|
| 465 |
}
|
| 466 |
} else {
|
| 467 |
+
// Pre-Hopper: cooperative kernel launch. The fused kernel uses
|
| 468 |
+
// grid.sync() for cross-block synchronization which REQUIRES
|
| 469 |
+
// cuLaunchCooperativeKernel (normal launch silently crashes on
|
| 470 |
+
// the first grid.sync() call).
|
| 471 |
+
let ret = sys::lib().cuLaunchCooperativeKernel(
|
| 472 |
fused.raw_kernel.function,
|
| 473 |
+
grid_x, 1, 1,
|
| 474 |
+
block_x, 1, 1,
|
| 475 |
+
0, // sharedMemBytes
|
| 476 |
cu_stream,
|
| 477 |
+
kernel_params.as_mut_ptr(),
|
| 478 |
+
);
|
| 479 |
+
if ret != sys::CUresult::CUDA_SUCCESS {
|
| 480 |
+
return Err(DriverError(ret));
|
| 481 |
+
}
|
| 482 |
}
|
| 483 |
}
|
| 484 |
|
|
|
|
| 513 |
assert_eq!(anom_per_region.len(), b);
|
| 514 |
assert!(b >= 1, "need at least one region");
|
| 515 |
|
| 516 |
+
// Reset per-region step_scratch before each launch.
|
| 517 |
+
for &rp in region_ptrs.iter() {
|
| 518 |
+
let r = unsafe { &mut *rp };
|
| 519 |
+
let dev = r.sp_gpu.dev_ref().clone();
|
| 520 |
+
dev.memset_zeros(&mut r.fused_state.step_scratch)?;
|
| 521 |
+
r.fused_state.iter_counter = r.fused_state.iter_counter.wrapping_add(1);
|
| 522 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
|
| 524 |
// Shared config — all regions use identical sp/tm parameters.
|
| 525 |
+
let (grid_x, block_x, function_batched, cu_stream, cu_ctx) = {
|
| 526 |
+
let r0 = unsafe { &*region_ptrs[0] };
|
| 527 |
+
(
|
| 528 |
+
r0.fused_state.grid_dim_x,
|
| 529 |
+
r0.fused_state.block_dim_x,
|
| 530 |
+
r0.fused_state.raw_kernel.function_batched,
|
| 531 |
+
*r0.sp_gpu.dev_ref().cu_stream(),
|
| 532 |
+
*r0.sp_gpu.dev_ref().cu_primary_ctx(),
|
| 533 |
+
)
|
| 534 |
+
};
|
| 535 |
+
|
| 536 |
+
let cfg = {
|
| 537 |
+
let r = unsafe { &*region_ptrs[0] };
|
| 538 |
+
FusedConfig {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
input_bits: input_bits as u32,
|
| 540 |
n_columns: r.sp_gpu.n_columns_accessor() as u32,
|
| 541 |
synapses_per_col: r.sp_gpu.synapses_per_col_accessor() as u32,
|
|
|
|
| 560 |
initial_perm_i16: r.tm_gpu.initial_perm_i16 as i32,
|
| 561 |
t: t as u32,
|
| 562 |
learn: if learn { 1 } else { 0 },
|
| 563 |
+
iter_seed: r.fused_state.iter_counter,
|
| 564 |
+
cooperative_grid_sync: 1,
|
| 565 |
+
}
|
| 566 |
+
};
|
| 567 |
|
| 568 |
// Build B FusedPtrs per-region.
|
| 569 |
+
let ptrs_vec: Vec<FusedPtrs> = (0..b)
|
| 570 |
+
.map(|i| {
|
| 571 |
+
let r = unsafe { &*region_ptrs[i] };
|
| 572 |
+
FusedPtrs {
|
| 573 |
+
syn_bit: *r.sp_gpu.syn_bit_accessor().device_ptr(),
|
| 574 |
+
syn_perm: *r.sp_gpu.syn_perm_accessor().device_ptr(),
|
| 575 |
+
boost: *r.sp_gpu.boost_accessor().device_ptr(),
|
| 576 |
+
active_duty: *r.sp_gpu.active_duty_accessor().device_ptr(),
|
| 577 |
+
inhibition_threshold: *r.fused_state.inhibition_threshold.device_ptr(),
|
| 578 |
+
seg_cell_id: *r.tm_gpu.seg_cell_id_accessor().device_ptr(),
|
| 579 |
+
seg_syn_count: *r.tm_gpu.seg_syn_count_accessor().device_ptr(),
|
| 580 |
+
syn_presyn: *r.tm_gpu.syn_presyn_accessor().device_ptr(),
|
| 581 |
+
tm_syn_perm: *r.tm_gpu.syn_perm_accessor().device_ptr(),
|
| 582 |
+
cell_seg_count: *r.tm_gpu.cell_seg_count_accessor().device_ptr(),
|
| 583 |
+
cell_active_a: *r.fused_state.cell_active_bits_a.device_ptr(),
|
| 584 |
+
cell_active_b: *r.fused_state.cell_active_bits_b.device_ptr(),
|
| 585 |
+
cell_winner_a: *r.fused_state.cell_winner_bits_a.device_ptr(),
|
| 586 |
+
cell_winner_b: *r.fused_state.cell_winner_bits_b.device_ptr(),
|
| 587 |
+
inputs: inputs_per_region[i],
|
| 588 |
+
cols_out: cols_per_region[i],
|
| 589 |
+
anom_out: anom_per_region[i],
|
| 590 |
+
barrier_counters: 0u64, // ABI-compat dummy; cluster barrier replaces DLB.
|
| 591 |
+
step_scratch: *r.fused_state.step_scratch.device_ptr(),
|
| 592 |
+
}
|
| 593 |
+
})
|
| 594 |
+
.collect();
|
|
|
|
|
|
|
|
|
|
| 595 |
|
| 596 |
// Upload FusedPtrs array to device (B * sizeof(FusedPtrs) bytes).
|
| 597 |
// FusedPtrs is repr(C) + DeviceRepr so htod_sync_copy handles it.
|
|
|
|
| 603 |
// Grid = (grid_x, B, 1) with cluster_dim=(16,1,1): each region (Y slice)
|
| 604 |
// occupies exactly one cluster of 16 blocks. All 8 clusters run concurrently
|
| 605 |
// on the H200's 132 SMs (8 × 16 = 128 blocks ≤ 132 SMs).
|
| 606 |
+
let use_cluster = {
|
| 607 |
+
let r0 = unsafe { &*region_ptrs[0] };
|
| 608 |
+
r0.fused_state.cluster_info.max_cluster_size > 0
|
| 609 |
+
};
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
|
| 611 |
unsafe {
|
| 612 |
result::ctx::set_current(cu_ctx)?;
|
|
|
|
| 644 |
return Err(DriverError(ret));
|
| 645 |
}
|
| 646 |
} else {
|
| 647 |
+
// Pre-Hopper: cooperative kernel launch (grid.sync() requires it).
|
| 648 |
+
let ret = sys::lib().cuLaunchCooperativeKernel(
|
| 649 |
function_batched,
|
| 650 |
+
grid_x, b as u32, 1,
|
| 651 |
+
block_x, 1, 1,
|
| 652 |
+
0, // sharedMemBytes
|
| 653 |
cu_stream,
|
| 654 |
+
kernel_params.as_mut_ptr(),
|
| 655 |
+
);
|
| 656 |
+
if ret != sys::CUresult::CUDA_SUCCESS {
|
| 657 |
+
return Err(DriverError(ret));
|
| 658 |
+
}
|
| 659 |
}
|
| 660 |
}
|
| 661 |
|
overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu
CHANGED
|
@@ -124,13 +124,21 @@ struct FusedConfig {
|
|
| 124 |
//
|
| 125 |
// The flags / expected / phase / cooperative_grid_sync parameters are kept
|
| 126 |
// in the signature for call-site compatibility but are unused.
|
| 127 |
-
__device__ static inline void fused_grid_barrier(cg::grid_group
|
| 128 |
unsigned int * /* flags — unused */,
|
| 129 |
unsigned int /* expected — unused */,
|
| 130 |
unsigned int /* phase — unused */,
|
| 131 |
unsigned int /* cooperative_grid_sync — unused */) {
|
|
|
|
|
|
|
| 132 |
auto cluster = cg::this_cluster();
|
| 133 |
cluster.sync();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
}
|
| 135 |
|
| 136 |
__device__ static inline unsigned int warp_sum_u32(unsigned int v) {
|
|
@@ -187,17 +195,26 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
|
|
| 187 |
// DSMEM: Cluster-distributed shared memory for hot per-column
|
| 188 |
// state (inhibition_threshold, boost, active_duty).
|
| 189 |
//
|
| 190 |
-
// Each block in the cluster owns a contiguous
|
| 191 |
-
//
|
| 192 |
-
//
|
| 193 |
-
// cluster.map_shared_rank(ptr, owner_block_rank)[offset].
|
| 194 |
//
|
| 195 |
-
//
|
| 196 |
-
//
|
|
|
|
|
|
|
| 197 |
// =========================================================
|
|
|
|
|
|
|
|
|
|
| 198 |
auto cluster = cg::this_cluster();
|
| 199 |
const unsigned int cluster_block_rank = cluster.block_rank(); // 0..cluster_size-1
|
| 200 |
const unsigned int cluster_sz = cluster.num_blocks(); // == gridDim.x (≤16)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
// Partition n_cols evenly across cluster blocks.
|
| 203 |
// Each block owns cols_per_block columns starting at my_col_start.
|
|
@@ -209,27 +226,27 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
|
|
| 209 |
(my_col_start + cols_per_block < n_cols)
|
| 210 |
? (my_col_start + cols_per_block) : n_cols; // clamp
|
| 211 |
|
|
|
|
| 212 |
// Cluster-distributed shared memory arrays.
|
| 213 |
// Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
|
| 214 |
// Peer blocks address into each other's smem via map_shared_rank.
|
| 215 |
__shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
|
| 216 |
__shared__ float s_boost [COLS_PER_CLUSTER_BLOCK_MAX];
|
| 217 |
__shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
|
|
|
|
| 218 |
|
| 219 |
-
// TMA multicast input staging tile (T9).
|
| 220 |
-
//
|
| 221 |
-
// On Hopper (sm_90a), cg::memcpy_async with cluster scope issues a single
|
| 222 |
-
// TMA DMA that multicasts the source data to all 16 SMs in the cluster
|
| 223 |
-
// simultaneously — replacing ~16 per-block GMEM reads per timestep with a
|
| 224 |
-
// single hardware DMA. After cg::wait(cluster) every SM's s_input_tile
|
| 225 |
-
// is populated identically without any additional DRAM traffic.
|
| 226 |
-
//
|
| 227 |
-
// Fallback: when cfg.input_bits > INPUT_BITS_MAX the tile is bypassed
|
| 228 |
-
// and each thread reads directly from GMEM (original path).
|
| 229 |
//
|
| 230 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
__shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
|
|
|
|
| 232 |
|
|
|
|
| 233 |
// Initial GMEM → smem load (reads state from previous forward call).
|
| 234 |
// Each block loads only its own slice; tid strides across the slice.
|
| 235 |
for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
|
|
@@ -242,6 +259,11 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
|
|
| 242 |
// All blocks in the cluster must finish loading before any block
|
| 243 |
// starts reading peer smem inside the T-loop.
|
| 244 |
cluster.sync();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
const unsigned int S = cfg.synapses_per_col;
|
| 247 |
const unsigned int cpc = cfg.cells_per_column;
|
|
@@ -307,32 +329,19 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
|
|
| 307 |
// Ordering: BARRIER 1 completes before we issue the DMA.
|
| 308 |
// The DMA completes before Stage A reads s_input_tile.
|
| 309 |
// =========================================================
|
|
|
|
| 310 |
const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
|
| 311 |
if (use_input_tile) {
|
| 312 |
-
// Thread-block scope async copy: each SM independently loads
|
| 313 |
-
// its own input tile from GMEM into shared memory.
|
| 314 |
-
//
|
| 315 |
-
// NOTE: CUDA 12.1's cooperative_groups::memcpy_async() rejects
|
| 316 |
-
// cluster_group at compile time (static_assert in async.h:171).
|
| 317 |
-
// True TMA multicast (single DMA for all 16 SMs in the cluster)
|
| 318 |
-
// would require raw PTX cp.async.bulk.tensor with multicast mode,
|
| 319 |
-
// which needs cuTensorMap descriptors on the host side (T11).
|
| 320 |
-
//
|
| 321 |
-
// This per-SM path still gives a meaningful win: it converts
|
| 322 |
-
// the original per-synapse scattered GMEM reads (random access
|
| 323 |
-
// pattern hitting multiple cache lines) into one sequential DMA
|
| 324 |
-
// per SM, improving L2 hit rate and hardware prefetcher
|
| 325 |
-
// effectiveness. The cluster.sync() below ensures all SMs in
|
| 326 |
-
// the cluster have finished loading before any SM enters Stage A.
|
| 327 |
auto tb = cg::this_thread_block();
|
| 328 |
cg::memcpy_async(tb, s_input_tile,
|
| 329 |
inputs + inp_off,
|
| 330 |
cfg.input_bits);
|
| 331 |
cg::wait(tb);
|
| 332 |
-
// Cluster barrier: all 16 SMs must have loaded their tile
|
| 333 |
-
// before any SM begins reading s_input_tile in Stage A.
|
| 334 |
cluster.sync();
|
| 335 |
}
|
|
|
|
|
|
|
|
|
|
| 336 |
|
| 337 |
// =========================================================
|
| 338 |
// STAGE A: Spatial Pooler
|
|
@@ -350,22 +359,31 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
|
|
| 350 |
float p = syn_perm[base + s];
|
| 351 |
// T9: read from cluster-broadcast tile when available;
|
| 352 |
// fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
|
|
|
|
| 353 |
unsigned int inp_byte = use_input_tile
|
| 354 |
? (unsigned int)s_input_tile[b]
|
| 355 |
: (unsigned int)inputs[inp_off + b];
|
|
|
|
|
|
|
|
|
|
| 356 |
unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
|
| 357 |
local += hit;
|
| 358 |
}
|
| 359 |
unsigned int overlap = warp_sum_u32(local);
|
| 360 |
overlap = __shfl_sync(0xffffffffu, overlap, 0);
|
| 361 |
|
| 362 |
-
//
|
| 363 |
-
|
|
|
|
| 364 |
const unsigned int owner_block = c / cols_per_block;
|
| 365 |
const unsigned int owner_offset = c - owner_block * cols_per_block;
|
| 366 |
-
|
| 367 |
float boost_val = cluster.map_shared_rank(s_boost, owner_block)[owner_offset];
|
| 368 |
float thr = cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
|
| 370 |
float boosted = (float)overlap * boost_val;
|
| 371 |
unsigned int is_active = (boosted > thr) ? 1u : 0u;
|
|
@@ -383,9 +401,13 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
|
|
| 383 |
for (unsigned int s = lane; s < S; s += 32u) {
|
| 384 |
unsigned int b = syn_bit[base + s];
|
| 385 |
float p = syn_perm[base + s];
|
|
|
|
| 386 |
unsigned int inp_byte = use_input_tile
|
| 387 |
? (unsigned int)s_input_tile[b]
|
| 388 |
: (unsigned int)inputs[inp_off + b];
|
|
|
|
|
|
|
|
|
|
| 389 |
if (inp_byte != 0u) {
|
| 390 |
p += cfg.sp_inc;
|
| 391 |
if (p > 1.0f) p = 1.0f;
|
|
@@ -398,15 +420,20 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
|
|
| 398 |
}
|
| 399 |
|
| 400 |
// active_duty EMA + threshold adaptation.
|
| 401 |
-
// Writes go to both
|
| 402 |
-
// and GMEM (persistence across forward calls).
|
| 403 |
if (lane == 0) {
|
|
|
|
| 404 |
float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
|
|
|
|
|
|
|
|
|
|
| 405 |
float sample = is_active ? 1.0f : 0.0f;
|
| 406 |
ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
|
| 407 |
|
|
|
|
| 408 |
// Writeback: peer smem (for next timestep read) + GMEM (persistence).
|
| 409 |
cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
|
|
|
|
| 410 |
active_duty[c] = ad;
|
| 411 |
|
| 412 |
// Threshold steers toward target sparsity.
|
|
@@ -415,50 +442,23 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
|
|
| 415 |
if (new_thr < 0.1f) new_thr = 0.1f;
|
| 416 |
if (new_thr > 1000.0f) new_thr = 1000.0f;
|
| 417 |
|
|
|
|
| 418 |
// Writeback: peer smem (for next timestep read) + GMEM (persistence).
|
| 419 |
cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
|
|
|
|
| 420 |
inhibition_threshold[c] = new_thr;
|
| 421 |
}
|
| 422 |
}
|
| 423 |
|
| 424 |
// ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
|
| 425 |
//
|
| 426 |
-
//
|
| 427 |
-
//
|
| 428 |
-
//
|
| 429 |
-
//
|
| 430 |
-
|
| 431 |
-
//
|
| 432 |
-
// READ SITES (Stage A of the NEXT timestep t+1):
|
| 433 |
-
// Line 290: cluster.map_shared_rank(s_boost, owner_block)[owner_offset] (read)
|
| 434 |
-
// Line 291: cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] (read)
|
| 435 |
-
// Line 323: cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] (read)
|
| 436 |
-
//
|
| 437 |
-
// PARTITION MISMATCH (root cause of T8 staleness):
|
| 438 |
-
// cols_per_block = ceil(n_cols / cluster_sz) [smem partition]
|
| 439 |
-
// col_lo/col_hi = floor(gwarp*n_cols/n_warps) [gwarp work partition]
|
| 440 |
-
// These are NOT identical — up to 1 column can spill across partition boundaries.
|
| 441 |
-
// Example: n_cols=1000, cluster_sz=16 → cols_per_block=63, block 1 col_lo=62
|
| 442 |
-
// → block 1 processes column 62 but column 62 belongs to block 0's smem slice.
|
| 443 |
-
// → block 1 issues a PEER WRITE to block 0's s_inhib_thr / s_active_duty.
|
| 444 |
-
//
|
| 445 |
-
// RACE WITHOUT SYNC:
|
| 446 |
-
// Blocks run Stage A concurrently. Block 1 writes block 0's smem at column 62.
|
| 447 |
-
// Block 0 may simultaneously READ s_inhib_thr[62] for its own column 62 in
|
| 448 |
-
// Stage A of the same timestep → concurrent peer write + local read → undefined.
|
| 449 |
-
// Additionally, without cluster.sync() after all peer writes complete, block 0's
|
| 450 |
-
// t+1 Stage A reads might observe t-1 values still cached in its smem.
|
| 451 |
-
//
|
| 452 |
-
// FIX: cluster.sync() here, AFTER Stage A's per-column loop, ensures:
|
| 453 |
-
// 1. All peer smem writes from this timestep are globally visible to all blocks.
|
| 454 |
-
// 2. No block can enter Stage B (or start t+1 Stage A) with stale smem values.
|
| 455 |
-
// 3. GMEM writes (lines 329, 339) are already committed to L2; __threadfence()
|
| 456 |
-
// below ensures they are visible to all SMs before the cluster barrier.
|
| 457 |
-
//
|
| 458 |
-
// ORDERING: write → cluster.sync() here → __threadfence() → cluster.sync() in
|
| 459 |
-
// fused_grid_barrier → next-timestep reads. Both visibility guarantees
|
| 460 |
-
// are now satisfied.
|
| 461 |
cluster.sync();
|
|
|
|
| 462 |
|
| 463 |
// ---- BARRIER 2: SP active_mask must be visible before TM reads ----
|
| 464 |
// Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
|
|
@@ -660,7 +660,7 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
|
|
| 660 |
}
|
| 661 |
|
| 662 |
// Single-region kernel (legacy call site).
|
| 663 |
-
__global__
|
| 664 |
void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
|
| 665 |
htm_fused_step_body(P, cfg);
|
| 666 |
}
|
|
@@ -668,7 +668,7 @@ void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
|
|
| 668 |
// Batched kernel: one cooperative launch for B regions. grid.y = B,
|
| 669 |
// grid.x = per-region block count. Each block reads its region's
|
| 670 |
// FusedPtrs from the device array via blockIdx.y.
|
| 671 |
-
__global__
|
| 672 |
void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
|
| 673 |
const FusedPtrs P = P_arr[blockIdx.y];
|
| 674 |
htm_fused_step_body(P, cfg);
|
|
|
|
| 124 |
//
|
| 125 |
// The flags / expected / phase / cooperative_grid_sync parameters are kept
|
| 126 |
// in the signature for call-site compatibility but are unused.
|
| 127 |
+
__device__ static inline void fused_grid_barrier(cg::grid_group grid,
|
| 128 |
unsigned int * /* flags — unused */,
|
| 129 |
unsigned int /* expected — unused */,
|
| 130 |
unsigned int /* phase — unused */,
|
| 131 |
unsigned int /* cooperative_grid_sync — unused */) {
|
| 132 |
+
#if __CUDA_ARCH__ >= 900
|
| 133 |
+
// Hopper+ : hardware cluster barrier (~10-40 ns)
|
| 134 |
auto cluster = cg::this_cluster();
|
| 135 |
cluster.sync();
|
| 136 |
+
#else
|
| 137 |
+
// Pre-Hopper (sm_80, sm_86, sm_89): grid-level cooperative sync.
|
| 138 |
+
// Requires cooperative kernel launch. ~us-ms range, adequate for HTM
|
| 139 |
+
// workload (kernel launch frequency is low).
|
| 140 |
+
grid.sync();
|
| 141 |
+
#endif
|
| 142 |
}
|
| 143 |
|
| 144 |
__device__ static inline unsigned int warp_sum_u32(unsigned int v) {
|
|
|
|
| 195 |
// DSMEM: Cluster-distributed shared memory for hot per-column
|
| 196 |
// state (inhibition_threshold, boost, active_duty).
|
| 197 |
//
|
| 198 |
+
// On Hopper (sm_90+): Each block in the cluster owns a contiguous
|
| 199 |
+
// slice of columns in its own __shared__ arrays. Any block can
|
| 200 |
+
// peer-read another block's slice via cluster.map_shared_rank().
|
|
|
|
| 201 |
//
|
| 202 |
+
// On Ampere (sm_86) and other pre-Hopper: No cluster support.
|
| 203 |
+
// Read/write directly from/to global memory (inhibition_threshold,
|
| 204 |
+
// boost, active_duty device pointers). Slightly higher latency but
|
| 205 |
+
// functionally correct.
|
| 206 |
// =========================================================
|
| 207 |
+
|
| 208 |
+
#if __CUDA_ARCH__ >= 900
|
| 209 |
+
// Hopper+ cluster path
|
| 210 |
auto cluster = cg::this_cluster();
|
| 211 |
const unsigned int cluster_block_rank = cluster.block_rank(); // 0..cluster_size-1
|
| 212 |
const unsigned int cluster_sz = cluster.num_blocks(); // == gridDim.x (≤16)
|
| 213 |
+
#else
|
| 214 |
+
// Pre-Hopper: no cluster, each block is independent.
|
| 215 |
+
const unsigned int cluster_block_rank = blockIdx.x;
|
| 216 |
+
const unsigned int cluster_sz = gridDim.x;
|
| 217 |
+
#endif
|
| 218 |
|
| 219 |
// Partition n_cols evenly across cluster blocks.
|
| 220 |
// Each block owns cols_per_block columns starting at my_col_start.
|
|
|
|
| 226 |
(my_col_start + cols_per_block < n_cols)
|
| 227 |
? (my_col_start + cols_per_block) : n_cols; // clamp
|
| 228 |
|
| 229 |
+
#if __CUDA_ARCH__ >= 900
|
| 230 |
// Cluster-distributed shared memory arrays.
|
| 231 |
// Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
|
| 232 |
// Peer blocks address into each other's smem via map_shared_rank.
|
| 233 |
__shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
|
| 234 |
__shared__ float s_boost [COLS_PER_CLUSTER_BLOCK_MAX];
|
| 235 |
__shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
|
| 236 |
+
#endif
|
| 237 |
|
| 238 |
+
// TMA multicast input staging tile (T9) — HOPPER ONLY.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
//
|
| 240 |
+
// On Hopper: cg::memcpy_async with cluster scope multicasts input to all
|
| 241 |
+
// 16 SMs, reducing DRAM traffic by ~16×.
|
| 242 |
+
// On Ampere: 32 KB smem allocation exceeds per-block budget when
|
| 243 |
+
// cooperatively launched (48 KB total, registers eat the rest). Skip the
|
| 244 |
+
// tile entirely — Stage A reads from GMEM directly (original path).
|
| 245 |
+
#if __CUDA_ARCH__ >= 900
|
| 246 |
__shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
|
| 247 |
+
#endif
|
| 248 |
|
| 249 |
+
#if __CUDA_ARCH__ >= 900
|
| 250 |
// Initial GMEM → smem load (reads state from previous forward call).
|
| 251 |
// Each block loads only its own slice; tid strides across the slice.
|
| 252 |
for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
|
|
|
|
| 259 |
// All blocks in the cluster must finish loading before any block
|
| 260 |
// starts reading peer smem inside the T-loop.
|
| 261 |
cluster.sync();
|
| 262 |
+
#else
|
| 263 |
+
// Pre-Hopper: no smem caching needed — reads go directly to GMEM.
|
| 264 |
+
// Grid sync ensures all blocks have completed Phase 0 init before T-loop.
|
| 265 |
+
grid.sync();
|
| 266 |
+
#endif
|
| 267 |
|
| 268 |
const unsigned int S = cfg.synapses_per_col;
|
| 269 |
const unsigned int cpc = cfg.cells_per_column;
|
|
|
|
| 329 |
// Ordering: BARRIER 1 completes before we issue the DMA.
|
| 330 |
// The DMA completes before Stage A reads s_input_tile.
|
| 331 |
// =========================================================
|
| 332 |
+
#if __CUDA_ARCH__ >= 900
|
| 333 |
const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
|
| 334 |
if (use_input_tile) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
auto tb = cg::this_thread_block();
|
| 336 |
cg::memcpy_async(tb, s_input_tile,
|
| 337 |
inputs + inp_off,
|
| 338 |
cfg.input_bits);
|
| 339 |
cg::wait(tb);
|
|
|
|
|
|
|
| 340 |
cluster.sync();
|
| 341 |
}
|
| 342 |
+
#else
|
| 343 |
+
const bool use_input_tile = false;
|
| 344 |
+
#endif
|
| 345 |
|
| 346 |
// =========================================================
|
| 347 |
// STAGE A: Spatial Pooler
|
|
|
|
| 359 |
float p = syn_perm[base + s];
|
| 360 |
// T9: read from cluster-broadcast tile when available;
|
| 361 |
// fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
|
| 362 |
+
#if __CUDA_ARCH__ >= 900
|
| 363 |
unsigned int inp_byte = use_input_tile
|
| 364 |
? (unsigned int)s_input_tile[b]
|
| 365 |
: (unsigned int)inputs[inp_off + b];
|
| 366 |
+
#else
|
| 367 |
+
unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
|
| 368 |
+
#endif
|
| 369 |
unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
|
| 370 |
local += hit;
|
| 371 |
}
|
| 372 |
unsigned int overlap = warp_sum_u32(local);
|
| 373 |
overlap = __shfl_sync(0xffffffffu, overlap, 0);
|
| 374 |
|
| 375 |
+
// Read boost + threshold for column c.
|
| 376 |
+
#if __CUDA_ARCH__ >= 900
|
| 377 |
+
// Hopper: read from cluster-distributed shared memory.
|
| 378 |
const unsigned int owner_block = c / cols_per_block;
|
| 379 |
const unsigned int owner_offset = c - owner_block * cols_per_block;
|
|
|
|
| 380 |
float boost_val = cluster.map_shared_rank(s_boost, owner_block)[owner_offset];
|
| 381 |
float thr = cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset];
|
| 382 |
+
#else
|
| 383 |
+
// Pre-Hopper: read directly from global memory.
|
| 384 |
+
float boost_val = boost[c];
|
| 385 |
+
float thr = inhibition_threshold[c];
|
| 386 |
+
#endif
|
| 387 |
|
| 388 |
float boosted = (float)overlap * boost_val;
|
| 389 |
unsigned int is_active = (boosted > thr) ? 1u : 0u;
|
|
|
|
| 401 |
for (unsigned int s = lane; s < S; s += 32u) {
|
| 402 |
unsigned int b = syn_bit[base + s];
|
| 403 |
float p = syn_perm[base + s];
|
| 404 |
+
#if __CUDA_ARCH__ >= 900
|
| 405 |
unsigned int inp_byte = use_input_tile
|
| 406 |
? (unsigned int)s_input_tile[b]
|
| 407 |
: (unsigned int)inputs[inp_off + b];
|
| 408 |
+
#else
|
| 409 |
+
unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
|
| 410 |
+
#endif
|
| 411 |
if (inp_byte != 0u) {
|
| 412 |
p += cfg.sp_inc;
|
| 413 |
if (p > 1.0f) p = 1.0f;
|
|
|
|
| 420 |
}
|
| 421 |
|
| 422 |
// active_duty EMA + threshold adaptation.
|
| 423 |
+
// Writes go to both DSMEM (hot path, Hopper only) and GMEM (persistence).
|
|
|
|
| 424 |
if (lane == 0) {
|
| 425 |
+
#if __CUDA_ARCH__ >= 900
|
| 426 |
float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
|
| 427 |
+
#else
|
| 428 |
+
float ad = active_duty[c];
|
| 429 |
+
#endif
|
| 430 |
float sample = is_active ? 1.0f : 0.0f;
|
| 431 |
ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
|
| 432 |
|
| 433 |
+
#if __CUDA_ARCH__ >= 900
|
| 434 |
// Writeback: peer smem (for next timestep read) + GMEM (persistence).
|
| 435 |
cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
|
| 436 |
+
#endif
|
| 437 |
active_duty[c] = ad;
|
| 438 |
|
| 439 |
// Threshold steers toward target sparsity.
|
|
|
|
| 442 |
if (new_thr < 0.1f) new_thr = 0.1f;
|
| 443 |
if (new_thr > 1000.0f) new_thr = 1000.0f;
|
| 444 |
|
| 445 |
+
#if __CUDA_ARCH__ >= 900
|
| 446 |
// Writeback: peer smem (for next timestep read) + GMEM (persistence).
|
| 447 |
cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
|
| 448 |
+
#endif
|
| 449 |
inhibition_threshold[c] = new_thr;
|
| 450 |
}
|
| 451 |
}
|
| 452 |
|
| 453 |
// ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
|
| 454 |
//
|
| 455 |
+
// On Hopper: cluster.sync() ensures all peer smem writes from this
|
| 456 |
+
// timestep are visible to all blocks before Stage B / next t.
|
| 457 |
+
// On pre-Hopper: no smem peer writes occur (all state in GMEM),
|
| 458 |
+
// so no extra sync needed here — the grid barrier below suffices.
|
| 459 |
+
#if __CUDA_ARCH__ >= 900
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
cluster.sync();
|
| 461 |
+
#endif
|
| 462 |
|
| 463 |
// ---- BARRIER 2: SP active_mask must be visible before TM reads ----
|
| 464 |
// Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
|
|
|
|
| 660 |
}
|
| 661 |
|
| 662 |
// Single-region kernel (legacy call site).
|
| 663 |
+
__global__ __launch_bounds__(256, 2)
|
| 664 |
void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
|
| 665 |
htm_fused_step_body(P, cfg);
|
| 666 |
}
|
|
|
|
| 668 |
// Batched kernel: one cooperative launch for B regions. grid.y = B,
|
| 669 |
// grid.x = per-region block count. Each block reads its region's
|
| 670 |
// FusedPtrs from the device array via blockIdx.y.
|
| 671 |
+
__global__ __launch_bounds__(256, 2)
|
| 672 |
void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
|
| 673 |
const FusedPtrs P = P_arr[blockIdx.y];
|
| 674 |
htm_fused_step_body(P, cfg);
|
overlay/hydra/engram.py
CHANGED
|
@@ -1,93 +1,80 @@
|
|
| 1 |
-
"""GPU Engram — Sparse
|
| 2 |
|
| 3 |
-
## What changed (scatter-gather →
|
| 4 |
|
| 5 |
The original forward used `self.memory[indices]` (scatter-gather), which misses
|
| 6 |
L2 cache at n_columns > 4096 and creates a hard tps ceiling.
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
retrieved = weights @ self.memory # (B, T, d_model) — coalesced matmul
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
-
##
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
|
| 28 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
the retrieval path — the Hopfield path computes dense similarity over the whole
|
| 33 |
-
memory bank, which subsumes any hash-based column selection. Documented here to
|
| 34 |
-
prevent confusion.
|
| 35 |
|
| 36 |
-
##
|
| 37 |
|
| 38 |
-
|
| 39 |
-
Hebbian
|
| 40 |
-
|
| 41 |
|
| 42 |
## Checkpoint compatibility
|
| 43 |
|
| 44 |
-
`self.memory` shape (n_columns, d_model) is unchanged
|
| 45 |
-
files load without
|
| 46 |
"""
|
| 47 |
|
| 48 |
from __future__ import annotations
|
| 49 |
|
|
|
|
|
|
|
| 50 |
import torch
|
| 51 |
import torch.nn as nn
|
| 52 |
|
| 53 |
-
# ---------------------------------------------------------------------------
|
| 54 |
-
# Sparse-attention backend — chosen ONCE at import time, no runtime branching.
|
| 55 |
-
# ---------------------------------------------------------------------------
|
| 56 |
-
|
| 57 |
-
try:
|
| 58 |
-
from entmax import entmax15 as _entmax15 # type: ignore[import]
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
_BACKEND = "entmax15"
|
| 65 |
-
|
| 66 |
-
except ImportError: # pragma: no cover — entmax always installed in CI
|
| 67 |
-
_K = 32 # top-k for fallback
|
| 68 |
-
|
| 69 |
-
def _sparse_attention(scores: torch.Tensor) -> torch.Tensor: # type: ignore[misc]
|
| 70 |
-
"""Top-k softmax fallback: zero outside the k highest-scoring columns."""
|
| 71 |
-
topk_vals, topk_idx = scores.topk(_K, dim=-1)
|
| 72 |
-
topk_w = torch.softmax(topk_vals, dim=-1).to(scores.dtype)
|
| 73 |
-
weights = torch.zeros_like(scores)
|
| 74 |
-
weights.scatter_(-1, topk_idx, topk_w)
|
| 75 |
-
return weights
|
| 76 |
-
|
| 77 |
-
_BACKEND = "topk32"
|
| 78 |
|
| 79 |
|
| 80 |
class GPUEngram(nn.Module):
|
| 81 |
-
"""GPU Engram: Sparse
|
| 82 |
|
| 83 |
Args:
|
| 84 |
d_model: Model dimension — must match the surrounding transformer.
|
| 85 |
-
n_columns: Number of memory columns (key-value pairs). Safe
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
max_ngram: Retained for API compatibility; unused in retrieval
|
| 89 |
hebbian_boost: If True, also run a Hebbian EMA write on the memory bank
|
| 90 |
-
during training
|
|
|
|
| 91 |
"""
|
| 92 |
|
| 93 |
def __init__(
|
|
@@ -105,16 +92,18 @@ class GPUEngram(nn.Module):
|
|
| 105 |
self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
|
| 106 |
self.gate = nn.Linear(d_model, 1, bias=True)
|
| 107 |
nn.init.constant_(self.gate.bias, 0.0) # START OPEN
|
|
|
|
|
|
|
| 108 |
# Retained for any external code that reads these attrs.
|
| 109 |
self.primes = [2654435761, 2246822519, 3266489917]
|
| 110 |
self.hebbian_lr = 0.01
|
| 111 |
|
| 112 |
# ------------------------------------------------------------------
|
| 113 |
-
# _hash: retained for API/checkpoint compat; unused in
|
| 114 |
# ------------------------------------------------------------------
|
| 115 |
|
| 116 |
def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
|
| 117 |
-
"""N-gram hash → column index (
|
| 118 |
B, T = token_ids.shape
|
| 119 |
h = token_ids * self.primes[0]
|
| 120 |
if T > 1:
|
|
@@ -132,39 +121,48 @@ class GPUEngram(nn.Module):
|
|
| 132 |
# ------------------------------------------------------------------
|
| 133 |
|
| 134 |
def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
|
| 135 |
-
"""Hopfield retrieve + soft gate + residual.
|
| 136 |
|
| 137 |
Args:
|
| 138 |
x: (B, T, d_model) — input activations.
|
| 139 |
-
token_ids: (B, T) —
|
| 140 |
-
|
| 141 |
|
| 142 |
Returns:
|
| 143 |
(x + alpha * retrieved, hit_rate)
|
| 144 |
- x + alpha * retrieved: (B, T, d_model)
|
| 145 |
- hit_rate: scalar tensor — fraction of gate values > 0.1
|
| 146 |
"""
|
|
|
|
|
|
|
| 147 |
# ---- 1. Similarity scores (coalesced GEMM) ----------------------
|
| 148 |
# scores[b, t, c] = dot(x[b,t], memory[c])
|
| 149 |
scores = x @ self.memory.T # (B, T, n_columns)
|
| 150 |
|
| 151 |
-
# ---- 2.
|
| 152 |
-
#
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
-
# ----
|
| 156 |
-
retrieved =
|
| 157 |
|
| 158 |
-
# ----
|
| 159 |
alpha = torch.sigmoid(self.gate(x)) # (B, T, 1)
|
| 160 |
|
| 161 |
-
# ----
|
| 162 |
if self.training and self.hebbian_boost:
|
| 163 |
with torch.no_grad():
|
| 164 |
-
# Reuse the hash-based indices for the write target (sparse update).
|
| 165 |
indices = self._hash(token_ids)
|
| 166 |
-
flat_idx = indices.reshape(-1)
|
| 167 |
-
flat_x = x.detach().reshape(-1,
|
| 168 |
mem_dtype = self.memory.data.dtype
|
| 169 |
updates = (
|
| 170 |
self.hebbian_lr * flat_x
|
|
@@ -172,6 +170,6 @@ class GPUEngram(nn.Module):
|
|
| 172 |
).to(mem_dtype)
|
| 173 |
self.memory.data.index_add_(0, flat_idx, updates)
|
| 174 |
|
| 175 |
-
# ----
|
| 176 |
hit_rate = (alpha.detach() > 0.1).float().mean()
|
| 177 |
return x + alpha * retrieved, hit_rate
|
|
|
|
| 1 |
+
"""GPU Engram — Top-k Sparse Hopfield retrieval, scales to n_columns >= 32768.
|
| 2 |
|
| 3 |
+
## What changed (scatter-gather → top-k Hopfield)
|
| 4 |
|
| 5 |
The original forward used `self.memory[indices]` (scatter-gather), which misses
|
| 6 |
L2 cache at n_columns > 4096 and creates a hard tps ceiling.
|
| 7 |
|
| 8 |
+
An earlier Hopfield implementation used `entmax15` for sparse attention, but
|
| 9 |
+
entmax's internal `torch.sort` over the full n_columns dimension allocates
|
| 10 |
+
~1 GB scratch at (B*T=8192, n_columns=32768) and OOMs on a 6 GB card.
|
|
|
|
| 11 |
|
| 12 |
+
This module replaces the sort-based entmax with **top-k softmax**, which is
|
| 13 |
+
O(B*T*K) in memory and O(B*T*K * log n_columns) in compute (the top-k is
|
| 14 |
+
radix-selection under the hood — not a full sort). Sparsity is still exact:
|
| 15 |
+
only K columns have non-zero weight per (batch, position).
|
| 16 |
|
| 17 |
+
## Why this scales where entmax didn't
|
| 18 |
|
| 19 |
+
- `scores = x @ memory.T` is (B, T, n_columns) — 268 MB at bf16 with n_columns=32768.
|
| 20 |
+
- `scores.topk(K)` allocates only (B, T, K) — ~2 MB at K=64. No full sort.
|
| 21 |
+
- `memory[topk_idx]` gathers (B, T, K, d_model) — ~32 MB at bf16. Gather is
|
| 22 |
+
on the LAST axis of memory (columns), contiguous stride-1 rows, cache-friendly.
|
| 23 |
+
- `retrieved = einsum(topk_w, selected_mem)` — ~4 MB. Final reduction.
|
| 24 |
|
| 25 |
+
Peak working set well under 400 MB at any reasonable n_columns + K. The weights
|
| 26 |
+
tensor is never densified (which would have been the (B, T, n_columns) killer).
|
| 27 |
|
| 28 |
+
## Gradient flow
|
| 29 |
+
|
| 30 |
+
Both the topk gather and the einsum are autograd-tracked, so `self.memory`
|
| 31 |
+
receives gradient from the LM loss (which the Hebbian scatter-gather path did
|
| 32 |
+
not). `topk` indices are detached — gradient flows through `topk_vals` via the
|
| 33 |
+
selected memory rows.
|
| 34 |
+
|
| 35 |
+
## Sparsity
|
| 36 |
|
| 37 |
+
Exactly K columns have non-zero weight per position. Default K=64, tunable via
|
| 38 |
+
HYDRA_ENGRAM_TOPK.
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
## token_ids argument
|
| 41 |
|
| 42 |
+
Accepted for API compatibility with hydra/model.py; unused in retrieval. The
|
| 43 |
+
optional Hebbian boost (hebbian_boost=True) uses the hash-indexed path for
|
| 44 |
+
its EMA write only.
|
| 45 |
|
| 46 |
## Checkpoint compatibility
|
| 47 |
|
| 48 |
+
`self.memory` shape (n_columns, d_model) is unchanged; existing .pt/.ckpt
|
| 49 |
+
files load without migration.
|
| 50 |
"""
|
| 51 |
|
| 52 |
from __future__ import annotations
|
| 53 |
|
| 54 |
+
import os
|
| 55 |
+
|
| 56 |
import torch
|
| 57 |
import torch.nn as nn
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
# Top-k width — how many memory columns get non-zero weight per position.
|
| 61 |
+
# Default 64 matches the entmax sparsity fraction we observed empirically
|
| 62 |
+
# (~0.2% of 32768 columns == 64). HYDRA_ENGRAM_TOPK env var overrides.
|
| 63 |
+
_ENGRAM_TOPK = int(os.environ.get("HYDRA_ENGRAM_TOPK", "64"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
|
| 66 |
class GPUEngram(nn.Module):
|
| 67 |
+
"""GPU Engram: Top-k Sparse Hopfield retrieval.
|
| 68 |
|
| 69 |
Args:
|
| 70 |
d_model: Model dimension — must match the surrounding transformer.
|
| 71 |
+
n_columns: Number of memory columns (key-value pairs). Safe up to
|
| 72 |
+
n_columns = 65536 at d_model = 384 on a 6 GB card with
|
| 73 |
+
B*T <= 8192.
|
| 74 |
+
max_ngram: Retained for API compatibility; unused in retrieval.
|
| 75 |
hebbian_boost: If True, also run a Hebbian EMA write on the memory bank
|
| 76 |
+
during training. Default False — the top-k gradient path
|
| 77 |
+
provides learning signal without this.
|
| 78 |
"""
|
| 79 |
|
| 80 |
def __init__(
|
|
|
|
| 92 |
self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
|
| 93 |
self.gate = nn.Linear(d_model, 1, bias=True)
|
| 94 |
nn.init.constant_(self.gate.bias, 0.0) # START OPEN
|
| 95 |
+
# Clamp topk K to n_columns so topk doesn't error at small engram.
|
| 96 |
+
self.topk_k = min(_ENGRAM_TOPK, n_columns)
|
| 97 |
# Retained for any external code that reads these attrs.
|
| 98 |
self.primes = [2654435761, 2246822519, 3266489917]
|
| 99 |
self.hebbian_lr = 0.01
|
| 100 |
|
| 101 |
# ------------------------------------------------------------------
|
| 102 |
+
# _hash: retained for API/checkpoint compat; unused in retrieval path.
|
| 103 |
# ------------------------------------------------------------------
|
| 104 |
|
| 105 |
def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
|
| 106 |
+
"""N-gram hash → column index (Hebbian-write target only, not retrieval)."""
|
| 107 |
B, T = token_ids.shape
|
| 108 |
h = token_ids * self.primes[0]
|
| 109 |
if T > 1:
|
|
|
|
| 121 |
# ------------------------------------------------------------------
|
| 122 |
|
| 123 |
def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
|
| 124 |
+
"""Top-k Hopfield retrieve + soft gate + residual.
|
| 125 |
|
| 126 |
Args:
|
| 127 |
x: (B, T, d_model) — input activations.
|
| 128 |
+
token_ids: (B, T) — accepted for API compat; only used in the
|
| 129 |
+
optional Hebbian boost path.
|
| 130 |
|
| 131 |
Returns:
|
| 132 |
(x + alpha * retrieved, hit_rate)
|
| 133 |
- x + alpha * retrieved: (B, T, d_model)
|
| 134 |
- hit_rate: scalar tensor — fraction of gate values > 0.1
|
| 135 |
"""
|
| 136 |
+
B, T, D = x.shape
|
| 137 |
+
|
| 138 |
# ---- 1. Similarity scores (coalesced GEMM) ----------------------
|
| 139 |
# scores[b, t, c] = dot(x[b,t], memory[c])
|
| 140 |
scores = x @ self.memory.T # (B, T, n_columns)
|
| 141 |
|
| 142 |
+
# ---- 2. Top-k sparse attention ----------------------------------
|
| 143 |
+
# topk uses radix select, not a sort — O(n_columns) memory, not O(n_columns log n_columns).
|
| 144 |
+
# Never materializes a dense (B, T, n_columns) weights tensor.
|
| 145 |
+
topk_vals, topk_idx = scores.topk(self.topk_k, dim=-1) # (B, T, K), (B, T, K)
|
| 146 |
+
topk_w = torch.softmax(topk_vals, dim=-1) # (B, T, K)
|
| 147 |
+
|
| 148 |
+
# ---- 3. Gather selected memory rows -----------------------------
|
| 149 |
+
# memory[topk_idx] is a gather along axis 0 of memory (n_columns, d_model).
|
| 150 |
+
# Output shape (B, T, K, d_model) — K is small, so gather bandwidth is
|
| 151 |
+
# O(B*T*K*d_model), independent of n_columns.
|
| 152 |
+
selected_mem = self.memory[topk_idx] # (B, T, K, d_model)
|
| 153 |
|
| 154 |
+
# ---- 4. Weighted sum → retrieved vector -------------------------
|
| 155 |
+
retrieved = torch.einsum('btk,btkd->btd', topk_w, selected_mem) # (B, T, d_model)
|
| 156 |
|
| 157 |
+
# ---- 5. Soft gate -----------------------------------------------
|
| 158 |
alpha = torch.sigmoid(self.gate(x)) # (B, T, 1)
|
| 159 |
|
| 160 |
+
# ---- 6. Optional Hebbian EMA write ------------------------------
|
| 161 |
if self.training and self.hebbian_boost:
|
| 162 |
with torch.no_grad():
|
|
|
|
| 163 |
indices = self._hash(token_ids)
|
| 164 |
+
flat_idx = indices.reshape(-1) # (B*T,)
|
| 165 |
+
flat_x = x.detach().reshape(-1, D) # (B*T, d_model)
|
| 166 |
mem_dtype = self.memory.data.dtype
|
| 167 |
updates = (
|
| 168 |
self.hebbian_lr * flat_x
|
|
|
|
| 170 |
).to(mem_dtype)
|
| 171 |
self.memory.data.index_add_(0, flat_idx, updates)
|
| 172 |
|
| 173 |
+
# ---- 7. Residual + hit_rate -------------------------------------
|
| 174 |
hit_rate = (alpha.detach() > 0.1).float().mean()
|
| 175 |
return x + alpha * retrieved, hit_rate
|
overlay/hydra/eval.py
CHANGED
|
@@ -138,6 +138,9 @@ def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
|
|
| 138 |
num_samples = FACTUAL_SAMPLES
|
| 139 |
batch = FACTUAL_BATCH
|
| 140 |
gen_tokens = FACTUAL_GEN_TOKENS
|
|
|
|
|
|
|
|
|
|
| 141 |
temps = [0.7, 0.9, 1.1]
|
| 142 |
hits = 0
|
| 143 |
|
|
@@ -154,14 +157,18 @@ def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
|
|
| 154 |
temp = temps[batch_idx % len(temps)]
|
| 155 |
batch_idx += 1
|
| 156 |
ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
|
|
|
|
| 157 |
for _ in range(gen_tokens):
|
| 158 |
-
logits = model(ctx, targets=None)
|
| 159 |
next_logits = logits[:, -1, :] if logits.dim() == 3 else logits
|
| 160 |
probs = torch.softmax(next_logits.float() / temp, dim=-1)
|
| 161 |
next_id = torch.multinomial(probs, num_samples=1)
|
| 162 |
ctx = torch.cat([ctx, next_id], dim=1)
|
| 163 |
if ctx.size(1) >= max_seq_len:
|
| 164 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
# Transfer to CPU in one shot, no per-row sync
|
| 166 |
all_rows.extend(ctx.cpu().tolist())
|
| 167 |
samples_done += b
|
|
|
|
| 138 |
num_samples = FACTUAL_SAMPLES
|
| 139 |
batch = FACTUAL_BATCH
|
| 140 |
gen_tokens = FACTUAL_GEN_TOKENS
|
| 141 |
+
# Optional fast incremental decode path for recurrence-capable backbones.
|
| 142 |
+
# If disabled, we preserve the original full-context re-forward behavior.
|
| 143 |
+
incremental_decode = os.environ.get("HYDRA_FACTUAL_GEN_INCREMENTAL", "1") == "1"
|
| 144 |
temps = [0.7, 0.9, 1.1]
|
| 145 |
hits = 0
|
| 146 |
|
|
|
|
| 157 |
temp = temps[batch_idx % len(temps)]
|
| 158 |
batch_idx += 1
|
| 159 |
ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
|
| 160 |
+
logits = model(ctx, targets=None)
|
| 161 |
for _ in range(gen_tokens):
|
|
|
|
| 162 |
next_logits = logits[:, -1, :] if logits.dim() == 3 else logits
|
| 163 |
probs = torch.softmax(next_logits.float() / temp, dim=-1)
|
| 164 |
next_id = torch.multinomial(probs, num_samples=1)
|
| 165 |
ctx = torch.cat([ctx, next_id], dim=1)
|
| 166 |
if ctx.size(1) >= max_seq_len:
|
| 167 |
break
|
| 168 |
+
if incremental_decode:
|
| 169 |
+
logits = model(ctx[:, -1:], targets=None)
|
| 170 |
+
else:
|
| 171 |
+
logits = model(ctx, targets=None)
|
| 172 |
# Transfer to CPU in one shot, no per-row sync
|
| 173 |
all_rows.extend(ctx.cpu().tolist())
|
| 174 |
samples_done += b
|
overlay/hydra/model.py
CHANGED
|
@@ -145,7 +145,7 @@ class PostSemClawModel(nn.Module):
|
|
| 145 |
expand=config.expand,
|
| 146 |
headdim=config.headdim,
|
| 147 |
is_mimo=False, # SISO path uses stable mamba3_siso_combined kernel
|
| 148 |
-
chunk_size=64,
|
| 149 |
is_outproj_norm=False,
|
| 150 |
dtype=torch.bfloat16,
|
| 151 |
)
|
|
@@ -173,8 +173,13 @@ class PostSemClawModel(nn.Module):
|
|
| 173 |
reset_each_forward=True,
|
| 174 |
)
|
| 175 |
|
| 176 |
-
# Gradient bridge
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
# GPU Engram with Hebbian writes — runs EVERY step.
|
| 180 |
self.engram = GPUEngram(
|
|
@@ -349,11 +354,13 @@ class PostSemClawModel(nn.Module):
|
|
| 349 |
nn.init.normal_(block.out_proj.weight, mean=0.0, std=out_std)
|
| 350 |
|
| 351 |
nn.init.normal_(self.htm_proj.weight, mean=0.0, std=s)
|
|
|
|
| 352 |
|
| 353 |
# Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
|
| 354 |
# dtypes in the same shape group would break lerp_ dtype checks.
|
| 355 |
self.wte.to(dtype=torch.bfloat16)
|
| 356 |
self.htm_proj.to(dtype=torch.bfloat16)
|
|
|
|
| 357 |
self.engram.to(dtype=torch.bfloat16)
|
| 358 |
|
| 359 |
def set_bos_token_id(self, bos_id: int) -> None:
|
|
@@ -402,11 +409,13 @@ class PostSemClawModel(nn.Module):
|
|
| 402 |
blocks = sum(p.numel() for p in self.blocks.parameters())
|
| 403 |
sdr = sum(p.numel() for p in self.sdr_semantic.parameters())
|
| 404 |
htm_proj = sum(p.numel() for p in self.htm_proj.parameters())
|
|
|
|
| 405 |
engram = sum(p.numel() for p in self.engram.parameters())
|
| 406 |
total = sum(p.numel() for p in self.parameters())
|
| 407 |
return {
|
| 408 |
'wte': wte, 'lm_head': lm_head, 'blocks': blocks,
|
| 409 |
'sdr_semantic': sdr, 'htm_proj': htm_proj,
|
|
|
|
| 410 |
'engram': engram, 'total': total,
|
| 411 |
}
|
| 412 |
|
|
@@ -516,9 +525,13 @@ class PostSemClawModel(nn.Module):
|
|
| 516 |
|
| 517 |
for shape in sorted({p.shape for p in matrix_params}):
|
| 518 |
group_params = [p for p in matrix_params if p.shape == shape]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
param_groups.append(dict(
|
| 520 |
kind='muon', params=group_params, lr=matrix_lr,
|
| 521 |
-
momentum=0.95, ns_steps=
|
| 522 |
))
|
| 523 |
|
| 524 |
optimizer = MuonAdamW(param_groups)
|
|
@@ -610,8 +623,10 @@ class PostSemClawModel(nn.Module):
|
|
| 610 |
if self._htm_stop_grad:
|
| 611 |
htm_out = htm_out.detach()
|
| 612 |
|
| 613 |
-
# Gradient bridge:
|
| 614 |
-
|
|
|
|
|
|
|
| 615 |
x = dense_emb + htm_proj_out
|
| 616 |
x = norm(x)
|
| 617 |
|
|
|
|
| 145 |
expand=config.expand,
|
| 146 |
headdim=config.headdim,
|
| 147 |
is_mimo=False, # SISO path uses stable mamba3_siso_combined kernel
|
| 148 |
+
chunk_size=int(os.environ.get("HYDRA_MAMBA3_CHUNK", "64")), # 64 is the validated default; 128 tripped a Triton autotune hang (>8min, no progress)
|
| 149 |
is_outproj_norm=False,
|
| 150 |
dtype=torch.bfloat16,
|
| 151 |
)
|
|
|
|
| 173 |
reset_each_forward=True,
|
| 174 |
)
|
| 175 |
|
| 176 |
+
# Gradient bridge split:
|
| 177 |
+
# (a) sparse HTM columns -> d_model
|
| 178 |
+
# (b) scalar anomaly -> d_model
|
| 179 |
+
# This avoids forcing the anomaly scalar through the same projection
|
| 180 |
+
# statistics as the high-dimensional sparse HTM column vector.
|
| 181 |
+
self.htm_proj = nn.Linear(config.htm_n_columns, config.d_model, bias=False)
|
| 182 |
+
self.htm_anom_proj = nn.Linear(1, config.d_model, bias=False)
|
| 183 |
|
| 184 |
# GPU Engram with Hebbian writes — runs EVERY step.
|
| 185 |
self.engram = GPUEngram(
|
|
|
|
| 354 |
nn.init.normal_(block.out_proj.weight, mean=0.0, std=out_std)
|
| 355 |
|
| 356 |
nn.init.normal_(self.htm_proj.weight, mean=0.0, std=s)
|
| 357 |
+
nn.init.normal_(self.htm_anom_proj.weight, mean=0.0, std=s)
|
| 358 |
|
| 359 |
# Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
|
| 360 |
# dtypes in the same shape group would break lerp_ dtype checks.
|
| 361 |
self.wte.to(dtype=torch.bfloat16)
|
| 362 |
self.htm_proj.to(dtype=torch.bfloat16)
|
| 363 |
+
self.htm_anom_proj.to(dtype=torch.bfloat16)
|
| 364 |
self.engram.to(dtype=torch.bfloat16)
|
| 365 |
|
| 366 |
def set_bos_token_id(self, bos_id: int) -> None:
|
|
|
|
| 409 |
blocks = sum(p.numel() for p in self.blocks.parameters())
|
| 410 |
sdr = sum(p.numel() for p in self.sdr_semantic.parameters())
|
| 411 |
htm_proj = sum(p.numel() for p in self.htm_proj.parameters())
|
| 412 |
+
htm_anom_proj = sum(p.numel() for p in self.htm_anom_proj.parameters())
|
| 413 |
engram = sum(p.numel() for p in self.engram.parameters())
|
| 414 |
total = sum(p.numel() for p in self.parameters())
|
| 415 |
return {
|
| 416 |
'wte': wte, 'lm_head': lm_head, 'blocks': blocks,
|
| 417 |
'sdr_semantic': sdr, 'htm_proj': htm_proj,
|
| 418 |
+
'htm_anom_proj': htm_anom_proj,
|
| 419 |
'engram': engram, 'total': total,
|
| 420 |
}
|
| 421 |
|
|
|
|
| 525 |
|
| 526 |
for shape in sorted({p.shape for p in matrix_params}):
|
| 527 |
group_params = [p for p in matrix_params if p.shape == shape]
|
| 528 |
+
# ns_steps: Muon polar-express inner iterations. Default 5 (paper),
|
| 529 |
+
# but 3 converges on small matrices (d_model ~ 384) with ~40% lower
|
| 530 |
+
# optimizer step cost. Env-tunable for experimentation.
|
| 531 |
+
_ns_steps = int(os.environ.get("HYDRA_MUON_NS_STEPS", "3"))
|
| 532 |
param_groups.append(dict(
|
| 533 |
kind='muon', params=group_params, lr=matrix_lr,
|
| 534 |
+
momentum=0.95, ns_steps=_ns_steps, beta2=0.95, weight_decay=weight_decay,
|
| 535 |
))
|
| 536 |
|
| 537 |
optimizer = MuonAdamW(param_groups)
|
|
|
|
| 623 |
if self._htm_stop_grad:
|
| 624 |
htm_out = htm_out.detach()
|
| 625 |
|
| 626 |
+
# Gradient bridge split: columns and anomaly use separate projections.
|
| 627 |
+
htm_cols = htm_out[..., :-1].to(dense_emb.dtype)
|
| 628 |
+
htm_anom = htm_out[..., -1:].to(dense_emb.dtype)
|
| 629 |
+
htm_proj_out = self.htm_proj(htm_cols) + self.htm_anom_proj(htm_anom)
|
| 630 |
x = dense_emb + htm_proj_out
|
| 631 |
x = norm(x)
|
| 632 |
|
overlay/hydra/training.py
CHANGED
|
@@ -779,15 +779,49 @@ def main() -> None:
|
|
| 779 |
)
|
| 780 |
|
| 781 |
# Now it's safe to eval — ckpts are on disk regardless of what happens here.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
val_bpb: float | None = None
|
|
|
|
|
|
|
| 783 |
try:
|
| 784 |
-
|
| 785 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
model.eval()
|
| 787 |
_orig = _prepare_mod.EVAL_TOKENS
|
| 788 |
-
_prepare_mod.EVAL_TOKENS =
|
| 789 |
with autocast_ctx:
|
| 790 |
-
val_bpb = evaluate_bpb(model, tokenizer,
|
| 791 |
_prepare_mod.EVAL_TOKENS = _orig
|
| 792 |
val_ppl = 2 ** val_bpb
|
| 793 |
print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
|
|
@@ -795,7 +829,14 @@ def main() -> None:
|
|
| 795 |
print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
|
| 796 |
torch.cuda.empty_cache()
|
| 797 |
except Exception as e:
|
|
|
|
| 798 |
print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 799 |
|
| 800 |
# Final ckpts with val_bpb filled in (if eval succeeded).
|
| 801 |
save_ckpt(
|
|
@@ -843,7 +884,7 @@ def main() -> None:
|
|
| 843 |
metrics = model.get_secondary_metrics()
|
| 844 |
|
| 845 |
print("---")
|
| 846 |
-
print(f"val_bpb: {val_bpb:.6f}")
|
| 847 |
print(f"training_seconds: {total_training_time:.1f}")
|
| 848 |
print(f"total_seconds: {t_end - t_start:.1f}")
|
| 849 |
print(f"peak_vram_mb: {peak_vram_mb:.1f}")
|
|
|
|
| 779 |
)
|
| 780 |
|
| 781 |
# Now it's safe to eval — ckpts are on disk regardless of what happens here.
|
| 782 |
+
# HYDRA_EVAL_BATCH overrides DEVICE_BATCH_SIZE (env-tunable; default halves
|
| 783 |
+
# the training batch because eval holds activations for full sequence and
|
| 784 |
+
# does not benefit from overlap with backward). HYDRA_EVAL_TOKENS controls
|
| 785 |
+
# how many val tokens to sweep (default 2 M, short enough for autoresearch
|
| 786 |
+
# 5-min budgets).
|
| 787 |
val_bpb: float | None = None
|
| 788 |
+
_eval_B = int(os.environ.get("HYDRA_EVAL_BATCH", str(max(1, DEVICE_BATCH_SIZE // 2))))
|
| 789 |
+
_eval_tokens = int(os.environ.get("HYDRA_EVAL_TOKENS", str(2 * 524288)))
|
| 790 |
try:
|
| 791 |
+
# Aggressive VRAM reclaim for 6GB cards. Peak training VRAM = 5.1GB
|
| 792 |
+
# which leaves < 1GB for the eval forward — the driver can't satisfy
|
| 793 |
+
# the allocation. Free EVERY tensor we don't strictly need:
|
| 794 |
+
# - optimizer grads (set_to_none releases tensor)
|
| 795 |
+
# - optimizer.state (fp32 Muon NS workspace, AdamW moments — ~size-of-params each)
|
| 796 |
+
# - model internal caches (HTM subsample cache, SDR stash)
|
| 797 |
+
# After this, VRAM should be ~params only (bf16 ≈ 120MB at 60M params).
|
| 798 |
+
optimizer.zero_grad(set_to_none=True)
|
| 799 |
+
if hasattr(optimizer, 'state') and optimizer.state:
|
| 800 |
+
for p, st in list(optimizer.state.items()):
|
| 801 |
+
st.clear()
|
| 802 |
+
optimizer.state.clear()
|
| 803 |
+
for p in model.parameters():
|
| 804 |
+
if p.grad is not None:
|
| 805 |
+
p.grad = None
|
| 806 |
+
if hasattr(model, '_htm_cache'):
|
| 807 |
+
model._htm_cache = None
|
| 808 |
+
if hasattr(model, '_last_sdr'):
|
| 809 |
+
model._last_sdr = None
|
| 810 |
+
import gc as _gc
|
| 811 |
+
_gc.collect()
|
| 812 |
+
torch.cuda.empty_cache()
|
| 813 |
+
torch.cuda.synchronize()
|
| 814 |
+
try:
|
| 815 |
+
_free_mb = torch.cuda.mem_get_info()[0] / 1024 / 1024
|
| 816 |
+
print(f"[VAL] free_vram_mb={_free_mb:.0f} (cleared optimizer state)", flush=True)
|
| 817 |
+
except Exception:
|
| 818 |
+
pass
|
| 819 |
+
print(f"[VAL] running eval on {_eval_tokens} tokens at B={_eval_B}...", flush=True)
|
| 820 |
model.eval()
|
| 821 |
_orig = _prepare_mod.EVAL_TOKENS
|
| 822 |
+
_prepare_mod.EVAL_TOKENS = _eval_tokens
|
| 823 |
with autocast_ctx:
|
| 824 |
+
val_bpb = evaluate_bpb(model, tokenizer, _eval_B)
|
| 825 |
_prepare_mod.EVAL_TOKENS = _orig
|
| 826 |
val_ppl = 2 ** val_bpb
|
| 827 |
print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
|
|
|
|
| 829 |
print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
|
| 830 |
torch.cuda.empty_cache()
|
| 831 |
except Exception as e:
|
| 832 |
+
import traceback as _tb
|
| 833 |
print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
|
| 834 |
+
_tb.print_exc()
|
| 835 |
+
try:
|
| 836 |
+
_free = torch.cuda.mem_get_info()[0] / 1024 / 1024
|
| 837 |
+
print(f"[VAL] post-crash free_vram_mb={_free:.0f}", flush=True)
|
| 838 |
+
except Exception:
|
| 839 |
+
pass
|
| 840 |
|
| 841 |
# Final ckpts with val_bpb filled in (if eval succeeded).
|
| 842 |
save_ckpt(
|
|
|
|
| 884 |
metrics = model.get_secondary_metrics()
|
| 885 |
|
| 886 |
print("---")
|
| 887 |
+
print(f"val_bpb: {val_bpb:.6f}" if val_bpb is not None else "val_bpb: SKIPPED")
|
| 888 |
print(f"training_seconds: {total_training_time:.1f}")
|
| 889 |
print(f"total_seconds: {t_end - t_start:.1f}")
|
| 890 |
print(f"peak_vram_mb: {peak_vram_mb:.1f}")
|
overlay/prepare_nemotron.py
CHANGED
|
@@ -20,15 +20,15 @@ Full blend mode (env HYDRA_USE_FULL_BLEND=1):
|
|
| 20 |
"""
|
| 21 |
from __future__ import annotations
|
| 22 |
|
| 23 |
-
import os
|
| 24 |
-
import random
|
| 25 |
-
|
| 26 |
-
from
|
|
|
|
| 27 |
|
| 28 |
-
import numpy as np
|
| 29 |
import torch
|
| 30 |
|
| 31 |
-
import prepare as _p # reuse tokenizer, BOS, byte-length helpers
|
| 32 |
|
| 33 |
NEMOTRON_REPO = "nvidia/Nemotron-Pretraining-Specialized-v1.1"
|
| 34 |
|
|
@@ -37,14 +37,13 @@ NEMOTRON_REPO = "nvidia/Nemotron-Pretraining-Specialized-v1.1"
|
|
| 37 |
# Keys are logical dataset names used by _open_blend_stream / _open_stream.
|
| 38 |
# ---------------------------------------------------------------------------
|
| 39 |
FULL_BLEND_WEIGHTS: dict[str, float] = {
|
| 40 |
-
"fineweb-edu": 0.
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
#
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
# "nemotron-specialized": 0.00,
|
| 48 |
}
|
| 49 |
|
| 50 |
# Mapping from logical blend name → (HF repo, optional config/name, text column).
|
|
@@ -66,13 +65,94 @@ PHASE1_WEIGHTS = {
|
|
| 66 |
"Nemotron-Pretraining-Formal-Logic": 0.20,
|
| 67 |
"Nemotron-Pretraining-Multiple-Choice": 0.20,
|
| 68 |
}
|
| 69 |
-
PHASE2_WEIGHTS = {
|
| 70 |
"Nemotron-Pretraining-Multiple-Choice": 0.45,
|
| 71 |
"Nemotron-Pretraining-Economics": 0.20,
|
| 72 |
"Nemotron-Pretraining-Formal-Logic": 0.15,
|
| 73 |
"Nemotron-Pretraining-Code-Concepts": 0.10,
|
| 74 |
"Nemotron-Pretraining-Unconditional-Algorithmic": 0.10,
|
| 75 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
|
| 78 |
def _phase_weights() -> dict[str, float]:
|
|
@@ -83,129 +163,61 @@ def _phase_weights() -> dict[str, float]:
|
|
| 83 |
return PHASE2_WEIGHTS if phase == "phase2" else PHASE1_WEIGHTS
|
| 84 |
|
| 85 |
|
| 86 |
-
|
| 87 |
-
_PREFETCH_STARTED = set()
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
def _find_local_parquets(repo: str, sub_config: str | None) -> list[str]:
|
| 91 |
-
"""Return LOCAL parquet paths in HF hub cache for a given repo+config.
|
| 92 |
-
|
| 93 |
-
If sub_config filter yields zero matches but parquet files exist in the
|
| 94 |
-
repo dir, returns all parquet files (some datasets like fineweb use a
|
| 95 |
-
builder config name that doesn't match the filesystem path).
|
| 96 |
-
"""
|
| 97 |
-
import glob
|
| 98 |
-
repo_dir = "datasets--" + repo.replace("/", "--")
|
| 99 |
-
base = os.path.expanduser(f"~/.cache/huggingface/hub/{repo_dir}/snapshots")
|
| 100 |
-
if not os.path.isdir(base):
|
| 101 |
-
return []
|
| 102 |
-
all_paths = []
|
| 103 |
-
for snap in os.listdir(base):
|
| 104 |
-
all_paths.extend(glob.glob(os.path.join(base, snap, "**", "*.parquet"), recursive=True))
|
| 105 |
-
if sub_config is None:
|
| 106 |
-
return sorted(all_paths)
|
| 107 |
-
filtered = [p for p in all_paths if f"/{sub_config}/" in p]
|
| 108 |
-
# Fallback: if the config name doesn't match filesystem paths, use all parquet
|
| 109 |
-
if not filtered and all_paths:
|
| 110 |
-
return sorted(all_paths)
|
| 111 |
-
return sorted(filtered)
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
def _start_background_prefetch(repo: str, sub_config: str | None):
|
| 115 |
-
"""Start a daemon thread that downloads parquet shards ahead of consumption.
|
| 116 |
-
|
| 117 |
-
Feeds HF's local cache so streaming=True serves from disk, never network.
|
| 118 |
-
Idempotent per (repo, sub_config). Runs at throttled speed to not flood.
|
| 119 |
-
"""
|
| 120 |
-
import threading
|
| 121 |
-
key = (repo, sub_config)
|
| 122 |
-
if key in _PREFETCH_STARTED:
|
| 123 |
-
return
|
| 124 |
-
_PREFETCH_STARTED.add(key)
|
| 125 |
-
|
| 126 |
-
def worker():
|
| 127 |
-
try:
|
| 128 |
-
from huggingface_hub import HfApi, hf_hub_download
|
| 129 |
-
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
|
| 130 |
-
token = os.environ.get("HF_TOKEN")
|
| 131 |
-
api = HfApi(token=token)
|
| 132 |
-
files = api.list_repo_files(repo, repo_type="dataset")
|
| 133 |
-
parquet = sorted(f for f in files if f.endswith(".parquet"))
|
| 134 |
-
if sub_config is not None:
|
| 135 |
-
filtered = [f for f in parquet if f"/{sub_config}/" in f or f.startswith(f"{sub_config}/")]
|
| 136 |
-
if filtered:
|
| 137 |
-
parquet = filtered
|
| 138 |
-
# Fetch shards one by one, skipping already-cached (hf_hub_download is idempotent)
|
| 139 |
-
for f in parquet:
|
| 140 |
-
try:
|
| 141 |
-
hf_hub_download(repo_id=repo, filename=f, repo_type="dataset", token=token)
|
| 142 |
-
except Exception:
|
| 143 |
-
pass # skip unavailable shards
|
| 144 |
-
except Exception:
|
| 145 |
-
pass # prefetch is best-effort, don't disrupt training
|
| 146 |
-
|
| 147 |
-
t = threading.Thread(target=worker, daemon=True, name=f"prefetch-{repo}")
|
| 148 |
-
t.start()
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
def _open_stream(config: str, split: str):
|
| 152 |
"""Open a streaming iterator over one dataset config.
|
| 153 |
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
shuffle_buf = int(os.environ.get("HYDRA_STREAM_SHUFFLE_BUFFER", "2048"))
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
if config == "nemotron-specialized":
|
| 165 |
-
|
| 166 |
repo = NEMOTRON_REPO
|
|
|
|
| 167 |
else:
|
| 168 |
-
|
| 169 |
-
effective_cfg = config
|
| 170 |
-
|
| 171 |
-
# Kick off background prefetch of remaining shards for this dataset
|
| 172 |
-
if os.environ.get("HYDRA_BACKGROUND_PREFETCH", "1") == "1":
|
| 173 |
-
_start_background_prefetch(repo, effective_cfg)
|
| 174 |
-
|
| 175 |
-
local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "1") == "1"
|
| 176 |
-
if local_only:
|
| 177 |
-
local_paths = _find_local_parquets(repo, effective_cfg)
|
| 178 |
-
if not local_paths:
|
| 179 |
-
raise RuntimeError(
|
| 180 |
-
f"No local parquet files for {repo} (config={effective_cfg}). "
|
| 181 |
-
f"Run scripts/predownload_shards.py first, or set HYDRA_LOCAL_SHARDS_ONLY=0."
|
| 182 |
-
)
|
| 183 |
ds = load_dataset(
|
| 184 |
-
|
| 185 |
-
|
| 186 |
split="train",
|
| 187 |
streaming=True,
|
|
|
|
| 188 |
)
|
| 189 |
-
else:
|
| 190 |
-
kwargs: dict = dict(split="train", streaming=True, token=token)
|
| 191 |
-
if effective_cfg is not None:
|
| 192 |
-
kwargs["name"] = effective_cfg
|
| 193 |
-
ds = load_dataset(repo, **kwargs)
|
| 194 |
ds = ds.shuffle(seed=42, buffer_size=shuffle_buf)
|
| 195 |
return iter(ds)
|
| 196 |
|
| 197 |
|
| 198 |
-
def _extract_text(row: dict) -> str:
|
| 199 |
"""Pick the right text column — datasets have different column names.
|
| 200 |
|
| 201 |
Priority order: text, content, prompt_completion, question, body.
|
| 202 |
For math datasets that split into problem+solution, concatenate both.
|
| 203 |
Fallback: concatenate all string-valued fields.
|
| 204 |
"""
|
| 205 |
-
# Fast path: most datasets use "text" or "content".
|
| 206 |
-
for k in ("text", "content", "prompt_completion", "question", "body"):
|
| 207 |
-
|
| 208 |
-
|
|
|
|
| 209 |
# Math datasets may have problem + solution as separate fields.
|
| 210 |
if "problem" in row and "solution" in row:
|
| 211 |
p = row["problem"] or ""
|
|
@@ -221,15 +233,20 @@ def _extract_text(row: dict) -> str:
|
|
| 221 |
return "\n".join(parts)
|
| 222 |
|
| 223 |
|
| 224 |
-
class _WeightedStream:
|
| 225 |
"""Infinite weighted-round-robin over configs' streaming iterators."""
|
| 226 |
|
| 227 |
-
def __init__(self, weights: dict[str, float], seed: int = 0):
|
| 228 |
-
self.configs = list(weights.keys())
|
| 229 |
-
self.weights = [weights[c] for c in self.configs]
|
| 230 |
-
self.streams
|
| 231 |
-
|
| 232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
def _reopen(self, config: str):
|
| 235 |
# stream exhausted — reopen (HF streaming typically infinite but restart on edge)
|
|
@@ -245,22 +262,20 @@ class _WeightedStream:
|
|
| 245 |
# exist in the Nemotron configs. Controlled by HYDRA_FACTUAL_INJECT_RATE
|
| 246 |
# (default 50 = inject one factual doc every 50 Nemotron docs = ~2%).
|
| 247 |
inject_rate = int(os.environ.get("HYDRA_FACTUAL_INJECT_RATE", "50"))
|
| 248 |
-
if inject_rate > 0 and
|
| 249 |
-
factual_path = os.path.join(
|
| 250 |
-
os.path.dirname(os.path.abspath(__file__)), "data", "factual", "facts.txt")
|
| 251 |
-
if os.path.exists(factual_path):
|
| 252 |
-
self._factual_docs = open(factual_path).read().strip().split('\n')
|
| 253 |
-
self._factual_idx = 0
|
| 254 |
-
self._inject_counter = 0
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
self.
|
| 261 |
-
doc
|
| 262 |
-
self._factual_idx += 1
|
| 263 |
-
return doc, self.epoch
|
| 264 |
|
| 265 |
config = self.rng.choices(self.configs, weights=self.weights, k=1)[0]
|
| 266 |
try:
|
|
@@ -293,9 +308,9 @@ def _document_batches(split: str, tokenizer_batch_size: int = 128) -> Iterator[t
|
|
| 293 |
stream = _WeightedStream(_phase_weights(), seed=0)
|
| 294 |
|
| 295 |
prefetch_depth = int(os.environ.get("HYDRA_STREAM_PREFETCH", "32"))
|
| 296 |
-
q: queue.Queue = queue.Queue(maxsize=prefetch_depth)
|
| 297 |
-
sentinel_stop = object()
|
| 298 |
-
error_box: list = []
|
| 299 |
|
| 300 |
def producer():
|
| 301 |
try:
|
|
@@ -320,7 +335,7 @@ def _document_batches(split: str, tokenizer_batch_size: int = 128) -> Iterator[t
|
|
| 320 |
if error_box:
|
| 321 |
raise error_box[0]
|
| 322 |
return
|
| 323 |
-
yield item
|
| 324 |
|
| 325 |
|
| 326 |
def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 1000):
|
|
@@ -331,47 +346,24 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
|
|
| 331 |
stage 2: BPE tokenization → token-id lists (this function's producer thread)
|
| 332 |
stage 3: best-fit packing → (B, T+1) tensor rows (main thread, consumes)
|
| 333 |
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
skipping the 5-min streaming cold-start entirely. Cache key includes
|
| 338 |
-
(T, vocab_size) so shape changes invalidate the cache automatically.
|
| 339 |
"""
|
| 340 |
import queue
|
| 341 |
import threading
|
| 342 |
|
| 343 |
assert split in ("train", "val")
|
| 344 |
row_capacity = T + 1
|
| 345 |
-
bos_token = tokenizer.get_bos_token_id()
|
| 346 |
-
|
| 347 |
-
# --- Local packed-token cache (train only; val path skips cache-write) ---
|
| 348 |
-
cache_enabled = split == "train"
|
| 349 |
-
cache_gb = float(os.environ.get("HYDRA_TOKEN_CACHE_GB", "2"))
|
| 350 |
-
cache_dir = os.path.expanduser("~/.cache/autoresearch")
|
| 351 |
-
os.makedirs(cache_dir, exist_ok=True)
|
| 352 |
-
vocab_size = tokenizer.get_vocab_size()
|
| 353 |
-
cache_path = os.path.join(cache_dir, f"packed_tokens_v1_T{T}_V{vocab_size}_{split}.bin")
|
| 354 |
-
cache_target_bytes = int(cache_gb * 1024**3)
|
| 355 |
-
dtype_np = np.int32 # vocab < 2^31
|
| 356 |
-
bytes_per_row = row_capacity * 4 # int32
|
| 357 |
-
cache_rows_target = cache_target_bytes // bytes_per_row
|
| 358 |
-
|
| 359 |
-
# If train cache exists and is ready, mmap and yield from it
|
| 360 |
-
if cache_enabled and os.path.exists(cache_path) and os.path.getsize(cache_path) >= cache_target_bytes // 2:
|
| 361 |
-
print(f"[token-cache] using {cache_path} ({os.path.getsize(cache_path) / 1024**3:.2f} GB)")
|
| 362 |
-
yield from _mmap_cache_loader(cache_path, B, T, row_capacity, dtype_np)
|
| 363 |
-
return # unreachable (mmap loader is infinite), but satisfies generator protocol
|
| 364 |
-
|
| 365 |
-
if cache_enabled:
|
| 366 |
-
print(f"[token-cache] building {cache_path} (target {cache_gb:.1f} GB) on first pass")
|
| 367 |
batches = _document_batches(split)
|
|
|
|
| 368 |
|
| 369 |
# Stage 2: tokenization prefetch thread. Each queue element is a list of
|
| 370 |
# token-id lists (pre-tokenized docs). HYDRA_TOKEN_PREFETCH controls depth.
|
| 371 |
tok_prefetch = int(os.environ.get("HYDRA_TOKEN_PREFETCH", "8"))
|
| 372 |
-
tok_q: queue.Queue = queue.Queue(maxsize=tok_prefetch)
|
| 373 |
-
tok_sentinel = object()
|
| 374 |
-
tok_err_box: list = []
|
| 375 |
|
| 376 |
def tokenizer_producer():
|
| 377 |
try:
|
|
@@ -395,8 +387,8 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
|
|
| 395 |
if tok_err_box:
|
| 396 |
raise tok_err_box[0]
|
| 397 |
raise StopIteration
|
| 398 |
-
token_lists, epoch = item
|
| 399 |
-
doc_buffer.extend(token_lists)
|
| 400 |
|
| 401 |
row_buffer = torch.empty((B, row_capacity), dtype=torch.long)
|
| 402 |
cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=True)
|
|
@@ -406,10 +398,6 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
|
|
| 406 |
inputs = gpu_buffer[: B * T].view(B, T)
|
| 407 |
targets = gpu_buffer[B * T :].view(B, T)
|
| 408 |
|
| 409 |
-
# Open cache file for append-on-build
|
| 410 |
-
cache_fh = open(cache_path + ".tmp", "wb") if cache_enabled else None
|
| 411 |
-
cache_rows_written = 0
|
| 412 |
-
|
| 413 |
while True:
|
| 414 |
for row_idx in range(B):
|
| 415 |
pos = 0
|
|
@@ -437,43 +425,6 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
|
|
| 437 |
cpu_inputs.copy_(row_buffer[:, :-1])
|
| 438 |
cpu_targets.copy_(row_buffer[:, 1:])
|
| 439 |
gpu_buffer.copy_(cpu_buffer, non_blocking=True)
|
| 440 |
-
|
| 441 |
-
# Write packed rows to cache (append) until target size reached
|
| 442 |
-
if cache_fh is not None:
|
| 443 |
-
np_rows = row_buffer.numpy().astype(np.int32, copy=False)
|
| 444 |
-
cache_fh.write(np_rows.tobytes())
|
| 445 |
-
cache_rows_written += B
|
| 446 |
-
if cache_rows_written >= cache_rows_target:
|
| 447 |
-
cache_fh.flush()
|
| 448 |
-
cache_fh.close()
|
| 449 |
-
os.replace(cache_path + ".tmp", cache_path)
|
| 450 |
-
cache_fh = None
|
| 451 |
-
print(f"[token-cache] finalized {cache_path} ({cache_rows_written} rows)")
|
| 452 |
-
|
| 453 |
-
yield inputs, targets, epoch
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
def _mmap_cache_loader(cache_path: str, B: int, T: int, row_capacity: int, dtype_np):
|
| 457 |
-
"""Read packed (T+1) rows from mmap cache, cycle forever."""
|
| 458 |
-
data = np.memmap(cache_path, dtype=dtype_np, mode="r").reshape(-1, row_capacity)
|
| 459 |
-
n_rows = data.shape[0]
|
| 460 |
-
cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=True)
|
| 461 |
-
gpu_buffer = torch.empty(2 * B * T, dtype=torch.long, device="cuda")
|
| 462 |
-
cpu_inputs = cpu_buffer[: B * T].view(B, T)
|
| 463 |
-
cpu_targets = cpu_buffer[B * T :].view(B, T)
|
| 464 |
-
inputs = gpu_buffer[: B * T].view(B, T)
|
| 465 |
-
targets = gpu_buffer[B * T :].view(B, T)
|
| 466 |
-
idx = 0
|
| 467 |
-
epoch = 1
|
| 468 |
-
while True:
|
| 469 |
-
if idx + B > n_rows:
|
| 470 |
-
idx = 0
|
| 471 |
-
epoch += 1
|
| 472 |
-
batch = torch.from_numpy(data[idx:idx + B].astype(np.int64, copy=True))
|
| 473 |
-
idx += B
|
| 474 |
-
cpu_inputs.copy_(batch[:, :-1])
|
| 475 |
-
cpu_targets.copy_(batch[:, 1:])
|
| 476 |
-
gpu_buffer.copy_(cpu_buffer, non_blocking=True)
|
| 477 |
yield inputs, targets, epoch
|
| 478 |
|
| 479 |
|
|
@@ -511,22 +462,24 @@ def evaluate_bpb(model, tokenizer, B: int) -> float:
|
|
| 511 |
return total_nats / (math.log(2) * max(total_bytes, 1))
|
| 512 |
|
| 513 |
|
| 514 |
-
def ensure_tokenizer():
|
| 515 |
"""Ensure rustbpe tokenizer exists. If absent, train on a Nemotron stream
|
| 516 |
sample using the same rustbpe.train_from_iterator API that prepare.py uses
|
| 517 |
(production path — don't fork tokenizer training logic).
|
| 518 |
"""
|
| 519 |
import pickle
|
| 520 |
import torch
|
| 521 |
-
path = os.path.join(_p.TOKENIZER_DIR, "tokenizer.pkl")
|
| 522 |
-
token_bytes_path = os.path.join(_p.TOKENIZER_DIR, "token_bytes.pt")
|
| 523 |
-
if os.path.exists(path) and os.path.exists(token_bytes_path):
|
| 524 |
-
print(f"[nemotron] tokenizer + token_bytes already trained at {_p.TOKENIZER_DIR}", flush=True)
|
| 525 |
-
return
|
| 526 |
-
os.
|
|
|
|
|
|
|
| 527 |
print(f"[nemotron] training BPE (vocab_size={_p.VOCAB_SIZE}) on stream sample…", flush=True)
|
| 528 |
-
import rustbpe
|
| 529 |
-
import tiktoken
|
| 530 |
|
| 531 |
# Pull a sample of docs — use full blend if active so BPE covers all 7 sources.
|
| 532 |
n_docs = int(os.environ.get("HYDRA_BPE_TRAIN_DOCS", "20000"))
|
|
@@ -542,7 +495,8 @@ def ensure_tokenizer():
|
|
| 542 |
print(f"[nemotron] collected {len(sample_texts)} sample docs; training BPE…", flush=True)
|
| 543 |
|
| 544 |
# Train rustbpe — identical API to prepare.py's train_tokenizer().
|
| 545 |
-
|
|
|
|
| 546 |
vocab_size_no_special = _p.VOCAB_SIZE - len(_p.SPECIAL_TOKENS)
|
| 547 |
tokenizer.train_from_iterator(iter(sample_texts), vocab_size_no_special, pattern=_p.SPLIT_PATTERN)
|
| 548 |
|
|
@@ -567,6 +521,7 @@ def ensure_tokenizer():
|
|
| 567 |
for token_id in range(enc.n_vocab):
|
| 568 |
tstr = enc.decode([token_id])
|
| 569 |
token_bytes_list.append(0 if tstr in special_set else len(tstr.encode("utf-8")))
|
| 570 |
-
token_bytes_tensor = torch.tensor(token_bytes_list, dtype=torch.int32)
|
| 571 |
-
torch.save(token_bytes_tensor, token_bytes_path)
|
| 572 |
-
print(f"[nemotron] BPE + token_bytes saved to {_p.TOKENIZER_DIR}", flush=True)
|
|
|
|
|
|
| 20 |
"""
|
| 21 |
from __future__ import annotations
|
| 22 |
|
| 23 |
+
import os
|
| 24 |
+
import random
|
| 25 |
+
import importlib
|
| 26 |
+
from itertools import cycle
|
| 27 |
+
from typing import Any, Iterator, cast
|
| 28 |
|
|
|
|
| 29 |
import torch
|
| 30 |
|
| 31 |
+
import prepare as _p # reuse tokenizer, BOS, byte-length helpers
|
| 32 |
|
| 33 |
NEMOTRON_REPO = "nvidia/Nemotron-Pretraining-Specialized-v1.1"
|
| 34 |
|
|
|
|
| 37 |
# Keys are logical dataset names used by _open_blend_stream / _open_stream.
|
| 38 |
# ---------------------------------------------------------------------------
|
| 39 |
FULL_BLEND_WEIGHTS: dict[str, float] = {
|
| 40 |
+
"fineweb-edu": 0.35, # HuggingFaceFW/fineweb-edu
|
| 41 |
+
"fineweb": 0.15, # HuggingFaceFW/fineweb (sample-100BT)
|
| 42 |
+
"stack-v2": 0.15, # bigcode/the-stack-v2
|
| 43 |
+
"nemotron-math": 0.10, # nvidia/Nemotron-CC-Math-v1
|
| 44 |
+
"nemotron-specialized": 0.10, # nvidia/Nemotron-Pretraining-Specialized-v1.1
|
| 45 |
+
"wikipedia": 0.08, # olm/wikipedia
|
| 46 |
+
"cosmopedia": 0.07, # HuggingFaceTB/cosmopedia
|
|
|
|
| 47 |
}
|
| 48 |
|
| 49 |
# Mapping from logical blend name → (HF repo, optional config/name, text column).
|
|
|
|
| 65 |
"Nemotron-Pretraining-Formal-Logic": 0.20,
|
| 66 |
"Nemotron-Pretraining-Multiple-Choice": 0.20,
|
| 67 |
}
|
| 68 |
+
PHASE2_WEIGHTS = {
|
| 69 |
"Nemotron-Pretraining-Multiple-Choice": 0.45,
|
| 70 |
"Nemotron-Pretraining-Economics": 0.20,
|
| 71 |
"Nemotron-Pretraining-Formal-Logic": 0.15,
|
| 72 |
"Nemotron-Pretraining-Code-Concepts": 0.10,
|
| 73 |
"Nemotron-Pretraining-Unconditional-Algorithmic": 0.10,
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
type StreamBatch = tuple[list[str], int]
|
| 77 |
+
type TokenBatch = tuple[list[list[int]], int]
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _tokenizer_cache_repo() -> str:
|
| 81 |
+
return (
|
| 82 |
+
os.environ.get("HYDRA_TOKENIZER_CACHE_REPO")
|
| 83 |
+
or os.environ.get("FEATHER_HF_OUTPUT_REPO")
|
| 84 |
+
or os.environ.get("HF_REPO_ID")
|
| 85 |
+
or os.environ.get("HYDRA_RETINA_CACHE_REPO")
|
| 86 |
+
or os.environ.get("FEATHER_HF_RETINA_CACHE_REPO")
|
| 87 |
+
or ""
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _tokenizer_cache_prefix() -> str:
|
| 92 |
+
return f"tokenizer/vocab{_p.VOCAB_SIZE}"
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def maybe_hydrate_tokenizer_cache() -> bool:
|
| 96 |
+
"""Try to download tokenizer artifacts from HF cache storage."""
|
| 97 |
+
repo_id = _tokenizer_cache_repo()
|
| 98 |
+
token = os.environ.get("HF_TOKEN")
|
| 99 |
+
if not repo_id or not token:
|
| 100 |
+
return False
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
from huggingface_hub import hf_hub_download
|
| 104 |
+
except Exception as e: # noqa: BLE001
|
| 105 |
+
print(f"[nemotron] tokenizer cache unavailable: {type(e).__name__}: {e}", flush=True)
|
| 106 |
+
return False
|
| 107 |
+
|
| 108 |
+
os.makedirs(_p.TOKENIZER_DIR, exist_ok=True)
|
| 109 |
+
prefix = _tokenizer_cache_prefix()
|
| 110 |
+
try:
|
| 111 |
+
hf_hub_download(
|
| 112 |
+
repo_id=repo_id,
|
| 113 |
+
repo_type="model",
|
| 114 |
+
subfolder=prefix,
|
| 115 |
+
filename="tokenizer.pkl",
|
| 116 |
+
token=token,
|
| 117 |
+
local_dir=_p.TOKENIZER_DIR,
|
| 118 |
+
)
|
| 119 |
+
hf_hub_download(
|
| 120 |
+
repo_id=repo_id,
|
| 121 |
+
repo_type="model",
|
| 122 |
+
subfolder=prefix,
|
| 123 |
+
filename="token_bytes.pt",
|
| 124 |
+
token=token,
|
| 125 |
+
local_dir=_p.TOKENIZER_DIR,
|
| 126 |
+
)
|
| 127 |
+
except Exception as e: # noqa: BLE001
|
| 128 |
+
print(f"[nemotron] tokenizer cache miss in {repo_id}/{prefix}: {type(e).__name__}: {e}", flush=True)
|
| 129 |
+
return False
|
| 130 |
+
|
| 131 |
+
print(f"[nemotron] hydrated tokenizer cache from {repo_id}/{prefix}", flush=True)
|
| 132 |
+
return True
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def upload_tokenizer_cache() -> None:
|
| 136 |
+
"""Upload tokenizer artifacts for reuse by future jobs."""
|
| 137 |
+
repo_id = _tokenizer_cache_repo()
|
| 138 |
+
token = os.environ.get("HF_TOKEN")
|
| 139 |
+
if not repo_id or not token:
|
| 140 |
+
return
|
| 141 |
+
|
| 142 |
+
path = os.path.join(_p.TOKENIZER_DIR, "tokenizer.pkl")
|
| 143 |
+
token_bytes_path = os.path.join(_p.TOKENIZER_DIR, "token_bytes.pt")
|
| 144 |
+
if not (os.path.exists(path) and os.path.exists(token_bytes_path)):
|
| 145 |
+
return
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
from huggingface_hub import HfApi
|
| 149 |
+
api = HfApi(token=token)
|
| 150 |
+
prefix = _tokenizer_cache_prefix()
|
| 151 |
+
api.upload_file(path_or_fileobj=path, path_in_repo=f"{prefix}/tokenizer.pkl", repo_id=repo_id, repo_type="model")
|
| 152 |
+
api.upload_file(path_or_fileobj=token_bytes_path, path_in_repo=f"{prefix}/token_bytes.pt", repo_id=repo_id, repo_type="model")
|
| 153 |
+
print(f"[nemotron] uploaded tokenizer cache to {repo_id}/{prefix}", flush=True)
|
| 154 |
+
except Exception as e: # noqa: BLE001
|
| 155 |
+
print(f"[nemotron] tokenizer cache upload skipped: {type(e).__name__}: {e}", flush=True)
|
| 156 |
|
| 157 |
|
| 158 |
def _phase_weights() -> dict[str, float]:
|
|
|
|
| 163 |
return PHASE2_WEIGHTS if phase == "phase2" else PHASE1_WEIGHTS
|
| 164 |
|
| 165 |
|
| 166 |
+
def _open_stream(config: str, split: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
"""Open a streaming iterator over one dataset config.
|
| 168 |
|
| 169 |
+
Handles two modes:
|
| 170 |
+
1. Nemotron sub-configs (e.g. "Nemotron-Pretraining-Code-Concepts") —
|
| 171 |
+
loaded from NEMOTRON_REPO with the config name.
|
| 172 |
+
2. Full-blend logical names (e.g. "fineweb-edu", "stack-v2") —
|
| 173 |
+
looked up in _BLEND_REGISTRY for repo / sub-config / text column.
|
|
|
|
| 174 |
|
| 175 |
+
Yields dicts; text extraction handled downstream by _extract_text.
|
| 176 |
+
"""
|
| 177 |
+
load_dataset = importlib.import_module("datasets").load_dataset
|
| 178 |
+
token = os.environ.get("HF_TOKEN")
|
| 179 |
+
shuffle_buf = int(os.environ.get("HYDRA_STREAM_SHUFFLE_BUFFER", "2048"))
|
| 180 |
+
|
| 181 |
+
if config in _BLEND_REGISTRY:
|
| 182 |
+
repo, name, _text_col = _BLEND_REGISTRY[config]
|
| 183 |
+
kwargs: dict[str, object] = dict(
|
| 184 |
+
split="train",
|
| 185 |
+
streaming=True,
|
| 186 |
+
token=token,
|
| 187 |
+
)
|
| 188 |
+
if name is not None:
|
| 189 |
+
kwargs["name"] = name
|
| 190 |
+
# nemotron-specialized has multiple sub-configs; pick the first one
|
| 191 |
+
# (diversity blend) when accessed via the full-blend path.
|
| 192 |
if config == "nemotron-specialized":
|
| 193 |
+
kwargs["name"] = "Nemotron-Pretraining-Code-Concepts"
|
| 194 |
repo = NEMOTRON_REPO
|
| 195 |
+
ds = load_dataset(repo, **kwargs)
|
| 196 |
else:
|
| 197 |
+
# Legacy Nemotron sub-config path (Phase 1 / Phase 2).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
ds = load_dataset(
|
| 199 |
+
NEMOTRON_REPO,
|
| 200 |
+
config,
|
| 201 |
split="train",
|
| 202 |
streaming=True,
|
| 203 |
+
token=token,
|
| 204 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
ds = ds.shuffle(seed=42, buffer_size=shuffle_buf)
|
| 206 |
return iter(ds)
|
| 207 |
|
| 208 |
|
| 209 |
+
def _extract_text(row: dict[str, object]) -> str:
|
| 210 |
"""Pick the right text column — datasets have different column names.
|
| 211 |
|
| 212 |
Priority order: text, content, prompt_completion, question, body.
|
| 213 |
For math datasets that split into problem+solution, concatenate both.
|
| 214 |
Fallback: concatenate all string-valued fields.
|
| 215 |
"""
|
| 216 |
+
# Fast path: most datasets use "text" or "content".
|
| 217 |
+
for k in ("text", "content", "prompt_completion", "question", "body"):
|
| 218 |
+
value = row.get(k)
|
| 219 |
+
if isinstance(value, str) and value:
|
| 220 |
+
return value
|
| 221 |
# Math datasets may have problem + solution as separate fields.
|
| 222 |
if "problem" in row and "solution" in row:
|
| 223 |
p = row["problem"] or ""
|
|
|
|
| 233 |
return "\n".join(parts)
|
| 234 |
|
| 235 |
|
| 236 |
+
class _WeightedStream:
|
| 237 |
"""Infinite weighted-round-robin over configs' streaming iterators."""
|
| 238 |
|
| 239 |
+
def __init__(self, weights: dict[str, float], seed: int = 0):
|
| 240 |
+
self.configs = list(weights.keys())
|
| 241 |
+
self.weights = [weights[c] for c in self.configs]
|
| 242 |
+
self.streams: dict[str, Iterator[dict[str, object]]] = {
|
| 243 |
+
c: _open_stream(c, "train") for c in self.configs
|
| 244 |
+
}
|
| 245 |
+
self.rng = random.Random(seed)
|
| 246 |
+
self.epoch = 1
|
| 247 |
+
self._factual_docs: list[str] | None = None
|
| 248 |
+
self._factual_idx = 0
|
| 249 |
+
self._inject_counter = 0
|
| 250 |
|
| 251 |
def _reopen(self, config: str):
|
| 252 |
# stream exhausted — reopen (HF streaming typically infinite but restart on edge)
|
|
|
|
| 262 |
# exist in the Nemotron configs. Controlled by HYDRA_FACTUAL_INJECT_RATE
|
| 263 |
# (default 50 = inject one factual doc every 50 Nemotron docs = ~2%).
|
| 264 |
inject_rate = int(os.environ.get("HYDRA_FACTUAL_INJECT_RATE", "50"))
|
| 265 |
+
if inject_rate > 0 and self._factual_docs is None:
|
| 266 |
+
factual_path = os.path.join(
|
| 267 |
+
os.path.dirname(os.path.abspath(__file__)), "data", "factual", "facts.txt")
|
| 268 |
+
if os.path.exists(factual_path):
|
| 269 |
+
self._factual_docs = open(factual_path).read().strip().split('\n')
|
| 270 |
+
self._factual_idx = 0
|
| 271 |
+
self._inject_counter = 0
|
| 272 |
+
if inject_rate > 0 and self._factual_docs:
|
| 273 |
+
self._inject_counter += 1
|
| 274 |
+
if self._inject_counter >= inject_rate:
|
| 275 |
+
self._inject_counter = 0
|
| 276 |
+
doc = self._factual_docs[self._factual_idx % len(self._factual_docs)]
|
| 277 |
+
self._factual_idx += 1
|
| 278 |
+
return doc, self.epoch
|
|
|
|
|
|
|
| 279 |
|
| 280 |
config = self.rng.choices(self.configs, weights=self.weights, k=1)[0]
|
| 281 |
try:
|
|
|
|
| 308 |
stream = _WeightedStream(_phase_weights(), seed=0)
|
| 309 |
|
| 310 |
prefetch_depth = int(os.environ.get("HYDRA_STREAM_PREFETCH", "32"))
|
| 311 |
+
q: queue.Queue[StreamBatch | object] = queue.Queue(maxsize=prefetch_depth)
|
| 312 |
+
sentinel_stop = object()
|
| 313 |
+
error_box: list[BaseException] = []
|
| 314 |
|
| 315 |
def producer():
|
| 316 |
try:
|
|
|
|
| 335 |
if error_box:
|
| 336 |
raise error_box[0]
|
| 337 |
return
|
| 338 |
+
yield cast(StreamBatch, item)
|
| 339 |
|
| 340 |
|
| 341 |
def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 1000):
|
|
|
|
| 346 |
stage 2: BPE tokenization → token-id lists (this function's producer thread)
|
| 347 |
stage 3: best-fit packing → (B, T+1) tensor rows (main thread, consumes)
|
| 348 |
|
| 349 |
+
Queue depths tunable via HYDRA_STREAM_PREFETCH and HYDRA_TOKEN_PREFETCH.
|
| 350 |
+
Goal: zero tps loss from I/O or tokenizer overhead — training loop pulls
|
| 351 |
+
from an always-full queue.
|
|
|
|
|
|
|
| 352 |
"""
|
| 353 |
import queue
|
| 354 |
import threading
|
| 355 |
|
| 356 |
assert split in ("train", "val")
|
| 357 |
row_capacity = T + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
batches = _document_batches(split)
|
| 359 |
+
bos_token = tokenizer.get_bos_token_id()
|
| 360 |
|
| 361 |
# Stage 2: tokenization prefetch thread. Each queue element is a list of
|
| 362 |
# token-id lists (pre-tokenized docs). HYDRA_TOKEN_PREFETCH controls depth.
|
| 363 |
tok_prefetch = int(os.environ.get("HYDRA_TOKEN_PREFETCH", "8"))
|
| 364 |
+
tok_q: queue.Queue[TokenBatch | object] = queue.Queue(maxsize=tok_prefetch)
|
| 365 |
+
tok_sentinel = object()
|
| 366 |
+
tok_err_box: list[BaseException] = []
|
| 367 |
|
| 368 |
def tokenizer_producer():
|
| 369 |
try:
|
|
|
|
| 387 |
if tok_err_box:
|
| 388 |
raise tok_err_box[0]
|
| 389 |
raise StopIteration
|
| 390 |
+
token_lists, epoch = cast(TokenBatch, item)
|
| 391 |
+
doc_buffer.extend(token_lists)
|
| 392 |
|
| 393 |
row_buffer = torch.empty((B, row_capacity), dtype=torch.long)
|
| 394 |
cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=True)
|
|
|
|
| 398 |
inputs = gpu_buffer[: B * T].view(B, T)
|
| 399 |
targets = gpu_buffer[B * T :].view(B, T)
|
| 400 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
while True:
|
| 402 |
for row_idx in range(B):
|
| 403 |
pos = 0
|
|
|
|
| 425 |
cpu_inputs.copy_(row_buffer[:, :-1])
|
| 426 |
cpu_targets.copy_(row_buffer[:, 1:])
|
| 427 |
gpu_buffer.copy_(cpu_buffer, non_blocking=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
yield inputs, targets, epoch
|
| 429 |
|
| 430 |
|
|
|
|
| 462 |
return total_nats / (math.log(2) * max(total_bytes, 1))
|
| 463 |
|
| 464 |
|
| 465 |
+
def ensure_tokenizer():
|
| 466 |
"""Ensure rustbpe tokenizer exists. If absent, train on a Nemotron stream
|
| 467 |
sample using the same rustbpe.train_from_iterator API that prepare.py uses
|
| 468 |
(production path — don't fork tokenizer training logic).
|
| 469 |
"""
|
| 470 |
import pickle
|
| 471 |
import torch
|
| 472 |
+
path = os.path.join(_p.TOKENIZER_DIR, "tokenizer.pkl")
|
| 473 |
+
token_bytes_path = os.path.join(_p.TOKENIZER_DIR, "token_bytes.pt")
|
| 474 |
+
if os.path.exists(path) and os.path.exists(token_bytes_path):
|
| 475 |
+
print(f"[nemotron] tokenizer + token_bytes already trained at {_p.TOKENIZER_DIR}", flush=True)
|
| 476 |
+
return
|
| 477 |
+
if maybe_hydrate_tokenizer_cache() and os.path.exists(path) and os.path.exists(token_bytes_path):
|
| 478 |
+
return
|
| 479 |
+
os.makedirs(_p.TOKENIZER_DIR, exist_ok=True)
|
| 480 |
print(f"[nemotron] training BPE (vocab_size={_p.VOCAB_SIZE}) on stream sample…", flush=True)
|
| 481 |
+
import rustbpe
|
| 482 |
+
import tiktoken
|
| 483 |
|
| 484 |
# Pull a sample of docs — use full blend if active so BPE covers all 7 sources.
|
| 485 |
n_docs = int(os.environ.get("HYDRA_BPE_TRAIN_DOCS", "20000"))
|
|
|
|
| 495 |
print(f"[nemotron] collected {len(sample_texts)} sample docs; training BPE…", flush=True)
|
| 496 |
|
| 497 |
# Train rustbpe — identical API to prepare.py's train_tokenizer().
|
| 498 |
+
tokenizer_cls = getattr(rustbpe, "Tokenizer")
|
| 499 |
+
tokenizer: Any = tokenizer_cls()
|
| 500 |
vocab_size_no_special = _p.VOCAB_SIZE - len(_p.SPECIAL_TOKENS)
|
| 501 |
tokenizer.train_from_iterator(iter(sample_texts), vocab_size_no_special, pattern=_p.SPLIT_PATTERN)
|
| 502 |
|
|
|
|
| 521 |
for token_id in range(enc.n_vocab):
|
| 522 |
tstr = enc.decode([token_id])
|
| 523 |
token_bytes_list.append(0 if tstr in special_set else len(tstr.encode("utf-8")))
|
| 524 |
+
token_bytes_tensor = torch.tensor(token_bytes_list, dtype=torch.int32)
|
| 525 |
+
torch.save(token_bytes_tensor, token_bytes_path)
|
| 526 |
+
print(f"[nemotron] BPE + token_bytes saved to {_p.TOKENIZER_DIR}", flush=True)
|
| 527 |
+
upload_tokenizer_cache()
|
overlay/pyproject.toml
CHANGED
|
@@ -7,6 +7,7 @@ requires-python = ">=3.11"
|
|
| 7 |
dependencies = [
|
| 8 |
"matplotlib>=3.10.8",
|
| 9 |
"numpy>=2.2.6",
|
|
|
|
| 10 |
"pandas>=2.3.3",
|
| 11 |
"pyarrow>=21.0.0",
|
| 12 |
"requests>=2.32.0",
|
|
|
|
| 7 |
dependencies = [
|
| 8 |
"matplotlib>=3.10.8",
|
| 9 |
"numpy>=2.2.6",
|
| 10 |
+
"optuna>=4.4.0",
|
| 11 |
"pandas>=2.3.3",
|
| 12 |
"pyarrow>=21.0.0",
|
| 13 |
"requests>=2.32.0",
|
overlay/scripts/autoresearch_iter.sh
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Autoresearch single-iteration runner — called from cron every 5 min.
|
| 3 |
+
#
|
| 4 |
+
# Philosophy (Apr 22 2026 rewrite): HYDRA is NOT a transformer. Semantic
|
| 5 |
+
# folding (SDR retina) + HTM episodic engram + GDN memory layers provide
|
| 6 |
+
# enormous latent capacity at tiny d_model. DEPTH > WIDTH. Per the user's
|
| 7 |
+
# guidance, start absolute-smallest, fill VRAM with depth.
|
| 8 |
+
#
|
| 9 |
+
# Base config: d_model=128, n_layer=16 (~60M params). Mutations explore
|
| 10 |
+
# deeper stacks, engram/GDN layout, SDR sparsity. Eval OOM fixed via
|
| 11 |
+
# HYDRA_EVAL_BATCH=1 + HYDRA_CE_CHUNK=64 (was =1024 = no chunking).
|
| 12 |
+
|
| 13 |
+
set -u
|
| 14 |
+
REPO=/home/mikeb/work/feather
|
| 15 |
+
RESULTS=$REPO/results.tsv
|
| 16 |
+
LOG_DIR=$REPO/.omc/autoresearch_logs
|
| 17 |
+
mkdir -p "$LOG_DIR"
|
| 18 |
+
ITER_LOG=$LOG_DIR/iter_$(date +%Y%m%d_%H%M%S).log
|
| 19 |
+
cd "$REPO"
|
| 20 |
+
|
| 21 |
+
# Skip if training already running — check the actual python process, not shells
|
| 22 |
+
# whose argv merely contains the pattern string (e.g. pgrep wait-loops).
|
| 23 |
+
if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {found=1} END {exit !found}'; then
|
| 24 |
+
echo "[$(date +%H:%M:%S)] skip — training already running" >> "$LOG_DIR/skips.log"
|
| 25 |
+
exit 0
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
# Skip if stop-file exists
|
| 29 |
+
if [ -f "$REPO/.omc/autoresearch_STOP" ]; then
|
| 30 |
+
echo "[$(date +%H:%M:%S)] STOPPED — .omc/autoresearch_STOP exists" >> "$LOG_DIR/skips.log"
|
| 31 |
+
exit 0
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
# Compute next experiment index from results.tsv
|
| 35 |
+
if [ ! -f "$RESULTS" ]; then
|
| 36 |
+
printf "experiment\tcommit\tval_bpb\ttps_avg\tfactual\tstatus\tdescription\n" > "$RESULTS"
|
| 37 |
+
fi
|
| 38 |
+
NEXT_EXP=$(awk -F'\t' 'NR>1 && $1~/^[0-9]+$/ {if ($1+0 > max) max=$1+0} END {print max+1}' "$RESULTS")
|
| 39 |
+
[ -z "$NEXT_EXP" ] && NEXT_EXP=1
|
| 40 |
+
|
| 41 |
+
# Mutation pool — explores deep+narrow regime.
|
| 42 |
+
# Base: d_model=128, n_layer=16, expand=3, d_state=64, engram=8192, B=16, seq=1024, GDN@5,11
|
| 43 |
+
MUTATIONS=(
|
| 44 |
+
"baseline-deep-narrow|"
|
| 45 |
+
"n_layer=16 (shallower-control)|HYDRA_N_LAYER=16"
|
| 46 |
+
"n_layer=24 (max depth)|HYDRA_N_LAYER=24"
|
| 47 |
+
"d_model=96 (leaner)|HYDRA_D_MODEL=96"
|
| 48 |
+
"d_model=160 (slightly wider)|HYDRA_D_MODEL=160"
|
| 49 |
+
"GDN_LAYERS=0,3,6,9,12,15,18 (7 GDN)|HYDRA_GDN_LAYERS=0,3,6,9,12,15,18"
|
| 50 |
+
"GDN_LAYERS=1,3,5,7,9,11,13,15,17 (9 GDN)|HYDRA_GDN_LAYERS=1,3,5,7,9,11,13,15,17"
|
| 51 |
+
"GDN_LAYERS= (all-Mamba3 depth)|HYDRA_GDN_LAYERS="
|
| 52 |
+
"D_STATE=128 (fatter SSM state)|HYDRA_D_STATE=128"
|
| 53 |
+
"D_STATE=32 (leaner SSM state)|HYDRA_D_STATE=32"
|
| 54 |
+
"EXPAND=2 (leaner FFN)|HYDRA_EXPAND=2"
|
| 55 |
+
"EXPAND=4 (fatter FFN)|HYDRA_EXPAND=4"
|
| 56 |
+
"engram=32768 (even wider)|HYDRA_ENGRAM_N_COLUMNS=32768"
|
| 57 |
+
"engram_topk=128 (denser retrieve)|HYDRA_ENGRAM_TOPK=128"
|
| 58 |
+
"D_STATE=96 (mid SSM)|HYDRA_D_STATE=96"
|
| 59 |
+
"HTM_SUBSAMPLE=64 (2x HTM)|HYDRA_HTM_SUBSAMPLE=64"
|
| 60 |
+
"batch=16 (fill VRAM)|HYDRA_BATCH_SIZE=16"
|
| 61 |
+
"batch=4 seq=2048 (long-range)|HYDRA_BATCH_SIZE=4 HYDRA_SEQ_LEN=2048"
|
| 62 |
+
"MATRIX_LR=0.18|HYDRA_MATRIX_LR=0.18"
|
| 63 |
+
"WARMUP_RATIO=0.05|HYDRA_WARMUP_RATIO=0.05"
|
| 64 |
+
"total_batch=16384 (2x opt steps)|HYDRA_TOTAL_BATCH=16384"
|
| 65 |
+
"total_batch=8192 (4x opt steps)|HYDRA_TOTAL_BATCH=8192"
|
| 66 |
+
"HEADDIM=64 (bigger heads)|HYDRA_HEADDIM=64"
|
| 67 |
+
"engram_layer_idx=8 (mid-stack)|HYDRA_ENGRAM_LAYER_IDX=8"
|
| 68 |
+
"EXPAND=4 + n_layer=20 (fat+deep)|HYDRA_EXPAND=4 HYDRA_N_LAYER=20"
|
| 69 |
+
"B=16 + total_batch=16384|HYDRA_BATCH_SIZE=16 HYDRA_TOTAL_BATCH=16384"
|
| 70 |
+
"engram=32768 + EXPAND=4|HYDRA_ENGRAM_N_COLUMNS=32768 HYDRA_EXPAND=4"
|
| 71 |
+
"MTP_K=2 + HEADDIM=64|HYDRA_MTP_K=2 HYDRA_HEADDIM=64"
|
| 72 |
+
"label_smoothing=0.1|HYDRA_LABEL_SMOOTHING=0.1"
|
| 73 |
+
"z_loss=0.001 (10x)|HYDRA_Z_LOSS_WEIGHT=0.001"
|
| 74 |
+
"HTM_STOP_GRAD=1|HYDRA_HTM_STOP_GRAD=1"
|
| 75 |
+
"DROPOUT=0.0|HYDRA_DROPOUT=0.0"
|
| 76 |
+
"TIME=900s long-budget champion|HYDRA_TIME_BUDGET=900 HYDRA_ENGRAM_N_COLUMNS=32768 HYDRA_EXPAND=4"
|
| 77 |
+
"TIME=1200s deep n_layer=24|HYDRA_TIME_BUDGET=1200 HYDRA_N_LAYER=24"
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# Index into mutation pool (wrap around for continuous search, start at exp13)
|
| 81 |
+
MUT_IDX=$(( (NEXT_EXP - 13) % ${#MUTATIONS[@]} ))
|
| 82 |
+
[ "$MUT_IDX" -lt 0 ] && MUT_IDX=0
|
| 83 |
+
|
| 84 |
+
IFS='|' read -r DESC EXTRA_ENV <<< "${MUTATIONS[$MUT_IDX]}"
|
| 85 |
+
echo "[$(date +%H:%M:%S)] Starting exp $NEXT_EXP: $DESC" >> "$ITER_LOG"
|
| 86 |
+
|
| 87 |
+
# Launch training with mutation
|
| 88 |
+
# KEY CHANGES vs prior iter:
|
| 89 |
+
# d_model 384→128 (3x narrower)
|
| 90 |
+
# n_layer 10→16 (1.6x deeper)
|
| 91 |
+
# batch 8→16 (fill VRAM)
|
| 92 |
+
# CE_CHUNK 1024→64 (16x smaller eval logit chunks — fixes OOM)
|
| 93 |
+
# EVAL_BATCH 2→1 (halve eval memory)
|
| 94 |
+
# EVAL_TOKENS 131K (keep, ~3-4s eval)
|
| 95 |
+
rm -f run.log
|
| 96 |
+
env \
|
| 97 |
+
LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
|
| 98 |
+
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
|
| 99 |
+
HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
|
| 100 |
+
HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
|
| 101 |
+
HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
|
| 102 |
+
HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
|
| 103 |
+
HYDRA_TIME_BUDGET=600 \
|
| 104 |
+
HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
|
| 105 |
+
HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
|
| 106 |
+
HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
|
| 107 |
+
HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
|
| 108 |
+
HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
|
| 109 |
+
HYDRA_CKPT_INTERVAL=0 HYDRA_MID_VAL_INTERVAL=0 \
|
| 110 |
+
HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
|
| 111 |
+
HYDRA_Z_LOSS_WEIGHT=0.001 \
|
| 112 |
+
HYDRA_RESUME_CKPT=none \
|
| 113 |
+
$EXTRA_ENV \
|
| 114 |
+
./.venv/bin/python -u train.py > run.log 2>&1
|
| 115 |
+
STATUS=$?
|
| 116 |
+
|
| 117 |
+
# Parse metrics
|
| 118 |
+
METRICS=$(./.venv/bin/python scripts/parse_metrics.py run.log 2>/dev/null || echo "NA NA NA")
|
| 119 |
+
VAL_BPB=$(echo "$METRICS" | cut -f1)
|
| 120 |
+
TPS=$(echo "$METRICS" | cut -f2)
|
| 121 |
+
FACTUAL=$(echo "$METRICS" | cut -f3)
|
| 122 |
+
COMMIT=$(git rev-parse --short HEAD)
|
| 123 |
+
# BPB can be: "NA" (parse fail), "~X.XXXX" (train_bpb fallback when eval OOMs),
|
| 124 |
+
# or "X.XXXX" (real val_bpb). The ~ prefix marks the fallback.
|
| 125 |
+
if [ "$STATUS" -ne 0 ]; then
|
| 126 |
+
STATUS_STR="crash"
|
| 127 |
+
elif [ "$VAL_BPB" = "NA" ]; then
|
| 128 |
+
STATUS_STR="no_metrics"
|
| 129 |
+
elif [[ "$VAL_BPB" == ~* ]]; then
|
| 130 |
+
STATUS_STR="train_bpb"
|
| 131 |
+
else
|
| 132 |
+
STATUS_STR="ok"
|
| 133 |
+
fi
|
| 134 |
+
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" "$NEXT_EXP" "$COMMIT" "$VAL_BPB" "$TPS" "$FACTUAL" "$STATUS_STR" "$DESC" >> "$RESULTS"
|
| 135 |
+
echo "[$(date +%H:%M:%S)] Done exp $NEXT_EXP: bpb=$VAL_BPB tps=$TPS factual=$FACTUAL status=$STATUS_STR" >> "$ITER_LOG"
|
| 136 |
+
|
| 137 |
+
# Auto-stop condition: great result
|
| 138 |
+
if [ "$FACTUAL" != "NA" ]; then
|
| 139 |
+
HITS=$(echo "$FACTUAL" | cut -d/ -f1)
|
| 140 |
+
if [ -n "$HITS" ] && [ "$HITS" -ge 7 ] 2>/dev/null; then
|
| 141 |
+
touch "$REPO/.omc/autoresearch_STOP"
|
| 142 |
+
echo "[$(date +%H:%M:%S)] STOP: reached factual>=7/9 at exp $NEXT_EXP" >> "$ITER_LOG"
|
| 143 |
+
fi
|
| 144 |
+
fi
|
overlay/scripts/benchmark_hyena_stack.py
CHANGED
|
@@ -26,8 +26,11 @@ Invocation:
|
|
| 26 |
# On A100/A10G (production cloud hardware), use time=900 (15 min) for
|
| 27 |
# stable steady-state numbers.
|
| 28 |
|
| 29 |
-
After each run the script prints:
|
| 30 |
-
BENCHMARK config=<name> tps_steady=<avg> bpb_at_500=<val> vram_peak=<MiB>
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
Collate those lines into the matrix table manually, then pick the winner
|
| 33 |
for the 6-hour production run (HYDRA_TIME_BUDGET=21600).
|
|
@@ -81,7 +84,7 @@ CONFIGS = {
|
|
| 81 |
}
|
| 82 |
|
| 83 |
|
| 84 |
-
def build_env(cfg_overrides: dict) -> dict:
|
| 85 |
"""Compose a full env dict from the inherited env + config overrides."""
|
| 86 |
env = os.environ.copy()
|
| 87 |
# Ensure the Hyena layer selection is always present (defaults to off).
|
|
@@ -91,7 +94,7 @@ def build_env(cfg_overrides: dict) -> dict:
|
|
| 91 |
return env
|
| 92 |
|
| 93 |
|
| 94 |
-
def parse_step_line(line: str) -> dict | None:
|
| 95 |
"""Parse a single step=... line into a dict of metrics, or None."""
|
| 96 |
if not line.startswith("step="):
|
| 97 |
return None
|
|
@@ -102,7 +105,7 @@ def parse_step_line(line: str) -> dict | None:
|
|
| 102 |
return None
|
| 103 |
|
| 104 |
|
| 105 |
-
def summarize(log_path: Path, warmup_steps: int = 50) -> dict:
|
| 106 |
"""Tail log_path, compute steady-state TPS / BPB@500 / VRAM peak.
|
| 107 |
|
| 108 |
Skips the first `warmup_steps` to discard CUDA graph capture / autotune
|
|
@@ -138,20 +141,29 @@ def summarize(log_path: Path, warmup_steps: int = 50) -> dict:
|
|
| 138 |
tps_sorted = sorted(tps_vals)
|
| 139 |
tps_steady = tps_sorted[len(tps_sorted) // 2] # median
|
| 140 |
|
| 141 |
-
return {
|
| 142 |
-
"tps_steady": tps_steady,
|
| 143 |
-
"bpb_at_500": bpb_at_500 or (bpbs[-1] if bpbs else 0.0),
|
| 144 |
-
"vram_peak": vram_peak,
|
| 145 |
-
"steps": len(tps_vals) + warmup_steps,
|
| 146 |
-
}
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
def
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
cfg = CONFIGS[args.config]
|
| 157 |
log_path = Path(args.log or (REPO / f"run_bench_{args.config}.log"))
|
|
@@ -178,16 +190,25 @@ def main() -> int:
|
|
| 178 |
print(f"BENCH FAIL config={args.config}", flush=True)
|
| 179 |
return proc.returncode
|
| 180 |
|
| 181 |
-
summary = summarize(log_path)
|
| 182 |
-
print(
|
| 183 |
-
f"BENCHMARK config={args.config} "
|
| 184 |
-
f"tps_steady={summary['tps_steady']:.0f} "
|
| 185 |
-
f"bpb_at_500={summary['bpb_at_500']:.4f} "
|
| 186 |
-
f"vram_peak={summary['vram_peak']:.0f}MiB "
|
| 187 |
-
f"steps={summary['steps']}",
|
| 188 |
-
flush=True,
|
| 189 |
-
)
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
|
| 193 |
if __name__ == "__main__":
|
|
|
|
| 26 |
# On A100/A10G (production cloud hardware), use time=900 (15 min) for
|
| 27 |
# stable steady-state numbers.
|
| 28 |
|
| 29 |
+
After each run the script prints:
|
| 30 |
+
BENCHMARK config=<name> tps_steady=<avg> bpb_at_500=<val> vram_peak=<MiB>
|
| 31 |
+
|
| 32 |
+
If `--min-tps` is set (>0), the script exits non-zero when steady-state TPS
|
| 33 |
+
falls below the threshold.
|
| 34 |
|
| 35 |
Collate those lines into the matrix table manually, then pick the winner
|
| 36 |
for the 6-hour production run (HYDRA_TIME_BUDGET=21600).
|
|
|
|
| 84 |
}
|
| 85 |
|
| 86 |
|
| 87 |
+
def build_env(cfg_overrides: dict[str, str]) -> dict[str, str]:
|
| 88 |
"""Compose a full env dict from the inherited env + config overrides."""
|
| 89 |
env = os.environ.copy()
|
| 90 |
# Ensure the Hyena layer selection is always present (defaults to off).
|
|
|
|
| 94 |
return env
|
| 95 |
|
| 96 |
|
| 97 |
+
def parse_step_line(line: str) -> dict[str, float] | None:
|
| 98 |
"""Parse a single step=... line into a dict of metrics, or None."""
|
| 99 |
if not line.startswith("step="):
|
| 100 |
return None
|
|
|
|
| 105 |
return None
|
| 106 |
|
| 107 |
|
| 108 |
+
def summarize(log_path: Path, warmup_steps: int = 50) -> dict[str, float]:
|
| 109 |
"""Tail log_path, compute steady-state TPS / BPB@500 / VRAM peak.
|
| 110 |
|
| 111 |
Skips the first `warmup_steps` to discard CUDA graph capture / autotune
|
|
|
|
| 141 |
tps_sorted = sorted(tps_vals)
|
| 142 |
tps_steady = tps_sorted[len(tps_sorted) // 2] # median
|
| 143 |
|
| 144 |
+
return {
|
| 145 |
+
"tps_steady": tps_steady,
|
| 146 |
+
"bpb_at_500": bpb_at_500 or (bpbs[-1] if bpbs else 0.0),
|
| 147 |
+
"vram_peak": vram_peak,
|
| 148 |
+
"steps": len(tps_vals) + warmup_steps,
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def fails_tps_floor(summary: dict[str, float], min_tps: float) -> bool:
|
| 153 |
+
if min_tps <= 0:
|
| 154 |
+
return False
|
| 155 |
+
tps_steady = float(summary.get("tps_steady", 0.0))
|
| 156 |
+
return tps_steady < float(min_tps)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def main() -> int:
|
| 160 |
+
ap = argparse.ArgumentParser()
|
| 161 |
+
ap.add_argument("--config", required=True, choices=list(CONFIGS))
|
| 162 |
+
ap.add_argument("--time", type=int, default=300, help="training seconds")
|
| 163 |
+
ap.add_argument("--log", default=None, help="output log path (default: run_bench_<cfg>.log)")
|
| 164 |
+
ap.add_argument("--min-tps", type=float, default=50000.0, help="Required steady-state TPS floor (set 0 to disable)")
|
| 165 |
+
ap.add_argument("--warmup-steps", type=int, default=50, help="Number of initial steps to skip before TPS median")
|
| 166 |
+
args = ap.parse_args()
|
| 167 |
|
| 168 |
cfg = CONFIGS[args.config]
|
| 169 |
log_path = Path(args.log or (REPO / f"run_bench_{args.config}.log"))
|
|
|
|
| 190 |
print(f"BENCH FAIL config={args.config}", flush=True)
|
| 191 |
return proc.returncode
|
| 192 |
|
| 193 |
+
summary = summarize(log_path, warmup_steps=max(0, int(args.warmup_steps)))
|
| 194 |
+
print(
|
| 195 |
+
f"BENCHMARK config={args.config} "
|
| 196 |
+
f"tps_steady={summary['tps_steady']:.0f} "
|
| 197 |
+
f"bpb_at_500={summary['bpb_at_500']:.4f} "
|
| 198 |
+
f"vram_peak={summary['vram_peak']:.0f}MiB "
|
| 199 |
+
f"steps={summary['steps']}",
|
| 200 |
+
flush=True,
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
if fails_tps_floor(summary, args.min_tps):
|
| 204 |
+
print(
|
| 205 |
+
f"BENCH FAIL config={args.config} tps_steady={summary['tps_steady']:.0f} < min_tps={args.min_tps:.0f}",
|
| 206 |
+
flush=True,
|
| 207 |
+
)
|
| 208 |
+
return 2
|
| 209 |
+
|
| 210 |
+
print(f"BENCH PASS config={args.config} min_tps={args.min_tps:.0f}", flush=True)
|
| 211 |
+
return 0
|
| 212 |
|
| 213 |
|
| 214 |
if __name__ == "__main__":
|
overlay/scripts/export_hpo_priors.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import datetime as dt
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
import optuna
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def parse_args() -> argparse.Namespace:
|
| 14 |
+
parser = argparse.ArgumentParser(description="Export top Optuna trials as transfer-learning priors")
|
| 15 |
+
parser.add_argument("--study-name", action="append", default=[], help="Repeat to merge multiple studies")
|
| 16 |
+
parser.add_argument("--storage", default="sqlite:///optuna_hpo.db")
|
| 17 |
+
parser.add_argument("--top-k", type=int, default=20)
|
| 18 |
+
parser.add_argument("--out", type=Path, default=Path("docs") / "hpo_transfer_priors.json")
|
| 19 |
+
parser.add_argument("--metric", default="val_bpb")
|
| 20 |
+
return parser.parse_args()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _completed_trials(study: optuna.Study) -> list[optuna.trial.FrozenTrial]:
|
| 24 |
+
trials = [t for t in study.trials if t.value is not None]
|
| 25 |
+
reverse = study.direction == optuna.study.StudyDirection.MAXIMIZE
|
| 26 |
+
return sorted(trials, key=lambda t: float(t.value), reverse=reverse)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _serialize_trial(trial: optuna.trial.FrozenTrial) -> dict[str, Any]:
|
| 30 |
+
return {
|
| 31 |
+
"trial_number": trial.number,
|
| 32 |
+
"value": float(trial.value) if trial.value is not None else None,
|
| 33 |
+
"params": dict(trial.params),
|
| 34 |
+
"user_attrs": dict(trial.user_attrs),
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def main() -> int:
|
| 39 |
+
args = parse_args()
|
| 40 |
+
study_names = args.study_name or ["hydra_hpo"]
|
| 41 |
+
merged_trials: list[dict[str, Any]] = []
|
| 42 |
+
total_trials = 0
|
| 43 |
+
total_completed = 0
|
| 44 |
+
|
| 45 |
+
for study_name in study_names:
|
| 46 |
+
study = optuna.load_study(study_name=study_name, storage=args.storage)
|
| 47 |
+
ranked = _completed_trials(study)
|
| 48 |
+
selected = ranked[: max(0, args.top_k)]
|
| 49 |
+
total_trials += len(study.trials)
|
| 50 |
+
total_completed += len(ranked)
|
| 51 |
+
for t in selected:
|
| 52 |
+
row = _serialize_trial(t)
|
| 53 |
+
row["study_name"] = study_name
|
| 54 |
+
merged_trials.append(row)
|
| 55 |
+
|
| 56 |
+
payload = {
|
| 57 |
+
"schema_version": 1,
|
| 58 |
+
"generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
|
| 59 |
+
"study_names": study_names,
|
| 60 |
+
"metric": args.metric,
|
| 61 |
+
"n_total_trials": total_trials,
|
| 62 |
+
"n_completed_trials": total_completed,
|
| 63 |
+
"top_k_per_study": args.top_k,
|
| 64 |
+
"trials": merged_trials,
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
args.out.parent.mkdir(parents=True, exist_ok=True)
|
| 68 |
+
args.out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
| 69 |
+
print(f"[hpo-priors] wrote {args.out} with {len(merged_trials)} merged trials")
|
| 70 |
+
return 0
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
raise SystemExit(main())
|
overlay/scripts/hpo_orchestrator.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import subprocess
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
import optuna
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 16 |
+
if str(REPO_ROOT) not in sys.path:
|
| 17 |
+
sys.path.insert(0, str(REPO_ROOT))
|
| 18 |
+
|
| 19 |
+
from scripts.hf_routing import resolve_routing
|
| 20 |
+
|
| 21 |
+
HPO_SCRIPT = REPO_ROOT / "scripts" / "optuna_hpo.py"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _run_worker(args: list[str]) -> int:
|
| 25 |
+
cmd = [sys.executable, str(HPO_SCRIPT), *args]
|
| 26 |
+
proc = subprocess.run(cmd, cwd=str(REPO_ROOT), text=True)
|
| 27 |
+
return proc.returncode
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _study_stats(storage: str, study_name: str) -> dict[str, Any]:
|
| 31 |
+
try:
|
| 32 |
+
study = optuna.load_study(study_name=study_name, storage=storage)
|
| 33 |
+
except KeyError:
|
| 34 |
+
return {
|
| 35 |
+
"study_name": study_name,
|
| 36 |
+
"status": "missing",
|
| 37 |
+
"direction": None,
|
| 38 |
+
"n_trials": 0,
|
| 39 |
+
"n_completed": 0,
|
| 40 |
+
"n_pruned": 0,
|
| 41 |
+
"n_failed": 0,
|
| 42 |
+
}
|
| 43 |
+
completed = [t for t in study.trials if t.value is not None]
|
| 44 |
+
pruned = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
|
| 45 |
+
failed = [t for t in study.trials if t.state == optuna.trial.TrialState.FAIL]
|
| 46 |
+
|
| 47 |
+
stats: dict[str, Any] = {
|
| 48 |
+
"study_name": study.study_name,
|
| 49 |
+
"direction": str(study.direction),
|
| 50 |
+
"n_trials": len(study.trials),
|
| 51 |
+
"n_completed": len(completed),
|
| 52 |
+
"n_pruned": len(pruned),
|
| 53 |
+
"n_failed": len(failed),
|
| 54 |
+
}
|
| 55 |
+
if completed:
|
| 56 |
+
stats.update(
|
| 57 |
+
{
|
| 58 |
+
"best_value": study.best_value,
|
| 59 |
+
"best_params": study.best_params,
|
| 60 |
+
"best_trial_number": study.best_trial.number,
|
| 61 |
+
"best_trial_user_attrs": study.best_trial.user_attrs,
|
| 62 |
+
}
|
| 63 |
+
)
|
| 64 |
+
return stats
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _phase_args(phase: str, base: argparse.Namespace) -> list[str]:
|
| 68 |
+
common = [
|
| 69 |
+
"--study-name",
|
| 70 |
+
base.study_name,
|
| 71 |
+
"--storage",
|
| 72 |
+
base.storage,
|
| 73 |
+
"--metric",
|
| 74 |
+
base.metric,
|
| 75 |
+
"--direction",
|
| 76 |
+
base.direction,
|
| 77 |
+
"--seed",
|
| 78 |
+
str(base.seed),
|
| 79 |
+
"--min-tps",
|
| 80 |
+
str(base.min_tps),
|
| 81 |
+
"--summary-out",
|
| 82 |
+
str(base.summary_out),
|
| 83 |
+
"--runner",
|
| 84 |
+
base.runner,
|
| 85 |
+
"--hf-namespace",
|
| 86 |
+
base.hf_namespace,
|
| 87 |
+
"--hf-image",
|
| 88 |
+
base.hf_image,
|
| 89 |
+
"--hf-flavor",
|
| 90 |
+
base.hf_flavor,
|
| 91 |
+
"--hf-timeout",
|
| 92 |
+
base.hf_timeout,
|
| 93 |
+
"--hf-command",
|
| 94 |
+
base.hf_command,
|
| 95 |
+
"--hf-token-env",
|
| 96 |
+
base.hf_token_env,
|
| 97 |
+
"--hf-poll-interval",
|
| 98 |
+
str(base.hf_poll_interval),
|
| 99 |
+
"--hf-launcher-script",
|
| 100 |
+
str(base.hf_launcher_script),
|
| 101 |
+
"--priors-file",
|
| 102 |
+
str(base.priors_file),
|
| 103 |
+
]
|
| 104 |
+
if base.hf_output_repo:
|
| 105 |
+
common.extend(["--hf-output-repo", base.hf_output_repo])
|
| 106 |
+
if base.hf_use_bash:
|
| 107 |
+
common.append("--hf-use-bash")
|
| 108 |
+
if base.hf_stop_after_metric:
|
| 109 |
+
common.append("--hf-stop-after-metric")
|
| 110 |
+
else:
|
| 111 |
+
common.append("--no-hf-stop-after-metric")
|
| 112 |
+
if base.apply_priors:
|
| 113 |
+
common.append("--apply-priors")
|
| 114 |
+
else:
|
| 115 |
+
common.append("--no-apply-priors")
|
| 116 |
+
if phase == "phase1":
|
| 117 |
+
return [
|
| 118 |
+
*common,
|
| 119 |
+
"--trials",
|
| 120 |
+
str(base.phase1_trials),
|
| 121 |
+
"--trial-time-budget",
|
| 122 |
+
str(base.phase1_trial_time_budget),
|
| 123 |
+
"--trial-timeout",
|
| 124 |
+
str(base.phase1_trial_timeout),
|
| 125 |
+
"--n-startup-trials",
|
| 126 |
+
str(base.phase1_n_startup),
|
| 127 |
+
"--n-warmup-steps",
|
| 128 |
+
str(base.phase1_n_warmup),
|
| 129 |
+
"--patience-trials",
|
| 130 |
+
str(base.phase1_patience),
|
| 131 |
+
"--min-improvement",
|
| 132 |
+
str(base.phase1_min_improvement),
|
| 133 |
+
]
|
| 134 |
+
if phase == "phase2":
|
| 135 |
+
return [
|
| 136 |
+
*common,
|
| 137 |
+
"--trials",
|
| 138 |
+
str(base.phase2_trials),
|
| 139 |
+
"--trial-time-budget",
|
| 140 |
+
str(base.phase2_trial_time_budget),
|
| 141 |
+
"--trial-timeout",
|
| 142 |
+
str(base.phase2_trial_timeout),
|
| 143 |
+
"--n-startup-trials",
|
| 144 |
+
str(base.phase2_n_startup),
|
| 145 |
+
"--n-warmup-steps",
|
| 146 |
+
str(base.phase2_n_warmup),
|
| 147 |
+
"--patience-trials",
|
| 148 |
+
str(base.phase2_patience),
|
| 149 |
+
"--min-improvement",
|
| 150 |
+
str(base.phase2_min_improvement),
|
| 151 |
+
]
|
| 152 |
+
raise ValueError(f"Unknown phase: {phase}")
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def cmd_phase(args: argparse.Namespace) -> int:
|
| 156 |
+
rc = _run_worker(_phase_args(args.phase, args))
|
| 157 |
+
stats = _study_stats(args.storage, args.study_name)
|
| 158 |
+
args.summary_out.parent.mkdir(parents=True, exist_ok=True)
|
| 159 |
+
args.summary_out.write_text(json.dumps({"phase": args.phase, "stats": stats}, indent=2), encoding="utf-8")
|
| 160 |
+
print(json.dumps({"phase": args.phase, "stats": stats}, indent=2))
|
| 161 |
+
return rc
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def cmd_parallel(args: argparse.Namespace) -> int:
|
| 165 |
+
worker_args = _phase_args(args.phase, args)
|
| 166 |
+
procs: list[subprocess.Popen[str]] = []
|
| 167 |
+
for _ in range(args.workers):
|
| 168 |
+
cmd = [sys.executable, str(HPO_SCRIPT), *worker_args]
|
| 169 |
+
procs.append(subprocess.Popen(cmd, cwd=str(REPO_ROOT), text=True))
|
| 170 |
+
|
| 171 |
+
exit_codes = [p.wait() for p in procs]
|
| 172 |
+
stats = _study_stats(args.storage, args.study_name)
|
| 173 |
+
payload = {
|
| 174 |
+
"phase": args.phase,
|
| 175 |
+
"workers": args.workers,
|
| 176 |
+
"exit_codes": exit_codes,
|
| 177 |
+
"stats": stats,
|
| 178 |
+
}
|
| 179 |
+
args.summary_out.parent.mkdir(parents=True, exist_ok=True)
|
| 180 |
+
args.summary_out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
| 181 |
+
print(json.dumps(payload, indent=2))
|
| 182 |
+
return 0 if all(code == 0 for code in exit_codes) else 1
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def cmd_recommend(args: argparse.Namespace) -> int:
|
| 186 |
+
stats = _study_stats(args.storage, args.study_name)
|
| 187 |
+
min_tps_floor = float(args.min_tps)
|
| 188 |
+
if stats.get("status") == "missing":
|
| 189 |
+
payload = {
|
| 190 |
+
"stats": stats,
|
| 191 |
+
"recommendation": {
|
| 192 |
+
"status": "create_study_first",
|
| 193 |
+
"next_step": "Run phase1 (serial or parallel) to create and populate the study.",
|
| 194 |
+
"example": f"python scripts/hpo_orchestrator.py parallel --phase phase1 --workers 3 --storage {args.storage} --study-name {args.study_name}",
|
| 195 |
+
},
|
| 196 |
+
}
|
| 197 |
+
args.summary_out.parent.mkdir(parents=True, exist_ok=True)
|
| 198 |
+
args.summary_out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
| 199 |
+
print(json.dumps(payload, indent=2))
|
| 200 |
+
return 0
|
| 201 |
+
|
| 202 |
+
n_completed = int(stats.get("n_completed", 0))
|
| 203 |
+
|
| 204 |
+
if n_completed < 10:
|
| 205 |
+
recommendation = {
|
| 206 |
+
"status": "insufficient_data",
|
| 207 |
+
"next_step": "Run phase1 with 2-4 parallel workers until >=10 completed trials.",
|
| 208 |
+
"early_stop_policy": {
|
| 209 |
+
"patience_trials": 8,
|
| 210 |
+
"min_improvement": 0.001,
|
| 211 |
+
},
|
| 212 |
+
"throughput_guard": {
|
| 213 |
+
"min_tps": min_tps_floor,
|
| 214 |
+
"note": "Trials below this TPS floor are pruned.",
|
| 215 |
+
},
|
| 216 |
+
"transfer_learning": {
|
| 217 |
+
"export_priors": f"python scripts/export_hpo_priors.py --storage {args.storage} --study-name {args.study_name} --top-k 10 --out docs/hpo_transfer_priors.json",
|
| 218 |
+
"use_priors": "Enabled by default in scripts/optuna_hpo.py (override with --no-apply-priors)",
|
| 219 |
+
},
|
| 220 |
+
}
|
| 221 |
+
else:
|
| 222 |
+
recommendation = {
|
| 223 |
+
"status": "ready_for_full_optimization",
|
| 224 |
+
"next_step": "Run phase2 with 3-4 parallel workers.",
|
| 225 |
+
"suggested_full_run": {
|
| 226 |
+
"trials": 60,
|
| 227 |
+
"workers": 4,
|
| 228 |
+
"trial_time_budget": 300,
|
| 229 |
+
"trial_timeout": 900,
|
| 230 |
+
"min_tps": min_tps_floor,
|
| 231 |
+
"patience_trials": 12,
|
| 232 |
+
"min_improvement": 0.0005,
|
| 233 |
+
},
|
| 234 |
+
"transfer_learning": {
|
| 235 |
+
"refresh_priors": f"python scripts/export_hpo_priors.py --storage {args.storage} --study-name {args.study_name} --top-k 20 --out docs/hpo_transfer_priors.json",
|
| 236 |
+
"notes": "Carry priors into new studies unless architecture/objective diverges significantly.",
|
| 237 |
+
},
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
payload = {"stats": stats, "recommendation": recommendation}
|
| 241 |
+
args.summary_out.parent.mkdir(parents=True, exist_ok=True)
|
| 242 |
+
args.summary_out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
| 243 |
+
print(json.dumps(payload, indent=2))
|
| 244 |
+
return 0
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def build_parser() -> argparse.ArgumentParser:
|
| 248 |
+
routing_defaults = resolve_routing(token=os.environ.get("HF_TOKEN"))
|
| 249 |
+
parser = argparse.ArgumentParser(description="Phase-oriented orchestration for Optuna HPO")
|
| 250 |
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
| 251 |
+
|
| 252 |
+
def add_common(p: argparse.ArgumentParser) -> None:
|
| 253 |
+
p.add_argument("--study-name", default="hydra_hpo")
|
| 254 |
+
p.add_argument("--storage", default="sqlite:///optuna_hpo.db")
|
| 255 |
+
p.add_argument("--metric", default="val_bpb")
|
| 256 |
+
p.add_argument("--direction", choices=["minimize", "maximize"], default="minimize")
|
| 257 |
+
p.add_argument("--seed", type=int, default=42)
|
| 258 |
+
p.add_argument("--min-tps", type=float, default=50000.0)
|
| 259 |
+
p.add_argument("--summary-out", type=Path, default=REPO_ROOT / ".tmp" / "optuna" / "orchestrator_summary.json")
|
| 260 |
+
p.add_argument("--runner", choices=["local", "hf-job", "hf-launcher"], default="local")
|
| 261 |
+
p.add_argument("--hf-namespace", default=routing_defaults.job_namespace)
|
| 262 |
+
p.add_argument("--hf-image", default=f"hf.co/spaces/{routing_defaults.space_repo}")
|
| 263 |
+
p.add_argument("--hf-flavor", default="a10g-large")
|
| 264 |
+
p.add_argument("--hf-timeout", default="25m")
|
| 265 |
+
p.add_argument("--hf-command", default="/app/entrypoint.py")
|
| 266 |
+
p.add_argument("--hf-use-bash", action="store_true")
|
| 267 |
+
p.add_argument("--hf-token-env", default="HF_TOKEN")
|
| 268 |
+
p.add_argument("--hf-poll-interval", type=int, default=12)
|
| 269 |
+
p.add_argument("--hf-launcher-script", type=Path, default=REPO_ROOT / "scripts" / "launch_feather_hf_job.py")
|
| 270 |
+
p.add_argument("--hf-output-repo", default=routing_defaults.output_repo)
|
| 271 |
+
p.add_argument("--priors-file", type=Path, default=REPO_ROOT / "docs" / "hpo_transfer_priors.json")
|
| 272 |
+
p.add_argument("--apply-priors", action="store_true", default=True)
|
| 273 |
+
p.add_argument("--no-apply-priors", action="store_false", dest="apply_priors")
|
| 274 |
+
p.add_argument("--hf-stop-after-metric", action="store_true", default=True)
|
| 275 |
+
p.add_argument("--no-hf-stop-after-metric", action="store_false", dest="hf_stop_after_metric")
|
| 276 |
+
|
| 277 |
+
# Phase-1 defaults
|
| 278 |
+
p.add_argument("--phase1-trials", type=int, default=30)
|
| 279 |
+
p.add_argument("--phase1-trial-time-budget", type=int, default=180)
|
| 280 |
+
p.add_argument("--phase1-trial-timeout", type=int, default=600)
|
| 281 |
+
p.add_argument("--phase1-n-startup", type=int, default=5)
|
| 282 |
+
p.add_argument("--phase1-n-warmup", type=int, default=0)
|
| 283 |
+
p.add_argument("--phase1-patience", type=int, default=8)
|
| 284 |
+
p.add_argument("--phase1-min-improvement", type=float, default=0.001)
|
| 285 |
+
|
| 286 |
+
# Phase-2 defaults
|
| 287 |
+
p.add_argument("--phase2-trials", type=int, default=60)
|
| 288 |
+
p.add_argument("--phase2-trial-time-budget", type=int, default=300)
|
| 289 |
+
p.add_argument("--phase2-trial-timeout", type=int, default=900)
|
| 290 |
+
p.add_argument("--phase2-n-startup", type=int, default=8)
|
| 291 |
+
p.add_argument("--phase2-n-warmup", type=int, default=0)
|
| 292 |
+
p.add_argument("--phase2-patience", type=int, default=12)
|
| 293 |
+
p.add_argument("--phase2-min-improvement", type=float, default=0.0005)
|
| 294 |
+
|
| 295 |
+
p_phase = sub.add_parser("phase", help="Run a single phase serially")
|
| 296 |
+
add_common(p_phase)
|
| 297 |
+
p_phase.add_argument("--phase", choices=["phase1", "phase2"], required=True)
|
| 298 |
+
p_phase.set_defaults(func=cmd_phase)
|
| 299 |
+
|
| 300 |
+
p_parallel = sub.add_parser("parallel", help="Run a phase with N parallel workers")
|
| 301 |
+
add_common(p_parallel)
|
| 302 |
+
p_parallel.add_argument("--phase", choices=["phase1", "phase2"], required=True)
|
| 303 |
+
p_parallel.add_argument("--workers", type=int, default=3)
|
| 304 |
+
p_parallel.set_defaults(func=cmd_parallel)
|
| 305 |
+
|
| 306 |
+
p_reco = sub.add_parser("recommend", help="Recommend full-run settings from current study")
|
| 307 |
+
add_common(p_reco)
|
| 308 |
+
p_reco.set_defaults(func=cmd_recommend)
|
| 309 |
+
return parser
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def main() -> int:
|
| 313 |
+
parser = build_parser()
|
| 314 |
+
args = parser.parse_args()
|
| 315 |
+
return int(args.func(args))
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
if __name__ == "__main__":
|
| 319 |
+
raise SystemExit(main())
|
overlay/scripts/launch_feather_hf_job.py
CHANGED
|
@@ -2,37 +2,104 @@
|
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
import os
|
|
|
|
| 5 |
import sys
|
| 6 |
import time
|
|
|
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
from huggingface_hub import HfApi
|
| 10 |
-
from huggingface_hub._space_api import SpaceHardware
|
| 11 |
-
from huggingface_hub.errors import HfHubHTTPError
|
| 12 |
|
| 13 |
-
|
| 14 |
-
REPO_ROOT = Path(__file__).resolve().parents[4]
|
| 15 |
if str(REPO_ROOT) not in sys.path:
|
| 16 |
sys.path.insert(0, str(REPO_ROOT))
|
| 17 |
|
| 18 |
from scripts.hf_routing import resolve_routing
|
|
|
|
| 19 |
|
| 20 |
DEFAULT_IMAGE = os.environ.get('FEATHER_HF_IMAGE', 'ghcr.io/slapglif/feather-hf-runtime:latest')
|
| 21 |
-
IMAGE_DIR =
|
| 22 |
TIMEOUT = os.environ.get('FEATHER_HF_JOB_TIMEOUT', '12h')
|
| 23 |
-
FLAVOR_RAW = os.environ.get('FEATHER_HF_FLAVOR', 'h200')
|
| 24 |
TARGET_SHARDS = os.environ.get('HYDRA_TARGET_SHARDS', '2048')
|
| 25 |
TIME_BUDGET = os.environ.get('HYDRA_TIME_BUDGET', '43200')
|
| 26 |
DOWNLOAD_WORKERS = os.environ.get('HYDRA_DOWNLOAD_WORKERS', '16')
|
| 27 |
CKPT_INTERVAL = os.environ.get('HYDRA_CKPT_INTERVAL', '1000')
|
|
|
|
| 28 |
DRY_RUN = os.environ.get('FEATHER_HF_DRY_RUN', '0') == '1'
|
| 29 |
USE_SPACE_IMAGE = os.environ.get('FEATHER_HF_USE_SPACE_IMAGE', '0') == '1'
|
| 30 |
# When true, assume the Space image has already been built by a previous
|
| 31 |
# invocation and skip the upload+build wait. Used by sweep drivers that fan
|
| 32 |
# out many jobs against a single pre-uploaded image.
|
| 33 |
SKIP_UPLOAD = os.environ.get('FEATHER_HF_SKIP_UPLOAD', '0') == '1'
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
def require_token() -> str:
|
|
@@ -59,115 +126,52 @@ def wait_for_space(api: HfApi, repo_id: str, token: str, timeout_s: int = 1800)
|
|
| 59 |
"""
|
| 60 |
start = time.time()
|
| 61 |
seen_build_completion = False
|
|
|
|
| 62 |
while True:
|
| 63 |
-
|
| 64 |
-
runtime = api.get_space_runtime(repo_id, token=token)
|
| 65 |
-
except HfHubHTTPError as exc:
|
| 66 |
-
code = getattr(getattr(exc, 'response', None), 'status_code', None)
|
| 67 |
-
if isinstance(code, int) and code >= 500:
|
| 68 |
-
if time.time() - start > timeout_s:
|
| 69 |
-
raise TimeoutError(
|
| 70 |
-
f'Space {repo_id} runtime endpoint unstable for {timeout_s}s '
|
| 71 |
-
f'(last HTTP {code})'
|
| 72 |
-
) from exc
|
| 73 |
-
print(f'[space] runtime endpoint HTTP {code}; retrying...', flush=True)
|
| 74 |
-
time.sleep(20)
|
| 75 |
-
continue
|
| 76 |
-
raise
|
| 77 |
stage = getattr(runtime, 'stage', None)
|
| 78 |
-
hardware = getattr(runtime, 'hardware', None)
|
| 79 |
-
err = getattr(runtime, 'errorMessage', None) or getattr(runtime, 'error_message', None)
|
| 80 |
-
print(f'[space] stage={stage} hardware={hardware}', flush=True)
|
| 81 |
-
if stage
|
| 82 |
-
|
| 83 |
-
if stage in {'RUNNING', 'PAUSED', 'SLEEPING'}:
|
| 84 |
-
|
|
|
|
|
|
|
| 85 |
# Image is built — Jobs can use it regardless of Space boot outcome.
|
| 86 |
-
if seen_build_completion and stage in {'RUNTIME_ERROR', 'APP_STARTING_ERROR'}:
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
print(msg, flush=True)
|
| 92 |
-
return
|
| 93 |
# Hard build failures — no image was produced.
|
| 94 |
if stage in {'BUILD_ERROR', 'CONFIG_ERROR', 'NO_APP_FILE'}:
|
| 95 |
raise RuntimeError(f'Space {repo_id} build failed: stage={stage} error={err!r}')
|
| 96 |
if time.time() - start > timeout_s:
|
| 97 |
raise TimeoutError(f'Space {repo_id} did not become ready in {timeout_s}s (last stage={stage})')
|
| 98 |
-
time.sleep(20)
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def submit_job_with_retry(
|
| 102 |
-
api: HfApi,
|
| 103 |
-
*,
|
| 104 |
-
image: str,
|
| 105 |
-
command: list[str],
|
| 106 |
-
env: dict[str, str],
|
| 107 |
-
secrets: dict[str, str],
|
| 108 |
-
flavor: SpaceHardware,
|
| 109 |
-
timeout: str,
|
| 110 |
-
token: str,
|
| 111 |
-
namespace: str,
|
| 112 |
-
):
|
| 113 |
-
last_exc: Exception | None = None
|
| 114 |
-
for attempt in range(1, JOB_SUBMIT_RETRIES + 1):
|
| 115 |
-
try:
|
| 116 |
-
return api.run_job(
|
| 117 |
-
image=image,
|
| 118 |
-
command=command,
|
| 119 |
-
env=env,
|
| 120 |
-
secrets=secrets,
|
| 121 |
-
flavor=flavor,
|
| 122 |
-
timeout=timeout,
|
| 123 |
-
token=token,
|
| 124 |
-
namespace=namespace,
|
| 125 |
-
)
|
| 126 |
-
except HfHubHTTPError as exc:
|
| 127 |
-
last_exc = exc
|
| 128 |
-
code = getattr(getattr(exc, 'response', None), 'status_code', None)
|
| 129 |
-
if not (isinstance(code, int) and code >= 500):
|
| 130 |
-
raise
|
| 131 |
-
if attempt >= JOB_SUBMIT_RETRIES:
|
| 132 |
-
raise SystemExit(
|
| 133 |
-
f'HF Jobs backend returned HTTP {code} after {JOB_SUBMIT_RETRIES} '
|
| 134 |
-
'submit attempts; failing fast.'
|
| 135 |
-
) from exc
|
| 136 |
-
wait_s = JOB_SUBMIT_RETRY_BASE_S * attempt
|
| 137 |
-
print(
|
| 138 |
-
f'[launch] HF Jobs backend returned HTTP {code}; retrying submit in '
|
| 139 |
-
f'{wait_s:.1f}s (attempt {attempt}/{JOB_SUBMIT_RETRIES})',
|
| 140 |
-
flush=True,
|
| 141 |
-
)
|
| 142 |
-
time.sleep(wait_s)
|
| 143 |
-
|
| 144 |
-
if last_exc is not None:
|
| 145 |
-
raise last_exc
|
| 146 |
-
raise RuntimeError('submit_job_with_retry exhausted without a result')
|
| 147 |
|
| 148 |
|
| 149 |
def main() -> int:
|
| 150 |
token = require_token()
|
| 151 |
routing = resolve_routing(token=token)
|
| 152 |
api = HfApi(token=token)
|
|
|
|
| 153 |
|
| 154 |
print(f'[launch] image_dir={IMAGE_DIR}', flush=True)
|
| 155 |
print(f'[launch] owner={routing.owner}', flush=True)
|
| 156 |
print(f'[launch] space_repo={routing.space_repo}', flush=True)
|
| 157 |
print(f'[launch] output_repo={routing.output_repo}', flush=True)
|
| 158 |
print(f'[launch] retina_cache_repo={routing.retina_cache_repo}', flush=True)
|
|
|
|
|
|
|
| 159 |
print(f'[launch] namespace={routing.job_namespace}', flush=True)
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT} flavor={flavor.value}', flush=True)
|
| 167 |
-
print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
|
| 168 |
-
if not USE_SPACE_IMAGE:
|
| 169 |
-
print(f'[launch] image={DEFAULT_IMAGE}', flush=True)
|
| 170 |
-
|
| 171 |
api.create_repo(repo_id=routing.space_repo, repo_type='space', space_sdk='docker', private=True, exist_ok=True, token=token)
|
| 172 |
api.create_repo(repo_id=routing.output_repo, repo_type='model', private=True, exist_ok=True, token=token)
|
| 173 |
|
|
@@ -175,17 +179,19 @@ def main() -> int:
|
|
| 175 |
print('[launch] dry-run mode; skipping upload and job submission', flush=True)
|
| 176 |
return 0
|
| 177 |
|
| 178 |
-
image_ref = DEFAULT_IMAGE
|
| 179 |
-
if USE_SPACE_IMAGE:
|
| 180 |
-
if SKIP_UPLOAD:
|
| 181 |
-
print('[launch] FEATHER_HF_SKIP_UPLOAD=1; reusing existing Space image', flush=True)
|
| 182 |
-
else:
|
| 183 |
-
|
|
|
|
|
|
|
| 184 |
api.upload_folder(
|
| 185 |
repo_id=routing.space_repo,
|
| 186 |
repo_type='space',
|
| 187 |
folder_path=str(IMAGE_DIR),
|
| 188 |
-
commit_message='Update Feather
|
| 189 |
token=token,
|
| 190 |
)
|
| 191 |
|
|
@@ -205,8 +211,38 @@ def main() -> int:
|
|
| 205 |
'HYDRA_DOWNLOAD_WORKERS': DOWNLOAD_WORKERS,
|
| 206 |
'HYDRA_CKPT_INTERVAL': CKPT_INTERVAL,
|
| 207 |
'PYTHONUNBUFFERED': '1',
|
| 208 |
-
'FEATHER_RUNTIME_MODE': 'job',
|
| 209 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
# Pass through any HYDRA_* / FEATHER_* overrides from the caller's env so
|
| 211 |
# sweep drivers can set HYDRA_N_LAYER, HYDRA_SDR_TARGET_ACTIVE,
|
| 212 |
# HYDRA_LAYER_DIAGNOSTICS, HYDRA_METRICS_OUT, HYDRA_MID_VAL_INTERVAL, etc.
|
|
@@ -216,17 +252,16 @@ def main() -> int:
|
|
| 216 |
env[_k] = _v
|
| 217 |
secrets = {'HF_TOKEN': token}
|
| 218 |
|
| 219 |
-
print('[launch] submitting HF Job...', flush=True)
|
| 220 |
-
job =
|
| 221 |
-
api,
|
| 222 |
image=image_ref,
|
| 223 |
command=['python', '/app/entrypoint.py'],
|
| 224 |
env=env,
|
| 225 |
secrets=secrets,
|
| 226 |
-
flavor=
|
| 227 |
timeout=TIMEOUT,
|
| 228 |
-
token=token,
|
| 229 |
namespace=routing.job_namespace,
|
|
|
|
| 230 |
)
|
| 231 |
print(f'[launch] submitted job_id={job.id} status={job.status.stage} url={job.url}', flush=True)
|
| 232 |
return 0
|
|
|
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
import os
|
| 5 |
+
import shutil
|
| 6 |
import sys
|
| 7 |
import time
|
| 8 |
+
import json
|
| 9 |
+
from typing import Any, cast
|
| 10 |
from pathlib import Path
|
| 11 |
|
| 12 |
from huggingface_hub import HfApi
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
|
|
|
| 15 |
if str(REPO_ROOT) not in sys.path:
|
| 16 |
sys.path.insert(0, str(REPO_ROOT))
|
| 17 |
|
| 18 |
from scripts.hf_routing import resolve_routing
|
| 19 |
+
from configs.harness_config import HarnessConfig
|
| 20 |
|
| 21 |
DEFAULT_IMAGE = os.environ.get('FEATHER_HF_IMAGE', 'ghcr.io/slapglif/feather-hf-runtime:latest')
|
| 22 |
+
IMAGE_DIR = Path(__file__).resolve().parents[1] / 'hf_jobs' / 'feather_h200_image'
|
| 23 |
TIMEOUT = os.environ.get('FEATHER_HF_JOB_TIMEOUT', '12h')
|
|
|
|
| 24 |
TARGET_SHARDS = os.environ.get('HYDRA_TARGET_SHARDS', '2048')
|
| 25 |
TIME_BUDGET = os.environ.get('HYDRA_TIME_BUDGET', '43200')
|
| 26 |
DOWNLOAD_WORKERS = os.environ.get('HYDRA_DOWNLOAD_WORKERS', '16')
|
| 27 |
CKPT_INTERVAL = os.environ.get('HYDRA_CKPT_INTERVAL', '1000')
|
| 28 |
+
JOB_FLAVOR = os.environ.get('FEATHER_HF_FLAVOR', 'a10g-small')
|
| 29 |
DRY_RUN = os.environ.get('FEATHER_HF_DRY_RUN', '0') == '1'
|
| 30 |
USE_SPACE_IMAGE = os.environ.get('FEATHER_HF_USE_SPACE_IMAGE', '0') == '1'
|
| 31 |
# When true, assume the Space image has already been built by a previous
|
| 32 |
# invocation and skip the upload+build wait. Used by sweep drivers that fan
|
| 33 |
# out many jobs against a single pre-uploaded image.
|
| 34 |
SKIP_UPLOAD = os.environ.get('FEATHER_HF_SKIP_UPLOAD', '0') == '1'
|
| 35 |
+
SYNC_OVERLAY = os.environ.get('FEATHER_HF_SYNC_OVERLAY', '1') == '1'
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def should_enable_fast_start_streaming(target_shards: str, time_budget: str) -> bool:
|
| 39 |
+
"""Use streaming data path for short-budget launch profiles."""
|
| 40 |
+
try:
|
| 41 |
+
shards = int(target_shards)
|
| 42 |
+
budget = int(time_budget)
|
| 43 |
+
except ValueError:
|
| 44 |
+
return False
|
| 45 |
+
return shards > 0 and shards <= 256 and budget > 0 and budget <= 1800
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def sync_overlay_from_repo() -> None:
|
| 49 |
+
"""Refresh Space overlay with required project files."""
|
| 50 |
+
overlay = IMAGE_DIR / 'overlay'
|
| 51 |
+
overlay.mkdir(parents=True, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
for child in overlay.iterdir():
|
| 54 |
+
if child.is_dir():
|
| 55 |
+
shutil.rmtree(child)
|
| 56 |
+
else:
|
| 57 |
+
child.unlink()
|
| 58 |
+
|
| 59 |
+
include_paths = [
|
| 60 |
+
'hydra',
|
| 61 |
+
'subsystems',
|
| 62 |
+
'scripts',
|
| 63 |
+
'htm_rust',
|
| 64 |
+
'harness',
|
| 65 |
+
'configs',
|
| 66 |
+
'prepare.py',
|
| 67 |
+
'prepare_nemotron.py',
|
| 68 |
+
'train.py',
|
| 69 |
+
'pyproject.toml',
|
| 70 |
+
'uv.lock',
|
| 71 |
+
]
|
| 72 |
+
ignore = shutil.ignore_patterns(
|
| 73 |
+
'__pycache__',
|
| 74 |
+
'.pytest_cache',
|
| 75 |
+
'.ruff_cache',
|
| 76 |
+
'.venv',
|
| 77 |
+
'.git',
|
| 78 |
+
'target',
|
| 79 |
+
'*.pyc',
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
copied: list[str] = []
|
| 83 |
+
for rel in include_paths:
|
| 84 |
+
src = REPO_ROOT / rel
|
| 85 |
+
dst = overlay / rel
|
| 86 |
+
if not src.exists():
|
| 87 |
+
continue
|
| 88 |
+
if src.is_dir():
|
| 89 |
+
shutil.copytree(src, dst, dirs_exist_ok=True, ignore=ignore)
|
| 90 |
+
else:
|
| 91 |
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
| 92 |
+
shutil.copy2(src, dst)
|
| 93 |
+
copied.append(rel)
|
| 94 |
+
|
| 95 |
+
scripts_dir = overlay / 'scripts'
|
| 96 |
+
if scripts_dir.exists():
|
| 97 |
+
for sh_path in scripts_dir.rglob('*.sh'):
|
| 98 |
+
data = sh_path.read_bytes()
|
| 99 |
+
data = data.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
|
| 100 |
+
sh_path.write_bytes(data)
|
| 101 |
+
|
| 102 |
+
print(f'[launch] overlay synced from repo ({len(copied)} paths): {copied}', flush=True)
|
| 103 |
|
| 104 |
|
| 105 |
def require_token() -> str:
|
|
|
|
| 126 |
"""
|
| 127 |
start = time.time()
|
| 128 |
seen_build_completion = False
|
| 129 |
+
seen_building = False
|
| 130 |
while True:
|
| 131 |
+
runtime = api.get_space_runtime(repo_id, token=token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
stage = getattr(runtime, 'stage', None)
|
| 133 |
+
hardware = getattr(runtime, 'hardware', None)
|
| 134 |
+
err = getattr(runtime, 'errorMessage', None) or getattr(runtime, 'error_message', None)
|
| 135 |
+
print(f'[space] stage={stage} hardware={hardware}', flush=True)
|
| 136 |
+
if stage == 'BUILDING':
|
| 137 |
+
seen_building = True
|
| 138 |
+
if stage in {'APP_STARTING', 'RUNNING', 'PAUSED', 'SLEEPING'}:
|
| 139 |
+
seen_build_completion = True
|
| 140 |
+
if stage in {'RUNNING', 'PAUSED', 'SLEEPING'}:
|
| 141 |
+
return
|
| 142 |
# Image is built — Jobs can use it regardless of Space boot outcome.
|
| 143 |
+
if (seen_build_completion or seen_building) and stage in {'RUNTIME_ERROR', 'APP_STARTING_ERROR'}:
|
| 144 |
+
print(f'[space] Space boot failed with {stage} but built image is '
|
| 145 |
+
f'available in the Space registry and is usable by HF Jobs.',
|
| 146 |
+
flush=True)
|
| 147 |
+
return
|
|
|
|
|
|
|
| 148 |
# Hard build failures — no image was produced.
|
| 149 |
if stage in {'BUILD_ERROR', 'CONFIG_ERROR', 'NO_APP_FILE'}:
|
| 150 |
raise RuntimeError(f'Space {repo_id} build failed: stage={stage} error={err!r}')
|
| 151 |
if time.time() - start > timeout_s:
|
| 152 |
raise TimeoutError(f'Space {repo_id} did not become ready in {timeout_s}s (last stage={stage})')
|
| 153 |
+
time.sleep(20)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
|
| 156 |
def main() -> int:
|
| 157 |
token = require_token()
|
| 158 |
routing = resolve_routing(token=token)
|
| 159 |
api = HfApi(token=token)
|
| 160 |
+
secondary_gates = HarnessConfig().to_secondary_gates()
|
| 161 |
|
| 162 |
print(f'[launch] image_dir={IMAGE_DIR}', flush=True)
|
| 163 |
print(f'[launch] owner={routing.owner}', flush=True)
|
| 164 |
print(f'[launch] space_repo={routing.space_repo}', flush=True)
|
| 165 |
print(f'[launch] output_repo={routing.output_repo}', flush=True)
|
| 166 |
print(f'[launch] retina_cache_repo={routing.retina_cache_repo}', flush=True)
|
| 167 |
+
print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT}', flush=True)
|
| 168 |
+
print(f'[launch] flavor={JOB_FLAVOR}', flush=True)
|
| 169 |
print(f'[launch] namespace={routing.job_namespace}', flush=True)
|
| 170 |
+
print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
|
| 171 |
+
print(f'[launch] secondary_gates={json.dumps(secondary_gates, sort_keys=True)}', flush=True)
|
| 172 |
+
if not USE_SPACE_IMAGE:
|
| 173 |
+
print(f'[launch] image={DEFAULT_IMAGE}', flush=True)
|
| 174 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
api.create_repo(repo_id=routing.space_repo, repo_type='space', space_sdk='docker', private=True, exist_ok=True, token=token)
|
| 176 |
api.create_repo(repo_id=routing.output_repo, repo_type='model', private=True, exist_ok=True, token=token)
|
| 177 |
|
|
|
|
| 179 |
print('[launch] dry-run mode; skipping upload and job submission', flush=True)
|
| 180 |
return 0
|
| 181 |
|
| 182 |
+
image_ref = DEFAULT_IMAGE
|
| 183 |
+
if USE_SPACE_IMAGE:
|
| 184 |
+
if SKIP_UPLOAD:
|
| 185 |
+
print('[launch] FEATHER_HF_SKIP_UPLOAD=1; reusing existing Space image', flush=True)
|
| 186 |
+
else:
|
| 187 |
+
if SYNC_OVERLAY:
|
| 188 |
+
sync_overlay_from_repo()
|
| 189 |
+
print('[launch] uploading custom Docker Space image context...', flush=True)
|
| 190 |
api.upload_folder(
|
| 191 |
repo_id=routing.space_repo,
|
| 192 |
repo_type='space',
|
| 193 |
folder_path=str(IMAGE_DIR),
|
| 194 |
+
commit_message='Update Feather training runtime image',
|
| 195 |
token=token,
|
| 196 |
)
|
| 197 |
|
|
|
|
| 211 |
'HYDRA_DOWNLOAD_WORKERS': DOWNLOAD_WORKERS,
|
| 212 |
'HYDRA_CKPT_INTERVAL': CKPT_INTERVAL,
|
| 213 |
'PYTHONUNBUFFERED': '1',
|
| 214 |
+
'FEATHER_RUNTIME_MODE': 'job',
|
| 215 |
+
}
|
| 216 |
+
if 'HYDRA_USE_NEMOTRON' not in os.environ and should_enable_fast_start_streaming(TARGET_SHARDS, TIME_BUDGET):
|
| 217 |
+
env['HYDRA_USE_NEMOTRON'] = '1'
|
| 218 |
+
print('[launch] auto-enabled HYDRA_USE_NEMOTRON=1 for short-budget fast-start profile', flush=True)
|
| 219 |
+
# A10 compatibility profile: avoid known PTX/compile runtime pitfalls and
|
| 220 |
+
# keep throughput path enabled. Caller can explicitly override each key by
|
| 221 |
+
# setting it in the parent environment.
|
| 222 |
+
if JOB_FLAVOR.startswith('a10'):
|
| 223 |
+
_a10_defaults = {
|
| 224 |
+
'HYDRA_MUON_COMPILE': '0',
|
| 225 |
+
'HYDRA_FORCE_HTM_CPU': '1',
|
| 226 |
+
'HYDRA_INERT_MAMBA': '1',
|
| 227 |
+
'HYDRA_ALLOW_SYNTHETIC_RETINA': '1',
|
| 228 |
+
'HYDRA_FASTPATH': '1',
|
| 229 |
+
}
|
| 230 |
+
for _k, _default in _a10_defaults.items():
|
| 231 |
+
if _k in os.environ:
|
| 232 |
+
env[_k] = os.environ[_k]
|
| 233 |
+
else:
|
| 234 |
+
env.setdefault(_k, _default)
|
| 235 |
+
if env.get('HYDRA_INERT_MAMBA') == '0' and 'HYDRA_FASTPATH' not in os.environ:
|
| 236 |
+
env['HYDRA_FASTPATH'] = '0'
|
| 237 |
+
print(
|
| 238 |
+
'[launch] applied A10 env profile '
|
| 239 |
+
f"(HYDRA_MUON_COMPILE={env['HYDRA_MUON_COMPILE']}, "
|
| 240 |
+
f"HYDRA_FORCE_HTM_CPU={env['HYDRA_FORCE_HTM_CPU']}, "
|
| 241 |
+
f"HYDRA_INERT_MAMBA={env['HYDRA_INERT_MAMBA']}, "
|
| 242 |
+
f"HYDRA_ALLOW_SYNTHETIC_RETINA={env['HYDRA_ALLOW_SYNTHETIC_RETINA']}, "
|
| 243 |
+
f"HYDRA_FASTPATH={env['HYDRA_FASTPATH']})",
|
| 244 |
+
flush=True,
|
| 245 |
+
)
|
| 246 |
# Pass through any HYDRA_* / FEATHER_* overrides from the caller's env so
|
| 247 |
# sweep drivers can set HYDRA_N_LAYER, HYDRA_SDR_TARGET_ACTIVE,
|
| 248 |
# HYDRA_LAYER_DIAGNOSTICS, HYDRA_METRICS_OUT, HYDRA_MID_VAL_INTERVAL, etc.
|
|
|
|
| 252 |
env[_k] = _v
|
| 253 |
secrets = {'HF_TOKEN': token}
|
| 254 |
|
| 255 |
+
print(f'[launch] submitting HF Job on flavor={JOB_FLAVOR}...', flush=True)
|
| 256 |
+
job = api.run_job(
|
|
|
|
| 257 |
image=image_ref,
|
| 258 |
command=['python', '/app/entrypoint.py'],
|
| 259 |
env=env,
|
| 260 |
secrets=secrets,
|
| 261 |
+
flavor=cast(Any, JOB_FLAVOR),
|
| 262 |
timeout=TIMEOUT,
|
|
|
|
| 263 |
namespace=routing.job_namespace,
|
| 264 |
+
token=token,
|
| 265 |
)
|
| 266 |
print(f'[launch] submitted job_id={job.id} status={job.status.stage} url={job.url}', flush=True)
|
| 267 |
return 0
|
overlay/scripts/long_train.sh
CHANGED
|
@@ -1,38 +1,38 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
# Long-training run for full-architecture completion attempt.
|
| 3 |
-
#
|
| 4 |
-
# The 5-minute autoresearch budget is for mutation screening — it's nowhere
|
| 5 |
-
# near enough compute for this small model (~6M params) to produce coherent
|
| 6 |
-
# English. This script runs the SAME full-architecture train.py with an
|
| 7 |
-
# extended budget so the "factual English" completion criterion can actually
|
| 8 |
-
# be tested end-to-end.
|
| 9 |
-
#
|
| 10 |
-
# Usage:
|
| 11 |
-
# ./scripts/long_train.sh # default 1-hour budget
|
| 12 |
-
# HYDRA_TIME_BUDGET=7200 ./scripts/long_train.sh # 2 hours
|
| 13 |
-
# HYDRA_D_MODEL=384 HYDRA_N_LAYER=6 ./scripts/long_train.sh # scale model
|
| 14 |
-
#
|
| 15 |
-
# Output: run_long_<timestamp>.log in repo root. Includes factual_english_score.
|
| 16 |
-
set -euo pipefail
|
| 17 |
-
|
| 18 |
-
cd "$(dirname "$0")/.."
|
| 19 |
-
|
| 20 |
-
TIME_BUDGET="${HYDRA_TIME_BUDGET:-3600}"
|
| 21 |
-
STAMP="$(date +%Y%m%d_%H%M%S)"
|
| 22 |
-
LOG="run_long_${STAMP}.log"
|
| 23 |
-
|
| 24 |
-
export HYDRA_TIME_BUDGET="${TIME_BUDGET}"
|
| 25 |
-
|
| 26 |
-
echo "=== HYDRA long-training run ==="
|
| 27 |
-
echo "time_budget: ${TIME_BUDGET}s ($((TIME_BUDGET / 60))m)"
|
| 28 |
-
echo "d_model: ${HYDRA_D_MODEL:-256 (default)}"
|
| 29 |
-
echo "n_layer: ${HYDRA_N_LAYER:-4 (default)}"
|
| 30 |
-
echo "d_state: ${HYDRA_D_STATE:-64 (default)}"
|
| 31 |
-
echo "log: ${LOG}"
|
| 32 |
-
echo
|
| 33 |
-
|
| 34 |
-
.venv/bin/python train.py 2>&1 | tee "${LOG}"
|
| 35 |
-
|
| 36 |
-
echo
|
| 37 |
-
echo "=== Summary ==="
|
| 38 |
-
grep -E "^val_bpb:|^factual_english_score:|^factual_english_hits:|^peak_vram_mb:|^num_steps:" "${LOG}"
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Long-training run for full-architecture completion attempt.
|
| 3 |
+
#
|
| 4 |
+
# The 5-minute autoresearch budget is for mutation screening — it's nowhere
|
| 5 |
+
# near enough compute for this small model (~6M params) to produce coherent
|
| 6 |
+
# English. This script runs the SAME full-architecture train.py with an
|
| 7 |
+
# extended budget so the "factual English" completion criterion can actually
|
| 8 |
+
# be tested end-to-end.
|
| 9 |
+
#
|
| 10 |
+
# Usage:
|
| 11 |
+
# ./scripts/long_train.sh # default 1-hour budget
|
| 12 |
+
# HYDRA_TIME_BUDGET=7200 ./scripts/long_train.sh # 2 hours
|
| 13 |
+
# HYDRA_D_MODEL=384 HYDRA_N_LAYER=6 ./scripts/long_train.sh # scale model
|
| 14 |
+
#
|
| 15 |
+
# Output: run_long_<timestamp>.log in repo root. Includes factual_english_score.
|
| 16 |
+
set -euo pipefail
|
| 17 |
+
|
| 18 |
+
cd "$(dirname "$0")/.."
|
| 19 |
+
|
| 20 |
+
TIME_BUDGET="${HYDRA_TIME_BUDGET:-3600}"
|
| 21 |
+
STAMP="$(date +%Y%m%d_%H%M%S)"
|
| 22 |
+
LOG="run_long_${STAMP}.log"
|
| 23 |
+
|
| 24 |
+
export HYDRA_TIME_BUDGET="${TIME_BUDGET}"
|
| 25 |
+
|
| 26 |
+
echo "=== HYDRA long-training run ==="
|
| 27 |
+
echo "time_budget: ${TIME_BUDGET}s ($((TIME_BUDGET / 60))m)"
|
| 28 |
+
echo "d_model: ${HYDRA_D_MODEL:-256 (default)}"
|
| 29 |
+
echo "n_layer: ${HYDRA_N_LAYER:-4 (default)}"
|
| 30 |
+
echo "d_state: ${HYDRA_D_STATE:-64 (default)}"
|
| 31 |
+
echo "log: ${LOG}"
|
| 32 |
+
echo
|
| 33 |
+
|
| 34 |
+
.venv/bin/python train.py 2>&1 | tee "${LOG}"
|
| 35 |
+
|
| 36 |
+
echo
|
| 37 |
+
echo "=== Summary ==="
|
| 38 |
+
grep -E "^val_bpb:|^factual_english_score:|^factual_english_hits:|^peak_vram_mb:|^num_steps:" "${LOG}"
|
overlay/scripts/optuna_hpo.py
ADDED
|
@@ -0,0 +1,725 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
import subprocess
|
| 9 |
+
import sys
|
| 10 |
+
import time
|
| 11 |
+
import tempfile
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Any
|
| 14 |
+
|
| 15 |
+
import optuna
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
_HF_ENV_KEY_RE = re.compile(r"^[A-Z][A-Z0-9_]*$")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 22 |
+
if str(REPO_ROOT) not in sys.path:
|
| 23 |
+
sys.path.insert(0, str(REPO_ROOT))
|
| 24 |
+
|
| 25 |
+
from scripts.hf_routing import resolve_routing
|
| 26 |
+
|
| 27 |
+
TRAIN_ENTRYPOINT = REPO_ROOT / "train.py"
|
| 28 |
+
SEARCH_SPACE_KEYS = {
|
| 29 |
+
"d_model",
|
| 30 |
+
"n_layer",
|
| 31 |
+
"d_state",
|
| 32 |
+
"headdim",
|
| 33 |
+
"expand",
|
| 34 |
+
"seq_len",
|
| 35 |
+
"batch_size",
|
| 36 |
+
"grad_accum",
|
| 37 |
+
"matrix_lr",
|
| 38 |
+
"embed_lr",
|
| 39 |
+
"unembed_lr",
|
| 40 |
+
"engram_n_columns",
|
| 41 |
+
"sdr_target_active",
|
| 42 |
+
"hyena_layers",
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _filter_prior_params(raw: dict[str, Any]) -> dict[str, Any]:
|
| 47 |
+
return {k: v for k, v in raw.items() if k in SEARCH_SPACE_KEYS}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _load_prior_param_sets(path: Path) -> list[dict[str, Any]]:
|
| 51 |
+
if not path.exists():
|
| 52 |
+
return []
|
| 53 |
+
|
| 54 |
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
| 55 |
+
if isinstance(payload, dict):
|
| 56 |
+
rows = payload.get("trials", [])
|
| 57 |
+
elif isinstance(payload, list):
|
| 58 |
+
rows = payload
|
| 59 |
+
else:
|
| 60 |
+
rows = []
|
| 61 |
+
|
| 62 |
+
out: list[dict[str, Any]] = []
|
| 63 |
+
for item in rows:
|
| 64 |
+
if not isinstance(item, dict):
|
| 65 |
+
continue
|
| 66 |
+
params_obj = item.get("params", item)
|
| 67 |
+
if not isinstance(params_obj, dict):
|
| 68 |
+
continue
|
| 69 |
+
filtered = _filter_prior_params(params_obj)
|
| 70 |
+
if filtered:
|
| 71 |
+
out.append(filtered)
|
| 72 |
+
return out
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _enqueue_transfer_priors(study: optuna.Study, priors_file: Path, apply_priors: bool) -> int:
|
| 76 |
+
if not apply_priors:
|
| 77 |
+
return 0
|
| 78 |
+
|
| 79 |
+
priors_raw = _load_prior_param_sets(priors_file)
|
| 80 |
+
if not priors_raw:
|
| 81 |
+
return 0
|
| 82 |
+
|
| 83 |
+
# Deduplicate param sets across merged studies.
|
| 84 |
+
priors: list[dict[str, Any]] = []
|
| 85 |
+
seen: set[str] = set()
|
| 86 |
+
for params in priors_raw:
|
| 87 |
+
key = json.dumps(params, sort_keys=True)
|
| 88 |
+
if key in seen:
|
| 89 |
+
continue
|
| 90 |
+
seen.add(key)
|
| 91 |
+
priors.append(params)
|
| 92 |
+
|
| 93 |
+
enqueued = 0
|
| 94 |
+
for params in priors:
|
| 95 |
+
before = len(study.get_trials(deepcopy=False))
|
| 96 |
+
try:
|
| 97 |
+
study.enqueue_trial(params, user_attrs={"seed_source": "transfer_priors"}, skip_if_exists=True)
|
| 98 |
+
except TypeError:
|
| 99 |
+
study.enqueue_trial(params, user_attrs={"seed_source": "transfer_priors"})
|
| 100 |
+
after = len(study.get_trials(deepcopy=False))
|
| 101 |
+
if after > before:
|
| 102 |
+
enqueued += 1
|
| 103 |
+
return enqueued
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any] | None:
|
| 107 |
+
metrics_line: str | None = None
|
| 108 |
+
for line in stdout.splitlines():
|
| 109 |
+
if "[METRICS_JSON]" in line:
|
| 110 |
+
metrics_line = line
|
| 111 |
+
if not metrics_line:
|
| 112 |
+
return None
|
| 113 |
+
m = re.search(r"\[METRICS_JSON\]\s*(\{.*\})", metrics_line)
|
| 114 |
+
if not m:
|
| 115 |
+
return None
|
| 116 |
+
try:
|
| 117 |
+
return json.loads(m.group(1))
|
| 118 |
+
except json.JSONDecodeError:
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def _parse_metrics_from_log_lines(lines: list[str]) -> dict[str, Any] | None:
|
| 123 |
+
metrics_line: str | None = None
|
| 124 |
+
for line in lines:
|
| 125 |
+
if "[METRICS_JSON]" in line:
|
| 126 |
+
metrics_line = line
|
| 127 |
+
if not metrics_line:
|
| 128 |
+
return None
|
| 129 |
+
m = re.search(r"\[METRICS_JSON\]\s*(\{.*\})", metrics_line)
|
| 130 |
+
if not m:
|
| 131 |
+
return None
|
| 132 |
+
try:
|
| 133 |
+
return json.loads(m.group(1))
|
| 134 |
+
except json.JSONDecodeError:
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def _parse_last_train_bpb_from_logs(lines: list[str]) -> float | None:
|
| 139 |
+
"""Best-effort fallback when final eval crashes before metrics JSON write."""
|
| 140 |
+
last: float | None = None
|
| 141 |
+
for line in lines:
|
| 142 |
+
m = re.search(r"\bbpb=([0-9]+(?:\.[0-9]+)?)", line)
|
| 143 |
+
if m:
|
| 144 |
+
last = float(m.group(1))
|
| 145 |
+
return last
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path) -> dict[str, str]:
|
| 149 |
+
env = os.environ.copy()
|
| 150 |
+
|
| 151 |
+
# Runtime and reporting
|
| 152 |
+
env["HYDRA_METRICS_OUT"] = str(metrics_path)
|
| 153 |
+
env["HYDRA_TIME_BUDGET"] = str(args.trial_time_budget)
|
| 154 |
+
env["PYTHONUNBUFFERED"] = "1"
|
| 155 |
+
|
| 156 |
+
# Search space — fully env-driven to match existing training stack.
|
| 157 |
+
env["HYDRA_D_MODEL"] = str(trial.suggest_categorical("d_model", [64, 96, 128, 160, 192]))
|
| 158 |
+
env["HYDRA_N_LAYER"] = str(trial.suggest_int("n_layer", 1, 4))
|
| 159 |
+
env["HYDRA_D_STATE"] = str(trial.suggest_categorical("d_state", [16, 32, 48]))
|
| 160 |
+
env["HYDRA_HEADDIM"] = str(trial.suggest_categorical("headdim", [8, 16, 32]))
|
| 161 |
+
env["HYDRA_EXPAND"] = str(trial.suggest_categorical("expand", [1, 2]))
|
| 162 |
+
|
| 163 |
+
seq_len = trial.suggest_categorical("seq_len", [32, 64])
|
| 164 |
+
batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
|
| 165 |
+
grad_accum = trial.suggest_categorical("grad_accum", [8, 16, 32, 64])
|
| 166 |
+
# Keep TOTAL_BATCH_SIZE divisible by DEVICE_BATCH_SIZE * MAX_SEQ_LEN.
|
| 167 |
+
total_batch = batch_size * seq_len * grad_accum
|
| 168 |
+
env["HYDRA_SEQ_LEN"] = str(seq_len)
|
| 169 |
+
env["HYDRA_BATCH_SIZE"] = str(batch_size)
|
| 170 |
+
env["HYDRA_TOTAL_BATCH"] = str(total_batch)
|
| 171 |
+
|
| 172 |
+
env["HYDRA_MATRIX_LR"] = str(trial.suggest_float("matrix_lr", 0.005, 0.2, log=True))
|
| 173 |
+
env["HYDRA_EMBED_LR"] = str(trial.suggest_float("embed_lr", 0.05, 1.0, log=True))
|
| 174 |
+
env["HYDRA_UNEMBED_LR"] = str(trial.suggest_float("unembed_lr", 0.0005, 0.02, log=True))
|
| 175 |
+
|
| 176 |
+
env["HYDRA_ENGRAM_N_COLUMNS"] = str(trial.suggest_categorical("engram_n_columns", [256, 512, 1024]))
|
| 177 |
+
env["HYDRA_SDR_TARGET_ACTIVE"] = str(trial.suggest_categorical("sdr_target_active", [128, 256, 327, 512]))
|
| 178 |
+
env["HYDRA_HYENA_LAYERS"] = trial.suggest_categorical("hyena_layers", ["", "0", "1", "0,1"])
|
| 179 |
+
|
| 180 |
+
# Keep trials alive long enough to emit metrics.
|
| 181 |
+
env["HYDRA_FAIL_LOSS_THRESHOLD"] = "1000000"
|
| 182 |
+
env["HYDRA_USE_NEMOTRON"] = os.environ.get("HYDRA_USE_NEMOTRON", "1")
|
| 183 |
+
env["HYDRA_LOCAL_SHARDS_ONLY"] = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "0")
|
| 184 |
+
# Strict optimal-path defaults (no forced fallback profile).
|
| 185 |
+
env["HYDRA_MUON_COMPILE"] = os.environ.get("HYDRA_MUON_COMPILE", "1")
|
| 186 |
+
env["HYDRA_FORCE_HTM_CPU"] = os.environ.get("HYDRA_FORCE_HTM_CPU", "0")
|
| 187 |
+
env["HYDRA_ALLOW_SYNTHETIC_RETINA"] = os.environ.get("HYDRA_ALLOW_SYNTHETIC_RETINA", "0")
|
| 188 |
+
env["HYDRA_INERT_MAMBA"] = os.environ.get("HYDRA_INERT_MAMBA", "0")
|
| 189 |
+
env["HYDRA_FASTPATH"] = os.environ.get("HYDRA_FASTPATH", "0")
|
| 190 |
+
|
| 191 |
+
return env
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def _sanitize_hf_env(env: dict[str, str]) -> dict[str, str]:
|
| 195 |
+
"""HF Jobs API accepts only strictly alnum/underscore env keys."""
|
| 196 |
+
sanitized: dict[str, str] = {}
|
| 197 |
+
for key, value in env.items():
|
| 198 |
+
if _HF_ENV_KEY_RE.match(key):
|
| 199 |
+
sanitized[key] = str(value)
|
| 200 |
+
return sanitized
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def _hf_command_candidates(args: argparse.Namespace) -> list[list[str]]:
|
| 204 |
+
if args.hf_use_bash:
|
| 205 |
+
return [["bash", "-lc", args.hf_command]]
|
| 206 |
+
|
| 207 |
+
raw = args.hf_command.strip()
|
| 208 |
+
if args.hf_auto_command_fallback and raw == "/app/entrypoint.py":
|
| 209 |
+
candidates = [
|
| 210 |
+
["/usr/bin/python3", "/app/entrypoint.py"],
|
| 211 |
+
["/usr/local/bin/python3", "/app/entrypoint.py"],
|
| 212 |
+
["python3", "/app/entrypoint.py"],
|
| 213 |
+
["python", "/app/entrypoint.py"],
|
| 214 |
+
["/app/entrypoint.py"],
|
| 215 |
+
]
|
| 216 |
+
uniq: list[list[str]] = []
|
| 217 |
+
seen: set[tuple[str, ...]] = set()
|
| 218 |
+
for c in candidates:
|
| 219 |
+
key = tuple(c)
|
| 220 |
+
if key not in seen:
|
| 221 |
+
seen.add(key)
|
| 222 |
+
uniq.append(c)
|
| 223 |
+
return uniq
|
| 224 |
+
|
| 225 |
+
return [raw.split()]
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def _objective_local(args: argparse.Namespace):
|
| 229 |
+
def objective(trial: optuna.Trial) -> float:
|
| 230 |
+
trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
|
| 231 |
+
metrics_path = trial_dir / "metrics.json"
|
| 232 |
+
|
| 233 |
+
env = _trial_env(trial, args, metrics_path)
|
| 234 |
+
|
| 235 |
+
proc = subprocess.run(
|
| 236 |
+
[sys.executable, str(TRAIN_ENTRYPOINT)],
|
| 237 |
+
cwd=str(REPO_ROOT),
|
| 238 |
+
env=env,
|
| 239 |
+
text=True,
|
| 240 |
+
capture_output=True,
|
| 241 |
+
timeout=args.trial_timeout,
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
metrics: dict[str, Any] | None = None
|
| 245 |
+
if metrics_path.exists():
|
| 246 |
+
try:
|
| 247 |
+
metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
|
| 248 |
+
except json.JSONDecodeError:
|
| 249 |
+
metrics = None
|
| 250 |
+
if metrics is None:
|
| 251 |
+
metrics = _parse_metrics_from_stdout(proc.stdout)
|
| 252 |
+
|
| 253 |
+
if metrics is None:
|
| 254 |
+
raise optuna.TrialPruned("No metrics found (HYDRA_METRICS_OUT/[METRICS_JSON])")
|
| 255 |
+
|
| 256 |
+
if proc.returncode != 0:
|
| 257 |
+
raise optuna.TrialPruned(f"Training failed rc={proc.returncode}")
|
| 258 |
+
|
| 259 |
+
metric_key = args.metric
|
| 260 |
+
if metric_key not in metrics or metrics[metric_key] is None:
|
| 261 |
+
raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
|
| 262 |
+
|
| 263 |
+
tps_val = metrics.get("tps")
|
| 264 |
+
if tps_val is not None:
|
| 265 |
+
tps_f = float(tps_val)
|
| 266 |
+
trial.set_user_attr("tps", tps_f)
|
| 267 |
+
if args.min_tps is not None and tps_f < args.min_tps:
|
| 268 |
+
raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
|
| 269 |
+
|
| 270 |
+
value = float(metrics[metric_key])
|
| 271 |
+
|
| 272 |
+
# Keep useful context on trial
|
| 273 |
+
trial.set_user_attr("summary_path", metrics.get("summary_path"))
|
| 274 |
+
trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
|
| 275 |
+
|
| 276 |
+
return value
|
| 277 |
+
|
| 278 |
+
return objective
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def _objective_hf_job(args: argparse.Namespace):
|
| 282 |
+
from huggingface_hub import HfApi
|
| 283 |
+
from huggingface_hub.utils import get_token
|
| 284 |
+
|
| 285 |
+
token = os.environ.get(args.hf_token_env) or get_token()
|
| 286 |
+
if not token:
|
| 287 |
+
raise RuntimeError(
|
| 288 |
+
f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
api = HfApi(token=token)
|
| 292 |
+
terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
|
| 293 |
+
|
| 294 |
+
def objective(trial: optuna.Trial) -> float:
|
| 295 |
+
trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
|
| 296 |
+
metrics_path = trial_dir / "metrics.json"
|
| 297 |
+
env = _trial_env(trial, args, metrics_path)
|
| 298 |
+
env = _sanitize_hf_env(env)
|
| 299 |
+
|
| 300 |
+
selected_job = None
|
| 301 |
+
launch_errors: list[str] = []
|
| 302 |
+
for command in _hf_command_candidates(args):
|
| 303 |
+
try:
|
| 304 |
+
job = api.run_job(
|
| 305 |
+
image=args.hf_image,
|
| 306 |
+
command=command,
|
| 307 |
+
env=env,
|
| 308 |
+
secrets={args.hf_token_env: token},
|
| 309 |
+
flavor=args.hf_flavor,
|
| 310 |
+
timeout=args.hf_timeout,
|
| 311 |
+
labels={"project": "feather", "goal": "optuna-hpo", "trial": str(trial.number)},
|
| 312 |
+
token=token,
|
| 313 |
+
namespace=args.hf_namespace,
|
| 314 |
+
)
|
| 315 |
+
except Exception as e:
|
| 316 |
+
launch_errors.append(f"launch:{command}: {type(e).__name__}: {e}")
|
| 317 |
+
continue
|
| 318 |
+
|
| 319 |
+
# Bootstrap check: reject known command/exec failures quickly.
|
| 320 |
+
bootstrap_deadline = time.time() + args.hf_bootstrap_seconds
|
| 321 |
+
bootstrap_stage = "UNKNOWN"
|
| 322 |
+
bootstrap_logs: list[str] = []
|
| 323 |
+
bootstrap_msg = ""
|
| 324 |
+
while time.time() < bootstrap_deadline:
|
| 325 |
+
info = api.inspect_job(job_id=job.id, token=token, namespace=args.hf_namespace)
|
| 326 |
+
bootstrap_stage = str(info.status.stage)
|
| 327 |
+
bootstrap_msg = str(getattr(info.status, "message", "") or "")
|
| 328 |
+
bootstrap_logs = list(
|
| 329 |
+
api.fetch_job_logs(
|
| 330 |
+
job_id=job.id,
|
| 331 |
+
follow=False,
|
| 332 |
+
token=token,
|
| 333 |
+
namespace=args.hf_namespace,
|
| 334 |
+
)
|
| 335 |
+
)
|
| 336 |
+
if bootstrap_stage in {"RUNNING", "COMPLETED"} or bootstrap_logs:
|
| 337 |
+
break
|
| 338 |
+
if bootstrap_stage in {"ERROR", "FAILED", "CANCELLED", "CANCELED", "TIMEOUT"}:
|
| 339 |
+
break
|
| 340 |
+
time.sleep(2)
|
| 341 |
+
|
| 342 |
+
detail = bootstrap_msg.lower()
|
| 343 |
+
unusable = bootstrap_stage in {"ERROR", "FAILED"} and len(bootstrap_logs) == 0 and any(
|
| 344 |
+
k in detail for k in ("executable file not found", "permission denied", "exec:")
|
| 345 |
+
)
|
| 346 |
+
if unusable:
|
| 347 |
+
launch_errors.append(f"bootstrap:{command}: {bootstrap_msg}")
|
| 348 |
+
continue
|
| 349 |
+
|
| 350 |
+
selected_job = job
|
| 351 |
+
break
|
| 352 |
+
|
| 353 |
+
if selected_job is None:
|
| 354 |
+
raise optuna.TrialPruned(f"HF job launch failed across command candidates: {launch_errors[:3]}")
|
| 355 |
+
|
| 356 |
+
job = selected_job
|
| 357 |
+
job_id = job.id
|
| 358 |
+
trial.set_user_attr("hf_job_id", job_id)
|
| 359 |
+
|
| 360 |
+
start = time.time()
|
| 361 |
+
metrics: dict[str, Any] | None = None
|
| 362 |
+
tps_seen: float | None = None
|
| 363 |
+
stage: str = "UNKNOWN"
|
| 364 |
+
log_lines: list[str] = []
|
| 365 |
+
terminal_detail: str | None = None
|
| 366 |
+
|
| 367 |
+
while True:
|
| 368 |
+
info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
|
| 369 |
+
stage = str(info.status.stage)
|
| 370 |
+
terminal_detail = str(getattr(info.status, "message", "")) or terminal_detail
|
| 371 |
+
log_lines = list(api.fetch_job_logs(job_id=job_id, follow=False, token=token, namespace=args.hf_namespace))
|
| 372 |
+
|
| 373 |
+
m = _parse_metrics_from_log_lines(log_lines)
|
| 374 |
+
if m is not None:
|
| 375 |
+
metrics = m
|
| 376 |
+
break
|
| 377 |
+
|
| 378 |
+
# Capture latest tps even before final metrics json
|
| 379 |
+
for line in log_lines:
|
| 380 |
+
mt = re.search(r"\btps=([0-9]+(?:\.[0-9]+)?)", line)
|
| 381 |
+
if mt:
|
| 382 |
+
tps_seen = float(mt.group(1))
|
| 383 |
+
|
| 384 |
+
if stage in terminal_states:
|
| 385 |
+
break
|
| 386 |
+
if time.time() - start > args.trial_timeout:
|
| 387 |
+
break
|
| 388 |
+
time.sleep(args.hf_poll_interval)
|
| 389 |
+
|
| 390 |
+
# Best-effort stop to control cost
|
| 391 |
+
try:
|
| 392 |
+
info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
|
| 393 |
+
if info.status.stage not in terminal_states and args.hf_stop_after_metric:
|
| 394 |
+
api.cancel_job(job_id=job_id, token=token, namespace=args.hf_namespace)
|
| 395 |
+
except Exception:
|
| 396 |
+
pass
|
| 397 |
+
|
| 398 |
+
# Save logs for debugging
|
| 399 |
+
(trial_dir / "hf_job.log").write_text("\n".join(log_lines), encoding="utf-8")
|
| 400 |
+
trial.set_user_attr("hf_stage", stage)
|
| 401 |
+
trial.set_user_attr("hf_log_lines", len(log_lines))
|
| 402 |
+
if terminal_detail:
|
| 403 |
+
trial.set_user_attr("hf_status_message", terminal_detail)
|
| 404 |
+
|
| 405 |
+
if metrics is None:
|
| 406 |
+
if args.allow_log_metric_fallback and args.metric == "val_bpb":
|
| 407 |
+
fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
|
| 408 |
+
if fallback_bpb is not None:
|
| 409 |
+
trial.set_user_attr("metric_source", "log_bpb_fallback")
|
| 410 |
+
if tps_seen is not None:
|
| 411 |
+
trial.set_user_attr("tps", tps_seen)
|
| 412 |
+
if args.min_tps is not None and tps_seen < args.min_tps:
|
| 413 |
+
raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {args.min_tps}")
|
| 414 |
+
return float(fallback_bpb)
|
| 415 |
+
if tps_seen is not None:
|
| 416 |
+
trial.set_user_attr("tps", tps_seen)
|
| 417 |
+
detail = f"stage={stage}, logs={len(log_lines)}"
|
| 418 |
+
if terminal_detail:
|
| 419 |
+
detail = f"{detail}, message={terminal_detail}"
|
| 420 |
+
raise optuna.TrialPruned(f"No metrics found from HF job ({detail})")
|
| 421 |
+
|
| 422 |
+
metric_key = args.metric
|
| 423 |
+
if metric_key not in metrics or metrics[metric_key] is None:
|
| 424 |
+
raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
|
| 425 |
+
|
| 426 |
+
tps_val = metrics.get("tps")
|
| 427 |
+
if tps_val is not None:
|
| 428 |
+
tps_f = float(tps_val)
|
| 429 |
+
trial.set_user_attr("tps", tps_f)
|
| 430 |
+
if args.min_tps is not None and tps_f < args.min_tps:
|
| 431 |
+
raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
|
| 432 |
+
|
| 433 |
+
value = float(metrics[metric_key])
|
| 434 |
+
trial.set_user_attr("summary_path", metrics.get("summary_path"))
|
| 435 |
+
trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
|
| 436 |
+
return value
|
| 437 |
+
|
| 438 |
+
return objective
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
def _objective_hf_launcher(args: argparse.Namespace):
|
| 442 |
+
from huggingface_hub import HfApi
|
| 443 |
+
from huggingface_hub.utils import get_token
|
| 444 |
+
|
| 445 |
+
token = os.environ.get(args.hf_token_env) or get_token()
|
| 446 |
+
if not token:
|
| 447 |
+
raise RuntimeError(
|
| 448 |
+
f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
api = HfApi(token=token)
|
| 452 |
+
terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
|
| 453 |
+
|
| 454 |
+
def objective(trial: optuna.Trial) -> float:
|
| 455 |
+
trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
|
| 456 |
+
metrics_path = trial_dir / "metrics.json"
|
| 457 |
+
env = _trial_env(trial, args, metrics_path)
|
| 458 |
+
env = _sanitize_hf_env(env)
|
| 459 |
+
|
| 460 |
+
local_env = os.environ.copy()
|
| 461 |
+
local_env.update(env)
|
| 462 |
+
local_env[args.hf_token_env] = token
|
| 463 |
+
local_env["FEATHER_HF_NAMESPACE"] = args.hf_namespace
|
| 464 |
+
local_env["FEATHER_HF_FLAVOR"] = args.hf_flavor
|
| 465 |
+
local_env["FEATHER_HF_JOB_TIMEOUT"] = args.hf_timeout
|
| 466 |
+
local_env["FEATHER_HF_IMAGE"] = args.hf_image
|
| 467 |
+
local_env["FEATHER_HF_SPACE_REPO"] = f"{args.hf_namespace}/feather-h200-runtime"
|
| 468 |
+
if args.hf_output_repo:
|
| 469 |
+
local_env["FEATHER_HF_OUTPUT_REPO"] = args.hf_output_repo
|
| 470 |
+
else:
|
| 471 |
+
local_env["FEATHER_HF_OUTPUT_REPO"] = f"{args.hf_namespace}/feather-pretrain-checkpoints"
|
| 472 |
+
|
| 473 |
+
proc = subprocess.run(
|
| 474 |
+
[sys.executable, str(args.hf_launcher_script)],
|
| 475 |
+
cwd=str(REPO_ROOT),
|
| 476 |
+
env=local_env,
|
| 477 |
+
text=True,
|
| 478 |
+
capture_output=True,
|
| 479 |
+
timeout=max(args.trial_timeout, 120),
|
| 480 |
+
)
|
| 481 |
+
|
| 482 |
+
launch_stdout = proc.stdout or ""
|
| 483 |
+
launch_stderr = proc.stderr or ""
|
| 484 |
+
m = re.search(r"job_id=([a-zA-Z0-9_-]+)", launch_stdout)
|
| 485 |
+
if proc.returncode != 0 or not m:
|
| 486 |
+
raise optuna.TrialPruned(
|
| 487 |
+
f"HF launcher failed rc={proc.returncode}; stderr={launch_stderr[-400:]} stdout_tail={launch_stdout[-400:]}"
|
| 488 |
+
)
|
| 489 |
+
|
| 490 |
+
job_id = m.group(1)
|
| 491 |
+
trial.set_user_attr("hf_job_id", job_id)
|
| 492 |
+
|
| 493 |
+
start = time.time()
|
| 494 |
+
metrics: dict[str, Any] | None = None
|
| 495 |
+
tps_seen: float | None = None
|
| 496 |
+
stage: str = "UNKNOWN"
|
| 497 |
+
log_lines: list[str] = []
|
| 498 |
+
terminal_detail: str | None = None
|
| 499 |
+
|
| 500 |
+
while True:
|
| 501 |
+
info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
|
| 502 |
+
stage = str(info.status.stage)
|
| 503 |
+
terminal_detail = str(getattr(info.status, "message", "") or "") or terminal_detail
|
| 504 |
+
log_lines = list(api.fetch_job_logs(job_id=job_id, follow=False, token=token, namespace=args.hf_namespace))
|
| 505 |
+
|
| 506 |
+
mtr = _parse_metrics_from_log_lines(log_lines)
|
| 507 |
+
if mtr is not None:
|
| 508 |
+
metrics = mtr
|
| 509 |
+
break
|
| 510 |
+
|
| 511 |
+
for line in log_lines:
|
| 512 |
+
mt = re.search(r"\btps=([0-9]+(?:\.[0-9]+)?)", line)
|
| 513 |
+
if mt:
|
| 514 |
+
tps_seen = float(mt.group(1))
|
| 515 |
+
|
| 516 |
+
if stage in terminal_states:
|
| 517 |
+
break
|
| 518 |
+
if time.time() - start > args.trial_timeout:
|
| 519 |
+
break
|
| 520 |
+
time.sleep(args.hf_poll_interval)
|
| 521 |
+
|
| 522 |
+
try:
|
| 523 |
+
info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
|
| 524 |
+
if info.status.stage not in terminal_states and args.hf_stop_after_metric:
|
| 525 |
+
api.cancel_job(job_id=job_id, token=token, namespace=args.hf_namespace)
|
| 526 |
+
except Exception:
|
| 527 |
+
pass
|
| 528 |
+
|
| 529 |
+
(trial_dir / "hf_job.log").write_text("\n".join(log_lines), encoding="utf-8")
|
| 530 |
+
trial.set_user_attr("hf_stage", stage)
|
| 531 |
+
trial.set_user_attr("hf_log_lines", len(log_lines))
|
| 532 |
+
if terminal_detail:
|
| 533 |
+
trial.set_user_attr("hf_status_message", terminal_detail)
|
| 534 |
+
|
| 535 |
+
if metrics is None:
|
| 536 |
+
if args.allow_log_metric_fallback and args.metric == "val_bpb":
|
| 537 |
+
fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
|
| 538 |
+
if fallback_bpb is not None:
|
| 539 |
+
trial.set_user_attr("metric_source", "log_bpb_fallback")
|
| 540 |
+
if tps_seen is not None:
|
| 541 |
+
trial.set_user_attr("tps", tps_seen)
|
| 542 |
+
if args.min_tps is not None and tps_seen < args.min_tps:
|
| 543 |
+
raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {args.min_tps}")
|
| 544 |
+
return float(fallback_bpb)
|
| 545 |
+
if tps_seen is not None:
|
| 546 |
+
trial.set_user_attr("tps", tps_seen)
|
| 547 |
+
detail = f"stage={stage}, logs={len(log_lines)}"
|
| 548 |
+
if terminal_detail:
|
| 549 |
+
detail = f"{detail}, message={terminal_detail}"
|
| 550 |
+
raise optuna.TrialPruned(f"No metrics found from HF launcher job ({detail})")
|
| 551 |
+
|
| 552 |
+
metric_key = args.metric
|
| 553 |
+
if metric_key not in metrics or metrics[metric_key] is None:
|
| 554 |
+
raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
|
| 555 |
+
|
| 556 |
+
tps_val = metrics.get("tps")
|
| 557 |
+
if tps_val is not None:
|
| 558 |
+
tps_f = float(tps_val)
|
| 559 |
+
trial.set_user_attr("tps", tps_f)
|
| 560 |
+
if args.min_tps is not None and tps_f < args.min_tps:
|
| 561 |
+
raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
|
| 562 |
+
|
| 563 |
+
value = float(metrics[metric_key])
|
| 564 |
+
trial.set_user_attr("summary_path", metrics.get("summary_path"))
|
| 565 |
+
trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
|
| 566 |
+
return value
|
| 567 |
+
|
| 568 |
+
return objective
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 572 |
+
routing_defaults = resolve_routing(token=os.environ.get("HF_TOKEN"))
|
| 573 |
+
parser = argparse.ArgumentParser(description="Optuna HPO runner for HYDRA train.py")
|
| 574 |
+
parser.add_argument("--study-name", default="hydra_hpo", help="Optuna study name")
|
| 575 |
+
parser.add_argument("--storage", default="sqlite:///optuna_hpo.db", help="Optuna storage URL")
|
| 576 |
+
parser.add_argument("--direction", choices=["minimize", "maximize"], default="minimize")
|
| 577 |
+
parser.add_argument("--metric", default="val_bpb", help="Metric key to optimize from HYDRA metrics")
|
| 578 |
+
parser.add_argument(
|
| 579 |
+
"--min-tps",
|
| 580 |
+
type=float,
|
| 581 |
+
default=50000.0,
|
| 582 |
+
help="TPS floor; prune trials under this value (set 0 to disable)",
|
| 583 |
+
)
|
| 584 |
+
parser.add_argument("--trials", type=int, default=20, help="Number of Optuna trials")
|
| 585 |
+
parser.add_argument("--study-timeout", type=int, default=None, help="Study timeout in seconds")
|
| 586 |
+
parser.add_argument("--trial-time-budget", type=int, default=300, help="HYDRA_TIME_BUDGET passed to each trial")
|
| 587 |
+
parser.add_argument("--trial-timeout", type=int, default=900, help="Subprocess timeout per trial in seconds")
|
| 588 |
+
parser.add_argument("--runner", choices=["local", "hf-job", "hf-launcher"], default="local", help="Trial execution backend")
|
| 589 |
+
parser.add_argument("--hf-namespace", default=routing_defaults.job_namespace, help="HF namespace for jobs")
|
| 590 |
+
parser.add_argument("--hf-image", default=f"hf.co/spaces/{routing_defaults.space_repo}", help="HF jobs image")
|
| 591 |
+
parser.add_argument("--hf-flavor", default="a10g-large", help="HF jobs hardware flavor")
|
| 592 |
+
parser.add_argument("--hf-timeout", default="25m", help="HF job timeout string")
|
| 593 |
+
parser.add_argument("--hf-command", default="/app/entrypoint.py", help="Command executed inside HF job")
|
| 594 |
+
parser.add_argument("--hf-use-bash", action="store_true", help="Run HF command via bash -lc")
|
| 595 |
+
parser.add_argument("--hf-auto-command-fallback", action="store_true", default=True, help="Auto-wrap entrypoint command with python/python3/uv fallback")
|
| 596 |
+
parser.add_argument("--no-hf-auto-command-fallback", action="store_false", dest="hf_auto_command_fallback")
|
| 597 |
+
parser.add_argument("--hf-poll-interval", type=int, default=12, help="HF job poll interval seconds")
|
| 598 |
+
parser.add_argument("--hf-bootstrap-seconds", type=int, default=18, help="Initial seconds to validate command bootstrap")
|
| 599 |
+
parser.add_argument("--hf-token-env", default="HF_TOKEN", help="Token env key passed as HF job secret")
|
| 600 |
+
parser.add_argument("--hf-stop-after-metric", action="store_true", default=True, help="Cancel running job after metrics captured")
|
| 601 |
+
parser.add_argument("--no-hf-stop-after-metric", action="store_false", dest="hf_stop_after_metric")
|
| 602 |
+
parser.add_argument("--hf-launcher-script", type=Path, default=REPO_ROOT / "scripts" / "launch_feather_hf_job.py", help="Local launcher script for hf-launcher runner")
|
| 603 |
+
parser.add_argument("--hf-output-repo", default=routing_defaults.output_repo, help="Optional FEATHER_HF_OUTPUT_REPO override for launcher runner")
|
| 604 |
+
parser.add_argument("--allow-log-metric-fallback", action="store_true", default=False, help="When metrics JSON is missing, allow val_bpb fallback from latest logged train bpb")
|
| 605 |
+
parser.add_argument("--no-allow-log-metric-fallback", action="store_false", dest="allow_log_metric_fallback")
|
| 606 |
+
parser.add_argument("--priors-file", type=Path, default=REPO_ROOT / "docs" / "hpo_transfer_priors.json", help="Path to transfer-learning prior trials JSON")
|
| 607 |
+
parser.add_argument("--apply-priors", action="store_true", default=True, help="Enqueue transfer-learning prior trials before optimize")
|
| 608 |
+
parser.add_argument("--no-apply-priors", action="store_false", dest="apply_priors")
|
| 609 |
+
parser.add_argument("--seed", type=int, default=42, help="Seed for sampler")
|
| 610 |
+
parser.add_argument("--n-startup-trials", type=int, default=5, help="Pruner startup trials before pruning")
|
| 611 |
+
parser.add_argument("--n-warmup-steps", type=int, default=0, help="Pruner warmup steps")
|
| 612 |
+
parser.add_argument("--patience-trials", type=int, default=None, help="Stop study after this many completed trials without meaningful improvement")
|
| 613 |
+
parser.add_argument("--min-improvement", type=float, default=0.0, help="Minimum best-value improvement to reset patience")
|
| 614 |
+
parser.add_argument("--work-dir", type=Path, default=REPO_ROOT / ".tmp" / "optuna", help="Directory for trial artifacts")
|
| 615 |
+
parser.add_argument("--summary-out", type=Path, default=REPO_ROOT / ".tmp" / "optuna" / "best_summary.json")
|
| 616 |
+
return parser.parse_args(argv)
|
| 617 |
+
|
| 618 |
+
|
| 619 |
+
def main() -> int:
|
| 620 |
+
args = parse_args()
|
| 621 |
+
args.work_dir.mkdir(parents=True, exist_ok=True)
|
| 622 |
+
args.summary_out.parent.mkdir(parents=True, exist_ok=True)
|
| 623 |
+
|
| 624 |
+
sampler = optuna.samplers.TPESampler(seed=args.seed, multivariate=True)
|
| 625 |
+
pruner = optuna.pruners.MedianPruner(
|
| 626 |
+
n_startup_trials=args.n_startup_trials,
|
| 627 |
+
n_warmup_steps=args.n_warmup_steps,
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
study = optuna.create_study(
|
| 631 |
+
study_name=args.study_name,
|
| 632 |
+
storage=args.storage,
|
| 633 |
+
load_if_exists=True,
|
| 634 |
+
direction=args.direction,
|
| 635 |
+
sampler=sampler,
|
| 636 |
+
pruner=pruner,
|
| 637 |
+
)
|
| 638 |
+
|
| 639 |
+
enqueued_priors = _enqueue_transfer_priors(study, args.priors_file, args.apply_priors)
|
| 640 |
+
if enqueued_priors:
|
| 641 |
+
print(f"[hpo] enqueued {enqueued_priors} transfer priors from {args.priors_file}")
|
| 642 |
+
|
| 643 |
+
state: dict[str, Any] = {
|
| 644 |
+
"best": None,
|
| 645 |
+
"best_trial_number": None,
|
| 646 |
+
"last_improve_trial_number": None,
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
def _improved(new_value: float, best_value: float) -> bool:
|
| 650 |
+
if args.direction == "minimize":
|
| 651 |
+
return new_value < (best_value - args.min_improvement)
|
| 652 |
+
return new_value > (best_value + args.min_improvement)
|
| 653 |
+
|
| 654 |
+
def _early_stop_callback(study_obj: optuna.Study, trial: optuna.trial.FrozenTrial) -> None:
|
| 655 |
+
if trial.value is None:
|
| 656 |
+
return
|
| 657 |
+
|
| 658 |
+
if state["best"] is None or _improved(float(trial.value), float(state["best"])):
|
| 659 |
+
state["best"] = float(trial.value)
|
| 660 |
+
state["best_trial_number"] = trial.number
|
| 661 |
+
state["last_improve_trial_number"] = trial.number
|
| 662 |
+
return
|
| 663 |
+
|
| 664 |
+
if args.patience_trials is None:
|
| 665 |
+
return
|
| 666 |
+
|
| 667 |
+
if state["last_improve_trial_number"] is None:
|
| 668 |
+
return
|
| 669 |
+
|
| 670 |
+
since = trial.number - int(state["last_improve_trial_number"])
|
| 671 |
+
if since >= args.patience_trials:
|
| 672 |
+
study_obj.stop()
|
| 673 |
+
|
| 674 |
+
callbacks = [_early_stop_callback] if args.patience_trials is not None else None
|
| 675 |
+
if args.runner == "local":
|
| 676 |
+
objective_fn = _objective_local(args)
|
| 677 |
+
elif args.runner == "hf-job":
|
| 678 |
+
objective_fn = _objective_hf_job(args)
|
| 679 |
+
else:
|
| 680 |
+
objective_fn = _objective_hf_launcher(args)
|
| 681 |
+
|
| 682 |
+
study.optimize(
|
| 683 |
+
objective_fn,
|
| 684 |
+
n_trials=args.trials,
|
| 685 |
+
timeout=args.study_timeout,
|
| 686 |
+
callbacks=callbacks,
|
| 687 |
+
)
|
| 688 |
+
|
| 689 |
+
completed = [t for t in study.trials if t.value is not None]
|
| 690 |
+
if completed:
|
| 691 |
+
best = {
|
| 692 |
+
"study_name": study.study_name,
|
| 693 |
+
"direction": args.direction,
|
| 694 |
+
"metric": args.metric,
|
| 695 |
+
"best_value": study.best_value,
|
| 696 |
+
"best_params": study.best_params,
|
| 697 |
+
"best_trial_number": study.best_trial.number,
|
| 698 |
+
"best_trial_user_attrs": study.best_trial.user_attrs,
|
| 699 |
+
"n_trials": len(study.trials),
|
| 700 |
+
"n_completed": len(completed),
|
| 701 |
+
"patience_trials": args.patience_trials,
|
| 702 |
+
"min_improvement": args.min_improvement,
|
| 703 |
+
"enqueued_priors": enqueued_priors,
|
| 704 |
+
}
|
| 705 |
+
else:
|
| 706 |
+
best = {
|
| 707 |
+
"study_name": study.study_name,
|
| 708 |
+
"direction": args.direction,
|
| 709 |
+
"metric": args.metric,
|
| 710 |
+
"best_value": None,
|
| 711 |
+
"best_params": {},
|
| 712 |
+
"best_trial_number": None,
|
| 713 |
+
"best_trial_user_attrs": {},
|
| 714 |
+
"n_trials": len(study.trials),
|
| 715 |
+
"n_completed": 0,
|
| 716 |
+
"enqueued_priors": enqueued_priors,
|
| 717 |
+
"note": "No completed trials with metrics found.",
|
| 718 |
+
}
|
| 719 |
+
args.summary_out.write_text(json.dumps(best, indent=2), encoding="utf-8")
|
| 720 |
+
print(json.dumps(best, indent=2))
|
| 721 |
+
return 0
|
| 722 |
+
|
| 723 |
+
|
| 724 |
+
if __name__ == "__main__":
|
| 725 |
+
raise SystemExit(main())
|
overlay/scripts/parse_metrics.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Parse train.py run.log → (bpb, tps_avg, factual).
|
| 2 |
+
|
| 3 |
+
bpb priority order:
|
| 4 |
+
1. val_bpb from [VAL] line (cleanest signal, but OOMs on 6GB cards)
|
| 5 |
+
2. train_bpb from the LAST step= line (proxy when val fails — not held-out
|
| 6 |
+
but monotone with model capability over a 5-min budget)
|
| 7 |
+
"""
|
| 8 |
+
import re, sys
|
| 9 |
+
txt = open(sys.argv[1]).read()
|
| 10 |
+
|
| 11 |
+
m = re.search(r'val_bpb:\s+([\d\.]+)', txt)
|
| 12 |
+
if m:
|
| 13 |
+
bpb = m.group(1)
|
| 14 |
+
else:
|
| 15 |
+
step_lines = re.findall(r'^step=\d+\s+loss=[\d\.]+\s+bpb=([\d\.]+)', txt, re.M)
|
| 16 |
+
bpb = f'~{step_lines[-1]}' if step_lines else 'NA'
|
| 17 |
+
|
| 18 |
+
tps_vals = [int(m.group(1)) for m in re.finditer(r'tps=(\d+)', txt)]
|
| 19 |
+
tps_avg = f'{sum(tps_vals)/len(tps_vals):.0f}' if tps_vals else 'NA'
|
| 20 |
+
|
| 21 |
+
m = re.search(r'factual_english_hits:\s+(\d+/\d+)', txt)
|
| 22 |
+
factual = m.group(1) if m else 'NA'
|
| 23 |
+
|
| 24 |
+
print(f"{bpb}\t{tps_avg}\t{factual}")
|
overlay/scripts/run_domain_expanded_pretrain.sh
CHANGED
|
@@ -1,262 +1,262 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
# Domain-expanded streaming pretrain launcher for Feather/HYDRA.
|
| 3 |
-
#
|
| 4 |
-
# Usage:
|
| 5 |
-
# ./scripts/run_domain_expanded_pretrain.sh
|
| 6 |
-
# HYDRA_TARGET_SHARDS=2048 HYDRA_TIME_BUDGET=28800 ./scripts/run_domain_expanded_pretrain.sh
|
| 7 |
-
# ./scripts/run_domain_expanded_pretrain.sh --target-shards 1024 --dry-run
|
| 8 |
-
# ./scripts/run_domain_expanded_pretrain.sh --target-shards -1 --download-workers 16
|
| 9 |
-
#
|
| 10 |
-
# Behavior:
|
| 11 |
-
# - counts currently cached parquet shards in ~/.cache/autoresearch/data
|
| 12 |
-
# - optionally expands shard coverage toward a target via prepare.py
|
| 13 |
-
# - skips prepare.py entirely when target coverage is already satisfied
|
| 14 |
-
# - exports WSL CUDA library paths and long-run HYDRA_* env vars
|
| 15 |
-
# - prefers an existing latest/pretrain checkpoint path if one is present
|
| 16 |
-
# - streams stdout/stderr to a stable repo log: run_domain_expanded.log
|
| 17 |
-
set -euo pipefail
|
| 18 |
-
|
| 19 |
-
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 20 |
-
cd "$REPO_ROOT"
|
| 21 |
-
|
| 22 |
-
CACHE_ROOT="${HYDRA_CACHE_ROOT:-$HOME/.cache/autoresearch}"
|
| 23 |
-
DATA_DIR="${HYDRA_DATA_DIR:-$CACHE_ROOT/data}"
|
| 24 |
-
CKPT_DIR="${HYDRA_CKPT_DIR:-$CACHE_ROOT/ckpts}"
|
| 25 |
-
LOG_FILE="${HYDRA_DOMAIN_EXPANDED_LOG:-$REPO_ROOT/run_domain_expanded.log}"
|
| 26 |
-
DEFAULT_TARGET_SHARDS="2048"
|
| 27 |
-
TARGET_SHARDS="${HYDRA_TARGET_SHARDS:-$DEFAULT_TARGET_SHARDS}"
|
| 28 |
-
DOWNLOAD_WORKERS="${HYDRA_DOWNLOAD_WORKERS:-8}"
|
| 29 |
-
DRY_RUN=0
|
| 30 |
-
SKIP_TRAIN=0
|
| 31 |
-
FORCE_PREPARE=0
|
| 32 |
-
NO_RESUME=0
|
| 33 |
-
EXPLICIT_RESUME_PATH="${HYDRA_RESUME_PATH:-}"
|
| 34 |
-
|
| 35 |
-
usage() {
|
| 36 |
-
sed -n '2,16p' "$0"
|
| 37 |
-
cat <<'EOF'
|
| 38 |
-
|
| 39 |
-
Options:
|
| 40 |
-
--target-shards N Target number of train shards to have locally (-1 = all)
|
| 41 |
-
--download-workers N Parallel workers for prepare.py downloads
|
| 42 |
-
--resume PATH Override auto-detected checkpoint path
|
| 43 |
-
--no-resume Ignore existing checkpoints
|
| 44 |
-
--skip-train Only ensure shard coverage, do not launch train.py
|
| 45 |
-
--force-prepare Run prepare.py even if target coverage is already satisfied
|
| 46 |
-
--dry-run Print planned actions without running prepare.py/train.py
|
| 47 |
-
-h, --help Show this help
|
| 48 |
-
EOF
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
-
while [[ $# -gt 0 ]]; do
|
| 52 |
-
case "$1" in
|
| 53 |
-
--target-shards)
|
| 54 |
-
TARGET_SHARDS="$2"
|
| 55 |
-
shift 2
|
| 56 |
-
;;
|
| 57 |
-
--download-workers)
|
| 58 |
-
DOWNLOAD_WORKERS="$2"
|
| 59 |
-
shift 2
|
| 60 |
-
;;
|
| 61 |
-
--resume)
|
| 62 |
-
EXPLICIT_RESUME_PATH="$2"
|
| 63 |
-
shift 2
|
| 64 |
-
;;
|
| 65 |
-
--no-resume)
|
| 66 |
-
NO_RESUME=1
|
| 67 |
-
shift
|
| 68 |
-
;;
|
| 69 |
-
--skip-train)
|
| 70 |
-
SKIP_TRAIN=1
|
| 71 |
-
shift
|
| 72 |
-
;;
|
| 73 |
-
--force-prepare)
|
| 74 |
-
FORCE_PREPARE=1
|
| 75 |
-
shift
|
| 76 |
-
;;
|
| 77 |
-
--dry-run)
|
| 78 |
-
DRY_RUN=1
|
| 79 |
-
shift
|
| 80 |
-
;;
|
| 81 |
-
-h|--help)
|
| 82 |
-
usage
|
| 83 |
-
exit 0
|
| 84 |
-
;;
|
| 85 |
-
*)
|
| 86 |
-
echo "Unknown option: $1" >&2
|
| 87 |
-
usage >&2
|
| 88 |
-
exit 2
|
| 89 |
-
;;
|
| 90 |
-
esac
|
| 91 |
-
done
|
| 92 |
-
|
| 93 |
-
if ! [[ "$TARGET_SHARDS" =~ ^-?[0-9]+$ ]]; then
|
| 94 |
-
echo "Invalid --target-shards: $TARGET_SHARDS" >&2
|
| 95 |
-
exit 2
|
| 96 |
-
fi
|
| 97 |
-
if ! [[ "$DOWNLOAD_WORKERS" =~ ^[0-9]+$ ]] || [[ "$DOWNLOAD_WORKERS" -lt 1 ]]; then
|
| 98 |
-
echo "Invalid --download-workers: $DOWNLOAD_WORKERS" >&2
|
| 99 |
-
exit 2
|
| 100 |
-
fi
|
| 101 |
-
|
| 102 |
-
python_has_deps() {
|
| 103 |
-
local py="$1"
|
| 104 |
-
"$py" - <<'PY' >/dev/null 2>&1
|
| 105 |
-
import requests, pyarrow, rustbpe, torch
|
| 106 |
-
PY
|
| 107 |
-
}
|
| 108 |
-
|
| 109 |
-
if [[ -x "$REPO_ROOT/.venv/bin/python" ]] && python_has_deps "$REPO_ROOT/.venv/bin/python"; then
|
| 110 |
-
PYTHON_CMD=("$REPO_ROOT/.venv/bin/python")
|
| 111 |
-
elif command -v uv >/dev/null 2>&1; then
|
| 112 |
-
PYTHON_CMD=(uv run python)
|
| 113 |
-
elif command -v python3 >/dev/null 2>&1 && python_has_deps "$(command -v python3)"; then
|
| 114 |
-
PYTHON_CMD=(python3)
|
| 115 |
-
else
|
| 116 |
-
echo "No usable Python interpreter found with required deps (.venv/bin/python, uv run python, or python3)." >&2
|
| 117 |
-
exit 1
|
| 118 |
-
fi
|
| 119 |
-
|
| 120 |
-
count_train_shards() {
|
| 121 |
-
if [[ ! -d "$DATA_DIR" ]]; then
|
| 122 |
-
echo 0
|
| 123 |
-
return
|
| 124 |
-
fi
|
| 125 |
-
find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' ! -name 'shard_06542.parquet' | wc -l
|
| 126 |
-
}
|
| 127 |
-
|
| 128 |
-
count_total_shards() {
|
| 129 |
-
if [[ ! -d "$DATA_DIR" ]]; then
|
| 130 |
-
echo 0
|
| 131 |
-
return
|
| 132 |
-
fi
|
| 133 |
-
find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' | wc -l
|
| 134 |
-
}
|
| 135 |
-
|
| 136 |
-
resolve_resume_path() {
|
| 137 |
-
if [[ "$NO_RESUME" -eq 1 ]]; then
|
| 138 |
-
return 0
|
| 139 |
-
fi
|
| 140 |
-
if [[ -n "$EXPLICIT_RESUME_PATH" ]]; then
|
| 141 |
-
local expanded
|
| 142 |
-
expanded="${EXPLICIT_RESUME_PATH/#\~/$HOME}"
|
| 143 |
-
if [[ -f "$expanded" ]]; then
|
| 144 |
-
printf '%s\n' "$expanded"
|
| 145 |
-
return 0
|
| 146 |
-
fi
|
| 147 |
-
echo "Requested resume checkpoint not found: $expanded" >&2
|
| 148 |
-
exit 1
|
| 149 |
-
fi
|
| 150 |
-
|
| 151 |
-
local candidates=(
|
| 152 |
-
"$CKPT_DIR/latest.pt"
|
| 153 |
-
"$CKPT_DIR/pretrain_latest.pt"
|
| 154 |
-
"$CKPT_DIR/pretrain_final.pt"
|
| 155 |
-
"$CACHE_ROOT/latest.pt"
|
| 156 |
-
"$CACHE_ROOT/pretrain_latest.pt"
|
| 157 |
-
"$CACHE_ROOT/pretrain_final.pt"
|
| 158 |
-
"$REPO_ROOT/latest.pt"
|
| 159 |
-
"$REPO_ROOT/pretrain_final.pt"
|
| 160 |
-
)
|
| 161 |
-
local candidate
|
| 162 |
-
for candidate in "${candidates[@]}"; do
|
| 163 |
-
if [[ -f "$candidate" ]]; then
|
| 164 |
-
printf '%s\n' "$candidate"
|
| 165 |
-
return 0
|
| 166 |
-
fi
|
| 167 |
-
done
|
| 168 |
-
}
|
| 169 |
-
|
| 170 |
-
CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
|
| 171 |
-
CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
|
| 172 |
-
HAS_VAL=0
|
| 173 |
-
if [[ -f "$DATA_DIR/shard_06542.parquet" ]]; then
|
| 174 |
-
HAS_VAL=1
|
| 175 |
-
fi
|
| 176 |
-
|
| 177 |
-
PREPARE_NUM_SHARDS="$TARGET_SHARDS"
|
| 178 |
-
if [[ "$TARGET_SHARDS" -eq -1 ]]; then
|
| 179 |
-
TARGET_DESC="all available train shards"
|
| 180 |
-
NEED_PREPARE=1
|
| 181 |
-
elif [[ "$CURRENT_TRAIN_SHARDS" -ge "$TARGET_SHARDS" ]]; then
|
| 182 |
-
TARGET_DESC="$TARGET_SHARDS"
|
| 183 |
-
NEED_PREPARE="$FORCE_PREPARE"
|
| 184 |
-
else
|
| 185 |
-
TARGET_DESC="$TARGET_SHARDS"
|
| 186 |
-
NEED_PREPARE=1
|
| 187 |
-
fi
|
| 188 |
-
|
| 189 |
-
RESUME_PATH="$(resolve_resume_path || true)"
|
| 190 |
-
|
| 191 |
-
export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
|
| 192 |
-
export HYDRA_TIME_BUDGET="${HYDRA_TIME_BUDGET:-28800}"
|
| 193 |
-
export HYDRA_TARGET_SHARDS="$TARGET_SHARDS"
|
| 194 |
-
export HYDRA_DOWNLOAD_WORKERS="$DOWNLOAD_WORKERS"
|
| 195 |
-
export HYDRA_DOMAIN_EXPANDED_LOG="$LOG_FILE"
|
| 196 |
-
export HYDRA_CKPT_INTERVAL="${HYDRA_CKPT_INTERVAL:-2000}"
|
| 197 |
-
export HYDRA_CHECKPOINT_INTERVAL="${HYDRA_CHECKPOINT_INTERVAL:-$HYDRA_CKPT_INTERVAL}"
|
| 198 |
-
if [[ -n "$RESUME_PATH" ]]; then
|
| 199 |
-
export HYDRA_RESUME_PATH="$RESUME_PATH"
|
| 200 |
-
export HYDRA_RESUME_CKPT="$RESUME_PATH"
|
| 201 |
-
fi
|
| 202 |
-
|
| 203 |
-
mkdir -p "$(dirname "$LOG_FILE")"
|
| 204 |
-
|
| 205 |
-
ts() { date '+%Y-%m-%d %H:%M:%S'; }
|
| 206 |
-
log() {
|
| 207 |
-
local line="[$(ts)] $*"
|
| 208 |
-
echo "$line"
|
| 209 |
-
echo "$line" >> "$LOG_FILE"
|
| 210 |
-
}
|
| 211 |
-
|
| 212 |
-
log "=== domain-expanded pretrain launcher ==="
|
| 213 |
-
log "repo_root=$REPO_ROOT"
|
| 214 |
-
log "data_dir=$DATA_DIR train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS has_val=$HAS_VAL"
|
| 215 |
-
log "target_train_shards=$TARGET_DESC download_workers=$DOWNLOAD_WORKERS"
|
| 216 |
-
log "log_file=$LOG_FILE"
|
| 217 |
-
log "python=${PYTHON_CMD[*]}"
|
| 218 |
-
log "HYDRA_TIME_BUDGET=$HYDRA_TIME_BUDGET"
|
| 219 |
-
log "HYDRA_CKPT_INTERVAL=$HYDRA_CKPT_INTERVAL"
|
| 220 |
-
if [[ -n "$RESUME_PATH" ]]; then
|
| 221 |
-
log "resume_checkpoint=$RESUME_PATH"
|
| 222 |
-
else
|
| 223 |
-
log "resume_checkpoint=<none found>"
|
| 224 |
-
fi
|
| 225 |
-
log "note=train.py consumes HYDRA_RESUME_CKPT and HYDRA_CKPT_INTERVAL env vars; launcher exports them automatically"
|
| 226 |
-
|
| 227 |
-
if [[ "${HYDRA_USE_NEMOTRON:-0}" == "1" ]]; then
|
| 228 |
-
# Streaming Nemotron path (Super3 recipe) pulls tokens directly from HF at
|
| 229 |
-
# train-time via prepare_nemotron.make_dataloader. The disk-shard prepare.py
|
| 230 |
-
# download phase is redundant in this mode and wastes 20-30 min of paid GPU
|
| 231 |
-
# time on shard parquet transfers we'll never read.
|
| 232 |
-
log "prepare_action=skip reason=HYDRA_USE_NEMOTRON=1 (streaming at train-time)"
|
| 233 |
-
elif [[ "$NEED_PREPARE" -eq 1 ]]; then
|
| 234 |
-
PREPARE_CMD=("${PYTHON_CMD[@]}" prepare.py --num-shards "$PREPARE_NUM_SHARDS" --download-workers "$DOWNLOAD_WORKERS")
|
| 235 |
-
log "prepare_action=run command=${PREPARE_CMD[*]}"
|
| 236 |
-
if [[ "$DRY_RUN" -eq 0 ]]; then
|
| 237 |
-
"${PREPARE_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
|
| 238 |
-
CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
|
| 239 |
-
CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
|
| 240 |
-
log "post_prepare train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS"
|
| 241 |
-
fi
|
| 242 |
-
else
|
| 243 |
-
log "prepare_action=skip reason=target_already_satisfied"
|
| 244 |
-
fi
|
| 245 |
-
|
| 246 |
-
TRAIN_CMD=("${PYTHON_CMD[@]}" -u train.py)
|
| 247 |
-
if [[ "$SKIP_TRAIN" -eq 1 ]]; then
|
| 248 |
-
log "train_action=skip reason=--skip-train"
|
| 249 |
-
exit 0
|
| 250 |
-
fi
|
| 251 |
-
|
| 252 |
-
log "train_action=launch command=${TRAIN_CMD[*]}"
|
| 253 |
-
if [[ "$DRY_RUN" -eq 1 ]]; then
|
| 254 |
-
exit 0
|
| 255 |
-
fi
|
| 256 |
-
|
| 257 |
-
set +e
|
| 258 |
-
"${TRAIN_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
|
| 259 |
-
EXIT_CODE=${PIPESTATUS[0]}
|
| 260 |
-
set -e
|
| 261 |
-
log "train_exit_code=$EXIT_CODE"
|
| 262 |
-
exit "$EXIT_CODE"
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Domain-expanded streaming pretrain launcher for Feather/HYDRA.
|
| 3 |
+
#
|
| 4 |
+
# Usage:
|
| 5 |
+
# ./scripts/run_domain_expanded_pretrain.sh
|
| 6 |
+
# HYDRA_TARGET_SHARDS=2048 HYDRA_TIME_BUDGET=28800 ./scripts/run_domain_expanded_pretrain.sh
|
| 7 |
+
# ./scripts/run_domain_expanded_pretrain.sh --target-shards 1024 --dry-run
|
| 8 |
+
# ./scripts/run_domain_expanded_pretrain.sh --target-shards -1 --download-workers 16
|
| 9 |
+
#
|
| 10 |
+
# Behavior:
|
| 11 |
+
# - counts currently cached parquet shards in ~/.cache/autoresearch/data
|
| 12 |
+
# - optionally expands shard coverage toward a target via prepare.py
|
| 13 |
+
# - skips prepare.py entirely when target coverage is already satisfied
|
| 14 |
+
# - exports WSL CUDA library paths and long-run HYDRA_* env vars
|
| 15 |
+
# - prefers an existing latest/pretrain checkpoint path if one is present
|
| 16 |
+
# - streams stdout/stderr to a stable repo log: run_domain_expanded.log
|
| 17 |
+
set -euo pipefail
|
| 18 |
+
|
| 19 |
+
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
| 20 |
+
cd "$REPO_ROOT"
|
| 21 |
+
|
| 22 |
+
CACHE_ROOT="${HYDRA_CACHE_ROOT:-$HOME/.cache/autoresearch}"
|
| 23 |
+
DATA_DIR="${HYDRA_DATA_DIR:-$CACHE_ROOT/data}"
|
| 24 |
+
CKPT_DIR="${HYDRA_CKPT_DIR:-$CACHE_ROOT/ckpts}"
|
| 25 |
+
LOG_FILE="${HYDRA_DOMAIN_EXPANDED_LOG:-$REPO_ROOT/run_domain_expanded.log}"
|
| 26 |
+
DEFAULT_TARGET_SHARDS="2048"
|
| 27 |
+
TARGET_SHARDS="${HYDRA_TARGET_SHARDS:-$DEFAULT_TARGET_SHARDS}"
|
| 28 |
+
DOWNLOAD_WORKERS="${HYDRA_DOWNLOAD_WORKERS:-8}"
|
| 29 |
+
DRY_RUN=0
|
| 30 |
+
SKIP_TRAIN=0
|
| 31 |
+
FORCE_PREPARE=0
|
| 32 |
+
NO_RESUME=0
|
| 33 |
+
EXPLICIT_RESUME_PATH="${HYDRA_RESUME_PATH:-}"
|
| 34 |
+
|
| 35 |
+
usage() {
|
| 36 |
+
sed -n '2,16p' "$0"
|
| 37 |
+
cat <<'EOF'
|
| 38 |
+
|
| 39 |
+
Options:
|
| 40 |
+
--target-shards N Target number of train shards to have locally (-1 = all)
|
| 41 |
+
--download-workers N Parallel workers for prepare.py downloads
|
| 42 |
+
--resume PATH Override auto-detected checkpoint path
|
| 43 |
+
--no-resume Ignore existing checkpoints
|
| 44 |
+
--skip-train Only ensure shard coverage, do not launch train.py
|
| 45 |
+
--force-prepare Run prepare.py even if target coverage is already satisfied
|
| 46 |
+
--dry-run Print planned actions without running prepare.py/train.py
|
| 47 |
+
-h, --help Show this help
|
| 48 |
+
EOF
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
while [[ $# -gt 0 ]]; do
|
| 52 |
+
case "$1" in
|
| 53 |
+
--target-shards)
|
| 54 |
+
TARGET_SHARDS="$2"
|
| 55 |
+
shift 2
|
| 56 |
+
;;
|
| 57 |
+
--download-workers)
|
| 58 |
+
DOWNLOAD_WORKERS="$2"
|
| 59 |
+
shift 2
|
| 60 |
+
;;
|
| 61 |
+
--resume)
|
| 62 |
+
EXPLICIT_RESUME_PATH="$2"
|
| 63 |
+
shift 2
|
| 64 |
+
;;
|
| 65 |
+
--no-resume)
|
| 66 |
+
NO_RESUME=1
|
| 67 |
+
shift
|
| 68 |
+
;;
|
| 69 |
+
--skip-train)
|
| 70 |
+
SKIP_TRAIN=1
|
| 71 |
+
shift
|
| 72 |
+
;;
|
| 73 |
+
--force-prepare)
|
| 74 |
+
FORCE_PREPARE=1
|
| 75 |
+
shift
|
| 76 |
+
;;
|
| 77 |
+
--dry-run)
|
| 78 |
+
DRY_RUN=1
|
| 79 |
+
shift
|
| 80 |
+
;;
|
| 81 |
+
-h|--help)
|
| 82 |
+
usage
|
| 83 |
+
exit 0
|
| 84 |
+
;;
|
| 85 |
+
*)
|
| 86 |
+
echo "Unknown option: $1" >&2
|
| 87 |
+
usage >&2
|
| 88 |
+
exit 2
|
| 89 |
+
;;
|
| 90 |
+
esac
|
| 91 |
+
done
|
| 92 |
+
|
| 93 |
+
if ! [[ "$TARGET_SHARDS" =~ ^-?[0-9]+$ ]]; then
|
| 94 |
+
echo "Invalid --target-shards: $TARGET_SHARDS" >&2
|
| 95 |
+
exit 2
|
| 96 |
+
fi
|
| 97 |
+
if ! [[ "$DOWNLOAD_WORKERS" =~ ^[0-9]+$ ]] || [[ "$DOWNLOAD_WORKERS" -lt 1 ]]; then
|
| 98 |
+
echo "Invalid --download-workers: $DOWNLOAD_WORKERS" >&2
|
| 99 |
+
exit 2
|
| 100 |
+
fi
|
| 101 |
+
|
| 102 |
+
python_has_deps() {
|
| 103 |
+
local py="$1"
|
| 104 |
+
"$py" - <<'PY' >/dev/null 2>&1
|
| 105 |
+
import requests, pyarrow, rustbpe, torch
|
| 106 |
+
PY
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
if [[ -x "$REPO_ROOT/.venv/bin/python" ]] && python_has_deps "$REPO_ROOT/.venv/bin/python"; then
|
| 110 |
+
PYTHON_CMD=("$REPO_ROOT/.venv/bin/python")
|
| 111 |
+
elif command -v uv >/dev/null 2>&1; then
|
| 112 |
+
PYTHON_CMD=(uv run python)
|
| 113 |
+
elif command -v python3 >/dev/null 2>&1 && python_has_deps "$(command -v python3)"; then
|
| 114 |
+
PYTHON_CMD=(python3)
|
| 115 |
+
else
|
| 116 |
+
echo "No usable Python interpreter found with required deps (.venv/bin/python, uv run python, or python3)." >&2
|
| 117 |
+
exit 1
|
| 118 |
+
fi
|
| 119 |
+
|
| 120 |
+
count_train_shards() {
|
| 121 |
+
if [[ ! -d "$DATA_DIR" ]]; then
|
| 122 |
+
echo 0
|
| 123 |
+
return
|
| 124 |
+
fi
|
| 125 |
+
find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' ! -name 'shard_06542.parquet' | wc -l
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
count_total_shards() {
|
| 129 |
+
if [[ ! -d "$DATA_DIR" ]]; then
|
| 130 |
+
echo 0
|
| 131 |
+
return
|
| 132 |
+
fi
|
| 133 |
+
find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' | wc -l
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
resolve_resume_path() {
|
| 137 |
+
if [[ "$NO_RESUME" -eq 1 ]]; then
|
| 138 |
+
return 0
|
| 139 |
+
fi
|
| 140 |
+
if [[ -n "$EXPLICIT_RESUME_PATH" ]]; then
|
| 141 |
+
local expanded
|
| 142 |
+
expanded="${EXPLICIT_RESUME_PATH/#\~/$HOME}"
|
| 143 |
+
if [[ -f "$expanded" ]]; then
|
| 144 |
+
printf '%s\n' "$expanded"
|
| 145 |
+
return 0
|
| 146 |
+
fi
|
| 147 |
+
echo "Requested resume checkpoint not found: $expanded" >&2
|
| 148 |
+
exit 1
|
| 149 |
+
fi
|
| 150 |
+
|
| 151 |
+
local candidates=(
|
| 152 |
+
"$CKPT_DIR/latest.pt"
|
| 153 |
+
"$CKPT_DIR/pretrain_latest.pt"
|
| 154 |
+
"$CKPT_DIR/pretrain_final.pt"
|
| 155 |
+
"$CACHE_ROOT/latest.pt"
|
| 156 |
+
"$CACHE_ROOT/pretrain_latest.pt"
|
| 157 |
+
"$CACHE_ROOT/pretrain_final.pt"
|
| 158 |
+
"$REPO_ROOT/latest.pt"
|
| 159 |
+
"$REPO_ROOT/pretrain_final.pt"
|
| 160 |
+
)
|
| 161 |
+
local candidate
|
| 162 |
+
for candidate in "${candidates[@]}"; do
|
| 163 |
+
if [[ -f "$candidate" ]]; then
|
| 164 |
+
printf '%s\n' "$candidate"
|
| 165 |
+
return 0
|
| 166 |
+
fi
|
| 167 |
+
done
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
|
| 171 |
+
CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
|
| 172 |
+
HAS_VAL=0
|
| 173 |
+
if [[ -f "$DATA_DIR/shard_06542.parquet" ]]; then
|
| 174 |
+
HAS_VAL=1
|
| 175 |
+
fi
|
| 176 |
+
|
| 177 |
+
PREPARE_NUM_SHARDS="$TARGET_SHARDS"
|
| 178 |
+
if [[ "$TARGET_SHARDS" -eq -1 ]]; then
|
| 179 |
+
TARGET_DESC="all available train shards"
|
| 180 |
+
NEED_PREPARE=1
|
| 181 |
+
elif [[ "$CURRENT_TRAIN_SHARDS" -ge "$TARGET_SHARDS" ]]; then
|
| 182 |
+
TARGET_DESC="$TARGET_SHARDS"
|
| 183 |
+
NEED_PREPARE="$FORCE_PREPARE"
|
| 184 |
+
else
|
| 185 |
+
TARGET_DESC="$TARGET_SHARDS"
|
| 186 |
+
NEED_PREPARE=1
|
| 187 |
+
fi
|
| 188 |
+
|
| 189 |
+
RESUME_PATH="$(resolve_resume_path || true)"
|
| 190 |
+
|
| 191 |
+
export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
|
| 192 |
+
export HYDRA_TIME_BUDGET="${HYDRA_TIME_BUDGET:-28800}"
|
| 193 |
+
export HYDRA_TARGET_SHARDS="$TARGET_SHARDS"
|
| 194 |
+
export HYDRA_DOWNLOAD_WORKERS="$DOWNLOAD_WORKERS"
|
| 195 |
+
export HYDRA_DOMAIN_EXPANDED_LOG="$LOG_FILE"
|
| 196 |
+
export HYDRA_CKPT_INTERVAL="${HYDRA_CKPT_INTERVAL:-2000}"
|
| 197 |
+
export HYDRA_CHECKPOINT_INTERVAL="${HYDRA_CHECKPOINT_INTERVAL:-$HYDRA_CKPT_INTERVAL}"
|
| 198 |
+
if [[ -n "$RESUME_PATH" ]]; then
|
| 199 |
+
export HYDRA_RESUME_PATH="$RESUME_PATH"
|
| 200 |
+
export HYDRA_RESUME_CKPT="$RESUME_PATH"
|
| 201 |
+
fi
|
| 202 |
+
|
| 203 |
+
mkdir -p "$(dirname "$LOG_FILE")"
|
| 204 |
+
|
| 205 |
+
ts() { date '+%Y-%m-%d %H:%M:%S'; }
|
| 206 |
+
log() {
|
| 207 |
+
local line="[$(ts)] $*"
|
| 208 |
+
echo "$line"
|
| 209 |
+
echo "$line" >> "$LOG_FILE"
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
log "=== domain-expanded pretrain launcher ==="
|
| 213 |
+
log "repo_root=$REPO_ROOT"
|
| 214 |
+
log "data_dir=$DATA_DIR train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS has_val=$HAS_VAL"
|
| 215 |
+
log "target_train_shards=$TARGET_DESC download_workers=$DOWNLOAD_WORKERS"
|
| 216 |
+
log "log_file=$LOG_FILE"
|
| 217 |
+
log "python=${PYTHON_CMD[*]}"
|
| 218 |
+
log "HYDRA_TIME_BUDGET=$HYDRA_TIME_BUDGET"
|
| 219 |
+
log "HYDRA_CKPT_INTERVAL=$HYDRA_CKPT_INTERVAL"
|
| 220 |
+
if [[ -n "$RESUME_PATH" ]]; then
|
| 221 |
+
log "resume_checkpoint=$RESUME_PATH"
|
| 222 |
+
else
|
| 223 |
+
log "resume_checkpoint=<none found>"
|
| 224 |
+
fi
|
| 225 |
+
log "note=train.py consumes HYDRA_RESUME_CKPT and HYDRA_CKPT_INTERVAL env vars; launcher exports them automatically"
|
| 226 |
+
|
| 227 |
+
if [[ "${HYDRA_USE_NEMOTRON:-0}" == "1" ]]; then
|
| 228 |
+
# Streaming Nemotron path (Super3 recipe) pulls tokens directly from HF at
|
| 229 |
+
# train-time via prepare_nemotron.make_dataloader. The disk-shard prepare.py
|
| 230 |
+
# download phase is redundant in this mode and wastes 20-30 min of paid GPU
|
| 231 |
+
# time on shard parquet transfers we'll never read.
|
| 232 |
+
log "prepare_action=skip reason=HYDRA_USE_NEMOTRON=1 (streaming at train-time)"
|
| 233 |
+
elif [[ "$NEED_PREPARE" -eq 1 ]]; then
|
| 234 |
+
PREPARE_CMD=("${PYTHON_CMD[@]}" prepare.py --num-shards "$PREPARE_NUM_SHARDS" --download-workers "$DOWNLOAD_WORKERS")
|
| 235 |
+
log "prepare_action=run command=${PREPARE_CMD[*]}"
|
| 236 |
+
if [[ "$DRY_RUN" -eq 0 ]]; then
|
| 237 |
+
"${PREPARE_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
|
| 238 |
+
CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
|
| 239 |
+
CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
|
| 240 |
+
log "post_prepare train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS"
|
| 241 |
+
fi
|
| 242 |
+
else
|
| 243 |
+
log "prepare_action=skip reason=target_already_satisfied"
|
| 244 |
+
fi
|
| 245 |
+
|
| 246 |
+
TRAIN_CMD=("${PYTHON_CMD[@]}" -u train.py)
|
| 247 |
+
if [[ "$SKIP_TRAIN" -eq 1 ]]; then
|
| 248 |
+
log "train_action=skip reason=--skip-train"
|
| 249 |
+
exit 0
|
| 250 |
+
fi
|
| 251 |
+
|
| 252 |
+
log "train_action=launch command=${TRAIN_CMD[*]}"
|
| 253 |
+
if [[ "$DRY_RUN" -eq 1 ]]; then
|
| 254 |
+
exit 0
|
| 255 |
+
fi
|
| 256 |
+
|
| 257 |
+
set +e
|
| 258 |
+
"${TRAIN_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
|
| 259 |
+
EXIT_CODE=${PIPESTATUS[0]}
|
| 260 |
+
set -e
|
| 261 |
+
log "train_exit_code=$EXIT_CODE"
|
| 262 |
+
exit "$EXIT_CODE"
|
overlay/scripts/run_meta.sh
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
set -euo pipefail
|
| 3 |
-
|
| 4 |
-
echo "=== HYDRA Meta-Agent ==="
|
| 5 |
-
cd "$(dirname "$0")/.."
|
| 6 |
-
|
| 7 |
-
echo "Running meta-agent iteration..."
|
| 8 |
-
uv run python -c "
|
| 9 |
-
from harness.meta_agent import run_meta_iteration
|
| 10 |
-
import json
|
| 11 |
-
result = run_meta_iteration()
|
| 12 |
-
print(json.dumps(result, indent=2))
|
| 13 |
-
"
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
echo "=== HYDRA Meta-Agent ==="
|
| 5 |
+
cd "$(dirname "$0")/.."
|
| 6 |
+
|
| 7 |
+
echo "Running meta-agent iteration..."
|
| 8 |
+
uv run python -c "
|
| 9 |
+
from harness.meta_agent import run_meta_iteration
|
| 10 |
+
import json
|
| 11 |
+
result = run_meta_iteration()
|
| 12 |
+
print(json.dumps(result, indent=2))
|
| 13 |
+
"
|
overlay/scripts/run_phase1.sh
CHANGED
|
@@ -1,32 +1,32 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
set -euo pipefail
|
| 3 |
-
|
| 4 |
-
echo "=== HYDRA Phase 1: Sequential Subsystem Bring-Up ==="
|
| 5 |
-
cd "$(dirname "$0")/.."
|
| 6 |
-
|
| 7 |
-
SUBSYSTEMS=("mamba3" "mhc" "engram" "hestia" "sdr")
|
| 8 |
-
|
| 9 |
-
for sub in "${SUBSYSTEMS[@]}"; do
|
| 10 |
-
echo ""
|
| 11 |
-
echo "--- Subsystem: ${sub} ---"
|
| 12 |
-
BRANCH="autoresearch/phase1-${sub}"
|
| 13 |
-
|
| 14 |
-
# Create branch if it doesn't exist
|
| 15 |
-
if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
|
| 16 |
-
git checkout -b "${BRANCH}"
|
| 17 |
-
else
|
| 18 |
-
git checkout "${BRANCH}"
|
| 19 |
-
fi
|
| 20 |
-
|
| 21 |
-
echo "Running: uv run subsystems/train_${sub}.py"
|
| 22 |
-
uv run "subsystems/train_${sub}.py" > "run_${sub}.log" 2>&1 || true
|
| 23 |
-
|
| 24 |
-
# Extract result
|
| 25 |
-
echo "Result:"
|
| 26 |
-
grep "^val_bpb:" "run_${sub}.log" || echo " (crashed)"
|
| 27 |
-
grep "^peak_vram_mb:" "run_${sub}.log" || true
|
| 28 |
-
done
|
| 29 |
-
|
| 30 |
-
echo ""
|
| 31 |
-
echo "=== Phase 1 complete ==="
|
| 32 |
-
git checkout main 2>/dev/null || git checkout master
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
echo "=== HYDRA Phase 1: Sequential Subsystem Bring-Up ==="
|
| 5 |
+
cd "$(dirname "$0")/.."
|
| 6 |
+
|
| 7 |
+
SUBSYSTEMS=("mamba3" "mhc" "engram" "hestia" "sdr")
|
| 8 |
+
|
| 9 |
+
for sub in "${SUBSYSTEMS[@]}"; do
|
| 10 |
+
echo ""
|
| 11 |
+
echo "--- Subsystem: ${sub} ---"
|
| 12 |
+
BRANCH="autoresearch/phase1-${sub}"
|
| 13 |
+
|
| 14 |
+
# Create branch if it doesn't exist
|
| 15 |
+
if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
|
| 16 |
+
git checkout -b "${BRANCH}"
|
| 17 |
+
else
|
| 18 |
+
git checkout "${BRANCH}"
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
echo "Running: uv run subsystems/train_${sub}.py"
|
| 22 |
+
uv run "subsystems/train_${sub}.py" > "run_${sub}.log" 2>&1 || true
|
| 23 |
+
|
| 24 |
+
# Extract result
|
| 25 |
+
echo "Result:"
|
| 26 |
+
grep "^val_bpb:" "run_${sub}.log" || echo " (crashed)"
|
| 27 |
+
grep "^peak_vram_mb:" "run_${sub}.log" || true
|
| 28 |
+
done
|
| 29 |
+
|
| 30 |
+
echo ""
|
| 31 |
+
echo "=== Phase 1 complete ==="
|
| 32 |
+
git checkout main 2>/dev/null || git checkout master
|
overlay/scripts/run_phase2.sh
CHANGED
|
@@ -1,25 +1,25 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
set -euo pipefail
|
| 3 |
-
|
| 4 |
-
echo "=== HYDRA Phase 2: Integrated Autoresearch ==="
|
| 5 |
-
cd "$(dirname "$0")/.."
|
| 6 |
-
|
| 7 |
-
TAG="${1:-$(date +%b%d | tr '[:upper:]' '[:lower:]')}"
|
| 8 |
-
|
| 9 |
-
# Validate tag: only alphanumeric, hyphens, underscores, dots
|
| 10 |
-
if [[ ! "${TAG}" =~ ^[a-zA-Z0-9._-]+$ ]]; then
|
| 11 |
-
echo "Error: invalid tag '${TAG}'. Use only alphanumeric, hyphens, underscores, dots." >&2
|
| 12 |
-
exit 1
|
| 13 |
-
fi
|
| 14 |
-
|
| 15 |
-
BRANCH="autoresearch/${TAG}"
|
| 16 |
-
|
| 17 |
-
if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
|
| 18 |
-
git checkout -b -- "${BRANCH}"
|
| 19 |
-
else
|
| 20 |
-
git checkout -- "${BRANCH}"
|
| 21 |
-
fi
|
| 22 |
-
|
| 23 |
-
echo "Branch: ${BRANCH}"
|
| 24 |
-
echo "Starting orchestrator..."
|
| 25 |
-
uv run -m harness.orchestrator
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
echo "=== HYDRA Phase 2: Integrated Autoresearch ==="
|
| 5 |
+
cd "$(dirname "$0")/.."
|
| 6 |
+
|
| 7 |
+
TAG="${1:-$(date +%b%d | tr '[:upper:]' '[:lower:]')}"
|
| 8 |
+
|
| 9 |
+
# Validate tag: only alphanumeric, hyphens, underscores, dots
|
| 10 |
+
if [[ ! "${TAG}" =~ ^[a-zA-Z0-9._-]+$ ]]; then
|
| 11 |
+
echo "Error: invalid tag '${TAG}'. Use only alphanumeric, hyphens, underscores, dots." >&2
|
| 12 |
+
exit 1
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
BRANCH="autoresearch/${TAG}"
|
| 16 |
+
|
| 17 |
+
if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
|
| 18 |
+
git checkout -b -- "${BRANCH}"
|
| 19 |
+
else
|
| 20 |
+
git checkout -- "${BRANCH}"
|
| 21 |
+
fi
|
| 22 |
+
|
| 23 |
+
echo "Branch: ${BRANCH}"
|
| 24 |
+
echo "Starting orchestrator..."
|
| 25 |
+
uv run -m harness.orchestrator
|
overlay/scripts/run_tps_gate.sh
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# Run a reproducible throughput gate.
|
| 5 |
+
# Default gate: 50k TPS steady-state.
|
| 6 |
+
#
|
| 7 |
+
# Usage:
|
| 8 |
+
# bash scripts/run_tps_gate.sh [config] [seconds] [min_tps]
|
| 9 |
+
# Example:
|
| 10 |
+
# bash scripts/run_tps_gate.sh baseline 300 50000
|
| 11 |
+
|
| 12 |
+
CONFIG="${1:-baseline}"
|
| 13 |
+
SECONDS_BUDGET="${2:-300}"
|
| 14 |
+
MIN_TPS="${3:-50000}"
|
| 15 |
+
|
| 16 |
+
echo "[tps-gate] config=$CONFIG seconds=$SECONDS_BUDGET min_tps=$MIN_TPS"
|
| 17 |
+
|
| 18 |
+
python scripts/benchmark_hyena_stack.py \
|
| 19 |
+
--config "$CONFIG" \
|
| 20 |
+
--time "$SECONDS_BUDGET" \
|
| 21 |
+
--min-tps "$MIN_TPS"
|
| 22 |
+
|
| 23 |
+
echo "[tps-gate] PASS"
|
overlay/scripts/setup.sh
CHANGED
|
@@ -1,27 +1,28 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
set -euo pipefail
|
| 3 |
-
|
| 4 |
-
echo "=== HYDRA Setup ==="
|
| 5 |
-
echo ""
|
| 6 |
-
|
| 7 |
-
# Check uv
|
| 8 |
-
if ! command -v uv &>/dev/null; then
|
| 9 |
-
echo "Installing uv..."
|
| 10 |
-
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 11 |
-
fi
|
| 12 |
-
|
| 13 |
-
# Install Python dependencies
|
| 14 |
-
echo "Installing Python dependencies..."
|
| 15 |
-
cd "$(dirname "$0")/.."
|
| 16 |
-
uv sync
|
| 17 |
-
|
| 18 |
-
# Prepare data (download shards + train tokenizer)
|
| 19 |
-
echo ""
|
| 20 |
-
echo "Preparing data (this may take a few minutes on first run)..."
|
| 21 |
-
uv run prepare.py --num-shards 10
|
| 22 |
-
|
| 23 |
-
echo ""
|
| 24 |
-
echo "=== Setup complete ==="
|
| 25 |
-
echo "Run experiments with: uv run train.py"
|
| 26 |
-
echo "Run orchestrator with: uv run -m harness.orchestrator"
|
| 27 |
-
echo "Run Phase 1 subsystems with: bash scripts/run_phase1.sh"
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
echo "=== HYDRA Setup ==="
|
| 5 |
+
echo ""
|
| 6 |
+
|
| 7 |
+
# Check uv
|
| 8 |
+
if ! command -v uv &>/dev/null; then
|
| 9 |
+
echo "Installing uv..."
|
| 10 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 11 |
+
fi
|
| 12 |
+
|
| 13 |
+
# Install Python dependencies
|
| 14 |
+
echo "Installing Python dependencies..."
|
| 15 |
+
cd "$(dirname "$0")/.."
|
| 16 |
+
uv sync
|
| 17 |
+
|
| 18 |
+
# Prepare data (download shards + train tokenizer)
|
| 19 |
+
echo ""
|
| 20 |
+
echo "Preparing data (this may take a few minutes on first run)..."
|
| 21 |
+
uv run prepare.py --num-shards 10
|
| 22 |
+
|
| 23 |
+
echo ""
|
| 24 |
+
echo "=== Setup complete ==="
|
| 25 |
+
echo "Run experiments with: uv run train.py"
|
| 26 |
+
echo "Run orchestrator with: uv run -m harness.orchestrator"
|
| 27 |
+
echo "Run Phase 1 subsystems with: bash scripts/run_phase1.sh"
|
| 28 |
+
echo "For WSL/CUDA throughput gate: see docs/WSL_TPS_RUNBOOK.md"
|
overlay/scripts/strip_optimizer_state.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Strip optimizer_state_dict from a checkpoint, keeping only model weights
|
| 2 |
+
and config metadata.
|
| 3 |
+
|
| 4 |
+
Reason: resuming training.py's standard path restores the optimizer state,
|
| 5 |
+
which (in our 6GB / Muon-compile / bf16 setup) reproducibly produces a
|
| 6 |
+
NaN/>100-loss on the first forward after load. Reloading model weights
|
| 7 |
+
only and letting the optimizer initialize fresh sidesteps the issue.
|
| 8 |
+
|
| 9 |
+
Output checkpoint also clears `step`, `train_seconds`, `epoch` so the LR
|
| 10 |
+
schedule and warmup restart from zero — useful when we want to fine-tune
|
| 11 |
+
the trained weights at a new schedule length.
|
| 12 |
+
"""
|
| 13 |
+
import sys, torch
|
| 14 |
+
|
| 15 |
+
src, dst = sys.argv[1], sys.argv[2]
|
| 16 |
+
ckpt = torch.load(src, map_location="cpu", weights_only=False)
|
| 17 |
+
keep = {
|
| 18 |
+
"model_state_dict": ckpt.get("model_state_dict", ckpt),
|
| 19 |
+
"config": ckpt.get("config"),
|
| 20 |
+
# Reset training progress markers so LR schedule warmups cleanly.
|
| 21 |
+
"step": 0,
|
| 22 |
+
"train_seconds": 0.0,
|
| 23 |
+
"smoothed_loss": 0.0,
|
| 24 |
+
"bpt_ema": 0.0,
|
| 25 |
+
"epoch": 0,
|
| 26 |
+
}
|
| 27 |
+
# Explicitly do NOT copy optimizer_state_dict.
|
| 28 |
+
torch.save(keep, dst)
|
| 29 |
+
print(f"Stripped -> {dst} (orig {sum(1 for _ in ckpt)} keys, kept {len(keep)})")
|
overlay/scripts/sweep_depth_aggregate.py
CHANGED
|
@@ -11,16 +11,56 @@ Usage:
|
|
| 11 |
"""
|
| 12 |
from __future__ import annotations
|
| 13 |
|
| 14 |
-
import json
|
| 15 |
-
import os
|
| 16 |
-
import
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
try:
|
| 25 |
from huggingface_hub import HfApi # type: ignore
|
| 26 |
except Exception as e:
|
|
@@ -33,41 +73,73 @@ def fetch_metrics_from_job(job_id: str) -> dict | None:
|
|
| 33 |
print(f'[agg] could not fetch logs for job={job_id}: {e}', file=sys.stderr)
|
| 34 |
return None
|
| 35 |
|
| 36 |
-
last_json = None
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
hdr = ['metric'] + [f'L={n}' for n in sorted_n]
|
| 60 |
print(' '.join(f'{h:>14}' for h in hdr))
|
| 61 |
-
for key in ('val_bpb', 'val_ppl', 'num_params_M', 'total_tokens_M',
|
| 62 |
-
'training_seconds', 'peak_vram_mb', 'sdr_target_active',
|
| 63 |
-
'htm_anomaly', 'engram_hit_rate', 'sdr_active_bits'
|
| 64 |
-
|
| 65 |
-
|
|
|
|
| 66 |
|
| 67 |
# Per-layer panel — one table per metric.
|
| 68 |
print('\n=== Per-layer: delta_ratio (residual contribution) ===')
|
| 69 |
print(' '.join(['layer'] + [f'L={n:>2}' for n in sorted_n]))
|
| 70 |
-
max_depth = max(results[n]
|
| 71 |
for li in range(max_depth):
|
| 72 |
row = [f'L{li:02d}']
|
| 73 |
for n in sorted_n:
|
|
@@ -104,16 +176,40 @@ def compare(results: dict[int, dict]) -> None:
|
|
| 104 |
|
| 105 |
# Dead-layer detection
|
| 106 |
print('\n=== Dead-layer detection (delta_ratio < 0.02) ===')
|
| 107 |
-
for n in sorted_n:
|
| 108 |
-
r = results[n]
|
| 109 |
-
n_layer =
|
| 110 |
dead = []
|
| 111 |
for li in range(n_layer):
|
| 112 |
v = r.get(f'layer_{li}_delta_ratio')
|
| 113 |
if isinstance(v, (int, float)) and v < 0.02:
|
| 114 |
dead.append(li)
|
| 115 |
-
status = 'ALL LIVE' if not dead else f'DEAD LAYERS: {dead}'
|
| 116 |
-
print(f' n_layer={n:2d} val_bpb={r.get("val_bpb", float("nan")):.4f} {status}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
|
| 119 |
def main() -> int:
|
|
@@ -134,7 +230,7 @@ def main() -> int:
|
|
| 134 |
jobs[n_layer] = job_id
|
| 135 |
|
| 136 |
print(f'[agg] reading {len(jobs)} jobs from {MANIFEST}')
|
| 137 |
-
results: dict[int,
|
| 138 |
for n, jid in jobs.items():
|
| 139 |
print(f'[agg] fetching job={jid} (n_layer={n}) ...')
|
| 140 |
m = fetch_metrics_from_job(jid)
|
|
|
|
| 11 |
"""
|
| 12 |
from __future__ import annotations
|
| 13 |
|
| 14 |
+
import json
|
| 15 |
+
import os
|
| 16 |
+
import statistics
|
| 17 |
+
import re
|
| 18 |
+
import sys
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
from configs.harness_config import HarnessConfig
|
| 22 |
+
|
| 23 |
+
type MetricValue = float | int | str | bool | None
|
| 24 |
+
type MetricsDict = dict[str, MetricValue]
|
| 25 |
+
|
| 26 |
+
MANIFEST = Path(sys.argv[1] if len(sys.argv) > 1 else '/tmp/sweep_depth_manifest.txt')
|
| 27 |
+
STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
|
| 28 |
+
MIN_TPS = float(os.environ.get('SWEEP_MIN_TPS', '0'))
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _zero_shot_score(result: MetricsDict) -> float:
|
| 32 |
+
"""Composite quality score for tie-breaking among BPB-near runs."""
|
| 33 |
+
factual = float(result.get('factual_english_score', 0.0) or 0.0)
|
| 34 |
+
instruction = float(result.get('instruction_following_score', 0.0) or 0.0)
|
| 35 |
+
distinct_2 = float(result.get('distinct_2', 0.0) or 0.0)
|
| 36 |
+
repetition = float(result.get('repetition_rate', 0.0) or 0.0)
|
| 37 |
+
return factual + instruction + distinct_2 - repetition
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _metric_float(result: MetricsDict, key: str, default: float = 0.0) -> float:
|
| 41 |
+
value = result.get(key, default)
|
| 42 |
+
return float(value) if isinstance(value, (int, float)) else default
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _metric_int(result: MetricsDict, key: str, default: int = 0) -> int:
|
| 46 |
+
value = result.get(key, default)
|
| 47 |
+
return int(value) if isinstance(value, int) else default
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _percentile_linear(sorted_values: list[float], pct: float) -> float:
|
| 51 |
+
if not sorted_values:
|
| 52 |
+
return 0.0
|
| 53 |
+
if len(sorted_values) == 1:
|
| 54 |
+
return sorted_values[0]
|
| 55 |
+
rank = (len(sorted_values) - 1) * (pct / 100.0)
|
| 56 |
+
lo = int(rank)
|
| 57 |
+
hi = min(lo + 1, len(sorted_values) - 1)
|
| 58 |
+
frac = rank - lo
|
| 59 |
+
return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def fetch_metrics_from_job(job_id: str) -> MetricsDict | None:
|
| 63 |
+
"""Fetch HF Job stdout and parse the [METRICS_JSON] line."""
|
| 64 |
try:
|
| 65 |
from huggingface_hub import HfApi # type: ignore
|
| 66 |
except Exception as e:
|
|
|
|
| 73 |
print(f'[agg] could not fetch logs for job={job_id}: {e}', file=sys.stderr)
|
| 74 |
return None
|
| 75 |
|
| 76 |
+
last_json = None
|
| 77 |
+
tps_samples: list[tuple[int, int]] = []
|
| 78 |
+
warmup_steps = 25
|
| 79 |
+
for line in logs_stream:
|
| 80 |
+
# HfApi returns strings or JobLogEntry-like objects depending on version.
|
| 81 |
+
text = getattr(line, 'data', None) or str(line)
|
| 82 |
+
|
| 83 |
+
wm = re.search(r"\[TPS_GUARD\] enabled .*?warmup_steps=(\d+)", text)
|
| 84 |
+
if wm:
|
| 85 |
+
warmup_steps = int(wm.group(1))
|
| 86 |
+
|
| 87 |
+
sm = STEP_TPS_PATTERN.search(text)
|
| 88 |
+
if sm:
|
| 89 |
+
tps_samples.append((int(sm.group(1)), int(sm.group(2))))
|
| 90 |
+
|
| 91 |
+
if '[METRICS_JSON]' in text:
|
| 92 |
+
payload = text.split('[METRICS_JSON]', 1)[1].strip()
|
| 93 |
+
try:
|
| 94 |
+
last_json = json.loads(payload)
|
| 95 |
+
except Exception:
|
| 96 |
+
# Might be truncated on a line boundary — keep looking.
|
| 97 |
+
pass
|
| 98 |
+
if last_json is None:
|
| 99 |
+
return None
|
| 100 |
+
|
| 101 |
+
steady_tps = [float(tps) for step, tps in tps_samples if step >= warmup_steps]
|
| 102 |
+
if not steady_tps:
|
| 103 |
+
steady_tps = [float(tps) for _, tps in tps_samples]
|
| 104 |
+
if steady_tps:
|
| 105 |
+
sorted_tps = sorted(steady_tps)
|
| 106 |
+
last_json['tps_samples'] = len(steady_tps)
|
| 107 |
+
last_json['tps_median'] = float(statistics.median(steady_tps))
|
| 108 |
+
last_json['tps_p10'] = float(_percentile_linear(sorted_tps, 10.0))
|
| 109 |
+
last_json['tps_min'] = float(sorted_tps[0])
|
| 110 |
+
last_json['tps_max'] = float(sorted_tps[-1])
|
| 111 |
+
last_json['tps_warmup_steps'] = int(warmup_steps)
|
| 112 |
+
|
| 113 |
+
return last_json
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def compare(results: dict[int, MetricsDict]) -> None:
|
| 117 |
+
"""Pretty-print comparison across n_layer values."""
|
| 118 |
+
if not results:
|
| 119 |
+
print('[agg] no results')
|
| 120 |
+
return
|
| 121 |
+
sorted_n = sorted(results.keys())
|
| 122 |
+
secondary_gates = HarnessConfig().to_secondary_gates()
|
| 123 |
+
|
| 124 |
+
print('\n=== Active secondary gates ===')
|
| 125 |
+
for metric, thresholds in sorted(secondary_gates.items()):
|
| 126 |
+
print(f' {metric}: {json.dumps(thresholds, sort_keys=True)}')
|
| 127 |
+
|
| 128 |
+
# Top-level scalars
|
| 129 |
+
print('\n=== Top-level scalars ===')
|
| 130 |
hdr = ['metric'] + [f'L={n}' for n in sorted_n]
|
| 131 |
print(' '.join(f'{h:>14}' for h in hdr))
|
| 132 |
+
for key in ('val_bpb', 'val_ppl', 'num_params_M', 'total_tokens_M',
|
| 133 |
+
'training_seconds', 'peak_vram_mb', 'sdr_target_active',
|
| 134 |
+
'htm_anomaly', 'engram_hit_rate', 'sdr_active_bits',
|
| 135 |
+
'tps_median', 'tps_p10', 'tps_min', 'tps_max', 'tps_samples'):
|
| 136 |
+
row = [key] + [f'{results[n].get(key, float("nan")):.4f}' if isinstance(results[n].get(key), (int, float)) else 'n/a' for n in sorted_n]
|
| 137 |
+
print(' '.join(f'{c:>14}' for c in row))
|
| 138 |
|
| 139 |
# Per-layer panel — one table per metric.
|
| 140 |
print('\n=== Per-layer: delta_ratio (residual contribution) ===')
|
| 141 |
print(' '.join(['layer'] + [f'L={n:>2}' for n in sorted_n]))
|
| 142 |
+
max_depth = max(_metric_int(results[n], 'n_layer', 0) for n in sorted_n)
|
| 143 |
for li in range(max_depth):
|
| 144 |
row = [f'L{li:02d}']
|
| 145 |
for n in sorted_n:
|
|
|
|
| 176 |
|
| 177 |
# Dead-layer detection
|
| 178 |
print('\n=== Dead-layer detection (delta_ratio < 0.02) ===')
|
| 179 |
+
for n in sorted_n:
|
| 180 |
+
r = results[n]
|
| 181 |
+
n_layer = _metric_int(r, 'n_layer', 0)
|
| 182 |
dead = []
|
| 183 |
for li in range(n_layer):
|
| 184 |
v = r.get(f'layer_{li}_delta_ratio')
|
| 185 |
if isinstance(v, (int, float)) and v < 0.02:
|
| 186 |
dead.append(li)
|
| 187 |
+
status = 'ALL LIVE' if not dead else f'DEAD LAYERS: {dead}'
|
| 188 |
+
print(f' n_layer={n:2d} val_bpb={r.get("val_bpb", float("nan")):.4f} {status}')
|
| 189 |
+
|
| 190 |
+
print('\n=== Throughput-constrained ranking ===')
|
| 191 |
+
ranked = sorted(
|
| 192 |
+
((n, r) for n, r in results.items() if isinstance(r.get('val_bpb'), (int, float))),
|
| 193 |
+
key=lambda x: (
|
| 194 |
+
(MIN_TPS > 0) and (_metric_float(x[1], 'tps_median', 0.0) < MIN_TPS),
|
| 195 |
+
_metric_float(x[1], 'val_bpb', float('inf')),
|
| 196 |
+
-_zero_shot_score(x[1]),
|
| 197 |
+
),
|
| 198 |
+
)
|
| 199 |
+
feasible_count = 0
|
| 200 |
+
for n, r in ranked:
|
| 201 |
+
tps_median = _metric_float(r, 'tps_median', 0.0)
|
| 202 |
+
feasible = (MIN_TPS <= 0) or (tps_median >= MIN_TPS)
|
| 203 |
+
zero_shot_score = _zero_shot_score(r)
|
| 204 |
+
if feasible:
|
| 205 |
+
feasible_count += 1
|
| 206 |
+
print(
|
| 207 |
+
f" n_layer={n:2d} val_bpb={_metric_float(r, 'val_bpb', float('nan')):.4f} "
|
| 208 |
+
f"tps_median={tps_median:.0f} zero_shot_score={zero_shot_score:.4f} feasible={feasible}",
|
| 209 |
+
flush=True,
|
| 210 |
+
)
|
| 211 |
+
if MIN_TPS > 0:
|
| 212 |
+
print(f"[agg] throughput gate: tps_median >= {MIN_TPS:.0f}; feasible={feasible_count}/{len(ranked)}")
|
| 213 |
|
| 214 |
|
| 215 |
def main() -> int:
|
|
|
|
| 230 |
jobs[n_layer] = job_id
|
| 231 |
|
| 232 |
print(f'[agg] reading {len(jobs)} jobs from {MANIFEST}')
|
| 233 |
+
results: dict[int, MetricsDict] = {}
|
| 234 |
for n, jid in jobs.items():
|
| 235 |
print(f'[agg] fetching job={jid} (n_layer={n}) ...')
|
| 236 |
m = fetch_metrics_from_job(jid)
|
overlay/scripts/sweep_depth_local.sh
CHANGED
|
@@ -1,62 +1,62 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
# Local sequential depth sweep on RTX 3060.
|
| 3 |
-
# Uses real mamba_ssm Mamba3 (grafted from state-spaces/mamba main).
|
| 4 |
-
# Config: Gen 76 local champion (d_model=96, engram=4096, target_active=327),
|
| 5 |
-
# sweeping n_layer ∈ {1, 2, 3, 4}. Each run 300s (~5 min) → ~20 min total.
|
| 6 |
-
|
| 7 |
-
set -euo pipefail
|
| 8 |
-
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
| 9 |
-
|
| 10 |
-
export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
|
| 11 |
-
# WSL2: libcuda.so.1 lives at /usr/lib/wsl/lib; prepend it so cudarc finds the
|
| 12 |
-
# CUDA driver library at runtime.
|
| 13 |
-
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:/usr/lib/wsl/lib:${LD_LIBRARY_PATH:-}
|
| 14 |
-
export PYTORCH_ALLOC_CONF=expandable_segments:True
|
| 15 |
-
|
| 16 |
-
# GPU HTM path: use non-fused step_many_cuda (fused megakernel is Hopper-only).
|
| 17 |
-
# This drops htm_await from ~20-40s/step (CPU) to ~0ms (GPU, async).
|
| 18 |
-
export HYDRA_HTM_FUSED=0
|
| 19 |
-
|
| 20 |
-
# Architecture (Gen 76 + user audit: keep target_active=327 for gradient plasticity).
|
| 21 |
-
export HYDRA_D_MODEL=96
|
| 22 |
-
export HYDRA_D_STATE=16
|
| 23 |
-
export HYDRA_HEADDIM=12
|
| 24 |
-
export HYDRA_EXPAND=3
|
| 25 |
-
export HYDRA_ENGRAM_N_COLUMNS=4096
|
| 26 |
-
export HYDRA_SDR_TARGET_ACTIVE=327
|
| 27 |
-
|
| 28 |
-
# Training knobs tuned for 6GB VRAM.
|
| 29 |
-
export HYDRA_BATCH_SIZE=1
|
| 30 |
-
export HYDRA_TOTAL_BATCH=32768 # 1 * 8 accum * 512 seq * 8 heads = Gen 76 config
|
| 31 |
-
export HYDRA_TIME_BUDGET=300 # 5 min per run
|
| 32 |
-
export HYDRA_CKPT_INTERVAL=0 # don't save ckpts during sweep
|
| 33 |
-
export HYDRA_MID_VAL_INTERVAL=250
|
| 34 |
-
|
| 35 |
-
# Full per-layer diagnostic panel.
|
| 36 |
-
export HYDRA_LAYER_DIAGNOSTICS=1
|
| 37 |
-
export HYDRA_LAYER_DIAG_SVD_EVERY=100
|
| 38 |
-
|
| 39 |
-
# Use cached shards + tokenizer + retina (vocab=8192, target_active=327).
|
| 40 |
-
# NOT streaming — already have 2049 shards from prior local runs.
|
| 41 |
-
unset HYDRA_USE_NEMOTRON
|
| 42 |
-
|
| 43 |
-
PY=/home/mikeb/work/feather/.venv/bin/python3
|
| 44 |
-
OUT_DIR=/tmp/local_sweep
|
| 45 |
-
mkdir -p "$OUT_DIR"
|
| 46 |
-
|
| 47 |
-
for N in 1 2 3 4; do
|
| 48 |
-
echo "=========================================="
|
| 49 |
-
echo "=== n_layer=$N $(date +%H:%M:%S) ==="
|
| 50 |
-
echo "=========================================="
|
| 51 |
-
export HYDRA_N_LAYER=$N
|
| 52 |
-
export HYDRA_METRICS_OUT="$OUT_DIR/sweep_n${N}_metrics.json"
|
| 53 |
-
LOG="$OUT_DIR/sweep_n${N}.log"
|
| 54 |
-
"$PY" -u train.py > "$LOG" 2>&1 || echo "[WARN] n_layer=$N run exited non-zero (see $LOG)"
|
| 55 |
-
echo "=== n_layer=$N done; metrics=$HYDRA_METRICS_OUT log=$LOG ==="
|
| 56 |
-
# Quick tail of the important lines
|
| 57 |
-
grep -E "val_bpb|LAYER_DIAG|METRICS_JSON" "$LOG" | tail -20 || true
|
| 58 |
-
done
|
| 59 |
-
|
| 60 |
-
echo ""
|
| 61 |
-
echo "=== SWEEP COMPLETE ==="
|
| 62 |
-
ls -la "$OUT_DIR"
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Local sequential depth sweep on RTX 3060.
|
| 3 |
+
# Uses real mamba_ssm Mamba3 (grafted from state-spaces/mamba main).
|
| 4 |
+
# Config: Gen 76 local champion (d_model=96, engram=4096, target_active=327),
|
| 5 |
+
# sweeping n_layer ∈ {1, 2, 3, 4}. Each run 300s (~5 min) → ~20 min total.
|
| 6 |
+
|
| 7 |
+
set -euo pipefail
|
| 8 |
+
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
| 9 |
+
|
| 10 |
+
export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
|
| 11 |
+
# WSL2: libcuda.so.1 lives at /usr/lib/wsl/lib; prepend it so cudarc finds the
|
| 12 |
+
# CUDA driver library at runtime.
|
| 13 |
+
export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:/usr/lib/wsl/lib:${LD_LIBRARY_PATH:-}
|
| 14 |
+
export PYTORCH_ALLOC_CONF=expandable_segments:True
|
| 15 |
+
|
| 16 |
+
# GPU HTM path: use non-fused step_many_cuda (fused megakernel is Hopper-only).
|
| 17 |
+
# This drops htm_await from ~20-40s/step (CPU) to ~0ms (GPU, async).
|
| 18 |
+
export HYDRA_HTM_FUSED=0
|
| 19 |
+
|
| 20 |
+
# Architecture (Gen 76 + user audit: keep target_active=327 for gradient plasticity).
|
| 21 |
+
export HYDRA_D_MODEL=96
|
| 22 |
+
export HYDRA_D_STATE=16
|
| 23 |
+
export HYDRA_HEADDIM=12
|
| 24 |
+
export HYDRA_EXPAND=3
|
| 25 |
+
export HYDRA_ENGRAM_N_COLUMNS=4096
|
| 26 |
+
export HYDRA_SDR_TARGET_ACTIVE=327
|
| 27 |
+
|
| 28 |
+
# Training knobs tuned for 6GB VRAM.
|
| 29 |
+
export HYDRA_BATCH_SIZE=1
|
| 30 |
+
export HYDRA_TOTAL_BATCH=32768 # 1 * 8 accum * 512 seq * 8 heads = Gen 76 config
|
| 31 |
+
export HYDRA_TIME_BUDGET=300 # 5 min per run
|
| 32 |
+
export HYDRA_CKPT_INTERVAL=0 # don't save ckpts during sweep
|
| 33 |
+
export HYDRA_MID_VAL_INTERVAL=250
|
| 34 |
+
|
| 35 |
+
# Full per-layer diagnostic panel.
|
| 36 |
+
export HYDRA_LAYER_DIAGNOSTICS=1
|
| 37 |
+
export HYDRA_LAYER_DIAG_SVD_EVERY=100
|
| 38 |
+
|
| 39 |
+
# Use cached shards + tokenizer + retina (vocab=8192, target_active=327).
|
| 40 |
+
# NOT streaming — already have 2049 shards from prior local runs.
|
| 41 |
+
unset HYDRA_USE_NEMOTRON
|
| 42 |
+
|
| 43 |
+
PY=/home/mikeb/work/feather/.venv/bin/python3
|
| 44 |
+
OUT_DIR=/tmp/local_sweep
|
| 45 |
+
mkdir -p "$OUT_DIR"
|
| 46 |
+
|
| 47 |
+
for N in 1 2 3 4; do
|
| 48 |
+
echo "=========================================="
|
| 49 |
+
echo "=== n_layer=$N $(date +%H:%M:%S) ==="
|
| 50 |
+
echo "=========================================="
|
| 51 |
+
export HYDRA_N_LAYER=$N
|
| 52 |
+
export HYDRA_METRICS_OUT="$OUT_DIR/sweep_n${N}_metrics.json"
|
| 53 |
+
LOG="$OUT_DIR/sweep_n${N}.log"
|
| 54 |
+
"$PY" -u train.py > "$LOG" 2>&1 || echo "[WARN] n_layer=$N run exited non-zero (see $LOG)"
|
| 55 |
+
echo "=== n_layer=$N done; metrics=$HYDRA_METRICS_OUT log=$LOG ==="
|
| 56 |
+
# Quick tail of the important lines
|
| 57 |
+
grep -E "val_bpb|LAYER_DIAG|METRICS_JSON" "$LOG" | tail -20 || true
|
| 58 |
+
done
|
| 59 |
+
|
| 60 |
+
echo ""
|
| 61 |
+
echo "=== SWEEP COMPLETE ==="
|
| 62 |
+
ls -la "$OUT_DIR"
|
overlay/scripts/train_champion_12h.sh
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# 12-hour champion training run. Config matches autoresearch iter.sh base
|
| 3 |
+
# after 61 mutation experiments identified the Pareto-optimal knobs.
|
| 4 |
+
#
|
| 5 |
+
# Champion config (train_bpb ~1.6169 at 10-min budget, 29.7k tps):
|
| 6 |
+
# d_model=160, n_layer=20, B=8, seq=1024
|
| 7 |
+
# engram=16384, z_loss=0.001, no GDN (pure Mamba3 stack)
|
| 8 |
+
# TIME_BUDGET=43200s (12 hours)
|
| 9 |
+
# CKPT_INTERVAL=500 steps (~every 15 min at ~30 steps/s)
|
| 10 |
+
#
|
| 11 |
+
# Assumes .omc/autoresearch_STOP sentinel is present (cron loop disabled).
|
| 12 |
+
# Output goes to run_champion_12h.log in repo root.
|
| 13 |
+
|
| 14 |
+
set -u
|
| 15 |
+
REPO=/home/mikeb/work/feather
|
| 16 |
+
cd "$REPO"
|
| 17 |
+
|
| 18 |
+
# Bail if autoresearch loop sentinel not set (would conflict)
|
| 19 |
+
if [ ! -f "$REPO/.omc/autoresearch_STOP" ]; then
|
| 20 |
+
echo "ERROR: .omc/autoresearch_STOP not present — autoresearch cron still active."
|
| 21 |
+
echo "Run: touch $REPO/.omc/autoresearch_STOP"
|
| 22 |
+
exit 1
|
| 23 |
+
fi
|
| 24 |
+
|
| 25 |
+
# Bail if another training is running
|
| 26 |
+
if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {found=1} END {exit !found}'; then
|
| 27 |
+
echo "ERROR: another python train.py is already running"
|
| 28 |
+
exit 1
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
rm -f run_champion_12h.log
|
| 32 |
+
env \
|
| 33 |
+
LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
|
| 34 |
+
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
|
| 35 |
+
HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
|
| 36 |
+
HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
|
| 37 |
+
HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
|
| 38 |
+
HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
|
| 39 |
+
HYDRA_TIME_BUDGET=43200 \
|
| 40 |
+
HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
|
| 41 |
+
HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
|
| 42 |
+
HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
|
| 43 |
+
HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
|
| 44 |
+
HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
|
| 45 |
+
HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
|
| 46 |
+
HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
|
| 47 |
+
HYDRA_Z_LOSS_WEIGHT=0.001 \
|
| 48 |
+
HYDRA_RESUME_CKPT=none \
|
| 49 |
+
./.venv/bin/python -u train.py > run_champion_12h.log 2>&1
|
| 50 |
+
echo "exit=$?"
|
overlay/scripts/train_champion_5h.sh
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# 5-hour champion training — fresh start with properly-timed cosine schedule.
|
| 3 |
+
#
|
| 4 |
+
# Why not 12h: at 12h budget, the cosine LR stays near peak for the first
|
| 5 |
+
# ~6h, leaving the model thrashing around bpb~1.72 (plateau observed).
|
| 6 |
+
# The schedule is stretched too thin.
|
| 7 |
+
#
|
| 8 |
+
# Why 5h: 18000s is long enough to build capacity (~17000 steps at 30k tps)
|
| 9 |
+
# while letting the cosine actually decay to zero within the window. The
|
| 10 |
+
# "cooling" phase (last 20% = 1h) is where the bpb drops sharply below
|
| 11 |
+
# the 10-min champion's 1.62.
|
| 12 |
+
#
|
| 13 |
+
# Why not resume from latest.pt: the saved ckpt triggers NaN on first
|
| 14 |
+
# forward after resume (reproducible; ckpt/optimizer state incompatibility
|
| 15 |
+
# not worth debugging — fresh start is faster).
|
| 16 |
+
|
| 17 |
+
set -u
|
| 18 |
+
REPO=/home/mikeb/work/feather
|
| 19 |
+
cd "$REPO"
|
| 20 |
+
|
| 21 |
+
if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
|
| 22 |
+
echo "ERROR: another python train.py is running"
|
| 23 |
+
exit 1
|
| 24 |
+
fi
|
| 25 |
+
|
| 26 |
+
rm -f run_champion_5h.log
|
| 27 |
+
env \
|
| 28 |
+
LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
|
| 29 |
+
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
|
| 30 |
+
HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
|
| 31 |
+
HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
|
| 32 |
+
HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
|
| 33 |
+
HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
|
| 34 |
+
HYDRA_TIME_BUDGET=18000 \
|
| 35 |
+
HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
|
| 36 |
+
HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
|
| 37 |
+
HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
|
| 38 |
+
HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
|
| 39 |
+
HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
|
| 40 |
+
HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
|
| 41 |
+
HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
|
| 42 |
+
HYDRA_Z_LOSS_WEIGHT=0.001 \
|
| 43 |
+
HYDRA_RESUME_CKPT=none \
|
| 44 |
+
./.venv/bin/python -u train.py > run_champion_5h.log 2>&1
|
| 45 |
+
echo "exit=$?"
|
overlay/scripts/train_champion_resume.sh
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Resume the original 12h run from its step-5000 checkpoint with the SAME
|
| 3 |
+
# budget (43200s). This keeps the optimizer state and LR schedule identical
|
| 4 |
+
# to what was running at ckpt save, so there's no mismatch between loaded
|
| 5 |
+
# momentum and new lr.
|
| 6 |
+
#
|
| 7 |
+
# Intent: validate that the resume path itself works (vs the failed warmstart
|
| 8 |
+
# attempts where budget change caused NaN on first step).
|
| 9 |
+
|
| 10 |
+
set -u
|
| 11 |
+
REPO=/home/mikeb/work/feather
|
| 12 |
+
cd "$REPO"
|
| 13 |
+
|
| 14 |
+
if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
|
| 15 |
+
echo "ERROR: another python train.py is running"
|
| 16 |
+
exit 1
|
| 17 |
+
fi
|
| 18 |
+
|
| 19 |
+
rm -f run_champion_resume.log
|
| 20 |
+
env \
|
| 21 |
+
LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
|
| 22 |
+
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
|
| 23 |
+
HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
|
| 24 |
+
HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
|
| 25 |
+
HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
|
| 26 |
+
HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
|
| 27 |
+
HYDRA_TIME_BUDGET=43200 \
|
| 28 |
+
HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
|
| 29 |
+
HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
|
| 30 |
+
HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
|
| 31 |
+
HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
|
| 32 |
+
HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
|
| 33 |
+
HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
|
| 34 |
+
HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
|
| 35 |
+
HYDRA_Z_LOSS_WEIGHT=0.001 \
|
| 36 |
+
HYDRA_RESUME_CKPT=/home/mikeb/.cache/autoresearch/latest.pt \
|
| 37 |
+
./.venv/bin/python -u train.py > run_champion_resume.log 2>&1
|
| 38 |
+
echo "exit=$?"
|
overlay/scripts/train_champion_resume_clean.sh
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Resume training from weights-only ckpt (optimizer state stripped) to
|
| 3 |
+
# avoid the reproducible NaN that plain resume triggers.
|
| 4 |
+
#
|
| 5 |
+
# The step/train_seconds/epoch are also reset to 0 so the LR schedule
|
| 6 |
+
# warmup runs cleanly and cosine decay matches the new TIME_BUDGET.
|
| 7 |
+
# Model weights carry over ~2500 steps of prior training.
|
| 8 |
+
|
| 9 |
+
set -u
|
| 10 |
+
REPO=/home/mikeb/work/feather
|
| 11 |
+
cd "$REPO"
|
| 12 |
+
|
| 13 |
+
if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
|
| 14 |
+
echo "ERROR: another python train.py is running"
|
| 15 |
+
exit 1
|
| 16 |
+
fi
|
| 17 |
+
|
| 18 |
+
CKPT=/home/mikeb/.cache/autoresearch/weights_only_clean.pt
|
| 19 |
+
if [ ! -f "$CKPT" ]; then
|
| 20 |
+
echo "ERROR: $CKPT missing. Run scripts/strip_optimizer_state.py first."
|
| 21 |
+
exit 1
|
| 22 |
+
fi
|
| 23 |
+
|
| 24 |
+
rm -f run_champion_resume_clean.log
|
| 25 |
+
env \
|
| 26 |
+
LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
|
| 27 |
+
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
|
| 28 |
+
HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
|
| 29 |
+
HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
|
| 30 |
+
HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
|
| 31 |
+
HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
|
| 32 |
+
HYDRA_TIME_BUDGET=18000 \
|
| 33 |
+
HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
|
| 34 |
+
HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
|
| 35 |
+
HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
|
| 36 |
+
HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
|
| 37 |
+
HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
|
| 38 |
+
HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
|
| 39 |
+
HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
|
| 40 |
+
HYDRA_Z_LOSS_WEIGHT=0.001 \
|
| 41 |
+
HYDRA_RESUME_CKPT="$CKPT" \
|
| 42 |
+
./.venv/bin/python -u train.py > run_champion_resume_clean.log 2>&1
|
| 43 |
+
echo "exit=$?"
|
overlay/scripts/train_champion_v2.sh
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Champion training v2 — fixes data pipeline + mode collapse.
|
| 3 |
+
#
|
| 4 |
+
# Diagnosis from step-3500 ckpt sampling:
|
| 5 |
+
# - Greedy decoding collapses to "a whole grains, etc." attractor
|
| 6 |
+
# - Top-p produces grammatical but factually-empty text
|
| 7 |
+
# - Token cache being built on-the-fly; blend sources were silently
|
| 8 |
+
# unavailable because HYDRA_LOCAL_SHARDS_ONLY=1 + no cached parquets
|
| 9 |
+
# - FULL_BLEND has only 4 active sources (fineweb-edu, wikipedia,
|
| 10 |
+
# cosmopedia, fineweb), all weight-0 for code/math
|
| 11 |
+
#
|
| 12 |
+
# Fixes:
|
| 13 |
+
# A) HYDRA_LOCAL_SHARDS_ONLY=0 → stream directly from HF Hub
|
| 14 |
+
# B) HYDRA_BACKGROUND_PREFETCH=1 → download remaining shards in BG
|
| 15 |
+
# C) HYDRA_ENTROPY_PENALTY=0.01 → break single-attractor mode collapse
|
| 16 |
+
# D) HYDRA_LABEL_SMOOTHING=0.1 → soft targets discourage peaked dist
|
| 17 |
+
# E) Resume from weights_only_clean.pt (inherit prior training)
|
| 18 |
+
|
| 19 |
+
set -u
|
| 20 |
+
REPO=/home/mikeb/work/feather
|
| 21 |
+
cd "$REPO"
|
| 22 |
+
|
| 23 |
+
if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
|
| 24 |
+
echo "ERROR: another python train.py is running"
|
| 25 |
+
exit 1
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
CKPT=/home/mikeb/.cache/autoresearch/weights_only_clean.pt
|
| 29 |
+
if [ ! -f "$CKPT" ]; then
|
| 30 |
+
echo "ERROR: $CKPT missing."
|
| 31 |
+
exit 1
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
rm -f run_champion_v2.log
|
| 35 |
+
env \
|
| 36 |
+
LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
|
| 37 |
+
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
|
| 38 |
+
HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
|
| 39 |
+
HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
|
| 40 |
+
HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
|
| 41 |
+
HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
|
| 42 |
+
HYDRA_TIME_BUDGET=18000 \
|
| 43 |
+
HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
|
| 44 |
+
HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
|
| 45 |
+
HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
|
| 46 |
+
HYDRA_LOCAL_SHARDS_ONLY=0 HYDRA_BACKGROUND_PREFETCH=1 \
|
| 47 |
+
HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
|
| 48 |
+
HYDRA_ENTROPY_PENALTY=0.01 HYDRA_LABEL_SMOOTHING=0.1 \
|
| 49 |
+
HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
|
| 50 |
+
HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
|
| 51 |
+
HYDRA_Z_LOSS_WEIGHT=0.001 \
|
| 52 |
+
HYDRA_RESUME_CKPT="$CKPT" \
|
| 53 |
+
./.venv/bin/python -u train.py > run_champion_v2.log 2>&1
|
| 54 |
+
echo "exit=$?"
|
overlay/scripts/train_champion_warmstart.sh
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Warm-start from the 12h champion training's latest.pt, with a TIGHTER
|
| 3 |
+
# total budget so the cosine LR decay actually kicks in.
|
| 4 |
+
#
|
| 5 |
+
# Problem: The plain 12h run (43200s) keeps lr near peak (1.1e-2) for the
|
| 6 |
+
# first ~6h, leaving the model thrashing around its local min (bpb ~1.72
|
| 7 |
+
# rolling avg from step 2700 onward). User correctly pointed out the
|
| 8 |
+
# schedule shape for a long budget wastes time in exploration.
|
| 9 |
+
#
|
| 10 |
+
# Fix: resume the already-trained weights (step ~5000, train_seconds ~5600)
|
| 11 |
+
# but run with HYDRA_TIME_BUDGET=20000 (5.5h total). The scheduler treats
|
| 12 |
+
# loaded train_seconds=5600 as "already 28% through" a 20000s budget, so
|
| 13 |
+
# lr decays from ~1.05e-2 now to near-zero over the next 4h — the "cooling"
|
| 14 |
+
# phase that produces the stable low-bpb endpoint.
|
| 15 |
+
#
|
| 16 |
+
# Total additional wall-clock: ~4h. Previous checkpoints are preserved
|
| 17 |
+
# (ckpt rotations keep latest.pt, latest.pt.1, etc.).
|
| 18 |
+
|
| 19 |
+
set -u
|
| 20 |
+
REPO=/home/mikeb/work/feather
|
| 21 |
+
cd "$REPO"
|
| 22 |
+
|
| 23 |
+
if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
|
| 24 |
+
echo "ERROR: another python train.py is running"
|
| 25 |
+
exit 1
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
rm -f run_champion_warmstart.log
|
| 29 |
+
env \
|
| 30 |
+
LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
|
| 31 |
+
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
|
| 32 |
+
HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
|
| 33 |
+
HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
|
| 34 |
+
HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
|
| 35 |
+
HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
|
| 36 |
+
HYDRA_TIME_BUDGET=20000 \
|
| 37 |
+
HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
|
| 38 |
+
HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
|
| 39 |
+
HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
|
| 40 |
+
HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
|
| 41 |
+
HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
|
| 42 |
+
HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
|
| 43 |
+
HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
|
| 44 |
+
HYDRA_Z_LOSS_WEIGHT=0.001 \
|
| 45 |
+
HYDRA_RESUME_CKPT=/home/mikeb/.cache/autoresearch/latest.pt \
|
| 46 |
+
./.venv/bin/python -u train.py > run_champion_warmstart.log 2>&1
|
| 47 |
+
echo "exit=$?"
|
overlay/scripts/wsl_bootstrap_tps.sh
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# Bootstrap a WSL CUDA Python env capable of running train.py TPS checks.
|
| 5 |
+
# Usage:
|
| 6 |
+
# bash scripts/wsl_bootstrap_tps.sh [cuda-tag]
|
| 7 |
+
# Example:
|
| 8 |
+
# bash scripts/wsl_bootstrap_tps.sh cu121
|
| 9 |
+
|
| 10 |
+
CUDA_TAG="${1:-cu121}"
|
| 11 |
+
PYTHON_BIN="${PYTHON_BIN:-python3}"
|
| 12 |
+
VENV_DIR="${VENV_DIR:-.venv-wsl}"
|
| 13 |
+
|
| 14 |
+
if ! grep -qiE "microsoft|wsl" /proc/version 2>/dev/null; then
|
| 15 |
+
echo "[bootstrap] warning: not running inside WSL; continuing anyway"
|
| 16 |
+
fi
|
| 17 |
+
|
| 18 |
+
if ! command -v nvidia-smi >/dev/null 2>&1; then
|
| 19 |
+
echo "[bootstrap] error: nvidia-smi not found. Install NVIDIA driver + WSL GPU support first."
|
| 20 |
+
exit 1
|
| 21 |
+
fi
|
| 22 |
+
|
| 23 |
+
if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
|
| 24 |
+
echo "[bootstrap] error: Python binary not found: $PYTHON_BIN"
|
| 25 |
+
exit 1
|
| 26 |
+
fi
|
| 27 |
+
|
| 28 |
+
"$PYTHON_BIN" -m venv "$VENV_DIR"
|
| 29 |
+
source "$VENV_DIR/bin/activate"
|
| 30 |
+
|
| 31 |
+
python -m pip install --upgrade pip wheel setuptools
|
| 32 |
+
|
| 33 |
+
case "$CUDA_TAG" in
|
| 34 |
+
cu118)
|
| 35 |
+
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu118"
|
| 36 |
+
;;
|
| 37 |
+
cu121)
|
| 38 |
+
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu121"
|
| 39 |
+
;;
|
| 40 |
+
cu124)
|
| 41 |
+
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu124"
|
| 42 |
+
;;
|
| 43 |
+
*)
|
| 44 |
+
echo "[bootstrap] error: unsupported cuda tag '$CUDA_TAG' (supported: cu118, cu121, cu124)"
|
| 45 |
+
exit 1
|
| 46 |
+
;;
|
| 47 |
+
esac
|
| 48 |
+
|
| 49 |
+
python -m pip install "torch" --index-url "$TORCH_INDEX_URL"
|
| 50 |
+
python -m pip install -e ".[dev]"
|
| 51 |
+
|
| 52 |
+
# IMPORTANT: --no-build-isolation keeps pip from pulling torch-cpu into an
|
| 53 |
+
# isolated build env, which would break mamba-ssm extension builds.
|
| 54 |
+
python -m pip install "causal-conv1d>=1.4.0" --no-build-isolation
|
| 55 |
+
python -m pip install "mamba-ssm" --no-build-isolation
|
| 56 |
+
|
| 57 |
+
python - <<'PY'
|
| 58 |
+
import torch
|
| 59 |
+
print(f"[bootstrap] torch={torch.__version__}")
|
| 60 |
+
print(f"[bootstrap] torch_cuda={torch.version.cuda}")
|
| 61 |
+
print(f"[bootstrap] cuda_available={torch.cuda.is_available()}")
|
| 62 |
+
if not torch.cuda.is_available():
|
| 63 |
+
raise SystemExit("[bootstrap] error: CUDA not available to torch")
|
| 64 |
+
import mamba_ssm # noqa: F401
|
| 65 |
+
print("[bootstrap] mamba_ssm import OK")
|
| 66 |
+
PY
|
| 67 |
+
|
| 68 |
+
echo "[bootstrap] done. Activate env with: source $VENV_DIR/bin/activate"
|
overlay/subsystems/htm.py
CHANGED
|
@@ -29,40 +29,38 @@ copy is small compared to the SP/TM compute.
|
|
| 29 |
from __future__ import annotations
|
| 30 |
|
| 31 |
import time
|
| 32 |
-
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
| 33 |
|
| 34 |
import numpy as np
|
| 35 |
import torch
|
| 36 |
import torch.nn as nn
|
| 37 |
|
| 38 |
-
import htm_rust
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# GPU backend: built with `maturin develop --features gpu`. One CUDA region
|
| 44 |
# per batch slot, persistent device state for SP synapses. Transparent
|
| 45 |
# fallback to CPU when not available.
|
| 46 |
-
_HTM_HAS_GPU = hasattr(htm_rust, "HTMRegionGpu")
|
| 47 |
# Zero-copy CUDA path: consumes torch CUDA tensors directly via the
|
| 48 |
# __cuda_array_interface__ protocol, skipping the sdr.cpu()/numpy round-trip
|
| 49 |
# and the D2H of outputs. Huge win when the input SDR already lives on GPU
|
| 50 |
# (which is the train.py hot path — retina is a device buffer).
|
| 51 |
-
_HTM_HAS_CAI = _HTM_HAS_GPU and hasattr(
|
| 52 |
# Fused megakernel path: collapses all T timesteps + SP + TM into a single
|
| 53 |
# CUDA launch per forward. Replaces global top-K with per-column threshold
|
| 54 |
# inhibition (see htm_rust/docs/GPU_HTM.md §Fused Kernel).
|
| 55 |
# Opt-in via env var (default on when available).
|
| 56 |
import os as _os_fused
|
| 57 |
-
_HTM_HAS_FUSED = _HTM_HAS_GPU and hasattr(
|
| 58 |
-
|
| 59 |
-
_HTM_HAS_FUSED and hasattr(htm_rust, "gpu_fused_available") and htm_rust.gpu_fused_available()
|
| 60 |
-
)
|
| 61 |
-
_HTM_USE_FUSED = (
|
| 62 |
-
_HTM_HAS_FUSED
|
| 63 |
-
and _HTM_GPU_FUSED_RUNTIME
|
| 64 |
-
and bool(int(_os_fused.environ.get("HYDRA_HTM_FUSED", "1")))
|
| 65 |
-
)
|
| 66 |
|
| 67 |
|
| 68 |
class HTMLayer(nn.Module):
|
|
@@ -87,11 +85,11 @@ class HTMLayer(nn.Module):
|
|
| 87 |
learn: bool = True,
|
| 88 |
reset_each_forward: bool = True,
|
| 89 |
use_gpu: bool | None = None,
|
| 90 |
-
) -> None:
|
| 91 |
-
super().__init__()
|
| 92 |
-
self.input_bits = input_bits
|
| 93 |
-
self.n_columns = n_columns
|
| 94 |
-
self.cells_per_column = cells_per_column
|
| 95 |
self.learn = learn
|
| 96 |
self.reset_each_forward = reset_each_forward
|
| 97 |
self._seed_base = seed
|
|
@@ -101,39 +99,27 @@ class HTMLayer(nn.Module):
|
|
| 101 |
# converges since the EMA accumulates over many calls. Env:
|
| 102 |
# HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
|
| 103 |
import os as _os
|
| 104 |
-
self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
|
| 105 |
-
self._forward_counter = 0
|
| 106 |
-
|
| 107 |
-
#
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
"
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
self.
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
if not self._use_gpu or not self._gpu_fallback:
|
| 126 |
-
raise
|
| 127 |
-
print(
|
| 128 |
-
f"[htm] GPU region init failed ({e}); falling back to CPU HTMRegion",
|
| 129 |
-
flush=True,
|
| 130 |
-
)
|
| 131 |
-
self._use_gpu = False
|
| 132 |
-
self._region_cls = htm_rust.HTMRegion
|
| 133 |
-
self._regions = [
|
| 134 |
-
self._region_cls(input_bits, n_columns, cells_per_column, seed + i)
|
| 135 |
-
for i in range(batch_size)
|
| 136 |
-
]
|
| 137 |
self.register_buffer("_dummy", torch.zeros(1), persistent=False)
|
| 138 |
import os as _os
|
| 139 |
self._htm_pool = ThreadPoolExecutor(max_workers=min(_os.cpu_count() or 4, 16))
|
|
@@ -278,12 +264,12 @@ class HTMLayer(nn.Module):
|
|
| 278 |
# grid.y = B processes all regions concurrently — ~B× speedup.
|
| 279 |
# Falls back to sequential dispatch if the batched entry isn't
|
| 280 |
# available (older htm_rust wheel).
|
| 281 |
-
if _HTM_USE_FUSED and
|
| 282 |
# Slice self._regions to match B: _ensure_regions may have
|
| 283 |
# allocated more regions than the current batch size needs
|
| 284 |
# (e.g. factual eval uses smaller batches than training).
|
| 285 |
try:
|
| 286 |
-
|
| 287 |
self._regions[:B],
|
| 288 |
[sdr_u8[b].__cuda_array_interface__ for b in range(B)],
|
| 289 |
[cols_out[b].__cuda_array_interface__ for b in range(B)],
|
|
|
|
| 29 |
from __future__ import annotations
|
| 30 |
|
| 31 |
import time
|
| 32 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 33 |
+
from typing import Any
|
| 34 |
|
| 35 |
import numpy as np
|
| 36 |
import torch
|
| 37 |
import torch.nn as nn
|
| 38 |
|
| 39 |
+
import htm_rust
|
| 40 |
+
|
| 41 |
+
_HTM_REGION: Any = getattr(htm_rust, "HTMRegion", None)
|
| 42 |
+
_HTM_REGION_GPU: Any = getattr(htm_rust, "HTMRegionGpu", None)
|
| 43 |
+
_HTM_STEP_BATCH_FUSED_CUDA: Any = getattr(htm_rust, "step_batch_fused_cuda", None)
|
| 44 |
+
|
| 45 |
+
# step_many releases the GIL for the whole pass, so multiple threads can
|
| 46 |
+
# truly run regions in parallel — wall-clock scales with B up to CPU cores.
|
| 47 |
+
_HTM_HAS_STEP_MANY = hasattr(_HTM_REGION, "step_many")
|
| 48 |
# GPU backend: built with `maturin develop --features gpu`. One CUDA region
|
| 49 |
# per batch slot, persistent device state for SP synapses. Transparent
|
| 50 |
# fallback to CPU when not available.
|
| 51 |
+
_HTM_HAS_GPU = hasattr(htm_rust, "HTMRegionGpu")
|
| 52 |
# Zero-copy CUDA path: consumes torch CUDA tensors directly via the
|
| 53 |
# __cuda_array_interface__ protocol, skipping the sdr.cpu()/numpy round-trip
|
| 54 |
# and the D2H of outputs. Huge win when the input SDR already lives on GPU
|
| 55 |
# (which is the train.py hot path — retina is a device buffer).
|
| 56 |
+
_HTM_HAS_CAI = _HTM_HAS_GPU and hasattr(_HTM_REGION_GPU, "step_many_cuda")
|
| 57 |
# Fused megakernel path: collapses all T timesteps + SP + TM into a single
|
| 58 |
# CUDA launch per forward. Replaces global top-K with per-column threshold
|
| 59 |
# inhibition (see htm_rust/docs/GPU_HTM.md §Fused Kernel).
|
| 60 |
# Opt-in via env var (default on when available).
|
| 61 |
import os as _os_fused
|
| 62 |
+
_HTM_HAS_FUSED = _HTM_HAS_GPU and hasattr(_HTM_REGION_GPU, "step_many_fused_cuda")
|
| 63 |
+
_HTM_USE_FUSED = _HTM_HAS_FUSED and bool(int(_os_fused.environ.get("HYDRA_HTM_FUSED", "1")))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
|
| 66 |
class HTMLayer(nn.Module):
|
|
|
|
| 85 |
learn: bool = True,
|
| 86 |
reset_each_forward: bool = True,
|
| 87 |
use_gpu: bool | None = None,
|
| 88 |
+
) -> None:
|
| 89 |
+
super().__init__()
|
| 90 |
+
self.input_bits = input_bits
|
| 91 |
+
self.n_columns = n_columns
|
| 92 |
+
self.cells_per_column = cells_per_column
|
| 93 |
self.learn = learn
|
| 94 |
self.reset_each_forward = reset_each_forward
|
| 95 |
self._seed_base = seed
|
|
|
|
| 99 |
# converges since the EMA accumulates over many calls. Env:
|
| 100 |
# HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
|
| 101 |
import os as _os
|
| 102 |
+
self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
|
| 103 |
+
self._forward_counter = 0
|
| 104 |
+
force_cpu = _os.environ.get("HYDRA_FORCE_HTM_CPU", "0") == "1"
|
| 105 |
+
# GPU backend gate. Default: auto-detect — use GPU when the pyo3
|
| 106 |
+
# module was built with --features gpu AND CUDA is actually usable.
|
| 107 |
+
if use_gpu is None:
|
| 108 |
+
use_gpu = (not force_cpu) and _HTM_HAS_GPU and torch.cuda.is_available()
|
| 109 |
+
elif use_gpu and not _HTM_HAS_GPU:
|
| 110 |
+
raise RuntimeError(
|
| 111 |
+
"HTMLayer(use_gpu=True) but htm_rust was not built with "
|
| 112 |
+
"--features gpu. Re-run `maturin develop --features gpu`."
|
| 113 |
+
)
|
| 114 |
+
elif use_gpu and force_cpu:
|
| 115 |
+
use_gpu = False
|
| 116 |
+
self._use_gpu = bool(use_gpu)
|
| 117 |
+
cls = _HTM_REGION_GPU if self._use_gpu else _HTM_REGION
|
| 118 |
+
self._region_cls = cls
|
| 119 |
+
self._regions = [
|
| 120 |
+
cls(input_bits, n_columns, cells_per_column, seed + i)
|
| 121 |
+
for i in range(batch_size)
|
| 122 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
self.register_buffer("_dummy", torch.zeros(1), persistent=False)
|
| 124 |
import os as _os
|
| 125 |
self._htm_pool = ThreadPoolExecutor(max_workers=min(_os.cpu_count() or 4, 16))
|
|
|
|
| 264 |
# grid.y = B processes all regions concurrently — ~B× speedup.
|
| 265 |
# Falls back to sequential dispatch if the batched entry isn't
|
| 266 |
# available (older htm_rust wheel).
|
| 267 |
+
if _HTM_USE_FUSED and _HTM_STEP_BATCH_FUSED_CUDA is not None:
|
| 268 |
# Slice self._regions to match B: _ensure_regions may have
|
| 269 |
# allocated more regions than the current batch size needs
|
| 270 |
# (e.g. factual eval uses smaller batches than training).
|
| 271 |
try:
|
| 272 |
+
_HTM_STEP_BATCH_FUSED_CUDA(
|
| 273 |
self._regions[:B],
|
| 274 |
[sdr_u8[b].__cuda_array_interface__ for b in range(B)],
|
| 275 |
[cols_out[b].__cuda_array_interface__ for b in range(B)],
|