Jackoatmon commited on
Commit
1981f80
·
verified ·
1 Parent(s): f2e46e6

Update Feather training runtime image

Browse files
Files changed (37) hide show
  1. overlay/configs/harness_config.py +47 -17
  2. overlay/harness/eval_agent.py +188 -60
  3. overlay/harness/orchestrator.py +16 -13
  4. overlay/htm_rust/build.rs +27 -35
  5. overlay/htm_rust/src/gpu/fused.rs +87 -93
  6. overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu +77 -77
  7. overlay/hydra/engram.py +73 -75
  8. overlay/hydra/eval.py +8 -1
  9. overlay/hydra/model.py +21 -6
  10. overlay/hydra/training.py +46 -5
  11. overlay/prepare_nemotron.py +187 -232
  12. overlay/pyproject.toml +1 -0
  13. overlay/scripts/autoresearch_iter.sh +144 -0
  14. overlay/scripts/benchmark_hyena_stack.py +50 -29
  15. overlay/scripts/export_hpo_priors.py +74 -0
  16. overlay/scripts/hpo_orchestrator.py +319 -0
  17. overlay/scripts/launch_feather_hf_job.py +145 -110
  18. overlay/scripts/long_train.sh +38 -38
  19. overlay/scripts/optuna_hpo.py +725 -0
  20. overlay/scripts/parse_metrics.py +24 -0
  21. overlay/scripts/run_domain_expanded_pretrain.sh +262 -262
  22. overlay/scripts/run_meta.sh +13 -13
  23. overlay/scripts/run_phase1.sh +32 -32
  24. overlay/scripts/run_phase2.sh +25 -25
  25. overlay/scripts/run_tps_gate.sh +23 -0
  26. overlay/scripts/setup.sh +28 -27
  27. overlay/scripts/strip_optimizer_state.py +29 -0
  28. overlay/scripts/sweep_depth_aggregate.py +141 -45
  29. overlay/scripts/sweep_depth_local.sh +62 -62
  30. overlay/scripts/train_champion_12h.sh +50 -0
  31. overlay/scripts/train_champion_5h.sh +45 -0
  32. overlay/scripts/train_champion_resume.sh +38 -0
  33. overlay/scripts/train_champion_resume_clean.sh +43 -0
  34. overlay/scripts/train_champion_v2.sh +54 -0
  35. overlay/scripts/train_champion_warmstart.sh +47 -0
  36. overlay/scripts/wsl_bootstrap_tps.sh +68 -0
  37. overlay/subsystems/htm.py +43 -57
overlay/configs/harness_config.py CHANGED
@@ -1,10 +1,13 @@
1
- """Harness configuration for HYDRA's self-evolving outer loop."""
2
- from typing import Literal
 
 
 
 
 
3
 
4
- from pydantic import BaseModel, Field
5
 
6
-
7
- class HarnessConfig(BaseModel):
8
  """Configuration for the HYDRA harness behavior."""
9
 
10
  # Inner loop
@@ -47,15 +50,19 @@ class HarnessConfig(BaseModel):
47
  default=5.0, description="Max % regression from best known val_bpb"
48
  )
49
 
50
- # Keep/discard criteria
51
- primary_metric: str = "val_bpb"
52
- secondary_metrics: dict = Field(
53
- default_factory=lambda: {
54
- "mhc_spectral_norm": {"max": 2.0},
55
- "engram_hit_rate": {"min": 0.1},
56
- "hestia_quant_error": {"max": 0.05},
57
- }
58
- )
 
 
 
 
59
 
60
  # Experiment execution
61
  experiment_timeout: int = Field(
@@ -73,6 +80,29 @@ class HarnessConfig(BaseModel):
73
  gate_mhc_spectral_norm: float | None = Field(
74
  default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
75
  )
76
- gate_engram_hit_rate: float | None = Field(
77
- default=None, description="Min engram_hit_rate for keep (None=disabled)"
78
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Harness configuration for HYDRA's self-evolving outer loop."""
2
+ from typing import Literal
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ type GateThresholds = dict[str, float]
7
+ type GateConfig = dict[str, GateThresholds]
8
 
 
9
 
10
+ class HarnessConfig(BaseModel):
 
11
  """Configuration for the HYDRA harness behavior."""
12
 
13
  # Inner loop
 
50
  default=5.0, description="Max % regression from best known val_bpb"
51
  )
52
 
53
+ # Keep/discard criteria
54
+ primary_metric: str = "val_bpb"
55
+ secondary_metrics: GateConfig = Field(
56
+ default_factory=lambda: {
57
+ "mhc_spectral_norm": {"max": 2.0},
58
+ "engram_hit_rate": {"min": 0.1},
59
+ "factual_english_score": {"min": 0.5},
60
+ "instruction_following_score": {"min": 0.5},
61
+ "distinct_2": {"min": 0.1},
62
+ "repetition_rate": {"max": 0.2},
63
+ "hestia_quant_error": {"max": 0.05},
64
+ }
65
+ )
66
 
67
  # Experiment execution
68
  experiment_timeout: int = Field(
 
80
  gate_mhc_spectral_norm: float | None = Field(
81
  default=None, description="Max mhc_spectral_norm for keep (None=disabled)"
82
  )
83
+ gate_engram_hit_rate: float | None = Field(
84
+ default=None, description="Min engram_hit_rate for keep (None=disabled)"
85
+ )
86
+ gate_tps_median: float | None = Field(
87
+ default=None,
88
+ description="Min steady-state tps_median for keep (None=disabled)",
89
+ )
90
+ gate_tps_p10: float | None = Field(
91
+ default=None,
92
+ description="Min steady-state tps_p10 for keep (None=disabled)",
93
+ )
94
+
95
+ def to_secondary_gates(self) -> GateConfig:
96
+ """Build active keep/discard gates from defaults plus gate_* overrides."""
97
+ gates = {metric: thresholds.copy() for metric, thresholds in self.secondary_metrics.items()}
98
+
99
+ if self.gate_mhc_spectral_norm is not None:
100
+ gates.setdefault("mhc_spectral_norm", {})["max"] = self.gate_mhc_spectral_norm
101
+ if self.gate_engram_hit_rate is not None:
102
+ gates.setdefault("engram_hit_rate", {})["min"] = self.gate_engram_hit_rate
103
+ if self.gate_tps_median is not None:
104
+ gates.setdefault("tps_median", {})["min"] = self.gate_tps_median
105
+ if self.gate_tps_p10 is not None:
106
+ gates.setdefault("tps_p10", {})["min"] = self.gate_tps_p10
107
+
108
+ return gates
overlay/harness/eval_agent.py CHANGED
@@ -1,10 +1,15 @@
1
- """Eval agent: parse run.log and extract metrics from training runs."""
2
- import re
3
- from dataclasses import dataclass, field
 
 
 
 
 
4
 
5
 
6
  @dataclass
7
- class ExperimentResult:
8
  """Parsed result from a single experiment run.
9
 
10
  All float fields default to 0.0; integer fields default to 0.
@@ -23,19 +28,38 @@ class ExperimentResult:
23
  peak_vram_mb: float = 0.0
24
  mfu_percent: float = 0.0
25
 
26
- # Throughput
27
- total_tokens_m: float = 0.0
28
- num_steps: int = 0
 
 
 
 
 
29
 
30
  # Model shape (echoed by train.py summary block)
31
  num_params_m: float = 0.0
32
  n_layer: int = 0
33
  d_model: int = 0
34
 
35
- # Secondary health metrics
36
- mhc_spectral_norm: float = 0.0
37
- engram_hit_rate: float = 0.0
38
- sr_bypass_rate: float = 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # Status
41
  crashed: bool = False
@@ -56,12 +80,48 @@ _PATTERNS: dict[str, str] = {
56
  "n_layer": r"^n_layer:\s+(\d+)",
57
  "d_model": r"^d_model:\s+(\d+)",
58
  "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
59
- "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
60
- "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
61
- }
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  # Attributes that should be parsed as int rather than float.
64
- _INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  def parse_run_log(log_path: str) -> ExperimentResult:
@@ -84,22 +144,60 @@ def parse_run_log(log_path: str) -> ExperimentResult:
84
  result.error_message = f"Log file not found: {log_path}"
85
  return result
86
 
87
- # Detect crash signals in output.
88
- if "Traceback" in content or "FAIL" in content or "Error" in content:
89
- result.crashed = True
90
- lines = content.strip().splitlines()
91
- result.error_message = "\n".join(lines[-20:])
92
-
93
- for attr, pattern in _PATTERNS.items():
94
- match = re.search(pattern, content, re.MULTILINE)
95
- if match:
96
- raw = match.group(1)
97
- setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw))
98
-
99
- return result
100
-
101
-
102
- def check_secondary_alarms(result: ExperimentResult) -> list[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  """Check secondary metrics against fixed alarm thresholds.
104
 
105
  Args:
@@ -118,19 +216,44 @@ def check_secondary_alarms(result: ExperimentResult) -> list[str]:
118
  alarms.append(
119
  f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
120
  )
121
- if 0 < result.mfu_percent < 10:
122
- alarms.append(
123
- f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
124
- )
125
-
126
- return alarms
127
-
128
-
129
- def should_keep(
130
- result: ExperimentResult,
131
- best_bpb: float,
132
- gates: dict | None = None,
133
- ) -> tuple[bool, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  """Decide whether to keep or discard an experiment.
135
 
136
  The primary criterion is strictly lower val_bpb than the current best.
@@ -154,19 +277,24 @@ def should_keep(
154
  if result.val_bpb >= best_bpb:
155
  return False, "discard"
156
 
157
- # Secondary gate checks.
158
- if gates:
159
- gate_mhc = gates.get("mhc_spectral_norm", {}).get("max")
160
- if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc:
161
- return (
162
- False,
163
- f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}",
164
- )
165
- gate_engram = gates.get("engram_hit_rate", {}).get("min")
166
- if gate_engram is not None and result.engram_hit_rate < gate_engram:
167
- return (
168
- False,
169
- f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}",
170
- )
171
-
172
- return True, "keep"
 
 
 
 
 
 
1
+ """Eval agent: parse run.log and extract metrics from training runs."""
2
+ import re
3
+ import statistics
4
+ from dataclasses import dataclass
5
+
6
+
7
+ type GateThresholds = dict[str, float]
8
+ type GateConfig = dict[str, GateThresholds]
9
 
10
 
11
  @dataclass
12
+ class ExperimentResult:
13
  """Parsed result from a single experiment run.
14
 
15
  All float fields default to 0.0; integer fields default to 0.
 
28
  peak_vram_mb: float = 0.0
29
  mfu_percent: float = 0.0
30
 
31
+ # Throughput
32
+ total_tokens_m: float = 0.0
33
+ num_steps: int = 0
34
+ tps_median: float = 0.0
35
+ tps_p10: float = 0.0
36
+ tps_min: float = 0.0
37
+ tps_max: float = 0.0
38
+ tps_samples: int = 0
39
 
40
  # Model shape (echoed by train.py summary block)
41
  num_params_m: float = 0.0
42
  n_layer: int = 0
43
  d_model: int = 0
44
 
45
+ # Secondary health metrics
46
+ mhc_spectral_norm: float = 0.0
47
+ engram_hit_rate: float = 0.0
48
+ sr_bypass_rate: float = 0.0
49
+
50
+ # Evaluation breadth metrics
51
+ factual_english_score: float = 0.0
52
+ instruction_following_score: float = 0.0
53
+ distinct_1: float = 0.0
54
+ distinct_2: float = 0.0
55
+ repetition_rate: float = 0.0
56
+ repetition_bigram_rate: float = 0.0
57
+ calibration_ece: float = 0.0
58
+ calibration_brier: float = 0.0
59
+ calibration_accuracy: float = 0.0
60
+ calibration_tokens: int = 0
61
+ eval_seed: int = 0
62
+ eval_seed_group: str = ""
63
 
64
  # Status
65
  crashed: bool = False
 
80
  "n_layer": r"^n_layer:\s+(\d+)",
81
  "d_model": r"^d_model:\s+(\d+)",
82
  "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
83
+ "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
84
+ "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
85
+ "factual_english_score": r"^factual_english_score:\s+([\d.]+)",
86
+ "instruction_following_score": r"^instruction_following_score:\s+([\d.]+)",
87
+ "distinct_1": r"^distinct_1:\s+([\d.]+)",
88
+ "distinct_2": r"^distinct_2:\s+([\d.]+)",
89
+ "repetition_rate": r"^repetition_rate:\s+([\d.]+)",
90
+ "repetition_bigram_rate": r"^repetition_bigram_rate:\s+([\d.]+)",
91
+ "calibration_ece": r"^calibration_ece:\s+([\d.]+)",
92
+ "calibration_brier": r"^calibration_brier:\s*([\d.]+)",
93
+ "calibration_accuracy": r"^calibration_accuracy:\s+([\d.]+)",
94
+ "calibration_tokens": r"^calibration_tokens:\s+(\d+)",
95
+ "eval_seed": r"^eval_seed:\s+(\d+)",
96
+ "eval_seed_group": r"^eval_seed_group:\s+(.+)",
97
+ }
98
 
99
  # Attributes that should be parsed as int rather than float.
100
+ _INT_ATTRS: frozenset[str] = frozenset(
101
+ {
102
+ "num_steps",
103
+ "n_layer",
104
+ "d_model",
105
+ "calibration_tokens",
106
+ "eval_seed",
107
+ }
108
+ )
109
+ _STR_ATTRS: frozenset[str] = frozenset({"eval_seed_group"})
110
+ _STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
111
+ _TPS_PATTERN = re.compile(r"\btps=(\d+)\b")
112
+
113
+
114
+ def _percentile_linear(sorted_values: list[float], pct: float) -> float:
115
+ """Compute percentile via linear interpolation (0 <= pct <= 100)."""
116
+ if not sorted_values:
117
+ return 0.0
118
+ if len(sorted_values) == 1:
119
+ return sorted_values[0]
120
+ rank = (len(sorted_values) - 1) * (pct / 100.0)
121
+ lo = int(rank)
122
+ hi = min(lo + 1, len(sorted_values) - 1)
123
+ frac = rank - lo
124
+ return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac
125
 
126
 
127
  def parse_run_log(log_path: str) -> ExperimentResult:
 
144
  result.error_message = f"Log file not found: {log_path}"
145
  return result
146
 
147
+ # Detect crash signals in output. Keep this strict to avoid false positives
148
+ # from benign log lines that include "error" in a non-fatal context.
149
+ if (
150
+ "Traceback" in content
151
+ or "\nFAIL\n" in content
152
+ or "[TPS_GUARD] FAIL" in content
153
+ or "raise SystemExit(1)" in content
154
+ ):
155
+ result.crashed = True
156
+ lines = content.strip().splitlines()
157
+ result.error_message = "\n".join(lines[-20:])
158
+
159
+ for attr, pattern in _PATTERNS.items():
160
+ match = re.search(pattern, content, re.MULTILINE)
161
+ if match:
162
+ raw = match.group(1)
163
+ if attr in _INT_ATTRS:
164
+ setattr(result, attr, int(raw))
165
+ elif attr in _STR_ATTRS:
166
+ setattr(result, attr, raw.strip())
167
+ else:
168
+ setattr(result, attr, float(raw))
169
+
170
+ warmup_steps = 10
171
+ warmup_match = re.search(r"\[TPS_GUARD\] enabled .*?warmup_steps=(\d+)", content)
172
+ if warmup_match:
173
+ warmup_steps = int(warmup_match.group(1))
174
+
175
+ step_tps_samples: list[tuple[int, int]] = []
176
+ for m in _STEP_TPS_PATTERN.finditer(content):
177
+ step_tps_samples.append((int(m.group(1)), int(m.group(2))))
178
+
179
+ tps_values: list[float] = []
180
+ if step_tps_samples:
181
+ for step, tps in step_tps_samples:
182
+ if step >= warmup_steps:
183
+ tps_values.append(float(tps))
184
+ if not tps_values:
185
+ tps_values = [float(tps) for _, tps in step_tps_samples]
186
+ else:
187
+ tps_values = [float(m.group(1)) for m in _TPS_PATTERN.finditer(content)]
188
+
189
+ if tps_values:
190
+ sorted_tps = sorted(tps_values)
191
+ result.tps_samples = len(tps_values)
192
+ result.tps_median = float(statistics.median(tps_values))
193
+ result.tps_p10 = float(_percentile_linear(sorted_tps, 10.0))
194
+ result.tps_min = float(sorted_tps[0])
195
+ result.tps_max = float(sorted_tps[-1])
196
+
197
+ return result
198
+
199
+
200
+ def check_secondary_alarms(result: ExperimentResult) -> list[str]:
201
  """Check secondary metrics against fixed alarm thresholds.
202
 
203
  Args:
 
216
  alarms.append(
217
  f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
218
  )
219
+ if 0 < result.mfu_percent < 10:
220
+ alarms.append(
221
+ f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
222
+ )
223
+ if result.calibration_ece > 0.35:
224
+ alarms.append(
225
+ f"calibration_ece={result.calibration_ece:.4f} > 0.35 (poor calibration)"
226
+ )
227
+ if result.tps_median > 0 and result.tps_median < 50000:
228
+ alarms.append(
229
+ f"tps_median={result.tps_median:.0f} < 50000 (throughput below A10 objective)"
230
+ )
231
+
232
+ return alarms
233
+
234
+
235
+ def _check_gate(
236
+ result: ExperimentResult,
237
+ gates: GateConfig,
238
+ metric: str,
239
+ ) -> tuple[bool, str] | None:
240
+ """Evaluate a single min/max gate against an ExperimentResult metric."""
241
+ gate = gates.get(metric, {})
242
+ value = getattr(result, metric)
243
+ max_value = gate.get("max")
244
+ if max_value is not None and value > max_value:
245
+ return False, f"{metric} {value:.4f} > gate {max_value}"
246
+ min_value = gate.get("min")
247
+ if min_value is not None and value < min_value:
248
+ return False, f"{metric} {value:.4f} < gate {min_value}"
249
+ return None
250
+
251
+
252
+ def should_keep(
253
+ result: ExperimentResult,
254
+ best_bpb: float,
255
+ gates: GateConfig | None = None,
256
+ ) -> tuple[bool, str]:
257
  """Decide whether to keep or discard an experiment.
258
 
259
  The primary criterion is strictly lower val_bpb than the current best.
 
277
  if result.val_bpb >= best_bpb:
278
  return False, "discard"
279
 
280
+ # Secondary gate checks.
281
+ if gates:
282
+ gate_metrics = (
283
+ "mhc_spectral_norm",
284
+ "engram_hit_rate",
285
+ "factual_english_score",
286
+ "instruction_following_score",
287
+ "distinct_1",
288
+ "distinct_2",
289
+ "repetition_rate",
290
+ "repetition_bigram_rate",
291
+ "calibration_ece",
292
+ "tps_median",
293
+ "tps_p10",
294
+ )
295
+ for metric in gate_metrics:
296
+ gate_result = _check_gate(result, gates, metric)
297
+ if gate_result is not None:
298
+ return gate_result
299
+
300
+ return True, "keep"
overlay/harness/orchestrator.py CHANGED
@@ -20,11 +20,12 @@ provides the infrastructure ("rails") that the autoresearch loop runs on.
20
  """
21
  import argparse
22
  import csv
23
- import os
24
- import subprocess
25
- import time
26
-
27
- from harness.eval_agent import ExperimentResult, check_secondary_alarms, parse_run_log, should_keep
 
28
  from harness.git_utils import REPO_DIR, commit_all, current_commit_short, reset_to
29
  from harness.health_monitor import check_health, reset_peak_stats
30
  from harness.meta_agent import run_meta_iteration
@@ -144,12 +145,12 @@ def run_experiment(timeout: int = 600) -> str:
144
  # ---------------------------------------------------------------------------
145
 
146
 
147
- def run_loop(
148
- meta_interval: int = 20,
149
- max_experiments: int | None = None,
150
- experiment_timeout: int = 600,
151
- secondary_gates: dict | None = None,
152
- ) -> None:
153
  """Run the HYDRA autoresearch loop.
154
 
155
  This function runs indefinitely (or until ``max_experiments`` is reached
@@ -162,8 +163,10 @@ def run_loop(
162
  secondary_gates: Optional gate thresholds forwarded to
163
  :func:`~harness.eval_agent.should_keep`.
164
  """
165
- init_results_tsv()
166
- best_bpb = _load_best_bpb()
 
 
167
  experiment_num = count_experiments()
168
 
169
  print(
 
20
  """
21
  import argparse
22
  import csv
23
+ import os
24
+ import subprocess
25
+ import time
26
+
27
+ from configs.harness_config import HarnessConfig
28
+ from harness.eval_agent import ExperimentResult, check_secondary_alarms, parse_run_log, should_keep
29
  from harness.git_utils import REPO_DIR, commit_all, current_commit_short, reset_to
30
  from harness.health_monitor import check_health, reset_peak_stats
31
  from harness.meta_agent import run_meta_iteration
 
145
  # ---------------------------------------------------------------------------
146
 
147
 
148
+ def run_loop(
149
+ meta_interval: int = 20,
150
+ max_experiments: int | None = None,
151
+ experiment_timeout: int = 600,
152
+ secondary_gates: dict[str, dict[str, float]] | None = None,
153
+ ) -> None:
154
  """Run the HYDRA autoresearch loop.
155
 
156
  This function runs indefinitely (or until ``max_experiments`` is reached
 
163
  secondary_gates: Optional gate thresholds forwarded to
164
  :func:`~harness.eval_agent.should_keep`.
165
  """
166
+ init_results_tsv()
167
+ if secondary_gates is None:
168
+ secondary_gates = HarnessConfig().to_secondary_gates()
169
+ best_bpb = _load_best_bpb()
170
  experiment_num = count_experiments()
171
 
172
  print(
overlay/htm_rust/build.rs CHANGED
@@ -26,39 +26,37 @@ fn main() {
26
  return;
27
  }
28
 
29
- // Kernels to compile. Each .cu file → one .ptx file, embedded by name.
30
- // htm_fused_step currently requires Hopper-only cluster APIs (sm_90+).
31
- let mut kernels: Vec<&str> = vec![
32
- "sp_overlap",
33
- "sp_topk",
34
- "sp_learn",
35
- "sp_duty",
36
- "sp_boost_fused",
 
 
37
  "tm_predict",
38
  "tm_activate",
39
  "tm_learn",
40
- "tm_punish",
41
- "tm_grow",
42
- "tm_anomaly",
43
- "tm_reset",
44
- ];
 
 
 
 
 
 
45
 
46
  let kernels_dir = PathBuf::from("src/gpu/kernels");
47
- for k in &kernels {
48
- let src = kernels_dir.join(format!("{k}.cu"));
49
- println!("cargo:rerun-if-changed={}", src.display());
50
- }
51
 
52
- let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
53
- let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_90a".into());
54
- let fused_supported = arch.starts_with("sm_90");
55
- if fused_supported {
56
- kernels.push("htm_fused_step");
57
- }
58
- println!(
59
- "cargo:rustc-env=HTM_GPU_FUSED_AVAILABLE={}",
60
- if fused_supported { "1" } else { "0" }
61
- );
62
 
63
  let nvcc = find_nvcc();
64
  println!("cargo:warning=htm_rust: nvcc = {nvcc}");
@@ -81,7 +79,7 @@ fn main() {
81
  // than the nvcc toolchain. Set HTM_PTX_VERSION to e.g. "7.8" or "8.0".
82
  let ptx_version_override = env::var("HTM_PTX_VERSION").ok();
83
 
84
- for k in kernels {
85
  let src = kernels_dir.join(format!("{k}.cu"));
86
  let ptx = out_dir.join(format!("{k}.ptx"));
87
  if !src.exists() {
@@ -129,13 +127,7 @@ fn main() {
129
  std::fs::write(&ptx, patched)
130
  .unwrap_or_else(|e| panic!("write {} failed: {e}", ptx.display()));
131
  }
132
- }
133
-
134
- if !fused_supported {
135
- let fused_ptx = out_dir.join("htm_fused_step.ptx");
136
- std::fs::write(&fused_ptx, "// fused kernel disabled for this CUDA arch\n")
137
- .unwrap_or_else(|e| panic!("write {} failed: {e}", fused_ptx.display()));
138
- }
139
 
140
  // Export OUT_DIR for include_str! in Rust.
141
  println!(
 
26
  return;
27
  }
28
 
29
+ let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR"));
30
+ let arch = env::var("HTM_CUDA_ARCH").unwrap_or_else(|_| "sm_90a".into());
31
+
32
+ // Base kernels — compile for any sm_80+ GPU. Each .cu file → one .ptx file.
33
+ let base_kernels: &[&str] = &[
34
+ "sp_overlap",
35
+ "sp_topk",
36
+ "sp_learn",
37
+ "sp_duty",
38
+ "sp_boost_fused",
39
  "tm_predict",
40
  "tm_activate",
41
  "tm_learn",
42
+ "tm_punish",
43
+ "tm_grow",
44
+ "tm_anomaly",
45
+ "tm_reset",
46
+ ];
47
+
48
+ // htm_fused_step now compiles for ALL architectures (sm_80+).
49
+ // On Hopper (sm_90+): uses cluster-distributed shared memory for hot state.
50
+ // On Ampere (sm_86) and other pre-Hopper: uses global memory reads/writes
51
+ // with grid.sync() for cross-block synchronization (cooperative launch).
52
+ let kernels: Vec<&str> = base_kernels.iter().chain(["htm_fused_step"].iter()).copied().collect();
53
 
54
  let kernels_dir = PathBuf::from("src/gpu/kernels");
55
+ for k in &kernels {
56
+ let src = kernels_dir.join(format!("{k}.cu"));
57
+ println!("cargo:rerun-if-changed={}", src.display());
58
+ }
59
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  let nvcc = find_nvcc();
62
  println!("cargo:warning=htm_rust: nvcc = {nvcc}");
 
79
  // than the nvcc toolchain. Set HTM_PTX_VERSION to e.g. "7.8" or "8.0".
80
  let ptx_version_override = env::var("HTM_PTX_VERSION").ok();
81
 
82
+ for k in kernels {
83
  let src = kernels_dir.join(format!("{k}.cu"));
84
  let ptx = out_dir.join(format!("{k}.ptx"));
85
  if !src.exists() {
 
127
  std::fs::write(&ptx, patched)
128
  .unwrap_or_else(|e| panic!("write {} failed: {e}", ptx.display()));
129
  }
130
+ }
 
 
 
 
 
 
131
 
132
  // Export OUT_DIR for include_str! in Rust.
133
  println!(
overlay/htm_rust/src/gpu/fused.rs CHANGED
@@ -132,7 +132,12 @@ pub(crate) fn plan_fused_launch(
132
  grid_cap_override: Option<u32>,
133
  ) -> Result<FusedLaunchPlan, String> {
134
  let sm_count = sm_count.max(1);
135
- let block_dim_x = 1024u32;
 
 
 
 
 
136
 
137
  // Cluster launch path: cooperative launch is not required. Keep the probe
138
  // result for residency estimation only.
@@ -140,11 +145,10 @@ pub(crate) fn plan_fused_launch(
140
  eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
141
  }
142
 
143
- // Cluster constraint: grid_dim_x must equal the cluster size (16) so that
144
- // each region maps to exactly one cluster. `HTM_FUSED_GRID_CAP` can lower
145
- // this for debugging but should not exceed 16 for cluster correctness.
146
  let default_grid_cap = 16u32;
147
- let grid_cap = grid_cap_override.unwrap_or(default_grid_cap).min(16);
148
  let resident_bound = if cooperative_grid_limit > 0 {
149
  cooperative_grid_limit.max(sm_count * 2)
150
  } else {
@@ -460,15 +464,21 @@ pub fn launch_fused(
460
  return Err(DriverError(ret));
461
  }
462
  } else {
463
- // Fallback for devices that don't support cluster launch.
464
- result::launch_kernel(
 
 
 
465
  fused.raw_kernel.function,
466
- (grid_x, 1, 1),
467
- (block_x, 1, 1),
468
- 0,
469
  cu_stream,
470
- &mut kernel_params,
471
- )?;
 
 
 
472
  }
473
  }
474
 
@@ -503,41 +513,29 @@ pub(super) fn launch_fused_batched_raw(
503
  assert_eq!(anom_per_region.len(), b);
504
  assert!(b >= 1, "need at least one region");
505
 
506
- // Reset per-region step_scratch before each launch.
507
- for &rp in region_ptrs.iter() {
508
- let r = unsafe { &mut *rp };
509
- let fused = r
510
- .fused_state
511
- .as_mut()
512
- .expect("launch_fused_batched_raw requires fused_state");
513
- let dev = r.sp_gpu.dev_ref().clone();
514
- dev.memset_zeros(&mut fused.step_scratch)?;
515
- fused.iter_counter = fused.iter_counter.wrapping_add(1);
516
- }
517
 
518
  // Shared config — all regions use identical sp/tm parameters.
519
- let (grid_x, block_x, function_batched, cu_stream, cu_ctx) = {
520
- let r0 = unsafe { &*region_ptrs[0] };
521
- let fused = r0
522
- .fused_state
523
- .as_ref()
524
- .expect("launch_fused_batched_raw requires fused_state");
525
- (
526
- fused.grid_dim_x,
527
- fused.block_dim_x,
528
- fused.raw_kernel.function_batched,
529
- *r0.sp_gpu.dev_ref().cu_stream(),
530
- *r0.sp_gpu.dev_ref().cu_primary_ctx(),
531
- )
532
- };
533
-
534
- let cfg = {
535
- let r = unsafe { &*region_ptrs[0] };
536
- let fused = r
537
- .fused_state
538
- .as_ref()
539
- .expect("launch_fused_batched_raw requires fused_state");
540
- FusedConfig {
541
  input_bits: input_bits as u32,
542
  n_columns: r.sp_gpu.n_columns_accessor() as u32,
543
  synapses_per_col: r.sp_gpu.synapses_per_col_accessor() as u32,
@@ -562,41 +560,38 @@ pub(super) fn launch_fused_batched_raw(
562
  initial_perm_i16: r.tm_gpu.initial_perm_i16 as i32,
563
  t: t as u32,
564
  learn: if learn { 1 } else { 0 },
565
- iter_seed: fused.iter_counter,
566
- cooperative_grid_sync: 1,
567
- }
568
- };
569
 
570
  // Build B FusedPtrs per-region.
571
- let mut ptrs_vec: Vec<FusedPtrs> = Vec::with_capacity(b);
572
- for i in 0..b {
573
- let r = unsafe { &*region_ptrs[i] };
574
- let fused = r
575
- .fused_state
576
- .as_ref()
577
- .expect("launch_fused_batched_raw requires fused_state");
578
- ptrs_vec.push(FusedPtrs {
579
- syn_bit: *r.sp_gpu.syn_bit_accessor().device_ptr(),
580
- syn_perm: *r.sp_gpu.syn_perm_accessor().device_ptr(),
581
- boost: *r.sp_gpu.boost_accessor().device_ptr(),
582
- active_duty: *r.sp_gpu.active_duty_accessor().device_ptr(),
583
- inhibition_threshold: *fused.inhibition_threshold.device_ptr(),
584
- seg_cell_id: *r.tm_gpu.seg_cell_id_accessor().device_ptr(),
585
- seg_syn_count: *r.tm_gpu.seg_syn_count_accessor().device_ptr(),
586
- syn_presyn: *r.tm_gpu.syn_presyn_accessor().device_ptr(),
587
- tm_syn_perm: *r.tm_gpu.syn_perm_accessor().device_ptr(),
588
- cell_seg_count: *r.tm_gpu.cell_seg_count_accessor().device_ptr(),
589
- cell_active_a: *fused.cell_active_bits_a.device_ptr(),
590
- cell_active_b: *fused.cell_active_bits_b.device_ptr(),
591
- cell_winner_a: *fused.cell_winner_bits_a.device_ptr(),
592
- cell_winner_b: *fused.cell_winner_bits_b.device_ptr(),
593
- inputs: inputs_per_region[i],
594
- cols_out: cols_per_region[i],
595
- anom_out: anom_per_region[i],
596
- barrier_counters: 0u64, // ABI-compat dummy; cluster barrier replaces DLB.
597
- step_scratch: *fused.step_scratch.device_ptr(),
598
- });
599
- }
600
 
601
  // Upload FusedPtrs array to device (B * sizeof(FusedPtrs) bytes).
602
  // FusedPtrs is repr(C) + DeviceRepr so htod_sync_copy handles it.
@@ -608,14 +603,10 @@ pub(super) fn launch_fused_batched_raw(
608
  // Grid = (grid_x, B, 1) with cluster_dim=(16,1,1): each region (Y slice)
609
  // occupies exactly one cluster of 16 blocks. All 8 clusters run concurrently
610
  // on the H200's 132 SMs (8 × 16 = 128 blocks ≤ 132 SMs).
611
- let use_cluster = {
612
- let r0 = unsafe { &*region_ptrs[0] };
613
- let fused = r0
614
- .fused_state
615
- .as_ref()
616
- .expect("launch_fused_batched_raw requires fused_state");
617
- fused.cluster_info.max_cluster_size > 0
618
- };
619
 
620
  unsafe {
621
  result::ctx::set_current(cu_ctx)?;
@@ -653,15 +644,18 @@ pub(super) fn launch_fused_batched_raw(
653
  return Err(DriverError(ret));
654
  }
655
  } else {
656
- // Fallback: plain non-cooperative launch for non-Hopper devices.
657
- result::launch_kernel(
658
  function_batched,
659
- (grid_x, b as u32, 1),
660
- (block_x, 1, 1),
661
- 0,
662
  cu_stream,
663
- &mut kernel_params,
664
- )?;
 
 
 
665
  }
666
  }
667
 
 
132
  grid_cap_override: Option<u32>,
133
  ) -> Result<FusedLaunchPlan, String> {
134
  let sm_count = sm_count.max(1);
135
+ // 1024 threads/block exceeds the register file on Ampere (sm_86: 65536
136
+ // regs/SM ÷ 1024 = 64 regs/thread; fused kernel needs ~80+). 256 gives
137
+ // 256 regs/thread which is ample. Compensate with more blocks via
138
+ // cooperative launch. On Hopper (228 KB smem, 255 regs/thread baseline),
139
+ // 1024 works fine, but 256 is safe everywhere.
140
+ let block_dim_x = 256u32;
141
 
142
  // Cluster launch path: cooperative launch is not required. Keep the probe
143
  // result for residency estimation only.
 
145
  eprintln!("[htm_rust] INFO: cooperative launch unsupported; cluster path only.");
146
  }
147
 
148
+ // Tested grid_cap: 4 blocks = 30ms (too serial), 16 blocks = 10.8ms (parallel wins).
149
+ // Parallelism in SP overlap + TM predict stages outweighs grid.sync() cost.
 
150
  let default_grid_cap = 16u32;
151
+ let grid_cap = grid_cap_override.unwrap_or(default_grid_cap);
152
  let resident_bound = if cooperative_grid_limit > 0 {
153
  cooperative_grid_limit.max(sm_count * 2)
154
  } else {
 
464
  return Err(DriverError(ret));
465
  }
466
  } else {
467
+ // Pre-Hopper: cooperative kernel launch. The fused kernel uses
468
+ // grid.sync() for cross-block synchronization which REQUIRES
469
+ // cuLaunchCooperativeKernel (normal launch silently crashes on
470
+ // the first grid.sync() call).
471
+ let ret = sys::lib().cuLaunchCooperativeKernel(
472
  fused.raw_kernel.function,
473
+ grid_x, 1, 1,
474
+ block_x, 1, 1,
475
+ 0, // sharedMemBytes
476
  cu_stream,
477
+ kernel_params.as_mut_ptr(),
478
+ );
479
+ if ret != sys::CUresult::CUDA_SUCCESS {
480
+ return Err(DriverError(ret));
481
+ }
482
  }
483
  }
484
 
 
513
  assert_eq!(anom_per_region.len(), b);
514
  assert!(b >= 1, "need at least one region");
515
 
516
+ // Reset per-region step_scratch before each launch.
517
+ for &rp in region_ptrs.iter() {
518
+ let r = unsafe { &mut *rp };
519
+ let dev = r.sp_gpu.dev_ref().clone();
520
+ dev.memset_zeros(&mut r.fused_state.step_scratch)?;
521
+ r.fused_state.iter_counter = r.fused_state.iter_counter.wrapping_add(1);
522
+ }
 
 
 
 
523
 
524
  // Shared config — all regions use identical sp/tm parameters.
525
+ let (grid_x, block_x, function_batched, cu_stream, cu_ctx) = {
526
+ let r0 = unsafe { &*region_ptrs[0] };
527
+ (
528
+ r0.fused_state.grid_dim_x,
529
+ r0.fused_state.block_dim_x,
530
+ r0.fused_state.raw_kernel.function_batched,
531
+ *r0.sp_gpu.dev_ref().cu_stream(),
532
+ *r0.sp_gpu.dev_ref().cu_primary_ctx(),
533
+ )
534
+ };
535
+
536
+ let cfg = {
537
+ let r = unsafe { &*region_ptrs[0] };
538
+ FusedConfig {
 
 
 
 
 
 
 
 
539
  input_bits: input_bits as u32,
540
  n_columns: r.sp_gpu.n_columns_accessor() as u32,
541
  synapses_per_col: r.sp_gpu.synapses_per_col_accessor() as u32,
 
560
  initial_perm_i16: r.tm_gpu.initial_perm_i16 as i32,
561
  t: t as u32,
562
  learn: if learn { 1 } else { 0 },
563
+ iter_seed: r.fused_state.iter_counter,
564
+ cooperative_grid_sync: 1,
565
+ }
566
+ };
567
 
568
  // Build B FusedPtrs per-region.
569
+ let ptrs_vec: Vec<FusedPtrs> = (0..b)
570
+ .map(|i| {
571
+ let r = unsafe { &*region_ptrs[i] };
572
+ FusedPtrs {
573
+ syn_bit: *r.sp_gpu.syn_bit_accessor().device_ptr(),
574
+ syn_perm: *r.sp_gpu.syn_perm_accessor().device_ptr(),
575
+ boost: *r.sp_gpu.boost_accessor().device_ptr(),
576
+ active_duty: *r.sp_gpu.active_duty_accessor().device_ptr(),
577
+ inhibition_threshold: *r.fused_state.inhibition_threshold.device_ptr(),
578
+ seg_cell_id: *r.tm_gpu.seg_cell_id_accessor().device_ptr(),
579
+ seg_syn_count: *r.tm_gpu.seg_syn_count_accessor().device_ptr(),
580
+ syn_presyn: *r.tm_gpu.syn_presyn_accessor().device_ptr(),
581
+ tm_syn_perm: *r.tm_gpu.syn_perm_accessor().device_ptr(),
582
+ cell_seg_count: *r.tm_gpu.cell_seg_count_accessor().device_ptr(),
583
+ cell_active_a: *r.fused_state.cell_active_bits_a.device_ptr(),
584
+ cell_active_b: *r.fused_state.cell_active_bits_b.device_ptr(),
585
+ cell_winner_a: *r.fused_state.cell_winner_bits_a.device_ptr(),
586
+ cell_winner_b: *r.fused_state.cell_winner_bits_b.device_ptr(),
587
+ inputs: inputs_per_region[i],
588
+ cols_out: cols_per_region[i],
589
+ anom_out: anom_per_region[i],
590
+ barrier_counters: 0u64, // ABI-compat dummy; cluster barrier replaces DLB.
591
+ step_scratch: *r.fused_state.step_scratch.device_ptr(),
592
+ }
593
+ })
594
+ .collect();
 
 
 
595
 
596
  // Upload FusedPtrs array to device (B * sizeof(FusedPtrs) bytes).
597
  // FusedPtrs is repr(C) + DeviceRepr so htod_sync_copy handles it.
 
603
  // Grid = (grid_x, B, 1) with cluster_dim=(16,1,1): each region (Y slice)
604
  // occupies exactly one cluster of 16 blocks. All 8 clusters run concurrently
605
  // on the H200's 132 SMs (8 × 16 = 128 blocks ≤ 132 SMs).
606
+ let use_cluster = {
607
+ let r0 = unsafe { &*region_ptrs[0] };
608
+ r0.fused_state.cluster_info.max_cluster_size > 0
609
+ };
 
 
 
 
610
 
611
  unsafe {
612
  result::ctx::set_current(cu_ctx)?;
 
644
  return Err(DriverError(ret));
645
  }
646
  } else {
647
+ // Pre-Hopper: cooperative kernel launch (grid.sync() requires it).
648
+ let ret = sys::lib().cuLaunchCooperativeKernel(
649
  function_batched,
650
+ grid_x, b as u32, 1,
651
+ block_x, 1, 1,
652
+ 0, // sharedMemBytes
653
  cu_stream,
654
+ kernel_params.as_mut_ptr(),
655
+ );
656
+ if ret != sys::CUresult::CUDA_SUCCESS {
657
+ return Err(DriverError(ret));
658
+ }
659
  }
660
  }
661
 
overlay/htm_rust/src/gpu/kernels/htm_fused_step.cu CHANGED
@@ -124,13 +124,21 @@ struct FusedConfig {
124
  //
125
  // The flags / expected / phase / cooperative_grid_sync parameters are kept
126
  // in the signature for call-site compatibility but are unused.
127
- __device__ static inline void fused_grid_barrier(cg::grid_group /* grid */,
128
  unsigned int * /* flags — unused */,
129
  unsigned int /* expected — unused */,
130
  unsigned int /* phase — unused */,
131
  unsigned int /* cooperative_grid_sync — unused */) {
 
 
132
  auto cluster = cg::this_cluster();
133
  cluster.sync();
 
 
 
 
 
 
134
  }
135
 
136
  __device__ static inline unsigned int warp_sum_u32(unsigned int v) {
@@ -187,17 +195,26 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
187
  // DSMEM: Cluster-distributed shared memory for hot per-column
188
  // state (inhibition_threshold, boost, active_duty).
189
  //
190
- // Each block in the cluster owns a contiguous slice of
191
- // [my_col_start, my_col_end) columns in its own __shared__
192
- // arrays. Any block can peer-read another block's slice via
193
- // cluster.map_shared_rank(ptr, owner_block_rank)[offset].
194
  //
195
- // This eliminates 2×n_cols×T GMEM reads per forward call
196
- // (read + potential re-read of threshold/boost/duty per timestep).
 
 
197
  // =========================================================
 
 
 
198
  auto cluster = cg::this_cluster();
199
  const unsigned int cluster_block_rank = cluster.block_rank(); // 0..cluster_size-1
200
  const unsigned int cluster_sz = cluster.num_blocks(); // == gridDim.x (≤16)
 
 
 
 
 
201
 
202
  // Partition n_cols evenly across cluster blocks.
203
  // Each block owns cols_per_block columns starting at my_col_start.
@@ -209,27 +226,27 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
209
  (my_col_start + cols_per_block < n_cols)
210
  ? (my_col_start + cols_per_block) : n_cols; // clamp
211
 
 
212
  // Cluster-distributed shared memory arrays.
213
  // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
214
  // Peer blocks address into each other's smem via map_shared_rank.
215
  __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
216
  __shared__ float s_boost [COLS_PER_CLUSTER_BLOCK_MAX];
217
  __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
 
218
 
219
- // TMA multicast input staging tile (T9).
220
- //
221
- // On Hopper (sm_90a), cg::memcpy_async with cluster scope issues a single
222
- // TMA DMA that multicasts the source data to all 16 SMs in the cluster
223
- // simultaneously — replacing ~16 per-block GMEM reads per timestep with a
224
- // single hardware DMA. After cg::wait(cluster) every SM's s_input_tile
225
- // is populated identically without any additional DRAM traffic.
226
- //
227
- // Fallback: when cfg.input_bits > INPUT_BITS_MAX the tile is bypassed
228
- // and each thread reads directly from GMEM (original path).
229
  //
230
- // Alignment: 16-byte aligned to satisfy TMA descriptor requirements.
 
 
 
 
 
231
  __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
 
232
 
 
233
  // Initial GMEM → smem load (reads state from previous forward call).
234
  // Each block loads only its own slice; tid strides across the slice.
235
  for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
@@ -242,6 +259,11 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
242
  // All blocks in the cluster must finish loading before any block
243
  // starts reading peer smem inside the T-loop.
244
  cluster.sync();
 
 
 
 
 
245
 
246
  const unsigned int S = cfg.synapses_per_col;
247
  const unsigned int cpc = cfg.cells_per_column;
@@ -307,32 +329,19 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
307
  // Ordering: BARRIER 1 completes before we issue the DMA.
308
  // The DMA completes before Stage A reads s_input_tile.
309
  // =========================================================
 
310
  const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
311
  if (use_input_tile) {
312
- // Thread-block scope async copy: each SM independently loads
313
- // its own input tile from GMEM into shared memory.
314
- //
315
- // NOTE: CUDA 12.1's cooperative_groups::memcpy_async() rejects
316
- // cluster_group at compile time (static_assert in async.h:171).
317
- // True TMA multicast (single DMA for all 16 SMs in the cluster)
318
- // would require raw PTX cp.async.bulk.tensor with multicast mode,
319
- // which needs cuTensorMap descriptors on the host side (T11).
320
- //
321
- // This per-SM path still gives a meaningful win: it converts
322
- // the original per-synapse scattered GMEM reads (random access
323
- // pattern hitting multiple cache lines) into one sequential DMA
324
- // per SM, improving L2 hit rate and hardware prefetcher
325
- // effectiveness. The cluster.sync() below ensures all SMs in
326
- // the cluster have finished loading before any SM enters Stage A.
327
  auto tb = cg::this_thread_block();
328
  cg::memcpy_async(tb, s_input_tile,
329
  inputs + inp_off,
330
  cfg.input_bits);
331
  cg::wait(tb);
332
- // Cluster barrier: all 16 SMs must have loaded their tile
333
- // before any SM begins reading s_input_tile in Stage A.
334
  cluster.sync();
335
  }
 
 
 
336
 
337
  // =========================================================
338
  // STAGE A: Spatial Pooler
@@ -350,22 +359,31 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
350
  float p = syn_perm[base + s];
351
  // T9: read from cluster-broadcast tile when available;
352
  // fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
 
353
  unsigned int inp_byte = use_input_tile
354
  ? (unsigned int)s_input_tile[b]
355
  : (unsigned int)inputs[inp_off + b];
 
 
 
356
  unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
357
  local += hit;
358
  }
359
  unsigned int overlap = warp_sum_u32(local);
360
  overlap = __shfl_sync(0xffffffffu, overlap, 0);
361
 
362
- // Determine which cluster block owns column c and read
363
- // boost + threshold from that block's shared memory.
 
364
  const unsigned int owner_block = c / cols_per_block;
365
  const unsigned int owner_offset = c - owner_block * cols_per_block;
366
-
367
  float boost_val = cluster.map_shared_rank(s_boost, owner_block)[owner_offset];
368
  float thr = cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset];
 
 
 
 
 
369
 
370
  float boosted = (float)overlap * boost_val;
371
  unsigned int is_active = (boosted > thr) ? 1u : 0u;
@@ -383,9 +401,13 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
383
  for (unsigned int s = lane; s < S; s += 32u) {
384
  unsigned int b = syn_bit[base + s];
385
  float p = syn_perm[base + s];
 
386
  unsigned int inp_byte = use_input_tile
387
  ? (unsigned int)s_input_tile[b]
388
  : (unsigned int)inputs[inp_off + b];
 
 
 
389
  if (inp_byte != 0u) {
390
  p += cfg.sp_inc;
391
  if (p > 1.0f) p = 1.0f;
@@ -398,15 +420,20 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
398
  }
399
 
400
  // active_duty EMA + threshold adaptation.
401
- // Writes go to both peer DSMEM (hot path for next timestep)
402
- // and GMEM (persistence across forward calls).
403
  if (lane == 0) {
 
404
  float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
 
 
 
405
  float sample = is_active ? 1.0f : 0.0f;
406
  ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
407
 
 
408
  // Writeback: peer smem (for next timestep read) + GMEM (persistence).
409
  cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
 
410
  active_duty[c] = ad;
411
 
412
  // Threshold steers toward target sparsity.
@@ -415,50 +442,23 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
415
  if (new_thr < 0.1f) new_thr = 0.1f;
416
  if (new_thr > 1000.0f) new_thr = 1000.0f;
417
 
 
418
  // Writeback: peer smem (for next timestep read) + GMEM (persistence).
419
  cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
 
420
  inhibition_threshold[c] = new_thr;
421
  }
422
  }
423
 
424
  // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
425
  //
426
- // DATA FLOW PROOF (T-loop iteration invariant):
427
- //
428
- // WRITE SITES (lane==0 inside Stage A per-col loop):
429
- // Line 328: cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad
430
- // Line 338: cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr
431
- //
432
- // READ SITES (Stage A of the NEXT timestep t+1):
433
- // Line 290: cluster.map_shared_rank(s_boost, owner_block)[owner_offset] (read)
434
- // Line 291: cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] (read)
435
- // Line 323: cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] (read)
436
- //
437
- // PARTITION MISMATCH (root cause of T8 staleness):
438
- // cols_per_block = ceil(n_cols / cluster_sz) [smem partition]
439
- // col_lo/col_hi = floor(gwarp*n_cols/n_warps) [gwarp work partition]
440
- // These are NOT identical — up to 1 column can spill across partition boundaries.
441
- // Example: n_cols=1000, cluster_sz=16 → cols_per_block=63, block 1 col_lo=62
442
- // → block 1 processes column 62 but column 62 belongs to block 0's smem slice.
443
- // → block 1 issues a PEER WRITE to block 0's s_inhib_thr / s_active_duty.
444
- //
445
- // RACE WITHOUT SYNC:
446
- // Blocks run Stage A concurrently. Block 1 writes block 0's smem at column 62.
447
- // Block 0 may simultaneously READ s_inhib_thr[62] for its own column 62 in
448
- // Stage A of the same timestep → concurrent peer write + local read → undefined.
449
- // Additionally, without cluster.sync() after all peer writes complete, block 0's
450
- // t+1 Stage A reads might observe t-1 values still cached in its smem.
451
- //
452
- // FIX: cluster.sync() here, AFTER Stage A's per-column loop, ensures:
453
- // 1. All peer smem writes from this timestep are globally visible to all blocks.
454
- // 2. No block can enter Stage B (or start t+1 Stage A) with stale smem values.
455
- // 3. GMEM writes (lines 329, 339) are already committed to L2; __threadfence()
456
- // below ensures they are visible to all SMs before the cluster barrier.
457
- //
458
- // ORDERING: write → cluster.sync() here → __threadfence() → cluster.sync() in
459
- // fused_grid_barrier → next-timestep reads. Both visibility guarantees
460
- // are now satisfied.
461
  cluster.sync();
 
462
 
463
  // ---- BARRIER 2: SP active_mask must be visible before TM reads ----
464
  // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
@@ -660,7 +660,7 @@ void htm_fused_step_body(const FusedPtrs& P, const FusedConfig& cfg) {
660
  }
661
 
662
  // Single-region kernel (legacy call site).
663
- __global__
664
  void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
665
  htm_fused_step_body(P, cfg);
666
  }
@@ -668,7 +668,7 @@ void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
668
  // Batched kernel: one cooperative launch for B regions. grid.y = B,
669
  // grid.x = per-region block count. Each block reads its region's
670
  // FusedPtrs from the device array via blockIdx.y.
671
- __global__
672
  void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
673
  const FusedPtrs P = P_arr[blockIdx.y];
674
  htm_fused_step_body(P, cfg);
 
124
  //
125
  // The flags / expected / phase / cooperative_grid_sync parameters are kept
126
  // in the signature for call-site compatibility but are unused.
127
+ __device__ static inline void fused_grid_barrier(cg::grid_group grid,
128
  unsigned int * /* flags — unused */,
129
  unsigned int /* expected — unused */,
130
  unsigned int /* phase — unused */,
131
  unsigned int /* cooperative_grid_sync — unused */) {
132
+ #if __CUDA_ARCH__ >= 900
133
+ // Hopper+ : hardware cluster barrier (~10-40 ns)
134
  auto cluster = cg::this_cluster();
135
  cluster.sync();
136
+ #else
137
+ // Pre-Hopper (sm_80, sm_86, sm_89): grid-level cooperative sync.
138
+ // Requires cooperative kernel launch. ~us-ms range, adequate for HTM
139
+ // workload (kernel launch frequency is low).
140
+ grid.sync();
141
+ #endif
142
  }
143
 
144
  __device__ static inline unsigned int warp_sum_u32(unsigned int v) {
 
195
  // DSMEM: Cluster-distributed shared memory for hot per-column
196
  // state (inhibition_threshold, boost, active_duty).
197
  //
198
+ // On Hopper (sm_90+): Each block in the cluster owns a contiguous
199
+ // slice of columns in its own __shared__ arrays. Any block can
200
+ // peer-read another block's slice via cluster.map_shared_rank().
 
201
  //
202
+ // On Ampere (sm_86) and other pre-Hopper: No cluster support.
203
+ // Read/write directly from/to global memory (inhibition_threshold,
204
+ // boost, active_duty device pointers). Slightly higher latency but
205
+ // functionally correct.
206
  // =========================================================
207
+
208
+ #if __CUDA_ARCH__ >= 900
209
+ // Hopper+ cluster path
210
  auto cluster = cg::this_cluster();
211
  const unsigned int cluster_block_rank = cluster.block_rank(); // 0..cluster_size-1
212
  const unsigned int cluster_sz = cluster.num_blocks(); // == gridDim.x (≤16)
213
+ #else
214
+ // Pre-Hopper: no cluster, each block is independent.
215
+ const unsigned int cluster_block_rank = blockIdx.x;
216
+ const unsigned int cluster_sz = gridDim.x;
217
+ #endif
218
 
219
  // Partition n_cols evenly across cluster blocks.
220
  // Each block owns cols_per_block columns starting at my_col_start.
 
226
  (my_col_start + cols_per_block < n_cols)
227
  ? (my_col_start + cols_per_block) : n_cols; // clamp
228
 
229
+ #if __CUDA_ARCH__ >= 900
230
  // Cluster-distributed shared memory arrays.
231
  // Each block holds at most COLS_PER_CLUSTER_BLOCK_MAX floats per array.
232
  // Peer blocks address into each other's smem via map_shared_rank.
233
  __shared__ float s_inhib_thr [COLS_PER_CLUSTER_BLOCK_MAX];
234
  __shared__ float s_boost [COLS_PER_CLUSTER_BLOCK_MAX];
235
  __shared__ float s_active_duty[COLS_PER_CLUSTER_BLOCK_MAX];
236
+ #endif
237
 
238
+ // TMA multicast input staging tile (T9) — HOPPER ONLY.
 
 
 
 
 
 
 
 
 
239
  //
240
+ // On Hopper: cg::memcpy_async with cluster scope multicasts input to all
241
+ // 16 SMs, reducing DRAM traffic by ~16×.
242
+ // On Ampere: 32 KB smem allocation exceeds per-block budget when
243
+ // cooperatively launched (48 KB total, registers eat the rest). Skip the
244
+ // tile entirely — Stage A reads from GMEM directly (original path).
245
+ #if __CUDA_ARCH__ >= 900
246
  __shared__ __align__(16) unsigned char s_input_tile[INPUT_BITS_MAX];
247
+ #endif
248
 
249
+ #if __CUDA_ARCH__ >= 900
250
  // Initial GMEM → smem load (reads state from previous forward call).
251
  // Each block loads only its own slice; tid strides across the slice.
252
  for (unsigned int c = my_col_start + tid; c < my_col_end; c += blockDim.x) {
 
259
  // All blocks in the cluster must finish loading before any block
260
  // starts reading peer smem inside the T-loop.
261
  cluster.sync();
262
+ #else
263
+ // Pre-Hopper: no smem caching needed — reads go directly to GMEM.
264
+ // Grid sync ensures all blocks have completed Phase 0 init before T-loop.
265
+ grid.sync();
266
+ #endif
267
 
268
  const unsigned int S = cfg.synapses_per_col;
269
  const unsigned int cpc = cfg.cells_per_column;
 
329
  // Ordering: BARRIER 1 completes before we issue the DMA.
330
  // The DMA completes before Stage A reads s_input_tile.
331
  // =========================================================
332
+ #if __CUDA_ARCH__ >= 900
333
  const bool use_input_tile = (cfg.input_bits <= INPUT_BITS_MAX);
334
  if (use_input_tile) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  auto tb = cg::this_thread_block();
336
  cg::memcpy_async(tb, s_input_tile,
337
  inputs + inp_off,
338
  cfg.input_bits);
339
  cg::wait(tb);
 
 
340
  cluster.sync();
341
  }
342
+ #else
343
+ const bool use_input_tile = false;
344
+ #endif
345
 
346
  // =========================================================
347
  // STAGE A: Spatial Pooler
 
359
  float p = syn_perm[base + s];
360
  // T9: read from cluster-broadcast tile when available;
361
  // fall back to direct GMEM when input_bits > INPUT_BITS_MAX.
362
+ #if __CUDA_ARCH__ >= 900
363
  unsigned int inp_byte = use_input_tile
364
  ? (unsigned int)s_input_tile[b]
365
  : (unsigned int)inputs[inp_off + b];
366
+ #else
367
+ unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
368
+ #endif
369
  unsigned int hit = ((inp_byte != 0u) && (p >= cfg.conn_thr)) ? 1u : 0u;
370
  local += hit;
371
  }
372
  unsigned int overlap = warp_sum_u32(local);
373
  overlap = __shfl_sync(0xffffffffu, overlap, 0);
374
 
375
+ // Read boost + threshold for column c.
376
+ #if __CUDA_ARCH__ >= 900
377
+ // Hopper: read from cluster-distributed shared memory.
378
  const unsigned int owner_block = c / cols_per_block;
379
  const unsigned int owner_offset = c - owner_block * cols_per_block;
 
380
  float boost_val = cluster.map_shared_rank(s_boost, owner_block)[owner_offset];
381
  float thr = cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset];
382
+ #else
383
+ // Pre-Hopper: read directly from global memory.
384
+ float boost_val = boost[c];
385
+ float thr = inhibition_threshold[c];
386
+ #endif
387
 
388
  float boosted = (float)overlap * boost_val;
389
  unsigned int is_active = (boosted > thr) ? 1u : 0u;
 
401
  for (unsigned int s = lane; s < S; s += 32u) {
402
  unsigned int b = syn_bit[base + s];
403
  float p = syn_perm[base + s];
404
+ #if __CUDA_ARCH__ >= 900
405
  unsigned int inp_byte = use_input_tile
406
  ? (unsigned int)s_input_tile[b]
407
  : (unsigned int)inputs[inp_off + b];
408
+ #else
409
+ unsigned int inp_byte = (unsigned int)inputs[inp_off + b];
410
+ #endif
411
  if (inp_byte != 0u) {
412
  p += cfg.sp_inc;
413
  if (p > 1.0f) p = 1.0f;
 
420
  }
421
 
422
  // active_duty EMA + threshold adaptation.
423
+ // Writes go to both DSMEM (hot path, Hopper only) and GMEM (persistence).
 
424
  if (lane == 0) {
425
+ #if __CUDA_ARCH__ >= 900
426
  float ad = cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset];
427
+ #else
428
+ float ad = active_duty[c];
429
+ #endif
430
  float sample = is_active ? 1.0f : 0.0f;
431
  ad = (1.0f - cfg.duty_alpha) * ad + cfg.duty_alpha * sample;
432
 
433
+ #if __CUDA_ARCH__ >= 900
434
  // Writeback: peer smem (for next timestep read) + GMEM (persistence).
435
  cluster.map_shared_rank(s_active_duty, owner_block)[owner_offset] = ad;
436
+ #endif
437
  active_duty[c] = ad;
438
 
439
  // Threshold steers toward target sparsity.
 
442
  if (new_thr < 0.1f) new_thr = 0.1f;
443
  if (new_thr > 1000.0f) new_thr = 1000.0f;
444
 
445
+ #if __CUDA_ARCH__ >= 900
446
  // Writeback: peer smem (for next timestep read) + GMEM (persistence).
447
  cluster.map_shared_rank(s_inhib_thr, owner_block)[owner_offset] = new_thr;
448
+ #endif
449
  inhibition_threshold[c] = new_thr;
450
  }
451
  }
452
 
453
  // ---- DSMEM WRITEBACK SYNC: peer-smem writes must be visible cluster-wide ----
454
  //
455
+ // On Hopper: cluster.sync() ensures all peer smem writes from this
456
+ // timestep are visible to all blocks before Stage B / next t.
457
+ // On pre-Hopper: no smem peer writes occur (all state in GMEM),
458
+ // so no extra sync needed here — the grid barrier below suffices.
459
+ #if __CUDA_ARCH__ >= 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  cluster.sync();
461
+ #endif
462
 
463
  // ---- BARRIER 2: SP active_mask must be visible before TM reads ----
464
  // Fence: flush cols_out + active_duty + inhibition_threshold + step_scratch
 
660
  }
661
 
662
  // Single-region kernel (legacy call site).
663
+ __global__ __launch_bounds__(256, 2)
664
  void htm_fused_step(FusedPtrs P, FusedConfig cfg) {
665
  htm_fused_step_body(P, cfg);
666
  }
 
668
  // Batched kernel: one cooperative launch for B regions. grid.y = B,
669
  // grid.x = per-region block count. Each block reads its region's
670
  // FusedPtrs from the device array via blockIdx.y.
671
+ __global__ __launch_bounds__(256, 2)
672
  void htm_fused_step_batched(const FusedPtrs* __restrict__ P_arr, FusedConfig cfg) {
673
  const FusedPtrs P = P_arr[blockIdx.y];
674
  htm_fused_step_body(P, cfg);
overlay/hydra/engram.py CHANGED
@@ -1,93 +1,80 @@
1
- """GPU Engram — Sparse Modern Hopfield retrieval path.
2
 
3
- ## What changed (scatter-gather → Hopfield matmul)
4
 
5
  The original forward used `self.memory[indices]` (scatter-gather), which misses
6
  L2 cache at n_columns > 4096 and creates a hard tps ceiling.
7
 
8
- The replacement uses:
9
- scores = x @ self.memory.T # (B, T, n_columns) coalesced matmul
10
- weights = entmax15(scores, dim=-1) # sparse attention; 95%+ exact zeros
11
- retrieved = weights @ self.memory # (B, T, d_model) — coalesced matmul
12
 
13
- Both matmuls are tile-friendly (cuBLAS GEMM), so L2 reuse is high regardless of
14
- n_columns. Gradient flows through both matmuls so `self.memory` learns via
15
- autograd in addition to (or instead of) the Hebbian EMA writes.
 
16
 
17
- ## Sparsity mechanism
18
 
19
- alpha-entmax with alpha=1.5 (entmax15) is a sparse attention operator that maps
20
- logit vectors to distributions where many entries are *exactly* zero (not merely
21
- small). It generalises softmax (alpha=1) and argmax (alpha→∞). At n_columns=1024
22
- with d_model=64 a random batch typically hits ≥95% zero entries — the key
23
- property that keeps bandwidth proportional to *attended* columns, not all columns.
24
 
25
- Fallback: if `entmax` is not pip-installed, top-k softmax (k=32) is used instead.
26
- This is chosen at module-import time NO runtime branching per forward call.
27
 
28
- ## token_ids argument
 
 
 
 
 
 
 
29
 
30
- token_ids is accepted for API compatibility with the rest of the hydra stack
31
- (train.py, lightning_module.py call `engram(x, token_ids)`). It is NOT used in
32
- the retrieval path — the Hopfield path computes dense similarity over the whole
33
- memory bank, which subsumes any hash-based column selection. Documented here to
34
- prevent confusion.
35
 
36
- ## Hebbian writes (hebbian_boost=False by default)
37
 
38
- With Hopfield retrieval, gradient signals reach self.memory through autograd, so
39
- Hebbian EMA writes are no longer critical. They are preserved as an *optional*
40
- boost (hebbian_boost=True) for experiments that want both signals. Default is off.
41
 
42
  ## Checkpoint compatibility
43
 
44
- `self.memory` shape (n_columns, d_model) is unchanged, so existing .pt / .ckpt
45
- files load without modification.
46
  """
47
 
48
  from __future__ import annotations
49
 
 
 
50
  import torch
51
  import torch.nn as nn
52
 
53
- # ---------------------------------------------------------------------------
54
- # Sparse-attention backend — chosen ONCE at import time, no runtime branching.
55
- # ---------------------------------------------------------------------------
56
-
57
- try:
58
- from entmax import entmax15 as _entmax15 # type: ignore[import]
59
 
60
- def _sparse_attention(scores: torch.Tensor) -> torch.Tensor:
61
- """alpha-entmax (alpha=1.5): truly sparse distribution over last dim."""
62
- return _entmax15(scores, dim=-1)
63
-
64
- _BACKEND = "entmax15"
65
-
66
- except ImportError: # pragma: no cover — entmax always installed in CI
67
- _K = 32 # top-k for fallback
68
-
69
- def _sparse_attention(scores: torch.Tensor) -> torch.Tensor: # type: ignore[misc]
70
- """Top-k softmax fallback: zero outside the k highest-scoring columns."""
71
- topk_vals, topk_idx = scores.topk(_K, dim=-1)
72
- topk_w = torch.softmax(topk_vals, dim=-1).to(scores.dtype)
73
- weights = torch.zeros_like(scores)
74
- weights.scatter_(-1, topk_idx, topk_w)
75
- return weights
76
-
77
- _BACKEND = "topk32"
78
 
79
 
80
  class GPUEngram(nn.Module):
81
- """GPU Engram: Sparse Modern Hopfield retrieval.
82
 
83
  Args:
84
  d_model: Model dimension — must match the surrounding transformer.
85
- n_columns: Number of memory columns (key-value pairs). Safe at 32 768
86
- with the matmul path; the old scatter-gather had an L2
87
- cliff above ~4 096.
88
- max_ngram: Retained for API compatibility; unused in retrieval path.
89
  hebbian_boost: If True, also run a Hebbian EMA write on the memory bank
90
- during training (old behaviour, now optional). Default False.
 
91
  """
92
 
93
  def __init__(
@@ -105,16 +92,18 @@ class GPUEngram(nn.Module):
105
  self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
106
  self.gate = nn.Linear(d_model, 1, bias=True)
107
  nn.init.constant_(self.gate.bias, 0.0) # START OPEN
 
 
108
  # Retained for any external code that reads these attrs.
109
  self.primes = [2654435761, 2246822519, 3266489917]
110
  self.hebbian_lr = 0.01
111
 
112
  # ------------------------------------------------------------------
113
- # _hash: retained for API/checkpoint compat; unused in forward below.
114
  # ------------------------------------------------------------------
115
 
116
  def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
117
- """N-gram hash → column index (kept for backward-compat; not used in retrieval)."""
118
  B, T = token_ids.shape
119
  h = token_ids * self.primes[0]
120
  if T > 1:
@@ -132,39 +121,48 @@ class GPUEngram(nn.Module):
132
  # ------------------------------------------------------------------
133
 
134
  def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
135
- """Hopfield retrieve + soft gate + residual.
136
 
137
  Args:
138
  x: (B, T, d_model) — input activations.
139
- token_ids: (B, T) — token indices. Accepted for API compatibility;
140
- NOT used in the retrieval path (see module docstring).
141
 
142
  Returns:
143
  (x + alpha * retrieved, hit_rate)
144
  - x + alpha * retrieved: (B, T, d_model)
145
  - hit_rate: scalar tensor — fraction of gate values > 0.1
146
  """
 
 
147
  # ---- 1. Similarity scores (coalesced GEMM) ----------------------
148
  # scores[b, t, c] = dot(x[b,t], memory[c])
149
  scores = x @ self.memory.T # (B, T, n_columns)
150
 
151
- # ---- 2. Sparse attention weights --------------------------------
152
- # _sparse_attention is fixed at import time (entmax15 or top-k).
153
- weights = _sparse_attention(scores) # (B, T, n_columns), many exact zeros
 
 
 
 
 
 
 
 
154
 
155
- # ---- 3. Retrieved vector (coalesced GEMM) -----------------------
156
- retrieved = weights @ self.memory # (B, T, d_model)
157
 
158
- # ---- 4. Soft gate (unchanged) -----------------------------------
159
  alpha = torch.sigmoid(self.gate(x)) # (B, T, 1)
160
 
161
- # ---- 5. Optional Hebbian EMA write ------------------------------
162
  if self.training and self.hebbian_boost:
163
  with torch.no_grad():
164
- # Reuse the hash-based indices for the write target (sparse update).
165
  indices = self._hash(token_ids)
166
- flat_idx = indices.reshape(-1) # (B*T,)
167
- flat_x = x.detach().reshape(-1, x.shape[-1]) # (B*T, d_model)
168
  mem_dtype = self.memory.data.dtype
169
  updates = (
170
  self.hebbian_lr * flat_x
@@ -172,6 +170,6 @@ class GPUEngram(nn.Module):
172
  ).to(mem_dtype)
173
  self.memory.data.index_add_(0, flat_idx, updates)
174
 
175
- # ---- 6. Residual + hit_rate -------------------------------------
176
  hit_rate = (alpha.detach() > 0.1).float().mean()
177
  return x + alpha * retrieved, hit_rate
 
1
+ """GPU Engram — Top-k Sparse Hopfield retrieval, scales to n_columns >= 32768.
2
 
3
+ ## What changed (scatter-gather → top-k Hopfield)
4
 
5
  The original forward used `self.memory[indices]` (scatter-gather), which misses
6
  L2 cache at n_columns > 4096 and creates a hard tps ceiling.
7
 
8
+ An earlier Hopfield implementation used `entmax15` for sparse attention, but
9
+ entmax's internal `torch.sort` over the full n_columns dimension allocates
10
+ ~1 GB scratch at (B*T=8192, n_columns=32768) and OOMs on a 6 GB card.
 
11
 
12
+ This module replaces the sort-based entmax with **top-k softmax**, which is
13
+ O(B*T*K) in memory and O(B*T*K * log n_columns) in compute (the top-k is
14
+ radix-selection under the hood not a full sort). Sparsity is still exact:
15
+ only K columns have non-zero weight per (batch, position).
16
 
17
+ ## Why this scales where entmax didn't
18
 
19
+ - `scores = x @ memory.T` is (B, T, n_columns) 268 MB at bf16 with n_columns=32768.
20
+ - `scores.topk(K)` allocates only (B, T, K) ~2 MB at K=64. No full sort.
21
+ - `memory[topk_idx]` gathers (B, T, K, d_model) — ~32 MB at bf16. Gather is
22
+ on the LAST axis of memory (columns), contiguous stride-1 rows, cache-friendly.
23
+ - `retrieved = einsum(topk_w, selected_mem)` ~4 MB. Final reduction.
24
 
25
+ Peak working set well under 400 MB at any reasonable n_columns + K. The weights
26
+ tensor is never densified (which would have been the (B, T, n_columns) killer).
27
 
28
+ ## Gradient flow
29
+
30
+ Both the topk gather and the einsum are autograd-tracked, so `self.memory`
31
+ receives gradient from the LM loss (which the Hebbian scatter-gather path did
32
+ not). `topk` indices are detached — gradient flows through `topk_vals` via the
33
+ selected memory rows.
34
+
35
+ ## Sparsity
36
 
37
+ Exactly K columns have non-zero weight per position. Default K=64, tunable via
38
+ HYDRA_ENGRAM_TOPK.
 
 
 
39
 
40
+ ## token_ids argument
41
 
42
+ Accepted for API compatibility with hydra/model.py; unused in retrieval. The
43
+ optional Hebbian boost (hebbian_boost=True) uses the hash-indexed path for
44
+ its EMA write only.
45
 
46
  ## Checkpoint compatibility
47
 
48
+ `self.memory` shape (n_columns, d_model) is unchanged; existing .pt/.ckpt
49
+ files load without migration.
50
  """
51
 
52
  from __future__ import annotations
53
 
54
+ import os
55
+
56
  import torch
57
  import torch.nn as nn
58
 
 
 
 
 
 
 
59
 
60
+ # Top-k width — how many memory columns get non-zero weight per position.
61
+ # Default 64 matches the entmax sparsity fraction we observed empirically
62
+ # (~0.2% of 32768 columns == 64). HYDRA_ENGRAM_TOPK env var overrides.
63
+ _ENGRAM_TOPK = int(os.environ.get("HYDRA_ENGRAM_TOPK", "64"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
 
66
  class GPUEngram(nn.Module):
67
+ """GPU Engram: Top-k Sparse Hopfield retrieval.
68
 
69
  Args:
70
  d_model: Model dimension — must match the surrounding transformer.
71
+ n_columns: Number of memory columns (key-value pairs). Safe up to
72
+ n_columns = 65536 at d_model = 384 on a 6 GB card with
73
+ B*T <= 8192.
74
+ max_ngram: Retained for API compatibility; unused in retrieval.
75
  hebbian_boost: If True, also run a Hebbian EMA write on the memory bank
76
+ during training. Default False the top-k gradient path
77
+ provides learning signal without this.
78
  """
79
 
80
  def __init__(
 
92
  self.memory = nn.Parameter(torch.randn(n_columns, d_model) * 0.01)
93
  self.gate = nn.Linear(d_model, 1, bias=True)
94
  nn.init.constant_(self.gate.bias, 0.0) # START OPEN
95
+ # Clamp topk K to n_columns so topk doesn't error at small engram.
96
+ self.topk_k = min(_ENGRAM_TOPK, n_columns)
97
  # Retained for any external code that reads these attrs.
98
  self.primes = [2654435761, 2246822519, 3266489917]
99
  self.hebbian_lr = 0.01
100
 
101
  # ------------------------------------------------------------------
102
+ # _hash: retained for API/checkpoint compat; unused in retrieval path.
103
  # ------------------------------------------------------------------
104
 
105
  def _hash(self, token_ids: torch.Tensor) -> torch.Tensor:
106
+ """N-gram hash → column index (Hebbian-write target only, not retrieval)."""
107
  B, T = token_ids.shape
108
  h = token_ids * self.primes[0]
109
  if T > 1:
 
121
  # ------------------------------------------------------------------
122
 
123
  def forward(self, x: torch.Tensor, token_ids: torch.Tensor):
124
+ """Top-k Hopfield retrieve + soft gate + residual.
125
 
126
  Args:
127
  x: (B, T, d_model) — input activations.
128
+ token_ids: (B, T) — accepted for API compat; only used in the
129
+ optional Hebbian boost path.
130
 
131
  Returns:
132
  (x + alpha * retrieved, hit_rate)
133
  - x + alpha * retrieved: (B, T, d_model)
134
  - hit_rate: scalar tensor — fraction of gate values > 0.1
135
  """
136
+ B, T, D = x.shape
137
+
138
  # ---- 1. Similarity scores (coalesced GEMM) ----------------------
139
  # scores[b, t, c] = dot(x[b,t], memory[c])
140
  scores = x @ self.memory.T # (B, T, n_columns)
141
 
142
+ # ---- 2. Top-k sparse attention ----------------------------------
143
+ # topk uses radix select, not a sort — O(n_columns) memory, not O(n_columns log n_columns).
144
+ # Never materializes a dense (B, T, n_columns) weights tensor.
145
+ topk_vals, topk_idx = scores.topk(self.topk_k, dim=-1) # (B, T, K), (B, T, K)
146
+ topk_w = torch.softmax(topk_vals, dim=-1) # (B, T, K)
147
+
148
+ # ---- 3. Gather selected memory rows -----------------------------
149
+ # memory[topk_idx] is a gather along axis 0 of memory (n_columns, d_model).
150
+ # Output shape (B, T, K, d_model) — K is small, so gather bandwidth is
151
+ # O(B*T*K*d_model), independent of n_columns.
152
+ selected_mem = self.memory[topk_idx] # (B, T, K, d_model)
153
 
154
+ # ---- 4. Weighted sum retrieved vector -------------------------
155
+ retrieved = torch.einsum('btk,btkd->btd', topk_w, selected_mem) # (B, T, d_model)
156
 
157
+ # ---- 5. Soft gate -----------------------------------------------
158
  alpha = torch.sigmoid(self.gate(x)) # (B, T, 1)
159
 
160
+ # ---- 6. Optional Hebbian EMA write ------------------------------
161
  if self.training and self.hebbian_boost:
162
  with torch.no_grad():
 
163
  indices = self._hash(token_ids)
164
+ flat_idx = indices.reshape(-1) # (B*T,)
165
+ flat_x = x.detach().reshape(-1, D) # (B*T, d_model)
166
  mem_dtype = self.memory.data.dtype
167
  updates = (
168
  self.hebbian_lr * flat_x
 
170
  ).to(mem_dtype)
171
  self.memory.data.index_add_(0, flat_idx, updates)
172
 
173
+ # ---- 7. Residual + hit_rate -------------------------------------
174
  hit_rate = (alpha.detach() > 0.1).float().mean()
175
  return x + alpha * retrieved, hit_rate
overlay/hydra/eval.py CHANGED
@@ -138,6 +138,9 @@ def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
138
  num_samples = FACTUAL_SAMPLES
139
  batch = FACTUAL_BATCH
140
  gen_tokens = FACTUAL_GEN_TOKENS
 
 
 
141
  temps = [0.7, 0.9, 1.1]
142
  hits = 0
143
 
@@ -154,14 +157,18 @@ def _run_factual_english_gen(model, tokenizer, max_seq_len: int):
154
  temp = temps[batch_idx % len(temps)]
155
  batch_idx += 1
156
  ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
 
157
  for _ in range(gen_tokens):
158
- logits = model(ctx, targets=None)
159
  next_logits = logits[:, -1, :] if logits.dim() == 3 else logits
160
  probs = torch.softmax(next_logits.float() / temp, dim=-1)
161
  next_id = torch.multinomial(probs, num_samples=1)
162
  ctx = torch.cat([ctx, next_id], dim=1)
163
  if ctx.size(1) >= max_seq_len:
164
  break
 
 
 
 
165
  # Transfer to CPU in one shot, no per-row sync
166
  all_rows.extend(ctx.cpu().tolist())
167
  samples_done += b
 
138
  num_samples = FACTUAL_SAMPLES
139
  batch = FACTUAL_BATCH
140
  gen_tokens = FACTUAL_GEN_TOKENS
141
+ # Optional fast incremental decode path for recurrence-capable backbones.
142
+ # If disabled, we preserve the original full-context re-forward behavior.
143
+ incremental_decode = os.environ.get("HYDRA_FACTUAL_GEN_INCREMENTAL", "1") == "1"
144
  temps = [0.7, 0.9, 1.1]
145
  hits = 0
146
 
 
157
  temp = temps[batch_idx % len(temps)]
158
  batch_idx += 1
159
  ctx = torch.tensor([ids] * b, device="cuda", dtype=torch.long)
160
+ logits = model(ctx, targets=None)
161
  for _ in range(gen_tokens):
 
162
  next_logits = logits[:, -1, :] if logits.dim() == 3 else logits
163
  probs = torch.softmax(next_logits.float() / temp, dim=-1)
164
  next_id = torch.multinomial(probs, num_samples=1)
165
  ctx = torch.cat([ctx, next_id], dim=1)
166
  if ctx.size(1) >= max_seq_len:
167
  break
168
+ if incremental_decode:
169
+ logits = model(ctx[:, -1:], targets=None)
170
+ else:
171
+ logits = model(ctx, targets=None)
172
  # Transfer to CPU in one shot, no per-row sync
173
  all_rows.extend(ctx.cpu().tolist())
174
  samples_done += b
overlay/hydra/model.py CHANGED
@@ -145,7 +145,7 @@ class PostSemClawModel(nn.Module):
145
  expand=config.expand,
146
  headdim=config.headdim,
147
  is_mimo=False, # SISO path uses stable mamba3_siso_combined kernel
148
- chunk_size=64, # upstream-recommended SISO chunk; 16 violated tl.dot M>=16 constraint
149
  is_outproj_norm=False,
150
  dtype=torch.bfloat16,
151
  )
@@ -173,8 +173,13 @@ class PostSemClawModel(nn.Module):
173
  reset_each_forward=True,
174
  )
175
 
176
- # Gradient bridge: (n_columns + anomaly) -> d_model.
177
- self.htm_proj = nn.Linear(config.htm_n_columns + 1, config.d_model, bias=False)
 
 
 
 
 
178
 
179
  # GPU Engram with Hebbian writes — runs EVERY step.
180
  self.engram = GPUEngram(
@@ -349,11 +354,13 @@ class PostSemClawModel(nn.Module):
349
  nn.init.normal_(block.out_proj.weight, mean=0.0, std=out_std)
350
 
351
  nn.init.normal_(self.htm_proj.weight, mean=0.0, std=s)
 
352
 
353
  # Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
354
  # dtypes in the same shape group would break lerp_ dtype checks.
355
  self.wte.to(dtype=torch.bfloat16)
356
  self.htm_proj.to(dtype=torch.bfloat16)
 
357
  self.engram.to(dtype=torch.bfloat16)
358
 
359
  def set_bos_token_id(self, bos_id: int) -> None:
@@ -402,11 +409,13 @@ class PostSemClawModel(nn.Module):
402
  blocks = sum(p.numel() for p in self.blocks.parameters())
403
  sdr = sum(p.numel() for p in self.sdr_semantic.parameters())
404
  htm_proj = sum(p.numel() for p in self.htm_proj.parameters())
 
405
  engram = sum(p.numel() for p in self.engram.parameters())
406
  total = sum(p.numel() for p in self.parameters())
407
  return {
408
  'wte': wte, 'lm_head': lm_head, 'blocks': blocks,
409
  'sdr_semantic': sdr, 'htm_proj': htm_proj,
 
410
  'engram': engram, 'total': total,
411
  }
412
 
@@ -516,9 +525,13 @@ class PostSemClawModel(nn.Module):
516
 
517
  for shape in sorted({p.shape for p in matrix_params}):
518
  group_params = [p for p in matrix_params if p.shape == shape]
 
 
 
 
519
  param_groups.append(dict(
520
  kind='muon', params=group_params, lr=matrix_lr,
521
- momentum=0.95, ns_steps=5, beta2=0.95, weight_decay=weight_decay,
522
  ))
523
 
524
  optimizer = MuonAdamW(param_groups)
@@ -610,8 +623,10 @@ class PostSemClawModel(nn.Module):
610
  if self._htm_stop_grad:
611
  htm_out = htm_out.detach()
612
 
613
- # Gradient bridge: HTM columns+anomaly -> d_model.
614
- htm_proj_out = self.htm_proj(htm_out.to(dense_emb.dtype))
 
 
615
  x = dense_emb + htm_proj_out
616
  x = norm(x)
617
 
 
145
  expand=config.expand,
146
  headdim=config.headdim,
147
  is_mimo=False, # SISO path uses stable mamba3_siso_combined kernel
148
+ chunk_size=int(os.environ.get("HYDRA_MAMBA3_CHUNK", "64")), # 64 is the validated default; 128 tripped a Triton autotune hang (>8min, no progress)
149
  is_outproj_norm=False,
150
  dtype=torch.bfloat16,
151
  )
 
173
  reset_each_forward=True,
174
  )
175
 
176
+ # Gradient bridge split:
177
+ # (a) sparse HTM columns -> d_model
178
+ # (b) scalar anomaly -> d_model
179
+ # This avoids forcing the anomaly scalar through the same projection
180
+ # statistics as the high-dimensional sparse HTM column vector.
181
+ self.htm_proj = nn.Linear(config.htm_n_columns, config.d_model, bias=False)
182
+ self.htm_anom_proj = nn.Linear(1, config.d_model, bias=False)
183
 
184
  # GPU Engram with Hebbian writes — runs EVERY step.
185
  self.engram = GPUEngram(
 
354
  nn.init.normal_(block.out_proj.weight, mean=0.0, std=out_std)
355
 
356
  nn.init.normal_(self.htm_proj.weight, mean=0.0, std=s)
357
+ nn.init.normal_(self.htm_anom_proj.weight, mean=0.0, std=s)
358
 
359
  # Cast to bf16 to match Mamba3 dtype; Muon groups by shape so mixed
360
  # dtypes in the same shape group would break lerp_ dtype checks.
361
  self.wte.to(dtype=torch.bfloat16)
362
  self.htm_proj.to(dtype=torch.bfloat16)
363
+ self.htm_anom_proj.to(dtype=torch.bfloat16)
364
  self.engram.to(dtype=torch.bfloat16)
365
 
366
  def set_bos_token_id(self, bos_id: int) -> None:
 
409
  blocks = sum(p.numel() for p in self.blocks.parameters())
410
  sdr = sum(p.numel() for p in self.sdr_semantic.parameters())
411
  htm_proj = sum(p.numel() for p in self.htm_proj.parameters())
412
+ htm_anom_proj = sum(p.numel() for p in self.htm_anom_proj.parameters())
413
  engram = sum(p.numel() for p in self.engram.parameters())
414
  total = sum(p.numel() for p in self.parameters())
415
  return {
416
  'wte': wte, 'lm_head': lm_head, 'blocks': blocks,
417
  'sdr_semantic': sdr, 'htm_proj': htm_proj,
418
+ 'htm_anom_proj': htm_anom_proj,
419
  'engram': engram, 'total': total,
420
  }
421
 
 
525
 
526
  for shape in sorted({p.shape for p in matrix_params}):
527
  group_params = [p for p in matrix_params if p.shape == shape]
528
+ # ns_steps: Muon polar-express inner iterations. Default 5 (paper),
529
+ # but 3 converges on small matrices (d_model ~ 384) with ~40% lower
530
+ # optimizer step cost. Env-tunable for experimentation.
531
+ _ns_steps = int(os.environ.get("HYDRA_MUON_NS_STEPS", "3"))
532
  param_groups.append(dict(
533
  kind='muon', params=group_params, lr=matrix_lr,
534
+ momentum=0.95, ns_steps=_ns_steps, beta2=0.95, weight_decay=weight_decay,
535
  ))
536
 
537
  optimizer = MuonAdamW(param_groups)
 
623
  if self._htm_stop_grad:
624
  htm_out = htm_out.detach()
625
 
626
+ # Gradient bridge split: columns and anomaly use separate projections.
627
+ htm_cols = htm_out[..., :-1].to(dense_emb.dtype)
628
+ htm_anom = htm_out[..., -1:].to(dense_emb.dtype)
629
+ htm_proj_out = self.htm_proj(htm_cols) + self.htm_anom_proj(htm_anom)
630
  x = dense_emb + htm_proj_out
631
  x = norm(x)
632
 
overlay/hydra/training.py CHANGED
@@ -779,15 +779,49 @@ def main() -> None:
779
  )
780
 
781
  # Now it's safe to eval — ckpts are on disk regardless of what happens here.
 
 
 
 
 
782
  val_bpb: float | None = None
 
 
783
  try:
784
- torch.cuda.empty_cache() # defrag before eval allocates logit chunks
785
- print(f"[VAL] running eval on {4 * 524288} tokens...", flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  model.eval()
787
  _orig = _prepare_mod.EVAL_TOKENS
788
- _prepare_mod.EVAL_TOKENS = 4 * 524288
789
  with autocast_ctx:
790
- val_bpb = evaluate_bpb(model, tokenizer, DEVICE_BATCH_SIZE)
791
  _prepare_mod.EVAL_TOKENS = _orig
792
  val_ppl = 2 ** val_bpb
793
  print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
@@ -795,7 +829,14 @@ def main() -> None:
795
  print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
796
  torch.cuda.empty_cache()
797
  except Exception as e:
 
798
  print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
 
 
 
 
 
 
799
 
800
  # Final ckpts with val_bpb filled in (if eval succeeded).
801
  save_ckpt(
@@ -843,7 +884,7 @@ def main() -> None:
843
  metrics = model.get_secondary_metrics()
844
 
845
  print("---")
846
- print(f"val_bpb: {val_bpb:.6f}")
847
  print(f"training_seconds: {total_training_time:.1f}")
848
  print(f"total_seconds: {t_end - t_start:.1f}")
849
  print(f"peak_vram_mb: {peak_vram_mb:.1f}")
 
779
  )
780
 
781
  # Now it's safe to eval — ckpts are on disk regardless of what happens here.
782
+ # HYDRA_EVAL_BATCH overrides DEVICE_BATCH_SIZE (env-tunable; default halves
783
+ # the training batch because eval holds activations for full sequence and
784
+ # does not benefit from overlap with backward). HYDRA_EVAL_TOKENS controls
785
+ # how many val tokens to sweep (default 2 M, short enough for autoresearch
786
+ # 5-min budgets).
787
  val_bpb: float | None = None
788
+ _eval_B = int(os.environ.get("HYDRA_EVAL_BATCH", str(max(1, DEVICE_BATCH_SIZE // 2))))
789
+ _eval_tokens = int(os.environ.get("HYDRA_EVAL_TOKENS", str(2 * 524288)))
790
  try:
791
+ # Aggressive VRAM reclaim for 6GB cards. Peak training VRAM = 5.1GB
792
+ # which leaves < 1GB for the eval forward the driver can't satisfy
793
+ # the allocation. Free EVERY tensor we don't strictly need:
794
+ # - optimizer grads (set_to_none releases tensor)
795
+ # - optimizer.state (fp32 Muon NS workspace, AdamW moments — ~size-of-params each)
796
+ # - model internal caches (HTM subsample cache, SDR stash)
797
+ # After this, VRAM should be ~params only (bf16 ≈ 120MB at 60M params).
798
+ optimizer.zero_grad(set_to_none=True)
799
+ if hasattr(optimizer, 'state') and optimizer.state:
800
+ for p, st in list(optimizer.state.items()):
801
+ st.clear()
802
+ optimizer.state.clear()
803
+ for p in model.parameters():
804
+ if p.grad is not None:
805
+ p.grad = None
806
+ if hasattr(model, '_htm_cache'):
807
+ model._htm_cache = None
808
+ if hasattr(model, '_last_sdr'):
809
+ model._last_sdr = None
810
+ import gc as _gc
811
+ _gc.collect()
812
+ torch.cuda.empty_cache()
813
+ torch.cuda.synchronize()
814
+ try:
815
+ _free_mb = torch.cuda.mem_get_info()[0] / 1024 / 1024
816
+ print(f"[VAL] free_vram_mb={_free_mb:.0f} (cleared optimizer state)", flush=True)
817
+ except Exception:
818
+ pass
819
+ print(f"[VAL] running eval on {_eval_tokens} tokens at B={_eval_B}...", flush=True)
820
  model.eval()
821
  _orig = _prepare_mod.EVAL_TOKENS
822
+ _prepare_mod.EVAL_TOKENS = _eval_tokens
823
  with autocast_ctx:
824
+ val_bpb = evaluate_bpb(model, tokenizer, _eval_B)
825
  _prepare_mod.EVAL_TOKENS = _orig
826
  val_ppl = 2 ** val_bpb
827
  print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
 
829
  print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
830
  torch.cuda.empty_cache()
831
  except Exception as e:
832
+ import traceback as _tb
833
  print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
834
+ _tb.print_exc()
835
+ try:
836
+ _free = torch.cuda.mem_get_info()[0] / 1024 / 1024
837
+ print(f"[VAL] post-crash free_vram_mb={_free:.0f}", flush=True)
838
+ except Exception:
839
+ pass
840
 
841
  # Final ckpts with val_bpb filled in (if eval succeeded).
842
  save_ckpt(
 
884
  metrics = model.get_secondary_metrics()
885
 
886
  print("---")
887
+ print(f"val_bpb: {val_bpb:.6f}" if val_bpb is not None else "val_bpb: SKIPPED")
888
  print(f"training_seconds: {total_training_time:.1f}")
889
  print(f"total_seconds: {t_end - t_start:.1f}")
890
  print(f"peak_vram_mb: {peak_vram_mb:.1f}")
overlay/prepare_nemotron.py CHANGED
@@ -20,15 +20,15 @@ Full blend mode (env HYDRA_USE_FULL_BLEND=1):
20
  """
21
  from __future__ import annotations
22
 
23
- import os
24
- import random
25
- from itertools import cycle
26
- from typing import Iterator
 
27
 
28
- import numpy as np
29
  import torch
30
 
31
- import prepare as _p # reuse tokenizer, BOS, byte-length helpers
32
 
33
  NEMOTRON_REPO = "nvidia/Nemotron-Pretraining-Specialized-v1.1"
34
 
@@ -37,14 +37,13 @@ NEMOTRON_REPO = "nvidia/Nemotron-Pretraining-Specialized-v1.1"
37
  # Keys are logical dataset names used by _open_blend_stream / _open_stream.
38
  # ---------------------------------------------------------------------------
39
  FULL_BLEND_WEIGHTS: dict[str, float] = {
40
- "fineweb-edu": 0.55, # HuggingFaceFW/fineweb-edu — PRIMARY (high-quality English)
41
- "wikipedia": 0.25, # wikimedia/wikipedia — factual grounding
42
- "cosmopedia": 0.15, # HuggingFaceTB/cosmopedia — synthetic textbook
43
- "fineweb": 0.05, # HuggingFaceFW/fineweb — general web
44
- # REMOVED code/math: was polluting English generation with Python syntax
45
- # "stack-v2": 0.00,
46
- # "nemotron-math": 0.00,
47
- # "nemotron-specialized": 0.00,
48
  }
49
 
50
  # Mapping from logical blend name → (HF repo, optional config/name, text column).
@@ -66,13 +65,94 @@ PHASE1_WEIGHTS = {
66
  "Nemotron-Pretraining-Formal-Logic": 0.20,
67
  "Nemotron-Pretraining-Multiple-Choice": 0.20,
68
  }
69
- PHASE2_WEIGHTS = {
70
  "Nemotron-Pretraining-Multiple-Choice": 0.45,
71
  "Nemotron-Pretraining-Economics": 0.20,
72
  "Nemotron-Pretraining-Formal-Logic": 0.15,
73
  "Nemotron-Pretraining-Code-Concepts": 0.10,
74
  "Nemotron-Pretraining-Unconditional-Algorithmic": 0.10,
75
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
 
78
  def _phase_weights() -> dict[str, float]:
@@ -83,129 +163,61 @@ def _phase_weights() -> dict[str, float]:
83
  return PHASE2_WEIGHTS if phase == "phase2" else PHASE1_WEIGHTS
84
 
85
 
86
- _PREFETCH_THREAD = None
87
- _PREFETCH_STARTED = set()
88
-
89
-
90
- def _find_local_parquets(repo: str, sub_config: str | None) -> list[str]:
91
- """Return LOCAL parquet paths in HF hub cache for a given repo+config.
92
-
93
- If sub_config filter yields zero matches but parquet files exist in the
94
- repo dir, returns all parquet files (some datasets like fineweb use a
95
- builder config name that doesn't match the filesystem path).
96
- """
97
- import glob
98
- repo_dir = "datasets--" + repo.replace("/", "--")
99
- base = os.path.expanduser(f"~/.cache/huggingface/hub/{repo_dir}/snapshots")
100
- if not os.path.isdir(base):
101
- return []
102
- all_paths = []
103
- for snap in os.listdir(base):
104
- all_paths.extend(glob.glob(os.path.join(base, snap, "**", "*.parquet"), recursive=True))
105
- if sub_config is None:
106
- return sorted(all_paths)
107
- filtered = [p for p in all_paths if f"/{sub_config}/" in p]
108
- # Fallback: if the config name doesn't match filesystem paths, use all parquet
109
- if not filtered and all_paths:
110
- return sorted(all_paths)
111
- return sorted(filtered)
112
-
113
-
114
- def _start_background_prefetch(repo: str, sub_config: str | None):
115
- """Start a daemon thread that downloads parquet shards ahead of consumption.
116
-
117
- Feeds HF's local cache so streaming=True serves from disk, never network.
118
- Idempotent per (repo, sub_config). Runs at throttled speed to not flood.
119
- """
120
- import threading
121
- key = (repo, sub_config)
122
- if key in _PREFETCH_STARTED:
123
- return
124
- _PREFETCH_STARTED.add(key)
125
-
126
- def worker():
127
- try:
128
- from huggingface_hub import HfApi, hf_hub_download
129
- os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
130
- token = os.environ.get("HF_TOKEN")
131
- api = HfApi(token=token)
132
- files = api.list_repo_files(repo, repo_type="dataset")
133
- parquet = sorted(f for f in files if f.endswith(".parquet"))
134
- if sub_config is not None:
135
- filtered = [f for f in parquet if f"/{sub_config}/" in f or f.startswith(f"{sub_config}/")]
136
- if filtered:
137
- parquet = filtered
138
- # Fetch shards one by one, skipping already-cached (hf_hub_download is idempotent)
139
- for f in parquet:
140
- try:
141
- hf_hub_download(repo_id=repo, filename=f, repo_type="dataset", token=token)
142
- except Exception:
143
- pass # skip unavailable shards
144
- except Exception:
145
- pass # prefetch is best-effort, don't disrupt training
146
-
147
- t = threading.Thread(target=worker, daemon=True, name=f"prefetch-{repo}")
148
- t.start()
149
-
150
-
151
- def _open_stream(config: str, split: str):
152
  """Open a streaming iterator over one dataset config.
153
 
154
- Uses HF streaming (reads local cache when shards present, network otherwise).
155
- Starts a background prefetcher that downloads remaining shards in parallel.
156
- """
157
- from datasets import load_dataset
158
- token = os.environ.get("HF_TOKEN")
159
- shuffle_buf = int(os.environ.get("HYDRA_STREAM_SHUFFLE_BUFFER", "2048"))
160
 
161
- if config in _BLEND_REGISTRY:
162
- repo, name, _text_col = _BLEND_REGISTRY[config]
163
- effective_cfg = name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  if config == "nemotron-specialized":
165
- effective_cfg = "Nemotron-Pretraining-Code-Concepts"
166
  repo = NEMOTRON_REPO
 
167
  else:
168
- repo = NEMOTRON_REPO
169
- effective_cfg = config
170
-
171
- # Kick off background prefetch of remaining shards for this dataset
172
- if os.environ.get("HYDRA_BACKGROUND_PREFETCH", "1") == "1":
173
- _start_background_prefetch(repo, effective_cfg)
174
-
175
- local_only = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "1") == "1"
176
- if local_only:
177
- local_paths = _find_local_parquets(repo, effective_cfg)
178
- if not local_paths:
179
- raise RuntimeError(
180
- f"No local parquet files for {repo} (config={effective_cfg}). "
181
- f"Run scripts/predownload_shards.py first, or set HYDRA_LOCAL_SHARDS_ONLY=0."
182
- )
183
  ds = load_dataset(
184
- "parquet",
185
- data_files=local_paths,
186
  split="train",
187
  streaming=True,
 
188
  )
189
- else:
190
- kwargs: dict = dict(split="train", streaming=True, token=token)
191
- if effective_cfg is not None:
192
- kwargs["name"] = effective_cfg
193
- ds = load_dataset(repo, **kwargs)
194
  ds = ds.shuffle(seed=42, buffer_size=shuffle_buf)
195
  return iter(ds)
196
 
197
 
198
- def _extract_text(row: dict) -> str:
199
  """Pick the right text column — datasets have different column names.
200
 
201
  Priority order: text, content, prompt_completion, question, body.
202
  For math datasets that split into problem+solution, concatenate both.
203
  Fallback: concatenate all string-valued fields.
204
  """
205
- # Fast path: most datasets use "text" or "content".
206
- for k in ("text", "content", "prompt_completion", "question", "body"):
207
- if k in row and row[k]:
208
- return row[k]
 
209
  # Math datasets may have problem + solution as separate fields.
210
  if "problem" in row and "solution" in row:
211
  p = row["problem"] or ""
@@ -221,15 +233,20 @@ def _extract_text(row: dict) -> str:
221
  return "\n".join(parts)
222
 
223
 
224
- class _WeightedStream:
225
  """Infinite weighted-round-robin over configs' streaming iterators."""
226
 
227
- def __init__(self, weights: dict[str, float], seed: int = 0):
228
- self.configs = list(weights.keys())
229
- self.weights = [weights[c] for c in self.configs]
230
- self.streams = {c: _open_stream(c, "train") for c in self.configs}
231
- self.rng = random.Random(seed)
232
- self.epoch = 1
 
 
 
 
 
233
 
234
  def _reopen(self, config: str):
235
  # stream exhausted — reopen (HF streaming typically infinite but restart on edge)
@@ -245,22 +262,20 @@ class _WeightedStream:
245
  # exist in the Nemotron configs. Controlled by HYDRA_FACTUAL_INJECT_RATE
246
  # (default 50 = inject one factual doc every 50 Nemotron docs = ~2%).
247
  inject_rate = int(os.environ.get("HYDRA_FACTUAL_INJECT_RATE", "50"))
248
- if inject_rate > 0 and not hasattr(self, '_factual_docs'):
249
- factual_path = os.path.join(
250
- os.path.dirname(os.path.abspath(__file__)), "data", "factual", "facts.txt")
251
- if os.path.exists(factual_path):
252
- self._factual_docs = open(factual_path).read().strip().split('\n')
253
- self._factual_idx = 0
254
- self._inject_counter = 0
255
- else:
256
- self._factual_docs = None
257
- if inject_rate > 0 and hasattr(self, '_factual_docs') and self._factual_docs:
258
- self._inject_counter = getattr(self, '_inject_counter', 0) + 1
259
- if self._inject_counter >= inject_rate:
260
- self._inject_counter = 0
261
- doc = self._factual_docs[self._factual_idx % len(self._factual_docs)]
262
- self._factual_idx += 1
263
- return doc, self.epoch
264
 
265
  config = self.rng.choices(self.configs, weights=self.weights, k=1)[0]
266
  try:
@@ -293,9 +308,9 @@ def _document_batches(split: str, tokenizer_batch_size: int = 128) -> Iterator[t
293
  stream = _WeightedStream(_phase_weights(), seed=0)
294
 
295
  prefetch_depth = int(os.environ.get("HYDRA_STREAM_PREFETCH", "32"))
296
- q: queue.Queue = queue.Queue(maxsize=prefetch_depth)
297
- sentinel_stop = object()
298
- error_box: list = []
299
 
300
  def producer():
301
  try:
@@ -320,7 +335,7 @@ def _document_batches(split: str, tokenizer_batch_size: int = 128) -> Iterator[t
320
  if error_box:
321
  raise error_box[0]
322
  return
323
- yield item
324
 
325
 
326
  def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 1000):
@@ -331,47 +346,24 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
331
  stage 2: BPE tokenization → token-id lists (this function's producer thread)
332
  stage 3: best-fit packing → (B, T+1) tensor rows (main thread, consumes)
333
 
334
- Local cache (HYDRA_TOKEN_CACHE_GB, default 2):
335
- Packed (T+1) rows are written to a binary shard on first pass. Subsequent
336
- launches with a non-empty cache mmap that file and cycle through it,
337
- skipping the 5-min streaming cold-start entirely. Cache key includes
338
- (T, vocab_size) so shape changes invalidate the cache automatically.
339
  """
340
  import queue
341
  import threading
342
 
343
  assert split in ("train", "val")
344
  row_capacity = T + 1
345
- bos_token = tokenizer.get_bos_token_id()
346
-
347
- # --- Local packed-token cache (train only; val path skips cache-write) ---
348
- cache_enabled = split == "train"
349
- cache_gb = float(os.environ.get("HYDRA_TOKEN_CACHE_GB", "2"))
350
- cache_dir = os.path.expanduser("~/.cache/autoresearch")
351
- os.makedirs(cache_dir, exist_ok=True)
352
- vocab_size = tokenizer.get_vocab_size()
353
- cache_path = os.path.join(cache_dir, f"packed_tokens_v1_T{T}_V{vocab_size}_{split}.bin")
354
- cache_target_bytes = int(cache_gb * 1024**3)
355
- dtype_np = np.int32 # vocab < 2^31
356
- bytes_per_row = row_capacity * 4 # int32
357
- cache_rows_target = cache_target_bytes // bytes_per_row
358
-
359
- # If train cache exists and is ready, mmap and yield from it
360
- if cache_enabled and os.path.exists(cache_path) and os.path.getsize(cache_path) >= cache_target_bytes // 2:
361
- print(f"[token-cache] using {cache_path} ({os.path.getsize(cache_path) / 1024**3:.2f} GB)")
362
- yield from _mmap_cache_loader(cache_path, B, T, row_capacity, dtype_np)
363
- return # unreachable (mmap loader is infinite), but satisfies generator protocol
364
-
365
- if cache_enabled:
366
- print(f"[token-cache] building {cache_path} (target {cache_gb:.1f} GB) on first pass")
367
  batches = _document_batches(split)
 
368
 
369
  # Stage 2: tokenization prefetch thread. Each queue element is a list of
370
  # token-id lists (pre-tokenized docs). HYDRA_TOKEN_PREFETCH controls depth.
371
  tok_prefetch = int(os.environ.get("HYDRA_TOKEN_PREFETCH", "8"))
372
- tok_q: queue.Queue = queue.Queue(maxsize=tok_prefetch)
373
- tok_sentinel = object()
374
- tok_err_box: list = []
375
 
376
  def tokenizer_producer():
377
  try:
@@ -395,8 +387,8 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
395
  if tok_err_box:
396
  raise tok_err_box[0]
397
  raise StopIteration
398
- token_lists, epoch = item
399
- doc_buffer.extend(token_lists)
400
 
401
  row_buffer = torch.empty((B, row_capacity), dtype=torch.long)
402
  cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=True)
@@ -406,10 +398,6 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
406
  inputs = gpu_buffer[: B * T].view(B, T)
407
  targets = gpu_buffer[B * T :].view(B, T)
408
 
409
- # Open cache file for append-on-build
410
- cache_fh = open(cache_path + ".tmp", "wb") if cache_enabled else None
411
- cache_rows_written = 0
412
-
413
  while True:
414
  for row_idx in range(B):
415
  pos = 0
@@ -437,43 +425,6 @@ def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 10
437
  cpu_inputs.copy_(row_buffer[:, :-1])
438
  cpu_targets.copy_(row_buffer[:, 1:])
439
  gpu_buffer.copy_(cpu_buffer, non_blocking=True)
440
-
441
- # Write packed rows to cache (append) until target size reached
442
- if cache_fh is not None:
443
- np_rows = row_buffer.numpy().astype(np.int32, copy=False)
444
- cache_fh.write(np_rows.tobytes())
445
- cache_rows_written += B
446
- if cache_rows_written >= cache_rows_target:
447
- cache_fh.flush()
448
- cache_fh.close()
449
- os.replace(cache_path + ".tmp", cache_path)
450
- cache_fh = None
451
- print(f"[token-cache] finalized {cache_path} ({cache_rows_written} rows)")
452
-
453
- yield inputs, targets, epoch
454
-
455
-
456
- def _mmap_cache_loader(cache_path: str, B: int, T: int, row_capacity: int, dtype_np):
457
- """Read packed (T+1) rows from mmap cache, cycle forever."""
458
- data = np.memmap(cache_path, dtype=dtype_np, mode="r").reshape(-1, row_capacity)
459
- n_rows = data.shape[0]
460
- cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=True)
461
- gpu_buffer = torch.empty(2 * B * T, dtype=torch.long, device="cuda")
462
- cpu_inputs = cpu_buffer[: B * T].view(B, T)
463
- cpu_targets = cpu_buffer[B * T :].view(B, T)
464
- inputs = gpu_buffer[: B * T].view(B, T)
465
- targets = gpu_buffer[B * T :].view(B, T)
466
- idx = 0
467
- epoch = 1
468
- while True:
469
- if idx + B > n_rows:
470
- idx = 0
471
- epoch += 1
472
- batch = torch.from_numpy(data[idx:idx + B].astype(np.int64, copy=True))
473
- idx += B
474
- cpu_inputs.copy_(batch[:, :-1])
475
- cpu_targets.copy_(batch[:, 1:])
476
- gpu_buffer.copy_(cpu_buffer, non_blocking=True)
477
  yield inputs, targets, epoch
478
 
479
 
@@ -511,22 +462,24 @@ def evaluate_bpb(model, tokenizer, B: int) -> float:
511
  return total_nats / (math.log(2) * max(total_bytes, 1))
512
 
513
 
514
- def ensure_tokenizer():
515
  """Ensure rustbpe tokenizer exists. If absent, train on a Nemotron stream
516
  sample using the same rustbpe.train_from_iterator API that prepare.py uses
517
  (production path — don't fork tokenizer training logic).
518
  """
519
  import pickle
520
  import torch
521
- path = os.path.join(_p.TOKENIZER_DIR, "tokenizer.pkl")
522
- token_bytes_path = os.path.join(_p.TOKENIZER_DIR, "token_bytes.pt")
523
- if os.path.exists(path) and os.path.exists(token_bytes_path):
524
- print(f"[nemotron] tokenizer + token_bytes already trained at {_p.TOKENIZER_DIR}", flush=True)
525
- return
526
- os.makedirs(_p.TOKENIZER_DIR, exist_ok=True)
 
 
527
  print(f"[nemotron] training BPE (vocab_size={_p.VOCAB_SIZE}) on stream sample…", flush=True)
528
- import rustbpe
529
- import tiktoken
530
 
531
  # Pull a sample of docs — use full blend if active so BPE covers all 7 sources.
532
  n_docs = int(os.environ.get("HYDRA_BPE_TRAIN_DOCS", "20000"))
@@ -542,7 +495,8 @@ def ensure_tokenizer():
542
  print(f"[nemotron] collected {len(sample_texts)} sample docs; training BPE…", flush=True)
543
 
544
  # Train rustbpe — identical API to prepare.py's train_tokenizer().
545
- tokenizer = rustbpe.Tokenizer()
 
546
  vocab_size_no_special = _p.VOCAB_SIZE - len(_p.SPECIAL_TOKENS)
547
  tokenizer.train_from_iterator(iter(sample_texts), vocab_size_no_special, pattern=_p.SPLIT_PATTERN)
548
 
@@ -567,6 +521,7 @@ def ensure_tokenizer():
567
  for token_id in range(enc.n_vocab):
568
  tstr = enc.decode([token_id])
569
  token_bytes_list.append(0 if tstr in special_set else len(tstr.encode("utf-8")))
570
- token_bytes_tensor = torch.tensor(token_bytes_list, dtype=torch.int32)
571
- torch.save(token_bytes_tensor, token_bytes_path)
572
- print(f"[nemotron] BPE + token_bytes saved to {_p.TOKENIZER_DIR}", flush=True)
 
 
20
  """
21
  from __future__ import annotations
22
 
23
+ import os
24
+ import random
25
+ import importlib
26
+ from itertools import cycle
27
+ from typing import Any, Iterator, cast
28
 
 
29
  import torch
30
 
31
+ import prepare as _p # reuse tokenizer, BOS, byte-length helpers
32
 
33
  NEMOTRON_REPO = "nvidia/Nemotron-Pretraining-Specialized-v1.1"
34
 
 
37
  # Keys are logical dataset names used by _open_blend_stream / _open_stream.
38
  # ---------------------------------------------------------------------------
39
  FULL_BLEND_WEIGHTS: dict[str, float] = {
40
+ "fineweb-edu": 0.35, # HuggingFaceFW/fineweb-edu
41
+ "fineweb": 0.15, # HuggingFaceFW/fineweb (sample-100BT)
42
+ "stack-v2": 0.15, # bigcode/the-stack-v2
43
+ "nemotron-math": 0.10, # nvidia/Nemotron-CC-Math-v1
44
+ "nemotron-specialized": 0.10, # nvidia/Nemotron-Pretraining-Specialized-v1.1
45
+ "wikipedia": 0.08, # olm/wikipedia
46
+ "cosmopedia": 0.07, # HuggingFaceTB/cosmopedia
 
47
  }
48
 
49
  # Mapping from logical blend name → (HF repo, optional config/name, text column).
 
65
  "Nemotron-Pretraining-Formal-Logic": 0.20,
66
  "Nemotron-Pretraining-Multiple-Choice": 0.20,
67
  }
68
+ PHASE2_WEIGHTS = {
69
  "Nemotron-Pretraining-Multiple-Choice": 0.45,
70
  "Nemotron-Pretraining-Economics": 0.20,
71
  "Nemotron-Pretraining-Formal-Logic": 0.15,
72
  "Nemotron-Pretraining-Code-Concepts": 0.10,
73
  "Nemotron-Pretraining-Unconditional-Algorithmic": 0.10,
74
+ }
75
+
76
+ type StreamBatch = tuple[list[str], int]
77
+ type TokenBatch = tuple[list[list[int]], int]
78
+
79
+
80
+ def _tokenizer_cache_repo() -> str:
81
+ return (
82
+ os.environ.get("HYDRA_TOKENIZER_CACHE_REPO")
83
+ or os.environ.get("FEATHER_HF_OUTPUT_REPO")
84
+ or os.environ.get("HF_REPO_ID")
85
+ or os.environ.get("HYDRA_RETINA_CACHE_REPO")
86
+ or os.environ.get("FEATHER_HF_RETINA_CACHE_REPO")
87
+ or ""
88
+ )
89
+
90
+
91
+ def _tokenizer_cache_prefix() -> str:
92
+ return f"tokenizer/vocab{_p.VOCAB_SIZE}"
93
+
94
+
95
+ def maybe_hydrate_tokenizer_cache() -> bool:
96
+ """Try to download tokenizer artifacts from HF cache storage."""
97
+ repo_id = _tokenizer_cache_repo()
98
+ token = os.environ.get("HF_TOKEN")
99
+ if not repo_id or not token:
100
+ return False
101
+
102
+ try:
103
+ from huggingface_hub import hf_hub_download
104
+ except Exception as e: # noqa: BLE001
105
+ print(f"[nemotron] tokenizer cache unavailable: {type(e).__name__}: {e}", flush=True)
106
+ return False
107
+
108
+ os.makedirs(_p.TOKENIZER_DIR, exist_ok=True)
109
+ prefix = _tokenizer_cache_prefix()
110
+ try:
111
+ hf_hub_download(
112
+ repo_id=repo_id,
113
+ repo_type="model",
114
+ subfolder=prefix,
115
+ filename="tokenizer.pkl",
116
+ token=token,
117
+ local_dir=_p.TOKENIZER_DIR,
118
+ )
119
+ hf_hub_download(
120
+ repo_id=repo_id,
121
+ repo_type="model",
122
+ subfolder=prefix,
123
+ filename="token_bytes.pt",
124
+ token=token,
125
+ local_dir=_p.TOKENIZER_DIR,
126
+ )
127
+ except Exception as e: # noqa: BLE001
128
+ print(f"[nemotron] tokenizer cache miss in {repo_id}/{prefix}: {type(e).__name__}: {e}", flush=True)
129
+ return False
130
+
131
+ print(f"[nemotron] hydrated tokenizer cache from {repo_id}/{prefix}", flush=True)
132
+ return True
133
+
134
+
135
+ def upload_tokenizer_cache() -> None:
136
+ """Upload tokenizer artifacts for reuse by future jobs."""
137
+ repo_id = _tokenizer_cache_repo()
138
+ token = os.environ.get("HF_TOKEN")
139
+ if not repo_id or not token:
140
+ return
141
+
142
+ path = os.path.join(_p.TOKENIZER_DIR, "tokenizer.pkl")
143
+ token_bytes_path = os.path.join(_p.TOKENIZER_DIR, "token_bytes.pt")
144
+ if not (os.path.exists(path) and os.path.exists(token_bytes_path)):
145
+ return
146
+
147
+ try:
148
+ from huggingface_hub import HfApi
149
+ api = HfApi(token=token)
150
+ prefix = _tokenizer_cache_prefix()
151
+ api.upload_file(path_or_fileobj=path, path_in_repo=f"{prefix}/tokenizer.pkl", repo_id=repo_id, repo_type="model")
152
+ api.upload_file(path_or_fileobj=token_bytes_path, path_in_repo=f"{prefix}/token_bytes.pt", repo_id=repo_id, repo_type="model")
153
+ print(f"[nemotron] uploaded tokenizer cache to {repo_id}/{prefix}", flush=True)
154
+ except Exception as e: # noqa: BLE001
155
+ print(f"[nemotron] tokenizer cache upload skipped: {type(e).__name__}: {e}", flush=True)
156
 
157
 
158
  def _phase_weights() -> dict[str, float]:
 
163
  return PHASE2_WEIGHTS if phase == "phase2" else PHASE1_WEIGHTS
164
 
165
 
166
+ def _open_stream(config: str, split: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  """Open a streaming iterator over one dataset config.
168
 
169
+ Handles two modes:
170
+ 1. Nemotron sub-configs (e.g. "Nemotron-Pretraining-Code-Concepts")
171
+ loaded from NEMOTRON_REPO with the config name.
172
+ 2. Full-blend logical names (e.g. "fineweb-edu", "stack-v2") —
173
+ looked up in _BLEND_REGISTRY for repo / sub-config / text column.
 
174
 
175
+ Yields dicts; text extraction handled downstream by _extract_text.
176
+ """
177
+ load_dataset = importlib.import_module("datasets").load_dataset
178
+ token = os.environ.get("HF_TOKEN")
179
+ shuffle_buf = int(os.environ.get("HYDRA_STREAM_SHUFFLE_BUFFER", "2048"))
180
+
181
+ if config in _BLEND_REGISTRY:
182
+ repo, name, _text_col = _BLEND_REGISTRY[config]
183
+ kwargs: dict[str, object] = dict(
184
+ split="train",
185
+ streaming=True,
186
+ token=token,
187
+ )
188
+ if name is not None:
189
+ kwargs["name"] = name
190
+ # nemotron-specialized has multiple sub-configs; pick the first one
191
+ # (diversity blend) when accessed via the full-blend path.
192
  if config == "nemotron-specialized":
193
+ kwargs["name"] = "Nemotron-Pretraining-Code-Concepts"
194
  repo = NEMOTRON_REPO
195
+ ds = load_dataset(repo, **kwargs)
196
  else:
197
+ # Legacy Nemotron sub-config path (Phase 1 / Phase 2).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  ds = load_dataset(
199
+ NEMOTRON_REPO,
200
+ config,
201
  split="train",
202
  streaming=True,
203
+ token=token,
204
  )
 
 
 
 
 
205
  ds = ds.shuffle(seed=42, buffer_size=shuffle_buf)
206
  return iter(ds)
207
 
208
 
209
+ def _extract_text(row: dict[str, object]) -> str:
210
  """Pick the right text column — datasets have different column names.
211
 
212
  Priority order: text, content, prompt_completion, question, body.
213
  For math datasets that split into problem+solution, concatenate both.
214
  Fallback: concatenate all string-valued fields.
215
  """
216
+ # Fast path: most datasets use "text" or "content".
217
+ for k in ("text", "content", "prompt_completion", "question", "body"):
218
+ value = row.get(k)
219
+ if isinstance(value, str) and value:
220
+ return value
221
  # Math datasets may have problem + solution as separate fields.
222
  if "problem" in row and "solution" in row:
223
  p = row["problem"] or ""
 
233
  return "\n".join(parts)
234
 
235
 
236
+ class _WeightedStream:
237
  """Infinite weighted-round-robin over configs' streaming iterators."""
238
 
239
+ def __init__(self, weights: dict[str, float], seed: int = 0):
240
+ self.configs = list(weights.keys())
241
+ self.weights = [weights[c] for c in self.configs]
242
+ self.streams: dict[str, Iterator[dict[str, object]]] = {
243
+ c: _open_stream(c, "train") for c in self.configs
244
+ }
245
+ self.rng = random.Random(seed)
246
+ self.epoch = 1
247
+ self._factual_docs: list[str] | None = None
248
+ self._factual_idx = 0
249
+ self._inject_counter = 0
250
 
251
  def _reopen(self, config: str):
252
  # stream exhausted — reopen (HF streaming typically infinite but restart on edge)
 
262
  # exist in the Nemotron configs. Controlled by HYDRA_FACTUAL_INJECT_RATE
263
  # (default 50 = inject one factual doc every 50 Nemotron docs = ~2%).
264
  inject_rate = int(os.environ.get("HYDRA_FACTUAL_INJECT_RATE", "50"))
265
+ if inject_rate > 0 and self._factual_docs is None:
266
+ factual_path = os.path.join(
267
+ os.path.dirname(os.path.abspath(__file__)), "data", "factual", "facts.txt")
268
+ if os.path.exists(factual_path):
269
+ self._factual_docs = open(factual_path).read().strip().split('\n')
270
+ self._factual_idx = 0
271
+ self._inject_counter = 0
272
+ if inject_rate > 0 and self._factual_docs:
273
+ self._inject_counter += 1
274
+ if self._inject_counter >= inject_rate:
275
+ self._inject_counter = 0
276
+ doc = self._factual_docs[self._factual_idx % len(self._factual_docs)]
277
+ self._factual_idx += 1
278
+ return doc, self.epoch
 
 
279
 
280
  config = self.rng.choices(self.configs, weights=self.weights, k=1)[0]
281
  try:
 
308
  stream = _WeightedStream(_phase_weights(), seed=0)
309
 
310
  prefetch_depth = int(os.environ.get("HYDRA_STREAM_PREFETCH", "32"))
311
+ q: queue.Queue[StreamBatch | object] = queue.Queue(maxsize=prefetch_depth)
312
+ sentinel_stop = object()
313
+ error_box: list[BaseException] = []
314
 
315
  def producer():
316
  try:
 
335
  if error_box:
336
  raise error_box[0]
337
  return
338
+ yield cast(StreamBatch, item)
339
 
340
 
341
  def make_dataloader(tokenizer, B: int, T: int, split: str, buffer_size: int = 1000):
 
346
  stage 2: BPE tokenization → token-id lists (this function's producer thread)
347
  stage 3: best-fit packing → (B, T+1) tensor rows (main thread, consumes)
348
 
349
+ Queue depths tunable via HYDRA_STREAM_PREFETCH and HYDRA_TOKEN_PREFETCH.
350
+ Goal: zero tps loss from I/O or tokenizer overhead training loop pulls
351
+ from an always-full queue.
 
 
352
  """
353
  import queue
354
  import threading
355
 
356
  assert split in ("train", "val")
357
  row_capacity = T + 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  batches = _document_batches(split)
359
+ bos_token = tokenizer.get_bos_token_id()
360
 
361
  # Stage 2: tokenization prefetch thread. Each queue element is a list of
362
  # token-id lists (pre-tokenized docs). HYDRA_TOKEN_PREFETCH controls depth.
363
  tok_prefetch = int(os.environ.get("HYDRA_TOKEN_PREFETCH", "8"))
364
+ tok_q: queue.Queue[TokenBatch | object] = queue.Queue(maxsize=tok_prefetch)
365
+ tok_sentinel = object()
366
+ tok_err_box: list[BaseException] = []
367
 
368
  def tokenizer_producer():
369
  try:
 
387
  if tok_err_box:
388
  raise tok_err_box[0]
389
  raise StopIteration
390
+ token_lists, epoch = cast(TokenBatch, item)
391
+ doc_buffer.extend(token_lists)
392
 
393
  row_buffer = torch.empty((B, row_capacity), dtype=torch.long)
394
  cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=True)
 
398
  inputs = gpu_buffer[: B * T].view(B, T)
399
  targets = gpu_buffer[B * T :].view(B, T)
400
 
 
 
 
 
401
  while True:
402
  for row_idx in range(B):
403
  pos = 0
 
425
  cpu_inputs.copy_(row_buffer[:, :-1])
426
  cpu_targets.copy_(row_buffer[:, 1:])
427
  gpu_buffer.copy_(cpu_buffer, non_blocking=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  yield inputs, targets, epoch
429
 
430
 
 
462
  return total_nats / (math.log(2) * max(total_bytes, 1))
463
 
464
 
465
+ def ensure_tokenizer():
466
  """Ensure rustbpe tokenizer exists. If absent, train on a Nemotron stream
467
  sample using the same rustbpe.train_from_iterator API that prepare.py uses
468
  (production path — don't fork tokenizer training logic).
469
  """
470
  import pickle
471
  import torch
472
+ path = os.path.join(_p.TOKENIZER_DIR, "tokenizer.pkl")
473
+ token_bytes_path = os.path.join(_p.TOKENIZER_DIR, "token_bytes.pt")
474
+ if os.path.exists(path) and os.path.exists(token_bytes_path):
475
+ print(f"[nemotron] tokenizer + token_bytes already trained at {_p.TOKENIZER_DIR}", flush=True)
476
+ return
477
+ if maybe_hydrate_tokenizer_cache() and os.path.exists(path) and os.path.exists(token_bytes_path):
478
+ return
479
+ os.makedirs(_p.TOKENIZER_DIR, exist_ok=True)
480
  print(f"[nemotron] training BPE (vocab_size={_p.VOCAB_SIZE}) on stream sample…", flush=True)
481
+ import rustbpe
482
+ import tiktoken
483
 
484
  # Pull a sample of docs — use full blend if active so BPE covers all 7 sources.
485
  n_docs = int(os.environ.get("HYDRA_BPE_TRAIN_DOCS", "20000"))
 
495
  print(f"[nemotron] collected {len(sample_texts)} sample docs; training BPE…", flush=True)
496
 
497
  # Train rustbpe — identical API to prepare.py's train_tokenizer().
498
+ tokenizer_cls = getattr(rustbpe, "Tokenizer")
499
+ tokenizer: Any = tokenizer_cls()
500
  vocab_size_no_special = _p.VOCAB_SIZE - len(_p.SPECIAL_TOKENS)
501
  tokenizer.train_from_iterator(iter(sample_texts), vocab_size_no_special, pattern=_p.SPLIT_PATTERN)
502
 
 
521
  for token_id in range(enc.n_vocab):
522
  tstr = enc.decode([token_id])
523
  token_bytes_list.append(0 if tstr in special_set else len(tstr.encode("utf-8")))
524
+ token_bytes_tensor = torch.tensor(token_bytes_list, dtype=torch.int32)
525
+ torch.save(token_bytes_tensor, token_bytes_path)
526
+ print(f"[nemotron] BPE + token_bytes saved to {_p.TOKENIZER_DIR}", flush=True)
527
+ upload_tokenizer_cache()
overlay/pyproject.toml CHANGED
@@ -7,6 +7,7 @@ requires-python = ">=3.11"
7
  dependencies = [
8
  "matplotlib>=3.10.8",
9
  "numpy>=2.2.6",
 
10
  "pandas>=2.3.3",
11
  "pyarrow>=21.0.0",
12
  "requests>=2.32.0",
 
7
  dependencies = [
8
  "matplotlib>=3.10.8",
9
  "numpy>=2.2.6",
10
+ "optuna>=4.4.0",
11
  "pandas>=2.3.3",
12
  "pyarrow>=21.0.0",
13
  "requests>=2.32.0",
overlay/scripts/autoresearch_iter.sh ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Autoresearch single-iteration runner — called from cron every 5 min.
3
+ #
4
+ # Philosophy (Apr 22 2026 rewrite): HYDRA is NOT a transformer. Semantic
5
+ # folding (SDR retina) + HTM episodic engram + GDN memory layers provide
6
+ # enormous latent capacity at tiny d_model. DEPTH > WIDTH. Per the user's
7
+ # guidance, start absolute-smallest, fill VRAM with depth.
8
+ #
9
+ # Base config: d_model=128, n_layer=16 (~60M params). Mutations explore
10
+ # deeper stacks, engram/GDN layout, SDR sparsity. Eval OOM fixed via
11
+ # HYDRA_EVAL_BATCH=1 + HYDRA_CE_CHUNK=64 (was =1024 = no chunking).
12
+
13
+ set -u
14
+ REPO=/home/mikeb/work/feather
15
+ RESULTS=$REPO/results.tsv
16
+ LOG_DIR=$REPO/.omc/autoresearch_logs
17
+ mkdir -p "$LOG_DIR"
18
+ ITER_LOG=$LOG_DIR/iter_$(date +%Y%m%d_%H%M%S).log
19
+ cd "$REPO"
20
+
21
+ # Skip if training already running — check the actual python process, not shells
22
+ # whose argv merely contains the pattern string (e.g. pgrep wait-loops).
23
+ if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {found=1} END {exit !found}'; then
24
+ echo "[$(date +%H:%M:%S)] skip — training already running" >> "$LOG_DIR/skips.log"
25
+ exit 0
26
+ fi
27
+
28
+ # Skip if stop-file exists
29
+ if [ -f "$REPO/.omc/autoresearch_STOP" ]; then
30
+ echo "[$(date +%H:%M:%S)] STOPPED — .omc/autoresearch_STOP exists" >> "$LOG_DIR/skips.log"
31
+ exit 0
32
+ fi
33
+
34
+ # Compute next experiment index from results.tsv
35
+ if [ ! -f "$RESULTS" ]; then
36
+ printf "experiment\tcommit\tval_bpb\ttps_avg\tfactual\tstatus\tdescription\n" > "$RESULTS"
37
+ fi
38
+ NEXT_EXP=$(awk -F'\t' 'NR>1 && $1~/^[0-9]+$/ {if ($1+0 > max) max=$1+0} END {print max+1}' "$RESULTS")
39
+ [ -z "$NEXT_EXP" ] && NEXT_EXP=1
40
+
41
+ # Mutation pool — explores deep+narrow regime.
42
+ # Base: d_model=128, n_layer=16, expand=3, d_state=64, engram=8192, B=16, seq=1024, GDN@5,11
43
+ MUTATIONS=(
44
+ "baseline-deep-narrow|"
45
+ "n_layer=16 (shallower-control)|HYDRA_N_LAYER=16"
46
+ "n_layer=24 (max depth)|HYDRA_N_LAYER=24"
47
+ "d_model=96 (leaner)|HYDRA_D_MODEL=96"
48
+ "d_model=160 (slightly wider)|HYDRA_D_MODEL=160"
49
+ "GDN_LAYERS=0,3,6,9,12,15,18 (7 GDN)|HYDRA_GDN_LAYERS=0,3,6,9,12,15,18"
50
+ "GDN_LAYERS=1,3,5,7,9,11,13,15,17 (9 GDN)|HYDRA_GDN_LAYERS=1,3,5,7,9,11,13,15,17"
51
+ "GDN_LAYERS= (all-Mamba3 depth)|HYDRA_GDN_LAYERS="
52
+ "D_STATE=128 (fatter SSM state)|HYDRA_D_STATE=128"
53
+ "D_STATE=32 (leaner SSM state)|HYDRA_D_STATE=32"
54
+ "EXPAND=2 (leaner FFN)|HYDRA_EXPAND=2"
55
+ "EXPAND=4 (fatter FFN)|HYDRA_EXPAND=4"
56
+ "engram=32768 (even wider)|HYDRA_ENGRAM_N_COLUMNS=32768"
57
+ "engram_topk=128 (denser retrieve)|HYDRA_ENGRAM_TOPK=128"
58
+ "D_STATE=96 (mid SSM)|HYDRA_D_STATE=96"
59
+ "HTM_SUBSAMPLE=64 (2x HTM)|HYDRA_HTM_SUBSAMPLE=64"
60
+ "batch=16 (fill VRAM)|HYDRA_BATCH_SIZE=16"
61
+ "batch=4 seq=2048 (long-range)|HYDRA_BATCH_SIZE=4 HYDRA_SEQ_LEN=2048"
62
+ "MATRIX_LR=0.18|HYDRA_MATRIX_LR=0.18"
63
+ "WARMUP_RATIO=0.05|HYDRA_WARMUP_RATIO=0.05"
64
+ "total_batch=16384 (2x opt steps)|HYDRA_TOTAL_BATCH=16384"
65
+ "total_batch=8192 (4x opt steps)|HYDRA_TOTAL_BATCH=8192"
66
+ "HEADDIM=64 (bigger heads)|HYDRA_HEADDIM=64"
67
+ "engram_layer_idx=8 (mid-stack)|HYDRA_ENGRAM_LAYER_IDX=8"
68
+ "EXPAND=4 + n_layer=20 (fat+deep)|HYDRA_EXPAND=4 HYDRA_N_LAYER=20"
69
+ "B=16 + total_batch=16384|HYDRA_BATCH_SIZE=16 HYDRA_TOTAL_BATCH=16384"
70
+ "engram=32768 + EXPAND=4|HYDRA_ENGRAM_N_COLUMNS=32768 HYDRA_EXPAND=4"
71
+ "MTP_K=2 + HEADDIM=64|HYDRA_MTP_K=2 HYDRA_HEADDIM=64"
72
+ "label_smoothing=0.1|HYDRA_LABEL_SMOOTHING=0.1"
73
+ "z_loss=0.001 (10x)|HYDRA_Z_LOSS_WEIGHT=0.001"
74
+ "HTM_STOP_GRAD=1|HYDRA_HTM_STOP_GRAD=1"
75
+ "DROPOUT=0.0|HYDRA_DROPOUT=0.0"
76
+ "TIME=900s long-budget champion|HYDRA_TIME_BUDGET=900 HYDRA_ENGRAM_N_COLUMNS=32768 HYDRA_EXPAND=4"
77
+ "TIME=1200s deep n_layer=24|HYDRA_TIME_BUDGET=1200 HYDRA_N_LAYER=24"
78
+ )
79
+
80
+ # Index into mutation pool (wrap around for continuous search, start at exp13)
81
+ MUT_IDX=$(( (NEXT_EXP - 13) % ${#MUTATIONS[@]} ))
82
+ [ "$MUT_IDX" -lt 0 ] && MUT_IDX=0
83
+
84
+ IFS='|' read -r DESC EXTRA_ENV <<< "${MUTATIONS[$MUT_IDX]}"
85
+ echo "[$(date +%H:%M:%S)] Starting exp $NEXT_EXP: $DESC" >> "$ITER_LOG"
86
+
87
+ # Launch training with mutation
88
+ # KEY CHANGES vs prior iter:
89
+ # d_model 384→128 (3x narrower)
90
+ # n_layer 10→16 (1.6x deeper)
91
+ # batch 8→16 (fill VRAM)
92
+ # CE_CHUNK 1024→64 (16x smaller eval logit chunks — fixes OOM)
93
+ # EVAL_BATCH 2→1 (halve eval memory)
94
+ # EVAL_TOKENS 131K (keep, ~3-4s eval)
95
+ rm -f run.log
96
+ env \
97
+ LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
98
+ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
99
+ HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
100
+ HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
101
+ HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
102
+ HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
103
+ HYDRA_TIME_BUDGET=600 \
104
+ HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
105
+ HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
106
+ HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
107
+ HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
108
+ HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
109
+ HYDRA_CKPT_INTERVAL=0 HYDRA_MID_VAL_INTERVAL=0 \
110
+ HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
111
+ HYDRA_Z_LOSS_WEIGHT=0.001 \
112
+ HYDRA_RESUME_CKPT=none \
113
+ $EXTRA_ENV \
114
+ ./.venv/bin/python -u train.py > run.log 2>&1
115
+ STATUS=$?
116
+
117
+ # Parse metrics
118
+ METRICS=$(./.venv/bin/python scripts/parse_metrics.py run.log 2>/dev/null || echo "NA NA NA")
119
+ VAL_BPB=$(echo "$METRICS" | cut -f1)
120
+ TPS=$(echo "$METRICS" | cut -f2)
121
+ FACTUAL=$(echo "$METRICS" | cut -f3)
122
+ COMMIT=$(git rev-parse --short HEAD)
123
+ # BPB can be: "NA" (parse fail), "~X.XXXX" (train_bpb fallback when eval OOMs),
124
+ # or "X.XXXX" (real val_bpb). The ~ prefix marks the fallback.
125
+ if [ "$STATUS" -ne 0 ]; then
126
+ STATUS_STR="crash"
127
+ elif [ "$VAL_BPB" = "NA" ]; then
128
+ STATUS_STR="no_metrics"
129
+ elif [[ "$VAL_BPB" == ~* ]]; then
130
+ STATUS_STR="train_bpb"
131
+ else
132
+ STATUS_STR="ok"
133
+ fi
134
+ printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\n" "$NEXT_EXP" "$COMMIT" "$VAL_BPB" "$TPS" "$FACTUAL" "$STATUS_STR" "$DESC" >> "$RESULTS"
135
+ echo "[$(date +%H:%M:%S)] Done exp $NEXT_EXP: bpb=$VAL_BPB tps=$TPS factual=$FACTUAL status=$STATUS_STR" >> "$ITER_LOG"
136
+
137
+ # Auto-stop condition: great result
138
+ if [ "$FACTUAL" != "NA" ]; then
139
+ HITS=$(echo "$FACTUAL" | cut -d/ -f1)
140
+ if [ -n "$HITS" ] && [ "$HITS" -ge 7 ] 2>/dev/null; then
141
+ touch "$REPO/.omc/autoresearch_STOP"
142
+ echo "[$(date +%H:%M:%S)] STOP: reached factual>=7/9 at exp $NEXT_EXP" >> "$ITER_LOG"
143
+ fi
144
+ fi
overlay/scripts/benchmark_hyena_stack.py CHANGED
@@ -26,8 +26,11 @@ Invocation:
26
  # On A100/A10G (production cloud hardware), use time=900 (15 min) for
27
  # stable steady-state numbers.
28
 
29
- After each run the script prints:
30
- BENCHMARK config=<name> tps_steady=<avg> bpb_at_500=<val> vram_peak=<MiB>
 
 
 
31
 
32
  Collate those lines into the matrix table manually, then pick the winner
33
  for the 6-hour production run (HYDRA_TIME_BUDGET=21600).
@@ -81,7 +84,7 @@ CONFIGS = {
81
  }
82
 
83
 
84
- def build_env(cfg_overrides: dict) -> dict:
85
  """Compose a full env dict from the inherited env + config overrides."""
86
  env = os.environ.copy()
87
  # Ensure the Hyena layer selection is always present (defaults to off).
@@ -91,7 +94,7 @@ def build_env(cfg_overrides: dict) -> dict:
91
  return env
92
 
93
 
94
- def parse_step_line(line: str) -> dict | None:
95
  """Parse a single step=... line into a dict of metrics, or None."""
96
  if not line.startswith("step="):
97
  return None
@@ -102,7 +105,7 @@ def parse_step_line(line: str) -> dict | None:
102
  return None
103
 
104
 
105
- def summarize(log_path: Path, warmup_steps: int = 50) -> dict:
106
  """Tail log_path, compute steady-state TPS / BPB@500 / VRAM peak.
107
 
108
  Skips the first `warmup_steps` to discard CUDA graph capture / autotune
@@ -138,20 +141,29 @@ def summarize(log_path: Path, warmup_steps: int = 50) -> dict:
138
  tps_sorted = sorted(tps_vals)
139
  tps_steady = tps_sorted[len(tps_sorted) // 2] # median
140
 
141
- return {
142
- "tps_steady": tps_steady,
143
- "bpb_at_500": bpb_at_500 or (bpbs[-1] if bpbs else 0.0),
144
- "vram_peak": vram_peak,
145
- "steps": len(tps_vals) + warmup_steps,
146
- }
147
-
148
-
149
- def main() -> int:
150
- ap = argparse.ArgumentParser()
151
- ap.add_argument("--config", required=True, choices=list(CONFIGS))
152
- ap.add_argument("--time", type=int, default=300, help="training seconds")
153
- ap.add_argument("--log", default=None, help="output log path (default: run_bench_<cfg>.log)")
154
- args = ap.parse_args()
 
 
 
 
 
 
 
 
 
155
 
156
  cfg = CONFIGS[args.config]
157
  log_path = Path(args.log or (REPO / f"run_bench_{args.config}.log"))
@@ -178,16 +190,25 @@ def main() -> int:
178
  print(f"BENCH FAIL config={args.config}", flush=True)
179
  return proc.returncode
180
 
181
- summary = summarize(log_path)
182
- print(
183
- f"BENCHMARK config={args.config} "
184
- f"tps_steady={summary['tps_steady']:.0f} "
185
- f"bpb_at_500={summary['bpb_at_500']:.4f} "
186
- f"vram_peak={summary['vram_peak']:.0f}MiB "
187
- f"steps={summary['steps']}",
188
- flush=True,
189
- )
190
- return 0
 
 
 
 
 
 
 
 
 
191
 
192
 
193
  if __name__ == "__main__":
 
26
  # On A100/A10G (production cloud hardware), use time=900 (15 min) for
27
  # stable steady-state numbers.
28
 
29
+ After each run the script prints:
30
+ BENCHMARK config=<name> tps_steady=<avg> bpb_at_500=<val> vram_peak=<MiB>
31
+
32
+ If `--min-tps` is set (>0), the script exits non-zero when steady-state TPS
33
+ falls below the threshold.
34
 
35
  Collate those lines into the matrix table manually, then pick the winner
36
  for the 6-hour production run (HYDRA_TIME_BUDGET=21600).
 
84
  }
85
 
86
 
87
+ def build_env(cfg_overrides: dict[str, str]) -> dict[str, str]:
88
  """Compose a full env dict from the inherited env + config overrides."""
89
  env = os.environ.copy()
90
  # Ensure the Hyena layer selection is always present (defaults to off).
 
94
  return env
95
 
96
 
97
+ def parse_step_line(line: str) -> dict[str, float] | None:
98
  """Parse a single step=... line into a dict of metrics, or None."""
99
  if not line.startswith("step="):
100
  return None
 
105
  return None
106
 
107
 
108
+ def summarize(log_path: Path, warmup_steps: int = 50) -> dict[str, float]:
109
  """Tail log_path, compute steady-state TPS / BPB@500 / VRAM peak.
110
 
111
  Skips the first `warmup_steps` to discard CUDA graph capture / autotune
 
141
  tps_sorted = sorted(tps_vals)
142
  tps_steady = tps_sorted[len(tps_sorted) // 2] # median
143
 
144
+ return {
145
+ "tps_steady": tps_steady,
146
+ "bpb_at_500": bpb_at_500 or (bpbs[-1] if bpbs else 0.0),
147
+ "vram_peak": vram_peak,
148
+ "steps": len(tps_vals) + warmup_steps,
149
+ }
150
+
151
+
152
+ def fails_tps_floor(summary: dict[str, float], min_tps: float) -> bool:
153
+ if min_tps <= 0:
154
+ return False
155
+ tps_steady = float(summary.get("tps_steady", 0.0))
156
+ return tps_steady < float(min_tps)
157
+
158
+
159
+ def main() -> int:
160
+ ap = argparse.ArgumentParser()
161
+ ap.add_argument("--config", required=True, choices=list(CONFIGS))
162
+ ap.add_argument("--time", type=int, default=300, help="training seconds")
163
+ ap.add_argument("--log", default=None, help="output log path (default: run_bench_<cfg>.log)")
164
+ ap.add_argument("--min-tps", type=float, default=50000.0, help="Required steady-state TPS floor (set 0 to disable)")
165
+ ap.add_argument("--warmup-steps", type=int, default=50, help="Number of initial steps to skip before TPS median")
166
+ args = ap.parse_args()
167
 
168
  cfg = CONFIGS[args.config]
169
  log_path = Path(args.log or (REPO / f"run_bench_{args.config}.log"))
 
190
  print(f"BENCH FAIL config={args.config}", flush=True)
191
  return proc.returncode
192
 
193
+ summary = summarize(log_path, warmup_steps=max(0, int(args.warmup_steps)))
194
+ print(
195
+ f"BENCHMARK config={args.config} "
196
+ f"tps_steady={summary['tps_steady']:.0f} "
197
+ f"bpb_at_500={summary['bpb_at_500']:.4f} "
198
+ f"vram_peak={summary['vram_peak']:.0f}MiB "
199
+ f"steps={summary['steps']}",
200
+ flush=True,
201
+ )
202
+
203
+ if fails_tps_floor(summary, args.min_tps):
204
+ print(
205
+ f"BENCH FAIL config={args.config} tps_steady={summary['tps_steady']:.0f} < min_tps={args.min_tps:.0f}",
206
+ flush=True,
207
+ )
208
+ return 2
209
+
210
+ print(f"BENCH PASS config={args.config} min_tps={args.min_tps:.0f}", flush=True)
211
+ return 0
212
 
213
 
214
  if __name__ == "__main__":
overlay/scripts/export_hpo_priors.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import datetime as dt
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import optuna
11
+
12
+
13
+ def parse_args() -> argparse.Namespace:
14
+ parser = argparse.ArgumentParser(description="Export top Optuna trials as transfer-learning priors")
15
+ parser.add_argument("--study-name", action="append", default=[], help="Repeat to merge multiple studies")
16
+ parser.add_argument("--storage", default="sqlite:///optuna_hpo.db")
17
+ parser.add_argument("--top-k", type=int, default=20)
18
+ parser.add_argument("--out", type=Path, default=Path("docs") / "hpo_transfer_priors.json")
19
+ parser.add_argument("--metric", default="val_bpb")
20
+ return parser.parse_args()
21
+
22
+
23
+ def _completed_trials(study: optuna.Study) -> list[optuna.trial.FrozenTrial]:
24
+ trials = [t for t in study.trials if t.value is not None]
25
+ reverse = study.direction == optuna.study.StudyDirection.MAXIMIZE
26
+ return sorted(trials, key=lambda t: float(t.value), reverse=reverse)
27
+
28
+
29
+ def _serialize_trial(trial: optuna.trial.FrozenTrial) -> dict[str, Any]:
30
+ return {
31
+ "trial_number": trial.number,
32
+ "value": float(trial.value) if trial.value is not None else None,
33
+ "params": dict(trial.params),
34
+ "user_attrs": dict(trial.user_attrs),
35
+ }
36
+
37
+
38
+ def main() -> int:
39
+ args = parse_args()
40
+ study_names = args.study_name or ["hydra_hpo"]
41
+ merged_trials: list[dict[str, Any]] = []
42
+ total_trials = 0
43
+ total_completed = 0
44
+
45
+ for study_name in study_names:
46
+ study = optuna.load_study(study_name=study_name, storage=args.storage)
47
+ ranked = _completed_trials(study)
48
+ selected = ranked[: max(0, args.top_k)]
49
+ total_trials += len(study.trials)
50
+ total_completed += len(ranked)
51
+ for t in selected:
52
+ row = _serialize_trial(t)
53
+ row["study_name"] = study_name
54
+ merged_trials.append(row)
55
+
56
+ payload = {
57
+ "schema_version": 1,
58
+ "generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
59
+ "study_names": study_names,
60
+ "metric": args.metric,
61
+ "n_total_trials": total_trials,
62
+ "n_completed_trials": total_completed,
63
+ "top_k_per_study": args.top_k,
64
+ "trials": merged_trials,
65
+ }
66
+
67
+ args.out.parent.mkdir(parents=True, exist_ok=True)
68
+ args.out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
69
+ print(f"[hpo-priors] wrote {args.out} with {len(merged_trials)} merged trials")
70
+ return 0
71
+
72
+
73
+ if __name__ == "__main__":
74
+ raise SystemExit(main())
overlay/scripts/hpo_orchestrator.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import os
7
+ import subprocess
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ import optuna
13
+
14
+
15
+ REPO_ROOT = Path(__file__).resolve().parents[1]
16
+ if str(REPO_ROOT) not in sys.path:
17
+ sys.path.insert(0, str(REPO_ROOT))
18
+
19
+ from scripts.hf_routing import resolve_routing
20
+
21
+ HPO_SCRIPT = REPO_ROOT / "scripts" / "optuna_hpo.py"
22
+
23
+
24
+ def _run_worker(args: list[str]) -> int:
25
+ cmd = [sys.executable, str(HPO_SCRIPT), *args]
26
+ proc = subprocess.run(cmd, cwd=str(REPO_ROOT), text=True)
27
+ return proc.returncode
28
+
29
+
30
+ def _study_stats(storage: str, study_name: str) -> dict[str, Any]:
31
+ try:
32
+ study = optuna.load_study(study_name=study_name, storage=storage)
33
+ except KeyError:
34
+ return {
35
+ "study_name": study_name,
36
+ "status": "missing",
37
+ "direction": None,
38
+ "n_trials": 0,
39
+ "n_completed": 0,
40
+ "n_pruned": 0,
41
+ "n_failed": 0,
42
+ }
43
+ completed = [t for t in study.trials if t.value is not None]
44
+ pruned = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
45
+ failed = [t for t in study.trials if t.state == optuna.trial.TrialState.FAIL]
46
+
47
+ stats: dict[str, Any] = {
48
+ "study_name": study.study_name,
49
+ "direction": str(study.direction),
50
+ "n_trials": len(study.trials),
51
+ "n_completed": len(completed),
52
+ "n_pruned": len(pruned),
53
+ "n_failed": len(failed),
54
+ }
55
+ if completed:
56
+ stats.update(
57
+ {
58
+ "best_value": study.best_value,
59
+ "best_params": study.best_params,
60
+ "best_trial_number": study.best_trial.number,
61
+ "best_trial_user_attrs": study.best_trial.user_attrs,
62
+ }
63
+ )
64
+ return stats
65
+
66
+
67
+ def _phase_args(phase: str, base: argparse.Namespace) -> list[str]:
68
+ common = [
69
+ "--study-name",
70
+ base.study_name,
71
+ "--storage",
72
+ base.storage,
73
+ "--metric",
74
+ base.metric,
75
+ "--direction",
76
+ base.direction,
77
+ "--seed",
78
+ str(base.seed),
79
+ "--min-tps",
80
+ str(base.min_tps),
81
+ "--summary-out",
82
+ str(base.summary_out),
83
+ "--runner",
84
+ base.runner,
85
+ "--hf-namespace",
86
+ base.hf_namespace,
87
+ "--hf-image",
88
+ base.hf_image,
89
+ "--hf-flavor",
90
+ base.hf_flavor,
91
+ "--hf-timeout",
92
+ base.hf_timeout,
93
+ "--hf-command",
94
+ base.hf_command,
95
+ "--hf-token-env",
96
+ base.hf_token_env,
97
+ "--hf-poll-interval",
98
+ str(base.hf_poll_interval),
99
+ "--hf-launcher-script",
100
+ str(base.hf_launcher_script),
101
+ "--priors-file",
102
+ str(base.priors_file),
103
+ ]
104
+ if base.hf_output_repo:
105
+ common.extend(["--hf-output-repo", base.hf_output_repo])
106
+ if base.hf_use_bash:
107
+ common.append("--hf-use-bash")
108
+ if base.hf_stop_after_metric:
109
+ common.append("--hf-stop-after-metric")
110
+ else:
111
+ common.append("--no-hf-stop-after-metric")
112
+ if base.apply_priors:
113
+ common.append("--apply-priors")
114
+ else:
115
+ common.append("--no-apply-priors")
116
+ if phase == "phase1":
117
+ return [
118
+ *common,
119
+ "--trials",
120
+ str(base.phase1_trials),
121
+ "--trial-time-budget",
122
+ str(base.phase1_trial_time_budget),
123
+ "--trial-timeout",
124
+ str(base.phase1_trial_timeout),
125
+ "--n-startup-trials",
126
+ str(base.phase1_n_startup),
127
+ "--n-warmup-steps",
128
+ str(base.phase1_n_warmup),
129
+ "--patience-trials",
130
+ str(base.phase1_patience),
131
+ "--min-improvement",
132
+ str(base.phase1_min_improvement),
133
+ ]
134
+ if phase == "phase2":
135
+ return [
136
+ *common,
137
+ "--trials",
138
+ str(base.phase2_trials),
139
+ "--trial-time-budget",
140
+ str(base.phase2_trial_time_budget),
141
+ "--trial-timeout",
142
+ str(base.phase2_trial_timeout),
143
+ "--n-startup-trials",
144
+ str(base.phase2_n_startup),
145
+ "--n-warmup-steps",
146
+ str(base.phase2_n_warmup),
147
+ "--patience-trials",
148
+ str(base.phase2_patience),
149
+ "--min-improvement",
150
+ str(base.phase2_min_improvement),
151
+ ]
152
+ raise ValueError(f"Unknown phase: {phase}")
153
+
154
+
155
+ def cmd_phase(args: argparse.Namespace) -> int:
156
+ rc = _run_worker(_phase_args(args.phase, args))
157
+ stats = _study_stats(args.storage, args.study_name)
158
+ args.summary_out.parent.mkdir(parents=True, exist_ok=True)
159
+ args.summary_out.write_text(json.dumps({"phase": args.phase, "stats": stats}, indent=2), encoding="utf-8")
160
+ print(json.dumps({"phase": args.phase, "stats": stats}, indent=2))
161
+ return rc
162
+
163
+
164
+ def cmd_parallel(args: argparse.Namespace) -> int:
165
+ worker_args = _phase_args(args.phase, args)
166
+ procs: list[subprocess.Popen[str]] = []
167
+ for _ in range(args.workers):
168
+ cmd = [sys.executable, str(HPO_SCRIPT), *worker_args]
169
+ procs.append(subprocess.Popen(cmd, cwd=str(REPO_ROOT), text=True))
170
+
171
+ exit_codes = [p.wait() for p in procs]
172
+ stats = _study_stats(args.storage, args.study_name)
173
+ payload = {
174
+ "phase": args.phase,
175
+ "workers": args.workers,
176
+ "exit_codes": exit_codes,
177
+ "stats": stats,
178
+ }
179
+ args.summary_out.parent.mkdir(parents=True, exist_ok=True)
180
+ args.summary_out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
181
+ print(json.dumps(payload, indent=2))
182
+ return 0 if all(code == 0 for code in exit_codes) else 1
183
+
184
+
185
+ def cmd_recommend(args: argparse.Namespace) -> int:
186
+ stats = _study_stats(args.storage, args.study_name)
187
+ min_tps_floor = float(args.min_tps)
188
+ if stats.get("status") == "missing":
189
+ payload = {
190
+ "stats": stats,
191
+ "recommendation": {
192
+ "status": "create_study_first",
193
+ "next_step": "Run phase1 (serial or parallel) to create and populate the study.",
194
+ "example": f"python scripts/hpo_orchestrator.py parallel --phase phase1 --workers 3 --storage {args.storage} --study-name {args.study_name}",
195
+ },
196
+ }
197
+ args.summary_out.parent.mkdir(parents=True, exist_ok=True)
198
+ args.summary_out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
199
+ print(json.dumps(payload, indent=2))
200
+ return 0
201
+
202
+ n_completed = int(stats.get("n_completed", 0))
203
+
204
+ if n_completed < 10:
205
+ recommendation = {
206
+ "status": "insufficient_data",
207
+ "next_step": "Run phase1 with 2-4 parallel workers until >=10 completed trials.",
208
+ "early_stop_policy": {
209
+ "patience_trials": 8,
210
+ "min_improvement": 0.001,
211
+ },
212
+ "throughput_guard": {
213
+ "min_tps": min_tps_floor,
214
+ "note": "Trials below this TPS floor are pruned.",
215
+ },
216
+ "transfer_learning": {
217
+ "export_priors": f"python scripts/export_hpo_priors.py --storage {args.storage} --study-name {args.study_name} --top-k 10 --out docs/hpo_transfer_priors.json",
218
+ "use_priors": "Enabled by default in scripts/optuna_hpo.py (override with --no-apply-priors)",
219
+ },
220
+ }
221
+ else:
222
+ recommendation = {
223
+ "status": "ready_for_full_optimization",
224
+ "next_step": "Run phase2 with 3-4 parallel workers.",
225
+ "suggested_full_run": {
226
+ "trials": 60,
227
+ "workers": 4,
228
+ "trial_time_budget": 300,
229
+ "trial_timeout": 900,
230
+ "min_tps": min_tps_floor,
231
+ "patience_trials": 12,
232
+ "min_improvement": 0.0005,
233
+ },
234
+ "transfer_learning": {
235
+ "refresh_priors": f"python scripts/export_hpo_priors.py --storage {args.storage} --study-name {args.study_name} --top-k 20 --out docs/hpo_transfer_priors.json",
236
+ "notes": "Carry priors into new studies unless architecture/objective diverges significantly.",
237
+ },
238
+ }
239
+
240
+ payload = {"stats": stats, "recommendation": recommendation}
241
+ args.summary_out.parent.mkdir(parents=True, exist_ok=True)
242
+ args.summary_out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
243
+ print(json.dumps(payload, indent=2))
244
+ return 0
245
+
246
+
247
+ def build_parser() -> argparse.ArgumentParser:
248
+ routing_defaults = resolve_routing(token=os.environ.get("HF_TOKEN"))
249
+ parser = argparse.ArgumentParser(description="Phase-oriented orchestration for Optuna HPO")
250
+ sub = parser.add_subparsers(dest="cmd", required=True)
251
+
252
+ def add_common(p: argparse.ArgumentParser) -> None:
253
+ p.add_argument("--study-name", default="hydra_hpo")
254
+ p.add_argument("--storage", default="sqlite:///optuna_hpo.db")
255
+ p.add_argument("--metric", default="val_bpb")
256
+ p.add_argument("--direction", choices=["minimize", "maximize"], default="minimize")
257
+ p.add_argument("--seed", type=int, default=42)
258
+ p.add_argument("--min-tps", type=float, default=50000.0)
259
+ p.add_argument("--summary-out", type=Path, default=REPO_ROOT / ".tmp" / "optuna" / "orchestrator_summary.json")
260
+ p.add_argument("--runner", choices=["local", "hf-job", "hf-launcher"], default="local")
261
+ p.add_argument("--hf-namespace", default=routing_defaults.job_namespace)
262
+ p.add_argument("--hf-image", default=f"hf.co/spaces/{routing_defaults.space_repo}")
263
+ p.add_argument("--hf-flavor", default="a10g-large")
264
+ p.add_argument("--hf-timeout", default="25m")
265
+ p.add_argument("--hf-command", default="/app/entrypoint.py")
266
+ p.add_argument("--hf-use-bash", action="store_true")
267
+ p.add_argument("--hf-token-env", default="HF_TOKEN")
268
+ p.add_argument("--hf-poll-interval", type=int, default=12)
269
+ p.add_argument("--hf-launcher-script", type=Path, default=REPO_ROOT / "scripts" / "launch_feather_hf_job.py")
270
+ p.add_argument("--hf-output-repo", default=routing_defaults.output_repo)
271
+ p.add_argument("--priors-file", type=Path, default=REPO_ROOT / "docs" / "hpo_transfer_priors.json")
272
+ p.add_argument("--apply-priors", action="store_true", default=True)
273
+ p.add_argument("--no-apply-priors", action="store_false", dest="apply_priors")
274
+ p.add_argument("--hf-stop-after-metric", action="store_true", default=True)
275
+ p.add_argument("--no-hf-stop-after-metric", action="store_false", dest="hf_stop_after_metric")
276
+
277
+ # Phase-1 defaults
278
+ p.add_argument("--phase1-trials", type=int, default=30)
279
+ p.add_argument("--phase1-trial-time-budget", type=int, default=180)
280
+ p.add_argument("--phase1-trial-timeout", type=int, default=600)
281
+ p.add_argument("--phase1-n-startup", type=int, default=5)
282
+ p.add_argument("--phase1-n-warmup", type=int, default=0)
283
+ p.add_argument("--phase1-patience", type=int, default=8)
284
+ p.add_argument("--phase1-min-improvement", type=float, default=0.001)
285
+
286
+ # Phase-2 defaults
287
+ p.add_argument("--phase2-trials", type=int, default=60)
288
+ p.add_argument("--phase2-trial-time-budget", type=int, default=300)
289
+ p.add_argument("--phase2-trial-timeout", type=int, default=900)
290
+ p.add_argument("--phase2-n-startup", type=int, default=8)
291
+ p.add_argument("--phase2-n-warmup", type=int, default=0)
292
+ p.add_argument("--phase2-patience", type=int, default=12)
293
+ p.add_argument("--phase2-min-improvement", type=float, default=0.0005)
294
+
295
+ p_phase = sub.add_parser("phase", help="Run a single phase serially")
296
+ add_common(p_phase)
297
+ p_phase.add_argument("--phase", choices=["phase1", "phase2"], required=True)
298
+ p_phase.set_defaults(func=cmd_phase)
299
+
300
+ p_parallel = sub.add_parser("parallel", help="Run a phase with N parallel workers")
301
+ add_common(p_parallel)
302
+ p_parallel.add_argument("--phase", choices=["phase1", "phase2"], required=True)
303
+ p_parallel.add_argument("--workers", type=int, default=3)
304
+ p_parallel.set_defaults(func=cmd_parallel)
305
+
306
+ p_reco = sub.add_parser("recommend", help="Recommend full-run settings from current study")
307
+ add_common(p_reco)
308
+ p_reco.set_defaults(func=cmd_recommend)
309
+ return parser
310
+
311
+
312
+ def main() -> int:
313
+ parser = build_parser()
314
+ args = parser.parse_args()
315
+ return int(args.func(args))
316
+
317
+
318
+ if __name__ == "__main__":
319
+ raise SystemExit(main())
overlay/scripts/launch_feather_hf_job.py CHANGED
@@ -2,37 +2,104 @@
2
  from __future__ import annotations
3
 
4
  import os
 
5
  import sys
6
  import time
 
 
7
  from pathlib import Path
8
 
9
  from huggingface_hub import HfApi
10
- from huggingface_hub._space_api import SpaceHardware
11
- from huggingface_hub.errors import HfHubHTTPError
12
 
13
- # ../../../../ from overlay/scripts/launch_feather_hf_job.py -> repository root
14
- REPO_ROOT = Path(__file__).resolve().parents[4]
15
  if str(REPO_ROOT) not in sys.path:
16
  sys.path.insert(0, str(REPO_ROOT))
17
 
18
  from scripts.hf_routing import resolve_routing
 
19
 
20
  DEFAULT_IMAGE = os.environ.get('FEATHER_HF_IMAGE', 'ghcr.io/slapglif/feather-hf-runtime:latest')
21
- IMAGE_DIR = REPO_ROOT / 'hf_jobs' / 'feather_h200_image'
22
  TIMEOUT = os.environ.get('FEATHER_HF_JOB_TIMEOUT', '12h')
23
- FLAVOR_RAW = os.environ.get('FEATHER_HF_FLAVOR', 'h200')
24
  TARGET_SHARDS = os.environ.get('HYDRA_TARGET_SHARDS', '2048')
25
  TIME_BUDGET = os.environ.get('HYDRA_TIME_BUDGET', '43200')
26
  DOWNLOAD_WORKERS = os.environ.get('HYDRA_DOWNLOAD_WORKERS', '16')
27
  CKPT_INTERVAL = os.environ.get('HYDRA_CKPT_INTERVAL', '1000')
 
28
  DRY_RUN = os.environ.get('FEATHER_HF_DRY_RUN', '0') == '1'
29
  USE_SPACE_IMAGE = os.environ.get('FEATHER_HF_USE_SPACE_IMAGE', '0') == '1'
30
  # When true, assume the Space image has already been built by a previous
31
  # invocation and skip the upload+build wait. Used by sweep drivers that fan
32
  # out many jobs against a single pre-uploaded image.
33
  SKIP_UPLOAD = os.environ.get('FEATHER_HF_SKIP_UPLOAD', '0') == '1'
34
- JOB_SUBMIT_RETRIES = max(1, int(os.environ.get('FEATHER_HF_JOB_SUBMIT_RETRIES', '5')))
35
- JOB_SUBMIT_RETRY_BASE_S = max(1.0, float(os.environ.get('FEATHER_HF_JOB_SUBMIT_RETRY_BASE_S', '3')))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
 
38
  def require_token() -> str:
@@ -59,115 +126,52 @@ def wait_for_space(api: HfApi, repo_id: str, token: str, timeout_s: int = 1800)
59
  """
60
  start = time.time()
61
  seen_build_completion = False
 
62
  while True:
63
- try:
64
- runtime = api.get_space_runtime(repo_id, token=token)
65
- except HfHubHTTPError as exc:
66
- code = getattr(getattr(exc, 'response', None), 'status_code', None)
67
- if isinstance(code, int) and code >= 500:
68
- if time.time() - start > timeout_s:
69
- raise TimeoutError(
70
- f'Space {repo_id} runtime endpoint unstable for {timeout_s}s '
71
- f'(last HTTP {code})'
72
- ) from exc
73
- print(f'[space] runtime endpoint HTTP {code}; retrying...', flush=True)
74
- time.sleep(20)
75
- continue
76
- raise
77
  stage = getattr(runtime, 'stage', None)
78
- hardware = getattr(runtime, 'hardware', None)
79
- err = getattr(runtime, 'errorMessage', None) or getattr(runtime, 'error_message', None)
80
- print(f'[space] stage={stage} hardware={hardware}', flush=True)
81
- if stage in {'APP_STARTING', 'RUNNING', 'PAUSED', 'SLEEPING'}:
82
- seen_build_completion = True
83
- if stage in {'RUNNING', 'PAUSED', 'SLEEPING'}:
84
- return
 
 
85
  # Image is built — Jobs can use it regardless of Space boot outcome.
86
- if seen_build_completion and stage in {'RUNTIME_ERROR', 'APP_STARTING_ERROR'}:
87
- msg = (
88
- f'[space] Space boot failed with {stage} but built image is '
89
- 'available in the Space registry and is usable by HF Jobs.'
90
- )
91
- print(msg, flush=True)
92
- return
93
  # Hard build failures — no image was produced.
94
  if stage in {'BUILD_ERROR', 'CONFIG_ERROR', 'NO_APP_FILE'}:
95
  raise RuntimeError(f'Space {repo_id} build failed: stage={stage} error={err!r}')
96
  if time.time() - start > timeout_s:
97
  raise TimeoutError(f'Space {repo_id} did not become ready in {timeout_s}s (last stage={stage})')
98
- time.sleep(20)
99
-
100
-
101
- def submit_job_with_retry(
102
- api: HfApi,
103
- *,
104
- image: str,
105
- command: list[str],
106
- env: dict[str, str],
107
- secrets: dict[str, str],
108
- flavor: SpaceHardware,
109
- timeout: str,
110
- token: str,
111
- namespace: str,
112
- ):
113
- last_exc: Exception | None = None
114
- for attempt in range(1, JOB_SUBMIT_RETRIES + 1):
115
- try:
116
- return api.run_job(
117
- image=image,
118
- command=command,
119
- env=env,
120
- secrets=secrets,
121
- flavor=flavor,
122
- timeout=timeout,
123
- token=token,
124
- namespace=namespace,
125
- )
126
- except HfHubHTTPError as exc:
127
- last_exc = exc
128
- code = getattr(getattr(exc, 'response', None), 'status_code', None)
129
- if not (isinstance(code, int) and code >= 500):
130
- raise
131
- if attempt >= JOB_SUBMIT_RETRIES:
132
- raise SystemExit(
133
- f'HF Jobs backend returned HTTP {code} after {JOB_SUBMIT_RETRIES} '
134
- 'submit attempts; failing fast.'
135
- ) from exc
136
- wait_s = JOB_SUBMIT_RETRY_BASE_S * attempt
137
- print(
138
- f'[launch] HF Jobs backend returned HTTP {code}; retrying submit in '
139
- f'{wait_s:.1f}s (attempt {attempt}/{JOB_SUBMIT_RETRIES})',
140
- flush=True,
141
- )
142
- time.sleep(wait_s)
143
-
144
- if last_exc is not None:
145
- raise last_exc
146
- raise RuntimeError('submit_job_with_retry exhausted without a result')
147
 
148
 
149
  def main() -> int:
150
  token = require_token()
151
  routing = resolve_routing(token=token)
152
  api = HfApi(token=token)
 
153
 
154
  print(f'[launch] image_dir={IMAGE_DIR}', flush=True)
155
  print(f'[launch] owner={routing.owner}', flush=True)
156
  print(f'[launch] space_repo={routing.space_repo}', flush=True)
157
  print(f'[launch] output_repo={routing.output_repo}', flush=True)
158
  print(f'[launch] retina_cache_repo={routing.retina_cache_repo}', flush=True)
 
 
159
  print(f'[launch] namespace={routing.job_namespace}', flush=True)
160
- try:
161
- flavor = SpaceHardware(FLAVOR_RAW)
162
- except ValueError as exc:
163
- valid = ", ".join([hw.value for hw in SpaceHardware])
164
- raise SystemExit(f'Invalid FEATHER_HF_FLAVOR={FLAVOR_RAW!r}. Valid values: {valid}') from exc
165
-
166
- print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT} flavor={flavor.value}', flush=True)
167
- print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
168
- if not USE_SPACE_IMAGE:
169
- print(f'[launch] image={DEFAULT_IMAGE}', flush=True)
170
-
171
  api.create_repo(repo_id=routing.space_repo, repo_type='space', space_sdk='docker', private=True, exist_ok=True, token=token)
172
  api.create_repo(repo_id=routing.output_repo, repo_type='model', private=True, exist_ok=True, token=token)
173
 
@@ -175,17 +179,19 @@ def main() -> int:
175
  print('[launch] dry-run mode; skipping upload and job submission', flush=True)
176
  return 0
177
 
178
- image_ref = DEFAULT_IMAGE
179
- if USE_SPACE_IMAGE:
180
- if SKIP_UPLOAD:
181
- print('[launch] FEATHER_HF_SKIP_UPLOAD=1; reusing existing Space image', flush=True)
182
- else:
183
- print('[launch] uploading custom Docker Space image context...', flush=True)
 
 
184
  api.upload_folder(
185
  repo_id=routing.space_repo,
186
  repo_type='space',
187
  folder_path=str(IMAGE_DIR),
188
- commit_message='Update Feather H200 training runtime image',
189
  token=token,
190
  )
191
 
@@ -205,8 +211,38 @@ def main() -> int:
205
  'HYDRA_DOWNLOAD_WORKERS': DOWNLOAD_WORKERS,
206
  'HYDRA_CKPT_INTERVAL': CKPT_INTERVAL,
207
  'PYTHONUNBUFFERED': '1',
208
- 'FEATHER_RUNTIME_MODE': 'job',
209
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  # Pass through any HYDRA_* / FEATHER_* overrides from the caller's env so
211
  # sweep drivers can set HYDRA_N_LAYER, HYDRA_SDR_TARGET_ACTIVE,
212
  # HYDRA_LAYER_DIAGNOSTICS, HYDRA_METRICS_OUT, HYDRA_MID_VAL_INTERVAL, etc.
@@ -216,17 +252,16 @@ def main() -> int:
216
  env[_k] = _v
217
  secrets = {'HF_TOKEN': token}
218
 
219
- print('[launch] submitting HF Job...', flush=True)
220
- job = submit_job_with_retry(
221
- api,
222
  image=image_ref,
223
  command=['python', '/app/entrypoint.py'],
224
  env=env,
225
  secrets=secrets,
226
- flavor=flavor,
227
  timeout=TIMEOUT,
228
- token=token,
229
  namespace=routing.job_namespace,
 
230
  )
231
  print(f'[launch] submitted job_id={job.id} status={job.status.stage} url={job.url}', flush=True)
232
  return 0
 
2
  from __future__ import annotations
3
 
4
  import os
5
+ import shutil
6
  import sys
7
  import time
8
+ import json
9
+ from typing import Any, cast
10
  from pathlib import Path
11
 
12
  from huggingface_hub import HfApi
 
 
13
 
14
+ REPO_ROOT = Path(__file__).resolve().parents[1]
 
15
  if str(REPO_ROOT) not in sys.path:
16
  sys.path.insert(0, str(REPO_ROOT))
17
 
18
  from scripts.hf_routing import resolve_routing
19
+ from configs.harness_config import HarnessConfig
20
 
21
  DEFAULT_IMAGE = os.environ.get('FEATHER_HF_IMAGE', 'ghcr.io/slapglif/feather-hf-runtime:latest')
22
+ IMAGE_DIR = Path(__file__).resolve().parents[1] / 'hf_jobs' / 'feather_h200_image'
23
  TIMEOUT = os.environ.get('FEATHER_HF_JOB_TIMEOUT', '12h')
 
24
  TARGET_SHARDS = os.environ.get('HYDRA_TARGET_SHARDS', '2048')
25
  TIME_BUDGET = os.environ.get('HYDRA_TIME_BUDGET', '43200')
26
  DOWNLOAD_WORKERS = os.environ.get('HYDRA_DOWNLOAD_WORKERS', '16')
27
  CKPT_INTERVAL = os.environ.get('HYDRA_CKPT_INTERVAL', '1000')
28
+ JOB_FLAVOR = os.environ.get('FEATHER_HF_FLAVOR', 'a10g-small')
29
  DRY_RUN = os.environ.get('FEATHER_HF_DRY_RUN', '0') == '1'
30
  USE_SPACE_IMAGE = os.environ.get('FEATHER_HF_USE_SPACE_IMAGE', '0') == '1'
31
  # When true, assume the Space image has already been built by a previous
32
  # invocation and skip the upload+build wait. Used by sweep drivers that fan
33
  # out many jobs against a single pre-uploaded image.
34
  SKIP_UPLOAD = os.environ.get('FEATHER_HF_SKIP_UPLOAD', '0') == '1'
35
+ SYNC_OVERLAY = os.environ.get('FEATHER_HF_SYNC_OVERLAY', '1') == '1'
36
+
37
+
38
+ def should_enable_fast_start_streaming(target_shards: str, time_budget: str) -> bool:
39
+ """Use streaming data path for short-budget launch profiles."""
40
+ try:
41
+ shards = int(target_shards)
42
+ budget = int(time_budget)
43
+ except ValueError:
44
+ return False
45
+ return shards > 0 and shards <= 256 and budget > 0 and budget <= 1800
46
+
47
+
48
+ def sync_overlay_from_repo() -> None:
49
+ """Refresh Space overlay with required project files."""
50
+ overlay = IMAGE_DIR / 'overlay'
51
+ overlay.mkdir(parents=True, exist_ok=True)
52
+
53
+ for child in overlay.iterdir():
54
+ if child.is_dir():
55
+ shutil.rmtree(child)
56
+ else:
57
+ child.unlink()
58
+
59
+ include_paths = [
60
+ 'hydra',
61
+ 'subsystems',
62
+ 'scripts',
63
+ 'htm_rust',
64
+ 'harness',
65
+ 'configs',
66
+ 'prepare.py',
67
+ 'prepare_nemotron.py',
68
+ 'train.py',
69
+ 'pyproject.toml',
70
+ 'uv.lock',
71
+ ]
72
+ ignore = shutil.ignore_patterns(
73
+ '__pycache__',
74
+ '.pytest_cache',
75
+ '.ruff_cache',
76
+ '.venv',
77
+ '.git',
78
+ 'target',
79
+ '*.pyc',
80
+ )
81
+
82
+ copied: list[str] = []
83
+ for rel in include_paths:
84
+ src = REPO_ROOT / rel
85
+ dst = overlay / rel
86
+ if not src.exists():
87
+ continue
88
+ if src.is_dir():
89
+ shutil.copytree(src, dst, dirs_exist_ok=True, ignore=ignore)
90
+ else:
91
+ dst.parent.mkdir(parents=True, exist_ok=True)
92
+ shutil.copy2(src, dst)
93
+ copied.append(rel)
94
+
95
+ scripts_dir = overlay / 'scripts'
96
+ if scripts_dir.exists():
97
+ for sh_path in scripts_dir.rglob('*.sh'):
98
+ data = sh_path.read_bytes()
99
+ data = data.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
100
+ sh_path.write_bytes(data)
101
+
102
+ print(f'[launch] overlay synced from repo ({len(copied)} paths): {copied}', flush=True)
103
 
104
 
105
  def require_token() -> str:
 
126
  """
127
  start = time.time()
128
  seen_build_completion = False
129
+ seen_building = False
130
  while True:
131
+ runtime = api.get_space_runtime(repo_id, token=token)
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  stage = getattr(runtime, 'stage', None)
133
+ hardware = getattr(runtime, 'hardware', None)
134
+ err = getattr(runtime, 'errorMessage', None) or getattr(runtime, 'error_message', None)
135
+ print(f'[space] stage={stage} hardware={hardware}', flush=True)
136
+ if stage == 'BUILDING':
137
+ seen_building = True
138
+ if stage in {'APP_STARTING', 'RUNNING', 'PAUSED', 'SLEEPING'}:
139
+ seen_build_completion = True
140
+ if stage in {'RUNNING', 'PAUSED', 'SLEEPING'}:
141
+ return
142
  # Image is built — Jobs can use it regardless of Space boot outcome.
143
+ if (seen_build_completion or seen_building) and stage in {'RUNTIME_ERROR', 'APP_STARTING_ERROR'}:
144
+ print(f'[space] Space boot failed with {stage} but built image is '
145
+ f'available in the Space registry and is usable by HF Jobs.',
146
+ flush=True)
147
+ return
 
 
148
  # Hard build failures — no image was produced.
149
  if stage in {'BUILD_ERROR', 'CONFIG_ERROR', 'NO_APP_FILE'}:
150
  raise RuntimeError(f'Space {repo_id} build failed: stage={stage} error={err!r}')
151
  if time.time() - start > timeout_s:
152
  raise TimeoutError(f'Space {repo_id} did not become ready in {timeout_s}s (last stage={stage})')
153
+ time.sleep(20)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
 
156
  def main() -> int:
157
  token = require_token()
158
  routing = resolve_routing(token=token)
159
  api = HfApi(token=token)
160
+ secondary_gates = HarnessConfig().to_secondary_gates()
161
 
162
  print(f'[launch] image_dir={IMAGE_DIR}', flush=True)
163
  print(f'[launch] owner={routing.owner}', flush=True)
164
  print(f'[launch] space_repo={routing.space_repo}', flush=True)
165
  print(f'[launch] output_repo={routing.output_repo}', flush=True)
166
  print(f'[launch] retina_cache_repo={routing.retina_cache_repo}', flush=True)
167
+ print(f'[launch] target_shards={TARGET_SHARDS} time_budget={TIME_BUDGET} timeout={TIMEOUT}', flush=True)
168
+ print(f'[launch] flavor={JOB_FLAVOR}', flush=True)
169
  print(f'[launch] namespace={routing.job_namespace}', flush=True)
170
+ print(f'[launch] image_mode={"space" if USE_SPACE_IMAGE else "ghcr"}', flush=True)
171
+ print(f'[launch] secondary_gates={json.dumps(secondary_gates, sort_keys=True)}', flush=True)
172
+ if not USE_SPACE_IMAGE:
173
+ print(f'[launch] image={DEFAULT_IMAGE}', flush=True)
174
+
 
 
 
 
 
 
175
  api.create_repo(repo_id=routing.space_repo, repo_type='space', space_sdk='docker', private=True, exist_ok=True, token=token)
176
  api.create_repo(repo_id=routing.output_repo, repo_type='model', private=True, exist_ok=True, token=token)
177
 
 
179
  print('[launch] dry-run mode; skipping upload and job submission', flush=True)
180
  return 0
181
 
182
+ image_ref = DEFAULT_IMAGE
183
+ if USE_SPACE_IMAGE:
184
+ if SKIP_UPLOAD:
185
+ print('[launch] FEATHER_HF_SKIP_UPLOAD=1; reusing existing Space image', flush=True)
186
+ else:
187
+ if SYNC_OVERLAY:
188
+ sync_overlay_from_repo()
189
+ print('[launch] uploading custom Docker Space image context...', flush=True)
190
  api.upload_folder(
191
  repo_id=routing.space_repo,
192
  repo_type='space',
193
  folder_path=str(IMAGE_DIR),
194
+ commit_message='Update Feather training runtime image',
195
  token=token,
196
  )
197
 
 
211
  'HYDRA_DOWNLOAD_WORKERS': DOWNLOAD_WORKERS,
212
  'HYDRA_CKPT_INTERVAL': CKPT_INTERVAL,
213
  'PYTHONUNBUFFERED': '1',
214
+ 'FEATHER_RUNTIME_MODE': 'job',
215
+ }
216
+ if 'HYDRA_USE_NEMOTRON' not in os.environ and should_enable_fast_start_streaming(TARGET_SHARDS, TIME_BUDGET):
217
+ env['HYDRA_USE_NEMOTRON'] = '1'
218
+ print('[launch] auto-enabled HYDRA_USE_NEMOTRON=1 for short-budget fast-start profile', flush=True)
219
+ # A10 compatibility profile: avoid known PTX/compile runtime pitfalls and
220
+ # keep throughput path enabled. Caller can explicitly override each key by
221
+ # setting it in the parent environment.
222
+ if JOB_FLAVOR.startswith('a10'):
223
+ _a10_defaults = {
224
+ 'HYDRA_MUON_COMPILE': '0',
225
+ 'HYDRA_FORCE_HTM_CPU': '1',
226
+ 'HYDRA_INERT_MAMBA': '1',
227
+ 'HYDRA_ALLOW_SYNTHETIC_RETINA': '1',
228
+ 'HYDRA_FASTPATH': '1',
229
+ }
230
+ for _k, _default in _a10_defaults.items():
231
+ if _k in os.environ:
232
+ env[_k] = os.environ[_k]
233
+ else:
234
+ env.setdefault(_k, _default)
235
+ if env.get('HYDRA_INERT_MAMBA') == '0' and 'HYDRA_FASTPATH' not in os.environ:
236
+ env['HYDRA_FASTPATH'] = '0'
237
+ print(
238
+ '[launch] applied A10 env profile '
239
+ f"(HYDRA_MUON_COMPILE={env['HYDRA_MUON_COMPILE']}, "
240
+ f"HYDRA_FORCE_HTM_CPU={env['HYDRA_FORCE_HTM_CPU']}, "
241
+ f"HYDRA_INERT_MAMBA={env['HYDRA_INERT_MAMBA']}, "
242
+ f"HYDRA_ALLOW_SYNTHETIC_RETINA={env['HYDRA_ALLOW_SYNTHETIC_RETINA']}, "
243
+ f"HYDRA_FASTPATH={env['HYDRA_FASTPATH']})",
244
+ flush=True,
245
+ )
246
  # Pass through any HYDRA_* / FEATHER_* overrides from the caller's env so
247
  # sweep drivers can set HYDRA_N_LAYER, HYDRA_SDR_TARGET_ACTIVE,
248
  # HYDRA_LAYER_DIAGNOSTICS, HYDRA_METRICS_OUT, HYDRA_MID_VAL_INTERVAL, etc.
 
252
  env[_k] = _v
253
  secrets = {'HF_TOKEN': token}
254
 
255
+ print(f'[launch] submitting HF Job on flavor={JOB_FLAVOR}...', flush=True)
256
+ job = api.run_job(
 
257
  image=image_ref,
258
  command=['python', '/app/entrypoint.py'],
259
  env=env,
260
  secrets=secrets,
261
+ flavor=cast(Any, JOB_FLAVOR),
262
  timeout=TIMEOUT,
 
263
  namespace=routing.job_namespace,
264
+ token=token,
265
  )
266
  print(f'[launch] submitted job_id={job.id} status={job.status.stage} url={job.url}', flush=True)
267
  return 0
overlay/scripts/long_train.sh CHANGED
@@ -1,38 +1,38 @@
1
- #!/usr/bin/env bash
2
- # Long-training run for full-architecture completion attempt.
3
- #
4
- # The 5-minute autoresearch budget is for mutation screening — it's nowhere
5
- # near enough compute for this small model (~6M params) to produce coherent
6
- # English. This script runs the SAME full-architecture train.py with an
7
- # extended budget so the "factual English" completion criterion can actually
8
- # be tested end-to-end.
9
- #
10
- # Usage:
11
- # ./scripts/long_train.sh # default 1-hour budget
12
- # HYDRA_TIME_BUDGET=7200 ./scripts/long_train.sh # 2 hours
13
- # HYDRA_D_MODEL=384 HYDRA_N_LAYER=6 ./scripts/long_train.sh # scale model
14
- #
15
- # Output: run_long_<timestamp>.log in repo root. Includes factual_english_score.
16
- set -euo pipefail
17
-
18
- cd "$(dirname "$0")/.."
19
-
20
- TIME_BUDGET="${HYDRA_TIME_BUDGET:-3600}"
21
- STAMP="$(date +%Y%m%d_%H%M%S)"
22
- LOG="run_long_${STAMP}.log"
23
-
24
- export HYDRA_TIME_BUDGET="${TIME_BUDGET}"
25
-
26
- echo "=== HYDRA long-training run ==="
27
- echo "time_budget: ${TIME_BUDGET}s ($((TIME_BUDGET / 60))m)"
28
- echo "d_model: ${HYDRA_D_MODEL:-256 (default)}"
29
- echo "n_layer: ${HYDRA_N_LAYER:-4 (default)}"
30
- echo "d_state: ${HYDRA_D_STATE:-64 (default)}"
31
- echo "log: ${LOG}"
32
- echo
33
-
34
- .venv/bin/python train.py 2>&1 | tee "${LOG}"
35
-
36
- echo
37
- echo "=== Summary ==="
38
- grep -E "^val_bpb:|^factual_english_score:|^factual_english_hits:|^peak_vram_mb:|^num_steps:" "${LOG}"
 
1
+ #!/usr/bin/env bash
2
+ # Long-training run for full-architecture completion attempt.
3
+ #
4
+ # The 5-minute autoresearch budget is for mutation screening — it's nowhere
5
+ # near enough compute for this small model (~6M params) to produce coherent
6
+ # English. This script runs the SAME full-architecture train.py with an
7
+ # extended budget so the "factual English" completion criterion can actually
8
+ # be tested end-to-end.
9
+ #
10
+ # Usage:
11
+ # ./scripts/long_train.sh # default 1-hour budget
12
+ # HYDRA_TIME_BUDGET=7200 ./scripts/long_train.sh # 2 hours
13
+ # HYDRA_D_MODEL=384 HYDRA_N_LAYER=6 ./scripts/long_train.sh # scale model
14
+ #
15
+ # Output: run_long_<timestamp>.log in repo root. Includes factual_english_score.
16
+ set -euo pipefail
17
+
18
+ cd "$(dirname "$0")/.."
19
+
20
+ TIME_BUDGET="${HYDRA_TIME_BUDGET:-3600}"
21
+ STAMP="$(date +%Y%m%d_%H%M%S)"
22
+ LOG="run_long_${STAMP}.log"
23
+
24
+ export HYDRA_TIME_BUDGET="${TIME_BUDGET}"
25
+
26
+ echo "=== HYDRA long-training run ==="
27
+ echo "time_budget: ${TIME_BUDGET}s ($((TIME_BUDGET / 60))m)"
28
+ echo "d_model: ${HYDRA_D_MODEL:-256 (default)}"
29
+ echo "n_layer: ${HYDRA_N_LAYER:-4 (default)}"
30
+ echo "d_state: ${HYDRA_D_STATE:-64 (default)}"
31
+ echo "log: ${LOG}"
32
+ echo
33
+
34
+ .venv/bin/python train.py 2>&1 | tee "${LOG}"
35
+
36
+ echo
37
+ echo "=== Summary ==="
38
+ grep -E "^val_bpb:|^factual_english_score:|^factual_english_hits:|^peak_vram_mb:|^num_steps:" "${LOG}"
overlay/scripts/optuna_hpo.py ADDED
@@ -0,0 +1,725 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import os
7
+ import re
8
+ import subprocess
9
+ import sys
10
+ import time
11
+ import tempfile
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import optuna
16
+
17
+
18
+ _HF_ENV_KEY_RE = re.compile(r"^[A-Z][A-Z0-9_]*$")
19
+
20
+
21
+ REPO_ROOT = Path(__file__).resolve().parents[1]
22
+ if str(REPO_ROOT) not in sys.path:
23
+ sys.path.insert(0, str(REPO_ROOT))
24
+
25
+ from scripts.hf_routing import resolve_routing
26
+
27
+ TRAIN_ENTRYPOINT = REPO_ROOT / "train.py"
28
+ SEARCH_SPACE_KEYS = {
29
+ "d_model",
30
+ "n_layer",
31
+ "d_state",
32
+ "headdim",
33
+ "expand",
34
+ "seq_len",
35
+ "batch_size",
36
+ "grad_accum",
37
+ "matrix_lr",
38
+ "embed_lr",
39
+ "unembed_lr",
40
+ "engram_n_columns",
41
+ "sdr_target_active",
42
+ "hyena_layers",
43
+ }
44
+
45
+
46
+ def _filter_prior_params(raw: dict[str, Any]) -> dict[str, Any]:
47
+ return {k: v for k, v in raw.items() if k in SEARCH_SPACE_KEYS}
48
+
49
+
50
+ def _load_prior_param_sets(path: Path) -> list[dict[str, Any]]:
51
+ if not path.exists():
52
+ return []
53
+
54
+ payload = json.loads(path.read_text(encoding="utf-8"))
55
+ if isinstance(payload, dict):
56
+ rows = payload.get("trials", [])
57
+ elif isinstance(payload, list):
58
+ rows = payload
59
+ else:
60
+ rows = []
61
+
62
+ out: list[dict[str, Any]] = []
63
+ for item in rows:
64
+ if not isinstance(item, dict):
65
+ continue
66
+ params_obj = item.get("params", item)
67
+ if not isinstance(params_obj, dict):
68
+ continue
69
+ filtered = _filter_prior_params(params_obj)
70
+ if filtered:
71
+ out.append(filtered)
72
+ return out
73
+
74
+
75
+ def _enqueue_transfer_priors(study: optuna.Study, priors_file: Path, apply_priors: bool) -> int:
76
+ if not apply_priors:
77
+ return 0
78
+
79
+ priors_raw = _load_prior_param_sets(priors_file)
80
+ if not priors_raw:
81
+ return 0
82
+
83
+ # Deduplicate param sets across merged studies.
84
+ priors: list[dict[str, Any]] = []
85
+ seen: set[str] = set()
86
+ for params in priors_raw:
87
+ key = json.dumps(params, sort_keys=True)
88
+ if key in seen:
89
+ continue
90
+ seen.add(key)
91
+ priors.append(params)
92
+
93
+ enqueued = 0
94
+ for params in priors:
95
+ before = len(study.get_trials(deepcopy=False))
96
+ try:
97
+ study.enqueue_trial(params, user_attrs={"seed_source": "transfer_priors"}, skip_if_exists=True)
98
+ except TypeError:
99
+ study.enqueue_trial(params, user_attrs={"seed_source": "transfer_priors"})
100
+ after = len(study.get_trials(deepcopy=False))
101
+ if after > before:
102
+ enqueued += 1
103
+ return enqueued
104
+
105
+
106
+ def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any] | None:
107
+ metrics_line: str | None = None
108
+ for line in stdout.splitlines():
109
+ if "[METRICS_JSON]" in line:
110
+ metrics_line = line
111
+ if not metrics_line:
112
+ return None
113
+ m = re.search(r"\[METRICS_JSON\]\s*(\{.*\})", metrics_line)
114
+ if not m:
115
+ return None
116
+ try:
117
+ return json.loads(m.group(1))
118
+ except json.JSONDecodeError:
119
+ return None
120
+
121
+
122
+ def _parse_metrics_from_log_lines(lines: list[str]) -> dict[str, Any] | None:
123
+ metrics_line: str | None = None
124
+ for line in lines:
125
+ if "[METRICS_JSON]" in line:
126
+ metrics_line = line
127
+ if not metrics_line:
128
+ return None
129
+ m = re.search(r"\[METRICS_JSON\]\s*(\{.*\})", metrics_line)
130
+ if not m:
131
+ return None
132
+ try:
133
+ return json.loads(m.group(1))
134
+ except json.JSONDecodeError:
135
+ return None
136
+
137
+
138
+ def _parse_last_train_bpb_from_logs(lines: list[str]) -> float | None:
139
+ """Best-effort fallback when final eval crashes before metrics JSON write."""
140
+ last: float | None = None
141
+ for line in lines:
142
+ m = re.search(r"\bbpb=([0-9]+(?:\.[0-9]+)?)", line)
143
+ if m:
144
+ last = float(m.group(1))
145
+ return last
146
+
147
+
148
+ def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path) -> dict[str, str]:
149
+ env = os.environ.copy()
150
+
151
+ # Runtime and reporting
152
+ env["HYDRA_METRICS_OUT"] = str(metrics_path)
153
+ env["HYDRA_TIME_BUDGET"] = str(args.trial_time_budget)
154
+ env["PYTHONUNBUFFERED"] = "1"
155
+
156
+ # Search space — fully env-driven to match existing training stack.
157
+ env["HYDRA_D_MODEL"] = str(trial.suggest_categorical("d_model", [64, 96, 128, 160, 192]))
158
+ env["HYDRA_N_LAYER"] = str(trial.suggest_int("n_layer", 1, 4))
159
+ env["HYDRA_D_STATE"] = str(trial.suggest_categorical("d_state", [16, 32, 48]))
160
+ env["HYDRA_HEADDIM"] = str(trial.suggest_categorical("headdim", [8, 16, 32]))
161
+ env["HYDRA_EXPAND"] = str(trial.suggest_categorical("expand", [1, 2]))
162
+
163
+ seq_len = trial.suggest_categorical("seq_len", [32, 64])
164
+ batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
165
+ grad_accum = trial.suggest_categorical("grad_accum", [8, 16, 32, 64])
166
+ # Keep TOTAL_BATCH_SIZE divisible by DEVICE_BATCH_SIZE * MAX_SEQ_LEN.
167
+ total_batch = batch_size * seq_len * grad_accum
168
+ env["HYDRA_SEQ_LEN"] = str(seq_len)
169
+ env["HYDRA_BATCH_SIZE"] = str(batch_size)
170
+ env["HYDRA_TOTAL_BATCH"] = str(total_batch)
171
+
172
+ env["HYDRA_MATRIX_LR"] = str(trial.suggest_float("matrix_lr", 0.005, 0.2, log=True))
173
+ env["HYDRA_EMBED_LR"] = str(trial.suggest_float("embed_lr", 0.05, 1.0, log=True))
174
+ env["HYDRA_UNEMBED_LR"] = str(trial.suggest_float("unembed_lr", 0.0005, 0.02, log=True))
175
+
176
+ env["HYDRA_ENGRAM_N_COLUMNS"] = str(trial.suggest_categorical("engram_n_columns", [256, 512, 1024]))
177
+ env["HYDRA_SDR_TARGET_ACTIVE"] = str(trial.suggest_categorical("sdr_target_active", [128, 256, 327, 512]))
178
+ env["HYDRA_HYENA_LAYERS"] = trial.suggest_categorical("hyena_layers", ["", "0", "1", "0,1"])
179
+
180
+ # Keep trials alive long enough to emit metrics.
181
+ env["HYDRA_FAIL_LOSS_THRESHOLD"] = "1000000"
182
+ env["HYDRA_USE_NEMOTRON"] = os.environ.get("HYDRA_USE_NEMOTRON", "1")
183
+ env["HYDRA_LOCAL_SHARDS_ONLY"] = os.environ.get("HYDRA_LOCAL_SHARDS_ONLY", "0")
184
+ # Strict optimal-path defaults (no forced fallback profile).
185
+ env["HYDRA_MUON_COMPILE"] = os.environ.get("HYDRA_MUON_COMPILE", "1")
186
+ env["HYDRA_FORCE_HTM_CPU"] = os.environ.get("HYDRA_FORCE_HTM_CPU", "0")
187
+ env["HYDRA_ALLOW_SYNTHETIC_RETINA"] = os.environ.get("HYDRA_ALLOW_SYNTHETIC_RETINA", "0")
188
+ env["HYDRA_INERT_MAMBA"] = os.environ.get("HYDRA_INERT_MAMBA", "0")
189
+ env["HYDRA_FASTPATH"] = os.environ.get("HYDRA_FASTPATH", "0")
190
+
191
+ return env
192
+
193
+
194
+ def _sanitize_hf_env(env: dict[str, str]) -> dict[str, str]:
195
+ """HF Jobs API accepts only strictly alnum/underscore env keys."""
196
+ sanitized: dict[str, str] = {}
197
+ for key, value in env.items():
198
+ if _HF_ENV_KEY_RE.match(key):
199
+ sanitized[key] = str(value)
200
+ return sanitized
201
+
202
+
203
+ def _hf_command_candidates(args: argparse.Namespace) -> list[list[str]]:
204
+ if args.hf_use_bash:
205
+ return [["bash", "-lc", args.hf_command]]
206
+
207
+ raw = args.hf_command.strip()
208
+ if args.hf_auto_command_fallback and raw == "/app/entrypoint.py":
209
+ candidates = [
210
+ ["/usr/bin/python3", "/app/entrypoint.py"],
211
+ ["/usr/local/bin/python3", "/app/entrypoint.py"],
212
+ ["python3", "/app/entrypoint.py"],
213
+ ["python", "/app/entrypoint.py"],
214
+ ["/app/entrypoint.py"],
215
+ ]
216
+ uniq: list[list[str]] = []
217
+ seen: set[tuple[str, ...]] = set()
218
+ for c in candidates:
219
+ key = tuple(c)
220
+ if key not in seen:
221
+ seen.add(key)
222
+ uniq.append(c)
223
+ return uniq
224
+
225
+ return [raw.split()]
226
+
227
+
228
+ def _objective_local(args: argparse.Namespace):
229
+ def objective(trial: optuna.Trial) -> float:
230
+ trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
231
+ metrics_path = trial_dir / "metrics.json"
232
+
233
+ env = _trial_env(trial, args, metrics_path)
234
+
235
+ proc = subprocess.run(
236
+ [sys.executable, str(TRAIN_ENTRYPOINT)],
237
+ cwd=str(REPO_ROOT),
238
+ env=env,
239
+ text=True,
240
+ capture_output=True,
241
+ timeout=args.trial_timeout,
242
+ )
243
+
244
+ metrics: dict[str, Any] | None = None
245
+ if metrics_path.exists():
246
+ try:
247
+ metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
248
+ except json.JSONDecodeError:
249
+ metrics = None
250
+ if metrics is None:
251
+ metrics = _parse_metrics_from_stdout(proc.stdout)
252
+
253
+ if metrics is None:
254
+ raise optuna.TrialPruned("No metrics found (HYDRA_METRICS_OUT/[METRICS_JSON])")
255
+
256
+ if proc.returncode != 0:
257
+ raise optuna.TrialPruned(f"Training failed rc={proc.returncode}")
258
+
259
+ metric_key = args.metric
260
+ if metric_key not in metrics or metrics[metric_key] is None:
261
+ raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
262
+
263
+ tps_val = metrics.get("tps")
264
+ if tps_val is not None:
265
+ tps_f = float(tps_val)
266
+ trial.set_user_attr("tps", tps_f)
267
+ if args.min_tps is not None and tps_f < args.min_tps:
268
+ raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
269
+
270
+ value = float(metrics[metric_key])
271
+
272
+ # Keep useful context on trial
273
+ trial.set_user_attr("summary_path", metrics.get("summary_path"))
274
+ trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
275
+
276
+ return value
277
+
278
+ return objective
279
+
280
+
281
+ def _objective_hf_job(args: argparse.Namespace):
282
+ from huggingface_hub import HfApi
283
+ from huggingface_hub.utils import get_token
284
+
285
+ token = os.environ.get(args.hf_token_env) or get_token()
286
+ if not token:
287
+ raise RuntimeError(
288
+ f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
289
+ )
290
+
291
+ api = HfApi(token=token)
292
+ terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
293
+
294
+ def objective(trial: optuna.Trial) -> float:
295
+ trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
296
+ metrics_path = trial_dir / "metrics.json"
297
+ env = _trial_env(trial, args, metrics_path)
298
+ env = _sanitize_hf_env(env)
299
+
300
+ selected_job = None
301
+ launch_errors: list[str] = []
302
+ for command in _hf_command_candidates(args):
303
+ try:
304
+ job = api.run_job(
305
+ image=args.hf_image,
306
+ command=command,
307
+ env=env,
308
+ secrets={args.hf_token_env: token},
309
+ flavor=args.hf_flavor,
310
+ timeout=args.hf_timeout,
311
+ labels={"project": "feather", "goal": "optuna-hpo", "trial": str(trial.number)},
312
+ token=token,
313
+ namespace=args.hf_namespace,
314
+ )
315
+ except Exception as e:
316
+ launch_errors.append(f"launch:{command}: {type(e).__name__}: {e}")
317
+ continue
318
+
319
+ # Bootstrap check: reject known command/exec failures quickly.
320
+ bootstrap_deadline = time.time() + args.hf_bootstrap_seconds
321
+ bootstrap_stage = "UNKNOWN"
322
+ bootstrap_logs: list[str] = []
323
+ bootstrap_msg = ""
324
+ while time.time() < bootstrap_deadline:
325
+ info = api.inspect_job(job_id=job.id, token=token, namespace=args.hf_namespace)
326
+ bootstrap_stage = str(info.status.stage)
327
+ bootstrap_msg = str(getattr(info.status, "message", "") or "")
328
+ bootstrap_logs = list(
329
+ api.fetch_job_logs(
330
+ job_id=job.id,
331
+ follow=False,
332
+ token=token,
333
+ namespace=args.hf_namespace,
334
+ )
335
+ )
336
+ if bootstrap_stage in {"RUNNING", "COMPLETED"} or bootstrap_logs:
337
+ break
338
+ if bootstrap_stage in {"ERROR", "FAILED", "CANCELLED", "CANCELED", "TIMEOUT"}:
339
+ break
340
+ time.sleep(2)
341
+
342
+ detail = bootstrap_msg.lower()
343
+ unusable = bootstrap_stage in {"ERROR", "FAILED"} and len(bootstrap_logs) == 0 and any(
344
+ k in detail for k in ("executable file not found", "permission denied", "exec:")
345
+ )
346
+ if unusable:
347
+ launch_errors.append(f"bootstrap:{command}: {bootstrap_msg}")
348
+ continue
349
+
350
+ selected_job = job
351
+ break
352
+
353
+ if selected_job is None:
354
+ raise optuna.TrialPruned(f"HF job launch failed across command candidates: {launch_errors[:3]}")
355
+
356
+ job = selected_job
357
+ job_id = job.id
358
+ trial.set_user_attr("hf_job_id", job_id)
359
+
360
+ start = time.time()
361
+ metrics: dict[str, Any] | None = None
362
+ tps_seen: float | None = None
363
+ stage: str = "UNKNOWN"
364
+ log_lines: list[str] = []
365
+ terminal_detail: str | None = None
366
+
367
+ while True:
368
+ info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
369
+ stage = str(info.status.stage)
370
+ terminal_detail = str(getattr(info.status, "message", "")) or terminal_detail
371
+ log_lines = list(api.fetch_job_logs(job_id=job_id, follow=False, token=token, namespace=args.hf_namespace))
372
+
373
+ m = _parse_metrics_from_log_lines(log_lines)
374
+ if m is not None:
375
+ metrics = m
376
+ break
377
+
378
+ # Capture latest tps even before final metrics json
379
+ for line in log_lines:
380
+ mt = re.search(r"\btps=([0-9]+(?:\.[0-9]+)?)", line)
381
+ if mt:
382
+ tps_seen = float(mt.group(1))
383
+
384
+ if stage in terminal_states:
385
+ break
386
+ if time.time() - start > args.trial_timeout:
387
+ break
388
+ time.sleep(args.hf_poll_interval)
389
+
390
+ # Best-effort stop to control cost
391
+ try:
392
+ info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
393
+ if info.status.stage not in terminal_states and args.hf_stop_after_metric:
394
+ api.cancel_job(job_id=job_id, token=token, namespace=args.hf_namespace)
395
+ except Exception:
396
+ pass
397
+
398
+ # Save logs for debugging
399
+ (trial_dir / "hf_job.log").write_text("\n".join(log_lines), encoding="utf-8")
400
+ trial.set_user_attr("hf_stage", stage)
401
+ trial.set_user_attr("hf_log_lines", len(log_lines))
402
+ if terminal_detail:
403
+ trial.set_user_attr("hf_status_message", terminal_detail)
404
+
405
+ if metrics is None:
406
+ if args.allow_log_metric_fallback and args.metric == "val_bpb":
407
+ fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
408
+ if fallback_bpb is not None:
409
+ trial.set_user_attr("metric_source", "log_bpb_fallback")
410
+ if tps_seen is not None:
411
+ trial.set_user_attr("tps", tps_seen)
412
+ if args.min_tps is not None and tps_seen < args.min_tps:
413
+ raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {args.min_tps}")
414
+ return float(fallback_bpb)
415
+ if tps_seen is not None:
416
+ trial.set_user_attr("tps", tps_seen)
417
+ detail = f"stage={stage}, logs={len(log_lines)}"
418
+ if terminal_detail:
419
+ detail = f"{detail}, message={terminal_detail}"
420
+ raise optuna.TrialPruned(f"No metrics found from HF job ({detail})")
421
+
422
+ metric_key = args.metric
423
+ if metric_key not in metrics or metrics[metric_key] is None:
424
+ raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
425
+
426
+ tps_val = metrics.get("tps")
427
+ if tps_val is not None:
428
+ tps_f = float(tps_val)
429
+ trial.set_user_attr("tps", tps_f)
430
+ if args.min_tps is not None and tps_f < args.min_tps:
431
+ raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
432
+
433
+ value = float(metrics[metric_key])
434
+ trial.set_user_attr("summary_path", metrics.get("summary_path"))
435
+ trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
436
+ return value
437
+
438
+ return objective
439
+
440
+
441
+ def _objective_hf_launcher(args: argparse.Namespace):
442
+ from huggingface_hub import HfApi
443
+ from huggingface_hub.utils import get_token
444
+
445
+ token = os.environ.get(args.hf_token_env) or get_token()
446
+ if not token:
447
+ raise RuntimeError(
448
+ f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
449
+ )
450
+
451
+ api = HfApi(token=token)
452
+ terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
453
+
454
+ def objective(trial: optuna.Trial) -> float:
455
+ trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
456
+ metrics_path = trial_dir / "metrics.json"
457
+ env = _trial_env(trial, args, metrics_path)
458
+ env = _sanitize_hf_env(env)
459
+
460
+ local_env = os.environ.copy()
461
+ local_env.update(env)
462
+ local_env[args.hf_token_env] = token
463
+ local_env["FEATHER_HF_NAMESPACE"] = args.hf_namespace
464
+ local_env["FEATHER_HF_FLAVOR"] = args.hf_flavor
465
+ local_env["FEATHER_HF_JOB_TIMEOUT"] = args.hf_timeout
466
+ local_env["FEATHER_HF_IMAGE"] = args.hf_image
467
+ local_env["FEATHER_HF_SPACE_REPO"] = f"{args.hf_namespace}/feather-h200-runtime"
468
+ if args.hf_output_repo:
469
+ local_env["FEATHER_HF_OUTPUT_REPO"] = args.hf_output_repo
470
+ else:
471
+ local_env["FEATHER_HF_OUTPUT_REPO"] = f"{args.hf_namespace}/feather-pretrain-checkpoints"
472
+
473
+ proc = subprocess.run(
474
+ [sys.executable, str(args.hf_launcher_script)],
475
+ cwd=str(REPO_ROOT),
476
+ env=local_env,
477
+ text=True,
478
+ capture_output=True,
479
+ timeout=max(args.trial_timeout, 120),
480
+ )
481
+
482
+ launch_stdout = proc.stdout or ""
483
+ launch_stderr = proc.stderr or ""
484
+ m = re.search(r"job_id=([a-zA-Z0-9_-]+)", launch_stdout)
485
+ if proc.returncode != 0 or not m:
486
+ raise optuna.TrialPruned(
487
+ f"HF launcher failed rc={proc.returncode}; stderr={launch_stderr[-400:]} stdout_tail={launch_stdout[-400:]}"
488
+ )
489
+
490
+ job_id = m.group(1)
491
+ trial.set_user_attr("hf_job_id", job_id)
492
+
493
+ start = time.time()
494
+ metrics: dict[str, Any] | None = None
495
+ tps_seen: float | None = None
496
+ stage: str = "UNKNOWN"
497
+ log_lines: list[str] = []
498
+ terminal_detail: str | None = None
499
+
500
+ while True:
501
+ info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
502
+ stage = str(info.status.stage)
503
+ terminal_detail = str(getattr(info.status, "message", "") or "") or terminal_detail
504
+ log_lines = list(api.fetch_job_logs(job_id=job_id, follow=False, token=token, namespace=args.hf_namespace))
505
+
506
+ mtr = _parse_metrics_from_log_lines(log_lines)
507
+ if mtr is not None:
508
+ metrics = mtr
509
+ break
510
+
511
+ for line in log_lines:
512
+ mt = re.search(r"\btps=([0-9]+(?:\.[0-9]+)?)", line)
513
+ if mt:
514
+ tps_seen = float(mt.group(1))
515
+
516
+ if stage in terminal_states:
517
+ break
518
+ if time.time() - start > args.trial_timeout:
519
+ break
520
+ time.sleep(args.hf_poll_interval)
521
+
522
+ try:
523
+ info = api.inspect_job(job_id=job_id, token=token, namespace=args.hf_namespace)
524
+ if info.status.stage not in terminal_states and args.hf_stop_after_metric:
525
+ api.cancel_job(job_id=job_id, token=token, namespace=args.hf_namespace)
526
+ except Exception:
527
+ pass
528
+
529
+ (trial_dir / "hf_job.log").write_text("\n".join(log_lines), encoding="utf-8")
530
+ trial.set_user_attr("hf_stage", stage)
531
+ trial.set_user_attr("hf_log_lines", len(log_lines))
532
+ if terminal_detail:
533
+ trial.set_user_attr("hf_status_message", terminal_detail)
534
+
535
+ if metrics is None:
536
+ if args.allow_log_metric_fallback and args.metric == "val_bpb":
537
+ fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
538
+ if fallback_bpb is not None:
539
+ trial.set_user_attr("metric_source", "log_bpb_fallback")
540
+ if tps_seen is not None:
541
+ trial.set_user_attr("tps", tps_seen)
542
+ if args.min_tps is not None and tps_seen < args.min_tps:
543
+ raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {args.min_tps}")
544
+ return float(fallback_bpb)
545
+ if tps_seen is not None:
546
+ trial.set_user_attr("tps", tps_seen)
547
+ detail = f"stage={stage}, logs={len(log_lines)}"
548
+ if terminal_detail:
549
+ detail = f"{detail}, message={terminal_detail}"
550
+ raise optuna.TrialPruned(f"No metrics found from HF launcher job ({detail})")
551
+
552
+ metric_key = args.metric
553
+ if metric_key not in metrics or metrics[metric_key] is None:
554
+ raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
555
+
556
+ tps_val = metrics.get("tps")
557
+ if tps_val is not None:
558
+ tps_f = float(tps_val)
559
+ trial.set_user_attr("tps", tps_f)
560
+ if args.min_tps is not None and tps_f < args.min_tps:
561
+ raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
562
+
563
+ value = float(metrics[metric_key])
564
+ trial.set_user_attr("summary_path", metrics.get("summary_path"))
565
+ trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
566
+ return value
567
+
568
+ return objective
569
+
570
+
571
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
572
+ routing_defaults = resolve_routing(token=os.environ.get("HF_TOKEN"))
573
+ parser = argparse.ArgumentParser(description="Optuna HPO runner for HYDRA train.py")
574
+ parser.add_argument("--study-name", default="hydra_hpo", help="Optuna study name")
575
+ parser.add_argument("--storage", default="sqlite:///optuna_hpo.db", help="Optuna storage URL")
576
+ parser.add_argument("--direction", choices=["minimize", "maximize"], default="minimize")
577
+ parser.add_argument("--metric", default="val_bpb", help="Metric key to optimize from HYDRA metrics")
578
+ parser.add_argument(
579
+ "--min-tps",
580
+ type=float,
581
+ default=50000.0,
582
+ help="TPS floor; prune trials under this value (set 0 to disable)",
583
+ )
584
+ parser.add_argument("--trials", type=int, default=20, help="Number of Optuna trials")
585
+ parser.add_argument("--study-timeout", type=int, default=None, help="Study timeout in seconds")
586
+ parser.add_argument("--trial-time-budget", type=int, default=300, help="HYDRA_TIME_BUDGET passed to each trial")
587
+ parser.add_argument("--trial-timeout", type=int, default=900, help="Subprocess timeout per trial in seconds")
588
+ parser.add_argument("--runner", choices=["local", "hf-job", "hf-launcher"], default="local", help="Trial execution backend")
589
+ parser.add_argument("--hf-namespace", default=routing_defaults.job_namespace, help="HF namespace for jobs")
590
+ parser.add_argument("--hf-image", default=f"hf.co/spaces/{routing_defaults.space_repo}", help="HF jobs image")
591
+ parser.add_argument("--hf-flavor", default="a10g-large", help="HF jobs hardware flavor")
592
+ parser.add_argument("--hf-timeout", default="25m", help="HF job timeout string")
593
+ parser.add_argument("--hf-command", default="/app/entrypoint.py", help="Command executed inside HF job")
594
+ parser.add_argument("--hf-use-bash", action="store_true", help="Run HF command via bash -lc")
595
+ parser.add_argument("--hf-auto-command-fallback", action="store_true", default=True, help="Auto-wrap entrypoint command with python/python3/uv fallback")
596
+ parser.add_argument("--no-hf-auto-command-fallback", action="store_false", dest="hf_auto_command_fallback")
597
+ parser.add_argument("--hf-poll-interval", type=int, default=12, help="HF job poll interval seconds")
598
+ parser.add_argument("--hf-bootstrap-seconds", type=int, default=18, help="Initial seconds to validate command bootstrap")
599
+ parser.add_argument("--hf-token-env", default="HF_TOKEN", help="Token env key passed as HF job secret")
600
+ parser.add_argument("--hf-stop-after-metric", action="store_true", default=True, help="Cancel running job after metrics captured")
601
+ parser.add_argument("--no-hf-stop-after-metric", action="store_false", dest="hf_stop_after_metric")
602
+ parser.add_argument("--hf-launcher-script", type=Path, default=REPO_ROOT / "scripts" / "launch_feather_hf_job.py", help="Local launcher script for hf-launcher runner")
603
+ parser.add_argument("--hf-output-repo", default=routing_defaults.output_repo, help="Optional FEATHER_HF_OUTPUT_REPO override for launcher runner")
604
+ parser.add_argument("--allow-log-metric-fallback", action="store_true", default=False, help="When metrics JSON is missing, allow val_bpb fallback from latest logged train bpb")
605
+ parser.add_argument("--no-allow-log-metric-fallback", action="store_false", dest="allow_log_metric_fallback")
606
+ parser.add_argument("--priors-file", type=Path, default=REPO_ROOT / "docs" / "hpo_transfer_priors.json", help="Path to transfer-learning prior trials JSON")
607
+ parser.add_argument("--apply-priors", action="store_true", default=True, help="Enqueue transfer-learning prior trials before optimize")
608
+ parser.add_argument("--no-apply-priors", action="store_false", dest="apply_priors")
609
+ parser.add_argument("--seed", type=int, default=42, help="Seed for sampler")
610
+ parser.add_argument("--n-startup-trials", type=int, default=5, help="Pruner startup trials before pruning")
611
+ parser.add_argument("--n-warmup-steps", type=int, default=0, help="Pruner warmup steps")
612
+ parser.add_argument("--patience-trials", type=int, default=None, help="Stop study after this many completed trials without meaningful improvement")
613
+ parser.add_argument("--min-improvement", type=float, default=0.0, help="Minimum best-value improvement to reset patience")
614
+ parser.add_argument("--work-dir", type=Path, default=REPO_ROOT / ".tmp" / "optuna", help="Directory for trial artifacts")
615
+ parser.add_argument("--summary-out", type=Path, default=REPO_ROOT / ".tmp" / "optuna" / "best_summary.json")
616
+ return parser.parse_args(argv)
617
+
618
+
619
+ def main() -> int:
620
+ args = parse_args()
621
+ args.work_dir.mkdir(parents=True, exist_ok=True)
622
+ args.summary_out.parent.mkdir(parents=True, exist_ok=True)
623
+
624
+ sampler = optuna.samplers.TPESampler(seed=args.seed, multivariate=True)
625
+ pruner = optuna.pruners.MedianPruner(
626
+ n_startup_trials=args.n_startup_trials,
627
+ n_warmup_steps=args.n_warmup_steps,
628
+ )
629
+
630
+ study = optuna.create_study(
631
+ study_name=args.study_name,
632
+ storage=args.storage,
633
+ load_if_exists=True,
634
+ direction=args.direction,
635
+ sampler=sampler,
636
+ pruner=pruner,
637
+ )
638
+
639
+ enqueued_priors = _enqueue_transfer_priors(study, args.priors_file, args.apply_priors)
640
+ if enqueued_priors:
641
+ print(f"[hpo] enqueued {enqueued_priors} transfer priors from {args.priors_file}")
642
+
643
+ state: dict[str, Any] = {
644
+ "best": None,
645
+ "best_trial_number": None,
646
+ "last_improve_trial_number": None,
647
+ }
648
+
649
+ def _improved(new_value: float, best_value: float) -> bool:
650
+ if args.direction == "minimize":
651
+ return new_value < (best_value - args.min_improvement)
652
+ return new_value > (best_value + args.min_improvement)
653
+
654
+ def _early_stop_callback(study_obj: optuna.Study, trial: optuna.trial.FrozenTrial) -> None:
655
+ if trial.value is None:
656
+ return
657
+
658
+ if state["best"] is None or _improved(float(trial.value), float(state["best"])):
659
+ state["best"] = float(trial.value)
660
+ state["best_trial_number"] = trial.number
661
+ state["last_improve_trial_number"] = trial.number
662
+ return
663
+
664
+ if args.patience_trials is None:
665
+ return
666
+
667
+ if state["last_improve_trial_number"] is None:
668
+ return
669
+
670
+ since = trial.number - int(state["last_improve_trial_number"])
671
+ if since >= args.patience_trials:
672
+ study_obj.stop()
673
+
674
+ callbacks = [_early_stop_callback] if args.patience_trials is not None else None
675
+ if args.runner == "local":
676
+ objective_fn = _objective_local(args)
677
+ elif args.runner == "hf-job":
678
+ objective_fn = _objective_hf_job(args)
679
+ else:
680
+ objective_fn = _objective_hf_launcher(args)
681
+
682
+ study.optimize(
683
+ objective_fn,
684
+ n_trials=args.trials,
685
+ timeout=args.study_timeout,
686
+ callbacks=callbacks,
687
+ )
688
+
689
+ completed = [t for t in study.trials if t.value is not None]
690
+ if completed:
691
+ best = {
692
+ "study_name": study.study_name,
693
+ "direction": args.direction,
694
+ "metric": args.metric,
695
+ "best_value": study.best_value,
696
+ "best_params": study.best_params,
697
+ "best_trial_number": study.best_trial.number,
698
+ "best_trial_user_attrs": study.best_trial.user_attrs,
699
+ "n_trials": len(study.trials),
700
+ "n_completed": len(completed),
701
+ "patience_trials": args.patience_trials,
702
+ "min_improvement": args.min_improvement,
703
+ "enqueued_priors": enqueued_priors,
704
+ }
705
+ else:
706
+ best = {
707
+ "study_name": study.study_name,
708
+ "direction": args.direction,
709
+ "metric": args.metric,
710
+ "best_value": None,
711
+ "best_params": {},
712
+ "best_trial_number": None,
713
+ "best_trial_user_attrs": {},
714
+ "n_trials": len(study.trials),
715
+ "n_completed": 0,
716
+ "enqueued_priors": enqueued_priors,
717
+ "note": "No completed trials with metrics found.",
718
+ }
719
+ args.summary_out.write_text(json.dumps(best, indent=2), encoding="utf-8")
720
+ print(json.dumps(best, indent=2))
721
+ return 0
722
+
723
+
724
+ if __name__ == "__main__":
725
+ raise SystemExit(main())
overlay/scripts/parse_metrics.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Parse train.py run.log → (bpb, tps_avg, factual).
2
+
3
+ bpb priority order:
4
+ 1. val_bpb from [VAL] line (cleanest signal, but OOMs on 6GB cards)
5
+ 2. train_bpb from the LAST step= line (proxy when val fails — not held-out
6
+ but monotone with model capability over a 5-min budget)
7
+ """
8
+ import re, sys
9
+ txt = open(sys.argv[1]).read()
10
+
11
+ m = re.search(r'val_bpb:\s+([\d\.]+)', txt)
12
+ if m:
13
+ bpb = m.group(1)
14
+ else:
15
+ step_lines = re.findall(r'^step=\d+\s+loss=[\d\.]+\s+bpb=([\d\.]+)', txt, re.M)
16
+ bpb = f'~{step_lines[-1]}' if step_lines else 'NA'
17
+
18
+ tps_vals = [int(m.group(1)) for m in re.finditer(r'tps=(\d+)', txt)]
19
+ tps_avg = f'{sum(tps_vals)/len(tps_vals):.0f}' if tps_vals else 'NA'
20
+
21
+ m = re.search(r'factual_english_hits:\s+(\d+/\d+)', txt)
22
+ factual = m.group(1) if m else 'NA'
23
+
24
+ print(f"{bpb}\t{tps_avg}\t{factual}")
overlay/scripts/run_domain_expanded_pretrain.sh CHANGED
@@ -1,262 +1,262 @@
1
- #!/usr/bin/env bash
2
- # Domain-expanded streaming pretrain launcher for Feather/HYDRA.
3
- #
4
- # Usage:
5
- # ./scripts/run_domain_expanded_pretrain.sh
6
- # HYDRA_TARGET_SHARDS=2048 HYDRA_TIME_BUDGET=28800 ./scripts/run_domain_expanded_pretrain.sh
7
- # ./scripts/run_domain_expanded_pretrain.sh --target-shards 1024 --dry-run
8
- # ./scripts/run_domain_expanded_pretrain.sh --target-shards -1 --download-workers 16
9
- #
10
- # Behavior:
11
- # - counts currently cached parquet shards in ~/.cache/autoresearch/data
12
- # - optionally expands shard coverage toward a target via prepare.py
13
- # - skips prepare.py entirely when target coverage is already satisfied
14
- # - exports WSL CUDA library paths and long-run HYDRA_* env vars
15
- # - prefers an existing latest/pretrain checkpoint path if one is present
16
- # - streams stdout/stderr to a stable repo log: run_domain_expanded.log
17
- set -euo pipefail
18
-
19
- REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
20
- cd "$REPO_ROOT"
21
-
22
- CACHE_ROOT="${HYDRA_CACHE_ROOT:-$HOME/.cache/autoresearch}"
23
- DATA_DIR="${HYDRA_DATA_DIR:-$CACHE_ROOT/data}"
24
- CKPT_DIR="${HYDRA_CKPT_DIR:-$CACHE_ROOT/ckpts}"
25
- LOG_FILE="${HYDRA_DOMAIN_EXPANDED_LOG:-$REPO_ROOT/run_domain_expanded.log}"
26
- DEFAULT_TARGET_SHARDS="2048"
27
- TARGET_SHARDS="${HYDRA_TARGET_SHARDS:-$DEFAULT_TARGET_SHARDS}"
28
- DOWNLOAD_WORKERS="${HYDRA_DOWNLOAD_WORKERS:-8}"
29
- DRY_RUN=0
30
- SKIP_TRAIN=0
31
- FORCE_PREPARE=0
32
- NO_RESUME=0
33
- EXPLICIT_RESUME_PATH="${HYDRA_RESUME_PATH:-}"
34
-
35
- usage() {
36
- sed -n '2,16p' "$0"
37
- cat <<'EOF'
38
-
39
- Options:
40
- --target-shards N Target number of train shards to have locally (-1 = all)
41
- --download-workers N Parallel workers for prepare.py downloads
42
- --resume PATH Override auto-detected checkpoint path
43
- --no-resume Ignore existing checkpoints
44
- --skip-train Only ensure shard coverage, do not launch train.py
45
- --force-prepare Run prepare.py even if target coverage is already satisfied
46
- --dry-run Print planned actions without running prepare.py/train.py
47
- -h, --help Show this help
48
- EOF
49
- }
50
-
51
- while [[ $# -gt 0 ]]; do
52
- case "$1" in
53
- --target-shards)
54
- TARGET_SHARDS="$2"
55
- shift 2
56
- ;;
57
- --download-workers)
58
- DOWNLOAD_WORKERS="$2"
59
- shift 2
60
- ;;
61
- --resume)
62
- EXPLICIT_RESUME_PATH="$2"
63
- shift 2
64
- ;;
65
- --no-resume)
66
- NO_RESUME=1
67
- shift
68
- ;;
69
- --skip-train)
70
- SKIP_TRAIN=1
71
- shift
72
- ;;
73
- --force-prepare)
74
- FORCE_PREPARE=1
75
- shift
76
- ;;
77
- --dry-run)
78
- DRY_RUN=1
79
- shift
80
- ;;
81
- -h|--help)
82
- usage
83
- exit 0
84
- ;;
85
- *)
86
- echo "Unknown option: $1" >&2
87
- usage >&2
88
- exit 2
89
- ;;
90
- esac
91
- done
92
-
93
- if ! [[ "$TARGET_SHARDS" =~ ^-?[0-9]+$ ]]; then
94
- echo "Invalid --target-shards: $TARGET_SHARDS" >&2
95
- exit 2
96
- fi
97
- if ! [[ "$DOWNLOAD_WORKERS" =~ ^[0-9]+$ ]] || [[ "$DOWNLOAD_WORKERS" -lt 1 ]]; then
98
- echo "Invalid --download-workers: $DOWNLOAD_WORKERS" >&2
99
- exit 2
100
- fi
101
-
102
- python_has_deps() {
103
- local py="$1"
104
- "$py" - <<'PY' >/dev/null 2>&1
105
- import requests, pyarrow, rustbpe, torch
106
- PY
107
- }
108
-
109
- if [[ -x "$REPO_ROOT/.venv/bin/python" ]] && python_has_deps "$REPO_ROOT/.venv/bin/python"; then
110
- PYTHON_CMD=("$REPO_ROOT/.venv/bin/python")
111
- elif command -v uv >/dev/null 2>&1; then
112
- PYTHON_CMD=(uv run python)
113
- elif command -v python3 >/dev/null 2>&1 && python_has_deps "$(command -v python3)"; then
114
- PYTHON_CMD=(python3)
115
- else
116
- echo "No usable Python interpreter found with required deps (.venv/bin/python, uv run python, or python3)." >&2
117
- exit 1
118
- fi
119
-
120
- count_train_shards() {
121
- if [[ ! -d "$DATA_DIR" ]]; then
122
- echo 0
123
- return
124
- fi
125
- find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' ! -name 'shard_06542.parquet' | wc -l
126
- }
127
-
128
- count_total_shards() {
129
- if [[ ! -d "$DATA_DIR" ]]; then
130
- echo 0
131
- return
132
- fi
133
- find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' | wc -l
134
- }
135
-
136
- resolve_resume_path() {
137
- if [[ "$NO_RESUME" -eq 1 ]]; then
138
- return 0
139
- fi
140
- if [[ -n "$EXPLICIT_RESUME_PATH" ]]; then
141
- local expanded
142
- expanded="${EXPLICIT_RESUME_PATH/#\~/$HOME}"
143
- if [[ -f "$expanded" ]]; then
144
- printf '%s\n' "$expanded"
145
- return 0
146
- fi
147
- echo "Requested resume checkpoint not found: $expanded" >&2
148
- exit 1
149
- fi
150
-
151
- local candidates=(
152
- "$CKPT_DIR/latest.pt"
153
- "$CKPT_DIR/pretrain_latest.pt"
154
- "$CKPT_DIR/pretrain_final.pt"
155
- "$CACHE_ROOT/latest.pt"
156
- "$CACHE_ROOT/pretrain_latest.pt"
157
- "$CACHE_ROOT/pretrain_final.pt"
158
- "$REPO_ROOT/latest.pt"
159
- "$REPO_ROOT/pretrain_final.pt"
160
- )
161
- local candidate
162
- for candidate in "${candidates[@]}"; do
163
- if [[ -f "$candidate" ]]; then
164
- printf '%s\n' "$candidate"
165
- return 0
166
- fi
167
- done
168
- }
169
-
170
- CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
171
- CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
172
- HAS_VAL=0
173
- if [[ -f "$DATA_DIR/shard_06542.parquet" ]]; then
174
- HAS_VAL=1
175
- fi
176
-
177
- PREPARE_NUM_SHARDS="$TARGET_SHARDS"
178
- if [[ "$TARGET_SHARDS" -eq -1 ]]; then
179
- TARGET_DESC="all available train shards"
180
- NEED_PREPARE=1
181
- elif [[ "$CURRENT_TRAIN_SHARDS" -ge "$TARGET_SHARDS" ]]; then
182
- TARGET_DESC="$TARGET_SHARDS"
183
- NEED_PREPARE="$FORCE_PREPARE"
184
- else
185
- TARGET_DESC="$TARGET_SHARDS"
186
- NEED_PREPARE=1
187
- fi
188
-
189
- RESUME_PATH="$(resolve_resume_path || true)"
190
-
191
- export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
192
- export HYDRA_TIME_BUDGET="${HYDRA_TIME_BUDGET:-28800}"
193
- export HYDRA_TARGET_SHARDS="$TARGET_SHARDS"
194
- export HYDRA_DOWNLOAD_WORKERS="$DOWNLOAD_WORKERS"
195
- export HYDRA_DOMAIN_EXPANDED_LOG="$LOG_FILE"
196
- export HYDRA_CKPT_INTERVAL="${HYDRA_CKPT_INTERVAL:-2000}"
197
- export HYDRA_CHECKPOINT_INTERVAL="${HYDRA_CHECKPOINT_INTERVAL:-$HYDRA_CKPT_INTERVAL}"
198
- if [[ -n "$RESUME_PATH" ]]; then
199
- export HYDRA_RESUME_PATH="$RESUME_PATH"
200
- export HYDRA_RESUME_CKPT="$RESUME_PATH"
201
- fi
202
-
203
- mkdir -p "$(dirname "$LOG_FILE")"
204
-
205
- ts() { date '+%Y-%m-%d %H:%M:%S'; }
206
- log() {
207
- local line="[$(ts)] $*"
208
- echo "$line"
209
- echo "$line" >> "$LOG_FILE"
210
- }
211
-
212
- log "=== domain-expanded pretrain launcher ==="
213
- log "repo_root=$REPO_ROOT"
214
- log "data_dir=$DATA_DIR train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS has_val=$HAS_VAL"
215
- log "target_train_shards=$TARGET_DESC download_workers=$DOWNLOAD_WORKERS"
216
- log "log_file=$LOG_FILE"
217
- log "python=${PYTHON_CMD[*]}"
218
- log "HYDRA_TIME_BUDGET=$HYDRA_TIME_BUDGET"
219
- log "HYDRA_CKPT_INTERVAL=$HYDRA_CKPT_INTERVAL"
220
- if [[ -n "$RESUME_PATH" ]]; then
221
- log "resume_checkpoint=$RESUME_PATH"
222
- else
223
- log "resume_checkpoint=<none found>"
224
- fi
225
- log "note=train.py consumes HYDRA_RESUME_CKPT and HYDRA_CKPT_INTERVAL env vars; launcher exports them automatically"
226
-
227
- if [[ "${HYDRA_USE_NEMOTRON:-0}" == "1" ]]; then
228
- # Streaming Nemotron path (Super3 recipe) pulls tokens directly from HF at
229
- # train-time via prepare_nemotron.make_dataloader. The disk-shard prepare.py
230
- # download phase is redundant in this mode and wastes 20-30 min of paid GPU
231
- # time on shard parquet transfers we'll never read.
232
- log "prepare_action=skip reason=HYDRA_USE_NEMOTRON=1 (streaming at train-time)"
233
- elif [[ "$NEED_PREPARE" -eq 1 ]]; then
234
- PREPARE_CMD=("${PYTHON_CMD[@]}" prepare.py --num-shards "$PREPARE_NUM_SHARDS" --download-workers "$DOWNLOAD_WORKERS")
235
- log "prepare_action=run command=${PREPARE_CMD[*]}"
236
- if [[ "$DRY_RUN" -eq 0 ]]; then
237
- "${PREPARE_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
238
- CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
239
- CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
240
- log "post_prepare train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS"
241
- fi
242
- else
243
- log "prepare_action=skip reason=target_already_satisfied"
244
- fi
245
-
246
- TRAIN_CMD=("${PYTHON_CMD[@]}" -u train.py)
247
- if [[ "$SKIP_TRAIN" -eq 1 ]]; then
248
- log "train_action=skip reason=--skip-train"
249
- exit 0
250
- fi
251
-
252
- log "train_action=launch command=${TRAIN_CMD[*]}"
253
- if [[ "$DRY_RUN" -eq 1 ]]; then
254
- exit 0
255
- fi
256
-
257
- set +e
258
- "${TRAIN_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
259
- EXIT_CODE=${PIPESTATUS[0]}
260
- set -e
261
- log "train_exit_code=$EXIT_CODE"
262
- exit "$EXIT_CODE"
 
1
+ #!/usr/bin/env bash
2
+ # Domain-expanded streaming pretrain launcher for Feather/HYDRA.
3
+ #
4
+ # Usage:
5
+ # ./scripts/run_domain_expanded_pretrain.sh
6
+ # HYDRA_TARGET_SHARDS=2048 HYDRA_TIME_BUDGET=28800 ./scripts/run_domain_expanded_pretrain.sh
7
+ # ./scripts/run_domain_expanded_pretrain.sh --target-shards 1024 --dry-run
8
+ # ./scripts/run_domain_expanded_pretrain.sh --target-shards -1 --download-workers 16
9
+ #
10
+ # Behavior:
11
+ # - counts currently cached parquet shards in ~/.cache/autoresearch/data
12
+ # - optionally expands shard coverage toward a target via prepare.py
13
+ # - skips prepare.py entirely when target coverage is already satisfied
14
+ # - exports WSL CUDA library paths and long-run HYDRA_* env vars
15
+ # - prefers an existing latest/pretrain checkpoint path if one is present
16
+ # - streams stdout/stderr to a stable repo log: run_domain_expanded.log
17
+ set -euo pipefail
18
+
19
+ REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
20
+ cd "$REPO_ROOT"
21
+
22
+ CACHE_ROOT="${HYDRA_CACHE_ROOT:-$HOME/.cache/autoresearch}"
23
+ DATA_DIR="${HYDRA_DATA_DIR:-$CACHE_ROOT/data}"
24
+ CKPT_DIR="${HYDRA_CKPT_DIR:-$CACHE_ROOT/ckpts}"
25
+ LOG_FILE="${HYDRA_DOMAIN_EXPANDED_LOG:-$REPO_ROOT/run_domain_expanded.log}"
26
+ DEFAULT_TARGET_SHARDS="2048"
27
+ TARGET_SHARDS="${HYDRA_TARGET_SHARDS:-$DEFAULT_TARGET_SHARDS}"
28
+ DOWNLOAD_WORKERS="${HYDRA_DOWNLOAD_WORKERS:-8}"
29
+ DRY_RUN=0
30
+ SKIP_TRAIN=0
31
+ FORCE_PREPARE=0
32
+ NO_RESUME=0
33
+ EXPLICIT_RESUME_PATH="${HYDRA_RESUME_PATH:-}"
34
+
35
+ usage() {
36
+ sed -n '2,16p' "$0"
37
+ cat <<'EOF'
38
+
39
+ Options:
40
+ --target-shards N Target number of train shards to have locally (-1 = all)
41
+ --download-workers N Parallel workers for prepare.py downloads
42
+ --resume PATH Override auto-detected checkpoint path
43
+ --no-resume Ignore existing checkpoints
44
+ --skip-train Only ensure shard coverage, do not launch train.py
45
+ --force-prepare Run prepare.py even if target coverage is already satisfied
46
+ --dry-run Print planned actions without running prepare.py/train.py
47
+ -h, --help Show this help
48
+ EOF
49
+ }
50
+
51
+ while [[ $# -gt 0 ]]; do
52
+ case "$1" in
53
+ --target-shards)
54
+ TARGET_SHARDS="$2"
55
+ shift 2
56
+ ;;
57
+ --download-workers)
58
+ DOWNLOAD_WORKERS="$2"
59
+ shift 2
60
+ ;;
61
+ --resume)
62
+ EXPLICIT_RESUME_PATH="$2"
63
+ shift 2
64
+ ;;
65
+ --no-resume)
66
+ NO_RESUME=1
67
+ shift
68
+ ;;
69
+ --skip-train)
70
+ SKIP_TRAIN=1
71
+ shift
72
+ ;;
73
+ --force-prepare)
74
+ FORCE_PREPARE=1
75
+ shift
76
+ ;;
77
+ --dry-run)
78
+ DRY_RUN=1
79
+ shift
80
+ ;;
81
+ -h|--help)
82
+ usage
83
+ exit 0
84
+ ;;
85
+ *)
86
+ echo "Unknown option: $1" >&2
87
+ usage >&2
88
+ exit 2
89
+ ;;
90
+ esac
91
+ done
92
+
93
+ if ! [[ "$TARGET_SHARDS" =~ ^-?[0-9]+$ ]]; then
94
+ echo "Invalid --target-shards: $TARGET_SHARDS" >&2
95
+ exit 2
96
+ fi
97
+ if ! [[ "$DOWNLOAD_WORKERS" =~ ^[0-9]+$ ]] || [[ "$DOWNLOAD_WORKERS" -lt 1 ]]; then
98
+ echo "Invalid --download-workers: $DOWNLOAD_WORKERS" >&2
99
+ exit 2
100
+ fi
101
+
102
+ python_has_deps() {
103
+ local py="$1"
104
+ "$py" - <<'PY' >/dev/null 2>&1
105
+ import requests, pyarrow, rustbpe, torch
106
+ PY
107
+ }
108
+
109
+ if [[ -x "$REPO_ROOT/.venv/bin/python" ]] && python_has_deps "$REPO_ROOT/.venv/bin/python"; then
110
+ PYTHON_CMD=("$REPO_ROOT/.venv/bin/python")
111
+ elif command -v uv >/dev/null 2>&1; then
112
+ PYTHON_CMD=(uv run python)
113
+ elif command -v python3 >/dev/null 2>&1 && python_has_deps "$(command -v python3)"; then
114
+ PYTHON_CMD=(python3)
115
+ else
116
+ echo "No usable Python interpreter found with required deps (.venv/bin/python, uv run python, or python3)." >&2
117
+ exit 1
118
+ fi
119
+
120
+ count_train_shards() {
121
+ if [[ ! -d "$DATA_DIR" ]]; then
122
+ echo 0
123
+ return
124
+ fi
125
+ find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' ! -name 'shard_06542.parquet' | wc -l
126
+ }
127
+
128
+ count_total_shards() {
129
+ if [[ ! -d "$DATA_DIR" ]]; then
130
+ echo 0
131
+ return
132
+ fi
133
+ find "$DATA_DIR" -maxdepth 1 -type f -name 'shard_*.parquet' | wc -l
134
+ }
135
+
136
+ resolve_resume_path() {
137
+ if [[ "$NO_RESUME" -eq 1 ]]; then
138
+ return 0
139
+ fi
140
+ if [[ -n "$EXPLICIT_RESUME_PATH" ]]; then
141
+ local expanded
142
+ expanded="${EXPLICIT_RESUME_PATH/#\~/$HOME}"
143
+ if [[ -f "$expanded" ]]; then
144
+ printf '%s\n' "$expanded"
145
+ return 0
146
+ fi
147
+ echo "Requested resume checkpoint not found: $expanded" >&2
148
+ exit 1
149
+ fi
150
+
151
+ local candidates=(
152
+ "$CKPT_DIR/latest.pt"
153
+ "$CKPT_DIR/pretrain_latest.pt"
154
+ "$CKPT_DIR/pretrain_final.pt"
155
+ "$CACHE_ROOT/latest.pt"
156
+ "$CACHE_ROOT/pretrain_latest.pt"
157
+ "$CACHE_ROOT/pretrain_final.pt"
158
+ "$REPO_ROOT/latest.pt"
159
+ "$REPO_ROOT/pretrain_final.pt"
160
+ )
161
+ local candidate
162
+ for candidate in "${candidates[@]}"; do
163
+ if [[ -f "$candidate" ]]; then
164
+ printf '%s\n' "$candidate"
165
+ return 0
166
+ fi
167
+ done
168
+ }
169
+
170
+ CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
171
+ CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
172
+ HAS_VAL=0
173
+ if [[ -f "$DATA_DIR/shard_06542.parquet" ]]; then
174
+ HAS_VAL=1
175
+ fi
176
+
177
+ PREPARE_NUM_SHARDS="$TARGET_SHARDS"
178
+ if [[ "$TARGET_SHARDS" -eq -1 ]]; then
179
+ TARGET_DESC="all available train shards"
180
+ NEED_PREPARE=1
181
+ elif [[ "$CURRENT_TRAIN_SHARDS" -ge "$TARGET_SHARDS" ]]; then
182
+ TARGET_DESC="$TARGET_SHARDS"
183
+ NEED_PREPARE="$FORCE_PREPARE"
184
+ else
185
+ TARGET_DESC="$TARGET_SHARDS"
186
+ NEED_PREPARE=1
187
+ fi
188
+
189
+ RESUME_PATH="$(resolve_resume_path || true)"
190
+
191
+ export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
192
+ export HYDRA_TIME_BUDGET="${HYDRA_TIME_BUDGET:-28800}"
193
+ export HYDRA_TARGET_SHARDS="$TARGET_SHARDS"
194
+ export HYDRA_DOWNLOAD_WORKERS="$DOWNLOAD_WORKERS"
195
+ export HYDRA_DOMAIN_EXPANDED_LOG="$LOG_FILE"
196
+ export HYDRA_CKPT_INTERVAL="${HYDRA_CKPT_INTERVAL:-2000}"
197
+ export HYDRA_CHECKPOINT_INTERVAL="${HYDRA_CHECKPOINT_INTERVAL:-$HYDRA_CKPT_INTERVAL}"
198
+ if [[ -n "$RESUME_PATH" ]]; then
199
+ export HYDRA_RESUME_PATH="$RESUME_PATH"
200
+ export HYDRA_RESUME_CKPT="$RESUME_PATH"
201
+ fi
202
+
203
+ mkdir -p "$(dirname "$LOG_FILE")"
204
+
205
+ ts() { date '+%Y-%m-%d %H:%M:%S'; }
206
+ log() {
207
+ local line="[$(ts)] $*"
208
+ echo "$line"
209
+ echo "$line" >> "$LOG_FILE"
210
+ }
211
+
212
+ log "=== domain-expanded pretrain launcher ==="
213
+ log "repo_root=$REPO_ROOT"
214
+ log "data_dir=$DATA_DIR train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS has_val=$HAS_VAL"
215
+ log "target_train_shards=$TARGET_DESC download_workers=$DOWNLOAD_WORKERS"
216
+ log "log_file=$LOG_FILE"
217
+ log "python=${PYTHON_CMD[*]}"
218
+ log "HYDRA_TIME_BUDGET=$HYDRA_TIME_BUDGET"
219
+ log "HYDRA_CKPT_INTERVAL=$HYDRA_CKPT_INTERVAL"
220
+ if [[ -n "$RESUME_PATH" ]]; then
221
+ log "resume_checkpoint=$RESUME_PATH"
222
+ else
223
+ log "resume_checkpoint=<none found>"
224
+ fi
225
+ log "note=train.py consumes HYDRA_RESUME_CKPT and HYDRA_CKPT_INTERVAL env vars; launcher exports them automatically"
226
+
227
+ if [[ "${HYDRA_USE_NEMOTRON:-0}" == "1" ]]; then
228
+ # Streaming Nemotron path (Super3 recipe) pulls tokens directly from HF at
229
+ # train-time via prepare_nemotron.make_dataloader. The disk-shard prepare.py
230
+ # download phase is redundant in this mode and wastes 20-30 min of paid GPU
231
+ # time on shard parquet transfers we'll never read.
232
+ log "prepare_action=skip reason=HYDRA_USE_NEMOTRON=1 (streaming at train-time)"
233
+ elif [[ "$NEED_PREPARE" -eq 1 ]]; then
234
+ PREPARE_CMD=("${PYTHON_CMD[@]}" prepare.py --num-shards "$PREPARE_NUM_SHARDS" --download-workers "$DOWNLOAD_WORKERS")
235
+ log "prepare_action=run command=${PREPARE_CMD[*]}"
236
+ if [[ "$DRY_RUN" -eq 0 ]]; then
237
+ "${PREPARE_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
238
+ CURRENT_TRAIN_SHARDS="$(count_train_shards | tr -d ' ')"
239
+ CURRENT_TOTAL_SHARDS="$(count_total_shards | tr -d ' ')"
240
+ log "post_prepare train_shards=$CURRENT_TRAIN_SHARDS total_shards=$CURRENT_TOTAL_SHARDS"
241
+ fi
242
+ else
243
+ log "prepare_action=skip reason=target_already_satisfied"
244
+ fi
245
+
246
+ TRAIN_CMD=("${PYTHON_CMD[@]}" -u train.py)
247
+ if [[ "$SKIP_TRAIN" -eq 1 ]]; then
248
+ log "train_action=skip reason=--skip-train"
249
+ exit 0
250
+ fi
251
+
252
+ log "train_action=launch command=${TRAIN_CMD[*]}"
253
+ if [[ "$DRY_RUN" -eq 1 ]]; then
254
+ exit 0
255
+ fi
256
+
257
+ set +e
258
+ "${TRAIN_CMD[@]}" 2>&1 | tee -a "$LOG_FILE"
259
+ EXIT_CODE=${PIPESTATUS[0]}
260
+ set -e
261
+ log "train_exit_code=$EXIT_CODE"
262
+ exit "$EXIT_CODE"
overlay/scripts/run_meta.sh CHANGED
@@ -1,13 +1,13 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- echo "=== HYDRA Meta-Agent ==="
5
- cd "$(dirname "$0")/.."
6
-
7
- echo "Running meta-agent iteration..."
8
- uv run python -c "
9
- from harness.meta_agent import run_meta_iteration
10
- import json
11
- result = run_meta_iteration()
12
- print(json.dumps(result, indent=2))
13
- "
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ echo "=== HYDRA Meta-Agent ==="
5
+ cd "$(dirname "$0")/.."
6
+
7
+ echo "Running meta-agent iteration..."
8
+ uv run python -c "
9
+ from harness.meta_agent import run_meta_iteration
10
+ import json
11
+ result = run_meta_iteration()
12
+ print(json.dumps(result, indent=2))
13
+ "
overlay/scripts/run_phase1.sh CHANGED
@@ -1,32 +1,32 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- echo "=== HYDRA Phase 1: Sequential Subsystem Bring-Up ==="
5
- cd "$(dirname "$0")/.."
6
-
7
- SUBSYSTEMS=("mamba3" "mhc" "engram" "hestia" "sdr")
8
-
9
- for sub in "${SUBSYSTEMS[@]}"; do
10
- echo ""
11
- echo "--- Subsystem: ${sub} ---"
12
- BRANCH="autoresearch/phase1-${sub}"
13
-
14
- # Create branch if it doesn't exist
15
- if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
16
- git checkout -b "${BRANCH}"
17
- else
18
- git checkout "${BRANCH}"
19
- fi
20
-
21
- echo "Running: uv run subsystems/train_${sub}.py"
22
- uv run "subsystems/train_${sub}.py" > "run_${sub}.log" 2>&1 || true
23
-
24
- # Extract result
25
- echo "Result:"
26
- grep "^val_bpb:" "run_${sub}.log" || echo " (crashed)"
27
- grep "^peak_vram_mb:" "run_${sub}.log" || true
28
- done
29
-
30
- echo ""
31
- echo "=== Phase 1 complete ==="
32
- git checkout main 2>/dev/null || git checkout master
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ echo "=== HYDRA Phase 1: Sequential Subsystem Bring-Up ==="
5
+ cd "$(dirname "$0")/.."
6
+
7
+ SUBSYSTEMS=("mamba3" "mhc" "engram" "hestia" "sdr")
8
+
9
+ for sub in "${SUBSYSTEMS[@]}"; do
10
+ echo ""
11
+ echo "--- Subsystem: ${sub} ---"
12
+ BRANCH="autoresearch/phase1-${sub}"
13
+
14
+ # Create branch if it doesn't exist
15
+ if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
16
+ git checkout -b "${BRANCH}"
17
+ else
18
+ git checkout "${BRANCH}"
19
+ fi
20
+
21
+ echo "Running: uv run subsystems/train_${sub}.py"
22
+ uv run "subsystems/train_${sub}.py" > "run_${sub}.log" 2>&1 || true
23
+
24
+ # Extract result
25
+ echo "Result:"
26
+ grep "^val_bpb:" "run_${sub}.log" || echo " (crashed)"
27
+ grep "^peak_vram_mb:" "run_${sub}.log" || true
28
+ done
29
+
30
+ echo ""
31
+ echo "=== Phase 1 complete ==="
32
+ git checkout main 2>/dev/null || git checkout master
overlay/scripts/run_phase2.sh CHANGED
@@ -1,25 +1,25 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- echo "=== HYDRA Phase 2: Integrated Autoresearch ==="
5
- cd "$(dirname "$0")/.."
6
-
7
- TAG="${1:-$(date +%b%d | tr '[:upper:]' '[:lower:]')}"
8
-
9
- # Validate tag: only alphanumeric, hyphens, underscores, dots
10
- if [[ ! "${TAG}" =~ ^[a-zA-Z0-9._-]+$ ]]; then
11
- echo "Error: invalid tag '${TAG}'. Use only alphanumeric, hyphens, underscores, dots." >&2
12
- exit 1
13
- fi
14
-
15
- BRANCH="autoresearch/${TAG}"
16
-
17
- if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
18
- git checkout -b -- "${BRANCH}"
19
- else
20
- git checkout -- "${BRANCH}"
21
- fi
22
-
23
- echo "Branch: ${BRANCH}"
24
- echo "Starting orchestrator..."
25
- uv run -m harness.orchestrator
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ echo "=== HYDRA Phase 2: Integrated Autoresearch ==="
5
+ cd "$(dirname "$0")/.."
6
+
7
+ TAG="${1:-$(date +%b%d | tr '[:upper:]' '[:lower:]')}"
8
+
9
+ # Validate tag: only alphanumeric, hyphens, underscores, dots
10
+ if [[ ! "${TAG}" =~ ^[a-zA-Z0-9._-]+$ ]]; then
11
+ echo "Error: invalid tag '${TAG}'. Use only alphanumeric, hyphens, underscores, dots." >&2
12
+ exit 1
13
+ fi
14
+
15
+ BRANCH="autoresearch/${TAG}"
16
+
17
+ if ! git rev-parse --verify "${BRANCH}" &>/dev/null; then
18
+ git checkout -b -- "${BRANCH}"
19
+ else
20
+ git checkout -- "${BRANCH}"
21
+ fi
22
+
23
+ echo "Branch: ${BRANCH}"
24
+ echo "Starting orchestrator..."
25
+ uv run -m harness.orchestrator
overlay/scripts/run_tps_gate.sh ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Run a reproducible throughput gate.
5
+ # Default gate: 50k TPS steady-state.
6
+ #
7
+ # Usage:
8
+ # bash scripts/run_tps_gate.sh [config] [seconds] [min_tps]
9
+ # Example:
10
+ # bash scripts/run_tps_gate.sh baseline 300 50000
11
+
12
+ CONFIG="${1:-baseline}"
13
+ SECONDS_BUDGET="${2:-300}"
14
+ MIN_TPS="${3:-50000}"
15
+
16
+ echo "[tps-gate] config=$CONFIG seconds=$SECONDS_BUDGET min_tps=$MIN_TPS"
17
+
18
+ python scripts/benchmark_hyena_stack.py \
19
+ --config "$CONFIG" \
20
+ --time "$SECONDS_BUDGET" \
21
+ --min-tps "$MIN_TPS"
22
+
23
+ echo "[tps-gate] PASS"
overlay/scripts/setup.sh CHANGED
@@ -1,27 +1,28 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
-
4
- echo "=== HYDRA Setup ==="
5
- echo ""
6
-
7
- # Check uv
8
- if ! command -v uv &>/dev/null; then
9
- echo "Installing uv..."
10
- curl -LsSf https://astral.sh/uv/install.sh | sh
11
- fi
12
-
13
- # Install Python dependencies
14
- echo "Installing Python dependencies..."
15
- cd "$(dirname "$0")/.."
16
- uv sync
17
-
18
- # Prepare data (download shards + train tokenizer)
19
- echo ""
20
- echo "Preparing data (this may take a few minutes on first run)..."
21
- uv run prepare.py --num-shards 10
22
-
23
- echo ""
24
- echo "=== Setup complete ==="
25
- echo "Run experiments with: uv run train.py"
26
- echo "Run orchestrator with: uv run -m harness.orchestrator"
27
- echo "Run Phase 1 subsystems with: bash scripts/run_phase1.sh"
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ echo "=== HYDRA Setup ==="
5
+ echo ""
6
+
7
+ # Check uv
8
+ if ! command -v uv &>/dev/null; then
9
+ echo "Installing uv..."
10
+ curl -LsSf https://astral.sh/uv/install.sh | sh
11
+ fi
12
+
13
+ # Install Python dependencies
14
+ echo "Installing Python dependencies..."
15
+ cd "$(dirname "$0")/.."
16
+ uv sync
17
+
18
+ # Prepare data (download shards + train tokenizer)
19
+ echo ""
20
+ echo "Preparing data (this may take a few minutes on first run)..."
21
+ uv run prepare.py --num-shards 10
22
+
23
+ echo ""
24
+ echo "=== Setup complete ==="
25
+ echo "Run experiments with: uv run train.py"
26
+ echo "Run orchestrator with: uv run -m harness.orchestrator"
27
+ echo "Run Phase 1 subsystems with: bash scripts/run_phase1.sh"
28
+ echo "For WSL/CUDA throughput gate: see docs/WSL_TPS_RUNBOOK.md"
overlay/scripts/strip_optimizer_state.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Strip optimizer_state_dict from a checkpoint, keeping only model weights
2
+ and config metadata.
3
+
4
+ Reason: resuming training.py's standard path restores the optimizer state,
5
+ which (in our 6GB / Muon-compile / bf16 setup) reproducibly produces a
6
+ NaN/>100-loss on the first forward after load. Reloading model weights
7
+ only and letting the optimizer initialize fresh sidesteps the issue.
8
+
9
+ Output checkpoint also clears `step`, `train_seconds`, `epoch` so the LR
10
+ schedule and warmup restart from zero — useful when we want to fine-tune
11
+ the trained weights at a new schedule length.
12
+ """
13
+ import sys, torch
14
+
15
+ src, dst = sys.argv[1], sys.argv[2]
16
+ ckpt = torch.load(src, map_location="cpu", weights_only=False)
17
+ keep = {
18
+ "model_state_dict": ckpt.get("model_state_dict", ckpt),
19
+ "config": ckpt.get("config"),
20
+ # Reset training progress markers so LR schedule warmups cleanly.
21
+ "step": 0,
22
+ "train_seconds": 0.0,
23
+ "smoothed_loss": 0.0,
24
+ "bpt_ema": 0.0,
25
+ "epoch": 0,
26
+ }
27
+ # Explicitly do NOT copy optimizer_state_dict.
28
+ torch.save(keep, dst)
29
+ print(f"Stripped -> {dst} (orig {sum(1 for _ in ckpt)} keys, kept {len(keep)})")
overlay/scripts/sweep_depth_aggregate.py CHANGED
@@ -11,16 +11,56 @@ Usage:
11
  """
12
  from __future__ import annotations
13
 
14
- import json
15
- import os
16
- import sys
17
- from pathlib import Path
18
-
19
- MANIFEST = Path(sys.argv[1] if len(sys.argv) > 1 else '/tmp/sweep_depth_manifest.txt')
20
-
21
-
22
- def fetch_metrics_from_job(job_id: str) -> dict | None:
23
- """Fetch HF Job stdout and parse the [METRICS_JSON] line."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  try:
25
  from huggingface_hub import HfApi # type: ignore
26
  except Exception as e:
@@ -33,41 +73,73 @@ def fetch_metrics_from_job(job_id: str) -> dict | None:
33
  print(f'[agg] could not fetch logs for job={job_id}: {e}', file=sys.stderr)
34
  return None
35
 
36
- last_json = None
37
- for line in logs_stream:
38
- # HfApi returns strings or JobLogEntry-like objects depending on version.
39
- text = getattr(line, 'data', None) or str(line)
40
- if '[METRICS_JSON]' in text:
41
- payload = text.split('[METRICS_JSON]', 1)[1].strip()
42
- try:
43
- last_json = json.loads(payload)
44
- except Exception:
45
- # Might be truncated on a line boundary — keep looking.
46
- pass
47
- return last_json
48
-
49
-
50
- def compare(results: dict[int, dict]) -> None:
51
- """Pretty-print comparison across n_layer values."""
52
- if not results:
53
- print('[agg] no results')
54
- return
55
- sorted_n = sorted(results.keys())
56
-
57
- # Top-level scalars
58
- print('\n=== Top-level scalars ===')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  hdr = ['metric'] + [f'L={n}' for n in sorted_n]
60
  print(' '.join(f'{h:>14}' for h in hdr))
61
- for key in ('val_bpb', 'val_ppl', 'num_params_M', 'total_tokens_M',
62
- 'training_seconds', 'peak_vram_mb', 'sdr_target_active',
63
- 'htm_anomaly', 'engram_hit_rate', 'sdr_active_bits'):
64
- row = [key] + [f'{results[n].get(key, float("nan")):.4f}' if isinstance(results[n].get(key), (int, float)) else 'n/a' for n in sorted_n]
65
- print(' '.join(f'{c:>14}' for c in row))
 
66
 
67
  # Per-layer panel — one table per metric.
68
  print('\n=== Per-layer: delta_ratio (residual contribution) ===')
69
  print(' '.join(['layer'] + [f'L={n:>2}' for n in sorted_n]))
70
- max_depth = max(results[n].get('n_layer', 0) for n in sorted_n)
71
  for li in range(max_depth):
72
  row = [f'L{li:02d}']
73
  for n in sorted_n:
@@ -104,16 +176,40 @@ def compare(results: dict[int, dict]) -> None:
104
 
105
  # Dead-layer detection
106
  print('\n=== Dead-layer detection (delta_ratio < 0.02) ===')
107
- for n in sorted_n:
108
- r = results[n]
109
- n_layer = r.get('n_layer', 0)
110
  dead = []
111
  for li in range(n_layer):
112
  v = r.get(f'layer_{li}_delta_ratio')
113
  if isinstance(v, (int, float)) and v < 0.02:
114
  dead.append(li)
115
- status = 'ALL LIVE' if not dead else f'DEAD LAYERS: {dead}'
116
- print(f' n_layer={n:2d} val_bpb={r.get("val_bpb", float("nan")):.4f} {status}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
 
119
  def main() -> int:
@@ -134,7 +230,7 @@ def main() -> int:
134
  jobs[n_layer] = job_id
135
 
136
  print(f'[agg] reading {len(jobs)} jobs from {MANIFEST}')
137
- results: dict[int, dict] = {}
138
  for n, jid in jobs.items():
139
  print(f'[agg] fetching job={jid} (n_layer={n}) ...')
140
  m = fetch_metrics_from_job(jid)
 
11
  """
12
  from __future__ import annotations
13
 
14
+ import json
15
+ import os
16
+ import statistics
17
+ import re
18
+ import sys
19
+ from pathlib import Path
20
+
21
+ from configs.harness_config import HarnessConfig
22
+
23
+ type MetricValue = float | int | str | bool | None
24
+ type MetricsDict = dict[str, MetricValue]
25
+
26
+ MANIFEST = Path(sys.argv[1] if len(sys.argv) > 1 else '/tmp/sweep_depth_manifest.txt')
27
+ STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
28
+ MIN_TPS = float(os.environ.get('SWEEP_MIN_TPS', '0'))
29
+
30
+
31
+ def _zero_shot_score(result: MetricsDict) -> float:
32
+ """Composite quality score for tie-breaking among BPB-near runs."""
33
+ factual = float(result.get('factual_english_score', 0.0) or 0.0)
34
+ instruction = float(result.get('instruction_following_score', 0.0) or 0.0)
35
+ distinct_2 = float(result.get('distinct_2', 0.0) or 0.0)
36
+ repetition = float(result.get('repetition_rate', 0.0) or 0.0)
37
+ return factual + instruction + distinct_2 - repetition
38
+
39
+
40
+ def _metric_float(result: MetricsDict, key: str, default: float = 0.0) -> float:
41
+ value = result.get(key, default)
42
+ return float(value) if isinstance(value, (int, float)) else default
43
+
44
+
45
+ def _metric_int(result: MetricsDict, key: str, default: int = 0) -> int:
46
+ value = result.get(key, default)
47
+ return int(value) if isinstance(value, int) else default
48
+
49
+
50
+ def _percentile_linear(sorted_values: list[float], pct: float) -> float:
51
+ if not sorted_values:
52
+ return 0.0
53
+ if len(sorted_values) == 1:
54
+ return sorted_values[0]
55
+ rank = (len(sorted_values) - 1) * (pct / 100.0)
56
+ lo = int(rank)
57
+ hi = min(lo + 1, len(sorted_values) - 1)
58
+ frac = rank - lo
59
+ return sorted_values[lo] * (1.0 - frac) + sorted_values[hi] * frac
60
+
61
+
62
+ def fetch_metrics_from_job(job_id: str) -> MetricsDict | None:
63
+ """Fetch HF Job stdout and parse the [METRICS_JSON] line."""
64
  try:
65
  from huggingface_hub import HfApi # type: ignore
66
  except Exception as e:
 
73
  print(f'[agg] could not fetch logs for job={job_id}: {e}', file=sys.stderr)
74
  return None
75
 
76
+ last_json = None
77
+ tps_samples: list[tuple[int, int]] = []
78
+ warmup_steps = 25
79
+ for line in logs_stream:
80
+ # HfApi returns strings or JobLogEntry-like objects depending on version.
81
+ text = getattr(line, 'data', None) or str(line)
82
+
83
+ wm = re.search(r"\[TPS_GUARD\] enabled .*?warmup_steps=(\d+)", text)
84
+ if wm:
85
+ warmup_steps = int(wm.group(1))
86
+
87
+ sm = STEP_TPS_PATTERN.search(text)
88
+ if sm:
89
+ tps_samples.append((int(sm.group(1)), int(sm.group(2))))
90
+
91
+ if '[METRICS_JSON]' in text:
92
+ payload = text.split('[METRICS_JSON]', 1)[1].strip()
93
+ try:
94
+ last_json = json.loads(payload)
95
+ except Exception:
96
+ # Might be truncated on a line boundary — keep looking.
97
+ pass
98
+ if last_json is None:
99
+ return None
100
+
101
+ steady_tps = [float(tps) for step, tps in tps_samples if step >= warmup_steps]
102
+ if not steady_tps:
103
+ steady_tps = [float(tps) for _, tps in tps_samples]
104
+ if steady_tps:
105
+ sorted_tps = sorted(steady_tps)
106
+ last_json['tps_samples'] = len(steady_tps)
107
+ last_json['tps_median'] = float(statistics.median(steady_tps))
108
+ last_json['tps_p10'] = float(_percentile_linear(sorted_tps, 10.0))
109
+ last_json['tps_min'] = float(sorted_tps[0])
110
+ last_json['tps_max'] = float(sorted_tps[-1])
111
+ last_json['tps_warmup_steps'] = int(warmup_steps)
112
+
113
+ return last_json
114
+
115
+
116
+ def compare(results: dict[int, MetricsDict]) -> None:
117
+ """Pretty-print comparison across n_layer values."""
118
+ if not results:
119
+ print('[agg] no results')
120
+ return
121
+ sorted_n = sorted(results.keys())
122
+ secondary_gates = HarnessConfig().to_secondary_gates()
123
+
124
+ print('\n=== Active secondary gates ===')
125
+ for metric, thresholds in sorted(secondary_gates.items()):
126
+ print(f' {metric}: {json.dumps(thresholds, sort_keys=True)}')
127
+
128
+ # Top-level scalars
129
+ print('\n=== Top-level scalars ===')
130
  hdr = ['metric'] + [f'L={n}' for n in sorted_n]
131
  print(' '.join(f'{h:>14}' for h in hdr))
132
+ for key in ('val_bpb', 'val_ppl', 'num_params_M', 'total_tokens_M',
133
+ 'training_seconds', 'peak_vram_mb', 'sdr_target_active',
134
+ 'htm_anomaly', 'engram_hit_rate', 'sdr_active_bits',
135
+ 'tps_median', 'tps_p10', 'tps_min', 'tps_max', 'tps_samples'):
136
+ row = [key] + [f'{results[n].get(key, float("nan")):.4f}' if isinstance(results[n].get(key), (int, float)) else 'n/a' for n in sorted_n]
137
+ print(' '.join(f'{c:>14}' for c in row))
138
 
139
  # Per-layer panel — one table per metric.
140
  print('\n=== Per-layer: delta_ratio (residual contribution) ===')
141
  print(' '.join(['layer'] + [f'L={n:>2}' for n in sorted_n]))
142
+ max_depth = max(_metric_int(results[n], 'n_layer', 0) for n in sorted_n)
143
  for li in range(max_depth):
144
  row = [f'L{li:02d}']
145
  for n in sorted_n:
 
176
 
177
  # Dead-layer detection
178
  print('\n=== Dead-layer detection (delta_ratio < 0.02) ===')
179
+ for n in sorted_n:
180
+ r = results[n]
181
+ n_layer = _metric_int(r, 'n_layer', 0)
182
  dead = []
183
  for li in range(n_layer):
184
  v = r.get(f'layer_{li}_delta_ratio')
185
  if isinstance(v, (int, float)) and v < 0.02:
186
  dead.append(li)
187
+ status = 'ALL LIVE' if not dead else f'DEAD LAYERS: {dead}'
188
+ print(f' n_layer={n:2d} val_bpb={r.get("val_bpb", float("nan")):.4f} {status}')
189
+
190
+ print('\n=== Throughput-constrained ranking ===')
191
+ ranked = sorted(
192
+ ((n, r) for n, r in results.items() if isinstance(r.get('val_bpb'), (int, float))),
193
+ key=lambda x: (
194
+ (MIN_TPS > 0) and (_metric_float(x[1], 'tps_median', 0.0) < MIN_TPS),
195
+ _metric_float(x[1], 'val_bpb', float('inf')),
196
+ -_zero_shot_score(x[1]),
197
+ ),
198
+ )
199
+ feasible_count = 0
200
+ for n, r in ranked:
201
+ tps_median = _metric_float(r, 'tps_median', 0.0)
202
+ feasible = (MIN_TPS <= 0) or (tps_median >= MIN_TPS)
203
+ zero_shot_score = _zero_shot_score(r)
204
+ if feasible:
205
+ feasible_count += 1
206
+ print(
207
+ f" n_layer={n:2d} val_bpb={_metric_float(r, 'val_bpb', float('nan')):.4f} "
208
+ f"tps_median={tps_median:.0f} zero_shot_score={zero_shot_score:.4f} feasible={feasible}",
209
+ flush=True,
210
+ )
211
+ if MIN_TPS > 0:
212
+ print(f"[agg] throughput gate: tps_median >= {MIN_TPS:.0f}; feasible={feasible_count}/{len(ranked)}")
213
 
214
 
215
  def main() -> int:
 
230
  jobs[n_layer] = job_id
231
 
232
  print(f'[agg] reading {len(jobs)} jobs from {MANIFEST}')
233
+ results: dict[int, MetricsDict] = {}
234
  for n, jid in jobs.items():
235
  print(f'[agg] fetching job={jid} (n_layer={n}) ...')
236
  m = fetch_metrics_from_job(jid)
overlay/scripts/sweep_depth_local.sh CHANGED
@@ -1,62 +1,62 @@
1
- #!/usr/bin/env bash
2
- # Local sequential depth sweep on RTX 3060.
3
- # Uses real mamba_ssm Mamba3 (grafted from state-spaces/mamba main).
4
- # Config: Gen 76 local champion (d_model=96, engram=4096, target_active=327),
5
- # sweeping n_layer ∈ {1, 2, 3, 4}. Each run 300s (~5 min) → ~20 min total.
6
-
7
- set -euo pipefail
8
- cd "$(dirname "${BASH_SOURCE[0]}")/.."
9
-
10
- export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
11
- # WSL2: libcuda.so.1 lives at /usr/lib/wsl/lib; prepend it so cudarc finds the
12
- # CUDA driver library at runtime.
13
- export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:/usr/lib/wsl/lib:${LD_LIBRARY_PATH:-}
14
- export PYTORCH_ALLOC_CONF=expandable_segments:True
15
-
16
- # GPU HTM path: use non-fused step_many_cuda (fused megakernel is Hopper-only).
17
- # This drops htm_await from ~20-40s/step (CPU) to ~0ms (GPU, async).
18
- export HYDRA_HTM_FUSED=0
19
-
20
- # Architecture (Gen 76 + user audit: keep target_active=327 for gradient plasticity).
21
- export HYDRA_D_MODEL=96
22
- export HYDRA_D_STATE=16
23
- export HYDRA_HEADDIM=12
24
- export HYDRA_EXPAND=3
25
- export HYDRA_ENGRAM_N_COLUMNS=4096
26
- export HYDRA_SDR_TARGET_ACTIVE=327
27
-
28
- # Training knobs tuned for 6GB VRAM.
29
- export HYDRA_BATCH_SIZE=1
30
- export HYDRA_TOTAL_BATCH=32768 # 1 * 8 accum * 512 seq * 8 heads = Gen 76 config
31
- export HYDRA_TIME_BUDGET=300 # 5 min per run
32
- export HYDRA_CKPT_INTERVAL=0 # don't save ckpts during sweep
33
- export HYDRA_MID_VAL_INTERVAL=250
34
-
35
- # Full per-layer diagnostic panel.
36
- export HYDRA_LAYER_DIAGNOSTICS=1
37
- export HYDRA_LAYER_DIAG_SVD_EVERY=100
38
-
39
- # Use cached shards + tokenizer + retina (vocab=8192, target_active=327).
40
- # NOT streaming — already have 2049 shards from prior local runs.
41
- unset HYDRA_USE_NEMOTRON
42
-
43
- PY=/home/mikeb/work/feather/.venv/bin/python3
44
- OUT_DIR=/tmp/local_sweep
45
- mkdir -p "$OUT_DIR"
46
-
47
- for N in 1 2 3 4; do
48
- echo "=========================================="
49
- echo "=== n_layer=$N $(date +%H:%M:%S) ==="
50
- echo "=========================================="
51
- export HYDRA_N_LAYER=$N
52
- export HYDRA_METRICS_OUT="$OUT_DIR/sweep_n${N}_metrics.json"
53
- LOG="$OUT_DIR/sweep_n${N}.log"
54
- "$PY" -u train.py > "$LOG" 2>&1 || echo "[WARN] n_layer=$N run exited non-zero (see $LOG)"
55
- echo "=== n_layer=$N done; metrics=$HYDRA_METRICS_OUT log=$LOG ==="
56
- # Quick tail of the important lines
57
- grep -E "val_bpb|LAYER_DIAG|METRICS_JSON" "$LOG" | tail -20 || true
58
- done
59
-
60
- echo ""
61
- echo "=== SWEEP COMPLETE ==="
62
- ls -la "$OUT_DIR"
 
1
+ #!/usr/bin/env bash
2
+ # Local sequential depth sweep on RTX 3060.
3
+ # Uses real mamba_ssm Mamba3 (grafted from state-spaces/mamba main).
4
+ # Config: Gen 76 local champion (d_model=96, engram=4096, target_active=327),
5
+ # sweeping n_layer ∈ {1, 2, 3, 4}. Each run 300s (~5 min) → ~20 min total.
6
+
7
+ set -euo pipefail
8
+ cd "$(dirname "${BASH_SOURCE[0]}")/.."
9
+
10
+ export CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
11
+ # WSL2: libcuda.so.1 lives at /usr/lib/wsl/lib; prepend it so cudarc finds the
12
+ # CUDA driver library at runtime.
13
+ export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:/usr/lib/wsl/lib:${LD_LIBRARY_PATH:-}
14
+ export PYTORCH_ALLOC_CONF=expandable_segments:True
15
+
16
+ # GPU HTM path: use non-fused step_many_cuda (fused megakernel is Hopper-only).
17
+ # This drops htm_await from ~20-40s/step (CPU) to ~0ms (GPU, async).
18
+ export HYDRA_HTM_FUSED=0
19
+
20
+ # Architecture (Gen 76 + user audit: keep target_active=327 for gradient plasticity).
21
+ export HYDRA_D_MODEL=96
22
+ export HYDRA_D_STATE=16
23
+ export HYDRA_HEADDIM=12
24
+ export HYDRA_EXPAND=3
25
+ export HYDRA_ENGRAM_N_COLUMNS=4096
26
+ export HYDRA_SDR_TARGET_ACTIVE=327
27
+
28
+ # Training knobs tuned for 6GB VRAM.
29
+ export HYDRA_BATCH_SIZE=1
30
+ export HYDRA_TOTAL_BATCH=32768 # 1 * 8 accum * 512 seq * 8 heads = Gen 76 config
31
+ export HYDRA_TIME_BUDGET=300 # 5 min per run
32
+ export HYDRA_CKPT_INTERVAL=0 # don't save ckpts during sweep
33
+ export HYDRA_MID_VAL_INTERVAL=250
34
+
35
+ # Full per-layer diagnostic panel.
36
+ export HYDRA_LAYER_DIAGNOSTICS=1
37
+ export HYDRA_LAYER_DIAG_SVD_EVERY=100
38
+
39
+ # Use cached shards + tokenizer + retina (vocab=8192, target_active=327).
40
+ # NOT streaming — already have 2049 shards from prior local runs.
41
+ unset HYDRA_USE_NEMOTRON
42
+
43
+ PY=/home/mikeb/work/feather/.venv/bin/python3
44
+ OUT_DIR=/tmp/local_sweep
45
+ mkdir -p "$OUT_DIR"
46
+
47
+ for N in 1 2 3 4; do
48
+ echo "=========================================="
49
+ echo "=== n_layer=$N $(date +%H:%M:%S) ==="
50
+ echo "=========================================="
51
+ export HYDRA_N_LAYER=$N
52
+ export HYDRA_METRICS_OUT="$OUT_DIR/sweep_n${N}_metrics.json"
53
+ LOG="$OUT_DIR/sweep_n${N}.log"
54
+ "$PY" -u train.py > "$LOG" 2>&1 || echo "[WARN] n_layer=$N run exited non-zero (see $LOG)"
55
+ echo "=== n_layer=$N done; metrics=$HYDRA_METRICS_OUT log=$LOG ==="
56
+ # Quick tail of the important lines
57
+ grep -E "val_bpb|LAYER_DIAG|METRICS_JSON" "$LOG" | tail -20 || true
58
+ done
59
+
60
+ echo ""
61
+ echo "=== SWEEP COMPLETE ==="
62
+ ls -la "$OUT_DIR"
overlay/scripts/train_champion_12h.sh ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # 12-hour champion training run. Config matches autoresearch iter.sh base
3
+ # after 61 mutation experiments identified the Pareto-optimal knobs.
4
+ #
5
+ # Champion config (train_bpb ~1.6169 at 10-min budget, 29.7k tps):
6
+ # d_model=160, n_layer=20, B=8, seq=1024
7
+ # engram=16384, z_loss=0.001, no GDN (pure Mamba3 stack)
8
+ # TIME_BUDGET=43200s (12 hours)
9
+ # CKPT_INTERVAL=500 steps (~every 15 min at ~30 steps/s)
10
+ #
11
+ # Assumes .omc/autoresearch_STOP sentinel is present (cron loop disabled).
12
+ # Output goes to run_champion_12h.log in repo root.
13
+
14
+ set -u
15
+ REPO=/home/mikeb/work/feather
16
+ cd "$REPO"
17
+
18
+ # Bail if autoresearch loop sentinel not set (would conflict)
19
+ if [ ! -f "$REPO/.omc/autoresearch_STOP" ]; then
20
+ echo "ERROR: .omc/autoresearch_STOP not present — autoresearch cron still active."
21
+ echo "Run: touch $REPO/.omc/autoresearch_STOP"
22
+ exit 1
23
+ fi
24
+
25
+ # Bail if another training is running
26
+ if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {found=1} END {exit !found}'; then
27
+ echo "ERROR: another python train.py is already running"
28
+ exit 1
29
+ fi
30
+
31
+ rm -f run_champion_12h.log
32
+ env \
33
+ LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
34
+ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
35
+ HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
36
+ HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
37
+ HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
38
+ HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
39
+ HYDRA_TIME_BUDGET=43200 \
40
+ HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
41
+ HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
42
+ HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
43
+ HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
44
+ HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
45
+ HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
46
+ HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
47
+ HYDRA_Z_LOSS_WEIGHT=0.001 \
48
+ HYDRA_RESUME_CKPT=none \
49
+ ./.venv/bin/python -u train.py > run_champion_12h.log 2>&1
50
+ echo "exit=$?"
overlay/scripts/train_champion_5h.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # 5-hour champion training — fresh start with properly-timed cosine schedule.
3
+ #
4
+ # Why not 12h: at 12h budget, the cosine LR stays near peak for the first
5
+ # ~6h, leaving the model thrashing around bpb~1.72 (plateau observed).
6
+ # The schedule is stretched too thin.
7
+ #
8
+ # Why 5h: 18000s is long enough to build capacity (~17000 steps at 30k tps)
9
+ # while letting the cosine actually decay to zero within the window. The
10
+ # "cooling" phase (last 20% = 1h) is where the bpb drops sharply below
11
+ # the 10-min champion's 1.62.
12
+ #
13
+ # Why not resume from latest.pt: the saved ckpt triggers NaN on first
14
+ # forward after resume (reproducible; ckpt/optimizer state incompatibility
15
+ # not worth debugging — fresh start is faster).
16
+
17
+ set -u
18
+ REPO=/home/mikeb/work/feather
19
+ cd "$REPO"
20
+
21
+ if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
22
+ echo "ERROR: another python train.py is running"
23
+ exit 1
24
+ fi
25
+
26
+ rm -f run_champion_5h.log
27
+ env \
28
+ LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
29
+ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
30
+ HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
31
+ HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
32
+ HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
33
+ HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
34
+ HYDRA_TIME_BUDGET=18000 \
35
+ HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
36
+ HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
37
+ HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
38
+ HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
39
+ HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
40
+ HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
41
+ HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
42
+ HYDRA_Z_LOSS_WEIGHT=0.001 \
43
+ HYDRA_RESUME_CKPT=none \
44
+ ./.venv/bin/python -u train.py > run_champion_5h.log 2>&1
45
+ echo "exit=$?"
overlay/scripts/train_champion_resume.sh ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Resume the original 12h run from its step-5000 checkpoint with the SAME
3
+ # budget (43200s). This keeps the optimizer state and LR schedule identical
4
+ # to what was running at ckpt save, so there's no mismatch between loaded
5
+ # momentum and new lr.
6
+ #
7
+ # Intent: validate that the resume path itself works (vs the failed warmstart
8
+ # attempts where budget change caused NaN on first step).
9
+
10
+ set -u
11
+ REPO=/home/mikeb/work/feather
12
+ cd "$REPO"
13
+
14
+ if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
15
+ echo "ERROR: another python train.py is running"
16
+ exit 1
17
+ fi
18
+
19
+ rm -f run_champion_resume.log
20
+ env \
21
+ LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
22
+ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
23
+ HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
24
+ HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
25
+ HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
26
+ HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
27
+ HYDRA_TIME_BUDGET=43200 \
28
+ HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
29
+ HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
30
+ HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
31
+ HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
32
+ HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
33
+ HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
34
+ HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
35
+ HYDRA_Z_LOSS_WEIGHT=0.001 \
36
+ HYDRA_RESUME_CKPT=/home/mikeb/.cache/autoresearch/latest.pt \
37
+ ./.venv/bin/python -u train.py > run_champion_resume.log 2>&1
38
+ echo "exit=$?"
overlay/scripts/train_champion_resume_clean.sh ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Resume training from weights-only ckpt (optimizer state stripped) to
3
+ # avoid the reproducible NaN that plain resume triggers.
4
+ #
5
+ # The step/train_seconds/epoch are also reset to 0 so the LR schedule
6
+ # warmup runs cleanly and cosine decay matches the new TIME_BUDGET.
7
+ # Model weights carry over ~2500 steps of prior training.
8
+
9
+ set -u
10
+ REPO=/home/mikeb/work/feather
11
+ cd "$REPO"
12
+
13
+ if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
14
+ echo "ERROR: another python train.py is running"
15
+ exit 1
16
+ fi
17
+
18
+ CKPT=/home/mikeb/.cache/autoresearch/weights_only_clean.pt
19
+ if [ ! -f "$CKPT" ]; then
20
+ echo "ERROR: $CKPT missing. Run scripts/strip_optimizer_state.py first."
21
+ exit 1
22
+ fi
23
+
24
+ rm -f run_champion_resume_clean.log
25
+ env \
26
+ LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
27
+ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
28
+ HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
29
+ HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
30
+ HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
31
+ HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
32
+ HYDRA_TIME_BUDGET=18000 \
33
+ HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
34
+ HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
35
+ HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
36
+ HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
37
+ HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
38
+ HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
39
+ HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
40
+ HYDRA_Z_LOSS_WEIGHT=0.001 \
41
+ HYDRA_RESUME_CKPT="$CKPT" \
42
+ ./.venv/bin/python -u train.py > run_champion_resume_clean.log 2>&1
43
+ echo "exit=$?"
overlay/scripts/train_champion_v2.sh ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Champion training v2 — fixes data pipeline + mode collapse.
3
+ #
4
+ # Diagnosis from step-3500 ckpt sampling:
5
+ # - Greedy decoding collapses to "a whole grains, etc." attractor
6
+ # - Top-p produces grammatical but factually-empty text
7
+ # - Token cache being built on-the-fly; blend sources were silently
8
+ # unavailable because HYDRA_LOCAL_SHARDS_ONLY=1 + no cached parquets
9
+ # - FULL_BLEND has only 4 active sources (fineweb-edu, wikipedia,
10
+ # cosmopedia, fineweb), all weight-0 for code/math
11
+ #
12
+ # Fixes:
13
+ # A) HYDRA_LOCAL_SHARDS_ONLY=0 → stream directly from HF Hub
14
+ # B) HYDRA_BACKGROUND_PREFETCH=1 → download remaining shards in BG
15
+ # C) HYDRA_ENTROPY_PENALTY=0.01 → break single-attractor mode collapse
16
+ # D) HYDRA_LABEL_SMOOTHING=0.1 → soft targets discourage peaked dist
17
+ # E) Resume from weights_only_clean.pt (inherit prior training)
18
+
19
+ set -u
20
+ REPO=/home/mikeb/work/feather
21
+ cd "$REPO"
22
+
23
+ if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
24
+ echo "ERROR: another python train.py is running"
25
+ exit 1
26
+ fi
27
+
28
+ CKPT=/home/mikeb/.cache/autoresearch/weights_only_clean.pt
29
+ if [ ! -f "$CKPT" ]; then
30
+ echo "ERROR: $CKPT missing."
31
+ exit 1
32
+ fi
33
+
34
+ rm -f run_champion_v2.log
35
+ env \
36
+ LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
37
+ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
38
+ HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
39
+ HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
40
+ HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
41
+ HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
42
+ HYDRA_TIME_BUDGET=18000 \
43
+ HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
44
+ HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
45
+ HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
46
+ HYDRA_LOCAL_SHARDS_ONLY=0 HYDRA_BACKGROUND_PREFETCH=1 \
47
+ HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
48
+ HYDRA_ENTROPY_PENALTY=0.01 HYDRA_LABEL_SMOOTHING=0.1 \
49
+ HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
50
+ HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
51
+ HYDRA_Z_LOSS_WEIGHT=0.001 \
52
+ HYDRA_RESUME_CKPT="$CKPT" \
53
+ ./.venv/bin/python -u train.py > run_champion_v2.log 2>&1
54
+ echo "exit=$?"
overlay/scripts/train_champion_warmstart.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Warm-start from the 12h champion training's latest.pt, with a TIGHTER
3
+ # total budget so the cosine LR decay actually kicks in.
4
+ #
5
+ # Problem: The plain 12h run (43200s) keeps lr near peak (1.1e-2) for the
6
+ # first ~6h, leaving the model thrashing around its local min (bpb ~1.72
7
+ # rolling avg from step 2700 onward). User correctly pointed out the
8
+ # schedule shape for a long budget wastes time in exploration.
9
+ #
10
+ # Fix: resume the already-trained weights (step ~5000, train_seconds ~5600)
11
+ # but run with HYDRA_TIME_BUDGET=20000 (5.5h total). The scheduler treats
12
+ # loaded train_seconds=5600 as "already 28% through" a 20000s budget, so
13
+ # lr decays from ~1.05e-2 now to near-zero over the next 4h — the "cooling"
14
+ # phase that produces the stable low-bpb endpoint.
15
+ #
16
+ # Total additional wall-clock: ~4h. Previous checkpoints are preserved
17
+ # (ckpt rotations keep latest.pt, latest.pt.1, etc.).
18
+
19
+ set -u
20
+ REPO=/home/mikeb/work/feather
21
+ cd "$REPO"
22
+
23
+ if ps -eo comm,args | awk 'NR>1 && $1 ~ /^python/ && $0 ~ /train\.py/ {f=1} END{exit !f}'; then
24
+ echo "ERROR: another python train.py is running"
25
+ exit 1
26
+ fi
27
+
28
+ rm -f run_champion_warmstart.log
29
+ env \
30
+ LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
31
+ PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
32
+ HYDRA_USE_NEMOTRON=1 HYDRA_USE_FULL_BLEND=1 \
33
+ HYDRA_SAMPLED_SOFTMAX=1024 HYDRA_SOFTCAP_CLAMP=1 \
34
+ HYDRA_SEQ_LEN=1024 HYDRA_HTM_SUBSAMPLE=128 HYDRA_HEADDIM=32 HYDRA_EXPAND=3 \
35
+ HYDRA_BATCH_SIZE=8 HYDRA_D_MODEL=160 HYDRA_N_LAYER=20 HYDRA_D_STATE=64 \
36
+ HYDRA_TIME_BUDGET=20000 \
37
+ HYDRA_ENGRAM_N_COLUMNS=16384 HYDRA_ENGRAM_TOPK=64 \
38
+ HYDRA_GDN_LAYERS= HYDRA_MTP_K=1 HYDRA_USE_MDLM=0 \
39
+ HYDRA_MUON_COMPILE=0 HYDRA_MUON_NS_STEPS=3 \
40
+ HYDRA_LOCAL_SHARDS_ONLY=1 HYDRA_BACKGROUND_PREFETCH=0 \
41
+ HYDRA_STREAM_PREFETCH=256 HYDRA_TOKEN_PREFETCH=32 \
42
+ HYDRA_CKPT_INTERVAL=500 HYDRA_MID_VAL_INTERVAL=0 \
43
+ HYDRA_EVAL_BATCH=1 HYDRA_EVAL_TOKENS=8192 HYDRA_CE_CHUNK=32 \
44
+ HYDRA_Z_LOSS_WEIGHT=0.001 \
45
+ HYDRA_RESUME_CKPT=/home/mikeb/.cache/autoresearch/latest.pt \
46
+ ./.venv/bin/python -u train.py > run_champion_warmstart.log 2>&1
47
+ echo "exit=$?"
overlay/scripts/wsl_bootstrap_tps.sh ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Bootstrap a WSL CUDA Python env capable of running train.py TPS checks.
5
+ # Usage:
6
+ # bash scripts/wsl_bootstrap_tps.sh [cuda-tag]
7
+ # Example:
8
+ # bash scripts/wsl_bootstrap_tps.sh cu121
9
+
10
+ CUDA_TAG="${1:-cu121}"
11
+ PYTHON_BIN="${PYTHON_BIN:-python3}"
12
+ VENV_DIR="${VENV_DIR:-.venv-wsl}"
13
+
14
+ if ! grep -qiE "microsoft|wsl" /proc/version 2>/dev/null; then
15
+ echo "[bootstrap] warning: not running inside WSL; continuing anyway"
16
+ fi
17
+
18
+ if ! command -v nvidia-smi >/dev/null 2>&1; then
19
+ echo "[bootstrap] error: nvidia-smi not found. Install NVIDIA driver + WSL GPU support first."
20
+ exit 1
21
+ fi
22
+
23
+ if ! command -v "$PYTHON_BIN" >/dev/null 2>&1; then
24
+ echo "[bootstrap] error: Python binary not found: $PYTHON_BIN"
25
+ exit 1
26
+ fi
27
+
28
+ "$PYTHON_BIN" -m venv "$VENV_DIR"
29
+ source "$VENV_DIR/bin/activate"
30
+
31
+ python -m pip install --upgrade pip wheel setuptools
32
+
33
+ case "$CUDA_TAG" in
34
+ cu118)
35
+ TORCH_INDEX_URL="https://download.pytorch.org/whl/cu118"
36
+ ;;
37
+ cu121)
38
+ TORCH_INDEX_URL="https://download.pytorch.org/whl/cu121"
39
+ ;;
40
+ cu124)
41
+ TORCH_INDEX_URL="https://download.pytorch.org/whl/cu124"
42
+ ;;
43
+ *)
44
+ echo "[bootstrap] error: unsupported cuda tag '$CUDA_TAG' (supported: cu118, cu121, cu124)"
45
+ exit 1
46
+ ;;
47
+ esac
48
+
49
+ python -m pip install "torch" --index-url "$TORCH_INDEX_URL"
50
+ python -m pip install -e ".[dev]"
51
+
52
+ # IMPORTANT: --no-build-isolation keeps pip from pulling torch-cpu into an
53
+ # isolated build env, which would break mamba-ssm extension builds.
54
+ python -m pip install "causal-conv1d>=1.4.0" --no-build-isolation
55
+ python -m pip install "mamba-ssm" --no-build-isolation
56
+
57
+ python - <<'PY'
58
+ import torch
59
+ print(f"[bootstrap] torch={torch.__version__}")
60
+ print(f"[bootstrap] torch_cuda={torch.version.cuda}")
61
+ print(f"[bootstrap] cuda_available={torch.cuda.is_available()}")
62
+ if not torch.cuda.is_available():
63
+ raise SystemExit("[bootstrap] error: CUDA not available to torch")
64
+ import mamba_ssm # noqa: F401
65
+ print("[bootstrap] mamba_ssm import OK")
66
+ PY
67
+
68
+ echo "[bootstrap] done. Activate env with: source $VENV_DIR/bin/activate"
overlay/subsystems/htm.py CHANGED
@@ -29,40 +29,38 @@ copy is small compared to the SP/TM compute.
29
  from __future__ import annotations
30
 
31
  import time
32
- from concurrent.futures import ThreadPoolExecutor
 
33
 
34
  import numpy as np
35
  import torch
36
  import torch.nn as nn
37
 
38
- import htm_rust
39
-
40
- # step_many releases the GIL for the whole pass, so multiple threads can
41
- # truly run regions in parallel — wall-clock scales with B up to CPU cores.
42
- _HTM_HAS_STEP_MANY = hasattr(htm_rust.HTMRegion, "step_many")
 
 
 
 
43
  # GPU backend: built with `maturin develop --features gpu`. One CUDA region
44
  # per batch slot, persistent device state for SP synapses. Transparent
45
  # fallback to CPU when not available.
46
- _HTM_HAS_GPU = hasattr(htm_rust, "HTMRegionGpu")
47
  # Zero-copy CUDA path: consumes torch CUDA tensors directly via the
48
  # __cuda_array_interface__ protocol, skipping the sdr.cpu()/numpy round-trip
49
  # and the D2H of outputs. Huge win when the input SDR already lives on GPU
50
  # (which is the train.py hot path — retina is a device buffer).
51
- _HTM_HAS_CAI = _HTM_HAS_GPU and hasattr(htm_rust.HTMRegionGpu, "step_many_cuda")
52
  # Fused megakernel path: collapses all T timesteps + SP + TM into a single
53
  # CUDA launch per forward. Replaces global top-K with per-column threshold
54
  # inhibition (see htm_rust/docs/GPU_HTM.md §Fused Kernel).
55
  # Opt-in via env var (default on when available).
56
  import os as _os_fused
57
- _HTM_HAS_FUSED = _HTM_HAS_GPU and hasattr(htm_rust.HTMRegionGpu, "step_many_fused_cuda")
58
- _HTM_GPU_FUSED_RUNTIME = bool(
59
- _HTM_HAS_FUSED and hasattr(htm_rust, "gpu_fused_available") and htm_rust.gpu_fused_available()
60
- )
61
- _HTM_USE_FUSED = (
62
- _HTM_HAS_FUSED
63
- and _HTM_GPU_FUSED_RUNTIME
64
- and bool(int(_os_fused.environ.get("HYDRA_HTM_FUSED", "1")))
65
- )
66
 
67
 
68
  class HTMLayer(nn.Module):
@@ -87,11 +85,11 @@ class HTMLayer(nn.Module):
87
  learn: bool = True,
88
  reset_each_forward: bool = True,
89
  use_gpu: bool | None = None,
90
- ) -> None:
91
- super().__init__()
92
- self.input_bits = input_bits
93
- self.n_columns = n_columns
94
- self.cells_per_column = cells_per_column
95
  self.learn = learn
96
  self.reset_each_forward = reset_each_forward
97
  self._seed_base = seed
@@ -101,39 +99,27 @@ class HTMLayer(nn.Module):
101
  # converges since the EMA accumulates over many calls. Env:
102
  # HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
103
  import os as _os
104
- self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
105
- self._forward_counter = 0
106
- # GPU backend gate. Default: auto-detect — use GPU when the pyo3
107
- # module was built with --features gpu AND CUDA is actually usable.
108
- if use_gpu is None:
109
- use_gpu = _HTM_HAS_GPU and torch.cuda.is_available()
110
- elif use_gpu and not _HTM_HAS_GPU:
111
- raise RuntimeError(
112
- "HTMLayer(use_gpu=True) but htm_rust was not built with "
113
- "--features gpu. Re-run `maturin develop --features gpu`."
114
- )
115
- self._use_gpu = bool(use_gpu)
116
- self._gpu_fallback = _os.environ.get("HYDRA_HTM_GPU_FALLBACK", "1") == "1"
117
- cls = htm_rust.HTMRegionGpu if self._use_gpu else htm_rust.HTMRegion
118
- self._region_cls = cls
119
- try:
120
- self._regions = [
121
- cls(input_bits, n_columns, cells_per_column, seed + i)
122
- for i in range(batch_size)
123
- ]
124
- except RuntimeError as e:
125
- if not self._use_gpu or not self._gpu_fallback:
126
- raise
127
- print(
128
- f"[htm] GPU region init failed ({e}); falling back to CPU HTMRegion",
129
- flush=True,
130
- )
131
- self._use_gpu = False
132
- self._region_cls = htm_rust.HTMRegion
133
- self._regions = [
134
- self._region_cls(input_bits, n_columns, cells_per_column, seed + i)
135
- for i in range(batch_size)
136
- ]
137
  self.register_buffer("_dummy", torch.zeros(1), persistent=False)
138
  import os as _os
139
  self._htm_pool = ThreadPoolExecutor(max_workers=min(_os.cpu_count() or 4, 16))
@@ -278,12 +264,12 @@ class HTMLayer(nn.Module):
278
  # grid.y = B processes all regions concurrently — ~B× speedup.
279
  # Falls back to sequential dispatch if the batched entry isn't
280
  # available (older htm_rust wheel).
281
- if _HTM_USE_FUSED and hasattr(htm_rust, "step_batch_fused_cuda"):
282
  # Slice self._regions to match B: _ensure_regions may have
283
  # allocated more regions than the current batch size needs
284
  # (e.g. factual eval uses smaller batches than training).
285
  try:
286
- htm_rust.step_batch_fused_cuda(
287
  self._regions[:B],
288
  [sdr_u8[b].__cuda_array_interface__ for b in range(B)],
289
  [cols_out[b].__cuda_array_interface__ for b in range(B)],
 
29
  from __future__ import annotations
30
 
31
  import time
32
+ from concurrent.futures import ThreadPoolExecutor
33
+ from typing import Any
34
 
35
  import numpy as np
36
  import torch
37
  import torch.nn as nn
38
 
39
+ import htm_rust
40
+
41
+ _HTM_REGION: Any = getattr(htm_rust, "HTMRegion", None)
42
+ _HTM_REGION_GPU: Any = getattr(htm_rust, "HTMRegionGpu", None)
43
+ _HTM_STEP_BATCH_FUSED_CUDA: Any = getattr(htm_rust, "step_batch_fused_cuda", None)
44
+
45
+ # step_many releases the GIL for the whole pass, so multiple threads can
46
+ # truly run regions in parallel — wall-clock scales with B up to CPU cores.
47
+ _HTM_HAS_STEP_MANY = hasattr(_HTM_REGION, "step_many")
48
  # GPU backend: built with `maturin develop --features gpu`. One CUDA region
49
  # per batch slot, persistent device state for SP synapses. Transparent
50
  # fallback to CPU when not available.
51
+ _HTM_HAS_GPU = hasattr(htm_rust, "HTMRegionGpu")
52
  # Zero-copy CUDA path: consumes torch CUDA tensors directly via the
53
  # __cuda_array_interface__ protocol, skipping the sdr.cpu()/numpy round-trip
54
  # and the D2H of outputs. Huge win when the input SDR already lives on GPU
55
  # (which is the train.py hot path — retina is a device buffer).
56
+ _HTM_HAS_CAI = _HTM_HAS_GPU and hasattr(_HTM_REGION_GPU, "step_many_cuda")
57
  # Fused megakernel path: collapses all T timesteps + SP + TM into a single
58
  # CUDA launch per forward. Replaces global top-K with per-column threshold
59
  # inhibition (see htm_rust/docs/GPU_HTM.md §Fused Kernel).
60
  # Opt-in via env var (default on when available).
61
  import os as _os_fused
62
+ _HTM_HAS_FUSED = _HTM_HAS_GPU and hasattr(_HTM_REGION_GPU, "step_many_fused_cuda")
63
+ _HTM_USE_FUSED = _HTM_HAS_FUSED and bool(int(_os_fused.environ.get("HYDRA_HTM_FUSED", "1")))
 
 
 
 
 
 
 
64
 
65
 
66
  class HTMLayer(nn.Module):
 
85
  learn: bool = True,
86
  reset_each_forward: bool = True,
87
  use_gpu: bool | None = None,
88
+ ) -> None:
89
+ super().__init__()
90
+ self.input_bits = input_bits
91
+ self.n_columns = n_columns
92
+ self.cells_per_column = cells_per_column
93
  self.learn = learn
94
  self.reset_each_forward = reset_each_forward
95
  self._seed_base = seed
 
99
  # converges since the EMA accumulates over many calls. Env:
100
  # HYDRA_HTM_LEARN_EVERY=N (default 1 = every forward, 0 = disabled).
101
  import os as _os
102
+ self._learn_every = max(1, int(_os.environ.get("HYDRA_HTM_LEARN_EVERY", "1")))
103
+ self._forward_counter = 0
104
+ force_cpu = _os.environ.get("HYDRA_FORCE_HTM_CPU", "0") == "1"
105
+ # GPU backend gate. Default: auto-detect use GPU when the pyo3
106
+ # module was built with --features gpu AND CUDA is actually usable.
107
+ if use_gpu is None:
108
+ use_gpu = (not force_cpu) and _HTM_HAS_GPU and torch.cuda.is_available()
109
+ elif use_gpu and not _HTM_HAS_GPU:
110
+ raise RuntimeError(
111
+ "HTMLayer(use_gpu=True) but htm_rust was not built with "
112
+ "--features gpu. Re-run `maturin develop --features gpu`."
113
+ )
114
+ elif use_gpu and force_cpu:
115
+ use_gpu = False
116
+ self._use_gpu = bool(use_gpu)
117
+ cls = _HTM_REGION_GPU if self._use_gpu else _HTM_REGION
118
+ self._region_cls = cls
119
+ self._regions = [
120
+ cls(input_bits, n_columns, cells_per_column, seed + i)
121
+ for i in range(batch_size)
122
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
123
  self.register_buffer("_dummy", torch.zeros(1), persistent=False)
124
  import os as _os
125
  self._htm_pool = ThreadPoolExecutor(max_workers=min(_os.cpu_count() or 4, 16))
 
264
  # grid.y = B processes all regions concurrently — ~B× speedup.
265
  # Falls back to sequential dispatch if the batched entry isn't
266
  # available (older htm_rust wheel).
267
+ if _HTM_USE_FUSED and _HTM_STEP_BATCH_FUSED_CUDA is not None:
268
  # Slice self._regions to match B: _ensure_regions may have
269
  # allocated more regions than the current batch size needs
270
  # (e.g. factual eval uses smaller batches than training).
271
  try:
272
+ _HTM_STEP_BATCH_FUSED_CUDA(
273
  self._regions[:B],
274
  [sdr_u8[b].__cuda_array_interface__ for b in range(B)],
275
  [cols_out[b].__cuda_array_interface__ for b in range(B)],