Jackoatmon commited on
Commit
dc229d4
·
verified ·
1 Parent(s): b60e1d9

Update benchmark runtime image

Browse files
__pycache__/entrypoint.cpython-312.pyc CHANGED
Binary files a/__pycache__/entrypoint.cpython-312.pyc and b/__pycache__/entrypoint.cpython-312.pyc differ
 
entrypoint.py CHANGED
@@ -110,7 +110,7 @@ def _start_health_server() -> HTTPServer:
110
  return server
111
 
112
 
113
- def upload_artifact(api: HfApi, path: Path, dest: str) -> None:
114
  if not path.exists():
115
  print(f'[upload] skip missing {path}', flush=True)
116
  return
@@ -120,7 +120,20 @@ def upload_artifact(api: HfApi, path: Path, dest: str) -> None:
120
  repo_id=OUTPUT_REPO,
121
  repo_type='model',
122
  )
123
- print(f'[upload] uploaded {path} -> {OUTPUT_REPO}/{dest}', flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
 
126
  def _wait_for_cuda_ready(timeout_s: int = 120) -> None:
@@ -158,7 +171,7 @@ def _wait_for_cuda_ready(timeout_s: int = 120) -> None:
158
  print(f'[job] CUDA still not ready after {timeout_s}s — continuing anyway (training will likely fail)', flush=True)
159
 
160
 
161
- def run_job_mode() -> int:
162
  os.chdir(REPO_ROOT)
163
  os.environ.setdefault('HYDRA_TIME_BUDGET', '43200')
164
  os.environ.setdefault('HYDRA_TARGET_SHARDS', '2048')
@@ -203,7 +216,31 @@ def run_job_mode() -> int:
203
  else:
204
  print('[upload] HF_TOKEN not set; skipping artifact upload', flush=True)
205
 
206
- return proc.returncode
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
 
209
  def run_space_mode() -> int:
@@ -217,10 +254,12 @@ def run_space_mode() -> int:
217
  server.server_close()
218
 
219
 
220
- def main() -> int:
221
- if RUNTIME_MODE == 'job':
222
- return run_job_mode()
223
- return run_space_mode()
 
 
224
 
225
 
226
  if __name__ == '__main__':
 
110
  return server
111
 
112
 
113
+ def upload_artifact(api: HfApi, path: Path, dest: str) -> None:
114
  if not path.exists():
115
  print(f'[upload] skip missing {path}', flush=True)
116
  return
 
120
  repo_id=OUTPUT_REPO,
121
  repo_type='model',
122
  )
123
+ print(f'[upload] uploaded {path} -> {OUTPUT_REPO}/{dest}', flush=True)
124
+
125
+
126
+ def build_benchmark_mode_command() -> list[str]:
127
+ return [
128
+ 'python',
129
+ str(REPO_ROOT / 'scripts' / 'benchmark_runner.py'),
130
+ '--benchmark', os.environ.get('HYDRA_BENCHMARK_NAME', 'GSM8K'),
131
+ '--generator-mode', 'hydra',
132
+ '--variant', os.environ.get('HYDRA_BENCHMARK_VARIANT', 'hydra_full'),
133
+ '--seed', os.environ.get('HYDRA_SEED', '42'),
134
+ '--out', str(REPO_ROOT / 'benchmark_result.json'),
135
+ '--ledger', str(REPO_ROOT / 'benchmark_ledger.json'),
136
+ ]
137
 
138
 
139
  def _wait_for_cuda_ready(timeout_s: int = 120) -> None:
 
171
  print(f'[job] CUDA still not ready after {timeout_s}s — continuing anyway (training will likely fail)', flush=True)
172
 
173
 
174
+ def run_job_mode() -> int:
175
  os.chdir(REPO_ROOT)
176
  os.environ.setdefault('HYDRA_TIME_BUDGET', '43200')
177
  os.environ.setdefault('HYDRA_TARGET_SHARDS', '2048')
 
216
  else:
217
  print('[upload] HF_TOKEN not set; skipping artifact upload', flush=True)
218
 
219
+ return proc.returncode
220
+
221
+
222
+ def run_benchmark_mode() -> int:
223
+ os.chdir(REPO_ROOT)
224
+ cmd = build_benchmark_mode_command()
225
+ print(f'[benchmark] command={cmd}', flush=True)
226
+ proc = subprocess.run(cmd, check=False)
227
+
228
+ if TOKEN:
229
+ api = HfApi(token=TOKEN)
230
+ try:
231
+ api.create_repo(repo_id=OUTPUT_REPO, repo_type='model', private=True, exist_ok=True)
232
+ except Exception as e:
233
+ print(f'[upload] create_repo warning: {type(e).__name__}: {e}', flush=True)
234
+ prefix = f'jobs/{JOB_ID}'
235
+ try:
236
+ upload_artifact(api, REPO_ROOT / 'benchmark_result.json', f'{prefix}/benchmark_result.json')
237
+ upload_artifact(api, REPO_ROOT / 'benchmark_ledger.json', f'{prefix}/benchmark_ledger.json')
238
+ except Exception as e:
239
+ print(f'[upload] upload warning: {type(e).__name__}: {e}', flush=True)
240
+ else:
241
+ print('[upload] HF_TOKEN not set; skipping benchmark artifact upload', flush=True)
242
+
243
+ return proc.returncode
244
 
245
 
246
  def run_space_mode() -> int:
 
254
  server.server_close()
255
 
256
 
257
+ def main() -> int:
258
+ if RUNTIME_MODE == 'job':
259
+ return run_job_mode()
260
+ if RUNTIME_MODE == 'benchmark':
261
+ return run_benchmark_mode()
262
+ return run_space_mode()
263
 
264
 
265
  if __name__ == '__main__':
overlay/hydra/model.py CHANGED
@@ -32,11 +32,28 @@ from __future__ import annotations
32
 
33
  import os
34
 
35
- import torch
36
- import torch.nn as nn
37
- import torch.nn.functional as F
38
-
39
- from mamba_ssm import Mamba3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
 
42
  def _ensure_triton_cuda_backend_registered() -> None:
@@ -81,9 +98,30 @@ from hydra.hyena_block import HyenaBlock
81
  from hydra.optimizer import MuonAdamW
82
 
83
 
84
- def norm(x: torch.Tensor) -> torch.Tensor:
85
- """RMSNorm over the last dim — stateless, autocast-friendly."""
86
- return F.rms_norm(x, (x.size(-1),))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
 
89
  class PostSemClawModel(nn.Module):
@@ -103,6 +141,7 @@ class PostSemClawModel(nn.Module):
103
  _ensure_triton_cuda_backend_registered()
104
  self.config = config
105
  self._throughput_mode = os.environ.get("HYDRA_THROUGHPUT_MODE", "0") == "1"
 
106
 
107
  # Token embedding
108
  self.wte = nn.Embedding(config.vocab_size, config.d_model)
@@ -124,23 +163,31 @@ class PostSemClawModel(nn.Module):
124
  print(f"[WARN] layers in both hyena_layers and gdn_layers; using Hyena: {sorted(_both)}", flush=True)
125
  _gdn_layer_set -= _hyena_layer_set
126
 
127
- if _gdn_layer_set:
128
- from hydra.gdn_block import GDNBlock # requires `fla` package
129
-
130
- def _build_block(i: int) -> nn.Module:
131
- if i in _hyena_layer_set:
132
- return HyenaBlock(
 
 
 
 
 
 
 
133
  d_model=config.d_model,
134
  seq_len=config.sequence_len,
135
  order=int(os.environ.get("HYDRA_HYENA_ORDER", "2")),
136
  filter_order=int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64")),
137
  )
138
- if i in _gdn_layer_set:
139
- return GDNBlock(
140
- d_model=config.d_model,
141
- n_heads=config.n_heads,
142
- )
143
- return Mamba3(
 
144
  d_model=config.d_model,
145
  d_state=config.d_state,
146
  expand=config.expand,
 
32
 
33
  import os
34
 
35
+ import torch
36
+ import torch.nn as nn
37
+ import torch.nn.functional as F
38
+
39
+ try:
40
+ from mamba_ssm import Mamba3
41
+ except Exception: # pragma: no cover - depends on optional runtime install
42
+ Mamba3 = None # type: ignore[assignment]
43
+
44
+
45
+ def _get_mamba3_cls():
46
+ global Mamba3
47
+ if Mamba3 is None:
48
+ try:
49
+ from mamba_ssm import Mamba3 as _Mamba3 # type: ignore
50
+ Mamba3 = _Mamba3 # type: ignore[assignment]
51
+ except Exception as exc: # pragma: no cover - environment dependent
52
+ raise ImportError(
53
+ "mamba_ssm is required for Mamba-based HYDRA blocks. "
54
+ "Install mamba-ssm or use HYDRA_BASELINE_ARCH=transformer."
55
+ ) from exc
56
+ return Mamba3
57
 
58
 
59
  def _ensure_triton_cuda_backend_registered() -> None:
 
98
  from hydra.optimizer import MuonAdamW
99
 
100
 
101
+ def norm(x: torch.Tensor) -> torch.Tensor:
102
+ """RMSNorm over the last dim — stateless, autocast-friendly."""
103
+ return F.rms_norm(x, (x.size(-1),))
104
+
105
+
106
+ class TransformerBaselineBlock(nn.Module):
107
+ """Transformer-style delta block for matched baseline experiments.
108
+
109
+ This block returns a transformed delta tensor rather than owning the outer
110
+ residual connection, because ManifoldHyperConnection already handles stream
111
+ mixing and residual injection around the block function.
112
+ """
113
+
114
+ def __init__(self, d_model: int, n_heads: int, expand: int, dropout: float) -> None:
115
+ super().__init__()
116
+ self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
117
+ self.ff_in = nn.Linear(d_model, expand * d_model, bias=False)
118
+ self.ff_out = nn.Linear(expand * d_model, d_model, bias=False)
119
+ self.dropout = nn.Dropout(dropout)
120
+
121
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
122
+ attn_out, _ = self.self_attn(x, x, x, need_weights=False)
123
+ ff = self.ff_out(F.gelu(self.ff_in(attn_out)))
124
+ return self.dropout(attn_out + ff)
125
 
126
 
127
  class PostSemClawModel(nn.Module):
 
141
  _ensure_triton_cuda_backend_registered()
142
  self.config = config
143
  self._throughput_mode = os.environ.get("HYDRA_THROUGHPUT_MODE", "0") == "1"
144
+ self._baseline_arch = os.environ.get("HYDRA_BASELINE_ARCH", "mamba3").strip().lower()
145
 
146
  # Token embedding
147
  self.wte = nn.Embedding(config.vocab_size, config.d_model)
 
163
  print(f"[WARN] layers in both hyena_layers and gdn_layers; using Hyena: {sorted(_both)}", flush=True)
164
  _gdn_layer_set -= _hyena_layer_set
165
 
166
+ if _gdn_layer_set:
167
+ from hydra.gdn_block import GDNBlock # requires `fla` package
168
+
169
+ def _build_block(i: int) -> nn.Module:
170
+ if self._baseline_arch == "transformer":
171
+ return TransformerBaselineBlock(
172
+ d_model=config.d_model,
173
+ n_heads=config.n_heads,
174
+ expand=config.expand,
175
+ dropout=float(os.environ.get("HYDRA_DROPOUT", "0.2")),
176
+ )
177
+ if i in _hyena_layer_set:
178
+ return HyenaBlock(
179
  d_model=config.d_model,
180
  seq_len=config.sequence_len,
181
  order=int(os.environ.get("HYDRA_HYENA_ORDER", "2")),
182
  filter_order=int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64")),
183
  )
184
+ if i in _gdn_layer_set:
185
+ return GDNBlock(
186
+ d_model=config.d_model,
187
+ n_heads=config.n_heads,
188
+ )
189
+ mamba3_cls = _get_mamba3_cls()
190
+ return mamba3_cls(
191
  d_model=config.d_model,
192
  d_state=config.d_state,
193
  expand=config.expand,
overlay/hydra/training.py CHANGED
@@ -4,17 +4,20 @@ Extracted from the monolithic train.py (W1 modularization). Semantics
4
  preserved. Public entrypoint: `main()`.
5
  """
6
 
7
- from __future__ import annotations
8
-
9
- import gc
10
- import json
11
- import math
12
- import os
13
- import sys
14
- import threading
15
- import time
16
- from dataclasses import asdict
17
- from pathlib import Path
 
 
 
18
 
19
  import torch
20
 
@@ -130,7 +133,7 @@ def _ckpt_snapshot_state_dicts(
130
  return msd, osd
131
 
132
 
133
- def save_ckpt(
134
  model: PostSemClawModel,
135
  optimizer: torch.optim.Optimizer,
136
  config: PostSemClawConfig,
@@ -211,11 +214,233 @@ def save_ckpt(
211
  target=_write, daemon=True, name=f"ckpt-save-{step}"
212
  )
213
  _CKPT_WORKER_THREAD.start()
214
- except Exception as e:
215
- print(f"[ckpt] SNAPSHOT FAILED {path}: {type(e).__name__}: {e}", flush=True)
216
-
217
-
218
- def config_from_dict(cfg_dict: dict) -> PostSemClawConfig:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  """Reconstruct a PostSemClawConfig from a checkpoint's asdict() payload.
220
 
221
  Newly-added fields (e.g. `hyena_layers`) are defaulted when absent in
@@ -275,14 +500,14 @@ def _try_load_ckpt(path: Path, model, optimizer, device):
275
  return step, total_training_time, smooth_train_loss, bpt_ema, epoch
276
 
277
 
278
- def maybe_resume_ckpt(
279
- model: PostSemClawModel,
280
- optimizer: torch.optim.Optimizer,
281
- device: torch.device,
282
- ) -> tuple[int, float, float, float, int]:
283
- if not RESUME_CKPT or RESUME_CKPT.lower() == "none":
284
- print("[ckpt] resume disabled; starting fresh", flush=True)
285
- return 0, 0.0, 0.0, 0.0, 0
286
 
287
  resume_path = Path(os.path.expanduser(RESUME_CKPT))
288
  # Try the primary path, then rotated backups. This is crucial because a
@@ -296,17 +521,18 @@ def maybe_resume_ckpt(
296
  if not cand.exists():
297
  continue
298
  try:
299
- result = _try_load_ckpt(cand, model, optimizer, device)
300
- if result is not None:
301
- if cand != resume_path:
302
- print(f"[ckpt] fell back to rotation {cand.name}", flush=True)
303
- return result
 
304
  except Exception as e:
305
  print(f"[ckpt] {cand.name} load failed: {type(e).__name__}: {e}", flush=True)
306
  continue
307
 
308
- print(f"[ckpt] no usable checkpoint in {resume_path} + rotations; starting fresh", flush=True)
309
- return 0, 0.0, 0.0, 0.0, 0
310
 
311
 
312
  # ---------------------------------------------------------------------------
@@ -388,9 +614,18 @@ def main() -> None:
388
  weight_decay=WEIGHT_DECAY,
389
  )
390
 
391
- step, total_training_time, smooth_train_loss, bpt_ema, resume_epoch = maybe_resume_ckpt(
392
- model, optimizer, device,
393
- )
 
 
 
 
 
 
 
 
 
394
 
395
  # Learnability #4: inform the model of the BOS token id so it can mask
396
  # doc-separator positions in packed sequences. Always set (the mask only
@@ -785,10 +1020,22 @@ def main() -> None:
785
  # does not benefit from overlap with backward). HYDRA_EVAL_TOKENS controls
786
  # how many val tokens to sweep (default 2 M, short enough for autoresearch
787
  # 5-min budgets).
788
- val_bpb: float | None = None
789
- _eval_B = int(os.environ.get("HYDRA_EVAL_BATCH", str(max(1, DEVICE_BATCH_SIZE // 2))))
790
- _eval_tokens = int(os.environ.get("HYDRA_EVAL_TOKENS", str(2 * 524288)))
791
- try:
 
 
 
 
 
 
 
 
 
 
 
 
792
  # Aggressive VRAM reclaim for 6GB cards. Peak training VRAM = 5.1GB
793
  # which leaves < 1GB for the eval forward — the driver can't satisfy
794
  # the allocation. Free EVERY tensor we don't strictly need:
@@ -810,34 +1057,70 @@ def main() -> None:
810
  model._last_sdr = None
811
  import gc as _gc
812
  _gc.collect()
813
- torch.cuda.empty_cache()
814
- torch.cuda.synchronize()
815
- try:
816
- _free_mb = torch.cuda.mem_get_info()[0] / 1024 / 1024
817
- print(f"[VAL] free_vram_mb={_free_mb:.0f} (cleared optimizer state)", flush=True)
818
- except Exception:
819
- pass
820
- print(f"[VAL] running eval on {_eval_tokens} tokens at B={_eval_B}...", flush=True)
821
- model.eval()
822
- _orig = _prepare_mod.EVAL_TOKENS
823
- _prepare_mod.EVAL_TOKENS = _eval_tokens
824
- with autocast_ctx:
825
- val_bpb = evaluate_bpb(model, tokenizer, _eval_B)
826
- _prepare_mod.EVAL_TOKENS = _orig
827
- val_ppl = 2 ** val_bpb
828
- print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
829
- except torch.cuda.OutOfMemoryError as e:
830
- print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
831
- torch.cuda.empty_cache()
832
- except Exception as e:
833
- import traceback as _tb
834
- print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
835
- _tb.print_exc()
836
- try:
837
- _free = torch.cuda.mem_get_info()[0] / 1024 / 1024
838
- print(f"[VAL] post-crash free_vram_mb={_free:.0f}", flush=True)
839
- except Exception:
840
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
 
842
  # Final ckpts with val_bpb filled in (if eval succeeded).
843
  save_ckpt(
@@ -881,8 +1164,13 @@ def main() -> None:
881
  / total_training_time / GPU_BF16_PEAK_FLOPS
882
  if total_training_time > 0 else 0
883
  )
884
- peak_vram_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
885
- metrics = model.get_secondary_metrics()
 
 
 
 
 
886
 
887
  print("---")
888
  print(f"val_bpb: {val_bpb:.6f}" if val_bpb is not None else "val_bpb: SKIPPED")
@@ -918,22 +1206,28 @@ def main() -> None:
918
  # Emit full metrics dictionary as JSON for sweep aggregation. Path from
919
  # HYDRA_METRICS_OUT env var; default=/tmp/hydra_run_metrics.json. Always
920
  # written (even without diagnostics) so the aggregator can compare runs.
921
- _metrics_out = os.environ.get("HYDRA_METRICS_OUT", "/tmp/hydra_run_metrics.json")
922
- try:
923
- _dump = dict(metrics)
924
- _dump.update({
925
- 'val_bpb': float(val_bpb),
926
- 'val_ppl': float(val_ppl),
927
- 'n_layer': int(N_LAYER),
928
- 'd_model': int(D_MODEL),
929
- 'num_params_M': float(num_params / 1e6),
930
- 'num_steps': int(step),
931
- 'total_tokens_M': float(total_tokens / 1e6),
932
- 'peak_vram_mb': float(peak_vram_mb),
933
- 'training_seconds': float(total_training_time),
934
- 'sdr_target_active': int(os.environ.get("HYDRA_SDR_TARGET_ACTIVE", "327")),
935
- })
936
- Path(_metrics_out).parent.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
937
  with open(_metrics_out, 'w') as _f:
938
  json.dump(_dump, _f, indent=2, sort_keys=True)
939
  print(f"[METRICS] wrote {_metrics_out}", flush=True)
 
4
  preserved. Public entrypoint: `main()`.
5
  """
6
 
7
+ from __future__ import annotations
8
+
9
+ import gc
10
+ import hashlib
11
+ import json
12
+ import math
13
+ import os
14
+ import sys
15
+ import threading
16
+ import time
17
+ from collections.abc import Mapping
18
+ from dataclasses import asdict
19
+ from pathlib import Path
20
+ from typing import Any
21
 
22
  import torch
23
 
 
133
  return msd, osd
134
 
135
 
136
+ def save_ckpt(
137
  model: PostSemClawModel,
138
  optimizer: torch.optim.Optimizer,
139
  config: PostSemClawConfig,
 
214
  target=_write, daemon=True, name=f"ckpt-save-{step}"
215
  )
216
  _CKPT_WORKER_THREAD.start()
217
+ except Exception as e:
218
+ print(f"[ckpt] SNAPSHOT FAILED {path}: {type(e).__name__}: {e}", flush=True)
219
+
220
+
221
+ def _env_flag_enabled(env: Mapping[str, str], key: str) -> bool:
222
+ value = str(env.get(key, "0") or "0").strip().lower()
223
+ return value not in {"", "0", "false", "no", "off"}
224
+
225
+
226
+ def _env_int(env: Mapping[str, str], key: str, default: int) -> int:
227
+ try:
228
+ return int(str(env.get(key, str(default)) or str(default)))
229
+ except ValueError:
230
+ return default
231
+
232
+
233
+ def architecture_compliance_payload(env: Mapping[str, str]) -> dict[str, bool | int | str]:
234
+ throughput_mode = _env_flag_enabled(env, "HYDRA_THROUGHPUT_MODE")
235
+ fastpath = _env_flag_enabled(env, "HYDRA_FASTPATH")
236
+ force_htm_cpu = _env_flag_enabled(env, "HYDRA_FORCE_HTM_CPU")
237
+ inert_mamba = _env_flag_enabled(env, "HYDRA_INERT_MAMBA")
238
+ synthetic_retina = _env_flag_enabled(env, "HYDRA_ALLOW_SYNTHETIC_RETINA")
239
+ hyena_layers = str(env.get("HYDRA_HYENA_LAYERS", "") or "")
240
+ engram_subsample = _env_int(env, "HYDRA_ENGRAM_SUBSAMPLE", 1)
241
+ htm_subsample = _env_int(env, "HYDRA_HTM_SUBSAMPLE", 1)
242
+ full_arch_compliant = not any((
243
+ throughput_mode,
244
+ fastpath,
245
+ force_htm_cpu,
246
+ inert_mamba,
247
+ synthetic_retina,
248
+ bool(hyena_layers.strip()),
249
+ ))
250
+ return {
251
+ 'full_arch_compliant': full_arch_compliant,
252
+ 'throughput_mode': throughput_mode,
253
+ 'fastpath': fastpath,
254
+ 'force_htm_cpu': force_htm_cpu,
255
+ 'inert_mamba': inert_mamba,
256
+ 'synthetic_retina': synthetic_retina,
257
+ 'hyena_layers': hyena_layers,
258
+ 'engram_subsample': engram_subsample,
259
+ 'htm_subsample': htm_subsample,
260
+ }
261
+
262
+
263
+ def eval_attempt_batches(*, requested_batch: int, min_batch: int) -> list[int]:
264
+ requested = max(1, int(requested_batch))
265
+ minimum = max(1, int(min_batch))
266
+ batches: list[int] = []
267
+ current = requested
268
+ while current >= minimum:
269
+ if current not in batches:
270
+ batches.append(current)
271
+ if current == minimum:
272
+ break
273
+ next_batch = max(minimum, current // 2)
274
+ if next_batch == current:
275
+ break
276
+ current = next_batch
277
+ if minimum not in batches:
278
+ batches.append(minimum)
279
+ return batches
280
+
281
+
282
+ def build_eval_plan(*, eval_tokens: int, requested_batch: int, max_seq_len: int, chunk_tokens: int, min_batch: int) -> dict[str, Any]:
283
+ effective_chunk_tokens = max(int(chunk_tokens), int(requested_batch) * int(max_seq_len))
284
+ chunk_count = max(1, math.ceil(int(eval_tokens) / effective_chunk_tokens))
285
+ return {
286
+ 'eval_tokens': int(eval_tokens),
287
+ 'eval_requested_batch': int(requested_batch),
288
+ 'eval_chunk_tokens': int(effective_chunk_tokens),
289
+ 'eval_chunk_count': int(chunk_count),
290
+ 'eval_attempt_batches': eval_attempt_batches(requested_batch=requested_batch, min_batch=min_batch),
291
+ 'eval_min_batch': int(max(1, min_batch)),
292
+ }
293
+
294
+
295
+ def _fingerprint_descriptor(descriptor: Mapping[str, Any]) -> str:
296
+ payload = json.dumps(dict(descriptor), sort_keys=True, separators=(",", ":"))
297
+ return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:12]
298
+
299
+
300
+ def dataset_domain_payload(*, env: Mapping[str, str], prepare_module: Any, nemotron_module: Any | None) -> dict[str, Any]:
301
+ use_nemotron = _env_flag_enabled(env, "HYDRA_USE_NEMOTRON")
302
+ vocab_size = int(getattr(prepare_module, "VOCAB_SIZE", 0))
303
+
304
+ if use_nemotron and nemotron_module is not None:
305
+ use_full_blend = _env_flag_enabled(env, "HYDRA_USE_FULL_BLEND")
306
+ phase = str(env.get("HYDRA_NEMOTRON_PHASE", "phase1") or "phase1").strip().lower()
307
+ if use_full_blend:
308
+ train_weights = dict(getattr(nemotron_module, "FULL_BLEND_WEIGHTS", {}))
309
+ val_weights = dict(train_weights)
310
+ else:
311
+ train_weights = dict(
312
+ getattr(nemotron_module, "PHASE2_WEIGHTS", {}) if phase == "phase2" else getattr(nemotron_module, "PHASE1_WEIGHTS", {})
313
+ )
314
+ val_weights = {"Nemotron-Pretraining-Multiple-Choice": 1.0}
315
+ train_descriptor = {
316
+ "backend": "nemotron_stream",
317
+ "phase": "full_blend" if use_full_blend else phase,
318
+ "weights": train_weights,
319
+ "factual_inject_rate": _env_int(env, "HYDRA_FACTUAL_INJECT_RATE", 50),
320
+ "vocab_size": vocab_size,
321
+ }
322
+ val_descriptor = {
323
+ "backend": "nemotron_stream",
324
+ "phase": "full_blend" if use_full_blend else "val_multiple_choice",
325
+ "weights": val_weights,
326
+ "vocab_size": vocab_size,
327
+ }
328
+ data_backend = "nemotron_stream"
329
+ else:
330
+ all_files = list(getattr(prepare_module, "list_parquet_files", lambda: [])())
331
+ val_filename = str(getattr(prepare_module, "VAL_FILENAME", ""))
332
+ train_files = [str(path) for path in all_files if not str(path).endswith(val_filename)]
333
+ val_files = [str(path) for path in all_files if str(path).endswith(val_filename)]
334
+ train_descriptor = {
335
+ "backend": "climbmix_parquet",
336
+ "train_shard_count": len(train_files),
337
+ "train_shard_examples": sorted(Path(path).name for path in train_files[:3]),
338
+ "vocab_size": vocab_size,
339
+ }
340
+ val_descriptor = {
341
+ "backend": "climbmix_parquet",
342
+ "val_filename": val_filename,
343
+ "val_shard_count": len(val_files),
344
+ "vocab_size": vocab_size,
345
+ }
346
+ data_backend = "climbmix_parquet"
347
+
348
+ train_fingerprint = _fingerprint_descriptor(train_descriptor)
349
+ val_fingerprint = _fingerprint_descriptor(val_descriptor)
350
+ return {
351
+ "data_backend": data_backend,
352
+ "train_domain_descriptor": train_descriptor,
353
+ "val_domain_descriptor": val_descriptor,
354
+ "train_domain_fingerprint": train_fingerprint,
355
+ "val_domain_fingerprint": val_fingerprint,
356
+ "train_val_domain_match": train_fingerprint == val_fingerprint,
357
+ }
358
+
359
+
360
+ def build_lineage_payload(
361
+ *,
362
+ env: Mapping[str, str],
363
+ seed: int,
364
+ resume_requested: bool,
365
+ resume_requested_path: str | None,
366
+ resume_loaded_path: str | None,
367
+ resume_step: int,
368
+ resume_epoch: int,
369
+ ) -> dict[str, Any]:
370
+ warmstart = _env_flag_enabled(env, "HYDRA_WARMSTART")
371
+ resume_applied = resume_loaded_path is not None and int(resume_step) > 0
372
+ if resume_applied and warmstart:
373
+ lineage_mode = "warmstart_resume"
374
+ elif resume_applied:
375
+ lineage_mode = "resume"
376
+ else:
377
+ lineage_mode = "fresh"
378
+ return {
379
+ "seed": int(seed),
380
+ "warmstart": warmstart,
381
+ "resume_requested": bool(resume_requested),
382
+ "resume_applied": resume_applied,
383
+ "resume_requested_path": resume_requested_path,
384
+ "resume_loaded_path": resume_loaded_path,
385
+ "resume_step": int(resume_step),
386
+ "resume_epoch": int(resume_epoch),
387
+ "lineage_mode": lineage_mode,
388
+ }
389
+
390
+
391
+ def build_final_metrics_payload(
392
+ *,
393
+ secondary_metrics: dict[str, Any],
394
+ val_bpb: float | None,
395
+ val_ppl: float | None,
396
+ eval_status: str,
397
+ eval_error: str | None,
398
+ n_layer: int,
399
+ d_model: int,
400
+ num_params: int,
401
+ step: int,
402
+ total_tokens: int,
403
+ peak_vram_mb: float,
404
+ total_training_time: float,
405
+ sdr_target_active: int,
406
+ architecture_env: Mapping[str, str] | None = None,
407
+ eval_diagnostics: Mapping[str, Any] | None = None,
408
+ domain_fingerprints: Mapping[str, Any] | None = None,
409
+ lineage_payload: Mapping[str, Any] | None = None,
410
+ ) -> dict[str, Any]:
411
+ """Build final run metrics without conflating skipped eval and validation.
412
+
413
+ This helper deliberately preserves ``val_bpb=None`` when final eval did not
414
+ complete. HPO can then prune or explicitly label a fallback instead of
415
+ accidentally treating live training BPB as validation BPB.
416
+ """
417
+ payload = dict(secondary_metrics)
418
+ payload.update({
419
+ 'eval_status': eval_status,
420
+ 'eval_error': eval_error,
421
+ 'objective_source': 'final_val' if val_bpb is not None else 'missing_final_val',
422
+ 'val_bpb': float(val_bpb) if val_bpb is not None else None,
423
+ 'val_ppl': float(val_ppl) if val_ppl is not None else None,
424
+ 'n_layer': int(n_layer),
425
+ 'd_model': int(d_model),
426
+ 'num_params_M': float(num_params / 1e6),
427
+ 'num_steps': int(step),
428
+ 'total_tokens_M': float(total_tokens / 1e6),
429
+ 'peak_vram_mb': float(peak_vram_mb),
430
+ 'training_seconds': float(total_training_time),
431
+ 'sdr_target_active': int(sdr_target_active),
432
+ })
433
+ payload.update(architecture_compliance_payload(architecture_env or dict(os.environ)))
434
+ if eval_diagnostics:
435
+ payload.update(dict(eval_diagnostics))
436
+ if domain_fingerprints:
437
+ payload.update(dict(domain_fingerprints))
438
+ if lineage_payload:
439
+ payload.update(dict(lineage_payload))
440
+ return payload
441
+
442
+
443
+ def config_from_dict(cfg_dict: dict) -> PostSemClawConfig:
444
  """Reconstruct a PostSemClawConfig from a checkpoint's asdict() payload.
445
 
446
  Newly-added fields (e.g. `hyena_layers`) are defaulted when absent in
 
500
  return step, total_training_time, smooth_train_loss, bpt_ema, epoch
501
 
502
 
503
+ def maybe_resume_ckpt(
504
+ model: PostSemClawModel,
505
+ optimizer: torch.optim.Optimizer,
506
+ device: torch.device,
507
+ ) -> tuple[int, float, float, float, int, str | None]:
508
+ if not RESUME_CKPT or RESUME_CKPT.lower() == "none":
509
+ print("[ckpt] resume disabled; starting fresh", flush=True)
510
+ return 0, 0.0, 0.0, 0.0, 0, None
511
 
512
  resume_path = Path(os.path.expanduser(RESUME_CKPT))
513
  # Try the primary path, then rotated backups. This is crucial because a
 
521
  if not cand.exists():
522
  continue
523
  try:
524
+ result = _try_load_ckpt(cand, model, optimizer, device)
525
+ if result is not None:
526
+ if cand != resume_path:
527
+ print(f"[ckpt] fell back to rotation {cand.name}", flush=True)
528
+ step, total_training_time, smooth_train_loss, bpt_ema, epoch = result
529
+ return step, total_training_time, smooth_train_loss, bpt_ema, epoch, str(cand)
530
  except Exception as e:
531
  print(f"[ckpt] {cand.name} load failed: {type(e).__name__}: {e}", flush=True)
532
  continue
533
 
534
+ print(f"[ckpt] no usable checkpoint in {resume_path} + rotations; starting fresh", flush=True)
535
+ return 0, 0.0, 0.0, 0.0, 0, None
536
 
537
 
538
  # ---------------------------------------------------------------------------
 
614
  weight_decay=WEIGHT_DECAY,
615
  )
616
 
617
+ step, total_training_time, smooth_train_loss, bpt_ema, resume_epoch, resume_loaded_path = maybe_resume_ckpt(
618
+ model, optimizer, device,
619
+ )
620
+ lineage_payload = build_lineage_payload(
621
+ env=dict(os.environ),
622
+ seed=SEED,
623
+ resume_requested=bool(RESUME_CKPT and RESUME_CKPT.lower() != "none"),
624
+ resume_requested_path=RESUME_CKPT if RESUME_CKPT and RESUME_CKPT.lower() != "none" else None,
625
+ resume_loaded_path=resume_loaded_path,
626
+ resume_step=step,
627
+ resume_epoch=resume_epoch,
628
+ )
629
 
630
  # Learnability #4: inform the model of the BOS token id so it can mask
631
  # doc-separator positions in packed sequences. Always set (the mask only
 
1020
  # does not benefit from overlap with backward). HYDRA_EVAL_TOKENS controls
1021
  # how many val tokens to sweep (default 2 M, short enough for autoresearch
1022
  # 5-min budgets).
1023
+ val_bpb: float | None = None
1024
+ val_ppl: float | None = None
1025
+ eval_status = "not_started"
1026
+ eval_error: str | None = None
1027
+ _eval_B = int(os.environ.get("HYDRA_EVAL_BATCH", str(max(1, DEVICE_BATCH_SIZE // 2))))
1028
+ _eval_tokens = int(os.environ.get("HYDRA_EVAL_TOKENS", str(2 * 524288)))
1029
+ _eval_chunk_tokens = int(os.environ.get("HYDRA_EVAL_CHUNK_TOKENS", str(_eval_tokens)))
1030
+ _eval_min_batch = int(os.environ.get("HYDRA_EVAL_MIN_BATCH", "1"))
1031
+ eval_diagnostics = build_eval_plan(
1032
+ eval_tokens=_eval_tokens,
1033
+ requested_batch=_eval_B,
1034
+ max_seq_len=MAX_SEQ_LEN,
1035
+ chunk_tokens=_eval_chunk_tokens,
1036
+ min_batch=_eval_min_batch,
1037
+ )
1038
+ try:
1039
  # Aggressive VRAM reclaim for 6GB cards. Peak training VRAM = 5.1GB
1040
  # which leaves < 1GB for the eval forward — the driver can't satisfy
1041
  # the allocation. Free EVERY tensor we don't strictly need:
 
1057
  model._last_sdr = None
1058
  import gc as _gc
1059
  _gc.collect()
1060
+ torch.cuda.empty_cache()
1061
+ torch.cuda.synchronize()
1062
+ try:
1063
+ _free_mb = torch.cuda.mem_get_info()[0] / 1024 / 1024
1064
+ eval_diagnostics["eval_free_vram_before_mb"] = float(_free_mb)
1065
+ print(f"[VAL] free_vram_mb={_free_mb:.0f} (cleared optimizer state)", flush=True)
1066
+ except Exception:
1067
+ pass
1068
+ print(
1069
+ f"[VAL] running eval on {_eval_tokens} tokens at B={_eval_B} "
1070
+ f"chunk_tokens={eval_diagnostics['eval_chunk_tokens']} attempts={eval_diagnostics['eval_attempt_batches']}...",
1071
+ flush=True,
1072
+ )
1073
+ model.eval()
1074
+ _orig = _prepare_mod.EVAL_TOKENS
1075
+ _orig_chunk = getattr(_prepare_mod, "EVAL_CHUNK_TOKENS", _eval_tokens)
1076
+ _prepare_mod.EVAL_TOKENS = _eval_tokens
1077
+ _prepare_mod.EVAL_CHUNK_TOKENS = int(eval_diagnostics["eval_chunk_tokens"])
1078
+ _successful_batch: int | None = None
1079
+ _attempts: list[int] = []
1080
+ try:
1081
+ for _attempt_batch in eval_diagnostics["eval_attempt_batches"]:
1082
+ _attempts.append(int(_attempt_batch))
1083
+ eval_diagnostics["eval_attempted_batch"] = int(_attempt_batch)
1084
+ try:
1085
+ with autocast_ctx:
1086
+ val_bpb = evaluate_bpb(model, tokenizer, int(_attempt_batch))
1087
+ _successful_batch = int(_attempt_batch)
1088
+ break
1089
+ except torch.cuda.OutOfMemoryError as _attempt_oom:
1090
+ eval_error = str(_attempt_oom)
1091
+ eval_status = "oom"
1092
+ torch.cuda.empty_cache()
1093
+ if int(_attempt_batch) == eval_diagnostics["eval_attempt_batches"][-1]:
1094
+ raise
1095
+ finally:
1096
+ _prepare_mod.EVAL_TOKENS = _orig
1097
+ _prepare_mod.EVAL_CHUNK_TOKENS = _orig_chunk
1098
+ eval_diagnostics["eval_attempt_batches"] = _attempts
1099
+ eval_diagnostics["eval_effective_batch"] = _successful_batch
1100
+ val_ppl = 2 ** val_bpb
1101
+ eval_status = "completed"
1102
+ print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
1103
+ except torch.cuda.OutOfMemoryError as e:
1104
+ eval_status = "oom"
1105
+ eval_error = str(e)
1106
+ print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
1107
+ torch.cuda.empty_cache()
1108
+ try:
1109
+ eval_diagnostics["eval_free_vram_after_mb"] = float(torch.cuda.mem_get_info()[0] / 1024 / 1024)
1110
+ except Exception:
1111
+ pass
1112
+ except Exception as e:
1113
+ import traceback as _tb
1114
+ eval_status = type(e).__name__
1115
+ eval_error = str(e)
1116
+ print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
1117
+ _tb.print_exc()
1118
+ try:
1119
+ _free = torch.cuda.mem_get_info()[0] / 1024 / 1024
1120
+ eval_diagnostics["eval_free_vram_after_mb"] = float(_free)
1121
+ print(f"[VAL] post-crash free_vram_mb={_free:.0f}", flush=True)
1122
+ except Exception:
1123
+ pass
1124
 
1125
  # Final ckpts with val_bpb filled in (if eval succeeded).
1126
  save_ckpt(
 
1164
  / total_training_time / GPU_BF16_PEAK_FLOPS
1165
  if total_training_time > 0 else 0
1166
  )
1167
+ peak_vram_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
1168
+ metrics = model.get_secondary_metrics()
1169
+ domain_fingerprints = dataset_domain_payload(
1170
+ env=dict(os.environ),
1171
+ prepare_module=_prepare_mod,
1172
+ nemotron_module=globals().get("_p_nemo"),
1173
+ )
1174
 
1175
  print("---")
1176
  print(f"val_bpb: {val_bpb:.6f}" if val_bpb is not None else "val_bpb: SKIPPED")
 
1206
  # Emit full metrics dictionary as JSON for sweep aggregation. Path from
1207
  # HYDRA_METRICS_OUT env var; default=/tmp/hydra_run_metrics.json. Always
1208
  # written (even without diagnostics) so the aggregator can compare runs.
1209
+ _metrics_out = os.environ.get("HYDRA_METRICS_OUT", "/tmp/hydra_run_metrics.json")
1210
+ try:
1211
+ _dump = build_final_metrics_payload(
1212
+ secondary_metrics=metrics,
1213
+ val_bpb=val_bpb,
1214
+ val_ppl=val_ppl,
1215
+ eval_status=eval_status,
1216
+ eval_error=eval_error,
1217
+ n_layer=N_LAYER,
1218
+ d_model=D_MODEL,
1219
+ num_params=num_params,
1220
+ step=step,
1221
+ total_tokens=total_tokens,
1222
+ peak_vram_mb=peak_vram_mb,
1223
+ total_training_time=total_training_time,
1224
+ sdr_target_active=int(os.environ.get("HYDRA_SDR_TARGET_ACTIVE", "327")),
1225
+ architecture_env=dict(os.environ),
1226
+ eval_diagnostics=eval_diagnostics,
1227
+ domain_fingerprints=domain_fingerprints,
1228
+ lineage_payload=lineage_payload,
1229
+ )
1230
+ Path(_metrics_out).parent.mkdir(parents=True, exist_ok=True)
1231
  with open(_metrics_out, 'w') as _f:
1232
  json.dump(_dump, _f, indent=2, sort_keys=True)
1233
  print(f"[METRICS] wrote {_metrics_out}", flush=True)
overlay/prepare.py CHANGED
@@ -13,9 +13,10 @@ import os
13
  import sys
14
  import time
15
  import math
16
- import argparse
17
- import pickle
18
- from multiprocessing import Pool
 
19
 
20
  import requests
21
  import pyarrow.parquet as pq
@@ -29,7 +30,8 @@ import torch
29
 
30
  MAX_SEQ_LEN = int(os.environ.get("HYDRA_SEQ_LEN", "512")) # context length
31
  TIME_BUDGET = 300 # training time budget in seconds (5 minutes)
32
- EVAL_TOKENS = 40 * 524288 # number of tokens for val eval
 
33
 
34
  # ---------------------------------------------------------------------------
35
  # Configuration
@@ -158,7 +160,8 @@ def train_tokenizer():
158
  print("Tokenizer: training BPE tokenizer...")
159
  t0 = time.time()
160
 
161
- tokenizer = rustbpe.Tokenizer()
 
162
  vocab_size_no_special = VOCAB_SIZE - len(SPECIAL_TOKENS)
163
  tokenizer.train_from_iterator(text_iterator(), vocab_size_no_special, pattern=SPLIT_PATTERN)
164
 
@@ -225,9 +228,10 @@ class Tokenizer:
225
  def get_bos_token_id(self):
226
  return self.bos_token_id
227
 
228
- def encode(self, text, prepend=None, num_threads=8):
229
- if prepend is not None:
230
- prepend_id = prepend if isinstance(prepend, int) else self.enc.encode_single_token(prepend)
 
231
  if isinstance(text, str):
232
  ids = self.enc.encode_ordinary(text)
233
  if prepend is not None:
@@ -245,7 +249,7 @@ class Tokenizer:
245
  return self.enc.decode(ids)
246
 
247
 
248
- _TOKEN_BYTES_CACHE: dict = {}
249
 
250
  def get_token_bytes(device="cpu"):
251
  key = str(device)
@@ -341,12 +345,30 @@ def make_dataloader(tokenizer, B, T, split, buffer_size=1000):
341
  gpu_buffer.copy_(cpu_buffer, non_blocking=True)
342
  yield inputs, targets, epoch
343
 
344
- # ---------------------------------------------------------------------------
345
- # Evaluation (DO NOT CHANGE — this is the fixed metric)
346
- # ---------------------------------------------------------------------------
347
-
348
- @torch.no_grad()
349
- def evaluate_bpb(model, tokenizer, batch_size):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  """
351
  Bits per byte (BPB): vocab size-independent evaluation metric.
352
  Sums per-token cross-entropy (in nats), sums target byte lengths,
@@ -357,31 +379,35 @@ def evaluate_bpb(model, tokenizer, batch_size):
357
  Perf: accumulates on GPU (single sync at end), prefetches next batch
358
  while current forward runs.
359
  """
360
- token_bytes = get_token_bytes(device="cuda")
361
- val_loader = make_dataloader(tokenizer, batch_size, MAX_SEQ_LEN, "val")
362
- steps = EVAL_TOKENS // (batch_size * MAX_SEQ_LEN)
 
363
 
364
  # GPU-resident accumulators — avoid per-batch .item() sync
365
  total_nats_t = torch.zeros(1, device="cuda", dtype=torch.float64)
366
  total_bytes_t = torch.zeros(1, device="cuda", dtype=torch.int64)
367
 
368
  # Prefetch first batch
369
- next_batch = next(val_loader)
370
- for _ in range(steps):
371
- x, y, _epoch = next_batch
372
- # Prefetch NEXT batch while GPU computes current forward
373
- next_batch = next(val_loader)
374
- loss_flat = model(x, y, reduction='none').view(-1)
375
- y_flat = y.view(-1)
376
- nbytes = token_bytes[y_flat]
377
- mask = nbytes > 0
378
- total_nats_t += (loss_flat * mask).sum()
379
- total_bytes_t += nbytes.sum()
380
-
381
- # Single GPU→CPU sync at end
382
- total_nats = total_nats_t.item()
383
- total_bytes = total_bytes_t.item()
384
- return total_nats / (math.log(2) * total_bytes)
 
 
 
385
 
386
  # ---------------------------------------------------------------------------
387
  # Main
 
13
  import sys
14
  import time
15
  import math
16
+ import argparse
17
+ import pickle
18
+ from multiprocessing import Pool
19
+ from typing import Any
20
 
21
  import requests
22
  import pyarrow.parquet as pq
 
30
 
31
  MAX_SEQ_LEN = int(os.environ.get("HYDRA_SEQ_LEN", "512")) # context length
32
  TIME_BUDGET = 300 # training time budget in seconds (5 minutes)
33
+ EVAL_TOKENS = 40 * 524288 # number of tokens for val eval
34
+ EVAL_CHUNK_TOKENS = int(os.environ.get("HYDRA_EVAL_CHUNK_TOKENS", str(EVAL_TOKENS)))
35
 
36
  # ---------------------------------------------------------------------------
37
  # Configuration
 
160
  print("Tokenizer: training BPE tokenizer...")
161
  t0 = time.time()
162
 
163
+ tokenizer_cls = getattr(rustbpe, "Tokenizer")
164
+ tokenizer: Any = tokenizer_cls()
165
  vocab_size_no_special = VOCAB_SIZE - len(SPECIAL_TOKENS)
166
  tokenizer.train_from_iterator(text_iterator(), vocab_size_no_special, pattern=SPLIT_PATTERN)
167
 
 
228
  def get_bos_token_id(self):
229
  return self.bos_token_id
230
 
231
+ def encode(self, text, prepend=None, num_threads=8):
232
+ prepend_id = None
233
+ if prepend is not None:
234
+ prepend_id = prepend if isinstance(prepend, int) else self.enc.encode_single_token(prepend)
235
  if isinstance(text, str):
236
  ids = self.enc.encode_ordinary(text)
237
  if prepend is not None:
 
249
  return self.enc.decode(ids)
250
 
251
 
252
+ _TOKEN_BYTES_CACHE: dict[str, torch.Tensor] = {}
253
 
254
  def get_token_bytes(device="cpu"):
255
  key = str(device)
 
345
  gpu_buffer.copy_(cpu_buffer, non_blocking=True)
346
  yield inputs, targets, epoch
347
 
348
+ # ---------------------------------------------------------------------------
349
+ # Evaluation (DO NOT CHANGE — this is the fixed metric)
350
+ # ---------------------------------------------------------------------------
351
+
352
+ def compute_bpb_from_totals(total_nats: torch.Tensor, total_bytes: torch.Tensor) -> torch.Tensor:
353
+ if int(total_bytes.item()) <= 0:
354
+ raise ValueError("BPB normalization requires at least one non-special token")
355
+ return total_nats.to(dtype=torch.float64) / (math.log(2) * total_bytes.to(dtype=torch.float64))
356
+
357
+
358
+ def compute_bpb_from_losses(loss_flat: torch.Tensor, nbytes: torch.Tensor) -> torch.Tensor:
359
+ """Convert per-token losses and token byte lengths into bits-per-byte.
360
+
361
+ Tokens with zero byte length (special tokens) are excluded from both the
362
+ numerator and denominator so BPB remains comparable across tokenizer
363
+ special-token conventions.
364
+ """
365
+ mask = nbytes > 0
366
+ total_nats = (loss_flat * mask).sum(dtype=torch.float64)
367
+ total_bytes = nbytes[mask].sum(dtype=torch.int64)
368
+ return compute_bpb_from_totals(total_nats, total_bytes)
369
+
370
+ @torch.no_grad()
371
+ def evaluate_bpb(model, tokenizer, batch_size):
372
  """
373
  Bits per byte (BPB): vocab size-independent evaluation metric.
374
  Sums per-token cross-entropy (in nats), sums target byte lengths,
 
379
  Perf: accumulates on GPU (single sync at end), prefetches next batch
380
  while current forward runs.
381
  """
382
+ token_bytes = get_token_bytes(device="cuda")
383
+ val_loader = make_dataloader(tokenizer, batch_size, MAX_SEQ_LEN, "val")
384
+ steps = EVAL_TOKENS // (batch_size * MAX_SEQ_LEN)
385
+ chunk_steps = max(1, EVAL_CHUNK_TOKENS // (batch_size * MAX_SEQ_LEN))
386
 
387
  # GPU-resident accumulators — avoid per-batch .item() sync
388
  total_nats_t = torch.zeros(1, device="cuda", dtype=torch.float64)
389
  total_bytes_t = torch.zeros(1, device="cuda", dtype=torch.int64)
390
 
391
  # Prefetch first batch
392
+ next_batch = next(val_loader)
393
+ steps_done = 0
394
+ while steps_done < steps:
395
+ this_chunk = min(chunk_steps, steps - steps_done)
396
+ for _ in range(this_chunk):
397
+ x, y, _epoch = next_batch
398
+ # Prefetch NEXT batch while GPU computes current forward
399
+ next_batch = next(val_loader)
400
+ loss_flat = model(x, y, reduction='none').view(-1)
401
+ y_flat = y.view(-1)
402
+ nbytes = token_bytes[y_flat]
403
+ total_nats_t += (loss_flat * (nbytes > 0)).sum(dtype=torch.float64)
404
+ total_bytes_t += nbytes[nbytes > 0].sum(dtype=torch.int64)
405
+ steps_done += this_chunk
406
+ if steps_done < steps:
407
+ torch.cuda.empty_cache()
408
+
409
+ # Single GPU→CPU sync at end
410
+ return float(compute_bpb_from_totals(total_nats_t, total_bytes_t).item())
411
 
412
  # ---------------------------------------------------------------------------
413
  # Main
overlay/scripts/audit_overlay_sync.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ from pathlib import Path
7
+
8
+
9
+ DEFAULT_INCLUDE_PATHS = [
10
+ "hydra",
11
+ "subsystems",
12
+ "scripts",
13
+ "htm_rust",
14
+ "harness",
15
+ "configs",
16
+ "prepare.py",
17
+ "prepare_nemotron.py",
18
+ "train.py",
19
+ "pyproject.toml",
20
+ "uv.lock",
21
+ ]
22
+
23
+
24
+ def _iter_files(path: Path) -> list[Path]:
25
+ if not path.exists():
26
+ return []
27
+ if path.is_file():
28
+ return [path]
29
+ return sorted(p for p in path.rglob("*") if p.is_file())
30
+
31
+
32
+ def classify_overlay_pairs(*, repo_root: Path, include_paths: list[str]) -> dict[str, list[str]]:
33
+ overlay_root = repo_root / "hf_jobs" / "feather_h200_image" / "overlay"
34
+ identical: list[str] = []
35
+ root_ahead: list[str] = []
36
+ overlay_only: list[str] = []
37
+ missing_overlay: list[str] = []
38
+
39
+ for rel in include_paths:
40
+ root_path = repo_root / rel
41
+ overlay_path = overlay_root / rel
42
+
43
+ root_files = {p.relative_to(root_path).as_posix(): p for p in _iter_files(root_path)} if root_path.exists() and root_path.is_dir() else {}
44
+ overlay_files = {p.relative_to(overlay_path).as_posix(): p for p in _iter_files(overlay_path)} if overlay_path.exists() and overlay_path.is_dir() else {}
45
+
46
+ if root_path.is_file() or overlay_path.is_file():
47
+ rel_name = rel.replace("\\", "/")
48
+ if root_path.exists() and overlay_path.exists():
49
+ if root_path.read_bytes() == overlay_path.read_bytes():
50
+ identical.append(rel_name)
51
+ else:
52
+ root_ahead.append(rel_name)
53
+ elif root_path.exists():
54
+ missing_overlay.append(rel_name)
55
+ elif overlay_path.exists():
56
+ overlay_only.append(rel_name)
57
+ continue
58
+
59
+ for subrel, root_file in root_files.items():
60
+ rel_name = f"{rel}/{subrel}".replace("\\", "/")
61
+ overlay_file = overlay_files.get(subrel)
62
+ if overlay_file is None:
63
+ missing_overlay.append(rel_name)
64
+ elif root_file.read_bytes() == overlay_file.read_bytes():
65
+ identical.append(rel_name)
66
+ else:
67
+ root_ahead.append(rel_name)
68
+
69
+ for subrel in overlay_files:
70
+ if subrel not in root_files:
71
+ overlay_only.append(f"{rel}/{subrel}".replace("\\", "/"))
72
+
73
+ for bucket in (identical, root_ahead, overlay_only, missing_overlay):
74
+ bucket.sort()
75
+
76
+ return {
77
+ "identical": identical,
78
+ "root_ahead": root_ahead,
79
+ "overlay_only": overlay_only,
80
+ "missing_overlay": missing_overlay,
81
+ }
82
+
83
+
84
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
85
+ parser = argparse.ArgumentParser(description="Audit mirrored H200 overlay files against root source-of-truth paths")
86
+ parser.add_argument("--repo-root", type=Path, default=Path(__file__).resolve().parents[1])
87
+ parser.add_argument("--include-path", action="append", default=[])
88
+ return parser.parse_args(argv)
89
+
90
+
91
+ def main(argv: list[str] | None = None) -> int:
92
+ args = parse_args(argv)
93
+ include_paths = args.include_path or DEFAULT_INCLUDE_PATHS
94
+ payload = classify_overlay_pairs(repo_root=args.repo_root, include_paths=include_paths)
95
+ print(json.dumps(payload, indent=2, sort_keys=True))
96
+ return 0
97
+
98
+
99
+ if __name__ == "__main__":
100
+ raise SystemExit(main())
overlay/scripts/benchmark_assets.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import shutil
6
+ from pathlib import Path
7
+
8
+ from scripts.benchmark_checkpoint import checkpoint_candidates
9
+
10
+ try:
11
+ from huggingface_hub import HfApi
12
+ except Exception: # pragma: no cover - optional import for offline test envs
13
+ HfApi = None
14
+
15
+
16
+ def _download_file(*, repo_id: str, filename: str, local_dir: str, token: str | None, subfolder: str | None = None) -> Path:
17
+ from huggingface_hub import hf_hub_download
18
+
19
+ path = hf_hub_download(
20
+ repo_id=repo_id,
21
+ repo_type="model",
22
+ filename=filename,
23
+ subfolder=subfolder,
24
+ token=token,
25
+ local_dir=local_dir,
26
+ local_dir_use_symlinks=False,
27
+ )
28
+ return Path(path)
29
+
30
+
31
+ def resolve_tokenizer_cache_repo(*, output_repo: str, retina_cache_repo: str) -> str:
32
+ return (
33
+ os.environ.get("HYDRA_TOKENIZER_CACHE_REPO")
34
+ or os.environ.get("FEATHER_HF_OUTPUT_REPO")
35
+ or os.environ.get("HF_REPO_ID")
36
+ or os.environ.get("HYDRA_RETINA_CACHE_REPO")
37
+ or os.environ.get("FEATHER_HF_RETINA_CACHE_REPO")
38
+ or output_repo
39
+ or retina_cache_repo
40
+ )
41
+
42
+
43
+ def tokenizer_cache_prefix() -> str:
44
+ vocab_size = int(os.environ.get("HYDRA_VOCAB_SIZE", "65536"))
45
+ return f"tokenizer/vocab{vocab_size}"
46
+
47
+
48
+ def choose_remote_checkpoint_path(files: list[str]) -> str | None:
49
+ preferred = [
50
+ path for path in files
51
+ if path.endswith("/pretrain_final.pt") or path.endswith("/best_bpb.pt") or path.endswith("/latest.pt")
52
+ ]
53
+ if not preferred:
54
+ return None
55
+ pretrain = sorted([p for p in preferred if p.endswith("/pretrain_final.pt")])
56
+ if pretrain:
57
+ return pretrain[-1]
58
+ best = sorted([p for p in preferred if p.endswith("/best_bpb.pt")])
59
+ if best:
60
+ return best[-1]
61
+ latest = sorted([p for p in preferred if p.endswith("/latest.pt")])
62
+ if latest:
63
+ return latest[-1]
64
+ return None
65
+
66
+
67
+ def hydrate_benchmark_assets(*, cache_dir: Path, output_repo: str, tokenizer_repo: str, token: str | None) -> dict[str, str]:
68
+ cache_dir.mkdir(parents=True, exist_ok=True)
69
+ tok_dir = cache_dir / "tokenizer"
70
+ tok_dir.mkdir(parents=True, exist_ok=True)
71
+ tok_repo = resolve_tokenizer_cache_repo(output_repo=tokenizer_repo, retina_cache_repo=tokenizer_repo)
72
+ tok_prefix = tokenizer_cache_prefix()
73
+
74
+ ckpt_path = None
75
+ for candidate in checkpoint_candidates(cache_dir):
76
+ if candidate.exists():
77
+ ckpt_path = candidate
78
+ break
79
+ try:
80
+ ckpt_path = _download_file(repo_id=output_repo, filename=candidate.name, local_dir=str(cache_dir), token=token)
81
+ break
82
+ except Exception:
83
+ continue
84
+ if ckpt_path is None:
85
+ try:
86
+ if HfApi is None:
87
+ raise RuntimeError("huggingface_hub unavailable")
88
+ files = HfApi(token=token).list_repo_files(repo_id=output_repo, repo_type="model", token=token)
89
+ remote_path = choose_remote_checkpoint_path(files)
90
+ if remote_path is not None:
91
+ parent, filename = remote_path.rsplit("/", 1)
92
+ downloaded_path = _download_file(
93
+ repo_id=output_repo,
94
+ filename=filename,
95
+ local_dir=str(cache_dir),
96
+ token=token,
97
+ subfolder=parent,
98
+ )
99
+ canonical_path = cache_dir / filename
100
+ if downloaded_path != canonical_path:
101
+ canonical_path.parent.mkdir(parents=True, exist_ok=True)
102
+ shutil.copy2(downloaded_path, canonical_path)
103
+ ckpt_path = canonical_path
104
+ except Exception:
105
+ pass
106
+ if ckpt_path is None:
107
+ raise FileNotFoundError(f"No benchmark checkpoint found in cache or repo {output_repo}")
108
+
109
+ tok_path = tok_dir / "tokenizer.pkl"
110
+ if not tok_path.exists():
111
+ downloaded_tok = _download_file(repo_id=tok_repo, filename="tokenizer.pkl", local_dir=str(tok_dir), token=token, subfolder=tok_prefix)
112
+ if downloaded_tok != tok_path:
113
+ shutil.copy2(downloaded_tok, tok_path)
114
+
115
+ token_bytes_path = tok_dir / "token_bytes.pt"
116
+ if not token_bytes_path.exists():
117
+ downloaded_token_bytes = _download_file(repo_id=tok_repo, filename="token_bytes.pt", local_dir=str(tok_dir), token=token, subfolder=tok_prefix)
118
+ if downloaded_token_bytes != token_bytes_path:
119
+ shutil.copy2(downloaded_token_bytes, token_bytes_path)
120
+
121
+ return {
122
+ "checkpoint_path": str(ckpt_path),
123
+ "tokenizer_dir": str(tok_dir),
124
+ }
overlay/scripts/benchmark_checkpoint.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+
6
+ from scripts.hf_routing import resolve_routing
7
+
8
+
9
+ def checkpoint_candidates(cache_dir: Path) -> list[Path]:
10
+ return [
11
+ cache_dir / "best_bpb.pt",
12
+ cache_dir / "pretrain_final.pt",
13
+ cache_dir / "latest.pt",
14
+ ]
15
+
16
+
17
+ def choose_checkpoint_candidate(cache_dir: Path) -> Path | None:
18
+ for path in checkpoint_candidates(cache_dir):
19
+ if path.exists():
20
+ return path
21
+ return None
22
+
23
+
24
+ def resolve_checkpoint_source(*, cache_dir: Path, output_repo: str | None) -> dict[str, str]:
25
+ local = choose_checkpoint_candidate(cache_dir)
26
+ if local is not None:
27
+ return {"mode": "local", "path": str(local)}
28
+ if output_repo:
29
+ return {"mode": "remote", "repo_id": output_repo}
30
+ routing = resolve_routing(token=None)
31
+ return {"mode": "remote", "repo_id": routing.output_repo}
32
+
33
+
34
+ def _download_checkpoint_file(*, repo_id: str, filename: str, local_dir: str, token: str | None) -> str:
35
+ from huggingface_hub import hf_hub_download
36
+
37
+ return hf_hub_download(
38
+ repo_id=repo_id,
39
+ repo_type="model",
40
+ filename=filename,
41
+ token=token,
42
+ local_dir=local_dir,
43
+ local_dir_use_symlinks=False,
44
+ )
45
+
46
+
47
+ def hydrate_checkpoint(*, cache_dir: Path, output_repo: str | None, token: str | None) -> Path | None:
48
+ local = choose_checkpoint_candidate(cache_dir)
49
+ if local is not None:
50
+ return local
51
+ source = resolve_checkpoint_source(cache_dir=cache_dir, output_repo=output_repo)
52
+ if source["mode"] != "remote":
53
+ return None
54
+ cache_dir.mkdir(parents=True, exist_ok=True)
55
+ for filename in ("best_bpb.pt", "pretrain_final.pt", "latest.pt"):
56
+ try:
57
+ path = Path(
58
+ _download_checkpoint_file(
59
+ repo_id=source["repo_id"],
60
+ filename=filename,
61
+ local_dir=str(cache_dir),
62
+ token=token,
63
+ )
64
+ )
65
+ if path.exists():
66
+ return path
67
+ except Exception:
68
+ continue
69
+ return None
overlay/scripts/benchmark_checkpoint_report.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import json
5
+
6
+
7
+ def build_checkpoint_report(files: list[str]) -> dict[str, object]:
8
+ by_job: dict[str, dict[str, object]] = {}
9
+ for path in files:
10
+ parts = path.split("/")
11
+ if len(parts) < 3 or parts[0] != "jobs":
12
+ continue
13
+ job_id = parts[1]
14
+ filename = parts[-1]
15
+ if filename not in {"best_bpb.pt", "pretrain_final.pt", "latest.pt"}:
16
+ continue
17
+ row = by_job.setdefault(job_id, {"job_id": job_id, "paths": []})
18
+ row["paths"].append(path)
19
+
20
+ candidates = []
21
+ for job_id, row in by_job.items():
22
+ paths = list(row["paths"])
23
+ preferred = None
24
+ for suffix in ("pretrain_final.pt", "best_bpb.pt", "latest.pt"):
25
+ for path in paths:
26
+ if path.endswith(suffix):
27
+ preferred = path
28
+ break
29
+ if preferred is not None:
30
+ break
31
+ candidates.append({
32
+ "job_id": job_id,
33
+ "preferred_path": preferred,
34
+ "available_paths": sorted(paths),
35
+ })
36
+
37
+ candidates.sort(key=lambda row: row["job_id"], reverse=True)
38
+ return {
39
+ "n_candidates": len(candidates),
40
+ "candidates": candidates,
41
+ }
42
+
43
+
44
+ def main() -> int:
45
+ print(json.dumps(build_checkpoint_report([]), indent=2, sort_keys=True))
46
+ return 0
47
+
48
+
49
+ if __name__ == "__main__":
50
+ raise SystemExit(main())
overlay/scripts/benchmark_contract.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+
9
+ def _require_path(payload: dict[str, Any], path: str) -> None:
10
+ current: Any = payload
11
+ for part in path.split('.'):
12
+ if not isinstance(current, dict) or part not in current:
13
+ raise ValueError(f"missing required field: {path}")
14
+ current = current[part]
15
+
16
+
17
+ def validate_benchmark_contract(payload: dict[str, Any]) -> None:
18
+ for field in [
19
+ "cycle_id",
20
+ "hardware_class",
21
+ "seeds",
22
+ "budget_modes",
23
+ "coding_benchmarks.fast_iteration",
24
+ "coding_benchmarks.milestone",
25
+ "reasoning_benchmarks.fast_iteration",
26
+ "reasoning_benchmarks.milestone",
27
+ "variants.hydra_full",
28
+ "variants.baseline_mamba_matched",
29
+ ]:
30
+ _require_path(payload, field)
31
+
32
+ for section in [
33
+ payload["coding_benchmarks"]["fast_iteration"],
34
+ payload["coding_benchmarks"]["milestone"],
35
+ payload["reasoning_benchmarks"]["fast_iteration"],
36
+ payload["reasoning_benchmarks"]["milestone"],
37
+ ]:
38
+ if "name" not in section or "primary_metric" not in section or "decode" not in section:
39
+ raise ValueError("benchmark sections require name, primary_metric, and decode")
40
+
41
+ if not isinstance(payload["seeds"], list) or len(payload["seeds"]) < 3:
42
+ raise ValueError("seeds must contain at least three values")
43
+
44
+ if payload["variants"]["hydra_full"].get("status") != "runnable_now":
45
+ raise ValueError("hydra_full must be runnable_now")
46
+
47
+ if payload["variants"]["baseline_mamba_matched"].get("status") != "runnable_now":
48
+ raise ValueError("baseline_mamba_matched must be runnable_now")
49
+
50
+
51
+ def load_benchmark_contract(path: Path) -> dict[str, Any]:
52
+ payload = json.loads(path.read_text(encoding="utf-8"))
53
+ if not isinstance(payload, dict):
54
+ raise ValueError("benchmark contract must be a JSON object")
55
+ validate_benchmark_contract(payload)
56
+ return payload
57
+
58
+
59
+ def main() -> int:
60
+ path = Path("artifacts/cycle_1_execution_freeze.json")
61
+ payload = load_benchmark_contract(path)
62
+ print(json.dumps(payload, indent=2, sort_keys=True))
63
+ return 0
64
+
65
+
66
+ if __name__ == "__main__":
67
+ raise SystemExit(main())
overlay/scripts/benchmark_datasets.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+
6
+
7
+ CANONICAL_SUBSETS = {
8
+ "MBPP": Path("data/benchmarks/mbpp.cycle1.jsonl"),
9
+ "GSM8K": Path("data/benchmarks/gsm8k.cycle1.jsonl"),
10
+ }
11
+
12
+
13
+ def resolve_benchmark_dataset(benchmark_name: str, explicit_path: Path | None) -> Path:
14
+ if explicit_path is not None:
15
+ return explicit_path
16
+ if benchmark_name not in CANONICAL_SUBSETS:
17
+ raise ValueError(f"Unsupported benchmark dataset: {benchmark_name}")
18
+ return Path.cwd() / CANONICAL_SUBSETS[benchmark_name]
overlay/scripts/benchmark_preflight.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ from pathlib import Path
5
+
6
+ from scripts.bootstrap_benchmark_env import build_bootstrap_report
7
+ from scripts.benchmark_checkpoint import choose_checkpoint_candidate
8
+
9
+
10
+ def build_readiness_report(*, cache_dir: Path, hf_token_present: bool, dependencies_present: bool = True, missing_dependencies: list[str] | None = None, output_repo: str | None = None, tokenizer_repo: str | None = None) -> dict[str, object]:
11
+ checkpoint = choose_checkpoint_candidate(cache_dir)
12
+ tokenizer_dir = cache_dir / "tokenizer"
13
+ tokenizer_ready = (tokenizer_dir / "tokenizer.pkl").exists() and (tokenizer_dir / "token_bytes.pt").exists()
14
+ checkpoint_present = checkpoint is not None
15
+ runtime = build_bootstrap_report(missing_dependencies=list(missing_dependencies or []))
16
+ return {
17
+ "cache_dir": str(cache_dir),
18
+ "checkpoint_present": checkpoint_present,
19
+ "checkpoint_path": str(checkpoint) if checkpoint is not None else None,
20
+ "tokenizer_ready": tokenizer_ready,
21
+ "hf_token_present": hf_token_present,
22
+ "dependencies_present": dependencies_present,
23
+ "missing_dependencies": list(missing_dependencies or []),
24
+ "install_hint": runtime["install_hint"],
25
+ "install_command": runtime["install_command"],
26
+ "install_blockers": runtime["install_blockers"],
27
+ "output_repo": output_repo,
28
+ "tokenizer_repo": tokenizer_repo,
29
+ "hydration_possible": bool(hf_token_present and output_repo and tokenizer_repo),
30
+ "ready_for_hydra_benchmarks": checkpoint_present and tokenizer_ready and dependencies_present,
31
+ }
overlay/scripts/benchmark_runner.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import re
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Any, Callable
10
+
11
+ REPO_ROOT = Path(__file__).resolve().parents[1]
12
+ if str(REPO_ROOT) not in sys.path:
13
+ sys.path.insert(0, str(REPO_ROOT))
14
+
15
+ LEDGER_TEMPLATE_PATH = REPO_ROOT / "artifacts" / "benchmark_ledger.template.json"
16
+
17
+ from scripts.hydra_generation import build_hydra_generator
18
+ from scripts.benchmark_datasets import resolve_benchmark_dataset as resolve_canonical_dataset
19
+ from scripts.benchmark_suite import build_prompt, validate_sample
20
+
21
+
22
+ def load_jsonl_samples(path: Path) -> list[dict[str, Any]]:
23
+ rows: list[dict[str, Any]] = []
24
+ for line in path.read_text(encoding="utf-8").splitlines():
25
+ if line.strip():
26
+ rows.append(json.loads(line))
27
+ return rows
28
+
29
+
30
+ def _score_mbpp(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float:
31
+ passed = 0
32
+ for sample in samples:
33
+ validate_sample("MBPP", sample)
34
+ code = generate_fn(build_prompt("MBPP", sample))
35
+ namespace: dict[str, Any] = {}
36
+ exec(code, namespace, namespace)
37
+ for test in sample["tests"]:
38
+ exec(test, namespace, namespace)
39
+ passed += 1
40
+ return passed / len(samples) if samples else 0.0
41
+
42
+
43
+ def _extract_last_number(text: str) -> str | None:
44
+ matches = re.findall(r"-?\d+(?:\.\d+)?", text)
45
+ return matches[-1] if matches else None
46
+
47
+
48
+ def _score_gsm8k(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float:
49
+ passed = 0
50
+ for sample in samples:
51
+ validate_sample("GSM8K", sample)
52
+ output = generate_fn(build_prompt("GSM8K", sample))
53
+ pred = _extract_last_number(output)
54
+ if pred is not None and pred == str(sample["answer"]):
55
+ passed += 1
56
+ return passed / len(samples) if samples else 0.0
57
+
58
+
59
+ def _score_humaneval(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float:
60
+ passed = 0
61
+ for sample in samples:
62
+ validate_sample("HumanEval", sample)
63
+ code = generate_fn(build_prompt("HumanEval", sample))
64
+ namespace: dict[str, Any] = {}
65
+ exec(code, namespace, namespace)
66
+ exec(sample["test"], namespace, namespace)
67
+ passed += 1
68
+ return passed / len(samples) if samples else 0.0
69
+
70
+
71
+ def _score_arc(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float:
72
+ passed = 0
73
+ for sample in samples:
74
+ validate_sample("ARC-Challenge", sample)
75
+ output = generate_fn(build_prompt("ARC-Challenge", sample)).strip()
76
+ if output == str(sample["answer"]):
77
+ passed += 1
78
+ return passed / len(samples) if samples else 0.0
79
+
80
+
81
+ def run_benchmark(benchmark_name: str, path: Path, generate_fn: Callable[[str], str]) -> dict[str, Any]:
82
+ samples = load_jsonl_samples(path)
83
+ if benchmark_name == "MBPP":
84
+ return {
85
+ "benchmark": "MBPP",
86
+ "primary_metric": "pass_at_1",
87
+ "score": _score_mbpp(samples, generate_fn),
88
+ "n_samples": len(samples),
89
+ }
90
+ if benchmark_name == "GSM8K":
91
+ return {
92
+ "benchmark": "GSM8K",
93
+ "primary_metric": "exact_match",
94
+ "score": _score_gsm8k(samples, generate_fn),
95
+ "n_samples": len(samples),
96
+ }
97
+ if benchmark_name == "HumanEval":
98
+ return {
99
+ "benchmark": "HumanEval",
100
+ "primary_metric": "pass_at_1",
101
+ "score": _score_humaneval(samples, generate_fn),
102
+ "n_samples": len(samples),
103
+ }
104
+ if benchmark_name == "ARC-Challenge":
105
+ return {
106
+ "benchmark": "ARC-Challenge",
107
+ "primary_metric": "accuracy",
108
+ "score": _score_arc(samples, generate_fn),
109
+ "n_samples": len(samples),
110
+ }
111
+ raise ValueError(f"Unsupported runnable benchmark: {benchmark_name}")
112
+
113
+
114
+ def write_benchmark_result(path: Path, payload: dict[str, Any]) -> None:
115
+ path.parent.mkdir(parents=True, exist_ok=True)
116
+ path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
117
+
118
+
119
+ def append_benchmark_run_record(
120
+ ledger_path: Path,
121
+ result: dict[str, Any],
122
+ *,
123
+ benchmark_name: str,
124
+ variant: str,
125
+ seed: int,
126
+ samples_path: Path,
127
+ ) -> None:
128
+ if not ledger_path.exists():
129
+ ledger_path.parent.mkdir(parents=True, exist_ok=True)
130
+ ledger_path.write_text(LEDGER_TEMPLATE_PATH.read_text(encoding="utf-8"), encoding="utf-8")
131
+ payload = json.loads(ledger_path.read_text(encoding="utf-8"))
132
+ run_records = payload.setdefault("run_records", [])
133
+ if len(run_records) == 1 and run_records[0].get("run_id") == "example-run-0001":
134
+ run_records.clear()
135
+ run_records.append(
136
+ {
137
+ "run_id": result.get("run_id", f"{benchmark_name.lower()}-{seed}"),
138
+ "commit": "HEAD",
139
+ "model_family": "hydra",
140
+ "variant": variant,
141
+ "seed": seed,
142
+ "hardware": {
143
+ "hardware_class": payload.get("benchmark_cycle", {}).get("hardware_class", "unknown"),
144
+ },
145
+ "budget": {
146
+ "budget_mode": payload.get("benchmark_cycle", {}).get("budget_modes", [None])[0],
147
+ },
148
+ "capability": {
149
+ "coding_score": result["score"] if benchmark_name in {"MBPP", "HumanEval"} else None,
150
+ "reasoning_score": result["score"] if benchmark_name in {"GSM8K", "ARC-Challenge"} else None,
151
+ },
152
+ "artifacts": {
153
+ "samples_path": str(samples_path),
154
+ },
155
+ }
156
+ )
157
+ ledger_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
158
+
159
+
160
+ def resolve_samples_path(benchmark_name: str, samples: Path | None, suite_path: Path) -> Path:
161
+ if samples is not None:
162
+ return samples
163
+ payload = json.loads(suite_path.read_text(encoding="utf-8"))
164
+ for section in ("coding_benchmarks", "reasoning_benchmarks"):
165
+ if section not in payload:
166
+ continue
167
+ for slot in ("fast_iteration", "milestone"):
168
+ entry = payload[section].get(slot)
169
+ if isinstance(entry, dict) and entry.get("name") == benchmark_name and "sample_path" in entry:
170
+ return Path(entry["sample_path"])
171
+ try:
172
+ return resolve_canonical_dataset(benchmark_name, None)
173
+ except ValueError:
174
+ raise ValueError(f"No sample path found for benchmark: {benchmark_name}")
175
+
176
+
177
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
178
+ parser = argparse.ArgumentParser(description="Run a local benchmark against JSONL samples")
179
+ parser.add_argument("--benchmark", required=True, choices=["MBPP", "GSM8K", "HumanEval", "ARC-Challenge"])
180
+ parser.add_argument("--samples", type=Path)
181
+ parser.add_argument("--suite", type=Path, default=REPO_ROOT / "artifacts" / "benchmark_suite.cycle1.json")
182
+ parser.add_argument("--out", type=Path)
183
+ parser.add_argument("--ledger", type=Path)
184
+ parser.add_argument("--variant", default="hydra_full")
185
+ parser.add_argument("--seed", type=int, default=42)
186
+ parser.add_argument("--generator-mode", choices=["stub", "hydra"], default="stub")
187
+ parser.add_argument("--checkpoint", type=Path)
188
+ parser.add_argument("--device")
189
+ parser.add_argument("--max-new-tokens", type=int, default=256)
190
+ parser.add_argument("--temperature", type=float, default=0.2)
191
+ parser.add_argument("--top-p", type=float, default=0.95)
192
+ return parser.parse_args(argv)
193
+
194
+
195
+ def main(argv: list[str] | None = None) -> int:
196
+ args = parse_args(argv)
197
+ sample_path = resolve_samples_path(args.benchmark, args.samples, args.suite)
198
+ try:
199
+ if args.generator_mode == "hydra":
200
+ generator = build_hydra_generator(
201
+ checkpoint_path=args.checkpoint,
202
+ device=args.device,
203
+ max_new_tokens=args.max_new_tokens,
204
+ temperature=args.temperature,
205
+ top_p=args.top_p,
206
+ )
207
+ else:
208
+ def generator(prompt: str) -> str:
209
+ return prompt
210
+
211
+ result = run_benchmark(args.benchmark, sample_path, generator)
212
+ exit_code = 0
213
+ except FileNotFoundError as exc:
214
+ result = {
215
+ "benchmark": args.benchmark,
216
+ "status": "failed",
217
+ "failure_type": "missing_checkpoint",
218
+ "error": str(exc),
219
+ "n_samples": 0,
220
+ }
221
+ exit_code = 1
222
+ except Exception as exc: # noqa: BLE001
223
+ result = {
224
+ "benchmark": args.benchmark,
225
+ "status": "failed",
226
+ "failure_type": type(exc).__name__,
227
+ "error": str(exc),
228
+ "n_samples": 0,
229
+ }
230
+ exit_code = 1
231
+
232
+ if args.out is not None:
233
+ write_benchmark_result(args.out, result)
234
+ if args.ledger is not None and exit_code == 0:
235
+ append_benchmark_run_record(
236
+ args.ledger,
237
+ result,
238
+ benchmark_name=args.benchmark,
239
+ variant=args.variant,
240
+ seed=args.seed,
241
+ samples_path=sample_path,
242
+ )
243
+ print(json.dumps(result, indent=2, sort_keys=True))
244
+ return exit_code
245
+
246
+
247
+ if __name__ == "__main__":
248
+ raise SystemExit(main())
overlay/scripts/benchmark_suite.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class BenchmarkSpec:
12
+ name: str
13
+ family: str
14
+ required_fields: tuple[str, ...]
15
+
16
+
17
+ REGISTRY: dict[str, BenchmarkSpec] = {
18
+ "MBPP": BenchmarkSpec("MBPP", "coding", ("task_id", "prompt", "tests")),
19
+ "HumanEval": BenchmarkSpec("HumanEval", "coding", ("task_id", "prompt", "test")),
20
+ "GSM8K": BenchmarkSpec("GSM8K", "reasoning", ("question", "answer")),
21
+ "ARC-Challenge": BenchmarkSpec("ARC-Challenge", "reasoning", ("question", "choices", "answer")),
22
+ }
23
+
24
+
25
+ def validate_sample(benchmark_name: str, sample: dict[str, Any]) -> None:
26
+ spec = REGISTRY[benchmark_name]
27
+ for field in spec.required_fields:
28
+ if field not in sample:
29
+ raise ValueError(f"{benchmark_name} sample missing required field: {field}")
30
+
31
+
32
+ def build_prompt(benchmark_name: str, sample: dict[str, Any]) -> str:
33
+ validate_sample(benchmark_name, sample)
34
+ if benchmark_name == "MBPP":
35
+ tests = sample["tests"]
36
+ rendered_tests = "\n".join(str(t) for t in tests)
37
+ return (
38
+ "Write a Python function that solves the task below.\n\n"
39
+ f"Task:\n{sample['prompt']}\n\n"
40
+ f"Tests:\n{rendered_tests}\n"
41
+ )
42
+ if benchmark_name == "HumanEval":
43
+ return (
44
+ "Complete the following Python function exactly as specified.\n\n"
45
+ f"Prompt:\n{sample['prompt']}\n\n"
46
+ f"Reference test:\n{sample['test']}\n"
47
+ )
48
+ if benchmark_name == "GSM8K":
49
+ return f"Solve the following math word problem. Return only the final answer.\n\nQuestion: {sample['question']}\n"
50
+ if benchmark_name == "ARC-Challenge":
51
+ choices = sample["choices"]
52
+ rendered_choices = "\n".join(f"- {choice}" for choice in choices)
53
+ return (
54
+ "Answer the following multiple-choice science question. Return only the correct option text or label.\n\n"
55
+ f"Question: {sample['question']}\nChoices:\n{rendered_choices}\n"
56
+ )
57
+ raise ValueError(f"Unknown benchmark: {benchmark_name}")
58
+
59
+
60
+ def load_cycle_benchmark_suite(path: Path) -> dict[str, dict[str, BenchmarkSpec]]:
61
+ payload = json.loads(path.read_text(encoding="utf-8"))
62
+ out: dict[str, dict[str, BenchmarkSpec]] = {"coding_benchmarks": {}, "reasoning_benchmarks": {}}
63
+ for section in ("coding_benchmarks", "reasoning_benchmarks"):
64
+ if section not in payload:
65
+ raise ValueError(f"missing benchmark section: {section}")
66
+ for slot in ("fast_iteration", "milestone"):
67
+ if slot not in payload[section]:
68
+ raise ValueError(f"missing benchmark slot: {section}.{slot}")
69
+ name = payload[section][slot]["name"]
70
+ if name not in REGISTRY:
71
+ raise ValueError(f"unsupported benchmark: {name}")
72
+ out[section][slot] = REGISTRY[name]
73
+ return out
74
+
75
+
76
+ def main() -> int:
77
+ path = Path("artifacts/benchmark_suite.cycle1.json")
78
+ suite = load_cycle_benchmark_suite(path)
79
+ print(json.dumps({k: {slot: spec.name for slot, spec in section.items()} for k, section in suite.items()}, indent=2))
80
+ return 0
81
+
82
+
83
+ if __name__ == "__main__":
84
+ raise SystemExit(main())
overlay/scripts/bootstrap_benchmark_env.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import shutil
6
+
7
+ import torch
8
+
9
+
10
+ PACKAGE_MAP = {
11
+ "mamba_ssm": "mamba-ssm",
12
+ "transformers": "transformers",
13
+ }
14
+
15
+
16
+ def build_install_command(*, missing_dependencies: list[str]) -> list[str]:
17
+ packages = [PACKAGE_MAP.get(name, name) for name in missing_dependencies]
18
+ return [] if not packages else ["python", "-m", "pip", "install", *packages]
19
+
20
+
21
+ def diagnose_install_blockers(
22
+ *,
23
+ missing_dependencies: list[str],
24
+ torch_version: str,
25
+ cuda_available: bool,
26
+ nvcc_present: bool,
27
+ ) -> list[str]:
28
+ blockers: list[str] = []
29
+ if "mamba_ssm" in missing_dependencies:
30
+ if "+cpu" in torch_version or not cuda_available:
31
+ blockers.append("mamba_ssm install likely blocked by CPU-only torch runtime")
32
+ if not nvcc_present:
33
+ blockers.append("mamba_ssm install likely blocked because nvcc is unavailable")
34
+ return blockers
35
+
36
+
37
+ def build_bootstrap_report(*, missing_dependencies: list[str]) -> dict[str, object]:
38
+ ready = len(missing_dependencies) == 0
39
+ packages = [PACKAGE_MAP.get(name, name) for name in missing_dependencies]
40
+ install_hint = "" if ready else f"Install missing benchmark dependencies: {', '.join(packages)}"
41
+ blockers = diagnose_install_blockers(
42
+ missing_dependencies=missing_dependencies,
43
+ torch_version=getattr(torch, "__version__", "unknown"),
44
+ cuda_available=torch.cuda.is_available(),
45
+ nvcc_present=shutil.which("nvcc") is not None,
46
+ )
47
+ return {
48
+ "ready": ready,
49
+ "missing_dependencies": list(missing_dependencies),
50
+ "install_hint": install_hint,
51
+ "install_command": build_install_command(missing_dependencies=missing_dependencies),
52
+ "install_blockers": blockers,
53
+ }
54
+
55
+
56
+ def main() -> int:
57
+ report = build_bootstrap_report(missing_dependencies=["mamba_ssm"])
58
+ print(json.dumps(report, indent=2, sort_keys=True))
59
+ return 0
60
+
61
+
62
+ if __name__ == "__main__":
63
+ raise SystemExit(main())
overlay/scripts/bootstrap_benchmark_runtime.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import importlib.util
5
+ import json
6
+
7
+
8
+ PACKAGE_MAP = {
9
+ "mamba_ssm": "mamba-ssm",
10
+ "transformers": "transformers",
11
+ }
12
+
13
+
14
+ def detect_missing_modules(required: list[str] | None = None) -> list[str]:
15
+ names = required or list(PACKAGE_MAP)
16
+ return [name for name in names if importlib.util.find_spec(name) is None]
17
+
18
+
19
+ def build_install_command(*, missing_modules: list[str]) -> list[str]:
20
+ packages = [PACKAGE_MAP[name] for name in missing_modules if name in PACKAGE_MAP]
21
+ if not packages:
22
+ return []
23
+ return ["python", "-m", "pip", "install", *packages]
24
+
25
+
26
+ def build_runtime_report(*, missing_modules: list[str]) -> dict[str, object]:
27
+ return {
28
+ "ready": len(missing_modules) == 0,
29
+ "missing_modules": list(missing_modules),
30
+ "packages": {name: PACKAGE_MAP[name] for name in missing_modules if name in PACKAGE_MAP},
31
+ "install_command": build_install_command(missing_modules=missing_modules),
32
+ }
33
+
34
+
35
+ def main() -> int:
36
+ missing = detect_missing_modules()
37
+ print(json.dumps(build_runtime_report(missing_modules=missing), indent=2, sort_keys=True))
38
+ return 0
39
+
40
+
41
+ if __name__ == "__main__":
42
+ raise SystemExit(main())
overlay/scripts/cycle_executor.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import importlib.util
6
+ import importlib
7
+ import json
8
+ import os
9
+ import subprocess
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from scripts.benchmark_preflight import build_readiness_report
15
+ from scripts.hf_routing import resolve_routing
16
+
17
+
18
+ REPO_ROOT = Path(__file__).resolve().parents[1]
19
+ FREEZE_PATH = REPO_ROOT / "artifacts" / "cycle_1_execution_freeze.json"
20
+ RUNNER_PATH = REPO_ROOT / "scripts" / "benchmark_runner.py"
21
+
22
+
23
+ def active_hf_token() -> str | None:
24
+ token = os.environ.get("HF_TOKEN")
25
+ if token:
26
+ return token
27
+ try:
28
+ from huggingface_hub.utils import get_token
29
+ return get_token()
30
+ except Exception:
31
+ return None
32
+
33
+
34
+ def missing_benchmark_dependencies() -> list[str]:
35
+ required = ["mamba_ssm", "transformers"]
36
+ missing: list[str] = []
37
+ for name in required:
38
+ try:
39
+ spec = importlib.util.find_spec(name)
40
+ except (ImportError, ValueError):
41
+ spec = None
42
+ if spec is None:
43
+ try:
44
+ importlib.import_module(name)
45
+ except Exception:
46
+ missing.append(name)
47
+ return missing
48
+
49
+
50
+ def load_cycle_freeze(path: Path) -> dict[str, Any]:
51
+ return json.loads(path.read_text(encoding="utf-8"))
52
+
53
+
54
+ def load_cycle_benchmarks(path: Path) -> list[str]:
55
+ payload = json.loads(path.read_text(encoding="utf-8"))
56
+ out: list[str] = []
57
+ for section in ("coding_benchmarks", "reasoning_benchmarks"):
58
+ for slot in ("fast_iteration", "milestone"):
59
+ entry = payload.get(section, {}).get(slot)
60
+ if isinstance(entry, dict) and entry.get("name"):
61
+ out.append(str(entry["name"]))
62
+ return out
63
+
64
+
65
+ def build_preflight_report(
66
+ *,
67
+ cache_dir: Path,
68
+ output_repo: str | None = None,
69
+ tokenizer_repo: str | None = None,
70
+ ) -> dict[str, object]:
71
+ return build_readiness_report(
72
+ cache_dir=cache_dir,
73
+ hf_token_present=bool(active_hf_token()),
74
+ dependencies_present=not bool(missing_benchmark_dependencies()),
75
+ missing_dependencies=missing_benchmark_dependencies(),
76
+ output_repo=output_repo,
77
+ tokenizer_repo=tokenizer_repo,
78
+ )
79
+
80
+
81
+ def write_preflight_report(path: Path, payload: dict[str, object]) -> None:
82
+ path.parent.mkdir(parents=True, exist_ok=True)
83
+ path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
84
+
85
+
86
+ def write_cycle_summary(path: Path, payload: list[dict[str, Any]]) -> None:
87
+ path.parent.mkdir(parents=True, exist_ok=True)
88
+ path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
89
+
90
+
91
+ def build_remote_checkpoint_report(output_repo: str, token: str | None) -> dict[str, Any]:
92
+ from huggingface_hub import HfApi
93
+
94
+ from scripts.benchmark_checkpoint_report import build_checkpoint_report
95
+
96
+ files = HfApi(token=token).list_repo_files(repo_id=output_repo, repo_type="model", token=token)
97
+ return build_checkpoint_report(files)
98
+
99
+
100
+ def ensure_benchmark_assets(
101
+ *,
102
+ cache_dir: Path,
103
+ output_repo: str,
104
+ tokenizer_repo: str,
105
+ token: str | None,
106
+ hydrate: bool,
107
+ ) -> dict[str, str] | None:
108
+ if not hydrate:
109
+ return None
110
+ from scripts.benchmark_assets import hydrate_benchmark_assets
111
+
112
+ return hydrate_benchmark_assets(
113
+ cache_dir=cache_dir,
114
+ output_repo=output_repo,
115
+ tokenizer_repo=tokenizer_repo,
116
+ token=token,
117
+ )
118
+
119
+
120
+ def build_benchmark_command(
121
+ freeze: dict[str, Any],
122
+ *,
123
+ benchmark: str,
124
+ variant: str,
125
+ seed: int,
126
+ out_dir: Path,
127
+ ) -> tuple[list[str], dict[str, str]]:
128
+ variant_cfg = freeze["variants"][variant]
129
+ env = os.environ.copy()
130
+ env.update({str(k): str(v) for k, v in variant_cfg.get("env", {}).items()})
131
+ env["HYDRA_SEED"] = str(seed)
132
+
133
+ out_dir.mkdir(parents=True, exist_ok=True)
134
+ result_path = out_dir / f"{benchmark.lower()}_{variant}_seed{seed}.json"
135
+ ledger_path = out_dir / "benchmark_ledger.json"
136
+ cmd = [
137
+ sys.executable,
138
+ str(RUNNER_PATH),
139
+ "--benchmark",
140
+ benchmark,
141
+ "--generator-mode",
142
+ "hydra",
143
+ "--out",
144
+ str(result_path),
145
+ "--ledger",
146
+ str(ledger_path),
147
+ "--variant",
148
+ variant,
149
+ "--seed",
150
+ str(seed),
151
+ ]
152
+ return cmd, env
153
+
154
+
155
+ def build_cycle_plan(freeze: dict[str, Any], *, benchmark: str, out_dir: Path) -> list[dict[str, Any]]:
156
+ runnable_variants = [
157
+ name for name, cfg in freeze.get("variants", {}).items()
158
+ if isinstance(cfg, dict) and cfg.get("status") == "runnable_now"
159
+ ]
160
+ seeds = [int(seed) for seed in freeze.get("seeds", [])]
161
+ plan: list[dict[str, Any]] = []
162
+ for variant in runnable_variants:
163
+ for seed in seeds:
164
+ cmd, env = build_benchmark_command(
165
+ freeze,
166
+ benchmark=benchmark,
167
+ variant=variant,
168
+ seed=seed,
169
+ out_dir=out_dir,
170
+ )
171
+ plan.append({
172
+ "benchmark": benchmark,
173
+ "variant": variant,
174
+ "seed": seed,
175
+ "command": cmd,
176
+ "env": env,
177
+ })
178
+ return plan
179
+
180
+
181
+ def execute_cycle_plan(plan: list[dict[str, Any]], *, repo_root: Path) -> list[dict[str, Any]]:
182
+ results: list[dict[str, Any]] = []
183
+ for item in plan:
184
+ proc = subprocess.run(item["command"], cwd=str(repo_root), env=item["env"])
185
+ results.append(
186
+ {
187
+ "benchmark": item["benchmark"],
188
+ "variant": item["variant"],
189
+ "seed": item["seed"],
190
+ "returncode": proc.returncode,
191
+ }
192
+ )
193
+ return results
194
+
195
+
196
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
197
+ parser = argparse.ArgumentParser(description="Execute a frozen Cycle 1 benchmark run")
198
+ parser.add_argument("--freeze", type=Path, default=FREEZE_PATH)
199
+ parser.add_argument("--suite", type=Path, default=REPO_ROOT / "artifacts" / "benchmark_suite.cycle1.json")
200
+ parser.add_argument("--benchmark", required=True)
201
+ parser.add_argument("--variant", required=True)
202
+ parser.add_argument("--seed", type=int, required=True)
203
+ parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "artifacts" / "runs")
204
+ parser.add_argument("--preflight-out", type=Path)
205
+ parser.add_argument("--summary-out", type=Path)
206
+ parser.add_argument("--hydrate-assets", action="store_true")
207
+ parser.add_argument("--all-runnable", action="store_true")
208
+ parser.add_argument("--all-benchmarks", action="store_true")
209
+ parser.add_argument("--require-ready", action="store_true")
210
+ parser.add_argument("--output-repo")
211
+ parser.add_argument("--tokenizer-repo")
212
+ return parser.parse_args(argv)
213
+
214
+
215
+ def main(argv: list[str] | None = None) -> int:
216
+ args = parse_args(argv)
217
+ cache_dir = Path(os.path.expanduser("~/.cache/autoresearch"))
218
+ report = None
219
+ token = active_hf_token()
220
+ routing = resolve_routing(token=token)
221
+ output_repo = args.output_repo or routing.output_repo
222
+ tokenizer_repo = args.tokenizer_repo or routing.output_repo
223
+ if args.hydrate_assets:
224
+ try:
225
+ ensure_benchmark_assets(
226
+ cache_dir=cache_dir,
227
+ output_repo=output_repo,
228
+ tokenizer_repo=tokenizer_repo,
229
+ token=token,
230
+ hydrate=True,
231
+ )
232
+ except FileNotFoundError as exc:
233
+ checkpoint_report = None
234
+ try:
235
+ checkpoint_report = build_remote_checkpoint_report(output_repo, token)
236
+ except Exception:
237
+ checkpoint_report = None
238
+ if args.summary_out is not None:
239
+ write_cycle_summary(
240
+ args.summary_out,
241
+ [{
242
+ "status": "blocked",
243
+ "reason": "asset_hydration_failed",
244
+ "error": str(exc),
245
+ "checkpoint_candidates": checkpoint_report,
246
+ }],
247
+ )
248
+ return 3
249
+ if args.preflight_out is not None:
250
+ report = build_preflight_report(
251
+ cache_dir=cache_dir,
252
+ output_repo=output_repo,
253
+ tokenizer_repo=tokenizer_repo,
254
+ )
255
+ write_preflight_report(args.preflight_out, report)
256
+ if args.require_ready:
257
+ if report is None:
258
+ report = build_preflight_report(
259
+ cache_dir=cache_dir,
260
+ output_repo=output_repo,
261
+ tokenizer_repo=tokenizer_repo,
262
+ )
263
+ if not bool(report.get("ready_for_hydra_benchmarks")):
264
+ checkpoint_report = None
265
+ try:
266
+ checkpoint_report = build_remote_checkpoint_report(output_repo, token)
267
+ except Exception:
268
+ checkpoint_report = None
269
+ if args.summary_out is not None:
270
+ write_cycle_summary(
271
+ args.summary_out,
272
+ [{
273
+ "status": "blocked",
274
+ "reason": "preflight_not_ready",
275
+ "preflight": report,
276
+ "checkpoint_candidates": checkpoint_report,
277
+ }],
278
+ )
279
+ return 2
280
+ freeze = load_cycle_freeze(args.freeze)
281
+ if args.all_runnable:
282
+ benchmarks = load_cycle_benchmarks(args.suite) if args.all_benchmarks else [args.benchmark]
283
+ plan = []
284
+ for benchmark in benchmarks:
285
+ plan.extend(build_cycle_plan(freeze, benchmark=benchmark, out_dir=args.out_dir))
286
+ results = execute_cycle_plan(plan, repo_root=REPO_ROOT)
287
+ if args.summary_out is not None:
288
+ write_cycle_summary(args.summary_out, results)
289
+ return 0 if all(item["returncode"] == 0 for item in results) else 1
290
+ cmd, env = build_benchmark_command(
291
+ freeze,
292
+ benchmark=args.benchmark,
293
+ variant=args.variant,
294
+ seed=args.seed,
295
+ out_dir=args.out_dir,
296
+ )
297
+ proc = subprocess.run(cmd, cwd=str(REPO_ROOT), env=env)
298
+ if args.summary_out is not None:
299
+ write_cycle_summary(
300
+ args.summary_out,
301
+ [{
302
+ "benchmark": args.benchmark,
303
+ "variant": args.variant,
304
+ "seed": args.seed,
305
+ "returncode": proc.returncode,
306
+ }],
307
+ )
308
+ return proc.returncode
309
+
310
+
311
+ if __name__ == "__main__":
312
+ raise SystemExit(main())
overlay/scripts/export_hpo_priors.py CHANGED
@@ -9,6 +9,8 @@ from typing import Any
9
 
10
  import optuna
11
 
 
 
12
 
13
  def parse_args() -> argparse.Namespace:
14
  parser = argparse.ArgumentParser(description="Export top Optuna trials as transfer-learning priors")
@@ -35,38 +37,56 @@ def _serialize_trial(trial: optuna.trial.FrozenTrial) -> dict[str, Any]:
35
  }
36
 
37
 
38
- def main() -> int:
39
- args = parse_args()
40
- study_names = args.study_name or ["hydra_hpo"]
41
- merged_trials: list[dict[str, Any]] = []
42
- total_trials = 0
43
- total_completed = 0
44
-
45
- for study_name in study_names:
46
- study = optuna.load_study(study_name=study_name, storage=args.storage)
47
- ranked = _completed_trials(study)
48
- selected = ranked[: max(0, args.top_k)]
49
- total_trials += len(study.trials)
50
- total_completed += len(ranked)
51
- for t in selected:
52
- row = _serialize_trial(t)
53
- row["study_name"] = study_name
54
- merged_trials.append(row)
55
-
56
- payload = {
57
- "schema_version": 1,
 
 
 
 
 
 
58
  "generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
59
  "study_names": study_names,
60
- "metric": args.metric,
61
- "n_total_trials": total_trials,
62
- "n_completed_trials": total_completed,
63
- "top_k_per_study": args.top_k,
64
- "trials": merged_trials,
 
 
 
65
  }
66
 
 
 
 
 
 
 
67
  args.out.parent.mkdir(parents=True, exist_ok=True)
68
  args.out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
69
- print(f"[hpo-priors] wrote {args.out} with {len(merged_trials)} merged trials")
 
 
 
70
  return 0
71
 
72
 
 
9
 
10
  import optuna
11
 
12
+ from scripts.hpo_leaderboard import build_leaderboard
13
+
14
 
15
  def parse_args() -> argparse.Namespace:
16
  parser = argparse.ArgumentParser(description="Export top Optuna trials as transfer-learning priors")
 
37
  }
38
 
39
 
40
+ def collect_prior_trials(*, storage: str, study_names: list[str], top_k: int, metric: str) -> dict[str, Any]:
41
+ leaderboard = build_leaderboard(storage=storage, study_names=study_names, metric=metric)
42
+ selected = leaderboard["clean_trials"][: max(0, top_k)]
43
+ trials = [
44
+ {
45
+ "study_name": row["study_name"],
46
+ "trial_number": row["trial_number"],
47
+ "value": row["value"],
48
+ "params": row["params"],
49
+ "user_attrs": row["user_attrs"],
50
+ }
51
+ for row in selected
52
+ ]
53
+ quarantined = [
54
+ {
55
+ "study_name": row["study_name"],
56
+ "trial_number": row["trial_number"],
57
+ "value": row["value"],
58
+ "params": row["params"],
59
+ "user_attrs": row["user_attrs"],
60
+ "contamination_reason": row["contamination_reason"],
61
+ }
62
+ for row in leaderboard["contaminated_trials"]
63
+ ]
64
+ return {
65
+ "schema_version": 2,
66
  "generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
67
  "study_names": study_names,
68
+ "metric": metric,
69
+ "n_total_trials": sum(int(s["n_trials"]) for s in leaderboard["studies"]),
70
+ "n_completed_trials": sum(int(s["n_completed"]) for s in leaderboard["studies"]),
71
+ "n_exported_trials": len(trials),
72
+ "n_quarantined_trials": len(quarantined),
73
+ "top_k": top_k,
74
+ "trials": trials,
75
+ "quarantined_trials": quarantined,
76
  }
77
 
78
+
79
+ def main() -> int:
80
+ args = parse_args()
81
+ study_names = args.study_name or ["hydra_hpo"]
82
+ payload = collect_prior_trials(storage=args.storage, study_names=study_names, top_k=args.top_k, metric=args.metric)
83
+
84
  args.out.parent.mkdir(parents=True, exist_ok=True)
85
  args.out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
86
+ print(
87
+ f"[hpo-priors] wrote {args.out} with {payload['n_exported_trials']} clean trials "
88
+ f"({payload['n_quarantined_trials']} quarantined)"
89
+ )
90
  return 0
91
 
92
 
overlay/scripts/hpo_component_report.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import datetime as dt
6
+ import json
7
+ import math
8
+ from collections import defaultdict
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from scripts.hpo_leaderboard import build_leaderboard
13
+
14
+
15
+ _COMPONENT_KEYS = [
16
+ "engram_subsample",
17
+ "htm_subsample",
18
+ "htm_learn_every",
19
+ "engram_n_columns",
20
+ "engram_layer_idx",
21
+ "sdr_target_active",
22
+ "mamba3_chunk",
23
+ "dropout",
24
+ "hyena_layers",
25
+ ]
26
+
27
+
28
+ def _recover_params(row: dict[str, Any]) -> dict[str, Any]:
29
+ params = dict(row.get("params") or {})
30
+ attrs = row.get("user_attrs") or {}
31
+ for key, value in attrs.items():
32
+ if key.startswith("param_"):
33
+ params.setdefault(key.removeprefix("param_"), value)
34
+ return params
35
+
36
+
37
+ def _pearson(xs: list[float], ys: list[float]) -> float | None:
38
+ if len(xs) < 2 or len(xs) != len(ys):
39
+ return None
40
+ mean_x = sum(xs) / len(xs)
41
+ mean_y = sum(ys) / len(ys)
42
+ cov = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys))
43
+ var_x = sum((x - mean_x) ** 2 for x in xs)
44
+ var_y = sum((y - mean_y) ** 2 for y in ys)
45
+ if var_x <= 0 or var_y <= 0:
46
+ return None
47
+ return cov / math.sqrt(var_x * var_y)
48
+
49
+
50
+ def build_component_report(*, storage: str, study_names: list[str], metric: str = "val_bpb") -> dict[str, Any]:
51
+ leaderboard = build_leaderboard(storage=storage, study_names=study_names, metric=metric)
52
+ clean_trials = leaderboard["clean_trials"]
53
+
54
+ ablations: dict[str, list[dict[str, Any]]] = {}
55
+ numeric_correlations: list[dict[str, Any]] = []
56
+
57
+ for key in _COMPONENT_KEYS:
58
+ grouped: dict[str, list[dict[str, Any]]] = defaultdict(list)
59
+ numeric_x: list[float] = []
60
+ metric_y: list[float] = []
61
+ tps_y: list[float] = []
62
+ for row in clean_trials:
63
+ params = _recover_params(row)
64
+ if key not in params:
65
+ continue
66
+ value = params[key]
67
+ grouped[str(value)].append({"value": value, "metric": float(row["value"]), "tps": row.get("tps")})
68
+ if isinstance(value, (int, float)) and isinstance(row.get("tps"), (int, float)):
69
+ numeric_x.append(float(value))
70
+ metric_y.append(float(row["value"]))
71
+ tps_y.append(float(row["tps"]))
72
+
73
+ rows: list[dict[str, Any]] = []
74
+ for grouped_rows in grouped.values():
75
+ value = grouped_rows[0]["value"]
76
+ metric_vals = [r["metric"] for r in grouped_rows]
77
+ tps_vals = [float(r["tps"]) for r in grouped_rows if isinstance(r["tps"], (int, float))]
78
+ rows.append({
79
+ "value": value,
80
+ "n_trials": len(grouped_rows),
81
+ "mean_metric": sum(metric_vals) / len(metric_vals),
82
+ "mean_tps": (sum(tps_vals) / len(tps_vals)) if tps_vals else None,
83
+ })
84
+ if rows:
85
+ rows.sort(key=lambda row: str(row["value"]))
86
+ ablations[key] = rows
87
+
88
+ pearson_metric = _pearson(numeric_x, metric_y)
89
+ pearson_tps = _pearson(numeric_x, tps_y)
90
+ if pearson_metric is not None or pearson_tps is not None:
91
+ numeric_correlations.append({
92
+ "param": key,
93
+ "pearson_with_metric": pearson_metric,
94
+ "pearson_with_tps": pearson_tps,
95
+ "n_points": len(numeric_x),
96
+ })
97
+
98
+ numeric_correlations.sort(key=lambda row: row["param"])
99
+ return {
100
+ "schema_version": 1,
101
+ "generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
102
+ "metric": metric,
103
+ "study_names": study_names,
104
+ "n_clean_trials": len(clean_trials),
105
+ "component_ablations": ablations,
106
+ "numeric_correlations": numeric_correlations,
107
+ }
108
+
109
+
110
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
111
+ parser = argparse.ArgumentParser(description="Build component ablation and correlation report from clean HPO trials")
112
+ parser.add_argument("--storage", default="sqlite:///optuna_hpo.db")
113
+ parser.add_argument("--study-name", action="append", default=[])
114
+ parser.add_argument("--metric", default="val_bpb")
115
+ parser.add_argument("--out", type=Path, default=Path(".tmp") / "optuna" / "component_report.json")
116
+ return parser.parse_args(argv)
117
+
118
+
119
+ def main(argv: list[str] | None = None) -> int:
120
+ args = parse_args(argv)
121
+ study_names = args.study_name or ["hydra_hpo"]
122
+ payload = build_component_report(storage=args.storage, study_names=study_names, metric=args.metric)
123
+ args.out.parent.mkdir(parents=True, exist_ok=True)
124
+ args.out.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
125
+ print(json.dumps(payload, indent=2, sort_keys=True))
126
+ return 0
127
+
128
+
129
+ if __name__ == "__main__":
130
+ raise SystemExit(main())
overlay/scripts/hpo_leaderboard.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import datetime as dt
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import optuna
11
+
12
+
13
+ def _trial_direction(study: optuna.Study) -> str:
14
+ return "maximize" if study.direction == optuna.study.StudyDirection.MAXIMIZE else "minimize"
15
+
16
+
17
+ def _contamination_reason(trial: optuna.trial.FrozenTrial, metric: str) -> str | None:
18
+ if trial.value is None:
19
+ return "missing_value"
20
+ attrs = trial.user_attrs
21
+ source = attrs.get("objective_source")
22
+ eval_status = attrs.get("eval_status")
23
+ objective_metric = attrs.get("objective_metric")
24
+
25
+ if source in {"train_log_fallback", "missing_metric", "missing_metrics", "missing_final_val"}:
26
+ return f"objective_source={source}"
27
+ if eval_status not in {None, "completed"}:
28
+ return f"eval_status={eval_status}"
29
+ if objective_metric not in {None, metric}:
30
+ return f"objective_metric={objective_metric}"
31
+ return None
32
+
33
+
34
+ def _serialize_trial(study_name: str, trial: optuna.trial.FrozenTrial, metric: str) -> dict[str, Any]:
35
+ attrs = dict(trial.user_attrs)
36
+ source = attrs.get("objective_source") or "legacy_completed_value"
37
+ row = {
38
+ "study_name": study_name,
39
+ "trial_number": trial.number,
40
+ "value": float(trial.value) if trial.value is not None else None,
41
+ "metric": metric,
42
+ "objective_source": source,
43
+ "objective_metric": attrs.get("objective_metric", metric),
44
+ "eval_status": attrs.get("eval_status"),
45
+ "hf_job_id": attrs.get("hf_job_id"),
46
+ "tps": attrs.get("tps"),
47
+ "params": dict(trial.params),
48
+ "user_attrs": attrs,
49
+ }
50
+ reason = _contamination_reason(trial, metric)
51
+ if reason is not None:
52
+ row["contamination_reason"] = reason
53
+ return row
54
+
55
+
56
+ def _is_pareto_dominated(candidate: dict[str, Any], peers: list[dict[str, Any]]) -> bool:
57
+ candidate_value = float(candidate["value"])
58
+ candidate_tps = float(candidate["tps"])
59
+ for peer in peers:
60
+ if peer is candidate or peer.get("tps") is None:
61
+ continue
62
+ peer_value = float(peer["value"])
63
+ peer_tps = float(peer["tps"])
64
+ no_worse = peer_value <= candidate_value and peer_tps >= candidate_tps
65
+ strictly_better = peer_value < candidate_value or peer_tps > candidate_tps
66
+ if no_worse and strictly_better:
67
+ return True
68
+ return False
69
+
70
+
71
+ def _annotate_pareto(clean_trials: list[dict[str, Any]]) -> list[dict[str, Any]]:
72
+ pareto_trials: list[dict[str, Any]] = []
73
+ comparable = [row for row in clean_trials if row.get("tps") is not None]
74
+ for row in clean_trials:
75
+ if row.get("tps") is None:
76
+ row["pareto_frontier"] = False
77
+ row["pareto_dominated"] = None
78
+ row["pareto_reason"] = "missing_tps"
79
+ continue
80
+ dominated = _is_pareto_dominated(row, comparable)
81
+ row["pareto_frontier"] = not dominated
82
+ row["pareto_dominated"] = dominated
83
+ row["pareto_reason"] = "frontier" if not dominated else "dominated"
84
+ if not dominated:
85
+ pareto_trials.append(row)
86
+ pareto_trials.sort(key=lambda row: (float(row["value"]), -float(row["tps"])))
87
+ return pareto_trials
88
+
89
+
90
+ def build_leaderboard(*, storage: str, study_names: list[str], metric: str = "val_bpb") -> dict[str, Any]:
91
+ clean_trials: list[dict[str, Any]] = []
92
+ contaminated_trials: list[dict[str, Any]] = []
93
+ study_summaries: list[dict[str, Any]] = []
94
+ direction = "minimize"
95
+
96
+ for study_name in study_names:
97
+ study = optuna.load_study(study_name=study_name, storage=storage)
98
+ direction = _trial_direction(study)
99
+ completed = [t for t in study.trials if t.value is not None]
100
+ study_summaries.append({
101
+ "study_name": study_name,
102
+ "direction": direction,
103
+ "n_trials": len(study.trials),
104
+ "n_completed": len(completed),
105
+ })
106
+ for trial in completed:
107
+ row = _serialize_trial(study_name, trial, metric)
108
+ if "contamination_reason" in row:
109
+ contaminated_trials.append(row)
110
+ else:
111
+ clean_trials.append(row)
112
+
113
+ reverse = direction == "maximize"
114
+ clean_trials.sort(key=lambda row: float(row["value"]), reverse=reverse)
115
+ contaminated_trials.sort(key=lambda row: float(row["value"]), reverse=reverse)
116
+ pareto_trials = _annotate_pareto(clean_trials)
117
+
118
+ return {
119
+ "schema_version": 1,
120
+ "generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
121
+ "metric": metric,
122
+ "direction": direction,
123
+ "study_names": study_names,
124
+ "studies": study_summaries,
125
+ "n_clean_trials": len(clean_trials),
126
+ "n_contaminated_trials": len(contaminated_trials),
127
+ "pareto_metric_x": metric,
128
+ "pareto_metric_y": "tps",
129
+ "n_pareto_trials": len(pareto_trials),
130
+ "clean_trials": clean_trials,
131
+ "contaminated_trials": contaminated_trials,
132
+ "pareto_trials": pareto_trials,
133
+ }
134
+
135
+
136
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
137
+ parser = argparse.ArgumentParser(description="Build a clean Optuna HPO leaderboard")
138
+ parser.add_argument("--storage", default="sqlite:///optuna_hpo.db")
139
+ parser.add_argument("--study-name", action="append", default=[], help="Repeat to merge multiple studies")
140
+ parser.add_argument("--metric", default="val_bpb")
141
+ parser.add_argument("--out", type=Path, default=Path(".tmp") / "optuna" / "leaderboard.json")
142
+ return parser.parse_args(argv)
143
+
144
+
145
+ def main(argv: list[str] | None = None) -> int:
146
+ args = parse_args(argv)
147
+ study_names = args.study_name or ["hydra_hpo"]
148
+ payload = build_leaderboard(storage=args.storage, study_names=study_names, metric=args.metric)
149
+ args.out.parent.mkdir(parents=True, exist_ok=True)
150
+ args.out.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
151
+ print(json.dumps(payload, indent=2, sort_keys=True))
152
+ return 0
153
+
154
+
155
+ if __name__ == "__main__":
156
+ raise SystemExit(main())
overlay/scripts/hpo_retest.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import datetime as dt
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import optuna
11
+
12
+ from scripts.hpo_leaderboard import build_leaderboard
13
+
14
+
15
+ _PARAM_TO_ENV = {
16
+ "d_model": "HYDRA_D_MODEL",
17
+ "n_layer": "HYDRA_N_LAYER",
18
+ "d_state": "HYDRA_D_STATE",
19
+ "headdim": "HYDRA_HEADDIM",
20
+ "expand": "HYDRA_EXPAND",
21
+ "seq_len": "HYDRA_SEQ_LEN",
22
+ "batch_size": "HYDRA_BATCH_SIZE",
23
+ "matrix_lr": "HYDRA_MATRIX_LR",
24
+ "embed_lr": "HYDRA_EMBED_LR",
25
+ "unembed_lr": "HYDRA_UNEMBED_LR",
26
+ "engram_n_columns": "HYDRA_ENGRAM_N_COLUMNS",
27
+ "engram_layer_idx": "HYDRA_ENGRAM_LAYER_IDX",
28
+ "sdr_target_active": "HYDRA_SDR_TARGET_ACTIVE",
29
+ "htm_learn_every": "HYDRA_HTM_LEARN_EVERY",
30
+ "htm_subsample": "HYDRA_HTM_SUBSAMPLE",
31
+ "engram_subsample": "HYDRA_ENGRAM_SUBSAMPLE",
32
+ "mamba3_chunk": "HYDRA_MAMBA3_CHUNK",
33
+ "dropout": "HYDRA_DROPOUT",
34
+ }
35
+
36
+ _DEFAULT_ENV = {
37
+ "HYDRA_USE_NEMOTRON": "1",
38
+ "HYDRA_LOCAL_SHARDS_ONLY": "0",
39
+ "HYDRA_THROUGHPUT_MODE": "0",
40
+ "HYDRA_FASTPATH": "0",
41
+ "HYDRA_FORCE_HTM_CPU": "0",
42
+ "HYDRA_INERT_MAMBA": "0",
43
+ "HYDRA_ALLOW_SYNTHETIC_RETINA": "0",
44
+ "HYDRA_HTM_FUSED": "1",
45
+ "HYDRA_HYENA_LAYERS": "",
46
+ "HYDRA_CKPT_INTERVAL": "0",
47
+ "HYDRA_ENGRAM_SUBSAMPLE": "1",
48
+ "HYDRA_HTM_SUBSAMPLE": "2",
49
+ "HYDRA_HTM_LEARN_EVERY": "8",
50
+ }
51
+
52
+
53
+ def _recover_params(row: dict[str, Any]) -> dict[str, Any]:
54
+ params = dict(row.get("params") or {})
55
+ attrs = row.get("user_attrs") or {}
56
+ for key, value in attrs.items():
57
+ if key.startswith("param_"):
58
+ params.setdefault(key.removeprefix("param_"), value)
59
+ return params
60
+
61
+
62
+ def _candidate_env(params: dict[str, Any], *, eval_tokens: int, eval_batch: int, time_budget: int) -> dict[str, str]:
63
+ env = dict(_DEFAULT_ENV)
64
+ env["HYDRA_EVAL_TOKENS"] = str(eval_tokens)
65
+ env["HYDRA_EVAL_BATCH"] = str(eval_batch)
66
+ env["HYDRA_TIME_BUDGET"] = str(time_budget)
67
+ for key, value in params.items():
68
+ env_key = _PARAM_TO_ENV.get(key)
69
+ if env_key is not None:
70
+ env[env_key] = str(value)
71
+ if "HYDRA_BATCH_SIZE" in env and "HYDRA_SEQ_LEN" in env:
72
+ grad_accum = int(params.get("grad_accum", 16))
73
+ env["HYDRA_TOTAL_BATCH"] = str(int(env["HYDRA_BATCH_SIZE"]) * int(env["HYDRA_SEQ_LEN"]) * grad_accum)
74
+ return env
75
+
76
+
77
+ def build_retest_plan(
78
+ *,
79
+ storage: str,
80
+ study_names: list[str],
81
+ top_k: int,
82
+ metric: str = "val_bpb",
83
+ eval_tokens: int = 16384,
84
+ eval_batch: int = 2,
85
+ time_budget: int = 420,
86
+ ) -> dict[str, Any]:
87
+ leaderboard = build_leaderboard(storage=storage, study_names=study_names, metric=metric)
88
+ rows = [*leaderboard["contaminated_trials"], *leaderboard["clean_trials"]]
89
+ reverse = leaderboard["direction"] == "maximize"
90
+ rows.sort(key=lambda row: float(row["value"]), reverse=reverse)
91
+ candidates = []
92
+ for row in rows[: max(0, top_k)]:
93
+ params = _recover_params(row)
94
+ env = _candidate_env(params, eval_tokens=eval_tokens, eval_batch=eval_batch, time_budget=time_budget)
95
+ reason = row.get("contamination_reason") or "canonical_truth_eval_retest"
96
+ candidates.append({
97
+ "study_name": row["study_name"],
98
+ "trial_number": row["trial_number"],
99
+ "source_value": row["value"],
100
+ "source_objective": row["objective_source"],
101
+ "source_job_id": row.get("hf_job_id"),
102
+ "needs_retest_reason": reason,
103
+ "params": params,
104
+ "env": env,
105
+ })
106
+ return {
107
+ "schema_version": 1,
108
+ "generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
109
+ "metric": metric,
110
+ "study_names": study_names,
111
+ "eval_tokens": eval_tokens,
112
+ "eval_batch": eval_batch,
113
+ "time_budget": time_budget,
114
+ "n_candidates": len(candidates),
115
+ "candidates": candidates,
116
+ }
117
+
118
+
119
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
120
+ parser = argparse.ArgumentParser(description="Plan canonical-eval retests for historical HPO configs")
121
+ parser.add_argument("--storage", default="sqlite:///optuna_hpo.db")
122
+ parser.add_argument("--study-name", action="append", default=[])
123
+ parser.add_argument("--metric", default="val_bpb")
124
+ parser.add_argument("--top-k", type=int, default=10)
125
+ parser.add_argument("--eval-tokens", type=int, default=16384)
126
+ parser.add_argument("--eval-batch", type=int, default=2)
127
+ parser.add_argument("--time-budget", type=int, default=420)
128
+ parser.add_argument("--out", type=Path, default=Path(".tmp") / "optuna" / "retest_plan.json")
129
+ return parser.parse_args(argv)
130
+
131
+
132
+ def main(argv: list[str] | None = None) -> int:
133
+ args = parse_args(argv)
134
+ study_names = args.study_name or ["hydra_hpo"]
135
+ payload = build_retest_plan(
136
+ storage=args.storage,
137
+ study_names=study_names,
138
+ top_k=args.top_k,
139
+ metric=args.metric,
140
+ eval_tokens=args.eval_tokens,
141
+ eval_batch=args.eval_batch,
142
+ time_budget=args.time_budget,
143
+ )
144
+ args.out.parent.mkdir(parents=True, exist_ok=True)
145
+ args.out.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
146
+ print(json.dumps(payload, indent=2, sort_keys=True))
147
+ return 0
148
+
149
+
150
+ if __name__ == "__main__":
151
+ raise SystemExit(main())
overlay/scripts/hydra_generation.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Callable
7
+
8
+ import torch
9
+
10
+ from scripts.benchmark_checkpoint import hydrate_checkpoint
11
+ from scripts.hf_routing import resolve_routing
12
+
13
+
14
+ def default_checkpoint_path() -> Path:
15
+ return Path(os.path.expanduser("~/.cache/autoresearch/latest.pt"))
16
+
17
+
18
+ def checkpoint_candidates(*, cache_dir: Path | None = None) -> list[Path]:
19
+ base = cache_dir or Path(os.path.expanduser("~/.cache/autoresearch"))
20
+ return [
21
+ base / "best_bpb.pt",
22
+ base / "pretrain_final.pt",
23
+ base / "latest.pt",
24
+ ]
25
+
26
+
27
+ def resolve_checkpoint_path(explicit_path: Path | None, *, cache_dir: Path | None = None) -> Path:
28
+ if explicit_path is not None:
29
+ return explicit_path
30
+ for candidate in checkpoint_candidates(cache_dir=cache_dir):
31
+ if candidate.exists():
32
+ return candidate
33
+ return default_checkpoint_path()
34
+
35
+
36
+ def validate_checkpoint_compatibility(
37
+ *,
38
+ baseline_arch: str,
39
+ missing_keys: list[str],
40
+ unexpected_keys: list[str],
41
+ total_model_keys: int,
42
+ ) -> None:
43
+ if baseline_arch == "transformer" and (missing_keys or unexpected_keys):
44
+ raise RuntimeError(
45
+ "checkpoint incompatible with transformer baseline architecture; "
46
+ "use a transformer-trained checkpoint or keep HYDRA_BASELINE_ARCH=mamba3"
47
+ )
48
+ mismatch_count = len(missing_keys) + len(unexpected_keys)
49
+ if total_model_keys > 0 and mismatch_count > max(8, total_model_keys // 2):
50
+ raise RuntimeError("checkpoint incompatible with requested model architecture")
51
+
52
+
53
+ def generate_from_callable(
54
+ generator: Callable[[str], str] | Callable[..., str],
55
+ prompt: str,
56
+ *,
57
+ max_new_tokens: int,
58
+ temperature: float,
59
+ top_p: float,
60
+ ) -> str:
61
+ text = generator(
62
+ prompt,
63
+ max_new_tokens=max_new_tokens,
64
+ temperature=temperature,
65
+ top_p=top_p,
66
+ )
67
+ return str(text).strip()
68
+
69
+
70
+ def load_hydra_causal_lm(checkpoint_path: Path | None = None, device: str | None = None):
71
+ ckpt_path = resolve_checkpoint_path(checkpoint_path)
72
+ if not ckpt_path.exists():
73
+ hydrated = hydrate_checkpoint(
74
+ cache_dir=ckpt_path.parent,
75
+ output_repo=resolve_routing(token=os.environ.get("HF_TOKEN")).output_repo,
76
+ token=os.environ.get("HF_TOKEN"),
77
+ )
78
+ if hydrated is not None:
79
+ ckpt_path = hydrated
80
+ if not ckpt_path.exists():
81
+ raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
82
+
83
+ from transformers import GenerationConfig, GenerationMixin, PretrainedConfig, PreTrainedModel
84
+ from transformers.modeling_outputs import CausalLMOutputWithPast
85
+
86
+ from hydra.config import PostSemClawConfig
87
+ from hydra.model import PostSemClawModel
88
+ from prepare import Tokenizer
89
+
90
+ resolved_device = device or ("cuda" if torch.cuda.is_available() else "cpu")
91
+
92
+ class _HydraGenConfig(PretrainedConfig):
93
+ model_type = "hydra"
94
+
95
+ def __init__(self, vocab_size: int = 65536, **kw):
96
+ super().__init__(**kw)
97
+ self.vocab_size = vocab_size
98
+
99
+ class HydraForCausalLM(PreTrainedModel, GenerationMixin):
100
+ config_class = _HydraGenConfig
101
+
102
+ def __init__(self, gen_config, inner_model):
103
+ super().__init__(gen_config)
104
+ self.inner = inner_model
105
+ self.config.vocab_size = gen_config.vocab_size
106
+
107
+ def forward(self, input_ids, attention_mask=None, **kw):
108
+ logits = self.inner(input_ids)
109
+ return CausalLMOutputWithPast(loss=None, logits=logits, past_key_values=None)
110
+
111
+ def prepare_inputs_for_generation(self, input_ids, **kw):
112
+ return {"input_ids": input_ids}
113
+
114
+ def get_input_embeddings(self):
115
+ return self.inner.wte
116
+
117
+ def can_generate(self) -> bool:
118
+ return True
119
+
120
+ @property
121
+ def _supports_cache_class(self):
122
+ return False
123
+
124
+ tokenizer = Tokenizer.from_directory()
125
+ vocab_size = tokenizer.get_vocab_size()
126
+ bos = tokenizer.get_bos_token_id()
127
+ ckpt = torch.load(str(ckpt_path), map_location="cpu", weights_only=False)
128
+ cfg = PostSemClawConfig(**ckpt["config"])
129
+ with torch.device("meta"):
130
+ inner = PostSemClawModel(cfg)
131
+ inner.to_empty(device=resolved_device)
132
+ missing, unexpected = inner.load_state_dict(ckpt["model_state_dict"], strict=False)
133
+ validate_checkpoint_compatibility(
134
+ baseline_arch=os.environ.get("HYDRA_BASELINE_ARCH", "mamba3").strip().lower(),
135
+ missing_keys=list(missing),
136
+ unexpected_keys=list(unexpected),
137
+ total_model_keys=len(inner.state_dict()),
138
+ )
139
+ inner.eval()
140
+
141
+ gen_cfg = _HydraGenConfig(vocab_size=vocab_size)
142
+ gen_cfg.bos_token_id = bos
143
+ gen_cfg.eos_token_id = bos
144
+ gen_cfg.pad_token_id = bos
145
+ model = HydraForCausalLM(gen_cfg, inner).to(resolved_device)
146
+ model.eval()
147
+ return tokenizer, model, bos, resolved_device, GenerationConfig
148
+
149
+
150
+ def build_hydra_generator(
151
+ *,
152
+ checkpoint_path: Path | None = None,
153
+ device: str | None = None,
154
+ max_new_tokens: int,
155
+ temperature: float,
156
+ top_p: float,
157
+ ):
158
+ tokenizer, model, bos, resolved_device, GenerationConfig = load_hydra_causal_lm(checkpoint_path=checkpoint_path, device=device)
159
+
160
+ def _generate(prompt: str) -> str:
161
+ ids = torch.tensor([tokenizer.encode(prompt)], dtype=torch.long, device=resolved_device)
162
+ gen_config = GenerationConfig(
163
+ max_new_tokens=max_new_tokens,
164
+ use_cache=False,
165
+ do_sample=temperature > 0.0,
166
+ temperature=temperature,
167
+ top_p=top_p,
168
+ bos_token_id=bos,
169
+ eos_token_id=bos,
170
+ pad_token_id=bos,
171
+ )
172
+ if str(resolved_device).startswith("cuda"):
173
+ with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
174
+ out = model.generate(ids, generation_config=gen_config)
175
+ else:
176
+ with torch.no_grad():
177
+ out = model.generate(ids, generation_config=gen_config)
178
+ return tokenizer.decode(out[0].tolist())
179
+
180
+ return _generate
overlay/scripts/launch_benchmark_hf_job.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ REPO_ROOT = Path(__file__).resolve().parents[1]
11
+ if str(REPO_ROOT) not in sys.path:
12
+ sys.path.insert(0, str(REPO_ROOT))
13
+
14
+ from huggingface_hub import HfApi
15
+ from huggingface_hub.utils import get_token
16
+
17
+ from scripts.hf_routing import resolve_routing
18
+ from scripts.launch_feather_hf_job import IMAGE_DIR, sync_overlay_from_repo, wait_for_space
19
+
20
+
21
+ def build_benchmark_job_env(
22
+ *,
23
+ benchmark: str,
24
+ variant: str,
25
+ seed: int,
26
+ output_repo: str,
27
+ tokenizer_repo: str,
28
+ ) -> dict[str, str]:
29
+ env = {
30
+ "FEATHER_HF_OUTPUT_REPO": output_repo,
31
+ "FEATHER_RUNTIME_MODE": "benchmark",
32
+ "HYDRA_TOKENIZER_CACHE_REPO": tokenizer_repo,
33
+ "HYDRA_BENCHMARK_NAME": benchmark,
34
+ "HYDRA_BENCHMARK_VARIANT": variant,
35
+ "HYDRA_SEED": str(seed),
36
+ "PYTHONUNBUFFERED": "1",
37
+ }
38
+ for key, value in os.environ.items():
39
+ if key.startswith("HYDRA_") and key not in env:
40
+ env[key] = value
41
+ return env
42
+
43
+
44
+ def build_benchmark_job_command(*, benchmark: str, variant: str, seed: int) -> list[str]:
45
+ return [
46
+ "python",
47
+ "/app/entrypoint.py",
48
+ ]
49
+
50
+
51
+ def submit_benchmark_job(
52
+ *,
53
+ api,
54
+ image: str,
55
+ command: list[str],
56
+ env: dict[str, str],
57
+ token: str,
58
+ namespace: str,
59
+ flavor: str,
60
+ timeout: str,
61
+ ) -> dict[str, str]:
62
+ job = api.run_job(
63
+ image=image,
64
+ command=command,
65
+ env=env,
66
+ secrets={"HF_TOKEN": token},
67
+ flavor=flavor,
68
+ timeout=timeout,
69
+ namespace=namespace,
70
+ token=token,
71
+ )
72
+ return {
73
+ "job_id": job.id,
74
+ "job_url": job.url,
75
+ "job_stage": str(job.status.stage),
76
+ }
77
+
78
+
79
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
80
+ routing = resolve_routing(token=os.environ.get("HF_TOKEN"))
81
+ parser = argparse.ArgumentParser(description="Prepare or submit a remote HF benchmark job")
82
+ parser.add_argument("--benchmark", required=True)
83
+ parser.add_argument("--variant", required=True)
84
+ parser.add_argument("--seed", type=int, required=True)
85
+ parser.add_argument("--output-repo", default=routing.output_repo)
86
+ parser.add_argument("--tokenizer-repo", default=routing.output_repo)
87
+ parser.add_argument("--image", default=f"hf.co/spaces/{routing.space_repo}")
88
+ parser.add_argument("--namespace", default=routing.job_namespace)
89
+ parser.add_argument("--flavor", default="a10g-small")
90
+ parser.add_argument("--timeout", default="30m")
91
+ parser.add_argument("--summary-out", type=Path)
92
+ parser.add_argument("--dry-run", action="store_true")
93
+ parser.add_argument("--refresh-image", action="store_true")
94
+ parser.add_argument("--sync-overlay", action="store_true")
95
+ return parser.parse_args(argv)
96
+
97
+
98
+ def main(argv: list[str] | None = None) -> int:
99
+ args = parse_args(argv)
100
+ env = build_benchmark_job_env(
101
+ benchmark=args.benchmark,
102
+ variant=args.variant,
103
+ seed=args.seed,
104
+ output_repo=args.output_repo,
105
+ tokenizer_repo=args.tokenizer_repo,
106
+ )
107
+ command = build_benchmark_job_command(benchmark=args.benchmark, variant=args.variant, seed=args.seed)
108
+ payload = {
109
+ "benchmark": args.benchmark,
110
+ "variant": args.variant,
111
+ "seed": args.seed,
112
+ "output_repo": args.output_repo,
113
+ "tokenizer_repo": args.tokenizer_repo,
114
+ "image": args.image,
115
+ "namespace": args.namespace,
116
+ "command": command,
117
+ "env": env,
118
+ "dry_run": args.dry_run,
119
+ }
120
+ if not args.dry_run:
121
+ token = os.environ.get("HF_TOKEN") or get_token()
122
+ if not token:
123
+ raise SystemExit("HF_TOKEN must be set or cached via huggingface-cli login")
124
+ api = HfApi(token=token)
125
+ if args.refresh_image:
126
+ space_repo = args.image.removeprefix("hf.co/spaces/")
127
+ if args.sync_overlay:
128
+ sync_overlay_from_repo()
129
+ api.upload_folder(
130
+ repo_id=space_repo,
131
+ repo_type="space",
132
+ folder_path=str(IMAGE_DIR),
133
+ commit_message="Update benchmark runtime image",
134
+ token=token,
135
+ )
136
+ wait_for_space(api, space_repo, token=token)
137
+ payload.update(
138
+ submit_benchmark_job(
139
+ api=api,
140
+ image=args.image,
141
+ command=command,
142
+ env=env,
143
+ token=token,
144
+ namespace=args.namespace,
145
+ flavor=args.flavor,
146
+ timeout=args.timeout,
147
+ )
148
+ )
149
+ if args.summary_out is not None:
150
+ args.summary_out.parent.mkdir(parents=True, exist_ok=True)
151
+ args.summary_out.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
152
+ print(json.dumps(payload, indent=2, sort_keys=True))
153
+ return 0
154
+
155
+
156
+ if __name__ == "__main__":
157
+ raise SystemExit(main())
overlay/scripts/optuna_hpo.py CHANGED
@@ -108,6 +108,28 @@ def _enqueue_transfer_priors(study: optuna.Study, priors_file: Path, apply_prior
108
  if after > before:
109
  enqueued += 1
110
  return enqueued
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
 
113
  def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any] | None:
@@ -143,15 +165,99 @@ def _parse_metrics_from_log_lines(lines: list[str]) -> dict[str, Any] | None:
143
 
144
 
145
  def _parse_last_train_bpb_from_logs(lines: list[str]) -> float | None:
146
- """Best-effort fallback when final eval crashes before metrics JSON write."""
147
- last: float | None = None
148
- for line in lines:
149
- m = re.search(r"\bbpb=([0-9]+(?:\.[0-9]+)?)", line)
150
  if m:
151
- last = float(m.group(1))
152
  return last
153
 
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  def _fetch_job_logs_safe(
156
  api,
157
  *,
@@ -180,12 +286,20 @@ def _fetch_job_logs_safe(
180
  if last_exc is not None:
181
  raise last_exc
182
  return []
 
 
 
 
 
 
 
183
 
184
 
185
  def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path) -> dict[str, str]:
186
  env = os.environ.copy()
187
  full_arch_hpo = env.get("HYDRA_HPO_FULL_ARCH", "0") == "1"
188
  speed_arch_hpo = full_arch_hpo and env.get("HYDRA_HPO_SPEED_ARCH", "0") == "1"
 
189
 
190
  # Runtime and reporting
191
  env["HYDRA_METRICS_OUT"] = str(metrics_path)
@@ -203,6 +317,12 @@ def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path
203
  env["HYDRA_D_STATE"] = str(trial.suggest_categorical("d_state", [16, 32]))
204
  env["HYDRA_HEADDIM"] = str(trial.suggest_categorical("headdim", [16, 32]))
205
  env["HYDRA_EXPAND"] = str(trial.suggest_categorical("expand", [1, 2]))
 
 
 
 
 
 
206
  else:
207
  env["HYDRA_D_MODEL"] = str(trial.suggest_categorical("d_model", [64, 96, 128, 160, 192]))
208
  env["HYDRA_N_LAYER"] = str(trial.suggest_int("n_layer", 1, 4))
@@ -214,6 +334,10 @@ def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path
214
  seq_len = trial.suggest_categorical("seq_len", [64, 128])
215
  batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
216
  grad_accum = trial.suggest_categorical("grad_accum", [4, 8, 16])
 
 
 
 
217
  else:
218
  seq_len = trial.suggest_categorical("seq_len", [32, 64])
219
  batch_size = trial.suggest_categorical("batch_size", [4, 8] if full_arch_hpo else [4, 8, 16])
@@ -224,22 +348,41 @@ def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path
224
  env["HYDRA_BATCH_SIZE"] = str(batch_size)
225
  env["HYDRA_TOTAL_BATCH"] = str(total_batch)
226
 
227
- env["HYDRA_MATRIX_LR"] = str(trial.suggest_float("matrix_lr", 0.005, 0.2, log=True))
228
- env["HYDRA_EMBED_LR"] = str(trial.suggest_float("embed_lr", 0.05, 1.0, log=True))
229
- env["HYDRA_UNEMBED_LR"] = str(trial.suggest_float("unembed_lr", 0.0005, 0.02, log=True))
 
 
 
 
 
230
 
231
  if full_arch_hpo:
232
  env["HYDRA_HYENA_LAYERS"] = ""
233
  env["HYDRA_ENGRAM_N_COLUMNS"] = str(
234
- trial.suggest_categorical("engram_n_columns", [512, 1024] if speed_arch_hpo else [512, 1024, 2048])
 
 
 
235
  )
236
  env["HYDRA_ENGRAM_LAYER_IDX"] = str(trial.suggest_int("engram_layer_idx", 0, max(0, int(env["HYDRA_N_LAYER"]) - 1)))
237
- env["HYDRA_SDR_TARGET_ACTIVE"] = str(trial.suggest_categorical("sdr_target_active", [164, 327] if speed_arch_hpo else [164, 327, 512]))
238
- env["HYDRA_HTM_LEARN_EVERY"] = str(trial.suggest_categorical("htm_learn_every", [8, 16] if speed_arch_hpo else [4, 8, 16]))
239
- env["HYDRA_HTM_SUBSAMPLE"] = str(trial.suggest_categorical("htm_subsample", [4, 8, 16] if speed_arch_hpo else [1, 2, 4, 8]))
240
- env["HYDRA_ENGRAM_SUBSAMPLE"] = str(trial.suggest_categorical("engram_subsample", [1, 2, 4] if speed_arch_hpo else [1]))
 
 
 
 
 
 
 
 
 
 
 
241
  env["HYDRA_MAMBA3_CHUNK"] = str(trial.suggest_categorical("mamba3_chunk", [32, 64]))
242
- env["HYDRA_DROPOUT"] = str(trial.suggest_categorical("dropout", [0.0, 0.1] if speed_arch_hpo else [0.0, 0.1, 0.2]))
243
  else:
244
  env["HYDRA_HYENA_LAYERS"] = trial.suggest_categorical("hyena_layers", ["", "0", "1", "0,1"])
245
 
@@ -299,8 +442,10 @@ def _space_repo_from_hf_image(image: str, namespace: str) -> str:
299
  return os.environ.get("FEATHER_HF_SPACE_REPO", f"{namespace}/feather-a10-runtime")
300
 
301
 
302
- def _objective_local(args: argparse.Namespace):
303
- def objective(trial: optuna.Trial) -> float:
 
 
304
  trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
305
  metrics_path = trial_dir / "metrics.json"
306
 
@@ -315,44 +460,67 @@ def _objective_local(args: argparse.Namespace):
315
  timeout=args.trial_timeout,
316
  )
317
 
318
- metrics: dict[str, Any] | None = None
319
  if metrics_path.exists():
320
  try:
321
  metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
322
  except json.JSONDecodeError:
323
  metrics = None
324
- if metrics is None:
325
- metrics = _parse_metrics_from_stdout(proc.stdout)
326
-
327
- if metrics is None:
328
- raise optuna.TrialPruned("No metrics found (HYDRA_METRICS_OUT/[METRICS_JSON])")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
  if proc.returncode != 0:
331
  raise optuna.TrialPruned(f"Training failed rc={proc.returncode}")
332
 
333
- metric_key = args.metric
334
- if metric_key not in metrics or metrics[metric_key] is None:
335
- raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
336
 
337
  tps_val = metrics.get("tps")
338
  if tps_val is not None:
339
  tps_f = float(tps_val)
340
  trial.set_user_attr("tps", tps_f)
341
- if args.min_tps is not None and tps_f < args.min_tps:
342
- raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
343
-
344
- value = float(metrics[metric_key])
345
-
346
- # Keep useful context on trial
347
- trial.set_user_attr("summary_path", metrics.get("summary_path"))
348
- trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
 
 
 
349
 
350
- return value
 
 
 
 
351
 
352
  return objective
353
 
354
 
355
- def _objective_hf_job(args: argparse.Namespace):
356
  from huggingface_hub import HfApi
357
  from huggingface_hub.utils import get_token
358
 
@@ -362,8 +530,9 @@ def _objective_hf_job(args: argparse.Namespace):
362
  f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
363
  )
364
 
365
- api = HfApi(token=token)
366
- terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
 
367
 
368
  def objective(trial: optuna.Trial) -> float:
369
  trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
@@ -474,50 +643,66 @@ def _objective_hf_job(args: argparse.Namespace):
474
  except Exception:
475
  pass
476
 
477
- # Save logs for debugging
478
- (trial_dir / "hf_job.log").write_text("\n".join(log_lines), encoding="utf-8")
479
- trial.set_user_attr("hf_stage", stage)
480
- trial.set_user_attr("hf_log_lines", len(log_lines))
 
 
 
 
 
481
  if terminal_detail:
482
  trial.set_user_attr("hf_status_message", terminal_detail)
483
 
484
- if metrics is None:
485
- if args.allow_log_metric_fallback and args.metric == "val_bpb":
486
- fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
487
- if fallback_bpb is not None:
488
- trial.set_user_attr("metric_source", "log_bpb_fallback")
489
- if tps_seen is not None:
490
- trial.set_user_attr("tps", tps_seen)
491
- if args.min_tps is not None and tps_seen < args.min_tps:
492
- raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {args.min_tps}")
493
- return float(fallback_bpb)
494
- if tps_seen is not None:
495
- trial.set_user_attr("tps", tps_seen)
496
- detail = f"stage={stage}, logs={len(log_lines)}"
497
- if terminal_detail:
498
- detail = f"{detail}, message={terminal_detail}"
 
 
 
 
 
 
499
  raise optuna.TrialPruned(f"No metrics found from HF job ({detail})")
500
 
501
- metric_key = args.metric
502
- if metric_key not in metrics or metrics[metric_key] is None:
503
- raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
504
 
505
  tps_val = metrics.get("tps")
506
  if tps_val is not None:
507
  tps_f = float(tps_val)
508
  trial.set_user_attr("tps", tps_f)
509
- if args.min_tps is not None and tps_f < args.min_tps:
510
- raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
511
-
512
- value = float(metrics[metric_key])
513
- trial.set_user_attr("summary_path", metrics.get("summary_path"))
514
- trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
515
- return value
 
 
 
 
 
 
 
516
 
517
  return objective
518
 
519
 
520
- def _objective_hf_launcher(args: argparse.Namespace):
521
  from huggingface_hub import HfApi
522
  from huggingface_hub.utils import get_token
523
 
@@ -527,8 +712,9 @@ def _objective_hf_launcher(args: argparse.Namespace):
527
  f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
528
  )
529
 
530
- api = HfApi(token=token)
531
- terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
 
532
 
533
  def objective(trial: optuna.Trial) -> float:
534
  trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
@@ -610,44 +796,61 @@ def _objective_hf_launcher(args: argparse.Namespace):
610
  except Exception:
611
  pass
612
 
613
- (trial_dir / "hf_job.log").write_text("\n".join(log_lines), encoding="utf-8")
614
- trial.set_user_attr("hf_stage", stage)
615
- trial.set_user_attr("hf_log_lines", len(log_lines))
 
 
 
 
 
 
616
  if terminal_detail:
617
  trial.set_user_attr("hf_status_message", terminal_detail)
618
 
619
- if metrics is None:
620
- if args.allow_log_metric_fallback and args.metric == "val_bpb":
621
- fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
622
- if fallback_bpb is not None:
623
- trial.set_user_attr("metric_source", "log_bpb_fallback")
624
- if tps_seen is not None:
625
- trial.set_user_attr("tps", tps_seen)
626
- if args.min_tps is not None and tps_seen < args.min_tps:
627
- raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {args.min_tps}")
628
- return float(fallback_bpb)
629
- if tps_seen is not None:
630
- trial.set_user_attr("tps", tps_seen)
631
- detail = f"stage={stage}, logs={len(log_lines)}"
632
- if terminal_detail:
633
- detail = f"{detail}, message={terminal_detail}"
 
 
 
 
 
 
634
  raise optuna.TrialPruned(f"No metrics found from HF launcher job ({detail})")
635
 
636
- metric_key = args.metric
637
- if metric_key not in metrics or metrics[metric_key] is None:
638
- raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
639
 
640
  tps_val = metrics.get("tps")
641
  if tps_val is not None:
642
  tps_f = float(tps_val)
643
  trial.set_user_attr("tps", tps_f)
644
- if args.min_tps is not None and tps_f < args.min_tps:
645
- raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {args.min_tps}")
646
-
647
- value = float(metrics[metric_key])
648
- trial.set_user_attr("summary_path", metrics.get("summary_path"))
649
- trial.set_user_attr("run_log_path", metrics.get("run_log_path"))
650
- return value
 
 
 
 
 
 
 
651
 
652
  return objective
653
 
@@ -690,6 +893,8 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
690
  parser.add_argument("--priors-file", type=Path, default=REPO_ROOT / "docs" / "hpo_transfer_priors.json", help="Path to transfer-learning prior trials JSON")
691
  parser.add_argument("--apply-priors", action="store_true", default=True, help="Enqueue transfer-learning prior trials before optimize")
692
  parser.add_argument("--no-apply-priors", action="store_false", dest="apply_priors")
 
 
693
  parser.add_argument("--seed", type=int, default=42, help="Seed for sampler")
694
  parser.add_argument("--n-startup-trials", type=int, default=5, help="Pruner startup trials before pruning")
695
  parser.add_argument("--n-warmup-steps", type=int, default=0, help="Pruner warmup steps")
@@ -720,6 +925,10 @@ def main() -> int:
720
  pruner=pruner,
721
  )
722
 
 
 
 
 
723
  enqueued_priors = _enqueue_transfer_priors(study, args.priors_file, args.apply_priors)
724
  if enqueued_priors:
725
  print(f"[hpo] enqueued {enqueued_priors} transfer priors from {args.priors_file}")
@@ -784,6 +993,8 @@ def main() -> int:
784
  "n_completed": len(completed),
785
  "patience_trials": args.patience_trials,
786
  "min_improvement": args.min_improvement,
 
 
787
  "enqueued_priors": enqueued_priors,
788
  }
789
  else:
@@ -793,10 +1004,12 @@ def main() -> int:
793
  "metric": args.metric,
794
  "best_value": None,
795
  "best_params": {},
796
- "best_trial_number": None,
797
  "best_trial_user_attrs": {},
798
  "n_trials": len(study.trials),
799
  "n_completed": 0,
 
 
800
  "enqueued_priors": enqueued_priors,
801
  "note": "No completed trials with metrics found.",
802
  }
 
108
  if after > before:
109
  enqueued += 1
110
  return enqueued
111
+
112
+
113
+ def _enqueue_quality_anchors(study: optuna.Study, priors_file: Path, quality_mode_local: bool, top_k: int) -> int:
114
+ if not quality_mode_local or top_k <= 0:
115
+ return 0
116
+
117
+ priors = _load_prior_param_sets(priors_file)[:top_k]
118
+ enqueued = 0
119
+ for params in priors:
120
+ before = len(study.get_trials(deepcopy=False))
121
+ try:
122
+ study.enqueue_trial(
123
+ params,
124
+ user_attrs={"seed_source": "quality_anchor"},
125
+ skip_if_exists=True,
126
+ )
127
+ except TypeError:
128
+ study.enqueue_trial(params, user_attrs={"seed_source": "quality_anchor"})
129
+ after = len(study.get_trials(deepcopy=False))
130
+ if after > before:
131
+ enqueued += 1
132
+ return enqueued
133
 
134
 
135
  def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any] | None:
 
165
 
166
 
167
  def _parse_last_train_bpb_from_logs(lines: list[str]) -> float | None:
168
+ """Best-effort fallback when final eval crashes before metrics JSON write."""
169
+ last: float | None = None
170
+ for line in lines:
171
+ m = re.search(r"\bbpb=([0-9]+(?:\.[0-9]+)?)", line)
172
  if m:
173
+ last = float(m.group(1))
174
  return last
175
 
176
 
177
+ def _persist_trial_artifacts(
178
+ *,
179
+ trial_dir: Path,
180
+ metrics: dict[str, Any] | None,
181
+ log_lines: list[str] | None,
182
+ log_name: str,
183
+ metadata: dict[str, Any],
184
+ ) -> dict[str, str | None]:
185
+ trial_dir.mkdir(parents=True, exist_ok=True)
186
+ metrics_path = trial_dir / "metrics.json"
187
+ log_path = trial_dir / log_name
188
+ manifest_path = trial_dir / "trial_artifacts.json"
189
+
190
+ if metrics is not None:
191
+ metrics_path.write_text(json.dumps(metrics, indent=2, sort_keys=True), encoding="utf-8")
192
+ if log_lines is not None:
193
+ log_path.write_text("\n".join(log_lines), encoding="utf-8")
194
+
195
+ manifest = {
196
+ **metadata,
197
+ "metrics_path": str(metrics_path) if metrics is not None else None,
198
+ "log_path": str(log_path) if log_lines is not None else None,
199
+ }
200
+ manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True), encoding="utf-8")
201
+ return {
202
+ "metrics_path": str(metrics_path) if metrics is not None else None,
203
+ "log_path": str(log_path) if log_lines is not None else None,
204
+ "manifest_path": str(manifest_path),
205
+ }
206
+
207
+
208
+ def _resolve_objective_metric(
209
+ trial: optuna.Trial,
210
+ *,
211
+ metric_key: str,
212
+ metrics: dict[str, Any] | None,
213
+ allow_log_metric_fallback: bool,
214
+ fallback_bpb: float | None,
215
+ tps_seen: float | None,
216
+ ) -> float:
217
+ """Resolve the objective value while labeling where it came from.
218
+
219
+ Validation metrics and live training-log fallbacks are intentionally
220
+ different sources. Keeping that distinction in trial attrs prevents a
221
+ skipped/OOM eval from being mistaken for a real validation result.
222
+ """
223
+ if metrics is None:
224
+ if allow_log_metric_fallback and metric_key == "val_bpb" and fallback_bpb is not None:
225
+ trial.set_user_attr("objective_source", "train_log_fallback")
226
+ trial.set_user_attr("objective_metric", "train_bpb")
227
+ trial.set_user_attr("eval_status", "missing_metrics")
228
+ trial.set_user_attr("train_bpb_fallback", float(fallback_bpb))
229
+ if tps_seen is not None:
230
+ trial.set_user_attr("tps", float(tps_seen))
231
+ return float(fallback_bpb)
232
+ trial.set_user_attr("objective_source", "missing_metrics")
233
+ raise optuna.TrialPruned("No metrics payload found")
234
+
235
+ eval_status = str(
236
+ metrics.get(
237
+ "eval_status",
238
+ "completed" if metrics.get("val_bpb") is not None else "unknown",
239
+ )
240
+ )
241
+ trial.set_user_attr("eval_status", eval_status)
242
+
243
+ if fallback_bpb is not None:
244
+ trial.set_user_attr("train_bpb_fallback", float(fallback_bpb))
245
+
246
+ if metric_key not in metrics or metrics[metric_key] is None:
247
+ trial.set_user_attr("objective_source", "missing_metric")
248
+ trial.set_user_attr("objective_metric", metric_key)
249
+ raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
250
+
251
+ value = float(metrics[metric_key])
252
+ trial.set_user_attr("objective_metric", metric_key)
253
+ if metric_key == "val_bpb":
254
+ trial.set_user_attr("objective_source", "final_val")
255
+ trial.set_user_attr("final_val_bpb", value)
256
+ else:
257
+ trial.set_user_attr("objective_source", "metrics_json")
258
+ return value
259
+
260
+
261
  def _fetch_job_logs_safe(
262
  api,
263
  *,
 
286
  if last_exc is not None:
287
  raise last_exc
288
  return []
289
+
290
+
291
+ def _effective_min_tps(args: argparse.Namespace) -> float | None:
292
+ min_tps = args.min_tps
293
+ if getattr(args, "quality_mode_local", False) and min_tps == 50000.0:
294
+ return 0.0
295
+ return min_tps
296
 
297
 
298
  def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path) -> dict[str, str]:
299
  env = os.environ.copy()
300
  full_arch_hpo = env.get("HYDRA_HPO_FULL_ARCH", "0") == "1"
301
  speed_arch_hpo = full_arch_hpo and env.get("HYDRA_HPO_SPEED_ARCH", "0") == "1"
302
+ quality_mode_local = bool(getattr(args, "quality_mode_local", False))
303
 
304
  # Runtime and reporting
305
  env["HYDRA_METRICS_OUT"] = str(metrics_path)
 
317
  env["HYDRA_D_STATE"] = str(trial.suggest_categorical("d_state", [16, 32]))
318
  env["HYDRA_HEADDIM"] = str(trial.suggest_categorical("headdim", [16, 32]))
319
  env["HYDRA_EXPAND"] = str(trial.suggest_categorical("expand", [1, 2]))
320
+ elif quality_mode_local and full_arch_hpo:
321
+ env["HYDRA_D_MODEL"] = str(trial.suggest_categorical("d_model", [64, 96, 128]))
322
+ env["HYDRA_N_LAYER"] = str(trial.suggest_int("n_layer", 2, 3))
323
+ env["HYDRA_D_STATE"] = str(trial.suggest_categorical("d_state", [16, 32]))
324
+ env["HYDRA_HEADDIM"] = str(trial.suggest_categorical("headdim", [16, 32]))
325
+ env["HYDRA_EXPAND"] = str(trial.suggest_categorical("expand", [1, 2]))
326
  else:
327
  env["HYDRA_D_MODEL"] = str(trial.suggest_categorical("d_model", [64, 96, 128, 160, 192]))
328
  env["HYDRA_N_LAYER"] = str(trial.suggest_int("n_layer", 1, 4))
 
334
  seq_len = trial.suggest_categorical("seq_len", [64, 128])
335
  batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
336
  grad_accum = trial.suggest_categorical("grad_accum", [4, 8, 16])
337
+ elif quality_mode_local and full_arch_hpo:
338
+ seq_len = trial.suggest_categorical("seq_len", [64])
339
+ batch_size = trial.suggest_categorical("batch_size", [4, 8])
340
+ grad_accum = trial.suggest_categorical("grad_accum", [4, 8, 16])
341
  else:
342
  seq_len = trial.suggest_categorical("seq_len", [32, 64])
343
  batch_size = trial.suggest_categorical("batch_size", [4, 8] if full_arch_hpo else [4, 8, 16])
 
348
  env["HYDRA_BATCH_SIZE"] = str(batch_size)
349
  env["HYDRA_TOTAL_BATCH"] = str(total_batch)
350
 
351
+ if quality_mode_local and full_arch_hpo:
352
+ env["HYDRA_MATRIX_LR"] = str(trial.suggest_float("matrix_lr", 0.008, 0.03, log=True))
353
+ env["HYDRA_EMBED_LR"] = str(trial.suggest_float("embed_lr", 0.15, 0.6, log=True))
354
+ env["HYDRA_UNEMBED_LR"] = str(trial.suggest_float("unembed_lr", 0.001, 0.01, log=True))
355
+ else:
356
+ env["HYDRA_MATRIX_LR"] = str(trial.suggest_float("matrix_lr", 0.005, 0.2, log=True))
357
+ env["HYDRA_EMBED_LR"] = str(trial.suggest_float("embed_lr", 0.05, 1.0, log=True))
358
+ env["HYDRA_UNEMBED_LR"] = str(trial.suggest_float("unembed_lr", 0.0005, 0.02, log=True))
359
 
360
  if full_arch_hpo:
361
  env["HYDRA_HYENA_LAYERS"] = ""
362
  env["HYDRA_ENGRAM_N_COLUMNS"] = str(
363
+ trial.suggest_categorical(
364
+ "engram_n_columns",
365
+ [512, 1024] if (speed_arch_hpo or quality_mode_local) else [512, 1024, 2048],
366
+ )
367
  )
368
  env["HYDRA_ENGRAM_LAYER_IDX"] = str(trial.suggest_int("engram_layer_idx", 0, max(0, int(env["HYDRA_N_LAYER"]) - 1)))
369
+ env["HYDRA_SDR_TARGET_ACTIVE"] = str(
370
+ trial.suggest_categorical(
371
+ "sdr_target_active",
372
+ [327] if quality_mode_local else ([164, 327] if speed_arch_hpo else [164, 327, 512]),
373
+ )
374
+ )
375
+ env["HYDRA_HTM_LEARN_EVERY"] = str(
376
+ trial.suggest_categorical("htm_learn_every", [8, 16] if (speed_arch_hpo or quality_mode_local) else [4, 8, 16])
377
+ )
378
+ env["HYDRA_HTM_SUBSAMPLE"] = str(
379
+ trial.suggest_categorical("htm_subsample", [1, 2] if quality_mode_local else ([4, 8, 16] if speed_arch_hpo else [1, 2, 4, 8]))
380
+ )
381
+ env["HYDRA_ENGRAM_SUBSAMPLE"] = str(
382
+ trial.suggest_categorical("engram_subsample", [1, 2] if quality_mode_local else ([1, 2, 4] if speed_arch_hpo else [1]))
383
+ )
384
  env["HYDRA_MAMBA3_CHUNK"] = str(trial.suggest_categorical("mamba3_chunk", [32, 64]))
385
+ env["HYDRA_DROPOUT"] = str(trial.suggest_categorical("dropout", [0.0, 0.1] if (speed_arch_hpo or quality_mode_local) else [0.0, 0.1, 0.2]))
386
  else:
387
  env["HYDRA_HYENA_LAYERS"] = trial.suggest_categorical("hyena_layers", ["", "0", "1", "0,1"])
388
 
 
442
  return os.environ.get("FEATHER_HF_SPACE_REPO", f"{namespace}/feather-a10-runtime")
443
 
444
 
445
+ def _objective_local(args: argparse.Namespace):
446
+ effective_min_tps = _effective_min_tps(args)
447
+
448
+ def objective(trial: optuna.Trial) -> float:
449
  trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
450
  metrics_path = trial_dir / "metrics.json"
451
 
 
460
  timeout=args.trial_timeout,
461
  )
462
 
463
+ metrics: dict[str, Any] | None = None
464
  if metrics_path.exists():
465
  try:
466
  metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
467
  except json.JSONDecodeError:
468
  metrics = None
469
+ if metrics is None:
470
+ metrics = _parse_metrics_from_stdout(proc.stdout)
471
+
472
+ artifact_paths = _persist_trial_artifacts(
473
+ trial_dir=trial_dir,
474
+ metrics=metrics,
475
+ log_lines=(proc.stdout or "").splitlines(),
476
+ log_name="train_stdout.log",
477
+ metadata={"runner": "local", "returncode": proc.returncode},
478
+ )
479
+ (trial_dir / "train_stderr.log").write_text(proc.stderr or "", encoding="utf-8")
480
+
481
+ fallback_bpb = _parse_last_train_bpb_from_logs(proc.stdout.splitlines())
482
+ if metrics is None:
483
+ _resolve_objective_metric(
484
+ trial,
485
+ metric_key=args.metric,
486
+ metrics=None,
487
+ allow_log_metric_fallback=args.allow_log_metric_fallback,
488
+ fallback_bpb=fallback_bpb,
489
+ tps_seen=None,
490
+ )
491
+ raise optuna.TrialPruned("No metrics found (HYDRA_METRICS_OUT/[METRICS_JSON])")
492
 
493
  if proc.returncode != 0:
494
  raise optuna.TrialPruned(f"Training failed rc={proc.returncode}")
495
 
496
+ metric_key = args.metric
 
 
497
 
498
  tps_val = metrics.get("tps")
499
  if tps_val is not None:
500
  tps_f = float(tps_val)
501
  trial.set_user_attr("tps", tps_f)
502
+ if effective_min_tps is not None and tps_f < effective_min_tps:
503
+ raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {effective_min_tps}")
504
+
505
+ value = _resolve_objective_metric(
506
+ trial,
507
+ metric_key=metric_key,
508
+ metrics=metrics,
509
+ allow_log_metric_fallback=args.allow_log_metric_fallback,
510
+ fallback_bpb=fallback_bpb,
511
+ tps_seen=None,
512
+ )
513
 
514
+ # Keep useful context on trial
515
+ trial.set_user_attr("summary_path", metrics.get("summary_path") or artifact_paths["manifest_path"])
516
+ trial.set_user_attr("run_log_path", metrics.get("run_log_path") or artifact_paths["log_path"])
517
+
518
+ return value
519
 
520
  return objective
521
 
522
 
523
+ def _objective_hf_job(args: argparse.Namespace):
524
  from huggingface_hub import HfApi
525
  from huggingface_hub.utils import get_token
526
 
 
530
  f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
531
  )
532
 
533
+ api = HfApi(token=token)
534
+ terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
535
+ effective_min_tps = _effective_min_tps(args)
536
 
537
  def objective(trial: optuna.Trial) -> float:
538
  trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
 
643
  except Exception:
644
  pass
645
 
646
+ artifact_paths = _persist_trial_artifacts(
647
+ trial_dir=trial_dir,
648
+ metrics=metrics,
649
+ log_lines=log_lines,
650
+ log_name="hf_job.log",
651
+ metadata={"runner": "hf-job", "hf_job_id": job_id, "hf_stage": stage},
652
+ )
653
+ trial.set_user_attr("hf_stage", stage)
654
+ trial.set_user_attr("hf_log_lines", len(log_lines))
655
  if terminal_detail:
656
  trial.set_user_attr("hf_status_message", terminal_detail)
657
 
658
+ fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
659
+ if metrics is None:
660
+ try:
661
+ value = _resolve_objective_metric(
662
+ trial,
663
+ metric_key=args.metric,
664
+ metrics=None,
665
+ allow_log_metric_fallback=args.allow_log_metric_fallback,
666
+ fallback_bpb=fallback_bpb,
667
+ tps_seen=tps_seen,
668
+ )
669
+ if tps_seen is not None and effective_min_tps is not None and tps_seen < effective_min_tps:
670
+ raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {effective_min_tps}")
671
+ return value
672
+ except optuna.TrialPruned:
673
+ pass
674
+ if tps_seen is not None:
675
+ trial.set_user_attr("tps", tps_seen)
676
+ detail = f"stage={stage}, logs={len(log_lines)}"
677
+ if terminal_detail:
678
+ detail = f"{detail}, message={terminal_detail}"
679
  raise optuna.TrialPruned(f"No metrics found from HF job ({detail})")
680
 
681
+ metric_key = args.metric
 
 
682
 
683
  tps_val = metrics.get("tps")
684
  if tps_val is not None:
685
  tps_f = float(tps_val)
686
  trial.set_user_attr("tps", tps_f)
687
+ if effective_min_tps is not None and tps_f < effective_min_tps:
688
+ raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {effective_min_tps}")
689
+
690
+ value = _resolve_objective_metric(
691
+ trial,
692
+ metric_key=metric_key,
693
+ metrics=metrics,
694
+ allow_log_metric_fallback=args.allow_log_metric_fallback,
695
+ fallback_bpb=fallback_bpb,
696
+ tps_seen=tps_seen,
697
+ )
698
+ trial.set_user_attr("summary_path", metrics.get("summary_path") or artifact_paths["manifest_path"])
699
+ trial.set_user_attr("run_log_path", metrics.get("run_log_path") or artifact_paths["log_path"])
700
+ return value
701
 
702
  return objective
703
 
704
 
705
+ def _objective_hf_launcher(args: argparse.Namespace):
706
  from huggingface_hub import HfApi
707
  from huggingface_hub.utils import get_token
708
 
 
712
  f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
713
  )
714
 
715
+ api = HfApi(token=token)
716
+ terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
717
+ effective_min_tps = _effective_min_tps(args)
718
 
719
  def objective(trial: optuna.Trial) -> float:
720
  trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
 
796
  except Exception:
797
  pass
798
 
799
+ artifact_paths = _persist_trial_artifacts(
800
+ trial_dir=trial_dir,
801
+ metrics=metrics,
802
+ log_lines=log_lines,
803
+ log_name="hf_job.log",
804
+ metadata={"runner": "hf-launcher", "hf_job_id": job_id, "hf_stage": stage},
805
+ )
806
+ trial.set_user_attr("hf_stage", stage)
807
+ trial.set_user_attr("hf_log_lines", len(log_lines))
808
  if terminal_detail:
809
  trial.set_user_attr("hf_status_message", terminal_detail)
810
 
811
+ fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
812
+ if metrics is None:
813
+ try:
814
+ value = _resolve_objective_metric(
815
+ trial,
816
+ metric_key=args.metric,
817
+ metrics=None,
818
+ allow_log_metric_fallback=args.allow_log_metric_fallback,
819
+ fallback_bpb=fallback_bpb,
820
+ tps_seen=tps_seen,
821
+ )
822
+ if tps_seen is not None and effective_min_tps is not None and tps_seen < effective_min_tps:
823
+ raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {effective_min_tps}")
824
+ return value
825
+ except optuna.TrialPruned:
826
+ pass
827
+ if tps_seen is not None:
828
+ trial.set_user_attr("tps", tps_seen)
829
+ detail = f"stage={stage}, logs={len(log_lines)}"
830
+ if terminal_detail:
831
+ detail = f"{detail}, message={terminal_detail}"
832
  raise optuna.TrialPruned(f"No metrics found from HF launcher job ({detail})")
833
 
834
+ metric_key = args.metric
 
 
835
 
836
  tps_val = metrics.get("tps")
837
  if tps_val is not None:
838
  tps_f = float(tps_val)
839
  trial.set_user_attr("tps", tps_f)
840
+ if effective_min_tps is not None and tps_f < effective_min_tps:
841
+ raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {effective_min_tps}")
842
+
843
+ value = _resolve_objective_metric(
844
+ trial,
845
+ metric_key=metric_key,
846
+ metrics=metrics,
847
+ allow_log_metric_fallback=args.allow_log_metric_fallback,
848
+ fallback_bpb=fallback_bpb,
849
+ tps_seen=tps_seen,
850
+ )
851
+ trial.set_user_attr("summary_path", metrics.get("summary_path") or artifact_paths["manifest_path"])
852
+ trial.set_user_attr("run_log_path", metrics.get("run_log_path") or artifact_paths["log_path"])
853
+ return value
854
 
855
  return objective
856
 
 
893
  parser.add_argument("--priors-file", type=Path, default=REPO_ROOT / "docs" / "hpo_transfer_priors.json", help="Path to transfer-learning prior trials JSON")
894
  parser.add_argument("--apply-priors", action="store_true", default=True, help="Enqueue transfer-learning prior trials before optimize")
895
  parser.add_argument("--no-apply-priors", action="store_false", dest="apply_priors")
896
+ parser.add_argument("--quality-mode-local", action="store_true", default=False, help="Narrow local full-architecture search around the proven quality-winning region")
897
+ parser.add_argument("--quality-anchor-top-k", type=int, default=3, help="Number of top clean priors to enqueue as deterministic local quality anchors")
898
  parser.add_argument("--seed", type=int, default=42, help="Seed for sampler")
899
  parser.add_argument("--n-startup-trials", type=int, default=5, help="Pruner startup trials before pruning")
900
  parser.add_argument("--n-warmup-steps", type=int, default=0, help="Pruner warmup steps")
 
925
  pruner=pruner,
926
  )
927
 
928
+ enqueued_quality_anchors = _enqueue_quality_anchors(study, args.priors_file, args.quality_mode_local, args.quality_anchor_top_k)
929
+ if enqueued_quality_anchors:
930
+ print(f"[hpo] enqueued {enqueued_quality_anchors} local quality anchors from {args.priors_file}")
931
+
932
  enqueued_priors = _enqueue_transfer_priors(study, args.priors_file, args.apply_priors)
933
  if enqueued_priors:
934
  print(f"[hpo] enqueued {enqueued_priors} transfer priors from {args.priors_file}")
 
993
  "n_completed": len(completed),
994
  "patience_trials": args.patience_trials,
995
  "min_improvement": args.min_improvement,
996
+ "quality_mode_local": args.quality_mode_local,
997
+ "enqueued_quality_anchors": enqueued_quality_anchors,
998
  "enqueued_priors": enqueued_priors,
999
  }
1000
  else:
 
1004
  "metric": args.metric,
1005
  "best_value": None,
1006
  "best_params": {},
1007
+ "best_trial_number": None,
1008
  "best_trial_user_attrs": {},
1009
  "n_trials": len(study.trials),
1010
  "n_completed": 0,
1011
+ "quality_mode_local": args.quality_mode_local,
1012
+ "enqueued_quality_anchors": enqueued_quality_anchors,
1013
  "enqueued_priors": enqueued_priors,
1014
  "note": "No completed trials with metrics found.",
1015
  }
overlay/scripts/run_cycle1a.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ REPO_ROOT = Path(__file__).resolve().parents[1]
9
+ if str(REPO_ROOT) not in sys.path:
10
+ sys.path.insert(0, str(REPO_ROOT))
11
+
12
+ from scripts import cycle_executor
13
+
14
+
15
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
16
+ parser = argparse.ArgumentParser(description="Run the full local Cycle 1a benchmark suite")
17
+ parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "artifacts" / "cycle1a_runs")
18
+ parser.add_argument("--preflight-out", type=Path, default=REPO_ROOT / "artifacts" / "cycle1a_preflight.json")
19
+ parser.add_argument("--summary-out", type=Path, default=REPO_ROOT / "artifacts" / "cycle1a_summary.json")
20
+ parser.add_argument("--hydrate-assets", action="store_true")
21
+ parser.add_argument("--require-ready", action="store_true")
22
+ parser.add_argument("--output-repo")
23
+ parser.add_argument("--tokenizer-repo")
24
+ return parser.parse_args(argv)
25
+
26
+
27
+ def main(argv: list[str] | None = None) -> int:
28
+ args = parse_args(argv)
29
+ return cycle_executor.main([
30
+ "--benchmark", "GSM8K",
31
+ "--variant", "hydra_full",
32
+ "--seed", "42",
33
+ "--out-dir", str(args.out_dir),
34
+ "--preflight-out", str(args.preflight_out),
35
+ "--summary-out", str(args.summary_out),
36
+ "--all-runnable",
37
+ "--all-benchmarks",
38
+ *( ["--hydrate-assets"] if args.hydrate_assets else [] ),
39
+ *( ["--require-ready"] if args.require_ready else [] ),
40
+ *( ["--output-repo", args.output_repo] if args.output_repo else [] ),
41
+ *( ["--tokenizer-repo", args.tokenizer_repo] if args.tokenizer_repo else [] ),
42
+ ])
43
+
44
+
45
+ if __name__ == "__main__":
46
+ raise SystemExit(main())
overlay/scripts/sweep_depth_aggregate.py CHANGED
@@ -26,6 +26,8 @@ type MetricsDict = dict[str, MetricValue]
26
  MANIFEST = Path(sys.argv[1] if len(sys.argv) > 1 else '/tmp/sweep_depth_manifest.txt')
27
  STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
28
  MIN_TPS = float(os.environ.get('SWEEP_MIN_TPS', '0'))
 
 
29
 
30
 
31
  def _zero_shot_score(result: MetricsDict) -> float:
@@ -47,6 +49,25 @@ def _metric_int(result: MetricsDict, key: str, default: int = 0) -> int:
47
  return int(value) if isinstance(value, int) else default
48
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def _percentile_linear(sorted_values: list[float], pct: float) -> float:
51
  if not sorted_values:
52
  return 0.0
@@ -210,6 +231,28 @@ def compare(results: dict[int, MetricsDict]) -> None:
210
  )
211
  if MIN_TPS > 0:
212
  print(f"[agg] throughput gate: tps_median >= {MIN_TPS:.0f}; feasible={feasible_count}/{len(ranked)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
 
215
  def main() -> int:
 
26
  MANIFEST = Path(sys.argv[1] if len(sys.argv) > 1 else '/tmp/sweep_depth_manifest.txt')
27
  STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
28
  MIN_TPS = float(os.environ.get('SWEEP_MIN_TPS', '0'))
29
+ TARGET_TOKENS_M = float(os.environ.get('SWEEP_TARGET_TOKENS_M', '0'))
30
+ TARGET_SECONDS = float(os.environ.get('SWEEP_TARGET_SECONDS', '0'))
31
 
32
 
33
  def _zero_shot_score(result: MetricsDict) -> float:
 
49
  return int(value) if isinstance(value, int) else default
50
 
51
 
52
+ def _fixed_budget_ranking(results: dict[int, MetricsDict], *, metric_key: str, target: float) -> list[tuple[int, MetricsDict, float]]:
53
+ ranked: list[tuple[int, MetricsDict, float]] = []
54
+ for n_layer, row in results.items():
55
+ budget_val = row.get(metric_key)
56
+ if not isinstance(budget_val, (int, float)):
57
+ continue
58
+ gap = abs(float(budget_val) - target)
59
+ ranked.append((n_layer, row, gap))
60
+ ranked.sort(
61
+ key=lambda item: (
62
+ item[2],
63
+ _metric_float(item[1], 'val_bpb', float('inf')),
64
+ -_zero_shot_score(item[1]),
65
+ -_metric_float(item[1], 'tps_median', 0.0),
66
+ )
67
+ )
68
+ return ranked
69
+
70
+
71
  def _percentile_linear(sorted_values: list[float], pct: float) -> float:
72
  if not sorted_values:
73
  return 0.0
 
231
  )
232
  if MIN_TPS > 0:
233
  print(f"[agg] throughput gate: tps_median >= {MIN_TPS:.0f}; feasible={feasible_count}/{len(ranked)}")
234
+
235
+ if TARGET_TOKENS_M > 0:
236
+ print('\n=== Fixed-token champion comparison ===')
237
+ print(f' target_tokens_M={TARGET_TOKENS_M:.4f}')
238
+ for n, r, gap in _fixed_budget_ranking(results, metric_key='total_tokens_M', target=TARGET_TOKENS_M):
239
+ print(
240
+ f" n_layer={n:2d} val_bpb={_metric_float(r, 'val_bpb', float('nan')):.4f} "
241
+ f"total_tokens_M={_metric_float(r, 'total_tokens_M', float('nan')):.4f} "
242
+ f"token_gap_M={gap:.4f} tps_median={_metric_float(r, 'tps_median', 0.0):.0f}",
243
+ flush=True,
244
+ )
245
+
246
+ if TARGET_SECONDS > 0:
247
+ print('\n=== Fixed-time champion comparison ===')
248
+ print(f' target_seconds={TARGET_SECONDS:.1f}')
249
+ for n, r, gap in _fixed_budget_ranking(results, metric_key='training_seconds', target=TARGET_SECONDS):
250
+ print(
251
+ f" n_layer={n:2d} val_bpb={_metric_float(r, 'val_bpb', float('nan')):.4f} "
252
+ f"training_seconds={_metric_float(r, 'training_seconds', float('nan')):.1f} "
253
+ f"time_gap_s={gap:.1f} tps_median={_metric_float(r, 'tps_median', 0.0):.0f}",
254
+ flush=True,
255
+ )
256
 
257
 
258
  def main() -> int:
overlay/scripts/watch_benchmark_hf_job.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import json
6
+ from pathlib import Path
7
+
8
+
9
+ def parse_benchmark_result_from_logs(lines: list[str]):
10
+ for line in reversed(lines):
11
+ text = line.strip()
12
+ if not text.startswith("{"):
13
+ continue
14
+ try:
15
+ payload = json.loads(text)
16
+ except json.JSONDecodeError:
17
+ continue
18
+ if isinstance(payload, dict) and "benchmark" in payload:
19
+ return payload
20
+ return None
21
+
22
+
23
+ def write_watch_summary(path: Path, payload: dict[str, object]) -> None:
24
+ path.parent.mkdir(parents=True, exist_ok=True)
25
+ path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
26
+
27
+
28
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
29
+ parser = argparse.ArgumentParser(description="Watch or snapshot a remote benchmark job")
30
+ parser.add_argument("--job-id", required=True)
31
+ parser.add_argument("--namespace", default="jackoatmon")
32
+ parser.add_argument("--summary-out", type=Path)
33
+ return parser.parse_args(argv)