Spaces:
Runtime error
Runtime error
Update benchmark runtime image
Browse files- __pycache__/entrypoint.cpython-312.pyc +0 -0
- entrypoint.py +47 -8
- overlay/hydra/model.py +67 -20
- overlay/hydra/training.py +379 -85
- overlay/prepare.py +60 -34
- overlay/scripts/audit_overlay_sync.py +100 -0
- overlay/scripts/benchmark_assets.py +124 -0
- overlay/scripts/benchmark_checkpoint.py +69 -0
- overlay/scripts/benchmark_checkpoint_report.py +50 -0
- overlay/scripts/benchmark_contract.py +67 -0
- overlay/scripts/benchmark_datasets.py +18 -0
- overlay/scripts/benchmark_preflight.py +31 -0
- overlay/scripts/benchmark_runner.py +248 -0
- overlay/scripts/benchmark_suite.py +84 -0
- overlay/scripts/bootstrap_benchmark_env.py +63 -0
- overlay/scripts/bootstrap_benchmark_runtime.py +42 -0
- overlay/scripts/cycle_executor.py +312 -0
- overlay/scripts/export_hpo_priors.py +46 -26
- overlay/scripts/hpo_component_report.py +130 -0
- overlay/scripts/hpo_leaderboard.py +156 -0
- overlay/scripts/hpo_retest.py +151 -0
- overlay/scripts/hydra_generation.py +180 -0
- overlay/scripts/launch_benchmark_hf_job.py +157 -0
- overlay/scripts/optuna_hpo.py +311 -98
- overlay/scripts/run_cycle1a.py +46 -0
- overlay/scripts/sweep_depth_aggregate.py +43 -0
- overlay/scripts/watch_benchmark_hf_job.py +33 -0
__pycache__/entrypoint.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/entrypoint.cpython-312.pyc and b/__pycache__/entrypoint.cpython-312.pyc differ
|
|
|
entrypoint.py
CHANGED
|
@@ -110,7 +110,7 @@ def _start_health_server() -> HTTPServer:
|
|
| 110 |
return server
|
| 111 |
|
| 112 |
|
| 113 |
-
def upload_artifact(api: HfApi, path: Path, dest: str) -> None:
|
| 114 |
if not path.exists():
|
| 115 |
print(f'[upload] skip missing {path}', flush=True)
|
| 116 |
return
|
|
@@ -120,7 +120,20 @@ def upload_artifact(api: HfApi, path: Path, dest: str) -> None:
|
|
| 120 |
repo_id=OUTPUT_REPO,
|
| 121 |
repo_type='model',
|
| 122 |
)
|
| 123 |
-
print(f'[upload] uploaded {path} -> {OUTPUT_REPO}/{dest}', flush=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
def _wait_for_cuda_ready(timeout_s: int = 120) -> None:
|
|
@@ -158,7 +171,7 @@ def _wait_for_cuda_ready(timeout_s: int = 120) -> None:
|
|
| 158 |
print(f'[job] CUDA still not ready after {timeout_s}s — continuing anyway (training will likely fail)', flush=True)
|
| 159 |
|
| 160 |
|
| 161 |
-
def run_job_mode() -> int:
|
| 162 |
os.chdir(REPO_ROOT)
|
| 163 |
os.environ.setdefault('HYDRA_TIME_BUDGET', '43200')
|
| 164 |
os.environ.setdefault('HYDRA_TARGET_SHARDS', '2048')
|
|
@@ -203,7 +216,31 @@ def run_job_mode() -> int:
|
|
| 203 |
else:
|
| 204 |
print('[upload] HF_TOKEN not set; skipping artifact upload', flush=True)
|
| 205 |
|
| 206 |
-
return proc.returncode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
|
| 209 |
def run_space_mode() -> int:
|
|
@@ -217,10 +254,12 @@ def run_space_mode() -> int:
|
|
| 217 |
server.server_close()
|
| 218 |
|
| 219 |
|
| 220 |
-
def main() -> int:
|
| 221 |
-
if RUNTIME_MODE == 'job':
|
| 222 |
-
return run_job_mode()
|
| 223 |
-
|
|
|
|
|
|
|
| 224 |
|
| 225 |
|
| 226 |
if __name__ == '__main__':
|
|
|
|
| 110 |
return server
|
| 111 |
|
| 112 |
|
| 113 |
+
def upload_artifact(api: HfApi, path: Path, dest: str) -> None:
|
| 114 |
if not path.exists():
|
| 115 |
print(f'[upload] skip missing {path}', flush=True)
|
| 116 |
return
|
|
|
|
| 120 |
repo_id=OUTPUT_REPO,
|
| 121 |
repo_type='model',
|
| 122 |
)
|
| 123 |
+
print(f'[upload] uploaded {path} -> {OUTPUT_REPO}/{dest}', flush=True)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def build_benchmark_mode_command() -> list[str]:
|
| 127 |
+
return [
|
| 128 |
+
'python',
|
| 129 |
+
str(REPO_ROOT / 'scripts' / 'benchmark_runner.py'),
|
| 130 |
+
'--benchmark', os.environ.get('HYDRA_BENCHMARK_NAME', 'GSM8K'),
|
| 131 |
+
'--generator-mode', 'hydra',
|
| 132 |
+
'--variant', os.environ.get('HYDRA_BENCHMARK_VARIANT', 'hydra_full'),
|
| 133 |
+
'--seed', os.environ.get('HYDRA_SEED', '42'),
|
| 134 |
+
'--out', str(REPO_ROOT / 'benchmark_result.json'),
|
| 135 |
+
'--ledger', str(REPO_ROOT / 'benchmark_ledger.json'),
|
| 136 |
+
]
|
| 137 |
|
| 138 |
|
| 139 |
def _wait_for_cuda_ready(timeout_s: int = 120) -> None:
|
|
|
|
| 171 |
print(f'[job] CUDA still not ready after {timeout_s}s — continuing anyway (training will likely fail)', flush=True)
|
| 172 |
|
| 173 |
|
| 174 |
+
def run_job_mode() -> int:
|
| 175 |
os.chdir(REPO_ROOT)
|
| 176 |
os.environ.setdefault('HYDRA_TIME_BUDGET', '43200')
|
| 177 |
os.environ.setdefault('HYDRA_TARGET_SHARDS', '2048')
|
|
|
|
| 216 |
else:
|
| 217 |
print('[upload] HF_TOKEN not set; skipping artifact upload', flush=True)
|
| 218 |
|
| 219 |
+
return proc.returncode
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def run_benchmark_mode() -> int:
|
| 223 |
+
os.chdir(REPO_ROOT)
|
| 224 |
+
cmd = build_benchmark_mode_command()
|
| 225 |
+
print(f'[benchmark] command={cmd}', flush=True)
|
| 226 |
+
proc = subprocess.run(cmd, check=False)
|
| 227 |
+
|
| 228 |
+
if TOKEN:
|
| 229 |
+
api = HfApi(token=TOKEN)
|
| 230 |
+
try:
|
| 231 |
+
api.create_repo(repo_id=OUTPUT_REPO, repo_type='model', private=True, exist_ok=True)
|
| 232 |
+
except Exception as e:
|
| 233 |
+
print(f'[upload] create_repo warning: {type(e).__name__}: {e}', flush=True)
|
| 234 |
+
prefix = f'jobs/{JOB_ID}'
|
| 235 |
+
try:
|
| 236 |
+
upload_artifact(api, REPO_ROOT / 'benchmark_result.json', f'{prefix}/benchmark_result.json')
|
| 237 |
+
upload_artifact(api, REPO_ROOT / 'benchmark_ledger.json', f'{prefix}/benchmark_ledger.json')
|
| 238 |
+
except Exception as e:
|
| 239 |
+
print(f'[upload] upload warning: {type(e).__name__}: {e}', flush=True)
|
| 240 |
+
else:
|
| 241 |
+
print('[upload] HF_TOKEN not set; skipping benchmark artifact upload', flush=True)
|
| 242 |
+
|
| 243 |
+
return proc.returncode
|
| 244 |
|
| 245 |
|
| 246 |
def run_space_mode() -> int:
|
|
|
|
| 254 |
server.server_close()
|
| 255 |
|
| 256 |
|
| 257 |
+
def main() -> int:
|
| 258 |
+
if RUNTIME_MODE == 'job':
|
| 259 |
+
return run_job_mode()
|
| 260 |
+
if RUNTIME_MODE == 'benchmark':
|
| 261 |
+
return run_benchmark_mode()
|
| 262 |
+
return run_space_mode()
|
| 263 |
|
| 264 |
|
| 265 |
if __name__ == '__main__':
|
overlay/hydra/model.py
CHANGED
|
@@ -32,11 +32,28 @@ from __future__ import annotations
|
|
| 32 |
|
| 33 |
import os
|
| 34 |
|
| 35 |
-
import torch
|
| 36 |
-
import torch.nn as nn
|
| 37 |
-
import torch.nn.functional as F
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
def _ensure_triton_cuda_backend_registered() -> None:
|
|
@@ -81,9 +98,30 @@ from hydra.hyena_block import HyenaBlock
|
|
| 81 |
from hydra.optimizer import MuonAdamW
|
| 82 |
|
| 83 |
|
| 84 |
-
def norm(x: torch.Tensor) -> torch.Tensor:
|
| 85 |
-
"""RMSNorm over the last dim — stateless, autocast-friendly."""
|
| 86 |
-
return F.rms_norm(x, (x.size(-1),))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
|
| 89 |
class PostSemClawModel(nn.Module):
|
|
@@ -103,6 +141,7 @@ class PostSemClawModel(nn.Module):
|
|
| 103 |
_ensure_triton_cuda_backend_registered()
|
| 104 |
self.config = config
|
| 105 |
self._throughput_mode = os.environ.get("HYDRA_THROUGHPUT_MODE", "0") == "1"
|
|
|
|
| 106 |
|
| 107 |
# Token embedding
|
| 108 |
self.wte = nn.Embedding(config.vocab_size, config.d_model)
|
|
@@ -124,23 +163,31 @@ class PostSemClawModel(nn.Module):
|
|
| 124 |
print(f"[WARN] layers in both hyena_layers and gdn_layers; using Hyena: {sorted(_both)}", flush=True)
|
| 125 |
_gdn_layer_set -= _hyena_layer_set
|
| 126 |
|
| 127 |
-
if _gdn_layer_set:
|
| 128 |
-
from hydra.gdn_block import GDNBlock # requires `fla` package
|
| 129 |
-
|
| 130 |
-
def _build_block(i: int) -> nn.Module:
|
| 131 |
-
if
|
| 132 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
d_model=config.d_model,
|
| 134 |
seq_len=config.sequence_len,
|
| 135 |
order=int(os.environ.get("HYDRA_HYENA_ORDER", "2")),
|
| 136 |
filter_order=int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64")),
|
| 137 |
)
|
| 138 |
-
if i in _gdn_layer_set:
|
| 139 |
-
return GDNBlock(
|
| 140 |
-
d_model=config.d_model,
|
| 141 |
-
n_heads=config.n_heads,
|
| 142 |
-
)
|
| 143 |
-
|
|
|
|
| 144 |
d_model=config.d_model,
|
| 145 |
d_state=config.d_state,
|
| 146 |
expand=config.expand,
|
|
|
|
| 32 |
|
| 33 |
import os
|
| 34 |
|
| 35 |
+
import torch
|
| 36 |
+
import torch.nn as nn
|
| 37 |
+
import torch.nn.functional as F
|
| 38 |
+
|
| 39 |
+
try:
|
| 40 |
+
from mamba_ssm import Mamba3
|
| 41 |
+
except Exception: # pragma: no cover - depends on optional runtime install
|
| 42 |
+
Mamba3 = None # type: ignore[assignment]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _get_mamba3_cls():
|
| 46 |
+
global Mamba3
|
| 47 |
+
if Mamba3 is None:
|
| 48 |
+
try:
|
| 49 |
+
from mamba_ssm import Mamba3 as _Mamba3 # type: ignore
|
| 50 |
+
Mamba3 = _Mamba3 # type: ignore[assignment]
|
| 51 |
+
except Exception as exc: # pragma: no cover - environment dependent
|
| 52 |
+
raise ImportError(
|
| 53 |
+
"mamba_ssm is required for Mamba-based HYDRA blocks. "
|
| 54 |
+
"Install mamba-ssm or use HYDRA_BASELINE_ARCH=transformer."
|
| 55 |
+
) from exc
|
| 56 |
+
return Mamba3
|
| 57 |
|
| 58 |
|
| 59 |
def _ensure_triton_cuda_backend_registered() -> None:
|
|
|
|
| 98 |
from hydra.optimizer import MuonAdamW
|
| 99 |
|
| 100 |
|
| 101 |
+
def norm(x: torch.Tensor) -> torch.Tensor:
|
| 102 |
+
"""RMSNorm over the last dim — stateless, autocast-friendly."""
|
| 103 |
+
return F.rms_norm(x, (x.size(-1),))
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class TransformerBaselineBlock(nn.Module):
|
| 107 |
+
"""Transformer-style delta block for matched baseline experiments.
|
| 108 |
+
|
| 109 |
+
This block returns a transformed delta tensor rather than owning the outer
|
| 110 |
+
residual connection, because ManifoldHyperConnection already handles stream
|
| 111 |
+
mixing and residual injection around the block function.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
def __init__(self, d_model: int, n_heads: int, expand: int, dropout: float) -> None:
|
| 115 |
+
super().__init__()
|
| 116 |
+
self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
|
| 117 |
+
self.ff_in = nn.Linear(d_model, expand * d_model, bias=False)
|
| 118 |
+
self.ff_out = nn.Linear(expand * d_model, d_model, bias=False)
|
| 119 |
+
self.dropout = nn.Dropout(dropout)
|
| 120 |
+
|
| 121 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 122 |
+
attn_out, _ = self.self_attn(x, x, x, need_weights=False)
|
| 123 |
+
ff = self.ff_out(F.gelu(self.ff_in(attn_out)))
|
| 124 |
+
return self.dropout(attn_out + ff)
|
| 125 |
|
| 126 |
|
| 127 |
class PostSemClawModel(nn.Module):
|
|
|
|
| 141 |
_ensure_triton_cuda_backend_registered()
|
| 142 |
self.config = config
|
| 143 |
self._throughput_mode = os.environ.get("HYDRA_THROUGHPUT_MODE", "0") == "1"
|
| 144 |
+
self._baseline_arch = os.environ.get("HYDRA_BASELINE_ARCH", "mamba3").strip().lower()
|
| 145 |
|
| 146 |
# Token embedding
|
| 147 |
self.wte = nn.Embedding(config.vocab_size, config.d_model)
|
|
|
|
| 163 |
print(f"[WARN] layers in both hyena_layers and gdn_layers; using Hyena: {sorted(_both)}", flush=True)
|
| 164 |
_gdn_layer_set -= _hyena_layer_set
|
| 165 |
|
| 166 |
+
if _gdn_layer_set:
|
| 167 |
+
from hydra.gdn_block import GDNBlock # requires `fla` package
|
| 168 |
+
|
| 169 |
+
def _build_block(i: int) -> nn.Module:
|
| 170 |
+
if self._baseline_arch == "transformer":
|
| 171 |
+
return TransformerBaselineBlock(
|
| 172 |
+
d_model=config.d_model,
|
| 173 |
+
n_heads=config.n_heads,
|
| 174 |
+
expand=config.expand,
|
| 175 |
+
dropout=float(os.environ.get("HYDRA_DROPOUT", "0.2")),
|
| 176 |
+
)
|
| 177 |
+
if i in _hyena_layer_set:
|
| 178 |
+
return HyenaBlock(
|
| 179 |
d_model=config.d_model,
|
| 180 |
seq_len=config.sequence_len,
|
| 181 |
order=int(os.environ.get("HYDRA_HYENA_ORDER", "2")),
|
| 182 |
filter_order=int(os.environ.get("HYDRA_HYENA_FILTER_DIM", "64")),
|
| 183 |
)
|
| 184 |
+
if i in _gdn_layer_set:
|
| 185 |
+
return GDNBlock(
|
| 186 |
+
d_model=config.d_model,
|
| 187 |
+
n_heads=config.n_heads,
|
| 188 |
+
)
|
| 189 |
+
mamba3_cls = _get_mamba3_cls()
|
| 190 |
+
return mamba3_cls(
|
| 191 |
d_model=config.d_model,
|
| 192 |
d_state=config.d_state,
|
| 193 |
expand=config.expand,
|
overlay/hydra/training.py
CHANGED
|
@@ -4,17 +4,20 @@ Extracted from the monolithic train.py (W1 modularization). Semantics
|
|
| 4 |
preserved. Public entrypoint: `main()`.
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
from __future__ import annotations
|
| 8 |
-
|
| 9 |
-
import gc
|
| 10 |
-
import
|
| 11 |
-
import
|
| 12 |
-
import
|
| 13 |
-
import
|
| 14 |
-
import
|
| 15 |
-
import
|
| 16 |
-
|
| 17 |
-
from
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
import torch
|
| 20 |
|
|
@@ -130,7 +133,7 @@ def _ckpt_snapshot_state_dicts(
|
|
| 130 |
return msd, osd
|
| 131 |
|
| 132 |
|
| 133 |
-
def save_ckpt(
|
| 134 |
model: PostSemClawModel,
|
| 135 |
optimizer: torch.optim.Optimizer,
|
| 136 |
config: PostSemClawConfig,
|
|
@@ -211,11 +214,233 @@ def save_ckpt(
|
|
| 211 |
target=_write, daemon=True, name=f"ckpt-save-{step}"
|
| 212 |
)
|
| 213 |
_CKPT_WORKER_THREAD.start()
|
| 214 |
-
except Exception as e:
|
| 215 |
-
print(f"[ckpt] SNAPSHOT FAILED {path}: {type(e).__name__}: {e}", flush=True)
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
"""Reconstruct a PostSemClawConfig from a checkpoint's asdict() payload.
|
| 220 |
|
| 221 |
Newly-added fields (e.g. `hyena_layers`) are defaulted when absent in
|
|
@@ -275,14 +500,14 @@ def _try_load_ckpt(path: Path, model, optimizer, device):
|
|
| 275 |
return step, total_training_time, smooth_train_loss, bpt_ema, epoch
|
| 276 |
|
| 277 |
|
| 278 |
-
def maybe_resume_ckpt(
|
| 279 |
-
model: PostSemClawModel,
|
| 280 |
-
optimizer: torch.optim.Optimizer,
|
| 281 |
-
device: torch.device,
|
| 282 |
-
) -> tuple[int, float, float, float, int]:
|
| 283 |
-
if not RESUME_CKPT or RESUME_CKPT.lower() == "none":
|
| 284 |
-
print("[ckpt] resume disabled; starting fresh", flush=True)
|
| 285 |
-
return 0, 0.0, 0.0, 0.0, 0
|
| 286 |
|
| 287 |
resume_path = Path(os.path.expanduser(RESUME_CKPT))
|
| 288 |
# Try the primary path, then rotated backups. This is crucial because a
|
|
@@ -296,17 +521,18 @@ def maybe_resume_ckpt(
|
|
| 296 |
if not cand.exists():
|
| 297 |
continue
|
| 298 |
try:
|
| 299 |
-
result = _try_load_ckpt(cand, model, optimizer, device)
|
| 300 |
-
if result is not None:
|
| 301 |
-
if cand != resume_path:
|
| 302 |
-
print(f"[ckpt] fell back to rotation {cand.name}", flush=True)
|
| 303 |
-
|
|
|
|
| 304 |
except Exception as e:
|
| 305 |
print(f"[ckpt] {cand.name} load failed: {type(e).__name__}: {e}", flush=True)
|
| 306 |
continue
|
| 307 |
|
| 308 |
-
print(f"[ckpt] no usable checkpoint in {resume_path} + rotations; starting fresh", flush=True)
|
| 309 |
-
return 0, 0.0, 0.0, 0.0, 0
|
| 310 |
|
| 311 |
|
| 312 |
# ---------------------------------------------------------------------------
|
|
@@ -388,9 +614,18 @@ def main() -> None:
|
|
| 388 |
weight_decay=WEIGHT_DECAY,
|
| 389 |
)
|
| 390 |
|
| 391 |
-
step, total_training_time, smooth_train_loss, bpt_ema, resume_epoch = maybe_resume_ckpt(
|
| 392 |
-
model, optimizer, device,
|
| 393 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
|
| 395 |
# Learnability #4: inform the model of the BOS token id so it can mask
|
| 396 |
# doc-separator positions in packed sequences. Always set (the mask only
|
|
@@ -785,10 +1020,22 @@ def main() -> None:
|
|
| 785 |
# does not benefit from overlap with backward). HYDRA_EVAL_TOKENS controls
|
| 786 |
# how many val tokens to sweep (default 2 M, short enough for autoresearch
|
| 787 |
# 5-min budgets).
|
| 788 |
-
val_bpb: float | None = None
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
# Aggressive VRAM reclaim for 6GB cards. Peak training VRAM = 5.1GB
|
| 793 |
# which leaves < 1GB for the eval forward — the driver can't satisfy
|
| 794 |
# the allocation. Free EVERY tensor we don't strictly need:
|
|
@@ -810,34 +1057,70 @@ def main() -> None:
|
|
| 810 |
model._last_sdr = None
|
| 811 |
import gc as _gc
|
| 812 |
_gc.collect()
|
| 813 |
-
torch.cuda.empty_cache()
|
| 814 |
-
torch.cuda.synchronize()
|
| 815 |
-
try:
|
| 816 |
-
_free_mb = torch.cuda.mem_get_info()[0] / 1024 / 1024
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
|
| 842 |
# Final ckpts with val_bpb filled in (if eval succeeded).
|
| 843 |
save_ckpt(
|
|
@@ -881,8 +1164,13 @@ def main() -> None:
|
|
| 881 |
/ total_training_time / GPU_BF16_PEAK_FLOPS
|
| 882 |
if total_training_time > 0 else 0
|
| 883 |
)
|
| 884 |
-
peak_vram_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
|
| 885 |
-
metrics = model.get_secondary_metrics()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 886 |
|
| 887 |
print("---")
|
| 888 |
print(f"val_bpb: {val_bpb:.6f}" if val_bpb is not None else "val_bpb: SKIPPED")
|
|
@@ -918,22 +1206,28 @@ def main() -> None:
|
|
| 918 |
# Emit full metrics dictionary as JSON for sweep aggregation. Path from
|
| 919 |
# HYDRA_METRICS_OUT env var; default=/tmp/hydra_run_metrics.json. Always
|
| 920 |
# written (even without diagnostics) so the aggregator can compare runs.
|
| 921 |
-
_metrics_out = os.environ.get("HYDRA_METRICS_OUT", "/tmp/hydra_run_metrics.json")
|
| 922 |
-
try:
|
| 923 |
-
_dump =
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 937 |
with open(_metrics_out, 'w') as _f:
|
| 938 |
json.dump(_dump, _f, indent=2, sort_keys=True)
|
| 939 |
print(f"[METRICS] wrote {_metrics_out}", flush=True)
|
|
|
|
| 4 |
preserved. Public entrypoint: `main()`.
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import gc
|
| 10 |
+
import hashlib
|
| 11 |
+
import json
|
| 12 |
+
import math
|
| 13 |
+
import os
|
| 14 |
+
import sys
|
| 15 |
+
import threading
|
| 16 |
+
import time
|
| 17 |
+
from collections.abc import Mapping
|
| 18 |
+
from dataclasses import asdict
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import Any
|
| 21 |
|
| 22 |
import torch
|
| 23 |
|
|
|
|
| 133 |
return msd, osd
|
| 134 |
|
| 135 |
|
| 136 |
+
def save_ckpt(
|
| 137 |
model: PostSemClawModel,
|
| 138 |
optimizer: torch.optim.Optimizer,
|
| 139 |
config: PostSemClawConfig,
|
|
|
|
| 214 |
target=_write, daemon=True, name=f"ckpt-save-{step}"
|
| 215 |
)
|
| 216 |
_CKPT_WORKER_THREAD.start()
|
| 217 |
+
except Exception as e:
|
| 218 |
+
print(f"[ckpt] SNAPSHOT FAILED {path}: {type(e).__name__}: {e}", flush=True)
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def _env_flag_enabled(env: Mapping[str, str], key: str) -> bool:
|
| 222 |
+
value = str(env.get(key, "0") or "0").strip().lower()
|
| 223 |
+
return value not in {"", "0", "false", "no", "off"}
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def _env_int(env: Mapping[str, str], key: str, default: int) -> int:
|
| 227 |
+
try:
|
| 228 |
+
return int(str(env.get(key, str(default)) or str(default)))
|
| 229 |
+
except ValueError:
|
| 230 |
+
return default
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def architecture_compliance_payload(env: Mapping[str, str]) -> dict[str, bool | int | str]:
|
| 234 |
+
throughput_mode = _env_flag_enabled(env, "HYDRA_THROUGHPUT_MODE")
|
| 235 |
+
fastpath = _env_flag_enabled(env, "HYDRA_FASTPATH")
|
| 236 |
+
force_htm_cpu = _env_flag_enabled(env, "HYDRA_FORCE_HTM_CPU")
|
| 237 |
+
inert_mamba = _env_flag_enabled(env, "HYDRA_INERT_MAMBA")
|
| 238 |
+
synthetic_retina = _env_flag_enabled(env, "HYDRA_ALLOW_SYNTHETIC_RETINA")
|
| 239 |
+
hyena_layers = str(env.get("HYDRA_HYENA_LAYERS", "") or "")
|
| 240 |
+
engram_subsample = _env_int(env, "HYDRA_ENGRAM_SUBSAMPLE", 1)
|
| 241 |
+
htm_subsample = _env_int(env, "HYDRA_HTM_SUBSAMPLE", 1)
|
| 242 |
+
full_arch_compliant = not any((
|
| 243 |
+
throughput_mode,
|
| 244 |
+
fastpath,
|
| 245 |
+
force_htm_cpu,
|
| 246 |
+
inert_mamba,
|
| 247 |
+
synthetic_retina,
|
| 248 |
+
bool(hyena_layers.strip()),
|
| 249 |
+
))
|
| 250 |
+
return {
|
| 251 |
+
'full_arch_compliant': full_arch_compliant,
|
| 252 |
+
'throughput_mode': throughput_mode,
|
| 253 |
+
'fastpath': fastpath,
|
| 254 |
+
'force_htm_cpu': force_htm_cpu,
|
| 255 |
+
'inert_mamba': inert_mamba,
|
| 256 |
+
'synthetic_retina': synthetic_retina,
|
| 257 |
+
'hyena_layers': hyena_layers,
|
| 258 |
+
'engram_subsample': engram_subsample,
|
| 259 |
+
'htm_subsample': htm_subsample,
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def eval_attempt_batches(*, requested_batch: int, min_batch: int) -> list[int]:
|
| 264 |
+
requested = max(1, int(requested_batch))
|
| 265 |
+
minimum = max(1, int(min_batch))
|
| 266 |
+
batches: list[int] = []
|
| 267 |
+
current = requested
|
| 268 |
+
while current >= minimum:
|
| 269 |
+
if current not in batches:
|
| 270 |
+
batches.append(current)
|
| 271 |
+
if current == minimum:
|
| 272 |
+
break
|
| 273 |
+
next_batch = max(minimum, current // 2)
|
| 274 |
+
if next_batch == current:
|
| 275 |
+
break
|
| 276 |
+
current = next_batch
|
| 277 |
+
if minimum not in batches:
|
| 278 |
+
batches.append(minimum)
|
| 279 |
+
return batches
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
def build_eval_plan(*, eval_tokens: int, requested_batch: int, max_seq_len: int, chunk_tokens: int, min_batch: int) -> dict[str, Any]:
|
| 283 |
+
effective_chunk_tokens = max(int(chunk_tokens), int(requested_batch) * int(max_seq_len))
|
| 284 |
+
chunk_count = max(1, math.ceil(int(eval_tokens) / effective_chunk_tokens))
|
| 285 |
+
return {
|
| 286 |
+
'eval_tokens': int(eval_tokens),
|
| 287 |
+
'eval_requested_batch': int(requested_batch),
|
| 288 |
+
'eval_chunk_tokens': int(effective_chunk_tokens),
|
| 289 |
+
'eval_chunk_count': int(chunk_count),
|
| 290 |
+
'eval_attempt_batches': eval_attempt_batches(requested_batch=requested_batch, min_batch=min_batch),
|
| 291 |
+
'eval_min_batch': int(max(1, min_batch)),
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def _fingerprint_descriptor(descriptor: Mapping[str, Any]) -> str:
|
| 296 |
+
payload = json.dumps(dict(descriptor), sort_keys=True, separators=(",", ":"))
|
| 297 |
+
return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:12]
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def dataset_domain_payload(*, env: Mapping[str, str], prepare_module: Any, nemotron_module: Any | None) -> dict[str, Any]:
|
| 301 |
+
use_nemotron = _env_flag_enabled(env, "HYDRA_USE_NEMOTRON")
|
| 302 |
+
vocab_size = int(getattr(prepare_module, "VOCAB_SIZE", 0))
|
| 303 |
+
|
| 304 |
+
if use_nemotron and nemotron_module is not None:
|
| 305 |
+
use_full_blend = _env_flag_enabled(env, "HYDRA_USE_FULL_BLEND")
|
| 306 |
+
phase = str(env.get("HYDRA_NEMOTRON_PHASE", "phase1") or "phase1").strip().lower()
|
| 307 |
+
if use_full_blend:
|
| 308 |
+
train_weights = dict(getattr(nemotron_module, "FULL_BLEND_WEIGHTS", {}))
|
| 309 |
+
val_weights = dict(train_weights)
|
| 310 |
+
else:
|
| 311 |
+
train_weights = dict(
|
| 312 |
+
getattr(nemotron_module, "PHASE2_WEIGHTS", {}) if phase == "phase2" else getattr(nemotron_module, "PHASE1_WEIGHTS", {})
|
| 313 |
+
)
|
| 314 |
+
val_weights = {"Nemotron-Pretraining-Multiple-Choice": 1.0}
|
| 315 |
+
train_descriptor = {
|
| 316 |
+
"backend": "nemotron_stream",
|
| 317 |
+
"phase": "full_blend" if use_full_blend else phase,
|
| 318 |
+
"weights": train_weights,
|
| 319 |
+
"factual_inject_rate": _env_int(env, "HYDRA_FACTUAL_INJECT_RATE", 50),
|
| 320 |
+
"vocab_size": vocab_size,
|
| 321 |
+
}
|
| 322 |
+
val_descriptor = {
|
| 323 |
+
"backend": "nemotron_stream",
|
| 324 |
+
"phase": "full_blend" if use_full_blend else "val_multiple_choice",
|
| 325 |
+
"weights": val_weights,
|
| 326 |
+
"vocab_size": vocab_size,
|
| 327 |
+
}
|
| 328 |
+
data_backend = "nemotron_stream"
|
| 329 |
+
else:
|
| 330 |
+
all_files = list(getattr(prepare_module, "list_parquet_files", lambda: [])())
|
| 331 |
+
val_filename = str(getattr(prepare_module, "VAL_FILENAME", ""))
|
| 332 |
+
train_files = [str(path) for path in all_files if not str(path).endswith(val_filename)]
|
| 333 |
+
val_files = [str(path) for path in all_files if str(path).endswith(val_filename)]
|
| 334 |
+
train_descriptor = {
|
| 335 |
+
"backend": "climbmix_parquet",
|
| 336 |
+
"train_shard_count": len(train_files),
|
| 337 |
+
"train_shard_examples": sorted(Path(path).name for path in train_files[:3]),
|
| 338 |
+
"vocab_size": vocab_size,
|
| 339 |
+
}
|
| 340 |
+
val_descriptor = {
|
| 341 |
+
"backend": "climbmix_parquet",
|
| 342 |
+
"val_filename": val_filename,
|
| 343 |
+
"val_shard_count": len(val_files),
|
| 344 |
+
"vocab_size": vocab_size,
|
| 345 |
+
}
|
| 346 |
+
data_backend = "climbmix_parquet"
|
| 347 |
+
|
| 348 |
+
train_fingerprint = _fingerprint_descriptor(train_descriptor)
|
| 349 |
+
val_fingerprint = _fingerprint_descriptor(val_descriptor)
|
| 350 |
+
return {
|
| 351 |
+
"data_backend": data_backend,
|
| 352 |
+
"train_domain_descriptor": train_descriptor,
|
| 353 |
+
"val_domain_descriptor": val_descriptor,
|
| 354 |
+
"train_domain_fingerprint": train_fingerprint,
|
| 355 |
+
"val_domain_fingerprint": val_fingerprint,
|
| 356 |
+
"train_val_domain_match": train_fingerprint == val_fingerprint,
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
def build_lineage_payload(
|
| 361 |
+
*,
|
| 362 |
+
env: Mapping[str, str],
|
| 363 |
+
seed: int,
|
| 364 |
+
resume_requested: bool,
|
| 365 |
+
resume_requested_path: str | None,
|
| 366 |
+
resume_loaded_path: str | None,
|
| 367 |
+
resume_step: int,
|
| 368 |
+
resume_epoch: int,
|
| 369 |
+
) -> dict[str, Any]:
|
| 370 |
+
warmstart = _env_flag_enabled(env, "HYDRA_WARMSTART")
|
| 371 |
+
resume_applied = resume_loaded_path is not None and int(resume_step) > 0
|
| 372 |
+
if resume_applied and warmstart:
|
| 373 |
+
lineage_mode = "warmstart_resume"
|
| 374 |
+
elif resume_applied:
|
| 375 |
+
lineage_mode = "resume"
|
| 376 |
+
else:
|
| 377 |
+
lineage_mode = "fresh"
|
| 378 |
+
return {
|
| 379 |
+
"seed": int(seed),
|
| 380 |
+
"warmstart": warmstart,
|
| 381 |
+
"resume_requested": bool(resume_requested),
|
| 382 |
+
"resume_applied": resume_applied,
|
| 383 |
+
"resume_requested_path": resume_requested_path,
|
| 384 |
+
"resume_loaded_path": resume_loaded_path,
|
| 385 |
+
"resume_step": int(resume_step),
|
| 386 |
+
"resume_epoch": int(resume_epoch),
|
| 387 |
+
"lineage_mode": lineage_mode,
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
def build_final_metrics_payload(
|
| 392 |
+
*,
|
| 393 |
+
secondary_metrics: dict[str, Any],
|
| 394 |
+
val_bpb: float | None,
|
| 395 |
+
val_ppl: float | None,
|
| 396 |
+
eval_status: str,
|
| 397 |
+
eval_error: str | None,
|
| 398 |
+
n_layer: int,
|
| 399 |
+
d_model: int,
|
| 400 |
+
num_params: int,
|
| 401 |
+
step: int,
|
| 402 |
+
total_tokens: int,
|
| 403 |
+
peak_vram_mb: float,
|
| 404 |
+
total_training_time: float,
|
| 405 |
+
sdr_target_active: int,
|
| 406 |
+
architecture_env: Mapping[str, str] | None = None,
|
| 407 |
+
eval_diagnostics: Mapping[str, Any] | None = None,
|
| 408 |
+
domain_fingerprints: Mapping[str, Any] | None = None,
|
| 409 |
+
lineage_payload: Mapping[str, Any] | None = None,
|
| 410 |
+
) -> dict[str, Any]:
|
| 411 |
+
"""Build final run metrics without conflating skipped eval and validation.
|
| 412 |
+
|
| 413 |
+
This helper deliberately preserves ``val_bpb=None`` when final eval did not
|
| 414 |
+
complete. HPO can then prune or explicitly label a fallback instead of
|
| 415 |
+
accidentally treating live training BPB as validation BPB.
|
| 416 |
+
"""
|
| 417 |
+
payload = dict(secondary_metrics)
|
| 418 |
+
payload.update({
|
| 419 |
+
'eval_status': eval_status,
|
| 420 |
+
'eval_error': eval_error,
|
| 421 |
+
'objective_source': 'final_val' if val_bpb is not None else 'missing_final_val',
|
| 422 |
+
'val_bpb': float(val_bpb) if val_bpb is not None else None,
|
| 423 |
+
'val_ppl': float(val_ppl) if val_ppl is not None else None,
|
| 424 |
+
'n_layer': int(n_layer),
|
| 425 |
+
'd_model': int(d_model),
|
| 426 |
+
'num_params_M': float(num_params / 1e6),
|
| 427 |
+
'num_steps': int(step),
|
| 428 |
+
'total_tokens_M': float(total_tokens / 1e6),
|
| 429 |
+
'peak_vram_mb': float(peak_vram_mb),
|
| 430 |
+
'training_seconds': float(total_training_time),
|
| 431 |
+
'sdr_target_active': int(sdr_target_active),
|
| 432 |
+
})
|
| 433 |
+
payload.update(architecture_compliance_payload(architecture_env or dict(os.environ)))
|
| 434 |
+
if eval_diagnostics:
|
| 435 |
+
payload.update(dict(eval_diagnostics))
|
| 436 |
+
if domain_fingerprints:
|
| 437 |
+
payload.update(dict(domain_fingerprints))
|
| 438 |
+
if lineage_payload:
|
| 439 |
+
payload.update(dict(lineage_payload))
|
| 440 |
+
return payload
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
def config_from_dict(cfg_dict: dict) -> PostSemClawConfig:
|
| 444 |
"""Reconstruct a PostSemClawConfig from a checkpoint's asdict() payload.
|
| 445 |
|
| 446 |
Newly-added fields (e.g. `hyena_layers`) are defaulted when absent in
|
|
|
|
| 500 |
return step, total_training_time, smooth_train_loss, bpt_ema, epoch
|
| 501 |
|
| 502 |
|
| 503 |
+
def maybe_resume_ckpt(
|
| 504 |
+
model: PostSemClawModel,
|
| 505 |
+
optimizer: torch.optim.Optimizer,
|
| 506 |
+
device: torch.device,
|
| 507 |
+
) -> tuple[int, float, float, float, int, str | None]:
|
| 508 |
+
if not RESUME_CKPT or RESUME_CKPT.lower() == "none":
|
| 509 |
+
print("[ckpt] resume disabled; starting fresh", flush=True)
|
| 510 |
+
return 0, 0.0, 0.0, 0.0, 0, None
|
| 511 |
|
| 512 |
resume_path = Path(os.path.expanduser(RESUME_CKPT))
|
| 513 |
# Try the primary path, then rotated backups. This is crucial because a
|
|
|
|
| 521 |
if not cand.exists():
|
| 522 |
continue
|
| 523 |
try:
|
| 524 |
+
result = _try_load_ckpt(cand, model, optimizer, device)
|
| 525 |
+
if result is not None:
|
| 526 |
+
if cand != resume_path:
|
| 527 |
+
print(f"[ckpt] fell back to rotation {cand.name}", flush=True)
|
| 528 |
+
step, total_training_time, smooth_train_loss, bpt_ema, epoch = result
|
| 529 |
+
return step, total_training_time, smooth_train_loss, bpt_ema, epoch, str(cand)
|
| 530 |
except Exception as e:
|
| 531 |
print(f"[ckpt] {cand.name} load failed: {type(e).__name__}: {e}", flush=True)
|
| 532 |
continue
|
| 533 |
|
| 534 |
+
print(f"[ckpt] no usable checkpoint in {resume_path} + rotations; starting fresh", flush=True)
|
| 535 |
+
return 0, 0.0, 0.0, 0.0, 0, None
|
| 536 |
|
| 537 |
|
| 538 |
# ---------------------------------------------------------------------------
|
|
|
|
| 614 |
weight_decay=WEIGHT_DECAY,
|
| 615 |
)
|
| 616 |
|
| 617 |
+
step, total_training_time, smooth_train_loss, bpt_ema, resume_epoch, resume_loaded_path = maybe_resume_ckpt(
|
| 618 |
+
model, optimizer, device,
|
| 619 |
+
)
|
| 620 |
+
lineage_payload = build_lineage_payload(
|
| 621 |
+
env=dict(os.environ),
|
| 622 |
+
seed=SEED,
|
| 623 |
+
resume_requested=bool(RESUME_CKPT and RESUME_CKPT.lower() != "none"),
|
| 624 |
+
resume_requested_path=RESUME_CKPT if RESUME_CKPT and RESUME_CKPT.lower() != "none" else None,
|
| 625 |
+
resume_loaded_path=resume_loaded_path,
|
| 626 |
+
resume_step=step,
|
| 627 |
+
resume_epoch=resume_epoch,
|
| 628 |
+
)
|
| 629 |
|
| 630 |
# Learnability #4: inform the model of the BOS token id so it can mask
|
| 631 |
# doc-separator positions in packed sequences. Always set (the mask only
|
|
|
|
| 1020 |
# does not benefit from overlap with backward). HYDRA_EVAL_TOKENS controls
|
| 1021 |
# how many val tokens to sweep (default 2 M, short enough for autoresearch
|
| 1022 |
# 5-min budgets).
|
| 1023 |
+
val_bpb: float | None = None
|
| 1024 |
+
val_ppl: float | None = None
|
| 1025 |
+
eval_status = "not_started"
|
| 1026 |
+
eval_error: str | None = None
|
| 1027 |
+
_eval_B = int(os.environ.get("HYDRA_EVAL_BATCH", str(max(1, DEVICE_BATCH_SIZE // 2))))
|
| 1028 |
+
_eval_tokens = int(os.environ.get("HYDRA_EVAL_TOKENS", str(2 * 524288)))
|
| 1029 |
+
_eval_chunk_tokens = int(os.environ.get("HYDRA_EVAL_CHUNK_TOKENS", str(_eval_tokens)))
|
| 1030 |
+
_eval_min_batch = int(os.environ.get("HYDRA_EVAL_MIN_BATCH", "1"))
|
| 1031 |
+
eval_diagnostics = build_eval_plan(
|
| 1032 |
+
eval_tokens=_eval_tokens,
|
| 1033 |
+
requested_batch=_eval_B,
|
| 1034 |
+
max_seq_len=MAX_SEQ_LEN,
|
| 1035 |
+
chunk_tokens=_eval_chunk_tokens,
|
| 1036 |
+
min_batch=_eval_min_batch,
|
| 1037 |
+
)
|
| 1038 |
+
try:
|
| 1039 |
# Aggressive VRAM reclaim for 6GB cards. Peak training VRAM = 5.1GB
|
| 1040 |
# which leaves < 1GB for the eval forward — the driver can't satisfy
|
| 1041 |
# the allocation. Free EVERY tensor we don't strictly need:
|
|
|
|
| 1057 |
model._last_sdr = None
|
| 1058 |
import gc as _gc
|
| 1059 |
_gc.collect()
|
| 1060 |
+
torch.cuda.empty_cache()
|
| 1061 |
+
torch.cuda.synchronize()
|
| 1062 |
+
try:
|
| 1063 |
+
_free_mb = torch.cuda.mem_get_info()[0] / 1024 / 1024
|
| 1064 |
+
eval_diagnostics["eval_free_vram_before_mb"] = float(_free_mb)
|
| 1065 |
+
print(f"[VAL] free_vram_mb={_free_mb:.0f} (cleared optimizer state)", flush=True)
|
| 1066 |
+
except Exception:
|
| 1067 |
+
pass
|
| 1068 |
+
print(
|
| 1069 |
+
f"[VAL] running eval on {_eval_tokens} tokens at B={_eval_B} "
|
| 1070 |
+
f"chunk_tokens={eval_diagnostics['eval_chunk_tokens']} attempts={eval_diagnostics['eval_attempt_batches']}...",
|
| 1071 |
+
flush=True,
|
| 1072 |
+
)
|
| 1073 |
+
model.eval()
|
| 1074 |
+
_orig = _prepare_mod.EVAL_TOKENS
|
| 1075 |
+
_orig_chunk = getattr(_prepare_mod, "EVAL_CHUNK_TOKENS", _eval_tokens)
|
| 1076 |
+
_prepare_mod.EVAL_TOKENS = _eval_tokens
|
| 1077 |
+
_prepare_mod.EVAL_CHUNK_TOKENS = int(eval_diagnostics["eval_chunk_tokens"])
|
| 1078 |
+
_successful_batch: int | None = None
|
| 1079 |
+
_attempts: list[int] = []
|
| 1080 |
+
try:
|
| 1081 |
+
for _attempt_batch in eval_diagnostics["eval_attempt_batches"]:
|
| 1082 |
+
_attempts.append(int(_attempt_batch))
|
| 1083 |
+
eval_diagnostics["eval_attempted_batch"] = int(_attempt_batch)
|
| 1084 |
+
try:
|
| 1085 |
+
with autocast_ctx:
|
| 1086 |
+
val_bpb = evaluate_bpb(model, tokenizer, int(_attempt_batch))
|
| 1087 |
+
_successful_batch = int(_attempt_batch)
|
| 1088 |
+
break
|
| 1089 |
+
except torch.cuda.OutOfMemoryError as _attempt_oom:
|
| 1090 |
+
eval_error = str(_attempt_oom)
|
| 1091 |
+
eval_status = "oom"
|
| 1092 |
+
torch.cuda.empty_cache()
|
| 1093 |
+
if int(_attempt_batch) == eval_diagnostics["eval_attempt_batches"][-1]:
|
| 1094 |
+
raise
|
| 1095 |
+
finally:
|
| 1096 |
+
_prepare_mod.EVAL_TOKENS = _orig
|
| 1097 |
+
_prepare_mod.EVAL_CHUNK_TOKENS = _orig_chunk
|
| 1098 |
+
eval_diagnostics["eval_attempt_batches"] = _attempts
|
| 1099 |
+
eval_diagnostics["eval_effective_batch"] = _successful_batch
|
| 1100 |
+
val_ppl = 2 ** val_bpb
|
| 1101 |
+
eval_status = "completed"
|
| 1102 |
+
print(f"[VAL] step={step} val_bpb={val_bpb:.4f} val_ppl={val_ppl:.3f}", flush=True)
|
| 1103 |
+
except torch.cuda.OutOfMemoryError as e:
|
| 1104 |
+
eval_status = "oom"
|
| 1105 |
+
eval_error = str(e)
|
| 1106 |
+
print(f"[VAL] SKIPPED (OOM): {e}", flush=True)
|
| 1107 |
+
torch.cuda.empty_cache()
|
| 1108 |
+
try:
|
| 1109 |
+
eval_diagnostics["eval_free_vram_after_mb"] = float(torch.cuda.mem_get_info()[0] / 1024 / 1024)
|
| 1110 |
+
except Exception:
|
| 1111 |
+
pass
|
| 1112 |
+
except Exception as e:
|
| 1113 |
+
import traceback as _tb
|
| 1114 |
+
eval_status = type(e).__name__
|
| 1115 |
+
eval_error = str(e)
|
| 1116 |
+
print(f"[VAL] SKIPPED ({type(e).__name__}): {e}", flush=True)
|
| 1117 |
+
_tb.print_exc()
|
| 1118 |
+
try:
|
| 1119 |
+
_free = torch.cuda.mem_get_info()[0] / 1024 / 1024
|
| 1120 |
+
eval_diagnostics["eval_free_vram_after_mb"] = float(_free)
|
| 1121 |
+
print(f"[VAL] post-crash free_vram_mb={_free:.0f}", flush=True)
|
| 1122 |
+
except Exception:
|
| 1123 |
+
pass
|
| 1124 |
|
| 1125 |
# Final ckpts with val_bpb filled in (if eval succeeded).
|
| 1126 |
save_ckpt(
|
|
|
|
| 1164 |
/ total_training_time / GPU_BF16_PEAK_FLOPS
|
| 1165 |
if total_training_time > 0 else 0
|
| 1166 |
)
|
| 1167 |
+
peak_vram_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
|
| 1168 |
+
metrics = model.get_secondary_metrics()
|
| 1169 |
+
domain_fingerprints = dataset_domain_payload(
|
| 1170 |
+
env=dict(os.environ),
|
| 1171 |
+
prepare_module=_prepare_mod,
|
| 1172 |
+
nemotron_module=globals().get("_p_nemo"),
|
| 1173 |
+
)
|
| 1174 |
|
| 1175 |
print("---")
|
| 1176 |
print(f"val_bpb: {val_bpb:.6f}" if val_bpb is not None else "val_bpb: SKIPPED")
|
|
|
|
| 1206 |
# Emit full metrics dictionary as JSON for sweep aggregation. Path from
|
| 1207 |
# HYDRA_METRICS_OUT env var; default=/tmp/hydra_run_metrics.json. Always
|
| 1208 |
# written (even without diagnostics) so the aggregator can compare runs.
|
| 1209 |
+
_metrics_out = os.environ.get("HYDRA_METRICS_OUT", "/tmp/hydra_run_metrics.json")
|
| 1210 |
+
try:
|
| 1211 |
+
_dump = build_final_metrics_payload(
|
| 1212 |
+
secondary_metrics=metrics,
|
| 1213 |
+
val_bpb=val_bpb,
|
| 1214 |
+
val_ppl=val_ppl,
|
| 1215 |
+
eval_status=eval_status,
|
| 1216 |
+
eval_error=eval_error,
|
| 1217 |
+
n_layer=N_LAYER,
|
| 1218 |
+
d_model=D_MODEL,
|
| 1219 |
+
num_params=num_params,
|
| 1220 |
+
step=step,
|
| 1221 |
+
total_tokens=total_tokens,
|
| 1222 |
+
peak_vram_mb=peak_vram_mb,
|
| 1223 |
+
total_training_time=total_training_time,
|
| 1224 |
+
sdr_target_active=int(os.environ.get("HYDRA_SDR_TARGET_ACTIVE", "327")),
|
| 1225 |
+
architecture_env=dict(os.environ),
|
| 1226 |
+
eval_diagnostics=eval_diagnostics,
|
| 1227 |
+
domain_fingerprints=domain_fingerprints,
|
| 1228 |
+
lineage_payload=lineage_payload,
|
| 1229 |
+
)
|
| 1230 |
+
Path(_metrics_out).parent.mkdir(parents=True, exist_ok=True)
|
| 1231 |
with open(_metrics_out, 'w') as _f:
|
| 1232 |
json.dump(_dump, _f, indent=2, sort_keys=True)
|
| 1233 |
print(f"[METRICS] wrote {_metrics_out}", flush=True)
|
overlay/prepare.py
CHANGED
|
@@ -13,9 +13,10 @@ import os
|
|
| 13 |
import sys
|
| 14 |
import time
|
| 15 |
import math
|
| 16 |
-
import argparse
|
| 17 |
-
import pickle
|
| 18 |
-
from multiprocessing import Pool
|
|
|
|
| 19 |
|
| 20 |
import requests
|
| 21 |
import pyarrow.parquet as pq
|
|
@@ -29,7 +30,8 @@ import torch
|
|
| 29 |
|
| 30 |
MAX_SEQ_LEN = int(os.environ.get("HYDRA_SEQ_LEN", "512")) # context length
|
| 31 |
TIME_BUDGET = 300 # training time budget in seconds (5 minutes)
|
| 32 |
-
EVAL_TOKENS = 40 * 524288 # number of tokens for val eval
|
|
|
|
| 33 |
|
| 34 |
# ---------------------------------------------------------------------------
|
| 35 |
# Configuration
|
|
@@ -158,7 +160,8 @@ def train_tokenizer():
|
|
| 158 |
print("Tokenizer: training BPE tokenizer...")
|
| 159 |
t0 = time.time()
|
| 160 |
|
| 161 |
-
|
|
|
|
| 162 |
vocab_size_no_special = VOCAB_SIZE - len(SPECIAL_TOKENS)
|
| 163 |
tokenizer.train_from_iterator(text_iterator(), vocab_size_no_special, pattern=SPLIT_PATTERN)
|
| 164 |
|
|
@@ -225,9 +228,10 @@ class Tokenizer:
|
|
| 225 |
def get_bos_token_id(self):
|
| 226 |
return self.bos_token_id
|
| 227 |
|
| 228 |
-
def encode(self, text, prepend=None, num_threads=8):
|
| 229 |
-
|
| 230 |
-
|
|
|
|
| 231 |
if isinstance(text, str):
|
| 232 |
ids = self.enc.encode_ordinary(text)
|
| 233 |
if prepend is not None:
|
|
@@ -245,7 +249,7 @@ class Tokenizer:
|
|
| 245 |
return self.enc.decode(ids)
|
| 246 |
|
| 247 |
|
| 248 |
-
_TOKEN_BYTES_CACHE: dict = {}
|
| 249 |
|
| 250 |
def get_token_bytes(device="cpu"):
|
| 251 |
key = str(device)
|
|
@@ -341,12 +345,30 @@ def make_dataloader(tokenizer, B, T, split, buffer_size=1000):
|
|
| 341 |
gpu_buffer.copy_(cpu_buffer, non_blocking=True)
|
| 342 |
yield inputs, targets, epoch
|
| 343 |
|
| 344 |
-
# ---------------------------------------------------------------------------
|
| 345 |
-
# Evaluation (DO NOT CHANGE — this is the fixed metric)
|
| 346 |
-
# ---------------------------------------------------------------------------
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
"""
|
| 351 |
Bits per byte (BPB): vocab size-independent evaluation metric.
|
| 352 |
Sums per-token cross-entropy (in nats), sums target byte lengths,
|
|
@@ -357,31 +379,35 @@ def evaluate_bpb(model, tokenizer, batch_size):
|
|
| 357 |
Perf: accumulates on GPU (single sync at end), prefetches next batch
|
| 358 |
while current forward runs.
|
| 359 |
"""
|
| 360 |
-
token_bytes = get_token_bytes(device="cuda")
|
| 361 |
-
val_loader = make_dataloader(tokenizer, batch_size, MAX_SEQ_LEN, "val")
|
| 362 |
-
steps = EVAL_TOKENS // (batch_size * MAX_SEQ_LEN)
|
|
|
|
| 363 |
|
| 364 |
# GPU-resident accumulators — avoid per-batch .item() sync
|
| 365 |
total_nats_t = torch.zeros(1, device="cuda", dtype=torch.float64)
|
| 366 |
total_bytes_t = torch.zeros(1, device="cuda", dtype=torch.int64)
|
| 367 |
|
| 368 |
# Prefetch first batch
|
| 369 |
-
next_batch = next(val_loader)
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
|
|
|
|
|
|
|
|
|
| 385 |
|
| 386 |
# ---------------------------------------------------------------------------
|
| 387 |
# Main
|
|
|
|
| 13 |
import sys
|
| 14 |
import time
|
| 15 |
import math
|
| 16 |
+
import argparse
|
| 17 |
+
import pickle
|
| 18 |
+
from multiprocessing import Pool
|
| 19 |
+
from typing import Any
|
| 20 |
|
| 21 |
import requests
|
| 22 |
import pyarrow.parquet as pq
|
|
|
|
| 30 |
|
| 31 |
MAX_SEQ_LEN = int(os.environ.get("HYDRA_SEQ_LEN", "512")) # context length
|
| 32 |
TIME_BUDGET = 300 # training time budget in seconds (5 minutes)
|
| 33 |
+
EVAL_TOKENS = 40 * 524288 # number of tokens for val eval
|
| 34 |
+
EVAL_CHUNK_TOKENS = int(os.environ.get("HYDRA_EVAL_CHUNK_TOKENS", str(EVAL_TOKENS)))
|
| 35 |
|
| 36 |
# ---------------------------------------------------------------------------
|
| 37 |
# Configuration
|
|
|
|
| 160 |
print("Tokenizer: training BPE tokenizer...")
|
| 161 |
t0 = time.time()
|
| 162 |
|
| 163 |
+
tokenizer_cls = getattr(rustbpe, "Tokenizer")
|
| 164 |
+
tokenizer: Any = tokenizer_cls()
|
| 165 |
vocab_size_no_special = VOCAB_SIZE - len(SPECIAL_TOKENS)
|
| 166 |
tokenizer.train_from_iterator(text_iterator(), vocab_size_no_special, pattern=SPLIT_PATTERN)
|
| 167 |
|
|
|
|
| 228 |
def get_bos_token_id(self):
|
| 229 |
return self.bos_token_id
|
| 230 |
|
| 231 |
+
def encode(self, text, prepend=None, num_threads=8):
|
| 232 |
+
prepend_id = None
|
| 233 |
+
if prepend is not None:
|
| 234 |
+
prepend_id = prepend if isinstance(prepend, int) else self.enc.encode_single_token(prepend)
|
| 235 |
if isinstance(text, str):
|
| 236 |
ids = self.enc.encode_ordinary(text)
|
| 237 |
if prepend is not None:
|
|
|
|
| 249 |
return self.enc.decode(ids)
|
| 250 |
|
| 251 |
|
| 252 |
+
_TOKEN_BYTES_CACHE: dict[str, torch.Tensor] = {}
|
| 253 |
|
| 254 |
def get_token_bytes(device="cpu"):
|
| 255 |
key = str(device)
|
|
|
|
| 345 |
gpu_buffer.copy_(cpu_buffer, non_blocking=True)
|
| 346 |
yield inputs, targets, epoch
|
| 347 |
|
| 348 |
+
# ---------------------------------------------------------------------------
|
| 349 |
+
# Evaluation (DO NOT CHANGE — this is the fixed metric)
|
| 350 |
+
# ---------------------------------------------------------------------------
|
| 351 |
+
|
| 352 |
+
def compute_bpb_from_totals(total_nats: torch.Tensor, total_bytes: torch.Tensor) -> torch.Tensor:
|
| 353 |
+
if int(total_bytes.item()) <= 0:
|
| 354 |
+
raise ValueError("BPB normalization requires at least one non-special token")
|
| 355 |
+
return total_nats.to(dtype=torch.float64) / (math.log(2) * total_bytes.to(dtype=torch.float64))
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def compute_bpb_from_losses(loss_flat: torch.Tensor, nbytes: torch.Tensor) -> torch.Tensor:
|
| 359 |
+
"""Convert per-token losses and token byte lengths into bits-per-byte.
|
| 360 |
+
|
| 361 |
+
Tokens with zero byte length (special tokens) are excluded from both the
|
| 362 |
+
numerator and denominator so BPB remains comparable across tokenizer
|
| 363 |
+
special-token conventions.
|
| 364 |
+
"""
|
| 365 |
+
mask = nbytes > 0
|
| 366 |
+
total_nats = (loss_flat * mask).sum(dtype=torch.float64)
|
| 367 |
+
total_bytes = nbytes[mask].sum(dtype=torch.int64)
|
| 368 |
+
return compute_bpb_from_totals(total_nats, total_bytes)
|
| 369 |
+
|
| 370 |
+
@torch.no_grad()
|
| 371 |
+
def evaluate_bpb(model, tokenizer, batch_size):
|
| 372 |
"""
|
| 373 |
Bits per byte (BPB): vocab size-independent evaluation metric.
|
| 374 |
Sums per-token cross-entropy (in nats), sums target byte lengths,
|
|
|
|
| 379 |
Perf: accumulates on GPU (single sync at end), prefetches next batch
|
| 380 |
while current forward runs.
|
| 381 |
"""
|
| 382 |
+
token_bytes = get_token_bytes(device="cuda")
|
| 383 |
+
val_loader = make_dataloader(tokenizer, batch_size, MAX_SEQ_LEN, "val")
|
| 384 |
+
steps = EVAL_TOKENS // (batch_size * MAX_SEQ_LEN)
|
| 385 |
+
chunk_steps = max(1, EVAL_CHUNK_TOKENS // (batch_size * MAX_SEQ_LEN))
|
| 386 |
|
| 387 |
# GPU-resident accumulators — avoid per-batch .item() sync
|
| 388 |
total_nats_t = torch.zeros(1, device="cuda", dtype=torch.float64)
|
| 389 |
total_bytes_t = torch.zeros(1, device="cuda", dtype=torch.int64)
|
| 390 |
|
| 391 |
# Prefetch first batch
|
| 392 |
+
next_batch = next(val_loader)
|
| 393 |
+
steps_done = 0
|
| 394 |
+
while steps_done < steps:
|
| 395 |
+
this_chunk = min(chunk_steps, steps - steps_done)
|
| 396 |
+
for _ in range(this_chunk):
|
| 397 |
+
x, y, _epoch = next_batch
|
| 398 |
+
# Prefetch NEXT batch while GPU computes current forward
|
| 399 |
+
next_batch = next(val_loader)
|
| 400 |
+
loss_flat = model(x, y, reduction='none').view(-1)
|
| 401 |
+
y_flat = y.view(-1)
|
| 402 |
+
nbytes = token_bytes[y_flat]
|
| 403 |
+
total_nats_t += (loss_flat * (nbytes > 0)).sum(dtype=torch.float64)
|
| 404 |
+
total_bytes_t += nbytes[nbytes > 0].sum(dtype=torch.int64)
|
| 405 |
+
steps_done += this_chunk
|
| 406 |
+
if steps_done < steps:
|
| 407 |
+
torch.cuda.empty_cache()
|
| 408 |
+
|
| 409 |
+
# Single GPU→CPU sync at end
|
| 410 |
+
return float(compute_bpb_from_totals(total_nats_t, total_bytes_t).item())
|
| 411 |
|
| 412 |
# ---------------------------------------------------------------------------
|
| 413 |
# Main
|
overlay/scripts/audit_overlay_sync.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
DEFAULT_INCLUDE_PATHS = [
|
| 10 |
+
"hydra",
|
| 11 |
+
"subsystems",
|
| 12 |
+
"scripts",
|
| 13 |
+
"htm_rust",
|
| 14 |
+
"harness",
|
| 15 |
+
"configs",
|
| 16 |
+
"prepare.py",
|
| 17 |
+
"prepare_nemotron.py",
|
| 18 |
+
"train.py",
|
| 19 |
+
"pyproject.toml",
|
| 20 |
+
"uv.lock",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _iter_files(path: Path) -> list[Path]:
|
| 25 |
+
if not path.exists():
|
| 26 |
+
return []
|
| 27 |
+
if path.is_file():
|
| 28 |
+
return [path]
|
| 29 |
+
return sorted(p for p in path.rglob("*") if p.is_file())
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def classify_overlay_pairs(*, repo_root: Path, include_paths: list[str]) -> dict[str, list[str]]:
|
| 33 |
+
overlay_root = repo_root / "hf_jobs" / "feather_h200_image" / "overlay"
|
| 34 |
+
identical: list[str] = []
|
| 35 |
+
root_ahead: list[str] = []
|
| 36 |
+
overlay_only: list[str] = []
|
| 37 |
+
missing_overlay: list[str] = []
|
| 38 |
+
|
| 39 |
+
for rel in include_paths:
|
| 40 |
+
root_path = repo_root / rel
|
| 41 |
+
overlay_path = overlay_root / rel
|
| 42 |
+
|
| 43 |
+
root_files = {p.relative_to(root_path).as_posix(): p for p in _iter_files(root_path)} if root_path.exists() and root_path.is_dir() else {}
|
| 44 |
+
overlay_files = {p.relative_to(overlay_path).as_posix(): p for p in _iter_files(overlay_path)} if overlay_path.exists() and overlay_path.is_dir() else {}
|
| 45 |
+
|
| 46 |
+
if root_path.is_file() or overlay_path.is_file():
|
| 47 |
+
rel_name = rel.replace("\\", "/")
|
| 48 |
+
if root_path.exists() and overlay_path.exists():
|
| 49 |
+
if root_path.read_bytes() == overlay_path.read_bytes():
|
| 50 |
+
identical.append(rel_name)
|
| 51 |
+
else:
|
| 52 |
+
root_ahead.append(rel_name)
|
| 53 |
+
elif root_path.exists():
|
| 54 |
+
missing_overlay.append(rel_name)
|
| 55 |
+
elif overlay_path.exists():
|
| 56 |
+
overlay_only.append(rel_name)
|
| 57 |
+
continue
|
| 58 |
+
|
| 59 |
+
for subrel, root_file in root_files.items():
|
| 60 |
+
rel_name = f"{rel}/{subrel}".replace("\\", "/")
|
| 61 |
+
overlay_file = overlay_files.get(subrel)
|
| 62 |
+
if overlay_file is None:
|
| 63 |
+
missing_overlay.append(rel_name)
|
| 64 |
+
elif root_file.read_bytes() == overlay_file.read_bytes():
|
| 65 |
+
identical.append(rel_name)
|
| 66 |
+
else:
|
| 67 |
+
root_ahead.append(rel_name)
|
| 68 |
+
|
| 69 |
+
for subrel in overlay_files:
|
| 70 |
+
if subrel not in root_files:
|
| 71 |
+
overlay_only.append(f"{rel}/{subrel}".replace("\\", "/"))
|
| 72 |
+
|
| 73 |
+
for bucket in (identical, root_ahead, overlay_only, missing_overlay):
|
| 74 |
+
bucket.sort()
|
| 75 |
+
|
| 76 |
+
return {
|
| 77 |
+
"identical": identical,
|
| 78 |
+
"root_ahead": root_ahead,
|
| 79 |
+
"overlay_only": overlay_only,
|
| 80 |
+
"missing_overlay": missing_overlay,
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 85 |
+
parser = argparse.ArgumentParser(description="Audit mirrored H200 overlay files against root source-of-truth paths")
|
| 86 |
+
parser.add_argument("--repo-root", type=Path, default=Path(__file__).resolve().parents[1])
|
| 87 |
+
parser.add_argument("--include-path", action="append", default=[])
|
| 88 |
+
return parser.parse_args(argv)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def main(argv: list[str] | None = None) -> int:
|
| 92 |
+
args = parse_args(argv)
|
| 93 |
+
include_paths = args.include_path or DEFAULT_INCLUDE_PATHS
|
| 94 |
+
payload = classify_overlay_pairs(repo_root=args.repo_root, include_paths=include_paths)
|
| 95 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 96 |
+
return 0
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
if __name__ == "__main__":
|
| 100 |
+
raise SystemExit(main())
|
overlay/scripts/benchmark_assets.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import shutil
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from scripts.benchmark_checkpoint import checkpoint_candidates
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from huggingface_hub import HfApi
|
| 12 |
+
except Exception: # pragma: no cover - optional import for offline test envs
|
| 13 |
+
HfApi = None
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _download_file(*, repo_id: str, filename: str, local_dir: str, token: str | None, subfolder: str | None = None) -> Path:
|
| 17 |
+
from huggingface_hub import hf_hub_download
|
| 18 |
+
|
| 19 |
+
path = hf_hub_download(
|
| 20 |
+
repo_id=repo_id,
|
| 21 |
+
repo_type="model",
|
| 22 |
+
filename=filename,
|
| 23 |
+
subfolder=subfolder,
|
| 24 |
+
token=token,
|
| 25 |
+
local_dir=local_dir,
|
| 26 |
+
local_dir_use_symlinks=False,
|
| 27 |
+
)
|
| 28 |
+
return Path(path)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def resolve_tokenizer_cache_repo(*, output_repo: str, retina_cache_repo: str) -> str:
|
| 32 |
+
return (
|
| 33 |
+
os.environ.get("HYDRA_TOKENIZER_CACHE_REPO")
|
| 34 |
+
or os.environ.get("FEATHER_HF_OUTPUT_REPO")
|
| 35 |
+
or os.environ.get("HF_REPO_ID")
|
| 36 |
+
or os.environ.get("HYDRA_RETINA_CACHE_REPO")
|
| 37 |
+
or os.environ.get("FEATHER_HF_RETINA_CACHE_REPO")
|
| 38 |
+
or output_repo
|
| 39 |
+
or retina_cache_repo
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def tokenizer_cache_prefix() -> str:
|
| 44 |
+
vocab_size = int(os.environ.get("HYDRA_VOCAB_SIZE", "65536"))
|
| 45 |
+
return f"tokenizer/vocab{vocab_size}"
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def choose_remote_checkpoint_path(files: list[str]) -> str | None:
|
| 49 |
+
preferred = [
|
| 50 |
+
path for path in files
|
| 51 |
+
if path.endswith("/pretrain_final.pt") or path.endswith("/best_bpb.pt") or path.endswith("/latest.pt")
|
| 52 |
+
]
|
| 53 |
+
if not preferred:
|
| 54 |
+
return None
|
| 55 |
+
pretrain = sorted([p for p in preferred if p.endswith("/pretrain_final.pt")])
|
| 56 |
+
if pretrain:
|
| 57 |
+
return pretrain[-1]
|
| 58 |
+
best = sorted([p for p in preferred if p.endswith("/best_bpb.pt")])
|
| 59 |
+
if best:
|
| 60 |
+
return best[-1]
|
| 61 |
+
latest = sorted([p for p in preferred if p.endswith("/latest.pt")])
|
| 62 |
+
if latest:
|
| 63 |
+
return latest[-1]
|
| 64 |
+
return None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def hydrate_benchmark_assets(*, cache_dir: Path, output_repo: str, tokenizer_repo: str, token: str | None) -> dict[str, str]:
|
| 68 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 69 |
+
tok_dir = cache_dir / "tokenizer"
|
| 70 |
+
tok_dir.mkdir(parents=True, exist_ok=True)
|
| 71 |
+
tok_repo = resolve_tokenizer_cache_repo(output_repo=tokenizer_repo, retina_cache_repo=tokenizer_repo)
|
| 72 |
+
tok_prefix = tokenizer_cache_prefix()
|
| 73 |
+
|
| 74 |
+
ckpt_path = None
|
| 75 |
+
for candidate in checkpoint_candidates(cache_dir):
|
| 76 |
+
if candidate.exists():
|
| 77 |
+
ckpt_path = candidate
|
| 78 |
+
break
|
| 79 |
+
try:
|
| 80 |
+
ckpt_path = _download_file(repo_id=output_repo, filename=candidate.name, local_dir=str(cache_dir), token=token)
|
| 81 |
+
break
|
| 82 |
+
except Exception:
|
| 83 |
+
continue
|
| 84 |
+
if ckpt_path is None:
|
| 85 |
+
try:
|
| 86 |
+
if HfApi is None:
|
| 87 |
+
raise RuntimeError("huggingface_hub unavailable")
|
| 88 |
+
files = HfApi(token=token).list_repo_files(repo_id=output_repo, repo_type="model", token=token)
|
| 89 |
+
remote_path = choose_remote_checkpoint_path(files)
|
| 90 |
+
if remote_path is not None:
|
| 91 |
+
parent, filename = remote_path.rsplit("/", 1)
|
| 92 |
+
downloaded_path = _download_file(
|
| 93 |
+
repo_id=output_repo,
|
| 94 |
+
filename=filename,
|
| 95 |
+
local_dir=str(cache_dir),
|
| 96 |
+
token=token,
|
| 97 |
+
subfolder=parent,
|
| 98 |
+
)
|
| 99 |
+
canonical_path = cache_dir / filename
|
| 100 |
+
if downloaded_path != canonical_path:
|
| 101 |
+
canonical_path.parent.mkdir(parents=True, exist_ok=True)
|
| 102 |
+
shutil.copy2(downloaded_path, canonical_path)
|
| 103 |
+
ckpt_path = canonical_path
|
| 104 |
+
except Exception:
|
| 105 |
+
pass
|
| 106 |
+
if ckpt_path is None:
|
| 107 |
+
raise FileNotFoundError(f"No benchmark checkpoint found in cache or repo {output_repo}")
|
| 108 |
+
|
| 109 |
+
tok_path = tok_dir / "tokenizer.pkl"
|
| 110 |
+
if not tok_path.exists():
|
| 111 |
+
downloaded_tok = _download_file(repo_id=tok_repo, filename="tokenizer.pkl", local_dir=str(tok_dir), token=token, subfolder=tok_prefix)
|
| 112 |
+
if downloaded_tok != tok_path:
|
| 113 |
+
shutil.copy2(downloaded_tok, tok_path)
|
| 114 |
+
|
| 115 |
+
token_bytes_path = tok_dir / "token_bytes.pt"
|
| 116 |
+
if not token_bytes_path.exists():
|
| 117 |
+
downloaded_token_bytes = _download_file(repo_id=tok_repo, filename="token_bytes.pt", local_dir=str(tok_dir), token=token, subfolder=tok_prefix)
|
| 118 |
+
if downloaded_token_bytes != token_bytes_path:
|
| 119 |
+
shutil.copy2(downloaded_token_bytes, token_bytes_path)
|
| 120 |
+
|
| 121 |
+
return {
|
| 122 |
+
"checkpoint_path": str(ckpt_path),
|
| 123 |
+
"tokenizer_dir": str(tok_dir),
|
| 124 |
+
}
|
overlay/scripts/benchmark_checkpoint.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from scripts.hf_routing import resolve_routing
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def checkpoint_candidates(cache_dir: Path) -> list[Path]:
|
| 10 |
+
return [
|
| 11 |
+
cache_dir / "best_bpb.pt",
|
| 12 |
+
cache_dir / "pretrain_final.pt",
|
| 13 |
+
cache_dir / "latest.pt",
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def choose_checkpoint_candidate(cache_dir: Path) -> Path | None:
|
| 18 |
+
for path in checkpoint_candidates(cache_dir):
|
| 19 |
+
if path.exists():
|
| 20 |
+
return path
|
| 21 |
+
return None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def resolve_checkpoint_source(*, cache_dir: Path, output_repo: str | None) -> dict[str, str]:
|
| 25 |
+
local = choose_checkpoint_candidate(cache_dir)
|
| 26 |
+
if local is not None:
|
| 27 |
+
return {"mode": "local", "path": str(local)}
|
| 28 |
+
if output_repo:
|
| 29 |
+
return {"mode": "remote", "repo_id": output_repo}
|
| 30 |
+
routing = resolve_routing(token=None)
|
| 31 |
+
return {"mode": "remote", "repo_id": routing.output_repo}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _download_checkpoint_file(*, repo_id: str, filename: str, local_dir: str, token: str | None) -> str:
|
| 35 |
+
from huggingface_hub import hf_hub_download
|
| 36 |
+
|
| 37 |
+
return hf_hub_download(
|
| 38 |
+
repo_id=repo_id,
|
| 39 |
+
repo_type="model",
|
| 40 |
+
filename=filename,
|
| 41 |
+
token=token,
|
| 42 |
+
local_dir=local_dir,
|
| 43 |
+
local_dir_use_symlinks=False,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def hydrate_checkpoint(*, cache_dir: Path, output_repo: str | None, token: str | None) -> Path | None:
|
| 48 |
+
local = choose_checkpoint_candidate(cache_dir)
|
| 49 |
+
if local is not None:
|
| 50 |
+
return local
|
| 51 |
+
source = resolve_checkpoint_source(cache_dir=cache_dir, output_repo=output_repo)
|
| 52 |
+
if source["mode"] != "remote":
|
| 53 |
+
return None
|
| 54 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 55 |
+
for filename in ("best_bpb.pt", "pretrain_final.pt", "latest.pt"):
|
| 56 |
+
try:
|
| 57 |
+
path = Path(
|
| 58 |
+
_download_checkpoint_file(
|
| 59 |
+
repo_id=source["repo_id"],
|
| 60 |
+
filename=filename,
|
| 61 |
+
local_dir=str(cache_dir),
|
| 62 |
+
token=token,
|
| 63 |
+
)
|
| 64 |
+
)
|
| 65 |
+
if path.exists():
|
| 66 |
+
return path
|
| 67 |
+
except Exception:
|
| 68 |
+
continue
|
| 69 |
+
return None
|
overlay/scripts/benchmark_checkpoint_report.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def build_checkpoint_report(files: list[str]) -> dict[str, object]:
|
| 8 |
+
by_job: dict[str, dict[str, object]] = {}
|
| 9 |
+
for path in files:
|
| 10 |
+
parts = path.split("/")
|
| 11 |
+
if len(parts) < 3 or parts[0] != "jobs":
|
| 12 |
+
continue
|
| 13 |
+
job_id = parts[1]
|
| 14 |
+
filename = parts[-1]
|
| 15 |
+
if filename not in {"best_bpb.pt", "pretrain_final.pt", "latest.pt"}:
|
| 16 |
+
continue
|
| 17 |
+
row = by_job.setdefault(job_id, {"job_id": job_id, "paths": []})
|
| 18 |
+
row["paths"].append(path)
|
| 19 |
+
|
| 20 |
+
candidates = []
|
| 21 |
+
for job_id, row in by_job.items():
|
| 22 |
+
paths = list(row["paths"])
|
| 23 |
+
preferred = None
|
| 24 |
+
for suffix in ("pretrain_final.pt", "best_bpb.pt", "latest.pt"):
|
| 25 |
+
for path in paths:
|
| 26 |
+
if path.endswith(suffix):
|
| 27 |
+
preferred = path
|
| 28 |
+
break
|
| 29 |
+
if preferred is not None:
|
| 30 |
+
break
|
| 31 |
+
candidates.append({
|
| 32 |
+
"job_id": job_id,
|
| 33 |
+
"preferred_path": preferred,
|
| 34 |
+
"available_paths": sorted(paths),
|
| 35 |
+
})
|
| 36 |
+
|
| 37 |
+
candidates.sort(key=lambda row: row["job_id"], reverse=True)
|
| 38 |
+
return {
|
| 39 |
+
"n_candidates": len(candidates),
|
| 40 |
+
"candidates": candidates,
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def main() -> int:
|
| 45 |
+
print(json.dumps(build_checkpoint_report([]), indent=2, sort_keys=True))
|
| 46 |
+
return 0
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
if __name__ == "__main__":
|
| 50 |
+
raise SystemExit(main())
|
overlay/scripts/benchmark_contract.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _require_path(payload: dict[str, Any], path: str) -> None:
|
| 10 |
+
current: Any = payload
|
| 11 |
+
for part in path.split('.'):
|
| 12 |
+
if not isinstance(current, dict) or part not in current:
|
| 13 |
+
raise ValueError(f"missing required field: {path}")
|
| 14 |
+
current = current[part]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def validate_benchmark_contract(payload: dict[str, Any]) -> None:
|
| 18 |
+
for field in [
|
| 19 |
+
"cycle_id",
|
| 20 |
+
"hardware_class",
|
| 21 |
+
"seeds",
|
| 22 |
+
"budget_modes",
|
| 23 |
+
"coding_benchmarks.fast_iteration",
|
| 24 |
+
"coding_benchmarks.milestone",
|
| 25 |
+
"reasoning_benchmarks.fast_iteration",
|
| 26 |
+
"reasoning_benchmarks.milestone",
|
| 27 |
+
"variants.hydra_full",
|
| 28 |
+
"variants.baseline_mamba_matched",
|
| 29 |
+
]:
|
| 30 |
+
_require_path(payload, field)
|
| 31 |
+
|
| 32 |
+
for section in [
|
| 33 |
+
payload["coding_benchmarks"]["fast_iteration"],
|
| 34 |
+
payload["coding_benchmarks"]["milestone"],
|
| 35 |
+
payload["reasoning_benchmarks"]["fast_iteration"],
|
| 36 |
+
payload["reasoning_benchmarks"]["milestone"],
|
| 37 |
+
]:
|
| 38 |
+
if "name" not in section or "primary_metric" not in section or "decode" not in section:
|
| 39 |
+
raise ValueError("benchmark sections require name, primary_metric, and decode")
|
| 40 |
+
|
| 41 |
+
if not isinstance(payload["seeds"], list) or len(payload["seeds"]) < 3:
|
| 42 |
+
raise ValueError("seeds must contain at least three values")
|
| 43 |
+
|
| 44 |
+
if payload["variants"]["hydra_full"].get("status") != "runnable_now":
|
| 45 |
+
raise ValueError("hydra_full must be runnable_now")
|
| 46 |
+
|
| 47 |
+
if payload["variants"]["baseline_mamba_matched"].get("status") != "runnable_now":
|
| 48 |
+
raise ValueError("baseline_mamba_matched must be runnable_now")
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def load_benchmark_contract(path: Path) -> dict[str, Any]:
|
| 52 |
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
| 53 |
+
if not isinstance(payload, dict):
|
| 54 |
+
raise ValueError("benchmark contract must be a JSON object")
|
| 55 |
+
validate_benchmark_contract(payload)
|
| 56 |
+
return payload
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def main() -> int:
|
| 60 |
+
path = Path("artifacts/cycle_1_execution_freeze.json")
|
| 61 |
+
payload = load_benchmark_contract(path)
|
| 62 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 63 |
+
return 0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
raise SystemExit(main())
|
overlay/scripts/benchmark_datasets.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
CANONICAL_SUBSETS = {
|
| 8 |
+
"MBPP": Path("data/benchmarks/mbpp.cycle1.jsonl"),
|
| 9 |
+
"GSM8K": Path("data/benchmarks/gsm8k.cycle1.jsonl"),
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def resolve_benchmark_dataset(benchmark_name: str, explicit_path: Path | None) -> Path:
|
| 14 |
+
if explicit_path is not None:
|
| 15 |
+
return explicit_path
|
| 16 |
+
if benchmark_name not in CANONICAL_SUBSETS:
|
| 17 |
+
raise ValueError(f"Unsupported benchmark dataset: {benchmark_name}")
|
| 18 |
+
return Path.cwd() / CANONICAL_SUBSETS[benchmark_name]
|
overlay/scripts/benchmark_preflight.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from scripts.bootstrap_benchmark_env import build_bootstrap_report
|
| 7 |
+
from scripts.benchmark_checkpoint import choose_checkpoint_candidate
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def build_readiness_report(*, cache_dir: Path, hf_token_present: bool, dependencies_present: bool = True, missing_dependencies: list[str] | None = None, output_repo: str | None = None, tokenizer_repo: str | None = None) -> dict[str, object]:
|
| 11 |
+
checkpoint = choose_checkpoint_candidate(cache_dir)
|
| 12 |
+
tokenizer_dir = cache_dir / "tokenizer"
|
| 13 |
+
tokenizer_ready = (tokenizer_dir / "tokenizer.pkl").exists() and (tokenizer_dir / "token_bytes.pt").exists()
|
| 14 |
+
checkpoint_present = checkpoint is not None
|
| 15 |
+
runtime = build_bootstrap_report(missing_dependencies=list(missing_dependencies or []))
|
| 16 |
+
return {
|
| 17 |
+
"cache_dir": str(cache_dir),
|
| 18 |
+
"checkpoint_present": checkpoint_present,
|
| 19 |
+
"checkpoint_path": str(checkpoint) if checkpoint is not None else None,
|
| 20 |
+
"tokenizer_ready": tokenizer_ready,
|
| 21 |
+
"hf_token_present": hf_token_present,
|
| 22 |
+
"dependencies_present": dependencies_present,
|
| 23 |
+
"missing_dependencies": list(missing_dependencies or []),
|
| 24 |
+
"install_hint": runtime["install_hint"],
|
| 25 |
+
"install_command": runtime["install_command"],
|
| 26 |
+
"install_blockers": runtime["install_blockers"],
|
| 27 |
+
"output_repo": output_repo,
|
| 28 |
+
"tokenizer_repo": tokenizer_repo,
|
| 29 |
+
"hydration_possible": bool(hf_token_present and output_repo and tokenizer_repo),
|
| 30 |
+
"ready_for_hydra_benchmarks": checkpoint_present and tokenizer_ready and dependencies_present,
|
| 31 |
+
}
|
overlay/scripts/benchmark_runner.py
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import re
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Any, Callable
|
| 10 |
+
|
| 11 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 12 |
+
if str(REPO_ROOT) not in sys.path:
|
| 13 |
+
sys.path.insert(0, str(REPO_ROOT))
|
| 14 |
+
|
| 15 |
+
LEDGER_TEMPLATE_PATH = REPO_ROOT / "artifacts" / "benchmark_ledger.template.json"
|
| 16 |
+
|
| 17 |
+
from scripts.hydra_generation import build_hydra_generator
|
| 18 |
+
from scripts.benchmark_datasets import resolve_benchmark_dataset as resolve_canonical_dataset
|
| 19 |
+
from scripts.benchmark_suite import build_prompt, validate_sample
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_jsonl_samples(path: Path) -> list[dict[str, Any]]:
|
| 23 |
+
rows: list[dict[str, Any]] = []
|
| 24 |
+
for line in path.read_text(encoding="utf-8").splitlines():
|
| 25 |
+
if line.strip():
|
| 26 |
+
rows.append(json.loads(line))
|
| 27 |
+
return rows
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _score_mbpp(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float:
|
| 31 |
+
passed = 0
|
| 32 |
+
for sample in samples:
|
| 33 |
+
validate_sample("MBPP", sample)
|
| 34 |
+
code = generate_fn(build_prompt("MBPP", sample))
|
| 35 |
+
namespace: dict[str, Any] = {}
|
| 36 |
+
exec(code, namespace, namespace)
|
| 37 |
+
for test in sample["tests"]:
|
| 38 |
+
exec(test, namespace, namespace)
|
| 39 |
+
passed += 1
|
| 40 |
+
return passed / len(samples) if samples else 0.0
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _extract_last_number(text: str) -> str | None:
|
| 44 |
+
matches = re.findall(r"-?\d+(?:\.\d+)?", text)
|
| 45 |
+
return matches[-1] if matches else None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _score_gsm8k(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float:
|
| 49 |
+
passed = 0
|
| 50 |
+
for sample in samples:
|
| 51 |
+
validate_sample("GSM8K", sample)
|
| 52 |
+
output = generate_fn(build_prompt("GSM8K", sample))
|
| 53 |
+
pred = _extract_last_number(output)
|
| 54 |
+
if pred is not None and pred == str(sample["answer"]):
|
| 55 |
+
passed += 1
|
| 56 |
+
return passed / len(samples) if samples else 0.0
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _score_humaneval(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float:
|
| 60 |
+
passed = 0
|
| 61 |
+
for sample in samples:
|
| 62 |
+
validate_sample("HumanEval", sample)
|
| 63 |
+
code = generate_fn(build_prompt("HumanEval", sample))
|
| 64 |
+
namespace: dict[str, Any] = {}
|
| 65 |
+
exec(code, namespace, namespace)
|
| 66 |
+
exec(sample["test"], namespace, namespace)
|
| 67 |
+
passed += 1
|
| 68 |
+
return passed / len(samples) if samples else 0.0
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def _score_arc(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float:
|
| 72 |
+
passed = 0
|
| 73 |
+
for sample in samples:
|
| 74 |
+
validate_sample("ARC-Challenge", sample)
|
| 75 |
+
output = generate_fn(build_prompt("ARC-Challenge", sample)).strip()
|
| 76 |
+
if output == str(sample["answer"]):
|
| 77 |
+
passed += 1
|
| 78 |
+
return passed / len(samples) if samples else 0.0
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def run_benchmark(benchmark_name: str, path: Path, generate_fn: Callable[[str], str]) -> dict[str, Any]:
|
| 82 |
+
samples = load_jsonl_samples(path)
|
| 83 |
+
if benchmark_name == "MBPP":
|
| 84 |
+
return {
|
| 85 |
+
"benchmark": "MBPP",
|
| 86 |
+
"primary_metric": "pass_at_1",
|
| 87 |
+
"score": _score_mbpp(samples, generate_fn),
|
| 88 |
+
"n_samples": len(samples),
|
| 89 |
+
}
|
| 90 |
+
if benchmark_name == "GSM8K":
|
| 91 |
+
return {
|
| 92 |
+
"benchmark": "GSM8K",
|
| 93 |
+
"primary_metric": "exact_match",
|
| 94 |
+
"score": _score_gsm8k(samples, generate_fn),
|
| 95 |
+
"n_samples": len(samples),
|
| 96 |
+
}
|
| 97 |
+
if benchmark_name == "HumanEval":
|
| 98 |
+
return {
|
| 99 |
+
"benchmark": "HumanEval",
|
| 100 |
+
"primary_metric": "pass_at_1",
|
| 101 |
+
"score": _score_humaneval(samples, generate_fn),
|
| 102 |
+
"n_samples": len(samples),
|
| 103 |
+
}
|
| 104 |
+
if benchmark_name == "ARC-Challenge":
|
| 105 |
+
return {
|
| 106 |
+
"benchmark": "ARC-Challenge",
|
| 107 |
+
"primary_metric": "accuracy",
|
| 108 |
+
"score": _score_arc(samples, generate_fn),
|
| 109 |
+
"n_samples": len(samples),
|
| 110 |
+
}
|
| 111 |
+
raise ValueError(f"Unsupported runnable benchmark: {benchmark_name}")
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def write_benchmark_result(path: Path, payload: dict[str, Any]) -> None:
|
| 115 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 116 |
+
path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def append_benchmark_run_record(
|
| 120 |
+
ledger_path: Path,
|
| 121 |
+
result: dict[str, Any],
|
| 122 |
+
*,
|
| 123 |
+
benchmark_name: str,
|
| 124 |
+
variant: str,
|
| 125 |
+
seed: int,
|
| 126 |
+
samples_path: Path,
|
| 127 |
+
) -> None:
|
| 128 |
+
if not ledger_path.exists():
|
| 129 |
+
ledger_path.parent.mkdir(parents=True, exist_ok=True)
|
| 130 |
+
ledger_path.write_text(LEDGER_TEMPLATE_PATH.read_text(encoding="utf-8"), encoding="utf-8")
|
| 131 |
+
payload = json.loads(ledger_path.read_text(encoding="utf-8"))
|
| 132 |
+
run_records = payload.setdefault("run_records", [])
|
| 133 |
+
if len(run_records) == 1 and run_records[0].get("run_id") == "example-run-0001":
|
| 134 |
+
run_records.clear()
|
| 135 |
+
run_records.append(
|
| 136 |
+
{
|
| 137 |
+
"run_id": result.get("run_id", f"{benchmark_name.lower()}-{seed}"),
|
| 138 |
+
"commit": "HEAD",
|
| 139 |
+
"model_family": "hydra",
|
| 140 |
+
"variant": variant,
|
| 141 |
+
"seed": seed,
|
| 142 |
+
"hardware": {
|
| 143 |
+
"hardware_class": payload.get("benchmark_cycle", {}).get("hardware_class", "unknown"),
|
| 144 |
+
},
|
| 145 |
+
"budget": {
|
| 146 |
+
"budget_mode": payload.get("benchmark_cycle", {}).get("budget_modes", [None])[0],
|
| 147 |
+
},
|
| 148 |
+
"capability": {
|
| 149 |
+
"coding_score": result["score"] if benchmark_name in {"MBPP", "HumanEval"} else None,
|
| 150 |
+
"reasoning_score": result["score"] if benchmark_name in {"GSM8K", "ARC-Challenge"} else None,
|
| 151 |
+
},
|
| 152 |
+
"artifacts": {
|
| 153 |
+
"samples_path": str(samples_path),
|
| 154 |
+
},
|
| 155 |
+
}
|
| 156 |
+
)
|
| 157 |
+
ledger_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def resolve_samples_path(benchmark_name: str, samples: Path | None, suite_path: Path) -> Path:
|
| 161 |
+
if samples is not None:
|
| 162 |
+
return samples
|
| 163 |
+
payload = json.loads(suite_path.read_text(encoding="utf-8"))
|
| 164 |
+
for section in ("coding_benchmarks", "reasoning_benchmarks"):
|
| 165 |
+
if section not in payload:
|
| 166 |
+
continue
|
| 167 |
+
for slot in ("fast_iteration", "milestone"):
|
| 168 |
+
entry = payload[section].get(slot)
|
| 169 |
+
if isinstance(entry, dict) and entry.get("name") == benchmark_name and "sample_path" in entry:
|
| 170 |
+
return Path(entry["sample_path"])
|
| 171 |
+
try:
|
| 172 |
+
return resolve_canonical_dataset(benchmark_name, None)
|
| 173 |
+
except ValueError:
|
| 174 |
+
raise ValueError(f"No sample path found for benchmark: {benchmark_name}")
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 178 |
+
parser = argparse.ArgumentParser(description="Run a local benchmark against JSONL samples")
|
| 179 |
+
parser.add_argument("--benchmark", required=True, choices=["MBPP", "GSM8K", "HumanEval", "ARC-Challenge"])
|
| 180 |
+
parser.add_argument("--samples", type=Path)
|
| 181 |
+
parser.add_argument("--suite", type=Path, default=REPO_ROOT / "artifacts" / "benchmark_suite.cycle1.json")
|
| 182 |
+
parser.add_argument("--out", type=Path)
|
| 183 |
+
parser.add_argument("--ledger", type=Path)
|
| 184 |
+
parser.add_argument("--variant", default="hydra_full")
|
| 185 |
+
parser.add_argument("--seed", type=int, default=42)
|
| 186 |
+
parser.add_argument("--generator-mode", choices=["stub", "hydra"], default="stub")
|
| 187 |
+
parser.add_argument("--checkpoint", type=Path)
|
| 188 |
+
parser.add_argument("--device")
|
| 189 |
+
parser.add_argument("--max-new-tokens", type=int, default=256)
|
| 190 |
+
parser.add_argument("--temperature", type=float, default=0.2)
|
| 191 |
+
parser.add_argument("--top-p", type=float, default=0.95)
|
| 192 |
+
return parser.parse_args(argv)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def main(argv: list[str] | None = None) -> int:
|
| 196 |
+
args = parse_args(argv)
|
| 197 |
+
sample_path = resolve_samples_path(args.benchmark, args.samples, args.suite)
|
| 198 |
+
try:
|
| 199 |
+
if args.generator_mode == "hydra":
|
| 200 |
+
generator = build_hydra_generator(
|
| 201 |
+
checkpoint_path=args.checkpoint,
|
| 202 |
+
device=args.device,
|
| 203 |
+
max_new_tokens=args.max_new_tokens,
|
| 204 |
+
temperature=args.temperature,
|
| 205 |
+
top_p=args.top_p,
|
| 206 |
+
)
|
| 207 |
+
else:
|
| 208 |
+
def generator(prompt: str) -> str:
|
| 209 |
+
return prompt
|
| 210 |
+
|
| 211 |
+
result = run_benchmark(args.benchmark, sample_path, generator)
|
| 212 |
+
exit_code = 0
|
| 213 |
+
except FileNotFoundError as exc:
|
| 214 |
+
result = {
|
| 215 |
+
"benchmark": args.benchmark,
|
| 216 |
+
"status": "failed",
|
| 217 |
+
"failure_type": "missing_checkpoint",
|
| 218 |
+
"error": str(exc),
|
| 219 |
+
"n_samples": 0,
|
| 220 |
+
}
|
| 221 |
+
exit_code = 1
|
| 222 |
+
except Exception as exc: # noqa: BLE001
|
| 223 |
+
result = {
|
| 224 |
+
"benchmark": args.benchmark,
|
| 225 |
+
"status": "failed",
|
| 226 |
+
"failure_type": type(exc).__name__,
|
| 227 |
+
"error": str(exc),
|
| 228 |
+
"n_samples": 0,
|
| 229 |
+
}
|
| 230 |
+
exit_code = 1
|
| 231 |
+
|
| 232 |
+
if args.out is not None:
|
| 233 |
+
write_benchmark_result(args.out, result)
|
| 234 |
+
if args.ledger is not None and exit_code == 0:
|
| 235 |
+
append_benchmark_run_record(
|
| 236 |
+
args.ledger,
|
| 237 |
+
result,
|
| 238 |
+
benchmark_name=args.benchmark,
|
| 239 |
+
variant=args.variant,
|
| 240 |
+
seed=args.seed,
|
| 241 |
+
samples_path=sample_path,
|
| 242 |
+
)
|
| 243 |
+
print(json.dumps(result, indent=2, sort_keys=True))
|
| 244 |
+
return exit_code
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
if __name__ == "__main__":
|
| 248 |
+
raise SystemExit(main())
|
overlay/scripts/benchmark_suite.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass(frozen=True)
|
| 11 |
+
class BenchmarkSpec:
|
| 12 |
+
name: str
|
| 13 |
+
family: str
|
| 14 |
+
required_fields: tuple[str, ...]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
REGISTRY: dict[str, BenchmarkSpec] = {
|
| 18 |
+
"MBPP": BenchmarkSpec("MBPP", "coding", ("task_id", "prompt", "tests")),
|
| 19 |
+
"HumanEval": BenchmarkSpec("HumanEval", "coding", ("task_id", "prompt", "test")),
|
| 20 |
+
"GSM8K": BenchmarkSpec("GSM8K", "reasoning", ("question", "answer")),
|
| 21 |
+
"ARC-Challenge": BenchmarkSpec("ARC-Challenge", "reasoning", ("question", "choices", "answer")),
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def validate_sample(benchmark_name: str, sample: dict[str, Any]) -> None:
|
| 26 |
+
spec = REGISTRY[benchmark_name]
|
| 27 |
+
for field in spec.required_fields:
|
| 28 |
+
if field not in sample:
|
| 29 |
+
raise ValueError(f"{benchmark_name} sample missing required field: {field}")
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def build_prompt(benchmark_name: str, sample: dict[str, Any]) -> str:
|
| 33 |
+
validate_sample(benchmark_name, sample)
|
| 34 |
+
if benchmark_name == "MBPP":
|
| 35 |
+
tests = sample["tests"]
|
| 36 |
+
rendered_tests = "\n".join(str(t) for t in tests)
|
| 37 |
+
return (
|
| 38 |
+
"Write a Python function that solves the task below.\n\n"
|
| 39 |
+
f"Task:\n{sample['prompt']}\n\n"
|
| 40 |
+
f"Tests:\n{rendered_tests}\n"
|
| 41 |
+
)
|
| 42 |
+
if benchmark_name == "HumanEval":
|
| 43 |
+
return (
|
| 44 |
+
"Complete the following Python function exactly as specified.\n\n"
|
| 45 |
+
f"Prompt:\n{sample['prompt']}\n\n"
|
| 46 |
+
f"Reference test:\n{sample['test']}\n"
|
| 47 |
+
)
|
| 48 |
+
if benchmark_name == "GSM8K":
|
| 49 |
+
return f"Solve the following math word problem. Return only the final answer.\n\nQuestion: {sample['question']}\n"
|
| 50 |
+
if benchmark_name == "ARC-Challenge":
|
| 51 |
+
choices = sample["choices"]
|
| 52 |
+
rendered_choices = "\n".join(f"- {choice}" for choice in choices)
|
| 53 |
+
return (
|
| 54 |
+
"Answer the following multiple-choice science question. Return only the correct option text or label.\n\n"
|
| 55 |
+
f"Question: {sample['question']}\nChoices:\n{rendered_choices}\n"
|
| 56 |
+
)
|
| 57 |
+
raise ValueError(f"Unknown benchmark: {benchmark_name}")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def load_cycle_benchmark_suite(path: Path) -> dict[str, dict[str, BenchmarkSpec]]:
|
| 61 |
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
| 62 |
+
out: dict[str, dict[str, BenchmarkSpec]] = {"coding_benchmarks": {}, "reasoning_benchmarks": {}}
|
| 63 |
+
for section in ("coding_benchmarks", "reasoning_benchmarks"):
|
| 64 |
+
if section not in payload:
|
| 65 |
+
raise ValueError(f"missing benchmark section: {section}")
|
| 66 |
+
for slot in ("fast_iteration", "milestone"):
|
| 67 |
+
if slot not in payload[section]:
|
| 68 |
+
raise ValueError(f"missing benchmark slot: {section}.{slot}")
|
| 69 |
+
name = payload[section][slot]["name"]
|
| 70 |
+
if name not in REGISTRY:
|
| 71 |
+
raise ValueError(f"unsupported benchmark: {name}")
|
| 72 |
+
out[section][slot] = REGISTRY[name]
|
| 73 |
+
return out
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def main() -> int:
|
| 77 |
+
path = Path("artifacts/benchmark_suite.cycle1.json")
|
| 78 |
+
suite = load_cycle_benchmark_suite(path)
|
| 79 |
+
print(json.dumps({k: {slot: spec.name for slot, spec in section.items()} for k, section in suite.items()}, indent=2))
|
| 80 |
+
return 0
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
if __name__ == "__main__":
|
| 84 |
+
raise SystemExit(main())
|
overlay/scripts/bootstrap_benchmark_env.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import shutil
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
PACKAGE_MAP = {
|
| 11 |
+
"mamba_ssm": "mamba-ssm",
|
| 12 |
+
"transformers": "transformers",
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def build_install_command(*, missing_dependencies: list[str]) -> list[str]:
|
| 17 |
+
packages = [PACKAGE_MAP.get(name, name) for name in missing_dependencies]
|
| 18 |
+
return [] if not packages else ["python", "-m", "pip", "install", *packages]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def diagnose_install_blockers(
|
| 22 |
+
*,
|
| 23 |
+
missing_dependencies: list[str],
|
| 24 |
+
torch_version: str,
|
| 25 |
+
cuda_available: bool,
|
| 26 |
+
nvcc_present: bool,
|
| 27 |
+
) -> list[str]:
|
| 28 |
+
blockers: list[str] = []
|
| 29 |
+
if "mamba_ssm" in missing_dependencies:
|
| 30 |
+
if "+cpu" in torch_version or not cuda_available:
|
| 31 |
+
blockers.append("mamba_ssm install likely blocked by CPU-only torch runtime")
|
| 32 |
+
if not nvcc_present:
|
| 33 |
+
blockers.append("mamba_ssm install likely blocked because nvcc is unavailable")
|
| 34 |
+
return blockers
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def build_bootstrap_report(*, missing_dependencies: list[str]) -> dict[str, object]:
|
| 38 |
+
ready = len(missing_dependencies) == 0
|
| 39 |
+
packages = [PACKAGE_MAP.get(name, name) for name in missing_dependencies]
|
| 40 |
+
install_hint = "" if ready else f"Install missing benchmark dependencies: {', '.join(packages)}"
|
| 41 |
+
blockers = diagnose_install_blockers(
|
| 42 |
+
missing_dependencies=missing_dependencies,
|
| 43 |
+
torch_version=getattr(torch, "__version__", "unknown"),
|
| 44 |
+
cuda_available=torch.cuda.is_available(),
|
| 45 |
+
nvcc_present=shutil.which("nvcc") is not None,
|
| 46 |
+
)
|
| 47 |
+
return {
|
| 48 |
+
"ready": ready,
|
| 49 |
+
"missing_dependencies": list(missing_dependencies),
|
| 50 |
+
"install_hint": install_hint,
|
| 51 |
+
"install_command": build_install_command(missing_dependencies=missing_dependencies),
|
| 52 |
+
"install_blockers": blockers,
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def main() -> int:
|
| 57 |
+
report = build_bootstrap_report(missing_dependencies=["mamba_ssm"])
|
| 58 |
+
print(json.dumps(report, indent=2, sort_keys=True))
|
| 59 |
+
return 0
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
raise SystemExit(main())
|
overlay/scripts/bootstrap_benchmark_runtime.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import importlib.util
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
PACKAGE_MAP = {
|
| 9 |
+
"mamba_ssm": "mamba-ssm",
|
| 10 |
+
"transformers": "transformers",
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def detect_missing_modules(required: list[str] | None = None) -> list[str]:
|
| 15 |
+
names = required or list(PACKAGE_MAP)
|
| 16 |
+
return [name for name in names if importlib.util.find_spec(name) is None]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def build_install_command(*, missing_modules: list[str]) -> list[str]:
|
| 20 |
+
packages = [PACKAGE_MAP[name] for name in missing_modules if name in PACKAGE_MAP]
|
| 21 |
+
if not packages:
|
| 22 |
+
return []
|
| 23 |
+
return ["python", "-m", "pip", "install", *packages]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def build_runtime_report(*, missing_modules: list[str]) -> dict[str, object]:
|
| 27 |
+
return {
|
| 28 |
+
"ready": len(missing_modules) == 0,
|
| 29 |
+
"missing_modules": list(missing_modules),
|
| 30 |
+
"packages": {name: PACKAGE_MAP[name] for name in missing_modules if name in PACKAGE_MAP},
|
| 31 |
+
"install_command": build_install_command(missing_modules=missing_modules),
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def main() -> int:
|
| 36 |
+
missing = detect_missing_modules()
|
| 37 |
+
print(json.dumps(build_runtime_report(missing_modules=missing), indent=2, sort_keys=True))
|
| 38 |
+
return 0
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
if __name__ == "__main__":
|
| 42 |
+
raise SystemExit(main())
|
overlay/scripts/cycle_executor.py
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import importlib.util
|
| 6 |
+
import importlib
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import subprocess
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Any
|
| 13 |
+
|
| 14 |
+
from scripts.benchmark_preflight import build_readiness_report
|
| 15 |
+
from scripts.hf_routing import resolve_routing
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 19 |
+
FREEZE_PATH = REPO_ROOT / "artifacts" / "cycle_1_execution_freeze.json"
|
| 20 |
+
RUNNER_PATH = REPO_ROOT / "scripts" / "benchmark_runner.py"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def active_hf_token() -> str | None:
|
| 24 |
+
token = os.environ.get("HF_TOKEN")
|
| 25 |
+
if token:
|
| 26 |
+
return token
|
| 27 |
+
try:
|
| 28 |
+
from huggingface_hub.utils import get_token
|
| 29 |
+
return get_token()
|
| 30 |
+
except Exception:
|
| 31 |
+
return None
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def missing_benchmark_dependencies() -> list[str]:
|
| 35 |
+
required = ["mamba_ssm", "transformers"]
|
| 36 |
+
missing: list[str] = []
|
| 37 |
+
for name in required:
|
| 38 |
+
try:
|
| 39 |
+
spec = importlib.util.find_spec(name)
|
| 40 |
+
except (ImportError, ValueError):
|
| 41 |
+
spec = None
|
| 42 |
+
if spec is None:
|
| 43 |
+
try:
|
| 44 |
+
importlib.import_module(name)
|
| 45 |
+
except Exception:
|
| 46 |
+
missing.append(name)
|
| 47 |
+
return missing
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def load_cycle_freeze(path: Path) -> dict[str, Any]:
|
| 51 |
+
return json.loads(path.read_text(encoding="utf-8"))
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def load_cycle_benchmarks(path: Path) -> list[str]:
|
| 55 |
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
| 56 |
+
out: list[str] = []
|
| 57 |
+
for section in ("coding_benchmarks", "reasoning_benchmarks"):
|
| 58 |
+
for slot in ("fast_iteration", "milestone"):
|
| 59 |
+
entry = payload.get(section, {}).get(slot)
|
| 60 |
+
if isinstance(entry, dict) and entry.get("name"):
|
| 61 |
+
out.append(str(entry["name"]))
|
| 62 |
+
return out
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def build_preflight_report(
|
| 66 |
+
*,
|
| 67 |
+
cache_dir: Path,
|
| 68 |
+
output_repo: str | None = None,
|
| 69 |
+
tokenizer_repo: str | None = None,
|
| 70 |
+
) -> dict[str, object]:
|
| 71 |
+
return build_readiness_report(
|
| 72 |
+
cache_dir=cache_dir,
|
| 73 |
+
hf_token_present=bool(active_hf_token()),
|
| 74 |
+
dependencies_present=not bool(missing_benchmark_dependencies()),
|
| 75 |
+
missing_dependencies=missing_benchmark_dependencies(),
|
| 76 |
+
output_repo=output_repo,
|
| 77 |
+
tokenizer_repo=tokenizer_repo,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def write_preflight_report(path: Path, payload: dict[str, object]) -> None:
|
| 82 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 83 |
+
path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def write_cycle_summary(path: Path, payload: list[dict[str, Any]]) -> None:
|
| 87 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 88 |
+
path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def build_remote_checkpoint_report(output_repo: str, token: str | None) -> dict[str, Any]:
|
| 92 |
+
from huggingface_hub import HfApi
|
| 93 |
+
|
| 94 |
+
from scripts.benchmark_checkpoint_report import build_checkpoint_report
|
| 95 |
+
|
| 96 |
+
files = HfApi(token=token).list_repo_files(repo_id=output_repo, repo_type="model", token=token)
|
| 97 |
+
return build_checkpoint_report(files)
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def ensure_benchmark_assets(
|
| 101 |
+
*,
|
| 102 |
+
cache_dir: Path,
|
| 103 |
+
output_repo: str,
|
| 104 |
+
tokenizer_repo: str,
|
| 105 |
+
token: str | None,
|
| 106 |
+
hydrate: bool,
|
| 107 |
+
) -> dict[str, str] | None:
|
| 108 |
+
if not hydrate:
|
| 109 |
+
return None
|
| 110 |
+
from scripts.benchmark_assets import hydrate_benchmark_assets
|
| 111 |
+
|
| 112 |
+
return hydrate_benchmark_assets(
|
| 113 |
+
cache_dir=cache_dir,
|
| 114 |
+
output_repo=output_repo,
|
| 115 |
+
tokenizer_repo=tokenizer_repo,
|
| 116 |
+
token=token,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def build_benchmark_command(
|
| 121 |
+
freeze: dict[str, Any],
|
| 122 |
+
*,
|
| 123 |
+
benchmark: str,
|
| 124 |
+
variant: str,
|
| 125 |
+
seed: int,
|
| 126 |
+
out_dir: Path,
|
| 127 |
+
) -> tuple[list[str], dict[str, str]]:
|
| 128 |
+
variant_cfg = freeze["variants"][variant]
|
| 129 |
+
env = os.environ.copy()
|
| 130 |
+
env.update({str(k): str(v) for k, v in variant_cfg.get("env", {}).items()})
|
| 131 |
+
env["HYDRA_SEED"] = str(seed)
|
| 132 |
+
|
| 133 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 134 |
+
result_path = out_dir / f"{benchmark.lower()}_{variant}_seed{seed}.json"
|
| 135 |
+
ledger_path = out_dir / "benchmark_ledger.json"
|
| 136 |
+
cmd = [
|
| 137 |
+
sys.executable,
|
| 138 |
+
str(RUNNER_PATH),
|
| 139 |
+
"--benchmark",
|
| 140 |
+
benchmark,
|
| 141 |
+
"--generator-mode",
|
| 142 |
+
"hydra",
|
| 143 |
+
"--out",
|
| 144 |
+
str(result_path),
|
| 145 |
+
"--ledger",
|
| 146 |
+
str(ledger_path),
|
| 147 |
+
"--variant",
|
| 148 |
+
variant,
|
| 149 |
+
"--seed",
|
| 150 |
+
str(seed),
|
| 151 |
+
]
|
| 152 |
+
return cmd, env
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def build_cycle_plan(freeze: dict[str, Any], *, benchmark: str, out_dir: Path) -> list[dict[str, Any]]:
|
| 156 |
+
runnable_variants = [
|
| 157 |
+
name for name, cfg in freeze.get("variants", {}).items()
|
| 158 |
+
if isinstance(cfg, dict) and cfg.get("status") == "runnable_now"
|
| 159 |
+
]
|
| 160 |
+
seeds = [int(seed) for seed in freeze.get("seeds", [])]
|
| 161 |
+
plan: list[dict[str, Any]] = []
|
| 162 |
+
for variant in runnable_variants:
|
| 163 |
+
for seed in seeds:
|
| 164 |
+
cmd, env = build_benchmark_command(
|
| 165 |
+
freeze,
|
| 166 |
+
benchmark=benchmark,
|
| 167 |
+
variant=variant,
|
| 168 |
+
seed=seed,
|
| 169 |
+
out_dir=out_dir,
|
| 170 |
+
)
|
| 171 |
+
plan.append({
|
| 172 |
+
"benchmark": benchmark,
|
| 173 |
+
"variant": variant,
|
| 174 |
+
"seed": seed,
|
| 175 |
+
"command": cmd,
|
| 176 |
+
"env": env,
|
| 177 |
+
})
|
| 178 |
+
return plan
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def execute_cycle_plan(plan: list[dict[str, Any]], *, repo_root: Path) -> list[dict[str, Any]]:
|
| 182 |
+
results: list[dict[str, Any]] = []
|
| 183 |
+
for item in plan:
|
| 184 |
+
proc = subprocess.run(item["command"], cwd=str(repo_root), env=item["env"])
|
| 185 |
+
results.append(
|
| 186 |
+
{
|
| 187 |
+
"benchmark": item["benchmark"],
|
| 188 |
+
"variant": item["variant"],
|
| 189 |
+
"seed": item["seed"],
|
| 190 |
+
"returncode": proc.returncode,
|
| 191 |
+
}
|
| 192 |
+
)
|
| 193 |
+
return results
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 197 |
+
parser = argparse.ArgumentParser(description="Execute a frozen Cycle 1 benchmark run")
|
| 198 |
+
parser.add_argument("--freeze", type=Path, default=FREEZE_PATH)
|
| 199 |
+
parser.add_argument("--suite", type=Path, default=REPO_ROOT / "artifacts" / "benchmark_suite.cycle1.json")
|
| 200 |
+
parser.add_argument("--benchmark", required=True)
|
| 201 |
+
parser.add_argument("--variant", required=True)
|
| 202 |
+
parser.add_argument("--seed", type=int, required=True)
|
| 203 |
+
parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "artifacts" / "runs")
|
| 204 |
+
parser.add_argument("--preflight-out", type=Path)
|
| 205 |
+
parser.add_argument("--summary-out", type=Path)
|
| 206 |
+
parser.add_argument("--hydrate-assets", action="store_true")
|
| 207 |
+
parser.add_argument("--all-runnable", action="store_true")
|
| 208 |
+
parser.add_argument("--all-benchmarks", action="store_true")
|
| 209 |
+
parser.add_argument("--require-ready", action="store_true")
|
| 210 |
+
parser.add_argument("--output-repo")
|
| 211 |
+
parser.add_argument("--tokenizer-repo")
|
| 212 |
+
return parser.parse_args(argv)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
def main(argv: list[str] | None = None) -> int:
|
| 216 |
+
args = parse_args(argv)
|
| 217 |
+
cache_dir = Path(os.path.expanduser("~/.cache/autoresearch"))
|
| 218 |
+
report = None
|
| 219 |
+
token = active_hf_token()
|
| 220 |
+
routing = resolve_routing(token=token)
|
| 221 |
+
output_repo = args.output_repo or routing.output_repo
|
| 222 |
+
tokenizer_repo = args.tokenizer_repo or routing.output_repo
|
| 223 |
+
if args.hydrate_assets:
|
| 224 |
+
try:
|
| 225 |
+
ensure_benchmark_assets(
|
| 226 |
+
cache_dir=cache_dir,
|
| 227 |
+
output_repo=output_repo,
|
| 228 |
+
tokenizer_repo=tokenizer_repo,
|
| 229 |
+
token=token,
|
| 230 |
+
hydrate=True,
|
| 231 |
+
)
|
| 232 |
+
except FileNotFoundError as exc:
|
| 233 |
+
checkpoint_report = None
|
| 234 |
+
try:
|
| 235 |
+
checkpoint_report = build_remote_checkpoint_report(output_repo, token)
|
| 236 |
+
except Exception:
|
| 237 |
+
checkpoint_report = None
|
| 238 |
+
if args.summary_out is not None:
|
| 239 |
+
write_cycle_summary(
|
| 240 |
+
args.summary_out,
|
| 241 |
+
[{
|
| 242 |
+
"status": "blocked",
|
| 243 |
+
"reason": "asset_hydration_failed",
|
| 244 |
+
"error": str(exc),
|
| 245 |
+
"checkpoint_candidates": checkpoint_report,
|
| 246 |
+
}],
|
| 247 |
+
)
|
| 248 |
+
return 3
|
| 249 |
+
if args.preflight_out is not None:
|
| 250 |
+
report = build_preflight_report(
|
| 251 |
+
cache_dir=cache_dir,
|
| 252 |
+
output_repo=output_repo,
|
| 253 |
+
tokenizer_repo=tokenizer_repo,
|
| 254 |
+
)
|
| 255 |
+
write_preflight_report(args.preflight_out, report)
|
| 256 |
+
if args.require_ready:
|
| 257 |
+
if report is None:
|
| 258 |
+
report = build_preflight_report(
|
| 259 |
+
cache_dir=cache_dir,
|
| 260 |
+
output_repo=output_repo,
|
| 261 |
+
tokenizer_repo=tokenizer_repo,
|
| 262 |
+
)
|
| 263 |
+
if not bool(report.get("ready_for_hydra_benchmarks")):
|
| 264 |
+
checkpoint_report = None
|
| 265 |
+
try:
|
| 266 |
+
checkpoint_report = build_remote_checkpoint_report(output_repo, token)
|
| 267 |
+
except Exception:
|
| 268 |
+
checkpoint_report = None
|
| 269 |
+
if args.summary_out is not None:
|
| 270 |
+
write_cycle_summary(
|
| 271 |
+
args.summary_out,
|
| 272 |
+
[{
|
| 273 |
+
"status": "blocked",
|
| 274 |
+
"reason": "preflight_not_ready",
|
| 275 |
+
"preflight": report,
|
| 276 |
+
"checkpoint_candidates": checkpoint_report,
|
| 277 |
+
}],
|
| 278 |
+
)
|
| 279 |
+
return 2
|
| 280 |
+
freeze = load_cycle_freeze(args.freeze)
|
| 281 |
+
if args.all_runnable:
|
| 282 |
+
benchmarks = load_cycle_benchmarks(args.suite) if args.all_benchmarks else [args.benchmark]
|
| 283 |
+
plan = []
|
| 284 |
+
for benchmark in benchmarks:
|
| 285 |
+
plan.extend(build_cycle_plan(freeze, benchmark=benchmark, out_dir=args.out_dir))
|
| 286 |
+
results = execute_cycle_plan(plan, repo_root=REPO_ROOT)
|
| 287 |
+
if args.summary_out is not None:
|
| 288 |
+
write_cycle_summary(args.summary_out, results)
|
| 289 |
+
return 0 if all(item["returncode"] == 0 for item in results) else 1
|
| 290 |
+
cmd, env = build_benchmark_command(
|
| 291 |
+
freeze,
|
| 292 |
+
benchmark=args.benchmark,
|
| 293 |
+
variant=args.variant,
|
| 294 |
+
seed=args.seed,
|
| 295 |
+
out_dir=args.out_dir,
|
| 296 |
+
)
|
| 297 |
+
proc = subprocess.run(cmd, cwd=str(REPO_ROOT), env=env)
|
| 298 |
+
if args.summary_out is not None:
|
| 299 |
+
write_cycle_summary(
|
| 300 |
+
args.summary_out,
|
| 301 |
+
[{
|
| 302 |
+
"benchmark": args.benchmark,
|
| 303 |
+
"variant": args.variant,
|
| 304 |
+
"seed": args.seed,
|
| 305 |
+
"returncode": proc.returncode,
|
| 306 |
+
}],
|
| 307 |
+
)
|
| 308 |
+
return proc.returncode
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
if __name__ == "__main__":
|
| 312 |
+
raise SystemExit(main())
|
overlay/scripts/export_hpo_priors.py
CHANGED
|
@@ -9,6 +9,8 @@ from typing import Any
|
|
| 9 |
|
| 10 |
import optuna
|
| 11 |
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def parse_args() -> argparse.Namespace:
|
| 14 |
parser = argparse.ArgumentParser(description="Export top Optuna trials as transfer-learning priors")
|
|
@@ -35,38 +37,56 @@ def _serialize_trial(trial: optuna.trial.FrozenTrial) -> dict[str, Any]:
|
|
| 35 |
}
|
| 36 |
|
| 37 |
|
| 38 |
-
def
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
row["study_name"]
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
"generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
|
| 59 |
"study_names": study_names,
|
| 60 |
-
"metric":
|
| 61 |
-
"n_total_trials":
|
| 62 |
-
"n_completed_trials":
|
| 63 |
-
"
|
| 64 |
-
"
|
|
|
|
|
|
|
|
|
|
| 65 |
}
|
| 66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
args.out.parent.mkdir(parents=True, exist_ok=True)
|
| 68 |
args.out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
| 69 |
-
print(
|
|
|
|
|
|
|
|
|
|
| 70 |
return 0
|
| 71 |
|
| 72 |
|
|
|
|
| 9 |
|
| 10 |
import optuna
|
| 11 |
|
| 12 |
+
from scripts.hpo_leaderboard import build_leaderboard
|
| 13 |
+
|
| 14 |
|
| 15 |
def parse_args() -> argparse.Namespace:
|
| 16 |
parser = argparse.ArgumentParser(description="Export top Optuna trials as transfer-learning priors")
|
|
|
|
| 37 |
}
|
| 38 |
|
| 39 |
|
| 40 |
+
def collect_prior_trials(*, storage: str, study_names: list[str], top_k: int, metric: str) -> dict[str, Any]:
|
| 41 |
+
leaderboard = build_leaderboard(storage=storage, study_names=study_names, metric=metric)
|
| 42 |
+
selected = leaderboard["clean_trials"][: max(0, top_k)]
|
| 43 |
+
trials = [
|
| 44 |
+
{
|
| 45 |
+
"study_name": row["study_name"],
|
| 46 |
+
"trial_number": row["trial_number"],
|
| 47 |
+
"value": row["value"],
|
| 48 |
+
"params": row["params"],
|
| 49 |
+
"user_attrs": row["user_attrs"],
|
| 50 |
+
}
|
| 51 |
+
for row in selected
|
| 52 |
+
]
|
| 53 |
+
quarantined = [
|
| 54 |
+
{
|
| 55 |
+
"study_name": row["study_name"],
|
| 56 |
+
"trial_number": row["trial_number"],
|
| 57 |
+
"value": row["value"],
|
| 58 |
+
"params": row["params"],
|
| 59 |
+
"user_attrs": row["user_attrs"],
|
| 60 |
+
"contamination_reason": row["contamination_reason"],
|
| 61 |
+
}
|
| 62 |
+
for row in leaderboard["contaminated_trials"]
|
| 63 |
+
]
|
| 64 |
+
return {
|
| 65 |
+
"schema_version": 2,
|
| 66 |
"generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
|
| 67 |
"study_names": study_names,
|
| 68 |
+
"metric": metric,
|
| 69 |
+
"n_total_trials": sum(int(s["n_trials"]) for s in leaderboard["studies"]),
|
| 70 |
+
"n_completed_trials": sum(int(s["n_completed"]) for s in leaderboard["studies"]),
|
| 71 |
+
"n_exported_trials": len(trials),
|
| 72 |
+
"n_quarantined_trials": len(quarantined),
|
| 73 |
+
"top_k": top_k,
|
| 74 |
+
"trials": trials,
|
| 75 |
+
"quarantined_trials": quarantined,
|
| 76 |
}
|
| 77 |
|
| 78 |
+
|
| 79 |
+
def main() -> int:
|
| 80 |
+
args = parse_args()
|
| 81 |
+
study_names = args.study_name or ["hydra_hpo"]
|
| 82 |
+
payload = collect_prior_trials(storage=args.storage, study_names=study_names, top_k=args.top_k, metric=args.metric)
|
| 83 |
+
|
| 84 |
args.out.parent.mkdir(parents=True, exist_ok=True)
|
| 85 |
args.out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
|
| 86 |
+
print(
|
| 87 |
+
f"[hpo-priors] wrote {args.out} with {payload['n_exported_trials']} clean trials "
|
| 88 |
+
f"({payload['n_quarantined_trials']} quarantined)"
|
| 89 |
+
)
|
| 90 |
return 0
|
| 91 |
|
| 92 |
|
overlay/scripts/hpo_component_report.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import datetime as dt
|
| 6 |
+
import json
|
| 7 |
+
import math
|
| 8 |
+
from collections import defaultdict
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
from scripts.hpo_leaderboard import build_leaderboard
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
_COMPONENT_KEYS = [
|
| 16 |
+
"engram_subsample",
|
| 17 |
+
"htm_subsample",
|
| 18 |
+
"htm_learn_every",
|
| 19 |
+
"engram_n_columns",
|
| 20 |
+
"engram_layer_idx",
|
| 21 |
+
"sdr_target_active",
|
| 22 |
+
"mamba3_chunk",
|
| 23 |
+
"dropout",
|
| 24 |
+
"hyena_layers",
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _recover_params(row: dict[str, Any]) -> dict[str, Any]:
|
| 29 |
+
params = dict(row.get("params") or {})
|
| 30 |
+
attrs = row.get("user_attrs") or {}
|
| 31 |
+
for key, value in attrs.items():
|
| 32 |
+
if key.startswith("param_"):
|
| 33 |
+
params.setdefault(key.removeprefix("param_"), value)
|
| 34 |
+
return params
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _pearson(xs: list[float], ys: list[float]) -> float | None:
|
| 38 |
+
if len(xs) < 2 or len(xs) != len(ys):
|
| 39 |
+
return None
|
| 40 |
+
mean_x = sum(xs) / len(xs)
|
| 41 |
+
mean_y = sum(ys) / len(ys)
|
| 42 |
+
cov = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys))
|
| 43 |
+
var_x = sum((x - mean_x) ** 2 for x in xs)
|
| 44 |
+
var_y = sum((y - mean_y) ** 2 for y in ys)
|
| 45 |
+
if var_x <= 0 or var_y <= 0:
|
| 46 |
+
return None
|
| 47 |
+
return cov / math.sqrt(var_x * var_y)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def build_component_report(*, storage: str, study_names: list[str], metric: str = "val_bpb") -> dict[str, Any]:
|
| 51 |
+
leaderboard = build_leaderboard(storage=storage, study_names=study_names, metric=metric)
|
| 52 |
+
clean_trials = leaderboard["clean_trials"]
|
| 53 |
+
|
| 54 |
+
ablations: dict[str, list[dict[str, Any]]] = {}
|
| 55 |
+
numeric_correlations: list[dict[str, Any]] = []
|
| 56 |
+
|
| 57 |
+
for key in _COMPONENT_KEYS:
|
| 58 |
+
grouped: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
| 59 |
+
numeric_x: list[float] = []
|
| 60 |
+
metric_y: list[float] = []
|
| 61 |
+
tps_y: list[float] = []
|
| 62 |
+
for row in clean_trials:
|
| 63 |
+
params = _recover_params(row)
|
| 64 |
+
if key not in params:
|
| 65 |
+
continue
|
| 66 |
+
value = params[key]
|
| 67 |
+
grouped[str(value)].append({"value": value, "metric": float(row["value"]), "tps": row.get("tps")})
|
| 68 |
+
if isinstance(value, (int, float)) and isinstance(row.get("tps"), (int, float)):
|
| 69 |
+
numeric_x.append(float(value))
|
| 70 |
+
metric_y.append(float(row["value"]))
|
| 71 |
+
tps_y.append(float(row["tps"]))
|
| 72 |
+
|
| 73 |
+
rows: list[dict[str, Any]] = []
|
| 74 |
+
for grouped_rows in grouped.values():
|
| 75 |
+
value = grouped_rows[0]["value"]
|
| 76 |
+
metric_vals = [r["metric"] for r in grouped_rows]
|
| 77 |
+
tps_vals = [float(r["tps"]) for r in grouped_rows if isinstance(r["tps"], (int, float))]
|
| 78 |
+
rows.append({
|
| 79 |
+
"value": value,
|
| 80 |
+
"n_trials": len(grouped_rows),
|
| 81 |
+
"mean_metric": sum(metric_vals) / len(metric_vals),
|
| 82 |
+
"mean_tps": (sum(tps_vals) / len(tps_vals)) if tps_vals else None,
|
| 83 |
+
})
|
| 84 |
+
if rows:
|
| 85 |
+
rows.sort(key=lambda row: str(row["value"]))
|
| 86 |
+
ablations[key] = rows
|
| 87 |
+
|
| 88 |
+
pearson_metric = _pearson(numeric_x, metric_y)
|
| 89 |
+
pearson_tps = _pearson(numeric_x, tps_y)
|
| 90 |
+
if pearson_metric is not None or pearson_tps is not None:
|
| 91 |
+
numeric_correlations.append({
|
| 92 |
+
"param": key,
|
| 93 |
+
"pearson_with_metric": pearson_metric,
|
| 94 |
+
"pearson_with_tps": pearson_tps,
|
| 95 |
+
"n_points": len(numeric_x),
|
| 96 |
+
})
|
| 97 |
+
|
| 98 |
+
numeric_correlations.sort(key=lambda row: row["param"])
|
| 99 |
+
return {
|
| 100 |
+
"schema_version": 1,
|
| 101 |
+
"generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
|
| 102 |
+
"metric": metric,
|
| 103 |
+
"study_names": study_names,
|
| 104 |
+
"n_clean_trials": len(clean_trials),
|
| 105 |
+
"component_ablations": ablations,
|
| 106 |
+
"numeric_correlations": numeric_correlations,
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 111 |
+
parser = argparse.ArgumentParser(description="Build component ablation and correlation report from clean HPO trials")
|
| 112 |
+
parser.add_argument("--storage", default="sqlite:///optuna_hpo.db")
|
| 113 |
+
parser.add_argument("--study-name", action="append", default=[])
|
| 114 |
+
parser.add_argument("--metric", default="val_bpb")
|
| 115 |
+
parser.add_argument("--out", type=Path, default=Path(".tmp") / "optuna" / "component_report.json")
|
| 116 |
+
return parser.parse_args(argv)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def main(argv: list[str] | None = None) -> int:
|
| 120 |
+
args = parse_args(argv)
|
| 121 |
+
study_names = args.study_name or ["hydra_hpo"]
|
| 122 |
+
payload = build_component_report(storage=args.storage, study_names=study_names, metric=args.metric)
|
| 123 |
+
args.out.parent.mkdir(parents=True, exist_ok=True)
|
| 124 |
+
args.out.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 125 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 126 |
+
return 0
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
if __name__ == "__main__":
|
| 130 |
+
raise SystemExit(main())
|
overlay/scripts/hpo_leaderboard.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import datetime as dt
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
import optuna
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _trial_direction(study: optuna.Study) -> str:
|
| 14 |
+
return "maximize" if study.direction == optuna.study.StudyDirection.MAXIMIZE else "minimize"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _contamination_reason(trial: optuna.trial.FrozenTrial, metric: str) -> str | None:
|
| 18 |
+
if trial.value is None:
|
| 19 |
+
return "missing_value"
|
| 20 |
+
attrs = trial.user_attrs
|
| 21 |
+
source = attrs.get("objective_source")
|
| 22 |
+
eval_status = attrs.get("eval_status")
|
| 23 |
+
objective_metric = attrs.get("objective_metric")
|
| 24 |
+
|
| 25 |
+
if source in {"train_log_fallback", "missing_metric", "missing_metrics", "missing_final_val"}:
|
| 26 |
+
return f"objective_source={source}"
|
| 27 |
+
if eval_status not in {None, "completed"}:
|
| 28 |
+
return f"eval_status={eval_status}"
|
| 29 |
+
if objective_metric not in {None, metric}:
|
| 30 |
+
return f"objective_metric={objective_metric}"
|
| 31 |
+
return None
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _serialize_trial(study_name: str, trial: optuna.trial.FrozenTrial, metric: str) -> dict[str, Any]:
|
| 35 |
+
attrs = dict(trial.user_attrs)
|
| 36 |
+
source = attrs.get("objective_source") or "legacy_completed_value"
|
| 37 |
+
row = {
|
| 38 |
+
"study_name": study_name,
|
| 39 |
+
"trial_number": trial.number,
|
| 40 |
+
"value": float(trial.value) if trial.value is not None else None,
|
| 41 |
+
"metric": metric,
|
| 42 |
+
"objective_source": source,
|
| 43 |
+
"objective_metric": attrs.get("objective_metric", metric),
|
| 44 |
+
"eval_status": attrs.get("eval_status"),
|
| 45 |
+
"hf_job_id": attrs.get("hf_job_id"),
|
| 46 |
+
"tps": attrs.get("tps"),
|
| 47 |
+
"params": dict(trial.params),
|
| 48 |
+
"user_attrs": attrs,
|
| 49 |
+
}
|
| 50 |
+
reason = _contamination_reason(trial, metric)
|
| 51 |
+
if reason is not None:
|
| 52 |
+
row["contamination_reason"] = reason
|
| 53 |
+
return row
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _is_pareto_dominated(candidate: dict[str, Any], peers: list[dict[str, Any]]) -> bool:
|
| 57 |
+
candidate_value = float(candidate["value"])
|
| 58 |
+
candidate_tps = float(candidate["tps"])
|
| 59 |
+
for peer in peers:
|
| 60 |
+
if peer is candidate or peer.get("tps") is None:
|
| 61 |
+
continue
|
| 62 |
+
peer_value = float(peer["value"])
|
| 63 |
+
peer_tps = float(peer["tps"])
|
| 64 |
+
no_worse = peer_value <= candidate_value and peer_tps >= candidate_tps
|
| 65 |
+
strictly_better = peer_value < candidate_value or peer_tps > candidate_tps
|
| 66 |
+
if no_worse and strictly_better:
|
| 67 |
+
return True
|
| 68 |
+
return False
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def _annotate_pareto(clean_trials: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
| 72 |
+
pareto_trials: list[dict[str, Any]] = []
|
| 73 |
+
comparable = [row for row in clean_trials if row.get("tps") is not None]
|
| 74 |
+
for row in clean_trials:
|
| 75 |
+
if row.get("tps") is None:
|
| 76 |
+
row["pareto_frontier"] = False
|
| 77 |
+
row["pareto_dominated"] = None
|
| 78 |
+
row["pareto_reason"] = "missing_tps"
|
| 79 |
+
continue
|
| 80 |
+
dominated = _is_pareto_dominated(row, comparable)
|
| 81 |
+
row["pareto_frontier"] = not dominated
|
| 82 |
+
row["pareto_dominated"] = dominated
|
| 83 |
+
row["pareto_reason"] = "frontier" if not dominated else "dominated"
|
| 84 |
+
if not dominated:
|
| 85 |
+
pareto_trials.append(row)
|
| 86 |
+
pareto_trials.sort(key=lambda row: (float(row["value"]), -float(row["tps"])))
|
| 87 |
+
return pareto_trials
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def build_leaderboard(*, storage: str, study_names: list[str], metric: str = "val_bpb") -> dict[str, Any]:
|
| 91 |
+
clean_trials: list[dict[str, Any]] = []
|
| 92 |
+
contaminated_trials: list[dict[str, Any]] = []
|
| 93 |
+
study_summaries: list[dict[str, Any]] = []
|
| 94 |
+
direction = "minimize"
|
| 95 |
+
|
| 96 |
+
for study_name in study_names:
|
| 97 |
+
study = optuna.load_study(study_name=study_name, storage=storage)
|
| 98 |
+
direction = _trial_direction(study)
|
| 99 |
+
completed = [t for t in study.trials if t.value is not None]
|
| 100 |
+
study_summaries.append({
|
| 101 |
+
"study_name": study_name,
|
| 102 |
+
"direction": direction,
|
| 103 |
+
"n_trials": len(study.trials),
|
| 104 |
+
"n_completed": len(completed),
|
| 105 |
+
})
|
| 106 |
+
for trial in completed:
|
| 107 |
+
row = _serialize_trial(study_name, trial, metric)
|
| 108 |
+
if "contamination_reason" in row:
|
| 109 |
+
contaminated_trials.append(row)
|
| 110 |
+
else:
|
| 111 |
+
clean_trials.append(row)
|
| 112 |
+
|
| 113 |
+
reverse = direction == "maximize"
|
| 114 |
+
clean_trials.sort(key=lambda row: float(row["value"]), reverse=reverse)
|
| 115 |
+
contaminated_trials.sort(key=lambda row: float(row["value"]), reverse=reverse)
|
| 116 |
+
pareto_trials = _annotate_pareto(clean_trials)
|
| 117 |
+
|
| 118 |
+
return {
|
| 119 |
+
"schema_version": 1,
|
| 120 |
+
"generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
|
| 121 |
+
"metric": metric,
|
| 122 |
+
"direction": direction,
|
| 123 |
+
"study_names": study_names,
|
| 124 |
+
"studies": study_summaries,
|
| 125 |
+
"n_clean_trials": len(clean_trials),
|
| 126 |
+
"n_contaminated_trials": len(contaminated_trials),
|
| 127 |
+
"pareto_metric_x": metric,
|
| 128 |
+
"pareto_metric_y": "tps",
|
| 129 |
+
"n_pareto_trials": len(pareto_trials),
|
| 130 |
+
"clean_trials": clean_trials,
|
| 131 |
+
"contaminated_trials": contaminated_trials,
|
| 132 |
+
"pareto_trials": pareto_trials,
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 137 |
+
parser = argparse.ArgumentParser(description="Build a clean Optuna HPO leaderboard")
|
| 138 |
+
parser.add_argument("--storage", default="sqlite:///optuna_hpo.db")
|
| 139 |
+
parser.add_argument("--study-name", action="append", default=[], help="Repeat to merge multiple studies")
|
| 140 |
+
parser.add_argument("--metric", default="val_bpb")
|
| 141 |
+
parser.add_argument("--out", type=Path, default=Path(".tmp") / "optuna" / "leaderboard.json")
|
| 142 |
+
return parser.parse_args(argv)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def main(argv: list[str] | None = None) -> int:
|
| 146 |
+
args = parse_args(argv)
|
| 147 |
+
study_names = args.study_name or ["hydra_hpo"]
|
| 148 |
+
payload = build_leaderboard(storage=args.storage, study_names=study_names, metric=args.metric)
|
| 149 |
+
args.out.parent.mkdir(parents=True, exist_ok=True)
|
| 150 |
+
args.out.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 151 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 152 |
+
return 0
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
raise SystemExit(main())
|
overlay/scripts/hpo_retest.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import datetime as dt
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
import optuna
|
| 11 |
+
|
| 12 |
+
from scripts.hpo_leaderboard import build_leaderboard
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
_PARAM_TO_ENV = {
|
| 16 |
+
"d_model": "HYDRA_D_MODEL",
|
| 17 |
+
"n_layer": "HYDRA_N_LAYER",
|
| 18 |
+
"d_state": "HYDRA_D_STATE",
|
| 19 |
+
"headdim": "HYDRA_HEADDIM",
|
| 20 |
+
"expand": "HYDRA_EXPAND",
|
| 21 |
+
"seq_len": "HYDRA_SEQ_LEN",
|
| 22 |
+
"batch_size": "HYDRA_BATCH_SIZE",
|
| 23 |
+
"matrix_lr": "HYDRA_MATRIX_LR",
|
| 24 |
+
"embed_lr": "HYDRA_EMBED_LR",
|
| 25 |
+
"unembed_lr": "HYDRA_UNEMBED_LR",
|
| 26 |
+
"engram_n_columns": "HYDRA_ENGRAM_N_COLUMNS",
|
| 27 |
+
"engram_layer_idx": "HYDRA_ENGRAM_LAYER_IDX",
|
| 28 |
+
"sdr_target_active": "HYDRA_SDR_TARGET_ACTIVE",
|
| 29 |
+
"htm_learn_every": "HYDRA_HTM_LEARN_EVERY",
|
| 30 |
+
"htm_subsample": "HYDRA_HTM_SUBSAMPLE",
|
| 31 |
+
"engram_subsample": "HYDRA_ENGRAM_SUBSAMPLE",
|
| 32 |
+
"mamba3_chunk": "HYDRA_MAMBA3_CHUNK",
|
| 33 |
+
"dropout": "HYDRA_DROPOUT",
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
_DEFAULT_ENV = {
|
| 37 |
+
"HYDRA_USE_NEMOTRON": "1",
|
| 38 |
+
"HYDRA_LOCAL_SHARDS_ONLY": "0",
|
| 39 |
+
"HYDRA_THROUGHPUT_MODE": "0",
|
| 40 |
+
"HYDRA_FASTPATH": "0",
|
| 41 |
+
"HYDRA_FORCE_HTM_CPU": "0",
|
| 42 |
+
"HYDRA_INERT_MAMBA": "0",
|
| 43 |
+
"HYDRA_ALLOW_SYNTHETIC_RETINA": "0",
|
| 44 |
+
"HYDRA_HTM_FUSED": "1",
|
| 45 |
+
"HYDRA_HYENA_LAYERS": "",
|
| 46 |
+
"HYDRA_CKPT_INTERVAL": "0",
|
| 47 |
+
"HYDRA_ENGRAM_SUBSAMPLE": "1",
|
| 48 |
+
"HYDRA_HTM_SUBSAMPLE": "2",
|
| 49 |
+
"HYDRA_HTM_LEARN_EVERY": "8",
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _recover_params(row: dict[str, Any]) -> dict[str, Any]:
|
| 54 |
+
params = dict(row.get("params") or {})
|
| 55 |
+
attrs = row.get("user_attrs") or {}
|
| 56 |
+
for key, value in attrs.items():
|
| 57 |
+
if key.startswith("param_"):
|
| 58 |
+
params.setdefault(key.removeprefix("param_"), value)
|
| 59 |
+
return params
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _candidate_env(params: dict[str, Any], *, eval_tokens: int, eval_batch: int, time_budget: int) -> dict[str, str]:
|
| 63 |
+
env = dict(_DEFAULT_ENV)
|
| 64 |
+
env["HYDRA_EVAL_TOKENS"] = str(eval_tokens)
|
| 65 |
+
env["HYDRA_EVAL_BATCH"] = str(eval_batch)
|
| 66 |
+
env["HYDRA_TIME_BUDGET"] = str(time_budget)
|
| 67 |
+
for key, value in params.items():
|
| 68 |
+
env_key = _PARAM_TO_ENV.get(key)
|
| 69 |
+
if env_key is not None:
|
| 70 |
+
env[env_key] = str(value)
|
| 71 |
+
if "HYDRA_BATCH_SIZE" in env and "HYDRA_SEQ_LEN" in env:
|
| 72 |
+
grad_accum = int(params.get("grad_accum", 16))
|
| 73 |
+
env["HYDRA_TOTAL_BATCH"] = str(int(env["HYDRA_BATCH_SIZE"]) * int(env["HYDRA_SEQ_LEN"]) * grad_accum)
|
| 74 |
+
return env
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def build_retest_plan(
|
| 78 |
+
*,
|
| 79 |
+
storage: str,
|
| 80 |
+
study_names: list[str],
|
| 81 |
+
top_k: int,
|
| 82 |
+
metric: str = "val_bpb",
|
| 83 |
+
eval_tokens: int = 16384,
|
| 84 |
+
eval_batch: int = 2,
|
| 85 |
+
time_budget: int = 420,
|
| 86 |
+
) -> dict[str, Any]:
|
| 87 |
+
leaderboard = build_leaderboard(storage=storage, study_names=study_names, metric=metric)
|
| 88 |
+
rows = [*leaderboard["contaminated_trials"], *leaderboard["clean_trials"]]
|
| 89 |
+
reverse = leaderboard["direction"] == "maximize"
|
| 90 |
+
rows.sort(key=lambda row: float(row["value"]), reverse=reverse)
|
| 91 |
+
candidates = []
|
| 92 |
+
for row in rows[: max(0, top_k)]:
|
| 93 |
+
params = _recover_params(row)
|
| 94 |
+
env = _candidate_env(params, eval_tokens=eval_tokens, eval_batch=eval_batch, time_budget=time_budget)
|
| 95 |
+
reason = row.get("contamination_reason") or "canonical_truth_eval_retest"
|
| 96 |
+
candidates.append({
|
| 97 |
+
"study_name": row["study_name"],
|
| 98 |
+
"trial_number": row["trial_number"],
|
| 99 |
+
"source_value": row["value"],
|
| 100 |
+
"source_objective": row["objective_source"],
|
| 101 |
+
"source_job_id": row.get("hf_job_id"),
|
| 102 |
+
"needs_retest_reason": reason,
|
| 103 |
+
"params": params,
|
| 104 |
+
"env": env,
|
| 105 |
+
})
|
| 106 |
+
return {
|
| 107 |
+
"schema_version": 1,
|
| 108 |
+
"generated_at": dt.datetime.now(dt.UTC).isoformat(timespec="seconds"),
|
| 109 |
+
"metric": metric,
|
| 110 |
+
"study_names": study_names,
|
| 111 |
+
"eval_tokens": eval_tokens,
|
| 112 |
+
"eval_batch": eval_batch,
|
| 113 |
+
"time_budget": time_budget,
|
| 114 |
+
"n_candidates": len(candidates),
|
| 115 |
+
"candidates": candidates,
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 120 |
+
parser = argparse.ArgumentParser(description="Plan canonical-eval retests for historical HPO configs")
|
| 121 |
+
parser.add_argument("--storage", default="sqlite:///optuna_hpo.db")
|
| 122 |
+
parser.add_argument("--study-name", action="append", default=[])
|
| 123 |
+
parser.add_argument("--metric", default="val_bpb")
|
| 124 |
+
parser.add_argument("--top-k", type=int, default=10)
|
| 125 |
+
parser.add_argument("--eval-tokens", type=int, default=16384)
|
| 126 |
+
parser.add_argument("--eval-batch", type=int, default=2)
|
| 127 |
+
parser.add_argument("--time-budget", type=int, default=420)
|
| 128 |
+
parser.add_argument("--out", type=Path, default=Path(".tmp") / "optuna" / "retest_plan.json")
|
| 129 |
+
return parser.parse_args(argv)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def main(argv: list[str] | None = None) -> int:
|
| 133 |
+
args = parse_args(argv)
|
| 134 |
+
study_names = args.study_name or ["hydra_hpo"]
|
| 135 |
+
payload = build_retest_plan(
|
| 136 |
+
storage=args.storage,
|
| 137 |
+
study_names=study_names,
|
| 138 |
+
top_k=args.top_k,
|
| 139 |
+
metric=args.metric,
|
| 140 |
+
eval_tokens=args.eval_tokens,
|
| 141 |
+
eval_batch=args.eval_batch,
|
| 142 |
+
time_budget=args.time_budget,
|
| 143 |
+
)
|
| 144 |
+
args.out.parent.mkdir(parents=True, exist_ok=True)
|
| 145 |
+
args.out.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 146 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 147 |
+
return 0
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
if __name__ == "__main__":
|
| 151 |
+
raise SystemExit(main())
|
overlay/scripts/hydra_generation.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Callable
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
from scripts.benchmark_checkpoint import hydrate_checkpoint
|
| 11 |
+
from scripts.hf_routing import resolve_routing
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def default_checkpoint_path() -> Path:
|
| 15 |
+
return Path(os.path.expanduser("~/.cache/autoresearch/latest.pt"))
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def checkpoint_candidates(*, cache_dir: Path | None = None) -> list[Path]:
|
| 19 |
+
base = cache_dir or Path(os.path.expanduser("~/.cache/autoresearch"))
|
| 20 |
+
return [
|
| 21 |
+
base / "best_bpb.pt",
|
| 22 |
+
base / "pretrain_final.pt",
|
| 23 |
+
base / "latest.pt",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def resolve_checkpoint_path(explicit_path: Path | None, *, cache_dir: Path | None = None) -> Path:
|
| 28 |
+
if explicit_path is not None:
|
| 29 |
+
return explicit_path
|
| 30 |
+
for candidate in checkpoint_candidates(cache_dir=cache_dir):
|
| 31 |
+
if candidate.exists():
|
| 32 |
+
return candidate
|
| 33 |
+
return default_checkpoint_path()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def validate_checkpoint_compatibility(
|
| 37 |
+
*,
|
| 38 |
+
baseline_arch: str,
|
| 39 |
+
missing_keys: list[str],
|
| 40 |
+
unexpected_keys: list[str],
|
| 41 |
+
total_model_keys: int,
|
| 42 |
+
) -> None:
|
| 43 |
+
if baseline_arch == "transformer" and (missing_keys or unexpected_keys):
|
| 44 |
+
raise RuntimeError(
|
| 45 |
+
"checkpoint incompatible with transformer baseline architecture; "
|
| 46 |
+
"use a transformer-trained checkpoint or keep HYDRA_BASELINE_ARCH=mamba3"
|
| 47 |
+
)
|
| 48 |
+
mismatch_count = len(missing_keys) + len(unexpected_keys)
|
| 49 |
+
if total_model_keys > 0 and mismatch_count > max(8, total_model_keys // 2):
|
| 50 |
+
raise RuntimeError("checkpoint incompatible with requested model architecture")
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def generate_from_callable(
|
| 54 |
+
generator: Callable[[str], str] | Callable[..., str],
|
| 55 |
+
prompt: str,
|
| 56 |
+
*,
|
| 57 |
+
max_new_tokens: int,
|
| 58 |
+
temperature: float,
|
| 59 |
+
top_p: float,
|
| 60 |
+
) -> str:
|
| 61 |
+
text = generator(
|
| 62 |
+
prompt,
|
| 63 |
+
max_new_tokens=max_new_tokens,
|
| 64 |
+
temperature=temperature,
|
| 65 |
+
top_p=top_p,
|
| 66 |
+
)
|
| 67 |
+
return str(text).strip()
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def load_hydra_causal_lm(checkpoint_path: Path | None = None, device: str | None = None):
|
| 71 |
+
ckpt_path = resolve_checkpoint_path(checkpoint_path)
|
| 72 |
+
if not ckpt_path.exists():
|
| 73 |
+
hydrated = hydrate_checkpoint(
|
| 74 |
+
cache_dir=ckpt_path.parent,
|
| 75 |
+
output_repo=resolve_routing(token=os.environ.get("HF_TOKEN")).output_repo,
|
| 76 |
+
token=os.environ.get("HF_TOKEN"),
|
| 77 |
+
)
|
| 78 |
+
if hydrated is not None:
|
| 79 |
+
ckpt_path = hydrated
|
| 80 |
+
if not ckpt_path.exists():
|
| 81 |
+
raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
|
| 82 |
+
|
| 83 |
+
from transformers import GenerationConfig, GenerationMixin, PretrainedConfig, PreTrainedModel
|
| 84 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
| 85 |
+
|
| 86 |
+
from hydra.config import PostSemClawConfig
|
| 87 |
+
from hydra.model import PostSemClawModel
|
| 88 |
+
from prepare import Tokenizer
|
| 89 |
+
|
| 90 |
+
resolved_device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 91 |
+
|
| 92 |
+
class _HydraGenConfig(PretrainedConfig):
|
| 93 |
+
model_type = "hydra"
|
| 94 |
+
|
| 95 |
+
def __init__(self, vocab_size: int = 65536, **kw):
|
| 96 |
+
super().__init__(**kw)
|
| 97 |
+
self.vocab_size = vocab_size
|
| 98 |
+
|
| 99 |
+
class HydraForCausalLM(PreTrainedModel, GenerationMixin):
|
| 100 |
+
config_class = _HydraGenConfig
|
| 101 |
+
|
| 102 |
+
def __init__(self, gen_config, inner_model):
|
| 103 |
+
super().__init__(gen_config)
|
| 104 |
+
self.inner = inner_model
|
| 105 |
+
self.config.vocab_size = gen_config.vocab_size
|
| 106 |
+
|
| 107 |
+
def forward(self, input_ids, attention_mask=None, **kw):
|
| 108 |
+
logits = self.inner(input_ids)
|
| 109 |
+
return CausalLMOutputWithPast(loss=None, logits=logits, past_key_values=None)
|
| 110 |
+
|
| 111 |
+
def prepare_inputs_for_generation(self, input_ids, **kw):
|
| 112 |
+
return {"input_ids": input_ids}
|
| 113 |
+
|
| 114 |
+
def get_input_embeddings(self):
|
| 115 |
+
return self.inner.wte
|
| 116 |
+
|
| 117 |
+
def can_generate(self) -> bool:
|
| 118 |
+
return True
|
| 119 |
+
|
| 120 |
+
@property
|
| 121 |
+
def _supports_cache_class(self):
|
| 122 |
+
return False
|
| 123 |
+
|
| 124 |
+
tokenizer = Tokenizer.from_directory()
|
| 125 |
+
vocab_size = tokenizer.get_vocab_size()
|
| 126 |
+
bos = tokenizer.get_bos_token_id()
|
| 127 |
+
ckpt = torch.load(str(ckpt_path), map_location="cpu", weights_only=False)
|
| 128 |
+
cfg = PostSemClawConfig(**ckpt["config"])
|
| 129 |
+
with torch.device("meta"):
|
| 130 |
+
inner = PostSemClawModel(cfg)
|
| 131 |
+
inner.to_empty(device=resolved_device)
|
| 132 |
+
missing, unexpected = inner.load_state_dict(ckpt["model_state_dict"], strict=False)
|
| 133 |
+
validate_checkpoint_compatibility(
|
| 134 |
+
baseline_arch=os.environ.get("HYDRA_BASELINE_ARCH", "mamba3").strip().lower(),
|
| 135 |
+
missing_keys=list(missing),
|
| 136 |
+
unexpected_keys=list(unexpected),
|
| 137 |
+
total_model_keys=len(inner.state_dict()),
|
| 138 |
+
)
|
| 139 |
+
inner.eval()
|
| 140 |
+
|
| 141 |
+
gen_cfg = _HydraGenConfig(vocab_size=vocab_size)
|
| 142 |
+
gen_cfg.bos_token_id = bos
|
| 143 |
+
gen_cfg.eos_token_id = bos
|
| 144 |
+
gen_cfg.pad_token_id = bos
|
| 145 |
+
model = HydraForCausalLM(gen_cfg, inner).to(resolved_device)
|
| 146 |
+
model.eval()
|
| 147 |
+
return tokenizer, model, bos, resolved_device, GenerationConfig
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def build_hydra_generator(
|
| 151 |
+
*,
|
| 152 |
+
checkpoint_path: Path | None = None,
|
| 153 |
+
device: str | None = None,
|
| 154 |
+
max_new_tokens: int,
|
| 155 |
+
temperature: float,
|
| 156 |
+
top_p: float,
|
| 157 |
+
):
|
| 158 |
+
tokenizer, model, bos, resolved_device, GenerationConfig = load_hydra_causal_lm(checkpoint_path=checkpoint_path, device=device)
|
| 159 |
+
|
| 160 |
+
def _generate(prompt: str) -> str:
|
| 161 |
+
ids = torch.tensor([tokenizer.encode(prompt)], dtype=torch.long, device=resolved_device)
|
| 162 |
+
gen_config = GenerationConfig(
|
| 163 |
+
max_new_tokens=max_new_tokens,
|
| 164 |
+
use_cache=False,
|
| 165 |
+
do_sample=temperature > 0.0,
|
| 166 |
+
temperature=temperature,
|
| 167 |
+
top_p=top_p,
|
| 168 |
+
bos_token_id=bos,
|
| 169 |
+
eos_token_id=bos,
|
| 170 |
+
pad_token_id=bos,
|
| 171 |
+
)
|
| 172 |
+
if str(resolved_device).startswith("cuda"):
|
| 173 |
+
with torch.no_grad(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
|
| 174 |
+
out = model.generate(ids, generation_config=gen_config)
|
| 175 |
+
else:
|
| 176 |
+
with torch.no_grad():
|
| 177 |
+
out = model.generate(ids, generation_config=gen_config)
|
| 178 |
+
return tokenizer.decode(out[0].tolist())
|
| 179 |
+
|
| 180 |
+
return _generate
|
overlay/scripts/launch_benchmark_hf_job.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 11 |
+
if str(REPO_ROOT) not in sys.path:
|
| 12 |
+
sys.path.insert(0, str(REPO_ROOT))
|
| 13 |
+
|
| 14 |
+
from huggingface_hub import HfApi
|
| 15 |
+
from huggingface_hub.utils import get_token
|
| 16 |
+
|
| 17 |
+
from scripts.hf_routing import resolve_routing
|
| 18 |
+
from scripts.launch_feather_hf_job import IMAGE_DIR, sync_overlay_from_repo, wait_for_space
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def build_benchmark_job_env(
|
| 22 |
+
*,
|
| 23 |
+
benchmark: str,
|
| 24 |
+
variant: str,
|
| 25 |
+
seed: int,
|
| 26 |
+
output_repo: str,
|
| 27 |
+
tokenizer_repo: str,
|
| 28 |
+
) -> dict[str, str]:
|
| 29 |
+
env = {
|
| 30 |
+
"FEATHER_HF_OUTPUT_REPO": output_repo,
|
| 31 |
+
"FEATHER_RUNTIME_MODE": "benchmark",
|
| 32 |
+
"HYDRA_TOKENIZER_CACHE_REPO": tokenizer_repo,
|
| 33 |
+
"HYDRA_BENCHMARK_NAME": benchmark,
|
| 34 |
+
"HYDRA_BENCHMARK_VARIANT": variant,
|
| 35 |
+
"HYDRA_SEED": str(seed),
|
| 36 |
+
"PYTHONUNBUFFERED": "1",
|
| 37 |
+
}
|
| 38 |
+
for key, value in os.environ.items():
|
| 39 |
+
if key.startswith("HYDRA_") and key not in env:
|
| 40 |
+
env[key] = value
|
| 41 |
+
return env
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def build_benchmark_job_command(*, benchmark: str, variant: str, seed: int) -> list[str]:
|
| 45 |
+
return [
|
| 46 |
+
"python",
|
| 47 |
+
"/app/entrypoint.py",
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def submit_benchmark_job(
|
| 52 |
+
*,
|
| 53 |
+
api,
|
| 54 |
+
image: str,
|
| 55 |
+
command: list[str],
|
| 56 |
+
env: dict[str, str],
|
| 57 |
+
token: str,
|
| 58 |
+
namespace: str,
|
| 59 |
+
flavor: str,
|
| 60 |
+
timeout: str,
|
| 61 |
+
) -> dict[str, str]:
|
| 62 |
+
job = api.run_job(
|
| 63 |
+
image=image,
|
| 64 |
+
command=command,
|
| 65 |
+
env=env,
|
| 66 |
+
secrets={"HF_TOKEN": token},
|
| 67 |
+
flavor=flavor,
|
| 68 |
+
timeout=timeout,
|
| 69 |
+
namespace=namespace,
|
| 70 |
+
token=token,
|
| 71 |
+
)
|
| 72 |
+
return {
|
| 73 |
+
"job_id": job.id,
|
| 74 |
+
"job_url": job.url,
|
| 75 |
+
"job_stage": str(job.status.stage),
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 80 |
+
routing = resolve_routing(token=os.environ.get("HF_TOKEN"))
|
| 81 |
+
parser = argparse.ArgumentParser(description="Prepare or submit a remote HF benchmark job")
|
| 82 |
+
parser.add_argument("--benchmark", required=True)
|
| 83 |
+
parser.add_argument("--variant", required=True)
|
| 84 |
+
parser.add_argument("--seed", type=int, required=True)
|
| 85 |
+
parser.add_argument("--output-repo", default=routing.output_repo)
|
| 86 |
+
parser.add_argument("--tokenizer-repo", default=routing.output_repo)
|
| 87 |
+
parser.add_argument("--image", default=f"hf.co/spaces/{routing.space_repo}")
|
| 88 |
+
parser.add_argument("--namespace", default=routing.job_namespace)
|
| 89 |
+
parser.add_argument("--flavor", default="a10g-small")
|
| 90 |
+
parser.add_argument("--timeout", default="30m")
|
| 91 |
+
parser.add_argument("--summary-out", type=Path)
|
| 92 |
+
parser.add_argument("--dry-run", action="store_true")
|
| 93 |
+
parser.add_argument("--refresh-image", action="store_true")
|
| 94 |
+
parser.add_argument("--sync-overlay", action="store_true")
|
| 95 |
+
return parser.parse_args(argv)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def main(argv: list[str] | None = None) -> int:
|
| 99 |
+
args = parse_args(argv)
|
| 100 |
+
env = build_benchmark_job_env(
|
| 101 |
+
benchmark=args.benchmark,
|
| 102 |
+
variant=args.variant,
|
| 103 |
+
seed=args.seed,
|
| 104 |
+
output_repo=args.output_repo,
|
| 105 |
+
tokenizer_repo=args.tokenizer_repo,
|
| 106 |
+
)
|
| 107 |
+
command = build_benchmark_job_command(benchmark=args.benchmark, variant=args.variant, seed=args.seed)
|
| 108 |
+
payload = {
|
| 109 |
+
"benchmark": args.benchmark,
|
| 110 |
+
"variant": args.variant,
|
| 111 |
+
"seed": args.seed,
|
| 112 |
+
"output_repo": args.output_repo,
|
| 113 |
+
"tokenizer_repo": args.tokenizer_repo,
|
| 114 |
+
"image": args.image,
|
| 115 |
+
"namespace": args.namespace,
|
| 116 |
+
"command": command,
|
| 117 |
+
"env": env,
|
| 118 |
+
"dry_run": args.dry_run,
|
| 119 |
+
}
|
| 120 |
+
if not args.dry_run:
|
| 121 |
+
token = os.environ.get("HF_TOKEN") or get_token()
|
| 122 |
+
if not token:
|
| 123 |
+
raise SystemExit("HF_TOKEN must be set or cached via huggingface-cli login")
|
| 124 |
+
api = HfApi(token=token)
|
| 125 |
+
if args.refresh_image:
|
| 126 |
+
space_repo = args.image.removeprefix("hf.co/spaces/")
|
| 127 |
+
if args.sync_overlay:
|
| 128 |
+
sync_overlay_from_repo()
|
| 129 |
+
api.upload_folder(
|
| 130 |
+
repo_id=space_repo,
|
| 131 |
+
repo_type="space",
|
| 132 |
+
folder_path=str(IMAGE_DIR),
|
| 133 |
+
commit_message="Update benchmark runtime image",
|
| 134 |
+
token=token,
|
| 135 |
+
)
|
| 136 |
+
wait_for_space(api, space_repo, token=token)
|
| 137 |
+
payload.update(
|
| 138 |
+
submit_benchmark_job(
|
| 139 |
+
api=api,
|
| 140 |
+
image=args.image,
|
| 141 |
+
command=command,
|
| 142 |
+
env=env,
|
| 143 |
+
token=token,
|
| 144 |
+
namespace=args.namespace,
|
| 145 |
+
flavor=args.flavor,
|
| 146 |
+
timeout=args.timeout,
|
| 147 |
+
)
|
| 148 |
+
)
|
| 149 |
+
if args.summary_out is not None:
|
| 150 |
+
args.summary_out.parent.mkdir(parents=True, exist_ok=True)
|
| 151 |
+
args.summary_out.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 152 |
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
| 153 |
+
return 0
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
if __name__ == "__main__":
|
| 157 |
+
raise SystemExit(main())
|
overlay/scripts/optuna_hpo.py
CHANGED
|
@@ -108,6 +108,28 @@ def _enqueue_transfer_priors(study: optuna.Study, priors_file: Path, apply_prior
|
|
| 108 |
if after > before:
|
| 109 |
enqueued += 1
|
| 110 |
return enqueued
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
|
| 113 |
def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any] | None:
|
|
@@ -143,15 +165,99 @@ def _parse_metrics_from_log_lines(lines: list[str]) -> dict[str, Any] | None:
|
|
| 143 |
|
| 144 |
|
| 145 |
def _parse_last_train_bpb_from_logs(lines: list[str]) -> float | None:
|
| 146 |
-
"""Best-effort fallback when final eval crashes before metrics JSON write."""
|
| 147 |
-
last: float | None = None
|
| 148 |
-
for line in lines:
|
| 149 |
-
m = re.search(r"\bbpb=([0-9]+(?:\.[0-9]+)?)", line)
|
| 150 |
if m:
|
| 151 |
-
last = float(m.group(1))
|
| 152 |
return last
|
| 153 |
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
def _fetch_job_logs_safe(
|
| 156 |
api,
|
| 157 |
*,
|
|
@@ -180,12 +286,20 @@ def _fetch_job_logs_safe(
|
|
| 180 |
if last_exc is not None:
|
| 181 |
raise last_exc
|
| 182 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
|
| 185 |
def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path) -> dict[str, str]:
|
| 186 |
env = os.environ.copy()
|
| 187 |
full_arch_hpo = env.get("HYDRA_HPO_FULL_ARCH", "0") == "1"
|
| 188 |
speed_arch_hpo = full_arch_hpo and env.get("HYDRA_HPO_SPEED_ARCH", "0") == "1"
|
|
|
|
| 189 |
|
| 190 |
# Runtime and reporting
|
| 191 |
env["HYDRA_METRICS_OUT"] = str(metrics_path)
|
|
@@ -203,6 +317,12 @@ def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path
|
|
| 203 |
env["HYDRA_D_STATE"] = str(trial.suggest_categorical("d_state", [16, 32]))
|
| 204 |
env["HYDRA_HEADDIM"] = str(trial.suggest_categorical("headdim", [16, 32]))
|
| 205 |
env["HYDRA_EXPAND"] = str(trial.suggest_categorical("expand", [1, 2]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
else:
|
| 207 |
env["HYDRA_D_MODEL"] = str(trial.suggest_categorical("d_model", [64, 96, 128, 160, 192]))
|
| 208 |
env["HYDRA_N_LAYER"] = str(trial.suggest_int("n_layer", 1, 4))
|
|
@@ -214,6 +334,10 @@ def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path
|
|
| 214 |
seq_len = trial.suggest_categorical("seq_len", [64, 128])
|
| 215 |
batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
|
| 216 |
grad_accum = trial.suggest_categorical("grad_accum", [4, 8, 16])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
else:
|
| 218 |
seq_len = trial.suggest_categorical("seq_len", [32, 64])
|
| 219 |
batch_size = trial.suggest_categorical("batch_size", [4, 8] if full_arch_hpo else [4, 8, 16])
|
|
@@ -224,22 +348,41 @@ def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path
|
|
| 224 |
env["HYDRA_BATCH_SIZE"] = str(batch_size)
|
| 225 |
env["HYDRA_TOTAL_BATCH"] = str(total_batch)
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
if full_arch_hpo:
|
| 232 |
env["HYDRA_HYENA_LAYERS"] = ""
|
| 233 |
env["HYDRA_ENGRAM_N_COLUMNS"] = str(
|
| 234 |
-
trial.suggest_categorical(
|
|
|
|
|
|
|
|
|
|
| 235 |
)
|
| 236 |
env["HYDRA_ENGRAM_LAYER_IDX"] = str(trial.suggest_int("engram_layer_idx", 0, max(0, int(env["HYDRA_N_LAYER"]) - 1)))
|
| 237 |
-
env["HYDRA_SDR_TARGET_ACTIVE"] = str(
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
env["HYDRA_MAMBA3_CHUNK"] = str(trial.suggest_categorical("mamba3_chunk", [32, 64]))
|
| 242 |
-
env["HYDRA_DROPOUT"] = str(trial.suggest_categorical("dropout", [0.0, 0.1] if speed_arch_hpo else [0.0, 0.1, 0.2]))
|
| 243 |
else:
|
| 244 |
env["HYDRA_HYENA_LAYERS"] = trial.suggest_categorical("hyena_layers", ["", "0", "1", "0,1"])
|
| 245 |
|
|
@@ -299,8 +442,10 @@ def _space_repo_from_hf_image(image: str, namespace: str) -> str:
|
|
| 299 |
return os.environ.get("FEATHER_HF_SPACE_REPO", f"{namespace}/feather-a10-runtime")
|
| 300 |
|
| 301 |
|
| 302 |
-
def _objective_local(args: argparse.Namespace):
|
| 303 |
-
|
|
|
|
|
|
|
| 304 |
trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
|
| 305 |
metrics_path = trial_dir / "metrics.json"
|
| 306 |
|
|
@@ -315,44 +460,67 @@ def _objective_local(args: argparse.Namespace):
|
|
| 315 |
timeout=args.trial_timeout,
|
| 316 |
)
|
| 317 |
|
| 318 |
-
metrics: dict[str, Any] | None = None
|
| 319 |
if metrics_path.exists():
|
| 320 |
try:
|
| 321 |
metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
|
| 322 |
except json.JSONDecodeError:
|
| 323 |
metrics = None
|
| 324 |
-
if metrics is None:
|
| 325 |
-
metrics = _parse_metrics_from_stdout(proc.stdout)
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
if proc.returncode != 0:
|
| 331 |
raise optuna.TrialPruned(f"Training failed rc={proc.returncode}")
|
| 332 |
|
| 333 |
-
metric_key = args.metric
|
| 334 |
-
if metric_key not in metrics or metrics[metric_key] is None:
|
| 335 |
-
raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
|
| 336 |
|
| 337 |
tps_val = metrics.get("tps")
|
| 338 |
if tps_val is not None:
|
| 339 |
tps_f = float(tps_val)
|
| 340 |
trial.set_user_attr("tps", tps_f)
|
| 341 |
-
if
|
| 342 |
-
raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {
|
| 343 |
-
|
| 344 |
-
value =
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
return objective
|
| 353 |
|
| 354 |
|
| 355 |
-
def _objective_hf_job(args: argparse.Namespace):
|
| 356 |
from huggingface_hub import HfApi
|
| 357 |
from huggingface_hub.utils import get_token
|
| 358 |
|
|
@@ -362,8 +530,9 @@ def _objective_hf_job(args: argparse.Namespace):
|
|
| 362 |
f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
|
| 363 |
)
|
| 364 |
|
| 365 |
-
api = HfApi(token=token)
|
| 366 |
-
terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
|
|
|
|
| 367 |
|
| 368 |
def objective(trial: optuna.Trial) -> float:
|
| 369 |
trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
|
|
@@ -474,50 +643,66 @@ def _objective_hf_job(args: argparse.Namespace):
|
|
| 474 |
except Exception:
|
| 475 |
pass
|
| 476 |
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
if terminal_detail:
|
| 482 |
trial.set_user_attr("hf_status_message", terminal_detail)
|
| 483 |
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
trial
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
raise optuna.TrialPruned(f"No metrics found from HF job ({detail})")
|
| 500 |
|
| 501 |
-
metric_key = args.metric
|
| 502 |
-
if metric_key not in metrics or metrics[metric_key] is None:
|
| 503 |
-
raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
|
| 504 |
|
| 505 |
tps_val = metrics.get("tps")
|
| 506 |
if tps_val is not None:
|
| 507 |
tps_f = float(tps_val)
|
| 508 |
trial.set_user_attr("tps", tps_f)
|
| 509 |
-
if
|
| 510 |
-
raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {
|
| 511 |
-
|
| 512 |
-
value =
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
|
| 517 |
return objective
|
| 518 |
|
| 519 |
|
| 520 |
-
def _objective_hf_launcher(args: argparse.Namespace):
|
| 521 |
from huggingface_hub import HfApi
|
| 522 |
from huggingface_hub.utils import get_token
|
| 523 |
|
|
@@ -527,8 +712,9 @@ def _objective_hf_launcher(args: argparse.Namespace):
|
|
| 527 |
f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
|
| 528 |
)
|
| 529 |
|
| 530 |
-
api = HfApi(token=token)
|
| 531 |
-
terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
|
|
|
|
| 532 |
|
| 533 |
def objective(trial: optuna.Trial) -> float:
|
| 534 |
trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
|
|
@@ -610,44 +796,61 @@ def _objective_hf_launcher(args: argparse.Namespace):
|
|
| 610 |
except Exception:
|
| 611 |
pass
|
| 612 |
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
if terminal_detail:
|
| 617 |
trial.set_user_attr("hf_status_message", terminal_detail)
|
| 618 |
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
trial
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
raise optuna.TrialPruned(f"No metrics found from HF launcher job ({detail})")
|
| 635 |
|
| 636 |
-
metric_key = args.metric
|
| 637 |
-
if metric_key not in metrics or metrics[metric_key] is None:
|
| 638 |
-
raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
|
| 639 |
|
| 640 |
tps_val = metrics.get("tps")
|
| 641 |
if tps_val is not None:
|
| 642 |
tps_f = float(tps_val)
|
| 643 |
trial.set_user_attr("tps", tps_f)
|
| 644 |
-
if
|
| 645 |
-
raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {
|
| 646 |
-
|
| 647 |
-
value =
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
|
| 652 |
return objective
|
| 653 |
|
|
@@ -690,6 +893,8 @@ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
| 690 |
parser.add_argument("--priors-file", type=Path, default=REPO_ROOT / "docs" / "hpo_transfer_priors.json", help="Path to transfer-learning prior trials JSON")
|
| 691 |
parser.add_argument("--apply-priors", action="store_true", default=True, help="Enqueue transfer-learning prior trials before optimize")
|
| 692 |
parser.add_argument("--no-apply-priors", action="store_false", dest="apply_priors")
|
|
|
|
|
|
|
| 693 |
parser.add_argument("--seed", type=int, default=42, help="Seed for sampler")
|
| 694 |
parser.add_argument("--n-startup-trials", type=int, default=5, help="Pruner startup trials before pruning")
|
| 695 |
parser.add_argument("--n-warmup-steps", type=int, default=0, help="Pruner warmup steps")
|
|
@@ -720,6 +925,10 @@ def main() -> int:
|
|
| 720 |
pruner=pruner,
|
| 721 |
)
|
| 722 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 723 |
enqueued_priors = _enqueue_transfer_priors(study, args.priors_file, args.apply_priors)
|
| 724 |
if enqueued_priors:
|
| 725 |
print(f"[hpo] enqueued {enqueued_priors} transfer priors from {args.priors_file}")
|
|
@@ -784,6 +993,8 @@ def main() -> int:
|
|
| 784 |
"n_completed": len(completed),
|
| 785 |
"patience_trials": args.patience_trials,
|
| 786 |
"min_improvement": args.min_improvement,
|
|
|
|
|
|
|
| 787 |
"enqueued_priors": enqueued_priors,
|
| 788 |
}
|
| 789 |
else:
|
|
@@ -793,10 +1004,12 @@ def main() -> int:
|
|
| 793 |
"metric": args.metric,
|
| 794 |
"best_value": None,
|
| 795 |
"best_params": {},
|
| 796 |
-
"best_trial_number": None,
|
| 797 |
"best_trial_user_attrs": {},
|
| 798 |
"n_trials": len(study.trials),
|
| 799 |
"n_completed": 0,
|
|
|
|
|
|
|
| 800 |
"enqueued_priors": enqueued_priors,
|
| 801 |
"note": "No completed trials with metrics found.",
|
| 802 |
}
|
|
|
|
| 108 |
if after > before:
|
| 109 |
enqueued += 1
|
| 110 |
return enqueued
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _enqueue_quality_anchors(study: optuna.Study, priors_file: Path, quality_mode_local: bool, top_k: int) -> int:
|
| 114 |
+
if not quality_mode_local or top_k <= 0:
|
| 115 |
+
return 0
|
| 116 |
+
|
| 117 |
+
priors = _load_prior_param_sets(priors_file)[:top_k]
|
| 118 |
+
enqueued = 0
|
| 119 |
+
for params in priors:
|
| 120 |
+
before = len(study.get_trials(deepcopy=False))
|
| 121 |
+
try:
|
| 122 |
+
study.enqueue_trial(
|
| 123 |
+
params,
|
| 124 |
+
user_attrs={"seed_source": "quality_anchor"},
|
| 125 |
+
skip_if_exists=True,
|
| 126 |
+
)
|
| 127 |
+
except TypeError:
|
| 128 |
+
study.enqueue_trial(params, user_attrs={"seed_source": "quality_anchor"})
|
| 129 |
+
after = len(study.get_trials(deepcopy=False))
|
| 130 |
+
if after > before:
|
| 131 |
+
enqueued += 1
|
| 132 |
+
return enqueued
|
| 133 |
|
| 134 |
|
| 135 |
def _parse_metrics_from_stdout(stdout: str) -> dict[str, Any] | None:
|
|
|
|
| 165 |
|
| 166 |
|
| 167 |
def _parse_last_train_bpb_from_logs(lines: list[str]) -> float | None:
|
| 168 |
+
"""Best-effort fallback when final eval crashes before metrics JSON write."""
|
| 169 |
+
last: float | None = None
|
| 170 |
+
for line in lines:
|
| 171 |
+
m = re.search(r"\bbpb=([0-9]+(?:\.[0-9]+)?)", line)
|
| 172 |
if m:
|
| 173 |
+
last = float(m.group(1))
|
| 174 |
return last
|
| 175 |
|
| 176 |
|
| 177 |
+
def _persist_trial_artifacts(
|
| 178 |
+
*,
|
| 179 |
+
trial_dir: Path,
|
| 180 |
+
metrics: dict[str, Any] | None,
|
| 181 |
+
log_lines: list[str] | None,
|
| 182 |
+
log_name: str,
|
| 183 |
+
metadata: dict[str, Any],
|
| 184 |
+
) -> dict[str, str | None]:
|
| 185 |
+
trial_dir.mkdir(parents=True, exist_ok=True)
|
| 186 |
+
metrics_path = trial_dir / "metrics.json"
|
| 187 |
+
log_path = trial_dir / log_name
|
| 188 |
+
manifest_path = trial_dir / "trial_artifacts.json"
|
| 189 |
+
|
| 190 |
+
if metrics is not None:
|
| 191 |
+
metrics_path.write_text(json.dumps(metrics, indent=2, sort_keys=True), encoding="utf-8")
|
| 192 |
+
if log_lines is not None:
|
| 193 |
+
log_path.write_text("\n".join(log_lines), encoding="utf-8")
|
| 194 |
+
|
| 195 |
+
manifest = {
|
| 196 |
+
**metadata,
|
| 197 |
+
"metrics_path": str(metrics_path) if metrics is not None else None,
|
| 198 |
+
"log_path": str(log_path) if log_lines is not None else None,
|
| 199 |
+
}
|
| 200 |
+
manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True), encoding="utf-8")
|
| 201 |
+
return {
|
| 202 |
+
"metrics_path": str(metrics_path) if metrics is not None else None,
|
| 203 |
+
"log_path": str(log_path) if log_lines is not None else None,
|
| 204 |
+
"manifest_path": str(manifest_path),
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def _resolve_objective_metric(
|
| 209 |
+
trial: optuna.Trial,
|
| 210 |
+
*,
|
| 211 |
+
metric_key: str,
|
| 212 |
+
metrics: dict[str, Any] | None,
|
| 213 |
+
allow_log_metric_fallback: bool,
|
| 214 |
+
fallback_bpb: float | None,
|
| 215 |
+
tps_seen: float | None,
|
| 216 |
+
) -> float:
|
| 217 |
+
"""Resolve the objective value while labeling where it came from.
|
| 218 |
+
|
| 219 |
+
Validation metrics and live training-log fallbacks are intentionally
|
| 220 |
+
different sources. Keeping that distinction in trial attrs prevents a
|
| 221 |
+
skipped/OOM eval from being mistaken for a real validation result.
|
| 222 |
+
"""
|
| 223 |
+
if metrics is None:
|
| 224 |
+
if allow_log_metric_fallback and metric_key == "val_bpb" and fallback_bpb is not None:
|
| 225 |
+
trial.set_user_attr("objective_source", "train_log_fallback")
|
| 226 |
+
trial.set_user_attr("objective_metric", "train_bpb")
|
| 227 |
+
trial.set_user_attr("eval_status", "missing_metrics")
|
| 228 |
+
trial.set_user_attr("train_bpb_fallback", float(fallback_bpb))
|
| 229 |
+
if tps_seen is not None:
|
| 230 |
+
trial.set_user_attr("tps", float(tps_seen))
|
| 231 |
+
return float(fallback_bpb)
|
| 232 |
+
trial.set_user_attr("objective_source", "missing_metrics")
|
| 233 |
+
raise optuna.TrialPruned("No metrics payload found")
|
| 234 |
+
|
| 235 |
+
eval_status = str(
|
| 236 |
+
metrics.get(
|
| 237 |
+
"eval_status",
|
| 238 |
+
"completed" if metrics.get("val_bpb") is not None else "unknown",
|
| 239 |
+
)
|
| 240 |
+
)
|
| 241 |
+
trial.set_user_attr("eval_status", eval_status)
|
| 242 |
+
|
| 243 |
+
if fallback_bpb is not None:
|
| 244 |
+
trial.set_user_attr("train_bpb_fallback", float(fallback_bpb))
|
| 245 |
+
|
| 246 |
+
if metric_key not in metrics or metrics[metric_key] is None:
|
| 247 |
+
trial.set_user_attr("objective_source", "missing_metric")
|
| 248 |
+
trial.set_user_attr("objective_metric", metric_key)
|
| 249 |
+
raise optuna.TrialPruned(f"Metric '{metric_key}' missing in metrics payload")
|
| 250 |
+
|
| 251 |
+
value = float(metrics[metric_key])
|
| 252 |
+
trial.set_user_attr("objective_metric", metric_key)
|
| 253 |
+
if metric_key == "val_bpb":
|
| 254 |
+
trial.set_user_attr("objective_source", "final_val")
|
| 255 |
+
trial.set_user_attr("final_val_bpb", value)
|
| 256 |
+
else:
|
| 257 |
+
trial.set_user_attr("objective_source", "metrics_json")
|
| 258 |
+
return value
|
| 259 |
+
|
| 260 |
+
|
| 261 |
def _fetch_job_logs_safe(
|
| 262 |
api,
|
| 263 |
*,
|
|
|
|
| 286 |
if last_exc is not None:
|
| 287 |
raise last_exc
|
| 288 |
return []
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
def _effective_min_tps(args: argparse.Namespace) -> float | None:
|
| 292 |
+
min_tps = args.min_tps
|
| 293 |
+
if getattr(args, "quality_mode_local", False) and min_tps == 50000.0:
|
| 294 |
+
return 0.0
|
| 295 |
+
return min_tps
|
| 296 |
|
| 297 |
|
| 298 |
def _trial_env(trial: optuna.Trial, args: argparse.Namespace, metrics_path: Path) -> dict[str, str]:
|
| 299 |
env = os.environ.copy()
|
| 300 |
full_arch_hpo = env.get("HYDRA_HPO_FULL_ARCH", "0") == "1"
|
| 301 |
speed_arch_hpo = full_arch_hpo and env.get("HYDRA_HPO_SPEED_ARCH", "0") == "1"
|
| 302 |
+
quality_mode_local = bool(getattr(args, "quality_mode_local", False))
|
| 303 |
|
| 304 |
# Runtime and reporting
|
| 305 |
env["HYDRA_METRICS_OUT"] = str(metrics_path)
|
|
|
|
| 317 |
env["HYDRA_D_STATE"] = str(trial.suggest_categorical("d_state", [16, 32]))
|
| 318 |
env["HYDRA_HEADDIM"] = str(trial.suggest_categorical("headdim", [16, 32]))
|
| 319 |
env["HYDRA_EXPAND"] = str(trial.suggest_categorical("expand", [1, 2]))
|
| 320 |
+
elif quality_mode_local and full_arch_hpo:
|
| 321 |
+
env["HYDRA_D_MODEL"] = str(trial.suggest_categorical("d_model", [64, 96, 128]))
|
| 322 |
+
env["HYDRA_N_LAYER"] = str(trial.suggest_int("n_layer", 2, 3))
|
| 323 |
+
env["HYDRA_D_STATE"] = str(trial.suggest_categorical("d_state", [16, 32]))
|
| 324 |
+
env["HYDRA_HEADDIM"] = str(trial.suggest_categorical("headdim", [16, 32]))
|
| 325 |
+
env["HYDRA_EXPAND"] = str(trial.suggest_categorical("expand", [1, 2]))
|
| 326 |
else:
|
| 327 |
env["HYDRA_D_MODEL"] = str(trial.suggest_categorical("d_model", [64, 96, 128, 160, 192]))
|
| 328 |
env["HYDRA_N_LAYER"] = str(trial.suggest_int("n_layer", 1, 4))
|
|
|
|
| 334 |
seq_len = trial.suggest_categorical("seq_len", [64, 128])
|
| 335 |
batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
|
| 336 |
grad_accum = trial.suggest_categorical("grad_accum", [4, 8, 16])
|
| 337 |
+
elif quality_mode_local and full_arch_hpo:
|
| 338 |
+
seq_len = trial.suggest_categorical("seq_len", [64])
|
| 339 |
+
batch_size = trial.suggest_categorical("batch_size", [4, 8])
|
| 340 |
+
grad_accum = trial.suggest_categorical("grad_accum", [4, 8, 16])
|
| 341 |
else:
|
| 342 |
seq_len = trial.suggest_categorical("seq_len", [32, 64])
|
| 343 |
batch_size = trial.suggest_categorical("batch_size", [4, 8] if full_arch_hpo else [4, 8, 16])
|
|
|
|
| 348 |
env["HYDRA_BATCH_SIZE"] = str(batch_size)
|
| 349 |
env["HYDRA_TOTAL_BATCH"] = str(total_batch)
|
| 350 |
|
| 351 |
+
if quality_mode_local and full_arch_hpo:
|
| 352 |
+
env["HYDRA_MATRIX_LR"] = str(trial.suggest_float("matrix_lr", 0.008, 0.03, log=True))
|
| 353 |
+
env["HYDRA_EMBED_LR"] = str(trial.suggest_float("embed_lr", 0.15, 0.6, log=True))
|
| 354 |
+
env["HYDRA_UNEMBED_LR"] = str(trial.suggest_float("unembed_lr", 0.001, 0.01, log=True))
|
| 355 |
+
else:
|
| 356 |
+
env["HYDRA_MATRIX_LR"] = str(trial.suggest_float("matrix_lr", 0.005, 0.2, log=True))
|
| 357 |
+
env["HYDRA_EMBED_LR"] = str(trial.suggest_float("embed_lr", 0.05, 1.0, log=True))
|
| 358 |
+
env["HYDRA_UNEMBED_LR"] = str(trial.suggest_float("unembed_lr", 0.0005, 0.02, log=True))
|
| 359 |
|
| 360 |
if full_arch_hpo:
|
| 361 |
env["HYDRA_HYENA_LAYERS"] = ""
|
| 362 |
env["HYDRA_ENGRAM_N_COLUMNS"] = str(
|
| 363 |
+
trial.suggest_categorical(
|
| 364 |
+
"engram_n_columns",
|
| 365 |
+
[512, 1024] if (speed_arch_hpo or quality_mode_local) else [512, 1024, 2048],
|
| 366 |
+
)
|
| 367 |
)
|
| 368 |
env["HYDRA_ENGRAM_LAYER_IDX"] = str(trial.suggest_int("engram_layer_idx", 0, max(0, int(env["HYDRA_N_LAYER"]) - 1)))
|
| 369 |
+
env["HYDRA_SDR_TARGET_ACTIVE"] = str(
|
| 370 |
+
trial.suggest_categorical(
|
| 371 |
+
"sdr_target_active",
|
| 372 |
+
[327] if quality_mode_local else ([164, 327] if speed_arch_hpo else [164, 327, 512]),
|
| 373 |
+
)
|
| 374 |
+
)
|
| 375 |
+
env["HYDRA_HTM_LEARN_EVERY"] = str(
|
| 376 |
+
trial.suggest_categorical("htm_learn_every", [8, 16] if (speed_arch_hpo or quality_mode_local) else [4, 8, 16])
|
| 377 |
+
)
|
| 378 |
+
env["HYDRA_HTM_SUBSAMPLE"] = str(
|
| 379 |
+
trial.suggest_categorical("htm_subsample", [1, 2] if quality_mode_local else ([4, 8, 16] if speed_arch_hpo else [1, 2, 4, 8]))
|
| 380 |
+
)
|
| 381 |
+
env["HYDRA_ENGRAM_SUBSAMPLE"] = str(
|
| 382 |
+
trial.suggest_categorical("engram_subsample", [1, 2] if quality_mode_local else ([1, 2, 4] if speed_arch_hpo else [1]))
|
| 383 |
+
)
|
| 384 |
env["HYDRA_MAMBA3_CHUNK"] = str(trial.suggest_categorical("mamba3_chunk", [32, 64]))
|
| 385 |
+
env["HYDRA_DROPOUT"] = str(trial.suggest_categorical("dropout", [0.0, 0.1] if (speed_arch_hpo or quality_mode_local) else [0.0, 0.1, 0.2]))
|
| 386 |
else:
|
| 387 |
env["HYDRA_HYENA_LAYERS"] = trial.suggest_categorical("hyena_layers", ["", "0", "1", "0,1"])
|
| 388 |
|
|
|
|
| 442 |
return os.environ.get("FEATHER_HF_SPACE_REPO", f"{namespace}/feather-a10-runtime")
|
| 443 |
|
| 444 |
|
| 445 |
+
def _objective_local(args: argparse.Namespace):
|
| 446 |
+
effective_min_tps = _effective_min_tps(args)
|
| 447 |
+
|
| 448 |
+
def objective(trial: optuna.Trial) -> float:
|
| 449 |
trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
|
| 450 |
metrics_path = trial_dir / "metrics.json"
|
| 451 |
|
|
|
|
| 460 |
timeout=args.trial_timeout,
|
| 461 |
)
|
| 462 |
|
| 463 |
+
metrics: dict[str, Any] | None = None
|
| 464 |
if metrics_path.exists():
|
| 465 |
try:
|
| 466 |
metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
|
| 467 |
except json.JSONDecodeError:
|
| 468 |
metrics = None
|
| 469 |
+
if metrics is None:
|
| 470 |
+
metrics = _parse_metrics_from_stdout(proc.stdout)
|
| 471 |
+
|
| 472 |
+
artifact_paths = _persist_trial_artifacts(
|
| 473 |
+
trial_dir=trial_dir,
|
| 474 |
+
metrics=metrics,
|
| 475 |
+
log_lines=(proc.stdout or "").splitlines(),
|
| 476 |
+
log_name="train_stdout.log",
|
| 477 |
+
metadata={"runner": "local", "returncode": proc.returncode},
|
| 478 |
+
)
|
| 479 |
+
(trial_dir / "train_stderr.log").write_text(proc.stderr or "", encoding="utf-8")
|
| 480 |
+
|
| 481 |
+
fallback_bpb = _parse_last_train_bpb_from_logs(proc.stdout.splitlines())
|
| 482 |
+
if metrics is None:
|
| 483 |
+
_resolve_objective_metric(
|
| 484 |
+
trial,
|
| 485 |
+
metric_key=args.metric,
|
| 486 |
+
metrics=None,
|
| 487 |
+
allow_log_metric_fallback=args.allow_log_metric_fallback,
|
| 488 |
+
fallback_bpb=fallback_bpb,
|
| 489 |
+
tps_seen=None,
|
| 490 |
+
)
|
| 491 |
+
raise optuna.TrialPruned("No metrics found (HYDRA_METRICS_OUT/[METRICS_JSON])")
|
| 492 |
|
| 493 |
if proc.returncode != 0:
|
| 494 |
raise optuna.TrialPruned(f"Training failed rc={proc.returncode}")
|
| 495 |
|
| 496 |
+
metric_key = args.metric
|
|
|
|
|
|
|
| 497 |
|
| 498 |
tps_val = metrics.get("tps")
|
| 499 |
if tps_val is not None:
|
| 500 |
tps_f = float(tps_val)
|
| 501 |
trial.set_user_attr("tps", tps_f)
|
| 502 |
+
if effective_min_tps is not None and tps_f < effective_min_tps:
|
| 503 |
+
raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {effective_min_tps}")
|
| 504 |
+
|
| 505 |
+
value = _resolve_objective_metric(
|
| 506 |
+
trial,
|
| 507 |
+
metric_key=metric_key,
|
| 508 |
+
metrics=metrics,
|
| 509 |
+
allow_log_metric_fallback=args.allow_log_metric_fallback,
|
| 510 |
+
fallback_bpb=fallback_bpb,
|
| 511 |
+
tps_seen=None,
|
| 512 |
+
)
|
| 513 |
|
| 514 |
+
# Keep useful context on trial
|
| 515 |
+
trial.set_user_attr("summary_path", metrics.get("summary_path") or artifact_paths["manifest_path"])
|
| 516 |
+
trial.set_user_attr("run_log_path", metrics.get("run_log_path") or artifact_paths["log_path"])
|
| 517 |
+
|
| 518 |
+
return value
|
| 519 |
|
| 520 |
return objective
|
| 521 |
|
| 522 |
|
| 523 |
+
def _objective_hf_job(args: argparse.Namespace):
|
| 524 |
from huggingface_hub import HfApi
|
| 525 |
from huggingface_hub.utils import get_token
|
| 526 |
|
|
|
|
| 530 |
f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
|
| 531 |
)
|
| 532 |
|
| 533 |
+
api = HfApi(token=token)
|
| 534 |
+
terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
|
| 535 |
+
effective_min_tps = _effective_min_tps(args)
|
| 536 |
|
| 537 |
def objective(trial: optuna.Trial) -> float:
|
| 538 |
trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
|
|
|
|
| 643 |
except Exception:
|
| 644 |
pass
|
| 645 |
|
| 646 |
+
artifact_paths = _persist_trial_artifacts(
|
| 647 |
+
trial_dir=trial_dir,
|
| 648 |
+
metrics=metrics,
|
| 649 |
+
log_lines=log_lines,
|
| 650 |
+
log_name="hf_job.log",
|
| 651 |
+
metadata={"runner": "hf-job", "hf_job_id": job_id, "hf_stage": stage},
|
| 652 |
+
)
|
| 653 |
+
trial.set_user_attr("hf_stage", stage)
|
| 654 |
+
trial.set_user_attr("hf_log_lines", len(log_lines))
|
| 655 |
if terminal_detail:
|
| 656 |
trial.set_user_attr("hf_status_message", terminal_detail)
|
| 657 |
|
| 658 |
+
fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
|
| 659 |
+
if metrics is None:
|
| 660 |
+
try:
|
| 661 |
+
value = _resolve_objective_metric(
|
| 662 |
+
trial,
|
| 663 |
+
metric_key=args.metric,
|
| 664 |
+
metrics=None,
|
| 665 |
+
allow_log_metric_fallback=args.allow_log_metric_fallback,
|
| 666 |
+
fallback_bpb=fallback_bpb,
|
| 667 |
+
tps_seen=tps_seen,
|
| 668 |
+
)
|
| 669 |
+
if tps_seen is not None and effective_min_tps is not None and tps_seen < effective_min_tps:
|
| 670 |
+
raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {effective_min_tps}")
|
| 671 |
+
return value
|
| 672 |
+
except optuna.TrialPruned:
|
| 673 |
+
pass
|
| 674 |
+
if tps_seen is not None:
|
| 675 |
+
trial.set_user_attr("tps", tps_seen)
|
| 676 |
+
detail = f"stage={stage}, logs={len(log_lines)}"
|
| 677 |
+
if terminal_detail:
|
| 678 |
+
detail = f"{detail}, message={terminal_detail}"
|
| 679 |
raise optuna.TrialPruned(f"No metrics found from HF job ({detail})")
|
| 680 |
|
| 681 |
+
metric_key = args.metric
|
|
|
|
|
|
|
| 682 |
|
| 683 |
tps_val = metrics.get("tps")
|
| 684 |
if tps_val is not None:
|
| 685 |
tps_f = float(tps_val)
|
| 686 |
trial.set_user_attr("tps", tps_f)
|
| 687 |
+
if effective_min_tps is not None and tps_f < effective_min_tps:
|
| 688 |
+
raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {effective_min_tps}")
|
| 689 |
+
|
| 690 |
+
value = _resolve_objective_metric(
|
| 691 |
+
trial,
|
| 692 |
+
metric_key=metric_key,
|
| 693 |
+
metrics=metrics,
|
| 694 |
+
allow_log_metric_fallback=args.allow_log_metric_fallback,
|
| 695 |
+
fallback_bpb=fallback_bpb,
|
| 696 |
+
tps_seen=tps_seen,
|
| 697 |
+
)
|
| 698 |
+
trial.set_user_attr("summary_path", metrics.get("summary_path") or artifact_paths["manifest_path"])
|
| 699 |
+
trial.set_user_attr("run_log_path", metrics.get("run_log_path") or artifact_paths["log_path"])
|
| 700 |
+
return value
|
| 701 |
|
| 702 |
return objective
|
| 703 |
|
| 704 |
|
| 705 |
+
def _objective_hf_launcher(args: argparse.Namespace):
|
| 706 |
from huggingface_hub import HfApi
|
| 707 |
from huggingface_hub.utils import get_token
|
| 708 |
|
|
|
|
| 712 |
f"No Hugging Face token found. Set {args.hf_token_env} or run huggingface-cli login."
|
| 713 |
)
|
| 714 |
|
| 715 |
+
api = HfApi(token=token)
|
| 716 |
+
terminal_states = {"ERROR", "COMPLETED", "CANCELLED", "TIMEOUT", "FAILED", "CANCELED"}
|
| 717 |
+
effective_min_tps = _effective_min_tps(args)
|
| 718 |
|
| 719 |
def objective(trial: optuna.Trial) -> float:
|
| 720 |
trial_dir = Path(tempfile.mkdtemp(prefix=f"optuna_trial_{trial.number}_", dir=str(args.work_dir)))
|
|
|
|
| 796 |
except Exception:
|
| 797 |
pass
|
| 798 |
|
| 799 |
+
artifact_paths = _persist_trial_artifacts(
|
| 800 |
+
trial_dir=trial_dir,
|
| 801 |
+
metrics=metrics,
|
| 802 |
+
log_lines=log_lines,
|
| 803 |
+
log_name="hf_job.log",
|
| 804 |
+
metadata={"runner": "hf-launcher", "hf_job_id": job_id, "hf_stage": stage},
|
| 805 |
+
)
|
| 806 |
+
trial.set_user_attr("hf_stage", stage)
|
| 807 |
+
trial.set_user_attr("hf_log_lines", len(log_lines))
|
| 808 |
if terminal_detail:
|
| 809 |
trial.set_user_attr("hf_status_message", terminal_detail)
|
| 810 |
|
| 811 |
+
fallback_bpb = _parse_last_train_bpb_from_logs(log_lines)
|
| 812 |
+
if metrics is None:
|
| 813 |
+
try:
|
| 814 |
+
value = _resolve_objective_metric(
|
| 815 |
+
trial,
|
| 816 |
+
metric_key=args.metric,
|
| 817 |
+
metrics=None,
|
| 818 |
+
allow_log_metric_fallback=args.allow_log_metric_fallback,
|
| 819 |
+
fallback_bpb=fallback_bpb,
|
| 820 |
+
tps_seen=tps_seen,
|
| 821 |
+
)
|
| 822 |
+
if tps_seen is not None and effective_min_tps is not None and tps_seen < effective_min_tps:
|
| 823 |
+
raise optuna.TrialPruned(f"TPS below floor: {tps_seen} < {effective_min_tps}")
|
| 824 |
+
return value
|
| 825 |
+
except optuna.TrialPruned:
|
| 826 |
+
pass
|
| 827 |
+
if tps_seen is not None:
|
| 828 |
+
trial.set_user_attr("tps", tps_seen)
|
| 829 |
+
detail = f"stage={stage}, logs={len(log_lines)}"
|
| 830 |
+
if terminal_detail:
|
| 831 |
+
detail = f"{detail}, message={terminal_detail}"
|
| 832 |
raise optuna.TrialPruned(f"No metrics found from HF launcher job ({detail})")
|
| 833 |
|
| 834 |
+
metric_key = args.metric
|
|
|
|
|
|
|
| 835 |
|
| 836 |
tps_val = metrics.get("tps")
|
| 837 |
if tps_val is not None:
|
| 838 |
tps_f = float(tps_val)
|
| 839 |
trial.set_user_attr("tps", tps_f)
|
| 840 |
+
if effective_min_tps is not None and tps_f < effective_min_tps:
|
| 841 |
+
raise optuna.TrialPruned(f"TPS below floor: {tps_f} < {effective_min_tps}")
|
| 842 |
+
|
| 843 |
+
value = _resolve_objective_metric(
|
| 844 |
+
trial,
|
| 845 |
+
metric_key=metric_key,
|
| 846 |
+
metrics=metrics,
|
| 847 |
+
allow_log_metric_fallback=args.allow_log_metric_fallback,
|
| 848 |
+
fallback_bpb=fallback_bpb,
|
| 849 |
+
tps_seen=tps_seen,
|
| 850 |
+
)
|
| 851 |
+
trial.set_user_attr("summary_path", metrics.get("summary_path") or artifact_paths["manifest_path"])
|
| 852 |
+
trial.set_user_attr("run_log_path", metrics.get("run_log_path") or artifact_paths["log_path"])
|
| 853 |
+
return value
|
| 854 |
|
| 855 |
return objective
|
| 856 |
|
|
|
|
| 893 |
parser.add_argument("--priors-file", type=Path, default=REPO_ROOT / "docs" / "hpo_transfer_priors.json", help="Path to transfer-learning prior trials JSON")
|
| 894 |
parser.add_argument("--apply-priors", action="store_true", default=True, help="Enqueue transfer-learning prior trials before optimize")
|
| 895 |
parser.add_argument("--no-apply-priors", action="store_false", dest="apply_priors")
|
| 896 |
+
parser.add_argument("--quality-mode-local", action="store_true", default=False, help="Narrow local full-architecture search around the proven quality-winning region")
|
| 897 |
+
parser.add_argument("--quality-anchor-top-k", type=int, default=3, help="Number of top clean priors to enqueue as deterministic local quality anchors")
|
| 898 |
parser.add_argument("--seed", type=int, default=42, help="Seed for sampler")
|
| 899 |
parser.add_argument("--n-startup-trials", type=int, default=5, help="Pruner startup trials before pruning")
|
| 900 |
parser.add_argument("--n-warmup-steps", type=int, default=0, help="Pruner warmup steps")
|
|
|
|
| 925 |
pruner=pruner,
|
| 926 |
)
|
| 927 |
|
| 928 |
+
enqueued_quality_anchors = _enqueue_quality_anchors(study, args.priors_file, args.quality_mode_local, args.quality_anchor_top_k)
|
| 929 |
+
if enqueued_quality_anchors:
|
| 930 |
+
print(f"[hpo] enqueued {enqueued_quality_anchors} local quality anchors from {args.priors_file}")
|
| 931 |
+
|
| 932 |
enqueued_priors = _enqueue_transfer_priors(study, args.priors_file, args.apply_priors)
|
| 933 |
if enqueued_priors:
|
| 934 |
print(f"[hpo] enqueued {enqueued_priors} transfer priors from {args.priors_file}")
|
|
|
|
| 993 |
"n_completed": len(completed),
|
| 994 |
"patience_trials": args.patience_trials,
|
| 995 |
"min_improvement": args.min_improvement,
|
| 996 |
+
"quality_mode_local": args.quality_mode_local,
|
| 997 |
+
"enqueued_quality_anchors": enqueued_quality_anchors,
|
| 998 |
"enqueued_priors": enqueued_priors,
|
| 999 |
}
|
| 1000 |
else:
|
|
|
|
| 1004 |
"metric": args.metric,
|
| 1005 |
"best_value": None,
|
| 1006 |
"best_params": {},
|
| 1007 |
+
"best_trial_number": None,
|
| 1008 |
"best_trial_user_attrs": {},
|
| 1009 |
"n_trials": len(study.trials),
|
| 1010 |
"n_completed": 0,
|
| 1011 |
+
"quality_mode_local": args.quality_mode_local,
|
| 1012 |
+
"enqueued_quality_anchors": enqueued_quality_anchors,
|
| 1013 |
"enqueued_priors": enqueued_priors,
|
| 1014 |
"note": "No completed trials with metrics found.",
|
| 1015 |
}
|
overlay/scripts/run_cycle1a.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import sys
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
REPO_ROOT = Path(__file__).resolve().parents[1]
|
| 9 |
+
if str(REPO_ROOT) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(REPO_ROOT))
|
| 11 |
+
|
| 12 |
+
from scripts import cycle_executor
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 16 |
+
parser = argparse.ArgumentParser(description="Run the full local Cycle 1a benchmark suite")
|
| 17 |
+
parser.add_argument("--out-dir", type=Path, default=REPO_ROOT / "artifacts" / "cycle1a_runs")
|
| 18 |
+
parser.add_argument("--preflight-out", type=Path, default=REPO_ROOT / "artifacts" / "cycle1a_preflight.json")
|
| 19 |
+
parser.add_argument("--summary-out", type=Path, default=REPO_ROOT / "artifacts" / "cycle1a_summary.json")
|
| 20 |
+
parser.add_argument("--hydrate-assets", action="store_true")
|
| 21 |
+
parser.add_argument("--require-ready", action="store_true")
|
| 22 |
+
parser.add_argument("--output-repo")
|
| 23 |
+
parser.add_argument("--tokenizer-repo")
|
| 24 |
+
return parser.parse_args(argv)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def main(argv: list[str] | None = None) -> int:
|
| 28 |
+
args = parse_args(argv)
|
| 29 |
+
return cycle_executor.main([
|
| 30 |
+
"--benchmark", "GSM8K",
|
| 31 |
+
"--variant", "hydra_full",
|
| 32 |
+
"--seed", "42",
|
| 33 |
+
"--out-dir", str(args.out_dir),
|
| 34 |
+
"--preflight-out", str(args.preflight_out),
|
| 35 |
+
"--summary-out", str(args.summary_out),
|
| 36 |
+
"--all-runnable",
|
| 37 |
+
"--all-benchmarks",
|
| 38 |
+
*( ["--hydrate-assets"] if args.hydrate_assets else [] ),
|
| 39 |
+
*( ["--require-ready"] if args.require_ready else [] ),
|
| 40 |
+
*( ["--output-repo", args.output_repo] if args.output_repo else [] ),
|
| 41 |
+
*( ["--tokenizer-repo", args.tokenizer_repo] if args.tokenizer_repo else [] ),
|
| 42 |
+
])
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
raise SystemExit(main())
|
overlay/scripts/sweep_depth_aggregate.py
CHANGED
|
@@ -26,6 +26,8 @@ type MetricsDict = dict[str, MetricValue]
|
|
| 26 |
MANIFEST = Path(sys.argv[1] if len(sys.argv) > 1 else '/tmp/sweep_depth_manifest.txt')
|
| 27 |
STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
|
| 28 |
MIN_TPS = float(os.environ.get('SWEEP_MIN_TPS', '0'))
|
|
|
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def _zero_shot_score(result: MetricsDict) -> float:
|
|
@@ -47,6 +49,25 @@ def _metric_int(result: MetricsDict, key: str, default: int = 0) -> int:
|
|
| 47 |
return int(value) if isinstance(value, int) else default
|
| 48 |
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
def _percentile_linear(sorted_values: list[float], pct: float) -> float:
|
| 51 |
if not sorted_values:
|
| 52 |
return 0.0
|
|
@@ -210,6 +231,28 @@ def compare(results: dict[int, MetricsDict]) -> None:
|
|
| 210 |
)
|
| 211 |
if MIN_TPS > 0:
|
| 212 |
print(f"[agg] throughput gate: tps_median >= {MIN_TPS:.0f}; feasible={feasible_count}/{len(ranked)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
|
| 215 |
def main() -> int:
|
|
|
|
| 26 |
MANIFEST = Path(sys.argv[1] if len(sys.argv) > 1 else '/tmp/sweep_depth_manifest.txt')
|
| 27 |
STEP_TPS_PATTERN = re.compile(r"step=(\d+).*?\btps=(\d+)\b")
|
| 28 |
MIN_TPS = float(os.environ.get('SWEEP_MIN_TPS', '0'))
|
| 29 |
+
TARGET_TOKENS_M = float(os.environ.get('SWEEP_TARGET_TOKENS_M', '0'))
|
| 30 |
+
TARGET_SECONDS = float(os.environ.get('SWEEP_TARGET_SECONDS', '0'))
|
| 31 |
|
| 32 |
|
| 33 |
def _zero_shot_score(result: MetricsDict) -> float:
|
|
|
|
| 49 |
return int(value) if isinstance(value, int) else default
|
| 50 |
|
| 51 |
|
| 52 |
+
def _fixed_budget_ranking(results: dict[int, MetricsDict], *, metric_key: str, target: float) -> list[tuple[int, MetricsDict, float]]:
|
| 53 |
+
ranked: list[tuple[int, MetricsDict, float]] = []
|
| 54 |
+
for n_layer, row in results.items():
|
| 55 |
+
budget_val = row.get(metric_key)
|
| 56 |
+
if not isinstance(budget_val, (int, float)):
|
| 57 |
+
continue
|
| 58 |
+
gap = abs(float(budget_val) - target)
|
| 59 |
+
ranked.append((n_layer, row, gap))
|
| 60 |
+
ranked.sort(
|
| 61 |
+
key=lambda item: (
|
| 62 |
+
item[2],
|
| 63 |
+
_metric_float(item[1], 'val_bpb', float('inf')),
|
| 64 |
+
-_zero_shot_score(item[1]),
|
| 65 |
+
-_metric_float(item[1], 'tps_median', 0.0),
|
| 66 |
+
)
|
| 67 |
+
)
|
| 68 |
+
return ranked
|
| 69 |
+
|
| 70 |
+
|
| 71 |
def _percentile_linear(sorted_values: list[float], pct: float) -> float:
|
| 72 |
if not sorted_values:
|
| 73 |
return 0.0
|
|
|
|
| 231 |
)
|
| 232 |
if MIN_TPS > 0:
|
| 233 |
print(f"[agg] throughput gate: tps_median >= {MIN_TPS:.0f}; feasible={feasible_count}/{len(ranked)}")
|
| 234 |
+
|
| 235 |
+
if TARGET_TOKENS_M > 0:
|
| 236 |
+
print('\n=== Fixed-token champion comparison ===')
|
| 237 |
+
print(f' target_tokens_M={TARGET_TOKENS_M:.4f}')
|
| 238 |
+
for n, r, gap in _fixed_budget_ranking(results, metric_key='total_tokens_M', target=TARGET_TOKENS_M):
|
| 239 |
+
print(
|
| 240 |
+
f" n_layer={n:2d} val_bpb={_metric_float(r, 'val_bpb', float('nan')):.4f} "
|
| 241 |
+
f"total_tokens_M={_metric_float(r, 'total_tokens_M', float('nan')):.4f} "
|
| 242 |
+
f"token_gap_M={gap:.4f} tps_median={_metric_float(r, 'tps_median', 0.0):.0f}",
|
| 243 |
+
flush=True,
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
if TARGET_SECONDS > 0:
|
| 247 |
+
print('\n=== Fixed-time champion comparison ===')
|
| 248 |
+
print(f' target_seconds={TARGET_SECONDS:.1f}')
|
| 249 |
+
for n, r, gap in _fixed_budget_ranking(results, metric_key='training_seconds', target=TARGET_SECONDS):
|
| 250 |
+
print(
|
| 251 |
+
f" n_layer={n:2d} val_bpb={_metric_float(r, 'val_bpb', float('nan')):.4f} "
|
| 252 |
+
f"training_seconds={_metric_float(r, 'training_seconds', float('nan')):.1f} "
|
| 253 |
+
f"time_gap_s={gap:.1f} tps_median={_metric_float(r, 'tps_median', 0.0):.0f}",
|
| 254 |
+
flush=True,
|
| 255 |
+
)
|
| 256 |
|
| 257 |
|
| 258 |
def main() -> int:
|
overlay/scripts/watch_benchmark_hf_job.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def parse_benchmark_result_from_logs(lines: list[str]):
|
| 10 |
+
for line in reversed(lines):
|
| 11 |
+
text = line.strip()
|
| 12 |
+
if not text.startswith("{"):
|
| 13 |
+
continue
|
| 14 |
+
try:
|
| 15 |
+
payload = json.loads(text)
|
| 16 |
+
except json.JSONDecodeError:
|
| 17 |
+
continue
|
| 18 |
+
if isinstance(payload, dict) and "benchmark" in payload:
|
| 19 |
+
return payload
|
| 20 |
+
return None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def write_watch_summary(path: Path, payload: dict[str, object]) -> None:
|
| 24 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
| 29 |
+
parser = argparse.ArgumentParser(description="Watch or snapshot a remote benchmark job")
|
| 30 |
+
parser.add_argument("--job-id", required=True)
|
| 31 |
+
parser.add_argument("--namespace", default="jackoatmon")
|
| 32 |
+
parser.add_argument("--summary-out", type=Path)
|
| 33 |
+
return parser.parse_args(argv)
|