blt-reasoner-pilot1 / code /scripts /hf_upload_pilot.py
LauraGG's picture
BLT-Reasoner pilot 1: ckpts + code + logs + ablations
9477b5c verified
"""Upload BLT-Reasoner pilot artifacts to a public HF repo.
Token is read from stdin so it never appears in command-line arguments,
process listings, or shell history on the box. Run as:
cat token.txt | python3 -m experiments.blt_reasoner.scripts.hf_upload_pilot \
--repo LauraGG/blt-reasoner-pilot1 \
--pilot_dir /home/ubuntu/work/blt_pilot1 \
--code_dir /home/ubuntu/experiments/blt_reasoner
Uploads (each in its own folder inside the repo):
ckpts/ckpt-step{2000,4000,6000,8000,...} — all saved local ckpts
code/ — full blt_reasoner source tree
logs/run.log, logs/metrics.jsonl, logs/auto_eval.log, logs/interim_*.log
ablations/*.json — interim ablation results
README.md — auto-generated state summary
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import sys
from pathlib import Path
def build_readme(pilot_dir: Path, code_dir: Path, repo: str) -> str:
lines = []
lines.append(f"# BLT-Reasoner Pilot 1 — checkpoints + code\n")
lines.append(
"Compute-constrained latent reasoning pilot on Qwen2.5-1.5B-Instruct + GSM8K. "
"Continuous M-step latent loop + strict y→only-z bottleneck + InfoNCE z↔y "
"identifiability loss. See `code/README.md` for architecture details and "
"`HANDOFF_DACOT_PROPOSAL_2026-05-16.md` (in the main repo) for full motivation.\n"
)
# Inventory
ckpts = sorted([p for p in (pilot_dir).glob("ckpt-step*") if p.is_dir()],
key=lambda p: int(p.name.replace("ckpt-step", "")))
lines.append("## Checkpoints (LoRA adapter + projector + InfoNCE head)\n")
lines.append("Each ckpt is ~25 MB — only the trained adapter/projector/head; "
"the base Qwen2.5-1.5B-Instruct is loaded fresh from HF on resume.\n")
lines.append("| step | K_train | files |")
lines.append("|---|---|---|")
for c in ckpts:
s = int(c.name.replace("ckpt-step", ""))
if s < 4000: k = 4
elif s < 8000: k = 8
else: k = 16
lines.append(f"| {s} | {k} | `ckpts/{c.name}/model/`, `projector.pt`, `head.pt` |")
lines.append("")
# Ablations
abls = []
for c in ckpts:
for f in c.glob("ablation_*.json"):
abls.append((c.name, f))
if abls:
lines.append("## Pre-registered z-ablation results\n")
lines.append(
"Pre-registered success criterion: `Δ_random ≥ 15 pp AND Δ_zero ≥ 25 pp` "
"on GSM8K-test. Below are the interim results captured during training.\n"
)
lines.append("| ckpt | K_eval | n | acc(normal) | acc(random) | acc(zero) | Δ_random | Δ_zero |")
lines.append("|---|---|---|---|---|---|---|---|")
for cname, fpath in sorted(abls):
try:
d = json.loads(Path(fpath).read_text())
r = d.get("results", {})
row = [
cname,
str(d.get("K", "?")),
str(d.get("n", "?")),
f"{r.get('normal', {}).get('acc', float('nan')):.3f}",
f"{r.get('random', {}).get('acc', float('nan')):.3f}",
f"{r.get('zero', {}).get('acc', float('nan')):.3f}",
f"{d.get('delta_normal_minus_random', float('nan')):+.3f}",
f"{d.get('delta_normal_minus_zero', float('nan')):+.3f}",
]
lines.append("| " + " | ".join(row) + " |")
except Exception as e:
lines.append(f"| {cname} | (parse error: {e}) |")
lines.append("")
# Resume instructions
lines.append("## Resume training on a fresh instance\n")
lines.append("```bash\n"
"git clone <main-repo-with-experiments/blt_reasoner> # or pull the code/ subdir here\n"
"pip install transformers peft bitsandbytes datasets safetensors huggingface_hub\n"
"python3 -m experiments.blt_reasoner.train \\\n"
f" --config experiments/blt_reasoner/configs/pilot_qwen15b_gsm8k.json \\\n"
f" --resume_from {repo}:ckpts/ckpt-step6000\n"
"```\n"
"Notes:\n"
"- The `--resume_from` flag (in `train.py`) accepts either a local ckpt path or "
f"a `{repo}:ckpts/ckpt-stepN` HF-Hub reference.\n"
"- **Optimizer state is not preserved** across resume. Expect a short loss spike "
"(~100–300 steps) while Adam moments re-stabilize. The latent geometry (LoRA "
"weights, projector, head) survives intact.\n"
"- The base model `Qwen/Qwen2.5-1.5B-Instruct` is fetched automatically.\n"
)
lines.append("## Logs and intermediate artifacts\n"
"- `logs/run.log` — full training log\n"
"- `logs/metrics.jsonl` — per-step loss/metric breakdown\n"
"- `logs/auto_eval.log` — poller daemon log (auto-eval on train exit)\n"
"- `logs/interim_*.log` — interim ablation logs\n"
"- `code/` — full `experiments/blt_reasoner/` source tree at upload time\n")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--repo", required=True, help="e.g., LauraGG/blt-reasoner-pilot1")
parser.add_argument("--pilot_dir", required=True, help="e.g., /home/ubuntu/work/blt_pilot1")
parser.add_argument("--code_dir", required=True, help="e.g., /home/ubuntu/experiments/blt_reasoner")
parser.add_argument("--private", action="store_true")
args = parser.parse_args()
token = sys.stdin.read().strip()
if not token.startswith("hf_"):
print("[upload] stdin did not contain an hf_ token; aborting", file=sys.stderr)
sys.exit(2)
from huggingface_hub import HfApi
api = HfApi(token=token)
print(f"[upload] creating repo {args.repo} (private={args.private})", flush=True)
api.create_repo(repo_id=args.repo, repo_type="model", private=args.private, exist_ok=True)
pilot = Path(args.pilot_dir)
code = Path(args.code_dir)
# Stage layout in a tmp dir, then upload as a single folder commit.
stage = Path("/tmp/blt_upload_stage")
if stage.exists():
shutil.rmtree(stage)
stage.mkdir(parents=True)
# Ckpts
(stage / "ckpts").mkdir()
for c in sorted(pilot.glob("ckpt-step*")):
if c.is_dir():
shutil.copytree(c, stage / "ckpts" / c.name)
print(f"[upload] staged {c.name}", flush=True)
# Code
if code.exists():
shutil.copytree(code, stage / "code",
ignore=shutil.ignore_patterns("__pycache__", "*.pyc"))
print(f"[upload] staged code dir", flush=True)
# Logs
(stage / "logs").mkdir()
for name in ("run.log", "metrics.jsonl", "auto_eval.log",
"interim_ablation.log",
"interim_ablation_K4.log",
"interim_ablation_K8.log",
"interim_ablation_K16_step8000.log",
"run_attempt1_oom.log",
"run_attempt2.log"):
p = pilot / name
if p.exists():
shutil.copy(p, stage / "logs" / name)
print(f"[upload] staged log {name}", flush=True)
# Also stash all ablation_*.json under ablations/ at the top of the staged tree,
# alongside their per-ckpt copies (which are in ckpts/ckpt-stepN/ already).
(stage / "ablations").mkdir()
for c in sorted(pilot.glob("ckpt-step*")):
for f in c.glob("ablation_*.json"):
shutil.copy(f, stage / "ablations" / f"{c.name}__{f.name}")
# README
readme = build_readme(pilot, code, args.repo)
(stage / "README.md").write_text(readme)
print(f"[upload] staged README.md ({len(readme)} chars)", flush=True)
# Final size
total_bytes = sum(p.stat().st_size for p in stage.rglob("*") if p.is_file())
print(f"[upload] total staged size = {total_bytes/1e6:.1f} MB", flush=True)
print(f"[upload] pushing to {args.repo} ...", flush=True)
api.upload_folder(
folder_path=str(stage),
repo_id=args.repo,
repo_type="model",
commit_message="BLT-Reasoner pilot 1: ckpts + code + logs + ablations",
)
print(f"[upload] DONE — https://huggingface.co/{args.repo}", flush=True)
shutil.rmtree(stage)
if __name__ == "__main__":
main()