blt-reasoner-pilot1 / code /scripts /hf_upload_pilot.py

BLT-Reasoner pilot 1: ckpts + code + logs + ablations

9477b5c verified 11 days ago

8.64 kB

	"""Upload BLT-Reasoner pilot artifacts to a public HF repo.

	Token is read from stdin so it never appears in command-line arguments,
	process listings, or shell history on the box. Run as:

	cat token.txt \| python3 -m experiments.blt_reasoner.scripts.hf_upload_pilot \
	--repo LauraGG/blt-reasoner-pilot1 \
	--pilot_dir /home/ubuntu/work/blt_pilot1 \
	--code_dir /home/ubuntu/experiments/blt_reasoner

	Uploads (each in its own folder inside the repo):
	ckpts/ckpt-step{2000,4000,6000,8000,...} — all saved local ckpts
	code/ — full blt_reasoner source tree
	logs/run.log, logs/metrics.jsonl, logs/auto_eval.log, logs/interim_*.log
	ablations/*.json — interim ablation results
	README.md — auto-generated state summary
	"""
	from __future__ import annotations

	import argparse
	import json
	import os
	import shutil
	import sys
	from pathlib import Path


	def build_readme(pilot_dir: Path, code_dir: Path, repo: str) -> str:
	lines = []
	lines.append(f"# BLT-Reasoner Pilot 1 — checkpoints + code\n")
	lines.append(
	"Compute-constrained latent reasoning pilot on Qwen2.5-1.5B-Instruct + GSM8K. "
	"Continuous M-step latent loop + strict y→only-z bottleneck + InfoNCE z↔y "
	"identifiability loss. See `code/README.md` for architecture details and "
	"`HANDOFF_DACOT_PROPOSAL_2026-05-16.md` (in the main repo) for full motivation.\n"
	)
	# Inventory
	ckpts = sorted([p for p in (pilot_dir).glob("ckpt-step*") if p.is_dir()],
	key=lambda p: int(p.name.replace("ckpt-step", "")))
	lines.append("## Checkpoints (LoRA adapter + projector + InfoNCE head)\n")
	lines.append("Each ckpt is ~25 MB — only the trained adapter/projector/head; "
	"the base Qwen2.5-1.5B-Instruct is loaded fresh from HF on resume.\n")
	lines.append("\| step \| K_train \| files \|")
	lines.append("\|---\|---\|---\|")
	for c in ckpts:
	s = int(c.name.replace("ckpt-step", ""))
	if s < 4000: k = 4
	elif s < 8000: k = 8
	else: k = 16
	lines.append(f"\| {s} \| {k} \| `ckpts/{c.name}/model/`, `projector.pt`, `head.pt` \|")
	lines.append("")
	# Ablations
	abls = []
	for c in ckpts:
	for f in c.glob("ablation_*.json"):
	abls.append((c.name, f))
	if abls:
	lines.append("## Pre-registered z-ablation results\n")
	lines.append(
	"Pre-registered success criterion: `Δ_random ≥ 15 pp AND Δ_zero ≥ 25 pp` "
	"on GSM8K-test. Below are the interim results captured during training.\n"
	)
	lines.append("\| ckpt \| K_eval \| n \| acc(normal) \| acc(random) \| acc(zero) \| Δ_random \| Δ_zero \|")
	lines.append("\|---\|---\|---\|---\|---\|---\|---\|---\|")
	for cname, fpath in sorted(abls):
	try:
	d = json.loads(Path(fpath).read_text())
	r = d.get("results", {})
	row = [
	cname,
	str(d.get("K", "?")),
	str(d.get("n", "?")),
	f"{r.get('normal', {}).get('acc', float('nan')):.3f}",
	f"{r.get('random', {}).get('acc', float('nan')):.3f}",
	f"{r.get('zero', {}).get('acc', float('nan')):.3f}",
	f"{d.get('delta_normal_minus_random', float('nan')):+.3f}",
	f"{d.get('delta_normal_minus_zero', float('nan')):+.3f}",
	]
	lines.append("\| " + " \| ".join(row) + " \|")
	except Exception as e:
	lines.append(f"\| {cname} \| (parse error: {e}) \|")
	lines.append("")
	# Resume instructions
	lines.append("## Resume training on a fresh instance\n")
	lines.append("```bash\n"
	"git clone <main-repo-with-experiments/blt_reasoner> # or pull the code/ subdir here\n"
	"pip install transformers peft bitsandbytes datasets safetensors huggingface_hub\n"
	"python3 -m experiments.blt_reasoner.train \\\n"
	f" --config experiments/blt_reasoner/configs/pilot_qwen15b_gsm8k.json \\\n"
	f" --resume_from {repo}:ckpts/ckpt-step6000\n"
	"```\n"
	"Notes:\n"
	"- The `--resume_from` flag (in `train.py`) accepts either a local ckpt path or "
	f"a `{repo}:ckpts/ckpt-stepN` HF-Hub reference.\n"
	"- Optimizer state is not preserved across resume. Expect a short loss spike "
	"(~100–300 steps) while Adam moments re-stabilize. The latent geometry (LoRA "
	"weights, projector, head) survives intact.\n"
	"- The base model `Qwen/Qwen2.5-1.5B-Instruct` is fetched automatically.\n"
	)
	lines.append("## Logs and intermediate artifacts\n"
	"- `logs/run.log` — full training log\n"
	"- `logs/metrics.jsonl` — per-step loss/metric breakdown\n"
	"- `logs/auto_eval.log` — poller daemon log (auto-eval on train exit)\n"
	"- `logs/interim_*.log` — interim ablation logs\n"
	"- `code/` — full `experiments/blt_reasoner/` source tree at upload time\n")
	return "\n".join(lines)


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--repo", required=True, help="e.g., LauraGG/blt-reasoner-pilot1")
	parser.add_argument("--pilot_dir", required=True, help="e.g., /home/ubuntu/work/blt_pilot1")
	parser.add_argument("--code_dir", required=True, help="e.g., /home/ubuntu/experiments/blt_reasoner")
	parser.add_argument("--private", action="store_true")
	args = parser.parse_args()

	token = sys.stdin.read().strip()
	if not token.startswith("hf_"):
	print("[upload] stdin did not contain an hf_ token; aborting", file=sys.stderr)
	sys.exit(2)

	from huggingface_hub import HfApi
	api = HfApi(token=token)

	print(f"[upload] creating repo {args.repo} (private={args.private})", flush=True)
	api.create_repo(repo_id=args.repo, repo_type="model", private=args.private, exist_ok=True)

	pilot = Path(args.pilot_dir)
	code = Path(args.code_dir)

	# Stage layout in a tmp dir, then upload as a single folder commit.
	stage = Path("/tmp/blt_upload_stage")
	if stage.exists():
	shutil.rmtree(stage)
	stage.mkdir(parents=True)

	# Ckpts
	(stage / "ckpts").mkdir()
	for c in sorted(pilot.glob("ckpt-step*")):
	if c.is_dir():
	shutil.copytree(c, stage / "ckpts" / c.name)
	print(f"[upload] staged {c.name}", flush=True)

	# Code
	if code.exists():
	shutil.copytree(code, stage / "code",
	ignore=shutil.ignore_patterns("__pycache__", "*.pyc"))
	print(f"[upload] staged code dir", flush=True)

	# Logs
	(stage / "logs").mkdir()
	for name in ("run.log", "metrics.jsonl", "auto_eval.log",
	"interim_ablation.log",
	"interim_ablation_K4.log",
	"interim_ablation_K8.log",
	"interim_ablation_K16_step8000.log",
	"run_attempt1_oom.log",
	"run_attempt2.log"):
	p = pilot / name
	if p.exists():
	shutil.copy(p, stage / "logs" / name)
	print(f"[upload] staged log {name}", flush=True)

	# Also stash all ablation_*.json under ablations/ at the top of the staged tree,
	# alongside their per-ckpt copies (which are in ckpts/ckpt-stepN/ already).
	(stage / "ablations").mkdir()
	for c in sorted(pilot.glob("ckpt-step*")):
	for f in c.glob("ablation_*.json"):
	shutil.copy(f, stage / "ablations" / f"{c.name}__{f.name}")

	# README
	readme = build_readme(pilot, code, args.repo)
	(stage / "README.md").write_text(readme)
	print(f"[upload] staged README.md ({len(readme)} chars)", flush=True)

	# Final size
	total_bytes = sum(p.stat().st_size for p in stage.rglob("*") if p.is_file())
	print(f"[upload] total staged size = {total_bytes/1e6:.1f} MB", flush=True)

	print(f"[upload] pushing to {args.repo} ...", flush=True)
	api.upload_folder(
	folder_path=str(stage),
	repo_id=args.repo,
	repo_type="model",
	commit_message="BLT-Reasoner pilot 1: ckpts + code + logs + ablations",
	)
	print(f"[upload] DONE — https://huggingface.co/{args.repo}", flush=True)

	shutil.rmtree(stage)


	if __name__ == "__main__":
	main()