File size: 8,642 Bytes

9477b5c

"""Upload BLT-Reasoner pilot artifacts to a public HF repo.

Token is read from stdin so it never appears in command-line arguments,
process listings, or shell history on the box. Run as:

    cat token.txt | python3 -m experiments.blt_reasoner.scripts.hf_upload_pilot \
        --repo LauraGG/blt-reasoner-pilot1 \
        --pilot_dir /home/ubuntu/work/blt_pilot1 \
        --code_dir /home/ubuntu/experiments/blt_reasoner

Uploads (each in its own folder inside the repo):
    ckpts/ckpt-step{2000,4000,6000,8000,...}    — all saved local ckpts
    code/                                        — full blt_reasoner source tree
    logs/run.log, logs/metrics.jsonl, logs/auto_eval.log, logs/interim_*.log
    ablations/*.json                             — interim ablation results
    README.md                                    — auto-generated state summary
"""
from __future__ import annotations

import argparse
import json
import os
import shutil
import sys
from pathlib import Path


def build_readme(pilot_dir: Path, code_dir: Path, repo: str) -> str:
    lines = []
    lines.append(f"# BLT-Reasoner Pilot 1 — checkpoints + code\n")
    lines.append(
        "Compute-constrained latent reasoning pilot on Qwen2.5-1.5B-Instruct + GSM8K. "
        "Continuous M-step latent loop + strict y→only-z bottleneck + InfoNCE z↔y "
        "identifiability loss. See `code/README.md` for architecture details and "
        "`HANDOFF_DACOT_PROPOSAL_2026-05-16.md` (in the main repo) for full motivation.\n"
    )
    # Inventory
    ckpts = sorted([p for p in (pilot_dir).glob("ckpt-step*") if p.is_dir()],
                   key=lambda p: int(p.name.replace("ckpt-step", "")))
    lines.append("## Checkpoints (LoRA adapter + projector + InfoNCE head)\n")
    lines.append("Each ckpt is ~25 MB — only the trained adapter/projector/head; "
                 "the base Qwen2.5-1.5B-Instruct is loaded fresh from HF on resume.\n")
    lines.append("| step | K_train | files |")
    lines.append("|---|---|---|")
    for c in ckpts:
        s = int(c.name.replace("ckpt-step", ""))
        if   s < 4000:  k = 4
        elif s < 8000:  k = 8
        else:           k = 16
        lines.append(f"| {s} | {k} | `ckpts/{c.name}/model/`, `projector.pt`, `head.pt` |")
    lines.append("")
    # Ablations
    abls = []
    for c in ckpts:
        for f in c.glob("ablation_*.json"):
            abls.append((c.name, f))
    if abls:
        lines.append("## Pre-registered z-ablation results\n")
        lines.append(
            "Pre-registered success criterion: `Δ_random ≥ 15 pp AND Δ_zero ≥ 25 pp` "
            "on GSM8K-test. Below are the interim results captured during training.\n"
        )
        lines.append("| ckpt | K_eval | n | acc(normal) | acc(random) | acc(zero) | Δ_random | Δ_zero |")
        lines.append("|---|---|---|---|---|---|---|---|")
        for cname, fpath in sorted(abls):
            try:
                d = json.loads(Path(fpath).read_text())
                r = d.get("results", {})
                row = [
                    cname,
                    str(d.get("K", "?")),
                    str(d.get("n", "?")),
                    f"{r.get('normal', {}).get('acc', float('nan')):.3f}",
                    f"{r.get('random', {}).get('acc', float('nan')):.3f}",
                    f"{r.get('zero',   {}).get('acc', float('nan')):.3f}",
                    f"{d.get('delta_normal_minus_random', float('nan')):+.3f}",
                    f"{d.get('delta_normal_minus_zero',   float('nan')):+.3f}",
                ]
                lines.append("| " + " | ".join(row) + " |")
            except Exception as e:
                lines.append(f"| {cname} | (parse error: {e}) |")
        lines.append("")
    # Resume instructions
    lines.append("## Resume training on a fresh instance\n")
    lines.append("```bash\n"
                 "git clone <main-repo-with-experiments/blt_reasoner>  # or pull the code/ subdir here\n"
                 "pip install transformers peft bitsandbytes datasets safetensors huggingface_hub\n"
                 "python3 -m experiments.blt_reasoner.train \\\n"
                 f"    --config experiments/blt_reasoner/configs/pilot_qwen15b_gsm8k.json \\\n"
                 f"    --resume_from {repo}:ckpts/ckpt-step6000\n"
                 "```\n"
                 "Notes:\n"
                 "- The `--resume_from` flag (in `train.py`) accepts either a local ckpt path or "
                 f"a `{repo}:ckpts/ckpt-stepN` HF-Hub reference.\n"
                 "- **Optimizer state is not preserved** across resume. Expect a short loss spike "
                 "(~100–300 steps) while Adam moments re-stabilize. The latent geometry (LoRA "
                 "weights, projector, head) survives intact.\n"
                 "- The base model `Qwen/Qwen2.5-1.5B-Instruct` is fetched automatically.\n"
                 )
    lines.append("## Logs and intermediate artifacts\n"
                 "- `logs/run.log`         — full training log\n"
                 "- `logs/metrics.jsonl`   — per-step loss/metric breakdown\n"
                 "- `logs/auto_eval.log`   — poller daemon log (auto-eval on train exit)\n"
                 "- `logs/interim_*.log`   — interim ablation logs\n"
                 "- `code/`                — full `experiments/blt_reasoner/` source tree at upload time\n")
    return "\n".join(lines)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--repo", required=True, help="e.g., LauraGG/blt-reasoner-pilot1")
    parser.add_argument("--pilot_dir", required=True, help="e.g., /home/ubuntu/work/blt_pilot1")
    parser.add_argument("--code_dir", required=True, help="e.g., /home/ubuntu/experiments/blt_reasoner")
    parser.add_argument("--private", action="store_true")
    args = parser.parse_args()

    token = sys.stdin.read().strip()
    if not token.startswith("hf_"):
        print("[upload] stdin did not contain an hf_ token; aborting", file=sys.stderr)
        sys.exit(2)

    from huggingface_hub import HfApi
    api = HfApi(token=token)

    print(f"[upload] creating repo {args.repo} (private={args.private})", flush=True)
    api.create_repo(repo_id=args.repo, repo_type="model", private=args.private, exist_ok=True)

    pilot = Path(args.pilot_dir)
    code = Path(args.code_dir)

    # Stage layout in a tmp dir, then upload as a single folder commit.
    stage = Path("/tmp/blt_upload_stage")
    if stage.exists():
        shutil.rmtree(stage)
    stage.mkdir(parents=True)

    # Ckpts
    (stage / "ckpts").mkdir()
    for c in sorted(pilot.glob("ckpt-step*")):
        if c.is_dir():
            shutil.copytree(c, stage / "ckpts" / c.name)
            print(f"[upload] staged {c.name}", flush=True)

    # Code
    if code.exists():
        shutil.copytree(code, stage / "code",
                        ignore=shutil.ignore_patterns("__pycache__", "*.pyc"))
        print(f"[upload] staged code dir", flush=True)

    # Logs
    (stage / "logs").mkdir()
    for name in ("run.log", "metrics.jsonl", "auto_eval.log",
                 "interim_ablation.log",
                 "interim_ablation_K4.log",
                 "interim_ablation_K8.log",
                 "interim_ablation_K16_step8000.log",
                 "run_attempt1_oom.log",
                 "run_attempt2.log"):
        p = pilot / name
        if p.exists():
            shutil.copy(p, stage / "logs" / name)
            print(f"[upload] staged log {name}", flush=True)

    # Also stash all ablation_*.json under ablations/ at the top of the staged tree,
    # alongside their per-ckpt copies (which are in ckpts/ckpt-stepN/ already).
    (stage / "ablations").mkdir()
    for c in sorted(pilot.glob("ckpt-step*")):
        for f in c.glob("ablation_*.json"):
            shutil.copy(f, stage / "ablations" / f"{c.name}__{f.name}")

    # README
    readme = build_readme(pilot, code, args.repo)
    (stage / "README.md").write_text(readme)
    print(f"[upload] staged README.md ({len(readme)} chars)", flush=True)

    # Final size
    total_bytes = sum(p.stat().st_size for p in stage.rglob("*") if p.is_file())
    print(f"[upload] total staged size = {total_bytes/1e6:.1f} MB", flush=True)

    print(f"[upload] pushing to {args.repo} ...", flush=True)
    api.upload_folder(
        folder_path=str(stage),
        repo_id=args.repo,
        repo_type="model",
        commit_message="BLT-Reasoner pilot 1: ckpts + code + logs + ablations",
    )
    print(f"[upload] DONE — https://huggingface.co/{args.repo}", flush=True)

    shutil.rmtree(stage)


if __name__ == "__main__":
    main()