#!/usr/bin/env python3 """ One-command Kaggle experiment runner for AutoDataLab++ Qwen2.5-1.5B. It runs, sequentially: 1. SFT 2. DPO 3. SFT -> DPO 4. GRPO+RLVR continuation from SFT->DPO 5. GRPO continuation from SFT->DPO 6. PPO continuation from SFT->DPO Every experiment evaluates both: - non-RAG mode - RAG mode It writes one leaderboard: /kaggle/working/cos_1p5b_all_runs/leaderboard.md /kaggle/working/cos_1p5b_all_runs/leaderboard.csv /kaggle/working/cos_1p5b_all_runs/leaderboard.json Recommended Kaggle cell: !python3 training/kaggle_run_all_1p5b_experiments.py --quick If the best RL method beats SFT->DPO, keep it. If not, keep SFT->DPO and report RL attempts as ablations. """ from __future__ import annotations import argparse import csv import json import os import subprocess import sys from pathlib import Path from typing import Any REPO = Path(__file__).resolve().parents[2] def run(cmd: list[str], cwd: Path) -> None: print("\n" + "=" * 100, flush=True) print("[cmd] " + " ".join(str(x) for x in cmd), flush=True) print("=" * 100, flush=True) subprocess.run(cmd, cwd=str(cwd), check=True) def evidence_score(rows: list[dict[str, Any]]) -> float: if not rows: return -999.0 vals = [] for row in rows: required = set(row.get("required_experts") or []) routed = set(row.get("model_routed_required") or []) coverage = len(required & routed) / max(len(required), 1) no_fallback = 1.0 if not row.get("needed_fallback") else 0.0 policy_reward = float(row.get("policy_reward") or 0.0) terminal = float(row.get("terminal_score") or 0.0) vals.append(coverage + no_fallback + 0.1 * policy_reward + 0.05 * terminal) return round(sum(vals) / len(vals), 4) def collect_rows(root: Path, run_type: str, run_name: str, adapter: Path, eval_dir: Path) -> dict[str, Any]: evidence_path = eval_dir / "evidence.json" if not evidence_path.is_file(): return { "run_type": run_type, "run_name": run_name, "adapter": str(adapter), "eval_dir": str(eval_dir), "score": -999.0, "ok": False, "rows": [], } rows = json.loads(evidence_path.read_text(encoding="utf-8")) all_no_fallback = all(not r.get("needed_fallback") for r in rows) all_full_coverage = all( set(r.get("required_experts") or []).issubset(set(r.get("model_routed_required") or [])) for r in rows ) return { "run_type": run_type, "run_name": run_name, "adapter": str(adapter), "eval_dir": str(eval_dir), "score": evidence_score(rows), "ok": bool(all_no_fallback and all_full_coverage), "rows": rows, } def write_leaderboard(out_root: Path, summaries: list[dict[str, Any]]) -> None: summaries = sorted(summaries, key=lambda x: x["score"], reverse=True) out_root.mkdir(parents=True, exist_ok=True) (out_root / "leaderboard.json").write_text(json.dumps(summaries, indent=2, default=str), encoding="utf-8") with (out_root / "leaderboard.csv").open("w", newline="", encoding="utf-8") as f: writer = csv.DictWriter( f, fieldnames=["rank", "run_type", "run_name", "score", "ok", "adapter", "eval_dir"], ) writer.writeheader() for i, row in enumerate(summaries, start=1): writer.writerow( { "rank": i, "run_type": row["run_type"], "run_name": row["run_name"], "score": row["score"], "ok": row["ok"], "adapter": row["adapter"], "eval_dir": row["eval_dir"], } ) md = [ "# AutoDataLab++ 1.5B Training Leaderboard", "", "| Rank | Method | Run | Score | Full routing / no fallback | Adapter |", "|---:|---|---|---:|---:|---|", ] for i, row in enumerate(summaries, start=1): md.append( f"| {i} | {row['run_type']} | `{row['run_name']}` | {row['score']} | " f"{row['ok']} | `{row['adapter']}` |" ) if summaries: best = summaries[0] md += [ "", "## Selected Best", "", f"- **Method:** {best['run_type']}", f"- **Run:** `{best['run_name']}`", f"- **Score:** {best['score']}", f"- **Adapter:** `{best['adapter']}`", f"- **Eval:** `{best['eval_dir']}`", ] (out_root / "leaderboard.md").write_text("\n".join(md), encoding="utf-8") def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--out-root", type=Path, default=Path("/kaggle/working/cos_1p5b_all_runs") if Path("/kaggle/working").is_dir() else Path("cos_1p5b_all_runs")) ap.add_argument("--base-out-root", type=Path, default=Path("/kaggle/working/cos_1p5b_runs") if Path("/kaggle/working").is_dir() else Path("cos_1p5b_runs")) ap.add_argument("--rl-out-root", type=Path, default=Path("/kaggle/working/cos_1p5b_rl_runs") if Path("/kaggle/working").is_dir() else Path("cos_1p5b_rl_runs")) ap.add_argument("--hf-token", default=os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or "") ap.add_argument("--model-id", default="Qwen/Qwen2.5-1.5B-Instruct") ap.add_argument("--eval-tasks", default="expert_brief,risk_brief,crisis_brief") ap.add_argument("--eval-rag-modes", default="false,true") ap.add_argument("--quick", action="store_true", help="short but useful hackathon run") ap.add_argument("--skip-sft", action="store_true") ap.add_argument("--skip-dpo", action="store_true") ap.add_argument("--skip-rl", action="store_true") args = ap.parse_args() py = sys.executable common = [ "--model-id", args.model_id, "--eval-tasks", args.eval_tasks, "--eval-rag-modes", args.eval_rag_modes, "--hf-token", args.hf_token, ] base_common = common + ["--out-root", str(args.base_out_root)] rl_common = common + ["--out-root", str(args.rl_out_root)] if args.quick: sft_epochs = "2" dpo_epochs = "1" rl_epochs = "1" max_train_examples = "0" max_train_states = "80" else: sft_epochs = "3" dpo_epochs = "2" rl_epochs = "2" max_train_examples = "0" max_train_states = "0" summaries: list[dict[str, Any]] = [] if not args.skip_sft: run( [ py, "training/scripts/kaggle_train_1p5b_methods.py", "--method", "sft", "--epochs", sft_epochs, "--run-name", "qwen15b_sft_all", "--max-train-examples", max_train_examples, *base_common, ], REPO, ) summaries.append( collect_rows( args.out_root, "sft", "qwen15b_sft_all", args.base_out_root / "qwen15b_sft_all" / "adapter", args.base_out_root / "qwen15b_sft_all" / "eval", ) ) if not args.skip_dpo: run( [ py, "training/scripts/kaggle_train_1p5b_methods.py", "--method", "dpo", "--epochs", dpo_epochs, "--run-name", "qwen15b_dpo_all", "--max-train-examples", max_train_examples, *base_common, ], REPO, ) summaries.append( collect_rows( args.out_root, "dpo", "qwen15b_dpo_all", args.base_out_root / "qwen15b_dpo_all" / "adapter", args.base_out_root / "qwen15b_dpo_all" / "eval", ) ) run( [ py, "training/scripts/kaggle_train_1p5b_methods.py", "--method", "sft_then_dpo", "--sft-epochs", sft_epochs, "--dpo-epochs", dpo_epochs, "--run-name", "qwen15b_sft_then_dpo_all", "--max-train-examples", max_train_examples, *base_common, ], REPO, ) sftdpo_adapter = args.base_out_root / "qwen15b_sft_then_dpo_all" / "adapter" summaries.append( collect_rows( args.out_root, "sft_then_dpo", "qwen15b_sft_then_dpo_all", sftdpo_adapter, args.base_out_root / "qwen15b_sft_then_dpo_all" / "eval", ) ) else: sftdpo_adapter = args.base_out_root / "qwen15b_sft_then_dpo_all" / "adapter" if not args.skip_rl: for method, lr, anchor in [ ("grpo_rlvr", "5e-6", "0.3"), ("grpo", "3e-6", "0.35"), ("ppo", "3e-6", "0.35"), ]: run_name = f"qwen15b_{method}_safe_all" run( [ py, "training/scripts/kaggle_rl_1p5b_methods.py", "--method", method, "--init-adapter", str(sftdpo_adapter), "--epochs", rl_epochs, "--lr", lr, "--sft-anchor", anchor, "--max-train-states", max_train_states, "--run-name", run_name, *rl_common, ], REPO, ) summaries.append( collect_rows( args.out_root, method, run_name, args.rl_out_root / run_name / "adapter", args.rl_out_root / run_name / "eval", ) ) write_leaderboard(args.out_root, summaries) print("\n" + "=" * 100, flush=True) print(f"[leaderboard] {args.out_root / 'leaderboard.md'}", flush=True) print(f"[leaderboard] {args.out_root / 'leaderboard.csv'}", flush=True) print(f"[leaderboard] {args.out_root / 'leaderboard.json'}", flush=True) print("=" * 100, flush=True) return 0 if __name__ == "__main__": raise SystemExit(main())