Spaces:
Sleeping
Sleeping
| """Phase-gate evaluation for the GRPO policy. | |
| Runs the trained model against held-out (task, seed) pairs, prints aggregate | |
| metrics, and emits a JSONL log compatible with `eval-models/results/*.jsonl`. | |
| Sanity-gate thresholds (configurable): | |
| phase1 (smoke): format_ok_rate ≥ 0.50 | |
| phase2 (signal): mean_grader strictly higher than phase0 baseline | |
| phase3 (full): mean_grader ≥ best teacher baseline | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| from pathlib import Path | |
| _HERE = Path(__file__).resolve().parent | |
| _PARENT = _HERE.parent | |
| sys.path.insert(0, str(_PARENT)) | |
| from training.rollout import HFPolicy, rollout_batch # noqa: E402 | |
| def main(): | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--model", required=True, help="HF model path or hub id (trained checkpoint)") | |
| p.add_argument("--base-url", default=os.getenv("API_BASE_URL_ENV", "http://localhost:8000")) | |
| p.add_argument("--platform", default="LinkedIn", help="Held-out platform (default LinkedIn)") | |
| p.add_argument("--tasks", nargs="+", default=["easy", "medium", "hard"]) | |
| p.add_argument("--seeds", nargs="+", type=int, default=[100, 101, 102, 103, 104]) | |
| p.add_argument("--out", default=str(_HERE / "runs" / "eval.jsonl")) | |
| p.add_argument("--gate", choices=["phase1", "phase2", "phase3"], default=None) | |
| p.add_argument("--baseline-grader", type=float, default=None, | |
| help="Mean grader from phase0 baseline, required when --gate=phase2") | |
| args = p.parse_args() | |
| policy = HFPolicy(args.model, temperature=0.0) | |
| rows = rollout_batch( | |
| policy.generate, | |
| base_url=args.base_url, platform=args.platform, | |
| tasks=args.tasks, seeds=args.seeds, | |
| model_tag=f"eval:{args.model}", | |
| ) | |
| if not rows: | |
| raise SystemExit("No rows produced — env unreachable?") | |
| Path(args.out).parent.mkdir(parents=True, exist_ok=True) | |
| with open(args.out, "w") as f: | |
| for r in rows: | |
| f.write(json.dumps(r) + "\n") | |
| n = len(rows) | |
| fmt = sum(r["format_ok"] for r in rows) / n | |
| grader = sum(r["r_grader"] for r in rows) / n | |
| total = sum(r["r_total"] for r in rows) / n | |
| print(f"\n=== eval @ {args.platform} (n={n} decisions) ===") | |
| print(f" format_ok_rate: {fmt:.3f}") | |
| print(f" mean_grader: {grader:.3f}") | |
| print(f" mean_r_total: {total:.3f}") | |
| print(f" log: {args.out}") | |
| if args.gate == "phase1": | |
| ok = fmt >= 0.50 | |
| print(f" gate phase1: format_ok_rate >= 0.50 → {'PASS' if ok else 'FAIL'}") | |
| sys.exit(0 if ok else 1) | |
| if args.gate == "phase2": | |
| if args.baseline_grader is None: | |
| raise SystemExit("--baseline-grader required for phase2 gate") | |
| ok = grader > args.baseline_grader | |
| print(f" gate phase2: grader {grader:.3f} > baseline {args.baseline_grader:.3f} → {'PASS' if ok else 'FAIL'}") | |
| sys.exit(0 if ok else 1) | |
| if args.gate == "phase3": | |
| ok = grader >= 0.50 # tune to teacher's mean | |
| print(f" gate phase3: grader >= 0.50 → {'PASS' if ok else 'FAIL'}") | |
| sys.exit(0 if ok else 1) | |
| if __name__ == "__main__": | |
| main() | |