Spaces:
Sleeping
Sleeping
| """Run baseline inference matrix (random + Open LLM) and summarize variance. | |
| Usage examples: | |
| python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 0 | |
| python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 3 --output-json baseline_report.json | |
| Environment variables: | |
| API_BASE_URL, MODEL_NAME | |
| OPENAI_API_KEY or HF_TOKEN (required when --llm-runs > 0) | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import re | |
| import statistics | |
| import subprocess | |
| import sys | |
| import time | |
| from dataclasses import asdict, dataclass | |
| from pathlib import Path | |
| START_RE = re.compile(r"^\[START\]\s+task=(\S+)\s+env=(\S+)\s+model=(\S+)$") | |
| END_RE = re.compile( | |
| r"^\[END\]\s+success=(true|false)\s+steps=(\d+)\s+score=([0-9]*\.?[0-9]+)\s+rewards=(.*)$" | |
| ) | |
| class TaskEpisode: | |
| task_id: str | |
| success: bool | |
| steps: int | |
| score: float | |
| class RunResult: | |
| lane: str | |
| run_index: int | |
| runtime_seconds: float | |
| tasks: list[TaskEpisode] | |
| return_code: int | |
| stderr: str | |
| def _project_root() -> Path: | |
| return Path(__file__).resolve().parents[1] | |
| def _required_var(name: str) -> str: | |
| value = os.environ.get(name) | |
| if not value: | |
| raise RuntimeError(f"Missing required environment variable: {name}") | |
| return value | |
| def _extract_task_episodes(stdout: str) -> list[TaskEpisode]: | |
| episodes: list[TaskEpisode] = [] | |
| current_task: str | None = None | |
| for line in stdout.splitlines(): | |
| start_match = START_RE.match(line) | |
| if start_match: | |
| current_task = start_match.group(1) | |
| continue | |
| end_match = END_RE.match(line) | |
| if end_match: | |
| task_id = current_task or f"unknown-{len(episodes) + 1}" | |
| episodes.append( | |
| TaskEpisode( | |
| task_id=task_id, | |
| success=end_match.group(1) == "true", | |
| steps=int(end_match.group(2)), | |
| score=float(end_match.group(3)), | |
| ) | |
| ) | |
| current_task = None | |
| return episodes | |
| def _run_inference(lane: str, run_index: int, timeout_seconds: int) -> RunResult: | |
| env = os.environ.copy() | |
| env.setdefault("API_BASE_URL", "https://api.openai.com/v1") | |
| env.setdefault("MODEL_NAME", "baseline-model") | |
| if lane == "random": | |
| env["USE_RANDOM"] = "true" | |
| env.setdefault("OPENAI_API_KEY", "dummy-token") | |
| else: | |
| env["USE_RANDOM"] = "false" | |
| if not (env.get("OPENAI_API_KEY") or env.get("HF_TOKEN")): | |
| raise RuntimeError( | |
| "OPENAI_API_KEY or HF_TOKEN is required for Open LLM runs" | |
| ) | |
| cmd = [sys.executable, "inference.py"] | |
| started = time.monotonic() | |
| proc = subprocess.run( | |
| cmd, | |
| cwd=str(_project_root()), | |
| capture_output=True, | |
| text=True, | |
| encoding="utf-8", | |
| errors="replace", | |
| env=env, | |
| timeout=timeout_seconds, | |
| ) | |
| runtime = time.monotonic() - started | |
| tasks = _extract_task_episodes(proc.stdout) | |
| return RunResult( | |
| lane=lane, | |
| run_index=run_index, | |
| runtime_seconds=runtime, | |
| tasks=tasks, | |
| return_code=proc.returncode, | |
| stderr=proc.stderr.strip(), | |
| ) | |
| def _summarize(runs: list[RunResult]) -> dict[str, dict[str, float]]: | |
| by_task: dict[str, list[float]] = {} | |
| for run in runs: | |
| for ep in run.tasks: | |
| by_task.setdefault(ep.task_id, []).append(ep.score) | |
| summary: dict[str, dict[str, float]] = {} | |
| for task_id, scores in sorted(by_task.items()): | |
| mean_score = statistics.mean(scores) | |
| stdev_score = statistics.pstdev(scores) if len(scores) > 1 else 0.0 | |
| summary[task_id] = { | |
| "runs": float(len(scores)), | |
| "mean": round(mean_score, 6), | |
| "std": round(stdev_score, 6), | |
| "min": round(min(scores), 6), | |
| "max": round(max(scores), 6), | |
| } | |
| return summary | |
| def _print_summary(title: str, runs: list[RunResult]) -> None: | |
| print(f"\n=== {title} ===") | |
| if not runs: | |
| print("No runs executed") | |
| return | |
| summary = _summarize(runs) | |
| for task_id, metrics in summary.items(): | |
| print( | |
| f"{task_id:16s} runs={int(metrics['runs'])} " | |
| f"mean={metrics['mean']:.3f} std={metrics['std']:.3f} " | |
| f"min={metrics['min']:.3f} max={metrics['max']:.3f}" | |
| ) | |
| total_runtime = sum(r.runtime_seconds for r in runs) | |
| failures = [r for r in runs if r.return_code != 0] | |
| print(f"total_runtime_seconds={total_runtime:.2f}") | |
| print(f"failed_runs={len(failures)}") | |
| def _to_jsonable(runs: list[RunResult]) -> list[dict]: | |
| serialized: list[dict] = [] | |
| for run in runs: | |
| entry = asdict(run) | |
| entry["tasks"] = [asdict(t) for t in run.tasks] | |
| serialized.append(entry) | |
| return serialized | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Run baseline matrix for inference.py") | |
| parser.add_argument("--random-runs", type=int, default=1) | |
| parser.add_argument("--llm-runs", type=int, default=3) | |
| parser.add_argument("--timeout-seconds", type=int, default=1200) | |
| parser.add_argument("--output-json", type=str, default="") | |
| args = parser.parse_args() | |
| os.environ.setdefault("API_BASE_URL", "https://api.openai.com/v1") | |
| os.environ.setdefault("MODEL_NAME", "nvidia/Nemotron-3-Super-49B-v1") | |
| _required_var("API_BASE_URL") | |
| _required_var("MODEL_NAME") | |
| random_runs: list[RunResult] = [] | |
| llm_runs: list[RunResult] = [] | |
| try: | |
| for idx in range(1, args.random_runs + 1): | |
| print(f"Running random baseline {idx}/{args.random_runs}...") | |
| random_runs.append(_run_inference("random", idx, args.timeout_seconds)) | |
| for idx in range(1, args.llm_runs + 1): | |
| print(f"Running Open LLM baseline {idx}/{args.llm_runs}...") | |
| llm_runs.append(_run_inference("llm", idx, args.timeout_seconds)) | |
| except RuntimeError as exc: | |
| print(f"ERROR: {exc}") | |
| return 1 | |
| _print_summary("Random Baseline", random_runs) | |
| _print_summary("Open LLM Baseline", llm_runs) | |
| all_runs = random_runs + llm_runs | |
| if args.output_json: | |
| report = { | |
| "api_base_url": os.environ.get("API_BASE_URL", ""), | |
| "model_name": os.environ.get("MODEL_NAME", ""), | |
| "random_summary": _summarize(random_runs), | |
| "llm_summary": _summarize(llm_runs), | |
| "runs": _to_jsonable(all_runs), | |
| } | |
| out_path = Path(args.output_json) | |
| out_path.write_text(json.dumps(report, indent=2), encoding="utf-8") | |
| print(f"Wrote report to {out_path}") | |
| failures = [r for r in all_runs if r.return_code != 0] | |
| if failures: | |
| print("\nOne or more runs failed:") | |
| for run in failures: | |
| print(f"- lane={run.lane} run={run.run_index} rc={run.return_code}") | |
| if run.stderr: | |
| print(run.stderr) | |
| return 1 | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |