911 / scripts /run_baseline_matrix.py
garvitsachdeva's picture
Submission polish: compliance hardening, baseline matrix, dashboard UX, tests, and docs
775befb
"""Run baseline inference matrix (random + Open LLM) and summarize variance.
Usage examples:
python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 0
python scripts/run_baseline_matrix.py --random-runs 1 --llm-runs 3 --output-json baseline_report.json
Environment variables:
API_BASE_URL, MODEL_NAME
OPENAI_API_KEY or HF_TOKEN (required when --llm-runs > 0)
"""
from __future__ import annotations
import argparse
import json
import os
import re
import statistics
import subprocess
import sys
import time
from dataclasses import asdict, dataclass
from pathlib import Path
START_RE = re.compile(r"^\[START\]\s+task=(\S+)\s+env=(\S+)\s+model=(\S+)$")
END_RE = re.compile(
r"^\[END\]\s+success=(true|false)\s+steps=(\d+)\s+score=([0-9]*\.?[0-9]+)\s+rewards=(.*)$"
)
@dataclass
class TaskEpisode:
task_id: str
success: bool
steps: int
score: float
@dataclass
class RunResult:
lane: str
run_index: int
runtime_seconds: float
tasks: list[TaskEpisode]
return_code: int
stderr: str
def _project_root() -> Path:
return Path(__file__).resolve().parents[1]
def _required_var(name: str) -> str:
value = os.environ.get(name)
if not value:
raise RuntimeError(f"Missing required environment variable: {name}")
return value
def _extract_task_episodes(stdout: str) -> list[TaskEpisode]:
episodes: list[TaskEpisode] = []
current_task: str | None = None
for line in stdout.splitlines():
start_match = START_RE.match(line)
if start_match:
current_task = start_match.group(1)
continue
end_match = END_RE.match(line)
if end_match:
task_id = current_task or f"unknown-{len(episodes) + 1}"
episodes.append(
TaskEpisode(
task_id=task_id,
success=end_match.group(1) == "true",
steps=int(end_match.group(2)),
score=float(end_match.group(3)),
)
)
current_task = None
return episodes
def _run_inference(lane: str, run_index: int, timeout_seconds: int) -> RunResult:
env = os.environ.copy()
env.setdefault("API_BASE_URL", "https://api.openai.com/v1")
env.setdefault("MODEL_NAME", "baseline-model")
if lane == "random":
env["USE_RANDOM"] = "true"
env.setdefault("OPENAI_API_KEY", "dummy-token")
else:
env["USE_RANDOM"] = "false"
if not (env.get("OPENAI_API_KEY") or env.get("HF_TOKEN")):
raise RuntimeError(
"OPENAI_API_KEY or HF_TOKEN is required for Open LLM runs"
)
cmd = [sys.executable, "inference.py"]
started = time.monotonic()
proc = subprocess.run(
cmd,
cwd=str(_project_root()),
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
env=env,
timeout=timeout_seconds,
)
runtime = time.monotonic() - started
tasks = _extract_task_episodes(proc.stdout)
return RunResult(
lane=lane,
run_index=run_index,
runtime_seconds=runtime,
tasks=tasks,
return_code=proc.returncode,
stderr=proc.stderr.strip(),
)
def _summarize(runs: list[RunResult]) -> dict[str, dict[str, float]]:
by_task: dict[str, list[float]] = {}
for run in runs:
for ep in run.tasks:
by_task.setdefault(ep.task_id, []).append(ep.score)
summary: dict[str, dict[str, float]] = {}
for task_id, scores in sorted(by_task.items()):
mean_score = statistics.mean(scores)
stdev_score = statistics.pstdev(scores) if len(scores) > 1 else 0.0
summary[task_id] = {
"runs": float(len(scores)),
"mean": round(mean_score, 6),
"std": round(stdev_score, 6),
"min": round(min(scores), 6),
"max": round(max(scores), 6),
}
return summary
def _print_summary(title: str, runs: list[RunResult]) -> None:
print(f"\n=== {title} ===")
if not runs:
print("No runs executed")
return
summary = _summarize(runs)
for task_id, metrics in summary.items():
print(
f"{task_id:16s} runs={int(metrics['runs'])} "
f"mean={metrics['mean']:.3f} std={metrics['std']:.3f} "
f"min={metrics['min']:.3f} max={metrics['max']:.3f}"
)
total_runtime = sum(r.runtime_seconds for r in runs)
failures = [r for r in runs if r.return_code != 0]
print(f"total_runtime_seconds={total_runtime:.2f}")
print(f"failed_runs={len(failures)}")
def _to_jsonable(runs: list[RunResult]) -> list[dict]:
serialized: list[dict] = []
for run in runs:
entry = asdict(run)
entry["tasks"] = [asdict(t) for t in run.tasks]
serialized.append(entry)
return serialized
def main() -> int:
parser = argparse.ArgumentParser(description="Run baseline matrix for inference.py")
parser.add_argument("--random-runs", type=int, default=1)
parser.add_argument("--llm-runs", type=int, default=3)
parser.add_argument("--timeout-seconds", type=int, default=1200)
parser.add_argument("--output-json", type=str, default="")
args = parser.parse_args()
os.environ.setdefault("API_BASE_URL", "https://api.openai.com/v1")
os.environ.setdefault("MODEL_NAME", "nvidia/Nemotron-3-Super-49B-v1")
_required_var("API_BASE_URL")
_required_var("MODEL_NAME")
random_runs: list[RunResult] = []
llm_runs: list[RunResult] = []
try:
for idx in range(1, args.random_runs + 1):
print(f"Running random baseline {idx}/{args.random_runs}...")
random_runs.append(_run_inference("random", idx, args.timeout_seconds))
for idx in range(1, args.llm_runs + 1):
print(f"Running Open LLM baseline {idx}/{args.llm_runs}...")
llm_runs.append(_run_inference("llm", idx, args.timeout_seconds))
except RuntimeError as exc:
print(f"ERROR: {exc}")
return 1
_print_summary("Random Baseline", random_runs)
_print_summary("Open LLM Baseline", llm_runs)
all_runs = random_runs + llm_runs
if args.output_json:
report = {
"api_base_url": os.environ.get("API_BASE_URL", ""),
"model_name": os.environ.get("MODEL_NAME", ""),
"random_summary": _summarize(random_runs),
"llm_summary": _summarize(llm_runs),
"runs": _to_jsonable(all_runs),
}
out_path = Path(args.output_json)
out_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
print(f"Wrote report to {out_path}")
failures = [r for r in all_runs if r.return_code != 0]
if failures:
print("\nOne or more runs failed:")
for run in failures:
print(f"- lane={run.lane} run={run.run_index} rc={run.return_code}")
if run.stderr:
print(run.stderr)
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())