Spaces:

Madhav189
/

SystemTruth

Running

File size: 3,731 Bytes

2733f3f

"""Run the scripted-optimal baseline across all 12 templates × 5 procgen variants
and print a summary table. This is the smoke-check that the env is healthy and
the baseline ceiling is preserved.

Usage:
    python scripts/eval_baseline.py
    python scripts/eval_baseline.py --templates-only
    python scripts/eval_baseline.py --episodes-per-scenario 3
    python scripts/eval_baseline.py --output eval/results/baseline.jsonl
"""

from __future__ import annotations

import argparse
import json
from pathlib import Path
from statistics import mean

from unified_incident_env.models import UnifiedIncidentAction
from unified_incident_env.server.challenge import (
    SCENARIOS,
    list_baselines,
)
from unified_incident_env.server.environment import UnifiedIncidentEnvironment


def run_one(scenario_id: str) -> dict:
    env = UnifiedIncidentEnvironment()
    obs = env.reset(scenario_id=scenario_id)
    baseline = list_baselines(scenario_id=scenario_id).baselines[0]
    for step in baseline.actions:
        obs = env.step(step.action)
        if obs.done:
            break
    return {
        "scenario_id": scenario_id,
        "template_id": SCENARIOS[scenario_id].get("template_id", scenario_id),
        "is_procgen": SCENARIOS[scenario_id].get("is_procgen", False),
        "final_score": float(obs.final_score),
        "incident_resolved": bool(obs.incident_resolved),
        "tick_count": int(obs.tick_count),
        "breakdown": dict(obs.score_breakdown),
    }


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--templates-only", action="store_true",
                        help="Run only the 12 base templates, skip procgen variants.")
    parser.add_argument("--episodes-per-scenario", type=int, default=1,
                        help="Number of times to run each scenario (deterministic, so default 1).")
    parser.add_argument("--output", type=str, default=None,
                        help="Optional JSONL output path.")
    args = parser.parse_args()

    if args.templates_only:
        scenario_ids = sorted(sid for sid, sc in SCENARIOS.items() if not sc.get("is_procgen"))
    else:
        scenario_ids = sorted(SCENARIOS.keys())

    results = []
    for sid in scenario_ids:
        for _ in range(args.episodes_per_scenario):
            r = run_one(sid)
            results.append(r)

    print(f"\n{'scenario':<40} {'score':>7}  {'resolved':>9}  {'ticks':>5}")
    print("-" * 70)
    for r in results:
        flag = "OK" if r["incident_resolved"] else "X"
        print(f"{r['scenario_id']:<40} {r['final_score']:>7.3f}  {flag:>9}  {r['tick_count']:>5}")

    print()
    by_template: dict[str, list[float]] = {}
    for r in results:
        by_template.setdefault(r["template_id"], []).append(r["final_score"])
    print(f"{'template':<40} {'mean':>7}  {'min':>7}  {'max':>7}  {'n':>3}")
    print("-" * 70)
    for tid, scores in sorted(by_template.items()):
        print(f"{tid:<40} {mean(scores):>7.3f}  {min(scores):>7.3f}  {max(scores):>7.3f}  {len(scores):>3}")

    overall_mean = mean(r["final_score"] for r in results)
    overall_resolved = sum(r["incident_resolved"] for r in results)
    print(f"\nOverall: mean={overall_mean:.3f}, resolved={overall_resolved}/{len(results)}")
    if overall_mean > 0.80:
        print("WARNING: scripted baseline ceiling exceeded 0.80 — see docs/REWARD_DESIGN.md §4")

    if args.output:
        out = Path(args.output)
        out.parent.mkdir(parents=True, exist_ok=True)
        with out.open("w") as f:
            for r in results:
                f.write(json.dumps(r) + "\n")
        print(f"\nWrote {len(results)} rows -> {out}")


if __name__ == "__main__":
    main()