OpenRA-Bench / scripts /triage_phase4.py
yxc20098's picture
Phase 5 final: 74-cell triage strengthens F1 to ≥30pp scaling delta
3f122f2
Raw
History Blame Contribute Delete
4.87 kB
#!/usr/bin/env python3
"""Phase 4 collection triage: scan completed JSONLs and produce a
per-(model × pack × level) outcome table + per-loss command histogram
so failures can be classified into engine / scenario / model.
Usage:
python3 scripts/triage_phase4.py [<root>] ...
If no roots, defaults to data/runs/paper-v1*.
"""
from __future__ import annotations
import argparse
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path
def _command_kind(cmd) -> str:
if isinstance(cmd, dict):
return cmd.get("kind") or "?"
if isinstance(cmd, str) and "::" in cmd:
return cmd.split("::", 1)[1].split(" ", 1)[0]
return "?"
def triage_cell(jsonl: Path) -> dict | None:
"""Return a dict per cell: model, pack, level, seed, outcome, n_turns,
cmd_kinds (Counter). None if the JSONL has no terminal line."""
try:
lines = jsonl.read_text().splitlines()
except OSError:
return None
if not lines:
return None
try:
last = json.loads(lines[-1])
except json.JSONDecodeError:
return None
term = last.get("terminal") or {}
if "outcome" not in term:
return None
# JSONL stem: <pack>__<level>__seed<N>__<fog>
parts = jsonl.stem.split("__")
pack, level, seed_part, fog = parts[0], parts[1], parts[2], parts[3]
# Parent dir: <ts>__<model_safe>
model = jsonl.parent.name.split("__", 1)[1] if "__" in jsonl.parent.name else jsonl.parent.name
cmd_kinds: Counter = Counter()
for ln in lines:
try:
d = json.loads(ln)
except json.JSONDecodeError:
continue
for c in d.get("commands_issued") or []:
cmd_kinds[_command_kind(c)] += 1
return {
"model": model,
"pack": pack,
"level": level,
"seed": int(seed_part.replace("seed", "")),
"outcome": term["outcome"],
"n_turns": int(term.get("turns") or len(lines)),
"cmd_kinds": cmd_kinds,
}
def main(argv: list[str]) -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("roots", nargs="*", default=[],
help="Run dirs (default: data/runs/paper-v1*)")
args = ap.parse_args(argv[1:])
repo = Path(__file__).resolve().parent.parent
roots = [Path(r) for r in args.roots] if args.roots else \
sorted((repo / "data" / "runs").glob("paper-v1*"))
records = []
for root in roots:
if not root.is_dir():
continue
for jsonl in root.rglob("*.jsonl"):
r = triage_cell(jsonl)
if r is not None:
records.append(r)
if not records:
print("(no completed cells found)")
return 1
# Outcome matrix: model × (pack, level) → Counter of outcomes
matrix: dict = defaultdict(lambda: defaultdict(Counter))
losses = []
for r in records:
matrix[r["model"]][(r["pack"], r["level"])][r["outcome"]] += 1
if r["outcome"] != "win":
losses.append(r)
print(f"=== Phase 4 triage on {len(records)} completed cells ===\n")
print(f"Models in scope: {sorted(matrix.keys())}\n")
for model, packs in sorted(matrix.items()):
print(f"--- {model} ---")
print(f"{'pack':30s} {'level':6s} {'W':>2s} {'L':>2s} {'D':>2s}")
for (pack, level), counts in sorted(packs.items()):
print(f" {pack[:28]:28s} {level:6s} "
f"{counts.get('win',0):>2d} {counts.get('loss',0):>2d} {counts.get('draw',0):>2d}")
print()
print(f"=== {len(losses)} loss/draw cells with command histogram ===")
print(f"{'model':30s} {'pack':30s} {'lvl':6s} {'sd':>2s} {'out':5s} {'turns':>5s} top cmds")
for r in losses:
top = " ".join(f"{k}:{n}" for k, n in r["cmd_kinds"].most_common(3))
print(f" {r['model'][:30]:30s} {r['pack'][:28]:28s} {r['level']:6s} "
f"{r['seed']:>2d} {r['outcome']:5s} {r['n_turns']:>5d} {top}")
# Classification per loss
print(f"\n=== Failure classification ===")
f1 = sum(1 for r in losses
if r["outcome"] == "loss"
and (set(r["cmd_kinds"].keys()) <= {"MoveUnits", "Observe", "Stop"}))
f2 = sum(1 for r in losses
if r["outcome"] == "loss"
and "FireSuperweapon" in r["cmd_kinds"]
and r["pack"] == "spec-nuke-strike")
other = len([r for r in losses if r["outcome"] == "loss"]) - f1 - f2
print(f" F1 (passive/walk-only, no special verb): {f1} losses")
print(f" F2 (superweapon mis-aim): {f2} losses")
print(f" Other (verb invoked, still lost): {other} losses")
print(f" Draws (need replay): "
f"{sum(1 for r in losses if r['outcome']=='draw')}")
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))