#!/usr/bin/env python3 """Phase 4 collection triage: scan completed JSONLs and produce a per-(model × pack × level) outcome table + per-loss command histogram so failures can be classified into engine / scenario / model. Usage: python3 scripts/triage_phase4.py [] ... If no roots, defaults to data/runs/paper-v1*. """ from __future__ import annotations import argparse import json import sys from collections import Counter, defaultdict from pathlib import Path def _command_kind(cmd) -> str: if isinstance(cmd, dict): return cmd.get("kind") or "?" if isinstance(cmd, str) and "::" in cmd: return cmd.split("::", 1)[1].split(" ", 1)[0] return "?" def triage_cell(jsonl: Path) -> dict | None: """Return a dict per cell: model, pack, level, seed, outcome, n_turns, cmd_kinds (Counter). None if the JSONL has no terminal line.""" try: lines = jsonl.read_text().splitlines() except OSError: return None if not lines: return None try: last = json.loads(lines[-1]) except json.JSONDecodeError: return None term = last.get("terminal") or {} if "outcome" not in term: return None # JSONL stem: ____seed__ parts = jsonl.stem.split("__") pack, level, seed_part, fog = parts[0], parts[1], parts[2], parts[3] # Parent dir: __ model = jsonl.parent.name.split("__", 1)[1] if "__" in jsonl.parent.name else jsonl.parent.name cmd_kinds: Counter = Counter() for ln in lines: try: d = json.loads(ln) except json.JSONDecodeError: continue for c in d.get("commands_issued") or []: cmd_kinds[_command_kind(c)] += 1 return { "model": model, "pack": pack, "level": level, "seed": int(seed_part.replace("seed", "")), "outcome": term["outcome"], "n_turns": int(term.get("turns") or len(lines)), "cmd_kinds": cmd_kinds, } def main(argv: list[str]) -> int: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("roots", nargs="*", default=[], help="Run dirs (default: data/runs/paper-v1*)") args = ap.parse_args(argv[1:]) repo = Path(__file__).resolve().parent.parent roots = [Path(r) for r in args.roots] if args.roots else \ sorted((repo / "data" / "runs").glob("paper-v1*")) records = [] for root in roots: if not root.is_dir(): continue for jsonl in root.rglob("*.jsonl"): r = triage_cell(jsonl) if r is not None: records.append(r) if not records: print("(no completed cells found)") return 1 # Outcome matrix: model × (pack, level) → Counter of outcomes matrix: dict = defaultdict(lambda: defaultdict(Counter)) losses = [] for r in records: matrix[r["model"]][(r["pack"], r["level"])][r["outcome"]] += 1 if r["outcome"] != "win": losses.append(r) print(f"=== Phase 4 triage on {len(records)} completed cells ===\n") print(f"Models in scope: {sorted(matrix.keys())}\n") for model, packs in sorted(matrix.items()): print(f"--- {model} ---") print(f"{'pack':30s} {'level':6s} {'W':>2s} {'L':>2s} {'D':>2s}") for (pack, level), counts in sorted(packs.items()): print(f" {pack[:28]:28s} {level:6s} " f"{counts.get('win',0):>2d} {counts.get('loss',0):>2d} {counts.get('draw',0):>2d}") print() print(f"=== {len(losses)} loss/draw cells with command histogram ===") print(f"{'model':30s} {'pack':30s} {'lvl':6s} {'sd':>2s} {'out':5s} {'turns':>5s} top cmds") for r in losses: top = " ".join(f"{k}:{n}" for k, n in r["cmd_kinds"].most_common(3)) print(f" {r['model'][:30]:30s} {r['pack'][:28]:28s} {r['level']:6s} " f"{r['seed']:>2d} {r['outcome']:5s} {r['n_turns']:>5d} {top}") # Classification per loss print(f"\n=== Failure classification ===") f1 = sum(1 for r in losses if r["outcome"] == "loss" and (set(r["cmd_kinds"].keys()) <= {"MoveUnits", "Observe", "Stop"})) f2 = sum(1 for r in losses if r["outcome"] == "loss" and "FireSuperweapon" in r["cmd_kinds"] and r["pack"] == "spec-nuke-strike") other = len([r for r in losses if r["outcome"] == "loss"]) - f1 - f2 print(f" F1 (passive/walk-only, no special verb): {f1} losses") print(f" F2 (superweapon mis-aim): {f2} losses") print(f" Other (verb invoked, still lost): {other} losses") print(f" Draws (need replay): " f"{sum(1 for r in losses if r['outcome']=='draw')}") return 0 if __name__ == "__main__": sys.exit(main(sys.argv))