Spaces:
Running
Running
File size: 4,873 Bytes
3f122f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env python3
"""Phase 4 collection triage: scan completed JSONLs and produce a
per-(model × pack × level) outcome table + per-loss command histogram
so failures can be classified into engine / scenario / model.
Usage:
python3 scripts/triage_phase4.py [<root>] ...
If no roots, defaults to data/runs/paper-v1*.
"""
from __future__ import annotations
import argparse
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path
def _command_kind(cmd) -> str:
if isinstance(cmd, dict):
return cmd.get("kind") or "?"
if isinstance(cmd, str) and "::" in cmd:
return cmd.split("::", 1)[1].split(" ", 1)[0]
return "?"
def triage_cell(jsonl: Path) -> dict | None:
"""Return a dict per cell: model, pack, level, seed, outcome, n_turns,
cmd_kinds (Counter). None if the JSONL has no terminal line."""
try:
lines = jsonl.read_text().splitlines()
except OSError:
return None
if not lines:
return None
try:
last = json.loads(lines[-1])
except json.JSONDecodeError:
return None
term = last.get("terminal") or {}
if "outcome" not in term:
return None
# JSONL stem: <pack>__<level>__seed<N>__<fog>
parts = jsonl.stem.split("__")
pack, level, seed_part, fog = parts[0], parts[1], parts[2], parts[3]
# Parent dir: <ts>__<model_safe>
model = jsonl.parent.name.split("__", 1)[1] if "__" in jsonl.parent.name else jsonl.parent.name
cmd_kinds: Counter = Counter()
for ln in lines:
try:
d = json.loads(ln)
except json.JSONDecodeError:
continue
for c in d.get("commands_issued") or []:
cmd_kinds[_command_kind(c)] += 1
return {
"model": model,
"pack": pack,
"level": level,
"seed": int(seed_part.replace("seed", "")),
"outcome": term["outcome"],
"n_turns": int(term.get("turns") or len(lines)),
"cmd_kinds": cmd_kinds,
}
def main(argv: list[str]) -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("roots", nargs="*", default=[],
help="Run dirs (default: data/runs/paper-v1*)")
args = ap.parse_args(argv[1:])
repo = Path(__file__).resolve().parent.parent
roots = [Path(r) for r in args.roots] if args.roots else \
sorted((repo / "data" / "runs").glob("paper-v1*"))
records = []
for root in roots:
if not root.is_dir():
continue
for jsonl in root.rglob("*.jsonl"):
r = triage_cell(jsonl)
if r is not None:
records.append(r)
if not records:
print("(no completed cells found)")
return 1
# Outcome matrix: model × (pack, level) → Counter of outcomes
matrix: dict = defaultdict(lambda: defaultdict(Counter))
losses = []
for r in records:
matrix[r["model"]][(r["pack"], r["level"])][r["outcome"]] += 1
if r["outcome"] != "win":
losses.append(r)
print(f"=== Phase 4 triage on {len(records)} completed cells ===\n")
print(f"Models in scope: {sorted(matrix.keys())}\n")
for model, packs in sorted(matrix.items()):
print(f"--- {model} ---")
print(f"{'pack':30s} {'level':6s} {'W':>2s} {'L':>2s} {'D':>2s}")
for (pack, level), counts in sorted(packs.items()):
print(f" {pack[:28]:28s} {level:6s} "
f"{counts.get('win',0):>2d} {counts.get('loss',0):>2d} {counts.get('draw',0):>2d}")
print()
print(f"=== {len(losses)} loss/draw cells with command histogram ===")
print(f"{'model':30s} {'pack':30s} {'lvl':6s} {'sd':>2s} {'out':5s} {'turns':>5s} top cmds")
for r in losses:
top = " ".join(f"{k}:{n}" for k, n in r["cmd_kinds"].most_common(3))
print(f" {r['model'][:30]:30s} {r['pack'][:28]:28s} {r['level']:6s} "
f"{r['seed']:>2d} {r['outcome']:5s} {r['n_turns']:>5d} {top}")
# Classification per loss
print(f"\n=== Failure classification ===")
f1 = sum(1 for r in losses
if r["outcome"] == "loss"
and (set(r["cmd_kinds"].keys()) <= {"MoveUnits", "Observe", "Stop"}))
f2 = sum(1 for r in losses
if r["outcome"] == "loss"
and "FireSuperweapon" in r["cmd_kinds"]
and r["pack"] == "spec-nuke-strike")
other = len([r for r in losses if r["outcome"] == "loss"]) - f1 - f2
print(f" F1 (passive/walk-only, no special verb): {f1} losses")
print(f" F2 (superweapon mis-aim): {f2} losses")
print(f" Other (verb invoked, still lost): {other} losses")
print(f" Draws (need replay): "
f"{sum(1 for r in losses if r['outcome']=='draw')}")
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))
|