File size: 4,873 Bytes
3f122f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
"""Phase 4 collection triage: scan completed JSONLs and produce a
per-(model × pack × level) outcome table + per-loss command histogram
so failures can be classified into engine / scenario / model.

Usage:
    python3 scripts/triage_phase4.py [<root>] ...

If no roots, defaults to data/runs/paper-v1*.
"""
from __future__ import annotations

import argparse
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path


def _command_kind(cmd) -> str:
    if isinstance(cmd, dict):
        return cmd.get("kind") or "?"
    if isinstance(cmd, str) and "::" in cmd:
        return cmd.split("::", 1)[1].split(" ", 1)[0]
    return "?"


def triage_cell(jsonl: Path) -> dict | None:
    """Return a dict per cell: model, pack, level, seed, outcome, n_turns,
    cmd_kinds (Counter). None if the JSONL has no terminal line."""
    try:
        lines = jsonl.read_text().splitlines()
    except OSError:
        return None
    if not lines:
        return None
    try:
        last = json.loads(lines[-1])
    except json.JSONDecodeError:
        return None
    term = last.get("terminal") or {}
    if "outcome" not in term:
        return None
    # JSONL stem: <pack>__<level>__seed<N>__<fog>
    parts = jsonl.stem.split("__")
    pack, level, seed_part, fog = parts[0], parts[1], parts[2], parts[3]
    # Parent dir: <ts>__<model_safe>
    model = jsonl.parent.name.split("__", 1)[1] if "__" in jsonl.parent.name else jsonl.parent.name
    cmd_kinds: Counter = Counter()
    for ln in lines:
        try:
            d = json.loads(ln)
        except json.JSONDecodeError:
            continue
        for c in d.get("commands_issued") or []:
            cmd_kinds[_command_kind(c)] += 1
    return {
        "model": model,
        "pack": pack,
        "level": level,
        "seed": int(seed_part.replace("seed", "")),
        "outcome": term["outcome"],
        "n_turns": int(term.get("turns") or len(lines)),
        "cmd_kinds": cmd_kinds,
    }


def main(argv: list[str]) -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("roots", nargs="*", default=[],
                    help="Run dirs (default: data/runs/paper-v1*)")
    args = ap.parse_args(argv[1:])
    repo = Path(__file__).resolve().parent.parent
    roots = [Path(r) for r in args.roots] if args.roots else \
        sorted((repo / "data" / "runs").glob("paper-v1*"))
    records = []
    for root in roots:
        if not root.is_dir():
            continue
        for jsonl in root.rglob("*.jsonl"):
            r = triage_cell(jsonl)
            if r is not None:
                records.append(r)
    if not records:
        print("(no completed cells found)")
        return 1
    # Outcome matrix: model × (pack, level) → Counter of outcomes
    matrix: dict = defaultdict(lambda: defaultdict(Counter))
    losses = []
    for r in records:
        matrix[r["model"]][(r["pack"], r["level"])][r["outcome"]] += 1
        if r["outcome"] != "win":
            losses.append(r)

    print(f"=== Phase 4 triage on {len(records)} completed cells ===\n")
    print(f"Models in scope: {sorted(matrix.keys())}\n")
    for model, packs in sorted(matrix.items()):
        print(f"--- {model} ---")
        print(f"{'pack':30s} {'level':6s} {'W':>2s} {'L':>2s} {'D':>2s}")
        for (pack, level), counts in sorted(packs.items()):
            print(f"  {pack[:28]:28s} {level:6s} "
                  f"{counts.get('win',0):>2d} {counts.get('loss',0):>2d} {counts.get('draw',0):>2d}")
        print()

    print(f"=== {len(losses)} loss/draw cells with command histogram ===")
    print(f"{'model':30s} {'pack':30s} {'lvl':6s} {'sd':>2s} {'out':5s} {'turns':>5s}  top cmds")
    for r in losses:
        top = " ".join(f"{k}:{n}" for k, n in r["cmd_kinds"].most_common(3))
        print(f"  {r['model'][:30]:30s} {r['pack'][:28]:28s} {r['level']:6s} "
              f"{r['seed']:>2d} {r['outcome']:5s} {r['n_turns']:>5d}  {top}")

    # Classification per loss
    print(f"\n=== Failure classification ===")
    f1 = sum(1 for r in losses
             if r["outcome"] == "loss"
             and (set(r["cmd_kinds"].keys()) <= {"MoveUnits", "Observe", "Stop"}))
    f2 = sum(1 for r in losses
             if r["outcome"] == "loss"
             and "FireSuperweapon" in r["cmd_kinds"]
             and r["pack"] == "spec-nuke-strike")
    other = len([r for r in losses if r["outcome"] == "loss"]) - f1 - f2
    print(f"  F1 (passive/walk-only, no special verb): {f1} losses")
    print(f"  F2 (superweapon mis-aim):                {f2} losses")
    print(f"  Other (verb invoked, still lost):        {other} losses")
    print(f"  Draws (need replay):                     "
          f"{sum(1 for r in losses if r['outcome']=='draw')}")
    return 0


if __name__ == "__main__":
    sys.exit(main(sys.argv))