her / tools /phase2_gate.py
geekwrestler's picture
Squash history (purge pre-scrub demo session blobs)
5f43c7d
#!/usr/bin/env python3
"""Phase 2 gate — falsification test for the provenance engine.
Loads the fixture through the loader seam, runs the deterministic engine, and
asserts the verified Phase-2 oracle. NO model. Never edit the oracle to match the
engine — fix the engine. Run:
python3 tools/phase2_gate.py
Oracle (verified ground truth):
heavy set == {9, 10, 13}
heaviest turn == 9, with 71 tools
real retry loops == 0
re-read flag == turn 12, basename 'apply.js', count 4
indirect/total ratio in [0.78, 0.86] (~82% agent-driven)
Plus: print 5 indirect edges on turn 9, each with flowValue + sourceTool.
"""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parent.parent
if str(REPO) not in sys.path:
sys.path.insert(0, str(REPO))
from engine.core.analyze import analyze_path # noqa: E402
from engine.core.heavy import heaviest_turn # noqa: E402
from engine.core.loops import detect_all_loops # noqa: E402
from engine.core.rereads import detect_all_rereads # noqa: E402
FIXTURE = REPO / "fixtures" / "sample-session.jsonl"
RATIO_LO, RATIO_HI = 0.78, 0.86
def _line(label: str, got, want, ok: bool) -> bool:
flag = "OK " if ok else "DIFF"
print(f" [{flag}] {label:<26} got={got!r:>20} want={want!r}")
return ok
def main() -> int:
if not FIXTURE.exists():
print(f"FAIL — fixture missing: {FIXTURE}")
return 1
result = analyze_path(str(FIXTURE))
turns = result["turns"]
session = result["session"]
print("Her · हेर — Phase 2 gate (provenance engine)")
print("=" * 64)
print(f"fixture : {FIXTURE}")
print(f"session : cwd={session.get('cwd')!r}")
print("-" * 64)
checks: list[bool] = []
# 1) heavy set == {9, 10, 13}
heavy_set = sorted(t.i for t in turns if t.heavy)
checks.append(_line("heavy set", heavy_set, [9, 10, 13], heavy_set == [9, 10, 13]))
# 2) heaviest == turn 9 with 71 tools
h = heaviest_turn(turns)
checks.append(_line("heaviest turn index", h.i, 9, h.i == 9))
checks.append(_line("heaviest turn #tools", len(h.tools), 71, len(h.tools) == 71))
# 3) real loops == 0 (exact-cmd + >=1 error)
loops_by_turn = detect_all_loops(turns)
real_loops = sum(len(tl.loops) for tl in loops_by_turn.values())
checks.append(_line("real retry loops", real_loops, 0, real_loops == 0))
# 4) re-read flag == turn 12 / apply.js / count 4
rr_by_turn = detect_all_rereads(turns)
rr_flag = sorted(
(ti, rr.file, rr.count)
for ti, rrs in rr_by_turn.items()
for rr in rrs
)
want_rr = [(12, "apply.js", 4)]
checks.append(_line("re-read flag", rr_flag, want_rr, rr_flag == want_rr))
# 5) indirect/total ratio in [0.78, 0.86]
total_tools = sum(len(t.tools) for t in turns)
total_indirect = sum(t.indirect for t in turns)
ratio = total_indirect / total_tools if total_tools else 0.0
ratio_ok = RATIO_LO <= ratio <= RATIO_HI
checks.append(
_line(
"indirect/total ratio",
f"{ratio:.4f} ({total_indirect}/{total_tools})",
f"[{RATIO_LO}, {RATIO_HI}]",
ratio_ok,
)
)
# --- spotcheck: 5 indirect edges on turn 9, each with flowValue + sourceTool
print("-" * 64)
print("turn-9 indirect value-flow edges (spotcheck, 5 shown):")
t9 = turns[9]
t9_indirect = [tc for tc in t9.tools if tc.provenance == "indirect" and tc.flowValue]
# prefer the most distinctive evidence for display: UUID > URL > filename > other
import re as _re
_UUID = _re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", _re.I)
def _rank(tc):
v = tc.flowValue or ""
if _UUID.match(v):
return 0
if v.startswith("http"):
return 1
if _re.search(r"\.[a-z]{2,4}$", v):
return 2
return 3
# dedup by (flowValue, sourceTool) so 5 DISTINCT edges are shown
seen: set = set()
deduped = []
for tc in sorted(t9_indirect, key=_rank):
key = (tc.flowValue, tc.sourceTool)
if key in seen:
continue
seen.add(key)
deduped.append(tc)
shown = deduped[:5]
cwd = session.get("cwd") or ""
for tc in shown:
cites_cwd = bool(cwd) and cwd in (tc.flowValue or "")
distinctive = bool(_UUID.match(tc.flowValue or "")) or ("/" in (tc.flowValue or "")) or len(tc.flowValue or "") >= 12
print(
f" {tc.name:<8} flowValue={tc.flowValue!r:<46} "
f"sourceTool={tc.sourceTool!r:<10} distinctive={distinctive} cites_cwd={cites_cwd}"
)
checks.append(
_line("turn-9 indirect edges (>=5)", len(t9_indirect), ">=5", len(t9_indirect) >= 5)
)
print("-" * 64)
ok = all(checks)
print("GATE:", "PASS" if ok else "FAIL")
return 0 if ok else 1
if __name__ == "__main__":
sys.exit(main())