Spaces:
Running on Zero
Running on Zero
File size: 4,986 Bytes
5f43c7d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | #!/usr/bin/env python3
"""Phase 2 gate — falsification test for the provenance engine.
Loads the fixture through the loader seam, runs the deterministic engine, and
asserts the verified Phase-2 oracle. NO model. Never edit the oracle to match the
engine — fix the engine. Run:
python3 tools/phase2_gate.py
Oracle (verified ground truth):
heavy set == {9, 10, 13}
heaviest turn == 9, with 71 tools
real retry loops == 0
re-read flag == turn 12, basename 'apply.js', count 4
indirect/total ratio in [0.78, 0.86] (~82% agent-driven)
Plus: print 5 indirect edges on turn 9, each with flowValue + sourceTool.
"""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parent.parent
if str(REPO) not in sys.path:
sys.path.insert(0, str(REPO))
from engine.core.analyze import analyze_path # noqa: E402
from engine.core.heavy import heaviest_turn # noqa: E402
from engine.core.loops import detect_all_loops # noqa: E402
from engine.core.rereads import detect_all_rereads # noqa: E402
FIXTURE = REPO / "fixtures" / "sample-session.jsonl"
RATIO_LO, RATIO_HI = 0.78, 0.86
def _line(label: str, got, want, ok: bool) -> bool:
flag = "OK " if ok else "DIFF"
print(f" [{flag}] {label:<26} got={got!r:>20} want={want!r}")
return ok
def main() -> int:
if not FIXTURE.exists():
print(f"FAIL — fixture missing: {FIXTURE}")
return 1
result = analyze_path(str(FIXTURE))
turns = result["turns"]
session = result["session"]
print("Her · हेर — Phase 2 gate (provenance engine)")
print("=" * 64)
print(f"fixture : {FIXTURE}")
print(f"session : cwd={session.get('cwd')!r}")
print("-" * 64)
checks: list[bool] = []
# 1) heavy set == {9, 10, 13}
heavy_set = sorted(t.i for t in turns if t.heavy)
checks.append(_line("heavy set", heavy_set, [9, 10, 13], heavy_set == [9, 10, 13]))
# 2) heaviest == turn 9 with 71 tools
h = heaviest_turn(turns)
checks.append(_line("heaviest turn index", h.i, 9, h.i == 9))
checks.append(_line("heaviest turn #tools", len(h.tools), 71, len(h.tools) == 71))
# 3) real loops == 0 (exact-cmd + >=1 error)
loops_by_turn = detect_all_loops(turns)
real_loops = sum(len(tl.loops) for tl in loops_by_turn.values())
checks.append(_line("real retry loops", real_loops, 0, real_loops == 0))
# 4) re-read flag == turn 12 / apply.js / count 4
rr_by_turn = detect_all_rereads(turns)
rr_flag = sorted(
(ti, rr.file, rr.count)
for ti, rrs in rr_by_turn.items()
for rr in rrs
)
want_rr = [(12, "apply.js", 4)]
checks.append(_line("re-read flag", rr_flag, want_rr, rr_flag == want_rr))
# 5) indirect/total ratio in [0.78, 0.86]
total_tools = sum(len(t.tools) for t in turns)
total_indirect = sum(t.indirect for t in turns)
ratio = total_indirect / total_tools if total_tools else 0.0
ratio_ok = RATIO_LO <= ratio <= RATIO_HI
checks.append(
_line(
"indirect/total ratio",
f"{ratio:.4f} ({total_indirect}/{total_tools})",
f"[{RATIO_LO}, {RATIO_HI}]",
ratio_ok,
)
)
# --- spotcheck: 5 indirect edges on turn 9, each with flowValue + sourceTool
print("-" * 64)
print("turn-9 indirect value-flow edges (spotcheck, 5 shown):")
t9 = turns[9]
t9_indirect = [tc for tc in t9.tools if tc.provenance == "indirect" and tc.flowValue]
# prefer the most distinctive evidence for display: UUID > URL > filename > other
import re as _re
_UUID = _re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", _re.I)
def _rank(tc):
v = tc.flowValue or ""
if _UUID.match(v):
return 0
if v.startswith("http"):
return 1
if _re.search(r"\.[a-z]{2,4}$", v):
return 2
return 3
# dedup by (flowValue, sourceTool) so 5 DISTINCT edges are shown
seen: set = set()
deduped = []
for tc in sorted(t9_indirect, key=_rank):
key = (tc.flowValue, tc.sourceTool)
if key in seen:
continue
seen.add(key)
deduped.append(tc)
shown = deduped[:5]
cwd = session.get("cwd") or ""
for tc in shown:
cites_cwd = bool(cwd) and cwd in (tc.flowValue or "")
distinctive = bool(_UUID.match(tc.flowValue or "")) or ("/" in (tc.flowValue or "")) or len(tc.flowValue or "") >= 12
print(
f" {tc.name:<8} flowValue={tc.flowValue!r:<46} "
f"sourceTool={tc.sourceTool!r:<10} distinctive={distinctive} cites_cwd={cites_cwd}"
)
checks.append(
_line("turn-9 indirect edges (>=5)", len(t9_indirect), ">=5", len(t9_indirect) >= 5)
)
print("-" * 64)
ok = all(checks)
print("GATE:", "PASS" if ok else "FAIL")
return 0 if ok else 1
if __name__ == "__main__":
sys.exit(main())
|