Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """Phase 2 gate — falsification test for the provenance engine. | |
| Loads the fixture through the loader seam, runs the deterministic engine, and | |
| asserts the verified Phase-2 oracle. NO model. Never edit the oracle to match the | |
| engine — fix the engine. Run: | |
| python3 tools/phase2_gate.py | |
| Oracle (verified ground truth): | |
| heavy set == {9, 10, 13} | |
| heaviest turn == 9, with 71 tools | |
| real retry loops == 0 | |
| re-read flag == turn 12, basename 'apply.js', count 4 | |
| indirect/total ratio in [0.78, 0.86] (~82% agent-driven) | |
| Plus: print 5 indirect edges on turn 9, each with flowValue + sourceTool. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| REPO = Path(__file__).resolve().parent.parent | |
| if str(REPO) not in sys.path: | |
| sys.path.insert(0, str(REPO)) | |
| from engine.core.analyze import analyze_path # noqa: E402 | |
| from engine.core.heavy import heaviest_turn # noqa: E402 | |
| from engine.core.loops import detect_all_loops # noqa: E402 | |
| from engine.core.rereads import detect_all_rereads # noqa: E402 | |
| FIXTURE = REPO / "fixtures" / "sample-session.jsonl" | |
| RATIO_LO, RATIO_HI = 0.78, 0.86 | |
| def _line(label: str, got, want, ok: bool) -> bool: | |
| flag = "OK " if ok else "DIFF" | |
| print(f" [{flag}] {label:<26} got={got!r:>20} want={want!r}") | |
| return ok | |
| def main() -> int: | |
| if not FIXTURE.exists(): | |
| print(f"FAIL — fixture missing: {FIXTURE}") | |
| return 1 | |
| result = analyze_path(str(FIXTURE)) | |
| turns = result["turns"] | |
| session = result["session"] | |
| print("Her · हेर — Phase 2 gate (provenance engine)") | |
| print("=" * 64) | |
| print(f"fixture : {FIXTURE}") | |
| print(f"session : cwd={session.get('cwd')!r}") | |
| print("-" * 64) | |
| checks: list[bool] = [] | |
| # 1) heavy set == {9, 10, 13} | |
| heavy_set = sorted(t.i for t in turns if t.heavy) | |
| checks.append(_line("heavy set", heavy_set, [9, 10, 13], heavy_set == [9, 10, 13])) | |
| # 2) heaviest == turn 9 with 71 tools | |
| h = heaviest_turn(turns) | |
| checks.append(_line("heaviest turn index", h.i, 9, h.i == 9)) | |
| checks.append(_line("heaviest turn #tools", len(h.tools), 71, len(h.tools) == 71)) | |
| # 3) real loops == 0 (exact-cmd + >=1 error) | |
| loops_by_turn = detect_all_loops(turns) | |
| real_loops = sum(len(tl.loops) for tl in loops_by_turn.values()) | |
| checks.append(_line("real retry loops", real_loops, 0, real_loops == 0)) | |
| # 4) re-read flag == turn 12 / apply.js / count 4 | |
| rr_by_turn = detect_all_rereads(turns) | |
| rr_flag = sorted( | |
| (ti, rr.file, rr.count) | |
| for ti, rrs in rr_by_turn.items() | |
| for rr in rrs | |
| ) | |
| want_rr = [(12, "apply.js", 4)] | |
| checks.append(_line("re-read flag", rr_flag, want_rr, rr_flag == want_rr)) | |
| # 5) indirect/total ratio in [0.78, 0.86] | |
| total_tools = sum(len(t.tools) for t in turns) | |
| total_indirect = sum(t.indirect for t in turns) | |
| ratio = total_indirect / total_tools if total_tools else 0.0 | |
| ratio_ok = RATIO_LO <= ratio <= RATIO_HI | |
| checks.append( | |
| _line( | |
| "indirect/total ratio", | |
| f"{ratio:.4f} ({total_indirect}/{total_tools})", | |
| f"[{RATIO_LO}, {RATIO_HI}]", | |
| ratio_ok, | |
| ) | |
| ) | |
| # --- spotcheck: 5 indirect edges on turn 9, each with flowValue + sourceTool | |
| print("-" * 64) | |
| print("turn-9 indirect value-flow edges (spotcheck, 5 shown):") | |
| t9 = turns[9] | |
| t9_indirect = [tc for tc in t9.tools if tc.provenance == "indirect" and tc.flowValue] | |
| # prefer the most distinctive evidence for display: UUID > URL > filename > other | |
| import re as _re | |
| _UUID = _re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", _re.I) | |
| def _rank(tc): | |
| v = tc.flowValue or "" | |
| if _UUID.match(v): | |
| return 0 | |
| if v.startswith("http"): | |
| return 1 | |
| if _re.search(r"\.[a-z]{2,4}$", v): | |
| return 2 | |
| return 3 | |
| # dedup by (flowValue, sourceTool) so 5 DISTINCT edges are shown | |
| seen: set = set() | |
| deduped = [] | |
| for tc in sorted(t9_indirect, key=_rank): | |
| key = (tc.flowValue, tc.sourceTool) | |
| if key in seen: | |
| continue | |
| seen.add(key) | |
| deduped.append(tc) | |
| shown = deduped[:5] | |
| cwd = session.get("cwd") or "" | |
| for tc in shown: | |
| cites_cwd = bool(cwd) and cwd in (tc.flowValue or "") | |
| distinctive = bool(_UUID.match(tc.flowValue or "")) or ("/" in (tc.flowValue or "")) or len(tc.flowValue or "") >= 12 | |
| print( | |
| f" {tc.name:<8} flowValue={tc.flowValue!r:<46} " | |
| f"sourceTool={tc.sourceTool!r:<10} distinctive={distinctive} cites_cwd={cites_cwd}" | |
| ) | |
| checks.append( | |
| _line("turn-9 indirect edges (>=5)", len(t9_indirect), ">=5", len(t9_indirect) >= 5) | |
| ) | |
| print("-" * 64) | |
| ok = all(checks) | |
| print("GATE:", "PASS" if ok else "FAIL") | |
| return 0 if ok else 1 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |