Spaces:

build-small-hackathon
/

her

Running on Zero

App Files Files Community

her / tools /phase2_gate.py

geekwrestler

Squash history (purge pre-scrub demo session blobs)

5f43c7d 4 days ago

raw

history blame contribute delete

4.99 kB

	#!/usr/bin/env python3
	"""Phase 2 gate — falsification test for the provenance engine.

	Loads the fixture through the loader seam, runs the deterministic engine, and
	asserts the verified Phase-2 oracle. NO model. Never edit the oracle to match the
	engine — fix the engine. Run:

	python3 tools/phase2_gate.py

	Oracle (verified ground truth):
	heavy set == {9, 10, 13}
	heaviest turn == 9, with 71 tools
	real retry loops == 0
	re-read flag == turn 12, basename 'apply.js', count 4
	indirect/total ratio in [0.78, 0.86] (~82% agent-driven)
	Plus: print 5 indirect edges on turn 9, each with flowValue + sourceTool.
	"""
	from __future__ import annotations

	import sys
	from pathlib import Path

	REPO = Path(__file__).resolve().parent.parent
	if str(REPO) not in sys.path:
	sys.path.insert(0, str(REPO))

	from engine.core.analyze import analyze_path # noqa: E402
	from engine.core.heavy import heaviest_turn # noqa: E402
	from engine.core.loops import detect_all_loops # noqa: E402
	from engine.core.rereads import detect_all_rereads # noqa: E402

	FIXTURE = REPO / "fixtures" / "sample-session.jsonl"

	RATIO_LO, RATIO_HI = 0.78, 0.86


	def _line(label: str, got, want, ok: bool) -> bool:
	flag = "OK " if ok else "DIFF"
	print(f" [{flag}] {label:<26} got={got!r:>20} want={want!r}")
	return ok


	def main() -> int:
	if not FIXTURE.exists():
	print(f"FAIL — fixture missing: {FIXTURE}")
	return 1

	result = analyze_path(str(FIXTURE))
	turns = result["turns"]
	session = result["session"]

	print("Her · हेर — Phase 2 gate (provenance engine)")
	print("=" * 64)
	print(f"fixture : {FIXTURE}")
	print(f"session : cwd={session.get('cwd')!r}")
	print("-" * 64)

	checks: list[bool] = []

	# 1) heavy set == {9, 10, 13}
	heavy_set = sorted(t.i for t in turns if t.heavy)
	checks.append(_line("heavy set", heavy_set, [9, 10, 13], heavy_set == [9, 10, 13]))

	# 2) heaviest == turn 9 with 71 tools
	h = heaviest_turn(turns)
	checks.append(_line("heaviest turn index", h.i, 9, h.i == 9))
	checks.append(_line("heaviest turn #tools", len(h.tools), 71, len(h.tools) == 71))

	# 3) real loops == 0 (exact-cmd + >=1 error)
	loops_by_turn = detect_all_loops(turns)
	real_loops = sum(len(tl.loops) for tl in loops_by_turn.values())
	checks.append(_line("real retry loops", real_loops, 0, real_loops == 0))

	# 4) re-read flag == turn 12 / apply.js / count 4
	rr_by_turn = detect_all_rereads(turns)
	rr_flag = sorted(
	(ti, rr.file, rr.count)
	for ti, rrs in rr_by_turn.items()
	for rr in rrs
	)
	want_rr = [(12, "apply.js", 4)]
	checks.append(_line("re-read flag", rr_flag, want_rr, rr_flag == want_rr))

	# 5) indirect/total ratio in [0.78, 0.86]
	total_tools = sum(len(t.tools) for t in turns)
	total_indirect = sum(t.indirect for t in turns)
	ratio = total_indirect / total_tools if total_tools else 0.0
	ratio_ok = RATIO_LO <= ratio <= RATIO_HI
	checks.append(
	_line(
	"indirect/total ratio",
	f"{ratio:.4f} ({total_indirect}/{total_tools})",
	f"[{RATIO_LO}, {RATIO_HI}]",
	ratio_ok,
	)
	)

	# --- spotcheck: 5 indirect edges on turn 9, each with flowValue + sourceTool
	print("-" * 64)
	print("turn-9 indirect value-flow edges (spotcheck, 5 shown):")
	t9 = turns[9]
	t9_indirect = [tc for tc in t9.tools if tc.provenance == "indirect" and tc.flowValue]

	# prefer the most distinctive evidence for display: UUID > URL > filename > other
	import re as _re

	_UUID = _re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", _re.I)

	def _rank(tc):
	v = tc.flowValue or ""
	if _UUID.match(v):
	return 0
	if v.startswith("http"):
	return 1
	if _re.search(r"\.[a-z]{2,4}$", v):
	return 2
	return 3

	# dedup by (flowValue, sourceTool) so 5 DISTINCT edges are shown
	seen: set = set()
	deduped = []
	for tc in sorted(t9_indirect, key=_rank):
	key = (tc.flowValue, tc.sourceTool)
	if key in seen:
	continue
	seen.add(key)
	deduped.append(tc)
	shown = deduped[:5]
	cwd = session.get("cwd") or ""
	for tc in shown:
	cites_cwd = bool(cwd) and cwd in (tc.flowValue or "")
	distinctive = bool(_UUID.match(tc.flowValue or "")) or ("/" in (tc.flowValue or "")) or len(tc.flowValue or "") >= 12
	print(
	f" {tc.name:<8} flowValue={tc.flowValue!r:<46} "
	f"sourceTool={tc.sourceTool!r:<10} distinctive={distinctive} cites_cwd={cites_cwd}"
	)
	checks.append(
	_line("turn-9 indirect edges (>=5)", len(t9_indirect), ">=5", len(t9_indirect) >= 5)
	)

	print("-" * 64)
	ok = all(checks)
	print("GATE:", "PASS" if ok else "FAIL")
	return 0 if ok else 1


	if __name__ == "__main__":
	sys.exit(main())