Spaces:

build-small-hackathon
/

her

Running on Zero

File size: 4,986 Bytes

5f43c7d

#!/usr/bin/env python3
"""Phase 2 gate — falsification test for the provenance engine.

Loads the fixture through the loader seam, runs the deterministic engine, and
asserts the verified Phase-2 oracle. NO model. Never edit the oracle to match the
engine — fix the engine. Run:

  python3 tools/phase2_gate.py

Oracle (verified ground truth):
  heavy set            == {9, 10, 13}
  heaviest turn        == 9, with 71 tools
  real retry loops     == 0
  re-read flag         == turn 12, basename 'apply.js', count 4
  indirect/total ratio in [0.78, 0.86]  (~82% agent-driven)
Plus: print 5 indirect edges on turn 9, each with flowValue + sourceTool.
"""
from __future__ import annotations

import sys
from pathlib import Path

REPO = Path(__file__).resolve().parent.parent
if str(REPO) not in sys.path:
    sys.path.insert(0, str(REPO))

from engine.core.analyze import analyze_path  # noqa: E402
from engine.core.heavy import heaviest_turn  # noqa: E402
from engine.core.loops import detect_all_loops  # noqa: E402
from engine.core.rereads import detect_all_rereads  # noqa: E402

FIXTURE = REPO / "fixtures" / "sample-session.jsonl"

RATIO_LO, RATIO_HI = 0.78, 0.86


def _line(label: str, got, want, ok: bool) -> bool:
    flag = "OK  " if ok else "DIFF"
    print(f"  [{flag}] {label:<26} got={got!r:>20}  want={want!r}")
    return ok


def main() -> int:
    if not FIXTURE.exists():
        print(f"FAIL — fixture missing: {FIXTURE}")
        return 1

    result = analyze_path(str(FIXTURE))
    turns = result["turns"]
    session = result["session"]

    print("Her · हेर — Phase 2 gate (provenance engine)")
    print("=" * 64)
    print(f"fixture : {FIXTURE}")
    print(f"session : cwd={session.get('cwd')!r}")
    print("-" * 64)

    checks: list[bool] = []

    # 1) heavy set == {9, 10, 13}
    heavy_set = sorted(t.i for t in turns if t.heavy)
    checks.append(_line("heavy set", heavy_set, [9, 10, 13], heavy_set == [9, 10, 13]))

    # 2) heaviest == turn 9 with 71 tools
    h = heaviest_turn(turns)
    checks.append(_line("heaviest turn index", h.i, 9, h.i == 9))
    checks.append(_line("heaviest turn #tools", len(h.tools), 71, len(h.tools) == 71))

    # 3) real loops == 0  (exact-cmd + >=1 error)
    loops_by_turn = detect_all_loops(turns)
    real_loops = sum(len(tl.loops) for tl in loops_by_turn.values())
    checks.append(_line("real retry loops", real_loops, 0, real_loops == 0))

    # 4) re-read flag == turn 12 / apply.js / count 4
    rr_by_turn = detect_all_rereads(turns)
    rr_flag = sorted(
        (ti, rr.file, rr.count)
        for ti, rrs in rr_by_turn.items()
        for rr in rrs
    )
    want_rr = [(12, "apply.js", 4)]
    checks.append(_line("re-read flag", rr_flag, want_rr, rr_flag == want_rr))

    # 5) indirect/total ratio in [0.78, 0.86]
    total_tools = sum(len(t.tools) for t in turns)
    total_indirect = sum(t.indirect for t in turns)
    ratio = total_indirect / total_tools if total_tools else 0.0
    ratio_ok = RATIO_LO <= ratio <= RATIO_HI
    checks.append(
        _line(
            "indirect/total ratio",
            f"{ratio:.4f} ({total_indirect}/{total_tools})",
            f"[{RATIO_LO}, {RATIO_HI}]",
            ratio_ok,
        )
    )

    # --- spotcheck: 5 indirect edges on turn 9, each with flowValue + sourceTool
    print("-" * 64)
    print("turn-9 indirect value-flow edges (spotcheck, 5 shown):")
    t9 = turns[9]
    t9_indirect = [tc for tc in t9.tools if tc.provenance == "indirect" and tc.flowValue]

    # prefer the most distinctive evidence for display: UUID > URL > filename > other
    import re as _re

    _UUID = _re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", _re.I)

    def _rank(tc):
        v = tc.flowValue or ""
        if _UUID.match(v):
            return 0
        if v.startswith("http"):
            return 1
        if _re.search(r"\.[a-z]{2,4}$", v):
            return 2
        return 3

    # dedup by (flowValue, sourceTool) so 5 DISTINCT edges are shown
    seen: set = set()
    deduped = []
    for tc in sorted(t9_indirect, key=_rank):
        key = (tc.flowValue, tc.sourceTool)
        if key in seen:
            continue
        seen.add(key)
        deduped.append(tc)
    shown = deduped[:5]
    cwd = session.get("cwd") or ""
    for tc in shown:
        cites_cwd = bool(cwd) and cwd in (tc.flowValue or "")
        distinctive = bool(_UUID.match(tc.flowValue or "")) or ("/" in (tc.flowValue or "")) or len(tc.flowValue or "") >= 12
        print(
            f"    {tc.name:<8} flowValue={tc.flowValue!r:<46} "
            f"sourceTool={tc.sourceTool!r:<10} distinctive={distinctive} cites_cwd={cites_cwd}"
        )
    checks.append(
        _line("turn-9 indirect edges (>=5)", len(t9_indirect), ">=5", len(t9_indirect) >= 5)
    )

    print("-" * 64)
    ok = all(checks)
    print("GATE:", "PASS" if ok else "FAIL")
    return 0 if ok else 1


if __name__ == "__main__":
    sys.exit(main())