"""guidance.py — Guides (per-turn advice) + Findings (graph-level claims). NON-NEGOTIABLE #6: the advisor stays SILENT unless a NAMED, FIXABLE pattern fires. Cost alone is never advice. A turn gets a Guide ONLY when: - a re-read pattern fired → kind 'reread' - a real retry loop fired → kind 'loop' - heavy AND over the absolute cost → kind 'big' ("expensive because big, not floor, with NO churn wasteful; checkpoint to cap, but don't restart and lose context") Otherwise `guide` stays None. "Expensive but clean" is a valid, important output — and a cheap turn (a query/reply, a failed round-trip) gets NO 'big' card at all: the absolute-cost gate stops the relative top-N rank from faking expense (#6). NON-NEGOTIABLE #4/#7: Findings separate PROVEN from HYPOTHESIS. A value-flow edge (a distinctive value reappears verbatim) is `proven: true`. Temporal proximity is a hypothesis (`proven: false`). The guide/finding text SUGGESTS, never asserts a fix. Pure code, NO model. (The narrator turns these into prose later; it never invents new findings.) """ from __future__ import annotations from dataclasses import asdict from typing import Any, Optional from engine.core.best_practices import practice_for, practices_for from engine.core.loops import TurnLoops from engine.core.rereads import Reread # Per-turn guide kinds that map to a fixable best practice. 'big' is EXCLUDED on # purpose: it fires only on heavy-with-no-churn ("expensive because big, not # wasteful"), so surfacing it as something to "do better" would manufacture advice # on a clean turn (build rule #6). _GUIDE_TO_SIGNAL = {"loop": "loop", "reread": "reread"} # --------------------------------------------------------------------------- # # Guides (attached to a turn ONLY when a pattern fires) # --------------------------------------------------------------------------- # def build_guide( turn, rereads: Optional[list[Reread]], loops: Optional[TurnLoops], ) -> Optional[dict[str, str]]: """Return a Guide dict for a turn, or None (silence) if no pattern fires. Priority when several could apply: a real loop (correctness) > re-read (wasted work) > big (informational). At most one guide per turn. """ # 1) real retry loop — the strongest, most actionable signal if loops and loops.loops: lp = max(loops.loops, key=lambda x: x.count) head = f"Retry loop: same command ran {lp.count}x, {lp.errored} errored" body = ( "The exact same command was re-run after it errored. Worth checking " "whether the command needs a fix (quoting, path, missing arg) rather " "than another retry." ) return {"kind": "loop", "head": head, "body": body} # 2) re-read — the same file opened >= 3x in one turn if rereads: rr = rereads[0] head = f"Re-read: {rr.file} opened {rr.count}x in this turn" body = ( f"{rr.file} was read {rr.count} times here. If the content is stable, " "reading it once and keeping it in context would avoid the repeat token " "cost — worth a look." ) return {"kind": "reread", "head": head, "body": body} # 3) heavy with no churn → 'big' (expensive because big, not wasteful). # Gate on the ABSOLUTE cost floor (overBudget), not merely the relative top-N # rank: in a tiny/cheap session every turn is "top-N", so rank alone would # paste "expensive because big" onto a turn that cost ~nothing — a single # query/reply, a failed auth round-trip. That is exactly the "cost alone is # never advice" rule (#6): calling a cheap turn expensive is simply false. has_churn = bool(rereads) or bool(loops and loops.loops) if turn.heavy and getattr(turn, "overBudget", False) and not has_churn: head = "Expensive because big, not wasteful" body = ( "This is one of the heaviest turns by cache-read, but there's no loop or " "re-read churn driving it — the cost is the size of the work, not waste. " "If you want to cap spend, checkpoint here; don't restart the session and " "lose the accumulated context." ) return {"kind": "big", "head": head, "body": body} return None # --------------------------------------------------------------------------- # # Findings (graph-level claims; proven vs hypothesis ALWAYS separated) # --------------------------------------------------------------------------- # def build_findings( turns, rereads_by_turn: dict[int, list[Reread]], loops_by_turn: dict[int, TurnLoops], heavy_indices: list[int], tool_clusters: Optional[list[dict[str, Any]]] = None, ) -> list[dict[str, Any]]: """Build Finding objects. `proven` is True ONLY for value-flow edges. Finding = { id, kind, severity, nodes:[id], edges:[id], proven:bool, text } """ findings: list[dict[str, Any]] = [] fid = 0 def _next_id() -> str: nonlocal fid fid += 1 return f"F{fid}" # --- PROVEN: value-flow edges (a distinctive value reappeared verbatim) --- # for t in turns: for tc in t.tools: if tc.provenance == "indirect" and tc.flowValue: findings.append( { "id": _next_id(), "kind": "value_flow", "severity": "info", "nodes": [tc.id] if tc.id else [], "edges": [], "proven": True, # value reappeared verbatim — asserted "text": ( f"Turn {t.i}: {tc.name} used '{tc.flowValue}', which first " f"appeared in an earlier {tc.sourceTool} result — a proven " f"value-flow (agent-driven, not from the human's prompt)." ), } ) # --- HYPOTHESIS / NAMED patterns: loops, re-reads, heavy ---------------- # for ti, tl in sorted(loops_by_turn.items()): for lp in tl.loops: findings.append( { "id": _next_id(), "kind": "loop", "severity": "warn", "nodes": [], "edges": [], "proven": True, # exact-cmd repeat + error is observed, not inferred "text": ( f"Turn {ti}: the same command ran {lp.count}x with " f"{lp.errored} errored — looks like a retry loop worth checking." ), } ) for ti, rrs in sorted(rereads_by_turn.items()): for rr in rrs: findings.append( { "id": _next_id(), "kind": "reread", "severity": "warn", "nodes": [], "edges": [], "proven": True, # the >=3 reads are counted, not inferred "text": ( f"Turn {ti}: {rr.file} was read {rr.count}x — a re-read pattern; " "caching it in context could avoid the repeat cost." ), } ) for ti in heavy_indices: t = turns[ti] churn = bool(rereads_by_turn.get(ti)) or bool( loops_by_turn.get(ti) and loops_by_turn[ti].loops ) findings.append( { "id": _next_id(), "kind": "heavy", "severity": "info", "nodes": [], "edges": [], "proven": True, # cacheRead ranking is computed, not inferred "text": ( f"Turn {ti} is among the top-3 by cache-read " f"({t.tokens.cacheRead:,} tokens)" + ( "; no loop/re-read churn — expensive because big, not wasteful." if not churn else "; see the loop/re-read finding on this turn." ) ), } ) # --- NAMED pattern: tool_cluster (CLI flailing, no skill) --------------- # # The call/error counts are OBSERVED (proven, like a loop); the fix is a cited # SUGGESTION (build rule #7). The Anthropic citation rides on the finding text. for c in tool_clusters or []: turns_str = ", ".join(f"turn {i}" for i in c.get("turns", [])) observed = ( f"Ran `{c['binary']}` {c['calls']}x ({c['errored']} errored) across " f"{turns_str} with no skill loaded for it" ) fix = c.get("fix") if fix: text = ( f"{observed}. {fix} " f"(Best practice: {c.get('practice', 'Use CLI tools / Create skills')} — " f"{c.get('source', '')})" ) else: # knowledge file absent — state the observation, suggest plainly, no citation text = ( f"{observed} — worth giving the agent that context up front " f"(a project skill or a service CLI) so it doesn't rediscover it by trial." ) findings.append( { "id": _next_id(), "kind": "tool_cluster", "severity": "warn", "nodes": c.get("toolIds", []), "edges": [], "proven": True, # the call/error counts are counted, not inferred "text": text, } ) return findings # --------------------------------------------------------------------------- # # Recommendations (session-level "what could have been better") # --------------------------------------------------------------------------- # def build_recommendations( turns, tool_clusters: Optional[list[dict[str, Any]]] = None, read_bursts: Optional[list[dict[str, Any]]] = None, unverified: Optional[dict[str, Any]] = None, near_repeats: Optional[list[dict[str, Any]]] = None, unloaded_mcp: Optional[dict[str, Any]] = None, npx_unpinned: Optional[dict[str, Any]] = None, ) -> list[dict[str, Any]]: """Abstract the fired, FIXABLE signals into a session-level list — one item per pattern (per attribution), each naming the turn(s) it touched. Item = { kind, turns:[i], headline, advice, practice, source, attribution }. Pure code: it reads the guides already attached to turns + the deterministic signal results, and pulls the fix text from the knowledge files (best_practices). Two attributions surface side by side: * "Anthropic" — the cited fix transcribed from Anthropic's doc. * "Generally recommended"— custom, editable, non-Anthropic craft. For every fired signal kind that ALSO has community practices, an extra rec is appended carrying the SAME turns, so e.g. a re-read shows the Anthropic rec AND the generally-recommended "read once, keep it in context" rec. Which signals fire is NOT decided here (deterministic detection upstream is untouched) — this only enriches the recommendation list. Empty list => silence is the honest result ("expensive but clean"). The narrator/UI may show ONLY these — no inventing a best practice that isn't in the knowledge files. """ recs: list[dict[str, Any]] = [] # (kind, turns) for every signal that actually fired — drives the community # supplements appended at the end (same kind, same turns, distinct attribution). fired: list[tuple[str, list[int]]] = [] # per-turn guides (loop / reread) — 'big'/heavy intentionally excluded for t in turns: g = getattr(t, "guide", None) if not g: continue sk = _GUIDE_TO_SIGNAL.get(g.get("kind")) if not sk: continue bp = practice_for(sk) recs.append( { "kind": sk, "turns": [t.i], "headline": g.get("head", ""), "advice": (bp.get("fix") if bp else g.get("body", "")), "practice": bp.get("practice") if bp else None, "source": bp.get("source") if bp else None, "attribution": "Anthropic", } ) fired.append((sk, [t.i])) # tool clusters (CLI flailing, no skill) — already carry the cited fix bp_tc = practice_for("tool_cluster") for c in tool_clusters or []: recs.append( { "kind": "tool_cluster", "turns": c.get("turns", []), "headline": ( f"Flailed on `{c['binary']}` — {c['calls']}x " f"({c['errored']} errored), no skill loaded" ), "advice": c.get("fix") or (bp_tc.get("fix") if bp_tc else ""), "practice": c.get("practice") or (bp_tc.get("practice") if bp_tc else None), "source": c.get("source") or (bp_tc.get("source") if bp_tc else None), "attribution": "Anthropic", } ) fired.append(("tool_cluster", c.get("turns", []))) # read-bursts (many files in one turn -> use a subagent) bp_rb = practice_for("read_burst") for rb in read_bursts or []: recs.append(_named_rec( "read_burst", [rb["turn"]], f"Read {rb['files']} different files in one turn", bp_rb, )) fired.append(("read_burst", [rb["turn"]])) # unverified edits (session-level: edits, no test/build/lint anywhere) bp_uv = practice_for("unverified_edit") if unverified: recs.append(_named_rec( "unverified_edit", unverified.get("turns", []), f"{unverified['edits']} edits, but no test/build/lint ran in the session", bp_uv, )) fired.append(("unverified_edit", unverified.get("turns", []))) # near-identical command repeats (circling -> redirect) bp_nr = practice_for("near_repeat") for nr in near_repeats or []: recs.append(_named_rec( "near_repeat", [nr["turn"]], f"Re-ran a near-identical command {nr['count']}x in one turn", bp_nr, )) fired.append(("near_repeat", [nr["turn"]])) # unloaded MCP probed manually (can't load mid-session -> restart, don't probe) if unloaded_mcp: bp_um = practice_for("unloaded_mcp") recs.append(_named_rec( "unloaded_mcp", unloaded_mcp.get("turns", []), f"Probed an MCP server that isn't loaded in this session " f"({unloaded_mcp.get('probes', 0)} manual reach attempts) — it can't appear mid-session", bp_um, )) fired.append(("unloaded_mcp", unloaded_mcp.get("turns", []))) # unpinned package runners (npx pkg with no @version) — there is NO Anthropic # practice for this kind, so it surfaces only as a generally-recommended rec # below; we just record it as fired (turns named for the cards). if npx_unpinned: fired.append(("npx_unpinned", npx_unpinned.get("turns", []))) # --- community supplements: for each fired kind that has 'Generally # recommended' practices, append one rec per practice with the SAME turns. # Pure enrichment — it never adds a new kind that didn't already fire. --- # for kind, turns_ in fired: for cp in _community_for(kind): recs.append( { "kind": kind, "turns": turns_, "headline": cp.get("practice", ""), "advice": cp.get("fix", ""), "practice": cp.get("practice"), "source": cp.get("source"), "attribution": "Generally recommended", } ) return recs def _community_for(kind: str) -> list[dict[str, Any]]: """The 'Generally recommended' (non-Anthropic) practices for a fired kind. Reuses best_practices.practices_for and drops the Anthropic entry (already emitted above), leaving only the community supplements. """ return [ p for p in practices_for(kind) if p.get("attribution") == "Generally recommended" ] def _named_rec(kind: str, turns_, headline: str, bp: Optional[dict]) -> dict[str, Any]: """Assemble a recommendation item from a cited best practice (or bare).""" return { "kind": kind, "turns": turns_, "headline": headline, "advice": bp.get("fix") if bp else "", "practice": bp.get("practice") if bp else None, "source": bp.get("source") if bp else None, "attribution": "Anthropic", } # --------------------------------------------------------------------------- # # orchestration helper: attach guides in place # --------------------------------------------------------------------------- # def attach_guides( turns, rereads_by_turn: dict[int, list[Reread]], loops_by_turn: dict[int, TurnLoops], ) -> None: """Set Turn.guide (or leave None) for every turn. In place.""" for t in turns: t.guide = build_guide( t, rereads_by_turn.get(t.i), loops_by_turn.get(t.i) )