"""clusters.py — tool_cluster signal: flailing on an external CLI with no skill. NON-NEGOTIABLE #1/#6: pure code, NO model; the advisor stays SILENT unless a NAMED, FIXABLE pattern fires. This detects exactly one: A run of Bash calls that share an external binary (the command's first real token, e.g. `railway`, `gh`, `docker`) where the agent FLAILED — it errored repeatedly — and NO skill was loaded in the session to give it that context. That maps to Anthropic's "Use CLI tools / Create skills" guidance: a skill or a service CLI would have handed the agent the context it instead burned tokens rediscovering. The fix text is attached from the cited knowledge file (best_practices.practice_for) — this module never invents advice. NOT a domain dictionary. The build owner rejected a service→skill mapping (the maintenance treadmill). What we keep instead is the inverse: `_UBIQUITOUS`, a small, stable, STRUCTURAL denylist of commands the agent universally knows — shell builtins, coreutils, core dev runtimes/VCS — for which a skill would never help. Adding a new service (vercel, supabase, fly…) needs ZERO maintenance here; it simply isn't on the denylist, so it can fire. The error signal is the proof, not a curated allow-list. PER-BINARY skill coverage: a cluster is suppressed only when a skill whose name matches THAT binary was loaded (e.g. a `railway` skill mutes the railway cluster). A skill used elsewhere in the session no longer mutes an unrelated CLI that flailed — that session-wide suppression was hiding real findings (e.g. railway 17x/2-errored in a session that happened to use some other skill). """ from __future__ import annotations import re from typing import Any from engine.core.best_practices import practice_for # Tunable in one place. Conservative on purpose: a teaching tool shipped to # beginners should under-fire rather than nag. Railway-style flailing (many calls, # several errors) clears these comfortably; a one-off 3-call hiccup does not. MIN_CALLS = 4 MIN_ERRORS = 2 # Command-segment splitters within a single line: && || | ; _SEG_RE = re.compile(r"&&|\|\|?|;") # Leading `VAR=value` environment assignments to skip when finding the binary. _ASSIGN_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=") # A real command name (rejects flags like `-c`, redirects `<`/`>`, `$(...)`, etc.) _NAME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_.+-]*$") # Command wrappers that precede the real binary. _WRAPPERS = {"sudo", "env", "command", "time", "nohup", "exec", "builtin", "then", "do", "xargs"} # Segment heads that are navigation/no-op/shell-keywords — look at the next segment. _NAV_HEADS = { "cd", "pushd", "popd", "export", "set", "source", ".", "unset", "", # shell control keywords — never an external binary "if", "elif", "else", "fi", "for", "while", "until", "done", "case", "esac", "function", "select", "return", "in", "continue", "break", } # Universally-known commands a skill would never help with. STRUCTURAL, not a # domain map: shell builtins + coreutils + core dev runtimes/VCS + generic net # fetchers. Stable for years; new SERVICES never need adding here. _UBIQUITOUS = { # shell builtins / control "echo", "printf", "true", "false", "test", "read", "wait", "trap", "eval", "alias", "type", "which", "kill", "jobs", "bg", "fg", "umask", # coreutils / text "cat", "ls", "cp", "mv", "rm", "mkdir", "rmdir", "touch", "ln", "chmod", "chown", "pwd", "grep", "egrep", "fgrep", "sed", "awk", "cut", "tr", "sort", "uniq", "head", "tail", "wc", "find", "tee", "tar", "gzip", "gunzip", "zip", "unzip", "diff", "comm", "basename", "dirname", "sleep", "date", "ps", "top", "df", "du", "stat", "file", "seq", "yes", "less", "more", "t3", "open", # core dev runtimes / package managers / vcs "python", "python3", "pip", "pip3", "node", "npm", "npx", "yarn", "pnpm", "deno", "bun", "git", "make", "cmake", "go", "cargo", "rustc", "java", "javac", "mvn", "gradle", "ruby", "gem", "bundle", "bash", "sh", "zsh", "fish", # generic network tools (too generic to map to one skill) "curl", "wget", "jq", "ssh", "scp", "rsync", "nc", "ping", "dig", "host", } def _binary(cmd: str) -> str: """The external binary a Bash command actually invokes (basename), or ''. First line only (heredoc bodies / multiline scripts never become 'binaries'), then skip leading env-assignments and wrappers (`sudo`, `env`, …) and step past pure-navigation segments (`cd foo && railway up` -> `railway`). The head must look like a real command name — flags (`-c`), redirects (`<`), and `$(...)` are rejected. Structural tokenization only — NOT a domain dictionary. """ first_line = cmd.strip().split("\n", 1)[0] for seg in _SEG_RE.split(first_line): toks = seg.strip().split() i = 0 while i < len(toks) and (_ASSIGN_RE.match(toks[i]) or toks[i] in _WRAPPERS): i += 1 if i >= len(toks): continue head = toks[i].rsplit("/", 1)[-1] # /usr/bin/railway -> railway if head in _NAV_HEADS or not _NAME_RE.match(head): continue return head return "" def detect_tool_clusters(turns) -> list[dict[str, Any]]: """Return the FLAGGED tool-clusters for a session (silence -> empty list). Each entry: { binary, calls, errored, turns:[i], toolIds:[id], skillLoaded, practice, fix, section, source } (the last four when the cited knowledge file is present). Only clusters that fired are returned; clean or universally-known binaries are never surfaced. """ # Skill identifiers used anywhere in the session (lowercased). A cluster is # muted only if a skill name matches its binary (per-binary, not session-wide). skill_names: set[str] = set() for t in turns: for tc in t.tools: if getattr(tc, "name", "") == "Skill": inp = tc.input if isinstance(tc.input, dict) else {} nm = str(inp.get("skill") or inp.get("command") or "").strip().lower() if nm: skill_names.add(nm) def _covered(binary: str) -> bool: bl = binary.lower() return any(bl in sn or sn in bl for sn in skill_names) agg: dict[str, dict[str, Any]] = {} for t in turns: for tc in t.tools: if getattr(tc, "name", "") != "Bash": continue inp = tc.input if isinstance(tc.input, dict) else {} cmd = str(inp.get("command", "") or "") b = _binary(cmd) if not b or b in _UBIQUITOUS: continue row = agg.setdefault( b, {"calls": 0, "errored": 0, "turns": set(), "toolIds": []} ) row["calls"] += 1 if getattr(tc, "errored", False): row["errored"] += 1 row["turns"].add(t.i) if tc.id: row["toolIds"].append(tc.id) bp = practice_for("tool_cluster") clusters: list[dict[str, Any]] = [] for b, row in agg.items(): if row["calls"] < MIN_CALLS or row["errored"] < MIN_ERRORS or _covered(b): continue c: dict[str, Any] = { "binary": b, "calls": row["calls"], "errored": row["errored"], "turns": sorted(row["turns"]), "toolIds": row["toolIds"], "skillLoaded": False, # a matching skill would have suppressed this cluster } if bp: c.update( { "practice": bp.get("practice"), "fix": bp.get("fix"), "section": bp.get("section"), "source": bp.get("source"), } ) clusters.append(c) # Most-flailing first: by errors, then calls, then name (deterministic). clusters.sort(key=lambda c: (-c["errored"], -c["calls"], c["binary"])) return clusters