Spaces:

build-small-hackathon
/

her

Running on Zero

File size: 8,039 Bytes

5f43c7d

"""clusters.py — tool_cluster signal: flailing on an external CLI with no skill.

NON-NEGOTIABLE #1/#6: pure code, NO model; the advisor stays SILENT unless a
NAMED, FIXABLE pattern fires. This detects exactly one:

  A run of Bash calls that share an external binary (the command's first real
  token, e.g. `railway`, `gh`, `docker`) where the agent FLAILED — it errored
  repeatedly — and NO skill was loaded in the session to give it that context.

That maps to Anthropic's "Use CLI tools / Create skills" guidance: a skill or a
service CLI would have handed the agent the context it instead burned tokens
rediscovering. The fix text is attached from the cited knowledge file
(best_practices.practice_for) — this module never invents advice.

NOT a domain dictionary. The build owner rejected a service→skill mapping (the
maintenance treadmill). What we keep instead is the inverse: `_UBIQUITOUS`, a
small, stable, STRUCTURAL denylist of commands the agent universally knows —
shell builtins, coreutils, core dev runtimes/VCS — for which a skill would never
help. Adding a new service (vercel, supabase, fly…) needs ZERO maintenance here;
it simply isn't on the denylist, so it can fire. The error signal is the proof,
not a curated allow-list.

PER-BINARY skill coverage: a cluster is suppressed only when a skill whose name
matches THAT binary was loaded (e.g. a `railway` skill mutes the railway cluster).
A skill used elsewhere in the session no longer mutes an unrelated CLI that flailed
— that session-wide suppression was hiding real findings (e.g. railway 17x/2-errored
in a session that happened to use some other skill).
"""
from __future__ import annotations

import re
from typing import Any

from engine.core.best_practices import practice_for

# Tunable in one place. Conservative on purpose: a teaching tool shipped to
# beginners should under-fire rather than nag. Railway-style flailing (many calls,
# several errors) clears these comfortably; a one-off 3-call hiccup does not.
MIN_CALLS = 4
MIN_ERRORS = 2

# Command-segment splitters within a single line: &&  ||  |  ;
_SEG_RE = re.compile(r"&&|\|\|?|;")
# Leading `VAR=value` environment assignments to skip when finding the binary.
_ASSIGN_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=")
# A real command name (rejects flags like `-c`, redirects `<`/`>`, `$(...)`, etc.)
_NAME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_.+-]*$")
# Command wrappers that precede the real binary.
_WRAPPERS = {"sudo", "env", "command", "time", "nohup", "exec", "builtin", "then", "do", "xargs"}
# Segment heads that are navigation/no-op/shell-keywords — look at the next segment.
_NAV_HEADS = {
    "cd", "pushd", "popd", "export", "set", "source", ".", "unset", "",
    # shell control keywords — never an external binary
    "if", "elif", "else", "fi", "for", "while", "until", "done", "case", "esac",
    "function", "select", "return", "in", "continue", "break",
}

# Universally-known commands a skill would never help with. STRUCTURAL, not a
# domain map: shell builtins + coreutils + core dev runtimes/VCS + generic net
# fetchers. Stable for years; new SERVICES never need adding here.
_UBIQUITOUS = {
    # shell builtins / control
    "echo", "printf", "true", "false", "test", "read", "wait", "trap", "eval",
    "alias", "type", "which", "kill", "jobs", "bg", "fg", "umask",
    # coreutils / text
    "cat", "ls", "cp", "mv", "rm", "mkdir", "rmdir", "touch", "ln", "chmod",
    "chown", "pwd", "grep", "egrep", "fgrep", "sed", "awk", "cut", "tr", "sort",
    "uniq", "head", "tail", "wc", "find", "tee", "tar", "gzip", "gunzip", "zip",
    "unzip", "diff", "comm", "basename", "dirname", "sleep", "date", "ps", "top",
    "df", "du", "stat", "file", "seq", "yes", "less", "more", "t3", "open",
    # core dev runtimes / package managers / vcs
    "python", "python3", "pip", "pip3", "node", "npm", "npx", "yarn", "pnpm",
    "deno", "bun", "git", "make", "cmake", "go", "cargo", "rustc", "java",
    "javac", "mvn", "gradle", "ruby", "gem", "bundle", "bash", "sh", "zsh", "fish",
    # generic network tools (too generic to map to one skill)
    "curl", "wget", "jq", "ssh", "scp", "rsync", "nc", "ping", "dig", "host",
}


def _binary(cmd: str) -> str:
    """The external binary a Bash command actually invokes (basename), or ''.

    First line only (heredoc bodies / multiline scripts never become 'binaries'),
    then skip leading env-assignments and wrappers (`sudo`, `env`, …) and step past
    pure-navigation segments (`cd foo && railway up` -> `railway`). The head must
    look like a real command name — flags (`-c`), redirects (`<`), and `$(...)` are
    rejected. Structural tokenization only — NOT a domain dictionary.
    """
    first_line = cmd.strip().split("\n", 1)[0]
    for seg in _SEG_RE.split(first_line):
        toks = seg.strip().split()
        i = 0
        while i < len(toks) and (_ASSIGN_RE.match(toks[i]) or toks[i] in _WRAPPERS):
            i += 1
        if i >= len(toks):
            continue
        head = toks[i].rsplit("/", 1)[-1]  # /usr/bin/railway -> railway
        if head in _NAV_HEADS or not _NAME_RE.match(head):
            continue
        return head
    return ""


def detect_tool_clusters(turns) -> list[dict[str, Any]]:
    """Return the FLAGGED tool-clusters for a session (silence -> empty list).

    Each entry: { binary, calls, errored, turns:[i], toolIds:[id], skillLoaded,
                  practice, fix, section, source }  (the last four when the cited
                  knowledge file is present). Only clusters that fired are returned;
                  clean or universally-known binaries are never surfaced.
    """
    # Skill identifiers used anywhere in the session (lowercased). A cluster is
    # muted only if a skill name matches its binary (per-binary, not session-wide).
    skill_names: set[str] = set()
    for t in turns:
        for tc in t.tools:
            if getattr(tc, "name", "") == "Skill":
                inp = tc.input if isinstance(tc.input, dict) else {}
                nm = str(inp.get("skill") or inp.get("command") or "").strip().lower()
                if nm:
                    skill_names.add(nm)

    def _covered(binary: str) -> bool:
        bl = binary.lower()
        return any(bl in sn or sn in bl for sn in skill_names)

    agg: dict[str, dict[str, Any]] = {}
    for t in turns:
        for tc in t.tools:
            if getattr(tc, "name", "") != "Bash":
                continue
            inp = tc.input if isinstance(tc.input, dict) else {}
            cmd = str(inp.get("command", "") or "")
            b = _binary(cmd)
            if not b or b in _UBIQUITOUS:
                continue
            row = agg.setdefault(
                b, {"calls": 0, "errored": 0, "turns": set(), "toolIds": []}
            )
            row["calls"] += 1
            if getattr(tc, "errored", False):
                row["errored"] += 1
            row["turns"].add(t.i)
            if tc.id:
                row["toolIds"].append(tc.id)

    bp = practice_for("tool_cluster")
    clusters: list[dict[str, Any]] = []
    for b, row in agg.items():
        if row["calls"] < MIN_CALLS or row["errored"] < MIN_ERRORS or _covered(b):
            continue
        c: dict[str, Any] = {
            "binary": b,
            "calls": row["calls"],
            "errored": row["errored"],
            "turns": sorted(row["turns"]),
            "toolIds": row["toolIds"],
            "skillLoaded": False,  # a matching skill would have suppressed this cluster
        }
        if bp:
            c.update(
                {
                    "practice": bp.get("practice"),
                    "fix": bp.get("fix"),
                    "section": bp.get("section"),
                    "source": bp.get("source"),
                }
            )
        clusters.append(c)

    # Most-flailing first: by errors, then calls, then name (deterministic).
    clusters.sort(key=lambda c: (-c["errored"], -c["calls"], c["binary"]))
    return clusters