Spaces:

build-small-hackathon
/

her

Running on Zero

App Files Files Community

her / engine /core /clusters.py

geekwrestler

Squash history (purge pre-scrub demo session blobs)

5f43c7d 4 days ago

raw

history blame contribute delete

8.04 kB

	"""clusters.py — tool_cluster signal: flailing on an external CLI with no skill.

	NON-NEGOTIABLE #1/#6: pure code, NO model; the advisor stays SILENT unless a
	NAMED, FIXABLE pattern fires. This detects exactly one:

	A run of Bash calls that share an external binary (the command's first real
	token, e.g. `railway`, `gh`, `docker`) where the agent FLAILED — it errored
	repeatedly — and NO skill was loaded in the session to give it that context.

	That maps to Anthropic's "Use CLI tools / Create skills" guidance: a skill or a
	service CLI would have handed the agent the context it instead burned tokens
	rediscovering. The fix text is attached from the cited knowledge file
	(best_practices.practice_for) — this module never invents advice.

	NOT a domain dictionary. The build owner rejected a service→skill mapping (the
	maintenance treadmill). What we keep instead is the inverse: `_UBIQUITOUS`, a
	small, stable, STRUCTURAL denylist of commands the agent universally knows —
	shell builtins, coreutils, core dev runtimes/VCS — for which a skill would never
	help. Adding a new service (vercel, supabase, fly…) needs ZERO maintenance here;
	it simply isn't on the denylist, so it can fire. The error signal is the proof,
	not a curated allow-list.

	PER-BINARY skill coverage: a cluster is suppressed only when a skill whose name
	matches THAT binary was loaded (e.g. a `railway` skill mutes the railway cluster).
	A skill used elsewhere in the session no longer mutes an unrelated CLI that flailed
	— that session-wide suppression was hiding real findings (e.g. railway 17x/2-errored
	in a session that happened to use some other skill).
	"""
	from __future__ import annotations

	import re
	from typing import Any

	from engine.core.best_practices import practice_for

	# Tunable in one place. Conservative on purpose: a teaching tool shipped to
	# beginners should under-fire rather than nag. Railway-style flailing (many calls,
	# several errors) clears these comfortably; a one-off 3-call hiccup does not.
	MIN_CALLS = 4
	MIN_ERRORS = 2

	# Command-segment splitters within a single line: && \|\| \| ;
	_SEG_RE = re.compile(r"&&\|\\|\\|?\|;")
	# Leading `VAR=value` environment assignments to skip when finding the binary.
	_ASSIGN_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*=")
	# A real command name (rejects flags like `-c`, redirects `<`/`>`, `$(...)`, etc.)
	_NAME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_.+-]*$")
	# Command wrappers that precede the real binary.
	_WRAPPERS = {"sudo", "env", "command", "time", "nohup", "exec", "builtin", "then", "do", "xargs"}
	# Segment heads that are navigation/no-op/shell-keywords — look at the next segment.
	_NAV_HEADS = {
	"cd", "pushd", "popd", "export", "set", "source", ".", "unset", "",
	# shell control keywords — never an external binary
	"if", "elif", "else", "fi", "for", "while", "until", "done", "case", "esac",
	"function", "select", "return", "in", "continue", "break",
	}

	# Universally-known commands a skill would never help with. STRUCTURAL, not a
	# domain map: shell builtins + coreutils + core dev runtimes/VCS + generic net
	# fetchers. Stable for years; new SERVICES never need adding here.
	_UBIQUITOUS = {
	# shell builtins / control
	"echo", "printf", "true", "false", "test", "read", "wait", "trap", "eval",
	"alias", "type", "which", "kill", "jobs", "bg", "fg", "umask",
	# coreutils / text
	"cat", "ls", "cp", "mv", "rm", "mkdir", "rmdir", "touch", "ln", "chmod",
	"chown", "pwd", "grep", "egrep", "fgrep", "sed", "awk", "cut", "tr", "sort",
	"uniq", "head", "tail", "wc", "find", "tee", "tar", "gzip", "gunzip", "zip",
	"unzip", "diff", "comm", "basename", "dirname", "sleep", "date", "ps", "top",
	"df", "du", "stat", "file", "seq", "yes", "less", "more", "t3", "open",
	# core dev runtimes / package managers / vcs
	"python", "python3", "pip", "pip3", "node", "npm", "npx", "yarn", "pnpm",
	"deno", "bun", "git", "make", "cmake", "go", "cargo", "rustc", "java",
	"javac", "mvn", "gradle", "ruby", "gem", "bundle", "bash", "sh", "zsh", "fish",
	# generic network tools (too generic to map to one skill)
	"curl", "wget", "jq", "ssh", "scp", "rsync", "nc", "ping", "dig", "host",
	}


	def _binary(cmd: str) -> str:
	"""The external binary a Bash command actually invokes (basename), or ''.

	First line only (heredoc bodies / multiline scripts never become 'binaries'),
	then skip leading env-assignments and wrappers (`sudo`, `env`, …) and step past
	pure-navigation segments (`cd foo && railway up` -> `railway`). The head must
	look like a real command name — flags (`-c`), redirects (`<`), and `$(...)` are
	rejected. Structural tokenization only — NOT a domain dictionary.
	"""
	first_line = cmd.strip().split("\n", 1)[0]
	for seg in _SEG_RE.split(first_line):
	toks = seg.strip().split()
	i = 0
	while i < len(toks) and (_ASSIGN_RE.match(toks[i]) or toks[i] in _WRAPPERS):
	i += 1
	if i >= len(toks):
	continue
	head = toks[i].rsplit("/", 1)[-1] # /usr/bin/railway -> railway
	if head in _NAV_HEADS or not _NAME_RE.match(head):
	continue
	return head
	return ""


	def detect_tool_clusters(turns) -> list[dict[str, Any]]:
	"""Return the FLAGGED tool-clusters for a session (silence -> empty list).

	Each entry: { binary, calls, errored, turns:[i], toolIds:[id], skillLoaded,
	practice, fix, section, source } (the last four when the cited
	knowledge file is present). Only clusters that fired are returned;
	clean or universally-known binaries are never surfaced.
	"""
	# Skill identifiers used anywhere in the session (lowercased). A cluster is
	# muted only if a skill name matches its binary (per-binary, not session-wide).
	skill_names: set[str] = set()
	for t in turns:
	for tc in t.tools:
	if getattr(tc, "name", "") == "Skill":
	inp = tc.input if isinstance(tc.input, dict) else {}
	nm = str(inp.get("skill") or inp.get("command") or "").strip().lower()
	if nm:
	skill_names.add(nm)

	def _covered(binary: str) -> bool:
	bl = binary.lower()
	return any(bl in sn or sn in bl for sn in skill_names)

	agg: dict[str, dict[str, Any]] = {}
	for t in turns:
	for tc in t.tools:
	if getattr(tc, "name", "") != "Bash":
	continue
	inp = tc.input if isinstance(tc.input, dict) else {}
	cmd = str(inp.get("command", "") or "")
	b = _binary(cmd)
	if not b or b in _UBIQUITOUS:
	continue
	row = agg.setdefault(
	b, {"calls": 0, "errored": 0, "turns": set(), "toolIds": []}
	)
	row["calls"] += 1
	if getattr(tc, "errored", False):
	row["errored"] += 1
	row["turns"].add(t.i)
	if tc.id:
	row["toolIds"].append(tc.id)

	bp = practice_for("tool_cluster")
	clusters: list[dict[str, Any]] = []
	for b, row in agg.items():
	if row["calls"] < MIN_CALLS or row["errored"] < MIN_ERRORS or _covered(b):
	continue
	c: dict[str, Any] = {
	"binary": b,
	"calls": row["calls"],
	"errored": row["errored"],
	"turns": sorted(row["turns"]),
	"toolIds": row["toolIds"],
	"skillLoaded": False, # a matching skill would have suppressed this cluster
	}
	if bp:
	c.update(
	{
	"practice": bp.get("practice"),
	"fix": bp.get("fix"),
	"section": bp.get("section"),
	"source": bp.get("source"),
	}
	)
	clusters.append(c)

	# Most-flailing first: by errors, then calls, then name (deterministic).
	clusters.sort(key=lambda c: (-c["errored"], -c["calls"], c["binary"]))
	return clusters