#!/usr/bin/env python3 """Build narrator/knowledge/binaries.bundled.json — a LOCAL base registry of top dev tools, so most binaries Her extracts resolve with NO network and NO model. Three public sources, merged (intra-bundle: first source wins; the whole bundle is still the LOWEST layer — learned + curated always override it): * Homebrew — full formula + cask API dump (system CLIs). Bulk download. * npm — registry search filtered to CLI tools (keywords:cli/command-line); returns name+description+homepage in bulk (the npx-run universe). * PyPI — top-N most-downloaded packages (hugovk list) + each one's summary /homepage fetched in parallel (python tools: ruff, black, pytest…). Anything not bundled is still resolved at runtime by the enricher (Nemotron → public registries) and persisted to the learned registry, so coverage keeps growing. Run: python3 scripts/build_binaries_db.py Env: HER_NPM_MAX (2000) · HER_PYPI_MAX (3000) · HER_BREW_DIR (read cached brew_formula.json/brew_cask.json from here instead of fetching). """ from __future__ import annotations import concurrent.futures import json import os import urllib.request _HERE = os.path.dirname(os.path.abspath(__file__)) _KNOW = os.path.join(os.path.dirname(_HERE), "narrator", "knowledge") OUT = os.path.join(_KNOW, "binaries.bundled.json") _UA = {"User-Agent": "her-db-build/1.0", "Accept": "application/json"} NPM_MAX = int(os.environ.get("HER_NPM_MAX", "2000")) PYPI_MAX = int(os.environ.get("HER_PYPI_MAX", "3000")) def _fetch(url: str, timeout: float = 90.0): req = urllib.request.Request(url, headers=_UA) with urllib.request.urlopen(req, timeout=timeout) as r: return json.loads(r.read().decode("utf-8")) def _add(out: dict, name: str, product: str, blurb: str, homepage: str, source: str): if not name: return blurb = (blurb or "").strip() out.setdefault(name, { "product": (product or name), "blurb": blurb[:140], "homepage": homepage or "", "source": source, }) def homebrew(out: dict): cache = os.environ.get("HER_BREW_DIR") def load(name, url): if cache and os.path.exists(os.path.join(cache, name)): return json.load(open(os.path.join(cache, name))) return _fetch(url) print("homebrew: formulae …", flush=True) for f in load("brew_formula.json", "https://formulae.brew.sh/api/formula.json"): if f.get("desc"): _add(out, f.get("name"), f.get("name"), f.get("desc"), f.get("homepage") or "", "homebrew") print("homebrew: casks …", flush=True) for c in load("brew_cask.json", "https://formulae.brew.sh/api/cask.json"): if c.get("desc"): nm = c.get("name"); prod = (nm[0] if isinstance(nm, list) and nm else nm) if nm else c.get("token") _add(out, c.get("token"), prod, c.get("desc"), c.get("homepage") or "", "homebrew-cask") def npm(out: dict): print(f"npm: search keywords:cli (up to {NPM_MAX}) …", flush=True) got = 0 for kw in ("keywords:cli", "keywords:command-line", "keywords:cli-tool"): for frm in range(0, NPM_MAX, 250): try: body = _fetch(f"https://registry.npmjs.org/-/v1/search?text={kw}&size=250&from={frm}", timeout=30) except Exception: break objs = body.get("objects") or [] if not objs: break for o in objs: p = o.get("package") or {} if not p.get("description"): continue hp = ((p.get("links") or {}).get("homepage")) or "" before = len(out) _add(out, p.get("name"), p.get("name"), p.get("description"), hp, "npm") got += len(out) - before if len(objs) < 250: break print(f"npm: +{got} new", flush=True) def _pypi_one(name: str): try: info = (_fetch(f"https://pypi.org/pypi/{name}/json", timeout=15) or {}).get("info") or {} except Exception: return None hp = info.get("home_page") or "" if not hp: for k, v in (info.get("project_urls") or {}).items(): if isinstance(v, str) and v and ("home" in k.lower() or "source" in k.lower()): hp = v; break return (name, info.get("summary") or "", hp) def pypi(out: dict): print(f"pypi: top {PYPI_MAX} (hugovk) + parallel metadata …", flush=True) try: rows = _fetch("https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json").get("rows", []) except Exception as e: print("pypi: top-list fetch failed:", e, flush=True); return names = [r.get("project") for r in rows[:PYPI_MAX] if r.get("project")] got = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex: for res in ex.map(_pypi_one, names): if res and res[1]: before = len(out) _add(out, res[0], res[0], res[1], res[2], "pypi") got += len(out) - before print(f"pypi: +{got} new", flush=True) def main() -> int: out: dict = {} homebrew(out) npm(out) pypi(out) doc = { "_provenance": { "note": "BUNDLED local base registry of top dev tools (Homebrew + npm CLI " "search + top PyPI). scripts/build_binaries_db.py. Merged UNDER " "learned + curated (curated always wins). No trace content.", "sources": ["formulae.brew.sh", "registry.npmjs.org", "pypi.org"], }, "binaries": out, } with open(OUT, "w", encoding="utf-8") as fh: json.dump(doc, fh, ensure_ascii=False); fh.write("\n") print(f"wrote {len(out)} entries -> {OUT} ({os.path.getsize(OUT)//1024} KB)", flush=True) return 0 if __name__ == "__main__": raise SystemExit(main())