her / scripts /build_binaries_db.py
geekwrestler's picture
Squash history (purge pre-scrub demo session blobs)
5f43c7d
#!/usr/bin/env python3
"""Build narrator/knowledge/binaries.bundled.json — a LOCAL base registry of top dev
tools, so most binaries Her extracts resolve with NO network and NO model.
Three public sources, merged (intra-bundle: first source wins; the whole bundle is
still the LOWEST layer — learned + curated always override it):
* Homebrew — full formula + cask API dump (system CLIs). Bulk download.
* npm — registry search filtered to CLI tools (keywords:cli/command-line);
returns name+description+homepage in bulk (the npx-run universe).
* PyPI — top-N most-downloaded packages (hugovk list) + each one's summary
/homepage fetched in parallel (python tools: ruff, black, pytest…).
Anything not bundled is still resolved at runtime by the enricher (Nemotron → public
registries) and persisted to the learned registry, so coverage keeps growing.
Run: python3 scripts/build_binaries_db.py
Env: HER_NPM_MAX (2000) · HER_PYPI_MAX (3000) · HER_BREW_DIR (read cached
brew_formula.json/brew_cask.json from here instead of fetching).
"""
from __future__ import annotations
import concurrent.futures
import json
import os
import urllib.request
_HERE = os.path.dirname(os.path.abspath(__file__))
_KNOW = os.path.join(os.path.dirname(_HERE), "narrator", "knowledge")
OUT = os.path.join(_KNOW, "binaries.bundled.json")
_UA = {"User-Agent": "her-db-build/1.0", "Accept": "application/json"}
NPM_MAX = int(os.environ.get("HER_NPM_MAX", "2000"))
PYPI_MAX = int(os.environ.get("HER_PYPI_MAX", "3000"))
def _fetch(url: str, timeout: float = 90.0):
req = urllib.request.Request(url, headers=_UA)
with urllib.request.urlopen(req, timeout=timeout) as r:
return json.loads(r.read().decode("utf-8"))
def _add(out: dict, name: str, product: str, blurb: str, homepage: str, source: str):
if not name:
return
blurb = (blurb or "").strip()
out.setdefault(name, {
"product": (product or name), "blurb": blurb[:140],
"homepage": homepage or "", "source": source,
})
def homebrew(out: dict):
cache = os.environ.get("HER_BREW_DIR")
def load(name, url):
if cache and os.path.exists(os.path.join(cache, name)):
return json.load(open(os.path.join(cache, name)))
return _fetch(url)
print("homebrew: formulae …", flush=True)
for f in load("brew_formula.json", "https://formulae.brew.sh/api/formula.json"):
if f.get("desc"):
_add(out, f.get("name"), f.get("name"), f.get("desc"), f.get("homepage") or "", "homebrew")
print("homebrew: casks …", flush=True)
for c in load("brew_cask.json", "https://formulae.brew.sh/api/cask.json"):
if c.get("desc"):
nm = c.get("name"); prod = (nm[0] if isinstance(nm, list) and nm else nm) if nm else c.get("token")
_add(out, c.get("token"), prod, c.get("desc"), c.get("homepage") or "", "homebrew-cask")
def npm(out: dict):
print(f"npm: search keywords:cli (up to {NPM_MAX}) …", flush=True)
got = 0
for kw in ("keywords:cli", "keywords:command-line", "keywords:cli-tool"):
for frm in range(0, NPM_MAX, 250):
try:
body = _fetch(f"https://registry.npmjs.org/-/v1/search?text={kw}&size=250&from={frm}", timeout=30)
except Exception:
break
objs = body.get("objects") or []
if not objs:
break
for o in objs:
p = o.get("package") or {}
if not p.get("description"):
continue
hp = ((p.get("links") or {}).get("homepage")) or ""
before = len(out)
_add(out, p.get("name"), p.get("name"), p.get("description"), hp, "npm")
got += len(out) - before
if len(objs) < 250:
break
print(f"npm: +{got} new", flush=True)
def _pypi_one(name: str):
try:
info = (_fetch(f"https://pypi.org/pypi/{name}/json", timeout=15) or {}).get("info") or {}
except Exception:
return None
hp = info.get("home_page") or ""
if not hp:
for k, v in (info.get("project_urls") or {}).items():
if isinstance(v, str) and v and ("home" in k.lower() or "source" in k.lower()):
hp = v; break
return (name, info.get("summary") or "", hp)
def pypi(out: dict):
print(f"pypi: top {PYPI_MAX} (hugovk) + parallel metadata …", flush=True)
try:
rows = _fetch("https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json").get("rows", [])
except Exception as e:
print("pypi: top-list fetch failed:", e, flush=True); return
names = [r.get("project") for r in rows[:PYPI_MAX] if r.get("project")]
got = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
for res in ex.map(_pypi_one, names):
if res and res[1]:
before = len(out)
_add(out, res[0], res[0], res[1], res[2], "pypi")
got += len(out) - before
print(f"pypi: +{got} new", flush=True)
def main() -> int:
out: dict = {}
homebrew(out)
npm(out)
pypi(out)
doc = {
"_provenance": {
"note": "BUNDLED local base registry of top dev tools (Homebrew + npm CLI "
"search + top PyPI). scripts/build_binaries_db.py. Merged UNDER "
"learned + curated (curated always wins). No trace content.",
"sources": ["formulae.brew.sh", "registry.npmjs.org", "pypi.org"],
},
"binaries": out,
}
with open(OUT, "w", encoding="utf-8") as fh:
json.dump(doc, fh, ensure_ascii=False); fh.write("\n")
print(f"wrote {len(out)} entries -> {OUT} ({os.path.getsize(OUT)//1024} KB)", flush=True)
return 0
if __name__ == "__main__":
raise SystemExit(main())