Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """Build narrator/knowledge/binaries.bundled.json — a LOCAL base registry of top dev | |
| tools, so most binaries Her extracts resolve with NO network and NO model. | |
| Three public sources, merged (intra-bundle: first source wins; the whole bundle is | |
| still the LOWEST layer — learned + curated always override it): | |
| * Homebrew — full formula + cask API dump (system CLIs). Bulk download. | |
| * npm — registry search filtered to CLI tools (keywords:cli/command-line); | |
| returns name+description+homepage in bulk (the npx-run universe). | |
| * PyPI — top-N most-downloaded packages (hugovk list) + each one's summary | |
| /homepage fetched in parallel (python tools: ruff, black, pytest…). | |
| Anything not bundled is still resolved at runtime by the enricher (Nemotron → public | |
| registries) and persisted to the learned registry, so coverage keeps growing. | |
| Run: python3 scripts/build_binaries_db.py | |
| Env: HER_NPM_MAX (2000) · HER_PYPI_MAX (3000) · HER_BREW_DIR (read cached | |
| brew_formula.json/brew_cask.json from here instead of fetching). | |
| """ | |
| from __future__ import annotations | |
| import concurrent.futures | |
| import json | |
| import os | |
| import urllib.request | |
| _HERE = os.path.dirname(os.path.abspath(__file__)) | |
| _KNOW = os.path.join(os.path.dirname(_HERE), "narrator", "knowledge") | |
| OUT = os.path.join(_KNOW, "binaries.bundled.json") | |
| _UA = {"User-Agent": "her-db-build/1.0", "Accept": "application/json"} | |
| NPM_MAX = int(os.environ.get("HER_NPM_MAX", "2000")) | |
| PYPI_MAX = int(os.environ.get("HER_PYPI_MAX", "3000")) | |
| def _fetch(url: str, timeout: float = 90.0): | |
| req = urllib.request.Request(url, headers=_UA) | |
| with urllib.request.urlopen(req, timeout=timeout) as r: | |
| return json.loads(r.read().decode("utf-8")) | |
| def _add(out: dict, name: str, product: str, blurb: str, homepage: str, source: str): | |
| if not name: | |
| return | |
| blurb = (blurb or "").strip() | |
| out.setdefault(name, { | |
| "product": (product or name), "blurb": blurb[:140], | |
| "homepage": homepage or "", "source": source, | |
| }) | |
| def homebrew(out: dict): | |
| cache = os.environ.get("HER_BREW_DIR") | |
| def load(name, url): | |
| if cache and os.path.exists(os.path.join(cache, name)): | |
| return json.load(open(os.path.join(cache, name))) | |
| return _fetch(url) | |
| print("homebrew: formulae …", flush=True) | |
| for f in load("brew_formula.json", "https://formulae.brew.sh/api/formula.json"): | |
| if f.get("desc"): | |
| _add(out, f.get("name"), f.get("name"), f.get("desc"), f.get("homepage") or "", "homebrew") | |
| print("homebrew: casks …", flush=True) | |
| for c in load("brew_cask.json", "https://formulae.brew.sh/api/cask.json"): | |
| if c.get("desc"): | |
| nm = c.get("name"); prod = (nm[0] if isinstance(nm, list) and nm else nm) if nm else c.get("token") | |
| _add(out, c.get("token"), prod, c.get("desc"), c.get("homepage") or "", "homebrew-cask") | |
| def npm(out: dict): | |
| print(f"npm: search keywords:cli (up to {NPM_MAX}) …", flush=True) | |
| got = 0 | |
| for kw in ("keywords:cli", "keywords:command-line", "keywords:cli-tool"): | |
| for frm in range(0, NPM_MAX, 250): | |
| try: | |
| body = _fetch(f"https://registry.npmjs.org/-/v1/search?text={kw}&size=250&from={frm}", timeout=30) | |
| except Exception: | |
| break | |
| objs = body.get("objects") or [] | |
| if not objs: | |
| break | |
| for o in objs: | |
| p = o.get("package") or {} | |
| if not p.get("description"): | |
| continue | |
| hp = ((p.get("links") or {}).get("homepage")) or "" | |
| before = len(out) | |
| _add(out, p.get("name"), p.get("name"), p.get("description"), hp, "npm") | |
| got += len(out) - before | |
| if len(objs) < 250: | |
| break | |
| print(f"npm: +{got} new", flush=True) | |
| def _pypi_one(name: str): | |
| try: | |
| info = (_fetch(f"https://pypi.org/pypi/{name}/json", timeout=15) or {}).get("info") or {} | |
| except Exception: | |
| return None | |
| hp = info.get("home_page") or "" | |
| if not hp: | |
| for k, v in (info.get("project_urls") or {}).items(): | |
| if isinstance(v, str) and v and ("home" in k.lower() or "source" in k.lower()): | |
| hp = v; break | |
| return (name, info.get("summary") or "", hp) | |
| def pypi(out: dict): | |
| print(f"pypi: top {PYPI_MAX} (hugovk) + parallel metadata …", flush=True) | |
| try: | |
| rows = _fetch("https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json").get("rows", []) | |
| except Exception as e: | |
| print("pypi: top-list fetch failed:", e, flush=True); return | |
| names = [r.get("project") for r in rows[:PYPI_MAX] if r.get("project")] | |
| got = 0 | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex: | |
| for res in ex.map(_pypi_one, names): | |
| if res and res[1]: | |
| before = len(out) | |
| _add(out, res[0], res[0], res[1], res[2], "pypi") | |
| got += len(out) - before | |
| print(f"pypi: +{got} new", flush=True) | |
| def main() -> int: | |
| out: dict = {} | |
| homebrew(out) | |
| npm(out) | |
| pypi(out) | |
| doc = { | |
| "_provenance": { | |
| "note": "BUNDLED local base registry of top dev tools (Homebrew + npm CLI " | |
| "search + top PyPI). scripts/build_binaries_db.py. Merged UNDER " | |
| "learned + curated (curated always wins). No trace content.", | |
| "sources": ["formulae.brew.sh", "registry.npmjs.org", "pypi.org"], | |
| }, | |
| "binaries": out, | |
| } | |
| with open(OUT, "w", encoding="utf-8") as fh: | |
| json.dump(doc, fh, ensure_ascii=False); fh.write("\n") | |
| print(f"wrote {len(out)} entries -> {OUT} ({os.path.getsize(OUT)//1024} KB)", flush=True) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |