Spaces:
Running on Zero
Running on Zero
File size: 5,915 Bytes
5f43c7d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | #!/usr/bin/env python3
"""Build narrator/knowledge/binaries.bundled.json — a LOCAL base registry of top dev
tools, so most binaries Her extracts resolve with NO network and NO model.
Three public sources, merged (intra-bundle: first source wins; the whole bundle is
still the LOWEST layer — learned + curated always override it):
* Homebrew — full formula + cask API dump (system CLIs). Bulk download.
* npm — registry search filtered to CLI tools (keywords:cli/command-line);
returns name+description+homepage in bulk (the npx-run universe).
* PyPI — top-N most-downloaded packages (hugovk list) + each one's summary
/homepage fetched in parallel (python tools: ruff, black, pytest…).
Anything not bundled is still resolved at runtime by the enricher (Nemotron → public
registries) and persisted to the learned registry, so coverage keeps growing.
Run: python3 scripts/build_binaries_db.py
Env: HER_NPM_MAX (2000) · HER_PYPI_MAX (3000) · HER_BREW_DIR (read cached
brew_formula.json/brew_cask.json from here instead of fetching).
"""
from __future__ import annotations
import concurrent.futures
import json
import os
import urllib.request
_HERE = os.path.dirname(os.path.abspath(__file__))
_KNOW = os.path.join(os.path.dirname(_HERE), "narrator", "knowledge")
OUT = os.path.join(_KNOW, "binaries.bundled.json")
_UA = {"User-Agent": "her-db-build/1.0", "Accept": "application/json"}
NPM_MAX = int(os.environ.get("HER_NPM_MAX", "2000"))
PYPI_MAX = int(os.environ.get("HER_PYPI_MAX", "3000"))
def _fetch(url: str, timeout: float = 90.0):
req = urllib.request.Request(url, headers=_UA)
with urllib.request.urlopen(req, timeout=timeout) as r:
return json.loads(r.read().decode("utf-8"))
def _add(out: dict, name: str, product: str, blurb: str, homepage: str, source: str):
if not name:
return
blurb = (blurb or "").strip()
out.setdefault(name, {
"product": (product or name), "blurb": blurb[:140],
"homepage": homepage or "", "source": source,
})
def homebrew(out: dict):
cache = os.environ.get("HER_BREW_DIR")
def load(name, url):
if cache and os.path.exists(os.path.join(cache, name)):
return json.load(open(os.path.join(cache, name)))
return _fetch(url)
print("homebrew: formulae …", flush=True)
for f in load("brew_formula.json", "https://formulae.brew.sh/api/formula.json"):
if f.get("desc"):
_add(out, f.get("name"), f.get("name"), f.get("desc"), f.get("homepage") or "", "homebrew")
print("homebrew: casks …", flush=True)
for c in load("brew_cask.json", "https://formulae.brew.sh/api/cask.json"):
if c.get("desc"):
nm = c.get("name"); prod = (nm[0] if isinstance(nm, list) and nm else nm) if nm else c.get("token")
_add(out, c.get("token"), prod, c.get("desc"), c.get("homepage") or "", "homebrew-cask")
def npm(out: dict):
print(f"npm: search keywords:cli (up to {NPM_MAX}) …", flush=True)
got = 0
for kw in ("keywords:cli", "keywords:command-line", "keywords:cli-tool"):
for frm in range(0, NPM_MAX, 250):
try:
body = _fetch(f"https://registry.npmjs.org/-/v1/search?text={kw}&size=250&from={frm}", timeout=30)
except Exception:
break
objs = body.get("objects") or []
if not objs:
break
for o in objs:
p = o.get("package") or {}
if not p.get("description"):
continue
hp = ((p.get("links") or {}).get("homepage")) or ""
before = len(out)
_add(out, p.get("name"), p.get("name"), p.get("description"), hp, "npm")
got += len(out) - before
if len(objs) < 250:
break
print(f"npm: +{got} new", flush=True)
def _pypi_one(name: str):
try:
info = (_fetch(f"https://pypi.org/pypi/{name}/json", timeout=15) or {}).get("info") or {}
except Exception:
return None
hp = info.get("home_page") or ""
if not hp:
for k, v in (info.get("project_urls") or {}).items():
if isinstance(v, str) and v and ("home" in k.lower() or "source" in k.lower()):
hp = v; break
return (name, info.get("summary") or "", hp)
def pypi(out: dict):
print(f"pypi: top {PYPI_MAX} (hugovk) + parallel metadata …", flush=True)
try:
rows = _fetch("https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json").get("rows", [])
except Exception as e:
print("pypi: top-list fetch failed:", e, flush=True); return
names = [r.get("project") for r in rows[:PYPI_MAX] if r.get("project")]
got = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
for res in ex.map(_pypi_one, names):
if res and res[1]:
before = len(out)
_add(out, res[0], res[0], res[1], res[2], "pypi")
got += len(out) - before
print(f"pypi: +{got} new", flush=True)
def main() -> int:
out: dict = {}
homebrew(out)
npm(out)
pypi(out)
doc = {
"_provenance": {
"note": "BUNDLED local base registry of top dev tools (Homebrew + npm CLI "
"search + top PyPI). scripts/build_binaries_db.py. Merged UNDER "
"learned + curated (curated always wins). No trace content.",
"sources": ["formulae.brew.sh", "registry.npmjs.org", "pypi.org"],
},
"binaries": out,
}
with open(OUT, "w", encoding="utf-8") as fh:
json.dump(doc, fh, ensure_ascii=False); fh.write("\n")
print(f"wrote {len(out)} entries -> {OUT} ({os.path.getsize(OUT)//1024} KB)", flush=True)
return 0
if __name__ == "__main__":
raise SystemExit(main())
|