Spaces:

build-small-hackathon
/

her

Running on Zero

App Files Files Community

her / scripts /build_binaries_db.py

geekwrestler

Squash history (purge pre-scrub demo session blobs)

5f43c7d 3 days ago

raw

history blame contribute delete

5.92 kB

	#!/usr/bin/env python3
	"""Build narrator/knowledge/binaries.bundled.json — a LOCAL base registry of top dev
	tools, so most binaries Her extracts resolve with NO network and NO model.

	Three public sources, merged (intra-bundle: first source wins; the whole bundle is
	still the LOWEST layer — learned + curated always override it):

	* Homebrew — full formula + cask API dump (system CLIs). Bulk download.
	* npm — registry search filtered to CLI tools (keywords:cli/command-line);
	returns name+description+homepage in bulk (the npx-run universe).
	* PyPI — top-N most-downloaded packages (hugovk list) + each one's summary
	/homepage fetched in parallel (python tools: ruff, black, pytest…).

	Anything not bundled is still resolved at runtime by the enricher (Nemotron → public
	registries) and persisted to the learned registry, so coverage keeps growing.

	Run: python3 scripts/build_binaries_db.py
	Env: HER_NPM_MAX (2000) · HER_PYPI_MAX (3000) · HER_BREW_DIR (read cached
	brew_formula.json/brew_cask.json from here instead of fetching).
	"""
	from __future__ import annotations

	import concurrent.futures
	import json
	import os
	import urllib.request

	_HERE = os.path.dirname(os.path.abspath(__file__))
	_KNOW = os.path.join(os.path.dirname(_HERE), "narrator", "knowledge")
	OUT = os.path.join(_KNOW, "binaries.bundled.json")
	_UA = {"User-Agent": "her-db-build/1.0", "Accept": "application/json"}
	NPM_MAX = int(os.environ.get("HER_NPM_MAX", "2000"))
	PYPI_MAX = int(os.environ.get("HER_PYPI_MAX", "3000"))


	def _fetch(url: str, timeout: float = 90.0):
	req = urllib.request.Request(url, headers=_UA)
	with urllib.request.urlopen(req, timeout=timeout) as r:
	return json.loads(r.read().decode("utf-8"))


	def _add(out: dict, name: str, product: str, blurb: str, homepage: str, source: str):
	if not name:
	return
	blurb = (blurb or "").strip()
	out.setdefault(name, {
	"product": (product or name), "blurb": blurb[:140],
	"homepage": homepage or "", "source": source,
	})


	def homebrew(out: dict):
	cache = os.environ.get("HER_BREW_DIR")
	def load(name, url):
	if cache and os.path.exists(os.path.join(cache, name)):
	return json.load(open(os.path.join(cache, name)))
	return _fetch(url)
	print("homebrew: formulae …", flush=True)
	for f in load("brew_formula.json", "https://formulae.brew.sh/api/formula.json"):
	if f.get("desc"):
	_add(out, f.get("name"), f.get("name"), f.get("desc"), f.get("homepage") or "", "homebrew")
	print("homebrew: casks …", flush=True)
	for c in load("brew_cask.json", "https://formulae.brew.sh/api/cask.json"):
	if c.get("desc"):
	nm = c.get("name"); prod = (nm[0] if isinstance(nm, list) and nm else nm) if nm else c.get("token")
	_add(out, c.get("token"), prod, c.get("desc"), c.get("homepage") or "", "homebrew-cask")


	def npm(out: dict):
	print(f"npm: search keywords:cli (up to {NPM_MAX}) …", flush=True)
	got = 0
	for kw in ("keywords:cli", "keywords:command-line", "keywords:cli-tool"):
	for frm in range(0, NPM_MAX, 250):
	try:
	body = _fetch(f"https://registry.npmjs.org/-/v1/search?text={kw}&size=250&from={frm}", timeout=30)
	except Exception:
	break
	objs = body.get("objects") or []
	if not objs:
	break
	for o in objs:
	p = o.get("package") or {}
	if not p.get("description"):
	continue
	hp = ((p.get("links") or {}).get("homepage")) or ""
	before = len(out)
	_add(out, p.get("name"), p.get("name"), p.get("description"), hp, "npm")
	got += len(out) - before
	if len(objs) < 250:
	break
	print(f"npm: +{got} new", flush=True)


	def _pypi_one(name: str):
	try:
	info = (_fetch(f"https://pypi.org/pypi/{name}/json", timeout=15) or {}).get("info") or {}
	except Exception:
	return None
	hp = info.get("home_page") or ""
	if not hp:
	for k, v in (info.get("project_urls") or {}).items():
	if isinstance(v, str) and v and ("home" in k.lower() or "source" in k.lower()):
	hp = v; break
	return (name, info.get("summary") or "", hp)


	def pypi(out: dict):
	print(f"pypi: top {PYPI_MAX} (hugovk) + parallel metadata …", flush=True)
	try:
	rows = _fetch("https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json").get("rows", [])
	except Exception as e:
	print("pypi: top-list fetch failed:", e, flush=True); return
	names = [r.get("project") for r in rows[:PYPI_MAX] if r.get("project")]
	got = 0
	with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
	for res in ex.map(_pypi_one, names):
	if res and res[1]:
	before = len(out)
	_add(out, res[0], res[0], res[1], res[2], "pypi")
	got += len(out) - before
	print(f"pypi: +{got} new", flush=True)


	def main() -> int:
	out: dict = {}
	homebrew(out)
	npm(out)
	pypi(out)
	doc = {
	"_provenance": {
	"note": "BUNDLED local base registry of top dev tools (Homebrew + npm CLI "
	"search + top PyPI). scripts/build_binaries_db.py. Merged UNDER "
	"learned + curated (curated always wins). No trace content.",
	"sources": ["formulae.brew.sh", "registry.npmjs.org", "pypi.org"],
	},
	"binaries": out,
	}
	with open(OUT, "w", encoding="utf-8") as fh:
	json.dump(doc, fh, ensure_ascii=False); fh.write("\n")
	print(f"wrote {len(out)} entries -> {OUT} ({os.path.getsize(OUT)//1024} KB)", flush=True)
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())