File size: 5,915 Bytes
5f43c7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
"""Build narrator/knowledge/binaries.bundled.json — a LOCAL base registry of top dev
tools, so most binaries Her extracts resolve with NO network and NO model.

Three public sources, merged (intra-bundle: first source wins; the whole bundle is
still the LOWEST layer — learned + curated always override it):

  * Homebrew  — full formula + cask API dump (system CLIs). Bulk download.
  * npm       — registry search filtered to CLI tools (keywords:cli/command-line);
                returns name+description+homepage in bulk (the npx-run universe).
  * PyPI      — top-N most-downloaded packages (hugovk list) + each one's summary
                /homepage fetched in parallel (python tools: ruff, black, pytest…).

Anything not bundled is still resolved at runtime by the enricher (Nemotron → public
registries) and persisted to the learned registry, so coverage keeps growing.

Run:  python3 scripts/build_binaries_db.py
Env:  HER_NPM_MAX (2000) · HER_PYPI_MAX (3000) · HER_BREW_DIR (read cached
      brew_formula.json/brew_cask.json from here instead of fetching).
"""
from __future__ import annotations

import concurrent.futures
import json
import os
import urllib.request

_HERE = os.path.dirname(os.path.abspath(__file__))
_KNOW = os.path.join(os.path.dirname(_HERE), "narrator", "knowledge")
OUT = os.path.join(_KNOW, "binaries.bundled.json")
_UA = {"User-Agent": "her-db-build/1.0", "Accept": "application/json"}
NPM_MAX = int(os.environ.get("HER_NPM_MAX", "2000"))
PYPI_MAX = int(os.environ.get("HER_PYPI_MAX", "3000"))


def _fetch(url: str, timeout: float = 90.0):
    req = urllib.request.Request(url, headers=_UA)
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return json.loads(r.read().decode("utf-8"))


def _add(out: dict, name: str, product: str, blurb: str, homepage: str, source: str):
    if not name:
        return
    blurb = (blurb or "").strip()
    out.setdefault(name, {
        "product": (product or name), "blurb": blurb[:140],
        "homepage": homepage or "", "source": source,
    })


def homebrew(out: dict):
    cache = os.environ.get("HER_BREW_DIR")
    def load(name, url):
        if cache and os.path.exists(os.path.join(cache, name)):
            return json.load(open(os.path.join(cache, name)))
        return _fetch(url)
    print("homebrew: formulae …", flush=True)
    for f in load("brew_formula.json", "https://formulae.brew.sh/api/formula.json"):
        if f.get("desc"):
            _add(out, f.get("name"), f.get("name"), f.get("desc"), f.get("homepage") or "", "homebrew")
    print("homebrew: casks …", flush=True)
    for c in load("brew_cask.json", "https://formulae.brew.sh/api/cask.json"):
        if c.get("desc"):
            nm = c.get("name"); prod = (nm[0] if isinstance(nm, list) and nm else nm) if nm else c.get("token")
            _add(out, c.get("token"), prod, c.get("desc"), c.get("homepage") or "", "homebrew-cask")


def npm(out: dict):
    print(f"npm: search keywords:cli (up to {NPM_MAX}) …", flush=True)
    got = 0
    for kw in ("keywords:cli", "keywords:command-line", "keywords:cli-tool"):
        for frm in range(0, NPM_MAX, 250):
            try:
                body = _fetch(f"https://registry.npmjs.org/-/v1/search?text={kw}&size=250&from={frm}", timeout=30)
            except Exception:
                break
            objs = body.get("objects") or []
            if not objs:
                break
            for o in objs:
                p = o.get("package") or {}
                if not p.get("description"):
                    continue
                hp = ((p.get("links") or {}).get("homepage")) or ""
                before = len(out)
                _add(out, p.get("name"), p.get("name"), p.get("description"), hp, "npm")
                got += len(out) - before
            if len(objs) < 250:
                break
    print(f"npm: +{got} new", flush=True)


def _pypi_one(name: str):
    try:
        info = (_fetch(f"https://pypi.org/pypi/{name}/json", timeout=15) or {}).get("info") or {}
    except Exception:
        return None
    hp = info.get("home_page") or ""
    if not hp:
        for k, v in (info.get("project_urls") or {}).items():
            if isinstance(v, str) and v and ("home" in k.lower() or "source" in k.lower()):
                hp = v; break
    return (name, info.get("summary") or "", hp)


def pypi(out: dict):
    print(f"pypi: top {PYPI_MAX} (hugovk) + parallel metadata …", flush=True)
    try:
        rows = _fetch("https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json").get("rows", [])
    except Exception as e:
        print("pypi: top-list fetch failed:", e, flush=True); return
    names = [r.get("project") for r in rows[:PYPI_MAX] if r.get("project")]
    got = 0
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
        for res in ex.map(_pypi_one, names):
            if res and res[1]:
                before = len(out)
                _add(out, res[0], res[0], res[1], res[2], "pypi")
                got += len(out) - before
    print(f"pypi: +{got} new", flush=True)


def main() -> int:
    out: dict = {}
    homebrew(out)
    npm(out)
    pypi(out)
    doc = {
        "_provenance": {
            "note": "BUNDLED local base registry of top dev tools (Homebrew + npm CLI "
                    "search + top PyPI). scripts/build_binaries_db.py. Merged UNDER "
                    "learned + curated (curated always wins). No trace content.",
            "sources": ["formulae.brew.sh", "registry.npmjs.org", "pypi.org"],
        },
        "binaries": out,
    }
    with open(OUT, "w", encoding="utf-8") as fh:
        json.dump(doc, fh, ensure_ascii=False); fh.write("\n")
    print(f"wrote {len(out)} entries -> {OUT} ({os.path.getsize(OUT)//1024} KB)", flush=True)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())