#!/usr/bin/env python3
"""Build narrator/knowledge/binaries.bundled.json — a LOCAL base registry of top dev
tools, so most binaries Her extracts resolve with NO network and NO model.

Three public sources, merged (intra-bundle: first source wins; the whole bundle is
still the LOWEST layer — learned + curated always override it):

  * Homebrew  — full formula + cask API dump (system CLIs). Bulk download.
  * npm       — registry search filtered to CLI tools (keywords:cli/command-line);
                returns name+description+homepage in bulk (the npx-run universe).
  * PyPI      — top-N most-downloaded packages (hugovk list) + each one's summary
                /homepage fetched in parallel (python tools: ruff, black, pytest…).

Anything not bundled is still resolved at runtime by the enricher (Nemotron → public
registries) and persisted to the learned registry, so coverage keeps growing.

Run:  python3 scripts/build_binaries_db.py
Env:  HER_NPM_MAX (2000) · HER_PYPI_MAX (3000) · HER_BREW_DIR (read cached
      brew_formula.json/brew_cask.json from here instead of fetching).
"""
from __future__ import annotations

import concurrent.futures
import json
import os
import urllib.request

_HERE = os.path.dirname(os.path.abspath(__file__))
_KNOW = os.path.join(os.path.dirname(_HERE), "narrator", "knowledge")
OUT = os.path.join(_KNOW, "binaries.bundled.json")
_UA = {"User-Agent": "her-db-build/1.0", "Accept": "application/json"}
NPM_MAX = int(os.environ.get("HER_NPM_MAX", "2000"))
PYPI_MAX = int(os.environ.get("HER_PYPI_MAX", "3000"))


def _fetch(url: str, timeout: float = 90.0):
    req = urllib.request.Request(url, headers=_UA)
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return json.loads(r.read().decode("utf-8"))


def _add(out: dict, name: str, product: str, blurb: str, homepage: str, source: str):
    if not name:
        return
    blurb = (blurb or "").strip()
    out.setdefault(name, {
        "product": (product or name), "blurb": blurb[:140],
        "homepage": homepage or "", "source": source,
    })


def homebrew(out: dict):
    cache = os.environ.get("HER_BREW_DIR")
    def load(name, url):
        if cache and os.path.exists(os.path.join(cache, name)):
            return json.load(open(os.path.join(cache, name)))
        return _fetch(url)
    print("homebrew: formulae …", flush=True)
    for f in load("brew_formula.json", "https://formulae.brew.sh/api/formula.json"):
        if f.get("desc"):
            _add(out, f.get("name"), f.get("name"), f.get("desc"), f.get("homepage") or "", "homebrew")
    print("homebrew: casks …", flush=True)
    for c in load("brew_cask.json", "https://formulae.brew.sh/api/cask.json"):
        if c.get("desc"):
            nm = c.get("name"); prod = (nm[0] if isinstance(nm, list) and nm else nm) if nm else c.get("token")
            _add(out, c.get("token"), prod, c.get("desc"), c.get("homepage") or "", "homebrew-cask")


def npm(out: dict):
    print(f"npm: search keywords:cli (up to {NPM_MAX}) …", flush=True)
    got = 0
    for kw in ("keywords:cli", "keywords:command-line", "keywords:cli-tool"):
        for frm in range(0, NPM_MAX, 250):
            try:
                body = _fetch(f"https://registry.npmjs.org/-/v1/search?text={kw}&size=250&from={frm}", timeout=30)
            except Exception:
                break
            objs = body.get("objects") or []
            if not objs:
                break
            for o in objs:
                p = o.get("package") or {}
                if not p.get("description"):
                    continue
                hp = ((p.get("links") or {}).get("homepage")) or ""
                before = len(out)
                _add(out, p.get("name"), p.get("name"), p.get("description"), hp, "npm")
                got += len(out) - before
            if len(objs) < 250:
                break
    print(f"npm: +{got} new", flush=True)


def _pypi_one(name: str):
    try:
        info = (_fetch(f"https://pypi.org/pypi/{name}/json", timeout=15) or {}).get("info") or {}
    except Exception:
        return None
    hp = info.get("home_page") or ""
    if not hp:
        for k, v in (info.get("project_urls") or {}).items():
            if isinstance(v, str) and v and ("home" in k.lower() or "source" in k.lower()):
                hp = v; break
    return (name, info.get("summary") or "", hp)


def pypi(out: dict):
    print(f"pypi: top {PYPI_MAX} (hugovk) + parallel metadata …", flush=True)
    try:
        rows = _fetch("https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json").get("rows", [])
    except Exception as e:
        print("pypi: top-list fetch failed:", e, flush=True); return
    names = [r.get("project") for r in rows[:PYPI_MAX] if r.get("project")]
    got = 0
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as ex:
        for res in ex.map(_pypi_one, names):
            if res and res[1]:
                before = len(out)
                _add(out, res[0], res[0], res[1], res[2], "pypi")
                got += len(out) - before
    print(f"pypi: +{got} new", flush=True)


def main() -> int:
    out: dict = {}
    homebrew(out)
    npm(out)
    pypi(out)
    doc = {
        "_provenance": {
            "note": "BUNDLED local base registry of top dev tools (Homebrew + npm CLI "
                    "search + top PyPI). scripts/build_binaries_db.py. Merged UNDER "
                    "learned + curated (curated always wins). No trace content.",
            "sources": ["formulae.brew.sh", "registry.npmjs.org", "pypi.org"],
        },
        "binaries": out,
    }
    with open(OUT, "w", encoding="utf-8") as fh:
        json.dump(doc, fh, ensure_ascii=False); fh.write("\n")
    print(f"wrote {len(out)} entries -> {OUT} ({os.path.getsize(OUT)//1024} KB)", flush=True)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())