"""binaries_db.py — load the binary registry that names + describes a binary. Two JSON files, merged into one `{binary: {product, blurb, homepage, logo, security, source, updated}}` lookup: * `narrator/knowledge/binaries.json` — CURATED, hand-editable, ships in the repo. The well-known set (railway, aws, docker, remotion, …). Logos and `security` notes live here. Code NEVER rewrites this file. * `narrator/knowledge/binaries.learned.json` — MACHINE-written by the background enricher only (so the curated file is never clobbered). New binaries the product discovers in the wild land here, flagged with their source. Curated wins on conflict (a human override always beats a learned guess). This is the ONE place that reads the registry — like `best_practices.py` for signals.json. Static reference data, NOT a model and NOT a finding source: it only supplies the display metadata for a binary the deterministic extractor already found. Robust to a missing/corrupt file — returns {} so callers fall back to the bare name rather than crashing (Non-negotiable: the engine never breaks on bad reference data). `security` is the owner's "security definition" field: a short note when a binary is sensitive (e.g. "Deploys to production / handles cloud credentials"), surfaced as a small warning badge in the UI. """ from __future__ import annotations import json import os from typing import Any, Optional _HERE = os.path.dirname(os.path.abspath(__file__)) _REPO = os.path.dirname(os.path.dirname(_HERE)) _KNOW = os.path.join(_REPO, "narrator", "knowledge") CURATED_PATH = os.path.join(_KNOW, "binaries.json") # BUNDLED: a large LOCAL base layer (top CLI tools from Homebrew/npm/PyPI) shipped with # the app, so most binaries resolve with NO network and NO model. Optional — {} if absent. BUNDLED_PATH = os.path.join(_KNOW, "binaries.bundled.json") # LEARNED: machine-written by the background enricher. Redirectable via HER_LEARNED_PATH # so the hosted Space persists it on the bucket (/data/_registry/...) — accumulating the # long tail across restarts so later users get better enrichment. Default = repo path. LEARNED_PATH = os.environ.get("HER_LEARNED_PATH") or os.path.join(_KNOW, "binaries.learned.json") # (path, mtime_ns) -> parsed dict, so editing/replacing a file busts the cache. _CACHE: dict[str, tuple[int, dict[str, Any]]] = {} def _read(path: str) -> dict[str, Any]: """Parse one registry file into {binary: meta}. {} on missing/corrupt. Accepts either a flat object ({binary: {...}}) or {"binaries": {...}} / {"binaries": [ {binary|name, ...} ]} so a hand-edited file can use whichever shape reads cleanest.""" try: mtime = os.stat(path).st_mtime_ns except OSError: return {} cached = _CACHE.get(path) if cached and cached[0] == mtime: return cached[1] try: with open(path, encoding="utf-8") as f: raw = json.load(f) except (OSError, ValueError): return {} out: dict[str, Any] = {} body = raw.get("binaries", raw) if isinstance(raw, dict) else raw if isinstance(body, dict): for k, v in body.items(): if k.startswith("_"): # allow _provenance / _note metadata keys continue if isinstance(v, dict): out[k] = v elif isinstance(body, list): for v in body: if isinstance(v, dict): key = v.get("binary") or v.get("name") if key: out[str(key)] = v _CACHE[path] = (mtime, out) return out def load_registry() -> dict[str, dict[str, Any]]: """Merged registry, lowest precedence first: bundled (local top-tools base) < learned (enricher's long tail) < curated (human overrides always win).""" merged: dict[str, dict[str, Any]] = {} merged.update(_read(BUNDLED_PATH)) merged.update(_read(LEARNED_PATH)) merged.update(_read(CURATED_PATH)) return merged def known_names() -> set[str]: """Every binary key the registry can name (for the enricher's 'is this new?').""" return set(load_registry().keys()) def lookup(name: str) -> Optional[dict[str, Any]]: """Metadata for one binary (exact, then lowercased), or None.""" reg = load_registry() return reg.get(name) or reg.get(name.lower())