Spaces:
Running on Zero
Running on Zero
| """binaries_db.py — load the binary registry that names + describes a binary. | |
| Two JSON files, merged into one `{binary: {product, blurb, homepage, logo, | |
| security, source, updated}}` lookup: | |
| * `narrator/knowledge/binaries.json` — CURATED, hand-editable, ships in | |
| the repo. The well-known set (railway, aws, docker, remotion, …). Logos and | |
| `security` notes live here. Code NEVER rewrites this file. | |
| * `narrator/knowledge/binaries.learned.json` — MACHINE-written by the background | |
| enricher only (so the curated file is never clobbered). New binaries the | |
| product discovers in the wild land here, flagged with their source. | |
| Curated wins on conflict (a human override always beats a learned guess). This is | |
| the ONE place that reads the registry — like `best_practices.py` for signals.json. | |
| Static reference data, NOT a model and NOT a finding source: it only supplies the | |
| display metadata for a binary the deterministic extractor already found. Robust to | |
| a missing/corrupt file — returns {} so callers fall back to the bare name rather | |
| than crashing (Non-negotiable: the engine never breaks on bad reference data). | |
| `security` is the owner's "security definition" field: a short note when a binary | |
| is sensitive (e.g. "Deploys to production / handles cloud credentials"), surfaced | |
| as a small warning badge in the UI. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| from typing import Any, Optional | |
| _HERE = os.path.dirname(os.path.abspath(__file__)) | |
| _REPO = os.path.dirname(os.path.dirname(_HERE)) | |
| _KNOW = os.path.join(_REPO, "narrator", "knowledge") | |
| CURATED_PATH = os.path.join(_KNOW, "binaries.json") | |
| # BUNDLED: a large LOCAL base layer (top CLI tools from Homebrew/npm/PyPI) shipped with | |
| # the app, so most binaries resolve with NO network and NO model. Optional — {} if absent. | |
| BUNDLED_PATH = os.path.join(_KNOW, "binaries.bundled.json") | |
| # LEARNED: machine-written by the background enricher. Redirectable via HER_LEARNED_PATH | |
| # so the hosted Space persists it on the bucket (/data/_registry/...) — accumulating the | |
| # long tail across restarts so later users get better enrichment. Default = repo path. | |
| LEARNED_PATH = os.environ.get("HER_LEARNED_PATH") or os.path.join(_KNOW, "binaries.learned.json") | |
| # (path, mtime_ns) -> parsed dict, so editing/replacing a file busts the cache. | |
| _CACHE: dict[str, tuple[int, dict[str, Any]]] = {} | |
| def _read(path: str) -> dict[str, Any]: | |
| """Parse one registry file into {binary: meta}. {} on missing/corrupt. | |
| Accepts either a flat object ({binary: {...}}) or {"binaries": {...}} / | |
| {"binaries": [ {binary|name, ...} ]} so a hand-edited file can use whichever | |
| shape reads cleanest.""" | |
| try: | |
| mtime = os.stat(path).st_mtime_ns | |
| except OSError: | |
| return {} | |
| cached = _CACHE.get(path) | |
| if cached and cached[0] == mtime: | |
| return cached[1] | |
| try: | |
| with open(path, encoding="utf-8") as f: | |
| raw = json.load(f) | |
| except (OSError, ValueError): | |
| return {} | |
| out: dict[str, Any] = {} | |
| body = raw.get("binaries", raw) if isinstance(raw, dict) else raw | |
| if isinstance(body, dict): | |
| for k, v in body.items(): | |
| if k.startswith("_"): # allow _provenance / _note metadata keys | |
| continue | |
| if isinstance(v, dict): | |
| out[k] = v | |
| elif isinstance(body, list): | |
| for v in body: | |
| if isinstance(v, dict): | |
| key = v.get("binary") or v.get("name") | |
| if key: | |
| out[str(key)] = v | |
| _CACHE[path] = (mtime, out) | |
| return out | |
| def load_registry() -> dict[str, dict[str, Any]]: | |
| """Merged registry, lowest precedence first: bundled (local top-tools base) < | |
| learned (enricher's long tail) < curated (human overrides always win).""" | |
| merged: dict[str, dict[str, Any]] = {} | |
| merged.update(_read(BUNDLED_PATH)) | |
| merged.update(_read(LEARNED_PATH)) | |
| merged.update(_read(CURATED_PATH)) | |
| return merged | |
| def known_names() -> set[str]: | |
| """Every binary key the registry can name (for the enricher's 'is this new?').""" | |
| return set(load_registry().keys()) | |
| def lookup(name: str) -> Optional[dict[str, Any]]: | |
| """Metadata for one binary (exact, then lowercased), or None.""" | |
| reg = load_registry() | |
| return reg.get(name) or reg.get(name.lower()) | |