"""Inference backend registry — one façade over every place a model can run. The engine started with a single inference backend: the OpenAI-compatible vLLM endpoints the project serves on Modal (``modal/catalogue.py``). This module adds a *second* backend — Hugging Face's serverless Inference Providers (``hf_catalogue.py``) — and gives both a single, uniform read surface so the router, the config loader, and the Lab UI never special-case which backend a model lives on. A model is named by a **backend-qualified key**: ``":"``, e.g. ``"hf:Qwen/Qwen2.5-7B-Instruct"``. A *bare* key with no recognised prefix means the Modal backend — so every existing key, manifest ``model_endpoint``, and ``config/models.yaml`` ``endpoint:`` keeps working untouched (Modal is the default backend). The router resolves a qualified key to the right backend's binding; offline it folds into the deterministic stub like any profile, so demos stay reproducible. Each backend exposes the same three primitives (``entries`` / ``binding_for`` / ``default_key_for_profile``); this layer dispatches on the prefix and adds a ``backend`` tag to every entry. Adding a third backend later = add one entry to ``_BACKENDS`` — nothing above this module changes. """ from __future__ import annotations import os from dataclasses import dataclass from src.models import hf_catalogue, local_catalogue, modal_catalogue # Separator between a backend prefix and the backend-local key. A raw HF repo id can # contain ``/`` but never a leading ``:`` prefix, so a single split is safe. SEP = ":" @dataclass(frozen=True) class Backend: """One inference backend and the description the UI shows for it.""" key: str # stable handle / key prefix, e.g. "modal" label: str # short display name, e.g. "Modal" blurb: str # one-line UX description catalogue: object # the backing catalogue module (modal_catalogue / hf_catalogue) # Modal first — it is the default backend a bare key resolves to. _BACKENDS: dict[str, Backend] = { "modal": Backend( key="modal", label="Modal", blurb="self-hosted vLLM endpoints you deploy (full control, GPU-backed)", catalogue=modal_catalogue, ), "hf": Backend( key="hf", label="Hugging Face", blurb="serverless Inference Providers — many small models, just a token", catalogue=hf_catalogue, ), "local": Backend( key="local", label="Local GPU", blurb="transformers in-process on this Space's own GPU — ZeroGPU or dedicated", catalogue=local_catalogue, ), } DEFAULT_BACKEND = "modal" # ── key (de)composition ────────────────────────────────────────────────────────── def backends() -> list[Backend]: """Every registered backend, in display order (Modal first).""" return list(_BACKENDS.values()) def split_key(key: str) -> tuple[str, str]: """Split a (possibly qualified) key into ``(backend, raw_key)``. ``"hf:org/model"`` → ``("hf", "org/model")``; a bare key (no recognised prefix, e.g. ``"gemma-4-12b"``) → ``("modal", "gemma-4-12b")`` for backward compatibility. """ if SEP in key: prefix, _, rest = key.partition(SEP) if prefix in _BACKENDS: return prefix, rest return DEFAULT_BACKEND, key def qualify(backend: str, raw_key: str) -> str: """Build a backend-qualified key. Modal keys stay *bare* (the historical form, so existing config/tests are unaffected); every other backend is prefixed.""" if backend == DEFAULT_BACKEND: return raw_key return f"{backend}{SEP}{raw_key}" def _catalogue(backend: str): backend_obj = _BACKENDS.get(backend) return backend_obj.catalogue if backend_obj else None # ── unified read surface ─────────────────────────────────────────────────────────── def entries(backend: str | None = None) -> list[dict]: """Catalogue entries, each tagged with its ``backend`` and a qualified ``key``. With *backend* set, only that backend's models; otherwise every backend's, in backend order. Each dict is the backend's own entry shape plus ``backend`` and with ``key`` rewritten to the qualified form the rest of the engine passes around. """ wanted = [backend] if backend is not None else list(_BACKENDS) out: list[dict] = [] for name in wanted: cat = _catalogue(name) if cat is None: continue for entry in cat.entries(): tagged = dict(entry) tagged["backend"] = name tagged["key"] = qualify(name, entry["key"]) out.append(tagged) return out def entry_by_key(key: str) -> dict | None: """The entry for a (qualified or bare) *key*, tagged with its backend, or None.""" backend, raw = split_key(key) cat = _catalogue(backend) if cat is None: return None try: entry = cat.entry_by_key(raw) except Exception: # pragma: no cover - defensive: a broken catalogue → no entry return None if entry is None: return None tagged = dict(entry) tagged["backend"] = backend tagged["key"] = qualify(backend, entry["key"]) return tagged def binding_for(key: str, env: dict[str, str] | None = None) -> dict: """Resolve a (qualified or bare) *key* to ``{model, base_url, api_key}``. Dispatches to the owning backend's ``binding_for``. Raises ``KeyError`` for an unknown backend or key — a profile pointing at a non-existent model is a config error worth surfacing (mirrors ``modal_catalogue.binding_for``). """ backend, raw = split_key(key) cat = _catalogue(backend) if cat is None: raise KeyError(f"unknown inference backend {backend!r} for key {key!r}; known: {sorted(_BACKENDS)}") return cat.binding_for(raw, env=env) def default_key_for_profile(profile: str, backend: str = DEFAULT_BACKEND) -> str | None: """The qualified key of *backend*'s default model for *profile*, or None.""" cat = _catalogue(backend) if cat is None: return None raw = cat.default_key_for_profile(profile) return qualify(backend, raw) if raw else None # ── credentials / status (per backend) ───────────────────────────────────────────── def backend_label(backend: str) -> str: """The display label for *backend* (its key as a fallback).""" b = _BACKENDS.get(backend) return b.label if b else backend def backend_available(backend: str, env: dict[str, str] | None = None) -> bool: """True when *backend* has enough configuration to make a live call. Each backend's catalogue owns its own credential check (``has_credentials``), so this dispatches without naming any backend — adding a backend to ``_BACKENDS`` makes it work here for free. Single source of truth for "can this backend go live", shared by the UI chip and the composed-run offline decision. """ source = os.environ if env is None else env b = _BACKENDS.get(backend) return bool(b and b.catalogue.has_credentials(source)) def configured_backends(env: dict[str, str] | None = None) -> list[str]: """The keys of every backend that is currently live-capable (in display order).""" return [b.key for b in backends() if backend_available(b.key, env=env)]