Spaces:
Running on Zero
Running on Zero
File size: 7,679 Bytes
45917c7 c6cdf25 45917c7 c6cdf25 0f11b49 45917c7 1f581d5 45917c7 1f581d5 45917c7 1f581d5 45917c7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | """Inference backend registry β one faΓ§ade over every place a model can run.
The engine started with a single inference backend: the OpenAI-compatible vLLM
endpoints the project serves on Modal (``modal/catalogue.py``). This module adds a
*second* backend β Hugging Face's serverless Inference Providers
(``hf_catalogue.py``) β and gives both a single, uniform read surface so the router,
the config loader, and the Lab UI never special-case which backend a model lives on.
A model is named by a **backend-qualified key**: ``"<backend>:<raw_key>"``, e.g.
``"hf:Qwen/Qwen2.5-7B-Instruct"``. A *bare* key with no recognised prefix means the
Modal backend β so every existing key, manifest ``model_endpoint``, and
``config/models.yaml`` ``endpoint:`` keeps working untouched (Modal is the default
backend). The router resolves a qualified key to the right backend's binding; offline
it folds into the deterministic stub like any profile, so demos stay reproducible.
Each backend exposes the same three primitives (``entries`` / ``binding_for`` /
``default_key_for_profile``); this layer dispatches on the prefix and adds a
``backend`` tag to every entry. Adding a third backend later = add one entry to
``_BACKENDS`` β nothing above this module changes.
"""
from __future__ import annotations
import os
from dataclasses import dataclass
from src.models import hf_catalogue, local_catalogue, modal_catalogue
# Separator between a backend prefix and the backend-local key. A raw HF repo id can
# contain ``/`` but never a leading ``<backend>:`` prefix, so a single split is safe.
SEP = ":"
@dataclass(frozen=True)
class Backend:
"""One inference backend and the description the UI shows for it."""
key: str # stable handle / key prefix, e.g. "modal"
label: str # short display name, e.g. "Modal"
blurb: str # one-line UX description
catalogue: object # the backing catalogue module (modal_catalogue / hf_catalogue)
# Modal first β it is the default backend a bare key resolves to.
_BACKENDS: dict[str, Backend] = {
"modal": Backend(
key="modal",
label="Modal",
blurb="self-hosted vLLM endpoints you deploy (full control, GPU-backed)",
catalogue=modal_catalogue,
),
"hf": Backend(
key="hf",
label="Hugging Face",
blurb="serverless Inference Providers β many small models, just a token",
catalogue=hf_catalogue,
),
"local": Backend(
key="local",
label="Local GPU",
blurb="transformers in-process on this Space's own GPU β ZeroGPU or dedicated",
catalogue=local_catalogue,
),
}
DEFAULT_BACKEND = "modal"
# ββ key (de)composition ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def backends() -> list[Backend]:
"""Every registered backend, in display order (Modal first)."""
return list(_BACKENDS.values())
def split_key(key: str) -> tuple[str, str]:
"""Split a (possibly qualified) key into ``(backend, raw_key)``.
``"hf:org/model"`` β ``("hf", "org/model")``; a bare key (no recognised prefix,
e.g. ``"gemma-4-12b"``) β ``("modal", "gemma-4-12b")`` for backward compatibility.
"""
if SEP in key:
prefix, _, rest = key.partition(SEP)
if prefix in _BACKENDS:
return prefix, rest
return DEFAULT_BACKEND, key
def qualify(backend: str, raw_key: str) -> str:
"""Build a backend-qualified key. Modal keys stay *bare* (the historical form, so
existing config/tests are unaffected); every other backend is prefixed."""
if backend == DEFAULT_BACKEND:
return raw_key
return f"{backend}{SEP}{raw_key}"
def _catalogue(backend: str):
backend_obj = _BACKENDS.get(backend)
return backend_obj.catalogue if backend_obj else None
# ββ unified read surface βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def entries(backend: str | None = None) -> list[dict]:
"""Catalogue entries, each tagged with its ``backend`` and a qualified ``key``.
With *backend* set, only that backend's models; otherwise every backend's, in
backend order. Each dict is the backend's own entry shape plus ``backend`` and
with ``key`` rewritten to the qualified form the rest of the engine passes around.
"""
wanted = [backend] if backend is not None else list(_BACKENDS)
out: list[dict] = []
for name in wanted:
cat = _catalogue(name)
if cat is None:
continue
for entry in cat.entries():
tagged = dict(entry)
tagged["backend"] = name
tagged["key"] = qualify(name, entry["key"])
out.append(tagged)
return out
def entry_by_key(key: str) -> dict | None:
"""The entry for a (qualified or bare) *key*, tagged with its backend, or None."""
backend, raw = split_key(key)
cat = _catalogue(backend)
if cat is None:
return None
try:
entry = cat.entry_by_key(raw)
except Exception: # pragma: no cover - defensive: a broken catalogue β no entry
return None
if entry is None:
return None
tagged = dict(entry)
tagged["backend"] = backend
tagged["key"] = qualify(backend, entry["key"])
return tagged
def binding_for(key: str, env: dict[str, str] | None = None) -> dict:
"""Resolve a (qualified or bare) *key* to ``{model, base_url, api_key}``.
Dispatches to the owning backend's ``binding_for``. Raises ``KeyError`` for an
unknown backend or key β a profile pointing at a non-existent model is a config
error worth surfacing (mirrors ``modal_catalogue.binding_for``).
"""
backend, raw = split_key(key)
cat = _catalogue(backend)
if cat is None:
raise KeyError(f"unknown inference backend {backend!r} for key {key!r}; known: {sorted(_BACKENDS)}")
return cat.binding_for(raw, env=env)
def default_key_for_profile(profile: str, backend: str = DEFAULT_BACKEND) -> str | None:
"""The qualified key of *backend*'s default model for *profile*, or None."""
cat = _catalogue(backend)
if cat is None:
return None
raw = cat.default_key_for_profile(profile)
return qualify(backend, raw) if raw else None
# ββ credentials / status (per backend) βββββββββββββββββββββββββββββββββββββββββββββ
def backend_label(backend: str) -> str:
"""The display label for *backend* (its key as a fallback)."""
b = _BACKENDS.get(backend)
return b.label if b else backend
def backend_available(backend: str, env: dict[str, str] | None = None) -> bool:
"""True when *backend* has enough configuration to make a live call.
Each backend's catalogue owns its own credential check (``has_credentials``), so
this dispatches without naming any backend β adding a backend to ``_BACKENDS`` makes
it work here for free. Single source of truth for "can this backend go live", shared
by the UI chip and the composed-run offline decision.
"""
source = os.environ if env is None else env
b = _BACKENDS.get(backend)
return bool(b and b.catalogue.has_credentials(source))
def configured_backends(env: dict[str, str] | None = None) -> list[str]:
"""The keys of every backend that is currently live-capable (in display order)."""
return [b.key for b in backends() if backend_available(b.key, env=env)]
|