"""Model router — per-agent small-model selection by logical profile. Each agent declares a *profile* in its manifest (``tiny`` | ``fast`` | ``balanced`` | ``strong``). The router resolves that profile to a concrete model name, endpoint, and decoding config, and hands back a ready provider. This is the single place per-agent model selection happens, so: - swapping a profile to a different small model is a one-line config change; - a scenario can mix a ``tiny`` worker with a ``strong`` judge for free; - the rest of the engine never names a model. The live path is the only product path: profiles resolve to concrete models and the router calls them over the gateway. The ``offline`` flag is a *test seam* — when set it serves a :class:`DeterministicTinyModel` for every profile so the suite runs with zero inference and full reproducibility (the deterministic "mock data"). Production never sets it; ``Registry.build_router`` requires live credentials and refuses to construct a stub router. See ADR-0010. On the live path the concrete transport is the :class:`LiteLLMProvider` gateway (ADR-0015): profiles point at the OpenAI-compatible Modal/vLLM endpoints in ``modal/`` and the gateway reports real per-call cost into the Governor. The routing abstraction here is unchanged — only how a model is *called* moved. """ from __future__ import annotations from dataclasses import dataclass, field from src import observability as obs from src.core.manifest import ModelProfile, resolve_model from src.models.provider import DeterministicTinyModel, ModelProvider # Decoding defaults per profile. Smaller models stay cooler and shorter; the # balanced/strong tiers are reasoning models (they think before answering, and the # thinking counts against max_tokens), so they get real room or they truncate # mid-thought and emit an empty answer. Override per-profile via config/models.yaml. _PROFILE_DECODING: dict[str, dict[str, float | int]] = { "tiny": {"temperature": 0.7, "max_tokens": 192}, "fast": {"temperature": 0.9, "max_tokens": 320}, "balanced": {"temperature": 0.8, "max_tokens": 768}, "strong": {"temperature": 0.6, "max_tokens": 1024}, } @dataclass class ProfileSpec: """Concrete binding for one logical profile. ``kind`` selects the transport: ``"litellm"`` (the default) calls the model over an OpenAI-compatible HTTP endpoint; ``"local"`` runs a ``transformers`` model in-process on the host GPU (the ``local`` backend — no ``base_url``/``api_key``). The router sets it from the resolved backend; everything else about a spec is shared. """ model: str base_url: str | None = None api_key: str | None = None temperature: float = 0.8 max_tokens: int = 256 kind: str = "litellm" @dataclass class ModelRouter: """Resolves a logical model profile to a concrete provider, with caching. Construct with explicit ``specs`` (e.g. from a validated config) or call :meth:`from_env` to derive them from ``MODEL_TINY`` / ``MODEL_FAST`` / ``MODEL_BALANCED`` / ``MODEL_STRONG`` (falling back to the catalogue default for each tier — see :func:`~src.core.manifest.resolve_model`). """ specs: dict[str, ProfileSpec] = field(default_factory=dict) # Test seam only: serve the deterministic stub for every profile. Production # never sets this (Registry.build_router requires live credentials). offline: bool = False _cache: dict[str, ModelProvider] = field(default_factory=dict, init=False, repr=False) # ── resolution ────────────────────────────────────────────────────────── def for_profile(self, profile: str) -> ModelProvider: """Return (and cache) the provider bound to *profile*.""" if profile not in self._cache: self._cache[profile] = self._build(profile) return self._cache[profile] def complete(self, role: str, prompt: str, profile: ModelProfile = "fast") -> str: """Convenience: route by profile and complete in one call.""" return self.for_profile(profile).complete(role, prompt) def describe(self) -> dict[str, str]: """Human-readable profile → model map for the UI/stats panel.""" if self.offline: return {p: f"stub:{p} (deterministic)" for p in _PROFILE_DECODING} return {p: self._spec_for(p).model for p in _PROFILE_DECODING} def model_for(self, key: str) -> str: """Concrete model name a *route key* resolves to (a profile tier or a catalogue endpoint key — the same key :attr:`BaseAgent._route_key` hands :meth:`for_profile`). Unlike :meth:`describe` (the four tiers only) this also resolves an explicit ``model_endpoint`` key, so a profile- or endpoint-bound winner can be attributed a real model name in the trace and the Hall of Fame. Offline returns the deterministic stub label so the demo still records a concrete, reproducible winning model.""" if self.offline: return f"stub:{key} (deterministic)" return self._spec_for(key).model # ── internals ─────────────────────────────────────────────────────────── def _build(self, profile: str) -> ModelProvider: if self.offline: obs.log("router.resolve", profile=profile, mode="offline", model=f"stub:{profile}") return DeterministicTinyModel(variant=f"stub:{profile}") spec = self._spec_for(profile) # Local in-process backend (ADR-0033): a transformers model on the host GPU, not an # HTTP endpoint — dispatch to the in-process provider before reaching for LiteLLM. if spec.kind == "local": from src.models.local_provider import LocalTransformersProvider obs.log("router.resolve", profile=profile, mode="live", model=spec.model, api_base="local") return LocalTransformersProvider( model=spec.model, temperature=spec.temperature, max_tokens=spec.max_tokens, ) # Live HTTP transport is the LiteLLM gateway (ADR-0015). Lazy-import keeps the # offline path free of the dependency. from src.models.litellm_provider import LiteLLMProvider # Resolution is logged WITHOUT the api key — only the model + endpoint. obs.log("router.resolve", profile=profile, mode="live", model=spec.model, api_base=spec.base_url or "") return LiteLLMProvider( model=spec.model, api_base=spec.base_url, api_key=spec.api_key, temperature=spec.temperature, max_tokens=spec.max_tokens, ) def _spec_for(self, profile: str) -> ProfileSpec: if profile in self.specs: return self.specs[profile] # A key that is not one of the four tiers names a *specific* catalogue model # (an agent's ``model_endpoint``): resolve it to that model's live binding so # a cast can pin concrete Modal models (ADR-0022). Only reached on the live # path — offline, ``_build`` serves the stub before this runs. if profile not in _PROFILE_DECODING: spec = self._catalogue_spec(profile) if spec is not None: return spec # Unknown non-tier key with no catalogue match → degrade to the fast tier # rather than crash ``resolve_model`` on a key it does not recognise. profile = "fast" decoding = _PROFILE_DECODING[profile] return ProfileSpec( model=resolve_model(profile), # type: ignore[arg-type] temperature=float(decoding["temperature"]), max_tokens=int(decoding["max_tokens"]), ) def _catalogue_spec(self, key: str) -> ProfileSpec | None: """Build a :class:`ProfileSpec` from a catalogue endpoint *key*, or None. The key may name a model on *either* inference backend — a bare Modal endpoint slug (``gemma-4-12b``) or a backend-qualified key (``hf:Qwen/Qwen2.5-7B-Instruct``). The model string / endpoint URL / api key come from that backend's catalogue + env (``inference.binding_for``); decoding inherits the model's tier default (an unbound specialist model → the balanced tier). Returns None when the key is in no catalogue, so the caller can fall back gracefully (ADR-0015 / ADR-0024).""" try: from src.models import inference entry = inference.entry_by_key(key) if entry is None: return None binding = inference.binding_for(key) except Exception: # pragma: no cover - defensive: catalogue unavailable return None decoding = _PROFILE_DECODING.get(entry.get("profile") or "balanced", _PROFILE_DECODING["balanced"]) # The local backend runs in-process (no endpoint) — tag the spec so _build picks the # in-process provider; every other backend resolves to the HTTP (LiteLLM) transport. kind = "local" if entry.get("backend") == "local" else "litellm" return ProfileSpec( model=binding["model"], base_url=binding["base_url"] or None, api_key=binding["api_key"] or None, temperature=float(decoding["temperature"]), max_tokens=int(decoding["max_tokens"]), kind=kind, ) # ── factory ───────────────────────────────────────────────────────────── @classmethod def from_env(cls) -> "ModelRouter": """Build a live router from environment configuration. Each profile resolves to its concrete model via ``resolve_model`` plus the per-profile decoding defaults (``MODEL_TINY`` / … overrides honoured). This always builds the live path; the deterministic stub is reached only by constructing ``ModelRouter(offline=True)`` explicitly (the test seam). """ return cls(offline=False)