"""Model router — per-agent small-model selection by logical profile.

Each agent declares a *profile* in its manifest (``tiny`` | ``fast`` |
``balanced`` | ``strong``).  The router resolves that profile to a concrete
model name, endpoint, and decoding config, and hands back a ready provider.
This is the single place per-agent model selection happens, so:

  - swapping a profile to a different small model is a one-line config change;
  - a scenario can mix a ``tiny`` worker with a ``strong`` judge for free;
  - the rest of the engine never names a model.

The live path is the only product path: profiles resolve to concrete models and
the router calls them over the gateway.  The ``offline`` flag is a *test seam* —
when set it serves a :class:`DeterministicTinyModel` for every profile so the
suite runs with zero inference and full reproducibility (the deterministic
"mock data").  Production never sets it; ``Registry.build_router`` requires live
credentials and refuses to construct a stub router.  See ADR-0010.

On the live path the concrete transport is the :class:`LiteLLMProvider` gateway
(ADR-0015): profiles point at the OpenAI-compatible Modal/vLLM endpoints in
``modal/`` and the gateway reports real per-call cost into the Governor.  The
routing abstraction here is unchanged — only how a model is *called* moved.
"""

from __future__ import annotations

from dataclasses import dataclass, field

from src import observability as obs
from src.core.manifest import ModelProfile, resolve_model
from src.models.provider import DeterministicTinyModel, ModelProvider

# Decoding defaults per profile.  Smaller models stay cooler and shorter; the
# balanced/strong tiers are reasoning models (they think before answering, and the
# thinking counts against max_tokens), so they get real room or they truncate
# mid-thought and emit an empty answer.  Override per-profile via config/models.yaml.
_PROFILE_DECODING: dict[str, dict[str, float | int]] = {
    "tiny": {"temperature": 0.7, "max_tokens": 192},
    "fast": {"temperature": 0.9, "max_tokens": 320},
    "balanced": {"temperature": 0.8, "max_tokens": 768},
    "strong": {"temperature": 0.6, "max_tokens": 1024},
}


@dataclass
class ProfileSpec:
    """Concrete binding for one logical profile.

    ``kind`` selects the transport: ``"litellm"`` (the default) calls the model over an
    OpenAI-compatible HTTP endpoint; ``"local"`` runs a ``transformers`` model in-process
    on the host GPU (the ``local`` backend — no ``base_url``/``api_key``). The router sets
    it from the resolved backend; everything else about a spec is shared.
    """

    model: str
    base_url: str | None = None
    api_key: str | None = None
    temperature: float = 0.8
    max_tokens: int = 256
    kind: str = "litellm"


@dataclass
class ModelRouter:
    """Resolves a logical model profile to a concrete provider, with caching.

    Construct with explicit ``specs`` (e.g. from a validated config) or call
    :meth:`from_env` to derive them from ``MODEL_TINY`` / ``MODEL_FAST`` /
    ``MODEL_BALANCED`` / ``MODEL_STRONG`` (falling back to the catalogue default
    for each tier — see :func:`~src.core.manifest.resolve_model`).
    """

    specs: dict[str, ProfileSpec] = field(default_factory=dict)
    # Test seam only: serve the deterministic stub for every profile. Production
    # never sets this (Registry.build_router requires live credentials).
    offline: bool = False
    _cache: dict[str, ModelProvider] = field(default_factory=dict, init=False, repr=False)

    # ── resolution ──────────────────────────────────────────────────────────

    def for_profile(self, profile: str) -> ModelProvider:
        """Return (and cache) the provider bound to *profile*."""
        if profile not in self._cache:
            self._cache[profile] = self._build(profile)
        return self._cache[profile]

    def complete(self, role: str, prompt: str, profile: ModelProfile = "fast") -> str:
        """Convenience: route by profile and complete in one call."""
        return self.for_profile(profile).complete(role, prompt)

    def describe(self) -> dict[str, str]:
        """Human-readable profile → model map for the UI/stats panel."""
        if self.offline:
            return {p: f"stub:{p} (deterministic)" for p in _PROFILE_DECODING}
        return {p: self._spec_for(p).model for p in _PROFILE_DECODING}

    def model_for(self, key: str) -> str:
        """Concrete model name a *route key* resolves to (a profile tier or a catalogue
        endpoint key — the same key :attr:`BaseAgent._route_key` hands :meth:`for_profile`).

        Unlike :meth:`describe` (the four tiers only) this also resolves an explicit
        ``model_endpoint`` key, so a profile- or endpoint-bound winner can be attributed a
        real model name in the trace and the Hall of Fame.  Offline returns the deterministic
        stub label so the demo still records a concrete, reproducible winning model."""
        if self.offline:
            return f"stub:{key} (deterministic)"
        return self._spec_for(key).model

    # ── internals ───────────────────────────────────────────────────────────

    def _build(self, profile: str) -> ModelProvider:
        if self.offline:
            obs.log("router.resolve", profile=profile, mode="offline", model=f"stub:{profile}")
            return DeterministicTinyModel(variant=f"stub:{profile}")

        spec = self._spec_for(profile)
        # Local in-process backend (ADR-0033): a transformers model on the host GPU, not an
        # HTTP endpoint — dispatch to the in-process provider before reaching for LiteLLM.
        if spec.kind == "local":
            from src.models.local_provider import LocalTransformersProvider

            obs.log("router.resolve", profile=profile, mode="live", model=spec.model, api_base="local")
            return LocalTransformersProvider(
                model=spec.model,
                temperature=spec.temperature,
                max_tokens=spec.max_tokens,
            )

        # Live HTTP transport is the LiteLLM gateway (ADR-0015).  Lazy-import keeps the
        # offline path free of the dependency.
        from src.models.litellm_provider import LiteLLMProvider

        # Resolution is logged WITHOUT the api key — only the model + endpoint.
        obs.log("router.resolve", profile=profile, mode="live", model=spec.model, api_base=spec.base_url or "")
        return LiteLLMProvider(
            model=spec.model,
            api_base=spec.base_url,
            api_key=spec.api_key,
            temperature=spec.temperature,
            max_tokens=spec.max_tokens,
        )

    def _spec_for(self, profile: str) -> ProfileSpec:
        if profile in self.specs:
            return self.specs[profile]
        # A key that is not one of the four tiers names a *specific* catalogue model
        # (an agent's ``model_endpoint``): resolve it to that model's live binding so
        # a cast can pin concrete Modal models (ADR-0022).  Only reached on the live
        # path — offline, ``_build`` serves the stub before this runs.
        if profile not in _PROFILE_DECODING:
            spec = self._catalogue_spec(profile)
            if spec is not None:
                return spec
            # Unknown non-tier key with no catalogue match → degrade to the fast tier
            # rather than crash ``resolve_model`` on a key it does not recognise.
            profile = "fast"
        decoding = _PROFILE_DECODING[profile]
        return ProfileSpec(
            model=resolve_model(profile),  # type: ignore[arg-type]
            temperature=float(decoding["temperature"]),
            max_tokens=int(decoding["max_tokens"]),
        )

    def _catalogue_spec(self, key: str) -> ProfileSpec | None:
        """Build a :class:`ProfileSpec` from a catalogue endpoint *key*, or None.

        The key may name a model on *either* inference backend — a bare Modal endpoint
        slug (``gemma-4-12b``) or a backend-qualified key (``hf:Qwen/Qwen2.5-7B-Instruct``).
        The model string / endpoint URL / api key come from that backend's catalogue +
        env (``inference.binding_for``); decoding inherits the model's tier default (an
        unbound specialist model → the balanced tier).  Returns None when the key is in
        no catalogue, so the caller can fall back gracefully (ADR-0015 / ADR-0024)."""
        try:
            from src.models import inference

            entry = inference.entry_by_key(key)
            if entry is None:
                return None
            binding = inference.binding_for(key)
        except Exception:  # pragma: no cover - defensive: catalogue unavailable
            return None
        decoding = _PROFILE_DECODING.get(entry.get("profile") or "balanced", _PROFILE_DECODING["balanced"])
        # The local backend runs in-process (no endpoint) — tag the spec so _build picks the
        # in-process provider; every other backend resolves to the HTTP (LiteLLM) transport.
        kind = "local" if entry.get("backend") == "local" else "litellm"
        return ProfileSpec(
            model=binding["model"],
            base_url=binding["base_url"] or None,
            api_key=binding["api_key"] or None,
            temperature=float(decoding["temperature"]),
            max_tokens=int(decoding["max_tokens"]),
            kind=kind,
        )

    # ── factory ─────────────────────────────────────────────────────────────

    @classmethod
    def from_env(cls) -> "ModelRouter":
        """Build a live router from environment configuration.

        Each profile resolves to its concrete model via ``resolve_model`` plus the
        per-profile decoding defaults (``MODEL_TINY`` / … overrides honoured).  This
        always builds the live path; the deterministic stub is reached only by
        constructing ``ModelRouter(offline=True)`` explicitly (the test seam).
        """
        return cls(offline=False)