Spaces:
Running
Running
| """LiteLLM kwargs resolution for the model ids this agent accepts. | |
| Kept separate from ``agent_loop`` so tools (research, context compaction, etc.) | |
| can import it without pulling in the whole agent loop / tool router and | |
| creating circular imports. | |
| """ | |
| import os | |
| from agent.core.hf_tokens import resolve_hf_router_token | |
| from agent.core.local_models import ( | |
| LOCAL_MODEL_API_KEY_DEFAULT, | |
| LOCAL_MODEL_API_KEY_ENV, | |
| LOCAL_MODEL_BASE_URL_ENV, | |
| is_reserved_local_model_id, | |
| local_model_name, | |
| local_model_provider, | |
| ) | |
| from agent.core.model_ids import ( | |
| HF_ROUTER_BASE_URL, | |
| strip_huggingface_model_prefix, | |
| ) | |
| def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None: | |
| """Backward-compatible private wrapper used by tests and older imports.""" | |
| return resolve_hf_router_token(session_hf_token) | |
| # Effort levels accepted on the wire. | |
| # HF Router exposes reasoning controls through the OpenAI-compatible | |
| # ``extra_body`` field. The probe cascade walks down when a provider rejects | |
| # an accepted-looking value, so this stays intentionally small and generic. | |
| _HF_EFFORTS = {"low", "medium", "high"} | |
| def _hf_router_effort_level(reasoning_effort: str) -> str: | |
| level = "low" if reasoning_effort == "minimal" else reasoning_effort | |
| return level | |
| class UnsupportedEffortError(ValueError): | |
| """The requested effort isn't valid for this provider's API surface. | |
| Raised synchronously before any network call so the probe cascade can | |
| skip levels the provider can't accept (e.g. ``max`` on HF router). | |
| """ | |
| def _local_api_base(base_url: str) -> str: | |
| base = base_url.strip().rstrip("/") | |
| if base.endswith("/v1"): | |
| return base | |
| return f"{base}/v1" | |
| def _resolve_local_model_params( | |
| model_name: str, | |
| reasoning_effort: str | None = None, | |
| strict: bool = False, | |
| ) -> dict: | |
| if reasoning_effort and strict: | |
| raise UnsupportedEffortError( | |
| "Local OpenAI-compatible endpoints don't accept reasoning_effort" | |
| ) | |
| local_name = local_model_name(model_name) | |
| if local_name is None: | |
| raise ValueError(f"Unsupported local model id: {model_name}") | |
| provider = local_model_provider(model_name) | |
| assert provider is not None | |
| raw_base = ( | |
| os.environ.get(provider["base_url_env"]) | |
| or os.environ.get(LOCAL_MODEL_BASE_URL_ENV) | |
| or provider["base_url_default"] | |
| ) | |
| api_key = ( | |
| os.environ.get(provider["api_key_env"]) | |
| or os.environ.get(LOCAL_MODEL_API_KEY_ENV) | |
| or LOCAL_MODEL_API_KEY_DEFAULT | |
| ) | |
| return { | |
| "model": f"openai/{local_name}", | |
| "api_base": _local_api_base(raw_base), | |
| "api_key": api_key, | |
| } | |
| def _resolve_llm_params( | |
| model_name: str, | |
| session_hf_token: str | None = None, | |
| reasoning_effort: str | None = None, | |
| strict: bool = False, | |
| ) -> dict: | |
| """ | |
| Build LiteLLM kwargs for a given model id. | |
| • ``ollama/<model>``, ``vllm/<model>``, ``lm_studio/<model>``, and | |
| ``llamacpp/<model>`` — local OpenAI-compatible endpoints. The id prefix | |
| selects a configurable localhost base URL, and the model suffix is sent | |
| to LiteLLM as ``openai/<model>``. These endpoints don't receive | |
| ``reasoning_effort``. | |
| • Anything else is treated as an HF Router id. We hit the auto-routing | |
| OpenAI-compatible endpoint at ``https://router.huggingface.co/v1``. | |
| The id can be bare or carry an HF routing suffix (``:fastest`` / | |
| ``:cheapest`` / ``:<provider>``). A leading ``huggingface/`` is | |
| stripped. ``reasoning_effort`` is forwarded via ``extra_body``. | |
| "minimal" normalizes to "low". | |
| ``strict=True`` raises ``UnsupportedEffortError`` when the requested | |
| effort isn't in the provider's accepted set, instead of silently | |
| dropping it. The probe cascade uses strict mode so it can walk down | |
| (``max`` → ``xhigh`` → ``high`` …) without making an API call. Regular | |
| runtime callers leave ``strict=False``, so a stale cached effort | |
| can't crash a turn — it just doesn't get sent. | |
| Token precedence for HF-router calls (first non-empty wins): | |
| 1. session.hf_token — the user's own token (CLI / OAuth / cache file). | |
| 2. huggingface_hub cache — ``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` / | |
| local ``hf auth login`` cache. | |
| """ | |
| normalized_model = strip_huggingface_model_prefix(model_name) or model_name | |
| if is_reserved_local_model_id(normalized_model): | |
| raise ValueError(f"Unsupported local model id: {normalized_model}") | |
| if local_model_provider(normalized_model) is not None: | |
| return _resolve_local_model_params(normalized_model, reasoning_effort, strict) | |
| hf_model = normalized_model | |
| api_key = _resolve_hf_router_token(session_hf_token) | |
| params = { | |
| "model": f"openai/{hf_model}", | |
| "api_base": HF_ROUTER_BASE_URL, | |
| "api_key": api_key, | |
| } | |
| if reasoning_effort: | |
| hf_level = _hf_router_effort_level(reasoning_effort) | |
| if hf_level not in _HF_EFFORTS: | |
| if strict: | |
| raise UnsupportedEffortError( | |
| f"HF Router doesn't accept effort={hf_level!r}" | |
| ) | |
| else: | |
| params["extra_body"] = {"reasoning_effort": hf_level} | |
| return params | |