Spaces:
Running
Running
File size: 5,302 Bytes
1635e66 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | """LiteLLM kwargs resolution for the model ids this agent accepts.
Kept separate from ``agent_loop`` so tools (research, context compaction, etc.)
can import it without pulling in the whole agent loop / tool router and
creating circular imports.
"""
import os
from agent.core.hf_tokens import resolve_hf_router_token
from agent.core.local_models import (
LOCAL_MODEL_API_KEY_DEFAULT,
LOCAL_MODEL_API_KEY_ENV,
LOCAL_MODEL_BASE_URL_ENV,
is_reserved_local_model_id,
local_model_name,
local_model_provider,
)
from agent.core.model_ids import (
HF_ROUTER_BASE_URL,
strip_huggingface_model_prefix,
)
def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None:
"""Backward-compatible private wrapper used by tests and older imports."""
return resolve_hf_router_token(session_hf_token)
# Effort levels accepted on the wire.
# HF Router exposes reasoning controls through the OpenAI-compatible
# ``extra_body`` field. The probe cascade walks down when a provider rejects
# an accepted-looking value, so this stays intentionally small and generic.
_HF_EFFORTS = {"low", "medium", "high"}
def _hf_router_effort_level(reasoning_effort: str) -> str:
level = "low" if reasoning_effort == "minimal" else reasoning_effort
return level
class UnsupportedEffortError(ValueError):
"""The requested effort isn't valid for this provider's API surface.
Raised synchronously before any network call so the probe cascade can
skip levels the provider can't accept (e.g. ``max`` on HF router).
"""
def _local_api_base(base_url: str) -> str:
base = base_url.strip().rstrip("/")
if base.endswith("/v1"):
return base
return f"{base}/v1"
def _resolve_local_model_params(
model_name: str,
reasoning_effort: str | None = None,
strict: bool = False,
) -> dict:
if reasoning_effort and strict:
raise UnsupportedEffortError(
"Local OpenAI-compatible endpoints don't accept reasoning_effort"
)
local_name = local_model_name(model_name)
if local_name is None:
raise ValueError(f"Unsupported local model id: {model_name}")
provider = local_model_provider(model_name)
assert provider is not None
raw_base = (
os.environ.get(provider["base_url_env"])
or os.environ.get(LOCAL_MODEL_BASE_URL_ENV)
or provider["base_url_default"]
)
api_key = (
os.environ.get(provider["api_key_env"])
or os.environ.get(LOCAL_MODEL_API_KEY_ENV)
or LOCAL_MODEL_API_KEY_DEFAULT
)
return {
"model": f"openai/{local_name}",
"api_base": _local_api_base(raw_base),
"api_key": api_key,
}
def _resolve_llm_params(
model_name: str,
session_hf_token: str | None = None,
reasoning_effort: str | None = None,
strict: bool = False,
) -> dict:
"""
Build LiteLLM kwargs for a given model id.
• ``ollama/<model>``, ``vllm/<model>``, ``lm_studio/<model>``, and
``llamacpp/<model>`` — local OpenAI-compatible endpoints. The id prefix
selects a configurable localhost base URL, and the model suffix is sent
to LiteLLM as ``openai/<model>``. These endpoints don't receive
``reasoning_effort``.
• Anything else is treated as an HF Router id. We hit the auto-routing
OpenAI-compatible endpoint at ``https://router.huggingface.co/v1``.
The id can be bare or carry an HF routing suffix (``:fastest`` /
``:cheapest`` / ``:<provider>``). A leading ``huggingface/`` is
stripped. ``reasoning_effort`` is forwarded via ``extra_body``.
"minimal" normalizes to "low".
``strict=True`` raises ``UnsupportedEffortError`` when the requested
effort isn't in the provider's accepted set, instead of silently
dropping it. The probe cascade uses strict mode so it can walk down
(``max`` → ``xhigh`` → ``high`` …) without making an API call. Regular
runtime callers leave ``strict=False``, so a stale cached effort
can't crash a turn — it just doesn't get sent.
Token precedence for HF-router calls (first non-empty wins):
1. session.hf_token — the user's own token (CLI / OAuth / cache file).
2. huggingface_hub cache — ``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` /
local ``hf auth login`` cache.
"""
normalized_model = strip_huggingface_model_prefix(model_name) or model_name
if is_reserved_local_model_id(normalized_model):
raise ValueError(f"Unsupported local model id: {normalized_model}")
if local_model_provider(normalized_model) is not None:
return _resolve_local_model_params(normalized_model, reasoning_effort, strict)
hf_model = normalized_model
api_key = _resolve_hf_router_token(session_hf_token)
params = {
"model": f"openai/{hf_model}",
"api_base": HF_ROUTER_BASE_URL,
"api_key": api_key,
}
if reasoning_effort:
hf_level = _hf_router_effort_level(reasoning_effort)
if hf_level not in _HF_EFFORTS:
if strict:
raise UnsupportedEffortError(
f"HF Router doesn't accept effort={hf_level!r}"
)
else:
params["extra_body"] = {"reasoning_effort": hf_level}
return params
|