Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """LiteLLM kwargs resolution for the model ids this agent accepts. | |
| Kept separate from ``agent_loop`` so tools (research, context compaction, etc.) | |
| can import it without pulling in the whole agent loop / tool router and | |
| creating circular imports. | |
| """ | |
| import os | |
| from agent.core.hf_tokens import ( | |
| get_hf_bill_to, | |
| resolve_hf_router_token, | |
| resolve_hf_token, | |
| ) | |
| from agent.core.local_models import ( | |
| LOCAL_MODEL_API_KEY_DEFAULT, | |
| LOCAL_MODEL_API_KEY_ENV, | |
| LOCAL_MODEL_BASE_URL_ENV, | |
| is_reserved_local_model_id, | |
| local_model_name, | |
| local_model_provider, | |
| ) | |
| def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None: | |
| """Backward-compatible private wrapper used by tests and older imports.""" | |
| return resolve_hf_router_token(session_hf_token) | |
| def _patch_litellm_effort_validation() -> None: | |
| """Neuter LiteLLM 1.83's hardcoded effort-level validation. | |
| Context: at ``litellm/llms/anthropic/chat/transformation.py:~1443`` the | |
| Anthropic adapter validates ``output_config.effort β {high, medium, | |
| low, max}`` and gates ``max`` behind an ``_is_opus_4_6_model`` check | |
| that only matches the substring ``opus-4-6`` / ``opus_4_6``. Result: | |
| * ``xhigh`` β valid on Anthropic's real API for Claude 4.7 β is | |
| rejected pre-flight with "Invalid effort value: xhigh". | |
| * ``max`` on Opus 4.7 is rejected with "effort='max' is only supported | |
| by Claude Opus 4.6", even though Opus 4.7 accepts it in practice. | |
| We don't want to maintain a parallel model table, so we let the | |
| Anthropic API itself be the validator: widen ``_is_opus_4_6_model`` | |
| to also match ``opus-4-7``+ families, and drop the valid-effort-set | |
| check entirely. If Anthropic rejects an effort level, we see a 400 | |
| and the cascade walks down β exactly the behavior we want for any | |
| future model family. | |
| Removable once litellm ships 1.83.8-stable (which merges PR #25867, | |
| "Litellm day 0 opus 4.7 support") β see commit 0868a82 on their main | |
| branch. Until then, this one-time patch is the escape hatch. | |
| """ | |
| try: | |
| from litellm.llms.anthropic.chat import transformation as _t | |
| except Exception: | |
| return | |
| cfg = getattr(_t, "AnthropicConfig", None) | |
| if cfg is None: | |
| return | |
| original = getattr(cfg, "_is_opus_4_6_model", None) | |
| if original is None or getattr(original, "_hf_agent_patched", False): | |
| return | |
| def _widened(model: str) -> bool: | |
| m = model.lower() | |
| # Original 4.6 match plus any future Opus >= 4.6. We only need this | |
| # to return True for families where "max" / "xhigh" are acceptable | |
| # at the API; the cascade handles the case when they're not. | |
| return any( | |
| v in m | |
| for v in ( | |
| "opus-4-6", | |
| "opus_4_6", | |
| "opus-4.6", | |
| "opus_4.6", | |
| "opus-4-7", | |
| "opus_4_7", | |
| "opus-4.7", | |
| "opus_4.7", | |
| "opus-4-8", | |
| "opus_4_8", | |
| "opus-4.8", | |
| "opus_4.8", | |
| ) | |
| ) | |
| _widened._hf_agent_patched = True # type: ignore[attr-defined] | |
| cfg._is_opus_4_6_model = staticmethod(_widened) | |
| _patch_litellm_effort_validation() | |
| # Effort levels accepted on the wire. | |
| # Anthropic (4.6+): low | medium | high | xhigh | max (output_config.effort) | |
| # OpenAI direct: minimal | low | medium | high | xhigh (reasoning_effort top-level) | |
| # HF router default: low | medium | high (extra_body.reasoning_effort) | |
| # HF router premium user-billed overflow keeps the subsidized endpoint's | |
| # provider-native effort set so overflow cannot silently lower or drop effort. | |
| # | |
| # We validate *shape* here and let the probe cascade walk down on rejection; | |
| # we deliberately do NOT maintain a per-model capability table. | |
| _ANTHROPIC_EFFORTS = {"low", "medium", "high", "xhigh", "max"} | |
| _OPENAI_EFFORTS = {"minimal", "low", "medium", "high", "xhigh"} | |
| _HF_EFFORTS = {"low", "medium", "high"} | |
| # Production still uses the historical subsidized endpoints while users are | |
| # within their daily premium allowance. Once a session overflows to user | |
| # billing, route the same logical model through the HF Inference Router/FAL | |
| # with the user's OAuth token. | |
| _USER_BILLED_HF_ROUTER_MODEL_BY_SUBSIDIZED_MODEL = { | |
| "bedrock/us.anthropic.claude-opus-4-6-v1": ( | |
| "huggingface/anthropic/claude-opus-4.6:fal-ai" | |
| ), | |
| "bedrock/us.anthropic.claude-opus-4-8": ( | |
| "huggingface/anthropic/claude-opus-4.8:fal-ai" | |
| ), | |
| "bedrock/us.anthropic.claude-sonnet-4-6": ( | |
| "huggingface/anthropic/claude-sonnet-4-6:fal-ai" | |
| ), | |
| "openai/gpt-5.5": "huggingface/openai/gpt-5.5:fal-ai", | |
| } | |
| _SUBSIDIZED_MODEL_BY_USER_BILLED_HF_ROUTER_MODEL = { | |
| user_billed: subsidized | |
| for subsidized, user_billed in _USER_BILLED_HF_ROUTER_MODEL_BY_SUBSIDIZED_MODEL.items() | |
| } | |
| def _hf_router_effort_spec( | |
| hf_model: str, bill_user: bool, reasoning_effort: str | |
| ) -> tuple[str, set[str], str]: | |
| """Return the effort value and accepted set for an HF-router call. | |
| Generic HF-router models use the router's low/medium/high contract. Premium | |
| overflow is different: the subsidized endpoint is the source of truth for | |
| the effective effort we cached on the session, so the user-billed FAL call | |
| must keep the same provider-native effort instead of silently dropping | |
| ``max``/``xhigh``. | |
| """ | |
| if bill_user and hf_model.startswith("anthropic/"): | |
| level = "low" if reasoning_effort == "minimal" else reasoning_effort | |
| return level, _ANTHROPIC_EFFORTS, "Anthropic" | |
| if bill_user and hf_model.startswith("openai/"): | |
| return reasoning_effort, _OPENAI_EFFORTS, "OpenAI" | |
| level = "low" if reasoning_effort == "minimal" else reasoning_effort | |
| return level, _HF_EFFORTS, "HF router" | |
| class UnsupportedEffortError(ValueError): | |
| """The requested effort isn't valid for this provider's API surface. | |
| Raised synchronously before any network call so the probe cascade can | |
| skip levels the provider can't accept (e.g. ``max`` on HF router). | |
| """ | |
| def _local_api_base(base_url: str) -> str: | |
| base = base_url.strip().rstrip("/") | |
| if base.endswith("/v1"): | |
| return base | |
| return f"{base}/v1" | |
| def _resolve_local_model_params( | |
| model_name: str, | |
| reasoning_effort: str | None = None, | |
| strict: bool = False, | |
| ) -> dict: | |
| if reasoning_effort and strict: | |
| raise UnsupportedEffortError( | |
| "Local OpenAI-compatible endpoints don't accept reasoning_effort" | |
| ) | |
| local_name = local_model_name(model_name) | |
| if local_name is None: | |
| raise ValueError(f"Unsupported local model id: {model_name}") | |
| provider = local_model_provider(model_name) | |
| assert provider is not None | |
| raw_base = ( | |
| os.environ.get(provider["base_url_env"]) | |
| or os.environ.get(LOCAL_MODEL_BASE_URL_ENV) | |
| or provider["base_url_default"] | |
| ) | |
| api_key = ( | |
| os.environ.get(provider["api_key_env"]) | |
| or os.environ.get(LOCAL_MODEL_API_KEY_ENV) | |
| or LOCAL_MODEL_API_KEY_DEFAULT | |
| ) | |
| return { | |
| "model": f"openai/{local_name}", | |
| "api_base": _local_api_base(raw_base), | |
| "api_key": api_key, | |
| } | |
| def _resolve_llm_params( | |
| model_name: str, | |
| session_hf_token: str | None = None, | |
| reasoning_effort: str | None = None, | |
| strict: bool = False, | |
| bill_to_user: bool = False, | |
| ) -> dict: | |
| """ | |
| Build LiteLLM kwargs for a given model id. | |
| β’ ``anthropic/<model>`` β native thinking config. We bypass LiteLLM's | |
| ``reasoning_effort`` β ``thinking`` mapping (which lags new Claude | |
| releases like 4.7 and sends the wrong API shape). Instead we pass | |
| both ``thinking={"type": "adaptive"}`` and ``output_config= | |
| {"effort": <level>}`` as top-level kwargs β LiteLLM's Anthropic | |
| adapter forwards unknown top-level kwargs into the request body | |
| verbatim (confirmed by live probe; ``extra_body`` does NOT work | |
| here because Anthropic's API rejects it as "Extra inputs are not | |
| permitted"). This is the stable API for 4.6 and 4.7. Older | |
| extended-thinking models that only accept ``thinking.type.enabled`` | |
| will reject this; the probe's cascade catches that and falls back | |
| to no thinking. | |
| β’ ``openai/<model>`` β ``reasoning_effort`` forwarded as a top-level | |
| kwarg (GPT-5 / o-series). LiteLLM uses the user's ``OPENAI_API_KEY``. | |
| β’ ``ollama/<model>``, ``vllm/<model>``, ``lm_studio/<model>``, and | |
| ``llamacpp/<model>`` β local OpenAI-compatible endpoints. The id prefix | |
| selects a configurable localhost base URL, and the model suffix is sent | |
| to LiteLLM as ``openai/<model>``. These endpoints don't receive | |
| ``reasoning_effort``. | |
| β’ Anything else is treated as a HuggingFace router id. We hit the | |
| auto-routing OpenAI-compatible endpoint at | |
| ``https://router.huggingface.co/v1``. The id can be bare or carry an | |
| HF routing suffix (``:fastest`` / ``:cheapest`` / ``:<provider>``). | |
| A leading ``huggingface/`` is stripped. ``reasoning_effort`` is | |
| forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as | |
| a top-level kwarg for non-OpenAI models). "minimal" normalizes to | |
| "low". | |
| ``strict=True`` raises ``UnsupportedEffortError`` when the requested | |
| effort isn't in the provider's accepted set, instead of silently | |
| dropping it. The probe cascade uses strict mode so it can walk down | |
| (``max`` β ``xhigh`` β ``high`` β¦) without making an API call. Regular | |
| runtime callers leave ``strict=False``, so a stale cached effort | |
| can't crash a turn β it just doesn't get sent. | |
| Token precedence for HF-router calls (first non-empty wins): | |
| 1. INFERENCE_TOKEN env β shared key on the hosted Space (inference is | |
| free for users, billed to the Space owner via ``X-HF-Bill-To``). | |
| 2. session.hf_token β the user's own token (CLI / OAuth / cache file). | |
| 3. huggingface_hub cache β ``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` / | |
| local ``hf auth login`` cache. | |
| The production premium ids intentionally remain the old subsidized | |
| endpoints (Bedrock Claude, direct OpenAI GPT-5.5). Pass | |
| ``bill_to_user=True`` only after the daily subsidized allowance is spent; | |
| those ids then map to the HF Router/FAL ids and use the caller's own token | |
| (skipping ``INFERENCE_TOKEN`` and omitting ``X-HF-Bill-To``). | |
| """ | |
| if bill_to_user and ( | |
| user_billed_model := _USER_BILLED_HF_ROUTER_MODEL_BY_SUBSIDIZED_MODEL.get( | |
| model_name | |
| ) | |
| ): | |
| return _resolve_llm_params( | |
| user_billed_model, | |
| session_hf_token, | |
| reasoning_effort=reasoning_effort, | |
| strict=strict, | |
| bill_to_user=True, | |
| ) | |
| if not bill_to_user and ( | |
| subsidized_model := _SUBSIDIZED_MODEL_BY_USER_BILLED_HF_ROUTER_MODEL.get( | |
| model_name | |
| ) | |
| ): | |
| return _resolve_llm_params( | |
| subsidized_model, | |
| session_hf_token, | |
| reasoning_effort=reasoning_effort, | |
| strict=strict, | |
| bill_to_user=False, | |
| ) | |
| if model_name.startswith("anthropic/"): | |
| params: dict = {"model": model_name} | |
| if reasoning_effort: | |
| level = reasoning_effort | |
| if level == "minimal": | |
| level = "low" | |
| if level not in _ANTHROPIC_EFFORTS: | |
| if strict: | |
| raise UnsupportedEffortError( | |
| f"Anthropic doesn't accept effort={level!r}" | |
| ) | |
| else: | |
| # Adaptive thinking + output_config.effort is the stable | |
| # Anthropic API for Claude 4.6 / 4.7. Both kwargs are | |
| # passed top-level: LiteLLM forwards unknown params into | |
| # the request body for Anthropic, so ``output_config`` | |
| # reaches the API. ``extra_body`` does NOT work here β | |
| # Anthropic rejects it as "Extra inputs are not | |
| # permitted". | |
| params["thinking"] = {"type": "adaptive"} | |
| params["output_config"] = {"effort": level} | |
| return params | |
| if model_name.startswith("bedrock/"): | |
| # LiteLLM routes ``bedrock/...`` through the Converse adapter, which | |
| # picks up AWS credentials from the standard env vars | |
| # (``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY`` / ``AWS_REGION``). | |
| # The Anthropic thinking/effort shape is not forwarded through Converse | |
| # the same way, so we leave it off for now. | |
| return {"model": model_name} | |
| if model_name.startswith("openai/"): | |
| params = {"model": model_name} | |
| if reasoning_effort: | |
| if reasoning_effort not in _OPENAI_EFFORTS: | |
| if strict: | |
| raise UnsupportedEffortError( | |
| f"OpenAI doesn't accept effort={reasoning_effort!r}" | |
| ) | |
| else: | |
| params["reasoning_effort"] = reasoning_effort | |
| return params | |
| if is_reserved_local_model_id(model_name): | |
| raise ValueError(f"Unsupported local model id: {model_name}") | |
| if local_model_provider(model_name) is not None: | |
| return _resolve_local_model_params(model_name, reasoning_effort, strict) | |
| hf_model = model_name.removeprefix("huggingface/") | |
| # Premium models routed through the HF router (Anthropic Claude, OpenAI GPT) | |
| # can be billed to the *user's* own HF account instead of the Space: when | |
| # ``bill_to_user`` is set (the backend flips it on once a user is past their | |
| # subsidized daily allowance) use the caller's session token (never | |
| # INFERENCE_TOKEN) and omit X-HF-Bill-To so the spend lands on their wallet. | |
| # Otherwise β within the allowance, or for any free model β keep the | |
| # subsidized path. | |
| bill_user = bill_to_user and hf_model.startswith(("anthropic/", "openai/")) | |
| api_key = ( | |
| resolve_hf_token(session_hf_token, include_cached=False) | |
| if bill_user | |
| else _resolve_hf_router_token(session_hf_token) | |
| ) | |
| params = { | |
| "model": f"openai/{hf_model}", | |
| "api_base": "https://router.huggingface.co/v1", | |
| "api_key": api_key, | |
| } | |
| if not bill_user and (bill_to := get_hf_bill_to()): | |
| params["extra_headers"] = {"X-HF-Bill-To": bill_to} | |
| if reasoning_effort: | |
| hf_level, accepted_efforts, effort_owner = _hf_router_effort_spec( | |
| hf_model, bill_user, reasoning_effort | |
| ) | |
| if hf_level not in accepted_efforts: | |
| if strict: | |
| raise UnsupportedEffortError( | |
| f"{effort_owner} doesn't accept effort={hf_level!r}" | |
| ) | |
| else: | |
| params["extra_body"] = {"reasoning_effort": hf_level} | |
| return params | |