"""LiteLLM kwargs resolution for the model ids this agent accepts. Kept separate from ``agent_loop`` so tools (research, context compaction, etc.) can import it without pulling in the whole agent loop / tool router and creating circular imports. """ import os from agent.core.hf_tokens import ( get_hf_bill_to, resolve_hf_router_token, resolve_hf_token, ) from agent.core.local_models import ( LOCAL_MODEL_API_KEY_DEFAULT, LOCAL_MODEL_API_KEY_ENV, LOCAL_MODEL_BASE_URL_ENV, is_reserved_local_model_id, local_model_name, local_model_provider, ) def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None: """Backward-compatible private wrapper used by tests and older imports.""" return resolve_hf_router_token(session_hf_token) def _patch_litellm_effort_validation() -> None: """Neuter LiteLLM 1.83's hardcoded effort-level validation. Context: at ``litellm/llms/anthropic/chat/transformation.py:~1443`` the Anthropic adapter validates ``output_config.effort ∈ {high, medium, low, max}`` and gates ``max`` behind an ``_is_opus_4_6_model`` check that only matches the substring ``opus-4-6`` / ``opus_4_6``. Result: * ``xhigh`` — valid on Anthropic's real API for Claude 4.7 — is rejected pre-flight with "Invalid effort value: xhigh". * ``max`` on Opus 4.7 is rejected with "effort='max' is only supported by Claude Opus 4.6", even though Opus 4.7 accepts it in practice. We don't want to maintain a parallel model table, so we let the Anthropic API itself be the validator: widen ``_is_opus_4_6_model`` to also match ``opus-4-7``+ families, and drop the valid-effort-set check entirely. If Anthropic rejects an effort level, we see a 400 and the cascade walks down — exactly the behavior we want for any future model family. Removable once litellm ships 1.83.8-stable (which merges PR #25867, "Litellm day 0 opus 4.7 support") — see commit 0868a82 on their main branch. Until then, this one-time patch is the escape hatch. """ try: from litellm.llms.anthropic.chat import transformation as _t except Exception: return cfg = getattr(_t, "AnthropicConfig", None) if cfg is None: return original = getattr(cfg, "_is_opus_4_6_model", None) if original is None or getattr(original, "_hf_agent_patched", False): return def _widened(model: str) -> bool: m = model.lower() # Original 4.6 match plus any future Opus >= 4.6. We only need this # to return True for families where "max" / "xhigh" are acceptable # at the API; the cascade handles the case when they're not. return any( v in m for v in ( "opus-4-6", "opus_4_6", "opus-4.6", "opus_4.6", "opus-4-7", "opus_4_7", "opus-4.7", "opus_4.7", "opus-4-8", "opus_4_8", "opus-4.8", "opus_4.8", ) ) _widened._hf_agent_patched = True # type: ignore[attr-defined] cfg._is_opus_4_6_model = staticmethod(_widened) _patch_litellm_effort_validation() # Effort levels accepted on the wire. # Anthropic (4.6+): low | medium | high | xhigh | max (output_config.effort) # OpenAI direct: minimal | low | medium | high | xhigh (reasoning_effort top-level) # HF router default: low | medium | high (extra_body.reasoning_effort) # HF router premium user-billed overflow keeps the subsidized endpoint's # provider-native effort set so overflow cannot silently lower or drop effort. # # We validate *shape* here and let the probe cascade walk down on rejection; # we deliberately do NOT maintain a per-model capability table. _ANTHROPIC_EFFORTS = {"low", "medium", "high", "xhigh", "max"} _OPENAI_EFFORTS = {"minimal", "low", "medium", "high", "xhigh"} _HF_EFFORTS = {"low", "medium", "high"} # Production still uses the historical subsidized endpoints while users are # within their daily premium allowance. Once a session overflows to user # billing, route the same logical model through the HF Inference Router/FAL # with the user's OAuth token. _USER_BILLED_HF_ROUTER_MODEL_BY_SUBSIDIZED_MODEL = { "bedrock/us.anthropic.claude-opus-4-6-v1": ( "huggingface/anthropic/claude-opus-4.6:fal-ai" ), "bedrock/us.anthropic.claude-opus-4-8": ( "huggingface/anthropic/claude-opus-4.8:fal-ai" ), "bedrock/us.anthropic.claude-sonnet-4-6": ( "huggingface/anthropic/claude-sonnet-4-6:fal-ai" ), "openai/gpt-5.5": "huggingface/openai/gpt-5.5:fal-ai", } _SUBSIDIZED_MODEL_BY_USER_BILLED_HF_ROUTER_MODEL = { user_billed: subsidized for subsidized, user_billed in _USER_BILLED_HF_ROUTER_MODEL_BY_SUBSIDIZED_MODEL.items() } def _hf_router_effort_spec( hf_model: str, bill_user: bool, reasoning_effort: str ) -> tuple[str, set[str], str]: """Return the effort value and accepted set for an HF-router call. Generic HF-router models use the router's low/medium/high contract. Premium overflow is different: the subsidized endpoint is the source of truth for the effective effort we cached on the session, so the user-billed FAL call must keep the same provider-native effort instead of silently dropping ``max``/``xhigh``. """ if bill_user and hf_model.startswith("anthropic/"): level = "low" if reasoning_effort == "minimal" else reasoning_effort return level, _ANTHROPIC_EFFORTS, "Anthropic" if bill_user and hf_model.startswith("openai/"): return reasoning_effort, _OPENAI_EFFORTS, "OpenAI" level = "low" if reasoning_effort == "minimal" else reasoning_effort return level, _HF_EFFORTS, "HF router" class UnsupportedEffortError(ValueError): """The requested effort isn't valid for this provider's API surface. Raised synchronously before any network call so the probe cascade can skip levels the provider can't accept (e.g. ``max`` on HF router). """ def _local_api_base(base_url: str) -> str: base = base_url.strip().rstrip("/") if base.endswith("/v1"): return base return f"{base}/v1" def _resolve_local_model_params( model_name: str, reasoning_effort: str | None = None, strict: bool = False, ) -> dict: if reasoning_effort and strict: raise UnsupportedEffortError( "Local OpenAI-compatible endpoints don't accept reasoning_effort" ) local_name = local_model_name(model_name) if local_name is None: raise ValueError(f"Unsupported local model id: {model_name}") provider = local_model_provider(model_name) assert provider is not None raw_base = ( os.environ.get(provider["base_url_env"]) or os.environ.get(LOCAL_MODEL_BASE_URL_ENV) or provider["base_url_default"] ) api_key = ( os.environ.get(provider["api_key_env"]) or os.environ.get(LOCAL_MODEL_API_KEY_ENV) or LOCAL_MODEL_API_KEY_DEFAULT ) return { "model": f"openai/{local_name}", "api_base": _local_api_base(raw_base), "api_key": api_key, } def _resolve_llm_params( model_name: str, session_hf_token: str | None = None, reasoning_effort: str | None = None, strict: bool = False, bill_to_user: bool = False, ) -> dict: """ Build LiteLLM kwargs for a given model id. • ``anthropic/`` — native thinking config. We bypass LiteLLM's ``reasoning_effort`` → ``thinking`` mapping (which lags new Claude releases like 4.7 and sends the wrong API shape). Instead we pass both ``thinking={"type": "adaptive"}`` and ``output_config= {"effort": }`` as top-level kwargs — LiteLLM's Anthropic adapter forwards unknown top-level kwargs into the request body verbatim (confirmed by live probe; ``extra_body`` does NOT work here because Anthropic's API rejects it as "Extra inputs are not permitted"). This is the stable API for 4.6 and 4.7. Older extended-thinking models that only accept ``thinking.type.enabled`` will reject this; the probe's cascade catches that and falls back to no thinking. • ``openai/`` — ``reasoning_effort`` forwarded as a top-level kwarg (GPT-5 / o-series). LiteLLM uses the user's ``OPENAI_API_KEY``. • ``ollama/``, ``vllm/``, ``lm_studio/``, and ``llamacpp/`` — local OpenAI-compatible endpoints. The id prefix selects a configurable localhost base URL, and the model suffix is sent to LiteLLM as ``openai/``. These endpoints don't receive ``reasoning_effort``. • Anything else is treated as a HuggingFace router id. We hit the auto-routing OpenAI-compatible endpoint at ``https://router.huggingface.co/v1``. The id can be bare or carry an HF routing suffix (``:fastest`` / ``:cheapest`` / ``:``). A leading ``huggingface/`` is stripped. ``reasoning_effort`` is forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as a top-level kwarg for non-OpenAI models). "minimal" normalizes to "low". ``strict=True`` raises ``UnsupportedEffortError`` when the requested effort isn't in the provider's accepted set, instead of silently dropping it. The probe cascade uses strict mode so it can walk down (``max`` → ``xhigh`` → ``high`` …) without making an API call. Regular runtime callers leave ``strict=False``, so a stale cached effort can't crash a turn — it just doesn't get sent. Token precedence for HF-router calls (first non-empty wins): 1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is free for users, billed to the Space owner via ``X-HF-Bill-To``). 2. session.hf_token — the user's own token (CLI / OAuth / cache file). 3. huggingface_hub cache — ``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` / local ``hf auth login`` cache. The production premium ids intentionally remain the old subsidized endpoints (Bedrock Claude, direct OpenAI GPT-5.5). Pass ``bill_to_user=True`` only after the daily subsidized allowance is spent; those ids then map to the HF Router/FAL ids and use the caller's own token (skipping ``INFERENCE_TOKEN`` and omitting ``X-HF-Bill-To``). """ if bill_to_user and ( user_billed_model := _USER_BILLED_HF_ROUTER_MODEL_BY_SUBSIDIZED_MODEL.get( model_name ) ): return _resolve_llm_params( user_billed_model, session_hf_token, reasoning_effort=reasoning_effort, strict=strict, bill_to_user=True, ) if not bill_to_user and ( subsidized_model := _SUBSIDIZED_MODEL_BY_USER_BILLED_HF_ROUTER_MODEL.get( model_name ) ): return _resolve_llm_params( subsidized_model, session_hf_token, reasoning_effort=reasoning_effort, strict=strict, bill_to_user=False, ) if model_name.startswith("anthropic/"): params: dict = {"model": model_name} if reasoning_effort: level = reasoning_effort if level == "minimal": level = "low" if level not in _ANTHROPIC_EFFORTS: if strict: raise UnsupportedEffortError( f"Anthropic doesn't accept effort={level!r}" ) else: # Adaptive thinking + output_config.effort is the stable # Anthropic API for Claude 4.6 / 4.7. Both kwargs are # passed top-level: LiteLLM forwards unknown params into # the request body for Anthropic, so ``output_config`` # reaches the API. ``extra_body`` does NOT work here — # Anthropic rejects it as "Extra inputs are not # permitted". params["thinking"] = {"type": "adaptive"} params["output_config"] = {"effort": level} return params if model_name.startswith("bedrock/"): # LiteLLM routes ``bedrock/...`` through the Converse adapter, which # picks up AWS credentials from the standard env vars # (``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY`` / ``AWS_REGION``). # The Anthropic thinking/effort shape is not forwarded through Converse # the same way, so we leave it off for now. return {"model": model_name} if model_name.startswith("openai/"): params = {"model": model_name} if reasoning_effort: if reasoning_effort not in _OPENAI_EFFORTS: if strict: raise UnsupportedEffortError( f"OpenAI doesn't accept effort={reasoning_effort!r}" ) else: params["reasoning_effort"] = reasoning_effort return params if is_reserved_local_model_id(model_name): raise ValueError(f"Unsupported local model id: {model_name}") if local_model_provider(model_name) is not None: return _resolve_local_model_params(model_name, reasoning_effort, strict) hf_model = model_name.removeprefix("huggingface/") # Premium models routed through the HF router (Anthropic Claude, OpenAI GPT) # can be billed to the *user's* own HF account instead of the Space: when # ``bill_to_user`` is set (the backend flips it on once a user is past their # subsidized daily allowance) use the caller's session token (never # INFERENCE_TOKEN) and omit X-HF-Bill-To so the spend lands on their wallet. # Otherwise — within the allowance, or for any free model — keep the # subsidized path. bill_user = bill_to_user and hf_model.startswith(("anthropic/", "openai/")) api_key = ( resolve_hf_token(session_hf_token, include_cached=False) if bill_user else _resolve_hf_router_token(session_hf_token) ) params = { "model": f"openai/{hf_model}", "api_base": "https://router.huggingface.co/v1", "api_key": api_key, } if not bill_user and (bill_to := get_hf_bill_to()): params["extra_headers"] = {"X-HF-Bill-To": bill_to} if reasoning_effort: hf_level, accepted_efforts, effort_owner = _hf_router_effort_spec( hf_model, bill_user, reasoning_effort ) if hf_level not in accepted_efforts: if strict: raise UnsupportedEffortError( f"{effort_owner} doesn't accept effort={hf_level!r}" ) else: params["extra_body"] = {"reasoning_effort": hf_level} return params