Spaces:
Paused
Paused
| """API error classification for smart failover and recovery. | |
| Provides a structured taxonomy of API errors and a priority-ordered | |
| classification pipeline that determines the correct recovery action | |
| (retry, rotate credential, fallback to another provider, compress | |
| context, or abort). | |
| Replaces scattered inline string-matching with a centralized classifier | |
| that the main retry loop in run_agent.py consults for every API failure. | |
| """ | |
| from __future__ import annotations | |
| import enum | |
| import logging | |
| import re | |
| from dataclasses import dataclass, field | |
| from typing import Any, Dict, Optional | |
| logger = logging.getLogger(__name__) | |
| # ── Error taxonomy ────────────────────────────────────────────────────── | |
| class FailoverReason(enum.Enum): | |
| """Why an API call failed — determines recovery strategy.""" | |
| # Authentication / authorization | |
| auth = "auth" # Transient auth (401/403) — refresh/rotate | |
| auth_permanent = "auth_permanent" # Auth failed after refresh — abort | |
| # Billing / quota | |
| billing = "billing" # 402 or confirmed credit exhaustion — rotate immediately | |
| rate_limit = "rate_limit" # 429 or quota-based throttling — backoff then rotate | |
| # Server-side | |
| overloaded = "overloaded" # 503/529 — provider overloaded, backoff | |
| server_error = "server_error" # 500/502 — internal server error, retry | |
| # Transport | |
| timeout = "timeout" # Connection/read timeout — rebuild client + retry | |
| # Context / payload | |
| context_overflow = "context_overflow" # Context too large — compress, not failover | |
| payload_too_large = "payload_too_large" # 413 — compress payload | |
| # Model | |
| model_not_found = "model_not_found" # 404 or invalid model — fallback to different model | |
| # Request format | |
| format_error = "format_error" # 400 bad request — abort or strip + retry | |
| # Provider-specific | |
| thinking_signature = "thinking_signature" # Anthropic thinking block sig invalid | |
| long_context_tier = "long_context_tier" # Anthropic "extra usage" tier gate | |
| # Catch-all | |
| unknown = "unknown" # Unclassifiable — retry with backoff | |
| # ── Classification result ─────────────────────────────────────────────── | |
| class ClassifiedError: | |
| """Structured classification of an API error with recovery hints.""" | |
| reason: FailoverReason | |
| status_code: Optional[int] = None | |
| provider: Optional[str] = None | |
| model: Optional[str] = None | |
| message: str = "" | |
| error_context: Dict[str, Any] = field(default_factory=dict) | |
| # Recovery action hints — the retry loop checks these instead of | |
| # re-classifying the error itself. | |
| retryable: bool = True | |
| should_compress: bool = False | |
| should_rotate_credential: bool = False | |
| should_fallback: bool = False | |
| def is_auth(self) -> bool: | |
| return self.reason in (FailoverReason.auth, FailoverReason.auth_permanent) | |
| # ── Provider-specific patterns ────────────────────────────────────────── | |
| # Patterns that indicate billing exhaustion (not transient rate limit) | |
| _BILLING_PATTERNS = [ | |
| "insufficient credits", | |
| "insufficient_quota", | |
| "credit balance", | |
| "credits have been exhausted", | |
| "top up your credits", | |
| "payment required", | |
| "billing hard limit", | |
| "exceeded your current quota", | |
| "account is deactivated", | |
| "plan does not include", | |
| ] | |
| # Patterns that indicate rate limiting (transient, will resolve) | |
| _RATE_LIMIT_PATTERNS = [ | |
| "rate limit", | |
| "rate_limit", | |
| "too many requests", | |
| "throttled", | |
| "requests per minute", | |
| "tokens per minute", | |
| "requests per day", | |
| "try again in", | |
| "please retry after", | |
| "resource_exhausted", | |
| "rate increased too quickly", # Alibaba/DashScope throttling | |
| ] | |
| # Usage-limit patterns that need disambiguation (could be billing OR rate_limit) | |
| _USAGE_LIMIT_PATTERNS = [ | |
| "usage limit", | |
| "quota", | |
| "limit exceeded", | |
| "key limit exceeded", | |
| ] | |
| # Patterns confirming usage limit is transient (not billing) | |
| _USAGE_LIMIT_TRANSIENT_SIGNALS = [ | |
| "try again", | |
| "retry", | |
| "resets at", | |
| "reset in", | |
| "wait", | |
| "requests remaining", | |
| "periodic", | |
| "window", | |
| ] | |
| # Payload-too-large patterns detected from message text (no status_code attr). | |
| # Proxies and some backends embed the HTTP status in the error message. | |
| _PAYLOAD_TOO_LARGE_PATTERNS = [ | |
| "request entity too large", | |
| "payload too large", | |
| "error code: 413", | |
| ] | |
| # Context overflow patterns | |
| _CONTEXT_OVERFLOW_PATTERNS = [ | |
| "context length", | |
| "context size", | |
| "maximum context", | |
| "token limit", | |
| "too many tokens", | |
| "reduce the length", | |
| "exceeds the limit", | |
| "context window", | |
| "prompt is too long", | |
| "prompt exceeds max length", | |
| "max_tokens", | |
| "maximum number of tokens", | |
| # Chinese error messages (some providers return these) | |
| "超过最大长度", | |
| "上下文长度", | |
| ] | |
| # Model not found patterns | |
| _MODEL_NOT_FOUND_PATTERNS = [ | |
| "is not a valid model", | |
| "invalid model", | |
| "model not found", | |
| "model_not_found", | |
| "does not exist", | |
| "no such model", | |
| "unknown model", | |
| "unsupported model", | |
| ] | |
| # Auth patterns (non-status-code signals) | |
| _AUTH_PATTERNS = [ | |
| "invalid api key", | |
| "invalid_api_key", | |
| "authentication", | |
| "unauthorized", | |
| "forbidden", | |
| "invalid token", | |
| "token expired", | |
| "token revoked", | |
| "access denied", | |
| ] | |
| # Anthropic thinking block signature patterns | |
| _THINKING_SIG_PATTERNS = [ | |
| "signature", # Combined with "thinking" check | |
| ] | |
| # Transport error type names | |
| _TRANSPORT_ERROR_TYPES = frozenset({ | |
| "ReadTimeout", "ConnectTimeout", "PoolTimeout", | |
| "ConnectError", "RemoteProtocolError", | |
| "ConnectionError", "ConnectionResetError", | |
| "ConnectionAbortedError", "BrokenPipeError", | |
| "TimeoutError", "ReadError", | |
| "ServerDisconnectedError", | |
| # OpenAI SDK errors (not subclasses of Python builtins) | |
| "APIConnectionError", | |
| "APITimeoutError", | |
| }) | |
| # Server disconnect patterns (no status code, but transport-level) | |
| _SERVER_DISCONNECT_PATTERNS = [ | |
| "server disconnected", | |
| "peer closed connection", | |
| "connection reset by peer", | |
| "connection was closed", | |
| "network connection lost", | |
| "unexpected eof", | |
| "incomplete chunked read", | |
| ] | |
| # ── Classification pipeline ───────────────────────────────────────────── | |
| def classify_api_error( | |
| error: Exception, | |
| *, | |
| provider: str = "", | |
| model: str = "", | |
| approx_tokens: int = 0, | |
| context_length: int = 200000, | |
| num_messages: int = 0, | |
| ) -> ClassifiedError: | |
| """Classify an API error into a structured recovery recommendation. | |
| Priority-ordered pipeline: | |
| 1. Special-case provider-specific patterns (thinking sigs, tier gates) | |
| 2. HTTP status code + message-aware refinement | |
| 3. Error code classification (from body) | |
| 4. Message pattern matching (billing vs rate_limit vs context vs auth) | |
| 5. Transport error heuristics | |
| 6. Server disconnect + large session → context overflow | |
| 7. Fallback: unknown (retryable with backoff) | |
| Args: | |
| error: The exception from the API call. | |
| provider: Current provider name (e.g. "openrouter", "anthropic"). | |
| model: Current model slug. | |
| approx_tokens: Approximate token count of the current context. | |
| context_length: Maximum context length for the current model. | |
| Returns: | |
| ClassifiedError with reason and recovery action hints. | |
| """ | |
| status_code = _extract_status_code(error) | |
| error_type = type(error).__name__ | |
| body = _extract_error_body(error) | |
| error_code = _extract_error_code(body) | |
| # Build a comprehensive error message string for pattern matching. | |
| # str(error) alone may not include the body message (e.g. OpenAI SDK's | |
| # APIStatusError.__str__ returns the first arg, not the body). Append | |
| # the body message so patterns like "try again" in 402 disambiguation | |
| # are detected even when only present in the structured body. | |
| # | |
| # Also extract metadata.raw — OpenRouter wraps upstream provider errors | |
| # inside {"error": {"message": "Provider returned error", "metadata": | |
| # {"raw": "<actual error JSON>"}}} and the real error message (e.g. | |
| # "context length exceeded") is only in the inner JSON. | |
| _raw_msg = str(error).lower() | |
| _body_msg = "" | |
| _metadata_msg = "" | |
| if isinstance(body, dict): | |
| _err_obj = body.get("error", {}) | |
| if isinstance(_err_obj, dict): | |
| _body_msg = (_err_obj.get("message") or "").lower() | |
| # Parse metadata.raw for wrapped provider errors | |
| _metadata = _err_obj.get("metadata", {}) | |
| if isinstance(_metadata, dict): | |
| _raw_json = _metadata.get("raw") or "" | |
| if isinstance(_raw_json, str) and _raw_json.strip(): | |
| try: | |
| import json | |
| _inner = json.loads(_raw_json) | |
| if isinstance(_inner, dict): | |
| _inner_err = _inner.get("error", {}) | |
| if isinstance(_inner_err, dict): | |
| _metadata_msg = (_inner_err.get("message") or "").lower() | |
| except (json.JSONDecodeError, TypeError): | |
| pass | |
| if not _body_msg: | |
| _body_msg = (body.get("message") or "").lower() | |
| # Combine all message sources for pattern matching | |
| parts = [_raw_msg] | |
| if _body_msg and _body_msg not in _raw_msg: | |
| parts.append(_body_msg) | |
| if _metadata_msg and _metadata_msg not in _raw_msg and _metadata_msg not in _body_msg: | |
| parts.append(_metadata_msg) | |
| error_msg = " ".join(parts) | |
| provider_lower = (provider or "").strip().lower() | |
| model_lower = (model or "").strip().lower() | |
| def _result(reason: FailoverReason, **overrides) -> ClassifiedError: | |
| defaults = { | |
| "reason": reason, | |
| "status_code": status_code, | |
| "provider": provider, | |
| "model": model, | |
| "message": _extract_message(error, body), | |
| } | |
| defaults.update(overrides) | |
| return ClassifiedError(**defaults) | |
| # ── 1. Provider-specific patterns (highest priority) ──────────── | |
| # Anthropic thinking block signature invalid (400). | |
| # Don't gate on provider — OpenRouter proxies Anthropic errors, so the | |
| # provider may be "openrouter" even though the error is Anthropic-specific. | |
| # The message pattern ("signature" + "thinking") is unique enough. | |
| if ( | |
| status_code == 400 | |
| and "signature" in error_msg | |
| and "thinking" in error_msg | |
| ): | |
| return _result( | |
| FailoverReason.thinking_signature, | |
| retryable=True, | |
| should_compress=False, | |
| ) | |
| # Anthropic long-context tier gate (429 "extra usage" + "long context") | |
| if ( | |
| status_code == 429 | |
| and "extra usage" in error_msg | |
| and "long context" in error_msg | |
| ): | |
| return _result( | |
| FailoverReason.long_context_tier, | |
| retryable=True, | |
| should_compress=True, | |
| ) | |
| # ── 2. HTTP status code classification ────────────────────────── | |
| if status_code is not None: | |
| classified = _classify_by_status( | |
| status_code, error_msg, error_code, body, | |
| provider=provider_lower, model=model_lower, | |
| approx_tokens=approx_tokens, context_length=context_length, | |
| num_messages=num_messages, | |
| result_fn=_result, | |
| ) | |
| if classified is not None: | |
| return classified | |
| # ── 3. Error code classification ──────────────────────────────── | |
| if error_code: | |
| classified = _classify_by_error_code(error_code, error_msg, _result) | |
| if classified is not None: | |
| return classified | |
| # ── 4. Message pattern matching (no status code) ──────────────── | |
| classified = _classify_by_message( | |
| error_msg, error_type, | |
| approx_tokens=approx_tokens, | |
| context_length=context_length, | |
| result_fn=_result, | |
| ) | |
| if classified is not None: | |
| return classified | |
| # ── 5. Server disconnect + large session → context overflow ───── | |
| # Must come BEFORE generic transport error catch — a disconnect on | |
| # a large session is more likely context overflow than a transient | |
| # transport hiccup. Without this ordering, RemoteProtocolError | |
| # always maps to timeout regardless of session size. | |
| is_disconnect = any(p in error_msg for p in _SERVER_DISCONNECT_PATTERNS) | |
| if is_disconnect and not status_code: | |
| is_large = approx_tokens > context_length * 0.6 or approx_tokens > 120000 or num_messages > 200 | |
| if is_large: | |
| return _result( | |
| FailoverReason.context_overflow, | |
| retryable=True, | |
| should_compress=True, | |
| ) | |
| return _result(FailoverReason.timeout, retryable=True) | |
| # ── 6. Transport / timeout heuristics ─────────────────────────── | |
| if error_type in _TRANSPORT_ERROR_TYPES or isinstance(error, (TimeoutError, ConnectionError, OSError)): | |
| return _result(FailoverReason.timeout, retryable=True) | |
| # ── 7. Fallback: unknown ──────────────────────────────────────── | |
| return _result(FailoverReason.unknown, retryable=True) | |
| # ── Status code classification ────────────────────────────────────────── | |
| def _classify_by_status( | |
| status_code: int, | |
| error_msg: str, | |
| error_code: str, | |
| body: dict, | |
| *, | |
| provider: str, | |
| model: str, | |
| approx_tokens: int, | |
| context_length: int, | |
| num_messages: int = 0, | |
| result_fn, | |
| ) -> Optional[ClassifiedError]: | |
| """Classify based on HTTP status code with message-aware refinement.""" | |
| if status_code == 401: | |
| # Not retryable on its own — credential pool rotation and | |
| # provider-specific refresh (Codex, Anthropic, Nous) run before | |
| # the retryability check in run_agent.py. If those succeed, the | |
| # loop `continue`s. If they fail, retryable=False ensures we | |
| # hit the client-error abort path (which tries fallback first). | |
| return result_fn( | |
| FailoverReason.auth, | |
| retryable=False, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| if status_code == 403: | |
| # OpenRouter 403 "key limit exceeded" is actually billing | |
| if "key limit exceeded" in error_msg or "spending limit" in error_msg: | |
| return result_fn( | |
| FailoverReason.billing, | |
| retryable=False, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| return result_fn( | |
| FailoverReason.auth, | |
| retryable=False, | |
| should_fallback=True, | |
| ) | |
| if status_code == 402: | |
| return _classify_402(error_msg, result_fn) | |
| if status_code == 404: | |
| if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS): | |
| return result_fn( | |
| FailoverReason.model_not_found, | |
| retryable=False, | |
| should_fallback=True, | |
| ) | |
| # Generic 404 — could be model or endpoint | |
| return result_fn( | |
| FailoverReason.model_not_found, | |
| retryable=False, | |
| should_fallback=True, | |
| ) | |
| if status_code == 413: | |
| return result_fn( | |
| FailoverReason.payload_too_large, | |
| retryable=True, | |
| should_compress=True, | |
| ) | |
| if status_code == 429: | |
| # Already checked long_context_tier above; this is a normal rate limit | |
| return result_fn( | |
| FailoverReason.rate_limit, | |
| retryable=True, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| if status_code == 400: | |
| return _classify_400( | |
| error_msg, error_code, body, | |
| provider=provider, model=model, | |
| approx_tokens=approx_tokens, | |
| context_length=context_length, | |
| num_messages=num_messages, | |
| result_fn=result_fn, | |
| ) | |
| if status_code in (500, 502): | |
| return result_fn(FailoverReason.server_error, retryable=True) | |
| if status_code in (503, 529): | |
| return result_fn(FailoverReason.overloaded, retryable=True) | |
| # Other 4xx — non-retryable | |
| if 400 <= status_code < 500: | |
| return result_fn( | |
| FailoverReason.format_error, | |
| retryable=False, | |
| should_fallback=True, | |
| ) | |
| # Other 5xx — retryable | |
| if 500 <= status_code < 600: | |
| return result_fn(FailoverReason.server_error, retryable=True) | |
| return None | |
| def _classify_402(error_msg: str, result_fn) -> ClassifiedError: | |
| """Disambiguate 402: billing exhaustion vs transient usage limit. | |
| The key insight from OpenClaw: some 402s are transient rate limits | |
| disguised as payment errors. "Usage limit, try again in 5 minutes" | |
| is NOT a billing problem — it's a periodic quota that resets. | |
| """ | |
| # Check for transient usage-limit signals first | |
| has_usage_limit = any(p in error_msg for p in _USAGE_LIMIT_PATTERNS) | |
| has_transient_signal = any(p in error_msg for p in _USAGE_LIMIT_TRANSIENT_SIGNALS) | |
| if has_usage_limit and has_transient_signal: | |
| # Transient quota — treat as rate limit, not billing | |
| return result_fn( | |
| FailoverReason.rate_limit, | |
| retryable=True, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| # Confirmed billing exhaustion | |
| return result_fn( | |
| FailoverReason.billing, | |
| retryable=False, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| def _classify_400( | |
| error_msg: str, | |
| error_code: str, | |
| body: dict, | |
| *, | |
| provider: str, | |
| model: str, | |
| approx_tokens: int, | |
| context_length: int, | |
| num_messages: int = 0, | |
| result_fn, | |
| ) -> ClassifiedError: | |
| """Classify 400 Bad Request — context overflow, format error, or generic.""" | |
| # Context overflow from 400 | |
| if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS): | |
| return result_fn( | |
| FailoverReason.context_overflow, | |
| retryable=True, | |
| should_compress=True, | |
| ) | |
| # Some providers return model-not-found as 400 instead of 404 (e.g. OpenRouter). | |
| if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS): | |
| return result_fn( | |
| FailoverReason.model_not_found, | |
| retryable=False, | |
| should_fallback=True, | |
| ) | |
| # Some providers return rate limit / billing errors as 400 instead of 429/402. | |
| # Check these patterns before falling through to format_error. | |
| if any(p in error_msg for p in _RATE_LIMIT_PATTERNS): | |
| return result_fn( | |
| FailoverReason.rate_limit, | |
| retryable=True, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| if any(p in error_msg for p in _BILLING_PATTERNS): | |
| return result_fn( | |
| FailoverReason.billing, | |
| retryable=False, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| # Generic 400 + large session → probable context overflow | |
| # Anthropic sometimes returns a bare "Error" message when context is too large | |
| err_body_msg = "" | |
| if isinstance(body, dict): | |
| err_obj = body.get("error", {}) | |
| if isinstance(err_obj, dict): | |
| err_body_msg = (err_obj.get("message") or "").strip().lower() | |
| # Responses API (and some providers) use flat body: {"message": "..."} | |
| if not err_body_msg: | |
| err_body_msg = (body.get("message") or "").strip().lower() | |
| is_generic = len(err_body_msg) < 30 or err_body_msg in ("error", "") | |
| is_large = approx_tokens > context_length * 0.4 or approx_tokens > 80000 or num_messages > 80 | |
| if is_generic and is_large: | |
| return result_fn( | |
| FailoverReason.context_overflow, | |
| retryable=True, | |
| should_compress=True, | |
| ) | |
| # Non-retryable format error | |
| return result_fn( | |
| FailoverReason.format_error, | |
| retryable=False, | |
| should_fallback=True, | |
| ) | |
| # ── Error code classification ─────────────────────────────────────────── | |
| def _classify_by_error_code( | |
| error_code: str, error_msg: str, result_fn, | |
| ) -> Optional[ClassifiedError]: | |
| """Classify by structured error codes from the response body.""" | |
| code_lower = error_code.lower() | |
| if code_lower in ("resource_exhausted", "throttled", "rate_limit_exceeded"): | |
| return result_fn( | |
| FailoverReason.rate_limit, | |
| retryable=True, | |
| should_rotate_credential=True, | |
| ) | |
| if code_lower in ("insufficient_quota", "billing_not_active", "payment_required"): | |
| return result_fn( | |
| FailoverReason.billing, | |
| retryable=False, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| if code_lower in ("model_not_found", "model_not_available", "invalid_model"): | |
| return result_fn( | |
| FailoverReason.model_not_found, | |
| retryable=False, | |
| should_fallback=True, | |
| ) | |
| if code_lower in ("context_length_exceeded", "max_tokens_exceeded"): | |
| return result_fn( | |
| FailoverReason.context_overflow, | |
| retryable=True, | |
| should_compress=True, | |
| ) | |
| return None | |
| # ── Message pattern classification ────────────────────────────────────── | |
| def _classify_by_message( | |
| error_msg: str, | |
| error_type: str, | |
| *, | |
| approx_tokens: int, | |
| context_length: int, | |
| result_fn, | |
| ) -> Optional[ClassifiedError]: | |
| """Classify based on error message patterns when no status code is available.""" | |
| # Payload-too-large patterns (from message text when no status_code) | |
| if any(p in error_msg for p in _PAYLOAD_TOO_LARGE_PATTERNS): | |
| return result_fn( | |
| FailoverReason.payload_too_large, | |
| retryable=True, | |
| should_compress=True, | |
| ) | |
| # Usage-limit patterns need the same disambiguation as 402: some providers | |
| # surface "usage limit" errors without an HTTP status code. A transient | |
| # signal ("try again", "resets at", …) means it's a periodic quota, not | |
| # billing exhaustion. | |
| has_usage_limit = any(p in error_msg for p in _USAGE_LIMIT_PATTERNS) | |
| if has_usage_limit: | |
| has_transient_signal = any(p in error_msg for p in _USAGE_LIMIT_TRANSIENT_SIGNALS) | |
| if has_transient_signal: | |
| return result_fn( | |
| FailoverReason.rate_limit, | |
| retryable=True, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| return result_fn( | |
| FailoverReason.billing, | |
| retryable=False, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| # Billing patterns | |
| if any(p in error_msg for p in _BILLING_PATTERNS): | |
| return result_fn( | |
| FailoverReason.billing, | |
| retryable=False, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| # Rate limit patterns | |
| if any(p in error_msg for p in _RATE_LIMIT_PATTERNS): | |
| return result_fn( | |
| FailoverReason.rate_limit, | |
| retryable=True, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| # Context overflow patterns | |
| if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS): | |
| return result_fn( | |
| FailoverReason.context_overflow, | |
| retryable=True, | |
| should_compress=True, | |
| ) | |
| # Auth patterns | |
| # Auth errors should NOT be retried directly — the credential is invalid and | |
| # retrying with the same key will always fail. Set retryable=False so the | |
| # caller triggers credential rotation (should_rotate_credential=True) or | |
| # provider fallback rather than an immediate retry loop. | |
| if any(p in error_msg for p in _AUTH_PATTERNS): | |
| return result_fn( | |
| FailoverReason.auth, | |
| retryable=False, | |
| should_rotate_credential=True, | |
| should_fallback=True, | |
| ) | |
| # Model not found patterns | |
| if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS): | |
| return result_fn( | |
| FailoverReason.model_not_found, | |
| retryable=False, | |
| should_fallback=True, | |
| ) | |
| return None | |
| # ── Helpers ───────────────────────────────────────────────────────────── | |
| def _extract_status_code(error: Exception) -> Optional[int]: | |
| """Walk the error and its cause chain to find an HTTP status code.""" | |
| current = error | |
| for _ in range(5): # Max depth to prevent infinite loops | |
| code = getattr(current, "status_code", None) | |
| if isinstance(code, int): | |
| return code | |
| # Some SDKs use .status instead of .status_code | |
| code = getattr(current, "status", None) | |
| if isinstance(code, int) and 100 <= code < 600: | |
| return code | |
| # Walk cause chain | |
| cause = getattr(current, "__cause__", None) or getattr(current, "__context__", None) | |
| if cause is None or cause is current: | |
| break | |
| current = cause | |
| return None | |
| def _extract_error_body(error: Exception) -> dict: | |
| """Extract the structured error body from an SDK exception.""" | |
| body = getattr(error, "body", None) | |
| if isinstance(body, dict): | |
| return body | |
| # Some errors have .response.json() | |
| response = getattr(error, "response", None) | |
| if response is not None: | |
| try: | |
| json_body = response.json() | |
| if isinstance(json_body, dict): | |
| return json_body | |
| except Exception: | |
| pass | |
| return {} | |
| def _extract_error_code(body: dict) -> str: | |
| """Extract an error code string from the response body.""" | |
| if not body: | |
| return "" | |
| error_obj = body.get("error", {}) | |
| if isinstance(error_obj, dict): | |
| code = error_obj.get("code") or error_obj.get("type") or "" | |
| if isinstance(code, str) and code.strip(): | |
| return code.strip() | |
| # Top-level code | |
| code = body.get("code") or body.get("error_code") or "" | |
| if isinstance(code, (str, int)): | |
| return str(code).strip() | |
| return "" | |
| def _extract_message(error: Exception, body: dict) -> str: | |
| """Extract the most informative error message.""" | |
| # Try structured body first | |
| if body: | |
| error_obj = body.get("error", {}) | |
| if isinstance(error_obj, dict): | |
| msg = error_obj.get("message", "") | |
| if isinstance(msg, str) and msg.strip(): | |
| return msg.strip()[:500] | |
| msg = body.get("message", "") | |
| if isinstance(msg, str) and msg.strip(): | |
| return msg.strip()[:500] | |
| # Fallback to str(error) | |
| return str(error)[:500] | |