Spaces:

abc1181
/

HermesAgent

Paused

App Files Files Community

HermesAgent / agent /error_classifier.py

cjovs

Upload 1680 files

b5b9c2e verified about 1 month ago

raw

history blame contribute delete

28.2 kB

	"""API error classification for smart failover and recovery.

	Provides a structured taxonomy of API errors and a priority-ordered
	classification pipeline that determines the correct recovery action
	(retry, rotate credential, fallback to another provider, compress
	context, or abort).

	Replaces scattered inline string-matching with a centralized classifier
	that the main retry loop in run_agent.py consults for every API failure.
	"""

	from __future__ import annotations

	import enum
	import logging
	import re
	from dataclasses import dataclass, field
	from typing import Any, Dict, Optional

	logger = logging.getLogger(__name__)


	# ── Error taxonomy ──────────────────────────────────────────────────────

	class FailoverReason(enum.Enum):
	"""Why an API call failed — determines recovery strategy."""

	# Authentication / authorization
	auth = "auth" # Transient auth (401/403) — refresh/rotate
	auth_permanent = "auth_permanent" # Auth failed after refresh — abort

	# Billing / quota
	billing = "billing" # 402 or confirmed credit exhaustion — rotate immediately
	rate_limit = "rate_limit" # 429 or quota-based throttling — backoff then rotate

	# Server-side
	overloaded = "overloaded" # 503/529 — provider overloaded, backoff
	server_error = "server_error" # 500/502 — internal server error, retry

	# Transport
	timeout = "timeout" # Connection/read timeout — rebuild client + retry

	# Context / payload
	context_overflow = "context_overflow" # Context too large — compress, not failover
	payload_too_large = "payload_too_large" # 413 — compress payload

	# Model
	model_not_found = "model_not_found" # 404 or invalid model — fallback to different model

	# Request format
	format_error = "format_error" # 400 bad request — abort or strip + retry

	# Provider-specific
	thinking_signature = "thinking_signature" # Anthropic thinking block sig invalid
	long_context_tier = "long_context_tier" # Anthropic "extra usage" tier gate

	# Catch-all
	unknown = "unknown" # Unclassifiable — retry with backoff


	# ── Classification result ───────────────────────────────────────────────

	@dataclass
	class ClassifiedError:
	"""Structured classification of an API error with recovery hints."""

	reason: FailoverReason
	status_code: Optional[int] = None
	provider: Optional[str] = None
	model: Optional[str] = None
	message: str = ""
	error_context: Dict[str, Any] = field(default_factory=dict)

	# Recovery action hints — the retry loop checks these instead of
	# re-classifying the error itself.
	retryable: bool = True
	should_compress: bool = False
	should_rotate_credential: bool = False
	should_fallback: bool = False

	@property
	def is_auth(self) -> bool:
	return self.reason in (FailoverReason.auth, FailoverReason.auth_permanent)



	# ── Provider-specific patterns ──────────────────────────────────────────

	# Patterns that indicate billing exhaustion (not transient rate limit)
	_BILLING_PATTERNS = [
	"insufficient credits",
	"insufficient_quota",
	"credit balance",
	"credits have been exhausted",
	"top up your credits",
	"payment required",
	"billing hard limit",
	"exceeded your current quota",
	"account is deactivated",
	"plan does not include",
	]

	# Patterns that indicate rate limiting (transient, will resolve)
	_RATE_LIMIT_PATTERNS = [
	"rate limit",
	"rate_limit",
	"too many requests",
	"throttled",
	"requests per minute",
	"tokens per minute",
	"requests per day",
	"try again in",
	"please retry after",
	"resource_exhausted",
	"rate increased too quickly", # Alibaba/DashScope throttling
	]

	# Usage-limit patterns that need disambiguation (could be billing OR rate_limit)
	_USAGE_LIMIT_PATTERNS = [
	"usage limit",
	"quota",
	"limit exceeded",
	"key limit exceeded",
	]

	# Patterns confirming usage limit is transient (not billing)
	_USAGE_LIMIT_TRANSIENT_SIGNALS = [
	"try again",
	"retry",
	"resets at",
	"reset in",
	"wait",
	"requests remaining",
	"periodic",
	"window",
	]

	# Payload-too-large patterns detected from message text (no status_code attr).
	# Proxies and some backends embed the HTTP status in the error message.
	_PAYLOAD_TOO_LARGE_PATTERNS = [
	"request entity too large",
	"payload too large",
	"error code: 413",
	]

	# Context overflow patterns
	_CONTEXT_OVERFLOW_PATTERNS = [
	"context length",
	"context size",
	"maximum context",
	"token limit",
	"too many tokens",
	"reduce the length",
	"exceeds the limit",
	"context window",
	"prompt is too long",
	"prompt exceeds max length",
	"max_tokens",
	"maximum number of tokens",
	# Chinese error messages (some providers return these)
	"超过最大长度",
	"上下文长度",
	]

	# Model not found patterns
	_MODEL_NOT_FOUND_PATTERNS = [
	"is not a valid model",
	"invalid model",
	"model not found",
	"model_not_found",
	"does not exist",
	"no such model",
	"unknown model",
	"unsupported model",
	]

	# Auth patterns (non-status-code signals)
	_AUTH_PATTERNS = [
	"invalid api key",
	"invalid_api_key",
	"authentication",
	"unauthorized",
	"forbidden",
	"invalid token",
	"token expired",
	"token revoked",
	"access denied",
	]

	# Anthropic thinking block signature patterns
	_THINKING_SIG_PATTERNS = [
	"signature", # Combined with "thinking" check
	]

	# Transport error type names
	_TRANSPORT_ERROR_TYPES = frozenset({
	"ReadTimeout", "ConnectTimeout", "PoolTimeout",
	"ConnectError", "RemoteProtocolError",
	"ConnectionError", "ConnectionResetError",
	"ConnectionAbortedError", "BrokenPipeError",
	"TimeoutError", "ReadError",
	"ServerDisconnectedError",
	# OpenAI SDK errors (not subclasses of Python builtins)
	"APIConnectionError",
	"APITimeoutError",
	})

	# Server disconnect patterns (no status code, but transport-level)
	_SERVER_DISCONNECT_PATTERNS = [
	"server disconnected",
	"peer closed connection",
	"connection reset by peer",
	"connection was closed",
	"network connection lost",
	"unexpected eof",
	"incomplete chunked read",
	]


	# ── Classification pipeline ─────────────────────────────────────────────

	def classify_api_error(
	error: Exception,
	*,
	provider: str = "",
	model: str = "",
	approx_tokens: int = 0,
	context_length: int = 200000,
	num_messages: int = 0,
	) -> ClassifiedError:
	"""Classify an API error into a structured recovery recommendation.

	Priority-ordered pipeline:
	1. Special-case provider-specific patterns (thinking sigs, tier gates)
	2. HTTP status code + message-aware refinement
	3. Error code classification (from body)
	4. Message pattern matching (billing vs rate_limit vs context vs auth)
	5. Transport error heuristics
	6. Server disconnect + large session → context overflow
	7. Fallback: unknown (retryable with backoff)

	Args:
	error: The exception from the API call.
	provider: Current provider name (e.g. "openrouter", "anthropic").
	model: Current model slug.
	approx_tokens: Approximate token count of the current context.
	context_length: Maximum context length for the current model.

	Returns:
	ClassifiedError with reason and recovery action hints.
	"""
	status_code = _extract_status_code(error)
	error_type = type(error).__name__
	body = _extract_error_body(error)
	error_code = _extract_error_code(body)

	# Build a comprehensive error message string for pattern matching.
	# str(error) alone may not include the body message (e.g. OpenAI SDK's
	# APIStatusError.__str__ returns the first arg, not the body). Append
	# the body message so patterns like "try again" in 402 disambiguation
	# are detected even when only present in the structured body.
	#
	# Also extract metadata.raw — OpenRouter wraps upstream provider errors
	# inside {"error": {"message": "Provider returned error", "metadata":
	# {"raw": "<actual error JSON>"}}} and the real error message (e.g.
	# "context length exceeded") is only in the inner JSON.
	_raw_msg = str(error).lower()
	_body_msg = ""
	_metadata_msg = ""
	if isinstance(body, dict):
	_err_obj = body.get("error", {})
	if isinstance(_err_obj, dict):
	_body_msg = (_err_obj.get("message") or "").lower()
	# Parse metadata.raw for wrapped provider errors
	_metadata = _err_obj.get("metadata", {})
	if isinstance(_metadata, dict):
	_raw_json = _metadata.get("raw") or ""
	if isinstance(_raw_json, str) and _raw_json.strip():
	try:
	import json
	_inner = json.loads(_raw_json)
	if isinstance(_inner, dict):
	_inner_err = _inner.get("error", {})
	if isinstance(_inner_err, dict):
	_metadata_msg = (_inner_err.get("message") or "").lower()
	except (json.JSONDecodeError, TypeError):
	pass
	if not _body_msg:
	_body_msg = (body.get("message") or "").lower()
	# Combine all message sources for pattern matching
	parts = [_raw_msg]
	if _body_msg and _body_msg not in _raw_msg:
	parts.append(_body_msg)
	if _metadata_msg and _metadata_msg not in _raw_msg and _metadata_msg not in _body_msg:
	parts.append(_metadata_msg)
	error_msg = " ".join(parts)
	provider_lower = (provider or "").strip().lower()
	model_lower = (model or "").strip().lower()

	def _result(reason: FailoverReason, **overrides) -> ClassifiedError:
	defaults = {
	"reason": reason,
	"status_code": status_code,
	"provider": provider,
	"model": model,
	"message": _extract_message(error, body),
	}
	defaults.update(overrides)
	return ClassifiedError(**defaults)

	# ── 1. Provider-specific patterns (highest priority) ────────────

	# Anthropic thinking block signature invalid (400).
	# Don't gate on provider — OpenRouter proxies Anthropic errors, so the
	# provider may be "openrouter" even though the error is Anthropic-specific.
	# The message pattern ("signature" + "thinking") is unique enough.
	if (
	status_code == 400
	and "signature" in error_msg
	and "thinking" in error_msg
	):
	return _result(
	FailoverReason.thinking_signature,
	retryable=True,
	should_compress=False,
	)

	# Anthropic long-context tier gate (429 "extra usage" + "long context")
	if (
	status_code == 429
	and "extra usage" in error_msg
	and "long context" in error_msg
	):
	return _result(
	FailoverReason.long_context_tier,
	retryable=True,
	should_compress=True,
	)

	# ── 2. HTTP status code classification ──────────────────────────

	if status_code is not None:
	classified = _classify_by_status(
	status_code, error_msg, error_code, body,
	provider=provider_lower, model=model_lower,
	approx_tokens=approx_tokens, context_length=context_length,
	num_messages=num_messages,
	result_fn=_result,
	)
	if classified is not None:
	return classified

	# ── 3. Error code classification ────────────────────────────────

	if error_code:
	classified = _classify_by_error_code(error_code, error_msg, _result)
	if classified is not None:
	return classified

	# ── 4. Message pattern matching (no status code) ────────────────

	classified = _classify_by_message(
	error_msg, error_type,
	approx_tokens=approx_tokens,
	context_length=context_length,
	result_fn=_result,
	)
	if classified is not None:
	return classified

	# ── 5. Server disconnect + large session → context overflow ─────
	# Must come BEFORE generic transport error catch — a disconnect on
	# a large session is more likely context overflow than a transient
	# transport hiccup. Without this ordering, RemoteProtocolError
	# always maps to timeout regardless of session size.

	is_disconnect = any(p in error_msg for p in _SERVER_DISCONNECT_PATTERNS)
	if is_disconnect and not status_code:
	is_large = approx_tokens > context_length * 0.6 or approx_tokens > 120000 or num_messages > 200
	if is_large:
	return _result(
	FailoverReason.context_overflow,
	retryable=True,
	should_compress=True,
	)
	return _result(FailoverReason.timeout, retryable=True)

	# ── 6. Transport / timeout heuristics ───────────────────────────

	if error_type in _TRANSPORT_ERROR_TYPES or isinstance(error, (TimeoutError, ConnectionError, OSError)):
	return _result(FailoverReason.timeout, retryable=True)

	# ── 7. Fallback: unknown ────────────────────────────────────────

	return _result(FailoverReason.unknown, retryable=True)


	# ── Status code classification ──────────────────────────────────────────

	def _classify_by_status(
	status_code: int,
	error_msg: str,
	error_code: str,
	body: dict,
	*,
	provider: str,
	model: str,
	approx_tokens: int,
	context_length: int,
	num_messages: int = 0,
	result_fn,
	) -> Optional[ClassifiedError]:
	"""Classify based on HTTP status code with message-aware refinement."""

	if status_code == 401:
	# Not retryable on its own — credential pool rotation and
	# provider-specific refresh (Codex, Anthropic, Nous) run before
	# the retryability check in run_agent.py. If those succeed, the
	# loop `continue`s. If they fail, retryable=False ensures we
	# hit the client-error abort path (which tries fallback first).
	return result_fn(
	FailoverReason.auth,
	retryable=False,
	should_rotate_credential=True,
	should_fallback=True,
	)

	if status_code == 403:
	# OpenRouter 403 "key limit exceeded" is actually billing
	if "key limit exceeded" in error_msg or "spending limit" in error_msg:
	return result_fn(
	FailoverReason.billing,
	retryable=False,
	should_rotate_credential=True,
	should_fallback=True,
	)
	return result_fn(
	FailoverReason.auth,
	retryable=False,
	should_fallback=True,
	)

	if status_code == 402:
	return _classify_402(error_msg, result_fn)

	if status_code == 404:
	if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS):
	return result_fn(
	FailoverReason.model_not_found,
	retryable=False,
	should_fallback=True,
	)
	# Generic 404 — could be model or endpoint
	return result_fn(
	FailoverReason.model_not_found,
	retryable=False,
	should_fallback=True,
	)

	if status_code == 413:
	return result_fn(
	FailoverReason.payload_too_large,
	retryable=True,
	should_compress=True,
	)

	if status_code == 429:
	# Already checked long_context_tier above; this is a normal rate limit
	return result_fn(
	FailoverReason.rate_limit,
	retryable=True,
	should_rotate_credential=True,
	should_fallback=True,
	)

	if status_code == 400:
	return _classify_400(
	error_msg, error_code, body,
	provider=provider, model=model,
	approx_tokens=approx_tokens,
	context_length=context_length,
	num_messages=num_messages,
	result_fn=result_fn,
	)

	if status_code in (500, 502):
	return result_fn(FailoverReason.server_error, retryable=True)

	if status_code in (503, 529):
	return result_fn(FailoverReason.overloaded, retryable=True)

	# Other 4xx — non-retryable
	if 400 <= status_code < 500:
	return result_fn(
	FailoverReason.format_error,
	retryable=False,
	should_fallback=True,
	)

	# Other 5xx — retryable
	if 500 <= status_code < 600:
	return result_fn(FailoverReason.server_error, retryable=True)

	return None


	def _classify_402(error_msg: str, result_fn) -> ClassifiedError:
	"""Disambiguate 402: billing exhaustion vs transient usage limit.

	The key insight from OpenClaw: some 402s are transient rate limits
	disguised as payment errors. "Usage limit, try again in 5 minutes"
	is NOT a billing problem — it's a periodic quota that resets.
	"""
	# Check for transient usage-limit signals first
	has_usage_limit = any(p in error_msg for p in _USAGE_LIMIT_PATTERNS)
	has_transient_signal = any(p in error_msg for p in _USAGE_LIMIT_TRANSIENT_SIGNALS)

	if has_usage_limit and has_transient_signal:
	# Transient quota — treat as rate limit, not billing
	return result_fn(
	FailoverReason.rate_limit,
	retryable=True,
	should_rotate_credential=True,
	should_fallback=True,
	)

	# Confirmed billing exhaustion
	return result_fn(
	FailoverReason.billing,
	retryable=False,
	should_rotate_credential=True,
	should_fallback=True,
	)


	def _classify_400(
	error_msg: str,
	error_code: str,
	body: dict,
	*,
	provider: str,
	model: str,
	approx_tokens: int,
	context_length: int,
	num_messages: int = 0,
	result_fn,
	) -> ClassifiedError:
	"""Classify 400 Bad Request — context overflow, format error, or generic."""

	# Context overflow from 400
	if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS):
	return result_fn(
	FailoverReason.context_overflow,
	retryable=True,
	should_compress=True,
	)

	# Some providers return model-not-found as 400 instead of 404 (e.g. OpenRouter).
	if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS):
	return result_fn(
	FailoverReason.model_not_found,
	retryable=False,
	should_fallback=True,
	)

	# Some providers return rate limit / billing errors as 400 instead of 429/402.
	# Check these patterns before falling through to format_error.
	if any(p in error_msg for p in _RATE_LIMIT_PATTERNS):
	return result_fn(
	FailoverReason.rate_limit,
	retryable=True,
	should_rotate_credential=True,
	should_fallback=True,
	)
	if any(p in error_msg for p in _BILLING_PATTERNS):
	return result_fn(
	FailoverReason.billing,
	retryable=False,
	should_rotate_credential=True,
	should_fallback=True,
	)

	# Generic 400 + large session → probable context overflow
	# Anthropic sometimes returns a bare "Error" message when context is too large
	err_body_msg = ""
	if isinstance(body, dict):
	err_obj = body.get("error", {})
	if isinstance(err_obj, dict):
	err_body_msg = (err_obj.get("message") or "").strip().lower()
	# Responses API (and some providers) use flat body: {"message": "..."}
	if not err_body_msg:
	err_body_msg = (body.get("message") or "").strip().lower()
	is_generic = len(err_body_msg) < 30 or err_body_msg in ("error", "")
	is_large = approx_tokens > context_length * 0.4 or approx_tokens > 80000 or num_messages > 80

	if is_generic and is_large:
	return result_fn(
	FailoverReason.context_overflow,
	retryable=True,
	should_compress=True,
	)

	# Non-retryable format error
	return result_fn(
	FailoverReason.format_error,
	retryable=False,
	should_fallback=True,
	)


	# ── Error code classification ───────────────────────────────────────────

	def _classify_by_error_code(
	error_code: str, error_msg: str, result_fn,
	) -> Optional[ClassifiedError]:
	"""Classify by structured error codes from the response body."""
	code_lower = error_code.lower()

	if code_lower in ("resource_exhausted", "throttled", "rate_limit_exceeded"):
	return result_fn(
	FailoverReason.rate_limit,
	retryable=True,
	should_rotate_credential=True,
	)

	if code_lower in ("insufficient_quota", "billing_not_active", "payment_required"):
	return result_fn(
	FailoverReason.billing,
	retryable=False,
	should_rotate_credential=True,
	should_fallback=True,
	)

	if code_lower in ("model_not_found", "model_not_available", "invalid_model"):
	return result_fn(
	FailoverReason.model_not_found,
	retryable=False,
	should_fallback=True,
	)

	if code_lower in ("context_length_exceeded", "max_tokens_exceeded"):
	return result_fn(
	FailoverReason.context_overflow,
	retryable=True,
	should_compress=True,
	)

	return None


	# ── Message pattern classification ──────────────────────────────────────

	def _classify_by_message(
	error_msg: str,
	error_type: str,
	*,
	approx_tokens: int,
	context_length: int,
	result_fn,
	) -> Optional[ClassifiedError]:
	"""Classify based on error message patterns when no status code is available."""

	# Payload-too-large patterns (from message text when no status_code)
	if any(p in error_msg for p in _PAYLOAD_TOO_LARGE_PATTERNS):
	return result_fn(
	FailoverReason.payload_too_large,
	retryable=True,
	should_compress=True,
	)

	# Usage-limit patterns need the same disambiguation as 402: some providers
	# surface "usage limit" errors without an HTTP status code. A transient
	# signal ("try again", "resets at", …) means it's a periodic quota, not
	# billing exhaustion.
	has_usage_limit = any(p in error_msg for p in _USAGE_LIMIT_PATTERNS)
	if has_usage_limit:
	has_transient_signal = any(p in error_msg for p in _USAGE_LIMIT_TRANSIENT_SIGNALS)
	if has_transient_signal:
	return result_fn(
	FailoverReason.rate_limit,
	retryable=True,
	should_rotate_credential=True,
	should_fallback=True,
	)
	return result_fn(
	FailoverReason.billing,
	retryable=False,
	should_rotate_credential=True,
	should_fallback=True,
	)

	# Billing patterns
	if any(p in error_msg for p in _BILLING_PATTERNS):
	return result_fn(
	FailoverReason.billing,
	retryable=False,
	should_rotate_credential=True,
	should_fallback=True,
	)

	# Rate limit patterns
	if any(p in error_msg for p in _RATE_LIMIT_PATTERNS):
	return result_fn(
	FailoverReason.rate_limit,
	retryable=True,
	should_rotate_credential=True,
	should_fallback=True,
	)

	# Context overflow patterns
	if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS):
	return result_fn(
	FailoverReason.context_overflow,
	retryable=True,
	should_compress=True,
	)

	# Auth patterns
	# Auth errors should NOT be retried directly — the credential is invalid and
	# retrying with the same key will always fail. Set retryable=False so the
	# caller triggers credential rotation (should_rotate_credential=True) or
	# provider fallback rather than an immediate retry loop.
	if any(p in error_msg for p in _AUTH_PATTERNS):
	return result_fn(
	FailoverReason.auth,
	retryable=False,
	should_rotate_credential=True,
	should_fallback=True,
	)

	# Model not found patterns
	if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS):
	return result_fn(
	FailoverReason.model_not_found,
	retryable=False,
	should_fallback=True,
	)

	return None


	# ── Helpers ─────────────────────────────────────────────────────────────

	def _extract_status_code(error: Exception) -> Optional[int]:
	"""Walk the error and its cause chain to find an HTTP status code."""
	current = error
	for _ in range(5): # Max depth to prevent infinite loops
	code = getattr(current, "status_code", None)
	if isinstance(code, int):
	return code
	# Some SDKs use .status instead of .status_code
	code = getattr(current, "status", None)
	if isinstance(code, int) and 100 <= code < 600:
	return code
	# Walk cause chain
	cause = getattr(current, "__cause__", None) or getattr(current, "__context__", None)
	if cause is None or cause is current:
	break
	current = cause
	return None


	def _extract_error_body(error: Exception) -> dict:
	"""Extract the structured error body from an SDK exception."""
	body = getattr(error, "body", None)
	if isinstance(body, dict):
	return body
	# Some errors have .response.json()
	response = getattr(error, "response", None)
	if response is not None:
	try:
	json_body = response.json()
	if isinstance(json_body, dict):
	return json_body
	except Exception:
	pass
	return {}


	def _extract_error_code(body: dict) -> str:
	"""Extract an error code string from the response body."""
	if not body:
	return ""
	error_obj = body.get("error", {})
	if isinstance(error_obj, dict):
	code = error_obj.get("code") or error_obj.get("type") or ""
	if isinstance(code, str) and code.strip():
	return code.strip()
	# Top-level code
	code = body.get("code") or body.get("error_code") or ""
	if isinstance(code, (str, int)):
	return str(code).strip()
	return ""


	def _extract_message(error: Exception, body: dict) -> str:
	"""Extract the most informative error message."""
	# Try structured body first
	if body:
	error_obj = body.get("error", {})
	if isinstance(error_obj, dict):
	msg = error_obj.get("message", "")
	if isinstance(msg, str) and msg.strip():
	return msg.strip()[:500]
	msg = body.get("message", "")
	if isinstance(msg, str) and msg.strip():
	return msg.strip()[:500]
	# Fallback to str(error)
	return str(error)[:500]