Spaces:

elmerzole
/

llm-api-proxy

Paused

App Files Files Community

Mirrowel commited on Dec 4, 2025

Commit

71586c6

unverified ·

2 Parent(s): d2adf05 a1cc875

Merge pull request #14 from Mirrowel/Error-handling-consistency

Browse files

add structured error accumulator and consistent error handling/reporting

Files changed (3) hide show

src/rotator_library/client.py +399 -179
src/rotator_library/error_handler.py +351 -16
src/rotator_library/failure_logger.py +93 -13

src/rotator_library/client.py CHANGED Viewed

@@ -25,6 +25,10 @@ from .error_handler import (
     classify_error,
     AllProviders,
     NoAvailableKeysError,
 )
 from .providers import PROVIDER_PLUGINS
 from .providers.openai_compatible_provider import OpenAICompatibleProvider
@@ -67,7 +71,7 @@ class RotatingClient:
     ):
         """
         Initialize the RotatingClient with intelligent credential rotation.
         Args:
             api_keys: Dictionary mapping provider names to lists of API keys
             oauth_credentials: Dictionary mapping provider names to OAuth credential paths
@@ -136,8 +140,7 @@ class RotatingClient:
         self.global_timeout = global_timeout
         self.abort_on_callback_error = abort_on_callback_error
         self.usage_manager = UsageManager(
-            file_path=usage_file_path,
-            rotation_tolerance=rotation_tolerance
         )
         self._model_list_cache = {}
         self._provider_plugins = PROVIDER_PLUGINS
@@ -156,7 +159,9 @@ class RotatingClient:
         # Validate all values are >= 1
         for provider, max_val in self.max_concurrent_requests_per_key.items():
             if max_val < 1:
-                lib_logger.warning(f"Invalid max_concurrent for '{provider}': {max_val}. Setting to 1.")
                 self.max_concurrent_requests_per_key[provider] = 1
     def _is_model_ignored(self, provider: str, model_id: str) -> bool:
@@ -364,7 +369,9 @@ class RotatingClient:
         return kwargs
-    def _apply_default_safety_settings(self, litellm_kwargs: Dict[str, Any], provider: str):
         """
         Ensure default Gemini safety settings are present when calling the Gemini provider.
         This will not override any explicit settings provided by the request. It accepts
@@ -393,22 +400,33 @@ class RotatingClient:
         ]
         # If generic form is present, ensure missing generic keys are filled in
-        if "safety_settings" in litellm_kwargs and isinstance(litellm_kwargs["safety_settings"], dict):
             for k, v in default_generic.items():
                 if k not in litellm_kwargs["safety_settings"]:
                     litellm_kwargs["safety_settings"][k] = v
             return
         # If Gemini form is present, ensure missing gemini categories are appended
-        if "safetySettings" in litellm_kwargs and isinstance(litellm_kwargs["safetySettings"], list):
-            present = {item.get("category") for item in litellm_kwargs["safetySettings"] if isinstance(item, dict)}
             for d in default_gemini:
                 if d["category"] not in present:
                     litellm_kwargs["safetySettings"].append(d)
             return
         # Neither present: set generic defaults so provider conversion will translate them
-        if "safety_settings" not in litellm_kwargs and "safetySettings" not in litellm_kwargs:
             litellm_kwargs["safety_settings"] = default_generic.copy()
     def get_oauth_credentials(self) -> Dict[str, List[str]]:
@@ -426,10 +444,10 @@ class RotatingClient:
         """
         Lazily initializes and returns a provider instance.
         Only initializes providers that have configured credentials.
         Args:
             provider_name: The name of the provider to get an instance for.
         Returns:
             Provider instance if credentials exist, None otherwise.
         """
@@ -439,7 +457,7 @@ class RotatingClient:
                 f"Skipping provider '{provider_name}' initialization: no credentials configured"
             )
             return None
         if provider_name not in self._provider_instances:
             if provider_name in self._provider_plugins:
                 self._provider_instances[provider_name] = self._provider_plugins[
@@ -461,46 +479,47 @@ class RotatingClient:
     def _resolve_model_id(self, model: str, provider: str) -> str:
         """
         Resolves the actual model ID to send to the provider.
         For custom models with name/ID mappings, returns the ID.
         Otherwise, returns the model name unchanged.
         Args:
             model: Full model string with provider (e.g., "iflow/DS-v3.2")
             provider: Provider name (e.g., "iflow")
         Returns:
             Full model string with ID (e.g., "iflow/deepseek-v3.2")
         """
         # Extract model name from "provider/model_name" format
-        model_name = model.split('/')[-1] if '/' in model else model
         # Try to get provider instance to check for model definitions
         provider_plugin = self._get_provider_instance(provider)
         # Check if provider has model definitions
-        if provider_plugin and hasattr(provider_plugin, 'model_definitions'):
-            model_id = provider_plugin.model_definitions.get_model_id(provider, model_name)
             if model_id and model_id != model_name:
                 # Return with provider prefix
                 return f"{provider}/{model_id}"
         # Fallback: use client's own model definitions
         model_id = self.model_definitions.get_model_id(provider, model_name)
         if model_id and model_id != model_name:
             return f"{provider}/{model_id}"
         # No conversion needed, return original
         return model
     async def _safe_streaming_wrapper(
         self, stream: Any, key: str, model: str, request: Optional[Any] = None
     ) -> AsyncGenerator[Any, None]:
         """
         A hybrid wrapper for streaming that buffers fragmented JSON, handles client disconnections gracefully,
         and distinguishes between content and streamed errors.
         FINISH_REASON HANDLING:
         Providers just translate chunks - this wrapper handles ALL finish_reason logic:
         1. Strip finish_reason from intermediate chunks (litellm defaults to "stop")
@@ -537,7 +556,7 @@ class RotatingClient:
                         chunk_dict = chunk.model_dump()
                     else:
                         chunk_dict = chunk
                     # === FINISH_REASON LOGIC ===
                     # Providers send raw chunks without finish_reason logic.
                     # This wrapper determines finish_reason based on accumulated state.
@@ -545,19 +564,19 @@ class RotatingClient:
                         choice = chunk_dict["choices"][0]
                         delta = choice.get("delta", {})
                         usage = chunk_dict.get("usage", {})
                         # Track tool_calls across ALL chunks - if we ever see one, finish_reason must be tool_calls
                         if delta.get("tool_calls"):
                             has_tool_calls = True
                             accumulated_finish_reason = "tool_calls"
                         # Detect final chunk: has usage with completion_tokens > 0
                         has_completion_tokens = (
-                            usage and
-                            isinstance(usage, dict) and
-                            usage.get("completion_tokens", 0) > 0
                         )
                         if has_completion_tokens:
                             # FINAL CHUNK: Determine correct finish_reason
                             if has_tool_calls:
@@ -573,7 +592,7 @@ class RotatingClient:
                             # INTERMEDIATE CHUNK: Never emit finish_reason
                             # (litellm.ModelResponse defaults to "stop" which is wrong)
                             choice["finish_reason"] = None
                     yield f"data: {json.dumps(chunk_dict)}\n\n"
                     if hasattr(chunk, "usage") and chunk.usage:
@@ -722,12 +741,13 @@ class RotatingClient:
         # multiple keys have the same usage stats.
         credentials_for_provider = list(self.all_credentials[provider])
         random.shuffle(credentials_for_provider)
         # Filter out credentials that are unavailable (queued for re-auth)
         provider_plugin = self._get_provider_instance(provider)
-        if provider_plugin and hasattr(provider_plugin, 'is_credential_available'):
             available_creds = [
-                cred for cred in credentials_for_provider
                 if provider_plugin.is_credential_available(cred)
             ]
             if available_creds:
@@ -740,7 +760,7 @@ class RotatingClient:
         kwargs = self._convert_model_params(**kwargs)
         # The main rotation loop. It continues as long as there are untried credentials and the global deadline has not been exceeded.
         # Resolve model ID early, before any credential operations
         # This ensures consistent model ID usage for acquisition, release, and tracking
         resolved_model = self._resolve_model_id(model, provider)
@@ -748,10 +768,10 @@ class RotatingClient:
             lib_logger.info(f"Resolved model '{model}' to '{resolved_model}'")
             model = resolved_model
             kwargs["model"] = model  # Ensure kwargs has the resolved model for litellm
         # [NEW] Filter by model tier requirement and build priority map
         credential_priorities = None
-        if provider_plugin and hasattr(provider_plugin, 'get_model_tier_requirement'):
             required_tier = provider_plugin.get_model_tier_requirement(model)
             if required_tier is not None:
                 # Filter OUT only credentials we KNOW are too low priority
@@ -759,9 +779,9 @@ class RotatingClient:
                 incompatible_creds = []
                 compatible_creds = []
                 unknown_creds = []
                 for cred in credentials_for_provider:
-                    if hasattr(provider_plugin, 'get_credential_priority'):
                         priority = provider_plugin.get_credential_priority(cred)
                         if priority is None:
                             # Unknown priority - keep it, will be discovered on first use
@@ -775,7 +795,7 @@ class RotatingClient:
                     else:
                         # Provider doesn't support priorities - keep all
                         unknown_creds.append(cred)
                 # If we have any known-compatible or unknown credentials, use them
                 tier_compatible_creds = compatible_creds + unknown_creds
                 if tier_compatible_creds:
@@ -802,20 +822,25 @@ class RotatingClient:
                         f"but all {len(incompatible_creds)} known credentials have priority > {required_tier}. "
                         f"Request will likely fail."
                     )
         # Build priority map for usage_manager
-        if provider_plugin and hasattr(provider_plugin, 'get_credential_priority'):
             credential_priorities = {}
             for cred in credentials_for_provider:
                 priority = provider_plugin.get_credential_priority(cred)
                 if priority is not None:
                     credential_priorities[cred] = priority
             if credential_priorities:
                 lib_logger.debug(
-                    f"Credential priorities for {provider}: {', '.join(f'P{p}={len([c for c in credentials_for_provider if credential_priorities.get(c)==p])}' for p in sorted(set(credential_priorities.values())))}"
                 )
         while (
             len(tried_creds) < len(credentials_for_provider) and time.time() < deadline
         ):
@@ -852,9 +877,11 @@ class RotatingClient:
                 )
                 max_concurrent = self.max_concurrent_requests_per_key.get(provider, 1)
                 current_cred = await self.usage_manager.acquire_key(
-                    available_keys=creds_to_try, model=model, deadline=deadline,
                     max_concurrent=max_concurrent,
-                    credential_priorities=credential_priorities
                 )
                 key_acquired = True
                 tried_creds.add(current_cred)
@@ -937,10 +964,14 @@ class RotatingClient:
                     if provider_instance:
                         # Ensure default Gemini safety settings are present (without overriding request)
                         try:
-                            self._apply_default_safety_settings(litellm_kwargs, provider)
                         except Exception:
                             # If anything goes wrong here, avoid breaking the request flow.
-                            lib_logger.debug("Could not apply default safety settings; continuing.")
                         if "safety_settings" in litellm_kwargs:
                             converted_settings = (
@@ -1023,8 +1054,14 @@ class RotatingClient:
                             # Extract a clean error message for the user-facing log
                             error_message = str(e).split("\n")[0]
                             lib_logger.info(
-                                f"Key ...{current_cred[-6:]} hit rate limit for model {model}. Reason: '{error_message}'. Rotating key."
                             )
                             if classified_error.status_code == 429:
@@ -1032,16 +1069,10 @@ class RotatingClient:
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
                                 )
-                                lib_logger.warning(
-                                    f"IP-based rate limit detected for {provider}. Starting a {cooldown_duration}-second global cooldown."
-                                )
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
-                            lib_logger.warning(
-                                f"Key ...{current_cred[-6:]} encountered a rate limit. Trying next key."
-                            )
                             break  # Move to the next key
                         except (
@@ -1060,39 +1091,115 @@ class RotatingClient:
                                 else {},
                             )
                             classified_error = classify_error(e)
                             # Provider-level error: don't increment consecutive failures
                             await self.usage_manager.record_failure(
-                                current_cred, model, classified_error,
-                                increment_consecutive_failures=False
                             )
                             if attempt >= self.max_retries - 1:
-                                error_message = str(e).split("\n")[0]
                                 lib_logger.warning(
-                                    f"Key ...{current_cred[-6:]} failed after max retries for model {model} due to a server error. Reason: '{error_message}'. Rotating key."
                                 )
                                 break  # Move to the next key
                             # For temporary errors, wait before retrying with the same key.
                             wait_time = classified_error.retry_after or (
-                                1 * (2**attempt)
                             ) + random.uniform(0, 1)
                             remaining_budget = deadline - time.time()
                             # If the required wait time exceeds the budget, don't wait; rotate to the next key immediately.
                             if wait_time > remaining_budget:
                                 lib_logger.warning(
-                                    f"Required retry wait time ({wait_time:.2f}s) exceeds remaining budget ({remaining_budget:.2f}s). Rotating key early."
                                 )
                                 break
-                            error_message = str(e).split("\n")[0]
                             lib_logger.warning(
-                                f"Key ...{current_cred[-6:]} encountered a server error for model {model}. Reason: '{error_message}'. Retrying in {wait_time:.2f}s."
                             )
                             await asyncio.sleep(wait_time)
                             continue  # Retry with the same key
                         except Exception as e:
                             last_exception = e
                             log_failure(
@@ -1107,32 +1214,40 @@ class RotatingClient:
                             if request and await request.is_disconnected():
                                 lib_logger.warning(
-                                    f"Client disconnected. Aborting retries for credential ...{current_cred[-6:]}."
                                 )
                                 raise last_exception
                             classified_error = classify_error(e)
                             error_message = str(e).split("\n")[0]
                             lib_logger.warning(
-                                f"Key ...{current_cred[-6:]} failed with {classified_error.error_type} (Status: {classified_error.status_code}). Error: {error_message}. Rotating key."
                             )
-                            if classified_error.status_code == 429:
                                 cooldown_duration = classified_error.retry_after or 60
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
                                 )
-                                lib_logger.warning(
-                                    f"IP-based rate limit detected for {provider} from generic exception. Starting a {cooldown_duration}-second global cooldown."
-                                )
-                            if classified_error.error_type in [
-                                "invalid_request",
-                                "context_window_exceeded",
-                                "authentication",
-                            ]:
-                                # For these errors, we should not retry with other keys.
                                 raise last_exception
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
@@ -1141,14 +1256,22 @@ class RotatingClient:
                 if key_acquired and current_cred:
                     await self.usage_manager.release_key(current_cred, model)
-        if last_exception:
-            # Log the final error but do not raise it, as per the new requirement.
-            # The client should not see intermittent failures.
-            lib_logger.error(
-                f"Request failed after trying all keys or exceeding global timeout. Last error: {last_exception}"
-            )
-        # Return None to indicate failure without propagating a disruptive exception.
         return None
     async def _streaming_acompletion_with_retry(
@@ -1164,12 +1287,13 @@ class RotatingClient:
         # Create a mutable copy of the keys and shuffle it.
         credentials_for_provider = list(self.all_credentials[provider])
         random.shuffle(credentials_for_provider)
         # Filter out credentials that are unavailable (queued for re-auth)
         provider_plugin = self._get_provider_instance(provider)
-        if provider_plugin and hasattr(provider_plugin, 'is_credential_available'):
             available_creds = [
-                cred for cred in credentials_for_provider
                 if provider_plugin.is_credential_available(cred)
             ]
             if available_creds:
@@ -1191,10 +1315,10 @@ class RotatingClient:
             lib_logger.info(f"Resolved model '{model}' to '{resolved_model}'")
             model = resolved_model
             kwargs["model"] = model  # Ensure kwargs has the resolved model for litellm
         # [NEW] Filter by model tier requirement and build priority map
         credential_priorities = None
-        if provider_plugin and hasattr(provider_plugin, 'get_model_tier_requirement'):
             required_tier = provider_plugin.get_model_tier_requirement(model)
             if required_tier is not None:
                 # Filter OUT only credentials we KNOW are too low priority
@@ -1202,9 +1326,9 @@ class RotatingClient:
                 incompatible_creds = []
                 compatible_creds = []
                 unknown_creds = []
                 for cred in credentials_for_provider:
-                    if hasattr(provider_plugin, 'get_credential_priority'):
                         priority = provider_plugin.get_credential_priority(cred)
                         if priority is None:
                             # Unknown priority - keep it, will be discovered on first use
@@ -1218,7 +1342,7 @@ class RotatingClient:
                     else:
                         # Provider doesn't support priorities - keep all
                         unknown_creds.append(cred)
                 # If we have any known-compatible or unknown credentials, use them
                 tier_compatible_creds = compatible_creds + unknown_creds
                 if tier_compatible_creds:
@@ -1245,20 +1369,25 @@ class RotatingClient:
                         f"but all {len(incompatible_creds)} known credentials have priority > {required_tier}. "
                         f"Request will likely fail."
                     )
         # Build priority map for usage_manager
-        if provider_plugin and hasattr(provider_plugin, 'get_credential_priority'):
             credential_priorities = {}
             for cred in credentials_for_provider:
                 priority = provider_plugin.get_credential_priority(cred)
                 if priority is not None:
                     credential_priorities[cred] = priority
             if credential_priorities:
                 lib_logger.debug(
-                    f"Credential priorities for {provider}: {', '.join(f'P{p}={len([c for c in credentials_for_provider if credential_priorities.get(c)==p])}' for p in sorted(set(credential_priorities.values())))}"
                 )
         try:
             while (
                 len(tried_creds) < len(credentials_for_provider)
@@ -1294,11 +1423,15 @@ class RotatingClient:
                     lib_logger.info(
                         f"Acquiring credential for model {model}. Tried credentials: {len(tried_creds)}/{len(credentials_for_provider)}"
                     )
-                    max_concurrent = self.max_concurrent_requests_per_key.get(provider, 1)
                     current_cred = await self.usage_manager.acquire_key(
-                        available_keys=creds_to_try, model=model, deadline=deadline,
                         max_concurrent=max_concurrent,
-                        credential_priorities=credential_priorities
                     )
                     key_acquired = True
                     tried_creds.add(current_cred)
@@ -1402,21 +1535,51 @@ class RotatingClient:
                                 litellm.RateLimitError,
                                 httpx.HTTPStatusError,
                             ) as e:
-                                if (
-                                    isinstance(e, httpx.HTTPStatusError)
-                                    and e.response.status_code != 429
-                                ):
-                                    raise e
                                 last_exception = e
                                 # If the exception is our custom wrapper, unwrap the original error
                                 original_exc = getattr(e, "data", e)
                                 classified_error = classify_error(original_exc)
                                 await self.usage_manager.record_failure(
                                     current_cred, model, classified_error
                                 )
                                 lib_logger.warning(
-                                    f"Credential ...{current_cred[-6:]} encountered a recoverable error ({classified_error.error_type}) during custom provider stream. Rotating key."
                                 )
                                 break
@@ -1436,31 +1599,40 @@ class RotatingClient:
                                     else {},
                                 )
                                 classified_error = classify_error(e)
                                 # Provider-level error: don't increment consecutive failures
                                 await self.usage_manager.record_failure(
-                                    current_cred, model, classified_error,
-                                    increment_consecutive_failures=False
                                 )
                                 if attempt >= self.max_retries - 1:
                                     lib_logger.warning(
-                                        f"Credential ...{current_cred[-6:]} failed after max retries for model {model} due to a server error. Rotating key."
                                     )
                                     break
                                 wait_time = classified_error.retry_after or (
-                                    1 * (2**attempt)
                                 ) + random.uniform(0, 1)
                                 remaining_budget = deadline - time.time()
                                 if wait_time > remaining_budget:
                                     lib_logger.warning(
-                                        f"Required retry wait time ({wait_time:.2f}s) exceeds remaining budget ({remaining_budget:.2f}s). Rotating key early."
                                     )
                                     break
-                                error_message = str(e).split("\n")[0]
                                 lib_logger.warning(
-                                    f"Credential ...{current_cred[-6:]} encountered a server error for model {model}. Reason: '{error_message}'. Retrying in {wait_time:.2f}s."
                                 )
                                 await asyncio.sleep(wait_time)
                                 continue
@@ -1477,15 +1649,24 @@ class RotatingClient:
                                     else {},
                                 )
                                 classified_error = classify_error(e)
                                 lib_logger.warning(
-                                    f"Credential ...{current_cred[-6:]} failed with {classified_error.error_type} (Status: {classified_error.status_code}). Error: {str(e)}. Rotating key."
                                 )
-                                if classified_error.error_type in [
-                                    "invalid_request",
-                                    "context_window_exceeded",
-                                    "authentication",
-                                ]:
                                     raise last_exception
                                 await self.usage_manager.record_failure(
                                     current_cred, model, classified_error
                                 )
@@ -1507,9 +1688,13 @@ class RotatingClient:
                     if provider_instance:
                         # Ensure default Gemini safety settings are present (without overriding request)
                         try:
-                            self._apply_default_safety_settings(litellm_kwargs, provider)
                         except Exception:
-                            lib_logger.debug("Could not apply default safety settings for streaming path; continuing.")
                         if "safety_settings" in litellm_kwargs:
                             converted_settings = (
@@ -1590,7 +1775,11 @@ class RotatingClient:
                                 yield chunk
                             return
-                        except (StreamedAPIError, litellm.RateLimitError) as e:
                             last_exception = e
                             # This is the final, robust handler for streamed errors.
@@ -1600,24 +1789,26 @@ class RotatingClient:
                             original_exc = getattr(e, "data", e)
                             classified_error = classify_error(original_exc)
                             try:
                                 # The full error JSON is in the string representation of the exception.
                                 json_str_match = re.search(
                                     r"(\{.*\})", str(original_exc), re.DOTALL
                                 )
                                 if json_str_match:
-                                    # The string may contain byte-escaped characters (e.g., \\n).
                                     cleaned_str = codecs.decode(
                                         json_str_match.group(1), "unicode_escape"
                                     )
                                     error_payload = json.loads(cleaned_str)
                             except (json.JSONDecodeError, TypeError):
-                                lib_logger.warning(
-                                    "Could not parse JSON details from streamed error exception."
-                                )
                                 error_payload = {}
-                            # Now, log the failure with the extracted raw response.
                             log_failure(
                                 api_key=current_cred,
                                 model=model,
@@ -1631,9 +1822,13 @@ class RotatingClient:
                             error_details = error_payload.get("error", {})
                             error_status = error_details.get("status", "")
-                            # Fallback to the full string if parsing fails.
                             error_message_text = error_details.get(
-                                "message", str(original_exc)
                             )
                             if (
@@ -1641,9 +1836,6 @@ class RotatingClient:
                                 or "resource_exhausted" in error_status.lower()
                             ):
                                 consecutive_quota_failures += 1
-                                lib_logger.warning(
-                                    f"Credential ...{current_cred[-6:]} hit a quota limit. This is consecutive failure #{consecutive_quota_failures} for this request."
-                                )
                                 quota_value = "N/A"
                                 quota_id = "N/A"
@@ -1670,48 +1862,39 @@ class RotatingClient:
                                 )
                                 if consecutive_quota_failures >= 3:
-                                    console_log_message = (
-                                        f"Terminating stream for credential ...{current_cred[-6:]} due to 3rd consecutive quota error. "
-                                        f"This is now considered a fatal input data error. ID: {quota_id}, Limit: {quota_value}."
-                                    )
                                     client_error_message = (
-                                        "FATAL: Request failed after 3 consecutive quota errors, "
-                                        "indicating the input data is too large for the model's per-request limit. "
-                                        f"Last Error Message: '{error_message_text}'. Limit: {quota_value} (Quota ID: {quota_id})."
                                     )
-                                    lib_logger.error(console_log_message)
                                     yield f"data: {json.dumps({'error': {'message': client_error_message, 'type': 'proxy_fatal_quota_error'}})}\n\n"
                                     yield "data: [DONE]\n\n"
                                     return
                                 else:
-                                    # [MODIFIED] Do not yield to the client. Just log and break to rotate the key.
                                     lib_logger.warning(
-                                        f"Quota error on credential ...{current_cred[-6:]} (failure {consecutive_quota_failures}/3). Rotating key silently."
                                     )
                                     break
                             else:
                                 consecutive_quota_failures = 0
-                                # [MODIFIED] Do not yield to the client. Just log and break to rotate the key.
                                 lib_logger.warning(
-                                    f"Credential ...{current_cred[-6:]} encountered a recoverable error ({classified_error.error_type}) during stream. Rotating key silently."
                                 )
-                                if (
-                                    classified_error.error_type == "rate_limit"
-                                    and classified_error.status_code == 429
-                                ):
                                     cooldown_duration = (
                                         classified_error.retry_after or 60
                                     )
                                     await self.cooldown_manager.start_cooldown(
                                         provider, cooldown_duration
                                     )
-                                    lib_logger.warning(
-                                        f"IP-based rate limit detected for {provider}. Starting a {cooldown_duration}-second global cooldown."
-                                    )
                                 await self.usage_manager.record_failure(
                                     current_cred, model, classified_error
@@ -1735,10 +1918,19 @@ class RotatingClient:
                                 else {},
                             )
                             classified_error = classify_error(e)
                             # Provider-level error: don't increment consecutive failures
                             await self.usage_manager.record_failure(
-                                current_cred, model, classified_error,
-                                increment_consecutive_failures=False
                             )
                             if attempt >= self.max_retries - 1:
@@ -1749,7 +1941,7 @@ class RotatingClient:
                                 break
                             wait_time = classified_error.retry_after or (
-                                1 * (2**attempt)
                             ) + random.uniform(0, 1)
                             remaining_budget = deadline - time.time()
                             if wait_time > remaining_budget:
@@ -1758,9 +1950,8 @@ class RotatingClient:
                                 )
                                 break
-                            error_message = str(e).split("\n")[0]
                             lib_logger.warning(
-                                f"Credential ...{current_cred[-6:]} encountered a server error for model {model}. Reason: '{error_message}'. Retrying in {wait_time:.2f}s."
                             )
                             await asyncio.sleep(wait_time)
                             continue
@@ -1778,49 +1969,76 @@ class RotatingClient:
                                 else {},
                             )
                             classified_error = classify_error(e)
                             lib_logger.warning(
-                                f"Credential ...{current_cred[-6:]} failed with {classified_error.error_type} (Status: {classified_error.status_code}). Error: {str(e)}. Rotating key."
                             )
-                            if classified_error.status_code == 429:
                                 cooldown_duration = classified_error.retry_after or 60
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
                                 )
                                 lib_logger.warning(
-                                    f"IP-based rate limit detected for {provider} from generic stream exception. Starting a {cooldown_duration}-second global cooldown."
                                 )
-                            if classified_error.error_type in [
-                                "invalid_request",
-                                "context_window_exceeded",
-                                "authentication",
-                            ]:
                                 raise last_exception
-                            # [MODIFIED] Do not yield to the client here.
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
                             break
                 finally:
                     if key_acquired and current_cred:
                         await self.usage_manager.release_key(current_cred, model)
-            final_error_message = "Failed to complete the streaming request: No available API keys after rotation or global timeout exceeded."
-            if last_exception:
-                final_error_message = f"Failed to complete the streaming request. Last error: {str(last_exception)}"
-                lib_logger.error(
-                    f"Streaming request failed after trying all keys. Last error: {last_exception}"
-                )
             else:
                 lib_logger.error(final_error_message)
-            error_data = {
-                "error": {"message": final_error_message, "type": "proxy_error"}
-            }
             yield f"data: {json.dumps(error_data)}\n\n"
             yield "data: [DONE]\n\n"
@@ -1868,11 +2086,13 @@ class RotatingClient:
         # Handle iflow provider: remove stream_options to avoid HTTP 406
         model = kwargs.get("model", "")
         provider = model.split("/")[0] if "/" in model else ""
         if provider == "iflow" and "stream_options" in kwargs:
-            lib_logger.debug("Removing stream_options for iflow provider to avoid HTTP 406")
             kwargs.pop("stream_options", None)
         if kwargs.get("stream"):
             # Only add stream_options for providers that support it (excluding iflow)
             if provider != "iflow":
@@ -1880,7 +2100,7 @@ class RotatingClient:
                     kwargs["stream_options"] = {}
                 if "include_usage" not in kwargs["stream_options"]:
                     kwargs["stream_options"]["include_usage"] = True
             return self._streaming_acompletion_with_retry(
                 request=request, pre_request_callback=pre_request_callback, **kwargs
             )

     classify_error,
     AllProviders,
     NoAvailableKeysError,
+    should_rotate_on_error,
+    should_retry_same_key,
+    RequestErrorAccumulator,
+    mask_credential,
 )
 from .providers import PROVIDER_PLUGINS
 from .providers.openai_compatible_provider import OpenAICompatibleProvider
     ):
         """
         Initialize the RotatingClient with intelligent credential rotation.
         Args:
             api_keys: Dictionary mapping provider names to lists of API keys
             oauth_credentials: Dictionary mapping provider names to OAuth credential paths
         self.global_timeout = global_timeout
         self.abort_on_callback_error = abort_on_callback_error
         self.usage_manager = UsageManager(
+            file_path=usage_file_path, rotation_tolerance=rotation_tolerance
         )
         self._model_list_cache = {}
         self._provider_plugins = PROVIDER_PLUGINS
         # Validate all values are >= 1
         for provider, max_val in self.max_concurrent_requests_per_key.items():
             if max_val < 1:
+                lib_logger.warning(
+                    f"Invalid max_concurrent for '{provider}': {max_val}. Setting to 1."
+                )
                 self.max_concurrent_requests_per_key[provider] = 1
     def _is_model_ignored(self, provider: str, model_id: str) -> bool:
         return kwargs
+    def _apply_default_safety_settings(
+        self, litellm_kwargs: Dict[str, Any], provider: str
+    ):
         """
         Ensure default Gemini safety settings are present when calling the Gemini provider.
         This will not override any explicit settings provided by the request. It accepts
         ]
         # If generic form is present, ensure missing generic keys are filled in
+        if "safety_settings" in litellm_kwargs and isinstance(
+            litellm_kwargs["safety_settings"], dict
+        ):
             for k, v in default_generic.items():
                 if k not in litellm_kwargs["safety_settings"]:
                     litellm_kwargs["safety_settings"][k] = v
             return
         # If Gemini form is present, ensure missing gemini categories are appended
+        if "safetySettings" in litellm_kwargs and isinstance(
+            litellm_kwargs["safetySettings"], list
+        ):
+            present = {
+                item.get("category")
+                for item in litellm_kwargs["safetySettings"]
+                if isinstance(item, dict)
+            }
             for d in default_gemini:
                 if d["category"] not in present:
                     litellm_kwargs["safetySettings"].append(d)
             return
         # Neither present: set generic defaults so provider conversion will translate them
+        if (
+            "safety_settings" not in litellm_kwargs
+            and "safetySettings" not in litellm_kwargs
+        ):
             litellm_kwargs["safety_settings"] = default_generic.copy()
     def get_oauth_credentials(self) -> Dict[str, List[str]]:
         """
         Lazily initializes and returns a provider instance.
         Only initializes providers that have configured credentials.
         Args:
             provider_name: The name of the provider to get an instance for.
         Returns:
             Provider instance if credentials exist, None otherwise.
         """
                 f"Skipping provider '{provider_name}' initialization: no credentials configured"
             )
             return None
         if provider_name not in self._provider_instances:
             if provider_name in self._provider_plugins:
                 self._provider_instances[provider_name] = self._provider_plugins[
     def _resolve_model_id(self, model: str, provider: str) -> str:
         """
         Resolves the actual model ID to send to the provider.
         For custom models with name/ID mappings, returns the ID.
         Otherwise, returns the model name unchanged.
         Args:
             model: Full model string with provider (e.g., "iflow/DS-v3.2")
             provider: Provider name (e.g., "iflow")
         Returns:
             Full model string with ID (e.g., "iflow/deepseek-v3.2")
         """
         # Extract model name from "provider/model_name" format
+        model_name = model.split("/")[-1] if "/" in model else model
         # Try to get provider instance to check for model definitions
         provider_plugin = self._get_provider_instance(provider)
         # Check if provider has model definitions
+        if provider_plugin and hasattr(provider_plugin, "model_definitions"):
+            model_id = provider_plugin.model_definitions.get_model_id(
+                provider, model_name
+            )
             if model_id and model_id != model_name:
                 # Return with provider prefix
                 return f"{provider}/{model_id}"
         # Fallback: use client's own model definitions
         model_id = self.model_definitions.get_model_id(provider, model_name)
         if model_id and model_id != model_name:
             return f"{provider}/{model_id}"
         # No conversion needed, return original
         return model
     async def _safe_streaming_wrapper(
         self, stream: Any, key: str, model: str, request: Optional[Any] = None
     ) -> AsyncGenerator[Any, None]:
         """
         A hybrid wrapper for streaming that buffers fragmented JSON, handles client disconnections gracefully,
         and distinguishes between content and streamed errors.
         FINISH_REASON HANDLING:
         Providers just translate chunks - this wrapper handles ALL finish_reason logic:
         1. Strip finish_reason from intermediate chunks (litellm defaults to "stop")
                         chunk_dict = chunk.model_dump()
                     else:
                         chunk_dict = chunk
                     # === FINISH_REASON LOGIC ===
                     # Providers send raw chunks without finish_reason logic.
                     # This wrapper determines finish_reason based on accumulated state.
                         choice = chunk_dict["choices"][0]
                         delta = choice.get("delta", {})
                         usage = chunk_dict.get("usage", {})
                         # Track tool_calls across ALL chunks - if we ever see one, finish_reason must be tool_calls
                         if delta.get("tool_calls"):
                             has_tool_calls = True
                             accumulated_finish_reason = "tool_calls"
                         # Detect final chunk: has usage with completion_tokens > 0
                         has_completion_tokens = (
+                            usage
+                            and isinstance(usage, dict)
+                            and usage.get("completion_tokens", 0) > 0
                         )
                         if has_completion_tokens:
                             # FINAL CHUNK: Determine correct finish_reason
                             if has_tool_calls:
                             # INTERMEDIATE CHUNK: Never emit finish_reason
                             # (litellm.ModelResponse defaults to "stop" which is wrong)
                             choice["finish_reason"] = None
                     yield f"data: {json.dumps(chunk_dict)}\n\n"
                     if hasattr(chunk, "usage") and chunk.usage:
         # multiple keys have the same usage stats.
         credentials_for_provider = list(self.all_credentials[provider])
         random.shuffle(credentials_for_provider)
         # Filter out credentials that are unavailable (queued for re-auth)
         provider_plugin = self._get_provider_instance(provider)
+        if provider_plugin and hasattr(provider_plugin, "is_credential_available"):
             available_creds = [
+                cred
+                for cred in credentials_for_provider
                 if provider_plugin.is_credential_available(cred)
             ]
             if available_creds:
         kwargs = self._convert_model_params(**kwargs)
         # The main rotation loop. It continues as long as there are untried credentials and the global deadline has not been exceeded.
         # Resolve model ID early, before any credential operations
         # This ensures consistent model ID usage for acquisition, release, and tracking
         resolved_model = self._resolve_model_id(model, provider)
             lib_logger.info(f"Resolved model '{model}' to '{resolved_model}'")
             model = resolved_model
             kwargs["model"] = model  # Ensure kwargs has the resolved model for litellm
         # [NEW] Filter by model tier requirement and build priority map
         credential_priorities = None
+        if provider_plugin and hasattr(provider_plugin, "get_model_tier_requirement"):
             required_tier = provider_plugin.get_model_tier_requirement(model)
             if required_tier is not None:
                 # Filter OUT only credentials we KNOW are too low priority
                 incompatible_creds = []
                 compatible_creds = []
                 unknown_creds = []
                 for cred in credentials_for_provider:
+                    if hasattr(provider_plugin, "get_credential_priority"):
                         priority = provider_plugin.get_credential_priority(cred)
                         if priority is None:
                             # Unknown priority - keep it, will be discovered on first use
                     else:
                         # Provider doesn't support priorities - keep all
                         unknown_creds.append(cred)
                 # If we have any known-compatible or unknown credentials, use them
                 tier_compatible_creds = compatible_creds + unknown_creds
                 if tier_compatible_creds:
                         f"but all {len(incompatible_creds)} known credentials have priority > {required_tier}. "
                         f"Request will likely fail."
                     )
         # Build priority map for usage_manager
+        if provider_plugin and hasattr(provider_plugin, "get_credential_priority"):
             credential_priorities = {}
             for cred in credentials_for_provider:
                 priority = provider_plugin.get_credential_priority(cred)
                 if priority is not None:
                     credential_priorities[cred] = priority
             if credential_priorities:
                 lib_logger.debug(
+                    f"Credential priorities for {provider}: {', '.join(f'P{p}={len([c for c in credentials_for_provider if credential_priorities.get(c) == p])}' for p in sorted(set(credential_priorities.values())))}"
                 )
+        # Initialize error accumulator for tracking errors across credential rotation
+        error_accumulator = RequestErrorAccumulator()
+        error_accumulator.model = model
+        error_accumulator.provider = provider
         while (
             len(tried_creds) < len(credentials_for_provider) and time.time() < deadline
         ):
                 )
                 max_concurrent = self.max_concurrent_requests_per_key.get(provider, 1)
                 current_cred = await self.usage_manager.acquire_key(
+                    available_keys=creds_to_try,
+                    model=model,
+                    deadline=deadline,
                     max_concurrent=max_concurrent,
+                    credential_priorities=credential_priorities,
                 )
                 key_acquired = True
                 tried_creds.add(current_cred)
                     if provider_instance:
                         # Ensure default Gemini safety settings are present (without overriding request)
                         try:
+                            self._apply_default_safety_settings(
+                                litellm_kwargs, provider
+                            )
                         except Exception:
                             # If anything goes wrong here, avoid breaking the request flow.
+                            lib_logger.debug(
+                                "Could not apply default safety settings; continuing."
+                            )
                         if "safety_settings" in litellm_kwargs:
                             converted_settings = (
                             # Extract a clean error message for the user-facing log
                             error_message = str(e).split("\n")[0]
+                            # Record in accumulator for client reporting
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message
+                            )
                             lib_logger.info(
+                                f"Key {mask_credential(current_cred)} hit rate limit for {model}. Rotating key."
                             )
                             if classified_error.status_code == 429:
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
                                 )
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
                             break  # Move to the next key
                         except (
                                 else {},
                             )
                             classified_error = classify_error(e)
+                            error_message = str(e).split("\n")[0]
                             # Provider-level error: don't increment consecutive failures
                             await self.usage_manager.record_failure(
+                                current_cred,
+                                model,
+                                classified_error,
+                                increment_consecutive_failures=False,
                             )
                             if attempt >= self.max_retries - 1:
+                                # Record in accumulator only on final failure for this key
+                                error_accumulator.record_error(
+                                    current_cred, classified_error, error_message
+                                )
                                 lib_logger.warning(
+                                    f"Key {mask_credential(current_cred)} failed after max retries due to server error. Rotating."
                                 )
                                 break  # Move to the next key
                             # For temporary errors, wait before retrying with the same key.
                             wait_time = classified_error.retry_after or (
+                                2**attempt
                             ) + random.uniform(0, 1)
                             remaining_budget = deadline - time.time()
                             # If the required wait time exceeds the budget, don't wait; rotate to the next key immediately.
                             if wait_time > remaining_budget:
+                                error_accumulator.record_error(
+                                    current_cred, classified_error, error_message
+                                )
                                 lib_logger.warning(
+                                    f"Retry wait ({wait_time:.2f}s) exceeds budget ({remaining_budget:.2f}s). Rotating key."
                                 )
                                 break
                             lib_logger.warning(
+                                f"Key {mask_credential(current_cred)} server error. Retrying in {wait_time:.2f}s."
                             )
                             await asyncio.sleep(wait_time)
                             continue  # Retry with the same key
+                        except httpx.HTTPStatusError as e:
+                            # Handle HTTP errors from httpx (e.g., from custom providers like Antigravity)
+                            last_exception = e
+                            log_failure(
+                                api_key=current_cred,
+                                model=model,
+                                attempt=attempt + 1,
+                                error=e,
+                                request_headers=dict(request.headers)
+                                if request
+                                else {},
+                            )
+                            classified_error = classify_error(e)
+                            error_message = str(e).split("\n")[0]
+                            lib_logger.warning(
+                                f"Key {mask_credential(current_cred)} HTTP {e.response.status_code} ({classified_error.error_type})."
+                            )
+                            # Check if this error should trigger rotation
+                            if not should_rotate_on_error(classified_error):
+                                lib_logger.error(
+                                    f"Non-recoverable error ({classified_error.error_type}). Failing request."
+                                )
+                                raise last_exception
+                            # Record in accumulator after confirming it's a rotatable error
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message
+                            )
+                            # Handle rate limits with cooldown
+                            if classified_error.error_type in [
+                                "rate_limit",
+                                "quota_exceeded",
+                            ]:
+                                cooldown_duration = classified_error.retry_after or 60
+                                await self.cooldown_manager.start_cooldown(
+                                    provider, cooldown_duration
+                                )
+                            # Check if we should retry same key (server errors with retries left)
+                            if (
+                                should_retry_same_key(classified_error)
+                                and attempt < self.max_retries - 1
+                            ):
+                                wait_time = classified_error.retry_after or (
+                                    2**attempt
+                                ) + random.uniform(0, 1)
+                                remaining_budget = deadline - time.time()
+                                if wait_time <= remaining_budget:
+                                    lib_logger.warning(
+                                        f"Server error, retrying same key in {wait_time:.2f}s."
+                                    )
+                                    await asyncio.sleep(wait_time)
+                                    continue
+                            # Record failure and rotate to next key
+                            await self.usage_manager.record_failure(
+                                current_cred, model, classified_error
+                            )
+                            lib_logger.info(
+                                f"Rotating to next key after {classified_error.error_type} error."
+                            )
+                            break
                         except Exception as e:
                             last_exception = e
                             log_failure(
                             if request and await request.is_disconnected():
                                 lib_logger.warning(
+                                    f"Client disconnected. Aborting retries for {mask_credential(current_cred)}."
                                 )
                                 raise last_exception
                             classified_error = classify_error(e)
                             error_message = str(e).split("\n")[0]
                             lib_logger.warning(
+                                f"Key {mask_credential(current_cred)} {classified_error.error_type} (HTTP {classified_error.status_code})."
                             )
+                            # Handle rate limits with cooldown
+                            if (
+                                classified_error.status_code == 429
+                                or classified_error.error_type
+                                in ["rate_limit", "quota_exceeded"]
+                            ):
                                 cooldown_duration = classified_error.retry_after or 60
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
                                 )
+                            # Check if this error should trigger rotation
+                            if not should_rotate_on_error(classified_error):
+                                lib_logger.error(
+                                    f"Non-recoverable error ({classified_error.error_type}). Failing request."
+                                )
                                 raise last_exception
+                            # Record in accumulator after confirming it's a rotatable error
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message
+                            )
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
                 if key_acquired and current_cred:
                     await self.usage_manager.release_key(current_cred, model)
+        # Check if we exhausted all credentials or timed out
+        if time.time() >= deadline:
+            error_accumulator.timeout_occurred = True
+        if error_accumulator.has_errors():
+            # Log concise summary for server logs
+            lib_logger.error(error_accumulator.build_log_message())
+            # Return the structured error response for the client
+            return error_accumulator.build_client_error_response()
+        # Return None to indicate failure without error details (shouldn't normally happen)
+        lib_logger.warning(
+            "Unexpected state: request failed with no recorded errors. "
+            "This may indicate a logic error in error tracking."
+        )
         return None
     async def _streaming_acompletion_with_retry(
         # Create a mutable copy of the keys and shuffle it.
         credentials_for_provider = list(self.all_credentials[provider])
         random.shuffle(credentials_for_provider)
         # Filter out credentials that are unavailable (queued for re-auth)
         provider_plugin = self._get_provider_instance(provider)
+        if provider_plugin and hasattr(provider_plugin, "is_credential_available"):
             available_creds = [
+                cred
+                for cred in credentials_for_provider
                 if provider_plugin.is_credential_available(cred)
             ]
             if available_creds:
             lib_logger.info(f"Resolved model '{model}' to '{resolved_model}'")
             model = resolved_model
             kwargs["model"] = model  # Ensure kwargs has the resolved model for litellm
         # [NEW] Filter by model tier requirement and build priority map
         credential_priorities = None
+        if provider_plugin and hasattr(provider_plugin, "get_model_tier_requirement"):
             required_tier = provider_plugin.get_model_tier_requirement(model)
             if required_tier is not None:
                 # Filter OUT only credentials we KNOW are too low priority
                 incompatible_creds = []
                 compatible_creds = []
                 unknown_creds = []
                 for cred in credentials_for_provider:
+                    if hasattr(provider_plugin, "get_credential_priority"):
                         priority = provider_plugin.get_credential_priority(cred)
                         if priority is None:
                             # Unknown priority - keep it, will be discovered on first use
                     else:
                         # Provider doesn't support priorities - keep all
                         unknown_creds.append(cred)
                 # If we have any known-compatible or unknown credentials, use them
                 tier_compatible_creds = compatible_creds + unknown_creds
                 if tier_compatible_creds:
                         f"but all {len(incompatible_creds)} known credentials have priority > {required_tier}. "
                         f"Request will likely fail."
                     )
         # Build priority map for usage_manager
+        if provider_plugin and hasattr(provider_plugin, "get_credential_priority"):
             credential_priorities = {}
             for cred in credentials_for_provider:
                 priority = provider_plugin.get_credential_priority(cred)
                 if priority is not None:
                     credential_priorities[cred] = priority
             if credential_priorities:
                 lib_logger.debug(
+                    f"Credential priorities for {provider}: {', '.join(f'P{p}={len([c for c in credentials_for_provider if credential_priorities.get(c) == p])}' for p in sorted(set(credential_priorities.values())))}"
                 )
+        # Initialize error accumulator for tracking errors across credential rotation
+        error_accumulator = RequestErrorAccumulator()
+        error_accumulator.model = model
+        error_accumulator.provider = provider
         try:
             while (
                 len(tried_creds) < len(credentials_for_provider)
                     lib_logger.info(
                         f"Acquiring credential for model {model}. Tried credentials: {len(tried_creds)}/{len(credentials_for_provider)}"
                     )
+                    max_concurrent = self.max_concurrent_requests_per_key.get(
+                        provider, 1
+                    )
                     current_cred = await self.usage_manager.acquire_key(
+                        available_keys=creds_to_try,
+                        model=model,
+                        deadline=deadline,
                         max_concurrent=max_concurrent,
+                        credential_priorities=credential_priorities,
                     )
                     key_acquired = True
                     tried_creds.add(current_cred)
                                 litellm.RateLimitError,
                                 httpx.HTTPStatusError,
                             ) as e:
                                 last_exception = e
                                 # If the exception is our custom wrapper, unwrap the original error
                                 original_exc = getattr(e, "data", e)
                                 classified_error = classify_error(original_exc)
+                                error_message = str(original_exc).split("\n")[0]
+                                log_failure(
+                                    api_key=current_cred,
+                                    model=model,
+                                    attempt=attempt + 1,
+                                    error=e,
+                                    request_headers=dict(request.headers)
+                                    if request
+                                    else {},
+                                )
+                                # Record in accumulator for client reporting
+                                error_accumulator.record_error(
+                                    current_cred, classified_error, error_message
+                                )
+                                # Check if this error should trigger rotation
+                                if not should_rotate_on_error(classified_error):
+                                    lib_logger.error(
+                                        f"Non-recoverable error ({classified_error.error_type}) during custom stream. Failing."
+                                    )
+                                    raise last_exception
+                                # Handle rate limits with cooldown
+                                if classified_error.error_type in [
+                                    "rate_limit",
+                                    "quota_exceeded",
+                                ]:
+                                    cooldown_duration = (
+                                        classified_error.retry_after or 60
+                                    )
+                                    await self.cooldown_manager.start_cooldown(
+                                        provider, cooldown_duration
+                                    )
                                 await self.usage_manager.record_failure(
                                     current_cred, model, classified_error
                                 )
                                 lib_logger.warning(
+                                    f"Cred {mask_credential(current_cred)} {classified_error.error_type} (HTTP {classified_error.status_code}). Rotating."
                                 )
                                 break
                                     else {},
                                 )
                                 classified_error = classify_error(e)
+                                error_message = str(e).split("\n")[0]
                                 # Provider-level error: don't increment consecutive failures
                                 await self.usage_manager.record_failure(
+                                    current_cred,
+                                    model,
+                                    classified_error,
+                                    increment_consecutive_failures=False,
                                 )
                                 if attempt >= self.max_retries - 1:
+                                    error_accumulator.record_error(
+                                        current_cred, classified_error, error_message
+                                    )
                                     lib_logger.warning(
+                                        f"Cred {mask_credential(current_cred)} failed after max retries. Rotating."
                                     )
                                     break
                                 wait_time = classified_error.retry_after or (
+                                    2**attempt
                                 ) + random.uniform(0, 1)
                                 remaining_budget = deadline - time.time()
                                 if wait_time > remaining_budget:
+                                    error_accumulator.record_error(
+                                        current_cred, classified_error, error_message
+                                    )
                                     lib_logger.warning(
+                                        f"Retry wait ({wait_time:.2f}s) exceeds budget. Rotating."
                                     )
                                     break
                                 lib_logger.warning(
+                                    f"Cred {mask_credential(current_cred)} server error. Retrying in {wait_time:.2f}s."
                                 )
                                 await asyncio.sleep(wait_time)
                                 continue
                                     else {},
                                 )
                                 classified_error = classify_error(e)
+                                error_message = str(e).split("\n")[0]
+                                # Record in accumulator
+                                error_accumulator.record_error(
+                                    current_cred, classified_error, error_message
+                                )
                                 lib_logger.warning(
+                                    f"Cred {mask_credential(current_cred)} {classified_error.error_type} (HTTP {classified_error.status_code})."
                                 )
+                                # Check if this error should trigger rotation
+                                if not should_rotate_on_error(classified_error):
+                                    lib_logger.error(
+                                        f"Non-recoverable error ({classified_error.error_type}). Failing."
+                                    )
                                     raise last_exception
                                 await self.usage_manager.record_failure(
                                     current_cred, model, classified_error
                                 )
                     if provider_instance:
                         # Ensure default Gemini safety settings are present (without overriding request)
                         try:
+                            self._apply_default_safety_settings(
+                                litellm_kwargs, provider
+                            )
                         except Exception:
+                            lib_logger.debug(
+                                "Could not apply default safety settings for streaming path; continuing."
+                            )
                         if "safety_settings" in litellm_kwargs:
                             converted_settings = (
                                 yield chunk
                             return
+                        except (
+                            StreamedAPIError,
+                            litellm.RateLimitError,
+                            httpx.HTTPStatusError,
+                        ) as e:
                             last_exception = e
                             # This is the final, robust handler for streamed errors.
                             original_exc = getattr(e, "data", e)
                             classified_error = classify_error(original_exc)
+                            # Check if this error should trigger rotation
+                            if not should_rotate_on_error(classified_error):
+                                lib_logger.error(
+                                    f"Non-recoverable error ({classified_error.error_type}) during litellm stream. Failing."
+                                )
+                                raise last_exception
                             try:
                                 # The full error JSON is in the string representation of the exception.
                                 json_str_match = re.search(
                                     r"(\{.*\})", str(original_exc), re.DOTALL
                                 )
                                 if json_str_match:
                                     cleaned_str = codecs.decode(
                                         json_str_match.group(1), "unicode_escape"
                                     )
                                     error_payload = json.loads(cleaned_str)
                             except (json.JSONDecodeError, TypeError):
                                 error_payload = {}
                             log_failure(
                                 api_key=current_cred,
                                 model=model,
                             error_details = error_payload.get("error", {})
                             error_status = error_details.get("status", "")
                             error_message_text = error_details.get(
+                                "message", str(original_exc).split("\n")[0]
+                            )
+                            # Record in accumulator for client reporting
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message_text
                             )
                             if (
                                 or "resource_exhausted" in error_status.lower()
                             ):
                                 consecutive_quota_failures += 1
                                 quota_value = "N/A"
                                 quota_id = "N/A"
                                 )
                                 if consecutive_quota_failures >= 3:
+                                    # Fatal: likely input data too large
                                     client_error_message = (
+                                        f"Request failed after 3 consecutive quota errors (input may be too large). "
+                                        f"Limit: {quota_value} (Quota ID: {quota_id})"
+                                    )
+                                    lib_logger.error(
+                                        f"Fatal quota error for {mask_credential(current_cred)}. ID: {quota_id}, Limit: {quota_value}"
                                     )
                                     yield f"data: {json.dumps({'error': {'message': client_error_message, 'type': 'proxy_fatal_quota_error'}})}\n\n"
                                     yield "data: [DONE]\n\n"
                                     return
                                 else:
                                     lib_logger.warning(
+                                        f"Cred {mask_credential(current_cred)} quota error ({consecutive_quota_failures}/3). Rotating."
                                     )
                                     break
                             else:
                                 consecutive_quota_failures = 0
                                 lib_logger.warning(
+                                    f"Cred {mask_credential(current_cred)} {classified_error.error_type}. Rotating."
                                 )
+                                if classified_error.error_type in [
+                                    "rate_limit",
+                                    "quota_exceeded",
+                                ]:
                                     cooldown_duration = (
                                         classified_error.retry_after or 60
                                     )
                                     await self.cooldown_manager.start_cooldown(
                                         provider, cooldown_duration
                                     )
                                 await self.usage_manager.record_failure(
                                     current_cred, model, classified_error
                                 else {},
                             )
                             classified_error = classify_error(e)
+                            error_message_text = str(e).split("\n")[0]
+                            # Record error in accumulator (server errors are transient, not abnormal)
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message_text
+                            )
                             # Provider-level error: don't increment consecutive failures
                             await self.usage_manager.record_failure(
+                                current_cred,
+                                model,
+                                classified_error,
+                                increment_consecutive_failures=False,
                             )
                             if attempt >= self.max_retries - 1:
                                 break
                             wait_time = classified_error.retry_after or (
+                                2**attempt
                             ) + random.uniform(0, 1)
                             remaining_budget = deadline - time.time()
                             if wait_time > remaining_budget:
                                 )
                                 break
                             lib_logger.warning(
+                                f"Credential ...{current_cred[-6:]} encountered a server error for model {model}. Reason: '{error_message_text}'. Retrying in {wait_time:.2f}s."
                             )
                             await asyncio.sleep(wait_time)
                             continue
                                 else {},
                             )
                             classified_error = classify_error(e)
+                            error_message_text = str(e).split("\n")[0]
+                            # Record error in accumulator
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message_text
+                            )
                             lib_logger.warning(
+                                f"Credential ...{current_cred[-6:]} failed with {classified_error.error_type} (Status: {classified_error.status_code}). Error: {error_message_text}."
                             )
+                            # Handle rate limits with cooldown
+                            if (
+                                classified_error.status_code == 429
+                                or classified_error.error_type
+                                in ["rate_limit", "quota_exceeded"]
+                            ):
                                 cooldown_duration = classified_error.retry_after or 60
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
                                 )
                                 lib_logger.warning(
+                                    f"Rate limit detected for {provider}. Starting {cooldown_duration}s cooldown."
                                 )
+                            # Check if this error should trigger rotation
+                            if not should_rotate_on_error(classified_error):
+                                # Non-rotatable errors - fail immediately
+                                lib_logger.error(
+                                    f"Non-recoverable error ({classified_error.error_type}). Failing request."
+                                )
                                 raise last_exception
+                            # Record failure and rotate to next key
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
+                            lib_logger.info(
+                                f"Rotating to next key after {classified_error.error_type} error."
+                            )
                             break
                 finally:
                     if key_acquired and current_cred:
                         await self.usage_manager.release_key(current_cred, model)
+            # Build detailed error response using error accumulator
+            error_accumulator.timeout_occurred = time.time() >= deadline
+            if error_accumulator.has_errors():
+                # Log concise summary for server logs
+                lib_logger.error(error_accumulator.build_log_message())
+                # Build structured error response for client
+                error_response = error_accumulator.build_client_error_response()
+                error_data = error_response
             else:
+                # Fallback if no errors were recorded (shouldn't happen)
+                final_error_message = (
+                    "Request failed: No available API keys after rotation or timeout."
+                )
+                if last_exception:
+                    final_error_message = (
+                        f"Request failed. Last error: {str(last_exception)}"
+                    )
+                error_data = {
+                    "error": {"message": final_error_message, "type": "proxy_error"}
+                }
                 lib_logger.error(final_error_message)
             yield f"data: {json.dumps(error_data)}\n\n"
             yield "data: [DONE]\n\n"
         # Handle iflow provider: remove stream_options to avoid HTTP 406
         model = kwargs.get("model", "")
         provider = model.split("/")[0] if "/" in model else ""
         if provider == "iflow" and "stream_options" in kwargs:
+            lib_logger.debug(
+                "Removing stream_options for iflow provider to avoid HTTP 406"
+            )
             kwargs.pop("stream_options", None)
         if kwargs.get("stream"):
             # Only add stream_options for providers that support it (excluding iflow)
             if provider != "iflow":
                     kwargs["stream_options"] = {}
                 if "include_usage" not in kwargs["stream_options"]:
                     kwargs["stream_options"]["include_usage"] = True
             return self._streaming_acompletion_with_retry(
                 request=request, pre_request_callback=pre_request_callback, **kwargs
             )

src/rotator_library/error_handler.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import re
 import json
 from typing import Optional, Dict, Any
 import httpx
@@ -20,20 +21,20 @@ from litellm.exceptions import (
 def extract_retry_after_from_body(error_body: Optional[str]) -> Optional[int]:
     """
     Extract the retry-after time from an API error response body.
     Handles various error formats including:
     - Gemini CLI: "Your quota will reset after 39s."
     - Generic: "quota will reset after 120s", "retry after 60s"
     Args:
         error_body: The raw error response body
     Returns:
         The retry time in seconds, or None if not found
     """
     if not error_body:
         return None
     # Pattern to match various "reset after Xs" or "retry after Xs" formats
     patterns = [
         r"quota will reset after\s*(\d+)s",
@@ -41,7 +42,7 @@ def extract_retry_after_from_body(error_body: Optional[str]) -> Optional[int]:
         r"retry after\s*(\d+)s",
         r"try again in\s*(\d+)\s*seconds?",
     ]
     for pattern in patterns:
         match = re.search(pattern, error_body, re.IGNORECASE)
         if match:
@@ -49,7 +50,7 @@ def extract_retry_after_from_body(error_body: Optional[str]) -> Optional[int]:
                 return int(match.group(1))
             except (ValueError, IndexError):
                 continue
     return None
@@ -65,6 +66,227 @@ class PreRequestCallbackError(Exception):
     pass
 class ClassifiedError:
     """A structured representation of a classified error."""
@@ -94,7 +316,7 @@ def get_retry_after(error: Exception) -> Optional[int]:
     if isinstance(error, httpx.HTTPStatusError):
         headers = error.response.headers
         # Check standard Retry-After header (case-insensitive)
-        retry_header = headers.get('retry-after') or headers.get('Retry-After')
         if retry_header:
             try:
                 return int(retry_header)  # Assumes seconds format
@@ -102,10 +324,13 @@ def get_retry_after(error: Exception) -> Optional[int]:
                 pass  # Might be HTTP date format, skip for now
         # Check X-RateLimit-Reset header (Unix timestamp)
-        reset_header = headers.get('x-ratelimit-reset') or headers.get('X-RateLimit-Reset')
         if reset_header:
             try:
                 import time
                 reset_timestamp = int(reset_header)
                 current_time = int(time.time())
                 wait_seconds = reset_timestamp - current_time
@@ -155,16 +380,16 @@ def get_retry_after(error: Exception) -> Optional[int]:
                 continue
     # 3. Handle duration formats like "60s", "2m", "1h"
-    duration_match = re.search(r'(\d+)\s*([smh])', error_str)
     if duration_match:
         try:
             value = int(duration_match.group(1))
             unit = duration_match.group(2)
-            if unit == 's':
                 return value
-            elif unit == 'm':
                 return value * 60
-            elif unit == 'h':
                 return value * 3600
         except (ValueError, IndexError):
             pass
@@ -179,15 +404,15 @@ def get_retry_after(error: Exception) -> Optional[int]:
             if value.isdigit():
                 return int(value)
             # Handle "60s", "2m" format in attribute
-            duration_match = re.search(r'(\d+)\s*([smh])', value.lower())
             if duration_match:
                 val = int(duration_match.group(1))
                 unit = duration_match.group(2)
-                if unit == 's':
                     return val
-                elif unit == 'm':
                     return val * 60
-                elif unit == 'h':
                     return val * 3600
     return None
@@ -197,25 +422,89 @@ def classify_error(e: Exception) -> ClassifiedError:
     """
     Classifies an exception into a structured ClassifiedError object.
     Now handles both litellm and httpx exceptions.
     """
     status_code = getattr(e, "status_code", None)
     if isinstance(e, httpx.HTTPStatusError):  # [NEW] Handle httpx errors first
         status_code = e.response.status_code
         if status_code == 401:
             return ClassifiedError(
                 error_type="authentication",
                 original_exception=e,
                 status_code=status_code,
             )
         if status_code == 429:
             retry_after = get_retry_after(e)
             return ClassifiedError(
                 error_type="rate_limit",
                 original_exception=e,
                 status_code=status_code,
                 retry_after=retry_after,
             )
         if 400 <= status_code < 500:
             return ClassifiedError(
                 error_type="invalid_request",
                 original_exception=e,
@@ -313,6 +602,52 @@ def is_unrecoverable_error(e: Exception) -> bool:
     return isinstance(e, (InvalidRequestError, AuthenticationError, BadRequestError))
 class AllProviders:
     """
     A class to handle provider-specific settings, such as custom API bases.

 import re
 import json
+import os
 from typing import Optional, Dict, Any
 import httpx
 def extract_retry_after_from_body(error_body: Optional[str]) -> Optional[int]:
     """
     Extract the retry-after time from an API error response body.
     Handles various error formats including:
     - Gemini CLI: "Your quota will reset after 39s."
     - Generic: "quota will reset after 120s", "retry after 60s"
     Args:
         error_body: The raw error response body
     Returns:
         The retry time in seconds, or None if not found
     """
     if not error_body:
         return None
     # Pattern to match various "reset after Xs" or "retry after Xs" formats
     patterns = [
         r"quota will reset after\s*(\d+)s",
         r"retry after\s*(\d+)s",
         r"try again in\s*(\d+)\s*seconds?",
     ]
     for pattern in patterns:
         match = re.search(pattern, error_body, re.IGNORECASE)
         if match:
                 return int(match.group(1))
             except (ValueError, IndexError):
                 continue
     return None
     pass
+# =============================================================================
+# ERROR TRACKING FOR CLIENT REPORTING
+# =============================================================================
+# Abnormal errors that require attention and should always be reported to client
+ABNORMAL_ERROR_TYPES = frozenset(
+    {
+        "forbidden",  # 403 - credential access issue
+        "authentication",  # 401 - credential invalid/revoked
+        "pre_request_callback_error",  # Internal proxy error
+    }
+)
+# Normal/expected errors during operation - only report if ALL credentials fail
+NORMAL_ERROR_TYPES = frozenset(
+    {
+        "rate_limit",  # 429 - expected during high load
+        "quota_exceeded",  # Expected when quota runs out
+        "server_error",  # 5xx - transient provider issues
+        "api_connection",  # Network issues - transient
+    }
+)
+def is_abnormal_error(classified_error: "ClassifiedError") -> bool:
+    """
+    Check if an error is abnormal and should be reported to the client.
+    Abnormal errors indicate credential issues that need attention:
+    - 403 Forbidden: Credential doesn't have access
+    - 401 Unauthorized: Credential is invalid/revoked
+    Normal errors are expected during operation:
+    - 429 Rate limit: Expected during high load
+    - 5xx Server errors: Transient provider issues
+    """
+    return classified_error.error_type in ABNORMAL_ERROR_TYPES
+def mask_credential(credential: str) -> str:
+    """
+    Mask a credential for safe display in logs and error messages.
+    - For API keys: shows last 6 characters (e.g., "...xyz123")
+    - For OAuth file paths: shows just the filename (e.g., "antigravity_oauth_1.json")
+    """
+    if os.path.isfile(credential):
+        return os.path.basename(credential)
+    elif len(credential) > 6:
+        return f"...{credential[-6:]}"
+    else:
+        return "***"
+class RequestErrorAccumulator:
+    """
+    Tracks errors encountered during a request's credential rotation cycle.
+    Used to build informative error messages for clients when all credentials
+    are exhausted. Distinguishes between abnormal errors (that need attention)
+    and normal errors (expected during operation).
+    """
+    def __init__(self):
+        self.abnormal_errors: list = []  # 403, 401 - always report details
+        self.normal_errors: list = []  # 429, 5xx - summarize only
+        self._tried_credentials: set = set()  # Track unique credentials
+        self.timeout_occurred: bool = False
+        self.model: str = ""
+        self.provider: str = ""
+    def record_error(
+        self, credential: str, classified_error: "ClassifiedError", error_message: str
+    ):
+        """Record an error for a credential."""
+        self._tried_credentials.add(credential)
+        masked_cred = mask_credential(credential)
+        error_record = {
+            "credential": masked_cred,
+            "error_type": classified_error.error_type,
+            "status_code": classified_error.status_code,
+            "message": self._truncate_message(error_message, 150),
+        }
+        if is_abnormal_error(classified_error):
+            self.abnormal_errors.append(error_record)
+        else:
+            self.normal_errors.append(error_record)
+    @property
+    def total_credentials_tried(self) -> int:
+        """Return the number of unique credentials tried."""
+        return len(self._tried_credentials)
+    def _truncate_message(self, message: str, max_length: int = 150) -> str:
+        """Truncate error message for readability."""
+        # Take first line and truncate
+        first_line = message.split("\n")[0]
+        if len(first_line) > max_length:
+            return first_line[:max_length] + "..."
+        return first_line
+    def has_errors(self) -> bool:
+        """Check if any errors were recorded."""
+        return bool(self.abnormal_errors or self.normal_errors)
+    def has_abnormal_errors(self) -> bool:
+        """Check if any abnormal errors were recorded."""
+        return bool(self.abnormal_errors)
+    def get_normal_error_summary(self) -> str:
+        """Get a summary of normal errors (not individual details)."""
+        if not self.normal_errors:
+            return ""
+        # Count by type
+        counts = {}
+        for err in self.normal_errors:
+            err_type = err["error_type"]
+            counts[err_type] = counts.get(err_type, 0) + 1
+        # Build summary like "3 rate_limit, 1 server_error"
+        parts = [f"{count} {err_type}" for err_type, count in counts.items()]
+        return ", ".join(parts)
+    def build_client_error_response(self) -> dict:
+        """
+        Build a structured error response for the client.
+        Returns a dict suitable for JSON serialization in the error response.
+        """
+        # Determine the primary failure reason
+        if self.timeout_occurred:
+            error_type = "proxy_timeout"
+            base_message = f"Request timed out after trying {self.total_credentials_tried} credential(s)"
+        else:
+            error_type = "proxy_all_credentials_exhausted"
+            base_message = f"All {self.total_credentials_tried} credential(s) exhausted for {self.provider}"
+        # Build human-readable message
+        message_parts = [base_message]
+        if self.abnormal_errors:
+            message_parts.append("\n\nCredential issues (require attention):")
+            for err in self.abnormal_errors:
+                status = (
+                    f"HTTP {err['status_code']}"
+                    if err["status_code"] is not None
+                    else err["error_type"]
+                )
+                message_parts.append(
+                    f"\n  • {err['credential']}: {status} - {err['message']}"
+                )
+        normal_summary = self.get_normal_error_summary()
+        if normal_summary:
+            if self.abnormal_errors:
+                message_parts.append(
+                    f"\n\nAdditionally: {normal_summary} (expected during normal operation)"
+                )
+            else:
+                message_parts.append(f"\n\nAll failures were: {normal_summary}")
+                message_parts.append(
+                    "\nThis is normal during high load - retry later or add more credentials."
+                )
+        response = {
+            "error": {
+                "message": "".join(message_parts),
+                "type": error_type,
+                "details": {
+                    "model": self.model,
+                    "provider": self.provider,
+                    "credentials_tried": self.total_credentials_tried,
+                    "timeout": self.timeout_occurred,
+                },
+            }
+        }
+        # Only include abnormal errors in details (they need attention)
+        if self.abnormal_errors:
+            response["error"]["details"]["abnormal_errors"] = self.abnormal_errors
+        # Include summary of normal errors
+        if normal_summary:
+            response["error"]["details"]["normal_error_summary"] = normal_summary
+        return response
+    def build_log_message(self) -> str:
+        """
+        Build a concise log message for server-side logging.
+        Shorter than client message, suitable for terminal display.
+        """
+        parts = []
+        if self.timeout_occurred:
+            parts.append(
+                f"TIMEOUT: {self.total_credentials_tried} creds tried for {self.model}"
+            )
+        else:
+            parts.append(
+                f"ALL CREDS EXHAUSTED: {self.total_credentials_tried} tried for {self.model}"
+            )
+        if self.abnormal_errors:
+            abnormal_summary = ", ".join(
+                f"{e['credential']}={e['status_code'] or e['error_type']}"
+                for e in self.abnormal_errors
+            )
+            parts.append(f"ISSUES: {abnormal_summary}")
+        normal_summary = self.get_normal_error_summary()
+        if normal_summary:
+            parts.append(f"Normal: {normal_summary}")
+        return " | ".join(parts)
 class ClassifiedError:
     """A structured representation of a classified error."""
     if isinstance(error, httpx.HTTPStatusError):
         headers = error.response.headers
         # Check standard Retry-After header (case-insensitive)
+        retry_header = headers.get("retry-after") or headers.get("Retry-After")
         if retry_header:
             try:
                 return int(retry_header)  # Assumes seconds format
                 pass  # Might be HTTP date format, skip for now
         # Check X-RateLimit-Reset header (Unix timestamp)
+        reset_header = headers.get("x-ratelimit-reset") or headers.get(
+            "X-RateLimit-Reset"
+        )
         if reset_header:
             try:
                 import time
                 reset_timestamp = int(reset_header)
                 current_time = int(time.time())
                 wait_seconds = reset_timestamp - current_time
                 continue
     # 3. Handle duration formats like "60s", "2m", "1h"
+    duration_match = re.search(r"(\d+)\s*([smh])", error_str)
     if duration_match:
         try:
             value = int(duration_match.group(1))
             unit = duration_match.group(2)
+            if unit == "s":
                 return value
+            elif unit == "m":
                 return value * 60
+            elif unit == "h":
                 return value * 3600
         except (ValueError, IndexError):
             pass
             if value.isdigit():
                 return int(value)
             # Handle "60s", "2m" format in attribute
+            duration_match = re.search(r"(\d+)\s*([smh])", value.lower())
             if duration_match:
                 val = int(duration_match.group(1))
                 unit = duration_match.group(2)
+                if unit == "s":
                     return val
+                elif unit == "m":
                     return val * 60
+                elif unit == "h":
                     return val * 3600
     return None
     """
     Classifies an exception into a structured ClassifiedError object.
     Now handles both litellm and httpx exceptions.
+    Error types and their typical handling:
+    - rate_limit (429): Rotate key, may retry with backoff
+    - server_error (5xx): Retry with backoff, then rotate
+    - forbidden (403): Rotate key immediately (access denied for this credential)
+    - authentication (401): Rotate key, trigger re-auth if OAuth
+    - quota_exceeded: Rotate key (credential quota exhausted)
+    - invalid_request (400): Don't retry - client error in request
+    - context_window_exceeded: Don't retry - request too large
+    - api_connection: Retry with backoff, then rotate
+    - unknown: Rotate key (safer to try another)
     """
     status_code = getattr(e, "status_code", None)
     if isinstance(e, httpx.HTTPStatusError):  # [NEW] Handle httpx errors first
         status_code = e.response.status_code
+        # Try to get error body for better classification
+        try:
+            error_body = e.response.text.lower() if hasattr(e.response, "text") else ""
+        except Exception:
+            error_body = ""
         if status_code == 401:
             return ClassifiedError(
                 error_type="authentication",
                 original_exception=e,
                 status_code=status_code,
             )
+        if status_code == 403:
+            # 403 Forbidden - credential doesn't have access, should rotate
+            # Could be: IP restriction, account disabled, permission denied, etc.
+            return ClassifiedError(
+                error_type="forbidden",
+                original_exception=e,
+                status_code=status_code,
+            )
         if status_code == 429:
             retry_after = get_retry_after(e)
+            # Check if this is a quota error vs rate limit
+            if "quota" in error_body or "resource_exhausted" in error_body:
+                return ClassifiedError(
+                    error_type="quota_exceeded",
+                    original_exception=e,
+                    status_code=status_code,
+                    retry_after=retry_after,
+                )
             return ClassifiedError(
                 error_type="rate_limit",
                 original_exception=e,
                 status_code=status_code,
                 retry_after=retry_after,
             )
+        if status_code == 400:
+            # Check for context window / token limit errors with more specific patterns
+            if any(
+                pattern in error_body
+                for pattern in [
+                    "context_length",
+                    "max_tokens",
+                    "token limit",
+                    "context window",
+                    "too many tokens",
+                    "too long",
+                ]
+            ):
+                return ClassifiedError(
+                    error_type="context_window_exceeded",
+                    original_exception=e,
+                    status_code=status_code,
+                )
+            return ClassifiedError(
+                error_type="invalid_request",
+                original_exception=e,
+                status_code=status_code,
+            )
+            return ClassifiedError(
+                error_type="invalid_request",
+                original_exception=e,
+                status_code=status_code,
+            )
         if 400 <= status_code < 500:
+            # Other 4xx errors - generally client errors
             return ClassifiedError(
                 error_type="invalid_request",
                 original_exception=e,
     return isinstance(e, (InvalidRequestError, AuthenticationError, BadRequestError))
+def should_rotate_on_error(classified_error: ClassifiedError) -> bool:
+    """
+    Determines if an error should trigger key rotation.
+    Errors that SHOULD rotate (try another key):
+    - rate_limit: Current key is throttled
+    - quota_exceeded: Current key/account exhausted
+    - forbidden: Current credential denied access
+    - authentication: Current credential invalid
+    - server_error: Provider having issues (might work with different endpoint/key)
+    - api_connection: Network issues (might be transient)
+    - unknown: Safer to try another key
+    Errors that should NOT rotate (fail immediately):
+    - invalid_request: Client error in request payload (won't help to retry)
+    - context_window_exceeded: Request too large (won't help to retry)
+    - pre_request_callback_error: Internal proxy error
+    Returns:
+        True if should rotate to next key, False if should fail immediately
+    """
+    non_rotatable_errors = {
+        "invalid_request",
+        "context_window_exceeded",
+        "pre_request_callback_error",
+    }
+    return classified_error.error_type not in non_rotatable_errors
+def should_retry_same_key(classified_error: ClassifiedError) -> bool:
+    """
+    Determines if an error should retry with the same key (with backoff).
+    Only server errors and connection issues should retry the same key,
+    as these are often transient.
+    Returns:
+        True if should retry same key, False if should rotate immediately
+    """
+    retryable_errors = {
+        "server_error",
+        "api_connection",
+    }
+    return classified_error.error_type in retryable_errors
 class AllProviders:
     """
     A class to handle provider-specific settings, such as custom API bases.

src/rotator_library/failure_logger.py CHANGED Viewed

@@ -4,6 +4,7 @@ from logging.handlers import RotatingFileHandler
 import os
 from datetime import datetime
 def setup_failure_logger():
     """Sets up a dedicated JSON logger for writing detailed failure logs to a file."""
     log_dir = "logs"
@@ -12,15 +13,15 @@ def setup_failure_logger():
     # Create a logger specifically for failures.
     # This logger will NOT propagate to the root logger.
-    logger = logging.getLogger('failure_logger')
     logger.setLevel(logging.INFO)
     logger.propagate = False
     # Use a rotating file handler
     handler = RotatingFileHandler(
-        os.path.join(log_dir, 'failures.log'),
-        maxBytes=5*1024*1024,  # 5 MB
-        backupCount=2
     )
     # Custom JSON formatter for structured logs
@@ -30,45 +31,124 @@ def setup_failure_logger():
             return json.dumps(record.msg)
     handler.setFormatter(JsonFormatter())
     # Add handler only if it hasn't been added before
     if not logger.handlers:
         logger.addHandler(handler)
     return logger
 # Initialize the dedicated logger for detailed failure logs
 failure_logger = setup_failure_logger()
 # Get the main library logger for concise, propagated messages
-main_lib_logger = logging.getLogger('rotator_library')
-def log_failure(api_key: str, model: str, attempt: int, error: Exception, request_headers: dict, raw_response_text: str = None):
     """
     Logs a detailed failure message to a file and a concise summary to the main logger.
     """
     # 1. Log the full, detailed error to the dedicated failures.log file
     # Prioritize the explicitly passed raw response text, as it may contain
     # reassembled data from a stream that is not available on the exception object.
     raw_response = raw_response_text
-    if not raw_response and hasattr(error, 'response') and hasattr(error.response, 'text'):
-        raw_response = error.response.text
     detailed_log_data = {
         "timestamp": datetime.utcnow().isoformat(),
-        "api_key_ending": api_key[-4:],
         "model": model,
         "attempt_number": attempt,
         "error_type": type(error).__name__,
-        "error_message": str(error),
-        "raw_response": raw_response,
         "request_headers": request_headers,
     }
     failure_logger.error(detailed_log_data)
     # 2. Log a concise summary to the main library logger, which will propagate
     summary_message = (
-        f"API call failed for model {model} with key ...{api_key[-4:]}. "
         f"Error: {type(error).__name__}. See failures.log for details."
     )
     main_lib_logger.error(summary_message)

 import os
 from datetime import datetime
 def setup_failure_logger():
     """Sets up a dedicated JSON logger for writing detailed failure logs to a file."""
     log_dir = "logs"
     # Create a logger specifically for failures.
     # This logger will NOT propagate to the root logger.
+    logger = logging.getLogger("failure_logger")
     logger.setLevel(logging.INFO)
     logger.propagate = False
     # Use a rotating file handler
     handler = RotatingFileHandler(
+        os.path.join(log_dir, "failures.log"),
+        maxBytes=5 * 1024 * 1024,  # 5 MB
+        backupCount=2,
     )
     # Custom JSON formatter for structured logs
             return json.dumps(record.msg)
     handler.setFormatter(JsonFormatter())
     # Add handler only if it hasn't been added before
     if not logger.handlers:
         logger.addHandler(handler)
     return logger
 # Initialize the dedicated logger for detailed failure logs
 failure_logger = setup_failure_logger()
 # Get the main library logger for concise, propagated messages
+main_lib_logger = logging.getLogger("rotator_library")
+def _extract_response_body(error: Exception) -> str:
+    """
+    Extract the full response body from various error types.
+    Handles:
+    - httpx.HTTPStatusError: response.text or response.content
+    - litellm exceptions: various response attributes
+    - Other exceptions: str(error)
+    """
+    # Try to get response body from httpx errors
+    if hasattr(error, "response") and error.response is not None:
+        response = error.response
+        # Try .text first (decoded)
+        if hasattr(response, "text") and response.text:
+            return response.text
+        # Try .content (bytes)
+        if hasattr(response, "content") and response.content:
+            try:
+                return response.content.decode("utf-8", errors="replace")
+            except Exception:
+                return str(response.content)
+    # Check for litellm's body attribute
+    if hasattr(error, "body") and error.body:
+        return str(error.body)
+    # Check for message attribute that might contain response
+    if hasattr(error, "message") and error.message:
+        return str(error.message)
+    return None
+def log_failure(
+    api_key: str,
+    model: str,
+    attempt: int,
+    error: Exception,
+    request_headers: dict,
+    raw_response_text: str = None,
+):
     """
     Logs a detailed failure message to a file and a concise summary to the main logger.
+    Args:
+        api_key: The API key or credential path that was used
+        model: The model that was requested
+        attempt: The attempt number (1-based)
+        error: The exception that occurred
+        request_headers: Headers from the original request
+        raw_response_text: Optional pre-extracted response body (e.g., from streaming)
     """
     # 1. Log the full, detailed error to the dedicated failures.log file
     # Prioritize the explicitly passed raw response text, as it may contain
     # reassembled data from a stream that is not available on the exception object.
     raw_response = raw_response_text
+    if not raw_response:
+        raw_response = _extract_response_body(error)
+    # Get full error message (not truncated)
+    full_error_message = str(error)
+    # Also capture any nested/wrapped exception info
+    error_chain = []
+    visited = set()  # Track visited exceptions to detect circular references
+    current_error = error
+    while current_error:
+        # Check for circular references
+        error_id = id(current_error)
+        if error_id in visited:
+            break
+        visited.add(error_id)
+        error_chain.append(
+            {
+                "type": type(current_error).__name__,
+                "message": str(current_error)[:2000],  # Limit per-error message size
+            }
+        )
+        current_error = getattr(current_error, "__cause__", None) or getattr(
+            current_error, "__context__", None
+        )
+        if len(error_chain) > 5:  # Prevent excessive chain length
+            break
     detailed_log_data = {
         "timestamp": datetime.utcnow().isoformat(),
+        "api_key_ending": api_key[-4:] if len(api_key) >= 4 else "****",
         "model": model,
         "attempt_number": attempt,
         "error_type": type(error).__name__,
+        "error_message": full_error_message[:5000],  # Limit total size
+        "raw_response": raw_response[:10000]
+        if raw_response
+        else None,  # Limit response size
         "request_headers": request_headers,
+        "error_chain": error_chain if len(error_chain) > 1 else None,
     }
     failure_logger.error(detailed_log_data)
     # 2. Log a concise summary to the main library logger, which will propagate
     summary_message = (
+        f"API call failed for model {model} with key ...{api_key[-4:] if len(api_key) >= 4 else '****'}. "
         f"Error: {type(error).__name__}. See failures.log for details."
     )
     main_lib_logger.error(summary_message)