Spaces:

elmerzole
/

llm-api-proxy

Paused

mirrobot-agent[bot] commited on Dec 4, 2025

Commit

a1cc875

1 Parent(s): 7cb148b

fix: improve error handling implementation based on code review

- Fix credential counting to track unique credentials (RequestErrorAccumulator)
- Move import os to module level in mask_credential function
- Fix status code check to use explicit 'is not None' comparison
- Improve context window error detection with more specific patterns
- Correct comment about server error classification
- Remove redundant '1 *' in exponential backoff calculations
- Add warning log for unreachable None return path
- Remove redundant error_accumulator.model/provider assignments
- Remove access to private _content attribute in failure_logger
- Add circular reference detection in error chain loop
- Reorder error recording to occur after should_rotate_on_error check

These changes address issues identified in both mirrobot-agent and
GitHub Copilot code reviews.

Files changed (3) hide show

src/rotator_library/client.py +253 -145
src/rotator_library/error_handler.py +123 -85
src/rotator_library/failure_logger.py +50 -33

src/rotator_library/client.py CHANGED Viewed

@@ -71,7 +71,7 @@ class RotatingClient:
     ):
         """
         Initialize the RotatingClient with intelligent credential rotation.
         Args:
             api_keys: Dictionary mapping provider names to lists of API keys
             oauth_credentials: Dictionary mapping provider names to OAuth credential paths
@@ -140,8 +140,7 @@ class RotatingClient:
         self.global_timeout = global_timeout
         self.abort_on_callback_error = abort_on_callback_error
         self.usage_manager = UsageManager(
-            file_path=usage_file_path,
-            rotation_tolerance=rotation_tolerance
         )
         self._model_list_cache = {}
         self._provider_plugins = PROVIDER_PLUGINS
@@ -160,7 +159,9 @@ class RotatingClient:
         # Validate all values are >= 1
         for provider, max_val in self.max_concurrent_requests_per_key.items():
             if max_val < 1:
-                lib_logger.warning(f"Invalid max_concurrent for '{provider}': {max_val}. Setting to 1.")
                 self.max_concurrent_requests_per_key[provider] = 1
     def _is_model_ignored(self, provider: str, model_id: str) -> bool:
@@ -368,7 +369,9 @@ class RotatingClient:
         return kwargs
-    def _apply_default_safety_settings(self, litellm_kwargs: Dict[str, Any], provider: str):
         """
         Ensure default Gemini safety settings are present when calling the Gemini provider.
         This will not override any explicit settings provided by the request. It accepts
@@ -397,22 +400,33 @@ class RotatingClient:
         ]
         # If generic form is present, ensure missing generic keys are filled in
-        if "safety_settings" in litellm_kwargs and isinstance(litellm_kwargs["safety_settings"], dict):
             for k, v in default_generic.items():
                 if k not in litellm_kwargs["safety_settings"]:
                     litellm_kwargs["safety_settings"][k] = v
             return
         # If Gemini form is present, ensure missing gemini categories are appended
-        if "safetySettings" in litellm_kwargs and isinstance(litellm_kwargs["safetySettings"], list):
-            present = {item.get("category") for item in litellm_kwargs["safetySettings"] if isinstance(item, dict)}
             for d in default_gemini:
                 if d["category"] not in present:
                     litellm_kwargs["safetySettings"].append(d)
             return
         # Neither present: set generic defaults so provider conversion will translate them
-        if "safety_settings" not in litellm_kwargs and "safetySettings" not in litellm_kwargs:
             litellm_kwargs["safety_settings"] = default_generic.copy()
     def get_oauth_credentials(self) -> Dict[str, List[str]]:
@@ -430,10 +444,10 @@ class RotatingClient:
         """
         Lazily initializes and returns a provider instance.
         Only initializes providers that have configured credentials.
         Args:
             provider_name: The name of the provider to get an instance for.
         Returns:
             Provider instance if credentials exist, None otherwise.
         """
@@ -443,7 +457,7 @@ class RotatingClient:
                 f"Skipping provider '{provider_name}' initialization: no credentials configured"
             )
             return None
         if provider_name not in self._provider_instances:
             if provider_name in self._provider_plugins:
                 self._provider_instances[provider_name] = self._provider_plugins[
@@ -465,46 +479,47 @@ class RotatingClient:
     def _resolve_model_id(self, model: str, provider: str) -> str:
         """
         Resolves the actual model ID to send to the provider.
         For custom models with name/ID mappings, returns the ID.
         Otherwise, returns the model name unchanged.
         Args:
             model: Full model string with provider (e.g., "iflow/DS-v3.2")
             provider: Provider name (e.g., "iflow")
         Returns:
             Full model string with ID (e.g., "iflow/deepseek-v3.2")
         """
         # Extract model name from "provider/model_name" format
-        model_name = model.split('/')[-1] if '/' in model else model
         # Try to get provider instance to check for model definitions
         provider_plugin = self._get_provider_instance(provider)
         # Check if provider has model definitions
-        if provider_plugin and hasattr(provider_plugin, 'model_definitions'):
-            model_id = provider_plugin.model_definitions.get_model_id(provider, model_name)
             if model_id and model_id != model_name:
                 # Return with provider prefix
                 return f"{provider}/{model_id}"
         # Fallback: use client's own model definitions
         model_id = self.model_definitions.get_model_id(provider, model_name)
         if model_id and model_id != model_name:
             return f"{provider}/{model_id}"
         # No conversion needed, return original
         return model
     async def _safe_streaming_wrapper(
         self, stream: Any, key: str, model: str, request: Optional[Any] = None
     ) -> AsyncGenerator[Any, None]:
         """
         A hybrid wrapper for streaming that buffers fragmented JSON, handles client disconnections gracefully,
         and distinguishes between content and streamed errors.
         FINISH_REASON HANDLING:
         Providers just translate chunks - this wrapper handles ALL finish_reason logic:
         1. Strip finish_reason from intermediate chunks (litellm defaults to "stop")
@@ -541,7 +556,7 @@ class RotatingClient:
                         chunk_dict = chunk.model_dump()
                     else:
                         chunk_dict = chunk
                     # === FINISH_REASON LOGIC ===
                     # Providers send raw chunks without finish_reason logic.
                     # This wrapper determines finish_reason based on accumulated state.
@@ -549,19 +564,19 @@ class RotatingClient:
                         choice = chunk_dict["choices"][0]
                         delta = choice.get("delta", {})
                         usage = chunk_dict.get("usage", {})
                         # Track tool_calls across ALL chunks - if we ever see one, finish_reason must be tool_calls
                         if delta.get("tool_calls"):
                             has_tool_calls = True
                             accumulated_finish_reason = "tool_calls"
                         # Detect final chunk: has usage with completion_tokens > 0
                         has_completion_tokens = (
-                            usage and
-                            isinstance(usage, dict) and
-                            usage.get("completion_tokens", 0) > 0
                         )
                         if has_completion_tokens:
                             # FINAL CHUNK: Determine correct finish_reason
                             if has_tool_calls:
@@ -577,7 +592,7 @@ class RotatingClient:
                             # INTERMEDIATE CHUNK: Never emit finish_reason
                             # (litellm.ModelResponse defaults to "stop" which is wrong)
                             choice["finish_reason"] = None
                     yield f"data: {json.dumps(chunk_dict)}\n\n"
                     if hasattr(chunk, "usage") and chunk.usage:
@@ -726,12 +741,13 @@ class RotatingClient:
         # multiple keys have the same usage stats.
         credentials_for_provider = list(self.all_credentials[provider])
         random.shuffle(credentials_for_provider)
         # Filter out credentials that are unavailable (queued for re-auth)
         provider_plugin = self._get_provider_instance(provider)
-        if provider_plugin and hasattr(provider_plugin, 'is_credential_available'):
             available_creds = [
-                cred for cred in credentials_for_provider
                 if provider_plugin.is_credential_available(cred)
             ]
             if available_creds:
@@ -744,7 +760,7 @@ class RotatingClient:
         kwargs = self._convert_model_params(**kwargs)
         # The main rotation loop. It continues as long as there are untried credentials and the global deadline has not been exceeded.
         # Resolve model ID early, before any credential operations
         # This ensures consistent model ID usage for acquisition, release, and tracking
         resolved_model = self._resolve_model_id(model, provider)
@@ -752,10 +768,10 @@ class RotatingClient:
             lib_logger.info(f"Resolved model '{model}' to '{resolved_model}'")
             model = resolved_model
             kwargs["model"] = model  # Ensure kwargs has the resolved model for litellm
         # [NEW] Filter by model tier requirement and build priority map
         credential_priorities = None
-        if provider_plugin and hasattr(provider_plugin, 'get_model_tier_requirement'):
             required_tier = provider_plugin.get_model_tier_requirement(model)
             if required_tier is not None:
                 # Filter OUT only credentials we KNOW are too low priority
@@ -763,9 +779,9 @@ class RotatingClient:
                 incompatible_creds = []
                 compatible_creds = []
                 unknown_creds = []
                 for cred in credentials_for_provider:
-                    if hasattr(provider_plugin, 'get_credential_priority'):
                         priority = provider_plugin.get_credential_priority(cred)
                         if priority is None:
                             # Unknown priority - keep it, will be discovered on first use
@@ -779,7 +795,7 @@ class RotatingClient:
                     else:
                         # Provider doesn't support priorities - keep all
                         unknown_creds.append(cred)
                 # If we have any known-compatible or unknown credentials, use them
                 tier_compatible_creds = compatible_creds + unknown_creds
                 if tier_compatible_creds:
@@ -806,18 +822,18 @@ class RotatingClient:
                         f"but all {len(incompatible_creds)} known credentials have priority > {required_tier}. "
                         f"Request will likely fail."
                     )
         # Build priority map for usage_manager
-        if provider_plugin and hasattr(provider_plugin, 'get_credential_priority'):
             credential_priorities = {}
             for cred in credentials_for_provider:
                 priority = provider_plugin.get_credential_priority(cred)
                 if priority is not None:
                     credential_priorities[cred] = priority
             if credential_priorities:
                 lib_logger.debug(
-                    f"Credential priorities for {provider}: {', '.join(f'P{p}={len([c for c in credentials_for_provider if credential_priorities.get(c)==p])}' for p in sorted(set(credential_priorities.values())))}"
                 )
         # Initialize error accumulator for tracking errors across credential rotation
@@ -861,9 +877,11 @@ class RotatingClient:
                 )
                 max_concurrent = self.max_concurrent_requests_per_key.get(provider, 1)
                 current_cred = await self.usage_manager.acquire_key(
-                    available_keys=creds_to_try, model=model, deadline=deadline,
                     max_concurrent=max_concurrent,
-                    credential_priorities=credential_priorities
                 )
                 key_acquired = True
                 tried_creds.add(current_cred)
@@ -946,10 +964,14 @@ class RotatingClient:
                     if provider_instance:
                         # Ensure default Gemini safety settings are present (without overriding request)
                         try:
-                            self._apply_default_safety_settings(litellm_kwargs, provider)
                         except Exception:
                             # If anything goes wrong here, avoid breaking the request flow.
-                            lib_logger.debug("Could not apply default safety settings; continuing.")
                         if "safety_settings" in litellm_kwargs:
                             converted_settings = (
@@ -1032,9 +1054,11 @@ class RotatingClient:
                             # Extract a clean error message for the user-facing log
                             error_message = str(e).split("\n")[0]
                             # Record in accumulator for client reporting
-                            error_accumulator.record_error(current_cred, classified_error, error_message)
                             lib_logger.info(
                                 f"Key {mask_credential(current_cred)} hit rate limit for {model}. Rotating key."
@@ -1068,16 +1092,20 @@ class RotatingClient:
                             )
                             classified_error = classify_error(e)
                             error_message = str(e).split("\n")[0]
                             # Provider-level error: don't increment consecutive failures
                             await self.usage_manager.record_failure(
-                                current_cred, model, classified_error,
-                                increment_consecutive_failures=False
                             )
                             if attempt >= self.max_retries - 1:
                                 # Record in accumulator only on final failure for this key
-                                error_accumulator.record_error(current_cred, classified_error, error_message)
                                 lib_logger.warning(
                                     f"Key {mask_credential(current_cred)} failed after max retries due to server error. Rotating."
                                 )
@@ -1085,13 +1113,15 @@ class RotatingClient:
                             # For temporary errors, wait before retrying with the same key.
                             wait_time = classified_error.retry_after or (
-                                1 * (2**attempt)
                             ) + random.uniform(0, 1)
                             remaining_budget = deadline - time.time()
                             # If the required wait time exceeds the budget, don't wait; rotate to the next key immediately.
                             if wait_time > remaining_budget:
-                                error_accumulator.record_error(current_cred, classified_error, error_message)
                                 lib_logger.warning(
                                     f"Retry wait ({wait_time:.2f}s) exceeds budget ({remaining_budget:.2f}s). Rotating key."
                                 )
@@ -1115,34 +1145,44 @@ class RotatingClient:
                                 if request
                                 else {},
                             )
                             classified_error = classify_error(e)
                             error_message = str(e).split("\n")[0]
-                            # Record in accumulator for client reporting
-                            error_accumulator.record_error(current_cred, classified_error, error_message)
                             lib_logger.warning(
                                 f"Key {mask_credential(current_cred)} HTTP {e.response.status_code} ({classified_error.error_type})."
                             )
                             # Check if this error should trigger rotation
                             if not should_rotate_on_error(classified_error):
                                 lib_logger.error(
                                     f"Non-recoverable error ({classified_error.error_type}). Failing request."
                                 )
                                 raise last_exception
                             # Handle rate limits with cooldown
-                            if classified_error.error_type in ["rate_limit", "quota_exceeded"]:
                                 cooldown_duration = classified_error.retry_after or 60
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
                                 )
                             # Check if we should retry same key (server errors with retries left)
-                            if should_retry_same_key(classified_error) and attempt < self.max_retries - 1:
-                                wait_time = classified_error.retry_after or (1 * (2**attempt)) + random.uniform(0, 1)
                                 remaining_budget = deadline - time.time()
                                 if wait_time <= remaining_budget:
                                     lib_logger.warning(
@@ -1150,12 +1190,14 @@ class RotatingClient:
                                     )
                                     await asyncio.sleep(wait_time)
                                     continue
                             # Record failure and rotate to next key
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
-                            lib_logger.info(f"Rotating to next key after {classified_error.error_type} error.")
                             break
                         except Exception as e:
@@ -1178,16 +1220,17 @@ class RotatingClient:
                             classified_error = classify_error(e)
                             error_message = str(e).split("\n")[0]
-                            # Record in accumulator for client reporting
-                            error_accumulator.record_error(current_cred, classified_error, error_message)
                             lib_logger.warning(
                                 f"Key {mask_credential(current_cred)} {classified_error.error_type} (HTTP {classified_error.status_code})."
                             )
                             # Handle rate limits with cooldown
-                            if classified_error.status_code == 429 or classified_error.error_type in ["rate_limit", "quota_exceeded"]:
                                 cooldown_duration = classified_error.retry_after or 60
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
@@ -1200,6 +1243,11 @@ class RotatingClient:
                                 )
                                 raise last_exception
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
@@ -1211,15 +1259,19 @@ class RotatingClient:
         # Check if we exhausted all credentials or timed out
         if time.time() >= deadline:
             error_accumulator.timeout_occurred = True
         if error_accumulator.has_errors():
             # Log concise summary for server logs
             lib_logger.error(error_accumulator.build_log_message())
             # Return the structured error response for the client
             return error_accumulator.build_client_error_response()
         # Return None to indicate failure without error details (shouldn't normally happen)
         return None
     async def _streaming_acompletion_with_retry(
@@ -1235,12 +1287,13 @@ class RotatingClient:
         # Create a mutable copy of the keys and shuffle it.
         credentials_for_provider = list(self.all_credentials[provider])
         random.shuffle(credentials_for_provider)
         # Filter out credentials that are unavailable (queued for re-auth)
         provider_plugin = self._get_provider_instance(provider)
-        if provider_plugin and hasattr(provider_plugin, 'is_credential_available'):
             available_creds = [
-                cred for cred in credentials_for_provider
                 if provider_plugin.is_credential_available(cred)
             ]
             if available_creds:
@@ -1262,10 +1315,10 @@ class RotatingClient:
             lib_logger.info(f"Resolved model '{model}' to '{resolved_model}'")
             model = resolved_model
             kwargs["model"] = model  # Ensure kwargs has the resolved model for litellm
         # [NEW] Filter by model tier requirement and build priority map
         credential_priorities = None
-        if provider_plugin and hasattr(provider_plugin, 'get_model_tier_requirement'):
             required_tier = provider_plugin.get_model_tier_requirement(model)
             if required_tier is not None:
                 # Filter OUT only credentials we KNOW are too low priority
@@ -1273,9 +1326,9 @@ class RotatingClient:
                 incompatible_creds = []
                 compatible_creds = []
                 unknown_creds = []
                 for cred in credentials_for_provider:
-                    if hasattr(provider_plugin, 'get_credential_priority'):
                         priority = provider_plugin.get_credential_priority(cred)
                         if priority is None:
                             # Unknown priority - keep it, will be discovered on first use
@@ -1289,7 +1342,7 @@ class RotatingClient:
                     else:
                         # Provider doesn't support priorities - keep all
                         unknown_creds.append(cred)
                 # If we have any known-compatible or unknown credentials, use them
                 tier_compatible_creds = compatible_creds + unknown_creds
                 if tier_compatible_creds:
@@ -1316,18 +1369,18 @@ class RotatingClient:
                         f"but all {len(incompatible_creds)} known credentials have priority > {required_tier}. "
                         f"Request will likely fail."
                     )
         # Build priority map for usage_manager
-        if provider_plugin and hasattr(provider_plugin, 'get_credential_priority'):
             credential_priorities = {}
             for cred in credentials_for_provider:
                 priority = provider_plugin.get_credential_priority(cred)
                 if priority is not None:
                     credential_priorities[cred] = priority
             if credential_priorities:
                 lib_logger.debug(
-                    f"Credential priorities for {provider}: {', '.join(f'P{p}={len([c for c in credentials_for_provider if credential_priorities.get(c)==p])}' for p in sorted(set(credential_priorities.values())))}"
                 )
         # Initialize error accumulator for tracking errors across credential rotation
@@ -1370,11 +1423,15 @@ class RotatingClient:
                     lib_logger.info(
                         f"Acquiring credential for model {model}. Tried credentials: {len(tried_creds)}/{len(credentials_for_provider)}"
                     )
-                    max_concurrent = self.max_concurrent_requests_per_key.get(provider, 1)
                     current_cred = await self.usage_manager.acquire_key(
-                        available_keys=creds_to_try, model=model, deadline=deadline,
                         max_concurrent=max_concurrent,
-                        credential_priorities=credential_priorities
                     )
                     key_acquired = True
                     tried_creds.add(current_cred)
@@ -1483,7 +1540,7 @@ class RotatingClient:
                                 original_exc = getattr(e, "data", e)
                                 classified_error = classify_error(original_exc)
                                 error_message = str(original_exc).split("\n")[0]
                                 log_failure(
                                     api_key=current_cred,
                                     model=model,
@@ -1493,24 +1550,31 @@ class RotatingClient:
                                     if request
                                     else {},
                                 )
                                 # Record in accumulator for client reporting
-                                error_accumulator.record_error(current_cred, classified_error, error_message)
                                 # Check if this error should trigger rotation
                                 if not should_rotate_on_error(classified_error):
                                     lib_logger.error(
                                         f"Non-recoverable error ({classified_error.error_type}) during custom stream. Failing."
                                     )
                                     raise last_exception
                                 # Handle rate limits with cooldown
-                                if classified_error.error_type in ["rate_limit", "quota_exceeded"]:
-                                    cooldown_duration = classified_error.retry_after or 60
                                     await self.cooldown_manager.start_cooldown(
                                         provider, cooldown_duration
                                     )
                                 await self.usage_manager.record_failure(
                                     current_cred, model, classified_error
                                 )
@@ -1536,26 +1600,32 @@ class RotatingClient:
                                 )
                                 classified_error = classify_error(e)
                                 error_message = str(e).split("\n")[0]
                                 # Provider-level error: don't increment consecutive failures
                                 await self.usage_manager.record_failure(
-                                    current_cred, model, classified_error,
-                                    increment_consecutive_failures=False
                                 )
                                 if attempt >= self.max_retries - 1:
-                                    error_accumulator.record_error(current_cred, classified_error, error_message)
                                     lib_logger.warning(
                                         f"Cred {mask_credential(current_cred)} failed after max retries. Rotating."
                                     )
                                     break
                                 wait_time = classified_error.retry_after or (
-                                    1 * (2**attempt)
                                 ) + random.uniform(0, 1)
                                 remaining_budget = deadline - time.time()
                                 if wait_time > remaining_budget:
-                                    error_accumulator.record_error(current_cred, classified_error, error_message)
                                     lib_logger.warning(
                                         f"Retry wait ({wait_time:.2f}s) exceeds budget. Rotating."
                                     )
@@ -1580,21 +1650,23 @@ class RotatingClient:
                                 )
                                 classified_error = classify_error(e)
                                 error_message = str(e).split("\n")[0]
                                 # Record in accumulator
-                                error_accumulator.record_error(current_cred, classified_error, error_message)
                                 lib_logger.warning(
                                     f"Cred {mask_credential(current_cred)} {classified_error.error_type} (HTTP {classified_error.status_code})."
                                 )
                                 # Check if this error should trigger rotation
                                 if not should_rotate_on_error(classified_error):
                                     lib_logger.error(
                                         f"Non-recoverable error ({classified_error.error_type}). Failing."
                                     )
                                     raise last_exception
                                 await self.usage_manager.record_failure(
                                     current_cred, model, classified_error
                                 )
@@ -1616,9 +1688,13 @@ class RotatingClient:
                     if provider_instance:
                         # Ensure default Gemini safety settings are present (without overriding request)
                         try:
-                            self._apply_default_safety_settings(litellm_kwargs, provider)
                         except Exception:
-                            lib_logger.debug("Could not apply default safety settings for streaming path; continuing.")
                         if "safety_settings" in litellm_kwargs:
                             converted_settings = (
@@ -1699,7 +1775,11 @@ class RotatingClient:
                                 yield chunk
                             return
-                        except (StreamedAPIError, litellm.RateLimitError, httpx.HTTPStatusError) as e:
                             last_exception = e
                             # This is the final, robust handler for streamed errors.
@@ -1708,7 +1788,7 @@ class RotatingClient:
                             # The actual exception might be wrapped in our StreamedAPIError.
                             original_exc = getattr(e, "data", e)
                             classified_error = classify_error(original_exc)
                             # Check if this error should trigger rotation
                             if not should_rotate_on_error(classified_error):
                                 lib_logger.error(
@@ -1745,16 +1825,18 @@ class RotatingClient:
                             error_message_text = error_details.get(
                                 "message", str(original_exc).split("\n")[0]
                             )
                             # Record in accumulator for client reporting
-                            error_accumulator.record_error(current_cred, classified_error, error_message_text)
                             if (
                                 "quota" in error_message_text.lower()
                                 or "resource_exhausted" in error_status.lower()
                             ):
                                 consecutive_quota_failures += 1
                                 quota_value = "N/A"
                                 quota_id = "N/A"
                                 if "details" in error_details and isinstance(
@@ -1764,10 +1846,15 @@ class RotatingClient:
                                         if isinstance(detail.get("violations"), list):
                                             for violation in detail["violations"]:
                                                 if "quotaValue" in violation:
-                                                    quota_value = violation["quotaValue"]
                                                 if "quotaId" in violation:
                                                     quota_id = violation["quotaId"]
-                                                if quota_value != "N/A" and quota_id != "N/A":
                                                     break
                                 await self.usage_manager.record_failure(
@@ -1798,8 +1885,13 @@ class RotatingClient:
                                     f"Cred {mask_credential(current_cred)} {classified_error.error_type}. Rotating."
                                 )
-                                if classified_error.error_type in ["rate_limit", "quota_exceeded"]:
-                                    cooldown_duration = classified_error.retry_after or 60
                                     await self.cooldown_manager.start_cooldown(
                                         provider, cooldown_duration
                                     )
@@ -1827,14 +1919,18 @@ class RotatingClient:
                             )
                             classified_error = classify_error(e)
                             error_message_text = str(e).split("\n")[0]
-                            # Record error in accumulator (server errors are abnormal)
-                            error_accumulator.record_error(current_cred, classified_error, error_message_text)
                             # Provider-level error: don't increment consecutive failures
                             await self.usage_manager.record_failure(
-                                current_cred, model, classified_error,
-                                increment_consecutive_failures=False
                             )
                             if attempt >= self.max_retries - 1:
@@ -1845,7 +1941,7 @@ class RotatingClient:
                                 break
                             wait_time = classified_error.retry_after or (
-                                1 * (2**attempt)
                             ) + random.uniform(0, 1)
                             remaining_budget = deadline - time.time()
                             if wait_time > remaining_budget:
@@ -1874,16 +1970,22 @@ class RotatingClient:
                             )
                             classified_error = classify_error(e)
                             error_message_text = str(e).split("\n")[0]
                             # Record error in accumulator
-                            error_accumulator.record_error(current_cred, classified_error, error_message_text)
                             lib_logger.warning(
                                 f"Credential ...{current_cred[-6:]} failed with {classified_error.error_type} (Status: {classified_error.status_code}). Error: {error_message_text}."
                             )
                             # Handle rate limits with cooldown
-                            if classified_error.status_code == 429 or classified_error.error_type in ["rate_limit", "quota_exceeded"]:
                                 cooldown_duration = classified_error.retry_after or 60
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
@@ -1904,7 +2006,9 @@ class RotatingClient:
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
-                            lib_logger.info(f"Rotating to next key after {classified_error.error_type} error.")
                             break
                 finally:
@@ -1913,26 +2017,28 @@ class RotatingClient:
             # Build detailed error response using error accumulator
             error_accumulator.timeout_occurred = time.time() >= deadline
-            error_accumulator.model = model
-            error_accumulator.provider = provider
             if error_accumulator.has_errors():
                 # Log concise summary for server logs
                 lib_logger.error(error_accumulator.build_log_message())
                 # Build structured error response for client
                 error_response = error_accumulator.build_client_error_response()
                 error_data = error_response
             else:
                 # Fallback if no errors were recorded (shouldn't happen)
-                final_error_message = "Request failed: No available API keys after rotation or timeout."
                 if last_exception:
-                    final_error_message = f"Request failed. Last error: {str(last_exception)}"
                 error_data = {
                     "error": {"message": final_error_message, "type": "proxy_error"}
                 }
                 lib_logger.error(final_error_message)
             yield f"data: {json.dumps(error_data)}\n\n"
             yield "data: [DONE]\n\n"
@@ -1980,11 +2086,13 @@ class RotatingClient:
         # Handle iflow provider: remove stream_options to avoid HTTP 406
         model = kwargs.get("model", "")
         provider = model.split("/")[0] if "/" in model else ""
         if provider == "iflow" and "stream_options" in kwargs:
-            lib_logger.debug("Removing stream_options for iflow provider to avoid HTTP 406")
             kwargs.pop("stream_options", None)
         if kwargs.get("stream"):
             # Only add stream_options for providers that support it (excluding iflow)
             if provider != "iflow":
@@ -1992,7 +2100,7 @@ class RotatingClient:
                     kwargs["stream_options"] = {}
                 if "include_usage" not in kwargs["stream_options"]:
                     kwargs["stream_options"]["include_usage"] = True
             return self._streaming_acompletion_with_retry(
                 request=request, pre_request_callback=pre_request_callback, **kwargs
             )

     ):
         """
         Initialize the RotatingClient with intelligent credential rotation.
         Args:
             api_keys: Dictionary mapping provider names to lists of API keys
             oauth_credentials: Dictionary mapping provider names to OAuth credential paths
         self.global_timeout = global_timeout
         self.abort_on_callback_error = abort_on_callback_error
         self.usage_manager = UsageManager(
+            file_path=usage_file_path, rotation_tolerance=rotation_tolerance
         )
         self._model_list_cache = {}
         self._provider_plugins = PROVIDER_PLUGINS
         # Validate all values are >= 1
         for provider, max_val in self.max_concurrent_requests_per_key.items():
             if max_val < 1:
+                lib_logger.warning(
+                    f"Invalid max_concurrent for '{provider}': {max_val}. Setting to 1."
+                )
                 self.max_concurrent_requests_per_key[provider] = 1
     def _is_model_ignored(self, provider: str, model_id: str) -> bool:
         return kwargs
+    def _apply_default_safety_settings(
+        self, litellm_kwargs: Dict[str, Any], provider: str
+    ):
         """
         Ensure default Gemini safety settings are present when calling the Gemini provider.
         This will not override any explicit settings provided by the request. It accepts
         ]
         # If generic form is present, ensure missing generic keys are filled in
+        if "safety_settings" in litellm_kwargs and isinstance(
+            litellm_kwargs["safety_settings"], dict
+        ):
             for k, v in default_generic.items():
                 if k not in litellm_kwargs["safety_settings"]:
                     litellm_kwargs["safety_settings"][k] = v
             return
         # If Gemini form is present, ensure missing gemini categories are appended
+        if "safetySettings" in litellm_kwargs and isinstance(
+            litellm_kwargs["safetySettings"], list
+        ):
+            present = {
+                item.get("category")
+                for item in litellm_kwargs["safetySettings"]
+                if isinstance(item, dict)
+            }
             for d in default_gemini:
                 if d["category"] not in present:
                     litellm_kwargs["safetySettings"].append(d)
             return
         # Neither present: set generic defaults so provider conversion will translate them
+        if (
+            "safety_settings" not in litellm_kwargs
+            and "safetySettings" not in litellm_kwargs
+        ):
             litellm_kwargs["safety_settings"] = default_generic.copy()
     def get_oauth_credentials(self) -> Dict[str, List[str]]:
         """
         Lazily initializes and returns a provider instance.
         Only initializes providers that have configured credentials.
         Args:
             provider_name: The name of the provider to get an instance for.
         Returns:
             Provider instance if credentials exist, None otherwise.
         """
                 f"Skipping provider '{provider_name}' initialization: no credentials configured"
             )
             return None
         if provider_name not in self._provider_instances:
             if provider_name in self._provider_plugins:
                 self._provider_instances[provider_name] = self._provider_plugins[
     def _resolve_model_id(self, model: str, provider: str) -> str:
         """
         Resolves the actual model ID to send to the provider.
         For custom models with name/ID mappings, returns the ID.
         Otherwise, returns the model name unchanged.
         Args:
             model: Full model string with provider (e.g., "iflow/DS-v3.2")
             provider: Provider name (e.g., "iflow")
         Returns:
             Full model string with ID (e.g., "iflow/deepseek-v3.2")
         """
         # Extract model name from "provider/model_name" format
+        model_name = model.split("/")[-1] if "/" in model else model
         # Try to get provider instance to check for model definitions
         provider_plugin = self._get_provider_instance(provider)
         # Check if provider has model definitions
+        if provider_plugin and hasattr(provider_plugin, "model_definitions"):
+            model_id = provider_plugin.model_definitions.get_model_id(
+                provider, model_name
+            )
             if model_id and model_id != model_name:
                 # Return with provider prefix
                 return f"{provider}/{model_id}"
         # Fallback: use client's own model definitions
         model_id = self.model_definitions.get_model_id(provider, model_name)
         if model_id and model_id != model_name:
             return f"{provider}/{model_id}"
         # No conversion needed, return original
         return model
     async def _safe_streaming_wrapper(
         self, stream: Any, key: str, model: str, request: Optional[Any] = None
     ) -> AsyncGenerator[Any, None]:
         """
         A hybrid wrapper for streaming that buffers fragmented JSON, handles client disconnections gracefully,
         and distinguishes between content and streamed errors.
         FINISH_REASON HANDLING:
         Providers just translate chunks - this wrapper handles ALL finish_reason logic:
         1. Strip finish_reason from intermediate chunks (litellm defaults to "stop")
                         chunk_dict = chunk.model_dump()
                     else:
                         chunk_dict = chunk
                     # === FINISH_REASON LOGIC ===
                     # Providers send raw chunks without finish_reason logic.
                     # This wrapper determines finish_reason based on accumulated state.
                         choice = chunk_dict["choices"][0]
                         delta = choice.get("delta", {})
                         usage = chunk_dict.get("usage", {})
                         # Track tool_calls across ALL chunks - if we ever see one, finish_reason must be tool_calls
                         if delta.get("tool_calls"):
                             has_tool_calls = True
                             accumulated_finish_reason = "tool_calls"
                         # Detect final chunk: has usage with completion_tokens > 0
                         has_completion_tokens = (
+                            usage
+                            and isinstance(usage, dict)
+                            and usage.get("completion_tokens", 0) > 0
                         )
                         if has_completion_tokens:
                             # FINAL CHUNK: Determine correct finish_reason
                             if has_tool_calls:
                             # INTERMEDIATE CHUNK: Never emit finish_reason
                             # (litellm.ModelResponse defaults to "stop" which is wrong)
                             choice["finish_reason"] = None
                     yield f"data: {json.dumps(chunk_dict)}\n\n"
                     if hasattr(chunk, "usage") and chunk.usage:
         # multiple keys have the same usage stats.
         credentials_for_provider = list(self.all_credentials[provider])
         random.shuffle(credentials_for_provider)
         # Filter out credentials that are unavailable (queued for re-auth)
         provider_plugin = self._get_provider_instance(provider)
+        if provider_plugin and hasattr(provider_plugin, "is_credential_available"):
             available_creds = [
+                cred
+                for cred in credentials_for_provider
                 if provider_plugin.is_credential_available(cred)
             ]
             if available_creds:
         kwargs = self._convert_model_params(**kwargs)
         # The main rotation loop. It continues as long as there are untried credentials and the global deadline has not been exceeded.
         # Resolve model ID early, before any credential operations
         # This ensures consistent model ID usage for acquisition, release, and tracking
         resolved_model = self._resolve_model_id(model, provider)
             lib_logger.info(f"Resolved model '{model}' to '{resolved_model}'")
             model = resolved_model
             kwargs["model"] = model  # Ensure kwargs has the resolved model for litellm
         # [NEW] Filter by model tier requirement and build priority map
         credential_priorities = None
+        if provider_plugin and hasattr(provider_plugin, "get_model_tier_requirement"):
             required_tier = provider_plugin.get_model_tier_requirement(model)
             if required_tier is not None:
                 # Filter OUT only credentials we KNOW are too low priority
                 incompatible_creds = []
                 compatible_creds = []
                 unknown_creds = []
                 for cred in credentials_for_provider:
+                    if hasattr(provider_plugin, "get_credential_priority"):
                         priority = provider_plugin.get_credential_priority(cred)
                         if priority is None:
                             # Unknown priority - keep it, will be discovered on first use
                     else:
                         # Provider doesn't support priorities - keep all
                         unknown_creds.append(cred)
                 # If we have any known-compatible or unknown credentials, use them
                 tier_compatible_creds = compatible_creds + unknown_creds
                 if tier_compatible_creds:
                         f"but all {len(incompatible_creds)} known credentials have priority > {required_tier}. "
                         f"Request will likely fail."
                     )
         # Build priority map for usage_manager
+        if provider_plugin and hasattr(provider_plugin, "get_credential_priority"):
             credential_priorities = {}
             for cred in credentials_for_provider:
                 priority = provider_plugin.get_credential_priority(cred)
                 if priority is not None:
                     credential_priorities[cred] = priority
             if credential_priorities:
                 lib_logger.debug(
+                    f"Credential priorities for {provider}: {', '.join(f'P{p}={len([c for c in credentials_for_provider if credential_priorities.get(c) == p])}' for p in sorted(set(credential_priorities.values())))}"
                 )
         # Initialize error accumulator for tracking errors across credential rotation
                 )
                 max_concurrent = self.max_concurrent_requests_per_key.get(provider, 1)
                 current_cred = await self.usage_manager.acquire_key(
+                    available_keys=creds_to_try,
+                    model=model,
+                    deadline=deadline,
                     max_concurrent=max_concurrent,
+                    credential_priorities=credential_priorities,
                 )
                 key_acquired = True
                 tried_creds.add(current_cred)
                     if provider_instance:
                         # Ensure default Gemini safety settings are present (without overriding request)
                         try:
+                            self._apply_default_safety_settings(
+                                litellm_kwargs, provider
+                            )
                         except Exception:
                             # If anything goes wrong here, avoid breaking the request flow.
+                            lib_logger.debug(
+                                "Could not apply default safety settings; continuing."
+                            )
                         if "safety_settings" in litellm_kwargs:
                             converted_settings = (
                             # Extract a clean error message for the user-facing log
                             error_message = str(e).split("\n")[0]
                             # Record in accumulator for client reporting
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message
+                            )
                             lib_logger.info(
                                 f"Key {mask_credential(current_cred)} hit rate limit for {model}. Rotating key."
                             )
                             classified_error = classify_error(e)
                             error_message = str(e).split("\n")[0]
                             # Provider-level error: don't increment consecutive failures
                             await self.usage_manager.record_failure(
+                                current_cred,
+                                model,
+                                classified_error,
+                                increment_consecutive_failures=False,
                             )
                             if attempt >= self.max_retries - 1:
                                 # Record in accumulator only on final failure for this key
+                                error_accumulator.record_error(
+                                    current_cred, classified_error, error_message
+                                )
                                 lib_logger.warning(
                                     f"Key {mask_credential(current_cred)} failed after max retries due to server error. Rotating."
                                 )
                             # For temporary errors, wait before retrying with the same key.
                             wait_time = classified_error.retry_after or (
+                                2**attempt
                             ) + random.uniform(0, 1)
                             remaining_budget = deadline - time.time()
                             # If the required wait time exceeds the budget, don't wait; rotate to the next key immediately.
                             if wait_time > remaining_budget:
+                                error_accumulator.record_error(
+                                    current_cred, classified_error, error_message
+                                )
                                 lib_logger.warning(
                                     f"Retry wait ({wait_time:.2f}s) exceeds budget ({remaining_budget:.2f}s). Rotating key."
                                 )
                                 if request
                                 else {},
                             )
                             classified_error = classify_error(e)
                             error_message = str(e).split("\n")[0]
                             lib_logger.warning(
                                 f"Key {mask_credential(current_cred)} HTTP {e.response.status_code} ({classified_error.error_type})."
                             )
                             # Check if this error should trigger rotation
                             if not should_rotate_on_error(classified_error):
                                 lib_logger.error(
                                     f"Non-recoverable error ({classified_error.error_type}). Failing request."
                                 )
                                 raise last_exception
+                            # Record in accumulator after confirming it's a rotatable error
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message
+                            )
                             # Handle rate limits with cooldown
+                            if classified_error.error_type in [
+                                "rate_limit",
+                                "quota_exceeded",
+                            ]:
                                 cooldown_duration = classified_error.retry_after or 60
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
                                 )
                             # Check if we should retry same key (server errors with retries left)
+                            if (
+                                should_retry_same_key(classified_error)
+                                and attempt < self.max_retries - 1
+                            ):
+                                wait_time = classified_error.retry_after or (
+                                    2**attempt
+                                ) + random.uniform(0, 1)
                                 remaining_budget = deadline - time.time()
                                 if wait_time <= remaining_budget:
                                     lib_logger.warning(
                                     )
                                     await asyncio.sleep(wait_time)
                                     continue
                             # Record failure and rotate to next key
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
+                            lib_logger.info(
+                                f"Rotating to next key after {classified_error.error_type} error."
+                            )
                             break
                         except Exception as e:
                             classified_error = classify_error(e)
                             error_message = str(e).split("\n")[0]
                             lib_logger.warning(
                                 f"Key {mask_credential(current_cred)} {classified_error.error_type} (HTTP {classified_error.status_code})."
                             )
                             # Handle rate limits with cooldown
+                            if (
+                                classified_error.status_code == 429
+                                or classified_error.error_type
+                                in ["rate_limit", "quota_exceeded"]
+                            ):
                                 cooldown_duration = classified_error.retry_after or 60
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
                                 )
                                 raise last_exception
+                            # Record in accumulator after confirming it's a rotatable error
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message
+                            )
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
         # Check if we exhausted all credentials or timed out
         if time.time() >= deadline:
             error_accumulator.timeout_occurred = True
         if error_accumulator.has_errors():
             # Log concise summary for server logs
             lib_logger.error(error_accumulator.build_log_message())
             # Return the structured error response for the client
             return error_accumulator.build_client_error_response()
         # Return None to indicate failure without error details (shouldn't normally happen)
+        lib_logger.warning(
+            "Unexpected state: request failed with no recorded errors. "
+            "This may indicate a logic error in error tracking."
+        )
         return None
     async def _streaming_acompletion_with_retry(
         # Create a mutable copy of the keys and shuffle it.
         credentials_for_provider = list(self.all_credentials[provider])
         random.shuffle(credentials_for_provider)
         # Filter out credentials that are unavailable (queued for re-auth)
         provider_plugin = self._get_provider_instance(provider)
+        if provider_plugin and hasattr(provider_plugin, "is_credential_available"):
             available_creds = [
+                cred
+                for cred in credentials_for_provider
                 if provider_plugin.is_credential_available(cred)
             ]
             if available_creds:
             lib_logger.info(f"Resolved model '{model}' to '{resolved_model}'")
             model = resolved_model
             kwargs["model"] = model  # Ensure kwargs has the resolved model for litellm
         # [NEW] Filter by model tier requirement and build priority map
         credential_priorities = None
+        if provider_plugin and hasattr(provider_plugin, "get_model_tier_requirement"):
             required_tier = provider_plugin.get_model_tier_requirement(model)
             if required_tier is not None:
                 # Filter OUT only credentials we KNOW are too low priority
                 incompatible_creds = []
                 compatible_creds = []
                 unknown_creds = []
                 for cred in credentials_for_provider:
+                    if hasattr(provider_plugin, "get_credential_priority"):
                         priority = provider_plugin.get_credential_priority(cred)
                         if priority is None:
                             # Unknown priority - keep it, will be discovered on first use
                     else:
                         # Provider doesn't support priorities - keep all
                         unknown_creds.append(cred)
                 # If we have any known-compatible or unknown credentials, use them
                 tier_compatible_creds = compatible_creds + unknown_creds
                 if tier_compatible_creds:
                         f"but all {len(incompatible_creds)} known credentials have priority > {required_tier}. "
                         f"Request will likely fail."
                     )
         # Build priority map for usage_manager
+        if provider_plugin and hasattr(provider_plugin, "get_credential_priority"):
             credential_priorities = {}
             for cred in credentials_for_provider:
                 priority = provider_plugin.get_credential_priority(cred)
                 if priority is not None:
                     credential_priorities[cred] = priority
             if credential_priorities:
                 lib_logger.debug(
+                    f"Credential priorities for {provider}: {', '.join(f'P{p}={len([c for c in credentials_for_provider if credential_priorities.get(c) == p])}' for p in sorted(set(credential_priorities.values())))}"
                 )
         # Initialize error accumulator for tracking errors across credential rotation
                     lib_logger.info(
                         f"Acquiring credential for model {model}. Tried credentials: {len(tried_creds)}/{len(credentials_for_provider)}"
                     )
+                    max_concurrent = self.max_concurrent_requests_per_key.get(
+                        provider, 1
+                    )
                     current_cred = await self.usage_manager.acquire_key(
+                        available_keys=creds_to_try,
+                        model=model,
+                        deadline=deadline,
                         max_concurrent=max_concurrent,
+                        credential_priorities=credential_priorities,
                     )
                     key_acquired = True
                     tried_creds.add(current_cred)
                                 original_exc = getattr(e, "data", e)
                                 classified_error = classify_error(original_exc)
                                 error_message = str(original_exc).split("\n")[0]
                                 log_failure(
                                     api_key=current_cred,
                                     model=model,
                                     if request
                                     else {},
                                 )
                                 # Record in accumulator for client reporting
+                                error_accumulator.record_error(
+                                    current_cred, classified_error, error_message
+                                )
                                 # Check if this error should trigger rotation
                                 if not should_rotate_on_error(classified_error):
                                     lib_logger.error(
                                         f"Non-recoverable error ({classified_error.error_type}) during custom stream. Failing."
                                     )
                                     raise last_exception
                                 # Handle rate limits with cooldown
+                                if classified_error.error_type in [
+                                    "rate_limit",
+                                    "quota_exceeded",
+                                ]:
+                                    cooldown_duration = (
+                                        classified_error.retry_after or 60
+                                    )
                                     await self.cooldown_manager.start_cooldown(
                                         provider, cooldown_duration
                                     )
                                 await self.usage_manager.record_failure(
                                     current_cred, model, classified_error
                                 )
                                 )
                                 classified_error = classify_error(e)
                                 error_message = str(e).split("\n")[0]
                                 # Provider-level error: don't increment consecutive failures
                                 await self.usage_manager.record_failure(
+                                    current_cred,
+                                    model,
+                                    classified_error,
+                                    increment_consecutive_failures=False,
                                 )
                                 if attempt >= self.max_retries - 1:
+                                    error_accumulator.record_error(
+                                        current_cred, classified_error, error_message
+                                    )
                                     lib_logger.warning(
                                         f"Cred {mask_credential(current_cred)} failed after max retries. Rotating."
                                     )
                                     break
                                 wait_time = classified_error.retry_after or (
+                                    2**attempt
                                 ) + random.uniform(0, 1)
                                 remaining_budget = deadline - time.time()
                                 if wait_time > remaining_budget:
+                                    error_accumulator.record_error(
+                                        current_cred, classified_error, error_message
+                                    )
                                     lib_logger.warning(
                                         f"Retry wait ({wait_time:.2f}s) exceeds budget. Rotating."
                                     )
                                 )
                                 classified_error = classify_error(e)
                                 error_message = str(e).split("\n")[0]
                                 # Record in accumulator
+                                error_accumulator.record_error(
+                                    current_cred, classified_error, error_message
+                                )
                                 lib_logger.warning(
                                     f"Cred {mask_credential(current_cred)} {classified_error.error_type} (HTTP {classified_error.status_code})."
                                 )
                                 # Check if this error should trigger rotation
                                 if not should_rotate_on_error(classified_error):
                                     lib_logger.error(
                                         f"Non-recoverable error ({classified_error.error_type}). Failing."
                                     )
                                     raise last_exception
                                 await self.usage_manager.record_failure(
                                     current_cred, model, classified_error
                                 )
                     if provider_instance:
                         # Ensure default Gemini safety settings are present (without overriding request)
                         try:
+                            self._apply_default_safety_settings(
+                                litellm_kwargs, provider
+                            )
                         except Exception:
+                            lib_logger.debug(
+                                "Could not apply default safety settings for streaming path; continuing."
+                            )
                         if "safety_settings" in litellm_kwargs:
                             converted_settings = (
                                 yield chunk
                             return
+                        except (
+                            StreamedAPIError,
+                            litellm.RateLimitError,
+                            httpx.HTTPStatusError,
+                        ) as e:
                             last_exception = e
                             # This is the final, robust handler for streamed errors.
                             # The actual exception might be wrapped in our StreamedAPIError.
                             original_exc = getattr(e, "data", e)
                             classified_error = classify_error(original_exc)
                             # Check if this error should trigger rotation
                             if not should_rotate_on_error(classified_error):
                                 lib_logger.error(
                             error_message_text = error_details.get(
                                 "message", str(original_exc).split("\n")[0]
                             )
                             # Record in accumulator for client reporting
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message_text
+                            )
                             if (
                                 "quota" in error_message_text.lower()
                                 or "resource_exhausted" in error_status.lower()
                             ):
                                 consecutive_quota_failures += 1
                                 quota_value = "N/A"
                                 quota_id = "N/A"
                                 if "details" in error_details and isinstance(
                                         if isinstance(detail.get("violations"), list):
                                             for violation in detail["violations"]:
                                                 if "quotaValue" in violation:
+                                                    quota_value = violation[
+                                                        "quotaValue"
+                                                    ]
                                                 if "quotaId" in violation:
                                                     quota_id = violation["quotaId"]
+                                                if (
+                                                    quota_value != "N/A"
+                                                    and quota_id != "N/A"
+                                                ):
                                                     break
                                 await self.usage_manager.record_failure(
                                     f"Cred {mask_credential(current_cred)} {classified_error.error_type}. Rotating."
                                 )
+                                if classified_error.error_type in [
+                                    "rate_limit",
+                                    "quota_exceeded",
+                                ]:
+                                    cooldown_duration = (
+                                        classified_error.retry_after or 60
+                                    )
                                     await self.cooldown_manager.start_cooldown(
                                         provider, cooldown_duration
                                     )
                             )
                             classified_error = classify_error(e)
                             error_message_text = str(e).split("\n")[0]
+                            # Record error in accumulator (server errors are transient, not abnormal)
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message_text
+                            )
                             # Provider-level error: don't increment consecutive failures
                             await self.usage_manager.record_failure(
+                                current_cred,
+                                model,
+                                classified_error,
+                                increment_consecutive_failures=False,
                             )
                             if attempt >= self.max_retries - 1:
                                 break
                             wait_time = classified_error.retry_after or (
+                                2**attempt
                             ) + random.uniform(0, 1)
                             remaining_budget = deadline - time.time()
                             if wait_time > remaining_budget:
                             )
                             classified_error = classify_error(e)
                             error_message_text = str(e).split("\n")[0]
                             # Record error in accumulator
+                            error_accumulator.record_error(
+                                current_cred, classified_error, error_message_text
+                            )
                             lib_logger.warning(
                                 f"Credential ...{current_cred[-6:]} failed with {classified_error.error_type} (Status: {classified_error.status_code}). Error: {error_message_text}."
                             )
                             # Handle rate limits with cooldown
+                            if (
+                                classified_error.status_code == 429
+                                or classified_error.error_type
+                                in ["rate_limit", "quota_exceeded"]
+                            ):
                                 cooldown_duration = classified_error.retry_after or 60
                                 await self.cooldown_manager.start_cooldown(
                                     provider, cooldown_duration
                             await self.usage_manager.record_failure(
                                 current_cred, model, classified_error
                             )
+                            lib_logger.info(
+                                f"Rotating to next key after {classified_error.error_type} error."
+                            )
                             break
                 finally:
             # Build detailed error response using error accumulator
             error_accumulator.timeout_occurred = time.time() >= deadline
             if error_accumulator.has_errors():
                 # Log concise summary for server logs
                 lib_logger.error(error_accumulator.build_log_message())
                 # Build structured error response for client
                 error_response = error_accumulator.build_client_error_response()
                 error_data = error_response
             else:
                 # Fallback if no errors were recorded (shouldn't happen)
+                final_error_message = (
+                    "Request failed: No available API keys after rotation or timeout."
+                )
                 if last_exception:
+                    final_error_message = (
+                        f"Request failed. Last error: {str(last_exception)}"
+                    )
                 error_data = {
                     "error": {"message": final_error_message, "type": "proxy_error"}
                 }
                 lib_logger.error(final_error_message)
             yield f"data: {json.dumps(error_data)}\n\n"
             yield "data: [DONE]\n\n"
         # Handle iflow provider: remove stream_options to avoid HTTP 406
         model = kwargs.get("model", "")
         provider = model.split("/")[0] if "/" in model else ""
         if provider == "iflow" and "stream_options" in kwargs:
+            lib_logger.debug(
+                "Removing stream_options for iflow provider to avoid HTTP 406"
+            )
             kwargs.pop("stream_options", None)
         if kwargs.get("stream"):
             # Only add stream_options for providers that support it (excluding iflow)
             if provider != "iflow":
                     kwargs["stream_options"] = {}
                 if "include_usage" not in kwargs["stream_options"]:
                     kwargs["stream_options"]["include_usage"] = True
             return self._streaming_acompletion_with_retry(
                 request=request, pre_request_callback=pre_request_callback, **kwargs
             )

src/rotator_library/error_handler.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import re
 import json
 from typing import Optional, Dict, Any
 import httpx
@@ -20,20 +21,20 @@ from litellm.exceptions import (
 def extract_retry_after_from_body(error_body: Optional[str]) -> Optional[int]:
     """
     Extract the retry-after time from an API error response body.
     Handles various error formats including:
     - Gemini CLI: "Your quota will reset after 39s."
     - Generic: "quota will reset after 120s", "retry after 60s"
     Args:
         error_body: The raw error response body
     Returns:
         The retry time in seconds, or None if not found
     """
     if not error_body:
         return None
     # Pattern to match various "reset after Xs" or "retry after Xs" formats
     patterns = [
         r"quota will reset after\s*(\d+)s",
@@ -41,7 +42,7 @@ def extract_retry_after_from_body(error_body: Optional[str]) -> Optional[int]:
         r"retry after\s*(\d+)s",
         r"try again in\s*(\d+)\s*seconds?",
     ]
     for pattern in patterns:
         match = re.search(pattern, error_body, re.IGNORECASE)
         if match:
@@ -49,7 +50,7 @@ def extract_retry_after_from_body(error_body: Optional[str]) -> Optional[int]:
                 return int(match.group(1))
             except (ValueError, IndexError):
                 continue
     return None
@@ -70,29 +71,33 @@ class PreRequestCallbackError(Exception):
 # =============================================================================
 # Abnormal errors that require attention and should always be reported to client
-ABNORMAL_ERROR_TYPES = frozenset({
-    "forbidden",           # 403 - credential access issue
-    "authentication",      # 401 - credential invalid/revoked
-    "pre_request_callback_error",  # Internal proxy error
-})
 # Normal/expected errors during operation - only report if ALL credentials fail
-NORMAL_ERROR_TYPES = frozenset({
-    "rate_limit",          # 429 - expected during high load
-    "quota_exceeded",      # Expected when quota runs out
-    "server_error",        # 5xx - transient provider issues
-    "api_connection",      # Network issues - transient
-})
 def is_abnormal_error(classified_error: "ClassifiedError") -> bool:
     """
     Check if an error is abnormal and should be reported to the client.
     Abnormal errors indicate credential issues that need attention:
     - 403 Forbidden: Credential doesn't have access
     - 401 Unauthorized: Credential is invalid/revoked
     Normal errors are expected during operation:
     - 429 Rate limit: Expected during high load
     - 5xx Server errors: Transient provider issues
@@ -103,11 +108,10 @@ def is_abnormal_error(classified_error: "ClassifiedError") -> bool:
 def mask_credential(credential: str) -> str:
     """
     Mask a credential for safe display in logs and error messages.
     - For API keys: shows last 6 characters (e.g., "...xyz123")
     - For OAuth file paths: shows just the filename (e.g., "antigravity_oauth_1.json")
     """
-    import os
     if os.path.isfile(credential):
         return os.path.basename(credential)
     elif len(credential) > 6:
@@ -119,77 +123,79 @@ def mask_credential(credential: str) -> str:
 class RequestErrorAccumulator:
     """
     Tracks errors encountered during a request's credential rotation cycle.
     Used to build informative error messages for clients when all credentials
     are exhausted. Distinguishes between abnormal errors (that need attention)
     and normal errors (expected during operation).
     """
     def __init__(self):
         self.abnormal_errors: list = []  # 403, 401 - always report details
-        self.normal_errors: list = []    # 429, 5xx - summarize only
-        self.total_credentials_tried: int = 0
         self.timeout_occurred: bool = False
         self.model: str = ""
         self.provider: str = ""
     def record_error(
-        self,
-        credential: str,
-        classified_error: "ClassifiedError",
-        error_message: str
     ):
         """Record an error for a credential."""
-        self.total_credentials_tried += 1
         masked_cred = mask_credential(credential)
         error_record = {
             "credential": masked_cred,
             "error_type": classified_error.error_type,
             "status_code": classified_error.status_code,
-            "message": self._truncate_message(error_message, 150)
         }
         if is_abnormal_error(classified_error):
             self.abnormal_errors.append(error_record)
         else:
             self.normal_errors.append(error_record)
     def _truncate_message(self, message: str, max_length: int = 150) -> str:
         """Truncate error message for readability."""
         # Take first line and truncate
-        first_line = message.split('\n')[0]
         if len(first_line) > max_length:
             return first_line[:max_length] + "..."
         return first_line
     def has_errors(self) -> bool:
         """Check if any errors were recorded."""
         return bool(self.abnormal_errors or self.normal_errors)
     def has_abnormal_errors(self) -> bool:
         """Check if any abnormal errors were recorded."""
         return bool(self.abnormal_errors)
     def get_normal_error_summary(self) -> str:
         """Get a summary of normal errors (not individual details)."""
         if not self.normal_errors:
             return ""
         # Count by type
         counts = {}
         for err in self.normal_errors:
             err_type = err["error_type"]
             counts[err_type] = counts.get(err_type, 0) + 1
         # Build summary like "3 rate_limit, 1 server_error"
         parts = [f"{count} {err_type}" for err_type, count in counts.items()]
         return ", ".join(parts)
     def build_client_error_response(self) -> dict:
         """
         Build a structured error response for the client.
         Returns a dict suitable for JSON serialization in the error response.
         """
         # Determine the primary failure reason
@@ -199,24 +205,34 @@ class RequestErrorAccumulator:
         else:
             error_type = "proxy_all_credentials_exhausted"
             base_message = f"All {self.total_credentials_tried} credential(s) exhausted for {self.provider}"
         # Build human-readable message
         message_parts = [base_message]
         if self.abnormal_errors:
             message_parts.append("\n\nCredential issues (require attention):")
             for err in self.abnormal_errors:
-                status = f"HTTP {err['status_code']}" if err['status_code'] else err['error_type']
-                message_parts.append(f"\n  • {err['credential']}: {status} - {err['message']}")
         normal_summary = self.get_normal_error_summary()
         if normal_summary:
             if self.abnormal_errors:
-                message_parts.append(f"\n\nAdditionally: {normal_summary} (expected during normal operation)")
             else:
                 message_parts.append(f"\n\nAll failures were: {normal_summary}")
-                message_parts.append("\nThis is normal during high load - retry later or add more credentials.")
         response = {
             "error": {
                 "message": "".join(message_parts),
@@ -226,44 +242,48 @@ class RequestErrorAccumulator:
                     "provider": self.provider,
                     "credentials_tried": self.total_credentials_tried,
                     "timeout": self.timeout_occurred,
-                }
             }
         }
         # Only include abnormal errors in details (they need attention)
         if self.abnormal_errors:
             response["error"]["details"]["abnormal_errors"] = self.abnormal_errors
         # Include summary of normal errors
         if normal_summary:
             response["error"]["details"]["normal_error_summary"] = normal_summary
         return response
     def build_log_message(self) -> str:
         """
         Build a concise log message for server-side logging.
         Shorter than client message, suitable for terminal display.
         """
         parts = []
         if self.timeout_occurred:
-            parts.append(f"TIMEOUT: {self.total_credentials_tried} creds tried for {self.model}")
         else:
-            parts.append(f"ALL CREDS EXHAUSTED: {self.total_credentials_tried} tried for {self.model}")
         if self.abnormal_errors:
             abnormal_summary = ", ".join(
                 f"{e['credential']}={e['status_code'] or e['error_type']}"
                 for e in self.abnormal_errors
             )
             parts.append(f"ISSUES: {abnormal_summary}")
         normal_summary = self.get_normal_error_summary()
         if normal_summary:
             parts.append(f"Normal: {normal_summary}")
         return " | ".join(parts)
@@ -296,7 +316,7 @@ def get_retry_after(error: Exception) -> Optional[int]:
     if isinstance(error, httpx.HTTPStatusError):
         headers = error.response.headers
         # Check standard Retry-After header (case-insensitive)
-        retry_header = headers.get('retry-after') or headers.get('Retry-After')
         if retry_header:
             try:
                 return int(retry_header)  # Assumes seconds format
@@ -304,10 +324,13 @@ def get_retry_after(error: Exception) -> Optional[int]:
                 pass  # Might be HTTP date format, skip for now
         # Check X-RateLimit-Reset header (Unix timestamp)
-        reset_header = headers.get('x-ratelimit-reset') or headers.get('X-RateLimit-Reset')
         if reset_header:
             try:
                 import time
                 reset_timestamp = int(reset_header)
                 current_time = int(time.time())
                 wait_seconds = reset_timestamp - current_time
@@ -357,16 +380,16 @@ def get_retry_after(error: Exception) -> Optional[int]:
                 continue
     # 3. Handle duration formats like "60s", "2m", "1h"
-    duration_match = re.search(r'(\d+)\s*([smh])', error_str)
     if duration_match:
         try:
             value = int(duration_match.group(1))
             unit = duration_match.group(2)
-            if unit == 's':
                 return value
-            elif unit == 'm':
                 return value * 60
-            elif unit == 'h':
                 return value * 3600
         except (ValueError, IndexError):
             pass
@@ -381,15 +404,15 @@ def get_retry_after(error: Exception) -> Optional[int]:
             if value.isdigit():
                 return int(value)
             # Handle "60s", "2m" format in attribute
-            duration_match = re.search(r'(\d+)\s*([smh])', value.lower())
             if duration_match:
                 val = int(duration_match.group(1))
                 unit = duration_match.group(2)
-                if unit == 's':
                     return val
-                elif unit == 'm':
                     return val * 60
-                elif unit == 'h':
                     return val * 3600
     return None
@@ -399,7 +422,7 @@ def classify_error(e: Exception) -> ClassifiedError:
     """
     Classifies an exception into a structured ClassifiedError object.
     Now handles both litellm and httpx exceptions.
     Error types and their typical handling:
     - rate_limit (429): Rotate key, may retry with backoff
     - server_error (5xx): Retry with backoff, then rotate
@@ -412,16 +435,16 @@ def classify_error(e: Exception) -> ClassifiedError:
     - unknown: Rotate key (safer to try another)
     """
     status_code = getattr(e, "status_code", None)
     if isinstance(e, httpx.HTTPStatusError):  # [NEW] Handle httpx errors first
         status_code = e.response.status_code
         # Try to get error body for better classification
         try:
-            error_body = e.response.text.lower() if hasattr(e.response, 'text') else ""
         except Exception:
             error_body = ""
         if status_code == 401:
             return ClassifiedError(
                 error_type="authentication",
@@ -453,8 +476,18 @@ def classify_error(e: Exception) -> ClassifiedError:
                 retry_after=retry_after,
             )
         if status_code == 400:
-            # Check for context window / token limit errors
-            if "context" in error_body or "token" in error_body or "too long" in error_body:
                 return ClassifiedError(
                     error_type="context_window_exceeded",
                     original_exception=e,
@@ -465,6 +498,11 @@ def classify_error(e: Exception) -> ClassifiedError:
                 original_exception=e,
                 status_code=status_code,
             )
         if 400 <= status_code < 500:
             # Other 4xx errors - generally client errors
             return ClassifiedError(
@@ -567,7 +605,7 @@ def is_unrecoverable_error(e: Exception) -> bool:
 def should_rotate_on_error(classified_error: ClassifiedError) -> bool:
     """
     Determines if an error should trigger key rotation.
     Errors that SHOULD rotate (try another key):
     - rate_limit: Current key is throttled
     - quota_exceeded: Current key/account exhausted
@@ -576,12 +614,12 @@ def should_rotate_on_error(classified_error: ClassifiedError) -> bool:
     - server_error: Provider having issues (might work with different endpoint/key)
     - api_connection: Network issues (might be transient)
     - unknown: Safer to try another key
     Errors that should NOT rotate (fail immediately):
     - invalid_request: Client error in request payload (won't help to retry)
     - context_window_exceeded: Request too large (won't help to retry)
     - pre_request_callback_error: Internal proxy error
     Returns:
         True if should rotate to next key, False if should fail immediately
     """
@@ -596,10 +634,10 @@ def should_rotate_on_error(classified_error: ClassifiedError) -> bool:
 def should_retry_same_key(classified_error: ClassifiedError) -> bool:
     """
     Determines if an error should retry with the same key (with backoff).
     Only server errors and connection issues should retry the same key,
     as these are often transient.
     Returns:
         True if should retry same key, False if should rotate immediately
     """

 import re
 import json
+import os
 from typing import Optional, Dict, Any
 import httpx
 def extract_retry_after_from_body(error_body: Optional[str]) -> Optional[int]:
     """
     Extract the retry-after time from an API error response body.
     Handles various error formats including:
     - Gemini CLI: "Your quota will reset after 39s."
     - Generic: "quota will reset after 120s", "retry after 60s"
     Args:
         error_body: The raw error response body
     Returns:
         The retry time in seconds, or None if not found
     """
     if not error_body:
         return None
     # Pattern to match various "reset after Xs" or "retry after Xs" formats
     patterns = [
         r"quota will reset after\s*(\d+)s",
         r"retry after\s*(\d+)s",
         r"try again in\s*(\d+)\s*seconds?",
     ]
     for pattern in patterns:
         match = re.search(pattern, error_body, re.IGNORECASE)
         if match:
                 return int(match.group(1))
             except (ValueError, IndexError):
                 continue
     return None
 # =============================================================================
 # Abnormal errors that require attention and should always be reported to client
+ABNORMAL_ERROR_TYPES = frozenset(
+    {
+        "forbidden",  # 403 - credential access issue
+        "authentication",  # 401 - credential invalid/revoked
+        "pre_request_callback_error",  # Internal proxy error
+    }
+)
 # Normal/expected errors during operation - only report if ALL credentials fail
+NORMAL_ERROR_TYPES = frozenset(
+    {
+        "rate_limit",  # 429 - expected during high load
+        "quota_exceeded",  # Expected when quota runs out
+        "server_error",  # 5xx - transient provider issues
+        "api_connection",  # Network issues - transient
+    }
+)
 def is_abnormal_error(classified_error: "ClassifiedError") -> bool:
     """
     Check if an error is abnormal and should be reported to the client.
     Abnormal errors indicate credential issues that need attention:
     - 403 Forbidden: Credential doesn't have access
     - 401 Unauthorized: Credential is invalid/revoked
     Normal errors are expected during operation:
     - 429 Rate limit: Expected during high load
     - 5xx Server errors: Transient provider issues
 def mask_credential(credential: str) -> str:
     """
     Mask a credential for safe display in logs and error messages.
     - For API keys: shows last 6 characters (e.g., "...xyz123")
     - For OAuth file paths: shows just the filename (e.g., "antigravity_oauth_1.json")
     """
     if os.path.isfile(credential):
         return os.path.basename(credential)
     elif len(credential) > 6:
 class RequestErrorAccumulator:
     """
     Tracks errors encountered during a request's credential rotation cycle.
     Used to build informative error messages for clients when all credentials
     are exhausted. Distinguishes between abnormal errors (that need attention)
     and normal errors (expected during operation).
     """
     def __init__(self):
         self.abnormal_errors: list = []  # 403, 401 - always report details
+        self.normal_errors: list = []  # 429, 5xx - summarize only
+        self._tried_credentials: set = set()  # Track unique credentials
         self.timeout_occurred: bool = False
         self.model: str = ""
         self.provider: str = ""
     def record_error(
+        self, credential: str, classified_error: "ClassifiedError", error_message: str
     ):
         """Record an error for a credential."""
+        self._tried_credentials.add(credential)
         masked_cred = mask_credential(credential)
         error_record = {
             "credential": masked_cred,
             "error_type": classified_error.error_type,
             "status_code": classified_error.status_code,
+            "message": self._truncate_message(error_message, 150),
         }
         if is_abnormal_error(classified_error):
             self.abnormal_errors.append(error_record)
         else:
             self.normal_errors.append(error_record)
+    @property
+    def total_credentials_tried(self) -> int:
+        """Return the number of unique credentials tried."""
+        return len(self._tried_credentials)
     def _truncate_message(self, message: str, max_length: int = 150) -> str:
         """Truncate error message for readability."""
         # Take first line and truncate
+        first_line = message.split("\n")[0]
         if len(first_line) > max_length:
             return first_line[:max_length] + "..."
         return first_line
     def has_errors(self) -> bool:
         """Check if any errors were recorded."""
         return bool(self.abnormal_errors or self.normal_errors)
     def has_abnormal_errors(self) -> bool:
         """Check if any abnormal errors were recorded."""
         return bool(self.abnormal_errors)
     def get_normal_error_summary(self) -> str:
         """Get a summary of normal errors (not individual details)."""
         if not self.normal_errors:
             return ""
         # Count by type
         counts = {}
         for err in self.normal_errors:
             err_type = err["error_type"]
             counts[err_type] = counts.get(err_type, 0) + 1
         # Build summary like "3 rate_limit, 1 server_error"
         parts = [f"{count} {err_type}" for err_type, count in counts.items()]
         return ", ".join(parts)
     def build_client_error_response(self) -> dict:
         """
         Build a structured error response for the client.
         Returns a dict suitable for JSON serialization in the error response.
         """
         # Determine the primary failure reason
         else:
             error_type = "proxy_all_credentials_exhausted"
             base_message = f"All {self.total_credentials_tried} credential(s) exhausted for {self.provider}"
         # Build human-readable message
         message_parts = [base_message]
         if self.abnormal_errors:
             message_parts.append("\n\nCredential issues (require attention):")
             for err in self.abnormal_errors:
+                status = (
+                    f"HTTP {err['status_code']}"
+                    if err["status_code"] is not None
+                    else err["error_type"]
+                )
+                message_parts.append(
+                    f"\n  • {err['credential']}: {status} - {err['message']}"
+                )
         normal_summary = self.get_normal_error_summary()
         if normal_summary:
             if self.abnormal_errors:
+                message_parts.append(
+                    f"\n\nAdditionally: {normal_summary} (expected during normal operation)"
+                )
             else:
                 message_parts.append(f"\n\nAll failures were: {normal_summary}")
+                message_parts.append(
+                    "\nThis is normal during high load - retry later or add more credentials."
+                )
         response = {
             "error": {
                 "message": "".join(message_parts),
                     "provider": self.provider,
                     "credentials_tried": self.total_credentials_tried,
                     "timeout": self.timeout_occurred,
+                },
             }
         }
         # Only include abnormal errors in details (they need attention)
         if self.abnormal_errors:
             response["error"]["details"]["abnormal_errors"] = self.abnormal_errors
         # Include summary of normal errors
         if normal_summary:
             response["error"]["details"]["normal_error_summary"] = normal_summary
         return response
     def build_log_message(self) -> str:
         """
         Build a concise log message for server-side logging.
         Shorter than client message, suitable for terminal display.
         """
         parts = []
         if self.timeout_occurred:
+            parts.append(
+                f"TIMEOUT: {self.total_credentials_tried} creds tried for {self.model}"
+            )
         else:
+            parts.append(
+                f"ALL CREDS EXHAUSTED: {self.total_credentials_tried} tried for {self.model}"
+            )
         if self.abnormal_errors:
             abnormal_summary = ", ".join(
                 f"{e['credential']}={e['status_code'] or e['error_type']}"
                 for e in self.abnormal_errors
             )
             parts.append(f"ISSUES: {abnormal_summary}")
         normal_summary = self.get_normal_error_summary()
         if normal_summary:
             parts.append(f"Normal: {normal_summary}")
         return " | ".join(parts)
     if isinstance(error, httpx.HTTPStatusError):
         headers = error.response.headers
         # Check standard Retry-After header (case-insensitive)
+        retry_header = headers.get("retry-after") or headers.get("Retry-After")
         if retry_header:
             try:
                 return int(retry_header)  # Assumes seconds format
                 pass  # Might be HTTP date format, skip for now
         # Check X-RateLimit-Reset header (Unix timestamp)
+        reset_header = headers.get("x-ratelimit-reset") or headers.get(
+            "X-RateLimit-Reset"
+        )
         if reset_header:
             try:
                 import time
                 reset_timestamp = int(reset_header)
                 current_time = int(time.time())
                 wait_seconds = reset_timestamp - current_time
                 continue
     # 3. Handle duration formats like "60s", "2m", "1h"
+    duration_match = re.search(r"(\d+)\s*([smh])", error_str)
     if duration_match:
         try:
             value = int(duration_match.group(1))
             unit = duration_match.group(2)
+            if unit == "s":
                 return value
+            elif unit == "m":
                 return value * 60
+            elif unit == "h":
                 return value * 3600
         except (ValueError, IndexError):
             pass
             if value.isdigit():
                 return int(value)
             # Handle "60s", "2m" format in attribute
+            duration_match = re.search(r"(\d+)\s*([smh])", value.lower())
             if duration_match:
                 val = int(duration_match.group(1))
                 unit = duration_match.group(2)
+                if unit == "s":
                     return val
+                elif unit == "m":
                     return val * 60
+                elif unit == "h":
                     return val * 3600
     return None
     """
     Classifies an exception into a structured ClassifiedError object.
     Now handles both litellm and httpx exceptions.
     Error types and their typical handling:
     - rate_limit (429): Rotate key, may retry with backoff
     - server_error (5xx): Retry with backoff, then rotate
     - unknown: Rotate key (safer to try another)
     """
     status_code = getattr(e, "status_code", None)
     if isinstance(e, httpx.HTTPStatusError):  # [NEW] Handle httpx errors first
         status_code = e.response.status_code
         # Try to get error body for better classification
         try:
+            error_body = e.response.text.lower() if hasattr(e.response, "text") else ""
         except Exception:
             error_body = ""
         if status_code == 401:
             return ClassifiedError(
                 error_type="authentication",
                 retry_after=retry_after,
             )
         if status_code == 400:
+            # Check for context window / token limit errors with more specific patterns
+            if any(
+                pattern in error_body
+                for pattern in [
+                    "context_length",
+                    "max_tokens",
+                    "token limit",
+                    "context window",
+                    "too many tokens",
+                    "too long",
+                ]
+            ):
                 return ClassifiedError(
                     error_type="context_window_exceeded",
                     original_exception=e,
                 original_exception=e,
                 status_code=status_code,
             )
+            return ClassifiedError(
+                error_type="invalid_request",
+                original_exception=e,
+                status_code=status_code,
+            )
         if 400 <= status_code < 500:
             # Other 4xx errors - generally client errors
             return ClassifiedError(
 def should_rotate_on_error(classified_error: ClassifiedError) -> bool:
     """
     Determines if an error should trigger key rotation.
     Errors that SHOULD rotate (try another key):
     - rate_limit: Current key is throttled
     - quota_exceeded: Current key/account exhausted
     - server_error: Provider having issues (might work with different endpoint/key)
     - api_connection: Network issues (might be transient)
     - unknown: Safer to try another key
     Errors that should NOT rotate (fail immediately):
     - invalid_request: Client error in request payload (won't help to retry)
     - context_window_exceeded: Request too large (won't help to retry)
     - pre_request_callback_error: Internal proxy error
     Returns:
         True if should rotate to next key, False if should fail immediately
     """
 def should_retry_same_key(classified_error: ClassifiedError) -> bool:
     """
     Determines if an error should retry with the same key (with backoff).
     Only server errors and connection issues should retry the same key,
     as these are often transient.
     Returns:
         True if should retry same key, False if should rotate immediately
     """

src/rotator_library/failure_logger.py CHANGED Viewed

@@ -4,6 +4,7 @@ from logging.handlers import RotatingFileHandler
 import os
 from datetime import datetime
 def setup_failure_logger():
     """Sets up a dedicated JSON logger for writing detailed failure logs to a file."""
     log_dir = "logs"
@@ -12,15 +13,15 @@ def setup_failure_logger():
     # Create a logger specifically for failures.
     # This logger will NOT propagate to the root logger.
-    logger = logging.getLogger('failure_logger')
     logger.setLevel(logging.INFO)
     logger.propagate = False
     # Use a rotating file handler
     handler = RotatingFileHandler(
-        os.path.join(log_dir, 'failures.log'),
-        maxBytes=5*1024*1024,  # 5 MB
-        backupCount=2
     )
     # Custom JSON formatter for structured logs
@@ -30,62 +31,65 @@ def setup_failure_logger():
             return json.dumps(record.msg)
     handler.setFormatter(JsonFormatter())
     # Add handler only if it hasn't been added before
     if not logger.handlers:
         logger.addHandler(handler)
     return logger
 # Initialize the dedicated logger for detailed failure logs
 failure_logger = setup_failure_logger()
 # Get the main library logger for concise, propagated messages
-main_lib_logger = logging.getLogger('rotator_library')
 def _extract_response_body(error: Exception) -> str:
     """
     Extract the full response body from various error types.
     Handles:
     - httpx.HTTPStatusError: response.text or response.content
     - litellm exceptions: various response attributes
     - Other exceptions: str(error)
     """
     # Try to get response body from httpx errors
-    if hasattr(error, 'response') and error.response is not None:
         response = error.response
         # Try .text first (decoded)
-        if hasattr(response, 'text') and response.text:
             return response.text
         # Try .content (bytes)
-        if hasattr(response, 'content') and response.content:
             try:
-                return response.content.decode('utf-8', errors='replace')
             except Exception:
                 return str(response.content)
-        # Try reading response if it's a streaming response that was read
-        if hasattr(response, '_content') and response._content:
-            try:
-                return response._content.decode('utf-8', errors='replace')
-            except Exception:
-                return str(response._content)
     # Check for litellm's body attribute
-    if hasattr(error, 'body') and error.body:
         return str(error.body)
     # Check for message attribute that might contain response
-    if hasattr(error, 'message') and error.message:
         return str(error.message)
     return None
-def log_failure(api_key: str, model: str, attempt: int, error: Exception, request_headers: dict, raw_response_text: str = None):
     """
     Logs a detailed failure message to a file and a concise summary to the main logger.
     Args:
         api_key: The API key or credential path that was used
         model: The model that was requested
@@ -103,19 +107,30 @@ def log_failure(api_key: str, model: str, attempt: int, error: Exception, reques
     # Get full error message (not truncated)
     full_error_message = str(error)
     # Also capture any nested/wrapped exception info
     error_chain = []
     current_error = error
     while current_error:
-        error_chain.append({
-            "type": type(current_error).__name__,
-            "message": str(current_error)[:2000]  # Limit per-error message size
-        })
-        current_error = getattr(current_error, '__cause__', None) or getattr(current_error, '__context__', None)
-        if len(error_chain) > 5:  # Prevent infinite loops
             break
     detailed_log_data = {
         "timestamp": datetime.utcnow().isoformat(),
         "api_key_ending": api_key[-4:] if len(api_key) >= 4 else "****",
@@ -123,7 +138,9 @@ def log_failure(api_key: str, model: str, attempt: int, error: Exception, reques
         "attempt_number": attempt,
         "error_type": type(error).__name__,
         "error_message": full_error_message[:5000],  # Limit total size
-        "raw_response": raw_response[:10000] if raw_response else None,  # Limit response size
         "request_headers": request_headers,
         "error_chain": error_chain if len(error_chain) > 1 else None,
     }

 import os
 from datetime import datetime
 def setup_failure_logger():
     """Sets up a dedicated JSON logger for writing detailed failure logs to a file."""
     log_dir = "logs"
     # Create a logger specifically for failures.
     # This logger will NOT propagate to the root logger.
+    logger = logging.getLogger("failure_logger")
     logger.setLevel(logging.INFO)
     logger.propagate = False
     # Use a rotating file handler
     handler = RotatingFileHandler(
+        os.path.join(log_dir, "failures.log"),
+        maxBytes=5 * 1024 * 1024,  # 5 MB
+        backupCount=2,
     )
     # Custom JSON formatter for structured logs
             return json.dumps(record.msg)
     handler.setFormatter(JsonFormatter())
     # Add handler only if it hasn't been added before
     if not logger.handlers:
         logger.addHandler(handler)
     return logger
 # Initialize the dedicated logger for detailed failure logs
 failure_logger = setup_failure_logger()
 # Get the main library logger for concise, propagated messages
+main_lib_logger = logging.getLogger("rotator_library")
 def _extract_response_body(error: Exception) -> str:
     """
     Extract the full response body from various error types.
     Handles:
     - httpx.HTTPStatusError: response.text or response.content
     - litellm exceptions: various response attributes
     - Other exceptions: str(error)
     """
     # Try to get response body from httpx errors
+    if hasattr(error, "response") and error.response is not None:
         response = error.response
         # Try .text first (decoded)
+        if hasattr(response, "text") and response.text:
             return response.text
         # Try .content (bytes)
+        if hasattr(response, "content") and response.content:
             try:
+                return response.content.decode("utf-8", errors="replace")
             except Exception:
                 return str(response.content)
     # Check for litellm's body attribute
+    if hasattr(error, "body") and error.body:
         return str(error.body)
     # Check for message attribute that might contain response
+    if hasattr(error, "message") and error.message:
         return str(error.message)
     return None
+def log_failure(
+    api_key: str,
+    model: str,
+    attempt: int,
+    error: Exception,
+    request_headers: dict,
+    raw_response_text: str = None,
+):
     """
     Logs a detailed failure message to a file and a concise summary to the main logger.
     Args:
         api_key: The API key or credential path that was used
         model: The model that was requested
     # Get full error message (not truncated)
     full_error_message = str(error)
     # Also capture any nested/wrapped exception info
     error_chain = []
+    visited = set()  # Track visited exceptions to detect circular references
     current_error = error
     while current_error:
+        # Check for circular references
+        error_id = id(current_error)
+        if error_id in visited:
             break
+        visited.add(error_id)
+        error_chain.append(
+            {
+                "type": type(current_error).__name__,
+                "message": str(current_error)[:2000],  # Limit per-error message size
+            }
+        )
+        current_error = getattr(current_error, "__cause__", None) or getattr(
+            current_error, "__context__", None
+        )
+        if len(error_chain) > 5:  # Prevent excessive chain length
+            break
     detailed_log_data = {
         "timestamp": datetime.utcnow().isoformat(),
         "api_key_ending": api_key[-4:] if len(api_key) >= 4 else "****",
         "attempt_number": attempt,
         "error_type": type(error).__name__,
         "error_message": full_error_message[:5000],  # Limit total size
+        "raw_response": raw_response[:10000]
+        if raw_response
+        else None,  # Limit response size
         "request_headers": request_headers,
         "error_chain": error_chain if len(error_chain) > 1 else None,
     }