Spaces:

elmerzole
/

llm-api-proxy

Paused

Mirrowel commited on Jul 11, 2025

Commit

856f4f3

1 Parent(s): aaab6f8

feat(client): enhance streaming error handling and log clarity

- Introduce a mechanism to track consecutive quota failures within the streaming client. This allows for distinguishing between transient quota hits and persistent, input-related quota exhaustion.
- Terminate the stream with a `proxy_fatal_quota_error` message to the client only after 3 consecutive quota failures, signaling a persistent issue (e.g., input data too large).
- Refactor key rotation behavior to be largely silent for transient or recoverable errors (including single quota errors or server errors). Intermediate error messages are no longer sent to the client on each key switch, improving client experience by reducing noise.
- Integrate `colorlog` into the proxy application's console handler for improved readability and visual distinction of log levels.
- Update `rotating-api-key-client` version to `0.8` reflecting these significant updates.

BREAKING CHANGE: The streaming client's error handling and client-facing error messages have changed. Intermediate `proxy_key_rotation_error` messages previously yielded during key rotation for transient failures are no longer sent. The `proxy_quota_error` type has been replaced by `proxy_fatal_quota_error`, which is now only emitted after 3 consecutive quota failures instead of immediately. Clients consuming the streaming API should adapt to these changes in error message types and timing.

Files changed (4) hide show

requirements.txt +2 -0
src/proxy_app/main.py +14 -3
src/rotator_library/client.py +45 -56
src/rotator_library/pyproject.toml +1 -1

requirements.txt CHANGED Viewed

@@ -14,3 +14,5 @@ litellm
 filelock
 httpx
 aiofiles

 filelock
 httpx
 aiofiles
+colorlog

src/proxy_app/main.py CHANGED Viewed

@@ -8,6 +8,7 @@ from fastapi.responses import StreamingResponse
 from fastapi.security import APIKeyHeader
 from dotenv import load_dotenv
 import logging
 from pathlib import Path
 import sys
 import json
@@ -60,10 +61,20 @@ class RotatorDebugFilter(logging.Filter):
         return record.levelno == logging.DEBUG and record.name.startswith('rotator_library')
 debug_file_handler.addFilter(RotatorDebugFilter())
-# Configure a console handler for concise, high-level info
-console_handler = logging.StreamHandler(sys.stdout)
 console_handler.setLevel(logging.INFO)
-console_handler.setFormatter(logging.Formatter('%(message)s'))
 # Add a filter to prevent any LiteLLM logs from cluttering the console
 class NoLiteLLMLogFilter(logging.Filter):

 from fastapi.security import APIKeyHeader
 from dotenv import load_dotenv
 import logging
+import colorlog
 from pathlib import Path
 import sys
 import json
         return record.levelno == logging.DEBUG and record.name.startswith('rotator_library')
 debug_file_handler.addFilter(RotatorDebugFilter())
+# Configure a console handler with color
+console_handler = colorlog.StreamHandler(sys.stdout)
 console_handler.setLevel(logging.INFO)
+formatter = colorlog.ColoredFormatter(
+    '%(log_color)s%(message)s',
+    log_colors={
+        'DEBUG':    'cyan',
+        'INFO':     'green',
+        'WARNING':  'yellow',
+        'ERROR':    'red',
+        'CRITICAL': 'red,bg_white',
+    }
+)
+console_handler.setFormatter(formatter)
 # Add a filter to prevent any LiteLLM logs from cluttering the console
 class NoLiteLLMLogFilter(logging.Filter):

src/rotator_library/client.py CHANGED Viewed

@@ -454,6 +454,9 @@ class RotatingClient:
         tried_keys = set()
         last_exception = None
         kwargs = self._convert_model_params(**kwargs)
         try:
             while len(tried_keys) < len(keys_for_provider) and time.time() < deadline:
                 current_key = None
@@ -545,7 +548,9 @@ class RotatingClient:
                             error_message_text = error_details.get("message", str(original_exc))
                             if "quota" in error_message_text.lower() or "resource_exhausted" in error_status.lower():
-                                # This is a fatal quota error. Terminate the stream with a clear message.
                                 quota_value = "N/A"
                                 quota_id = "N/A"
                                 if "details" in error_details and isinstance(error_details.get("details"), list):
@@ -559,58 +564,52 @@ class RotatingClient:
                                                 if quota_value != "N/A" and quota_id != "N/A":
                                                     break
-                                # 1. Detailed message for the end client
-                                client_error_message = (
-                                    f"FATAL: You have exceeded your API quota. "
-                                    f"Message: '{error_message_text}'. "
-                                    f"Limit: {quota_value} (Quota ID: {quota_id})."
-                                )
-                                # 2. Concise message for the console log
-                                console_log_message = (
-                                    f"Terminating stream for key ...{current_key[-4:]} due to fatal quota error. "
-                                    f"ID: {quota_id}, Limit: {quota_value}."
-                                )
-                                lib_logger.warning(console_log_message)
-                                # 3. Yield the detailed message to the client and terminate
-                                yield f"data: {json.dumps({'error': {'message': client_error_message, 'type': 'proxy_quota_error'}})}\n\n"
-                                yield "data: [DONE]\n\n"
-                                return # Exit the generator completely.
-                            # --- NON-QUOTA ERROR: Fallback to key rotation ---
-                            rotation_error_message = f"Provider API key failed with {classified_error.error_type}. Rotating to a new key."
-                            yield f"data: {json.dumps({'error': {'message': rotation_error_message, 'type': 'proxy_key_rotation_error', 'code': classified_error.status_code}})}\n\n"
-                            lib_logger.warning(f"Key ...{current_key[-4:]} encountered a recoverable error during stream for model {model}. Rotating key.")
-                            # Only apply global cooldown for non-quota 429s.
-                            if classified_error.error_type == 'rate_limit' and classified_error.status_code == 429:
-                                cooldown_duration = classified_error.retry_after or 60
-                                await self.cooldown_manager.start_cooldown(provider, cooldown_duration)
-                                lib_logger.warning(f"IP-based rate limit detected for {provider}. Starting a {cooldown_duration}-second global cooldown.")
-                            await self.usage_manager.record_failure(current_key, model, classified_error)
-                            break # Break to try the next key
                         except (APIConnectionError, litellm.InternalServerError, litellm.ServiceUnavailableError) as e:
                             last_exception = e
                             log_failure(api_key=current_key, model=model, attempt=attempt + 1, error=e, request_headers=dict(request.headers) if request else {})
                             classified_error = classify_error(e)
                             await self.usage_manager.record_failure(current_key, model, classified_error)
                             if attempt >= self.max_retries - 1:
-                                lib_logger.warning(f"Key ...{current_key[-4:]} failed after max retries for model {model} due to a server error. Rotating key.")
-                                # Inform the client about the temporary failure before rotating.
-                                error_message = f"Key ...{current_key[-4:]} failed after multiple retries. Rotating to a new key."
-                                error_data = {
-                                    "error": {
-                                        "message": error_message,
-                                        "type": "proxy_key_rotation_error",
-                                        "code": classified_error.status_code
-                                    }
-                                }
-                                yield f"data: {json.dumps(error_data)}\n\n"
                                 break
                             wait_time = classified_error.retry_after or (1 * (2 ** attempt)) + random.uniform(0, 1)
@@ -625,22 +624,11 @@ class RotatingClient:
                             continue
                         except Exception as e:
                             last_exception = e
                             log_failure(api_key=current_key, model=model, attempt=attempt + 1, error=e, request_headers=dict(request.headers) if request else {})
                             classified_error = classify_error(e)
-                            # For most exceptions, we notify the client and rotate the key.
-                            if classified_error.error_type not in ['invalid_request', 'context_window_exceeded', 'authentication']:
-                                error_message = f"An unexpected error occurred with key ...{current_key[-4:]}. Rotating to a new key."
-                                error_data = {
-                                    "error": {
-                                        "message": error_message,
-                                        "type": "proxy_key_rotation_error",
-                                        "code": classified_error.status_code
-                                    }
-                                }
-                                yield f"data: {json.dumps(error_data)}\n\n"
                             lib_logger.warning(f"Key ...{current_key[-4:]} failed with {classified_error.error_type} (Status: {classified_error.status_code}). Error: {str(e)}. Rotating key.")
                             if classified_error.status_code == 429:
@@ -651,6 +639,7 @@ class RotatingClient:
                             if classified_error.error_type in ['invalid_request', 'context_window_exceeded', 'authentication']:
                                 raise last_exception
                             await self.usage_manager.record_failure(current_key, model, classified_error)
                             break
@@ -761,4 +750,4 @@ class RotatingClient:
             for provider, models in all_provider_models.items():
                 for model in models:
                     flat_models.append(f"{provider}/{model}")
-            return flat_models

         tried_keys = set()
         last_exception = None
         kwargs = self._convert_model_params(**kwargs)
+        consecutive_quota_failures = 0
         try:
             while len(tried_keys) < len(keys_for_provider) and time.time() < deadline:
                 current_key = None
                             error_message_text = error_details.get("message", str(original_exc))
                             if "quota" in error_message_text.lower() or "resource_exhausted" in error_status.lower():
+                                consecutive_quota_failures += 1
+                                lib_logger.warning(f"Key ...{current_key[-4:]} hit a quota limit. This is consecutive failure #{consecutive_quota_failures} for this request.")
                                 quota_value = "N/A"
                                 quota_id = "N/A"
                                 if "details" in error_details and isinstance(error_details.get("details"), list):
                                                 if quota_value != "N/A" and quota_id != "N/A":
                                                     break
+                                await self.usage_manager.record_failure(current_key, model, classified_error)
+                                if consecutive_quota_failures >= 3:
+                                    console_log_message = (
+                                        f"Terminating stream for key ...{current_key[-4:]} due to 3rd consecutive quota error. "
+                                        f"This is now considered a fatal input data error. ID: {quota_id}, Limit: {quota_value}."
+                                    )
+                                    client_error_message = (
+                                        "FATAL: Request failed after 3 consecutive quota errors, "
+                                        "indicating the input data is too large for the model's per-request limit. "
+                                        f"Last Error Message: '{error_message_text}'. Limit: {quota_value} (Quota ID: {quota_id})."
+                                    )
+                                    lib_logger.error(console_log_message)
+                                    yield f"data: {json.dumps({'error': {'message': client_error_message, 'type': 'proxy_fatal_quota_error'}})}\n\n"
+                                    yield "data: [DONE]\n\n"
+                                    return
+                                else:
+                                    # [MODIFIED] Do not yield to the client. Just log and break to rotate the key.
+                                    lib_logger.warning(f"Quota error on key ...{current_key[-4:]} (failure {consecutive_quota_failures}/3). Rotating key silently.")
+                                    break
+                            else:
+                                consecutive_quota_failures = 0
+                                # [MODIFIED] Do not yield to the client. Just log and break to rotate the key.
+                                lib_logger.warning(f"Key ...{current_key[-4:]} encountered a recoverable error ({classified_error.error_type}) during stream. Rotating key silently.")
+                                if classified_error.error_type == 'rate_limit' and classified_error.status_code == 429:
+                                    cooldown_duration = classified_error.retry_after or 60
+                                    await self.cooldown_manager.start_cooldown(provider, cooldown_duration)
+                                    lib_logger.warning(f"IP-based rate limit detected for {provider}. Starting a {cooldown_duration}-second global cooldown.")
+                                await self.usage_manager.record_failure(current_key, model, classified_error)
+                                break
                         except (APIConnectionError, litellm.InternalServerError, litellm.ServiceUnavailableError) as e:
+                            consecutive_quota_failures = 0
                             last_exception = e
                             log_failure(api_key=current_key, model=model, attempt=attempt + 1, error=e, request_headers=dict(request.headers) if request else {})
                             classified_error = classify_error(e)
                             await self.usage_manager.record_failure(current_key, model, classified_error)
                             if attempt >= self.max_retries - 1:
+                                lib_logger.warning(f"Key ...{current_key[-4:]} failed after max retries for model {model} due to a server error. Rotating key silently.")
+                                # [MODIFIED] Do not yield to the client here.
                                 break
                             wait_time = classified_error.retry_after or (1 * (2 ** attempt)) + random.uniform(0, 1)
                             continue
                         except Exception as e:
+                            consecutive_quota_failures = 0
                             last_exception = e
                             log_failure(api_key=current_key, model=model, attempt=attempt + 1, error=e, request_headers=dict(request.headers) if request else {})
                             classified_error = classify_error(e)
                             lib_logger.warning(f"Key ...{current_key[-4:]} failed with {classified_error.error_type} (Status: {classified_error.status_code}). Error: {str(e)}. Rotating key.")
                             if classified_error.status_code == 429:
                             if classified_error.error_type in ['invalid_request', 'context_window_exceeded', 'authentication']:
                                 raise last_exception
+                            # [MODIFIED] Do not yield to the client here.
                             await self.usage_manager.record_failure(current_key, model, classified_error)
                             break
             for provider, models in all_provider_models.items():
                 for model in models:
                     flat_models.append(f"{provider}/{model}")
+            return flat_models

src/rotator_library/pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "rotating-api-key-client"
-version = "0.6.7"
 authors = [
     { name="Mirrowel", email="nuh@uh.com" },
 ]

 [project]
 name = "rotating-api-key-client"
+version = "0.8"
 authors = [
     { name="Mirrowel", email="nuh@uh.com" },
 ]