Spaces:

elmerzole
/

llm-api-proxy

Paused

Mirrowel commited on Jul 7, 2025

Commit

3f958d9

1 Parent(s): b5b51f2

Refactor: Streaming response handling and key management.

Adjust `chat_completions` in `proxy_app` to correctly differentiate and handle streaming vs. non-streaming responses from the client. This ensures the generator is passed directly for streaming and the awaited result for non-streaming.

In `rotator_library`, remove the immediate `release_key` call after a successful `aembedding` operation, indicating a change in how API keys are managed post-use. Add explicit handling for `asyncio.CancelledError` during retries.

Files changed (2) hide show

src/proxy_app/main.py +3 -5
src/rotator_library/client.py +15 -25

src/proxy_app/main.py CHANGED Viewed

@@ -243,16 +243,14 @@ async def chat_completions(
         request_data = await request.json()
         is_streaming = request_data.get("stream", False)
-        response = await client.acompletion(request=request, **request_data)
         if is_streaming:
-            # For streaming, the response is the generator.
             return StreamingResponse(
-                streaming_response_wrapper(request, request_data, response),
                 media_type="text/event-stream"
             )
         else:
-            # For non-streaming, the response is the completed object.
             if ENABLE_REQUEST_LOGGING:
                 log_request_response(
                     request_data=request_data,

         request_data = await request.json()
         is_streaming = request_data.get("stream", False)
         if is_streaming:
+            response_generator = client.acompletion(request=request, **request_data)
             return StreamingResponse(
+                streaming_response_wrapper(request, request_data, response_generator),
                 media_type="text/event-stream"
             )
         else:
+            response = await client.acompletion(request=request, **request_data)
             if ENABLE_REQUEST_LOGGING:
                 log_request_response(
                     request_data=request_data,

src/rotator_library/client.py CHANGED Viewed

@@ -197,9 +197,6 @@ class RotatingClient:
         raise Exception("Failed to complete the request: No available API keys or all keys failed.")
     async def aembedding(self, request: Optional[Any] = None, **kwargs) -> Any:
-        """
-        Performs an embedding call with smart key rotation and retry logic.
-        """
         kwargs = self._convert_model_params(**kwargs)
         model = kwargs.get("model")
         if not model:
@@ -221,10 +218,7 @@ class RotatingClient:
                 if not keys_to_try:
                     break
-                current_key = await self.usage_manager.acquire_key(
-                    available_keys=keys_to_try,
-                    model=model
-                )
                 key_acquired = True
                 tried_keys.add(current_key)
@@ -234,45 +228,41 @@ class RotatingClient:
                 for attempt in range(self.max_retries):
                     try:
                         lib_logger.info(f"Attempting embedding call with key ...{current_key[-4:]} (Attempt {attempt + 1}/{self.max_retries})")
                         response = await litellm.aembedding(api_key=current_key, **litellm_kwargs)
                         await self.usage_manager.record_success(current_key, model, response)
-                        await self.usage_manager.release_key(current_key, model)
-                        key_acquired = False
                         return response
                     except Exception as e:
                         last_exception = e
                         log_failure(api_key=current_key, model=model, attempt=attempt + 1, error=e, request_data=kwargs)
                         classified_error = classify_error(e)
                         if classified_error.error_type in ['invalid_request', 'context_window_exceeded']:
-                            lib_logger.error(f"Unrecoverable error '{classified_error.error_type}' with key ...{current_key[-4:]}. Failing request.")
                             raise last_exception
                         if request and await request.is_disconnected():
                             lib_logger.warning(f"Client disconnected during embedding. Aborting retries for key ...{current_key[-4:]}.")
                             raise last_exception
                         if classified_error.error_type in ['server_error', 'api_connection']:
                             await self.usage_manager.record_failure(current_key, model, classified_error)
-                            if attempt >= self.max_retries - 1:
-                                lib_logger.warning(f"Key ...{current_key[-4:]} failed on final retry for {classified_error.error_type}. Trying next key.")
-                                break
-                            base_wait = 5 if classified_error.error_type == 'api_connection' else 1
-                            wait_time = classified_error.retry_after or (base_wait * (2 ** attempt)) + random.uniform(0, 1)
-                            lib_logger.warning(f"Key ...{current_key[-4:]} encountered a {classified_error.error_type}. Retrying in {wait_time:.2f} seconds...")
                             await asyncio.sleep(wait_time)
                             continue
                         await self.usage_manager.record_failure(current_key, model, classified_error)
-                        lib_logger.warning(f"Key ...{current_key[-4:]} encountered '{classified_error.error_type}'. Trying next key.")
                         break
             finally:
                 if key_acquired and current_key:
                     await self.usage_manager.release_key(current_key, model)
@@ -280,7 +270,7 @@ class RotatingClient:
         if last_exception:
             raise last_exception
-        raise Exception("Failed to complete the request: No available API keys for the provider or all keys failed.")
     def token_count(self, **kwargs) -> int:
         """Calculates the number of tokens for a given text or list of messages."""

         raise Exception("Failed to complete the request: No available API keys or all keys failed.")
     async def aembedding(self, request: Optional[Any] = None, **kwargs) -> Any:
         kwargs = self._convert_model_params(**kwargs)
         model = kwargs.get("model")
         if not model:
                 if not keys_to_try:
                     break
+                current_key = await self.usage_manager.acquire_key(available_keys=keys_to_try, model=model)
                 key_acquired = True
                 tried_keys.add(current_key)
                 for attempt in range(self.max_retries):
                     try:
                         lib_logger.info(f"Attempting embedding call with key ...{current_key[-4:]} (Attempt {attempt + 1}/{self.max_retries})")
                         response = await litellm.aembedding(api_key=current_key, **litellm_kwargs)
                         await self.usage_manager.record_success(current_key, model, response)
                         return response
                     except Exception as e:
                         last_exception = e
+                        if isinstance(e, asyncio.CancelledError): raise e
                         log_failure(api_key=current_key, model=model, attempt=attempt + 1, error=e, request_data=kwargs)
                         classified_error = classify_error(e)
                         if classified_error.error_type in ['invalid_request', 'context_window_exceeded']:
                             raise last_exception
                         if request and await request.is_disconnected():
                             lib_logger.warning(f"Client disconnected during embedding. Aborting retries for key ...{current_key[-4:]}.")
                             raise last_exception
                         if classified_error.error_type in ['server_error', 'api_connection']:
                             await self.usage_manager.record_failure(current_key, model, classified_error)
+                            if attempt >= self.max_retries - 1: break
+                            wait_time = classified_error.retry_after or (1 * (2 ** attempt)) + random.uniform(0, 1)
                             await asyncio.sleep(wait_time)
                             continue
                         await self.usage_manager.record_failure(current_key, model, classified_error)
                         break
+            except (litellm.InvalidRequestError, litellm.ContextWindowExceededError, asyncio.CancelledError) as e:
+                raise e
+            except Exception as e:
+                last_exception = e
+                lib_logger.error(f"An unexpected error occurred with key ...{current_key[-4:] if current_key else 'N/A'}: {e}")
+                continue
             finally:
                 if key_acquired and current_key:
                     await self.usage_manager.release_key(current_key, model)
         if last_exception:
             raise last_exception
+        raise Exception("Failed to complete the request: No available API keys or all keys failed.")
     def token_count(self, **kwargs) -> int:
         """Calculates the number of tokens for a given text or list of messages."""