Spaces:

elmerzole
/

llm-api-proxy

Paused

Mirrowel commited on Jul 7, 2025

Commit

1337d48

1 Parent(s): 304ae92

feat(embeddings): Implement request batching for performance

This commit introduces an asynchronous batching mechanism for the `/v1/embeddings` endpoint to improve performance and efficiency.

A new `EmbeddingBatcher` class manages a queue of incoming requests. A background worker gathers these requests into batches, limited by size or a timeout, and sends them as a single API call to the provider. This significantly reduces the number of network requests under high concurrency.

Key changes:
- Add `EmbeddingBatcher` class in a new `src/proxy_app/batch_manager.py` module.
- Integrate the batcher into the FastAPI app lifecycle.
- Refactor the `/v1/embeddings` endpoint to use the new batcher.
- Update documentation to note that this feature is a work in progress.

Files changed (4) hide show

DOCUMENTATION.md +1 -1
README.md +1 -1
src/proxy_app/batch_manager.py +81 -0
src/proxy_app/main.py +24 -9

DOCUMENTATION.md CHANGED Viewed

@@ -142,7 +142,7 @@ The application uses FastAPI's `lifespan` context manager to manage the `Rotatin
 #### Endpoints
 *   `POST /v1/chat/completions`: The main endpoint for chat requests.
-*   `POST /v1/embeddings`: The endpoint for creating embeddings.
 *   `GET /v1/models`: Returns a list of all available models from configured providers.
 *   `GET /v1/providers`: Returns a list of all configured providers.
 *   `POST /v1/token-count`: Calculates the token count for a given message payload.

 #### Endpoints
 *   `POST /v1/chat/completions`: The main endpoint for chat requests.
+*   `POST /v1/embeddings`: The endpoint for creating embeddings. - Not fully functional yet.
 *   `GET /v1/models`: Returns a list of all available models from configured providers.
 *   `GET /v1/providers`: Returns a list of all configured providers.
 *   `POST /v1/token-count`: Calculates the token count for a given message payload.

README.md CHANGED Viewed

@@ -208,7 +208,7 @@ curl -X POST http://127.0.0.1:8000/v1/chat/completions \
 ### Available API Endpoints
 -   `POST /v1/chat/completions`: The main endpoint for making chat requests.
--   `POST /v1/embeddings`: The endpoint for creating embeddings.
 -   `GET /v1/models`: Returns a list of all available models from your configured providers.
 -   `GET /v1/providers`: Returns a list of all configured providers.
 -   `POST /v1/token-count`: Calculates the token count for a given message payload.

 ### Available API Endpoints
 -   `POST /v1/chat/completions`: The main endpoint for making chat requests.
+-   `POST /v1/embeddings`: The endpoint for creating embeddings. - Not fully functional yet.
 -   `GET /v1/models`: Returns a list of all available models from your configured providers.
 -   `GET /v1/providers`: Returns a list of all configured providers.
 -   `POST /v1/token-count`: Calculates the token count for a given message payload.

src/proxy_app/batch_manager.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import asyncio
+from typing import List, Dict, Any, Tuple
+import time
+from rotator_library import RotatingClient
+class EmbeddingBatcher:
+    def __init__(self, client: RotatingClient, batch_size: int = 64, timeout: float = 0.1):
+        self.client = client
+        self.batch_size = batch_size
+        self.timeout = timeout
+        self.queue = asyncio.Queue()
+        self.worker_task = asyncio.create_task(self._batch_worker())
+    async def add_request(self, request_data: Dict[str, Any]) -> Any:
+        future = asyncio.Future()
+        await self.queue.put((request_data, future))
+        return await future
+    async def _batch_worker(self):
+        while True:
+            batch, futures = await self._gather_batch()
+            if not batch:
+                continue
+            try:
+                # Assume all requests in a batch use the same model and other settings
+                model = batch[0]["model"]
+                inputs = [item["input"][0] for item in batch] # Extract single string input
+                batched_request = {
+                    "model": model,
+                    "input": inputs
+                }
+                # Pass through any other relevant parameters from the first request
+                for key in ["input_type", "dimensions", "user"]:
+                    if key in batch[0]:
+                        batched_request[key] = batch[0][key]
+                response = await self.client.aembedding(**batched_request)
+                # Distribute results back to the original requesters
+                for i, future in enumerate(futures):
+                    # Create a new response object for each item in the batch
+                    single_response_data = {
+                        "object": response.object,
+                        "model": response.model,
+                        "data": [response.data[i]],
+                        "usage": response.usage # Usage is for the whole batch
+                    }
+                    future.set_result(single_response_data)
+            except Exception as e:
+                for future in futures:
+                    future.set_exception(e)
+    async def _gather_batch(self) -> Tuple[List[Dict[str, Any]], List[asyncio.Future]]:
+        batch = []
+        futures = []
+        start_time = time.time()
+        while len(batch) < self.batch_size and (time.time() - start_time) < self.timeout:
+            try:
+                # Wait for an item with a timeout
+                timeout = self.timeout - (time.time() - start_time)
+                if timeout <= 0:
+                    break
+                request, future = await asyncio.wait_for(self.queue.get(), timeout=timeout)
+                batch.append(request)
+                futures.append(future)
+            except asyncio.TimeoutError:
+                break
+        return batch, futures
+    async def stop(self):
+        self.worker_task.cancel()
+        try:
+            await self.worker_task
+        except asyncio.CancelledError:
+            pass

src/proxy_app/main.py CHANGED Viewed

@@ -36,6 +36,7 @@ sys.path.append(str(Path(__file__).resolve().parent.parent))
 from rotator_library import RotatingClient, PROVIDER_PLUGINS
 from proxy_app.request_logger import log_request_response
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -67,11 +68,15 @@ if not api_keys:
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Manage the RotatingClient's lifecycle with the app's lifespan."""
-    app.state.rotating_client = RotatingClient(api_keys=api_keys)
-    print("RotatingClient initialized.")
     yield
-    await app.state.rotating_client.close()
-    print("RotatingClient closed.")
 # --- FastAPI App Setup ---
 app = FastAPI(lifespan=lifespan)
@@ -81,6 +86,10 @@ def get_rotating_client(request: Request) -> RotatingClient:
     """Dependency to get the rotating client instance from the app state."""
     return request.app.state.rotating_client
 async def verify_api_key(auth: str = Depends(api_key_header)):
     """Dependency to verify the proxy API key."""
     if not auth or auth != f"Bearer {PROXY_API_KEY}":
@@ -267,21 +276,27 @@ async def chat_completions(
 async def embeddings(
     request: Request,
     body: EmbeddingRequest,
-    client: RotatingClient = Depends(get_rotating_client),
     _ = Depends(verify_api_key)
 ):
     """
     OpenAI-compatible endpoint for creating embeddings.
     """
     try:
         request_data = body.model_dump(exclude_none=True)
-        # Ensure input is always a list for the client
-        if isinstance(request_data.get("input"), str):
-            request_data["input"] = [request_data["input"]]
-        response = await client.aembedding(**request_data)
         if ENABLE_REQUEST_LOGGING:
             response_summary = {
                 "model": response.model,

 from rotator_library import RotatingClient, PROVIDER_PLUGINS
 from proxy_app.request_logger import log_request_response
+from proxy_app.batch_manager import EmbeddingBatcher
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Manage the RotatingClient's lifecycle with the app's lifespan."""
+    client = RotatingClient(api_keys=api_keys)
+    batcher = EmbeddingBatcher(client=client)
+    app.state.rotating_client = client
+    app.state.embedding_batcher = batcher
+    print("RotatingClient and EmbeddingBatcher initialized.")
     yield
+    await batcher.stop()
+    await client.close()
+    print("RotatingClient and EmbeddingBatcher closed.")
 # --- FastAPI App Setup ---
 app = FastAPI(lifespan=lifespan)
     """Dependency to get the rotating client instance from the app state."""
     return request.app.state.rotating_client
+def get_embedding_batcher(request: Request) -> EmbeddingBatcher:
+    """Dependency to get the embedding batcher instance from the app state."""
+    return request.app.state.embedding_batcher
 async def verify_api_key(auth: str = Depends(api_key_header)):
     """Dependency to verify the proxy API key."""
     if not auth or auth != f"Bearer {PROXY_API_KEY}":
 async def embeddings(
     request: Request,
     body: EmbeddingRequest,
+    batcher: EmbeddingBatcher = Depends(get_embedding_batcher),
     _ = Depends(verify_api_key)
 ):
     """
     OpenAI-compatible endpoint for creating embeddings.
+    This endpoint uses a batching manager to group requests for efficiency.
     """
     try:
         request_data = body.model_dump(exclude_none=True)
+        # The batcher expects a single string input per request
+        if isinstance(request_data.get("input"), list):
+            if len(request_data["input"]) > 1:
+                raise HTTPException(status_code=400, detail="Batching multiple inputs in a single request is not supported when using the server-side batcher. Please send one input string per request.")
+            request_data["input"] = request_data["input"][0]
+        response_data = await batcher.add_request(request_data)
+        # The batcher returns a dict, not a Pydantic model, so we construct it
+        response = litellm.EmbeddingResponse(**response_data)
         if ENABLE_REQUEST_LOGGING:
             response_summary = {
                 "model": response.model,