Spaces:

elmerzole
/

llm-api-proxy

Paused

Mirrowel commited on Nov 16, 2025

Commit

79e83ae

1 Parent(s): 3962356

feat(core): enable configuration of maximum concurrent requests per key

This introduces functionality to allow multiple concurrent requests to utilize the same API key, which is necessary when a provider's capacity allows for parallel usage (e.g., modern OpenAI tiers).

The `UsageManager` is updated to track concurrent request counts per model per key, moving from a simple busy/idle state to a counter.

- New environment variables (`MAX_CONCURRENT_REQUESTS_PER_KEY_<PROVIDER>`) define the maximum concurrency limit for keys of a specific provider.
- The default limit is 1, maintaining the previous behavior (no concurrency).
- Updates provider endpoint resolution to support loading custom API bases via environment variables (e.g., `CUSTOM_API_BASE`) if the provider is not hardcoded.

Files changed (5) hide show

.env.example +15 -0
src/proxy_app/main.py +17 -1
src/proxy_app/provider_urls.py +9 -1
src/rotator_library/client.py +15 -2
src/rotator_library/usage_manager.py +19 -10

.env.example CHANGED Viewed

@@ -139,6 +139,21 @@ IGNORE_MODELS_OPENAI=""
 WHITELIST_MODELS_GEMINI=""
 WHITELIST_MODELS_OPENAI=""
 # ------------------------------------------------------------------------------
 # | [ADVANCED] Proxy Configuration                                             |

 WHITELIST_MODELS_GEMINI=""
 WHITELIST_MODELS_OPENAI=""
+# --- Maximum Concurrent Requests Per Key ---
+# Controls how many concurrent requests for the SAME model can use the SAME key.
+# This is useful for providers that can handle concurrent requests without rate limiting.
+# Default is 1 (no concurrency, current behavior).
+#
+# Format: MAX_CONCURRENT_REQUESTS_PER_KEY_<PROVIDER_NAME>=<number>
+#
+# Example:
+# MAX_CONCURRENT_REQUESTS_PER_KEY_OPENAI=3  # Allow 3 concurrent requests per OpenAI key
+# MAX_CONCURRENT_REQUESTS_PER_KEY_GEMINI=1  # Allow only 1 request per Gemini key (default)
+#
+MAX_CONCURRENT_REQUESTS_PER_KEY_OPENAI=1
+MAX_CONCURRENT_REQUESTS_PER_KEY_GEMINI=1
+MAX_CONCURRENT_REQUESTS_PER_KEY_ANTHROPIC=1
 # ------------------------------------------------------------------------------
 # | [ADVANCED] Proxy Configuration                                             |

src/proxy_app/main.py CHANGED Viewed

@@ -163,6 +163,21 @@ for key, value in os.environ.items():
         whitelist_models[provider] = models_to_whitelist
         logging.debug(f"Loaded whitelist for provider '{provider}': {models_to_whitelist}")
 # --- Lifespan Management ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
@@ -282,7 +297,8 @@ async def lifespan(app: FastAPI):
         litellm_provider_params=litellm_provider_params,
         ignore_models=ignore_models,
         whitelist_models=whitelist_models,
-        enable_request_logging=ENABLE_REQUEST_LOGGING
     )
     client.background_refresher.start() # Start the background task
     app.state.rotating_client = client

         whitelist_models[provider] = models_to_whitelist
         logging.debug(f"Loaded whitelist for provider '{provider}': {models_to_whitelist}")
+# Load max concurrent requests per key from environment variables
+max_concurrent_requests_per_key = {}
+for key, value in os.environ.items():
+    if key.startswith("MAX_CONCURRENT_REQUESTS_PER_KEY_"):
+        provider = key.replace("MAX_CONCURRENT_REQUESTS_PER_KEY_", "").lower()
+        try:
+            max_concurrent = int(value)
+            if max_concurrent < 1:
+                logging.warning(f"Invalid max_concurrent value for provider '{provider}': {value}. Must be >= 1. Using default (1).")
+                max_concurrent = 1
+            max_concurrent_requests_per_key[provider] = max_concurrent
+            logging.debug(f"Loaded max concurrent requests for provider '{provider}': {max_concurrent}")
+        except ValueError:
+            logging.warning(f"Invalid max_concurrent value for provider '{provider}': {value}. Using default (1).")
 # --- Lifespan Management ---
 @asynccontextmanager
 async def lifespan(app: FastAPI):
         litellm_provider_params=litellm_provider_params,
         ignore_models=ignore_models,
         whitelist_models=whitelist_models,
+        enable_request_logging=ENABLE_REQUEST_LOGGING,
+        max_concurrent_requests_per_key=max_concurrent_requests_per_key
     )
     client.background_refresher.start() # Start the background task
     app.state.rotating_client = client

src/proxy_app/provider_urls.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Optional
 # A comprehensive map of provider names to their base URLs.
@@ -31,10 +32,17 @@ PROVIDER_URL_MAP = {
 def get_provider_endpoint(provider: str, model_name: str, incoming_path: str) -> Optional[str]:
     """
     Constructs the full provider endpoint URL based on the provider and incoming request path.
     """
     base_url = PROVIDER_URL_MAP.get(provider)
     if not base_url:
-        return None
     # Determine the specific action from the incoming path (e.g., 'chat/completions')
     action = incoming_path.split('/v1/', 1)[-1] if '/v1/' in incoming_path else incoming_path

+import os
 from typing import Optional
 # A comprehensive map of provider names to their base URLs.
 def get_provider_endpoint(provider: str, model_name: str, incoming_path: str) -> Optional[str]:
     """
     Constructs the full provider endpoint URL based on the provider and incoming request path.
+    Supports both hardcoded providers and custom OpenAI-compatible providers via environment variables.
     """
+    # First, check the hardcoded map
     base_url = PROVIDER_URL_MAP.get(provider)
+    # If not found, check for custom provider via environment variable
     if not base_url:
+        api_base_env = f"{provider.upper()}_API_BASE"
+        base_url = os.getenv(api_base_env)
+        if not base_url:
+            return None
     # Determine the specific action from the incoming path (e.g., 'chat/completions')
     action = incoming_path.split('/v1/', 1)[-1] if '/v1/' in incoming_path else incoming_path

src/rotator_library/client.py CHANGED Viewed

@@ -61,6 +61,7 @@ class RotatingClient:
         ignore_models: Optional[Dict[str, List[str]]] = None,
         whitelist_models: Optional[Dict[str, List[str]]] = None,
         enable_request_logging: bool = False,
     ):
         os.environ["LITELLM_LOG"] = "ERROR"
         litellm.set_verbose = False
@@ -118,6 +119,14 @@ class RotatingClient:
         self.whitelist_models = whitelist_models or {}
         self.enable_request_logging = enable_request_logging
     def _is_model_ignored(self, provider: str, model_id: str) -> bool:
         """
         Checks if a model should be ignored based on the ignore list.
@@ -576,8 +585,10 @@ class RotatingClient:
                 lib_logger.info(
                     f"Acquiring key for model {model}. Tried keys: {len(tried_creds)}/{len(credentials_for_provider)}"
                 )
                 current_cred = await self.usage_manager.acquire_key(
-                    available_keys=creds_to_try, model=model, deadline=deadline
                 )
                 key_acquired = True
                 tried_creds.add(current_cred)
@@ -918,8 +929,10 @@ class RotatingClient:
                     lib_logger.info(
                         f"Acquiring credential for model {model}. Tried credentials: {len(tried_creds)}/{len(credentials_for_provider)}"
                     )
                     current_cred = await self.usage_manager.acquire_key(
-                        available_keys=creds_to_try, model=model, deadline=deadline
                     )
                     key_acquired = True
                     tried_creds.add(current_cred)

         ignore_models: Optional[Dict[str, List[str]]] = None,
         whitelist_models: Optional[Dict[str, List[str]]] = None,
         enable_request_logging: bool = False,
+        max_concurrent_requests_per_key: Optional[Dict[str, int]] = None,
     ):
         os.environ["LITELLM_LOG"] = "ERROR"
         litellm.set_verbose = False
         self.whitelist_models = whitelist_models or {}
         self.enable_request_logging = enable_request_logging
+        # Store and validate max concurrent requests per key
+        self.max_concurrent_requests_per_key = max_concurrent_requests_per_key or {}
+        # Validate all values are >= 1
+        for provider, max_val in self.max_concurrent_requests_per_key.items():
+            if max_val < 1:
+                lib_logger.warning(f"Invalid max_concurrent for '{provider}': {max_val}. Setting to 1.")
+                self.max_concurrent_requests_per_key[provider] = 1
     def _is_model_ignored(self, provider: str, model_id: str) -> bool:
         """
         Checks if a model should be ignored based on the ignore list.
                 lib_logger.info(
                     f"Acquiring key for model {model}. Tried keys: {len(tried_creds)}/{len(credentials_for_provider)}"
                 )
+                max_concurrent = self.max_concurrent_requests_per_key.get(provider, 1)
                 current_cred = await self.usage_manager.acquire_key(
+                    available_keys=creds_to_try, model=model, deadline=deadline,
+                    max_concurrent=max_concurrent
                 )
                 key_acquired = True
                 tried_creds.add(current_cred)
                     lib_logger.info(
                         f"Acquiring credential for model {model}. Tried credentials: {len(tried_creds)}/{len(credentials_for_provider)}"
                     )
+                    max_concurrent = self.max_concurrent_requests_per_key.get(provider, 1)
                     current_cred = await self.usage_manager.acquire_key(
+                        available_keys=creds_to_try, model=model, deadline=deadline,
+                        max_concurrent=max_concurrent
                     )
                     key_acquired = True
                     tried_creds.add(current_cred)

src/rotator_library/usage_manager.py CHANGED Viewed

@@ -157,11 +157,12 @@ class UsageManager:
                 self.key_states[key] = {
                     "lock": asyncio.Lock(),
                     "condition": asyncio.Condition(),
-                    "models_in_use": set(),
                 }
     async def acquire_key(
-        self, available_keys: List[str], model: str, deadline: float
     ) -> str:
         """
         Acquires the best available key using a tiered, model-aware locking strategy,
@@ -198,8 +199,8 @@ class UsageManager:
                     # Tier 1: Completely idle keys (preferred).
                     if not key_state["models_in_use"]:
                         tier1_keys.append((key, usage_count))
-                    # Tier 2: Keys busy with other models, but free for this one.
-                    elif model not in key_state["models_in_use"]:
                         tier2_keys.append((key, usage_count))
             tier1_keys.sort(key=lambda x: x[1])
@@ -210,7 +211,7 @@ class UsageManager:
                 state = self.key_states[key]
                 async with state["lock"]:
                     if not state["models_in_use"]:
-                        state["models_in_use"].add(model)
                         lib_logger.info(
                             f"Acquired Tier 1 key ...{key[-6:]} for model {model}"
                         )
@@ -220,10 +221,12 @@ class UsageManager:
             for key, _ in tier2_keys:
                 state = self.key_states[key]
                 async with state["lock"]:
-                    if model not in state["models_in_use"]:
-                        state["models_in_use"].add(model)
                         lib_logger.info(
-                            f"Acquired Tier 2 key ...{key[-6:]} for model {model}"
                         )
                         return key
@@ -271,8 +274,14 @@ class UsageManager:
         state = self.key_states[key]
         async with state["lock"]:
             if model in state["models_in_use"]:
-                state["models_in_use"].remove(model)
-                lib_logger.info(f"Released credential ...{key[-6:]} from model {model}")
             else:
                 lib_logger.warning(
                     f"Attempted to release credential ...{key[-6:]} for model {model}, but it was not in use."

                 self.key_states[key] = {
                     "lock": asyncio.Lock(),
                     "condition": asyncio.Condition(),
+                    "models_in_use": {},  # Dict[model_name, concurrent_count]
                 }
     async def acquire_key(
+        self, available_keys: List[str], model: str, deadline: float,
+        max_concurrent: int = 1
     ) -> str:
         """
         Acquires the best available key using a tiered, model-aware locking strategy,
                     # Tier 1: Completely idle keys (preferred).
                     if not key_state["models_in_use"]:
                         tier1_keys.append((key, usage_count))
+                    # Tier 2: Keys that can accept more concurrent requests for this model.
+                    elif key_state["models_in_use"].get(model, 0) < max_concurrent:
                         tier2_keys.append((key, usage_count))
             tier1_keys.sort(key=lambda x: x[1])
                 state = self.key_states[key]
                 async with state["lock"]:
                     if not state["models_in_use"]:
+                        state["models_in_use"][model] = 1
                         lib_logger.info(
                             f"Acquired Tier 1 key ...{key[-6:]} for model {model}"
                         )
             for key, _ in tier2_keys:
                 state = self.key_states[key]
                 async with state["lock"]:
+                    current_count = state["models_in_use"].get(model, 0)
+                    if current_count < max_concurrent:
+                        state["models_in_use"][model] = current_count + 1
                         lib_logger.info(
+                            f"Acquired Tier 2 key ...{key[-6:]} for model {model} "
+                            f"(concurrent: {state['models_in_use'][model]}/{max_concurrent})"
                         )
                         return key
         state = self.key_states[key]
         async with state["lock"]:
             if model in state["models_in_use"]:
+                state["models_in_use"][model] -= 1
+                remaining = state["models_in_use"][model]
+                if remaining <= 0:
+                    del state["models_in_use"][model]  # Clean up when count reaches 0
+                lib_logger.info(
+                    f"Released credential ...{key[-6:]} from model {model} "
+                    f"(remaining concurrent: {max(0, remaining)})"
+                )
             else:
                 lib_logger.warning(
                     f"Attempted to release credential ...{key[-6:]} for model {model}, but it was not in use."