Spaces:

elmerzole
/

llm-api-proxy

Paused

Mirrowel commited on Nov 27, 2025

Commit

d4593e5

1 Parent(s): 62e7cf3

fix(gemini): 🐛 consolidate parallel tool responses and improve rate limit handling

This commit addresses multiple issues with Gemini API providers related to parallel function calling and rate limit error handling:

**Tool Response Consolidation:**
- Parallel function responses are now consolidated into a single user message as required by Gemini API specification
- Previously, consecutive tool responses were sent as separate messages, causing API errors
- Implemented pending tool parts accumulation pattern in both GeminiCliProvider and AntigravityProvider
- Tool responses are flushed when a non-tool message is encountered or at the end of message processing

**Thought Signature Handling:**
- Fixed parallel function call signature behavior to match Gemini 3 API requirements
- Only the first parallel function call in a message receives a thoughtSignature field
- Subsequent parallel calls no longer include thoughtSignature to prevent API validation errors
- Removed `first_sig_seen` tracking flags since signatures are now stored per tool call

**Rate Limit Error Handling:**
- Added `extract_retry_after_from_body()` function to parse retry-after times from various API error formats
- Improved Gemini CLI rate limit error messages with extracted retry-after information
- Enhanced error logging to capture and display response bodies before raising HTTPStatusError
- Reduced log noise by using debug level for rate limit rotation events instead of info/warning
- Better error context propagation for 429 responses

**Code Quality:**
- Removed unused `first_sig_seen` tracking variables
- Improved inline documentation explaining Gemini API parallel function call requirements
- Consistent role mapping (tool -> user) across message transformation logic

Files changed (3) hide show

src/rotator_library/error_handler.py +38 -0
src/rotator_library/providers/antigravity_provider.py +43 -14
src/rotator_library/providers/gemini_cli_provider.py +62 -17

src/rotator_library/error_handler.py CHANGED Viewed

@@ -17,6 +17,42 @@ from litellm.exceptions import (
 )
 class NoAvailableKeysError(Exception):
     """Raised when no API keys are available for a request after waiting."""
@@ -106,6 +142,8 @@ def get_retry_after(error: Exception) -> Optional[int]:
         r"wait for\s*(\d+)\s*seconds?",
         r'"retryDelay":\s*"(\d+)s"',
         r"x-ratelimit-reset:?\s*(\d+)",
     ]
     for pattern in patterns:

 )
+def extract_retry_after_from_body(error_body: Optional[str]) -> Optional[int]:
+    """
+    Extract the retry-after time from an API error response body.
+    Handles various error formats including:
+    - Gemini CLI: "Your quota will reset after 39s."
+    - Generic: "quota will reset after 120s", "retry after 60s"
+    Args:
+        error_body: The raw error response body
+    Returns:
+        The retry time in seconds, or None if not found
+    """
+    if not error_body:
+        return None
+    # Pattern to match various "reset after Xs" or "retry after Xs" formats
+    patterns = [
+        r"quota will reset after\s*(\d+)s",
+        r"reset after\s*(\d+)s",
+        r"retry after\s*(\d+)s",
+        r"try again in\s*(\d+)\s*seconds?",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, error_body, re.IGNORECASE)
+        if match:
+            try:
+                return int(match.group(1))
+            except (ValueError, IndexError):
+                continue
+    return None
 class NoAvailableKeysError(Exception):
     """Raised when no API keys are available for a request after waiting."""
         r"wait for\s*(\d+)\s*seconds?",
         r'"retryDelay":\s*"(\d+)s"',
         r"x-ratelimit-reset:?\s*(\d+)",
+        r"quota will reset after\s*(\d+)s",  # Gemini CLI rate limit format
+        r"reset after\s*(\d+)s",  # Generic reset after format
     ]
     for pattern in patterns:

src/rotator_library/providers/antigravity_provider.py CHANGED Viewed

@@ -605,23 +605,38 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
                         tool_id_to_name[tc_id] = tc_name
                         #lib_logger.debug(f"[ID Mapping] Registered tool_call: id={tc_id}, name={tc_name}")
-        # Convert each message
         for msg in messages:
             role = msg.get("role")
             content = msg.get("content")
             parts = []
             if role == "user":
                 parts = self._transform_user_message(content)
             elif role == "assistant":
                 parts = self._transform_assistant_message(msg, model, tool_id_to_name)
             elif role == "tool":
-                parts = self._transform_tool_message(msg, model, tool_id_to_name)
             if parts:
-                gemini_role = "model" if role == "assistant" else "user" if role == "tool" else "user"
                 gemini_contents.append({"role": gemini_role, "parts": parts})
         return system_instruction, gemini_contents
     def _parse_content_parts(
@@ -687,6 +702,9 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
             parts.append({"text": content})
         # Add tool calls
         for tc in tool_calls:
             if tc.get("type") != "function":
                 continue
@@ -717,6 +735,8 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
             }
             # Add thoughtSignature for Gemini 3
             if self._is_gemini_3(model):
                 sig = tc.get("thought_signature")
                 if not sig and tool_id and self._enable_signature_cache:
@@ -724,9 +744,13 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
                 if sig:
                     func_part["thoughtSignature"] = sig
-                else:
                     func_part["thoughtSignature"] = "skip_thought_signature_validator"
-                    lib_logger.warning(f"Missing thoughtSignature for {tool_id}, using bypass")
             parts.append(func_part)
@@ -1146,13 +1170,20 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
                 del thinking_config["thinkingLevel"]
                 thinking_config["thinkingBudget"] = -1
-        # Add thoughtSignature to function calls for Gemini 3
         if internal_model.startswith("gemini-3-"):
             for content in antigravity_payload["request"].get("contents", []):
                 if content.get("role") == "model":
                     for part in content.get("parts", []):
-                        if "functionCall" in part and "thoughtSignature" not in part:
-                            part["thoughtSignature"] = "skip_thought_signature_validator"
         # Claude-specific tool schema transformation
         if internal_model.startswith("claude-sonnet-"):
@@ -1203,7 +1234,6 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
         text_content = ""
         reasoning_content = ""
         tool_calls = []
-        first_sig_seen = False
         # Use accumulator's tool_idx if available, otherwise use local counter
         tool_idx = accumulator.get("tool_idx", 0) if accumulator else 0
@@ -1235,8 +1265,8 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
             if has_func:
                 tool_call = self._extract_tool_call(part, model, tool_idx, accumulator)
-                if has_sig and not first_sig_seen:
-                    first_sig_seen = True
                     self._handle_tool_signature(tool_call, part["thoughtSignature"])
                 tool_calls.append(tool_call)
@@ -1298,7 +1328,6 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
         reasoning_content = ""
         tool_calls = []
         thought_sig = ""
-        first_sig_seen = False
         for part in content_parts:
             has_func = "functionCall" in part
@@ -1321,8 +1350,8 @@ class AntigravityProvider(AntigravityAuthBase, ProviderInterface):
             if has_func:
                 tool_call = self._extract_tool_call(part, model, len(tool_calls))
-                if has_sig and not first_sig_seen:
-                    first_sig_seen = True
                     self._handle_tool_signature(tool_call, part["thoughtSignature"])
                 tool_calls.append(tool_call)

                         tool_id_to_name[tc_id] = tc_name
                         #lib_logger.debug(f"[ID Mapping] Registered tool_call: id={tc_id}, name={tc_name}")
+        # Convert each message, consolidating consecutive tool responses
+        # Per Gemini docs: parallel function responses must be in a single user message
+        pending_tool_parts = []
         for msg in messages:
             role = msg.get("role")
             content = msg.get("content")
             parts = []
+            # Flush pending tool parts before non-tool message
+            if pending_tool_parts and role != "tool":
+                gemini_contents.append({"role": "user", "parts": pending_tool_parts})
+                pending_tool_parts = []
             if role == "user":
                 parts = self._transform_user_message(content)
             elif role == "assistant":
                 parts = self._transform_assistant_message(msg, model, tool_id_to_name)
             elif role == "tool":
+                tool_parts = self._transform_tool_message(msg, model, tool_id_to_name)
+                # Accumulate tool responses instead of adding individually
+                pending_tool_parts.extend(tool_parts)
+                continue
             if parts:
+                gemini_role = "model" if role == "assistant" else "user"
                 gemini_contents.append({"role": gemini_role, "parts": parts})
+        # Flush any remaining tool parts
+        if pending_tool_parts:
+            gemini_contents.append({"role": "user", "parts": pending_tool_parts})
         return system_instruction, gemini_contents
     def _parse_content_parts(
             parts.append({"text": content})
         # Add tool calls
+        # Track if we've seen the first function call in this message
+        # Per Gemini docs: Only the FIRST parallel function call gets a signature
+        first_func_in_msg = True
         for tc in tool_calls:
             if tc.get("type") != "function":
                 continue
             }
             # Add thoughtSignature for Gemini 3
+            # Per Gemini docs: Only the FIRST parallel function call gets a signature.
+            # Subsequent parallel calls should NOT have a thoughtSignature field.
             if self._is_gemini_3(model):
                 sig = tc.get("thought_signature")
                 if not sig and tool_id and self._enable_signature_cache:
                 if sig:
                     func_part["thoughtSignature"] = sig
+                elif first_func_in_msg:
+                    # Only add bypass to the first function call if no sig available
                     func_part["thoughtSignature"] = "skip_thought_signature_validator"
+                    lib_logger.warning(f"Missing thoughtSignature for first func call {tool_id}, using bypass")
+                # Subsequent parallel calls: no signature field at all
+                first_func_in_msg = False
             parts.append(func_part)
                 del thinking_config["thinkingLevel"]
                 thinking_config["thinkingBudget"] = -1
+        # Ensure first function call in each model message has a thoughtSignature for Gemini 3
+        # Per Gemini docs: Only the FIRST parallel function call gets a signature
         if internal_model.startswith("gemini-3-"):
             for content in antigravity_payload["request"].get("contents", []):
                 if content.get("role") == "model":
+                    first_func_seen = False
                     for part in content.get("parts", []):
+                        if "functionCall" in part:
+                            if not first_func_seen:
+                                # First function call in this message - needs a signature
+                                if "thoughtSignature" not in part:
+                                    part["thoughtSignature"] = "skip_thought_signature_validator"
+                                first_func_seen = True
+                            # Subsequent parallel calls: leave as-is (no signature)
         # Claude-specific tool schema transformation
         if internal_model.startswith("claude-sonnet-"):
         text_content = ""
         reasoning_content = ""
         tool_calls = []
         # Use accumulator's tool_idx if available, otherwise use local counter
         tool_idx = accumulator.get("tool_idx", 0) if accumulator else 0
             if has_func:
                 tool_call = self._extract_tool_call(part, model, tool_idx, accumulator)
+                # Store signature for each tool call (needed for parallel tool calls)
+                if has_sig:
                     self._handle_tool_signature(tool_call, part["thoughtSignature"])
                 tool_calls.append(tool_call)
         reasoning_content = ""
         tool_calls = []
         thought_sig = ""
         for part in content_parts:
             has_func = "functionCall" in part
             if has_func:
                 tool_call = self._extract_tool_call(part, model, len(tool_calls))
+                # Store signature for each tool call (needed for parallel tool calls)
+                if has_sig:
                     self._handle_tool_signature(tool_call, part["thoughtSignature"])
                 tool_calls.append(tool_call)

src/rotator_library/providers/gemini_cli_provider.py CHANGED Viewed

@@ -13,6 +13,7 @@ from .provider_cache import ProviderCache
 from ..model_definitions import ModelDefinitions
 import litellm
 from litellm.exceptions import RateLimitError
 import os
 from pathlib import Path
 import uuid
@@ -125,6 +126,7 @@ def _env_int(key: str, default: int) -> int:
     """Get integer from environment variable."""
     return int(os.getenv(key, str(default)))
 class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
     skip_cost_calculation = True
@@ -684,11 +686,21 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
                     if tool_call.get("type") == "function":
                         tool_call_id_to_name[tool_call["id"]] = tool_call["function"]["name"]
         for msg in messages:
             role = msg.get("role")
             content = msg.get("content")
             parts = []
-            gemini_role = "model" if role == "assistant" else "tool" if role == "tool" else "user"
             if role == "user":
                 if isinstance(content, str):
@@ -725,6 +737,9 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
                 if isinstance(content, str):
                     parts.append({"text": content})
                 if msg.get("tool_calls"):
                     for tool_call in msg["tool_calls"]:
                         if tool_call.get("type") == "function":
                             try:
@@ -748,6 +763,8 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
                             }
                             # Add thoughtSignature for Gemini 3
                             if is_gemini_3:
                                 sig = tool_call.get("thought_signature")
                                 if not sig and tool_id and self._enable_signature_cache:
@@ -755,9 +772,13 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
                                 if sig:
                                     func_part["thoughtSignature"] = sig
-                                else:
                                     func_part["thoughtSignature"] = "skip_thought_signature_validator"
-                                    lib_logger.warning(f"Missing thoughtSignature for {tool_id}, using bypass")
                             parts.append(func_part)
@@ -771,17 +792,24 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
                     # Wrap the tool response in a 'result' object
                     response_content = {"result": content}
-                    parts.append({
                         "functionResponse": {
                             "name": function_name,
                             "response": response_content,
                             "id": tool_call_id
                         }
                     })
             if parts:
                 gemini_contents.append({"role": gemini_role, "parts": parts})
         if not gemini_contents or gemini_contents[0]['role'] != 'user':
             gemini_contents.insert(0, {"role": "user", "parts": [{"text": ""}]})
@@ -866,7 +894,6 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
         candidate = candidates[0]
         parts = candidate.get('content', {}).get('parts', [])
         is_gemini_3 = self._is_gemini_3(model_id)
-        first_sig_seen = False
         for part in parts:
             delta = {}
@@ -905,8 +932,8 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
                 }
                 # Handle thoughtSignature for Gemini 3
-                if is_gemini_3 and has_sig and not first_sig_seen:
-                    first_sig_seen = True
                     sig = part['thoughtSignature']
                     if self._enable_signature_cache:
@@ -1369,6 +1396,15 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
                 })
                 try:
                     async with client.stream("POST", url, headers=final_headers, json=request_payload, params={"alt": "sse"}, timeout=600) as response:
                         # This will raise an HTTPStatusError for 4xx/5xx responses
                         response.raise_for_status()
@@ -1405,16 +1441,24 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
                             error_body = e.response.text
                         except Exception:
                             pass
-                    log_line = f"Stream handler HTTPStatusError: {str(e)}"
                     if error_body:
-                        log_line = f"{log_line} | response_body={error_body}"
-                    file_logger.log_error(log_line)
                     if e.response.status_code == 429:
-                        # Pass the raw response object to the exception. Do not read the
-                        # response body here as it will close the stream and cause a
-                        # 'StreamClosed' error in the client's stream reader.
                         raise RateLimitError(
-                            message=f"Gemini CLI rate limit exceeded: {e.request.url}",
                             llm_provider="gemini_cli",
                             model=model,
                             response=e.response
@@ -1451,7 +1495,8 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
         for idx, attempt_model in enumerate(fallback_models):
             is_fallback = idx > 0
             if is_fallback:
-                lib_logger.info(f"Gemini CLI rate limited, retrying with fallback model: {attempt_model}")
             elif has_fallbacks:
                 lib_logger.debug(f"Attempting primary model: {attempt_model} (with {len(fallback_models)-1} fallback(s) available)")
             else:
@@ -1473,8 +1518,8 @@ class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
                 if idx + 1 < len(fallback_models):
                     lib_logger.debug(f"Rate limit hit on {attempt_model}, trying next fallback...")
                     continue
-                # If this was the last fallback option, raise the error
-                lib_logger.error(f"Rate limit hit on all fallback models (tried {len(fallback_models)} models)")
                 raise
         # Should not reach here, but raise last error if we do

 from ..model_definitions import ModelDefinitions
 import litellm
 from litellm.exceptions import RateLimitError
+from ..error_handler import extract_retry_after_from_body
 import os
 from pathlib import Path
 import uuid
     """Get integer from environment variable."""
     return int(os.getenv(key, str(default)))
 class GeminiCliProvider(GeminiAuthBase, ProviderInterface):
     skip_cost_calculation = True
                     if tool_call.get("type") == "function":
                         tool_call_id_to_name[tool_call["id"]] = tool_call["function"]["name"]
+        # Process messages and consolidate consecutive tool responses
+        # Per Gemini docs: parallel function responses must be in a single user message,
+        # not interleaved as separate messages
+        pending_tool_parts = []  # Accumulate tool responses
         for msg in messages:
             role = msg.get("role")
             content = msg.get("content")
             parts = []
+            gemini_role = "model" if role == "assistant" else "user"  # tool -> user in Gemini
+            # If we have pending tool parts and hit a non-tool message, flush them first
+            if pending_tool_parts and role != "tool":
+                gemini_contents.append({"role": "user", "parts": pending_tool_parts})
+                pending_tool_parts = []
             if role == "user":
                 if isinstance(content, str):
                 if isinstance(content, str):
                     parts.append({"text": content})
                 if msg.get("tool_calls"):
+                    # Track if we've seen the first function call in this message
+                    # Per Gemini docs: Only the FIRST parallel function call gets a signature
+                    first_func_in_msg = True
                     for tool_call in msg["tool_calls"]:
                         if tool_call.get("type") == "function":
                             try:
                             }
                             # Add thoughtSignature for Gemini 3
+                            # Per Gemini docs: Only the FIRST parallel function call gets a signature.
+                            # Subsequent parallel calls should NOT have a thoughtSignature field.
                             if is_gemini_3:
                                 sig = tool_call.get("thought_signature")
                                 if not sig and tool_id and self._enable_signature_cache:
                                 if sig:
                                     func_part["thoughtSignature"] = sig
+                                elif first_func_in_msg:
+                                    # Only add bypass to the first function call if no sig available
                                     func_part["thoughtSignature"] = "skip_thought_signature_validator"
+                                    lib_logger.warning(f"Missing thoughtSignature for first func call {tool_id}, using bypass")
+                                # Subsequent parallel calls: no signature field at all
+                                first_func_in_msg = False
                             parts.append(func_part)
                     # Wrap the tool response in a 'result' object
                     response_content = {"result": content}
+                    # Accumulate tool responses - they'll be combined into one user message
+                    pending_tool_parts.append({
                         "functionResponse": {
                             "name": function_name,
                             "response": response_content,
                             "id": tool_call_id
                         }
                     })
+                # Don't add parts here - tool responses are handled via pending_tool_parts
+                continue
             if parts:
                 gemini_contents.append({"role": gemini_role, "parts": parts})
+        # Flush any remaining tool parts at end of messages
+        if pending_tool_parts:
+            gemini_contents.append({"role": "user", "parts": pending_tool_parts})
         if not gemini_contents or gemini_contents[0]['role'] != 'user':
             gemini_contents.insert(0, {"role": "user", "parts": [{"text": ""}]})
         candidate = candidates[0]
         parts = candidate.get('content', {}).get('parts', [])
         is_gemini_3 = self._is_gemini_3(model_id)
         for part in parts:
             delta = {}
                 }
                 # Handle thoughtSignature for Gemini 3
+                # Store signature for each tool call (needed for parallel tool calls)
+                if is_gemini_3 and has_sig:
                     sig = part['thoughtSignature']
                     if self._enable_signature_cache:
                 })
                 try:
                     async with client.stream("POST", url, headers=final_headers, json=request_payload, params={"alt": "sse"}, timeout=600) as response:
+                        # Read and log error body before raise_for_status for better debugging
+                        if response.status_code >= 400:
+                            try:
+                                error_body = await response.aread()
+                                lib_logger.error(f"Gemini CLI API error {response.status_code}: {error_body.decode()}")
+                                file_logger.log_error(f"API error {response.status_code}: {error_body.decode()}")
+                            except Exception:
+                                pass
                         # This will raise an HTTPStatusError for 4xx/5xx responses
                         response.raise_for_status()
                             error_body = e.response.text
                         except Exception:
                             pass
+                    # Only log to file logger (for detailed logging)
                     if error_body:
+                        file_logger.log_error(f"HTTPStatusError {e.response.status_code}: {error_body}")
+                    else:
+                        file_logger.log_error(f"HTTPStatusError {e.response.status_code}: {str(e)}")
                     if e.response.status_code == 429:
+                        # Extract retry-after time from the error body
+                        retry_after = extract_retry_after_from_body(error_body)
+                        retry_info = f" (retry after {retry_after}s)" if retry_after else ""
+                        error_msg = f"Gemini CLI rate limit exceeded{retry_info}"
+                        if error_body:
+                            error_msg = f"{error_msg} | {error_body}"
+                        # Only log at debug level - rotation happens silently
+                        lib_logger.debug(f"Gemini CLI 429 rate limit: retry_after={retry_after}s")
                         raise RateLimitError(
+                            message=error_msg,
                             llm_provider="gemini_cli",
                             model=model,
                             response=e.response
         for idx, attempt_model in enumerate(fallback_models):
             is_fallback = idx > 0
             if is_fallback:
+                # Silent rotation - only log at debug level
+                lib_logger.debug(f"Rate limited on previous model, trying fallback: {attempt_model}")
             elif has_fallbacks:
                 lib_logger.debug(f"Attempting primary model: {attempt_model} (with {len(fallback_models)-1} fallback(s) available)")
             else:
                 if idx + 1 < len(fallback_models):
                     lib_logger.debug(f"Rate limit hit on {attempt_model}, trying next fallback...")
                     continue
+                # If this was the last fallback option, log error and raise
+                lib_logger.warning(f"Rate limit exhausted on all fallback models (tried {len(fallback_models)} models)")
                 raise
         # Should not reach here, but raise last error if we do