Mirrowel commited on
Commit
9c5dbdf
Β·
1 Parent(s): a9943e6

fix(antigravity): πŸ› propagate usage stats in streaming auto-recovery

Browse files

The streaming wrapper relies on `completion_tokens > 0` in the usage dictionary to correctly detect final chunks and terminate the stream. Previously, auto-recovery responses lacked this data, causing streams to hang.

- Add `_build_malformed_fallback_chunk` to generate compliant streaming error responses when retries are exhausted.
- Extract and propagate `last_usage` from the response accumulator during malformed function call exceptions.
- Ensure fixed tool call chunks and fallback chunks include valid usage data (forcing `completion_tokens` to 1 if necessary) to properly signal stream completion.

src/rotator_library/providers/antigravity_provider.py CHANGED
@@ -3153,6 +3153,55 @@ Analyze what you did wrong, correct it, and retry the function call. Output ONLY
3153
  }
3154
  )
3155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3156
  def _build_fixed_tool_call_response(
3157
  self,
3158
  model: str,
@@ -3216,6 +3265,7 @@ Analyze what you did wrong, correct it, and retry the function call. Output ONLY
3216
  parsed_call: Dict[str, Any],
3217
  error_info: Dict[str, Any],
3218
  response_id: Optional[str] = None,
 
3219
  ) -> Optional[litellm.ModelResponse]:
3220
  """
3221
  Build a streaming chunk with the auto-fixed tool call.
@@ -3227,6 +3277,8 @@ Analyze what you did wrong, correct it, and retry the function call. Output ONLY
3227
 
3228
  Args:
3229
  response_id: Optional original response ID to maintain stream continuity
 
 
3230
 
3231
  Returns None if the JSON couldn't be fixed.
3232
  """
@@ -3245,6 +3297,16 @@ Analyze what you did wrong, correct it, and retry the function call. Output ONLY
3245
  # Use original response ID if provided, otherwise generate new one
3246
  chunk_id = response_id or f"chatcmpl-{uuid.uuid4().hex[:24]}"
3247
 
 
 
 
 
 
 
 
 
 
 
3248
  return litellm.ModelResponse(
3249
  **{
3250
  "id": chunk_id,
@@ -3272,6 +3334,7 @@ Analyze what you did wrong, correct it, and retry the function call. Output ONLY
3272
  "finish_reason": "tool_calls",
3273
  }
3274
  ],
 
3275
  }
3276
  )
3277
 
@@ -4641,6 +4704,14 @@ Analyze what you did wrong, correct it, and retry the function call. Output ONLY
4641
  # Handle MALFORMED_FUNCTION_CALL - try auto-fix first
4642
  parsed = self._parse_malformed_call_message(e.finish_message, model)
4643
 
 
 
 
 
 
 
 
 
4644
  if parsed:
4645
  # Try to auto-fix the malformed JSON
4646
  error_info = self._analyze_json_error(parsed["raw_args"])
@@ -4660,15 +4731,13 @@ Analyze what you did wrong, correct it, and retry the function call. Output ONLY
4660
  error_info["fixed_json"],
4661
  )
4662
 
4663
- # Extract response_id from accumulator in exception
4664
- response_id = None
4665
- if e.raw_response and isinstance(e.raw_response, dict):
4666
- acc = e.raw_response.get("accumulator", {})
4667
- response_id = acc.get("response_id")
4668
-
4669
- # Use chunk format for streaming with original response ID
4670
  fixed_chunk = self._build_fixed_tool_call_chunk(
4671
- model, parsed, error_info, response_id=response_id
 
 
 
 
4672
  )
4673
  if fixed_chunk:
4674
  yield fixed_chunk
@@ -4730,8 +4799,11 @@ Analyze what you did wrong, correct it, and retry the function call. Output ONLY
4730
  f"[Antigravity] MALFORMED_FUNCTION_CALL could not be auto-fixed "
4731
  f"for {model} (streaming): {e.finish_message[:100]}..."
4732
  )
4733
- fallback = self._build_malformed_fallback_response(
4734
- model, e.finish_message
 
 
 
4735
  )
4736
  yield fallback
4737
  return
 
3153
  }
3154
  )
3155
 
3156
+ def _build_malformed_fallback_chunk(
3157
+ self,
3158
+ model: str,
3159
+ error_details: str,
3160
+ response_id: Optional[str] = None,
3161
+ usage: Optional[Dict[str, Any]] = None,
3162
+ ) -> litellm.ModelResponse:
3163
+ """
3164
+ Build streaming chunk error response when malformed call retries are exhausted.
3165
+
3166
+ Uses streaming format (delta instead of message) for consistency with streaming responses.
3167
+ Includes usage with completion_tokens > 0 so client.py recognizes it as a final chunk.
3168
+ """
3169
+ chunk_id = response_id or f"chatcmpl-{uuid.uuid4().hex[:24]}"
3170
+
3171
+ # Ensure usage has completion_tokens > 0 for client to recognize as final chunk
3172
+ if not usage or usage.get("completion_tokens", 0) <= 0:
3173
+ prompt_tokens = usage.get("prompt_tokens", 0) if usage else 0
3174
+ usage = {
3175
+ "prompt_tokens": prompt_tokens,
3176
+ "completion_tokens": 1,
3177
+ "total_tokens": prompt_tokens + 1,
3178
+ }
3179
+
3180
+ return litellm.ModelResponse(
3181
+ **{
3182
+ "id": chunk_id,
3183
+ "object": "chat.completion.chunk",
3184
+ "created": int(time.time()),
3185
+ "model": model,
3186
+ "choices": [
3187
+ {
3188
+ "index": 0,
3189
+ "delta": {
3190
+ "role": "assistant",
3191
+ "content": (
3192
+ "[TOOL CALL ERROR] I attempted to call a function but "
3193
+ "repeatedly produced malformed syntax. This may be a model issue.\n\n"
3194
+ f"Last error: {error_details}\n\n"
3195
+ "Please try rephrasing your request or try a different approach."
3196
+ ),
3197
+ },
3198
+ "finish_reason": "stop",
3199
+ }
3200
+ ],
3201
+ "usage": usage,
3202
+ }
3203
+ )
3204
+
3205
  def _build_fixed_tool_call_response(
3206
  self,
3207
  model: str,
 
3265
  parsed_call: Dict[str, Any],
3266
  error_info: Dict[str, Any],
3267
  response_id: Optional[str] = None,
3268
+ usage: Optional[Dict[str, Any]] = None,
3269
  ) -> Optional[litellm.ModelResponse]:
3270
  """
3271
  Build a streaming chunk with the auto-fixed tool call.
 
3277
 
3278
  Args:
3279
  response_id: Optional original response ID to maintain stream continuity
3280
+ usage: Optional usage from previous chunks. Must include completion_tokens > 0
3281
+ for client to recognize this as a final chunk.
3282
 
3283
  Returns None if the JSON couldn't be fixed.
3284
  """
 
3297
  # Use original response ID if provided, otherwise generate new one
3298
  chunk_id = response_id or f"chatcmpl-{uuid.uuid4().hex[:24]}"
3299
 
3300
+ # Ensure usage has completion_tokens > 0 for client to recognize as final chunk
3301
+ # Client.py's _safe_streaming_wrapper uses completion_tokens > 0 to detect final chunks
3302
+ if not usage or usage.get("completion_tokens", 0) <= 0:
3303
+ prompt_tokens = usage.get("prompt_tokens", 0) if usage else 0
3304
+ usage = {
3305
+ "prompt_tokens": prompt_tokens,
3306
+ "completion_tokens": 1, # Minimum to signal final chunk
3307
+ "total_tokens": prompt_tokens + 1,
3308
+ }
3309
+
3310
  return litellm.ModelResponse(
3311
  **{
3312
  "id": chunk_id,
 
3334
  "finish_reason": "tool_calls",
3335
  }
3336
  ],
3337
+ "usage": usage,
3338
  }
3339
  )
3340
 
 
4704
  # Handle MALFORMED_FUNCTION_CALL - try auto-fix first
4705
  parsed = self._parse_malformed_call_message(e.finish_message, model)
4706
 
4707
+ # Extract response_id and last_usage from accumulator for all paths
4708
+ response_id = None
4709
+ last_usage = None
4710
+ if e.raw_response and isinstance(e.raw_response, dict):
4711
+ acc = e.raw_response.get("accumulator", {})
4712
+ response_id = acc.get("response_id")
4713
+ last_usage = acc.get("last_usage")
4714
+
4715
  if parsed:
4716
  # Try to auto-fix the malformed JSON
4717
  error_info = self._analyze_json_error(parsed["raw_args"])
 
4731
  error_info["fixed_json"],
4732
  )
4733
 
4734
+ # Use chunk format for streaming with original response ID and usage
 
 
 
 
 
 
4735
  fixed_chunk = self._build_fixed_tool_call_chunk(
4736
+ model,
4737
+ parsed,
4738
+ error_info,
4739
+ response_id=response_id,
4740
+ usage=last_usage,
4741
  )
4742
  if fixed_chunk:
4743
  yield fixed_chunk
 
4799
  f"[Antigravity] MALFORMED_FUNCTION_CALL could not be auto-fixed "
4800
  f"for {model} (streaming): {e.finish_message[:100]}..."
4801
  )
4802
+ fallback = self._build_malformed_fallback_chunk(
4803
+ model,
4804
+ e.finish_message,
4805
+ response_id=response_id,
4806
+ usage=last_usage,
4807
  )
4808
  yield fallback
4809
  return