Spaces:

elmerzole
/

llm-api-proxy

Paused

Mirrowel commited on Nov 19, 2025

Commit

c590f47

1 Parent(s): 664a3d8

fix(providers/iflow): 🐛 stabilize iFlow streaming behavior and model handling

iFlow integration produced sporadic HTTP 406s and missed finalization info for streamed responses. This change:

- Stop transmitting unsupported stream_options to iFlow (apply stream metadata only for other providers) to avoid 406 responses
- Normalize model identifiers by removing any leading provider tag so iFlow receives the raw model name
- Rework SSE parsing to accept both `data:` and `data: ` prefixes and to handle final chunks that contain both choices and usage by emitting the content first, then the usage
- Add explicit branches for usage-only and content-only chunks to ensure consistent output shapes

Files changed (2) hide show

src/rotator_library/client.py +15 -4
src/rotator_library/providers/iflow_provider.py +56 -17

src/rotator_library/client.py CHANGED Viewed

@@ -1552,11 +1552,22 @@ class RotatingClient:
         Returns:
             The completion response object, or an async generator for streaming responses, or None if all retries fail.
         """
         if kwargs.get("stream"):
-            if "stream_options" not in kwargs:
-                kwargs["stream_options"] = {}
-            if "include_usage" not in kwargs["stream_options"]:
-                kwargs["stream_options"]["include_usage"] = True
             return self._streaming_acompletion_with_retry(
                 request=request, pre_request_callback=pre_request_callback, **kwargs
             )

         Returns:
             The completion response object, or an async generator for streaming responses, or None if all retries fail.
         """
+        # Handle iflow provider: remove stream_options to avoid HTTP 406
+        model = kwargs.get("model", "")
+        provider = model.split("/")[0] if "/" in model else ""
+        if provider == "iflow" and "stream_options" in kwargs:
+            lib_logger.debug("Removing stream_options for iflow provider to avoid HTTP 406")
+            kwargs.pop("stream_options", None)
         if kwargs.get("stream"):
+            # Only add stream_options for providers that support it (excluding iflow)
+            if provider != "iflow":
+                if "stream_options" not in kwargs:
+                    kwargs["stream_options"] = {}
+                if "include_usage" not in kwargs["stream_options"]:
+                    kwargs["stream_options"]["include_usage"] = True
             return self._streaming_acompletion_with_retry(
                 request=request, pre_request_callback=pre_request_callback, **kwargs
             )

src/rotator_library/providers/iflow_provider.py CHANGED Viewed

@@ -282,12 +282,29 @@ class IFlowProvider(IFlowAuthBase, ProviderInterface):
         """
         Converts a raw iFlow SSE chunk to an OpenAI-compatible chunk.
         Since iFlow is OpenAI-compatible, minimal conversion is needed.
         """
         if not isinstance(chunk, dict):
             return
-        # Handle usage data
-        if usage_data := chunk.get("usage"):
             yield {
                 "choices": [], "model": model_id, "object": "chat.completion.chunk",
                 "id": chunk.get("id", f"chatcmpl-iflow-{time.time()}"),
@@ -300,19 +317,30 @@ class IFlowProvider(IFlowAuthBase, ProviderInterface):
             }
             return
-        # Handle content data
-        choices = chunk.get("choices", [])
-        if not choices:
             return
-        # iFlow returns OpenAI-compatible format, so we can mostly pass through
-        yield {
-            "choices": choices,
-            "model": model_id,
-            "object": "chat.completion.chunk",
-            "id": chunk.get("id", f"chatcmpl-iflow-{time.time()}"),
-            "created": chunk.get("created", int(time.time()))
-        }
     def _stream_to_completion_response(self, chunks: List[litellm.ModelResponse]) -> litellm.ModelResponse:
         """
@@ -429,8 +457,12 @@ class IFlowProvider(IFlowAuthBase, ProviderInterface):
             # CRITICAL: get_api_details returns api_key, NOT access_token
             api_base, api_key = await self.get_api_details(credential_path)
             # Build clean payload with only supported parameters
-            payload = self._build_request_payload(**kwargs)
             headers = {
                 "Authorization": f"Bearer {api_key}",  # Uses api_key from user info
@@ -487,9 +519,16 @@ class IFlowProvider(IFlowAuthBase, ProviderInterface):
                     # Process successful streaming response
                     async for line in response.aiter_lines():
                         file_logger.log_response_chunk(line)
-                        if line.startswith('data: '):
-                            data_str = line[6:]
-                            if data_str == "[DONE]":
                                 break
                             try:
                                 chunk = json.loads(data_str)

         """
         Converts a raw iFlow SSE chunk to an OpenAI-compatible chunk.
         Since iFlow is OpenAI-compatible, minimal conversion is needed.
+        CRITICAL FIX: Handle chunks with BOTH usage and choices (final chunk)
+        without early return to ensure finish_reason is properly processed.
         """
         if not isinstance(chunk, dict):
             return
+        # Get choices and usage data
+        choices = chunk.get("choices", [])
+        usage_data = chunk.get("usage")
+        # Handle chunks with BOTH choices and usage (typical for final chunk)
+        # CRITICAL: Process choices FIRST to capture finish_reason, then yield usage
+        if choices and usage_data:
+            # Yield the choice chunk first (contains finish_reason)
+            yield {
+                "choices": choices,
+                "model": model_id,
+                "object": "chat.completion.chunk",
+                "id": chunk.get("id", f"chatcmpl-iflow-{time.time()}"),
+                "created": chunk.get("created", int(time.time()))
+            }
+            # Then yield the usage chunk
             yield {
                 "choices": [], "model": model_id, "object": "chat.completion.chunk",
                 "id": chunk.get("id", f"chatcmpl-iflow-{time.time()}"),
             }
             return
+        # Handle usage-only chunks
+        if usage_data:
+            yield {
+                "choices": [], "model": model_id, "object": "chat.completion.chunk",
+                "id": chunk.get("id", f"chatcmpl-iflow-{time.time()}"),
+                "created": chunk.get("created", int(time.time())),
+                "usage": {
+                    "prompt_tokens": usage_data.get("prompt_tokens", 0),
+                    "completion_tokens": usage_data.get("completion_tokens", 0),
+                    "total_tokens": usage_data.get("total_tokens", 0),
+                }
+            }
             return
+        # Handle content-only chunks
+        if choices:
+            # iFlow returns OpenAI-compatible format, so we can mostly pass through
+            yield {
+                "choices": choices,
+                "model": model_id,
+                "object": "chat.completion.chunk",
+                "id": chunk.get("id", f"chatcmpl-iflow-{time.time()}"),
+                "created": chunk.get("created", int(time.time()))
+            }
     def _stream_to_completion_response(self, chunks: List[litellm.ModelResponse]) -> litellm.ModelResponse:
         """
             # CRITICAL: get_api_details returns api_key, NOT access_token
             api_base, api_key = await self.get_api_details(credential_path)
+            # Strip provider prefix from model name (e.g., "iflow/Qwen3-Coder-Plus" -> "Qwen3-Coder-Plus")
+            model_name = model.split('/')[-1]
+            kwargs_with_stripped_model = {**kwargs, 'model': model_name}
             # Build clean payload with only supported parameters
+            payload = self._build_request_payload(**kwargs_with_stripped_model)
             headers = {
                 "Authorization": f"Bearer {api_key}",  # Uses api_key from user info
                     # Process successful streaming response
                     async for line in response.aiter_lines():
                         file_logger.log_response_chunk(line)
+                        # CRITICAL FIX: Handle both "data:" (no space) and "data: " (with space)
+                        if line.startswith('data:'):
+                            # Extract data after "data:" prefix, handling both formats
+                            if line.startswith('data: '):
+                                data_str = line[6:]  # Skip "data: "
+                            else:
+                                data_str = line[5:]  # Skip "data:"
+                            if data_str.strip() == "[DONE]":
                                 break
                             try:
                                 chunk = json.loads(data_str)