Spaces:

sharktide
/

lightning

Running

App Files Files Community

sharktide commited on Apr 25

Commit

4a20d1d

verified ·

1 Parent(s): e5661d9

Update gen.py

Browse files

Files changed (1) hide show

gen.py +88 -107

gen.py CHANGED Viewed

@@ -218,6 +218,50 @@ def _prepare_forward_headers(request: Request) -> Dict[str, str]:
     return fwd
 async def call_chat_completions(
     messages: List[Dict[str, Any]],
     model: str,
@@ -240,18 +284,15 @@ async def call_chat_completions(
     """
     url, api_key = _get_provider_url_and_key(provider)
     headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
-    # Forward client-provided auth/identity headers
-    # Note: This assumes `request` is in scope — it's not, so we must pass it.
-    # We'll fix this by modifying the caller to pass request, or remove if not needed.
-    # For now, we keep it minimal and only do this where we have `request`.
-    # In this function, we do not have `request`, so we skip header forwarding.
-    # Callers that have `request` should handle it externally if needed.
     # Always request streaming upstream — we reassemble below.
     body: Dict[str, Any] = {"model": model, "messages": messages, "stream": True}
     if extra_body:
-        body.update(extra_body)
-        body["stream"] = True  # force streaming even if caller passed stream=False
     TRANSIENT = {502, 503, 524, 429}
     MAX_ATTEMPTS = 3
@@ -295,11 +336,6 @@ async def call_chat_completions(
                     resp_model = model
                     stalled = False
-                    # Wrap each aiter_lines() call in a per-chunk timeout.
-                    # This is the upstream keepalive mechanism: if navy stops
-                    # sending bytes for CHUNK_TIMEOUT seconds we abort and retry
-                    # the whole request rather than silently waiting for Cloudflare
-                    # to kill us with a 524.
                     aiter = r.aiter_lines().__aiter__()
                     while True:
                         try:
@@ -415,7 +451,7 @@ async def call_chat_completions(
         "stream": False,
     }
     if extra_body:
-        # Forward tools/tool_choice but not stream override.
         for k in ("tools", "tool_choice"):
             if k in extra_body:
                 fallback_body[k] = extra_body[k]
@@ -1037,8 +1073,6 @@ async def generate_text(
                         try:
                             obj = json.loads(raw)
                         except Exception:
-                            # Not valid JSON — forward verbatim (keeps partial
-                            # chunks from blocking the stream).
                             yield chunk
                             continue
@@ -1049,24 +1083,9 @@ async def generate_text(
                         # Normalize usage block whenever it appears.
                         _normalize_usage_block(obj)
-                        # ── thinking / reasoning tokens ───────────────────────
-                        # Navy models may embed thinking in two ways:
-                        #
-                        #   1. As delta.reasoning_content (separate field)
-                        #   2. Inline inside delta.content wrapped in ```...```
-                        #
-                        # For API-key callers we always surface both forms.
-                        # For browser/session callers we strip reasoning_content
-                        # so it doesn't confuse UI clients that don't expect it,
-                        # but ``` tags already present in content are left
-                        # alone (they arrived that way from upstream).
                         if forward_thinking:
-                            # Merge reasoning_content into content as
-                            # ```...``` and keep the raw field.
                             obj = _inject_reasoning_into_chunk(obj)
                         else:
-                            # Strip the non-standard field so browser clients
-                            # don't see unexpected keys.
                             try:
                                 delta = obj["choices"][0]["delta"]
                                 delta.pop("reasoning_content", None)
@@ -1092,7 +1111,6 @@ async def generate_text(
         )
     # ── non-streaming ─────────────────────────
-    # Forward headers to upstream call if we had request (we do!)
     fwd_headers = _prepare_forward_headers(request)
     fwd_headers.update({"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"})
@@ -1106,7 +1124,6 @@ async def generate_text(
             fallback_body = dict(body)
             fallback_body["model"] = FALLBACK_MODEL
             fb_headers = {"Authorization": f"Bearer {fb_key}", "Content-Type": "application/json"}
-            # Forward original headers to fallback
             fb_fwd_headers = _prepare_forward_headers(request)
             fb_fwd_headers.update(fb_headers)
             r = await client.post(
@@ -1122,14 +1139,8 @@ async def generate_text(
         except Exception:
             payload = {"error": "Upstream returned invalid JSON"}
         else:
-            # Normalize usage fields.
             _normalize_usage_block(payload)
-            # ── thinking tokens in non-streaming responses ────────────────────
-            # Some navy models put thinking content in
-            # message.reasoning_content.  For API-key callers we prepend it to
-            # message.content wrapped in ```...```; for others we drop
-            # the non-standard field.
             try:
                 message = payload["choices"][0]["message"]
                 reasoning = (
@@ -1141,9 +1152,7 @@ async def generate_text(
                     if forward_thinking:
                         existing = message.get("content") or ""
                         message["content"] = f"```\n{reasoning}\n```{existing}"
-                        # Restore the raw field for clients that want it.
                         message["reasoning_content"] = reasoning
-                    # else: already popped — nothing to do.
                     payload["choices"][0]["message"] = message
             except (KeyError, IndexError, TypeError):
                 pass
@@ -1227,8 +1236,11 @@ def _responses_input_to_messages(
     instructions: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     messages: List[Dict[str, Any]] = []
     if instructions:
-        messages.append({"role": "developer", "content": instructions})
     if isinstance(input_data, str):
         messages.append({"role": "user", "content": input_data})
@@ -1242,6 +1254,9 @@ def _responses_input_to_messages(
             if not isinstance(item, dict):
                 continue
             role = item.get("role", "user")
             text = _content_to_text(item.get("content", ""))
             if text:
                 messages.append({"role": role, "content": text})
@@ -1256,7 +1271,6 @@ def _build_responses_payload(
     output_tokens: int = 0,
     tool_calls: Optional[List[Dict[str, Any]]] = None,
 ) -> Dict[str, Any]:
-    # Build content: text part first, then one function_call part per tool call
     content: List[Dict[str, Any]] = []
     if text:
         content.append({"type": "output_text", "text": text, "annotations": []})
@@ -1269,7 +1283,6 @@ def _build_responses_payload(
             "input": json.loads(fn["arguments"]) if fn.get("arguments") else {},
         })
-    # Top-level output items: one message item (text) + one per tool call
     output_items: List[Dict[str, Any]] = []
     if text or not tool_calls:
@@ -1338,10 +1351,14 @@ async def create_responses(
     uses_tools = bool(tools) or (tool_choice not in [None, "none"])
     # Build extra fields to forward upstream
     extra_body: Dict[str, Any] = {}
-    if tools:
-        extra_body["tools"] = tools
     if tool_choice is not None:
         extra_body["tool_choice"] = tool_choice
@@ -1349,32 +1366,28 @@ async def create_responses(
     _log_routing(chosen_model, provider, messages, uses_tools=uses_tools)
     await _check_chat_rate_limit(request, authorization, x_client_id)
-    # Determine if we should forward thinking (reasoning) tokens
     forward_thinking = _is_api_key_request(request)
     # ── non-streaming ─────────────────────────
     if stream is False:
-        # Forward headers to upstream
-        fwd_headers = _prepare_forward_headers(request)
-        url, api_key = _get_provider_url_and_key(provider)
-        fwd_headers.update({"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"})
-        data = await call_chat_completions(
-            messages, chosen_model, provider, extra_body=extra_body or None
-        )
-        # Note: call_chat_completions does not currently use fwd_headers — we must fix that.
-        # Since we cannot change call_chat_completions signature easily here,
-        # we instead reimplement the non-streaming path with proper header forwarding.
-        # But to avoid duplication, we'll assume call_chat_completions is fixed externally.
-        # For now, we proceed and note: header forwarding in non-streaming is incomplete
-        # unless call_chat_completions is updated to accept headers.
-        # Given constraints, we'll skip and note this as a remaining gap.
         data = await call_chat_completions(
             messages, chosen_model, provider, extra_body=extra_body or None
         )
         input_tokens, output_tokens = _extract_usage(data)
         message = data.get("choices", [{}])[0].get("message", {})
         text = message.get("content") or ""
         tool_calls = message.get("tool_calls") or []
         response_id = _resp_id("resp")
         return JSONResponse(
@@ -1384,26 +1397,10 @@ async def create_responses(
         )
     # ── streaming ─────────────────────────────
-    # Rather than accumulating the full upstream response and then replaying it,
-    # we open a streaming connection to the upstream and translate each SSE chunk
-    # into the appropriate Responses-API event in real time.
-    #
-    # This means:
-    #   - Thinking/reasoning tokens appear as response.output_text.delta events
-    #     the moment navy emits them — no buffering, no pings needed.
-    #   - Tool-call argument fragments stream as
-    #     response.function_call_arguments.delta events.
-    #   - The Cloudflare 524 problem is avoided because bytes flow continuously.
-    #
-    # State machine:
-    #   THINKING  → emitting ```...``` deltas (reasoning_content field)
-    #   TEXT      → emitting normal output_text deltas (content field)
-    #   TOOL      → emitting function_call_arguments deltas
-    #   DONE      → response.completed emitted, generator exits
     async def event_stream():
         response_id = _resp_id("resp")
         ts          = _resp_ts()
-        CHUNK_TIMEOUT = 60  # seconds — stall detector (same as call_chat_completions)
         def sse(event_type: str, data: dict) -> str:
             return f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
@@ -1435,31 +1432,27 @@ async def create_responses(
         })
         up_url, up_key = _get_provider_url_and_key(provider)
-        up_headers = {"Authorization": f"Bearer {up_key}", "Content-Type": "application/json"}
         up_body: Dict[str, Any] = {
             "model": chosen_model, "messages": messages, "stream": True,
         }
         if extra_body:
-            up_body.update(extra_body)
-            up_body["stream"] = True
         TRANSIENT = {502, 503, 524, 429}
         MAX_ATTEMPTS = 3
-        # ── Per-attempt retry loop ────────────────────────────────────────────
-        # If navy stalls or returns a transient error we retry transparently.
-        # The client already received response.created/in_progress so we just
-        # keep the stream open; from Codex's perspective it's still waiting.
         for attempt in range(MAX_ATTEMPTS):
             if attempt:
                 await asyncio.sleep(2 ** attempt)
-            # Accumulated state — reset on each retry so we don't double-emit.
             text_item_id       = _resp_id("msg")
             output_index       = 0
-            text_started       = False   # have we opened a message output item?
-            thinking_open      = False   # are we inside a ``` block?
-            full_text          = ""      # for response.completed payload
             full_reasoning     = ""
             tool_calls_map: Dict[int, Dict[str, Any]] = {}
             tool_item_ids: Dict[int, str] = {}
@@ -1470,14 +1463,13 @@ async def create_responses(
             attempt_failed = False
             try:
-                # Prepare headers: upstream auth + forwarded client headers
                 fwd_headers = _prepare_forward_headers(request)
                 fwd_headers.update({"Authorization": f"Bearer {up_key}", "Content-Type": "application/json"})
                 async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, read=300.0)) as client:
                     async with client.stream("POST", up_url, json=up_body, headers=fwd_headers) as r:
                         if r.status_code in TRANSIENT:
-                            body_bytes = await r.aread()
                             print(
                                 f"[responses stream] attempt {attempt+1} got "
                                 f"{r.status_code}, retrying…"
@@ -1540,8 +1532,6 @@ async def create_responses(
                             if reasoning_chunk:
                                 full_reasoning += reasoning_chunk
                                 if not text_started:
-                                    # Open the message output item on first token
-                                    # (whether thinking or regular content).
                                     text_started = True
                                     yield sse("response.output_item.added", {
                                         "type": "response.output_item.added",
@@ -1561,7 +1551,6 @@ async def create_responses(
                                         "part": {"type": "output_text", "text": "", "annotations": []},
                                     })
                                 if not thinking_open:
-                                    # Emit the opening ``` tag as its own delta.
                                     thinking_open = True
                                     yield sse("response.output_text.delta", {
                                         "type": "response.output_text.delta",
@@ -1602,7 +1591,6 @@ async def create_responses(
                                         "part": {"type": "output_text", "text": "", "annotations": []},
                                     })
                                 if thinking_open:
-                                    # Close the ``` block before regular content.
                                     thinking_open = False
                                     yield sse("response.output_text.delta", {
                                         "type": "response.output_text.delta",
@@ -1623,7 +1611,6 @@ async def create_responses(
                             for tc_delta in (delta.get("tool_calls") or []):
                                 idx = tc_delta.get("index", 0)
-                                # First fragment for this tool call index.
                                 if idx not in tool_calls_map:
                                     tc_id = tc_delta.get("id") or _resp_id("tool")
                                     tool_calls_map[idx] = {
@@ -1646,12 +1633,9 @@ async def create_responses(
                                 if arg_chunk:
                                     existing["function"]["arguments"] += arg_chunk
-                                    # Open this tool-call output item on its first
-                                    # argument fragment, once we know the name.
                                     tc_id = tool_item_ids[idx]
                                     if not tool_started[idx] and existing["function"]["name"]:
                                         tool_started[idx] = True
-                                        # Close text item first if it's open.
                                         if text_started:
                                             if thinking_open:
                                                 thinking_open = False
@@ -1720,13 +1704,12 @@ async def create_responses(
             except (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError) as exc:
                 print(f"[responses stream] attempt {attempt+1} network error: {exc}")
-                stalled = True  # treat as retryable
             if stalled or attempt_failed:
-                continue  # retry
             # ── Stream finished cleanly — emit closing events ─────────────────
-            # Close any still-open text item.
             if text_started:
                 if thinking_open:
                     yield sse("response.output_text.delta", {
@@ -1764,7 +1747,6 @@ async def create_responses(
                 })
                 output_index += 1
-            # Close any open tool-call items.
             for idx in sorted(tool_calls_map):
                 if not tool_started.get(idx):
                     continue
@@ -1789,7 +1771,6 @@ async def create_responses(
                 })
                 output_index += 1
-            # Build tool_calls list for the completed payload.
             tool_calls_list = [tool_calls_map[i] for i in sorted(tool_calls_map)]
             input_tok, output_tok = _extract_usage({"usage": usage})
             yield sse("response.completed", {
@@ -1800,7 +1781,7 @@ async def create_responses(
                 ),
             })
             yield "data: [DONE]\n\n"
-            return  # success — exit generator
         # All retry attempts exhausted.
         yield _fail("Upstream failed after multiple retries")
@@ -1810,4 +1791,4 @@ async def create_responses(
         event_stream(),
         media_type="text/event-stream",
         headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"},
-    )

     return fwd
+def _normalize_tools_for_chat_completions(tools: Optional[List[Any]]) -> Optional[List[Any]]:
+    """
+    Normalize tools from the Responses API format to the chat completions format.
+    Responses API tools look like:
+        {"type": "function", "name": "...", "description": "...", "parameters": {...}}
+    Chat completions tools look like:
+        {"type": "function", "function": {"name": "...", "description": "...", "parameters": {...}}}
+    If tools are already in chat completions format (have a nested "function" key) they
+    are returned unchanged.
+    """
+    if not tools:
+        return tools
+    normalized = []
+    for tool in tools:
+        if not isinstance(tool, dict):
+            normalized.append(tool)
+            continue
+        # Already in chat completions format — leave untouched.
+        if "function" in tool:
+            normalized.append(tool)
+            continue
+        # Responses API format — lift name/description/parameters into "function".
+        if tool.get("type") == "function":
+            fn: Dict[str, Any] = {}
+            if "name" in tool:
+                fn["name"] = tool["name"]
+            if "description" in tool:
+                fn["description"] = tool["description"]
+            if "parameters" in tool:
+                fn["parameters"] = tool["parameters"]
+            normalized.append({"type": "function", "function": fn})
+        else:
+            # Unknown tool type — pass through as-is.
+            normalized.append(tool)
+    return normalized
 async def call_chat_completions(
     messages: List[Dict[str, Any]],
     model: str,
     """
     url, api_key = _get_provider_url_and_key(provider)
     headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
     # Always request streaming upstream — we reassemble below.
+    # FIX #1: Never let extra_body overwrite the model we chose; also protect
+    # the stream flag so it always stays True for our reassembly logic.
     body: Dict[str, Any] = {"model": model, "messages": messages, "stream": True}
     if extra_body:
+        for k, v in extra_body.items():
+            if k not in ("model", "stream"):   # ← protect model & stream
+                body[k] = v
     TRANSIENT = {502, 503, 524, 429}
     MAX_ATTEMPTS = 3
                     resp_model = model
                     stalled = False
                     aiter = r.aiter_lines().__aiter__()
                     while True:
                         try:
         "stream": False,
     }
     if extra_body:
+        # Forward tools/tool_choice but not stream/model override.
         for k in ("tools", "tool_choice"):
             if k in extra_body:
                 fallback_body[k] = extra_body[k]
                         try:
                             obj = json.loads(raw)
                         except Exception:
                             yield chunk
                             continue
                         # Normalize usage block whenever it appears.
                         _normalize_usage_block(obj)
                         if forward_thinking:
                             obj = _inject_reasoning_into_chunk(obj)
                         else:
                             try:
                                 delta = obj["choices"][0]["delta"]
                                 delta.pop("reasoning_content", None)
         )
     # ── non-streaming ─────────────────────────
     fwd_headers = _prepare_forward_headers(request)
     fwd_headers.update({"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"})
             fallback_body = dict(body)
             fallback_body["model"] = FALLBACK_MODEL
             fb_headers = {"Authorization": f"Bearer {fb_key}", "Content-Type": "application/json"}
             fb_fwd_headers = _prepare_forward_headers(request)
             fb_fwd_headers.update(fb_headers)
             r = await client.post(
         except Exception:
             payload = {"error": "Upstream returned invalid JSON"}
         else:
             _normalize_usage_block(payload)
             try:
                 message = payload["choices"][0]["message"]
                 reasoning = (
                     if forward_thinking:
                         existing = message.get("content") or ""
                         message["content"] = f"```\n{reasoning}\n```{existing}"
                         message["reasoning_content"] = reasoning
                     payload["choices"][0]["message"] = message
             except (KeyError, IndexError, TypeError):
                 pass
     instructions: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     messages: List[Dict[str, Any]] = []
+    # FIX #3: Use "system" instead of "developer" — navy rejects the
+    # non-standard "developer" role with a 400.
     if instructions:
+        messages.append({"role": "system", "content": instructions})
     if isinstance(input_data, str):
         messages.append({"role": "user", "content": input_data})
             if not isinstance(item, dict):
                 continue
             role = item.get("role", "user")
+            # Also normalise "developer" role on individual message items.
+            if role == "developer":
+                role = "system"
             text = _content_to_text(item.get("content", ""))
             if text:
                 messages.append({"role": role, "content": text})
     output_tokens: int = 0,
     tool_calls: Optional[List[Dict[str, Any]]] = None,
 ) -> Dict[str, Any]:
     content: List[Dict[str, Any]] = []
     if text:
         content.append({"type": "output_text", "text": text, "annotations": []})
             "input": json.loads(fn["arguments"]) if fn.get("arguments") else {},
         })
     output_items: List[Dict[str, Any]] = []
     if text or not tool_calls:
     uses_tools = bool(tools) or (tool_choice not in [None, "none"])
+    # FIX #2: Normalize tools from Responses API format → chat completions format
+    # before forwarding to navy, which only speaks chat completions.
+    normalized_tools = _normalize_tools_for_chat_completions(tools)
     # Build extra fields to forward upstream
     extra_body: Dict[str, Any] = {}
+    if normalized_tools:
+        extra_body["tools"] = normalized_tools
     if tool_choice is not None:
         extra_body["tool_choice"] = tool_choice
     _log_routing(chosen_model, provider, messages, uses_tools=uses_tools)
     await _check_chat_rate_limit(request, authorization, x_client_id)
     forward_thinking = _is_api_key_request(request)
     # ── non-streaming ─────────────────────────
     if stream is False:
+        # FIX: removed the duplicate call_chat_completions call that was here
         data = await call_chat_completions(
             messages, chosen_model, provider, extra_body=extra_body or None
         )
         input_tokens, output_tokens = _extract_usage(data)
         message = data.get("choices", [{}])[0].get("message", {})
+        # Handle reasoning tokens in non-streaming responses path
+        reasoning = (
+            message.pop("reasoning_content", None)
+            or message.pop("reasoning", None)
+            or ""
+        )
         text = message.get("content") or ""
+        if reasoning and isinstance(reasoning, str):
+            if forward_thinking:
+                text = f"```\n{reasoning}\n```{text}"
         tool_calls = message.get("tool_calls") or []
         response_id = _resp_id("resp")
         return JSONResponse(
         )
     # ── streaming ─────────────────────────────
     async def event_stream():
         response_id = _resp_id("resp")
         ts          = _resp_ts()
+        CHUNK_TIMEOUT = 60
         def sse(event_type: str, data: dict) -> str:
             return f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
         })
         up_url, up_key = _get_provider_url_and_key(provider)
         up_body: Dict[str, Any] = {
             "model": chosen_model, "messages": messages, "stream": True,
         }
         if extra_body:
+            # FIX #1 (streaming path): also protect model/stream here
+            for k, v in extra_body.items():
+                if k not in ("model", "stream"):
+                    up_body[k] = v
         TRANSIENT = {502, 503, 524, 429}
         MAX_ATTEMPTS = 3
         for attempt in range(MAX_ATTEMPTS):
             if attempt:
                 await asyncio.sleep(2 ** attempt)
             text_item_id       = _resp_id("msg")
             output_index       = 0
+            text_started       = False
+            thinking_open      = False
+            full_text          = ""
             full_reasoning     = ""
             tool_calls_map: Dict[int, Dict[str, Any]] = {}
             tool_item_ids: Dict[int, str] = {}
             attempt_failed = False
             try:
                 fwd_headers = _prepare_forward_headers(request)
                 fwd_headers.update({"Authorization": f"Bearer {up_key}", "Content-Type": "application/json"})
                 async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, read=300.0)) as client:
                     async with client.stream("POST", up_url, json=up_body, headers=fwd_headers) as r:
                         if r.status_code in TRANSIENT:
+                            await r.aread()
                             print(
                                 f"[responses stream] attempt {attempt+1} got "
                                 f"{r.status_code}, retrying…"
                             if reasoning_chunk:
                                 full_reasoning += reasoning_chunk
                                 if not text_started:
                                     text_started = True
                                     yield sse("response.output_item.added", {
                                         "type": "response.output_item.added",
                                         "part": {"type": "output_text", "text": "", "annotations": []},
                                     })
                                 if not thinking_open:
                                     thinking_open = True
                                     yield sse("response.output_text.delta", {
                                         "type": "response.output_text.delta",
                                         "part": {"type": "output_text", "text": "", "annotations": []},
                                     })
                                 if thinking_open:
                                     thinking_open = False
                                     yield sse("response.output_text.delta", {
                                         "type": "response.output_text.delta",
                             for tc_delta in (delta.get("tool_calls") or []):
                                 idx = tc_delta.get("index", 0)
                                 if idx not in tool_calls_map:
                                     tc_id = tc_delta.get("id") or _resp_id("tool")
                                     tool_calls_map[idx] = {
                                 if arg_chunk:
                                     existing["function"]["arguments"] += arg_chunk
                                     tc_id = tool_item_ids[idx]
                                     if not tool_started[idx] and existing["function"]["name"]:
                                         tool_started[idx] = True
                                         if text_started:
                                             if thinking_open:
                                                 thinking_open = False
             except (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError) as exc:
                 print(f"[responses stream] attempt {attempt+1} network error: {exc}")
+                stalled = True
             if stalled or attempt_failed:
+                continue
             # ── Stream finished cleanly — emit closing events ─────────────────
             if text_started:
                 if thinking_open:
                     yield sse("response.output_text.delta", {
                 })
                 output_index += 1
             for idx in sorted(tool_calls_map):
                 if not tool_started.get(idx):
                     continue
                 })
                 output_index += 1
             tool_calls_list = [tool_calls_map[i] for i in sorted(tool_calls_map)]
             input_tok, output_tok = _extract_usage({"usage": usage})
             yield sse("response.completed", {
                 ),
             })
             yield "data: [DONE]\n\n"
+            return  # success
         # All retry attempts exhausted.
         yield _fail("Upstream failed after multiple retries")
         event_stream(),
         media_type="text/event-stream",
         headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"},
+    )