Spaces:

sharktide
/

lightning

Running

App Files Files Community

Update gen.py

by incognitolm - opened 22 days ago

base: refs/heads/main

←

from: refs/pr/8

Discussion Files changed

+378

-47

Files changed (1) hide show

gen.py +378 -47

gen.py CHANGED Viewed

@@ -66,6 +66,10 @@ MODEL_MAP = {
 FALLBACK_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
 FALLBACK_PROVIDER = "groq"
 # ──────────────────────────────────────────────
 # CENTRAL ROUTING LOGIC
@@ -208,24 +212,204 @@ async def call_chat_completions(
     extra_body: Optional[Dict[str, Any]] = None,
 ) -> Dict[str, Any]:
     """
-    Non-streaming chat-completions call.
-    Returns the full upstream JSON payload.
-    Raises HTTPException on upstream errors.
     """
     url, api_key = _get_provider_url_and_key(provider)
     headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
-    body = {"model": model, "messages": messages, "stream": False}
     if extra_body:
         body.update(extra_body)
-    async with httpx.AsyncClient(timeout=None) as client:
-        r = await client.post(url, json=body, headers=headers)
-    if r.status_code != 200:
-        raise HTTPException(status_code=r.status_code, detail=r.text[:1000])
-    return r.json()
 def _extract_text_from_response(data: Dict[str, Any]) -> str:
@@ -253,6 +437,65 @@ def is_cinematic_image_prompt(prompt: str) -> bool:
     return False
 # ──────────────────────────────────────────────
 # IMAGE GENERATION
 # ──────────────────────────────────────────────
@@ -682,6 +925,10 @@ async def generate_text(
     await _check_chat_rate_limit(request, authorization, x_client_id)
     body["model"] = chosen_model
     stream = body.get("stream", False)
@@ -744,39 +991,79 @@ async def generate_text(
             sent_metadata = False
             async with httpx.AsyncClient(timeout=None) as client:
                 async for chunk in stream_primary(client):
                     if not sent_metadata:
-                        meta = {"router_metadata": {"model_name": MODEL_MAP.get(chosen_model, chosen_model)}}
                         yield f"data: {json.dumps(meta)}\n\n"
                         sent_metadata = True
-                    # Intercept the final non-[DONE] data chunk and normalize
-                    # the usage block so callers always see consistent field names.
-                    if chunk.startswith("data:") and "[DONE]" not in chunk:
                         raw = chunk[5:].strip()
                         try:
                             obj = json.loads(raw)
-                            if isinstance(obj, dict) and "usage" in obj and isinstance(obj["usage"], dict):
-                                u = obj["usage"]
-                                input_tok  = u.get("prompt_tokens")     or u.get("input_tokens",  0)
-                                output_tok = u.get("completion_tokens") or u.get("output_tokens", 0)
-                                obj["usage"] = {
-                                    "prompt_tokens":     input_tok,
-                                    "completion_tokens": output_tok,
-                                    "total_tokens":      input_tok + output_tok,
-                                    "input_tokens":      input_tok,
-                                    "output_tokens":     output_tok,
-                                }
-                                yield f"data: {json.dumps(obj)}\n\n"
-                                continue
                         except Exception:
-                            pass
                     yield chunk
         return StreamingResponse(
             event_generator(),
             media_type="text/event-stream",
-            headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"},
         )
     # ── non-streaming ─────────────────────────
@@ -789,7 +1076,11 @@ async def generate_text(
             fb_url, fb_key = _get_provider_url_and_key(FALLBACK_PROVIDER)
             fallback_body = dict(body)
             fallback_body["model"] = FALLBACK_MODEL
-            r = await client.post(fb_url, json=fallback_body, headers={"Authorization": f"Bearer {fb_key}"})
     content_type = (r.headers.get("content-type") or "").lower()
     if "application/json" in content_type:
@@ -798,22 +1089,35 @@ async def generate_text(
         except Exception:
             payload = {"error": "Upstream returned invalid JSON"}
         else:
-            # Normalize usage: upstream may use prompt_tokens/completion_tokens
-            # (OpenAI/Groq style) — rewrite to a consistent shape and add
-            # router_metadata so callers always see the same fields.
-            if "usage" in payload and isinstance(payload["usage"], dict):
-                u = payload["usage"]
-                input_tok  = u.get("prompt_tokens")  or u.get("input_tokens",  0)
-                output_tok = u.get("completion_tokens") or u.get("output_tokens", 0)
-                payload["usage"] = {
-                    "prompt_tokens":     input_tok,
-                    "completion_tokens": output_tok,
-                    "total_tokens":      input_tok + output_tok,
-                    # also include the OpenAI Responses-API names for clients that expect them
-                    "input_tokens":  input_tok,
-                    "output_tokens": output_tok,
-                }
-            payload.setdefault("router_metadata", {})["model_name"] = MODEL_MAP.get(chosen_model, chosen_model)
     else:
         payload = {
             "error": "Upstream returned non-JSON response",
@@ -1063,8 +1367,24 @@ async def create_responses(
             },
         })
         try:
-            text, tool_calls, input_tokens, output_tokens = await _generate()
         except HTTPException as exc:
             yield sse("response.failed", {
                 "type": "response.failed",
@@ -1076,6 +1396,17 @@ async def create_responses(
             })
             yield "data: [DONE]\n\n"
             return
         output_index = 0

 FALLBACK_MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
 FALLBACK_PROVIDER = "groq"
+# Header that API-key authenticated clients send so we know to stream
+# thinking tokens back to them.
+API_KEY_HEADER = "x-api-key"
 # ──────────────────────────────────────────────
 # CENTRAL ROUTING LOGIC
     extra_body: Optional[Dict[str, Any]] = None,
 ) -> Dict[str, Any]:
     """
+    Resilient chat-completions call designed to survive Cloudflare 524 timeouts.
+    Strategy:
+      1. Ask the upstream for a *streaming* response so bytes trickle in
+         continuously, preventing Cloudflare's idle timeout from firing.
+      2. Each chunk from aiter_lines() has its own deadline (CHUNK_TIMEOUT).
+         If navy goes completely silent mid-stream (common during long tool-call
+         generation) we detect the stall quickly and retry rather than waiting
+         for Cloudflare to 524 us.
+      3. Retry up to MAX_ATTEMPTS times on transient errors or stalls,
+         with exponential back-off between attempts.
+      4. On exhausted retries fall through to the Groq fallback.
     """
     url, api_key = _get_provider_url_and_key(provider)
     headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+    # Always request streaming upstream — we reassemble below.
+    body: Dict[str, Any] = {"model": model, "messages": messages, "stream": True}
     if extra_body:
         body.update(extra_body)
+        body["stream"] = True  # force streaming even if caller passed stream=False
+    TRANSIENT = {502, 503, 524, 429}
+    MAX_ATTEMPTS = 3
+    # How long to wait for the *next chunk* before declaring a stall.
+    # Must be comfortably below Cloudflare's ~100 s idle-connection limit.
+    CHUNK_TIMEOUT = 60  # seconds
+    last_exc: Optional[Exception] = None
+    for attempt in range(MAX_ATTEMPTS):
+        if attempt:
+            await asyncio.sleep(2 ** attempt)  # 2 s, 4 s
+        try:
+            async with httpx.AsyncClient(timeout=httpx.Timeout(300.0, read=300.0)) as client:
+                async with client.stream("POST", url, json=body, headers=headers) as r:
+                    # Transient upstream error — retry.
+                    if r.status_code in TRANSIENT:
+                        body_bytes = await r.aread()
+                        last_exc = HTTPException(
+                            status_code=r.status_code,
+                            detail=body_bytes.decode("utf-8", errors="replace")[:500],
+                        )
+                        print(f"[call_chat_completions] attempt {attempt+1} got {r.status_code}, retrying…")
+                        continue
+                    if r.status_code != 200:
+                        body_bytes = await r.aread()
+                        raise HTTPException(
+                            status_code=r.status_code,
+                            detail=body_bytes.decode("utf-8", errors="replace")[:1000],
+                        )
+                    # ── Reassemble streaming SSE into a single response object ──
+                    accumulated_content = ""
+                    accumulated_reasoning = ""
+                    tool_calls_map: Dict[int, Dict[str, Any]] = {}
+                    usage: Dict[str, Any] = {}
+                    finish_reason: Optional[str] = None
+                    resp_id = ""
+                    resp_model = model
+                    stalled = False
+                    # Wrap each aiter_lines() call in a per-chunk timeout.
+                    # This is the upstream keepalive mechanism: if navy stops
+                    # sending bytes for CHUNK_TIMEOUT seconds we abort and retry
+                    # the whole request rather than silently waiting for Cloudflare
+                    # to kill us with a 524.
+                    aiter = r.aiter_lines().__aiter__()
+                    while True:
+                        try:
+                            line = await asyncio.wait_for(
+                                aiter.__anext__(), timeout=CHUNK_TIMEOUT
+                            )
+                        except asyncio.TimeoutError:
+                            print(
+                                f"[call_chat_completions] attempt {attempt+1} "
+                                f"stalled >{CHUNK_TIMEOUT}s waiting for next chunk — retrying"
+                            )
+                            stalled = True
+                            break
+                        except StopAsyncIteration:
+                            break
+                        if not line or not line.startswith("data:"):
+                            continue
+                        raw = line[5:].strip()
+                        if raw == "[DONE]":
+                            break
+                        try:
+                            obj = json.loads(raw)
+                        except Exception:
+                            continue
+                        if not isinstance(obj, dict):
+                            continue
+                        resp_id = resp_id or obj.get("id", "")
+                        resp_model = obj.get("model", resp_model)
+                        if "usage" in obj and obj["usage"]:
+                            usage = obj["usage"]
+                        choices = obj.get("choices") or []
+                        if not choices:
+                            continue
+                        choice = choices[0]
+                        finish_reason = choice.get("finish_reason") or finish_reason
+                        delta = choice.get("delta") or {}
+                        # Accumulate text content.
+                        dc = delta.get("content")
+                        if dc:
+                            accumulated_content += dc
+                        # Accumulate reasoning / thinking tokens.
+                        dr = delta.get("reasoning_content") or delta.get("reasoning")
+                        if dr:
+                            accumulated_reasoning += dr
+                        # Accumulate tool-call argument chunks (streamed as fragments).
+                        for tc_delta in (delta.get("tool_calls") or []):
+                            idx = tc_delta.get("index", 0)
+                            if idx not in tool_calls_map:
+                                tool_calls_map[idx] = {
+                                    "id": tc_delta.get("id", ""),
+                                    "type": tc_delta.get("type", "function"),
+                                    "function": {"name": "", "arguments": ""},
+                                }
+                            existing = tool_calls_map[idx]
+                            if tc_delta.get("id"):
+                                existing["id"] = tc_delta["id"]
+                            fn_delta = tc_delta.get("function") or {}
+                            if fn_delta.get("name"):
+                                existing["function"]["name"] += fn_delta["name"]
+                            if fn_delta.get("arguments"):
+                                existing["function"]["arguments"] += fn_delta["arguments"]
+                    if stalled:
+                        last_exc = Exception(f"navy stalled >{CHUNK_TIMEOUT}s between chunks")
+                        continue  # → next retry attempt
+            # Reassemble into a standard non-streaming response shape.
+            tool_calls_list = [tool_calls_map[i] for i in sorted(tool_calls_map)]
+            message: Dict[str, Any] = {"role": "assistant", "content": accumulated_content}
+            if accumulated_reasoning:
+                message["reasoning_content"] = accumulated_reasoning
+            if tool_calls_list:
+                message["tool_calls"] = tool_calls_list
+            return {
+                "id": resp_id,
+                "object": "chat.completion",
+                "model": resp_model,
+                "choices": [
+                    {
+                        "index": 0,
+                        "message": message,
+                        "finish_reason": finish_reason or "stop",
+                    }
+                ],
+                "usage": usage,
+            }
+        except HTTPException:
+            raise
+        except (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError) as exc:
+            last_exc = exc
+            print(f"[call_chat_completions] attempt {attempt+1} network error: {exc}, retrying…")
+            continue
+    # All attempts exhausted — fall back to Groq.
+    print(f"[call_chat_completions] all attempts failed ({last_exc}), falling back to Groq")
+    fb_url, fb_key = _get_provider_url_and_key(FALLBACK_PROVIDER)
+    fb_headers = {"Authorization": f"Bearer {fb_key}", "Content-Type": "application/json"}
+    fallback_body = {
+        "model": FALLBACK_MODEL,
+        "messages": messages,
+        "stream": False,
+    }
+    if extra_body:
+        # Forward tools/tool_choice but not stream override.
+        for k in ("tools", "tool_choice"):
+            if k in extra_body:
+                fallback_body[k] = extra_body[k]
+    async with httpx.AsyncClient(timeout=httpx.Timeout(120.0)) as client:
+        fb_r = await client.post(fb_url, json=fallback_body, headers=fb_headers)
+    if fb_r.status_code != 200:
+        raise HTTPException(
+            status_code=fb_r.status_code,
+            detail=f"Primary and fallback both failed. Fallback: {fb_r.text[:500]}",
+        )
+    return fb_r.json()
 def _extract_text_from_response(data: Dict[str, Any]) -> str:
     return False
+def _is_api_key_request(request: Request) -> bool:
+    """
+    Return True when the caller authenticated with an API key rather than a
+    session cookie / browser auth.  We use this to decide whether to forward
+    think-tag / reasoning_content tokens to the client.
+    """
+    return bool(
+        request.headers.get(API_KEY_HEADER)
+        or request.headers.get("authorization", "").lower().startswith("bearer ")
+    )
+def _inject_reasoning_into_chunk(obj: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Some navy models return thinking tokens in a non-standard
+    ``reasoning_content`` field inside each delta.  When that field is
+    present we wrap it in <think>…</think> and prepend it to the regular
+    ``content`` delta so that every SSE-speaking client sees a single,
+    unified text stream.
+    The original ``reasoning_content`` field is preserved so clients that
+    know about it can still use it directly.
+    """
+    try:
+        delta = obj["choices"][0]["delta"]
+    except (KeyError, IndexError, TypeError):
+        return obj
+    reasoning = delta.get("reasoning_content") or delta.get("reasoning") or ""
+    content   = delta.get("content") or ""
+    if reasoning and isinstance(reasoning, str):
+        # Wrap in <think> tags and prepend to the visible content delta.
+        wrapped = f"<think>{reasoning}</think>"
+        delta["content"] = wrapped + content
+        # Keep the raw field so native clients can parse it too.
+        delta["reasoning_content"] = reasoning
+        obj["choices"][0]["delta"] = delta
+    return obj
+def _normalize_usage_block(obj: Dict[str, Any]) -> Dict[str, Any]:
+    """Rewrite the usage block to a canonical shape (in-place, returns obj)."""
+    if "usage" not in obj or not isinstance(obj.get("usage"), dict):
+        return obj
+    u = obj["usage"]
+    input_tok  = u.get("prompt_tokens")     or u.get("input_tokens",  0)
+    output_tok = u.get("completion_tokens") or u.get("output_tokens", 0)
+    obj["usage"] = {
+        "prompt_tokens":     input_tok,
+        "completion_tokens": output_tok,
+        "total_tokens":      input_tok + output_tok,
+        "input_tokens":      input_tok,
+        "output_tokens":     output_tok,
+    }
+    return obj
 # ──────────────────────────────────────────────
 # IMAGE GENERATION
 # ──────────────────────────────────────────────
     await _check_chat_rate_limit(request, authorization, x_client_id)
+    # Determine whether the caller is an API-key client that should receive
+    # raw thinking tokens.
+    forward_thinking = _is_api_key_request(request)
     body["model"] = chosen_model
     stream = body.get("stream", False)
             sent_metadata = False
             async with httpx.AsyncClient(timeout=None) as client:
                 async for chunk in stream_primary(client):
+                    # ── emit router metadata once as the very first SSE frame ──
                     if not sent_metadata:
+                        meta = {
+                            "router_metadata": {
+                                "model_name": MODEL_MAP.get(chosen_model, chosen_model)
+                            }
+                        }
                         yield f"data: {json.dumps(meta)}\n\n"
                         sent_metadata = True
+                    # ── pass [DONE] straight through ──────────────────────────
+                    if "data: [DONE]" in chunk:
+                        yield chunk
+                        continue
+                    # ── process data: … lines ─────────────────────────────────
+                    if chunk.startswith("data:"):
                         raw = chunk[5:].strip()
                         try:
                             obj = json.loads(raw)
                         except Exception:
+                            # Not valid JSON — forward verbatim (keeps partial
+                            # chunks from blocking the stream).
+                            yield chunk
+                            continue
+                        if not isinstance(obj, dict):
+                            yield chunk
+                            continue
+                        # Normalize usage block whenever it appears.
+                        _normalize_usage_block(obj)
+                        # ── thinking / reasoning tokens ───────────────────────
+                        # Navy models may embed thinking in two ways:
+                        #
+                        #   1. As delta.reasoning_content (separate field)
+                        #   2. Inline inside delta.content wrapped in <think>…</think>
+                        #
+                        # For API-key callers we always surface both forms.
+                        # For browser/session callers we strip reasoning_content
+                        # so it doesn't confuse UI clients that don't expect it,
+                        # but <think> tags already present in content are left
+                        # alone (they arrived that way from upstream).
+                        if forward_thinking:
+                            # Merge reasoning_content into content as
+                            # <think>…</think> and keep the raw field.
+                            obj = _inject_reasoning_into_chunk(obj)
+                        else:
+                            # Strip the non-standard field so browser clients
+                            # don't see unexpected keys.
+                            try:
+                                delta = obj["choices"][0]["delta"]
+                                delta.pop("reasoning_content", None)
+                                delta.pop("reasoning", None)
+                                obj["choices"][0]["delta"] = delta
+                            except (KeyError, IndexError, TypeError):
+                                pass
+                        yield f"data: {json.dumps(obj)}\n\n"
+                        continue
+                    # ── any other line (comments, keep-alives, …) ─────────────
                     yield chunk
         return StreamingResponse(
             event_generator(),
             media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no",
+            },
         )
     # ── non-streaming ─────────────────────────
             fb_url, fb_key = _get_provider_url_and_key(FALLBACK_PROVIDER)
             fallback_body = dict(body)
             fallback_body["model"] = FALLBACK_MODEL
+            r = await client.post(
+                fb_url,
+                json=fallback_body,
+                headers={"Authorization": f"Bearer {fb_key}"},
+            )
     content_type = (r.headers.get("content-type") or "").lower()
     if "application/json" in content_type:
         except Exception:
             payload = {"error": "Upstream returned invalid JSON"}
         else:
+            # Normalize usage fields.
+            _normalize_usage_block(payload)
+            # ── thinking tokens in non-streaming responses ────────────────────
+            # Some navy models put thinking content in
+            # message.reasoning_content.  For API-key callers we prepend it to
+            # message.content wrapped in <think>…</think>; for others we drop
+            # the non-standard field.
+            try:
+                message = payload["choices"][0]["message"]
+                reasoning = (
+                    message.pop("reasoning_content", None)
+                    or message.pop("reasoning", None)
+                    or ""
+                )
+                if reasoning and isinstance(reasoning, str):
+                    if forward_thinking:
+                        existing = message.get("content") or ""
+                        message["content"] = f"<think>{reasoning}</think>{existing}"
+                        # Restore the raw field for clients that want it.
+                        message["reasoning_content"] = reasoning
+                    # else: already popped — nothing to do.
+                    payload["choices"][0]["message"] = message
+            except (KeyError, IndexError, TypeError):
+                pass
+            payload.setdefault("router_metadata", {})["model_name"] = MODEL_MAP.get(
+                chosen_model, chosen_model
+            )
     else:
         payload = {
             "error": "Upstream returned non-JSON response",
             },
         })
+        # ── Run _generate() in the background, pinging every 15 s ──────────────
+        # Without keepalive bytes, Cloudflare (524) and Codex both drop the
+        # connection while the model is thinking or accumulating tool arguments.
+        # SSE comment lines (": ping") are invisible to application code but
+        # reset every proxy's idle-timeout counter.
+        PING_INTERVAL = 15  # seconds
+        gen_task: asyncio.Task = asyncio.ensure_future(_generate())
+        while not gen_task.done():
+            try:
+                await asyncio.wait_for(asyncio.shield(gen_task), timeout=PING_INTERVAL)
+            except asyncio.TimeoutError:
+                yield ": ping\n\n"
+            except Exception:
+                break  # real error — handled below
         try:
+            text, tool_calls, input_tokens, output_tokens = gen_task.result()
         except HTTPException as exc:
             yield sse("response.failed", {
                 "type": "response.failed",
             })
             yield "data: [DONE]\n\n"
             return
+        except Exception as exc:
+            yield sse("response.failed", {
+                "type": "response.failed",
+                "response": {
+                    "id": response_id, "object": "response",
+                    "created_at": ts, "status": "failed", "model": chosen_model,
+                    "error": {"code": "upstream_error", "message": str(exc)},
+                },
+            })
+            yield "data: [DONE]\n\n"
+            return
         output_index = 0