Spaces:

sharktide
/

lightning

Running

App Files Files Community

sharktide commited on Apr 24

Commit

9f437ef

verified ·

1 Parent(s): 1f8b39e

Update gen.py

Browse files

Files changed (1) hide show

gen.py +308 -0

gen.py CHANGED Viewed

@@ -942,3 +942,311 @@ def return_models_openai():
         }
       ]
     }

         }
       ]
     }
+import time
+import uuid
+# -----------------------------
+# RESPONSES API  (OpenAI-compatible)
+# -----------------------------
+def _normalize_responses_input(input_field) -> list[dict]:
+    """
+    Coerce the Responses API `input` field into a standard messages[] list.
+    Accepted shapes:
+      • str                          → [{"role":"user","content":"..."}]
+      • list of message-like dicts   → pass through, normalising content parts
+    """
+    if isinstance(input_field, str):
+        return [{"role": "user", "content": input_field}]
+    messages = []
+    for item in input_field:
+        role = item.get("role", "user")
+        content = item.get("content", "")
+        # Content can be a plain string or a list of content parts
+        if isinstance(content, list):
+            # Translate Responses-style parts to Chat-style parts
+            parts = []
+            for part in content:
+                ptype = part.get("type", "")
+                if ptype == "text":
+                    parts.append({"type": "text", "text": part.get("text", "")})
+                elif ptype == "image_url":
+                    parts.append({"type": "image_url", "image_url": part.get("image_url", {})})
+                elif ptype == "input_audio":
+                    # Not supported downstream — skip gracefully
+                    pass
+                else:
+                    # Forward unknown parts as-is so nothing is silently dropped
+                    parts.append(part)
+            messages.append({"role": role, "content": parts})
+        else:
+            messages.append({"role": role, "content": content})
+    return messages
+def _wrap_responses_output(chat_payload: dict, model_name: str) -> dict:
+    """
+    Wrap a standard chat-completions JSON response into the Responses API shape.
+    """
+    choices = chat_payload.get("choices", [])
+    output = []
+    for choice in choices:
+        msg = choice.get("message", {})
+        content_text = msg.get("content") or ""
+        tool_calls = msg.get("tool_calls")
+        if tool_calls:
+            for tc in tool_calls:
+                fn = tc.get("function", {})
+                output.append({
+                    "type": "function_call",
+                    "id": tc.get("id", f"call_{uuid.uuid4().hex[:8]}"),
+                    "call_id": tc.get("id", f"call_{uuid.uuid4().hex[:8]}"),
+                    "name": fn.get("name", ""),
+                    "arguments": fn.get("arguments", ""),
+                })
+        else:
+            output.append({
+                "type": "message",
+                "id": f"msg_{uuid.uuid4().hex[:12]}",
+                "role": msg.get("role", "assistant"),
+                "content": [{"type": "output_text", "text": content_text, "annotations": []}],
+                "status": "completed",
+            })
+    usage = chat_payload.get("usage", {})
+    return {
+        "id": f"resp_{uuid.uuid4().hex[:24]}",
+        "object": "response",
+        "created_at": int(time.time()),
+        "model": model_name,
+        "output": output,
+        "usage": {
+            "input_tokens": usage.get("prompt_tokens", 0),
+            "output_tokens": usage.get("completion_tokens", 0),
+            "total_tokens": usage.get("total_tokens", 0),
+        },
+        "status": "completed",
+        "error": None,
+    }
+def _wrap_responses_stream_chunk(
+    line: str, response_id: str, model_name: str, sent_created: bool
+) -> tuple[str, bool]:
+    """
+    Translate a single SSE line from chat-completions streaming format
+    into Responses API streaming events.
+    Returns (translated_line, sent_created).
+    """
+    if not line.startswith("data:"):
+        return line + "\n", sent_created
+    raw = line[5:].strip()
+    if raw == "[DONE]":
+        done_event = json.dumps({
+            "type": "response.completed",
+            "response": {
+                "id": response_id,
+                "object": "response",
+                "model": model_name,
+                "status": "completed",
+                "output": [],
+                "usage": None,
+            },
+        })
+        return f"data: {done_event}\n\n", sent_created
+    try:
+        chunk = json.loads(raw)
+    except json.JSONDecodeError:
+        # Forward as-is — could be our router_metadata injection
+        return line + "\n", sent_created
+    # router_metadata forwarded from generate_text — pass through unchanged
+    if "router_metadata" in chunk:
+        return f"data: {json.dumps(chunk)}\n\n", sent_created
+    out_lines = []
+    if not sent_created:
+        created_event = {
+            "type": "response.created",
+            "response": {
+                "id": response_id,
+                "object": "response",
+                "model": model_name,
+                "status": "in_progress",
+                "output": [],
+            },
+        }
+        out_lines.append(f"data: {json.dumps(created_event)}\n\n")
+        sent_created = True
+    choices = chunk.get("choices", [])
+    for choice in choices:
+        delta = choice.get("delta", {})
+        finish_reason = choice.get("finish_reason")
+        # Text delta
+        text_delta = delta.get("content")
+        if text_delta:
+            delta_event = {
+                "type": "response.output_text.delta",
+                "item_id": f"msg_{response_id[-12:]}",
+                "output_index": 0,
+                "content_index": 0,
+                "delta": text_delta,
+            }
+            out_lines.append(f"data: {json.dumps(delta_event)}\n\n")
+        # Tool call delta
+        tool_calls = delta.get("tool_calls")
+        if tool_calls:
+            for tc in tool_calls:
+                fn = tc.get("function", {})
+                tc_event = {
+                    "type": "response.function_call_arguments.delta",
+                    "item_id": tc.get("id", ""),
+                    "output_index": tc.get("index", 0),
+                    "call_id": tc.get("id", ""),
+                    "delta": fn.get("arguments", ""),
+                }
+                out_lines.append(f"data: {json.dumps(tc_event)}\n\n")
+        if finish_reason:
+            done_text_event = {
+                "type": "response.output_text.done",
+                "item_id": f"msg_{response_id[-12:]}",
+                "output_index": 0,
+                "content_index": 0,
+                "text": "",  # full text not echoed here; client accumulates deltas
+            }
+            out_lines.append(f"data: {json.dumps(done_text_event)}\n\n")
+    return "".join(out_lines), sent_created
+@router.post("/responses")
+async def create_response(
+    request: Request,
+    authorization: Optional[str] = Header(None),
+    x_client_id: Optional[str] = Header(None),
+):
+    """
+    OpenAI Responses API-compatible endpoint.
+    Accepts the Responses API request shape, normalises it into the chat
+    completions format, routes it through the existing generate_text logic,
+    and wraps the result back into the Responses API shape.
+    Supported fields:
+      • input       (str | list)  — required
+      • model       (str)         — accepted but ignored (router decides)
+      • stream      (bool)        — optional, default False
+      • tools       (list)        — optional, forwarded as-is
+      • tool_choice (str|dict)    — optional, forwarded as-is
+      • temperature (float)       — optional, forwarded
+      • max_output_tokens (int)   — mapped to max_tokens
+    """
+    body = await request.json()
+    input_field = body.get("input")
+    if not input_field and input_field != "":
+        raise HTTPException(400, "`input` is required")
+    # --- Normalise into chat-completions shape ---
+    messages = _normalize_responses_input(input_field)
+    chat_body: dict = {"messages": messages}
+    # Forward compatible fields
+    for field in ("tools", "tool_choice", "temperature", "top_p", "stream"):
+        if field in body:
+            chat_body[field] = body[field]
+    if "max_output_tokens" in body:
+        chat_body["max_tokens"] = body["max_output_tokens"]
+    # Mutate the request body so generate_text can read it via request.json()
+    # We call it directly instead, reusing its inner logic via a sub-request shim.
+    # Simpler: re-invoke the routing logic inline by building a new Request.
+    from starlette.requests import Request as StarletteRequest
+    from starlette.datastructures import Headers as StarletteHeaders
+    raw_body = json.dumps(chat_body).encode()
+    scope = dict(request.scope)
+    scope["path"] = "/gen/chat/completions"
+    scope["headers"] = [
+        (k, v) for k, v in request.scope["headers"]
+        if k.lower() not in (b"content-length",)
+    ] + [(b"content-length", str(len(raw_body)).encode())]
+    async def new_receive():
+        return {"type": "http.request", "body": raw_body, "more_body": False}
+    sub_request = StarletteRequest(scope, new_receive)
+    # --- Delegate to generate_text ---
+    response = await generate_text(sub_request, authorization, x_client_id)
+    # --- Streaming path ---
+    if chat_body.get("stream"):
+        response_id = f"resp_{uuid.uuid4().hex[:24]}"
+        model_label = MODEL_MAP.get("lightning", "lightning")
+        async def responses_stream():
+            sent_created = False
+            async for chunk in response.body_iterator:
+                if isinstance(chunk, bytes):
+                    chunk = chunk.decode("utf-8", errors="replace")
+                for line in chunk.splitlines():
+                    translated, sent_created = _wrap_responses_stream_chunk(
+                        line, response_id, model_label, sent_created
+                    )
+                    if translated:
+                        yield translated
+        return StreamingResponse(
+            responses_stream(),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no",
+            },
+        )
+    # --- Non-streaming path ---
+    if hasattr(response, "body"):
+        raw = response.body
+    else:
+        raw = b""
+        async for chunk in response.body_iterator:
+            raw += chunk if isinstance(chunk, bytes) else chunk.encode()
+    try:
+        chat_payload = json.loads(raw)
+    except json.JSONDecodeError:
+        raise HTTPException(502, "Upstream returned unparseable JSON")
+    if response.status_code >= 400:
+        return JSONResponse(status_code=response.status_code, content=chat_payload)
+    model_label = MODEL_MAP.get(
+        chat_payload.get("model", ""),
+        chat_payload.get("model", "lightning"),
+    )
+    return JSONResponse(
+        status_code=200,
+        content=_wrap_responses_output(chat_payload, model_label),
+    )