Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on Oct 21

Commit

7b0c05f

verified ·

1 Parent(s): 0f4286e

Update main.py

Browse files

Files changed (1) hide show

main.py +111 -99

main.py CHANGED Viewed

@@ -1,14 +1,13 @@
 import os
 import httpx
 import json
 import time
 from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field
 from typing import List, Dict, Any, Optional, Union, Literal
 from dotenv import load_dotenv
-from sse_starlette.sse import EventSourceResponse
 # Load environment variables
 load_dotenv()
@@ -17,23 +16,36 @@ if not REPLICATE_API_TOKEN:
     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
-app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="9.0.0 (Definitive Streaming Fix)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
-    id: str; object: str = "model"; created: int = Field(default_factory=lambda: int(time.time())); owned_by: str = "replicate"
 class ModelList(BaseModel):
-    object: str = "list"; data: List[ModelCard] = []
 class ChatMessage(BaseModel):
-    role: Literal["system", "user", "assistant", "tool"]; content: Union[str, List[Dict[str, Any]]]
 class OpenAIChatCompletionRequest(BaseModel):
-    model: str; messages: List[ChatMessage]; temperature: Optional[float] = 0.7; top_p: Optional[float] = 1.0; max_tokens: Optional[int] = None; stream: Optional[bool] = False
 # --- Supported Models ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
-    "claude-4.5-haiku": "anthropic/claude-4.5-haiku",
-    "claude-4.5-sonnet": "anthropic/claude-4.5-sonnet",
     "llava-13b": "yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358"
 }
@@ -80,140 +92,137 @@ def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, A
     return payload
-async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
-    """Handles the full streaming lifecycle with correct whitespace preservation."""
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=60.0) as client:
         try:
             response = await client.post(url, headers=headers, json={"input": input_payload, "stream": True})
             response.raise_for_status()
             prediction = response.json()
             stream_url = prediction.get("urls", {}).get("stream")
-            prediction_id = prediction.get("id", "stream-unknown")
             if not stream_url:
-                yield f"data: {json.dumps({'error': {'message': 'Model did not return a stream URL.'}})}\n\n"
                 return
         except httpx.HTTPStatusError as e:
             error_details = e.response.text
             try:
                 error_json = e.response.json()
                 error_details = error_json.get("detail", error_details)
             except json.JSONDecodeError: pass
-            yield f"data: {json.dumps({'error': {'message': f'Upstream Error: {error_details}', 'type': 'replicate_error'}})}\n\n"
             return
         try:
             async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
                 current_event = None
                 async for line in sse.aiter_lines():
-                    if not line: # Skip empty lines
                         continue
                     if line.startswith("event:"):
                         current_event = line[len("event:"):].strip()
                     elif line.startswith("data:"):
-                        # FIXED: Preserve all whitespace including leading/trailing spaces
-                        raw_data = line[5:]  # Remove "data:" prefix
-                        # Handle empty data lines (preserve them)
-                        if not raw_data:
-                            continue
-                        # Remove only the optional single space after data: if present
-                        # This is per SSE spec and preserves actual content spaces
-                        if raw_data.startswith(" "):
-                            data_content = raw_data[1:]  # Remove the first space only
-                        else:
-                            data_content = raw_data
                         if current_event == "output":
-                            if not data_content:
                                 continue
                             content_token = ""
                             try:
-                                # Handle JSON-encoded strings properly (including spaces)
-                                content_token = json.loads(data_content)
                             except (json.JSONDecodeError, TypeError):
-                                # Handle plain text tokens (preserve as-is)
-                                content_token = data_content
-                            # Create chunk with exact format you specified
                             chunk = {
                                 "choices": [{
                                     "delta": {"content": content_token},
                                     "finish_reason": None,
-                                    "index": 0,
                                     "logprobs": None,
                                     "native_finish_reason": None
-                                }],
-                                "created": int(time.time()),
-                                "id": f"gen-{int(time.time())}-{prediction_id[-12:]}",  # Format like your example
-                                "model": replicate_model_id,
-                                "object": "chat.completion.chunk",
-                                "provider": "Anthropic" if "anthropic" in replicate_model_id else "Replicate"
                             }
                             yield f"data: {json.dumps(chunk)}\n\n"
                         elif current_event == "done":
-                            # Send usage chunk before done
-                            usage_chunk = {
-                                "choices": [{
-                                    "delta": {},
-                                    "finish_reason": None,
-                                    "index": 0,
-                                    "logprobs": None,
-                                    "native_finish_reason": None
-                                }],
-                                "created": int(time.time()),
-                                "id": f"gen-{int(time.time())}-{prediction_id[-12:]}",
-                                "model": replicate_model_id,
-                                "object": "chat.completion.chunk",
-                                "provider": "Anthropic" if "anthropic" in replicate_model_id else "Replicate",
-                                "usage": {
-                                    "cache_discount": 0,
-                                    "completion_tokens": 0,
-                                    "completion_tokens_details": {"image_tokens": 0, "reasoning_tokens": 0},
-                                    "cost": 0,
-                                    "cost_details": {
-                                        "upstream_inference_completions_cost": 0,
-                                        "upstream_inference_cost": None,
-                                        "upstream_inference_prompt_cost": 0
-                                    },
-                                    "input_tokens": 0,
-                                    "is_byok": False,
-                                    "prompt_tokens": 0,
-                                    "prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
-                                    "total_tokens": 0
-                                }
-                            }
-                            yield f"data: {json.dumps(usage_chunk)}\n\n"
-                            # Send final chunk with stop reason
-                            final_chunk = {
-                                "choices": [{
-                                    "delta": {},
-                                    "finish_reason": "stop",
-                                    "index": 0,
-                                    "logprobs": None,
-                                    "native_finish_reason": "end_turn"
-                                }],
-                                "created": int(time.time()),
-                                "id": f"gen-{int(time.time())}-{prediction_id[-12:]}",
-                                "model": replicate_model_id,
-                                "object": "chat.completion.chunk",
-                                "provider": "Anthropic" if "anthropic" in replicate_model_id else "Replicate"
-                            }
-                            yield f"data: {json.dumps(final_chunk)}\n\n"
                             break
         except httpx.ReadTimeout:
-            yield f"data: {json.dumps({'error': {'message': 'Stream timed out.', 'type': 'timeout_error'}})}\n\n"
             return
-    # Send [DONE] event
     yield "data: [DONE]\n\n"
 # --- Endpoints ---
 @app.get("/v1/models")
 async def list_models():
@@ -224,13 +233,16 @@ async def create_chat_completion(request: OpenAIChatCompletionRequest):
     if request.model not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")
     replicate_input = prepare_replicate_input(request)
     if request.stream:
-        return EventSourceResponse(stream_replicate_sse(SUPPORTED_MODELS[request.model], replicate_input), media_type="text/event-stream")
     # Non-streaming fallback
-    url = f"https://api.replicate.com/v1/models/{SUPPORTED_MODELS[request.model]}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}
     async with httpx.AsyncClient() as client:
         try:
@@ -244,4 +256,4 @@ async def create_chat_completion(request: OpenAIChatCompletionRequest):
                 "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
             }
         except httpx.HTTPStatusError as e:
-            raise HTTPException(status_code=e.response.status_code, detail=f"Error from Replicate API: {e.response.text}")

 import os
 import httpx
 import json
 import time
 from fastapi import FastAPI, HTTPException
+from fastapi.responses import Response
 from pydantic import BaseModel, Field
 from typing import List, Dict, Any, Optional, Union, Literal
 from dotenv import load_dotenv
+import asyncio
 # Load environment variables
 load_dotenv()
     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
+app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="10.0.0 (Enhanced Chunk Formatting)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "replicate"
 class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = []
 class ChatMessage(BaseModel):
+    role: Literal["system", "user", "assistant", "tool"]
+    content: Union[str, List[Dict[str, Any]]]
 class OpenAIChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 1.0
+    max_tokens: Optional[int] = None
+    stream: Optional[bool] = False
 # --- Supported Models ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
+    "claude-4.5-haiku": "anthropic/claude-4.5-haiku", # Note: Name changed for clarity
+    "claude-4.5-sonnet": "anthropic/claude-4.5-sonnet", # Note: Name changed for clarity
     "llava-13b": "yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358"
 }
     return payload
+def get_provider(replicate_model_id: str) -> str:
+    """Infers the provider from the Replicate model ID."""
+    if replicate_model_id.startswith("meta/"):
+        return "Meta"
+    if replicate_model_id.startswith("anthropic/"):
+        return "Anthropic"
+    if "llava" in replicate_model_id:
+        return "Llava"
+    return "Replicate"
+async def stream_replicate_sse(replicate_model_id: str, requested_model_name: str, input_payload: dict):
+    """
+    Handles the full streaming lifecycle with corrected whitespace preservation
+    and the new, detailed chunk format.
+    """
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
+    # Identify provider for the response chunks
+    provider = get_provider(replicate_model_id)
     async with httpx.AsyncClient(timeout=60.0) as client:
+        # 1. Create the prediction and get the stream URL
         try:
             response = await client.post(url, headers=headers, json={"input": input_payload, "stream": True})
             response.raise_for_status()
             prediction = response.json()
             stream_url = prediction.get("urls", {}).get("stream")
+            prediction_id = prediction.get("id", f"stream-{int(time.time())}")
             if not stream_url:
+                error_chunk = { "error": {"message": "Model did not return a stream URL."} }
+                yield f"data: {json.dumps(error_chunk)}\n\n"
                 return
         except httpx.HTTPStatusError as e:
             error_details = e.response.text
             try:
                 error_json = e.response.json()
                 error_details = error_json.get("detail", error_details)
             except json.JSONDecodeError: pass
+            error_chunk = {"error": {"message": f"Upstream Error: {error_details}", "type": "replicate_error"}}
+            yield f"data: {json.dumps(error_chunk)}\n\n"
             return
+        # 2. Connect to the SSE stream and yield formatted chunks
         try:
             async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
                 current_event = None
                 async for line in sse.aiter_lines():
+                    if not line:
                         continue
                     if line.startswith("event:"):
                         current_event = line[len("event:"):].strip()
                     elif line.startswith("data:"):
+                        # Get the raw payload after "data:"
+                        raw_payload = line[len("data:"):]
+                        # The SSE spec allows an optional leading space. Remove it.
+                        # This robustly prevents parsing errors without destroying content.
+                        payload = raw_payload.lstrip(" ")
                         if current_event == "output":
+                            if not payload:
                                 continue
                             content_token = ""
                             try:
+                                # This handles JSON-encoded strings like "\" Hello\"" and correctly
+                                # preserves all whitespace, including single spaces. This is the fix.
+                                content_token = json.loads(payload)
                             except (json.JSONDecodeError, TypeError):
+                                # Fallback for plain text tokens if Replicate changes format
+                                content_token = payload
+                            # Build the new, detailed chunk structure
                             chunk = {
+                                "id": prediction_id,
+                                "object": "chat.completion.chunk",
+                                "created": int(time.time()),
+                                "model": requested_model_name,
+                                "provider": provider,
                                 "choices": [{
+                                    "index": 0,
                                     "delta": {"content": content_token},
                                     "finish_reason": None,
                                     "logprobs": None,
                                     "native_finish_reason": None
+                                }]
                             }
                             yield f"data: {json.dumps(chunk)}\n\n"
                         elif current_event == "done":
                             break
         except httpx.ReadTimeout:
+            error_chunk = {"error": {"message": "Stream timed out.", "type": "timeout_error"}}
+            yield f"data: {json.dumps(error_chunk)}\n\n"
             return
+    # 3. Send the final chunk with finish_reason
+    final_chunk = {
+        "id": prediction_id,
+        "object": "chat.completion.chunk",
+        "created": int(time.time()),
+        "model": requested_model_name,
+        "provider": provider,
+        "choices": [{
+            "index": 0,
+            "delta": {},
+            "finish_reason": "stop",
+            "logprobs": None,
+            "native_finish_reason": "end_turn"
+        }]
+    }
+    yield f"data: {json.dumps(final_chunk)}\n\n"
     yield "data: [DONE]\n\n"
+# A simple EventSourceResponse implementation if sse-starlette is not preferred
+async def create_sse_response(generator):
+    headers = {
+        'Content-Type': 'text/event-stream',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+    }
+    async def stream():
+        async for chunk in generator:
+            yield chunk
+            await asyncio.sleep(0) # Yield control to the event loop
+    return Response(stream(), headers=headers)
 # --- Endpoints ---
 @app.get("/v1/models")
 async def list_models():
     if request.model not in SUPPORTED_MODELS:
         raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")
+    replicate_model_id = SUPPORTED_MODELS[request.model]
     replicate_input = prepare_replicate_input(request)
     if request.stream:
+        # Use the custom generator with the detailed chunk format
+        generator = stream_replicate_sse(replicate_model_id, request.model, replicate_input)
+        return await create_sse_response(generator)
     # Non-streaming fallback
+    url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}
     async with httpx.AsyncClient() as client:
         try:
                 "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
             }
         except httpx.HTTPStatusError as e:
+            raise HTTPException(status_code=e.response.status_code, detail=f"Error from Replicate API: {e.response.text}")