Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on Oct 21

Commit

91b7eb3

verified ·

1 Parent(s): e014ad9

Update main.py

Browse files

Files changed (1) hide show

main.py +32 -51

main.py CHANGED Viewed

@@ -16,7 +16,7 @@ if not REPLICATE_API_TOKEN:
     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
-app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="4.1.0 (Context Fixed)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
@@ -29,47 +29,52 @@ class OpenAIChatCompletionRequest(BaseModel):
     model: str; messages: List[ChatMessage]; temperature: Optional[float] = 0.7; top_p: Optional[float] = 1.0; max_tokens: Optional[int] = None; stream: Optional[bool] = False
 # --- Supported Models ---
-# Maps OpenAI-friendly names to Replicate model paths
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
     "claude-4.5-haiku": "anthropic/claude-4.5-haiku"
-    # You can add more models here
 }
 # --- Core Logic ---
 def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
     """
-    Formats the input for Replicate API, preserving the conversational context.
     """
     payload = {}
-    # --- CONTEXT FIX START ---
-    # Modern chat models on Replicate (like Llama 3 and Claude 4.5) expect
-    # the 'messages' array directly, just like OpenAI.
-    # We no longer need to flatten the conversation into a single prompt string.
-    # Extract system prompt if it exists, as some models take it as a separate parameter.
-    messages_for_payload = []
     system_prompt = None
     for msg in request.messages:
         if msg.role == "system":
-            # Claude and some other models prefer a dedicated system_prompt field.
             system_prompt = str(msg.content)
-        else:
-            # Handle user/assistant roles. Convert Pydantic model to a standard dict.
-            messages_for_payload.append(msg.dict())
-    # The main input for conversation is the 'messages' array.
-    payload["messages"] = messages_for_payload
-    # Add system_prompt to the payload if it was found.
     if system_prompt:
          payload["system_prompt"] = system_prompt
-    # --- CONTEXT FIX END ---
     # Map common OpenAI parameters to Replicate equivalents
-    # Note: Replicate's parameter for max tokens is often 'max_new_tokens'
     if request.max_tokens: payload["max_new_tokens"] = request.max_tokens
     if request.temperature: payload["temperature"] = request.temperature
     if request.top_p: payload["top_p"] = request.top_p
@@ -78,13 +83,11 @@ def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, A
 async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
     """Handles the full streaming lifecycle using standard Replicate endpoints."""
-    # 1. Start Prediction
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=60.0) as client:
         try:
-            # Request a streaming prediction
             response = await client.post(url, headers=headers, json={"input": input_payload, "stream": True})
             response.raise_for_status()
             prediction = response.json()
@@ -98,15 +101,13 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
         except httpx.HTTPStatusError as e:
              error_details = e.response.text
              try:
-                 # Try to parse the error for a cleaner message
                  error_json = e.response.json()
                  error_details = error_json.get("detail", error_details)
              except json.JSONDecodeError:
-                 pass # Use raw text if not JSON
              yield json.dumps({"error": {"message": f"Upstream Error: {error_details}", "type": "replicate_error"}})
              return
-        # 2. Connect to the provided Stream URL and process Server-Sent Events (SSE)
         try:
             async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
                 current_event = None
@@ -117,9 +118,7 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
                         data = line[len("data:"):].strip()
                         if current_event == "output":
-                            # The 'output' event for chat models sends one token at a time as a plain string.
-                            # We don't need to parse it as JSON.
-                            if data: # Ensure we don't send empty chunks
                                 chunk = {
                                     "id": prediction_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": replicate_model_id,
                                     "choices": [{"index": 0, "delta": {"content": data}, "finish_reason": None}]
@@ -127,21 +126,16 @@ async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
                                 yield json.dumps(chunk)
                         elif current_event == "done":
-                            # The 'done' event signals the end of the stream.
                             break
         except httpx.ReadTimeout:
-            # Handle cases where the stream times out
             yield json.dumps({"error": {"message": "Stream timed out.", "type": "timeout_error"}})
             return
-    # 3. Send the final termination chunk in OpenAI format
     final_chunk = {
         "id": prediction_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": replicate_model_id,
         "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
     }
     yield json.dumps(final_chunk)
-    # Some clients (like curl) expect a final "[DONE]" message to close the connection.
     yield "[DONE]"
 # --- Endpoints ---
@@ -160,34 +154,21 @@ async def create_chat_completion(request: OpenAIChatCompletionRequest):
     replicate_input = prepare_replicate_input(request)
     if request.stream:
-        # Return a streaming response
         return EventSourceResponse(stream_replicate_sse(replicate_id, replicate_input), media_type="text/event-stream")
     # Non-streaming fallback
     url = f"https://api.replicate.com/v1/models/{replicate_id}/predictions"
-    headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"} # Increased wait time
     async with httpx.AsyncClient() as client:
         try:
             resp = await client.post(url, headers=headers, json={"input": replicate_input}, timeout=130.0)
             resp.raise_for_status()
             pred = resp.json()
-            # The output of chat models is typically a list of strings (tokens)
             output = "".join(pred.get("output", []))
             return {
-                "id": pred.get("id"),
-                "object": "chat.completion",
-                "created": int(time.time()),
-                "model": request.model,
-                "choices": [{
-                    "index": 0,
-                    "message": {"role": "assistant", "content": output},
-                    "finish_reason": "stop"
-                }],
-                "usage": { # Placeholder usage object
-                    "prompt_tokens": 0,
-                    "completion_tokens": 0,
-                    "total_tokens": 0
-                }
             }
         except httpx.HTTPStatusError as e:
             raise HTTPException(status_code=e.response.status_code, detail=f"Error from Replicate API: {e.response.text}")

     raise ValueError("REPLICATE_API_TOKEN environment variable not set.")
 # FastAPI Init
+app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="4.2.0 (Prompt Format Fixed)")
 # --- Pydantic Models ---
 class ModelCard(BaseModel):
     model: str; messages: List[ChatMessage]; temperature: Optional[float] = 0.7; top_p: Optional[float] = 1.0; max_tokens: Optional[int] = None; stream: Optional[bool] = False
 # --- Supported Models ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
     "claude-4.5-haiku": "anthropic/claude-4.5-haiku"
 }
 # --- Core Logic ---
 def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
     """
+    Formats the input for Replicate API. This function now correctly builds a
+    single prompt string from the message history, which is required by
+    Replicate's endpoints for models like Claude and Llama 3.
     """
     payload = {}
+    # --- PROMPT FORMAT FIX START ---
+    prompt_parts = []
     system_prompt = None
     for msg in request.messages:
         if msg.role == "system":
+            # Extract system prompt, as it's a separate parameter for many models
             system_prompt = str(msg.content)
+        elif msg.role == "user":
+            # Format user messages
+            content = msg.content
+            if isinstance(content, list): # Handle potential future vision models
+                 text_parts = [item.get("text", "") for item in content if item.get("type") == "text"]
+                 content = " ".join(text_parts)
+            prompt_parts.append(f"User: {content}")
+        elif msg.role == "assistant":
+            # Format assistant messages
+            prompt_parts.append(f"Assistant: {msg.content}")
+    # Add the final "Assistant:" turn to prompt the model for a response.
+    # This is a standard convention for many chat models when using a single prompt string.
+    prompt_parts.append("Assistant:")
+    # The main input is a single 'prompt' string with turns separated by newlines.
+    payload["prompt"] = "\n\n".join(prompt_parts)
     if system_prompt:
          payload["system_prompt"] = system_prompt
+    # --- PROMPT FORMAT FIX END ---
     # Map common OpenAI parameters to Replicate equivalents
     if request.max_tokens: payload["max_new_tokens"] = request.max_tokens
     if request.temperature: payload["temperature"] = request.temperature
     if request.top_p: payload["top_p"] = request.top_p
 async def stream_replicate_sse(replicate_model_id: str, input_payload: dict):
     """Handles the full streaming lifecycle using standard Replicate endpoints."""
     url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
     async with httpx.AsyncClient(timeout=60.0) as client:
         try:
             response = await client.post(url, headers=headers, json={"input": input_payload, "stream": True})
             response.raise_for_status()
             prediction = response.json()
         except httpx.HTTPStatusError as e:
              error_details = e.response.text
              try:
                  error_json = e.response.json()
                  error_details = error_json.get("detail", error_details)
              except json.JSONDecodeError:
+                 pass
              yield json.dumps({"error": {"message": f"Upstream Error: {error_details}", "type": "replicate_error"}})
              return
         try:
             async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
                 current_event = None
                         data = line[len("data:"):].strip()
                         if current_event == "output":
+                            if data:
                                 chunk = {
                                     "id": prediction_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": replicate_model_id,
                                     "choices": [{"index": 0, "delta": {"content": data}, "finish_reason": None}]
                                 yield json.dumps(chunk)
                         elif current_event == "done":
                             break
         except httpx.ReadTimeout:
             yield json.dumps({"error": {"message": "Stream timed out.", "type": "timeout_error"}})
             return
     final_chunk = {
         "id": prediction_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": replicate_model_id,
         "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
     }
     yield json.dumps(final_chunk)
     yield "[DONE]"
 # --- Endpoints ---
     replicate_input = prepare_replicate_input(request)
     if request.stream:
         return EventSourceResponse(stream_replicate_sse(replicate_id, replicate_input), media_type="text/event-stream")
     # Non-streaming fallback
     url = f"https://api.replicate.com/v1/models/{replicate_id}/predictions"
+    headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}
     async with httpx.AsyncClient() as client:
         try:
             resp = await client.post(url, headers=headers, json={"input": replicate_input}, timeout=130.0)
             resp.raise_for_status()
             pred = resp.json()
             output = "".join(pred.get("output", []))
             return {
+                "id": pred.get("id"), "object": "chat.completion", "created": int(time.time()), "model": request.model,
+                "choices": [{"index": 0, "message": {"role": "assistant", "content": output}, "finish_reason": "stop"}],
+                "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
             }
         except httpx.HTTPStatusError as e:
             raise HTTPException(status_code=e.response.status_code, detail=f"Error from Replicate API: {e.response.text}")