Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

rkihacker commited on Oct 21

Commit

6c8dce7

verified ·

1 Parent(s): 63b36c2

Update main.py

Browse files

Files changed (1) hide show

main.py +19 -32

main.py CHANGED Viewed

@@ -20,7 +20,7 @@ if not REPLICATE_API_TOKEN:
 # --- FastAPI App Initialization ---
 app = FastAPI(
     title="Replicate to OpenAI Compatibility Layer",
-    version="3.0.0 (Production Grade)",
 )
 # --- Pydantic Models ---
@@ -36,61 +36,48 @@ class ChatMessage(BaseModel):
 class OpenAIChatCompletionRequest(BaseModel):
     model: str; messages: List[ChatMessage]; temperature: Optional[float] = 0.7; top_p: Optional[float] = 1.0; max_tokens: Optional[int] = None; stream: Optional[bool] = False
-# --- Model Mapping with Explicit Version Hashes (Inspired by LiteLLM) ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": {
         "id": "meta/meta-llama-3-8b-instruct",
-        "version": "02741d1be9a932e6566058d4c92ab80332f143003b5a874f63c9b743e4f3583c",
         "input_type": "messages"
     },
     "claude-4.5-haiku": {
         "id": "anthropic/claude-4.5-haiku",
-        "version": "311c5ff9b9f71c9ebd401b34a41ce604a8b735def3a4aad56f671302b5c56784",
         "input_type": "prompt"
     }
 }
 # --- Helper Functions ---
-def build_replicate_request_body(request: OpenAIChatCompletionRequest, model_details: dict) -> dict:
-    """Builds the complete request body, including the crucial version hash."""
     input_payload = {}
-    # Handle model-specific input format (prompt vs messages)
     if model_details["input_type"] == "prompt":
         prompt_parts = []
         system_prompt = None
         for msg in request.messages:
-            if msg.role == "system":
-                system_prompt = str(msg.content)
-            elif msg.role == "user":
-                prompt_parts.append(f"User: {msg.content}")
-            elif msg.role == "assistant":
-                prompt_parts.append(f"Assistant: {msg.content}")
-        prompt_parts.append("Assistant:") # Cue the model to respond
         input_payload["prompt"] = "\n".join(prompt_parts)
         if system_prompt: input_payload["system_prompt"] = system_prompt
     else: # "messages"
         input_payload["messages"] = [msg.dict() for msg in request.messages]
-    # Add common parameters
     if request.max_tokens is not None: input_payload["max_new_tokens"] = request.max_tokens
     if request.temperature is not None: input_payload["temperature"] = request.temperature
     if request.top_p is not None: input_payload["top_p"] = request.top_p
-    return {
-        "version": model_details["version"],
-        "input": input_payload
-    }
-async def stream_replicate_native_sse(model_id: str, request_body: dict):
-    """Connects to Replicate's native SSE stream for true token-by-token streaming."""
-    # Note: We call the generic predictions endpoint when providing a version hash.
-    url = "https://api.replicate.com/v1/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
-    # Add stream=True to the request body
-    request_body["stream"] = True
     async with httpx.AsyncClient(timeout=300) as client:
         prediction = None
@@ -153,18 +140,18 @@ async def create_chat_completion(request: OpenAIChatCompletionRequest):
         raise HTTPException(status_code=404, detail=f"Model not found. Supported models: {list(SUPPORTED_MODELS.keys())}")
     model_details = SUPPORTED_MODELS[model_key]
-    replicate_request_body = build_replicate_request_body(request, model_details)
     if request.stream:
-        return EventSourceResponse(stream_replicate_native_sse(model_details["id"], replicate_request_body))
-    # Synchronous request
-    url = "https://api.replicate.com/v1/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}
     async with httpx.AsyncClient(timeout=150) as client:
         try:
-            response = await client.post(url, headers=headers, json=replicate_request_body)
             response.raise_for_status()
             prediction = response.json()
             output = "".join(prediction.get("output", []))

 # --- FastAPI App Initialization ---
 app = FastAPI(
     title="Replicate to OpenAI Compatibility Layer",
+    version="4.0.0 (Stable & Correct)",
 )
 # --- Pydantic Models ---
 class OpenAIChatCompletionRequest(BaseModel):
     model: str; messages: List[ChatMessage]; temperature: Optional[float] = 0.7; top_p: Optional[float] = 1.0; max_tokens: Optional[int] = None; stream: Optional[bool] = False
+# --- Model Mapping (Simplified for direct endpoint usage) ---
 SUPPORTED_MODELS = {
     "llama3-8b-instruct": {
         "id": "meta/meta-llama-3-8b-instruct",
         "input_type": "messages"
     },
     "claude-4.5-haiku": {
         "id": "anthropic/claude-4.5-haiku",
         "input_type": "prompt"
     }
 }
 # --- Helper Functions ---
+def prepare_replicate_input(request: OpenAIChatCompletionRequest, model_details: dict) -> Dict[str, Any]:
+    """Prepares the 'input' dictionary for Replicate, handling model-specific formats."""
     input_payload = {}
     if model_details["input_type"] == "prompt":
         prompt_parts = []
         system_prompt = None
         for msg in request.messages:
+            if msg.role == "system": system_prompt = str(msg.content)
+            elif msg.role == "user": prompt_parts.append(f"User: {msg.content}")
+            elif msg.role == "assistant": prompt_parts.append(f"Assistant: {msg.content}")
+        prompt_parts.append("Assistant:")
         input_payload["prompt"] = "\n".join(prompt_parts)
         if system_prompt: input_payload["system_prompt"] = system_prompt
     else: # "messages"
         input_payload["messages"] = [msg.dict() for msg in request.messages]
     if request.max_tokens is not None: input_payload["max_new_tokens"] = request.max_tokens
     if request.temperature is not None: input_payload["temperature"] = request.temperature
     if request.top_p is not None: input_payload["top_p"] = request.top_p
+    return input_payload
+async def stream_replicate_native_sse(model_id: str, input_payload: dict):
+    """Connects to Replicate's native SSE stream using the model-specific endpoint."""
+    url = f"https://api.replicate.com/v1/models/{model_id}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}
+    # The request body is now simple and correct
+    request_body = {"input": input_payload, "stream": True}
     async with httpx.AsyncClient(timeout=300) as client:
         prediction = None
         raise HTTPException(status_code=404, detail=f"Model not found. Supported models: {list(SUPPORTED_MODELS.keys())}")
     model_details = SUPPORTED_MODELS[model_key]
+    replicate_input = prepare_replicate_input(request, model_details)
     if request.stream:
+        return EventSourceResponse(stream_replicate_native_sse(model_details["id"], replicate_input))
+    # Synchronous Request
+    url = f"https://api.replicate.com/v1/models/{model_details['id']}/predictions"
     headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}
     async with httpx.AsyncClient(timeout=150) as client:
         try:
+            response = await client.post(url, headers=headers, json={"input": replicate_input})
             response.raise_for_status()
             prediction = response.json()
             output = "".join(prediction.get("output", []))