Spaces:

rkihacker
/

R2OAI

Paused

App Files Files Community

R2OAI / main.py

rkihacker

Update main.py

7b0c05f verified about 2 months ago

raw

history blame

10.9 kB

	import os
	import httpx
	import json
	import time
	from fastapi import FastAPI, HTTPException
	from fastapi.responses import Response
	from pydantic import BaseModel, Field
	from typing import List, Dict, Any, Optional, Union, Literal
	from dotenv import load_dotenv
	import asyncio

	# Load environment variables
	load_dotenv()
	REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_TOKEN")
	if not REPLICATE_API_TOKEN:
	raise ValueError("REPLICATE_API_TOKEN environment variable not set.")

	# FastAPI Init
	app = FastAPI(title="Replicate to OpenAI Compatibility Layer", version="10.0.0 (Enhanced Chunk Formatting)")

	# --- Pydantic Models ---
	class ModelCard(BaseModel):
	id: str
	object: str = "model"
	created: int = Field(default_factory=lambda: int(time.time()))
	owned_by: str = "replicate"

	class ModelList(BaseModel):
	object: str = "list"
	data: List[ModelCard] = []

	class ChatMessage(BaseModel):
	role: Literal["system", "user", "assistant", "tool"]
	content: Union[str, List[Dict[str, Any]]]

	class OpenAIChatCompletionRequest(BaseModel):
	model: str
	messages: List[ChatMessage]
	temperature: Optional[float] = 0.7
	top_p: Optional[float] = 1.0
	max_tokens: Optional[int] = None
	stream: Optional[bool] = False

	# --- Supported Models ---
	SUPPORTED_MODELS = {
	"llama3-8b-instruct": "meta/meta-llama-3-8b-instruct",
	"claude-4.5-haiku": "anthropic/claude-4.5-haiku", # Note: Name changed for clarity
	"claude-4.5-sonnet": "anthropic/claude-4.5-sonnet", # Note: Name changed for clarity
	"llava-13b": "yorickvp/llava-13b:e272157381e2a3bf12df3a8edd1f38d1dbd736bbb7437277c8b34175f8fce358"
	}

	# --- Core Logic ---
	def prepare_replicate_input(request: OpenAIChatCompletionRequest) -> Dict[str, Any]:
	"""
	Formats the input for Replicate's API, flattening the message history into a
	single 'prompt' string and handling images separately.
	"""
	payload = {}
	prompt_parts = []
	system_prompt = None
	image_input = None

	for msg in request.messages:
	if msg.role == "system":
	system_prompt = str(msg.content)
	elif msg.role == "assistant":
	prompt_parts.append(f"Assistant: {msg.content}")
	elif msg.role == "user":
	user_text_content = ""
	if isinstance(msg.content, list):
	for item in msg.content:
	if item.get("type") == "text":
	user_text_content += item.get("text", "")
	elif item.get("type") == "image_url":
	image_url_data = item.get("image_url", {})
	image_input = image_url_data.get("url")
	else:
	user_text_content = str(msg.content)
	prompt_parts.append(f"User: {user_text_content}")

	prompt_parts.append("Assistant:")
	payload["prompt"] = "\n\n".join(prompt_parts)

	if system_prompt:
	payload["system_prompt"] = system_prompt
	if image_input:
	payload["image"] = image_input

	if request.max_tokens: payload["max_new_tokens"] = request.max_tokens
	if request.temperature: payload["temperature"] = request.temperature
	if request.top_p: payload["top_p"] = request.top_p

	return payload

	def get_provider(replicate_model_id: str) -> str:
	"""Infers the provider from the Replicate model ID."""
	if replicate_model_id.startswith("meta/"):
	return "Meta"
	if replicate_model_id.startswith("anthropic/"):
	return "Anthropic"
	if "llava" in replicate_model_id:
	return "Llava"
	return "Replicate"

	async def stream_replicate_sse(replicate_model_id: str, requested_model_name: str, input_payload: dict):
	"""
	Handles the full streaming lifecycle with corrected whitespace preservation
	and the new, detailed chunk format.
	"""
	url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
	headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json"}

	# Identify provider for the response chunks
	provider = get_provider(replicate_model_id)

	async with httpx.AsyncClient(timeout=60.0) as client:
	# 1. Create the prediction and get the stream URL
	try:
	response = await client.post(url, headers=headers, json={"input": input_payload, "stream": True})
	response.raise_for_status()
	prediction = response.json()
	stream_url = prediction.get("urls", {}).get("stream")
	prediction_id = prediction.get("id", f"stream-{int(time.time())}")

	if not stream_url:
	error_chunk = { "error": {"message": "Model did not return a stream URL."} }
	yield f"data: {json.dumps(error_chunk)}\n\n"
	return

	except httpx.HTTPStatusError as e:
	error_details = e.response.text
	try:
	error_json = e.response.json()
	error_details = error_json.get("detail", error_details)
	except json.JSONDecodeError: pass
	error_chunk = {"error": {"message": f"Upstream Error: {error_details}", "type": "replicate_error"}}
	yield f"data: {json.dumps(error_chunk)}\n\n"
	return

	# 2. Connect to the SSE stream and yield formatted chunks
	try:
	async with client.stream("GET", stream_url, headers={"Accept": "text/event-stream"}, timeout=None) as sse:
	current_event = None
	async for line in sse.aiter_lines():
	if not line:
	continue
	if line.startswith("event:"):
	current_event = line[len("event:"):].strip()
	elif line.startswith("data:"):
	# Get the raw payload after "data:"
	raw_payload = line[len("data:"):]

	# The SSE spec allows an optional leading space. Remove it.
	# This robustly prevents parsing errors without destroying content.
	payload = raw_payload.lstrip(" ")

	if current_event == "output":
	if not payload:
	continue

	content_token = ""
	try:
	# This handles JSON-encoded strings like "\" Hello\"" and correctly
	# preserves all whitespace, including single spaces. This is the fix.
	content_token = json.loads(payload)
	except (json.JSONDecodeError, TypeError):
	# Fallback for plain text tokens if Replicate changes format
	content_token = payload

	# Build the new, detailed chunk structure
	chunk = {
	"id": prediction_id,
	"object": "chat.completion.chunk",
	"created": int(time.time()),
	"model": requested_model_name,
	"provider": provider,
	"choices": [{
	"index": 0,
	"delta": {"content": content_token},
	"finish_reason": None,
	"logprobs": None,
	"native_finish_reason": None
	}]
	}
	yield f"data: {json.dumps(chunk)}\n\n"

	elif current_event == "done":
	break
	except httpx.ReadTimeout:
	error_chunk = {"error": {"message": "Stream timed out.", "type": "timeout_error"}}
	yield f"data: {json.dumps(error_chunk)}\n\n"
	return

	# 3. Send the final chunk with finish_reason
	final_chunk = {
	"id": prediction_id,
	"object": "chat.completion.chunk",
	"created": int(time.time()),
	"model": requested_model_name,
	"provider": provider,
	"choices": [{
	"index": 0,
	"delta": {},
	"finish_reason": "stop",
	"logprobs": None,
	"native_finish_reason": "end_turn"
	}]
	}
	yield f"data: {json.dumps(final_chunk)}\n\n"
	yield "data: [DONE]\n\n"

	# A simple EventSourceResponse implementation if sse-starlette is not preferred
	async def create_sse_response(generator):
	headers = {
	'Content-Type': 'text/event-stream',
	'Cache-Control': 'no-cache',
	'Connection': 'keep-alive',
	}
	async def stream():
	async for chunk in generator:
	yield chunk
	await asyncio.sleep(0) # Yield control to the event loop
	return Response(stream(), headers=headers)


	# --- Endpoints ---
	@app.get("/v1/models")
	async def list_models():
	return ModelList(data=[ModelCard(id=k) for k in SUPPORTED_MODELS.keys()])

	@app.post("/v1/chat/completions")
	async def create_chat_completion(request: OpenAIChatCompletionRequest):
	if request.model not in SUPPORTED_MODELS:
	raise HTTPException(status_code=404, detail=f"Model not found. Available models: {list(SUPPORTED_MODELS.keys())}")

	replicate_model_id = SUPPORTED_MODELS[request.model]
	replicate_input = prepare_replicate_input(request)

	if request.stream:
	# Use the custom generator with the detailed chunk format
	generator = stream_replicate_sse(replicate_model_id, request.model, replicate_input)
	return await create_sse_response(generator)

	# Non-streaming fallback
	url = f"https://api.replicate.com/v1/models/{replicate_model_id}/predictions"
	headers = {"Authorization": f"Bearer {REPLICATE_API_TOKEN}", "Content-Type": "application/json", "Prefer": "wait=120"}
	async with httpx.AsyncClient() as client:
	try:
	resp = await client.post(url, headers=headers, json={"input": replicate_input}, timeout=130.0)
	resp.raise_for_status()
	pred = resp.json()
	output = "".join(pred.get("output", []))
	return {
	"id": pred.get("id"), "object": "chat.completion", "created": int(time.time()), "model": request.model,
	"choices": [{"index": 0, "message": {"role": "assistant", "content": output}, "finish_reason": "stop"}],
	"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
	}
	except httpx.HTTPStatusError as e:
	raise HTTPException(status_code=e.response.status_code, detail=f"Error from Replicate API: {e.response.text}")