testr / app.py
ngandugilbert's picture
Update app.py
2d8de14 verified
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List, Literal, Optional
from huggingface_hub import InferenceClient
from fastapi.responses import JSONResponse
import uuid
import time
import uvicorn
app = FastAPI()
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
# OpenAI-compatible request message
class Message(BaseModel):
role: Literal["system", "user", "assistant"]
content: str
# OpenAI-compatible request body
class ChatCompletionRequest(BaseModel):
model: str = "zephyr-7b-beta"
messages: List[Message]
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.95
max_tokens: Optional[int] = 512
stream: Optional[bool] = False
# OpenAI-compatible response message
class Choice(BaseModel):
index: int
message: Message
finish_reason: Optional[str] = "stop"
# OpenAI-compatible full response
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[Choice]
@app.get("/")
async def home():
return "Hello, I am running"
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
# Build HuggingFace-style message list
messages = [{"role": m.role, "content": m.content} for m in request.messages]
# Generate chat completion
response_text = ""
for chunk in client.chat_completion(
messages,
max_tokens=request.max_tokens,
temperature=request.temperature,
top_p=request.top_p,
stream=False,
):
response_text += chunk.choices[0].delta.content
# Build OpenAI-style response
chat_response = ChatCompletionResponse(
id=f"chatcmpl-{uuid.uuid4().hex}",
created=int(time.time()),
model=request.model,
choices=[
Choice(
index=0,
message=Message(role="assistant", content=response_text),
)
]
)
return JSONResponse(content=chat_response.dict())