modell / app.py
ngandugilbert's picture
Update app.py
1dfb74e verified
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List, Literal, Optional
from huggingface_hub import InferenceClient
from fastapi.responses import JSONResponse
import uuid
import time
import uvicorn
app = FastAPI()
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
# OpenAI-compatible request message
class Message(BaseModel):
role: Literal["system", "user", "assistant"]
content: str
# OpenAI-compatible request body
class ChatCompletionRequest(BaseModel):
model: str = "zephyr-7b-beta"
messages: List[Message]
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.95
max_tokens: Optional[int] = 512
stream: Optional[bool] = False
# OpenAI-compatible response message
class Choice(BaseModel):
index: int
message: Message
finish_reason: Optional[str] = "stop"
# OpenAI-compatible full response
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[Choice]
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
# Build HuggingFace-style message list
messages = [{"role": m.role, "content": m.content} for m in request.messages]
# Generate chat completion
response_text = ""
for chunk in client.chat_completion(
messages,
max_tokens=request.max_tokens,
temperature=request.temperature,
top_p=request.top_p,
stream=False,
):
response_text += chunk.choices[0].delta.content
# Build OpenAI-style response
chat_response = ChatCompletionResponse(
id=f"chatcmpl-{uuid.uuid4().hex}",
created=int(time.time()),
model=request.model,
choices=[
Choice(
index=0,
message=Message(role="assistant", content=response_text),
)
]
)
return JSONResponse(content=chat_response.dict())
# Run this file directly
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)