from fastapi import FastAPI from pydantic import BaseModel from typing import List, Literal, Optional from huggingface_hub import InferenceClient from fastapi.responses import JSONResponse import uuid import time import uvicorn app = FastAPI() client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") # OpenAI-compatible request message class Message(BaseModel): role: Literal["system", "user", "assistant"] content: str # OpenAI-compatible request body class ChatCompletionRequest(BaseModel): model: str = "zephyr-7b-beta" messages: List[Message] temperature: Optional[float] = 0.7 top_p: Optional[float] = 0.95 max_tokens: Optional[int] = 512 stream: Optional[bool] = False # OpenAI-compatible response message class Choice(BaseModel): index: int message: Message finish_reason: Optional[str] = "stop" # OpenAI-compatible full response class ChatCompletionResponse(BaseModel): id: str object: str = "chat.completion" created: int model: str choices: List[Choice] @app.post("/v1/chat/completions", response_model=ChatCompletionResponse) async def chat_completions(request: ChatCompletionRequest): # Build HuggingFace-style message list messages = [{"role": m.role, "content": m.content} for m in request.messages] # Generate chat completion response_text = "" for chunk in client.chat_completion( messages, max_tokens=request.max_tokens, temperature=request.temperature, top_p=request.top_p, stream=False, ): response_text += chunk.choices[0].delta.content # Build OpenAI-style response chat_response = ChatCompletionResponse( id=f"chatcmpl-{uuid.uuid4().hex}", created=int(time.time()), model=request.model, choices=[ Choice( index=0, message=Message(role="assistant", content=response_text), ) ] ) return JSONResponse(content=chat_response.dict()) # Run this file directly if __name__ == "__main__": uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)