Spaces:
Runtime error
Runtime error
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from typing import List, Literal, Optional | |
| from huggingface_hub import InferenceClient | |
| from fastapi.responses import JSONResponse | |
| import uuid | |
| import time | |
| import uvicorn | |
| app = FastAPI() | |
| client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") | |
| # OpenAI-compatible request message | |
| class Message(BaseModel): | |
| role: Literal["system", "user", "assistant"] | |
| content: str | |
| # OpenAI-compatible request body | |
| class ChatCompletionRequest(BaseModel): | |
| model: str = "zephyr-7b-beta" | |
| messages: List[Message] | |
| temperature: Optional[float] = 0.7 | |
| top_p: Optional[float] = 0.95 | |
| max_tokens: Optional[int] = 512 | |
| stream: Optional[bool] = False | |
| # OpenAI-compatible response message | |
| class Choice(BaseModel): | |
| index: int | |
| message: Message | |
| finish_reason: Optional[str] = "stop" | |
| # OpenAI-compatible full response | |
| class ChatCompletionResponse(BaseModel): | |
| id: str | |
| object: str = "chat.completion" | |
| created: int | |
| model: str | |
| choices: List[Choice] | |
| async def chat_completions(request: ChatCompletionRequest): | |
| # Build HuggingFace-style message list | |
| messages = [{"role": m.role, "content": m.content} for m in request.messages] | |
| # Generate chat completion | |
| response_text = "" | |
| for chunk in client.chat_completion( | |
| messages, | |
| max_tokens=request.max_tokens, | |
| temperature=request.temperature, | |
| top_p=request.top_p, | |
| stream=False, | |
| ): | |
| response_text += chunk.choices[0].delta.content | |
| # Build OpenAI-style response | |
| chat_response = ChatCompletionResponse( | |
| id=f"chatcmpl-{uuid.uuid4().hex}", | |
| created=int(time.time()), | |
| model=request.model, | |
| choices=[ | |
| Choice( | |
| index=0, | |
| message=Message(role="assistant", content=response_text), | |
| ) | |
| ] | |
| ) | |
| return JSONResponse(content=chat_response.dict()) | |
| # Run this file directly | |
| if __name__ == "__main__": | |
| uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True) | |