Spaces:
Runtime error
Runtime error
File size: 2,127 Bytes
cc6a799 1222634 cc6a799 fd5c94a cc6a799 776282b cc6a799 fd5c94a 776282b cc6a799 776282b cc6a799 1222634 cc6a799 1222634 fd5c94a cc6a799 1222634 cc6a799 fd5c94a cc6a799 fd5c94a cc6a799 1222634 cc6a799 1222634 cc6a799 fd5c94a cc6a799 1222634 cc6a799 fd5c94a cc6a799 1dfb74e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List, Literal, Optional
from huggingface_hub import InferenceClient
from fastapi.responses import JSONResponse
import uuid
import time
import uvicorn
app = FastAPI()
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
# OpenAI-compatible request message
class Message(BaseModel):
role: Literal["system", "user", "assistant"]
content: str
# OpenAI-compatible request body
class ChatCompletionRequest(BaseModel):
model: str = "zephyr-7b-beta"
messages: List[Message]
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.95
max_tokens: Optional[int] = 512
stream: Optional[bool] = False
# OpenAI-compatible response message
class Choice(BaseModel):
index: int
message: Message
finish_reason: Optional[str] = "stop"
# OpenAI-compatible full response
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[Choice]
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
# Build HuggingFace-style message list
messages = [{"role": m.role, "content": m.content} for m in request.messages]
# Generate chat completion
response_text = ""
for chunk in client.chat_completion(
messages,
max_tokens=request.max_tokens,
temperature=request.temperature,
top_p=request.top_p,
stream=False,
):
response_text += chunk.choices[0].delta.content
# Build OpenAI-style response
chat_response = ChatCompletionResponse(
id=f"chatcmpl-{uuid.uuid4().hex}",
created=int(time.time()),
model=request.model,
choices=[
Choice(
index=0,
message=Message(role="assistant", content=response_text),
)
]
)
return JSONResponse(content=chat_response.dict())
# Run this file directly
if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
|