File size: 2,074 Bytes
ff93a9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d8de14
 
 
ff93a9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64a5688
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List, Literal, Optional
from huggingface_hub import InferenceClient
from fastapi.responses import JSONResponse
import uuid
import time
import uvicorn

app = FastAPI()
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# OpenAI-compatible request message
class Message(BaseModel):
    role: Literal["system", "user", "assistant"]
    content: str

# OpenAI-compatible request body
class ChatCompletionRequest(BaseModel):
    model: str = "zephyr-7b-beta"
    messages: List[Message]
    temperature: Optional[float] = 0.7
    top_p: Optional[float] = 0.95
    max_tokens: Optional[int] = 512
    stream: Optional[bool] = False

# OpenAI-compatible response message
class Choice(BaseModel):
    index: int
    message: Message
    finish_reason: Optional[str] = "stop"

# OpenAI-compatible full response
class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[Choice]

@app.get("/")
async def home():
    return "Hello, I am running"
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
    # Build HuggingFace-style message list
    messages = [{"role": m.role, "content": m.content} for m in request.messages]

    # Generate chat completion
    response_text = ""
    for chunk in client.chat_completion(
        messages,
        max_tokens=request.max_tokens,
        temperature=request.temperature,
        top_p=request.top_p,
        stream=False,
    ):
        response_text += chunk.choices[0].delta.content

    # Build OpenAI-style response
    chat_response = ChatCompletionResponse(
        id=f"chatcmpl-{uuid.uuid4().hex}",
        created=int(time.time()),
        model=request.model,
        choices=[
            Choice(
                index=0,
                message=Message(role="assistant", content=response_text),
            )
        ]
    )
    return JSONResponse(content=chat_response.dict())