File size: 2,127 Bytes
cc6a799
1222634
cc6a799
fd5c94a
cc6a799
776282b
cc6a799
 
fd5c94a
776282b
cc6a799
776282b
cc6a799
1222634
cc6a799
1222634
fd5c94a
cc6a799
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1222634
cc6a799
fd5c94a
cc6a799
 
 
 
fd5c94a
cc6a799
1222634
 
cc6a799
1222634
 
 
cc6a799
fd5c94a
cc6a799
 
 
 
 
 
 
 
 
 
 
 
1222634
cc6a799
 
fd5c94a
cc6a799
 
1dfb74e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List, Literal, Optional
from huggingface_hub import InferenceClient
from fastapi.responses import JSONResponse
import uuid
import time
import uvicorn

app = FastAPI()
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")

# OpenAI-compatible request message
class Message(BaseModel):
    role: Literal["system", "user", "assistant"]
    content: str

# OpenAI-compatible request body
class ChatCompletionRequest(BaseModel):
    model: str = "zephyr-7b-beta"
    messages: List[Message]
    temperature: Optional[float] = 0.7
    top_p: Optional[float] = 0.95
    max_tokens: Optional[int] = 512
    stream: Optional[bool] = False

# OpenAI-compatible response message
class Choice(BaseModel):
    index: int
    message: Message
    finish_reason: Optional[str] = "stop"

# OpenAI-compatible full response
class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[Choice]

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def chat_completions(request: ChatCompletionRequest):
    # Build HuggingFace-style message list
    messages = [{"role": m.role, "content": m.content} for m in request.messages]

    # Generate chat completion
    response_text = ""
    for chunk in client.chat_completion(
        messages,
        max_tokens=request.max_tokens,
        temperature=request.temperature,
        top_p=request.top_p,
        stream=False,
    ):
        response_text += chunk.choices[0].delta.content

    # Build OpenAI-style response
    chat_response = ChatCompletionResponse(
        id=f"chatcmpl-{uuid.uuid4().hex}",
        created=int(time.time()),
        model=request.model,
        choices=[
            Choice(
                index=0,
                message=Message(role="assistant", content=response_text),
            )
        ]
    )
    return JSONResponse(content=chat_response.dict())

# Run this file directly
if __name__ == "__main__":
    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)