Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from typing import List | |
| from llama_cpp import Llama | |
| app = FastAPI() | |
| llm = Llama( | |
| model_path="phi-2.Q4_K_M.gguf", | |
| n_ctx=2048, | |
| n_threads=2 | |
| ) | |
| class Message(BaseModel): | |
| role: str | |
| content: str | |
| class ChatRequest(BaseModel): | |
| model: str | |
| messages: List[Message] | |
| temperature: float = 0.7 | |
| max_tokens: int = 256 | |
| async def chat_completions(req: ChatRequest): | |
| prompt = "\n".join([f"{m.role}: {m.content}" for m in req.messages]) + "\nassistant:" | |
| output = llm( | |
| prompt, | |
| max_tokens=req.max_tokens, | |
| temperature=req.temperature, | |
| stop=["user:", "assistant:"] | |
| ) | |
| text = output["choices"][0]["text"] | |
| return { | |
| "id": "chatcmpl-123", | |
| "object": "chat.completion", | |
| "choices": [{ | |
| "index": 0, | |
| "message": {"role": "assistant", "content": text}, | |
| "finish_reason": "stop" | |
| }], | |
| "model": req.model | |
| } | |