Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| app = FastAPI() | |
| llm = Llama( | |
| model_path="/models/model.gguf", | |
| n_ctx=2048, | |
| n_threads=2, | |
| n_batch=128, | |
| verbose=False, | |
| ) | |
| class Message(BaseModel): | |
| role: str | |
| content: str | |
| class ChatRequest(BaseModel): | |
| messages: list[Message] | |
| def chat(req: ChatRequest): | |
| prompt = "" | |
| for m in req.messages: | |
| prompt += f"{m.role.upper()}: {m.content}\n" | |
| prompt += "ASSISTANT:" | |
| output = llm( | |
| prompt, | |
| max_tokens=256, | |
| temperature=0.7, | |
| top_p=0.9, | |
| stop=["USER:"] | |
| ) | |
| return { | |
| "model": "tinyllama-1.1b-chat-q4_k_m", | |
| "text": output["choices"][0]["text"].strip(), | |
| "tokens": output["usage"]["total_tokens"], | |
| } | |