convo-model / app.py
Adi362's picture
Create app.py
f97ce08 verified
raw
history blame
821 Bytes
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
app = FastAPI()
llm = Llama(
model_path="/models/model.gguf",
n_ctx=2048,
n_threads=2,
n_batch=128,
verbose=False,
)
class Message(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
messages: list[Message]
@app.post("/v1/chat")
def chat(req: ChatRequest):
prompt = ""
for m in req.messages:
prompt += f"{m.role.upper()}: {m.content}\n"
prompt += "ASSISTANT:"
output = llm(
prompt,
max_tokens=256,
temperature=0.7,
top_p=0.9,
stop=["USER:"]
)
return {
"model": "tinyllama-1.1b-chat-q4_k_m",
"text": output["choices"][0]["text"].strip(),
"tokens": output["usage"]["total_tokens"],
}