gguf / app.py
triflix's picture
Update app.py
6d112d0 verified
raw
history blame contribute delete
663 Bytes
import subprocess
from fastapi import FastAPI
from pydantic import BaseModel
import json
import os
app = FastAPI()
MODEL_PATH = "/app/model/qwen2.5-0.5b-instruct-q4_k_m.gguf"
LLAMA = "/app/llama.cpp/llama-cli" # CLI mode → fastest for small CPUs
class Query(BaseModel):
prompt: str
max_tokens: int = 128
@app.post("/generate")
def generate_text(data: Query):
cmd = [
LLAMA,
"-m", MODEL_PATH,
"-p", data.prompt,
"--n-predict", str(data.max_tokens),
"--temp", "0.2"
]
out = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return {"output": out.stdout.strip()}