| import subprocess |
| from fastapi import FastAPI |
| from pydantic import BaseModel |
| import json |
| import os |
|
|
| app = FastAPI() |
|
|
| MODEL_PATH = "/app/model/qwen2.5-0.5b-instruct-q4_k_m.gguf" |
| LLAMA = "/app/llama.cpp/llama-cli" |
|
|
|
|
| class Query(BaseModel): |
| prompt: str |
| max_tokens: int = 128 |
|
|
|
|
| @app.post("/generate") |
| def generate_text(data: Query): |
| cmd = [ |
| LLAMA, |
| "-m", MODEL_PATH, |
| "-p", data.prompt, |
| "--n-predict", str(data.max_tokens), |
| "--temp", "0.2" |
| ] |
|
|
| out = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) |
| return {"output": out.stdout.strip()} |
|
|