Update app.py
Browse files
app.py
CHANGED
|
@@ -1,36 +1,29 @@
|
|
|
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
-
from pydantic import BaseModel
|
| 3 |
-
|
| 4 |
import os
|
| 5 |
|
| 6 |
-
|
| 7 |
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
if os.environ.get("CUDA_VISIBLE_DEVICES") not in [None, "", "None"]:
|
| 11 |
-
GPU_LAYERS = 20 # enables GPU acceleration for Qwen 0.5B
|
| 12 |
|
| 13 |
-
llm = Llama(
|
| 14 |
-
model_path=MODEL_PATH,
|
| 15 |
-
n_ctx=2048,
|
| 16 |
-
n_threads=2,
|
| 17 |
-
n_gpu_layers=GPU_LAYERS,
|
| 18 |
-
n_batch=64,
|
| 19 |
-
verbose=False
|
| 20 |
-
)
|
| 21 |
-
|
| 22 |
-
api = FastAPI()
|
| 23 |
|
| 24 |
class Query(BaseModel):
|
| 25 |
-
prompt: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
output = llm(
|
| 30 |
-
q.prompt,
|
| 31 |
-
max_tokens=256,
|
| 32 |
-
temperature=0.2,
|
| 33 |
-
top_p=0.9,
|
| 34 |
-
stop=["</s>"]
|
| 35 |
-
)
|
| 36 |
-
return {"reply": output["choices"][0]["text"].strip()}
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
from fastapi import FastAPI
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
import json
|
| 5 |
import os
|
| 6 |
|
| 7 |
+
app = FastAPI()
|
| 8 |
|
| 9 |
+
MODEL_PATH = "/app/model/qwen2.5-0.5b-instruct-q4_k_m.gguf"
|
| 10 |
+
LLAMA = "/app/llama.cpp/llama-cli" # CLI mode → fastest for small CPUs
|
|
|
|
|
|
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
class Query(BaseModel):
|
| 14 |
+
prompt: str
|
| 15 |
+
max_tokens: int = 128
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@app.post("/generate")
|
| 19 |
+
def generate_text(data: Query):
|
| 20 |
+
cmd = [
|
| 21 |
+
LLAMA,
|
| 22 |
+
"-m", MODEL_PATH,
|
| 23 |
+
"-p", data.prompt,
|
| 24 |
+
"--n-predict", str(data.max_tokens),
|
| 25 |
+
"--temp", "0.2"
|
| 26 |
+
]
|
| 27 |
|
| 28 |
+
out = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
| 29 |
+
return {"output": out.stdout.strip()}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|