triflix commited on
Commit
6d112d0
·
verified ·
1 Parent(s): 0c2178f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -28
app.py CHANGED
@@ -1,36 +1,29 @@
 
1
  from fastapi import FastAPI
2
- from pydantic import BaseModel, Field
3
- from llama_cpp import Llama
4
  import os
5
 
6
- MODEL_PATH = "model/model.gguf"
7
 
8
- # GPU auto-detection: use cuBLAS if available, fallback to CPU
9
- GPU_LAYERS = 0
10
- if os.environ.get("CUDA_VISIBLE_DEVICES") not in [None, "", "None"]:
11
- GPU_LAYERS = 20 # enables GPU acceleration for Qwen 0.5B
12
 
13
- llm = Llama(
14
- model_path=MODEL_PATH,
15
- n_ctx=2048,
16
- n_threads=2,
17
- n_gpu_layers=GPU_LAYERS,
18
- n_batch=64,
19
- verbose=False
20
- )
21
-
22
- api = FastAPI()
23
 
24
  class Query(BaseModel):
25
- prompt: str = Field(..., min_length=1)
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- @api.post("/generate")
28
- def generate(q: Query):
29
- output = llm(
30
- q.prompt,
31
- max_tokens=256,
32
- temperature=0.2,
33
- top_p=0.9,
34
- stop=["</s>"]
35
- )
36
- return {"reply": output["choices"][0]["text"].strip()}
 
1
+ import subprocess
2
  from fastapi import FastAPI
3
+ from pydantic import BaseModel
4
+ import json
5
  import os
6
 
7
+ app = FastAPI()
8
 
9
+ MODEL_PATH = "/app/model/qwen2.5-0.5b-instruct-q4_k_m.gguf"
10
+ LLAMA = "/app/llama.cpp/llama-cli" # CLI mode → fastest for small CPUs
 
 
11
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  class Query(BaseModel):
14
+ prompt: str
15
+ max_tokens: int = 128
16
+
17
+
18
+ @app.post("/generate")
19
+ def generate_text(data: Query):
20
+ cmd = [
21
+ LLAMA,
22
+ "-m", MODEL_PATH,
23
+ "-p", data.prompt,
24
+ "--n-predict", str(data.max_tokens),
25
+ "--temp", "0.2"
26
+ ]
27
 
28
+ out = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
29
+ return {"output": out.stdout.strip()}