pakito312 commited on
Commit
44ab248
·
1 Parent(s): 1805c7e
Files changed (1) hide show
  1. api.py +73 -22
api.py CHANGED
@@ -1,8 +1,9 @@
1
- from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
  import os
5
  import subprocess
 
6
 
7
  MODEL_PATH = "/data/phi-3.gguf"
8
  MODEL_URL = (
@@ -11,44 +12,94 @@ MODEL_URL = (
11
  "phi-3-mini-4k-instruct.Q4_K_M.gguf"
12
  )
13
 
14
- app = FastAPI(title="llama.cpp Code API")
15
 
16
- def download_model():
17
- if not os.path.exists(MODEL_PATH):
18
- os.makedirs("/data", exist_ok=True)
19
- subprocess.run([
20
- "curl", "-L", "-o", MODEL_PATH, MODEL_URL
21
- ], check=True)
22
 
23
- download_model()
24
 
25
- llm = Llama(
26
- model_path=MODEL_PATH,
27
- n_ctx=4096,
28
- n_threads=2, # HF CPU safe
29
- n_batch=256,
30
- n_gpu_layers=0,
31
- verbose=False,
32
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  class GenerateRequest(BaseModel):
35
  prompt: str
36
  max_tokens: int = 512
37
  temperature: float = 0.1
 
 
38
 
39
  @app.get("/")
40
  def root():
41
- return {"message": "llama.cpp Phi-3 API ready"}
 
 
 
 
 
42
 
43
  @app.post("/generate")
44
  def generate(req: GenerateRequest):
45
- output = llm(
46
- f"<|user|>\n{req.prompt}\n<|assistant|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  max_tokens=req.max_tokens,
48
  temperature=req.temperature,
49
- stop=["<|user|>"]
 
 
50
  )
51
- return {"response": output["choices"][0]["text"].strip()}
 
 
 
 
52
  if __name__ == "__main__":
53
  import uvicorn
54
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
  import os
5
  import subprocess
6
+ import threading
7
 
8
  MODEL_PATH = "/data/phi-3.gguf"
9
  MODEL_URL = (
 
12
  "phi-3-mini-4k-instruct.Q4_K_M.gguf"
13
  )
14
 
15
+ app = FastAPI(title="llama.cpp Phi-3 API")
16
 
17
+ llm = None
18
+ lock = threading.Lock()
 
 
 
 
19
 
 
20
 
21
+ def ensure_model():
22
+ if os.path.exists(MODEL_PATH) and os.path.getsize(MODEL_PATH) > 100_000_000:
23
+ return
24
+
25
+ os.makedirs("/data", exist_ok=True)
26
+
27
+ result = subprocess.run(
28
+ ["curl", "-L", "--fail", "--retry", "3", "-o", MODEL_PATH, MODEL_URL],
29
+ stdout=subprocess.PIPE,
30
+ stderr=subprocess.PIPE,
31
+ text=True,
32
+ )
33
+
34
+ if result.returncode != 0 or not os.path.exists(MODEL_PATH):
35
+ raise RuntimeError(f"Model download failed: {result.stderr}")
36
+
37
+ if os.path.getsize(MODEL_PATH) < 100_000_000:
38
+ raise RuntimeError("Downloaded model file is corrupted or incomplete")
39
+
40
+
41
+ def get_llm():
42
+ global llm
43
+ with lock:
44
+ if llm is None:
45
+ ensure_model()
46
+ llm = Llama(
47
+ model_path=MODEL_PATH,
48
+ n_ctx=4096,
49
+ n_threads=2, # HF Space CPU safe
50
+ n_batch=256,
51
+ n_gpu_layers=0,
52
+ use_mmap=True,
53
+ use_mlock=False,
54
+ verbose=False,
55
+ )
56
+ return llm
57
+
58
 
59
  class GenerateRequest(BaseModel):
60
  prompt: str
61
  max_tokens: int = 512
62
  temperature: float = 0.1
63
+ top_p: float = 0.9
64
+
65
 
66
  @app.get("/")
67
  def root():
68
+ return {
69
+ "status": "ok",
70
+ "model_loaded": llm is not None,
71
+ "model_file_exists": os.path.exists(MODEL_PATH),
72
+ }
73
+
74
 
75
  @app.post("/generate")
76
  def generate(req: GenerateRequest):
77
+ try:
78
+ model = get_llm()
79
+ except Exception as e:
80
+ raise HTTPException(status_code=500, detail=str(e))
81
+
82
+ prompt = (
83
+ "<|system|>\n"
84
+ "You are an expert software engineer.\n"
85
+ "<|user|>\n"
86
+ f"{req.prompt}\n"
87
+ "<|assistant|>\n"
88
+ )
89
+
90
+ output = model(
91
+ prompt,
92
  max_tokens=req.max_tokens,
93
  temperature=req.temperature,
94
+ top_p=req.top_p,
95
+ stop=["<|user|>", "<|system|>"],
96
+ echo=False,
97
  )
98
+
99
+ return {
100
+ "response": output["choices"][0]["text"].strip()
101
+ }
102
+
103
  if __name__ == "__main__":
104
  import uvicorn
105
  uvicorn.run(app, host="0.0.0.0", port=7860)