pakito312 commited on
Commit
1805c7e
·
1 Parent(s): 27d8bb7
Files changed (2) hide show
  1. Dockerfile +2 -6
  2. api.py +22 -7
Dockerfile CHANGED
@@ -1,6 +1,7 @@
1
  FROM python:3.10-slim
2
 
3
  ENV PYTHONUNBUFFERED=1
 
4
  ENV LLAMA_CPP_VERBOSE=0
5
 
6
  WORKDIR /app
@@ -8,18 +9,13 @@ WORKDIR /app
8
  RUN apt-get update && apt-get install -y \
9
  build-essential \
10
  curl \
 
11
  && rm -rf /var/lib/apt/lists/*
12
 
13
  COPY requirements.txt .
14
  RUN pip install --no-cache-dir -r requirements.txt
15
 
16
- # Télécharger le modèle GGUF (cache HF)
17
- RUN mkdir -p /models && \
18
- curl -L -o /models/phi-3.gguf \
19
- https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct.Q4_K_M.gguf
20
-
21
  COPY api.py .
22
 
23
  EXPOSE 7860
24
-
25
  CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.10-slim
2
 
3
  ENV PYTHONUNBUFFERED=1
4
+ ENV HF_HOME=/data
5
  ENV LLAMA_CPP_VERBOSE=0
6
 
7
  WORKDIR /app
 
9
  RUN apt-get update && apt-get install -y \
10
  build-essential \
11
  curl \
12
+ libstdc++6 \
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
  COPY requirements.txt .
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
 
 
 
 
 
18
  COPY api.py .
19
 
20
  EXPOSE 7860
 
21
  CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
api.py CHANGED
@@ -1,14 +1,32 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
 
 
 
 
 
 
 
 
 
4
 
5
  app = FastAPI(title="llama.cpp Code API")
6
 
 
 
 
 
 
 
 
 
 
7
  llm = Llama(
8
- model_path="/models/phi-3.gguf",
9
  n_ctx=4096,
10
- n_threads=4,
11
- n_batch=512,
12
  n_gpu_layers=0,
13
  verbose=False,
14
  )
@@ -30,10 +48,7 @@ def generate(req: GenerateRequest):
30
  temperature=req.temperature,
31
  stop=["<|user|>"]
32
  )
33
-
34
- return {
35
- "response": output["choices"][0]["text"].strip()
36
- }
37
  if __name__ == "__main__":
38
  import uvicorn
39
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
+ import os
5
+ import subprocess
6
+
7
+ MODEL_PATH = "/data/phi-3.gguf"
8
+ MODEL_URL = (
9
+ "https://huggingface.co/TheBloke/"
10
+ "Phi-3-mini-4k-instruct-GGUF/resolve/main/"
11
+ "phi-3-mini-4k-instruct.Q4_K_M.gguf"
12
+ )
13
 
14
  app = FastAPI(title="llama.cpp Code API")
15
 
16
+ def download_model():
17
+ if not os.path.exists(MODEL_PATH):
18
+ os.makedirs("/data", exist_ok=True)
19
+ subprocess.run([
20
+ "curl", "-L", "-o", MODEL_PATH, MODEL_URL
21
+ ], check=True)
22
+
23
+ download_model()
24
+
25
  llm = Llama(
26
+ model_path=MODEL_PATH,
27
  n_ctx=4096,
28
+ n_threads=2, # HF CPU safe
29
+ n_batch=256,
30
  n_gpu_layers=0,
31
  verbose=False,
32
  )
 
48
  temperature=req.temperature,
49
  stop=["<|user|>"]
50
  )
51
+ return {"response": output["choices"][0]["text"].strip()}
 
 
 
52
  if __name__ == "__main__":
53
  import uvicorn
54
  uvicorn.run(app, host="0.0.0.0", port=7860)