pakito312 commited on
Commit
27d8bb7
·
1 Parent(s): e031762
Files changed (3) hide show
  1. Dockerfile +8 -10
  2. api.py +17 -38
  3. requirements.txt +3 -6
Dockerfile CHANGED
@@ -1,27 +1,25 @@
1
  FROM python:3.10-slim
2
 
3
- # Variables d'environnement (HF + perf)
4
- ENV PYTHONDONTWRITEBYTECODE=1
5
  ENV PYTHONUNBUFFERED=1
6
- ENV TRANSFORMERS_CACHE=/data/hf_cache
7
- ENV HF_HOME=/data/hf_cache
8
 
9
  WORKDIR /app
10
 
11
- # Dépendances système minimales
12
  RUN apt-get update && apt-get install -y \
13
- git \
 
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
- # Installer les dépendances Python
17
  COPY requirements.txt .
18
  RUN pip install --no-cache-dir -r requirements.txt
19
 
20
- # Copier l'application
 
 
 
 
21
  COPY api.py .
22
 
23
- # Exposer le port HF Space
24
  EXPOSE 7860
25
 
26
- # Lancer FastAPI
27
  CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.10-slim
2
 
 
 
3
  ENV PYTHONUNBUFFERED=1
4
+ ENV LLAMA_CPP_VERBOSE=0
 
5
 
6
  WORKDIR /app
7
 
 
8
  RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ curl \
11
  && rm -rf /var/lib/apt/lists/*
12
 
 
13
  COPY requirements.txt .
14
  RUN pip install --no-cache-dir -r requirements.txt
15
 
16
+ # Télécharger le modèle GGUF (cache HF)
17
+ RUN mkdir -p /models && \
18
+ curl -L -o /models/phi-3.gguf \
19
+ https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct.Q4_K_M.gguf
20
+
21
  COPY api.py .
22
 
 
23
  EXPOSE 7860
24
 
 
25
  CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
api.py CHANGED
@@ -1,59 +1,38 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
- import torch
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
- app = FastAPI(title="Phi-3 Code API")
7
 
8
- MODEL_ID = "microsoft/phi-3-mini-4k-instruct"
9
-
10
- # Chargement du tokenizer et du modèle
11
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
12
-
13
- model = AutoModelForCausalLM.from_pretrained(
14
- MODEL_ID,
15
- device_map="auto",
16
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
17
- low_cpu_mem_usage=True
18
  )
19
 
20
  class GenerateRequest(BaseModel):
21
  prompt: str
22
  max_tokens: int = 512
23
- temperature: float = 0.2
24
 
25
  @app.get("/")
26
  def root():
27
- return {"message": "Phi-3-mini Code API is running"}
28
 
29
  @app.post("/generate")
30
  def generate(req: GenerateRequest):
31
- prompt = req.prompt.strip()
32
-
33
- inputs = tokenizer(
34
- prompt,
35
- return_tensors="pt",
36
- truncation=True,
37
- max_length=4096
38
- ).to(model.device)
39
-
40
- with torch.no_grad():
41
- output = model.generate(
42
- **inputs,
43
- max_new_tokens=req.max_tokens,
44
- temperature=req.temperature,
45
- do_sample=False,
46
- pad_token_id=tokenizer.eos_token_id
47
- )
48
-
49
- result = tokenizer.decode(
50
- output[0],
51
- skip_special_tokens=True
52
  )
53
 
54
  return {
55
- "model": MODEL_ID,
56
- "response": result
57
  }
58
  if __name__ == "__main__":
59
  import uvicorn
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
+ from llama_cpp import Llama
 
4
 
5
+ app = FastAPI(title="llama.cpp Code API")
6
 
7
+ llm = Llama(
8
+ model_path="/models/phi-3.gguf",
9
+ n_ctx=4096,
10
+ n_threads=4,
11
+ n_batch=512,
12
+ n_gpu_layers=0,
13
+ verbose=False,
 
 
 
14
  )
15
 
16
  class GenerateRequest(BaseModel):
17
  prompt: str
18
  max_tokens: int = 512
19
+ temperature: float = 0.1
20
 
21
  @app.get("/")
22
  def root():
23
+ return {"message": "llama.cpp Phi-3 API ready"}
24
 
25
  @app.post("/generate")
26
  def generate(req: GenerateRequest):
27
+ output = llm(
28
+ f"<|user|>\n{req.prompt}\n<|assistant|>",
29
+ max_tokens=req.max_tokens,
30
+ temperature=req.temperature,
31
+ stop=["<|user|>"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  )
33
 
34
  return {
35
+ "response": output["choices"][0]["text"].strip()
 
36
  }
37
  if __name__ == "__main__":
38
  import uvicorn
requirements.txt CHANGED
@@ -1,7 +1,4 @@
1
- fastapi==0.110.0
2
- uvicorn==0.27.1
3
- torch>=2.1.0
4
- transformers>=4.39.0
5
- accelerate>=0.27.0
6
- sentencepiece
7
  pydantic
 
1
+ fastapi
2
+ uvicorn
3
+ llama-cpp-python==0.2.77
 
 
 
4
  pydantic