pakito312 commited on
Commit
ca1c16e
·
1 Parent(s): 77da021
Files changed (3) hide show
  1. Dockerfile +25 -11
  2. api.py +343 -74
  3. download_model.py +51 -0
Dockerfile CHANGED
@@ -1,21 +1,35 @@
1
  FROM python:3.10-slim
2
 
3
- ENV PYTHONUNBUFFERED=1
4
- ENV HF_HOME=/data
5
- ENV LLAMA_CPP_VERBOSE=0
6
-
7
- WORKDIR /app
8
-
9
  RUN apt-get update && apt-get install -y \
10
  build-essential \
 
 
11
  curl \
12
- libstdc++6 \
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
- COPY requirements.txt .
16
- RUN pip install --no-cache-dir -r requirements.txt
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- COPY api.py .
 
 
 
 
 
19
 
20
  EXPOSE 7860
21
- CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
1
  FROM python:3.10-slim
2
 
3
+ # Installer les dépendances système
 
 
 
 
 
4
  RUN apt-get update && apt-get install -y \
5
  build-essential \
6
+ cmake \
7
+ git \
8
  curl \
 
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
+ # Installer llama-cpp-python avec support CUDA (si disponible)
12
+ RUN pip install --no-cache-dir \
13
+ llama-cpp-python[server] \
14
+ fastapi \
15
+ uvicorn \
16
+ pydantic \
17
+ requests \
18
+ huggingface-hub
19
+
20
+ # Créer un utilisateur non-root
21
+ RUN useradd -m -u 1000 user
22
+ USER user
23
+ WORKDIR /home/user
24
 
25
+ # Copier l'application
26
+ COPY --chown=user:user api.py .
27
+ COPY --chown=user:user download_model.py .
28
+
29
+ # Télécharger le modèle GGUF au build (optionnel)
30
+ # RUN python download_model.py
31
 
32
  EXPOSE 7860
33
+
34
+ # Démarrer
35
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
api.py CHANGED
@@ -1,102 +1,371 @@
1
- from fastapi import FastAPI, HTTPException
2
- from pydantic import BaseModel
3
- from llama_cpp import Llama
 
4
  import os
5
- import subprocess
6
- import threading
7
-
8
- MODEL_PATH = "/data/phi-3.gguf"
9
- MODEL_URL = (
10
- "https://huggingface.co/TheBloke/"
11
- "Phi-3-mini-4k-instruct-GGUF/resolve/main/"
12
- "phi-3-mini-4k-instruct.Q4_K_M.gguf"
13
- )
14
 
15
- app = FastAPI(title="llama.cpp Phi-3 API")
 
 
 
 
 
 
16
 
17
- llm = None
18
- lock = threading.Lock()
 
 
 
 
 
19
 
 
 
 
20
 
21
- def ensure_model():
22
- if os.path.exists(MODEL_PATH) and os.path.getsize(MODEL_PATH) > 100_000_000:
23
- return
 
 
 
 
24
 
25
- os.makedirs("/data", exist_ok=True)
 
 
26
 
27
- result = subprocess.run(
28
- ["curl", "-L", "--fail", "--retry", "3", "-o", MODEL_PATH, MODEL_URL],
29
- stdout=subprocess.PIPE,
30
- stderr=subprocess.PIPE,
31
- text=True,
32
- )
33
 
34
- if result.returncode != 0 or not os.path.exists(MODEL_PATH):
35
- raise RuntimeError(f"Model download failed: {result.stderr}")
36
-
37
- if os.path.getsize(MODEL_PATH) < 100_000_000:
38
- raise RuntimeError("Downloaded model file is corrupted or incomplete")
39
-
40
-
41
- def get_llm():
42
- global llm
43
- with lock:
44
- if llm is None:
45
- ensure_model()
46
- llm = Llama(
47
- model_path=MODEL_PATH,
48
- n_ctx=4096,
49
- n_threads=2, # HF Space CPU safe
50
- n_batch=256,
51
- n_gpu_layers=0,
52
- use_mmap=True,
53
- use_mlock=False,
54
- verbose=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  )
56
- return llm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
 
58
 
59
- class GenerateRequest(BaseModel):
60
- prompt: str
61
- max_tokens: int = 512
62
- temperature: float = 0.1
63
- top_p: float = 0.9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
 
66
  @app.get("/")
67
- def root():
68
  return {
69
- "status": "ok",
70
- "model_loaded": llm is not None,
71
- "model_file_exists": os.path.exists(MODEL_PATH),
 
 
 
 
 
 
 
 
 
72
  }
73
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  @app.post("/generate")
76
- def generate(req: GenerateRequest):
 
 
 
 
77
  try:
78
- model = get_llm()
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  except Exception as e:
80
  raise HTTPException(status_code=500, detail=str(e))
81
 
82
- prompt = (
83
- "<|system|>\n"
84
- "You are an expert software engineer.\n"
85
- "<|user|>\n"
86
- f"{req.prompt}\n"
87
- "<|assistant|>\n"
88
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- output = model(
91
- prompt,
92
- max_tokens=req.max_tokens,
93
- temperature=req.temperature,
94
- top_p=req.top_p,
95
- stop=["<|user|>", "<|system|>"],
96
- echo=False,
97
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  return {
100
- "response": output["choices"][0]["text"].strip()
 
 
101
  }
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ API FastAPI pour DeepSeek-Coder avec llama_cpp
3
+ Démarrage rapide, faible mémoire
4
+ """
5
  import os
6
+ import time
7
+ import asyncio
8
+ from typing import Optional, List
9
+ from contextlib import asynccontextmanager
10
+
11
+ from fastapi import FastAPI, HTTPException
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from pydantic import BaseModel, Field
14
+ from huggingface_hub import hf_hub_download
15
 
16
+ # Import llama_cpp
17
+ try:
18
+ from llama_cpp import Llama
19
+ from llama_cpp.server.app import create_app, Settings
20
+ except ImportError:
21
+ # Fallback si llama_cpp_python n'est pas installé
22
+ Llama = None
23
 
24
+ # ========== CONFIGURATION ==========
25
+ MODEL_REPO = "bartowski/DeepSeek-Coder-1.3B-Instruct-GGUF"
26
+ MODEL_FILES = [
27
+ "DeepSeek-Coder-1.3B-Instruct-Q4_K_M.gguf", # 900MB - Bon compromis
28
+ "DeepSeek-Coder-1.3B-Instruct-Q4_0.gguf", # 900MB
29
+ "DeepSeek-Coder-1.3B-Instruct-Q2_K.gguf", # 500MB - Plus léger
30
+ ]
31
 
32
+ # Chemin local pour le modèle
33
+ MODEL_DIR = "./models"
34
+ os.makedirs(MODEL_DIR, exist_ok=True)
35
 
36
+ # ========== MODÈLES DE DONNÉES ==========
37
+ class GenerateRequest(BaseModel):
38
+ prompt: str = Field(..., min_length=1, max_length=2000)
39
+ temperature: float = Field(0.2, ge=0.1, le=1.0)
40
+ max_tokens: int = Field(256, ge=1, le=1024)
41
+ top_p: float = Field(0.95, ge=0.1, le=1.0)
42
+ stream: bool = False
43
 
44
+ class ChatMessage(BaseModel):
45
+ role: str = Field(..., regex="^(user|assistant|system)$")
46
+ content: str
47
 
48
+ class ChatRequest(BaseModel):
49
+ messages: List[ChatMessage]
50
+ temperature: float = Field(0.2, ge=0.1, le=1.0)
51
+ max_tokens: int = Field(256, ge=1, le=1024)
52
+ stream: bool = False
 
53
 
54
+ # ========== GESTION DU MODÈLE ==========
55
+ class ModelManager:
56
+ def __init__(self):
57
+ self.llm = None
58
+ self.model_path = None
59
+ self.loading = False
60
+
61
+ def find_or_download_model(self):
62
+ """Trouver ou télécharger le modèle GGUF"""
63
+ # Vérifier si un modèle existe déjà
64
+ for model_file in MODEL_FILES:
65
+ local_path = os.path.join(MODEL_DIR, model_file)
66
+ if os.path.exists(local_path):
67
+ print(f"✅ Modèle trouvé: {local_path}")
68
+ return local_path
69
+
70
+ # Télécharger le premier modèle disponible
71
+ print("📥 Aucun modèle local, téléchargement...")
72
+ for model_file in MODEL_FILES:
73
+ try:
74
+ print(f" Essai: {model_file}")
75
+ local_path = hf_hub_download(
76
+ repo_id=MODEL_REPO,
77
+ filename=model_file,
78
+ local_dir=MODEL_DIR,
79
+ local_dir_use_symlinks=False,
80
+ resume_download=True
81
+ )
82
+ print(f"✅ Téléchargé: {model_file}")
83
+ return local_path
84
+ except Exception as e:
85
+ print(f" ❌ {model_file}: {str(e)[:100]}")
86
+ continue
87
+
88
+ raise Exception("❌ Aucun modèle disponible")
89
+
90
+ def load_model(self):
91
+ """Charger le modèle avec llama_cpp"""
92
+ if self.llm is not None:
93
+ return self.llm
94
+
95
+ print("🔧 Chargement du modèle...")
96
+ self.loading = True
97
+
98
+ try:
99
+ # Trouver le modèle
100
+ self.model_path = self.find_or_download_model()
101
+
102
+ # Configurer le modèle (optimisé pour Hugging Face 16GB RAM)
103
+ n_gpu_layers = -1 # Utiliser GPU si disponible
104
+ n_threads = 4 # 4 threads CPU
105
+ n_ctx = 2048 # Contexte limité pour économiser la RAM
106
+
107
+ print(f"🔄 Chargement depuis: {self.model_path}")
108
+ print(f"⚙️ Configuration: GPU layers={n_gpu_layers}, Threads={n_threads}, Context={n_ctx}")
109
+
110
+ # Charger le modèle
111
+ self.llm = Llama(
112
+ model_path=self.model_path,
113
+ n_ctx=n_ctx,
114
+ n_threads=n_threads,
115
+ n_gpu_layers=n_gpu_layers,
116
+ verbose=False
117
  )
118
+
119
+ print("✅ Modèle chargé avec succès!")
120
+ self.loading = False
121
+ return self.llm
122
+
123
+ except Exception as e:
124
+ self.loading = False
125
+ print(f"❌ Erreur chargement modèle: {e}")
126
+ raise
127
+
128
+ def generate(self, prompt: str, temperature: float = 0.2, max_tokens: int = 256, top_p: float = 0.95):
129
+ """Générer du texte"""
130
+ if self.llm is None:
131
+ self.load_model()
132
+
133
+ try:
134
+ output = self.llm(
135
+ prompt=prompt,
136
+ temperature=temperature,
137
+ max_tokens=max_tokens,
138
+ top_p=top_p,
139
+ stop=["</s>", "```"],
140
+ echo=False
141
+ )
142
+
143
+ return output["choices"][0]["text"]
144
+
145
+ except Exception as e:
146
+ raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
147
+
148
+ def chat(self, messages: List[dict], temperature: float = 0.2, max_tokens: int = 256):
149
+ """Chat conversationnel"""
150
+ if self.llm is None:
151
+ self.load_model()
152
+
153
+ # Formater les messages pour llama_cpp
154
+ formatted_prompt = self.format_chat_prompt(messages)
155
+
156
+ try:
157
+ output = self.llm(
158
+ prompt=formatted_prompt,
159
+ temperature=temperature,
160
+ max_tokens=max_tokens,
161
+ stop=["</s>", "```"],
162
+ echo=False
163
+ )
164
+
165
+ return output["choices"][0]["text"]
166
+
167
+ except Exception as e:
168
+ raise HTTPException(status_code=500, detail=f"Chat error: {str(e)}")
169
+
170
+ def format_chat_prompt(self, messages: List[dict]) -> str:
171
+ """Formater les messages pour DeepSeek-Coder"""
172
+ prompt = ""
173
+ for msg in messages:
174
+ role = msg["role"]
175
+ content = msg["content"]
176
+
177
+ if role == "system":
178
+ prompt += f"<|system|>\n{content}\n<|end|>\n"
179
+ elif role == "user":
180
+ prompt += f"<|user|>\n{content}\n<|end|>\n"
181
+ elif role == "assistant":
182
+ prompt += f"<|assistant|>\n{content}\n<|end|>\n"
183
+
184
+ prompt += "<|assistant|>\n"
185
+ return prompt
186
 
187
+ # ========== LIFECYCLE DE L'APPLICATION ==========
188
+ model_manager = ModelManager()
189
 
190
+ @asynccontextmanager
191
+ async def lifespan(app: FastAPI):
192
+ """Gérer le cycle de vie de l'app"""
193
+ # Démarrage
194
+ print("🚀 Démarrage de l'API llama_cpp...")
195
+
196
+ # Charger le modèle en arrière-plan
197
+ async def load_model_async():
198
+ try:
199
+ model_manager.load_model()
200
+ except Exception as e:
201
+ print(f"⚠️ Erreur chargement modèle: {e}")
202
+
203
+ # Lancer le chargement sans bloquer
204
+ asyncio.create_task(load_model_async())
205
+
206
+ yield
207
+
208
+ # Nettoyage (si nécessaire)
209
+ if model_manager.llm:
210
+ print("🧹 Nettoyage...")
211
 
212
+ # ========== APPLICATION FASTAPI ==========
213
+ app = FastAPI(
214
+ title="🚀 DeepSeek-Coder 1.3B API (llama_cpp)",
215
+ description="API ultra-rapide avec llama_cpp_python",
216
+ version="2.0.0",
217
+ docs_url="/docs",
218
+ redoc_url="/redoc",
219
+ lifespan=lifespan
220
+ )
221
+
222
+ # CORS
223
+ app.add_middleware(
224
+ CORSMiddleware,
225
+ allow_origins=["*"],
226
+ allow_credentials=True,
227
+ allow_methods=["*"],
228
+ allow_headers=["*"],
229
+ )
230
 
231
+ # ========== ROUTES API ==========
232
  @app.get("/")
233
+ async def root():
234
  return {
235
+ "message": "🚀 DeepSeek-Coder 1.3B API",
236
+ "backend": "llama_cpp_python",
237
+ "status": "ready" if model_manager.llm else "loading",
238
+ "model_size": "1.3B",
239
+ "format": "GGUF (4-bit quantized)",
240
+ "endpoints": {
241
+ "generate": "POST /generate",
242
+ "chat": "POST /chat",
243
+ "health": "GET /health",
244
+ "models": "GET /models"
245
+ },
246
+ "performance": "~5-10 tokens/sec sur CPU"
247
  }
248
 
249
+ @app.get("/health")
250
+ async def health():
251
+ """Vérifier la santé"""
252
+ return {
253
+ "status": "healthy",
254
+ "model_loaded": model_manager.llm is not None,
255
+ "model_loading": model_manager.loading,
256
+ "model_path": model_manager.model_path,
257
+ "timestamp": time.time()
258
+ }
259
 
260
  @app.post("/generate")
261
+ async def generate(request: GenerateRequest):
262
+ """Générer du code"""
263
+ if model_manager.loading:
264
+ raise HTTPException(status_code=503, detail="Model is still loading...")
265
+
266
  try:
267
+ response = model_manager.generate(
268
+ prompt=request.prompt,
269
+ temperature=request.temperature,
270
+ max_tokens=request.max_tokens,
271
+ top_p=request.top_p
272
+ )
273
+
274
+ return {
275
+ "response": response,
276
+ "model": "deepseek-coder-1.3b",
277
+ "tokens_generated": len(response.split()),
278
+ "backend": "llama_cpp"
279
+ }
280
+
281
  except Exception as e:
282
  raise HTTPException(status_code=500, detail=str(e))
283
 
284
+ @app.post("/chat")
285
+ async def chat(request: ChatRequest):
286
+ """Chat conversationnel"""
287
+ if model_manager.loading:
288
+ raise HTTPException(status_code=503, detail="Model is still loading...")
289
+
290
+ try:
291
+ # Convertir les messages
292
+ messages = [msg.dict() for msg in request.messages]
293
+
294
+ response = model_manager.chat(
295
+ messages=messages,
296
+ temperature=request.temperature,
297
+ max_tokens=request.max_tokens
298
+ )
299
+
300
+ return {
301
+ "response": response,
302
+ "model": "deepseek-coder-1.3b-instruct",
303
+ "backend": "llama_cpp"
304
+ }
305
+
306
+ except Exception as e:
307
+ raise HTTPException(status_code=500, detail=str(e))
308
 
309
+ @app.get("/models")
310
+ async def list_models():
311
+ """Lister les modèles disponibles"""
312
+ models = []
313
+ if model_manager.model_path:
314
+ models.append({
315
+ "name": "deepseek-coder-1.3b",
316
+ "path": model_manager.model_path,
317
+ "size_mb": os.path.getsize(model_manager.model_path) / 1024 / 1024 if os.path.exists(model_manager.model_path) else 0,
318
+ "loaded": model_manager.llm is not None
319
+ })
320
+
321
+ return {"models": models}
322
+
323
+ @app.get("/demo")
324
+ async def demo():
325
+ """Démonstration rapide"""
326
+ examples = [
327
+ {
328
+ "endpoint": "POST /generate",
329
+ "curl": 'curl -X POST https://your-api.space/generate -H "Content-Type: application/json" -d \'{"prompt": "def fibonacci(n):", "temperature": 0.2}\''
330
+ },
331
+ {
332
+ "endpoint": "POST /chat",
333
+ "curl": 'curl -X POST https://your-api.space/chat -H "Content-Type: application/json" -d \'{"messages": [{"role": "user", "content": "Write Python code for binary search"}], "temperature": 0.2}\''
334
+ }
335
+ ]
336
+ return {"examples": examples}
337
 
338
+ # ========== COMPATIBILITÉ OLLAMA ==========
339
+ @app.post("/api/generate")
340
+ async def ollama_generate(request: dict):
341
+ """Endpoint compatible Ollama"""
342
+ prompt = request.get("prompt", "")
343
+ model = request.get("model", "deepseek-coder-1.3b")
344
+
345
+ response = model_manager.generate(
346
+ prompt=prompt,
347
+ temperature=request.get("temperature", 0.2),
348
+ max_tokens=request.get("max_tokens", 256)
349
+ )
350
+
351
  return {
352
+ "model": model,
353
+ "response": response,
354
+ "done": True
355
  }
356
 
357
+ # ========== DÉMARRAGE ==========
358
+ if __name__ == "__main__":
359
+ import uvicorn
360
+
361
+ # Charger le modèle au démarrage (optionnel)
362
+ try:
363
+ model_manager.load_model()
364
+ except Exception as e:
365
+ print(f"⚠️ Note: {e}")
366
+ print("🔄 Le modèle se chargera à la première requête")
367
+
368
+ # Démarrer le serveur
369
+ port = int(os.getenv("PORT", 7860))
370
+ print(f"🌐 API démarrée sur http://0.0.0.0:{port}")
371
+ uvicorn.run(app, host="0.0.0.0", port=port)
download_model.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Télécharger le modèle DeepSeek-Coder au format GGUF
3
+ """
4
+ from huggingface_hub import hf_hub_download
5
+ import os
6
+
7
+ # Configuration
8
+ MODEL_REPO = "bartowski/DeepSeek-Coder-1.3B-Instruct-GGUF"
9
+ MODEL_FILE = "DeepSeek-Coder-1.3B-Instruct-Q4_K_M.gguf"
10
+ LOCAL_PATH = "./models"
11
+
12
+ def download_model():
13
+ """Télécharger le modèle GGUF"""
14
+ os.makedirs(LOCAL_PATH, exist_ok=True)
15
+
16
+ print(f"📥 Téléchargement de {MODEL_FILE}...")
17
+
18
+ try:
19
+ model_path = hf_hub_download(
20
+ repo_id=MODEL_REPO,
21
+ filename=MODEL_FILE,
22
+ local_dir=LOCAL_PATH,
23
+ local_dir_use_symlinks=False,
24
+ resume_download=True
25
+ )
26
+
27
+ print(f"✅ Modèle téléchargé: {model_path}")
28
+ print(f"📊 Taille: {os.path.getsize(model_path) / 1024 / 1024:.2f} MB")
29
+
30
+ return model_path
31
+
32
+ except Exception as e:
33
+ print(f"❌ Erreur: {e}")
34
+
35
+ # Fallback: télécharger un modèle plus petit
36
+ print("🔄 Téléchargement d'un modèle plus petit...")
37
+ try:
38
+ model_path = hf_hub_download(
39
+ repo_id="TheBloke/DeepSeek-Coder-1.3B-Instruct-GGUF",
40
+ filename="deepseek-coder-1.3b-instruct.Q2_K.gguf",
41
+ local_dir=LOCAL_PATH,
42
+ local_dir_use_symlinks=False
43
+ )
44
+ print(f"✅ Modèle de secours téléchargé")
45
+ return model_path
46
+ except:
47
+ print("❌ Impossible de télécharger aucun modèle")
48
+ return None
49
+
50
+ if __name__ == "__main__":
51
+ download_model()