Spaces:

digitaldev2024
/

allma

Sleeping

App Files Files Community

pakito312 commited on Jan 15

Commit

ca1c16e

1 Parent(s): 77da021

update

Browse files

Files changed (3) hide show

Dockerfile +25 -11
api.py +343 -74
download_model.py +51 -0

Dockerfile CHANGED Viewed

@@ -1,21 +1,35 @@
 FROM python:3.10-slim
-ENV PYTHONUNBUFFERED=1
-ENV HF_HOME=/data
-ENV LLAMA_CPP_VERBOSE=0
-WORKDIR /app
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
-    libstdc++6 \
     && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-COPY api.py .
 EXPOSE 7860
-CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.10-slim
+# Installer les dépendances système
 RUN apt-get update && apt-get install -y \
     build-essential \
+    cmake \
+    git \
     curl \
     && rm -rf /var/lib/apt/lists/*
+# Installer llama-cpp-python avec support CUDA (si disponible)
+RUN pip install --no-cache-dir \
+    llama-cpp-python[server] \
+    fastapi \
+    uvicorn \
+    pydantic \
+    requests \
+    huggingface-hub
+# Créer un utilisateur non-root
+RUN useradd -m -u 1000 user
+USER user
+WORKDIR /home/user
+# Copier l'application
+COPY --chown=user:user api.py .
+COPY --chown=user:user download_model.py .
+# Télécharger le modèle GGUF au build (optionnel)
+# RUN python download_model.py
 EXPOSE 7860
+# Démarrer
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

api.py CHANGED Viewed

@@ -1,102 +1,371 @@
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from llama_cpp import Llama
 import os
-import subprocess
-import threading
-MODEL_PATH = "/data/phi-3.gguf"
-MODEL_URL = (
-    "https://huggingface.co/TheBloke/"
-    "Phi-3-mini-4k-instruct-GGUF/resolve/main/"
-    "phi-3-mini-4k-instruct.Q4_K_M.gguf"
-)
-app = FastAPI(title="llama.cpp Phi-3 API")
-llm = None
-lock = threading.Lock()
-def ensure_model():
-    if os.path.exists(MODEL_PATH) and os.path.getsize(MODEL_PATH) > 100_000_000:
-        return
-    os.makedirs("/data", exist_ok=True)
-    result = subprocess.run(
-        ["curl", "-L", "--fail", "--retry", "3", "-o", MODEL_PATH, MODEL_URL],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        text=True,
-    )
-    if result.returncode != 0 or not os.path.exists(MODEL_PATH):
-        raise RuntimeError(f"Model download failed: {result.stderr}")
-    if os.path.getsize(MODEL_PATH) < 100_000_000:
-        raise RuntimeError("Downloaded model file is corrupted or incomplete")
-def get_llm():
-    global llm
-    with lock:
-        if llm is None:
-            ensure_model()
-            llm = Llama(
-                model_path=MODEL_PATH,
-                n_ctx=4096,
-                n_threads=2,        # HF Space CPU safe
-                n_batch=256,
-                n_gpu_layers=0,
-                use_mmap=True,
-                use_mlock=False,
-                verbose=False,
             )
-    return llm
-class GenerateRequest(BaseModel):
-    prompt: str
-    max_tokens: int = 512
-    temperature: float = 0.1
-    top_p: float = 0.9
 @app.get("/")
-def root():
     return {
-        "status": "ok",
-        "model_loaded": llm is not None,
-        "model_file_exists": os.path.exists(MODEL_PATH),
     }
 @app.post("/generate")
-def generate(req: GenerateRequest):
     try:
-        model = get_llm()
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
-    prompt = (
-        "<|system|>\n"
-        "You are an expert software engineer.\n"
-        "<|user|>\n"
-        f"{req.prompt}\n"
-        "<|assistant|>\n"
-    )
-    output = model(
-        prompt,
-        max_tokens=req.max_tokens,
-        temperature=req.temperature,
-        top_p=req.top_p,
-        stop=["<|user|>", "<|system|>"],
-        echo=False,
-    )
     return {
-        "response": output["choices"][0]["text"].strip()
     }

+"""
+API FastAPI pour DeepSeek-Coder avec llama_cpp
+Démarrage rapide, faible mémoire
+"""
 import os
+import time
+import asyncio
+from typing import Optional, List
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from huggingface_hub import hf_hub_download
+# Import llama_cpp
+try:
+    from llama_cpp import Llama
+    from llama_cpp.server.app import create_app, Settings
+except ImportError:
+    # Fallback si llama_cpp_python n'est pas installé
+    Llama = None
+# ========== CONFIGURATION ==========
+MODEL_REPO = "bartowski/DeepSeek-Coder-1.3B-Instruct-GGUF"
+MODEL_FILES = [
+    "DeepSeek-Coder-1.3B-Instruct-Q4_K_M.gguf",      # 900MB - Bon compromis
+    "DeepSeek-Coder-1.3B-Instruct-Q4_0.gguf",        # 900MB
+    "DeepSeek-Coder-1.3B-Instruct-Q2_K.gguf",        # 500MB - Plus léger
+]
+# Chemin local pour le modèle
+MODEL_DIR = "./models"
+os.makedirs(MODEL_DIR, exist_ok=True)
+# ========== MODÈLES DE DONNÉES ==========
+class GenerateRequest(BaseModel):
+    prompt: str = Field(..., min_length=1, max_length=2000)
+    temperature: float = Field(0.2, ge=0.1, le=1.0)
+    max_tokens: int = Field(256, ge=1, le=1024)
+    top_p: float = Field(0.95, ge=0.1, le=1.0)
+    stream: bool = False
+class ChatMessage(BaseModel):
+    role: str = Field(..., regex="^(user|assistant|system)$")
+    content: str
+class ChatRequest(BaseModel):
+    messages: List[ChatMessage]
+    temperature: float = Field(0.2, ge=0.1, le=1.0)
+    max_tokens: int = Field(256, ge=1, le=1024)
+    stream: bool = False
+# ========== GESTION DU MODÈLE ==========
+class ModelManager:
+    def __init__(self):
+        self.llm = None
+        self.model_path = None
+        self.loading = False
+    def find_or_download_model(self):
+        """Trouver ou télécharger le modèle GGUF"""
+        # Vérifier si un modèle existe déjà
+        for model_file in MODEL_FILES:
+            local_path = os.path.join(MODEL_DIR, model_file)
+            if os.path.exists(local_path):
+                print(f"✅ Modèle trouvé: {local_path}")
+                return local_path
+        # Télécharger le premier modèle disponible
+        print("📥 Aucun modèle local, téléchargement...")
+        for model_file in MODEL_FILES:
+            try:
+                print(f"  Essai: {model_file}")
+                local_path = hf_hub_download(
+                    repo_id=MODEL_REPO,
+                    filename=model_file,
+                    local_dir=MODEL_DIR,
+                    local_dir_use_symlinks=False,
+                    resume_download=True
+                )
+                print(f"✅ Téléchargé: {model_file}")
+                return local_path
+            except Exception as e:
+                print(f"  ❌ {model_file}: {str(e)[:100]}")
+                continue
+        raise Exception("❌ Aucun modèle disponible")
+    def load_model(self):
+        """Charger le modèle avec llama_cpp"""
+        if self.llm is not None:
+            return self.llm
+        print("🔧 Chargement du modèle...")
+        self.loading = True
+        try:
+            # Trouver le modèle
+            self.model_path = self.find_or_download_model()
+            # Configurer le modèle (optimisé pour Hugging Face 16GB RAM)
+            n_gpu_layers = -1  # Utiliser GPU si disponible
+            n_threads = 4      # 4 threads CPU
+            n_ctx = 2048       # Contexte limité pour économiser la RAM
+            print(f"🔄 Chargement depuis: {self.model_path}")
+            print(f"⚙️  Configuration: GPU layers={n_gpu_layers}, Threads={n_threads}, Context={n_ctx}")
+            # Charger le modèle
+            self.llm = Llama(
+                model_path=self.model_path,
+                n_ctx=n_ctx,
+                n_threads=n_threads,
+                n_gpu_layers=n_gpu_layers,
+                verbose=False
             )
+            print("✅ Modèle chargé avec succès!")
+            self.loading = False
+            return self.llm
+        except Exception as e:
+            self.loading = False
+            print(f"❌ Erreur chargement modèle: {e}")
+            raise
+    def generate(self, prompt: str, temperature: float = 0.2, max_tokens: int = 256, top_p: float = 0.95):
+        """Générer du texte"""
+        if self.llm is None:
+            self.load_model()
+        try:
+            output = self.llm(
+                prompt=prompt,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                top_p=top_p,
+                stop=["</s>", "```"],
+                echo=False
+            )
+            return output["choices"][0]["text"]
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
+    def chat(self, messages: List[dict], temperature: float = 0.2, max_tokens: int = 256):
+        """Chat conversationnel"""
+        if self.llm is None:
+            self.load_model()
+        # Formater les messages pour llama_cpp
+        formatted_prompt = self.format_chat_prompt(messages)
+        try:
+            output = self.llm(
+                prompt=formatted_prompt,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                stop=["</s>", "```"],
+                echo=False
+            )
+            return output["choices"][0]["text"]
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Chat error: {str(e)}")
+    def format_chat_prompt(self, messages: List[dict]) -> str:
+        """Formater les messages pour DeepSeek-Coder"""
+        prompt = ""
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            if role == "system":
+                prompt += f"<|system|>\n{content}\n<|end|>\n"
+            elif role == "user":
+                prompt += f"<|user|>\n{content}\n<|end|>\n"
+            elif role == "assistant":
+                prompt += f"<|assistant|>\n{content}\n<|end|>\n"
+        prompt += "<|assistant|>\n"
+        return prompt
+# ========== LIFECYCLE DE L'APPLICATION ==========
+model_manager = ModelManager()
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Gérer le cycle de vie de l'app"""
+    # Démarrage
+    print("🚀 Démarrage de l'API llama_cpp...")
+    # Charger le modèle en arrière-plan
+    async def load_model_async():
+        try:
+            model_manager.load_model()
+        except Exception as e:
+            print(f"⚠️ Erreur chargement modèle: {e}")
+    # Lancer le chargement sans bloquer
+    asyncio.create_task(load_model_async())
+    yield
+    # Nettoyage (si nécessaire)
+    if model_manager.llm:
+        print("🧹 Nettoyage...")
+# ========== APPLICATION FASTAPI ==========
+app = FastAPI(
+    title="🚀 DeepSeek-Coder 1.3B API (llama_cpp)",
+    description="API ultra-rapide avec llama_cpp_python",
+    version="2.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+    lifespan=lifespan
+)
+# CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ========== ROUTES API ==========
 @app.get("/")
+async def root():
     return {
+        "message": "🚀 DeepSeek-Coder 1.3B API",
+        "backend": "llama_cpp_python",
+        "status": "ready" if model_manager.llm else "loading",
+        "model_size": "1.3B",
+        "format": "GGUF (4-bit quantized)",
+        "endpoints": {
+            "generate": "POST /generate",
+            "chat": "POST /chat",
+            "health": "GET /health",
+            "models": "GET /models"
+        },
+        "performance": "~5-10 tokens/sec sur CPU"
     }
+@app.get("/health")
+async def health():
+    """Vérifier la santé"""
+    return {
+        "status": "healthy",
+        "model_loaded": model_manager.llm is not None,
+        "model_loading": model_manager.loading,
+        "model_path": model_manager.model_path,
+        "timestamp": time.time()
+    }
 @app.post("/generate")
+async def generate(request: GenerateRequest):
+    """Générer du code"""
+    if model_manager.loading:
+        raise HTTPException(status_code=503, detail="Model is still loading...")
     try:
+        response = model_manager.generate(
+            prompt=request.prompt,
+            temperature=request.temperature,
+            max_tokens=request.max_tokens,
+            top_p=request.top_p
+        )
+        return {
+            "response": response,
+            "model": "deepseek-coder-1.3b",
+            "tokens_generated": len(response.split()),
+            "backend": "llama_cpp"
+        }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+@app.post("/chat")
+async def chat(request: ChatRequest):
+    """Chat conversationnel"""
+    if model_manager.loading:
+        raise HTTPException(status_code=503, detail="Model is still loading...")
+    try:
+        # Convertir les messages
+        messages = [msg.dict() for msg in request.messages]
+        response = model_manager.chat(
+            messages=messages,
+            temperature=request.temperature,
+            max_tokens=request.max_tokens
+        )
+        return {
+            "response": response,
+            "model": "deepseek-coder-1.3b-instruct",
+            "backend": "llama_cpp"
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/models")
+async def list_models():
+    """Lister les modèles disponibles"""
+    models = []
+    if model_manager.model_path:
+        models.append({
+            "name": "deepseek-coder-1.3b",
+            "path": model_manager.model_path,
+            "size_mb": os.path.getsize(model_manager.model_path) / 1024 / 1024 if os.path.exists(model_manager.model_path) else 0,
+            "loaded": model_manager.llm is not None
+        })
+    return {"models": models}
+@app.get("/demo")
+async def demo():
+    """Démonstration rapide"""
+    examples = [
+        {
+            "endpoint": "POST /generate",
+            "curl": 'curl -X POST https://your-api.space/generate -H "Content-Type: application/json" -d \'{"prompt": "def fibonacci(n):", "temperature": 0.2}\''
+        },
+        {
+            "endpoint": "POST /chat",
+            "curl": 'curl -X POST https://your-api.space/chat -H "Content-Type: application/json" -d \'{"messages": [{"role": "user", "content": "Write Python code for binary search"}], "temperature": 0.2}\''
+        }
+    ]
+    return {"examples": examples}
+# ========== COMPATIBILITÉ OLLAMA ==========
+@app.post("/api/generate")
+async def ollama_generate(request: dict):
+    """Endpoint compatible Ollama"""
+    prompt = request.get("prompt", "")
+    model = request.get("model", "deepseek-coder-1.3b")
+    response = model_manager.generate(
+        prompt=prompt,
+        temperature=request.get("temperature", 0.2),
+        max_tokens=request.get("max_tokens", 256)
+    )
     return {
+        "model": model,
+        "response": response,
+        "done": True
     }
+# ========== DÉMARRAGE ==========
+if __name__ == "__main__":
+    import uvicorn
+    # Charger le modèle au démarrage (optionnel)
+    try:
+        model_manager.load_model()
+    except Exception as e:
+        print(f"⚠️ Note: {e}")
+        print("🔄 Le modèle se chargera à la première requête")
+    # Démarrer le serveur
+    port = int(os.getenv("PORT", 7860))
+    print(f"🌐 API démarrée sur http://0.0.0.0:{port}")
+    uvicorn.run(app, host="0.0.0.0", port=port)

download_model.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+Télécharger le modèle DeepSeek-Coder au format GGUF
+"""
+from huggingface_hub import hf_hub_download
+import os
+# Configuration
+MODEL_REPO = "bartowski/DeepSeek-Coder-1.3B-Instruct-GGUF"
+MODEL_FILE = "DeepSeek-Coder-1.3B-Instruct-Q4_K_M.gguf"
+LOCAL_PATH = "./models"
+def download_model():
+    """Télécharger le modèle GGUF"""
+    os.makedirs(LOCAL_PATH, exist_ok=True)
+    print(f"📥 Téléchargement de {MODEL_FILE}...")
+    try:
+        model_path = hf_hub_download(
+            repo_id=MODEL_REPO,
+            filename=MODEL_FILE,
+            local_dir=LOCAL_PATH,
+            local_dir_use_symlinks=False,
+            resume_download=True
+        )
+        print(f"✅ Modèle téléchargé: {model_path}")
+        print(f"📊 Taille: {os.path.getsize(model_path) / 1024 / 1024:.2f} MB")
+        return model_path
+    except Exception as e:
+        print(f"❌ Erreur: {e}")
+        # Fallback: télécharger un modèle plus petit
+        print("🔄 Téléchargement d'un modèle plus petit...")
+        try:
+            model_path = hf_hub_download(
+                repo_id="TheBloke/DeepSeek-Coder-1.3B-Instruct-GGUF",
+                filename="deepseek-coder-1.3b-instruct.Q2_K.gguf",
+                local_dir=LOCAL_PATH,
+                local_dir_use_symlinks=False
+            )
+            print(f"✅ Modèle de secours téléchargé")
+            return model_path
+        except:
+            print("❌ Impossible de télécharger aucun modèle")
+            return None
+if __name__ == "__main__":
+    download_model()