Spaces:
Sleeping
Sleeping
| """ | |
| API FastAPI pour DeepSeek-Coder avec llama_cpp | |
| Démarrage rapide, faible mémoire | |
| """ | |
| import os | |
| import time | |
| import asyncio | |
| from typing import Optional, List | |
| from contextlib import asynccontextmanager | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel, Field | |
| from huggingface_hub import hf_hub_download | |
| # Import llama_cpp | |
| try: | |
| from llama_cpp import Llama | |
| from llama_cpp.server.app import create_app, Settings | |
| except ImportError: | |
| # Fallback si llama_cpp_python n'est pas installé | |
| Llama = None | |
| # ========== CONFIGURATION ========== | |
| MODEL_REPO = "bartowski/DeepSeek-Coder-1.3B-Instruct-GGUF" | |
| MODEL_FILES = [ | |
| "DeepSeek-Coder-1.3B-Instruct-Q4_K_M.gguf", # 900MB - Bon compromis | |
| "DeepSeek-Coder-1.3B-Instruct-Q4_0.gguf", # 900MB | |
| "DeepSeek-Coder-1.3B-Instruct-Q2_K.gguf", # 500MB - Plus léger | |
| ] | |
| # Chemin local pour le modèle | |
| MODEL_DIR = "./models" | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| # ========== MODÈLES DE DONNÉES ========== | |
| class GenerateRequest(BaseModel): | |
| prompt: str = Field(..., min_length=1, max_length=2000) | |
| temperature: float = Field(0.2, ge=0.1, le=1.0) | |
| max_tokens: int = Field(256, ge=1, le=1024) | |
| top_p: float = Field(0.95, ge=0.1, le=1.0) | |
| stream: bool = False | |
| class ChatMessage(BaseModel): | |
| role: str = Field(..., regex="^(user|assistant|system)$") | |
| content: str | |
| class ChatRequest(BaseModel): | |
| messages: List[ChatMessage] | |
| temperature: float = Field(0.2, ge=0.1, le=1.0) | |
| max_tokens: int = Field(256, ge=1, le=1024) | |
| stream: bool = False | |
| # ========== GESTION DU MODÈLE ========== | |
| class ModelManager: | |
| def __init__(self): | |
| self.llm = None | |
| self.model_path = None | |
| self.loading = False | |
| def find_or_download_model(self): | |
| """Trouver ou télécharger le modèle GGUF""" | |
| # Vérifier si un modèle existe déjà | |
| for model_file in MODEL_FILES: | |
| local_path = os.path.join(MODEL_DIR, model_file) | |
| if os.path.exists(local_path): | |
| print(f"✅ Modèle trouvé: {local_path}") | |
| return local_path | |
| # Télécharger le premier modèle disponible | |
| print("📥 Aucun modèle local, téléchargement...") | |
| for model_file in MODEL_FILES: | |
| try: | |
| print(f" Essai: {model_file}") | |
| local_path = hf_hub_download( | |
| repo_id=MODEL_REPO, | |
| filename=model_file, | |
| local_dir=MODEL_DIR, | |
| local_dir_use_symlinks=False, | |
| resume_download=True | |
| ) | |
| print(f"✅ Téléchargé: {model_file}") | |
| return local_path | |
| except Exception as e: | |
| print(f" ❌ {model_file}: {str(e)[:100]}") | |
| continue | |
| raise Exception("❌ Aucun modèle disponible") | |
| def load_model(self): | |
| """Charger le modèle avec llama_cpp""" | |
| if self.llm is not None: | |
| return self.llm | |
| print("🔧 Chargement du modèle...") | |
| self.loading = True | |
| try: | |
| # Trouver le modèle | |
| self.model_path = self.find_or_download_model() | |
| # Configurer le modèle (optimisé pour Hugging Face 16GB RAM) | |
| n_gpu_layers = -1 # Utiliser GPU si disponible | |
| n_threads = 4 # 4 threads CPU | |
| n_ctx = 2048 # Contexte limité pour économiser la RAM | |
| print(f"🔄 Chargement depuis: {self.model_path}") | |
| print(f"⚙️ Configuration: GPU layers={n_gpu_layers}, Threads={n_threads}, Context={n_ctx}") | |
| # Charger le modèle | |
| self.llm = Llama( | |
| model_path=self.model_path, | |
| n_ctx=n_ctx, | |
| n_threads=n_threads, | |
| n_gpu_layers=n_gpu_layers, | |
| verbose=False | |
| ) | |
| print("✅ Modèle chargé avec succès!") | |
| self.loading = False | |
| return self.llm | |
| except Exception as e: | |
| self.loading = False | |
| print(f"❌ Erreur chargement modèle: {e}") | |
| raise | |
| def generate(self, prompt: str, temperature: float = 0.2, max_tokens: int = 256, top_p: float = 0.95): | |
| """Générer du texte""" | |
| if self.llm is None: | |
| self.load_model() | |
| try: | |
| output = self.llm( | |
| prompt=prompt, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| top_p=top_p, | |
| stop=["</s>", "```"], | |
| echo=False | |
| ) | |
| return output["choices"][0]["text"] | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}") | |
| def chat(self, messages: List[dict], temperature: float = 0.2, max_tokens: int = 256): | |
| """Chat conversationnel""" | |
| if self.llm is None: | |
| self.load_model() | |
| # Formater les messages pour llama_cpp | |
| formatted_prompt = self.format_chat_prompt(messages) | |
| try: | |
| output = self.llm( | |
| prompt=formatted_prompt, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| stop=["</s>", "```"], | |
| echo=False | |
| ) | |
| return output["choices"][0]["text"] | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Chat error: {str(e)}") | |
| def format_chat_prompt(self, messages: List[dict]) -> str: | |
| """Formater les messages pour DeepSeek-Coder""" | |
| prompt = "" | |
| for msg in messages: | |
| role = msg["role"] | |
| content = msg["content"] | |
| if role == "system": | |
| prompt += f"<|system|>\n{content}\n<|end|>\n" | |
| elif role == "user": | |
| prompt += f"<|user|>\n{content}\n<|end|>\n" | |
| elif role == "assistant": | |
| prompt += f"<|assistant|>\n{content}\n<|end|>\n" | |
| prompt += "<|assistant|>\n" | |
| return prompt | |
| # ========== LIFECYCLE DE L'APPLICATION ========== | |
| model_manager = ModelManager() | |
| async def lifespan(app: FastAPI): | |
| """Gérer le cycle de vie de l'app""" | |
| # Démarrage | |
| print("🚀 Démarrage de l'API llama_cpp...") | |
| # Charger le modèle en arrière-plan | |
| async def load_model_async(): | |
| try: | |
| model_manager.load_model() | |
| except Exception as e: | |
| print(f"⚠️ Erreur chargement modèle: {e}") | |
| # Lancer le chargement sans bloquer | |
| asyncio.create_task(load_model_async()) | |
| yield | |
| # Nettoyage (si nécessaire) | |
| if model_manager.llm: | |
| print("🧹 Nettoyage...") | |
| # ========== APPLICATION FASTAPI ========== | |
| app = FastAPI( | |
| title="🚀 DeepSeek-Coder 1.3B API (llama_cpp)", | |
| description="API ultra-rapide avec llama_cpp_python", | |
| version="2.0.0", | |
| docs_url="/docs", | |
| redoc_url="/redoc", | |
| lifespan=lifespan | |
| ) | |
| # CORS | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ========== ROUTES API ========== | |
| async def root(): | |
| return { | |
| "message": "🚀 DeepSeek-Coder 1.3B API", | |
| "backend": "llama_cpp_python", | |
| "status": "ready" if model_manager.llm else "loading", | |
| "model_size": "1.3B", | |
| "format": "GGUF (4-bit quantized)", | |
| "endpoints": { | |
| "generate": "POST /generate", | |
| "chat": "POST /chat", | |
| "health": "GET /health", | |
| "models": "GET /models" | |
| }, | |
| "performance": "~5-10 tokens/sec sur CPU" | |
| } | |
| async def health(): | |
| """Vérifier la santé""" | |
| return { | |
| "status": "healthy", | |
| "model_loaded": model_manager.llm is not None, | |
| "model_loading": model_manager.loading, | |
| "model_path": model_manager.model_path, | |
| "timestamp": time.time() | |
| } | |
| async def generate(request: GenerateRequest): | |
| """Générer du code""" | |
| if model_manager.loading: | |
| raise HTTPException(status_code=503, detail="Model is still loading...") | |
| try: | |
| response = model_manager.generate( | |
| prompt=request.prompt, | |
| temperature=request.temperature, | |
| max_tokens=request.max_tokens, | |
| top_p=request.top_p | |
| ) | |
| return { | |
| "response": response, | |
| "model": "deepseek-coder-1.3b", | |
| "tokens_generated": len(response.split()), | |
| "backend": "llama_cpp" | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def chat(request: ChatRequest): | |
| """Chat conversationnel""" | |
| if model_manager.loading: | |
| raise HTTPException(status_code=503, detail="Model is still loading...") | |
| try: | |
| # Convertir les messages | |
| messages = [msg.dict() for msg in request.messages] | |
| response = model_manager.chat( | |
| messages=messages, | |
| temperature=request.temperature, | |
| max_tokens=request.max_tokens | |
| ) | |
| return { | |
| "response": response, | |
| "model": "deepseek-coder-1.3b-instruct", | |
| "backend": "llama_cpp" | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def list_models(): | |
| """Lister les modèles disponibles""" | |
| models = [] | |
| if model_manager.model_path: | |
| models.append({ | |
| "name": "deepseek-coder-1.3b", | |
| "path": model_manager.model_path, | |
| "size_mb": os.path.getsize(model_manager.model_path) / 1024 / 1024 if os.path.exists(model_manager.model_path) else 0, | |
| "loaded": model_manager.llm is not None | |
| }) | |
| return {"models": models} | |
| async def demo(): | |
| """Démonstration rapide""" | |
| examples = [ | |
| { | |
| "endpoint": "POST /generate", | |
| "curl": 'curl -X POST https://your-api.space/generate -H "Content-Type: application/json" -d \'{"prompt": "def fibonacci(n):", "temperature": 0.2}\'' | |
| }, | |
| { | |
| "endpoint": "POST /chat", | |
| "curl": 'curl -X POST https://your-api.space/chat -H "Content-Type: application/json" -d \'{"messages": [{"role": "user", "content": "Write Python code for binary search"}], "temperature": 0.2}\'' | |
| } | |
| ] | |
| return {"examples": examples} | |
| # ========== COMPATIBILITÉ OLLAMA ========== | |
| async def ollama_generate(request: dict): | |
| """Endpoint compatible Ollama""" | |
| prompt = request.get("prompt", "") | |
| model = request.get("model", "deepseek-coder-1.3b") | |
| response = model_manager.generate( | |
| prompt=prompt, | |
| temperature=request.get("temperature", 0.2), | |
| max_tokens=request.get("max_tokens", 256) | |
| ) | |
| return { | |
| "model": model, | |
| "response": response, | |
| "done": True | |
| } | |
| # ========== DÉMARRAGE ========== | |
| if __name__ == "__main__": | |
| import uvicorn | |
| # Charger le modèle au démarrage (optionnel) | |
| try: | |
| model_manager.load_model() | |
| except Exception as e: | |
| print(f"⚠️ Note: {e}") | |
| print("🔄 Le modèle se chargera à la première requête") | |
| # Démarrer le serveur | |
| port = int(os.getenv("PORT", 7860)) | |
| print(f"🌐 API démarrée sur http://0.0.0.0:{port}") | |
| uvicorn.run(app, host="0.0.0.0", port=port) |