Spaces:
Sleeping
Sleeping
| """ | |
| API FastAPI pour DeepSeek-Coder avec llama_cpp | |
| Utilise des modèles publics accessibles | |
| """ | |
| import os | |
| import time | |
| import asyncio | |
| from typing import Optional, List | |
| from contextlib import asynccontextmanager | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel, Field | |
| # Import llama_cpp | |
| try: | |
| from llama_cpp import Llama | |
| except ImportError: | |
| Llama = None | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| HF_AVAILABLE = True | |
| except ImportError: | |
| HF_AVAILABLE = False | |
| # ========== CONFIGURATION ========== | |
| # Dépôts PUBLICs et accessibles | |
| MODEL_CONFIGS = [ | |
| { | |
| "repo": "TheBloke/DeepSeek-Coder-1.3B-Instruct-GGUF", | |
| "files": [ | |
| "deepseek-coder-1.3b-instruct.Q4_K_M.gguf", | |
| "deepseek-coder-1.3b-instruct.Q4_0.gguf", | |
| "deepseek-coder-1.3b-instruct.Q2_K.gguf" | |
| ] | |
| }, | |
| { | |
| "repo": "mradermacher/DeepSeek-Coder-1.3B-Instruct-GGUF", | |
| "files": [ | |
| "DeepSeek-Coder-1.3B-Instruct.Q4_K_M.gguf", | |
| "DeepSeek-Coder-1.3B-Instruct.Q2_K.gguf" | |
| ] | |
| } | |
| ] | |
| # Modèle de secours plus petit | |
| FALLBACK_MODELS = [ | |
| { | |
| "repo": "TheBloke/CodeLlama-7B-Instruct-GGUF", | |
| "files": ["codellama-7b-instruct.Q2_K.gguf"] # ~2.7GB | |
| }, | |
| { | |
| "repo": "TheBloke/tinycoder-1.1B-GGUF", | |
| "files": ["tinycoder-1.1b.Q2_K.gguf"] # ~500MB | |
| } | |
| ] | |
| MODEL_DIR = "./models" | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| # ========== MODÈLES DE DONNÉES ========== | |
| class GenerateRequest(BaseModel): | |
| prompt: str = Field(..., min_length=1, max_length=2000) | |
| temperature: float = Field(0.2, ge=0.1, le=1.0) | |
| max_tokens: int = Field(256, ge=1, le=1024) | |
| top_p: float = Field(0.95, ge=0.1, le=1.0) | |
| class ChatMessage(BaseModel): | |
| role: str = Field(..., pattern="^(user|assistant|system)$") | |
| content: str | |
| class ChatRequest(BaseModel): | |
| messages: List[ChatMessage] | |
| temperature: float = Field(0.2, ge=0.1, le=1.0) | |
| max_tokens: int = Field(256, ge=1, le=1024) | |
| # ========== OPENAI / OPENROUTER SCHEMAS ========== | |
| class OpenAIMessage(BaseModel): | |
| role: str | |
| content: str | |
| class ChatCompletionRequest(BaseModel): | |
| model: str | |
| messages: List[OpenAIMessage] | |
| temperature: Optional[float] = 0.2 | |
| max_tokens: Optional[int] = 256 | |
| top_p: Optional[float] = 0.95 | |
| stream: Optional[bool] = False | |
| class CompletionRequest(BaseModel): | |
| model: str | |
| prompt: str | |
| temperature: Optional[float] = 0.2 | |
| max_tokens: Optional[int] = 256 | |
| top_p: Optional[float] = 0.95 | |
| # ========== GESTION DU MODÈLE ========== | |
| class ModelManager: | |
| def __init__(self): | |
| self.llm = None | |
| self.model_path = None | |
| self.loading = False | |
| self.model_loaded = False | |
| def download_model(self, repo_id: str, filename: str) -> str: | |
| """Télécharger un modèle depuis Hugging Face""" | |
| print(f"📥 Téléchargement: {filename} depuis {repo_id}") | |
| try: | |
| model_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename, | |
| local_dir=MODEL_DIR, | |
| local_dir_use_symlinks=False, | |
| resume_download=True, | |
| token=None # Pas de token nécessaire pour les repos publics | |
| ) | |
| print(f"✅ Téléchargé: {model_path}") | |
| return model_path | |
| except Exception as e: | |
| print(f"❌ Erreur: {str(e)[:200]}") | |
| raise | |
| def find_or_download_model(self): | |
| """Trouver ou télécharger un modèle accessible""" | |
| if not HF_AVAILABLE: | |
| raise Exception("huggingface-hub non disponible") | |
| # Vérifier les modèles existants | |
| for root, dirs, files in os.walk(MODEL_DIR): | |
| for file in files: | |
| if file.endswith('.gguf'): | |
| path = os.path.join(root, file) | |
| print(f"✅ Modèle existant trouvé: {path}") | |
| return path | |
| # Essayer les modèles principaux | |
| print("🔍 Recherche d'un modèle accessible...") | |
| for config in MODEL_CONFIGS: | |
| repo = config["repo"] | |
| for filename in config["files"]: | |
| try: | |
| return self.download_model(repo, filename) | |
| except: | |
| continue | |
| # Essayer les modèles de secours | |
| print("🔄 Essai des modèles de secours...") | |
| for config in FALLBACK_MODELS: | |
| repo = config["repo"] | |
| for filename in config["files"]: | |
| try: | |
| return self.download_model(repo, filename) | |
| except: | |
| continue | |
| # Si tout échoue, créer un modèle factice pour tester | |
| print("⚠️ Création d'un modèle factice pour test...") | |
| dummy_path = os.path.join(MODEL_DIR, "dummy.gguf") | |
| with open(dummy_path, 'w') as f: | |
| f.write("DUMMY MODEL FOR TESTING") | |
| # Dans un environnement réel, vous voudriez télécharger un vrai petit modèle | |
| # Exemple: "microsoft/phi-2" ou un petit modèle local | |
| return dummy_path | |
| def load_model(self): | |
| """Charger le modèle avec llama_cpp""" | |
| if self.llm is not None: | |
| return self.llm | |
| if Llama is None: | |
| raise Exception("llama_cpp n'est pas installé") | |
| print("🔧 Chargement du modèle...") | |
| self.loading = True | |
| try: | |
| # Trouver ou télécharger le modèle | |
| self.model_path = self.find_or_download_model() | |
| # Vérifier si c'est un modèle factice | |
| if "dummy" in self.model_path: | |
| print("⚠️ Utilisation du modèle factice - l'API fonctionnera en mode test") | |
| self.llm = DummyLLM() | |
| self.model_loaded = True | |
| self.loading = False | |
| return self.llm | |
| # Configuration optimisée | |
| n_gpu_layers = 0 # Pas de GPU sur Hugging Face gratuit | |
| n_threads = 2 # Conservateur | |
| n_ctx = 2048 # Limité | |
| print(f"🔄 Chargement: {os.path.basename(self.model_path)}") | |
| # Charger le vrai modèle | |
| self.llm = Llama( | |
| model_path=self.model_path, | |
| n_ctx=n_ctx, | |
| n_threads=n_threads, | |
| n_gpu_layers=n_gpu_layers, | |
| verbose=False | |
| ) | |
| print("✅ Modèle chargé!") | |
| self.model_loaded = True | |
| self.loading = False | |
| return self.llm | |
| except Exception as e: | |
| print(f"❌ Erreur: {e}") | |
| print("🔄 Utilisation du mode fallback...") | |
| self.llm = DummyLLM() | |
| self.model_loaded = True | |
| self.loading = False | |
| return self.llm | |
| def generate( | |
| self, | |
| prompt: str, | |
| temperature: float = 0.2, | |
| max_tokens: int = 256, | |
| top_p: float = 0.95 | |
| ): | |
| """Générer du texte avec llama_cpp""" | |
| if self.llm is None: | |
| self.load_model() | |
| if isinstance(self.llm, DummyLLM): | |
| return self.llm.generate(prompt, temperature, max_tokens, top_p) | |
| try: | |
| response = self.llm( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| echo=False | |
| ) | |
| return response["choices"][0]["text"] | |
| except Exception as e: | |
| return ( | |
| "# Fallback response\n\n" | |
| f"Prompt: {prompt}\n\n" | |
| f"Error: {str(e)[:200]}" | |
| ) | |
| def chat( | |
| self, | |
| messages: List[dict], | |
| temperature: float = 0.2, | |
| max_tokens: int = 256 | |
| ): | |
| if self.llm is None: | |
| self.load_model() | |
| if isinstance(self.llm, DummyLLM): | |
| return self.llm.chat(messages, temperature, max_tokens) | |
| # ---- Prompt Instruct DeepSeek ---- | |
| system_prompt = "" | |
| user_prompt = "" | |
| for msg in messages: | |
| if msg["role"] == "system": | |
| system_prompt += msg["content"] + "\n" | |
| elif msg["role"] == "user": | |
| user_prompt += msg["content"] + "\n" | |
| prompt = f"""### Instruction: | |
| {system_prompt}{user_prompt} | |
| ### Response: | |
| """ | |
| response = self.llm( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.95, | |
| echo=False | |
| ) | |
| return response["choices"][0]["text"].strip() | |
| # ========== MODÈLE FACTICE POUR TEST ========== | |
| class DummyLLM: | |
| """Modèle factice pour tester l'API quand le vrai modèle échoue""" | |
| def generate(self, prompt: str, temperature: float = 0.2, max_tokens: int = 256, top_p: float = 0.95): | |
| """Générer une réponse factice""" | |
| # Simuler un délai de traitement | |
| time.sleep(0.1) | |
| # Retourner une réponse basée sur le prompt | |
| if "python" in prompt.lower(): | |
| return f"""# Code Python généré (mode test) | |
| # Prompt: {prompt[:50]}... | |
| def example_function(): | |
| \"\"\"Exemple de fonction Python\"\"\" | |
| print("Hello from DeepSeek-Coder (Test Mode)") | |
| return 42 | |
| # Note: L'API fonctionne mais utilise un modèle factice. | |
| # Le vrai modèle sera téléchargé automatiquement à la prochaine requête.""" | |
| elif "javascript" in prompt.lower() or "js" in prompt.lower(): | |
| return f"""// Code JavaScript généré (mode test) | |
| // Prompt: {prompt[:50]}... | |
| function exampleFunction() {{ | |
| console.log("Hello from DeepSeek-Coder (Test Mode)"); | |
| return 42; | |
| }} | |
| // Note: Mode test - le vrai modèle se télécharge en arrière-plan.""" | |
| else: | |
| return f"""# Réponse générée (mode test) | |
| Prompt: {prompt} | |
| Voici un exemple de code: | |
| ```python | |
| def process_input(text): | |
| \"\"\"Traiter l'entrée utilisateur\"\"\" | |
| return f"Processed: {{text}}" | |
| # L'API est opérationnelle en mode test. | |
| # Le modèle DeepSeek-Coder se télécharge en arrière-plan.""" | |
| def chat(self, messages: List[dict], temperature: float = 0.2, max_tokens: int = 256): | |
| """Chat factice""" | |
| last_message = messages[-1]["content"] if messages else "Hello" | |
| responses = [ | |
| f"Bonjour! Je suis DeepSeek-Coder en mode test. Vous avez dit: '{last_message[:50]}...'", | |
| f"Je peux vous aider avec du code. En mode test, voici un exemple:\n\n```python\nprint('Hello World')\n```", | |
| f"Le modèle réel est en cours de téléchargement. En attendant, voici une réponse de test." | |
| ] | |
| import random | |
| return random.choice(responses) | |
| # ========== APPLICATION ========== | |
| model_manager = ModelManager() | |
| async def lifespan(app: FastAPI): | |
| """Cycle de vie""" | |
| print("🚀 Démarrage API...") | |
| async def load_async(): | |
| try: | |
| model_manager.load_model() | |
| except Exception as e: | |
| print(f"⚠️ Note: {e}") | |
| asyncio.create_task(load_async()) | |
| yield | |
| print("🧹 Arrêt...") | |
| app = FastAPI( | |
| title="🚀 DeepSeek-Coder API", | |
| description="API avec fallback automatique", | |
| version="1.0.0", | |
| docs_url="/docs", | |
| redoc_url=None, | |
| lifespan=lifespan | |
| ) | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ========== ROUTES ========== | |
| async def root(): | |
| return { | |
| "message": "🚀 DeepSeek-Coder 1.3B API", | |
| "status": "ready" if model_manager.model_loaded else "loading", | |
| "mode": "dummy" if isinstance(model_manager.llm, DummyLLM) else "real", | |
| "endpoints": { | |
| "generate": "POST /generate", | |
| "chat": "POST /chat", | |
| "health": "GET /health", | |
| "test": "GET /test" | |
| } | |
| } | |
| async def health(): | |
| return { | |
| "status": "healthy", | |
| "model_loaded": model_manager.model_loaded, | |
| "model_type": "dummy" if isinstance(model_manager.llm, DummyLLM) else "real", | |
| "timestamp": time.time() | |
| } | |
| async def generate(request: GenerateRequest): | |
| """Générer du code""" | |
| try: | |
| response = model_manager.generate( | |
| prompt=request.prompt, | |
| temperature=request.temperature, | |
| max_tokens=request.max_tokens, | |
| top_p=request.top_p | |
| ) | |
| return { | |
| "response": response, | |
| "model": "deepseek-coder-1.3b", | |
| "mode": "dummy" if isinstance(model_manager.llm, DummyLLM) else "real", | |
| "backend": "llama_cpp" | |
| } | |
| except Exception as e: | |
| # Fallback encore plus simple | |
| return { | |
| "response": f"# Fallback response\n\nPrompt: {request.prompt}\n\nError: {str(e)[:100]}", | |
| "model": "fallback", | |
| "error": "generation_failed" | |
| } | |
| async def chat(request: ChatRequest): | |
| """Chat""" | |
| try: | |
| messages = [msg.dict() for msg in request.messages] | |
| response = model_manager.chat( | |
| messages=messages, | |
| temperature=request.temperature, | |
| max_tokens=request.max_tokens | |
| ) | |
| return { | |
| "response": response, | |
| "model": "deepseek-coder-1.3b-instruct", | |
| "mode": "dummy" if isinstance(model_manager.llm, DummyLLM) else "real" | |
| } | |
| except Exception as e: | |
| return { | |
| "response": f"Chat error: {str(e)[:100]}", | |
| "model": "fallback" | |
| } | |
| async def test(): | |
| """Endpoint de test""" | |
| return { | |
| "test": "success", | |
| "message": "API is running", | |
| "try_endpoints": [ | |
| "POST /generate with JSON: {'prompt': 'def hello():', 'temperature': 0.2}", | |
| "POST /chat with JSON: {'messages': [{'role': 'user', 'content': 'Hello'}]}" | |
| ] | |
| } | |
| async def download_status(): | |
| """Vérifier le statut du téléchargement""" | |
| if model_manager.model_path and os.path.exists(model_manager.model_path): | |
| size = os.path.getsize(model_manager.model_path) | |
| return { | |
| "downloaded": True, | |
| "size_mb": round(size / 1024 / 1024, 2), | |
| "path": model_manager.model_path | |
| } | |
| return {"downloaded": False, "message": "No model downloaded yet"} | |
| async def openai_chat(request: ChatCompletionRequest): | |
| try: | |
| messages = [m.dict() for m in request.messages] | |
| response_text = model_manager.chat( | |
| messages=messages, | |
| temperature=request.temperature, | |
| max_tokens=request.max_tokens | |
| ) | |
| return { | |
| "id": f"chatcmpl-{int(time.time())}", | |
| "object": "chat.completion", | |
| "created": int(time.time()), | |
| "model": request.model, | |
| "choices": [ | |
| { | |
| "index": 0, | |
| "message": { | |
| "role": "assistant", | |
| "content": response_text | |
| }, | |
| "finish_reason": "stop" | |
| } | |
| ], | |
| "usage": { | |
| "prompt_tokens": 0, | |
| "completion_tokens": 0, | |
| "total_tokens": 0 | |
| } | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def openai_completion(request: CompletionRequest): | |
| try: | |
| response_text = model_manager.generate( | |
| prompt=request.prompt, | |
| temperature=request.temperature, | |
| max_tokens=request.max_tokens, | |
| top_p=request.top_p | |
| ) | |
| return { | |
| "id": f"cmpl-{int(time.time())}", | |
| "object": "text_completion", | |
| "created": int(time.time()), | |
| "model": request.model, | |
| "choices": [ | |
| { | |
| "text": response_text, | |
| "index": 0, | |
| "finish_reason": "stop" | |
| } | |
| ] | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| port = int(os.getenv("PORT", 7860)) | |
| print(f"🌐 Serveur démarré sur le port {port}") | |
| uvicorn.run(app, host="0.0.0.0", port=port) |