allma / api.py
pakito312
update
0aec217
"""
API FastAPI pour DeepSeek-Coder avec llama_cpp
Utilise des modèles publics accessibles
"""
import os
import time
import asyncio
from typing import Optional, List
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
# Import llama_cpp
try:
from llama_cpp import Llama
except ImportError:
Llama = None
try:
from huggingface_hub import hf_hub_download
HF_AVAILABLE = True
except ImportError:
HF_AVAILABLE = False
# ========== CONFIGURATION ==========
# Dépôts PUBLICs et accessibles
MODEL_CONFIGS = [
{
"repo": "TheBloke/DeepSeek-Coder-1.3B-Instruct-GGUF",
"files": [
"deepseek-coder-1.3b-instruct.Q4_K_M.gguf",
"deepseek-coder-1.3b-instruct.Q4_0.gguf",
"deepseek-coder-1.3b-instruct.Q2_K.gguf"
]
},
{
"repo": "mradermacher/DeepSeek-Coder-1.3B-Instruct-GGUF",
"files": [
"DeepSeek-Coder-1.3B-Instruct.Q4_K_M.gguf",
"DeepSeek-Coder-1.3B-Instruct.Q2_K.gguf"
]
}
]
# Modèle de secours plus petit
FALLBACK_MODELS = [
{
"repo": "TheBloke/CodeLlama-7B-Instruct-GGUF",
"files": ["codellama-7b-instruct.Q2_K.gguf"] # ~2.7GB
},
{
"repo": "TheBloke/tinycoder-1.1B-GGUF",
"files": ["tinycoder-1.1b.Q2_K.gguf"] # ~500MB
}
]
MODEL_DIR = "./models"
os.makedirs(MODEL_DIR, exist_ok=True)
# ========== MODÈLES DE DONNÉES ==========
class GenerateRequest(BaseModel):
prompt: str = Field(..., min_length=1, max_length=2000)
temperature: float = Field(0.2, ge=0.1, le=1.0)
max_tokens: int = Field(256, ge=1, le=1024)
top_p: float = Field(0.95, ge=0.1, le=1.0)
class ChatMessage(BaseModel):
role: str = Field(..., pattern="^(user|assistant|system)$")
content: str
class ChatRequest(BaseModel):
messages: List[ChatMessage]
temperature: float = Field(0.2, ge=0.1, le=1.0)
max_tokens: int = Field(256, ge=1, le=1024)
# ========== OPENAI / OPENROUTER SCHEMAS ==========
class OpenAIMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str
messages: List[OpenAIMessage]
temperature: Optional[float] = 0.2
max_tokens: Optional[int] = 256
top_p: Optional[float] = 0.95
stream: Optional[bool] = False
class CompletionRequest(BaseModel):
model: str
prompt: str
temperature: Optional[float] = 0.2
max_tokens: Optional[int] = 256
top_p: Optional[float] = 0.95
# ========== GESTION DU MODÈLE ==========
class ModelManager:
def __init__(self):
self.llm = None
self.model_path = None
self.loading = False
self.model_loaded = False
def download_model(self, repo_id: str, filename: str) -> str:
"""Télécharger un modèle depuis Hugging Face"""
print(f"📥 Téléchargement: {filename} depuis {repo_id}")
try:
model_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False,
resume_download=True,
token=None # Pas de token nécessaire pour les repos publics
)
print(f"✅ Téléchargé: {model_path}")
return model_path
except Exception as e:
print(f"❌ Erreur: {str(e)[:200]}")
raise
def find_or_download_model(self):
"""Trouver ou télécharger un modèle accessible"""
if not HF_AVAILABLE:
raise Exception("huggingface-hub non disponible")
# Vérifier les modèles existants
for root, dirs, files in os.walk(MODEL_DIR):
for file in files:
if file.endswith('.gguf'):
path = os.path.join(root, file)
print(f"✅ Modèle existant trouvé: {path}")
return path
# Essayer les modèles principaux
print("🔍 Recherche d'un modèle accessible...")
for config in MODEL_CONFIGS:
repo = config["repo"]
for filename in config["files"]:
try:
return self.download_model(repo, filename)
except:
continue
# Essayer les modèles de secours
print("🔄 Essai des modèles de secours...")
for config in FALLBACK_MODELS:
repo = config["repo"]
for filename in config["files"]:
try:
return self.download_model(repo, filename)
except:
continue
# Si tout échoue, créer un modèle factice pour tester
print("⚠️ Création d'un modèle factice pour test...")
dummy_path = os.path.join(MODEL_DIR, "dummy.gguf")
with open(dummy_path, 'w') as f:
f.write("DUMMY MODEL FOR TESTING")
# Dans un environnement réel, vous voudriez télécharger un vrai petit modèle
# Exemple: "microsoft/phi-2" ou un petit modèle local
return dummy_path
def load_model(self):
"""Charger le modèle avec llama_cpp"""
if self.llm is not None:
return self.llm
if Llama is None:
raise Exception("llama_cpp n'est pas installé")
print("🔧 Chargement du modèle...")
self.loading = True
try:
# Trouver ou télécharger le modèle
self.model_path = self.find_or_download_model()
# Vérifier si c'est un modèle factice
if "dummy" in self.model_path:
print("⚠️ Utilisation du modèle factice - l'API fonctionnera en mode test")
self.llm = DummyLLM()
self.model_loaded = True
self.loading = False
return self.llm
# Configuration optimisée
n_gpu_layers = 0 # Pas de GPU sur Hugging Face gratuit
n_threads = 2 # Conservateur
n_ctx = 2048 # Limité
print(f"🔄 Chargement: {os.path.basename(self.model_path)}")
# Charger le vrai modèle
self.llm = Llama(
model_path=self.model_path,
n_ctx=n_ctx,
n_threads=n_threads,
n_gpu_layers=n_gpu_layers,
verbose=False
)
print("✅ Modèle chargé!")
self.model_loaded = True
self.loading = False
return self.llm
except Exception as e:
print(f"❌ Erreur: {e}")
print("🔄 Utilisation du mode fallback...")
self.llm = DummyLLM()
self.model_loaded = True
self.loading = False
return self.llm
def generate(
self,
prompt: str,
temperature: float = 0.2,
max_tokens: int = 256,
top_p: float = 0.95
):
"""Générer du texte avec llama_cpp"""
if self.llm is None:
self.load_model()
if isinstance(self.llm, DummyLLM):
return self.llm.generate(prompt, temperature, max_tokens, top_p)
try:
response = self.llm(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
echo=False
)
return response["choices"][0]["text"]
except Exception as e:
return (
"# Fallback response\n\n"
f"Prompt: {prompt}\n\n"
f"Error: {str(e)[:200]}"
)
def chat(
self,
messages: List[dict],
temperature: float = 0.2,
max_tokens: int = 256
):
if self.llm is None:
self.load_model()
if isinstance(self.llm, DummyLLM):
return self.llm.chat(messages, temperature, max_tokens)
# ---- Prompt Instruct DeepSeek ----
system_prompt = ""
user_prompt = ""
for msg in messages:
if msg["role"] == "system":
system_prompt += msg["content"] + "\n"
elif msg["role"] == "user":
user_prompt += msg["content"] + "\n"
prompt = f"""### Instruction:
{system_prompt}{user_prompt}
### Response:
"""
response = self.llm(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.95,
echo=False
)
return response["choices"][0]["text"].strip()
# ========== MODÈLE FACTICE POUR TEST ==========
class DummyLLM:
"""Modèle factice pour tester l'API quand le vrai modèle échoue"""
def generate(self, prompt: str, temperature: float = 0.2, max_tokens: int = 256, top_p: float = 0.95):
"""Générer une réponse factice"""
# Simuler un délai de traitement
time.sleep(0.1)
# Retourner une réponse basée sur le prompt
if "python" in prompt.lower():
return f"""# Code Python généré (mode test)
# Prompt: {prompt[:50]}...
def example_function():
\"\"\"Exemple de fonction Python\"\"\"
print("Hello from DeepSeek-Coder (Test Mode)")
return 42
# Note: L'API fonctionne mais utilise un modèle factice.
# Le vrai modèle sera téléchargé automatiquement à la prochaine requête."""
elif "javascript" in prompt.lower() or "js" in prompt.lower():
return f"""// Code JavaScript généré (mode test)
// Prompt: {prompt[:50]}...
function exampleFunction() {{
console.log("Hello from DeepSeek-Coder (Test Mode)");
return 42;
}}
// Note: Mode test - le vrai modèle se télécharge en arrière-plan."""
else:
return f"""# Réponse générée (mode test)
Prompt: {prompt}
Voici un exemple de code:
```python
def process_input(text):
\"\"\"Traiter l'entrée utilisateur\"\"\"
return f"Processed: {{text}}"
# L'API est opérationnelle en mode test.
# Le modèle DeepSeek-Coder se télécharge en arrière-plan."""
def chat(self, messages: List[dict], temperature: float = 0.2, max_tokens: int = 256):
"""Chat factice"""
last_message = messages[-1]["content"] if messages else "Hello"
responses = [
f"Bonjour! Je suis DeepSeek-Coder en mode test. Vous avez dit: '{last_message[:50]}...'",
f"Je peux vous aider avec du code. En mode test, voici un exemple:\n\n```python\nprint('Hello World')\n```",
f"Le modèle réel est en cours de téléchargement. En attendant, voici une réponse de test."
]
import random
return random.choice(responses)
# ========== APPLICATION ==========
model_manager = ModelManager()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Cycle de vie"""
print("🚀 Démarrage API...")
async def load_async():
try:
model_manager.load_model()
except Exception as e:
print(f"⚠️ Note: {e}")
asyncio.create_task(load_async())
yield
print("🧹 Arrêt...")
app = FastAPI(
title="🚀 DeepSeek-Coder API",
description="API avec fallback automatique",
version="1.0.0",
docs_url="/docs",
redoc_url=None,
lifespan=lifespan
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# ========== ROUTES ==========
@app.get("/")
async def root():
return {
"message": "🚀 DeepSeek-Coder 1.3B API",
"status": "ready" if model_manager.model_loaded else "loading",
"mode": "dummy" if isinstance(model_manager.llm, DummyLLM) else "real",
"endpoints": {
"generate": "POST /generate",
"chat": "POST /chat",
"health": "GET /health",
"test": "GET /test"
}
}
@app.get("/health")
async def health():
return {
"status": "healthy",
"model_loaded": model_manager.model_loaded,
"model_type": "dummy" if isinstance(model_manager.llm, DummyLLM) else "real",
"timestamp": time.time()
}
@app.post("/generate")
async def generate(request: GenerateRequest):
"""Générer du code"""
try:
response = model_manager.generate(
prompt=request.prompt,
temperature=request.temperature,
max_tokens=request.max_tokens,
top_p=request.top_p
)
return {
"response": response,
"model": "deepseek-coder-1.3b",
"mode": "dummy" if isinstance(model_manager.llm, DummyLLM) else "real",
"backend": "llama_cpp"
}
except Exception as e:
# Fallback encore plus simple
return {
"response": f"# Fallback response\n\nPrompt: {request.prompt}\n\nError: {str(e)[:100]}",
"model": "fallback",
"error": "generation_failed"
}
@app.post("/chat")
async def chat(request: ChatRequest):
"""Chat"""
try:
messages = [msg.dict() for msg in request.messages]
response = model_manager.chat(
messages=messages,
temperature=request.temperature,
max_tokens=request.max_tokens
)
return {
"response": response,
"model": "deepseek-coder-1.3b-instruct",
"mode": "dummy" if isinstance(model_manager.llm, DummyLLM) else "real"
}
except Exception as e:
return {
"response": f"Chat error: {str(e)[:100]}",
"model": "fallback"
}
@app.get("/test")
async def test():
"""Endpoint de test"""
return {
"test": "success",
"message": "API is running",
"try_endpoints": [
"POST /generate with JSON: {'prompt': 'def hello():', 'temperature': 0.2}",
"POST /chat with JSON: {'messages': [{'role': 'user', 'content': 'Hello'}]}"
]
}
@app.get("/download_status")
async def download_status():
"""Vérifier le statut du téléchargement"""
if model_manager.model_path and os.path.exists(model_manager.model_path):
size = os.path.getsize(model_manager.model_path)
return {
"downloaded": True,
"size_mb": round(size / 1024 / 1024, 2),
"path": model_manager.model_path
}
return {"downloaded": False, "message": "No model downloaded yet"}
@app.post("/v1/chat/completions")
async def openai_chat(request: ChatCompletionRequest):
try:
messages = [m.dict() for m in request.messages]
response_text = model_manager.chat(
messages=messages,
temperature=request.temperature,
max_tokens=request.max_tokens
)
return {
"id": f"chatcmpl-{int(time.time())}",
"object": "chat.completion",
"created": int(time.time()),
"model": request.model,
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": response_text
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/v1/completions")
async def openai_completion(request: CompletionRequest):
try:
response_text = model_manager.generate(
prompt=request.prompt,
temperature=request.temperature,
max_tokens=request.max_tokens,
top_p=request.top_p
)
return {
"id": f"cmpl-{int(time.time())}",
"object": "text_completion",
"created": int(time.time()),
"model": request.model,
"choices": [
{
"text": response_text,
"index": 0,
"finish_reason": "stop"
}
]
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
port = int(os.getenv("PORT", 7860))
print(f"🌐 Serveur démarré sur le port {port}")
uvicorn.run(app, host="0.0.0.0", port=port)