allma / api.py
pakito312
update
ca1c16e
raw
history blame
11.9 kB
"""
API FastAPI pour DeepSeek-Coder avec llama_cpp
Démarrage rapide, faible mémoire
"""
import os
import time
import asyncio
from typing import Optional, List
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from huggingface_hub import hf_hub_download
# Import llama_cpp
try:
from llama_cpp import Llama
from llama_cpp.server.app import create_app, Settings
except ImportError:
# Fallback si llama_cpp_python n'est pas installé
Llama = None
# ========== CONFIGURATION ==========
MODEL_REPO = "bartowski/DeepSeek-Coder-1.3B-Instruct-GGUF"
MODEL_FILES = [
"DeepSeek-Coder-1.3B-Instruct-Q4_K_M.gguf", # 900MB - Bon compromis
"DeepSeek-Coder-1.3B-Instruct-Q4_0.gguf", # 900MB
"DeepSeek-Coder-1.3B-Instruct-Q2_K.gguf", # 500MB - Plus léger
]
# Chemin local pour le modèle
MODEL_DIR = "./models"
os.makedirs(MODEL_DIR, exist_ok=True)
# ========== MODÈLES DE DONNÉES ==========
class GenerateRequest(BaseModel):
prompt: str = Field(..., min_length=1, max_length=2000)
temperature: float = Field(0.2, ge=0.1, le=1.0)
max_tokens: int = Field(256, ge=1, le=1024)
top_p: float = Field(0.95, ge=0.1, le=1.0)
stream: bool = False
class ChatMessage(BaseModel):
role: str = Field(..., regex="^(user|assistant|system)$")
content: str
class ChatRequest(BaseModel):
messages: List[ChatMessage]
temperature: float = Field(0.2, ge=0.1, le=1.0)
max_tokens: int = Field(256, ge=1, le=1024)
stream: bool = False
# ========== GESTION DU MODÈLE ==========
class ModelManager:
def __init__(self):
self.llm = None
self.model_path = None
self.loading = False
def find_or_download_model(self):
"""Trouver ou télécharger le modèle GGUF"""
# Vérifier si un modèle existe déjà
for model_file in MODEL_FILES:
local_path = os.path.join(MODEL_DIR, model_file)
if os.path.exists(local_path):
print(f"✅ Modèle trouvé: {local_path}")
return local_path
# Télécharger le premier modèle disponible
print("📥 Aucun modèle local, téléchargement...")
for model_file in MODEL_FILES:
try:
print(f" Essai: {model_file}")
local_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=model_file,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False,
resume_download=True
)
print(f"✅ Téléchargé: {model_file}")
return local_path
except Exception as e:
print(f" ❌ {model_file}: {str(e)[:100]}")
continue
raise Exception("❌ Aucun modèle disponible")
def load_model(self):
"""Charger le modèle avec llama_cpp"""
if self.llm is not None:
return self.llm
print("🔧 Chargement du modèle...")
self.loading = True
try:
# Trouver le modèle
self.model_path = self.find_or_download_model()
# Configurer le modèle (optimisé pour Hugging Face 16GB RAM)
n_gpu_layers = -1 # Utiliser GPU si disponible
n_threads = 4 # 4 threads CPU
n_ctx = 2048 # Contexte limité pour économiser la RAM
print(f"🔄 Chargement depuis: {self.model_path}")
print(f"⚙️ Configuration: GPU layers={n_gpu_layers}, Threads={n_threads}, Context={n_ctx}")
# Charger le modèle
self.llm = Llama(
model_path=self.model_path,
n_ctx=n_ctx,
n_threads=n_threads,
n_gpu_layers=n_gpu_layers,
verbose=False
)
print("✅ Modèle chargé avec succès!")
self.loading = False
return self.llm
except Exception as e:
self.loading = False
print(f"❌ Erreur chargement modèle: {e}")
raise
def generate(self, prompt: str, temperature: float = 0.2, max_tokens: int = 256, top_p: float = 0.95):
"""Générer du texte"""
if self.llm is None:
self.load_model()
try:
output = self.llm(
prompt=prompt,
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
stop=["</s>", "```"],
echo=False
)
return output["choices"][0]["text"]
except Exception as e:
raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
def chat(self, messages: List[dict], temperature: float = 0.2, max_tokens: int = 256):
"""Chat conversationnel"""
if self.llm is None:
self.load_model()
# Formater les messages pour llama_cpp
formatted_prompt = self.format_chat_prompt(messages)
try:
output = self.llm(
prompt=formatted_prompt,
temperature=temperature,
max_tokens=max_tokens,
stop=["</s>", "```"],
echo=False
)
return output["choices"][0]["text"]
except Exception as e:
raise HTTPException(status_code=500, detail=f"Chat error: {str(e)}")
def format_chat_prompt(self, messages: List[dict]) -> str:
"""Formater les messages pour DeepSeek-Coder"""
prompt = ""
for msg in messages:
role = msg["role"]
content = msg["content"]
if role == "system":
prompt += f"<|system|>\n{content}\n<|end|>\n"
elif role == "user":
prompt += f"<|user|>\n{content}\n<|end|>\n"
elif role == "assistant":
prompt += f"<|assistant|>\n{content}\n<|end|>\n"
prompt += "<|assistant|>\n"
return prompt
# ========== LIFECYCLE DE L'APPLICATION ==========
model_manager = ModelManager()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Gérer le cycle de vie de l'app"""
# Démarrage
print("🚀 Démarrage de l'API llama_cpp...")
# Charger le modèle en arrière-plan
async def load_model_async():
try:
model_manager.load_model()
except Exception as e:
print(f"⚠️ Erreur chargement modèle: {e}")
# Lancer le chargement sans bloquer
asyncio.create_task(load_model_async())
yield
# Nettoyage (si nécessaire)
if model_manager.llm:
print("🧹 Nettoyage...")
# ========== APPLICATION FASTAPI ==========
app = FastAPI(
title="🚀 DeepSeek-Coder 1.3B API (llama_cpp)",
description="API ultra-rapide avec llama_cpp_python",
version="2.0.0",
docs_url="/docs",
redoc_url="/redoc",
lifespan=lifespan
)
# CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ========== ROUTES API ==========
@app.get("/")
async def root():
return {
"message": "🚀 DeepSeek-Coder 1.3B API",
"backend": "llama_cpp_python",
"status": "ready" if model_manager.llm else "loading",
"model_size": "1.3B",
"format": "GGUF (4-bit quantized)",
"endpoints": {
"generate": "POST /generate",
"chat": "POST /chat",
"health": "GET /health",
"models": "GET /models"
},
"performance": "~5-10 tokens/sec sur CPU"
}
@app.get("/health")
async def health():
"""Vérifier la santé"""
return {
"status": "healthy",
"model_loaded": model_manager.llm is not None,
"model_loading": model_manager.loading,
"model_path": model_manager.model_path,
"timestamp": time.time()
}
@app.post("/generate")
async def generate(request: GenerateRequest):
"""Générer du code"""
if model_manager.loading:
raise HTTPException(status_code=503, detail="Model is still loading...")
try:
response = model_manager.generate(
prompt=request.prompt,
temperature=request.temperature,
max_tokens=request.max_tokens,
top_p=request.top_p
)
return {
"response": response,
"model": "deepseek-coder-1.3b",
"tokens_generated": len(response.split()),
"backend": "llama_cpp"
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/chat")
async def chat(request: ChatRequest):
"""Chat conversationnel"""
if model_manager.loading:
raise HTTPException(status_code=503, detail="Model is still loading...")
try:
# Convertir les messages
messages = [msg.dict() for msg in request.messages]
response = model_manager.chat(
messages=messages,
temperature=request.temperature,
max_tokens=request.max_tokens
)
return {
"response": response,
"model": "deepseek-coder-1.3b-instruct",
"backend": "llama_cpp"
}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/models")
async def list_models():
"""Lister les modèles disponibles"""
models = []
if model_manager.model_path:
models.append({
"name": "deepseek-coder-1.3b",
"path": model_manager.model_path,
"size_mb": os.path.getsize(model_manager.model_path) / 1024 / 1024 if os.path.exists(model_manager.model_path) else 0,
"loaded": model_manager.llm is not None
})
return {"models": models}
@app.get("/demo")
async def demo():
"""Démonstration rapide"""
examples = [
{
"endpoint": "POST /generate",
"curl": 'curl -X POST https://your-api.space/generate -H "Content-Type: application/json" -d \'{"prompt": "def fibonacci(n):", "temperature": 0.2}\''
},
{
"endpoint": "POST /chat",
"curl": 'curl -X POST https://your-api.space/chat -H "Content-Type: application/json" -d \'{"messages": [{"role": "user", "content": "Write Python code for binary search"}], "temperature": 0.2}\''
}
]
return {"examples": examples}
# ========== COMPATIBILITÉ OLLAMA ==========
@app.post("/api/generate")
async def ollama_generate(request: dict):
"""Endpoint compatible Ollama"""
prompt = request.get("prompt", "")
model = request.get("model", "deepseek-coder-1.3b")
response = model_manager.generate(
prompt=prompt,
temperature=request.get("temperature", 0.2),
max_tokens=request.get("max_tokens", 256)
)
return {
"model": model,
"response": response,
"done": True
}
# ========== DÉMARRAGE ==========
if __name__ == "__main__":
import uvicorn
# Charger le modèle au démarrage (optionnel)
try:
model_manager.load_model()
except Exception as e:
print(f"⚠️ Note: {e}")
print("🔄 Le modèle se chargera à la première requête")
# Démarrer le serveur
port = int(os.getenv("PORT", 7860))
print(f"🌐 API démarrée sur http://0.0.0.0:{port}")
uvicorn.run(app, host="0.0.0.0", port=port)