Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| DeepCoder Model API Server | |
| Serves the DeepCoder-14B model via FastAPI | |
| """ | |
| import os | |
| import asyncio | |
| import logging | |
| from typing import Optional, Dict, Any | |
| import uvicorn | |
| from fastapi import FastAPI, HTTPException, BackgroundTasks | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel, Field | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from huggingface_hub import hf_hub_download | |
| import json | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Configuration | |
| MODEL_NAME = os.getenv("MODEL_NAME", "ai/deepcoder-preview") | |
| MODEL_VARIANT = os.getenv("MODEL_VARIANT", "14B-Q4_K_M") | |
| CACHE_DIR = os.getenv("HUGGINGFACE_HUB_CACHE", "/app/cache") | |
| MAX_TOKENS = 131072 # 131K context length | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| app = FastAPI( | |
| title="DeepCoder API", | |
| description="AI Code Generation Model API", | |
| version="1.0.0" | |
| ) | |
| # CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Global model variables | |
| tokenizer = None | |
| model = None | |
| model_loaded = False | |
| class CodeRequest(BaseModel): | |
| prompt: str = Field(..., description="Code generation prompt") | |
| temperature: float = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature") | |
| top_p: float = Field(0.95, ge=0.0, le=1.0, description="Top-p sampling") | |
| max_tokens: int = Field(2048, ge=1, le=8192, description="Maximum tokens to generate") | |
| stop_sequences: Optional[list] = Field(None, description="Stop sequences") | |
| class CodeResponse(BaseModel): | |
| generated_code: str | |
| model_info: Dict[str, Any] | |
| generation_params: Dict[str, Any] | |
| async def load_model(): | |
| """Load the DeepCoder model and tokenizer""" | |
| global tokenizer, model, model_loaded | |
| if model_loaded: | |
| return | |
| try: | |
| logger.info(f"Loading model: {MODEL_NAME}") | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_NAME, | |
| cache_dir=CACHE_DIR, | |
| trust_remote_code=True | |
| ) | |
| # Load model with appropriate settings for the quantized version | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| cache_dir=CACHE_DIR, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, | |
| device_map="auto" if DEVICE == "cuda" else None, | |
| load_in_4bit=True if "Q4" in MODEL_VARIANT else False, | |
| ) | |
| if DEVICE == "cpu" and hasattr(model, 'to'): | |
| model = model.to(DEVICE) | |
| model_loaded = True | |
| logger.info(f"Model loaded successfully on {DEVICE}") | |
| except Exception as e: | |
| logger.error(f"Error loading model: {str(e)}") | |
| raise | |
| async def startup_event(): | |
| """Load model on startup""" | |
| await load_model() | |
| async def root(): | |
| return { | |
| "message": "DeepCoder API", | |
| "model": MODEL_NAME, | |
| "variant": MODEL_VARIANT, | |
| "status": "ready" if model_loaded else "loading" | |
| } | |
| async def health_check(): | |
| return { | |
| "status": "healthy" if model_loaded else "loading", | |
| "model_loaded": model_loaded, | |
| "device": DEVICE, | |
| "gpu_available": torch.cuda.is_available() | |
| } | |
| async def model_info(): | |
| """Get model information""" | |
| if not model_loaded: | |
| raise HTTPException(status_code=503, detail="Model not loaded yet") | |
| return { | |
| "model_name": MODEL_NAME, | |
| "variant": MODEL_VARIANT, | |
| "max_context_length": MAX_TOKENS, | |
| "device": DEVICE, | |
| "model_size": "14B parameters", | |
| "quantization": "Q4_K_M" if "Q4" in MODEL_VARIANT else "None", | |
| "benchmarks": { | |
| "LiveCodeBench_v5_Pass@1": "60.6%", | |
| "Codeforces_Elo": 1936, | |
| "Codeforces_Percentile": "95.3", | |
| "HumanEval+_Accuracy": "92.6%" | |
| } | |
| } | |
| async def generate_code(request: CodeRequest): | |
| """Generate code using the DeepCoder model""" | |
| if not model_loaded: | |
| raise HTTPException(status_code=503, detail="Model not loaded yet") | |
| try: | |
| # Tokenize input | |
| inputs = tokenizer( | |
| request.prompt, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=MAX_TOKENS - request.max_tokens | |
| ) | |
| if DEVICE == "cuda": | |
| inputs = {k: v.to(DEVICE) for k, v in inputs.items()} | |
| # Generation parameters | |
| generation_kwargs = { | |
| "max_new_tokens": request.max_tokens, | |
| "temperature": request.temperature, | |
| "top_p": request.top_p, | |
| "do_sample": True, | |
| "pad_token_id": tokenizer.eos_token_id, | |
| } | |
| if request.stop_sequences: | |
| generation_kwargs["stop_sequences"] = request.stop_sequences | |
| # Generate | |
| with torch.no_grad(): | |
| outputs = model.generate(**inputs, **generation_kwargs) | |
| # Decode output | |
| generated_tokens = outputs[0][inputs["input_ids"].shape[1]:] | |
| generated_code = tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
| return CodeResponse( | |
| generated_code=generated_code, | |
| model_info={ | |
| "model_name": MODEL_NAME, | |
| "variant": MODEL_VARIANT, | |
| "device": DEVICE | |
| }, | |
| generation_params={ | |
| "temperature": request.temperature, | |
| "top_p": request.top_p, | |
| "max_tokens": request.max_tokens | |
| } | |
| ) | |
| except Exception as e: | |
| logger.error(f"Generation error: {str(e)}") | |
| raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}") | |
| async def chat_completion(request: CodeRequest): | |
| """Chat-style completion for code assistance""" | |
| # Add system context for better code generation | |
| system_prompt = """You are DeepCoder, an expert AI programming assistant. Generate high-quality, well-commented code that follows best practices.""" | |
| full_prompt = f"{system_prompt}\n\nUser: {request.prompt}\n\nAssistant:" | |
| # Create modified request with system prompt | |
| modified_request = CodeRequest( | |
| prompt=full_prompt, | |
| temperature=request.temperature, | |
| top_p=request.top_p, | |
| max_tokens=request.max_tokens, | |
| stop_sequences=request.stop_sequences | |
| ) | |
| return await generate_code(modified_request) | |
| if __name__ == "__main__": | |
| uvicorn.run( | |
| "app:app", | |
| host="0.0.0.0", | |
| port=8000, | |
| reload=False, | |
| log_level="info" | |
| ) | |