Spaces:

likhonsheikh
/

model

Runtime error

App Files Files Community

likhonsheikh commited on Aug 21, 2025

Commit

aba8087

verified ·

1 Parent(s): 1dfc148

Upload 10 files

Browse files

Files changed (10) hide show

Dockerfile +46 -0
README.md +44 -10
app.py +226 -0
deploy-hf.sh +108 -0
docker-compose.yml +51 -0
download_model.py +51 -0
nginx.conf +52 -0
requirements.txt +14 -0
setup-files.py +662 -0
setup.sh +81 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,46 @@

+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    wget \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python ML dependencies
+RUN pip install --no-cache-dir \
+    torch \
+    transformers \
+    accelerate \
+    bitsandbytes \
+    huggingface_hub
+# Create directories for model and cache
+RUN mkdir -p /app/models /app/cache
+# Set environment variables
+ENV MODEL_NAME="ai/deepcoder-preview"
+ENV MODEL_VARIANT="14B-Q4_K_M"
+ENV HUGGINGFACE_HUB_CACHE="/app/cache"
+ENV TRANSFORMERS_CACHE="/app/cache"
+# Copy application files
+COPY requirements.txt .
+COPY app.py .
+COPY download_model.py .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Expose port for API
+EXPOSE 8000
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the application
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,10 +1,44 @@
----
-title: Model
-emoji: 😻
-colorFrom: yellow
-colorTo: purple
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# DeepCoder Docker Deployment
+Complete Docker setup for deploying the DeepCoder-14B AI code generation model.
+## Quick Start
+1. **Setup and Deploy:**
+   \`\`\`bash
+   chmod +x setup.sh
+   ./setup.sh
+   \`\`\`
+2. **Test the API:**
+   \`\`\`bash
+   curl -X POST http://localhost:8000/generate \
+     -H 'Content-Type: application/json' \
+     -d '{"prompt": "def fibonacci(n):", "max_tokens": 200}'
+   \`\`\`
+## Deployment Options
+### Local Docker
+- Run `./setup.sh` for automatic setup
+- Supports both GPU and CPU deployment
+- Includes Nginx reverse proxy with rate limiting
+### Hugging Face Spaces
+- Run `./deploy-hf.sh [space-name] [username]`
+- Requires `HF_TOKEN` environment variable
+- Automatically configures for HF Spaces (port 7860)
+## API Endpoints
+- `POST /generate` - Generate code from prompts
+- `POST /chat` - Chat-style code assistance
+- `GET /model/info` - Model benchmarks and info
+- `GET /health` - Health check
+## Requirements
+- Docker & Docker Compose
+- 16GB+ RAM (32GB recommended)
+- NVIDIA GPU with 8GB+ VRAM (optional, falls back to CPU)
+- 50GB+ disk space for model cache

app.py ADDED Viewed

	@@ -0,0 +1,226 @@

+#!/usr/bin/env python3
+"""
+DeepCoder Model API Server
+Serves the DeepCoder-14B model via FastAPI
+"""
+import os
+import asyncio
+import logging
+from typing import Optional, Dict, Any
+import uvicorn
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import hf_hub_download
+import json
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Configuration
+MODEL_NAME = os.getenv("MODEL_NAME", "ai/deepcoder-preview")
+MODEL_VARIANT = os.getenv("MODEL_VARIANT", "14B-Q4_K_M")
+CACHE_DIR = os.getenv("HUGGINGFACE_HUB_CACHE", "/app/cache")
+MAX_TOKENS = 131072  # 131K context length
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+app = FastAPI(
+    title="DeepCoder API",
+    description="AI Code Generation Model API",
+    version="1.0.0"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global model variables
+tokenizer = None
+model = None
+model_loaded = False
+class CodeRequest(BaseModel):
+    prompt: str = Field(..., description="Code generation prompt")
+    temperature: float = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
+    top_p: float = Field(0.95, ge=0.0, le=1.0, description="Top-p sampling")
+    max_tokens: int = Field(2048, ge=1, le=8192, description="Maximum tokens to generate")
+    stop_sequences: Optional[list] = Field(None, description="Stop sequences")
+class CodeResponse(BaseModel):
+    generated_code: str
+    model_info: Dict[str, Any]
+    generation_params: Dict[str, Any]
+async def load_model():
+    """Load the DeepCoder model and tokenizer"""
+    global tokenizer, model, model_loaded
+    if model_loaded:
+        return
+    try:
+        logger.info(f"Loading model: {MODEL_NAME}")
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            cache_dir=CACHE_DIR,
+            trust_remote_code=True
+        )
+        # Load model with appropriate settings for the quantized version
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            cache_dir=CACHE_DIR,
+            trust_remote_code=True,
+            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+            device_map="auto" if DEVICE == "cuda" else None,
+            load_in_4bit=True if "Q4" in MODEL_VARIANT else False,
+        )
+        if DEVICE == "cpu" and hasattr(model, 'to'):
+            model = model.to(DEVICE)
+        model_loaded = True
+        logger.info(f"Model loaded successfully on {DEVICE}")
+    except Exception as e:
+        logger.error(f"Error loading model: {str(e)}")
+        raise
+@app.on_event("startup")
+async def startup_event():
+    """Load model on startup"""
+    await load_model()
+@app.get("/")
+async def root():
+    return {
+        "message": "DeepCoder API",
+        "model": MODEL_NAME,
+        "variant": MODEL_VARIANT,
+        "status": "ready" if model_loaded else "loading"
+    }
+@app.get("/health")
+async def health_check():
+    return {
+        "status": "healthy" if model_loaded else "loading",
+        "model_loaded": model_loaded,
+        "device": DEVICE,
+        "gpu_available": torch.cuda.is_available()
+    }
+@app.get("/model/info")
+async def model_info():
+    """Get model information"""
+    if not model_loaded:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+    return {
+        "model_name": MODEL_NAME,
+        "variant": MODEL_VARIANT,
+        "max_context_length": MAX_TOKENS,
+        "device": DEVICE,
+        "model_size": "14B parameters",
+        "quantization": "Q4_K_M" if "Q4" in MODEL_VARIANT else "None",
+        "benchmarks": {
+            "LiveCodeBench_v5_Pass@1": "60.6%",
+            "Codeforces_Elo": 1936,
+            "Codeforces_Percentile": "95.3",
+            "HumanEval+_Accuracy": "92.6%"
+        }
+    }
+@app.post("/generate", response_model=CodeResponse)
+async def generate_code(request: CodeRequest):
+    """Generate code using the DeepCoder model"""
+    if not model_loaded:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+    try:
+        # Tokenize input
+        inputs = tokenizer(
+            request.prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=MAX_TOKENS - request.max_tokens
+        )
+        if DEVICE == "cuda":
+            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        # Generation parameters
+        generation_kwargs = {
+            "max_new_tokens": request.max_tokens,
+            "temperature": request.temperature,
+            "top_p": request.top_p,
+            "do_sample": True,
+            "pad_token_id": tokenizer.eos_token_id,
+        }
+        if request.stop_sequences:
+            generation_kwargs["stop_sequences"] = request.stop_sequences
+        # Generate
+        with torch.no_grad():
+            outputs = model.generate(**inputs, **generation_kwargs)
+        # Decode output
+        generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+        generated_code = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        return CodeResponse(
+            generated_code=generated_code,
+            model_info={
+                "model_name": MODEL_NAME,
+                "variant": MODEL_VARIANT,
+                "device": DEVICE
+            },
+            generation_params={
+                "temperature": request.temperature,
+                "top_p": request.top_p,
+                "max_tokens": request.max_tokens
+            }
+        )
+    except Exception as e:
+        logger.error(f"Generation error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
+@app.post("/chat")
+async def chat_completion(request: CodeRequest):
+    """Chat-style completion for code assistance"""
+    # Add system context for better code generation
+    system_prompt = """You are DeepCoder, an expert AI programming assistant. Generate high-quality, well-commented code that follows best practices."""
+    full_prompt = f"{system_prompt}\n\nUser: {request.prompt}\n\nAssistant:"
+    # Create modified request with system prompt
+    modified_request = CodeRequest(
+        prompt=full_prompt,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_tokens=request.max_tokens,
+        stop_sequences=request.stop_sequences
+    )
+    return await generate_code(modified_request)
+if __name__ == "__main__":
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=False,
+        log_level="info"
+    )

deploy-hf.sh ADDED Viewed

	@@ -0,0 +1,108 @@

+#!/bin/bash
+# Deploy to Hugging Face Spaces
+set -e
+echo "🤗 Deploying to Hugging Face Spaces"
+echo "===================================="
+# Check if git is configured
+if ! git config user.email > /dev/null; then
+    echo "⚠️  Please configure git:"
+    echo "git config --global user.email 'your-email@example.com'"
+    echo "git config --global user.name 'Your Name'"
+    exit 1
+fi
+# Check if HF_TOKEN is set
+if [ -z "$HF_TOKEN" ]; then
+    echo "⚠️  Please set your Hugging Face token:"
+    echo "export HF_TOKEN=your_hf_token_here"
+    exit 1
+fi
+SPACE_NAME=${1:-"deepcoder-api"}
+HF_USERNAME=${2:-$(whoami)}
+echo "Creating Space: $HF_USERNAME/$SPACE_NAME"
+# Create Hugging Face Space files
+cat > README.md << EOF
+---
+title: DeepCoder API
+emoji: 🚀
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+license: mit
+---
+# DeepCoder API
+High-performance code generation API powered by DeepCoder-14B model.
+## Features
+- 🎯 60.6% pass rate on LiveCodeBench v5
+- 🏆 1936 Elo rating on Codeforces (95.3 percentile)
+- 📝 92.6% accuracy on HumanEval+
+- ⚡ 131K token context length
+- 🔧 Optimized Q4_K_M quantization
+## API Endpoints
+- \`POST /generate\` - Generate code from prompts
+- \`POST /chat\` - Chat-style code assistance
+- \`GET /model/info\` - Model information
+- \`GET /health\` - Health check
+## Usage
+\`\`\`bash
+curl -X POST /generate \\
+  -H 'Content-Type: application/json' \\
+  -d '{"prompt": "def fibonacci(n):", "max_tokens": 200}'
+\`\`\`
+EOF
+# Create Dockerfile for HF Spaces
+cat > Dockerfile.hf << EOF
+FROM python:3.11-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y curl git && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
+EOF
+# Update app.py for HF Spaces (port 7860)
+sed 's/port=8000/port=7860/g' app.py > app_hf.py
+mv app_hf.py app.py
+# Initialize git repo if not exists
+if [ ! -d .git ]; then
+    git init
+    git lfs install
+fi
+# Track large model files with git LFS
+echo "*.bin filter=lfs diff=lfs merge=lfs -text" >> .gitattributes
+echo "*.safetensors filter=lfs diff=lfs merge=lfs -text" >> .gitattributes
+# Add remote if not exists
+if ! git remote get-url origin > /dev/null 2>&1; then
+    git remote add origin https://huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME
+fi
+# Commit and push
+git add .
+git commit -m "Initial DeepCoder API deployment" || true
+git push -u origin main
+echo "✅ Deployed to: https://huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME"

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,51 @@

+version: '3.8'
+services:
+  deepcoder-api:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: deepcoder-model
+    ports:
+      - "8000:8000"
+    environment:
+      - MODEL_NAME=ai/deepcoder-preview
+      - MODEL_VARIANT=14B-Q4_K_M
+      - HUGGINGFACE_HUB_CACHE=/app/cache
+      - CUDA_VISIBLE_DEVICES=0
+    volumes:
+      - ./models:/app/models
+      - ./cache:/app/cache
+      - ./logs:/app/logs
+    restart: unless-stopped
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+  nginx:
+    image: nginx:alpine
+    container_name: deepcoder-nginx
+    ports:
+      - "80:80"
+      - "443:443"
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf
+      - ./ssl:/etc/nginx/ssl
+    depends_on:
+      - deepcoder-api
+    restart: unless-stopped
+volumes:
+  models:
+  cache:
+  logs:

download_model.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python3
+"""
+Download script for DeepCoder model
+Downloads and caches the model for faster container startup
+"""
+import os
+import logging
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import snapshot_download
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+MODEL_NAME = os.getenv("MODEL_NAME", "ai/deepcoder-preview")
+CACHE_DIR = os.getenv("HUGGINGFACE_HUB_CACHE", "/app/cache")
+def download_model():
+    """Download the model and tokenizer"""
+    try:
+        logger.info(f"Downloading model: {MODEL_NAME}")
+        # Download model files
+        snapshot_download(
+            repo_id=MODEL_NAME,
+            cache_dir=CACHE_DIR,
+            resume_download=True
+        )
+        # Verify by loading tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            cache_dir=CACHE_DIR,
+            trust_remote_code=True
+        )
+        logger.info("Model downloaded successfully")
+        logger.info(f"Vocab size: {tokenizer.vocab_size}")
+        logger.info(f"Cache directory: {CACHE_DIR}")
+        return True
+    except Exception as e:
+        logger.error(f"Error downloading model: {str(e)}")
+        return False
+if __name__ == "__main__":
+    success = download_model()
+    if not success:
+        exit(1)
+    logger.info("Download complete!")

nginx.conf ADDED Viewed

	@@ -0,0 +1,52 @@

+events {
+    worker_connections 1024;
+}
+http {
+    upstream deepcoder_backend {
+        server deepcoder-api:8000;
+    }
+    # Rate limiting
+    limit_req_zone $binary_remote_addr zone=api:10m rate=10r/m;
+    server {
+        listen 80;
+        server_name localhost;
+        # Security headers
+        add_header X-Frame-Options DENY;
+        add_header X-Content-Type-Options nosniff;
+        add_header X-XSS-Protection "1; mode=block";
+        # Increase client max body size for large code submissions
+        client_max_body_size 10M;
+        # Timeouts for long-running generation requests
+        proxy_connect_timeout 60s;
+        proxy_send_timeout 300s;
+        proxy_read_timeout 300s;
+        location / {
+            proxy_pass http://deepcoder_backend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+        location /generate {
+            limit_req zone=api burst=5 nodelay;
+            proxy_pass http://deepcoder_backend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+        location /health {
+            proxy_pass http://deepcoder_backend;
+            access_log off;
+        }
+    }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+torch>=2.0.0
+transformers>=4.35.0
+accelerate>=0.24.0
+bitsandbytes>=0.41.0
+huggingface_hub>=0.19.0
+pydantic>=2.5.0
+python-multipart==0.0.6
+jinja2>=3.1.0
+aiofiles>=23.0.0
+nvidia-ml-py3>=7.352.0
+psutil>=5.9.0
+requests>=2.31.0

setup-files.py ADDED Viewed

	@@ -0,0 +1,662 @@

+I'll help you create Docker files and set up hosting for the DeepCoder model. Let me create the necessary files for both Docker setup and potential Hugging Face deployment.
+# Dockerfile for DeepCoder AI Model
+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    wget \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Install Docker Model Runner (assuming it's a Python package or CLI tool)
+RUN pip install --no-cache-dir \
+    torch \
+    transformers \
+    accelerate \
+    bitsandbytes \
+    huggingface_hub
+# Create directories for model and cache
+RUN mkdir -p /app/models /app/cache
+# Set environment variables
+ENV MODEL_NAME="ai/deepcoder-preview"
+ENV MODEL_VARIANT="14B-Q4_K_M"
+ENV HUGGINGFACE_HUB_CACHE="/app/cache"
+ENV TRANSFORMERS_CACHE="/app/cache"
+# Copy application files
+COPY requirements.txt .
+COPY app.py .
+COPY download_model.py .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Download model during build (optional - can be done at runtime)
+# RUN python download_model.py
+# Expose port for API
+EXPOSE 8000
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the application
+CMD ["python", "app.py"]
+version: '3.8'
+services:
+  deepcoder-api:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: deepcoder-model
+    ports:
+      - "8000:8000"
+    environment:
+      - MODEL_NAME=ai/deepcoder-preview
+      - MODEL_VARIANT=14B-Q4_K_M
+      - HUGGINGFACE_HUB_CACHE=/app/cache
+      - CUDA_VISIBLE_DEVICES=0
+    volumes:
+      - ./models:/app/models
+      - ./cache:/app/cache
+      - ./logs:/app/logs
+    restart: unless-stopped
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+  nginx:
+    image: nginx:alpine
+    container_name: deepcoder-nginx
+    ports:
+      - "80:80"
+      - "443:443"
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf
+      - ./ssl:/etc/nginx/ssl
+    depends_on:
+      - deepcoder-api
+    restart: unless-stopped
+volumes:
+  models:
+  cache:
+  logs:
+#!/usr/bin/env python3
+"""
+DeepCoder Model API Server
+Serves the DeepCoder-14B model via FastAPI
+"""
+import os
+import asyncio
+import logging
+from typing import Optional, Dict, Any
+import uvicorn
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import hf_hub_download
+import json
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Configuration
+MODEL_NAME = os.getenv("MODEL_NAME", "ai/deepcoder-preview")
+MODEL_VARIANT = os.getenv("MODEL_VARIANT", "14B-Q4_K_M")
+CACHE_DIR = os.getenv("HUGGINGFACE_HUB_CACHE", "/app/cache")
+MAX_TOKENS = 131072  # 131K context length
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+app = FastAPI(
+    title="DeepCoder API",
+    description="AI Code Generation Model API",
+    version="1.0.0"
+)
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Global model variables
+tokenizer = None
+model = None
+model_loaded = False
+class CodeRequest(BaseModel):
+    prompt: str = Field(..., description="Code generation prompt")
+    temperature: float = Field(0.6, ge=0.0, le=2.0, description="Sampling temperature")
+    top_p: float = Field(0.95, ge=0.0, le=1.0, description="Top-p sampling")
+    max_tokens: int = Field(2048, ge=1, le=8192, description="Maximum tokens to generate")
+    stop_sequences: Optional[list] = Field(None, description="Stop sequences")
+class CodeResponse(BaseModel):
+    generated_code: str
+    model_info: Dict[str, Any]
+    generation_params: Dict[str, Any]
+async def load_model():
+    """Load the DeepCoder model and tokenizer"""
+    global tokenizer, model, model_loaded
+    if model_loaded:
+        return
+    try:
+        logger.info(f"Loading model: {MODEL_NAME}")
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            cache_dir=CACHE_DIR,
+            trust_remote_code=True
+        )
+        # Load model with appropriate settings for the quantized version
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            cache_dir=CACHE_DIR,
+            trust_remote_code=True,
+            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+            device_map="auto" if DEVICE == "cuda" else None,
+            load_in_4bit=True if "Q4" in MODEL_VARIANT else False,
+        )
+        if DEVICE == "cpu" and hasattr(model, 'to'):
+            model = model.to(DEVICE)
+        model_loaded = True
+        logger.info(f"Model loaded successfully on {DEVICE}")
+    except Exception as e:
+        logger.error(f"Error loading model: {str(e)}")
+        raise
+@app.on_event("startup")
+async def startup_event():
+    """Load model on startup"""
+    await load_model()
+@app.get("/")
+async def root():
+    return {
+        "message": "DeepCoder API",
+        "model": MODEL_NAME,
+        "variant": MODEL_VARIANT,
+        "status": "ready" if model_loaded else "loading"
+    }
+@app.get("/health")
+async def health_check():
+    return {
+        "status": "healthy" if model_loaded else "loading",
+        "model_loaded": model_loaded,
+        "device": DEVICE,
+        "gpu_available": torch.cuda.is_available()
+    }
+@app.get("/model/info")
+async def model_info():
+    """Get model information"""
+    if not model_loaded:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+    return {
+        "model_name": MODEL_NAME,
+        "variant": MODEL_VARIANT,
+        "max_context_length": MAX_TOKENS,
+        "device": DEVICE,
+        "model_size": "14B parameters",
+        "quantization": "Q4_K_M" if "Q4" in MODEL_VARIANT else "None",
+        "benchmarks": {
+            "LiveCodeBench_v5_Pass@1": "60.6%",
+            "Codeforces_Elo": 1936,
+            "Codeforces_Percentile": "95.3",
+            "HumanEval+_Accuracy": "92.6%"
+        }
+    }
+@app.post("/generate", response_model=CodeResponse)
+async def generate_code(request: CodeRequest):
+    """Generate code using the DeepCoder model"""
+    if not model_loaded:
+        raise HTTPException(status_code=503, detail="Model not loaded yet")
+    try:
+        # Tokenize input
+        inputs = tokenizer(
+            request.prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=MAX_TOKENS - request.max_tokens
+        )
+        if DEVICE == "cuda":
+            inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        # Generation parameters
+        generation_kwargs = {
+            "max_new_tokens": request.max_tokens,
+            "temperature": request.temperature,
+            "top_p": request.top_p,
+            "do_sample": True,
+            "pad_token_id": tokenizer.eos_token_id,
+        }
+        if request.stop_sequences:
+            generation_kwargs["stop_sequences"] = request.stop_sequences
+        # Generate
+        with torch.no_grad():
+            outputs = model.generate(**inputs, **generation_kwargs)
+        # Decode output
+        generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+        generated_code = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        return CodeResponse(
+            generated_code=generated_code,
+            model_info={
+                "model_name": MODEL_NAME,
+                "variant": MODEL_VARIANT,
+                "device": DEVICE
+            },
+            generation_params={
+                "temperature": request.temperature,
+                "top_p": request.top_p,
+                "max_tokens": request.max_tokens
+            }
+        )
+    except Exception as e:
+        logger.error(f"Generation error: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
+@app.post("/chat")
+async def chat_completion(request: CodeRequest):
+    """Chat-style completion for code assistance"""
+    # Add system context for better code generation
+    system_prompt = """You are DeepCoder, an expert AI programming assistant. Generate high-quality, well-commented code that follows best practices."""
+    full_prompt = f"{system_prompt}\n\nUser: {request.prompt}\n\nAssistant:"
+    # Create modified request with system prompt
+    modified_request = CodeRequest(
+        prompt=full_prompt,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_tokens=request.max_tokens,
+        stop_sequences=request.stop_sequences
+    )
+    return await generate_code(modified_request)
+if __name__ == "__main__":
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=False,
+        log_level="info"
+    )
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+torch>=2.0.0
+transformers>=4.35.0
+accelerate>=0.24.0
+bitsandbytes>=0.41.0
+huggingface_hub>=0.19.0
+pydantic>=2.5.0
+python-multipart==0.0.6
+jinja2>=3.1.0
+aiofiles>=23.0.0
+nvidia-ml-py3>=7.352.0
+psutil>=5.9.0
+requests>=2.31.0
+#!/usr/bin/env python3
+"""
+Download script for DeepCoder model
+Downloads and caches the model for faster container startup
+"""
+import os
+import logging
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import snapshot_download
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+MODEL_NAME = os.getenv("MODEL_NAME", "ai/deepcoder-preview")
+CACHE_DIR = os.getenv("HUGGINGFACE_HUB_CACHE", "/app/cache")
+def download_model():
+    """Download the model and tokenizer"""
+    try:
+        logger.info(f"Downloading model: {MODEL_NAME}")
+        # Download model files
+        snapshot_download(
+            repo_id=MODEL_NAME,
+            cache_dir=CACHE_DIR,
+            resume_download=True
+        )
+        # Verify by loading tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            MODEL_NAME,
+            cache_dir=CACHE_DIR,
+            trust_remote_code=True
+        )
+        logger.info("Model downloaded successfully")
+        logger.info(f"Vocab size: {tokenizer.vocab_size}")
+        logger.info(f"Cache directory: {CACHE_DIR}")
+        return True
+    except Exception as e:
+        logger.error(f"Error downloading model: {str(e)}")
+        return False
+if __name__ == "__main__":
+    success = download_model()
+    if not success:
+        exit(1)
+    logger.info("Download complete!")
+events {
+    worker_connections 1024;
+}
+http {
+    upstream deepcoder_backend {
+        server deepcoder-api:8000;
+    }
+    # Rate limiting
+    limit_req_zone $binary_remote_addr zone=api:10m rate=10r/m;
+    server {
+        listen 80;
+        server_name localhost;
+        # Security headers
+        add_header X-Frame-Options DENY;
+        add_header X-Content-Type-Options nosniff;
+        add_header X-XSS-Protection "1; mode=block";
+        # Increase client max body size for large code submissions
+        client_max_body_size 10M;
+        # Timeouts for long-running generation requests
+        proxy_connect_timeout 60s;
+        proxy_send_timeout 300s;
+        proxy_read_timeout 300s;
+        location / {
+            proxy_pass http://deepcoder_backend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+        location /generate {
+            limit_req zone=api burst=5 nodelay;
+            proxy_pass http://deepcoder_backend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+        location /health {
+            proxy_pass http://deepcoder_backend;
+            access_log off;
+        }
+    }
+}
+#!/bin/bash
+# setup.sh - Setup script for DeepCoder deployment
+set -e
+echo "🚀 DeepCoder Model Setup"
+echo "========================"
+# Create necessary directories
+echo "📁 Creating directories..."
+mkdir -p models cache logs ssl
+# Set permissions
+chmod 755 models cache logs
+chmod 700 ssl
+# Pull the DeepCoder model using Docker Model Runner
+echo "📦 Pulling DeepCoder model..."
+if command -v docker &> /dev/null; then
+    # Assuming docker model runner is available
+    docker model pull ai/deepcoder-preview
+else
+    echo "⚠️  Docker not found. Please install Docker first."
+    exit 1
+fi
+# Check for GPU support
+echo "🔍 Checking GPU support..."
+if command -v nvidia-smi &> /dev/null; then
+    echo "✅ NVIDIA GPU detected:"
+    nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader
+    # Check for Docker GPU support
+    if docker run --rm --gpus all nvidia/cuda:11.8-base nvidia-smi &> /dev/null; then
+        echo "✅ Docker GPU support verified"
+        export GPU_SUPPORT=true
+    else
+        echo "⚠️  Docker GPU support not available"
+        export GPU_SUPPORT=false
+    fi
+else
+    echo "⚠️  No GPU detected. Running on CPU."
+    export GPU_SUPPORT=false
+fi
+# Build and start containers
+echo "🏗️  Building Docker containers..."
+docker-compose build
+echo "🚀 Starting services..."
+if [ "$GPU_SUPPORT" = true ]; then
+    docker-compose up -d
+else
+    # Remove GPU requirements for CPU-only deployment
+    sed 's/devices:/# devices:/g' docker-compose.yml | \
+    sed 's/- driver: nvidia/# - driver: nvidia/g' | \
+    sed 's/count: 1/# count: 1/g' | \
+    sed 's/capabilities: \[gpu\]/# capabilities: [gpu]/g' > docker-compose-cpu.yml
+    docker-compose -f docker-compose-cpu.yml up -d
+fi
+# Wait for services to be ready
+echo "⏳ Waiting for services to start..."
+sleep 30
+# Health check
+echo "🏥 Performing health check..."
+for i in {1..10}; do
+    if curl -f http://localhost:8000/health > /dev/null 2>&1; then
+        echo "✅ DeepCoder API is healthy!"
+        break
+    else
+        echo "⏳ Waiting for API to be ready... (attempt $i/10)"
+        sleep 10
+    fi
+done
+# Show status
+echo "📊 Service Status:"
+docker-compose ps
+echo ""
+echo "🎉 DeepCoder setup complete!"
+echo "API endpoint: http://localhost:8000"
+echo "Health check: http://localhost:8000/health"
+echo "Model info: http://localhost:8000/model/info"
+echo ""
+echo "To test the API:"
+echo "curl -X POST http://localhost:8000/generate \\"
+echo "  -H 'Content-Type: application/json' \\"
+echo "  -d '{\"prompt\": \"def fibonacci(n):\", \"max_tokens\": 200}'"
+###########################################
+# deploy-hf.sh - Hugging Face Spaces deployment
+###########################################
+cat > deploy-hf.sh << 'EOL'
+#!/bin/bash
+# Deploy to Hugging Face Spaces
+set -e
+echo "🤗 Deploying to Hugging Face Spaces"
+echo "===================================="
+# Check if git is configured
+if ! git config user.email > /dev/null; then
+    echo "⚠️  Please configure git:"
+    echo "git config --global user.email 'your-email@example.com'"
+    echo "git config --global user.name 'Your Name'"
+    exit 1
+fi
+# Check if HF_TOKEN is set
+if [ -z "$HF_TOKEN" ]; then
+    echo "⚠️  Please set your Hugging Face token:"
+    echo "export HF_TOKEN=your_hf_token_here"
+    exit 1
+fi
+SPACE_NAME=${1:-"deepcoder-api"}
+HF_USERNAME=${2:-$(whoami)}
+echo "Creating Space: $HF_USERNAME/$SPACE_NAME"
+# Create Hugging Face Space files
+cat > README.md << EOF
+---
+title: DeepCoder API
+emoji: 🚀
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+license: mit
+---
+# DeepCoder API
+High-performance code generation API powered by DeepCoder-14B model.
+## Features
+- 🎯 60.6% pass rate on LiveCodeBench v5
+- 🏆 1936 Elo rating on Codeforces (95.3 percentile)
+- 📝 92.6% accuracy on HumanEval+
+- ⚡ 131K token context length
+- 🔧 Optimized Q4_K_M quantization
+## API Endpoints
+- \`POST /generate\` - Generate code from prompts
+- \`POST /chat\` - Chat-style code assistance
+- \`GET /model/info\` - Model information
+- \`GET /health\` - Health check
+## Usage
+\`\`\`bash
+curl -X POST /generate \\
+  -H 'Content-Type: application/json' \\
+  -d '{"prompt": "def fibonacci(n):", "max_tokens": 200}'
+\`\`\`
+EOF
+# Create Dockerfile for HF Spaces
+cat > Dockerfile.hf << EOF
+FROM python:3.11-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y curl git && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
+EOF
+# Update app.py for HF Spaces (port 7860)
+sed 's/port=8000/port=7860/g' app.py > app_hf.py
+mv app_hf.py app.py
+# Initialize git repo if not exists
+if [ ! -d .git ]; then
+    git init
+    git lfs install
+fi
+# Track large model files with git LFS
+echo "*.bin filter=lfs diff=lfs merge=lfs -text" >> .gitattributes
+echo "*.safetensors filter=lfs diff=lfs merge=lfs -text" >> .gitattributes
+# Add remote if not exists
+if ! git remote get-url origin > /dev/null 2>&1; then
+    git remote add origin https://huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME
+fi
+# Commit and push
+git add .
+git commit -m "Initial DeepCoder API deployment" || true
+git push -u origin main
+echo "✅ Deployed to: https://huggingface.co/spaces/$HF_USERNAME/$SPACE_NAME"
+EOL
+chmod +x deploy-hf.sh
+echo "📝 Additional deployment script created: deploy-hf.sh"

setup.sh ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/bin/bash
+# setup.sh - Setup script for DeepCoder deployment
+set -e
+echo "🚀 DeepCoder Model Setup"
+echo "========================"
+# Create necessary directories
+echo "📁 Creating directories..."
+mkdir -p models cache logs ssl
+# Set permissions
+chmod 755 models cache logs
+chmod 700 ssl
+# Check for GPU support
+echo "🔍 Checking GPU support..."
+if command -v nvidia-smi &> /dev/null; then
+    echo "✅ NVIDIA GPU detected:"
+    nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader
+    # Check for Docker GPU support
+    if docker run --rm --gpus all nvidia/cuda:11.8-base nvidia-smi &> /dev/null; then
+        echo "✅ Docker GPU support verified"
+        export GPU_SUPPORT=true
+    else
+        echo "⚠️  Docker GPU support not available"
+        export GPU_SUPPORT=false
+    fi
+else
+    echo "⚠️  No GPU detected. Running on CPU."
+    export GPU_SUPPORT=false
+fi
+# Build and start containers
+echo "🏗️  Building Docker containers..."
+docker-compose build
+echo "🚀 Starting services..."
+if [ "$GPU_SUPPORT" = true ]; then
+    docker-compose up -d
+else
+    # Remove GPU requirements for CPU-only deployment
+    sed 's/devices:/# devices:/g' docker-compose.yml | \
+    sed 's/- driver: nvidia/# - driver: nvidia/g' | \
+    sed 's/count: 1/# count: 1/g' | \
+    sed 's/capabilities: \[gpu\]/# capabilities: [gpu]/g' > docker-compose-cpu.yml
+    docker-compose -f docker-compose-cpu.yml up -d
+fi
+# Wait for services to be ready
+echo "⏳ Waiting for services to start..."
+sleep 30
+# Health check
+echo "🏥 Performing health check..."
+for i in {1..10}; do
+    if curl -f http://localhost:8000/health > /dev/null 2>&1; then
+        echo "✅ DeepCoder API is healthy!"
+        break
+    else
+        echo "⏳ Waiting for API to be ready... (attempt $i/10)"
+        sleep 10
+    fi
+done
+# Show status
+echo "📊 Service Status:"
+docker-compose ps
+echo ""
+echo "🎉 DeepCoder setup complete!"
+echo "API endpoint: http://localhost:8000"
+echo "Health check: http://localhost:8000/health"
+echo "Model info: http://localhost:8000/model/info"
+echo ""
+echo "To test the API:"
+echo "curl -X POST http://localhost:8000/generate \\"
+echo "  -H 'Content-Type: application/json' \\"
+echo "  -d '{\"prompt\": \"def fibonacci(n):\", \"max_tokens\": 200}'"