| # Dragon-3B on HuggingFace Spaces | |
| # Optimized for T4/L4 GPU with flash-linear-attention | |
| FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 | |
| # Set environment variables | |
| ENV DEBIAN_FRONTEND=noninteractive \ | |
| PYTHONUNBUFFERED=1 \ | |
| HF_HOME=/data/cache \ | |
| PORT=7860 | |
| # Install system dependencies | |
| RUN apt-get update && apt-get install -y \ | |
| python3.10 \ | |
| python3-pip \ | |
| git \ | |
| && rm -rf /var/lib/apt/lists/* | |
| # Set working directory | |
| WORKDIR /app | |
| # Copy application code | |
| COPY ./app ./app | |
| COPY ./requirements ./requirements | |
| # Install Python dependencies | |
| # Upgrade pip and install build tools first | |
| RUN pip3 install --upgrade pip setuptools wheel | |
| # Install base dependencies | |
| RUN pip3 install --no-cache-dir -r requirements/base.txt | |
| # Install build dependencies for flash-linear-attention | |
| RUN pip3 install --no-cache-dir packaging ninja | |
| # Install optimizations (this will take longer but gives 3-4x speedup) | |
| # Comment out this line for faster builds (at cost of performance) | |
| RUN pip3 install --no-cache-dir -r requirements/optimization.txt --no-build-isolation | |
| # Expose port | |
| EXPOSE 7860 | |
| # Health check | |
| HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ | |
| CMD python3 -c "import requests; requests.get('http://localhost:7860/health')" | |
| # Run application | |
| CMD ["python3", "-m", "app.main"] | |