dragon-3b-inference / Dockerfile
jeanbaptdzd's picture
Fix: Add build dependencies for flash-linear-attention
e947af3
# Dragon-3B on HuggingFace Spaces
# Optimized for T4/L4 GPU with flash-linear-attention
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 \
HF_HOME=/data/cache \
PORT=7860
# Install system dependencies
RUN apt-get update && apt-get install -y \
python3.10 \
python3-pip \
git \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
# Copy application code
COPY ./app ./app
COPY ./requirements ./requirements
# Install Python dependencies
# Upgrade pip and install build tools first
RUN pip3 install --upgrade pip setuptools wheel
# Install base dependencies
RUN pip3 install --no-cache-dir -r requirements/base.txt
# Install build dependencies for flash-linear-attention
RUN pip3 install --no-cache-dir packaging ninja
# Install optimizations (this will take longer but gives 3-4x speedup)
# Comment out this line for faster builds (at cost of performance)
RUN pip3 install --no-cache-dir -r requirements/optimization.txt --no-build-isolation
# Expose port
EXPOSE 7860
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD python3 -c "import requests; requests.get('http://localhost:7860/health')"
# Run application
CMD ["python3", "-m", "app.main"]