Spaces:

jeanbaptdzd
/

dragon-3b-inference

Paused

Fix: Add build dependencies for flash-linear-attention

e947af3 6 months ago

1.32 kB

	# Dragon-3B on HuggingFace Spaces
	# Optimized for T4/L4 GPU with flash-linear-attention

	FROM nvidia/cuda:12.1.0-devel-ubuntu22.04

	# Set environment variables
	ENV DEBIAN_FRONTEND=noninteractive \
	PYTHONUNBUFFERED=1 \
	HF_HOME=/data/cache \
	PORT=7860

	# Install system dependencies
	RUN apt-get update && apt-get install -y \
	python3.10 \
	python3-pip \
	git \
	&& rm -rf /var/lib/apt/lists/*

	# Set working directory
	WORKDIR /app

	# Copy application code
	COPY ./app ./app
	COPY ./requirements ./requirements

	# Install Python dependencies
	# Upgrade pip and install build tools first
	RUN pip3 install --upgrade pip setuptools wheel

	# Install base dependencies
	RUN pip3 install --no-cache-dir -r requirements/base.txt

	# Install build dependencies for flash-linear-attention
	RUN pip3 install --no-cache-dir packaging ninja

	# Install optimizations (this will take longer but gives 3-4x speedup)
	# Comment out this line for faster builds (at cost of performance)
	RUN pip3 install --no-cache-dir -r requirements/optimization.txt --no-build-isolation

	# Expose port
	EXPOSE 7860

	# Health check
	HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
	CMD python3 -c "import requests; requests.get('http://localhost:7860/health')"

	# Run application
	CMD ["python3", "-m", "app.main"]