trainer / Dockerfile.runpod

Update Dockerfile with cache optimization

e334188 verified 4 days ago

4.84 kB

	# =============================================================================
	# AI-Toolkit Trainer - RunPod Serverless Worker (CACHE OPTIMIZED)
	# =============================================================================
	# CACHE OPTIMIZATION: Layers ordered from LEAST to MOST frequently changed
	#
	# Layer Order (top = rarely changes, bottom = frequently changes):
	# 1. Base image + system deps [RARELY CHANGE]
	# 2. PyTorch + CUDA [RARELY CHANGE]
	# 3. AI-Toolkit requirements [OCCASIONALLY CHANGE]
	# 4. AI-Toolkit code [OCCASIONALLY CHANGE]
	# 5. RunPod + HF deps [RARELY CHANGE]
	# 6. Directory setup [RARELY CHANGE]
	# 7. rp_handler.py [FREQUENTLY CHANGE]
	#
	# Build:
	# docker buildx build --platform linux/amd64 -f Dockerfile.runpod \
	# -t aloukikaditya/trainer:latest --push .
	#
	# Build with cache:
	# DOCKER_BUILDKIT=1 docker build -f Dockerfile.runpod -t aio-trainer .
	# =============================================================================

	ARG BASE_IMAGE=runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04
	FROM ${BASE_IMAGE}

	# -----------------------------------------------------------------------------
	# [LAYER 1] Environment Configuration - RARELY CHANGES
	# -----------------------------------------------------------------------------
	ENV PYTHONUNBUFFERED=1 \
	DEBIAN_FRONTEND=noninteractive \
	HF_HUB_ENABLE_HF_TRANSFER=1 \
	HF_HOME=/runpod-volume/huggingface-cache \
	HUGGINGFACE_HUB_CACHE=/runpod-volume/huggingface-cache/hub \
	TRANSFORMERS_CACHE=/runpod-volume/huggingface-cache/hub \
	NO_ALBUMENTATIONS_UPDATE=1 \
	DISABLE_TELEMETRY=YES \
	TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0"

	WORKDIR /app

	# -----------------------------------------------------------------------------
	# [LAYER 2] System Dependencies - RARELY CHANGES
	# -----------------------------------------------------------------------------
	RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
	--mount=type=cache,target=/var/lib/apt,sharing=locked \
	apt-get update && apt-get install -y --no-install-recommends \
	git git-lfs curl wget ffmpeg libgl1-mesa-glx libglib2.0-0 aria2 \
	&& apt-get clean && rm -rf /var/lib/apt/lists/*

	# -----------------------------------------------------------------------------
	# [LAYER 3] PyTorch (use base image PyTorch or install specific version)
	# -----------------------------------------------------------------------------
	# Base image already has PyTorch, verify it
	RUN python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA {torch.version.cuda}')"

	# -----------------------------------------------------------------------------
	# [LAYER 4] AI-Toolkit Requirements - OCCASIONALLY CHANGES
	# -----------------------------------------------------------------------------
	# Copy only requirements first for better caching
	COPY ai-toolkit/requirements.txt /app/ai-toolkit/requirements.txt

	RUN --mount=type=cache,target=/root/.cache/pip \
	pip install --upgrade pip && \
	pip install -r /app/ai-toolkit/requirements.txt

	# -----------------------------------------------------------------------------
	# [LAYER 5] RunPod + HuggingFace Dependencies - RARELY CHANGES
	# -----------------------------------------------------------------------------
	RUN --mount=type=cache,target=/root/.cache/pip \
	pip install runpod hf_transfer huggingface_hub

	# -----------------------------------------------------------------------------
	# [LAYER 6] AI-Toolkit Code - OCCASIONALLY CHANGES
	# -----------------------------------------------------------------------------
	COPY ai-toolkit/ /app/ai-toolkit/

	# Verify ai-toolkit is properly installed
	RUN python -c "import sys; sys.path.insert(0, '/app/ai-toolkit'); print('AI-Toolkit ready')"

	# -----------------------------------------------------------------------------
	# [LAYER 7] Directory Setup - RARELY CHANGES
	# -----------------------------------------------------------------------------
	RUN mkdir -p \
	/workspace/dataset \
	/workspace/output \
	/runpod-volume/huggingface-cache/hub

	# -----------------------------------------------------------------------------
	# [LAYER 8] Handler Code - FREQUENTLY CHANGES
	# -----------------------------------------------------------------------------
	# This layer is last so changes to handler don't invalidate ai-toolkit cache
	COPY rp_handler.py /app/rp_handler.py

	# Verify handler imports work
	RUN python -c "from rp_handler import handler, MODEL_PRESETS; print(f'Handler ready: {list(MODEL_PRESETS.keys())}')"

	# -----------------------------------------------------------------------------
	# Runtime Configuration
	# -----------------------------------------------------------------------------
	EXPOSE 8000

	CMD ["python", "-u", "rp_handler.py"]