trainer / Dockerfile.runpod
Aloukik21's picture
Update Dockerfile with cache optimization
e334188 verified
# =============================================================================
# AI-Toolkit Trainer - RunPod Serverless Worker (CACHE OPTIMIZED)
# =============================================================================
# CACHE OPTIMIZATION: Layers ordered from LEAST to MOST frequently changed
#
# Layer Order (top = rarely changes, bottom = frequently changes):
# 1. Base image + system deps [RARELY CHANGE]
# 2. PyTorch + CUDA [RARELY CHANGE]
# 3. AI-Toolkit requirements [OCCASIONALLY CHANGE]
# 4. AI-Toolkit code [OCCASIONALLY CHANGE]
# 5. RunPod + HF deps [RARELY CHANGE]
# 6. Directory setup [RARELY CHANGE]
# 7. rp_handler.py [FREQUENTLY CHANGE]
#
# Build:
# docker buildx build --platform linux/amd64 -f Dockerfile.runpod \
# -t aloukikaditya/trainer:latest --push .
#
# Build with cache:
# DOCKER_BUILDKIT=1 docker build -f Dockerfile.runpod -t aio-trainer .
# =============================================================================
ARG BASE_IMAGE=runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04
FROM ${BASE_IMAGE}
# -----------------------------------------------------------------------------
# [LAYER 1] Environment Configuration - RARELY CHANGES
# -----------------------------------------------------------------------------
ENV PYTHONUNBUFFERED=1 \
DEBIAN_FRONTEND=noninteractive \
HF_HUB_ENABLE_HF_TRANSFER=1 \
HF_HOME=/runpod-volume/huggingface-cache \
HUGGINGFACE_HUB_CACHE=/runpod-volume/huggingface-cache/hub \
TRANSFORMERS_CACHE=/runpod-volume/huggingface-cache/hub \
NO_ALBUMENTATIONS_UPDATE=1 \
DISABLE_TELEMETRY=YES \
TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0"
WORKDIR /app
# -----------------------------------------------------------------------------
# [LAYER 2] System Dependencies - RARELY CHANGES
# -----------------------------------------------------------------------------
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update && apt-get install -y --no-install-recommends \
git git-lfs curl wget ffmpeg libgl1-mesa-glx libglib2.0-0 aria2 \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
# -----------------------------------------------------------------------------
# [LAYER 3] PyTorch (use base image PyTorch or install specific version)
# -----------------------------------------------------------------------------
# Base image already has PyTorch, verify it
RUN python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA {torch.version.cuda}')"
# -----------------------------------------------------------------------------
# [LAYER 4] AI-Toolkit Requirements - OCCASIONALLY CHANGES
# -----------------------------------------------------------------------------
# Copy only requirements first for better caching
COPY ai-toolkit/requirements.txt /app/ai-toolkit/requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --upgrade pip && \
pip install -r /app/ai-toolkit/requirements.txt
# -----------------------------------------------------------------------------
# [LAYER 5] RunPod + HuggingFace Dependencies - RARELY CHANGES
# -----------------------------------------------------------------------------
RUN --mount=type=cache,target=/root/.cache/pip \
pip install runpod hf_transfer huggingface_hub
# -----------------------------------------------------------------------------
# [LAYER 6] AI-Toolkit Code - OCCASIONALLY CHANGES
# -----------------------------------------------------------------------------
COPY ai-toolkit/ /app/ai-toolkit/
# Verify ai-toolkit is properly installed
RUN python -c "import sys; sys.path.insert(0, '/app/ai-toolkit'); print('AI-Toolkit ready')"
# -----------------------------------------------------------------------------
# [LAYER 7] Directory Setup - RARELY CHANGES
# -----------------------------------------------------------------------------
RUN mkdir -p \
/workspace/dataset \
/workspace/output \
/runpod-volume/huggingface-cache/hub
# -----------------------------------------------------------------------------
# [LAYER 8] Handler Code - FREQUENTLY CHANGES
# -----------------------------------------------------------------------------
# This layer is last so changes to handler don't invalidate ai-toolkit cache
COPY rp_handler.py /app/rp_handler.py
# Verify handler imports work
RUN python -c "from rp_handler import handler, MODEL_PRESETS; print(f'Handler ready: {list(MODEL_PRESETS.keys())}')"
# -----------------------------------------------------------------------------
# Runtime Configuration
# -----------------------------------------------------------------------------
EXPOSE 8000
CMD ["python", "-u", "rp_handler.py"]