Aloukik21
/

trainer

@@ -1,45 +1,102 @@
-FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
-ENV DEBIAN_FRONTEND=noninteractive
-ENV TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0"
-ENV PYTHONUNBUFFERED=1
-# HuggingFace settings
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
-ENV HF_HOME=/runpod-volume/huggingface-cache
-ENV HUGGINGFACE_HUB_CACHE=/runpod-volume/huggingface-cache/hub
-ENV TRANSFORMERS_CACHE=/runpod-volume/huggingface-cache/hub
-# Disable telemetry
-ENV NO_ALBUMENTATIONS_UPDATE=1
-ENV DISABLE_TELEMETRY=YES
-# Install system deps
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    git git-lfs curl wget python3.10 python3.10-dev python3-pip \
-    ffmpeg libgl1-mesa-glx libglib2.0-0 aria2 \
-    && rm -rf /var/lib/apt/lists/*
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
-RUN pip install --upgrade pip
-# Install PyTorch
-RUN pip install --no-cache-dir torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 \
-    --index-url https://download.pytorch.org/whl/cu124
-# Install RunPod and HF transfer
-RUN pip install --no-cache-dir runpod hf_transfer huggingface_hub
-# Copy project
-WORKDIR /app
-COPY ai-toolkit /app/ai-toolkit
 COPY rp_handler.py /app/rp_handler.py
-# Install ai-toolkit requirements
-RUN pip install --no-cache-dir -r /app/ai-toolkit/requirements.txt
-# Create workspace directories
-RUN mkdir -p /workspace/dataset /workspace/output
-WORKDIR /app
 CMD ["python", "-u", "rp_handler.py"]

+# =============================================================================
+# AI-Toolkit Trainer - RunPod Serverless Worker (CACHE OPTIMIZED)
+# =============================================================================
+# CACHE OPTIMIZATION: Layers ordered from LEAST to MOST frequently changed
+#
+# Layer Order (top = rarely changes, bottom = frequently changes):
+#   1. Base image + system deps     [RARELY CHANGE]
+#   2. PyTorch + CUDA               [RARELY CHANGE]
+#   3. AI-Toolkit requirements      [OCCASIONALLY CHANGE]
+#   4. AI-Toolkit code              [OCCASIONALLY CHANGE]
+#   5. RunPod + HF deps             [RARELY CHANGE]
+#   6. Directory setup              [RARELY CHANGE]
+#   7. rp_handler.py                [FREQUENTLY CHANGE]
+#
+# Build:
+#   docker buildx build --platform linux/amd64 -f Dockerfile.runpod \
+#     -t aloukikaditya/trainer:latest --push .
+#
+# Build with cache:
+#   DOCKER_BUILDKIT=1 docker build -f Dockerfile.runpod -t aio-trainer .
+# =============================================================================
+ARG BASE_IMAGE=runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04
+FROM ${BASE_IMAGE}
+# -----------------------------------------------------------------------------
+# [LAYER 1] Environment Configuration - RARELY CHANGES
+# -----------------------------------------------------------------------------
+ENV PYTHONUNBUFFERED=1 \
+    DEBIAN_FRONTEND=noninteractive \
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    HF_HOME=/runpod-volume/huggingface-cache \
+    HUGGINGFACE_HUB_CACHE=/runpod-volume/huggingface-cache/hub \
+    TRANSFORMERS_CACHE=/runpod-volume/huggingface-cache/hub \
+    NO_ALBUMENTATIONS_UPDATE=1 \
+    DISABLE_TELEMETRY=YES \
+    TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0"
+WORKDIR /app
+# -----------------------------------------------------------------------------
+# [LAYER 2] System Dependencies - RARELY CHANGES
+# -----------------------------------------------------------------------------
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update && apt-get install -y --no-install-recommends \
+    git git-lfs curl wget ffmpeg libgl1-mesa-glx libglib2.0-0 aria2 \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+# -----------------------------------------------------------------------------
+# [LAYER 3] PyTorch (use base image PyTorch or install specific version)
+# -----------------------------------------------------------------------------
+# Base image already has PyTorch, verify it
+RUN python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA {torch.version.cuda}')"
+# -----------------------------------------------------------------------------
+# [LAYER 4] AI-Toolkit Requirements - OCCASIONALLY CHANGES
+# -----------------------------------------------------------------------------
+# Copy only requirements first for better caching
+COPY ai-toolkit/requirements.txt /app/ai-toolkit/requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --upgrade pip && \
+    pip install -r /app/ai-toolkit/requirements.txt
+# -----------------------------------------------------------------------------
+# [LAYER 5] RunPod + HuggingFace Dependencies - RARELY CHANGES
+# -----------------------------------------------------------------------------
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install runpod hf_transfer huggingface_hub
+# -----------------------------------------------------------------------------
+# [LAYER 6] AI-Toolkit Code - OCCASIONALLY CHANGES
+# -----------------------------------------------------------------------------
+COPY ai-toolkit/ /app/ai-toolkit/
+# Verify ai-toolkit is properly installed
+RUN python -c "import sys; sys.path.insert(0, '/app/ai-toolkit'); print('AI-Toolkit ready')"
+# -----------------------------------------------------------------------------
+# [LAYER 7] Directory Setup - RARELY CHANGES
+# -----------------------------------------------------------------------------
+RUN mkdir -p \
+    /workspace/dataset \
+    /workspace/output \
+    /runpod-volume/huggingface-cache/hub
+# -----------------------------------------------------------------------------
+# [LAYER 8] Handler Code - FREQUENTLY CHANGES
+# -----------------------------------------------------------------------------
+# This layer is last so changes to handler don't invalidate ai-toolkit cache
 COPY rp_handler.py /app/rp_handler.py
+# Verify handler imports work
+RUN python -c "from rp_handler import handler, MODEL_PRESETS; print(f'Handler ready: {list(MODEL_PRESETS.keys())}')"
+# -----------------------------------------------------------------------------
+# Runtime Configuration
+# -----------------------------------------------------------------------------
+EXPOSE 8000
 CMD ["python", "-u", "rp_handler.py"]