File size: 4,843 Bytes
e334188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f322631
e334188
 
f322631
e334188
 
 
 
 
 
 
 
 
 
 
 
f322631
e334188
f322631
e334188
 
 
 
 
 
 
 
f322631
e334188
 
 
 
 
f322631
e334188
 
 
 
 
f322631
e334188
 
 
f322631
e334188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f322631
 
e334188
 
f322631
e334188
 
 
 
f322631
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# =============================================================================
# AI-Toolkit Trainer - RunPod Serverless Worker (CACHE OPTIMIZED)
# =============================================================================
# CACHE OPTIMIZATION: Layers ordered from LEAST to MOST frequently changed
#
# Layer Order (top = rarely changes, bottom = frequently changes):
#   1. Base image + system deps     [RARELY CHANGE]
#   2. PyTorch + CUDA               [RARELY CHANGE]
#   3. AI-Toolkit requirements      [OCCASIONALLY CHANGE]
#   4. AI-Toolkit code              [OCCASIONALLY CHANGE]
#   5. RunPod + HF deps             [RARELY CHANGE]
#   6. Directory setup              [RARELY CHANGE]
#   7. rp_handler.py                [FREQUENTLY CHANGE]
#
# Build:
#   docker buildx build --platform linux/amd64 -f Dockerfile.runpod \
#     -t aloukikaditya/trainer:latest --push .
#
# Build with cache:
#   DOCKER_BUILDKIT=1 docker build -f Dockerfile.runpod -t aio-trainer .
# =============================================================================

ARG BASE_IMAGE=runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04
FROM ${BASE_IMAGE}

# -----------------------------------------------------------------------------
# [LAYER 1] Environment Configuration - RARELY CHANGES
# -----------------------------------------------------------------------------
ENV PYTHONUNBUFFERED=1 \
    DEBIAN_FRONTEND=noninteractive \
    HF_HUB_ENABLE_HF_TRANSFER=1 \
    HF_HOME=/runpod-volume/huggingface-cache \
    HUGGINGFACE_HUB_CACHE=/runpod-volume/huggingface-cache/hub \
    TRANSFORMERS_CACHE=/runpod-volume/huggingface-cache/hub \
    NO_ALBUMENTATIONS_UPDATE=1 \
    DISABLE_TELEMETRY=YES \
    TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0"

WORKDIR /app

# -----------------------------------------------------------------------------
# [LAYER 2] System Dependencies - RARELY CHANGES
# -----------------------------------------------------------------------------
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get update && apt-get install -y --no-install-recommends \
    git git-lfs curl wget ffmpeg libgl1-mesa-glx libglib2.0-0 aria2 \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

# -----------------------------------------------------------------------------
# [LAYER 3] PyTorch (use base image PyTorch or install specific version)
# -----------------------------------------------------------------------------
# Base image already has PyTorch, verify it
RUN python -c "import torch; print(f'PyTorch {torch.__version__}, CUDA {torch.version.cuda}')"

# -----------------------------------------------------------------------------
# [LAYER 4] AI-Toolkit Requirements - OCCASIONALLY CHANGES
# -----------------------------------------------------------------------------
# Copy only requirements first for better caching
COPY ai-toolkit/requirements.txt /app/ai-toolkit/requirements.txt

RUN --mount=type=cache,target=/root/.cache/pip \
    pip install --upgrade pip && \
    pip install -r /app/ai-toolkit/requirements.txt

# -----------------------------------------------------------------------------
# [LAYER 5] RunPod + HuggingFace Dependencies - RARELY CHANGES
# -----------------------------------------------------------------------------
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install runpod hf_transfer huggingface_hub

# -----------------------------------------------------------------------------
# [LAYER 6] AI-Toolkit Code - OCCASIONALLY CHANGES
# -----------------------------------------------------------------------------
COPY ai-toolkit/ /app/ai-toolkit/

# Verify ai-toolkit is properly installed
RUN python -c "import sys; sys.path.insert(0, '/app/ai-toolkit'); print('AI-Toolkit ready')"

# -----------------------------------------------------------------------------
# [LAYER 7] Directory Setup - RARELY CHANGES
# -----------------------------------------------------------------------------
RUN mkdir -p \
    /workspace/dataset \
    /workspace/output \
    /runpod-volume/huggingface-cache/hub

# -----------------------------------------------------------------------------
# [LAYER 8] Handler Code - FREQUENTLY CHANGES
# -----------------------------------------------------------------------------
# This layer is last so changes to handler don't invalidate ai-toolkit cache
COPY rp_handler.py /app/rp_handler.py

# Verify handler imports work
RUN python -c "from rp_handler import handler, MODEL_PRESETS; print(f'Handler ready: {list(MODEL_PRESETS.keys())}')"

# -----------------------------------------------------------------------------
# Runtime Configuration
# -----------------------------------------------------------------------------
EXPOSE 8000

CMD ["python", "-u", "rp_handler.py"]