Georg commited on
Commit
3968781
·
1 Parent(s): bbc3fdc

Optimized Docker build to fix OOM errors

Browse files
Files changed (4) hide show
  1. Dockerfile +22 -7
  2. Dockerfile.base +36 -40
  3. deploy.sh +62 -10
  4. requirements.txt +5 -22
Dockerfile CHANGED
@@ -1,21 +1,36 @@
1
- # Start from base image (build locally, push to DockerHub)
2
- # To build base: docker build -f Dockerfile.base -t gpue/foundationpose-base:latest .
3
- # To push base: docker push gpue/foundationpose-base:latest
4
  FROM gpue/foundationpose-base:latest
5
 
6
- # FoundationPose configuration - always use real model
7
  ENV FOUNDATIONPOSE_MODEL_REPO=gpue/foundationpose-weights
8
  ENV USE_REAL_MODEL=true
9
 
10
- # Build FoundationPose C++ extensions (requires GPU present)
 
 
 
 
 
 
 
 
11
  WORKDIR /app/FoundationPose
12
  RUN cd bundlesdf/mycuda && pip install . --no-build-isolation
13
  RUN cd mycpp && python setup.py build_ext --inplace
14
 
15
- # Download model weights from HuggingFace
 
 
 
 
 
 
16
  WORKDIR /app
17
  RUN python3 -c "from huggingface_hub import snapshot_download; \
18
  snapshot_download(repo_id='gpue/foundationpose-weights', local_dir='weights', repo_type='model')"
19
 
20
- # Run the application
 
 
21
  CMD ["python3", "app.py"]
 
1
+ # Final stage Dockerfile - optimized for HuggingFace
2
+ # Uses runtime base image (not devel) with minimal dependencies
 
3
  FROM gpue/foundationpose-base:latest
4
 
5
+ # FoundationPose configuration
6
  ENV FOUNDATIONPOSE_MODEL_REPO=gpue/foundationpose-weights
7
  ENV USE_REAL_MODEL=true
8
 
9
+ # Install build tools temporarily (will be removed after compilation)
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ cmake \
12
+ build-essential \
13
+ ninja-build \
14
+ libeigen3-dev \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Build FoundationPose C++ extensions (requires GPU)
18
  WORKDIR /app/FoundationPose
19
  RUN cd bundlesdf/mycuda && pip install . --no-build-isolation
20
  RUN cd mycpp && python setup.py build_ext --inplace
21
 
22
+ # Remove build tools to save space
23
+ RUN apt-get purge -y cmake build-essential ninja-build libeigen3-dev && \
24
+ apt-get autoremove -y && \
25
+ apt-get clean && \
26
+ rm -rf /var/lib/apt/lists/*
27
+
28
+ # Download model weights
29
  WORKDIR /app
30
  RUN python3 -c "from huggingface_hub import snapshot_download; \
31
  snapshot_download(repo_id='gpue/foundationpose-weights', local_dir='weights', repo_type='model')"
32
 
33
+ # Clean pip cache
34
+ RUN pip cache purge
35
+
36
  CMD ["python3", "app.py"]
Dockerfile.base CHANGED
@@ -1,65 +1,61 @@
1
- FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 
2
 
3
- # Set environment variables
4
  ENV DEBIAN_FRONTEND=noninteractive
5
  ENV CUDA_HOME=/usr/local/cuda
6
  ENV PATH=${CUDA_HOME}/bin:${PATH}
7
  ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
8
 
9
- # CUDA architecture list for building extensions without GPU present
10
- # Covers most modern GPUs: Turing (75), Ampere (80,86), Ada (89), Hopper (90)
11
- ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0"
12
 
13
- # Install system dependencies
14
- RUN apt-get update && apt-get install -y \
15
- git \
16
- wget \
17
- cmake \
18
- build-essential \
 
19
  python3.10 \
20
- python3.10-dev \
21
  python3-pip \
22
- libgl1-mesa-glx \
 
23
  libglib2.0-0 \
24
- libsm6 \
25
- libxext6 \
26
- libxrender-dev \
27
  libgomp1 \
28
- libeigen3-dev \
29
- ninja-build \
30
- && rm -rf /var/lib/apt/lists/*
31
 
32
- # Set python3.10 as default
33
- RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
34
- RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
35
 
36
  # Upgrade pip
37
- RUN python3 -m pip install --upgrade pip
38
 
39
- # Set working directory
40
  WORKDIR /app
41
 
42
- # Copy and install Python dependencies
43
- COPY requirements.txt .
44
- RUN pip install --no-cache-dir --upgrade setuptools wheel
45
- RUN pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cu118
46
- RUN pip install --no-cache-dir -r requirements.txt
47
-
48
- # Clone FoundationPose repository (but don't build extensions yet)
49
- RUN git clone https://github.com/NVlabs/FoundationPose.git /app/FoundationPose
50
 
51
- # Patch mycuda setup.py to use C++17 (preparation for GPU build)
52
- WORKDIR /app/FoundationPose
53
- RUN cd bundlesdf/mycuda && sed -i 's/-std=c++14/-std=c++17/g' setup.py
54
-
55
- # Reset workdir
56
- WORKDIR /app
 
 
 
 
 
 
 
 
57
 
58
  # Copy application files
59
  COPY app.py client.py estimator.py ./
60
 
61
- # Create weights directory (weights will be downloaded in final image)
62
  RUN mkdir -p weights
63
 
64
- # Expose Gradio port
65
  EXPOSE 7860
 
1
+ # Minimal base image - optimized for size and build speed
2
+ FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
3
 
 
4
  ENV DEBIAN_FRONTEND=noninteractive
5
  ENV CUDA_HOME=/usr/local/cuda
6
  ENV PATH=${CUDA_HOME}/bin:${PATH}
7
  ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
8
 
9
+ # Only build for T4 (7.5) and A100 (8.0) - HuggingFace hardware
10
+ ENV TORCH_CUDA_ARCH_LIST="7.5;8.0"
 
11
 
12
+ # Install minimal runtime dependencies
13
+ # Remove problematic CUDA repo and install packages
14
+ RUN rm -f /etc/apt/sources.list.d/cuda*.list /etc/apt/sources.list.d/*.list && \
15
+ apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
16
+ ca-certificates \
17
+ && apt-get clean && rm -rf /var/lib/apt/lists/* && \
18
+ apt-get update && apt-get install -y --no-install-recommends \
19
  python3.10 \
 
20
  python3-pip \
21
+ git \
22
+ libgl1 \
23
  libglib2.0-0 \
 
 
 
24
  libgomp1 \
25
+ && rm -rf /var/lib/apt/lists/* \
26
+ && apt-get clean
 
27
 
28
+ # Set python as default
29
+ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
30
+ update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
31
 
32
  # Upgrade pip
33
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
34
 
 
35
  WORKDIR /app
36
 
37
+ # Install PyTorch (smallest CUDA 11.8 build)
38
+ RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
 
 
 
 
 
 
39
 
40
+ # Copy and install only essential requirements
41
+ COPY requirements.txt .
42
+ RUN pip install --no-cache-dir \
43
+ gradio>=4.0.0 \
44
+ numpy>=1.24.0 \
45
+ opencv-python-headless>=4.8.0 \
46
+ Pillow>=10.0.0 \
47
+ huggingface-hub>=0.20.0 \
48
+ && pip cache purge
49
+
50
+ # Clone FoundationPose (but don't build yet - that's done in final stage)
51
+ RUN git clone --depth 1 https://github.com/NVlabs/FoundationPose.git /app/FoundationPose && \
52
+ cd /app/FoundationPose/bundlesdf/mycuda && \
53
+ sed -i 's/-std=c++14/-std=c++17/g' setup.py
54
 
55
  # Copy application files
56
  COPY app.py client.py estimator.py ./
57
 
58
+ # Create weights directory
59
  RUN mkdir -p weights
60
 
 
61
  EXPOSE 7860
deploy.sh CHANGED
@@ -1,5 +1,5 @@
1
  #!/bin/bash
2
- # Two-stage deployment script for FoundationPose
3
 
4
  set -e
5
 
@@ -10,12 +10,12 @@ HF_SPACE="gpue/foundationpose"
10
  HF_TOKEN_FILE="../training/.env.local"
11
 
12
  echo "==================================="
13
- echo "FoundationPose Two-Stage Deployment"
14
  echo "==================================="
15
  echo ""
16
 
17
- # Stage 1: Build and push base image (local, no GPU needed)
18
- echo "Stage 1: Building base image locally (no GPU required)"
19
  echo "Platform: ${PLATFORM}"
20
  echo "Image: ${IMAGE_NAME}:${TAG}"
21
  echo ""
@@ -29,11 +29,18 @@ fi
29
  echo "✓ DockerHub authentication verified"
30
  echo ""
31
 
 
32
  docker build --platform ${PLATFORM} -f Dockerfile.base -t ${IMAGE_NAME}:${TAG} .
33
 
34
  echo ""
35
  echo "✓ Base image built successfully"
36
  echo ""
 
 
 
 
 
 
37
  echo "Pushing to DockerHub..."
38
  docker push ${IMAGE_NAME}:${TAG}
39
 
@@ -57,8 +64,8 @@ fi
57
  # Check if there are changes to commit
58
  if [[ -n $(git status -s) ]]; then
59
  echo "Committing changes..."
60
- git add Dockerfile Dockerfile.base BUILD.md build_base.sh deploy.sh
61
- git commit -m "Two-stage Docker build: base image + GPU compilation"
62
  echo "✓ Changes committed"
63
  else
64
  echo "No changes to commit"
@@ -73,10 +80,12 @@ echo ""
73
  echo "✓ Pushed to HuggingFace"
74
  echo ""
75
  echo "HuggingFace will now:"
76
- echo " 1. Pull the base image from DockerHub (${IMAGE_NAME}:${TAG})"
77
- echo " 2. Build C++ extensions with GPU present"
78
- echo " 3. Download model weights"
79
- echo " 4. Start the Gradio app"
 
 
80
  echo ""
81
 
82
  # Follow build logs
@@ -95,6 +104,49 @@ if [ -f "${HF_TOKEN_FILE}" ]; then
95
  # Parse JSON and extract data field
96
  echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
97
  done
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  else
99
  echo "Warning: HF_TOKEN not found in ${HF_TOKEN_FILE}"
100
  echo "To follow logs manually:"
 
1
  #!/bin/bash
2
+ # FoundationPose deployment script (optimized for HuggingFace)
3
 
4
  set -e
5
 
 
10
  HF_TOKEN_FILE="../training/.env.local"
11
 
12
  echo "==================================="
13
+ echo "FoundationPose Deployment"
14
  echo "==================================="
15
  echo ""
16
 
17
+ # Stage 1: Build and push base image
18
+ echo "Stage 1: Building base image"
19
  echo "Platform: ${PLATFORM}"
20
  echo "Image: ${IMAGE_NAME}:${TAG}"
21
  echo ""
 
29
  echo "✓ DockerHub authentication verified"
30
  echo ""
31
 
32
+ echo "Building base image..."
33
  docker build --platform ${PLATFORM} -f Dockerfile.base -t ${IMAGE_NAME}:${TAG} .
34
 
35
  echo ""
36
  echo "✓ Base image built successfully"
37
  echo ""
38
+
39
+ # Show image size
40
+ IMAGE_SIZE=$(docker images ${IMAGE_NAME}:${TAG} --format "{{.Size}}")
41
+ echo "Image size: ${IMAGE_SIZE}"
42
+ echo ""
43
+
44
  echo "Pushing to DockerHub..."
45
  docker push ${IMAGE_NAME}:${TAG}
46
 
 
64
  # Check if there are changes to commit
65
  if [[ -n $(git status -s) ]]; then
66
  echo "Committing changes..."
67
+ git add Dockerfile Dockerfile.base requirements.txt deploy.sh app.py client.py estimator.py
68
+ git commit -m "Optimized Docker build to fix OOM errors"
69
  echo "✓ Changes committed"
70
  else
71
  echo "No changes to commit"
 
80
  echo "✓ Pushed to HuggingFace"
81
  echo ""
82
  echo "HuggingFace will now:"
83
+ echo " 1. Pull base image from DockerHub (${IMAGE_NAME}:${TAG})"
84
+ echo " 2. Install build tools temporarily"
85
+ echo " 3. Build C++ extensions with GPU"
86
+ echo " 4. Remove build tools"
87
+ echo " 5. Download model weights (246MB)"
88
+ echo " 6. Start the Gradio app"
89
  echo ""
90
 
91
  # Follow build logs
 
104
  # Parse JSON and extract data field
105
  echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
106
  done
107
+
108
+ echo ""
109
+ echo "===================================="
110
+ echo "Build Status Check"
111
+ echo "===================================="
112
+ echo ""
113
+
114
+ # Wait a moment for status to update
115
+ sleep 2
116
+
117
+ # Check final build status
118
+ STATUS_JSON=$(curl -s -H "Authorization: Bearer ${HF_TOKEN}" \
119
+ "https://huggingface.co/api/spaces/${HF_SPACE}")
120
+
121
+ STAGE=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('stage', 'UNKNOWN'))" 2>/dev/null)
122
+ ERROR_MSG=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('errorMessage', ''))" 2>/dev/null)
123
+
124
+ echo "Final Status: ${STAGE}"
125
+
126
+ if [ "${STAGE}" = "RUNNING" ]; then
127
+ echo "✓ Deployment successful!"
128
+ echo ""
129
+ echo "Space URL: https://${HF_SPACE/\//-}.hf.space"
130
+ echo "API URL: https://${HF_SPACE/\//-}.hf.space/gradio_api/info"
131
+ echo ""
132
+ echo "Test with: cd ../training && make test-perception-api"
133
+ elif [ "${STAGE}" = "BUILD_ERROR" ]; then
134
+ echo "✗ Build failed!"
135
+ if [ -n "${ERROR_MSG}" ]; then
136
+ echo "Error: ${ERROR_MSG}"
137
+ fi
138
+ echo ""
139
+ echo "If still getting OOM errors, consider:"
140
+ echo " - Moving weights to runtime download (not build time)"
141
+ echo " - Requesting larger build instance from HuggingFace"
142
+ echo " - Using only CUDA arch 7.5 (T4 only)"
143
+ exit 1
144
+ else
145
+ echo "Status: ${STAGE}"
146
+ if [ -n "${ERROR_MSG}" ]; then
147
+ echo "Message: ${ERROR_MSG}"
148
+ fi
149
+ fi
150
  else
151
  echo "Warning: HF_TOKEN not found in ${HF_TOKEN_FILE}"
152
  echo "To follow logs manually:"
requirements.txt CHANGED
@@ -1,26 +1,9 @@
1
- # Core dependencies
2
  gradio>=4.0.0
3
  numpy>=1.24.0
4
- opencv-python>=4.8.0
5
  Pillow>=10.0.0
 
6
 
7
- # FastAPI for REST API endpoints
8
- fastapi>=0.109.0
9
- uvicorn>=0.27.0
10
- pydantic>=2.0.0
11
-
12
- # Hugging Face
13
- huggingface_hub>=0.20.0
14
-
15
- # Deep learning
16
- torch>=2.0.0
17
- torchvision>=0.15.0
18
-
19
- # 3D vision dependencies
20
- trimesh>=4.0.0
21
- pyrender>=0.1.45
22
- scikit-image>=0.21.0
23
-
24
- # FoundationPose specific (will need to install from source)
25
- # The actual FoundationPose repo needs to be cloned and installed
26
- # git+https://github.com/NVlabs/FoundationPose.git
 
1
+ # Minimal requirements - only what's needed for runtime
2
  gradio>=4.0.0
3
  numpy>=1.24.0
4
+ opencv-python-headless>=4.8.0 # Headless version saves ~400MB
5
  Pillow>=10.0.0
6
+ huggingface-hub>=0.20.0
7
 
8
+ # Note: torch and torchvision are installed separately with CUDA support
9
+ # Note: FoundationPose C++ extensions built at runtime