Spaces:
Sleeping
Sleeping
Georg commited on
Commit ·
3968781
1
Parent(s): bbc3fdc
Optimized Docker build to fix OOM errors
Browse files- Dockerfile +22 -7
- Dockerfile.base +36 -40
- deploy.sh +62 -10
- requirements.txt +5 -22
Dockerfile
CHANGED
|
@@ -1,21 +1,36 @@
|
|
| 1 |
-
#
|
| 2 |
-
#
|
| 3 |
-
# To push base: docker push gpue/foundationpose-base:latest
|
| 4 |
FROM gpue/foundationpose-base:latest
|
| 5 |
|
| 6 |
-
# FoundationPose configuration
|
| 7 |
ENV FOUNDATIONPOSE_MODEL_REPO=gpue/foundationpose-weights
|
| 8 |
ENV USE_REAL_MODEL=true
|
| 9 |
|
| 10 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
WORKDIR /app/FoundationPose
|
| 12 |
RUN cd bundlesdf/mycuda && pip install . --no-build-isolation
|
| 13 |
RUN cd mycpp && python setup.py build_ext --inplace
|
| 14 |
|
| 15 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
WORKDIR /app
|
| 17 |
RUN python3 -c "from huggingface_hub import snapshot_download; \
|
| 18 |
snapshot_download(repo_id='gpue/foundationpose-weights', local_dir='weights', repo_type='model')"
|
| 19 |
|
| 20 |
-
#
|
|
|
|
|
|
|
| 21 |
CMD ["python3", "app.py"]
|
|
|
|
| 1 |
+
# Final stage Dockerfile - optimized for HuggingFace
|
| 2 |
+
# Uses runtime base image (not devel) with minimal dependencies
|
|
|
|
| 3 |
FROM gpue/foundationpose-base:latest
|
| 4 |
|
| 5 |
+
# FoundationPose configuration
|
| 6 |
ENV FOUNDATIONPOSE_MODEL_REPO=gpue/foundationpose-weights
|
| 7 |
ENV USE_REAL_MODEL=true
|
| 8 |
|
| 9 |
+
# Install build tools temporarily (will be removed after compilation)
|
| 10 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 11 |
+
cmake \
|
| 12 |
+
build-essential \
|
| 13 |
+
ninja-build \
|
| 14 |
+
libeigen3-dev \
|
| 15 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
+
|
| 17 |
+
# Build FoundationPose C++ extensions (requires GPU)
|
| 18 |
WORKDIR /app/FoundationPose
|
| 19 |
RUN cd bundlesdf/mycuda && pip install . --no-build-isolation
|
| 20 |
RUN cd mycpp && python setup.py build_ext --inplace
|
| 21 |
|
| 22 |
+
# Remove build tools to save space
|
| 23 |
+
RUN apt-get purge -y cmake build-essential ninja-build libeigen3-dev && \
|
| 24 |
+
apt-get autoremove -y && \
|
| 25 |
+
apt-get clean && \
|
| 26 |
+
rm -rf /var/lib/apt/lists/*
|
| 27 |
+
|
| 28 |
+
# Download model weights
|
| 29 |
WORKDIR /app
|
| 30 |
RUN python3 -c "from huggingface_hub import snapshot_download; \
|
| 31 |
snapshot_download(repo_id='gpue/foundationpose-weights', local_dir='weights', repo_type='model')"
|
| 32 |
|
| 33 |
+
# Clean pip cache
|
| 34 |
+
RUN pip cache purge
|
| 35 |
+
|
| 36 |
CMD ["python3", "app.py"]
|
Dockerfile.base
CHANGED
|
@@ -1,65 +1,61 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
|
| 3 |
-
# Set environment variables
|
| 4 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 5 |
ENV CUDA_HOME=/usr/local/cuda
|
| 6 |
ENV PATH=${CUDA_HOME}/bin:${PATH}
|
| 7 |
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
-
ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0"
|
| 12 |
|
| 13 |
-
# Install
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
| 19 |
python3.10 \
|
| 20 |
-
python3.10-dev \
|
| 21 |
python3-pip \
|
| 22 |
-
|
|
|
|
| 23 |
libglib2.0-0 \
|
| 24 |
-
libsm6 \
|
| 25 |
-
libxext6 \
|
| 26 |
-
libxrender-dev \
|
| 27 |
libgomp1 \
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 31 |
|
| 32 |
-
# Set
|
| 33 |
-
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
|
| 34 |
-
|
| 35 |
|
| 36 |
# Upgrade pip
|
| 37 |
-
RUN python3 -m pip install --upgrade pip
|
| 38 |
|
| 39 |
-
# Set working directory
|
| 40 |
WORKDIR /app
|
| 41 |
|
| 42 |
-
#
|
| 43 |
-
|
| 44 |
-
RUN pip install --no-cache-dir --upgrade setuptools wheel
|
| 45 |
-
RUN pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cu118
|
| 46 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 47 |
-
|
| 48 |
-
# Clone FoundationPose repository (but don't build extensions yet)
|
| 49 |
-
RUN git clone https://github.com/NVlabs/FoundationPose.git /app/FoundationPose
|
| 50 |
|
| 51 |
-
#
|
| 52 |
-
|
| 53 |
-
RUN
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
# Copy application files
|
| 59 |
COPY app.py client.py estimator.py ./
|
| 60 |
|
| 61 |
-
# Create weights directory
|
| 62 |
RUN mkdir -p weights
|
| 63 |
|
| 64 |
-
# Expose Gradio port
|
| 65 |
EXPOSE 7860
|
|
|
|
| 1 |
+
# Minimal base image - optimized for size and build speed
|
| 2 |
+
FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
|
| 3 |
|
|
|
|
| 4 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 5 |
ENV CUDA_HOME=/usr/local/cuda
|
| 6 |
ENV PATH=${CUDA_HOME}/bin:${PATH}
|
| 7 |
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
|
| 8 |
|
| 9 |
+
# Only build for T4 (7.5) and A100 (8.0) - HuggingFace hardware
|
| 10 |
+
ENV TORCH_CUDA_ARCH_LIST="7.5;8.0"
|
|
|
|
| 11 |
|
| 12 |
+
# Install minimal runtime dependencies
|
| 13 |
+
# Remove problematic CUDA repo and install packages
|
| 14 |
+
RUN rm -f /etc/apt/sources.list.d/cuda*.list /etc/apt/sources.list.d/*.list && \
|
| 15 |
+
apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
|
| 16 |
+
ca-certificates \
|
| 17 |
+
&& apt-get clean && rm -rf /var/lib/apt/lists/* && \
|
| 18 |
+
apt-get update && apt-get install -y --no-install-recommends \
|
| 19 |
python3.10 \
|
|
|
|
| 20 |
python3-pip \
|
| 21 |
+
git \
|
| 22 |
+
libgl1 \
|
| 23 |
libglib2.0-0 \
|
|
|
|
|
|
|
|
|
|
| 24 |
libgomp1 \
|
| 25 |
+
&& rm -rf /var/lib/apt/lists/* \
|
| 26 |
+
&& apt-get clean
|
|
|
|
| 27 |
|
| 28 |
+
# Set python as default
|
| 29 |
+
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
|
| 30 |
+
update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
|
| 31 |
|
| 32 |
# Upgrade pip
|
| 33 |
+
RUN python3 -m pip install --no-cache-dir --upgrade pip
|
| 34 |
|
|
|
|
| 35 |
WORKDIR /app
|
| 36 |
|
| 37 |
+
# Install PyTorch (smallest CUDA 11.8 build)
|
| 38 |
+
RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
# Copy and install only essential requirements
|
| 41 |
+
COPY requirements.txt .
|
| 42 |
+
RUN pip install --no-cache-dir \
|
| 43 |
+
gradio>=4.0.0 \
|
| 44 |
+
numpy>=1.24.0 \
|
| 45 |
+
opencv-python-headless>=4.8.0 \
|
| 46 |
+
Pillow>=10.0.0 \
|
| 47 |
+
huggingface-hub>=0.20.0 \
|
| 48 |
+
&& pip cache purge
|
| 49 |
+
|
| 50 |
+
# Clone FoundationPose (but don't build yet - that's done in final stage)
|
| 51 |
+
RUN git clone --depth 1 https://github.com/NVlabs/FoundationPose.git /app/FoundationPose && \
|
| 52 |
+
cd /app/FoundationPose/bundlesdf/mycuda && \
|
| 53 |
+
sed -i 's/-std=c++14/-std=c++17/g' setup.py
|
| 54 |
|
| 55 |
# Copy application files
|
| 56 |
COPY app.py client.py estimator.py ./
|
| 57 |
|
| 58 |
+
# Create weights directory
|
| 59 |
RUN mkdir -p weights
|
| 60 |
|
|
|
|
| 61 |
EXPOSE 7860
|
deploy.sh
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
-
#
|
| 3 |
|
| 4 |
set -e
|
| 5 |
|
|
@@ -10,12 +10,12 @@ HF_SPACE="gpue/foundationpose"
|
|
| 10 |
HF_TOKEN_FILE="../training/.env.local"
|
| 11 |
|
| 12 |
echo "==================================="
|
| 13 |
-
echo "FoundationPose
|
| 14 |
echo "==================================="
|
| 15 |
echo ""
|
| 16 |
|
| 17 |
-
# Stage 1: Build and push base image
|
| 18 |
-
echo "Stage 1: Building base image
|
| 19 |
echo "Platform: ${PLATFORM}"
|
| 20 |
echo "Image: ${IMAGE_NAME}:${TAG}"
|
| 21 |
echo ""
|
|
@@ -29,11 +29,18 @@ fi
|
|
| 29 |
echo "✓ DockerHub authentication verified"
|
| 30 |
echo ""
|
| 31 |
|
|
|
|
| 32 |
docker build --platform ${PLATFORM} -f Dockerfile.base -t ${IMAGE_NAME}:${TAG} .
|
| 33 |
|
| 34 |
echo ""
|
| 35 |
echo "✓ Base image built successfully"
|
| 36 |
echo ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
echo "Pushing to DockerHub..."
|
| 38 |
docker push ${IMAGE_NAME}:${TAG}
|
| 39 |
|
|
@@ -57,8 +64,8 @@ fi
|
|
| 57 |
# Check if there are changes to commit
|
| 58 |
if [[ -n $(git status -s) ]]; then
|
| 59 |
echo "Committing changes..."
|
| 60 |
-
git add Dockerfile Dockerfile.base
|
| 61 |
-
git commit -m "
|
| 62 |
echo "✓ Changes committed"
|
| 63 |
else
|
| 64 |
echo "No changes to commit"
|
|
@@ -73,10 +80,12 @@ echo ""
|
|
| 73 |
echo "✓ Pushed to HuggingFace"
|
| 74 |
echo ""
|
| 75 |
echo "HuggingFace will now:"
|
| 76 |
-
echo " 1. Pull
|
| 77 |
-
echo " 2.
|
| 78 |
-
echo " 3.
|
| 79 |
-
echo " 4.
|
|
|
|
|
|
|
| 80 |
echo ""
|
| 81 |
|
| 82 |
# Follow build logs
|
|
@@ -95,6 +104,49 @@ if [ -f "${HF_TOKEN_FILE}" ]; then
|
|
| 95 |
# Parse JSON and extract data field
|
| 96 |
echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
|
| 97 |
done
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
else
|
| 99 |
echo "Warning: HF_TOKEN not found in ${HF_TOKEN_FILE}"
|
| 100 |
echo "To follow logs manually:"
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
+
# FoundationPose deployment script (optimized for HuggingFace)
|
| 3 |
|
| 4 |
set -e
|
| 5 |
|
|
|
|
| 10 |
HF_TOKEN_FILE="../training/.env.local"
|
| 11 |
|
| 12 |
echo "==================================="
|
| 13 |
+
echo "FoundationPose Deployment"
|
| 14 |
echo "==================================="
|
| 15 |
echo ""
|
| 16 |
|
| 17 |
+
# Stage 1: Build and push base image
|
| 18 |
+
echo "Stage 1: Building base image"
|
| 19 |
echo "Platform: ${PLATFORM}"
|
| 20 |
echo "Image: ${IMAGE_NAME}:${TAG}"
|
| 21 |
echo ""
|
|
|
|
| 29 |
echo "✓ DockerHub authentication verified"
|
| 30 |
echo ""
|
| 31 |
|
| 32 |
+
echo "Building base image..."
|
| 33 |
docker build --platform ${PLATFORM} -f Dockerfile.base -t ${IMAGE_NAME}:${TAG} .
|
| 34 |
|
| 35 |
echo ""
|
| 36 |
echo "✓ Base image built successfully"
|
| 37 |
echo ""
|
| 38 |
+
|
| 39 |
+
# Show image size
|
| 40 |
+
IMAGE_SIZE=$(docker images ${IMAGE_NAME}:${TAG} --format "{{.Size}}")
|
| 41 |
+
echo "Image size: ${IMAGE_SIZE}"
|
| 42 |
+
echo ""
|
| 43 |
+
|
| 44 |
echo "Pushing to DockerHub..."
|
| 45 |
docker push ${IMAGE_NAME}:${TAG}
|
| 46 |
|
|
|
|
| 64 |
# Check if there are changes to commit
|
| 65 |
if [[ -n $(git status -s) ]]; then
|
| 66 |
echo "Committing changes..."
|
| 67 |
+
git add Dockerfile Dockerfile.base requirements.txt deploy.sh app.py client.py estimator.py
|
| 68 |
+
git commit -m "Optimized Docker build to fix OOM errors"
|
| 69 |
echo "✓ Changes committed"
|
| 70 |
else
|
| 71 |
echo "No changes to commit"
|
|
|
|
| 80 |
echo "✓ Pushed to HuggingFace"
|
| 81 |
echo ""
|
| 82 |
echo "HuggingFace will now:"
|
| 83 |
+
echo " 1. Pull base image from DockerHub (${IMAGE_NAME}:${TAG})"
|
| 84 |
+
echo " 2. Install build tools temporarily"
|
| 85 |
+
echo " 3. Build C++ extensions with GPU"
|
| 86 |
+
echo " 4. Remove build tools"
|
| 87 |
+
echo " 5. Download model weights (246MB)"
|
| 88 |
+
echo " 6. Start the Gradio app"
|
| 89 |
echo ""
|
| 90 |
|
| 91 |
# Follow build logs
|
|
|
|
| 104 |
# Parse JSON and extract data field
|
| 105 |
echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
|
| 106 |
done
|
| 107 |
+
|
| 108 |
+
echo ""
|
| 109 |
+
echo "===================================="
|
| 110 |
+
echo "Build Status Check"
|
| 111 |
+
echo "===================================="
|
| 112 |
+
echo ""
|
| 113 |
+
|
| 114 |
+
# Wait a moment for status to update
|
| 115 |
+
sleep 2
|
| 116 |
+
|
| 117 |
+
# Check final build status
|
| 118 |
+
STATUS_JSON=$(curl -s -H "Authorization: Bearer ${HF_TOKEN}" \
|
| 119 |
+
"https://huggingface.co/api/spaces/${HF_SPACE}")
|
| 120 |
+
|
| 121 |
+
STAGE=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('stage', 'UNKNOWN'))" 2>/dev/null)
|
| 122 |
+
ERROR_MSG=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('errorMessage', ''))" 2>/dev/null)
|
| 123 |
+
|
| 124 |
+
echo "Final Status: ${STAGE}"
|
| 125 |
+
|
| 126 |
+
if [ "${STAGE}" = "RUNNING" ]; then
|
| 127 |
+
echo "✓ Deployment successful!"
|
| 128 |
+
echo ""
|
| 129 |
+
echo "Space URL: https://${HF_SPACE/\//-}.hf.space"
|
| 130 |
+
echo "API URL: https://${HF_SPACE/\//-}.hf.space/gradio_api/info"
|
| 131 |
+
echo ""
|
| 132 |
+
echo "Test with: cd ../training && make test-perception-api"
|
| 133 |
+
elif [ "${STAGE}" = "BUILD_ERROR" ]; then
|
| 134 |
+
echo "✗ Build failed!"
|
| 135 |
+
if [ -n "${ERROR_MSG}" ]; then
|
| 136 |
+
echo "Error: ${ERROR_MSG}"
|
| 137 |
+
fi
|
| 138 |
+
echo ""
|
| 139 |
+
echo "If still getting OOM errors, consider:"
|
| 140 |
+
echo " - Moving weights to runtime download (not build time)"
|
| 141 |
+
echo " - Requesting larger build instance from HuggingFace"
|
| 142 |
+
echo " - Using only CUDA arch 7.5 (T4 only)"
|
| 143 |
+
exit 1
|
| 144 |
+
else
|
| 145 |
+
echo "Status: ${STAGE}"
|
| 146 |
+
if [ -n "${ERROR_MSG}" ]; then
|
| 147 |
+
echo "Message: ${ERROR_MSG}"
|
| 148 |
+
fi
|
| 149 |
+
fi
|
| 150 |
else
|
| 151 |
echo "Warning: HF_TOKEN not found in ${HF_TOKEN_FILE}"
|
| 152 |
echo "To follow logs manually:"
|
requirements.txt
CHANGED
|
@@ -1,26 +1,9 @@
|
|
| 1 |
-
#
|
| 2 |
gradio>=4.0.0
|
| 3 |
numpy>=1.24.0
|
| 4 |
-
opencv-python>=4.8.0
|
| 5 |
Pillow>=10.0.0
|
|
|
|
| 6 |
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
uvicorn>=0.27.0
|
| 10 |
-
pydantic>=2.0.0
|
| 11 |
-
|
| 12 |
-
# Hugging Face
|
| 13 |
-
huggingface_hub>=0.20.0
|
| 14 |
-
|
| 15 |
-
# Deep learning
|
| 16 |
-
torch>=2.0.0
|
| 17 |
-
torchvision>=0.15.0
|
| 18 |
-
|
| 19 |
-
# 3D vision dependencies
|
| 20 |
-
trimesh>=4.0.0
|
| 21 |
-
pyrender>=0.1.45
|
| 22 |
-
scikit-image>=0.21.0
|
| 23 |
-
|
| 24 |
-
# FoundationPose specific (will need to install from source)
|
| 25 |
-
# The actual FoundationPose repo needs to be cloned and installed
|
| 26 |
-
# git+https://github.com/NVlabs/FoundationPose.git
|
|
|
|
| 1 |
+
# Minimal requirements - only what's needed for runtime
|
| 2 |
gradio>=4.0.0
|
| 3 |
numpy>=1.24.0
|
| 4 |
+
opencv-python-headless>=4.8.0 # Headless version saves ~400MB
|
| 5 |
Pillow>=10.0.0
|
| 6 |
+
huggingface-hub>=0.20.0
|
| 7 |
|
| 8 |
+
# Note: torch and torchvision are installed separately with CUDA support
|
| 9 |
+
# Note: FoundationPose C++ extensions built at runtime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|