Spaces:

gpue
/

foundationpose

Sleeping

App Files Files Community

Georg commited on Jan 29

Commit

3968781

1 Parent(s): bbc3fdc

Optimized Docker build to fix OOM errors

Browse files

Files changed (4) hide show

Dockerfile +22 -7
Dockerfile.base +36 -40
deploy.sh +62 -10
requirements.txt +5 -22

Dockerfile CHANGED Viewed

@@ -1,21 +1,36 @@
-# Start from base image (build locally, push to DockerHub)
-# To build base: docker build -f Dockerfile.base -t gpue/foundationpose-base:latest .
-# To push base: docker push gpue/foundationpose-base:latest
 FROM gpue/foundationpose-base:latest
-# FoundationPose configuration - always use real model
 ENV FOUNDATIONPOSE_MODEL_REPO=gpue/foundationpose-weights
 ENV USE_REAL_MODEL=true
-# Build FoundationPose C++ extensions (requires GPU present)
 WORKDIR /app/FoundationPose
 RUN cd bundlesdf/mycuda && pip install . --no-build-isolation
 RUN cd mycpp && python setup.py build_ext --inplace
-# Download model weights from HuggingFace
 WORKDIR /app
 RUN python3 -c "from huggingface_hub import snapshot_download; \
 snapshot_download(repo_id='gpue/foundationpose-weights', local_dir='weights', repo_type='model')"
-# Run the application
 CMD ["python3", "app.py"]

+# Final stage Dockerfile - optimized for HuggingFace
+# Uses runtime base image (not devel) with minimal dependencies
 FROM gpue/foundationpose-base:latest
+# FoundationPose configuration
 ENV FOUNDATIONPOSE_MODEL_REPO=gpue/foundationpose-weights
 ENV USE_REAL_MODEL=true
+# Install build tools temporarily (will be removed after compilation)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    cmake \
+    build-essential \
+    ninja-build \
+    libeigen3-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Build FoundationPose C++ extensions (requires GPU)
 WORKDIR /app/FoundationPose
 RUN cd bundlesdf/mycuda && pip install . --no-build-isolation
 RUN cd mycpp && python setup.py build_ext --inplace
+# Remove build tools to save space
+RUN apt-get purge -y cmake build-essential ninja-build libeigen3-dev && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+# Download model weights
 WORKDIR /app
 RUN python3 -c "from huggingface_hub import snapshot_download; \
 snapshot_download(repo_id='gpue/foundationpose-weights', local_dir='weights', repo_type='model')"
+# Clean pip cache
+RUN pip cache purge
 CMD ["python3", "app.py"]

Dockerfile.base CHANGED Viewed

@@ -1,65 +1,61 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-# Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
 ENV CUDA_HOME=/usr/local/cuda
 ENV PATH=${CUDA_HOME}/bin:${PATH}
 ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
-# CUDA architecture list for building extensions without GPU present
-# Covers most modern GPUs: Turing (75), Ampere (80,86), Ada (89), Hopper (90)
-ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.9;9.0"
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    git \
-    wget \
-    cmake \
-    build-essential \
     python3.10 \
-    python3.10-dev \
     python3-pip \
-    libgl1-mesa-glx \
     libglib2.0-0 \
-    libsm6 \
-    libxext6 \
-    libxrender-dev \
     libgomp1 \
-    libeigen3-dev \
-    ninja-build \
-    && rm -rf /var/lib/apt/lists/*
-# Set python3.10 as default
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
-RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
 # Upgrade pip
-RUN python3 -m pip install --upgrade pip
-# Set working directory
 WORKDIR /app
-# Copy and install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir --upgrade setuptools wheel
-RUN pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cu118
-RUN pip install --no-cache-dir -r requirements.txt
-# Clone FoundationPose repository (but don't build extensions yet)
-RUN git clone https://github.com/NVlabs/FoundationPose.git /app/FoundationPose
-# Patch mycuda setup.py to use C++17 (preparation for GPU build)
-WORKDIR /app/FoundationPose
-RUN cd bundlesdf/mycuda && sed -i 's/-std=c++14/-std=c++17/g' setup.py
-# Reset workdir
-WORKDIR /app
 # Copy application files
 COPY app.py client.py estimator.py ./
-# Create weights directory (weights will be downloaded in final image)
 RUN mkdir -p weights
-# Expose Gradio port
 EXPOSE 7860

+# Minimal base image - optimized for size and build speed
+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 ENV CUDA_HOME=/usr/local/cuda
 ENV PATH=${CUDA_HOME}/bin:${PATH}
 ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+# Only build for T4 (7.5) and A100 (8.0) - HuggingFace hardware
+ENV TORCH_CUDA_ARCH_LIST="7.5;8.0"
+# Install minimal runtime dependencies
+# Remove problematic CUDA repo and install packages
+RUN rm -f /etc/apt/sources.list.d/cuda*.list /etc/apt/sources.list.d/*.list && \
+    apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
+    ca-certificates \
+    && apt-get clean && rm -rf /var/lib/apt/lists/* && \
+    apt-get update && apt-get install -y --no-install-recommends \
     python3.10 \
     python3-pip \
+    git \
+    libgl1 \
     libglib2.0-0 \
     libgomp1 \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+# Set python as default
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
 # Upgrade pip
+RUN python3 -m pip install --no-cache-dir --upgrade pip
 WORKDIR /app
+# Install PyTorch (smallest CUDA 11.8 build)
+RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
+# Copy and install only essential requirements
+COPY requirements.txt .
+RUN pip install --no-cache-dir \
+    gradio>=4.0.0 \
+    numpy>=1.24.0 \
+    opencv-python-headless>=4.8.0 \
+    Pillow>=10.0.0 \
+    huggingface-hub>=0.20.0 \
+    && pip cache purge
+# Clone FoundationPose (but don't build yet - that's done in final stage)
+RUN git clone --depth 1 https://github.com/NVlabs/FoundationPose.git /app/FoundationPose && \
+    cd /app/FoundationPose/bundlesdf/mycuda && \
+    sed -i 's/-std=c++14/-std=c++17/g' setup.py
 # Copy application files
 COPY app.py client.py estimator.py ./
+# Create weights directory
 RUN mkdir -p weights
 EXPOSE 7860

deploy.sh CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/bin/bash
-# Two-stage deployment script for FoundationPose
 set -e
@@ -10,12 +10,12 @@ HF_SPACE="gpue/foundationpose"
 HF_TOKEN_FILE="../training/.env.local"
 echo "==================================="
-echo "FoundationPose Two-Stage Deployment"
 echo "==================================="
 echo ""
-# Stage 1: Build and push base image (local, no GPU needed)
-echo "Stage 1: Building base image locally (no GPU required)"
 echo "Platform: ${PLATFORM}"
 echo "Image: ${IMAGE_NAME}:${TAG}"
 echo ""
@@ -29,11 +29,18 @@ fi
 echo "✓ DockerHub authentication verified"
 echo ""
 docker build --platform ${PLATFORM} -f Dockerfile.base -t ${IMAGE_NAME}:${TAG} .
 echo ""
 echo "✓ Base image built successfully"
 echo ""
 echo "Pushing to DockerHub..."
 docker push ${IMAGE_NAME}:${TAG}
@@ -57,8 +64,8 @@ fi
 # Check if there are changes to commit
 if [[ -n $(git status -s) ]]; then
     echo "Committing changes..."
-    git add Dockerfile Dockerfile.base BUILD.md build_base.sh deploy.sh
-    git commit -m "Two-stage Docker build: base image + GPU compilation"
     echo "✓ Changes committed"
 else
     echo "No changes to commit"
@@ -73,10 +80,12 @@ echo ""
 echo "✓ Pushed to HuggingFace"
 echo ""
 echo "HuggingFace will now:"
-echo "  1. Pull the base image from DockerHub (${IMAGE_NAME}:${TAG})"
-echo "  2. Build C++ extensions with GPU present"
-echo "  3. Download model weights"
-echo "  4. Start the Gradio app"
 echo ""
 # Follow build logs
@@ -95,6 +104,49 @@ if [ -f "${HF_TOKEN_FILE}" ]; then
                 # Parse JSON and extract data field
                 echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
             done
     else
         echo "Warning: HF_TOKEN not found in ${HF_TOKEN_FILE}"
         echo "To follow logs manually:"

 #!/bin/bash
+# FoundationPose deployment script (optimized for HuggingFace)
 set -e
 HF_TOKEN_FILE="../training/.env.local"
 echo "==================================="
+echo "FoundationPose Deployment"
 echo "==================================="
 echo ""
+# Stage 1: Build and push base image
+echo "Stage 1: Building base image"
 echo "Platform: ${PLATFORM}"
 echo "Image: ${IMAGE_NAME}:${TAG}"
 echo ""
 echo "✓ DockerHub authentication verified"
 echo ""
+echo "Building base image..."
 docker build --platform ${PLATFORM} -f Dockerfile.base -t ${IMAGE_NAME}:${TAG} .
 echo ""
 echo "✓ Base image built successfully"
 echo ""
+# Show image size
+IMAGE_SIZE=$(docker images ${IMAGE_NAME}:${TAG} --format "{{.Size}}")
+echo "Image size: ${IMAGE_SIZE}"
+echo ""
 echo "Pushing to DockerHub..."
 docker push ${IMAGE_NAME}:${TAG}
 # Check if there are changes to commit
 if [[ -n $(git status -s) ]]; then
     echo "Committing changes..."
+    git add Dockerfile Dockerfile.base requirements.txt deploy.sh app.py client.py estimator.py
+    git commit -m "Optimized Docker build to fix OOM errors"
     echo "✓ Changes committed"
 else
     echo "No changes to commit"
 echo "✓ Pushed to HuggingFace"
 echo ""
 echo "HuggingFace will now:"
+echo "  1. Pull base image from DockerHub (${IMAGE_NAME}:${TAG})"
+echo "  2. Install build tools temporarily"
+echo "  3. Build C++ extensions with GPU"
+echo "  4. Remove build tools"
+echo "  5. Download model weights (246MB)"
+echo "  6. Start the Gradio app"
 echo ""
 # Follow build logs
                 # Parse JSON and extract data field
                 echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
             done
+        echo ""
+        echo "===================================="
+        echo "Build Status Check"
+        echo "===================================="
+        echo ""
+        # Wait a moment for status to update
+        sleep 2
+        # Check final build status
+        STATUS_JSON=$(curl -s -H "Authorization: Bearer ${HF_TOKEN}" \
+            "https://huggingface.co/api/spaces/${HF_SPACE}")
+        STAGE=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('stage', 'UNKNOWN'))" 2>/dev/null)
+        ERROR_MSG=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('errorMessage', ''))" 2>/dev/null)
+        echo "Final Status: ${STAGE}"
+        if [ "${STAGE}" = "RUNNING" ]; then
+            echo "✓ Deployment successful!"
+            echo ""
+            echo "Space URL: https://${HF_SPACE/\//-}.hf.space"
+            echo "API URL: https://${HF_SPACE/\//-}.hf.space/gradio_api/info"
+            echo ""
+            echo "Test with: cd ../training && make test-perception-api"
+        elif [ "${STAGE}" = "BUILD_ERROR" ]; then
+            echo "✗ Build failed!"
+            if [ -n "${ERROR_MSG}" ]; then
+                echo "Error: ${ERROR_MSG}"
+            fi
+            echo ""
+            echo "If still getting OOM errors, consider:"
+            echo "  - Moving weights to runtime download (not build time)"
+            echo "  - Requesting larger build instance from HuggingFace"
+            echo "  - Using only CUDA arch 7.5 (T4 only)"
+            exit 1
+        else
+            echo "Status: ${STAGE}"
+            if [ -n "${ERROR_MSG}" ]; then
+                echo "Message: ${ERROR_MSG}"
+            fi
+        fi
     else
         echo "Warning: HF_TOKEN not found in ${HF_TOKEN_FILE}"
         echo "To follow logs manually:"

requirements.txt CHANGED Viewed

@@ -1,26 +1,9 @@
-# Core dependencies
 gradio>=4.0.0
 numpy>=1.24.0
-opencv-python>=4.8.0
 Pillow>=10.0.0
-# FastAPI for REST API endpoints
-fastapi>=0.109.0
-uvicorn>=0.27.0
-pydantic>=2.0.0
-# Hugging Face
-huggingface_hub>=0.20.0
-# Deep learning
-torch>=2.0.0
-torchvision>=0.15.0
-# 3D vision dependencies
-trimesh>=4.0.0
-pyrender>=0.1.45
-scikit-image>=0.21.0
-# FoundationPose specific (will need to install from source)
-# The actual FoundationPose repo needs to be cloned and installed
-# git+https://github.com/NVlabs/FoundationPose.git

+# Minimal requirements - only what's needed for runtime
 gradio>=4.0.0
 numpy>=1.24.0
+opencv-python-headless>=4.8.0  # Headless version saves ~400MB
 Pillow>=10.0.0
+huggingface-hub>=0.20.0
+# Note: torch and torchvision are installed separately with CUDA support
+# Note: FoundationPose C++ extensions built at runtime