Spaces:

gpue
/

foundationpose

Sleeping

App Files Files Community

Georg commited on 28 days ago

Commit

c58f0bb

1 Parent(s): a0f9c96

Update base image build and deps

Browse files

Files changed (4) hide show

Dockerfile +19 -8
Dockerfile.base +127 -48
deploy.sh +122 -96
scripts/run_hf_image_job.py +185 -0

Dockerfile CHANGED Viewed

@@ -1,24 +1,37 @@
 # Final stage Dockerfile - optimized for HuggingFace
-# Uses devel base image (includes CUDA compiler tools)
-FROM gpue/foundationpose-base:latest
 # FoundationPose configuration
 ENV FOUNDATIONPOSE_MODEL_REPO=gpue/foundationpose-weights
 ENV USE_REAL_MODEL=true
-# Ensure NumPy 1.x for CUDA extension compatibility
 RUN pip install --no-cache-dir "numpy<2" transformers==4.41.2 \
     && pip install --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu118_pyt210/download.html
 # Set MAX_JOBS=1 BEFORE any CUDA compilation to limit memory usage
 ENV MAX_JOBS=1
 # Install nvdiffrast (CUDA rasterizer) - needs GPU, build here
 RUN git clone --depth 1 https://github.com/NVlabs/nvdiffrast.git /tmp/nvdiffrast \
     && cd /tmp/nvdiffrast \
     && python3 setup.py build_ext --inplace
 RUN python3 -c "import shutil, sysconfig, glob; from pathlib import Path; site=Path(sysconfig.get_paths()['purelib']); src=Path('/tmp/nvdiffrast/nvdiffrast'); dst=site/'nvdiffrast'; shutil.rmtree(dst, ignore_errors=True); shutil.copytree(src, dst); so_files=(glob.glob('/tmp/nvdiffrast/_nvdiffrast_c*.so') + glob.glob('/tmp/nvdiffrast/nvdiffrast/_nvdiffrast_c*.so') + glob.glob('/tmp/nvdiffrast/build/lib.*/*_nvdiffrast_c*.so')); [shutil.copy2(p, site) for p in so_files]"
-RUN python3 -c "import sysconfig; from pathlib import Path; site=Path(sysconfig.get_paths()['purelib']); dist=site/'nvdiffrast-0.0.0.dist-info'; dist.mkdir(exist_ok=True); (dist/'METADATA').write_text('Metadata-Version: 2.1\\nName: nvdiffrast\\nVersion: 0.0.0\\n'); (dist/'WHEEL').write_text('Wheel-Version: 1.0\\nGenerator: manual\\nRoot-Is-Purelib: false\\nTag: py3-none-any\\n'); (dist/'top_level.txt').write_text('nvdiffrast\\n'); (dist/'RECORD').write_text('')"
 RUN python3 -c "import nvdiffrast.torch"
 RUN rm -rf /tmp/nvdiffrast
@@ -26,10 +39,8 @@ RUN rm -rf /tmp/nvdiffrast
 WORKDIR /app/FoundationPose
 RUN cd bundlesdf/mycuda && pip install . --no-build-isolation
-# Note: mycpp build, weights download, and build deps are already in base image
 WORKDIR /app
-# Copy application files (placed here so changes don't require base image rebuild)
-COPY app.py client.py estimator.py masks.py ./
 CMD ["python3", "app.py"]

 # Final stage Dockerfile - optimized for HuggingFace
+FROM gpue/foundationpose-base-l2:latest
 # FoundationPose configuration
 ENV FOUNDATIONPOSE_MODEL_REPO=gpue/foundationpose-weights
 ENV USE_REAL_MODEL=true
+# Ensure NumPy 1.x for CUDA extension compatibility and install SAM/pytorch3d
 RUN pip install --no-cache-dir "numpy<2" transformers==4.41.2 \
     && pip install --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu118_pyt210/download.html
 # Set MAX_JOBS=1 BEFORE any CUDA compilation to limit memory usage
 ENV MAX_JOBS=1
+# Clone FoundationPose source
+RUN git clone --depth 1 https://github.com/NVlabs/FoundationPose.git /app/FoundationPose \
+    && cd /app/FoundationPose/bundlesdf/mycuda \
+    && sed -i 's/-std=c++14/-std=c++17/g' setup.py
+# Build CPU-only C++ code
+WORKDIR /app/FoundationPose
+RUN cd mycpp && mkdir -p build && cd build && cmake .. && make
+# Download model weights (246MB)
+WORKDIR /app
+COPY download_weights.py ./download_weights.py
+RUN python3 download_weights.py
 # Install nvdiffrast (CUDA rasterizer) - needs GPU, build here
 RUN git clone --depth 1 https://github.com/NVlabs/nvdiffrast.git /tmp/nvdiffrast \
     && cd /tmp/nvdiffrast \
     && python3 setup.py build_ext --inplace
 RUN python3 -c "import shutil, sysconfig, glob; from pathlib import Path; site=Path(sysconfig.get_paths()['purelib']); src=Path('/tmp/nvdiffrast/nvdiffrast'); dst=site/'nvdiffrast'; shutil.rmtree(dst, ignore_errors=True); shutil.copytree(src, dst); so_files=(glob.glob('/tmp/nvdiffrast/_nvdiffrast_c*.so') + glob.glob('/tmp/nvdiffrast/nvdiffrast/_nvdiffrast_c*.so') + glob.glob('/tmp/nvdiffrast/build/lib.*/*_nvdiffrast_c*.so')); [shutil.copy2(p, site) for p in so_files]"
+RUN python3 -c "import sysconfig; from pathlib import Path; site=Path(sysconfig.get_paths()['purelib']); dist=site/'nvdiffrast-0.0.0.dist-info'; dist.mkdir(exist_ok=True); (dist/'METADATA').write_text('Metadata-Version: 2.1\nName: nvdiffrast\nVersion: 0.0.0\n'); (dist/'WHEEL').write_text('Wheel-Version: 1.0\nGenerator: manual\nRoot-Is-Purelib: false\nTag: py3-none-any\n'); (dist/'top_level.txt').write_text('nvdiffrast\n'); (dist/'RECORD').write_text('')"
 RUN python3 -c "import nvdiffrast.torch"
 RUN rm -rf /tmp/nvdiffrast
 WORKDIR /app/FoundationPose
 RUN cd bundlesdf/mycuda && pip install . --no-build-isolation
+# Copy application files
 WORKDIR /app
+COPY app.py client.py estimator.py masks.py .
 CMD ["python3", "app.py"]

Dockerfile.base CHANGED Viewed

@@ -1,5 +1,108 @@
-# Base image with CUDA compiler tools (needed for C++ extensions)
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 ENV CUDA_HOME=/usr/local/cuda
@@ -9,8 +112,7 @@ ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
 # Only build for T4 (7.5) - reduces compilation memory by 50%
 ENV TORCH_CUDA_ARCH_LIST="7.5"
-# Install minimal runtime dependencies
-# Remove problematic CUDA repo and install packages
 RUN rm -f /etc/apt/sources.list.d/cuda*.list /etc/apt/sources.list.d/*.list && \
     apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
     ca-certificates \
@@ -22,6 +124,16 @@ RUN rm -f /etc/apt/sources.list.d/cuda*.list /etc/apt/sources.list.d/*.list && \
     libgl1 \
     libglib2.0-0 \
     libgomp1 \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
@@ -32,23 +144,22 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
 # Upgrade pip
 RUN python3 -m pip install --no-cache-dir --upgrade pip
-WORKDIR /app
-# Install PyTorch (smallest CUDA 11.8 build)
-RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
-# Install only essential requirements
-# Pin NumPy to 1.x for CUDA extension compatibility
 RUN pip install --no-cache-dir \
-    "numpy<2" \
-    gradio>=4.0.0 \
-    opencv-python-headless>=4.8.0 \
-    Pillow>=10.0.0 \
-    huggingface-hub>=0.20.0 \
     && pip cache purge
-# Install build dependencies (keep them for faster HuggingFace builds)
-# Install BEFORE nvdiffrast because it needs python3.10-dev
 RUN apt-get update && apt-get install -y --no-install-recommends \
     cmake \
     build-essential \
@@ -60,38 +171,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     pybind11-dev \
     && rm -rf /var/lib/apt/lists/*
-# Install FoundationPose dependencies
-RUN pip install --no-cache-dir \
-    trimesh==4.2.2 \
-    scipy==1.12.0 \
-    scikit-image==0.22.0 \
-    kornia==0.7.2 \
-    einops==0.7.0 \
-    timm==0.9.16 \
-    transformations==2024.6.1 \
-    pyyaml==6.0.1 \
-    joblib==1.4.0 \
-    psutil==6.1.1 \
-    open3d==0.18.0 \
-    && pip cache purge
-# Note: nvdiffrast will be built in final Dockerfile on HuggingFace (needs GPU)
-# Clone FoundationPose
-RUN git clone --depth 1 https://github.com/NVlabs/FoundationPose.git /app/FoundationPose && \
-    cd /app/FoundationPose/bundlesdf/mycuda && \
-    sed -i 's/-std=c++14/-std=c++17/g' setup.py
-# Build mycpp (non-GPU C++ code - can be built without GPU)
-WORKDIR /app/FoundationPose
-RUN cd mycpp && mkdir -p build && cd build && cmake .. && make
-# Download model weights (246MB)
 WORKDIR /app
-RUN python3 -c "from huggingface_hub import snapshot_download; \
-snapshot_download(repo_id='gpue/foundationpose-weights', local_dir='weights', repo_type='model')"
-# Note: Application files (app.py, client.py, estimator.py) are copied in main Dockerfile
-# This allows updates without rebuilding the entire base image
 EXPOSE 7860

+# Base image with FoundationPose dependencies split into CPU (L1) and GPU (L2)
+# Stage 1: CPU-only base with Python deps
+FROM ubuntu:22.04 AS foundationpose-base-l1
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system deps needed to build/run python packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    git \
+    python3.10 \
+    python3-pip \
+    build-essential \
+    cmake \
+    ninja-build \
+    libeigen3-dev \
+    python3.10-dev \
+    libboost-system-dev \
+    libboost-program-options-dev \
+    pybind11-dev \
+    libgl1 \
+    libglib2.0-0 \
+    libgomp1 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libxkbcommon0 \
+    libx11-6 \
+    libxrandr2 \
+    libxi6 \
+    libxinerama1 \
+    libxcursor1 \
+    libspatialindex-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Set python as default
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+# Core python deps (CPU-safe)
+# Keep NumPy <2 for extension compatibility
+RUN pip install --no-cache-dir \
+    "numpy<2" \
+    Pillow>=10.0.0 \
+    gradio>=4.0.0 \
+    huggingface-hub>=0.20.0 \
+    scipy==1.12.0 \
+    scikit-image==0.22.0 \
+    scikit-learn==1.4.1.post1 \
+    kornia==0.7.2 \
+    einops==0.7.0 \
+    timm==0.9.16 \
+    pyyaml==6.0.1 \
+    ruamel.yaml==0.18.6 \
+    omegaconf==2.3.0 \
+    h5py==3.10.0 \
+    numba==0.59.1 \
+    imageio==2.34.0 \
+    joblib==1.3.2 \
+    psutil==6.1.1 \
+    albumentations==1.4.2 \
+    imgaug==0.4.0 \
+    seaborn==0.13.2 \
+    plotly==5.20.0 \
+    bokeh==3.4.0 \
+    colorama==0.4.6 \
+    GPUtil==1.4.0 \
+    simplejson==3.19.2 \
+    openpyxl==3.1.2 \
+    xlsxwriter==3.2.0 \
+    nodejs==0.1.1 \
+    jupyterlab==4.1.5 \
+    ipywidgets==8.1.2 \
+    py-spy==0.3.14 \
+    videoio==0.2.8 \
+    pypng==0.20220715.0 \
+    roma==1.4.4 \
+    transformations==2024.6.1 \
+    meshcat==0.3.2 \
+    webdataset==0.2.86 \
+    wandb==0.16.5 \
+    g4f==0.2.7.1 \
+    objaverse==0.1.7 \
+    opencv-python==4.9.0.80 \
+    opencv-contrib-python==4.9.0.80 \
+    open3d==0.18.0 \
+    pyglet==1.5.28 \
+    pysdf==0.1.9 \
+    trimesh==4.2.2 \
+    xatlas==0.0.9 \
+    rtree==1.2.0 \
+    pyrender==0.1.45 \
+    pyOpenGL>=3.1.0 \
+    pyOpenGL_accelerate>=3.1.0 \
+    pybullet==3.2.6 \
+    pycocotools==2.0.7 \
+    Panda3D==1.10.14 \
+    pin==2.7.0 \
+    && pip cache purge
+# Stage 2: GPU-enabled base
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS foundationpose-base-l2
 ENV DEBIAN_FRONTEND=noninteractive
 ENV CUDA_HOME=/usr/local/cuda
 # Only build for T4 (7.5) - reduces compilation memory by 50%
 ENV TORCH_CUDA_ARCH_LIST="7.5"
+# Install system deps
 RUN rm -f /etc/apt/sources.list.d/cuda*.list /etc/apt/sources.list.d/*.list && \
     apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
     ca-certificates \
     libgl1 \
     libglib2.0-0 \
     libgomp1 \
+    libsm6 \
+    libxext6 \
+    libxrender1 \
+    libxkbcommon0 \
+    libx11-6 \
+    libxrandr2 \
+    libxi6 \
+    libxinerama1 \
+    libxcursor1 \
+    libspatialindex-dev \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 # Upgrade pip
 RUN python3 -m pip install --no-cache-dir --upgrade pip
+# Copy CPU-only python deps from L1
+COPY --from=foundationpose-base-l1 /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
+COPY --from=foundationpose-base-l1 /usr/local/bin /usr/local/bin
+# Install PyTorch (CUDA 11.8)
+RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
+# GPU/torch-dependent deps
 RUN pip install --no-cache-dir \
+    fvcore==0.1.5.post20221221 \
+    torchnet==0.0.4 \
+    ultralytics==8.0.120 \
+    warp-lang==1.0.2 \
     && pip cache purge
+# Build deps required for CUDA extensions
 RUN apt-get update && apt-get install -y --no-install-recommends \
     cmake \
     build-essential \
     pybind11-dev \
     && rm -rf /var/lib/apt/lists/*
 WORKDIR /app
 EXPOSE 7860

deploy.sh CHANGED Viewed

@@ -3,58 +3,63 @@
 set -e
-IMAGE_NAME="gpue/foundationpose-base"
 TAG="latest"
 PLATFORM="linux/amd64"
 HF_SPACE="gpue/foundationpose"
-HF_TOKEN_FILE="../training/.env.local"
 echo "==================================="
 echo "FoundationPose Deployment"
 echo "==================================="
 echo ""
-# Stage 1: Build and push base image
-echo "Stage 1: Building base image"
 echo "Platform: ${PLATFORM}"
 echo "Image: ${IMAGE_NAME}:${TAG}"
 echo ""
-# Check Docker login (prefer token if provided)
-if [ -n "${DOCKER_HF_PAT}" ]; then
-    DOCKER_USER="${DOCKER_HF_USER:-gpue}"
-    echo "${DOCKER_HF_PAT}" | docker login -u "${DOCKER_USER}" --password-stdin
-    echo "✓ DockerHub authentication verified (token)"
 else
-    if [ ! -f ~/.docker/config.json ] || ! grep -q "index.docker.io" ~/.docker/config.json 2>/dev/null; then
-        echo "Error: Not logged in to DockerHub"
-        echo "Please run: docker login or set DOCKER_HF_PAT"
-        exit 1
     fi
-    echo "✓ DockerHub authentication verified"
 fi
-echo ""
-echo "Building base image..."
-docker build --platform ${PLATFORM} -f Dockerfile.base -t ${IMAGE_NAME}:${TAG} .
-echo ""
-echo "✓ Base image built successfully"
-echo ""
-# Show image size
-IMAGE_SIZE=$(docker images ${IMAGE_NAME}:${TAG} --format "{{.Size}}")
-echo "Image size: ${IMAGE_SIZE}"
-echo ""
-echo "Pushing to DockerHub..."
-docker push ${IMAGE_NAME}:${TAG}
 echo ""
-echo "✓ Base image pushed to DockerHub: ${IMAGE_NAME}:${TAG}"
-echo ""
-# Stage 2: Deploy to HuggingFace
 echo "Stage 2: Deploying to HuggingFace Space"
 echo ""
@@ -62,7 +67,7 @@ echo ""
 if [ ! -d .git ]; then
     echo "Initializing git repository..."
     git init
-    git remote add origin https://huggingface.co/spaces/${HF_SPACE}
     echo "✓ Git repository initialized"
     echo ""
 fi
@@ -70,8 +75,8 @@ fi
 # Check if there are changes to commit
 if [[ -n $(git status -s) ]]; then
     echo "Committing changes..."
-    git add Dockerfile Dockerfile.base requirements.txt deploy.sh app.py client.py estimator.py masks.py
-    git commit -m "Optimized Docker build to fix OOM errors"
     echo "✓ Changes committed"
 else
     echo "No changes to commit"
@@ -80,18 +85,16 @@ fi
 # Push to HuggingFace
 echo ""
 echo "Pushing to HuggingFace Space: ${HF_SPACE}"
-git push https://huggingface.co/spaces/${HF_SPACE} main --force
 echo ""
 echo "✓ Pushed to HuggingFace"
 echo ""
 echo "HuggingFace will now:"
 echo "  1. Pull base image from DockerHub (${IMAGE_NAME}:${TAG})"
-echo "  2. Install build tools temporarily"
-echo "  3. Build C++ extensions with GPU"
-echo "  4. Remove build tools"
-echo "  5. Download model weights (246MB)"
-echo "  6. Start the Gradio app"
 echo ""
 # Follow build logs
@@ -99,67 +102,90 @@ echo "Following build logs..."
 echo "Press Ctrl+C to stop watching"
 echo ""
-# Load HF token
-if [ -f "${HF_TOKEN_FILE}" ]; then
-    HF_TOKEN=$(grep "^HUGGINGFACE_TOKEN=" "${HF_TOKEN_FILE}" | cut -d'=' -f2)
-    if [ -n "${HF_TOKEN}" ]; then
-        curl -N -H "Authorization: Bearer ${HF_TOKEN}" \
-            "https://huggingface.co/api/spaces/${HF_SPACE}/logs/build" 2>/dev/null | \
-            while IFS= read -r line; do
-                # Parse JSON and extract data field
-                echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
-            done
         echo ""
-        echo "===================================="
-        echo "Build Status Check"
-        echo "===================================="
         echo ""
-        # Wait a moment for status to update
-        sleep 2
-        # Check final build status
-        STATUS_JSON=$(curl -s -H "Authorization: Bearer ${HF_TOKEN}" \
-            "https://huggingface.co/api/spaces/${HF_SPACE}")
-        STAGE=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('stage', 'UNKNOWN'))" 2>/dev/null)
-        ERROR_MSG=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('errorMessage', ''))" 2>/dev/null)
-        echo "Final Status: ${STAGE}"
-        if [ "${STAGE}" = "RUNNING" ]; then
-            echo "✓ Deployment successful!"
-            echo ""
-            echo "Space URL: https://${HF_SPACE/\//-}.hf.space"
-            echo "API URL: https://${HF_SPACE/\//-}.hf.space/gradio_api/info"
-            echo ""
-            echo "Test with: cd ../training && make test-perception-api"
-        elif [ "${STAGE}" = "BUILD_ERROR" ]; then
-            echo "✗ Build failed!"
-            if [ -n "${ERROR_MSG}" ]; then
-                echo "Error: ${ERROR_MSG}"
-            fi
-            echo ""
-            echo "If still getting OOM errors, consider:"
-            echo "  - Moving weights to runtime download (not build time)"
-            echo "  - Requesting larger build instance from HuggingFace"
-            echo "  - Using only CUDA arch 7.5 (T4 only)"
-            exit 1
-        else
-            echo "Status: ${STAGE}"
-            if [ -n "${ERROR_MSG}" ]; then
-                echo "Message: ${ERROR_MSG}"
-            fi
         fi
     else
-        echo "Warning: HF_TOKEN not found in ${HF_TOKEN_FILE}"
-        echo "To follow logs manually:"
-        echo "  curl -N -H \"Authorization: Bearer \$HF_TOKEN\" \"https://huggingface.co/api/spaces/${HF_SPACE}/logs/build\""
     fi
 else
-    echo "Warning: ${HF_TOKEN_FILE} not found"
     echo "To follow logs manually:"
     echo "  curl -N -H \"Authorization: Bearer \$HF_TOKEN\" \"https://huggingface.co/api/spaces/${HF_SPACE}/logs/build\""
 fi

 set -e
+IMAGE_NAME="gpue/foundationpose-base-l2"
 TAG="latest"
 PLATFORM="linux/amd64"
 HF_SPACE="gpue/foundationpose"
+ENV_FILE=".env"
 echo "==================================="
 echo "FoundationPose Deployment"
 echo "==================================="
 echo ""
+# Load tokens from .env
+if [ -f "${ENV_FILE}" ]; then
+    set -a
+    # shellcheck disable=SC1090
+    source "${ENV_FILE}"
+    set +a
+else
+    echo "Warning: ${ENV_FILE} not found"
+fi
+# Ensure hf CLI is available for job logs
+if ! command -v hf >/dev/null 2>&1; then
+    echo "Installing huggingface_hub CLI (hf)..."
+    python3 -m pip install --user --quiet huggingface_hub
+    export PATH="$HOME/.local/bin:$PATH"
+fi
+echo "Stage 1: Building base image via HF Job"
 echo "Platform: ${PLATFORM}"
 echo "Image: ${IMAGE_NAME}:${TAG}"
 echo ""
+JOB_OUTPUT=$(python3 scripts/run_hf_image_job.py \
+    --image-name "${IMAGE_NAME}" \
+    --tag "${TAG}" \
+    --platform "${PLATFORM}" \
+    --dockerfile "Dockerfile.base" \
+    --target "foundationpose-base-l2" \
+    --git-repo "https://huggingface.co/spaces/${HF_SPACE}" 2>&1 | tee /tmp/hf_image_job.log)
+JOB_ID=$(echo "${JOB_OUTPUT}" | awk '/Job ID:/ {print $3}')
+if [ -z "${JOB_ID}" ]; then
+    echo "Warning: Could not parse HF job id. See /tmp/hf_image_job.log"
 else
+    echo "Following job logs for 1 minute..."
+    if command -v hf >/dev/null 2>&1; then
+        (timeout 60 hf jobs logs "${JOB_ID}") || true
+        echo ""
+        echo "Job status:"
+        hf jobs status "${JOB_ID}" || true
+    else
+        echo "hf CLI not available; job logs skipped"
     fi
 fi
 echo ""
 echo "Stage 2: Deploying to HuggingFace Space"
 echo ""
 if [ ! -d .git ]; then
     echo "Initializing git repository..."
     git init
+    git remote add origin "https://huggingface.co/spaces/${HF_SPACE}"
     echo "✓ Git repository initialized"
     echo ""
 fi
 # Check if there are changes to commit
 if [[ -n $(git status -s) ]]; then
     echo "Committing changes..."
+    git add Dockerfile Dockerfile.base requirements.txt deploy.sh app.py client.py estimator.py masks.py scripts/run_hf_image_job.py download_weights.py
+    git commit -m "Update base image build and deps"
     echo "✓ Changes committed"
 else
     echo "No changes to commit"
 # Push to HuggingFace
 echo ""
 echo "Pushing to HuggingFace Space: ${HF_SPACE}"
+git push "https://huggingface.co/spaces/${HF_SPACE}" main --force
 echo ""
 echo "✓ Pushed to HuggingFace"
 echo ""
 echo "HuggingFace will now:"
 echo "  1. Pull base image from DockerHub (${IMAGE_NAME}:${TAG})"
+echo "  2. Build CUDA extensions"
+echo "  3. Download model weights"
+echo "  4. Start the Gradio app"
 echo ""
 # Follow build logs
 echo "Press Ctrl+C to stop watching"
 echo ""
+HF_TOKEN="${HUGGINGFACE_TOKEN:-${HF_TOKEN:-}}"
+if [ -n "${HF_TOKEN}" ]; then
+    curl -N -H "Authorization: Bearer ${HF_TOKEN}" \
+        "https://huggingface.co/api/spaces/${HF_SPACE}/logs/build" 2>/dev/null | \
+        while IFS= read -r line; do
+            echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
+        done
+    echo ""
+    echo "===================================="
+    echo "Build Status Check"
+    echo "===================================="
+    echo ""
+    # Wait a moment for status to update
+    sleep 2
+    # Check final build status
+    STATUS_JSON=$(curl -s -H "Authorization: Bearer ${HF_TOKEN}" \
+        "https://huggingface.co/api/spaces/${HF_SPACE}")
+    STAGE=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('stage', 'UNKNOWN'))" 2>/dev/null)
+    ERROR_MSG=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('errorMessage', ''))" 2>/dev/null)
+    echo "Final Status: ${STAGE}"
+    if [ "${STAGE}" = "RUNNING" ]; then
+        echo "✓ Deployment successful!"
         echo ""
+        echo "Space URL: https://${HF_SPACE/\//-}.hf.space"
+        echo "API URL: https://${HF_SPACE/\//-}.hf.space/gradio_api/info"
         echo ""
+        echo "Test with: cd ../training && make test-perception-api"
+    elif [ "${STAGE}" = "BUILD_ERROR" ]; then
+        echo "✗ Build failed!"
+        if [ -n "${ERROR_MSG}" ]; then
+            echo "Error: ${ERROR_MSG}"
         fi
+        echo ""
+        echo "If still getting OOM errors, consider:"
+        echo "  - Moving weights to runtime download (not build time)"
+        echo "  - Requesting larger build instance from HuggingFace"
+        echo "  - Using only CUDA arch 7.5 (T4 only)"
+        exit 1
     else
+        echo "Status: ${STAGE}"
+        if [ -n "${ERROR_MSG}" ]; then
+            echo "Message: ${ERROR_MSG}"
+        fi
     fi
+    echo ""
+    echo "Following application logs for 1 minute..."
+    LOG_URL="https://huggingface.co/api/spaces/${HF_SPACE}/logs"
+    python3 - <<'PY'
+import os
+import subprocess
+import sys
+import time
+log_url = os.environ.get("LOG_URL")
+token = os.environ.get("HF_TOKEN")
+if not log_url or not token:
+    print("Skipping app logs: missing LOG_URL or HF_TOKEN")
+    raise SystemExit(0)
+proc = subprocess.Popen(
+    ["curl", "-N", "-H", f"Authorization: Bearer {token}", log_url],
+    stdout=sys.stdout,
+    stderr=subprocess.DEVNULL,
+)
+try:
+    time.sleep(60)
+finally:
+    proc.terminate()
+    try:
+        proc.wait(timeout=5)
+    except Exception:
+        proc.kill()
+PY
 else
+    echo "Warning: HF token not available; cannot follow logs"
     echo "To follow logs manually:"
     echo "  curl -N -H \"Authorization: Bearer \$HF_TOKEN\" \"https://huggingface.co/api/spaces/${HF_SPACE}/logs/build\""
 fi

scripts/run_hf_image_job.py ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env python3
+"""
+Submit a HuggingFace Job that builds the FoundationPose base image and pushes it to Docker Hub.
+"""
+import argparse
+import os
+import sys
+from huggingface_hub import run_job
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Build and push the FoundationPose base image via HuggingFace Jobs"
+    )
+    parser.add_argument(
+        "--image-name",
+        default="gpue/foundationpose-base-l2",
+        help="Docker Hub image name (default: gpue/foundationpose-base-l2)",
+    )
+    parser.add_argument(
+        "--tag",
+        default="latest",
+        help="Docker image tag (default: latest)",
+    )
+    parser.add_argument(
+        "--platform",
+        default="linux/amd64",
+        help="Target platform for docker build (default: linux/amd64)",
+    )
+    parser.add_argument(
+        "--dockerfile",
+        default="Dockerfile.base",
+        help="Dockerfile path inside repo (default: Dockerfile.base)",
+    )
+    parser.add_argument(
+        "--context",
+        default=".",
+        help="Docker build context path inside repo (default: .)",
+    )
+    parser.add_argument(
+        "--target",
+        default="foundationpose-base-l2",
+        help="Docker build target (default: foundationpose-base-l2)",
+    )
+    parser.add_argument(
+        "--git-repo",
+        default="https://huggingface.co/spaces/gpue/foundationpose",
+        help="Git repo to clone for build context (default: HF space repo)",
+    )
+    parser.add_argument(
+        "--flavor",
+        default="l40s",
+        help="HF Jobs hardware flavor (default: l40s)",
+    )
+    parser.add_argument(
+        "--timeout",
+        default="2h",
+        help="Job timeout (default: 2h)",
+    )
+    parser.add_argument("--namespace", help="Organization namespace (optional)")
+    parser.add_argument(
+        "--hf-token",
+        help="HuggingFace token (default: from HF_TOKEN or HUGGINGFACE_TOKEN env)",
+    )
+    parser.add_argument(
+        "--docker-user",
+        default=os.getenv("DOCKER_HF_USER", "gpue"),
+        help="Docker Hub username (default: DOCKER_HF_USER or gpue)",
+    )
+    parser.add_argument(
+        "--docker-token",
+        help="Docker Hub token (default: from DOCKER_HF_PAT env)",
+    )
+    args = parser.parse_args()
+    hf_token = args.hf_token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
+    docker_token = args.docker_token or os.getenv("DOCKER_HF_PAT")
+    if not hf_token:
+        print("Error: missing HF token (set HF_TOKEN or HUGGINGFACE_TOKEN)")
+        sys.exit(1)
+    if not docker_token:
+        print("Error: missing Docker token (set DOCKER_HF_PAT or --docker-token)")
+        sys.exit(1)
+    env = {
+        "IMAGE_NAME": args.image_name,
+        "IMAGE_TAG": args.tag,
+        "PLATFORM": args.platform,
+        "DOCKERFILE": args.dockerfile,
+        "CONTEXT": args.context,
+        "TARGET": args.target,
+        "GIT_REPO": args.git_repo,
+        "DOCKER_USER": args.docker_user,
+    }
+    secrets = {
+        "HF_TOKEN": hf_token,
+        "DOCKER_TOKEN": docker_token,
+    }
+    command = [
+        "sh",
+        "-c",
+        r"""
+set -euo pipefail
+echo "Installing git and certificates..."
+apk add --no-cache git ca-certificates curl >/dev/null
+# Start Docker daemon (DinD image)
+echo "Starting Docker daemon..."
+dockerd-entrypoint.sh > /tmp/dockerd.log 2>&1 &
+# Wait for Docker
+for i in $(seq 1 30); do
+  if docker info >/dev/null 2>&1; then
+    break
+  fi
+  sleep 1
+  if [ "$i" -eq 30 ]; then
+    echo "Docker did not start in time. Logs:" >&2
+    tail -n 200 /tmp/dockerd.log >&2 || true
+    exit 1
+  fi
+done
+echo "Cloning build context..."
+if [ -n "${HF_TOKEN:-}" ]; then
+  AUTH_REPO=$(echo "$GIT_REPO" | sed -e "s#https://#https://user:${HF_TOKEN}@#")
+  git clone --depth 1 "$AUTH_REPO" /work/repo
+else
+  git clone --depth 1 "$GIT_REPO" /work/repo
+fi
+cd /work/repo
+echo "Logging in to Docker Hub..."
+echo "$DOCKER_TOKEN" | docker login -u "$DOCKER_USER" --password-stdin
+IMAGE_REF="$IMAGE_NAME:$IMAGE_TAG"
+echo "Building image $IMAGE_REF (target: $TARGET)..."
+docker build --platform "$PLATFORM" -f "$DOCKERFILE" --target "$TARGET" -t "$IMAGE_REF" "$CONTEXT"
+echo "Pushing image $IMAGE_REF..."
+docker push "$IMAGE_REF"
+echo "✓ Image pushed successfully"
+""",
+    ]
+    print("Submitting HF job for image build...")
+    print(f"  Image:      {args.image_name}:{args.tag}")
+    print(f"  Target:     {args.target}")
+    print(f"  Repo:       {args.git_repo}")
+    print(f"  Dockerfile: {args.dockerfile}")
+    print(f"  Flavor:     {args.flavor}")
+    print(f"  Timeout:    {args.timeout}")
+    print()
+    job_info = run_job(
+        image="docker:24.0.7-dind",
+        command=command,
+        env=env,
+        secrets=secrets,
+        flavor=args.flavor,
+        timeout=args.timeout,
+        namespace=args.namespace,
+    )
+    print("✓ Job submitted")
+    print(f"  Job ID:  {job_info.id}")
+    print(f"  Job URL: {job_info.url}")
+    print()
+    print("Monitor logs:")
+    print(f"  hf jobs logs {job_info.id}")
+    print("Check status:")
+    print(f"  hf jobs status {job_info.id}")
+if __name__ == "__main__":
+    main()