Georg commited on
Commit
c58f0bb
·
1 Parent(s): a0f9c96

Update base image build and deps

Browse files
Files changed (4) hide show
  1. Dockerfile +19 -8
  2. Dockerfile.base +127 -48
  3. deploy.sh +122 -96
  4. scripts/run_hf_image_job.py +185 -0
Dockerfile CHANGED
@@ -1,24 +1,37 @@
1
  # Final stage Dockerfile - optimized for HuggingFace
2
- # Uses devel base image (includes CUDA compiler tools)
3
- FROM gpue/foundationpose-base:latest
4
 
5
  # FoundationPose configuration
6
  ENV FOUNDATIONPOSE_MODEL_REPO=gpue/foundationpose-weights
7
  ENV USE_REAL_MODEL=true
8
 
9
- # Ensure NumPy 1.x for CUDA extension compatibility
10
  RUN pip install --no-cache-dir "numpy<2" transformers==4.41.2 \
11
  && pip install --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu118_pyt210/download.html
12
 
13
  # Set MAX_JOBS=1 BEFORE any CUDA compilation to limit memory usage
14
  ENV MAX_JOBS=1
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # Install nvdiffrast (CUDA rasterizer) - needs GPU, build here
17
  RUN git clone --depth 1 https://github.com/NVlabs/nvdiffrast.git /tmp/nvdiffrast \
18
  && cd /tmp/nvdiffrast \
19
  && python3 setup.py build_ext --inplace
20
  RUN python3 -c "import shutil, sysconfig, glob; from pathlib import Path; site=Path(sysconfig.get_paths()['purelib']); src=Path('/tmp/nvdiffrast/nvdiffrast'); dst=site/'nvdiffrast'; shutil.rmtree(dst, ignore_errors=True); shutil.copytree(src, dst); so_files=(glob.glob('/tmp/nvdiffrast/_nvdiffrast_c*.so') + glob.glob('/tmp/nvdiffrast/nvdiffrast/_nvdiffrast_c*.so') + glob.glob('/tmp/nvdiffrast/build/lib.*/*_nvdiffrast_c*.so')); [shutil.copy2(p, site) for p in so_files]"
21
- RUN python3 -c "import sysconfig; from pathlib import Path; site=Path(sysconfig.get_paths()['purelib']); dist=site/'nvdiffrast-0.0.0.dist-info'; dist.mkdir(exist_ok=True); (dist/'METADATA').write_text('Metadata-Version: 2.1\\nName: nvdiffrast\\nVersion: 0.0.0\\n'); (dist/'WHEEL').write_text('Wheel-Version: 1.0\\nGenerator: manual\\nRoot-Is-Purelib: false\\nTag: py3-none-any\\n'); (dist/'top_level.txt').write_text('nvdiffrast\\n'); (dist/'RECORD').write_text('')"
22
  RUN python3 -c "import nvdiffrast.torch"
23
  RUN rm -rf /tmp/nvdiffrast
24
 
@@ -26,10 +39,8 @@ RUN rm -rf /tmp/nvdiffrast
26
  WORKDIR /app/FoundationPose
27
  RUN cd bundlesdf/mycuda && pip install . --no-build-isolation
28
 
29
- # Note: mycpp build, weights download, and build deps are already in base image
30
  WORKDIR /app
31
-
32
- # Copy application files (placed here so changes don't require base image rebuild)
33
- COPY app.py client.py estimator.py masks.py ./
34
 
35
  CMD ["python3", "app.py"]
 
1
  # Final stage Dockerfile - optimized for HuggingFace
2
+ FROM gpue/foundationpose-base-l2:latest
 
3
 
4
  # FoundationPose configuration
5
  ENV FOUNDATIONPOSE_MODEL_REPO=gpue/foundationpose-weights
6
  ENV USE_REAL_MODEL=true
7
 
8
+ # Ensure NumPy 1.x for CUDA extension compatibility and install SAM/pytorch3d
9
  RUN pip install --no-cache-dir "numpy<2" transformers==4.41.2 \
10
  && pip install --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu118_pyt210/download.html
11
 
12
  # Set MAX_JOBS=1 BEFORE any CUDA compilation to limit memory usage
13
  ENV MAX_JOBS=1
14
 
15
+ # Clone FoundationPose source
16
+ RUN git clone --depth 1 https://github.com/NVlabs/FoundationPose.git /app/FoundationPose \
17
+ && cd /app/FoundationPose/bundlesdf/mycuda \
18
+ && sed -i 's/-std=c++14/-std=c++17/g' setup.py
19
+
20
+ # Build CPU-only C++ code
21
+ WORKDIR /app/FoundationPose
22
+ RUN cd mycpp && mkdir -p build && cd build && cmake .. && make
23
+
24
+ # Download model weights (246MB)
25
+ WORKDIR /app
26
+ COPY download_weights.py ./download_weights.py
27
+ RUN python3 download_weights.py
28
+
29
  # Install nvdiffrast (CUDA rasterizer) - needs GPU, build here
30
  RUN git clone --depth 1 https://github.com/NVlabs/nvdiffrast.git /tmp/nvdiffrast \
31
  && cd /tmp/nvdiffrast \
32
  && python3 setup.py build_ext --inplace
33
  RUN python3 -c "import shutil, sysconfig, glob; from pathlib import Path; site=Path(sysconfig.get_paths()['purelib']); src=Path('/tmp/nvdiffrast/nvdiffrast'); dst=site/'nvdiffrast'; shutil.rmtree(dst, ignore_errors=True); shutil.copytree(src, dst); so_files=(glob.glob('/tmp/nvdiffrast/_nvdiffrast_c*.so') + glob.glob('/tmp/nvdiffrast/nvdiffrast/_nvdiffrast_c*.so') + glob.glob('/tmp/nvdiffrast/build/lib.*/*_nvdiffrast_c*.so')); [shutil.copy2(p, site) for p in so_files]"
34
+ RUN python3 -c "import sysconfig; from pathlib import Path; site=Path(sysconfig.get_paths()['purelib']); dist=site/'nvdiffrast-0.0.0.dist-info'; dist.mkdir(exist_ok=True); (dist/'METADATA').write_text('Metadata-Version: 2.1\nName: nvdiffrast\nVersion: 0.0.0\n'); (dist/'WHEEL').write_text('Wheel-Version: 1.0\nGenerator: manual\nRoot-Is-Purelib: false\nTag: py3-none-any\n'); (dist/'top_level.txt').write_text('nvdiffrast\n'); (dist/'RECORD').write_text('')"
35
  RUN python3 -c "import nvdiffrast.torch"
36
  RUN rm -rf /tmp/nvdiffrast
37
 
 
39
  WORKDIR /app/FoundationPose
40
  RUN cd bundlesdf/mycuda && pip install . --no-build-isolation
41
 
42
+ # Copy application files
43
  WORKDIR /app
44
+ COPY app.py client.py estimator.py masks.py .
 
 
45
 
46
  CMD ["python3", "app.py"]
Dockerfile.base CHANGED
@@ -1,5 +1,108 @@
1
- # Base image with CUDA compiler tools (needed for C++ extensions)
2
- FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  ENV DEBIAN_FRONTEND=noninteractive
5
  ENV CUDA_HOME=/usr/local/cuda
@@ -9,8 +112,7 @@ ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
9
  # Only build for T4 (7.5) - reduces compilation memory by 50%
10
  ENV TORCH_CUDA_ARCH_LIST="7.5"
11
 
12
- # Install minimal runtime dependencies
13
- # Remove problematic CUDA repo and install packages
14
  RUN rm -f /etc/apt/sources.list.d/cuda*.list /etc/apt/sources.list.d/*.list && \
15
  apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
16
  ca-certificates \
@@ -22,6 +124,16 @@ RUN rm -f /etc/apt/sources.list.d/cuda*.list /etc/apt/sources.list.d/*.list && \
22
  libgl1 \
23
  libglib2.0-0 \
24
  libgomp1 \
 
 
 
 
 
 
 
 
 
 
25
  && rm -rf /var/lib/apt/lists/* \
26
  && apt-get clean
27
 
@@ -32,23 +144,22 @@ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
32
  # Upgrade pip
33
  RUN python3 -m pip install --no-cache-dir --upgrade pip
34
 
35
- WORKDIR /app
 
 
36
 
37
- # Install PyTorch (smallest CUDA 11.8 build)
38
- RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118
39
 
40
- # Install only essential requirements
41
- # Pin NumPy to 1.x for CUDA extension compatibility
42
  RUN pip install --no-cache-dir \
43
- "numpy<2" \
44
- gradio>=4.0.0 \
45
- opencv-python-headless>=4.8.0 \
46
- Pillow>=10.0.0 \
47
- huggingface-hub>=0.20.0 \
48
  && pip cache purge
49
 
50
- # Install build dependencies (keep them for faster HuggingFace builds)
51
- # Install BEFORE nvdiffrast because it needs python3.10-dev
52
  RUN apt-get update && apt-get install -y --no-install-recommends \
53
  cmake \
54
  build-essential \
@@ -60,38 +171,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
60
  pybind11-dev \
61
  && rm -rf /var/lib/apt/lists/*
62
 
63
- # Install FoundationPose dependencies
64
- RUN pip install --no-cache-dir \
65
- trimesh==4.2.2 \
66
- scipy==1.12.0 \
67
- scikit-image==0.22.0 \
68
- kornia==0.7.2 \
69
- einops==0.7.0 \
70
- timm==0.9.16 \
71
- transformations==2024.6.1 \
72
- pyyaml==6.0.1 \
73
- joblib==1.4.0 \
74
- psutil==6.1.1 \
75
- open3d==0.18.0 \
76
- && pip cache purge
77
-
78
- # Note: nvdiffrast will be built in final Dockerfile on HuggingFace (needs GPU)
79
-
80
- # Clone FoundationPose
81
- RUN git clone --depth 1 https://github.com/NVlabs/FoundationPose.git /app/FoundationPose && \
82
- cd /app/FoundationPose/bundlesdf/mycuda && \
83
- sed -i 's/-std=c++14/-std=c++17/g' setup.py
84
-
85
- # Build mycpp (non-GPU C++ code - can be built without GPU)
86
- WORKDIR /app/FoundationPose
87
- RUN cd mycpp && mkdir -p build && cd build && cmake .. && make
88
-
89
- # Download model weights (246MB)
90
  WORKDIR /app
91
- RUN python3 -c "from huggingface_hub import snapshot_download; \
92
- snapshot_download(repo_id='gpue/foundationpose-weights', local_dir='weights', repo_type='model')"
93
-
94
- # Note: Application files (app.py, client.py, estimator.py) are copied in main Dockerfile
95
- # This allows updates without rebuilding the entire base image
96
 
97
  EXPOSE 7860
 
1
+ # Base image with FoundationPose dependencies split into CPU (L1) and GPU (L2)
2
+
3
+ # Stage 1: CPU-only base with Python deps
4
+ FROM ubuntu:22.04 AS foundationpose-base-l1
5
+
6
+ ENV DEBIAN_FRONTEND=noninteractive
7
+
8
+ # Install system deps needed to build/run python packages
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ ca-certificates \
11
+ git \
12
+ python3.10 \
13
+ python3-pip \
14
+ build-essential \
15
+ cmake \
16
+ ninja-build \
17
+ libeigen3-dev \
18
+ python3.10-dev \
19
+ libboost-system-dev \
20
+ libboost-program-options-dev \
21
+ pybind11-dev \
22
+ libgl1 \
23
+ libglib2.0-0 \
24
+ libgomp1 \
25
+ libsm6 \
26
+ libxext6 \
27
+ libxrender1 \
28
+ libxkbcommon0 \
29
+ libx11-6 \
30
+ libxrandr2 \
31
+ libxi6 \
32
+ libxinerama1 \
33
+ libxcursor1 \
34
+ libspatialindex-dev \
35
+ && rm -rf /var/lib/apt/lists/*
36
+
37
+ # Set python as default
38
+ RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
39
+ update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
40
+
41
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
42
+
43
+ # Core python deps (CPU-safe)
44
+ # Keep NumPy <2 for extension compatibility
45
+ RUN pip install --no-cache-dir \
46
+ "numpy<2" \
47
+ Pillow>=10.0.0 \
48
+ gradio>=4.0.0 \
49
+ huggingface-hub>=0.20.0 \
50
+ scipy==1.12.0 \
51
+ scikit-image==0.22.0 \
52
+ scikit-learn==1.4.1.post1 \
53
+ kornia==0.7.2 \
54
+ einops==0.7.0 \
55
+ timm==0.9.16 \
56
+ pyyaml==6.0.1 \
57
+ ruamel.yaml==0.18.6 \
58
+ omegaconf==2.3.0 \
59
+ h5py==3.10.0 \
60
+ numba==0.59.1 \
61
+ imageio==2.34.0 \
62
+ joblib==1.3.2 \
63
+ psutil==6.1.1 \
64
+ albumentations==1.4.2 \
65
+ imgaug==0.4.0 \
66
+ seaborn==0.13.2 \
67
+ plotly==5.20.0 \
68
+ bokeh==3.4.0 \
69
+ colorama==0.4.6 \
70
+ GPUtil==1.4.0 \
71
+ simplejson==3.19.2 \
72
+ openpyxl==3.1.2 \
73
+ xlsxwriter==3.2.0 \
74
+ nodejs==0.1.1 \
75
+ jupyterlab==4.1.5 \
76
+ ipywidgets==8.1.2 \
77
+ py-spy==0.3.14 \
78
+ videoio==0.2.8 \
79
+ pypng==0.20220715.0 \
80
+ roma==1.4.4 \
81
+ transformations==2024.6.1 \
82
+ meshcat==0.3.2 \
83
+ webdataset==0.2.86 \
84
+ wandb==0.16.5 \
85
+ g4f==0.2.7.1 \
86
+ objaverse==0.1.7 \
87
+ opencv-python==4.9.0.80 \
88
+ opencv-contrib-python==4.9.0.80 \
89
+ open3d==0.18.0 \
90
+ pyglet==1.5.28 \
91
+ pysdf==0.1.9 \
92
+ trimesh==4.2.2 \
93
+ xatlas==0.0.9 \
94
+ rtree==1.2.0 \
95
+ pyrender==0.1.45 \
96
+ pyOpenGL>=3.1.0 \
97
+ pyOpenGL_accelerate>=3.1.0 \
98
+ pybullet==3.2.6 \
99
+ pycocotools==2.0.7 \
100
+ Panda3D==1.10.14 \
101
+ pin==2.7.0 \
102
+ && pip cache purge
103
+
104
+ # Stage 2: GPU-enabled base
105
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS foundationpose-base-l2
106
 
107
  ENV DEBIAN_FRONTEND=noninteractive
108
  ENV CUDA_HOME=/usr/local/cuda
 
112
  # Only build for T4 (7.5) - reduces compilation memory by 50%
113
  ENV TORCH_CUDA_ARCH_LIST="7.5"
114
 
115
+ # Install system deps
 
116
  RUN rm -f /etc/apt/sources.list.d/cuda*.list /etc/apt/sources.list.d/*.list && \
117
  apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
118
  ca-certificates \
 
124
  libgl1 \
125
  libglib2.0-0 \
126
  libgomp1 \
127
+ libsm6 \
128
+ libxext6 \
129
+ libxrender1 \
130
+ libxkbcommon0 \
131
+ libx11-6 \
132
+ libxrandr2 \
133
+ libxi6 \
134
+ libxinerama1 \
135
+ libxcursor1 \
136
+ libspatialindex-dev \
137
  && rm -rf /var/lib/apt/lists/* \
138
  && apt-get clean
139
 
 
144
  # Upgrade pip
145
  RUN python3 -m pip install --no-cache-dir --upgrade pip
146
 
147
+ # Copy CPU-only python deps from L1
148
+ COPY --from=foundationpose-base-l1 /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
149
+ COPY --from=foundationpose-base-l1 /usr/local/bin /usr/local/bin
150
 
151
+ # Install PyTorch (CUDA 11.8)
152
+ RUN pip install --no-cache-dir torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
153
 
154
+ # GPU/torch-dependent deps
 
155
  RUN pip install --no-cache-dir \
156
+ fvcore==0.1.5.post20221221 \
157
+ torchnet==0.0.4 \
158
+ ultralytics==8.0.120 \
159
+ warp-lang==1.0.2 \
 
160
  && pip cache purge
161
 
162
+ # Build deps required for CUDA extensions
 
163
  RUN apt-get update && apt-get install -y --no-install-recommends \
164
  cmake \
165
  build-essential \
 
171
  pybind11-dev \
172
  && rm -rf /var/lib/apt/lists/*
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  WORKDIR /app
 
 
 
 
 
175
 
176
  EXPOSE 7860
deploy.sh CHANGED
@@ -3,58 +3,63 @@
3
 
4
  set -e
5
 
6
- IMAGE_NAME="gpue/foundationpose-base"
7
  TAG="latest"
8
  PLATFORM="linux/amd64"
9
  HF_SPACE="gpue/foundationpose"
10
- HF_TOKEN_FILE="../training/.env.local"
11
 
12
  echo "==================================="
13
  echo "FoundationPose Deployment"
14
  echo "==================================="
15
  echo ""
16
 
17
- # Stage 1: Build and push base image
18
- echo "Stage 1: Building base image"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  echo "Platform: ${PLATFORM}"
20
  echo "Image: ${IMAGE_NAME}:${TAG}"
21
  echo ""
22
 
23
- # Check Docker login (prefer token if provided)
24
- if [ -n "${DOCKER_HF_PAT}" ]; then
25
- DOCKER_USER="${DOCKER_HF_USER:-gpue}"
26
- echo "${DOCKER_HF_PAT}" | docker login -u "${DOCKER_USER}" --password-stdin
27
- echo " DockerHub authentication verified (token)"
 
 
 
 
 
 
28
  else
29
- if [ ! -f ~/.docker/config.json ] || ! grep -q "index.docker.io" ~/.docker/config.json 2>/dev/null; then
30
- echo "Error: Not logged in to DockerHub"
31
- echo "Please run: docker login or set DOCKER_HF_PAT"
32
- exit 1
 
 
 
 
33
  fi
34
- echo "✓ DockerHub authentication verified"
35
  fi
36
- echo ""
37
-
38
- echo "Building base image..."
39
- docker build --platform ${PLATFORM} -f Dockerfile.base -t ${IMAGE_NAME}:${TAG} .
40
-
41
- echo ""
42
- echo "✓ Base image built successfully"
43
- echo ""
44
-
45
- # Show image size
46
- IMAGE_SIZE=$(docker images ${IMAGE_NAME}:${TAG} --format "{{.Size}}")
47
- echo "Image size: ${IMAGE_SIZE}"
48
- echo ""
49
-
50
- echo "Pushing to DockerHub..."
51
- docker push ${IMAGE_NAME}:${TAG}
52
 
53
  echo ""
54
- echo "✓ Base image pushed to DockerHub: ${IMAGE_NAME}:${TAG}"
55
- echo ""
56
-
57
- # Stage 2: Deploy to HuggingFace
58
  echo "Stage 2: Deploying to HuggingFace Space"
59
  echo ""
60
 
@@ -62,7 +67,7 @@ echo ""
62
  if [ ! -d .git ]; then
63
  echo "Initializing git repository..."
64
  git init
65
- git remote add origin https://huggingface.co/spaces/${HF_SPACE}
66
  echo "✓ Git repository initialized"
67
  echo ""
68
  fi
@@ -70,8 +75,8 @@ fi
70
  # Check if there are changes to commit
71
  if [[ -n $(git status -s) ]]; then
72
  echo "Committing changes..."
73
- git add Dockerfile Dockerfile.base requirements.txt deploy.sh app.py client.py estimator.py masks.py
74
- git commit -m "Optimized Docker build to fix OOM errors"
75
  echo "✓ Changes committed"
76
  else
77
  echo "No changes to commit"
@@ -80,18 +85,16 @@ fi
80
  # Push to HuggingFace
81
  echo ""
82
  echo "Pushing to HuggingFace Space: ${HF_SPACE}"
83
- git push https://huggingface.co/spaces/${HF_SPACE} main --force
84
 
85
  echo ""
86
  echo "✓ Pushed to HuggingFace"
87
  echo ""
88
  echo "HuggingFace will now:"
89
  echo " 1. Pull base image from DockerHub (${IMAGE_NAME}:${TAG})"
90
- echo " 2. Install build tools temporarily"
91
- echo " 3. Build C++ extensions with GPU"
92
- echo " 4. Remove build tools"
93
- echo " 5. Download model weights (246MB)"
94
- echo " 6. Start the Gradio app"
95
  echo ""
96
 
97
  # Follow build logs
@@ -99,67 +102,90 @@ echo "Following build logs..."
99
  echo "Press Ctrl+C to stop watching"
100
  echo ""
101
 
102
- # Load HF token
103
- if [ -f "${HF_TOKEN_FILE}" ]; then
104
- HF_TOKEN=$(grep "^HUGGINGFACE_TOKEN=" "${HF_TOKEN_FILE}" | cut -d'=' -f2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- if [ -n "${HF_TOKEN}" ]; then
107
- curl -N -H "Authorization: Bearer ${HF_TOKEN}" \
108
- "https://huggingface.co/api/spaces/${HF_SPACE}/logs/build" 2>/dev/null | \
109
- while IFS= read -r line; do
110
- # Parse JSON and extract data field
111
- echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
112
- done
113
 
 
 
 
 
114
  echo ""
115
- echo "===================================="
116
- echo "Build Status Check"
117
- echo "===================================="
118
  echo ""
119
-
120
- # Wait a moment for status to update
121
- sleep 2
122
-
123
- # Check final build status
124
- STATUS_JSON=$(curl -s -H "Authorization: Bearer ${HF_TOKEN}" \
125
- "https://huggingface.co/api/spaces/${HF_SPACE}")
126
-
127
- STAGE=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('stage', 'UNKNOWN'))" 2>/dev/null)
128
- ERROR_MSG=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('errorMessage', ''))" 2>/dev/null)
129
-
130
- echo "Final Status: ${STAGE}"
131
-
132
- if [ "${STAGE}" = "RUNNING" ]; then
133
- echo "✓ Deployment successful!"
134
- echo ""
135
- echo "Space URL: https://${HF_SPACE/\//-}.hf.space"
136
- echo "API URL: https://${HF_SPACE/\//-}.hf.space/gradio_api/info"
137
- echo ""
138
- echo "Test with: cd ../training && make test-perception-api"
139
- elif [ "${STAGE}" = "BUILD_ERROR" ]; then
140
- echo "✗ Build failed!"
141
- if [ -n "${ERROR_MSG}" ]; then
142
- echo "Error: ${ERROR_MSG}"
143
- fi
144
- echo ""
145
- echo "If still getting OOM errors, consider:"
146
- echo " - Moving weights to runtime download (not build time)"
147
- echo " - Requesting larger build instance from HuggingFace"
148
- echo " - Using only CUDA arch 7.5 (T4 only)"
149
- exit 1
150
- else
151
- echo "Status: ${STAGE}"
152
- if [ -n "${ERROR_MSG}" ]; then
153
- echo "Message: ${ERROR_MSG}"
154
- fi
155
  fi
 
 
 
 
 
 
156
  else
157
- echo "Warning: HF_TOKEN not found in ${HF_TOKEN_FILE}"
158
- echo "To follow logs manually:"
159
- echo " curl -N -H \"Authorization: Bearer \$HF_TOKEN\" \"https://huggingface.co/api/spaces/${HF_SPACE}/logs/build\""
 
160
  fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  else
162
- echo "Warning: ${HF_TOKEN_FILE} not found"
163
  echo "To follow logs manually:"
164
  echo " curl -N -H \"Authorization: Bearer \$HF_TOKEN\" \"https://huggingface.co/api/spaces/${HF_SPACE}/logs/build\""
165
  fi
 
3
 
4
  set -e
5
 
6
+ IMAGE_NAME="gpue/foundationpose-base-l2"
7
  TAG="latest"
8
  PLATFORM="linux/amd64"
9
  HF_SPACE="gpue/foundationpose"
10
+ ENV_FILE=".env"
11
 
12
  echo "==================================="
13
  echo "FoundationPose Deployment"
14
  echo "==================================="
15
  echo ""
16
 
17
+ # Load tokens from .env
18
+ if [ -f "${ENV_FILE}" ]; then
19
+ set -a
20
+ # shellcheck disable=SC1090
21
+ source "${ENV_FILE}"
22
+ set +a
23
+ else
24
+ echo "Warning: ${ENV_FILE} not found"
25
+ fi
26
+
27
+ # Ensure hf CLI is available for job logs
28
+ if ! command -v hf >/dev/null 2>&1; then
29
+ echo "Installing huggingface_hub CLI (hf)..."
30
+ python3 -m pip install --user --quiet huggingface_hub
31
+ export PATH="$HOME/.local/bin:$PATH"
32
+ fi
33
+
34
+ echo "Stage 1: Building base image via HF Job"
35
  echo "Platform: ${PLATFORM}"
36
  echo "Image: ${IMAGE_NAME}:${TAG}"
37
  echo ""
38
 
39
+ JOB_OUTPUT=$(python3 scripts/run_hf_image_job.py \
40
+ --image-name "${IMAGE_NAME}" \
41
+ --tag "${TAG}" \
42
+ --platform "${PLATFORM}" \
43
+ --dockerfile "Dockerfile.base" \
44
+ --target "foundationpose-base-l2" \
45
+ --git-repo "https://huggingface.co/spaces/${HF_SPACE}" 2>&1 | tee /tmp/hf_image_job.log)
46
+
47
+ JOB_ID=$(echo "${JOB_OUTPUT}" | awk '/Job ID:/ {print $3}')
48
+ if [ -z "${JOB_ID}" ]; then
49
+ echo "Warning: Could not parse HF job id. See /tmp/hf_image_job.log"
50
  else
51
+ echo "Following job logs for 1 minute..."
52
+ if command -v hf >/dev/null 2>&1; then
53
+ (timeout 60 hf jobs logs "${JOB_ID}") || true
54
+ echo ""
55
+ echo "Job status:"
56
+ hf jobs status "${JOB_ID}" || true
57
+ else
58
+ echo "hf CLI not available; job logs skipped"
59
  fi
 
60
  fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  echo ""
 
 
 
 
63
  echo "Stage 2: Deploying to HuggingFace Space"
64
  echo ""
65
 
 
67
  if [ ! -d .git ]; then
68
  echo "Initializing git repository..."
69
  git init
70
+ git remote add origin "https://huggingface.co/spaces/${HF_SPACE}"
71
  echo "✓ Git repository initialized"
72
  echo ""
73
  fi
 
75
  # Check if there are changes to commit
76
  if [[ -n $(git status -s) ]]; then
77
  echo "Committing changes..."
78
+ git add Dockerfile Dockerfile.base requirements.txt deploy.sh app.py client.py estimator.py masks.py scripts/run_hf_image_job.py download_weights.py
79
+ git commit -m "Update base image build and deps"
80
  echo "✓ Changes committed"
81
  else
82
  echo "No changes to commit"
 
85
  # Push to HuggingFace
86
  echo ""
87
  echo "Pushing to HuggingFace Space: ${HF_SPACE}"
88
+ git push "https://huggingface.co/spaces/${HF_SPACE}" main --force
89
 
90
  echo ""
91
  echo "✓ Pushed to HuggingFace"
92
  echo ""
93
  echo "HuggingFace will now:"
94
  echo " 1. Pull base image from DockerHub (${IMAGE_NAME}:${TAG})"
95
+ echo " 2. Build CUDA extensions"
96
+ echo " 3. Download model weights"
97
+ echo " 4. Start the Gradio app"
 
 
98
  echo ""
99
 
100
  # Follow build logs
 
102
  echo "Press Ctrl+C to stop watching"
103
  echo ""
104
 
105
+ HF_TOKEN="${HUGGINGFACE_TOKEN:-${HF_TOKEN:-}}"
106
+
107
+ if [ -n "${HF_TOKEN}" ]; then
108
+ curl -N -H "Authorization: Bearer ${HF_TOKEN}" \
109
+ "https://huggingface.co/api/spaces/${HF_SPACE}/logs/build" 2>/dev/null | \
110
+ while IFS= read -r line; do
111
+ echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
112
+ done
113
+
114
+ echo ""
115
+ echo "===================================="
116
+ echo "Build Status Check"
117
+ echo "===================================="
118
+ echo ""
119
+
120
+ # Wait a moment for status to update
121
+ sleep 2
122
+
123
+ # Check final build status
124
+ STATUS_JSON=$(curl -s -H "Authorization: Bearer ${HF_TOKEN}" \
125
+ "https://huggingface.co/api/spaces/${HF_SPACE}")
126
 
127
+ STAGE=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('stage', 'UNKNOWN'))" 2>/dev/null)
128
+ ERROR_MSG=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('errorMessage', ''))" 2>/dev/null)
 
 
 
 
 
129
 
130
+ echo "Final Status: ${STAGE}"
131
+
132
+ if [ "${STAGE}" = "RUNNING" ]; then
133
+ echo "✓ Deployment successful!"
134
  echo ""
135
+ echo "Space URL: https://${HF_SPACE/\//-}.hf.space"
136
+ echo "API URL: https://${HF_SPACE/\//-}.hf.space/gradio_api/info"
 
137
  echo ""
138
+ echo "Test with: cd ../training && make test-perception-api"
139
+ elif [ "${STAGE}" = "BUILD_ERROR" ]; then
140
+ echo "✗ Build failed!"
141
+ if [ -n "${ERROR_MSG}" ]; then
142
+ echo "Error: ${ERROR_MSG}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  fi
144
+ echo ""
145
+ echo "If still getting OOM errors, consider:"
146
+ echo " - Moving weights to runtime download (not build time)"
147
+ echo " - Requesting larger build instance from HuggingFace"
148
+ echo " - Using only CUDA arch 7.5 (T4 only)"
149
+ exit 1
150
  else
151
+ echo "Status: ${STAGE}"
152
+ if [ -n "${ERROR_MSG}" ]; then
153
+ echo "Message: ${ERROR_MSG}"
154
+ fi
155
  fi
156
+
157
+ echo ""
158
+ echo "Following application logs for 1 minute..."
159
+ LOG_URL="https://huggingface.co/api/spaces/${HF_SPACE}/logs"
160
+ python3 - <<'PY'
161
+ import os
162
+ import subprocess
163
+ import sys
164
+ import time
165
+
166
+ log_url = os.environ.get("LOG_URL")
167
+ token = os.environ.get("HF_TOKEN")
168
+ if not log_url or not token:
169
+ print("Skipping app logs: missing LOG_URL or HF_TOKEN")
170
+ raise SystemExit(0)
171
+
172
+ proc = subprocess.Popen(
173
+ ["curl", "-N", "-H", f"Authorization: Bearer {token}", log_url],
174
+ stdout=sys.stdout,
175
+ stderr=subprocess.DEVNULL,
176
+ )
177
+ try:
178
+ time.sleep(60)
179
+ finally:
180
+ proc.terminate()
181
+ try:
182
+ proc.wait(timeout=5)
183
+ except Exception:
184
+ proc.kill()
185
+ PY
186
+
187
  else
188
+ echo "Warning: HF token not available; cannot follow logs"
189
  echo "To follow logs manually:"
190
  echo " curl -N -H \"Authorization: Bearer \$HF_TOKEN\" \"https://huggingface.co/api/spaces/${HF_SPACE}/logs/build\""
191
  fi
scripts/run_hf_image_job.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Submit a HuggingFace Job that builds the FoundationPose base image and pushes it to Docker Hub.
4
+ """
5
+
6
+ import argparse
7
+ import os
8
+ import sys
9
+
10
+ from huggingface_hub import run_job
11
+
12
+
13
+ def main() -> None:
14
+ parser = argparse.ArgumentParser(
15
+ description="Build and push the FoundationPose base image via HuggingFace Jobs"
16
+ )
17
+ parser.add_argument(
18
+ "--image-name",
19
+ default="gpue/foundationpose-base-l2",
20
+ help="Docker Hub image name (default: gpue/foundationpose-base-l2)",
21
+ )
22
+ parser.add_argument(
23
+ "--tag",
24
+ default="latest",
25
+ help="Docker image tag (default: latest)",
26
+ )
27
+ parser.add_argument(
28
+ "--platform",
29
+ default="linux/amd64",
30
+ help="Target platform for docker build (default: linux/amd64)",
31
+ )
32
+ parser.add_argument(
33
+ "--dockerfile",
34
+ default="Dockerfile.base",
35
+ help="Dockerfile path inside repo (default: Dockerfile.base)",
36
+ )
37
+ parser.add_argument(
38
+ "--context",
39
+ default=".",
40
+ help="Docker build context path inside repo (default: .)",
41
+ )
42
+ parser.add_argument(
43
+ "--target",
44
+ default="foundationpose-base-l2",
45
+ help="Docker build target (default: foundationpose-base-l2)",
46
+ )
47
+ parser.add_argument(
48
+ "--git-repo",
49
+ default="https://huggingface.co/spaces/gpue/foundationpose",
50
+ help="Git repo to clone for build context (default: HF space repo)",
51
+ )
52
+ parser.add_argument(
53
+ "--flavor",
54
+ default="l40s",
55
+ help="HF Jobs hardware flavor (default: l40s)",
56
+ )
57
+ parser.add_argument(
58
+ "--timeout",
59
+ default="2h",
60
+ help="Job timeout (default: 2h)",
61
+ )
62
+ parser.add_argument("--namespace", help="Organization namespace (optional)")
63
+ parser.add_argument(
64
+ "--hf-token",
65
+ help="HuggingFace token (default: from HF_TOKEN or HUGGINGFACE_TOKEN env)",
66
+ )
67
+ parser.add_argument(
68
+ "--docker-user",
69
+ default=os.getenv("DOCKER_HF_USER", "gpue"),
70
+ help="Docker Hub username (default: DOCKER_HF_USER or gpue)",
71
+ )
72
+ parser.add_argument(
73
+ "--docker-token",
74
+ help="Docker Hub token (default: from DOCKER_HF_PAT env)",
75
+ )
76
+
77
+ args = parser.parse_args()
78
+
79
+ hf_token = args.hf_token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
80
+ docker_token = args.docker_token or os.getenv("DOCKER_HF_PAT")
81
+
82
+ if not hf_token:
83
+ print("Error: missing HF token (set HF_TOKEN or HUGGINGFACE_TOKEN)")
84
+ sys.exit(1)
85
+ if not docker_token:
86
+ print("Error: missing Docker token (set DOCKER_HF_PAT or --docker-token)")
87
+ sys.exit(1)
88
+
89
+ env = {
90
+ "IMAGE_NAME": args.image_name,
91
+ "IMAGE_TAG": args.tag,
92
+ "PLATFORM": args.platform,
93
+ "DOCKERFILE": args.dockerfile,
94
+ "CONTEXT": args.context,
95
+ "TARGET": args.target,
96
+ "GIT_REPO": args.git_repo,
97
+ "DOCKER_USER": args.docker_user,
98
+ }
99
+ secrets = {
100
+ "HF_TOKEN": hf_token,
101
+ "DOCKER_TOKEN": docker_token,
102
+ }
103
+
104
+ command = [
105
+ "sh",
106
+ "-c",
107
+ r"""
108
+ set -euo pipefail
109
+
110
+ echo "Installing git and certificates..."
111
+ apk add --no-cache git ca-certificates curl >/dev/null
112
+
113
+ # Start Docker daemon (DinD image)
114
+ echo "Starting Docker daemon..."
115
+ dockerd-entrypoint.sh > /tmp/dockerd.log 2>&1 &
116
+
117
+ # Wait for Docker
118
+ for i in $(seq 1 30); do
119
+ if docker info >/dev/null 2>&1; then
120
+ break
121
+ fi
122
+ sleep 1
123
+ if [ "$i" -eq 30 ]; then
124
+ echo "Docker did not start in time. Logs:" >&2
125
+ tail -n 200 /tmp/dockerd.log >&2 || true
126
+ exit 1
127
+ fi
128
+ done
129
+
130
+ echo "Cloning build context..."
131
+ if [ -n "${HF_TOKEN:-}" ]; then
132
+ AUTH_REPO=$(echo "$GIT_REPO" | sed -e "s#https://#https://user:${HF_TOKEN}@#")
133
+ git clone --depth 1 "$AUTH_REPO" /work/repo
134
+ else
135
+ git clone --depth 1 "$GIT_REPO" /work/repo
136
+ fi
137
+
138
+ cd /work/repo
139
+
140
+ echo "Logging in to Docker Hub..."
141
+ echo "$DOCKER_TOKEN" | docker login -u "$DOCKER_USER" --password-stdin
142
+
143
+ IMAGE_REF="$IMAGE_NAME:$IMAGE_TAG"
144
+
145
+ echo "Building image $IMAGE_REF (target: $TARGET)..."
146
+ docker build --platform "$PLATFORM" -f "$DOCKERFILE" --target "$TARGET" -t "$IMAGE_REF" "$CONTEXT"
147
+
148
+ echo "Pushing image $IMAGE_REF..."
149
+ docker push "$IMAGE_REF"
150
+
151
+ echo "✓ Image pushed successfully"
152
+ """,
153
+ ]
154
+
155
+ print("Submitting HF job for image build...")
156
+ print(f" Image: {args.image_name}:{args.tag}")
157
+ print(f" Target: {args.target}")
158
+ print(f" Repo: {args.git_repo}")
159
+ print(f" Dockerfile: {args.dockerfile}")
160
+ print(f" Flavor: {args.flavor}")
161
+ print(f" Timeout: {args.timeout}")
162
+ print()
163
+
164
+ job_info = run_job(
165
+ image="docker:24.0.7-dind",
166
+ command=command,
167
+ env=env,
168
+ secrets=secrets,
169
+ flavor=args.flavor,
170
+ timeout=args.timeout,
171
+ namespace=args.namespace,
172
+ )
173
+
174
+ print("✓ Job submitted")
175
+ print(f" Job ID: {job_info.id}")
176
+ print(f" Job URL: {job_info.url}")
177
+ print()
178
+ print("Monitor logs:")
179
+ print(f" hf jobs logs {job_info.id}")
180
+ print("Check status:")
181
+ print(f" hf jobs status {job_info.id}")
182
+
183
+
184
+ if __name__ == "__main__":
185
+ main()