foundationpose / deploy.sh
Georg
Prepare job build context
dde1d40
#!/bin/bash
# FoundationPose deployment script (optimized for HuggingFace)
set -e
if [ -z "${BASH_VERSION:-}" ]; then
exec /bin/bash "$0" "$@"
fi
IMAGE_NAME_L1="gpue/foundationpose-base-l1"
IMAGE_NAME_L2="gpue/foundationpose-base-l2"
TAG="latest"
PLATFORM="linux/amd64"
HF_SPACE="gpue/foundationpose"
ENV_FILE=".env"
exec > >(tee -a deploy.logs) 2>&1
echo "==================================="
echo "FoundationPose Deployment"
echo "==================================="
echo ""
# Load tokens from .env
if [ -f "${ENV_FILE}" ]; then
set -a
# shellcheck disable=SC1090
source "${ENV_FILE}"
set +a
else
echo "Warning: ${ENV_FILE} not found"
fi
# Ensure huggingface_hub (and hf CLI) are available via local venv
VENV_DIR=".deploy-venv"
PY_BIN="${VENV_DIR}/bin/python3"
HF_BIN="${VENV_DIR}/bin/hf"
if [ ! -x "${PY_BIN}" ]; then
echo "Creating deploy venv at ${VENV_DIR}..."
python3 -m venv "${VENV_DIR}"
fi
if ! "${PY_BIN}" -c "import huggingface_hub" >/dev/null 2>&1; then
echo "Installing huggingface_hub in deploy venv..."
"${PY_BIN}" -m pip install --quiet huggingface_hub
fi
# Hash helper for build gating
hash_files() {
"${PY_BIN}" - <<'PY' "$@"
import hashlib
import sys
from pathlib import Path
paths = [Path(p) for p in sys.argv[1:]]
hasher = hashlib.sha256()
for path in paths:
hasher.update(path.as_posix().encode("utf-8"))
hasher.update(b"\\0")
hasher.update(path.read_bytes())
hasher.update(b"\\0")
print(hasher.hexdigest())
PY
}
mkdir -p .deploy
L1_INPUTS=(Dockerfile.base)
L2_INPUTS=(Dockerfile.base download_weights.py)
L1_HASH=$(hash_files "${L1_INPUTS[@]}")
L2_HASH=$(hash_files "${L2_INPUTS[@]}")
LAST_L1_HASH_FILE=".deploy/last_l1.sha"
LAST_L2_HASH_FILE=".deploy/last_l2.sha"
SKIP_L1=0
SKIP_L2=0
if [ -f "${LAST_L1_HASH_FILE}" ] && [ "$(cat "${LAST_L1_HASH_FILE}")" = "${L1_HASH}" ]; then
SKIP_L1=1
echo "L1 inputs unchanged; skipping L1 image job."
fi
if [ -f "${LAST_L2_HASH_FILE}" ] && [ "$(cat "${LAST_L2_HASH_FILE}")" = "${L2_HASH}" ]; then
SKIP_L2=1
echo "L2 inputs unchanged; skipping L2 image job."
fi
# Initialize git repo if needed (for job context)
if [ ! -d .git ]; then
echo "Initializing git repository..."
git init
git remote add origin "https://huggingface.co/spaces/${HF_SPACE}"
echo "✓ Git repository initialized"
echo ""
fi
# Commit local changes before job so the job can build the right ref
if [[ -n $(git status -s) ]]; then
echo "Committing changes for job context..."
git add Dockerfile Dockerfile.base requirements.txt deploy.sh app.py client.py estimator.py masks.py scripts/run_hf_image_job.py download_weights.py
if git diff --cached --quiet; then
echo "No staged changes for job context"
else
git commit -m "Prepare job build context"
echo "✓ Job context committed"
fi
fi
JOB_REF=""
if [ "${SKIP_L1}" -eq 0 ] || [ "${SKIP_L2}" -eq 0 ]; then
# Push a temporary ref for the job to build from
JOB_REF="job-build-$(date +%Y%m%d-%H%M%S)"
echo "Pushing job ref: ${JOB_REF}"
git push "https://huggingface.co/spaces/${HF_SPACE}" "HEAD:${JOB_REF}" --force
echo "✓ Job ref pushed"
echo ""
fi
if [ "${SKIP_L1}" -eq 0 ]; then
echo "Stage 1: Building L1 base image via HF Job"
echo "Platform: ${PLATFORM}"
echo "Image: ${IMAGE_NAME_L1}:${TAG}"
echo ""
JOB_OUTPUT=$("${PY_BIN}" scripts/run_hf_image_job.py \
--image-name "${IMAGE_NAME_L1}" \
--tag "${TAG}" \
--platform "${PLATFORM}" \
--dockerfile "Dockerfile.base" \
--target "foundationpose-base-l1" \
--flavor "l40sx1" \
--git-repo "https://huggingface.co/spaces/${HF_SPACE}" \
--git-ref "${JOB_REF}" 2>&1 | tee /tmp/hf_image_job.log)
JOB_ID=$(echo "${JOB_OUTPUT}" | awk '/Job ID:/ {print $3}')
if [ -z "${JOB_ID}" ]; then
echo "Warning: Could not parse HF job id. See /tmp/hf_image_job.log"
else
echo "Following job logs until completion..."
if [ -x "${HF_BIN}" ]; then
HF_BIN_PATH="${HF_BIN}" JOB_ID="${JOB_ID}" "${PY_BIN}" - <<'PY'
import json
import os
import subprocess
import sys
import time
hf = os.environ["HF_BIN_PATH"]
job_id = os.environ["JOB_ID"]
log_proc = subprocess.Popen([hf, "jobs", "logs", job_id], stdout=sys.stdout, stderr=sys.stderr)
try:
while True:
inspect = subprocess.run([hf, "jobs", "inspect", job_id], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True)
if inspect.returncode == 0:
try:
data = json.loads(inspect.stdout)[0]
stage = (data.get("status") or {}).get("stage", "UNKNOWN")
except Exception:
stage = "UNKNOWN"
else:
stage = "UNKNOWN"
if stage in {"SUCCESS","SUCCEEDED","COMPLETED","DONE","FAILED","ERROR","CANCELED","CANCELLED"}:
break
time.sleep(15)
finally:
log_proc.terminate()
try:
log_proc.wait(timeout=5)
except Exception:
log_proc.kill()
PY
echo ""
echo "Job status:"
"${HF_BIN}" jobs inspect "${JOB_ID}" || true
else
echo "hf CLI not available; job logs skipped"
fi
fi
if [ -n "${JOB_ID}" ] && [ -x "${HF_BIN}" ]; then
echo ""
echo "Waiting for L1 image build job to complete..."
for i in $(seq 1 40); do
JOB_STAGE=$("${HF_BIN}" jobs inspect "${JOB_ID}" | python3 -c "import sys, json; data=json.load(sys.stdin)[0]; print(data.get('status', {}).get('stage', 'UNKNOWN'))" 2>/dev/null || echo "UNKNOWN")
echo " Job stage: ${JOB_STAGE}"
if [[ "${JOB_STAGE}" =~ ^(SUCCESS|SUCCEEDED|COMPLETED|DONE)$ ]]; then
echo "✓ L1 image build job completed"
echo "${L1_HASH}" > "${LAST_L1_HASH_FILE}"
break
elif [[ "${JOB_STAGE}" =~ ^(FAILED|ERROR|CANCELED|CANCELLED)$ ]]; then
echo "✗ Image build job failed: ${JOB_STAGE}"
exit 1
else
sleep 30
fi
done
fi
fi
echo ""
if [ "${SKIP_L2}" -eq 0 ]; then
echo "Stage 2: Building L2 base image via HF Job"
echo ""
JOB_OUTPUT_L2=$("${PY_BIN}" scripts/run_hf_image_job.py \
--image-name "${IMAGE_NAME_L2}" \
--tag "${TAG}" \
--platform "${PLATFORM}" \
--dockerfile "Dockerfile.base" \
--target "foundationpose-base-l2" \
--flavor "l40sx1" \
--git-repo "https://huggingface.co/spaces/${HF_SPACE}" \
--git-ref "${JOB_REF}" 2>&1 | tee /tmp/hf_image_job_l2.log)
JOB_ID_L2=$(echo "${JOB_OUTPUT_L2}" | awk '/Job ID:/ {print $3}')
if [ -z "${JOB_ID_L2}" ]; then
echo "Warning: Could not parse HF job id for L2. See /tmp/hf_image_job_l2.log"
else
echo "Following L2 job logs until completion..."
if [ -x "${HF_BIN}" ]; then
HF_BIN_PATH="${HF_BIN}" JOB_ID="${JOB_ID_L2}" "${PY_BIN}" - <<'PY'
import json
import os
import subprocess
import sys
import time
hf = os.environ["HF_BIN_PATH"]
job_id = os.environ["JOB_ID"]
log_proc = subprocess.Popen([hf, "jobs", "logs", job_id], stdout=sys.stdout, stderr=sys.stderr)
try:
while True:
inspect = subprocess.run([hf, "jobs", "inspect", job_id], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True)
if inspect.returncode == 0:
try:
data = json.loads(inspect.stdout)[0]
stage = (data.get("status") or {}).get("stage", "UNKNOWN")
except Exception:
stage = "UNKNOWN"
else:
stage = "UNKNOWN"
if stage in {"SUCCESS","SUCCEEDED","COMPLETED","DONE","FAILED","ERROR","CANCELED","CANCELLED"}:
break
time.sleep(15)
finally:
log_proc.terminate()
try:
log_proc.wait(timeout=5)
except Exception:
log_proc.kill()
PY
echo ""
echo "L2 job status:"
"${HF_BIN}" jobs inspect "${JOB_ID_L2}" || true
else
echo "hf CLI not available; job logs skipped"
fi
fi
if [ -n "${JOB_ID_L2}" ] && [ -x "${HF_BIN}" ]; then
echo ""
echo "Waiting for L2 image build job to complete..."
for i in $(seq 1 60); do
JOB_STAGE=$("${HF_BIN}" jobs inspect "${JOB_ID_L2}" | python3 -c "import sys, json; data=json.load(sys.stdin)[0]; print(data.get('status', {}).get('stage', 'UNKNOWN'))" 2>/dev/null || echo "UNKNOWN")
echo " Job stage: ${JOB_STAGE}"
if [[ "${JOB_STAGE}" =~ ^(SUCCESS|SUCCEEDED|COMPLETED|DONE)$ ]]; then
echo "✓ L2 image build job completed"
echo "${L2_HASH}" > "${LAST_L2_HASH_FILE}"
break
elif [[ "${JOB_STAGE}" =~ ^(FAILED|ERROR|CANCELED|CANCELLED)$ ]]; then
echo "✗ L2 image build job failed: ${JOB_STAGE}"
exit 1
else
sleep 30
fi
done
fi
fi
echo ""
echo "Stage 3: Deploying to HuggingFace Space"
echo ""
# Initialize git repo if needed
if [ ! -d .git ]; then
echo "Initializing git repository..."
git init
git remote add origin "https://huggingface.co/spaces/${HF_SPACE}"
echo "✓ Git repository initialized"
echo ""
fi
# Bump build number to force Space rebuild
BUILDNUM_FILE="buildnum.txt"
"${PY_BIN}" - <<'PY'
from pathlib import Path
path = Path("buildnum.txt")
try:
current = int(path.read_text().strip())
except Exception:
current = 0
path.write_text(f"{current + 1}\n")
print(f"Updated buildnum.txt -> {current + 1}")
PY
# Check if there are changes to commit
if [[ -n $(git status -s) ]]; then
echo "Committing changes..."
git add .
git commit -m "Update base image build and deps"
echo "✓ Changes committed"
else
echo "No changes to commit"
fi
# Push to HuggingFace
echo ""
echo "Pushing to HuggingFace Space: ${HF_SPACE}"
git push "https://huggingface.co/spaces/${HF_SPACE}" main --force
echo ""
echo "✓ Pushed to HuggingFace"
echo ""
echo "HuggingFace will now:"
echo " 1. Pull base image from DockerHub (${IMAGE_NAME_L2}:${TAG})"
echo " 2. Start the Gradio app"
echo ""
# Follow build logs
echo "Following build logs..."
echo "Press Ctrl+C to stop watching"
echo ""
HF_TOKEN="${HUGGINGFACE_TOKEN:-${HF_TOKEN:-}}"
export HF_TOKEN
if [ -n "${HF_TOKEN}" ]; then
curl -N -H "Authorization: Bearer ${HF_TOKEN}" \
"https://huggingface.co/api/spaces/${HF_SPACE}/logs/build" 2>/dev/null | \
while IFS= read -r line; do
echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g'
done
echo ""
echo "===================================="
echo "Build Status Check"
echo "===================================="
echo ""
# Wait a moment for status to update
sleep 2
# Check final build status
STATUS_JSON=$(curl -s -H "Authorization: Bearer ${HF_TOKEN}" \
"https://huggingface.co/api/spaces/${HF_SPACE}")
STAGE=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('stage', 'UNKNOWN'))" 2>/dev/null)
ERROR_MSG=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('errorMessage', ''))" 2>/dev/null)
echo "Final Status: ${STAGE}"
if [ "${STAGE}" = "RUNNING" ]; then
echo "✓ Deployment successful!"
echo ""
echo "Space URL: https://${HF_SPACE/\//-}.hf.space"
echo "API URL: https://${HF_SPACE/\//-}.hf.space/gradio_api/info"
echo ""
echo "Test with: cd ../training && make test-perception-api"
elif [ "${STAGE}" = "BUILD_ERROR" ]; then
echo "✗ Build failed!"
if [ -n "${ERROR_MSG}" ]; then
echo "Error: ${ERROR_MSG}"
fi
echo ""
echo "If still getting OOM errors, consider:"
echo " - Moving weights to runtime download (not build time)"
echo " - Requesting larger build instance from HuggingFace"
echo " - Using only CUDA arch 7.5 (T4 only)"
exit 1
else
echo "Status: ${STAGE}"
if [ -n "${ERROR_MSG}" ]; then
echo "Message: ${ERROR_MSG}"
fi
fi
echo ""
echo "Following application logs for 1 minute..."
LOG_URL="https://huggingface.co/api/spaces/${HF_SPACE}/logs/run"
export LOG_URL
python3 - <<'PY'
import os
import subprocess
import sys
import time
log_url = os.environ.get("LOG_URL")
token = os.environ.get("HF_TOKEN")
if not log_url or not token:
print("Skipping app logs: missing LOG_URL or HF_TOKEN")
raise SystemExit(0)
proc = subprocess.Popen(
["curl", "-N", "-H", f"Authorization: Bearer {token}", log_url],
stdout=sys.stdout,
stderr=subprocess.DEVNULL,
)
try:
time.sleep(60)
finally:
proc.terminate()
try:
proc.wait(timeout=5)
except Exception:
proc.kill()
PY
else
echo "Warning: HF token not available; cannot follow logs"
echo "To follow logs manually:"
echo " curl -N -H \"Authorization: Bearer \$HF_TOKEN\" \"https://huggingface.co/api/spaces/${HF_SPACE}/logs/build\""
fi