#!/bin/bash # FoundationPose deployment script (optimized for HuggingFace) set -e if [ -z "${BASH_VERSION:-}" ]; then exec /bin/bash "$0" "$@" fi IMAGE_NAME_L1="gpue/foundationpose-base-l1" IMAGE_NAME_L2="gpue/foundationpose-base-l2" TAG="latest" PLATFORM="linux/amd64" HF_SPACE="gpue/foundationpose" ENV_FILE=".env" exec > >(tee -a deploy.logs) 2>&1 echo "===================================" echo "FoundationPose Deployment" echo "===================================" echo "" # Load tokens from .env if [ -f "${ENV_FILE}" ]; then set -a # shellcheck disable=SC1090 source "${ENV_FILE}" set +a else echo "Warning: ${ENV_FILE} not found" fi # Ensure huggingface_hub (and hf CLI) are available via local venv VENV_DIR=".deploy-venv" PY_BIN="${VENV_DIR}/bin/python3" HF_BIN="${VENV_DIR}/bin/hf" if [ ! -x "${PY_BIN}" ]; then echo "Creating deploy venv at ${VENV_DIR}..." python3 -m venv "${VENV_DIR}" fi if ! "${PY_BIN}" -c "import huggingface_hub" >/dev/null 2>&1; then echo "Installing huggingface_hub in deploy venv..." "${PY_BIN}" -m pip install --quiet huggingface_hub fi # Hash helper for build gating hash_files() { "${PY_BIN}" - <<'PY' "$@" import hashlib import sys from pathlib import Path paths = [Path(p) for p in sys.argv[1:]] hasher = hashlib.sha256() for path in paths: hasher.update(path.as_posix().encode("utf-8")) hasher.update(b"\\0") hasher.update(path.read_bytes()) hasher.update(b"\\0") print(hasher.hexdigest()) PY } mkdir -p .deploy L1_INPUTS=(Dockerfile.base) L2_INPUTS=(Dockerfile.base download_weights.py) L1_HASH=$(hash_files "${L1_INPUTS[@]}") L2_HASH=$(hash_files "${L2_INPUTS[@]}") LAST_L1_HASH_FILE=".deploy/last_l1.sha" LAST_L2_HASH_FILE=".deploy/last_l2.sha" SKIP_L1=0 SKIP_L2=0 if [ -f "${LAST_L1_HASH_FILE}" ] && [ "$(cat "${LAST_L1_HASH_FILE}")" = "${L1_HASH}" ]; then SKIP_L1=1 echo "L1 inputs unchanged; skipping L1 image job." fi if [ -f "${LAST_L2_HASH_FILE}" ] && [ "$(cat "${LAST_L2_HASH_FILE}")" = "${L2_HASH}" ]; then SKIP_L2=1 echo "L2 inputs unchanged; skipping L2 image job." fi # Initialize git repo if needed (for job context) if [ ! -d .git ]; then echo "Initializing git repository..." git init git remote add origin "https://huggingface.co/spaces/${HF_SPACE}" echo "✓ Git repository initialized" echo "" fi # Commit local changes before job so the job can build the right ref if [[ -n $(git status -s) ]]; then echo "Committing changes for job context..." git add Dockerfile Dockerfile.base requirements.txt deploy.sh app.py client.py estimator.py masks.py scripts/run_hf_image_job.py download_weights.py if git diff --cached --quiet; then echo "No staged changes for job context" else git commit -m "Prepare job build context" echo "✓ Job context committed" fi fi JOB_REF="" if [ "${SKIP_L1}" -eq 0 ] || [ "${SKIP_L2}" -eq 0 ]; then # Push a temporary ref for the job to build from JOB_REF="job-build-$(date +%Y%m%d-%H%M%S)" echo "Pushing job ref: ${JOB_REF}" git push "https://huggingface.co/spaces/${HF_SPACE}" "HEAD:${JOB_REF}" --force echo "✓ Job ref pushed" echo "" fi if [ "${SKIP_L1}" -eq 0 ]; then echo "Stage 1: Building L1 base image via HF Job" echo "Platform: ${PLATFORM}" echo "Image: ${IMAGE_NAME_L1}:${TAG}" echo "" JOB_OUTPUT=$("${PY_BIN}" scripts/run_hf_image_job.py \ --image-name "${IMAGE_NAME_L1}" \ --tag "${TAG}" \ --platform "${PLATFORM}" \ --dockerfile "Dockerfile.base" \ --target "foundationpose-base-l1" \ --flavor "l40sx1" \ --git-repo "https://huggingface.co/spaces/${HF_SPACE}" \ --git-ref "${JOB_REF}" 2>&1 | tee /tmp/hf_image_job.log) JOB_ID=$(echo "${JOB_OUTPUT}" | awk '/Job ID:/ {print $3}') if [ -z "${JOB_ID}" ]; then echo "Warning: Could not parse HF job id. See /tmp/hf_image_job.log" else echo "Following job logs until completion..." if [ -x "${HF_BIN}" ]; then HF_BIN_PATH="${HF_BIN}" JOB_ID="${JOB_ID}" "${PY_BIN}" - <<'PY' import json import os import subprocess import sys import time hf = os.environ["HF_BIN_PATH"] job_id = os.environ["JOB_ID"] log_proc = subprocess.Popen([hf, "jobs", "logs", job_id], stdout=sys.stdout, stderr=sys.stderr) try: while True: inspect = subprocess.run([hf, "jobs", "inspect", job_id], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True) if inspect.returncode == 0: try: data = json.loads(inspect.stdout)[0] stage = (data.get("status") or {}).get("stage", "UNKNOWN") except Exception: stage = "UNKNOWN" else: stage = "UNKNOWN" if stage in {"SUCCESS","SUCCEEDED","COMPLETED","DONE","FAILED","ERROR","CANCELED","CANCELLED"}: break time.sleep(15) finally: log_proc.terminate() try: log_proc.wait(timeout=5) except Exception: log_proc.kill() PY echo "" echo "Job status:" "${HF_BIN}" jobs inspect "${JOB_ID}" || true else echo "hf CLI not available; job logs skipped" fi fi if [ -n "${JOB_ID}" ] && [ -x "${HF_BIN}" ]; then echo "" echo "Waiting for L1 image build job to complete..." for i in $(seq 1 40); do JOB_STAGE=$("${HF_BIN}" jobs inspect "${JOB_ID}" | python3 -c "import sys, json; data=json.load(sys.stdin)[0]; print(data.get('status', {}).get('stage', 'UNKNOWN'))" 2>/dev/null || echo "UNKNOWN") echo " Job stage: ${JOB_STAGE}" if [[ "${JOB_STAGE}" =~ ^(SUCCESS|SUCCEEDED|COMPLETED|DONE)$ ]]; then echo "✓ L1 image build job completed" echo "${L1_HASH}" > "${LAST_L1_HASH_FILE}" break elif [[ "${JOB_STAGE}" =~ ^(FAILED|ERROR|CANCELED|CANCELLED)$ ]]; then echo "✗ Image build job failed: ${JOB_STAGE}" exit 1 else sleep 30 fi done fi fi echo "" if [ "${SKIP_L2}" -eq 0 ]; then echo "Stage 2: Building L2 base image via HF Job" echo "" JOB_OUTPUT_L2=$("${PY_BIN}" scripts/run_hf_image_job.py \ --image-name "${IMAGE_NAME_L2}" \ --tag "${TAG}" \ --platform "${PLATFORM}" \ --dockerfile "Dockerfile.base" \ --target "foundationpose-base-l2" \ --flavor "l40sx1" \ --git-repo "https://huggingface.co/spaces/${HF_SPACE}" \ --git-ref "${JOB_REF}" 2>&1 | tee /tmp/hf_image_job_l2.log) JOB_ID_L2=$(echo "${JOB_OUTPUT_L2}" | awk '/Job ID:/ {print $3}') if [ -z "${JOB_ID_L2}" ]; then echo "Warning: Could not parse HF job id for L2. See /tmp/hf_image_job_l2.log" else echo "Following L2 job logs until completion..." if [ -x "${HF_BIN}" ]; then HF_BIN_PATH="${HF_BIN}" JOB_ID="${JOB_ID_L2}" "${PY_BIN}" - <<'PY' import json import os import subprocess import sys import time hf = os.environ["HF_BIN_PATH"] job_id = os.environ["JOB_ID"] log_proc = subprocess.Popen([hf, "jobs", "logs", job_id], stdout=sys.stdout, stderr=sys.stderr) try: while True: inspect = subprocess.run([hf, "jobs", "inspect", job_id], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True) if inspect.returncode == 0: try: data = json.loads(inspect.stdout)[0] stage = (data.get("status") or {}).get("stage", "UNKNOWN") except Exception: stage = "UNKNOWN" else: stage = "UNKNOWN" if stage in {"SUCCESS","SUCCEEDED","COMPLETED","DONE","FAILED","ERROR","CANCELED","CANCELLED"}: break time.sleep(15) finally: log_proc.terminate() try: log_proc.wait(timeout=5) except Exception: log_proc.kill() PY echo "" echo "L2 job status:" "${HF_BIN}" jobs inspect "${JOB_ID_L2}" || true else echo "hf CLI not available; job logs skipped" fi fi if [ -n "${JOB_ID_L2}" ] && [ -x "${HF_BIN}" ]; then echo "" echo "Waiting for L2 image build job to complete..." for i in $(seq 1 60); do JOB_STAGE=$("${HF_BIN}" jobs inspect "${JOB_ID_L2}" | python3 -c "import sys, json; data=json.load(sys.stdin)[0]; print(data.get('status', {}).get('stage', 'UNKNOWN'))" 2>/dev/null || echo "UNKNOWN") echo " Job stage: ${JOB_STAGE}" if [[ "${JOB_STAGE}" =~ ^(SUCCESS|SUCCEEDED|COMPLETED|DONE)$ ]]; then echo "✓ L2 image build job completed" echo "${L2_HASH}" > "${LAST_L2_HASH_FILE}" break elif [[ "${JOB_STAGE}" =~ ^(FAILED|ERROR|CANCELED|CANCELLED)$ ]]; then echo "✗ L2 image build job failed: ${JOB_STAGE}" exit 1 else sleep 30 fi done fi fi echo "" echo "Stage 3: Deploying to HuggingFace Space" echo "" # Initialize git repo if needed if [ ! -d .git ]; then echo "Initializing git repository..." git init git remote add origin "https://huggingface.co/spaces/${HF_SPACE}" echo "✓ Git repository initialized" echo "" fi # Bump build number to force Space rebuild BUILDNUM_FILE="buildnum.txt" "${PY_BIN}" - <<'PY' from pathlib import Path path = Path("buildnum.txt") try: current = int(path.read_text().strip()) except Exception: current = 0 path.write_text(f"{current + 1}\n") print(f"Updated buildnum.txt -> {current + 1}") PY # Check if there are changes to commit if [[ -n $(git status -s) ]]; then echo "Committing changes..." git add . git commit -m "Update base image build and deps" echo "✓ Changes committed" else echo "No changes to commit" fi # Push to HuggingFace echo "" echo "Pushing to HuggingFace Space: ${HF_SPACE}" git push "https://huggingface.co/spaces/${HF_SPACE}" main --force echo "" echo "✓ Pushed to HuggingFace" echo "" echo "HuggingFace will now:" echo " 1. Pull base image from DockerHub (${IMAGE_NAME_L2}:${TAG})" echo " 2. Start the Gradio app" echo "" # Follow build logs echo "Following build logs..." echo "Press Ctrl+C to stop watching" echo "" HF_TOKEN="${HUGGINGFACE_TOKEN:-${HF_TOKEN:-}}" export HF_TOKEN if [ -n "${HF_TOKEN}" ]; then curl -N -H "Authorization: Bearer ${HF_TOKEN}" \ "https://huggingface.co/api/spaces/${HF_SPACE}/logs/build" 2>/dev/null | \ while IFS= read -r line; do echo "$line" | grep -o '"data":"[^"]*"' | sed 's/"data":"//;s/"$//' | sed 's/\\n/\n/g' done echo "" echo "====================================" echo "Build Status Check" echo "====================================" echo "" # Wait a moment for status to update sleep 2 # Check final build status STATUS_JSON=$(curl -s -H "Authorization: Bearer ${HF_TOKEN}" \ "https://huggingface.co/api/spaces/${HF_SPACE}") STAGE=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('stage', 'UNKNOWN'))" 2>/dev/null) ERROR_MSG=$(echo "$STATUS_JSON" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data.get('runtime', {}).get('errorMessage', ''))" 2>/dev/null) echo "Final Status: ${STAGE}" if [ "${STAGE}" = "RUNNING" ]; then echo "✓ Deployment successful!" echo "" echo "Space URL: https://${HF_SPACE/\//-}.hf.space" echo "API URL: https://${HF_SPACE/\//-}.hf.space/gradio_api/info" echo "" echo "Test with: cd ../training && make test-perception-api" elif [ "${STAGE}" = "BUILD_ERROR" ]; then echo "✗ Build failed!" if [ -n "${ERROR_MSG}" ]; then echo "Error: ${ERROR_MSG}" fi echo "" echo "If still getting OOM errors, consider:" echo " - Moving weights to runtime download (not build time)" echo " - Requesting larger build instance from HuggingFace" echo " - Using only CUDA arch 7.5 (T4 only)" exit 1 else echo "Status: ${STAGE}" if [ -n "${ERROR_MSG}" ]; then echo "Message: ${ERROR_MSG}" fi fi echo "" echo "Following application logs for 1 minute..." LOG_URL="https://huggingface.co/api/spaces/${HF_SPACE}/logs/run" export LOG_URL python3 - <<'PY' import os import subprocess import sys import time log_url = os.environ.get("LOG_URL") token = os.environ.get("HF_TOKEN") if not log_url or not token: print("Skipping app logs: missing LOG_URL or HF_TOKEN") raise SystemExit(0) proc = subprocess.Popen( ["curl", "-N", "-H", f"Authorization: Bearer {token}", log_url], stdout=sys.stdout, stderr=subprocess.DEVNULL, ) try: time.sleep(60) finally: proc.terminate() try: proc.wait(timeout=5) except Exception: proc.kill() PY else echo "Warning: HF token not available; cannot follow logs" echo "To follow logs manually:" echo " curl -N -H \"Authorization: Bearer \$HF_TOKEN\" \"https://huggingface.co/api/spaces/${HF_SPACE}/logs/build\"" fi