#!/usr/bin/env bash # Entry point for the Hugging Face Space container. # # Pre-warms the checkpoint download from HF Hub before starting gunicorn so # that worker processes never block on the network on first request. The # download is idempotent — on warm restarts (with persistent storage) every # file is already present and snapshot_download is a no-op. set -euo pipefail # `export VAR="${VAR:-default}"` and not bash's `: "${VAR:=default}"` — # the assignment-only form does not export to child processes (the # Python heredoc below reads via os.environ and would KeyError). export HF_CHECKPOINTS_REPO="${HF_CHECKPOINTS_REPO:-Bani57/checkpoints}" export CHECKPOINTS_ROOT="${CHECKPOINTS_ROOT:-/app/research}" export PORT="${PORT:-7860}" # An empty HF_TOKEN tricks huggingface_hub into emitting a malformed # 'Bearer ' auth header (httpx rejects it). For public repos no token is # needed; drop the variable entirely if it's set but empty. if [ -z "${HF_TOKEN:-}" ]; then unset HF_TOKEN fi mkdir -p "${CHECKPOINTS_ROOT}" echo "[entrypoint] Pre-warming checkpoints from ${HF_CHECKPOINTS_REPO} -> ${CHECKPOINTS_ROOT}" python - <<'PY' import os from huggingface_hub import snapshot_download token = os.environ.get("HF_TOKEN") or None snapshot_download( repo_id=os.environ["HF_CHECKPOINTS_REPO"], repo_type="model", local_dir=os.environ["CHECKPOINTS_ROOT"], max_workers=4, token=token, ) print("[entrypoint] checkpoints ready") PY echo "[entrypoint] starting gunicorn on 0.0.0.0:${PORT}" # Single worker, multiple threads, preloaded app: # * --preload runs Django setup (and ModelRegistry.initialize → COINs # Loaders + sample subgraphs) ONCE in the gunicorn master before # forking. Without it, every worker boot goes through the same # 10–15-minute graph-metric computation and gunicorn's silent-time # timeout (which fires during boot) kills the worker mid-init. # * Single worker because the ModelRegistry holds multi-GB of state and # ModelRegistry._inference_lock serializes inference globally anyway — # a second worker would duplicate the memory without adding throughput. # * Long --timeout protects the first inference request after a cold # start, when lazy model loading + diffusion sampling can take minutes # on free-tier CPU. exec gunicorn research_api.wsgi:application \ --bind "0.0.0.0:${PORT}" \ --workers 1 \ --threads 4 \ --preload \ --timeout 1800 \ --graceful-timeout 30 \ --access-logfile - \ --error-logfile -