| # Entry point for the Hugging Face Space container. | |
| # | |
| # Pre-warms the checkpoint download from HF Hub before starting gunicorn so | |
| # that worker processes never block on the network on first request. The | |
| # download is idempotent β on warm restarts (with persistent storage) every | |
| # file is already present and snapshot_download is a no-op. | |
| set -euo pipefail | |
| # `export VAR="${VAR:-default}"` and not bash's `: "${VAR:=default}"` β | |
| # the assignment-only form does not export to child processes (the | |
| # Python heredoc below reads via os.environ and would KeyError). | |
| export HF_CHECKPOINTS_REPO="${HF_CHECKPOINTS_REPO:-Bani57/checkpoints}" | |
| export CHECKPOINTS_ROOT="${CHECKPOINTS_ROOT:-/app/research}" | |
| export PORT="${PORT:-7860}" | |
| # An empty HF_TOKEN tricks huggingface_hub into emitting a malformed | |
| # 'Bearer ' auth header (httpx rejects it). For public repos no token is | |
| # needed; drop the variable entirely if it's set but empty. | |
| if [ -z "${HF_TOKEN:-}" ]; then | |
| unset HF_TOKEN | |
| fi | |
| mkdir -p "${CHECKPOINTS_ROOT}" | |
| echo "[entrypoint] Pre-warming checkpoints from ${HF_CHECKPOINTS_REPO} -> ${CHECKPOINTS_ROOT}" | |
| python - <<'PY' | |
| import os | |
| from huggingface_hub import snapshot_download | |
| token = os.environ.get("HF_TOKEN") or None | |
| snapshot_download( | |
| repo_id=os.environ["HF_CHECKPOINTS_REPO"], | |
| repo_type="model", | |
| local_dir=os.environ["CHECKPOINTS_ROOT"], | |
| max_workers=4, | |
| token=token, | |
| ) | |
| print("[entrypoint] checkpoints ready") | |
| PY | |
| echo "[entrypoint] starting gunicorn on 0.0.0.0:${PORT}" | |
| # Single worker, multiple threads, preloaded app: | |
| # * --preload runs Django setup (and ModelRegistry.initialize β COINs | |
| # Loaders + sample subgraphs) ONCE in the gunicorn master before | |
| # forking. Without it, every worker boot goes through the same | |
| # 10β15-minute graph-metric computation and gunicorn's silent-time | |
| # timeout (which fires during boot) kills the worker mid-init. | |
| # * Single worker because the ModelRegistry holds multi-GB of state and | |
| # ModelRegistry._inference_lock serializes inference globally anyway β | |
| # a second worker would duplicate the memory without adding throughput. | |
| # * Long --timeout protects the first inference request after a cold | |
| # start, when lazy model loading + diffusion sampling can take minutes | |
| # on free-tier CPU. | |
| exec gunicorn research_api.wsgi:application \ | |
| --bind "0.0.0.0:${PORT}" \ | |
| --workers 1 \ | |
| --threads 4 \ | |
| --preload \ | |
| --timeout 1800 \ | |
| --graceful-timeout 30 \ | |
| --access-logfile - \ | |
| --error-logfile - | |