website / entrypoint.sh
Andrej Janchevski
fix(deploy): export entrypoint defaults so the python heredoc sees them
f74f48e
#!/usr/bin/env bash
# Entry point for the Hugging Face Space container.
#
# Pre-warms the checkpoint download from HF Hub before starting gunicorn so
# that worker processes never block on the network on first request. The
# download is idempotent β€” on warm restarts (with persistent storage) every
# file is already present and snapshot_download is a no-op.
set -euo pipefail
# `export VAR="${VAR:-default}"` and not bash's `: "${VAR:=default}"` β€”
# the assignment-only form does not export to child processes (the
# Python heredoc below reads via os.environ and would KeyError).
export HF_CHECKPOINTS_REPO="${HF_CHECKPOINTS_REPO:-Bani57/checkpoints}"
export CHECKPOINTS_ROOT="${CHECKPOINTS_ROOT:-/app/research}"
export PORT="${PORT:-7860}"
# An empty HF_TOKEN tricks huggingface_hub into emitting a malformed
# 'Bearer ' auth header (httpx rejects it). For public repos no token is
# needed; drop the variable entirely if it's set but empty.
if [ -z "${HF_TOKEN:-}" ]; then
unset HF_TOKEN
fi
mkdir -p "${CHECKPOINTS_ROOT}"
echo "[entrypoint] Pre-warming checkpoints from ${HF_CHECKPOINTS_REPO} -> ${CHECKPOINTS_ROOT}"
python - <<'PY'
import os
from huggingface_hub import snapshot_download
token = os.environ.get("HF_TOKEN") or None
snapshot_download(
repo_id=os.environ["HF_CHECKPOINTS_REPO"],
repo_type="model",
local_dir=os.environ["CHECKPOINTS_ROOT"],
max_workers=4,
token=token,
)
print("[entrypoint] checkpoints ready")
PY
echo "[entrypoint] starting gunicorn on 0.0.0.0:${PORT}"
# Single worker, multiple threads, preloaded app:
# * --preload runs Django setup (and ModelRegistry.initialize β†’ COINs
# Loaders + sample subgraphs) ONCE in the gunicorn master before
# forking. Without it, every worker boot goes through the same
# 10–15-minute graph-metric computation and gunicorn's silent-time
# timeout (which fires during boot) kills the worker mid-init.
# * Single worker because the ModelRegistry holds multi-GB of state and
# ModelRegistry._inference_lock serializes inference globally anyway β€”
# a second worker would duplicate the memory without adding throughput.
# * Long --timeout protects the first inference request after a cold
# start, when lazy model loading + diffusion sampling can take minutes
# on free-tier CPU.
exec gunicorn research_api.wsgi:application \
--bind "0.0.0.0:${PORT}" \
--workers 1 \
--threads 4 \
--preload \
--timeout 1800 \
--graceful-timeout 30 \
--access-logfile - \
--error-logfile -