Spaces:

Bani57
/

website

Sleeping

File size: 2,547 Bytes

#!/usr/bin/env bash
# Entry point for the Hugging Face Space container.
#
# Pre-warms the checkpoint download from HF Hub before starting gunicorn so
# that worker processes never block on the network on first request.  The
# download is idempotent — on warm restarts (with persistent storage) every
# file is already present and snapshot_download is a no-op.
set -euo pipefail

# `export VAR="${VAR:-default}"` and not bash's `: "${VAR:=default}"` —
# the assignment-only form does not export to child processes (the
# Python heredoc below reads via os.environ and would KeyError).
export HF_CHECKPOINTS_REPO="${HF_CHECKPOINTS_REPO:-Bani57/checkpoints}"
export CHECKPOINTS_ROOT="${CHECKPOINTS_ROOT:-/app/research}"
export PORT="${PORT:-7860}"

# An empty HF_TOKEN tricks huggingface_hub into emitting a malformed
# 'Bearer ' auth header (httpx rejects it).  For public repos no token is
# needed; drop the variable entirely if it's set but empty.
if [ -z "${HF_TOKEN:-}" ]; then
    unset HF_TOKEN
fi

mkdir -p "${CHECKPOINTS_ROOT}"

echo "[entrypoint] Pre-warming checkpoints from ${HF_CHECKPOINTS_REPO} -> ${CHECKPOINTS_ROOT}"
python - <<'PY'
import os
from huggingface_hub import snapshot_download

token = os.environ.get("HF_TOKEN") or None
snapshot_download(
    repo_id=os.environ["HF_CHECKPOINTS_REPO"],
    repo_type="model",
    local_dir=os.environ["CHECKPOINTS_ROOT"],
    max_workers=4,
    token=token,
)
print("[entrypoint] checkpoints ready")
PY

echo "[entrypoint] starting gunicorn on 0.0.0.0:${PORT}"
# Single worker, multiple threads, preloaded app:
#   * --preload runs Django setup (and ModelRegistry.initialize → COINs
#     Loaders + sample subgraphs) ONCE in the gunicorn master before
#     forking.  Without it, every worker boot goes through the same
#     10–15-minute graph-metric computation and gunicorn's silent-time
#     timeout (which fires during boot) kills the worker mid-init.
#   * Single worker because the ModelRegistry holds multi-GB of state and
#     ModelRegistry._inference_lock serializes inference globally anyway —
#     a second worker would duplicate the memory without adding throughput.
#   * Long --timeout protects the first inference request after a cold
#     start, when lazy model loading + diffusion sampling can take minutes
#     on free-tier CPU.
exec gunicorn research_api.wsgi:application \
    --bind "0.0.0.0:${PORT}" \
    --workers 1 \
    --threads 4 \
    --preload \
    --timeout 1800 \
    --graceful-timeout 30 \
    --access-logfile - \
    --error-logfile -