Spaces:

Bani57
/

website

Sleeping

website / entrypoint.sh

Andrej Janchevski

fix(deploy): export entrypoint defaults so the python heredoc sees them

f74f48e 16 days ago

2.55 kB

	#!/usr/bin/env bash
	# Entry point for the Hugging Face Space container.
	#
	# Pre-warms the checkpoint download from HF Hub before starting gunicorn so
	# that worker processes never block on the network on first request. The
	# download is idempotent — on warm restarts (with persistent storage) every
	# file is already present and snapshot_download is a no-op.
	set -euo pipefail

	# `export VAR="${VAR:-default}"` and not bash's `: "${VAR:=default}"` —
	# the assignment-only form does not export to child processes (the
	# Python heredoc below reads via os.environ and would KeyError).
	export HF_CHECKPOINTS_REPO="${HF_CHECKPOINTS_REPO:-Bani57/checkpoints}"
	export CHECKPOINTS_ROOT="${CHECKPOINTS_ROOT:-/app/research}"
	export PORT="${PORT:-7860}"

	# An empty HF_TOKEN tricks huggingface_hub into emitting a malformed
	# 'Bearer ' auth header (httpx rejects it). For public repos no token is
	# needed; drop the variable entirely if it's set but empty.
	if [ -z "${HF_TOKEN:-}" ]; then
	unset HF_TOKEN
	fi

	mkdir -p "${CHECKPOINTS_ROOT}"

	echo "[entrypoint] Pre-warming checkpoints from ${HF_CHECKPOINTS_REPO} -> ${CHECKPOINTS_ROOT}"
	python - <<'PY'
	import os
	from huggingface_hub import snapshot_download

	token = os.environ.get("HF_TOKEN") or None
	snapshot_download(
	repo_id=os.environ["HF_CHECKPOINTS_REPO"],
	repo_type="model",
	local_dir=os.environ["CHECKPOINTS_ROOT"],
	max_workers=4,
	token=token,
	)
	print("[entrypoint] checkpoints ready")
	PY

	echo "[entrypoint] starting gunicorn on 0.0.0.0:${PORT}"
	# Single worker, multiple threads, preloaded app:
	# * --preload runs Django setup (and ModelRegistry.initialize → COINs
	# Loaders + sample subgraphs) ONCE in the gunicorn master before
	# forking. Without it, every worker boot goes through the same
	# 10–15-minute graph-metric computation and gunicorn's silent-time
	# timeout (which fires during boot) kills the worker mid-init.
	# * Single worker because the ModelRegistry holds multi-GB of state and
	# ModelRegistry._inference_lock serializes inference globally anyway —
	# a second worker would duplicate the memory without adding throughput.
	# * Long --timeout protects the first inference request after a cold
	# start, when lazy model loading + diffusion sampling can take minutes
	# on free-tier CPU.
	exec gunicorn research_api.wsgi:application \
	--bind "0.0.0.0:${PORT}" \
	--workers 1 \
	--threads 4 \
	--preload \
	--timeout 1800 \
	--graceful-timeout 30 \
	--access-logfile - \
	--error-logfile -