File size: 2,547 Bytes
b385912
 
 
 
 
 
 
 
 
f74f48e
 
 
 
 
 
b385912
5375c2e
 
 
 
 
 
 
b385912
 
 
 
 
 
 
5375c2e
b385912
 
 
 
 
5375c2e
b385912
 
 
 
 
5375c2e
 
 
 
 
 
 
 
 
 
 
 
b385912
 
5375c2e
 
 
 
 
b385912
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env bash
# Entry point for the Hugging Face Space container.
#
# Pre-warms the checkpoint download from HF Hub before starting gunicorn so
# that worker processes never block on the network on first request.  The
# download is idempotent β€” on warm restarts (with persistent storage) every
# file is already present and snapshot_download is a no-op.
set -euo pipefail

# `export VAR="${VAR:-default}"` and not bash's `: "${VAR:=default}"` β€”
# the assignment-only form does not export to child processes (the
# Python heredoc below reads via os.environ and would KeyError).
export HF_CHECKPOINTS_REPO="${HF_CHECKPOINTS_REPO:-Bani57/checkpoints}"
export CHECKPOINTS_ROOT="${CHECKPOINTS_ROOT:-/app/research}"
export PORT="${PORT:-7860}"

# An empty HF_TOKEN tricks huggingface_hub into emitting a malformed
# 'Bearer ' auth header (httpx rejects it).  For public repos no token is
# needed; drop the variable entirely if it's set but empty.
if [ -z "${HF_TOKEN:-}" ]; then
    unset HF_TOKEN
fi

mkdir -p "${CHECKPOINTS_ROOT}"

echo "[entrypoint] Pre-warming checkpoints from ${HF_CHECKPOINTS_REPO} -> ${CHECKPOINTS_ROOT}"
python - <<'PY'
import os
from huggingface_hub import snapshot_download

token = os.environ.get("HF_TOKEN") or None
snapshot_download(
    repo_id=os.environ["HF_CHECKPOINTS_REPO"],
    repo_type="model",
    local_dir=os.environ["CHECKPOINTS_ROOT"],
    max_workers=4,
    token=token,
)
print("[entrypoint] checkpoints ready")
PY

echo "[entrypoint] starting gunicorn on 0.0.0.0:${PORT}"
# Single worker, multiple threads, preloaded app:
#   * --preload runs Django setup (and ModelRegistry.initialize β†’ COINs
#     Loaders + sample subgraphs) ONCE in the gunicorn master before
#     forking.  Without it, every worker boot goes through the same
#     10–15-minute graph-metric computation and gunicorn's silent-time
#     timeout (which fires during boot) kills the worker mid-init.
#   * Single worker because the ModelRegistry holds multi-GB of state and
#     ModelRegistry._inference_lock serializes inference globally anyway β€”
#     a second worker would duplicate the memory without adding throughput.
#   * Long --timeout protects the first inference request after a cold
#     start, when lazy model loading + diffusion sampling can take minutes
#     on free-tier CPU.
exec gunicorn research_api.wsgi:application \
    --bind "0.0.0.0:${PORT}" \
    --workers 1 \
    --threads 4 \
    --preload \
    --timeout 1800 \
    --graceful-timeout 30 \
    --access-logfile - \
    --error-logfile -