#!/usr/bin/env bash set -euo pipefail # Ensure HF cache lives on a writable path (persistent if your platform supports volumes) export HF_HOME=${HF_HOME:-/root/.cache/huggingface} export TRANSFORMERS_CACHE=${TRANSFORMERS_CACHE:-$HF_HOME/transformers} export RUNNING_GUNICORN=1 export PYTHONUNBUFFERED=1 export TOKENIZERS_PARALLELISM=false export WEB_CONCURRENCY=1 # Keep single worker so you don't load models multiple times # Start gunicorn (threaded so /process stays responsive) exec gunicorn --bind 0.0.0.0:${PORT:-8080} \ --workers ${WEB_CONCURRENCY} \ --threads 4 \ --timeout 0 \ app:app