ls-onboarding / entrypoint.sh
Timothy S. Phan
Label Studio with S3 backup and signup disabled
4fac651
#!/bin/bash
# =============================================================================
# entrypoint.sh – Label Studio wrapper for Hugging Face Spaces
#
# What this script does:
# 1. On startup : restores SQLite DB + exports from S3
# 2. After restore: verifies DB integrity before starting Label Studio
# 3. Starts Label Studio in the background
# 4. Every 60 s : checkpoints WAL then syncs /data β†’ S3
# 5. On SIGTERM : does one final sync before the pod dies
#
# Required HF Space secrets:
# BACKUP_S3_BUCKET – bucket name (no s3:// prefix)
# LABEL_STUDIO_USERNAME – admin username / email
# LABEL_STUDIO_PASSWORD – admin password
#
# Optional HF Space secrets (with defaults shown):
# BACKUP_S3_PREFIX – labelstudio
# AWS_DEFAULT_REGION – us-east-1
# AWS_ENDPOINT_URL – https://s3.amazonaws.com (change for R2/MinIO)
# SYNC_INTERVAL_SECONDS – 60
# =============================================================================
set -euo pipefail
# ------------- configuration (all from HF secrets / env) -------------------
BUCKET="${BACKUP_S3_BUCKET:?'BACKUP_S3_BUCKET secret is required'}"
PREFIX="${BACKUP_S3_PREFIX:-labelstudio}"
REGION="${AWS_DEFAULT_REGION:-us-east-1}"
ENDPOINT="${AWS_ENDPOINT_URL:-https://s3.amazonaws.com}"
SYNC_INTERVAL="${SYNC_INTERVAL_SECONDS:-60}"
DATA_DIR=/data
# Label Studio uses label_studio.sqlite3 (not .db) β€” confirmed from live logs
DB_FILE="$DATA_DIR/label_studio.sqlite3"
S3_PATH="s3://${BUCKET}/${PREFIX}"
export AWS_DEFAULT_REGION="$REGION"
# ------------- helper: WAL checkpoint + S3 upload --------------------------
sync_to_s3() {
local reason="${1:-periodic}"
echo "[sync] Starting $reason sync to $S3_PATH/ ..."
# Checkpoint the WAL so S3 always receives a single self-consistent file.
# TRUNCATE: folds all WAL frames into the main DB file and zeros the WAL.
# This briefly blocks writers (not readers) β€” safe for ≀10 concurrent users.
if [ -f "$DB_FILE" ]; then
sqlite3 "$DB_FILE" "PRAGMA wal_checkpoint(TRUNCATE);" 2>/dev/null || \
echo "[sync] WAL checkpoint skipped (DB may be locked) – proceeding anyway"
fi
aws s3 sync "$DATA_DIR/" "$S3_PATH/" \
--endpoint-url "$ENDPOINT" \
--no-progress \
&& echo "[sync] $reason sync complete" \
|| echo "[sync] WARNING: $reason sync failed – will retry next cycle"
}
# ------------- graceful shutdown -------------------------------------------
shutdown_handler() {
echo ""
echo "[shutdown] SIGTERM received – uploading final snapshot before exit ..."
sync_to_s3 "shutdown"
echo "[shutdown] Stopping Label Studio (PID $LS_PID) ..."
kill "$LS_PID" 2>/dev/null || true
wait "$LS_PID" 2>/dev/null || true
echo "[shutdown] Done. Goodbye."
}
# ------------- 1. restore from S3 -----------------------------------------
echo "============================================================"
echo " Label Studio | HF Spaces + S3 backup | $(date -u)"
echo " Bucket : $S3_PATH/"
echo " Data dir: $DATA_DIR"
echo " Sync : every ${SYNC_INTERVAL}s"
echo "============================================================"
mkdir -p "$DATA_DIR"
echo "[restore] Syncing data DOWN from $S3_PATH/ ..."
# Use || true so a fresh bucket (nothing to restore) doesn't abort the script
aws s3 sync "$S3_PATH/" "$DATA_DIR/" \
--endpoint-url "$ENDPOINT" \
--no-progress \
&& echo "[restore] Restore complete" \
|| echo "[restore] Restore returned non-zero (may be first run) – continuing"
# ------------- 2. DB integrity check before starting LS -------------------
if [ -f "$DB_FILE" ]; then
echo "[integrity] Checking DB integrity ..."
INTEGRITY=$(sqlite3 "$DB_FILE" "PRAGMA integrity_check;" 2>&1 | head -1)
if [ "$INTEGRITY" != "ok" ]; then
echo "[integrity] ERROR: DB integrity check failed: $INTEGRITY"
echo "[integrity] The restored database is corrupt."
echo "[integrity] β†’ Check your S3 bucket for a healthy backup."
echo "[integrity] β†’ If you have bucket versioning enabled, restore an earlier version."
exit 1
fi
echo "[integrity] DB is healthy: $INTEGRITY"
else
echo "[integrity] No existing DB found – Label Studio will create a fresh one."
fi
# ------------- 3. ensure admin user exists (idempotent) -------------------
echo "[user] Ensuring admin user exists ..."
label-studio user create \
--username "${LABEL_STUDIO_USERNAME:?'LABEL_STUDIO_USERNAME secret is required'}" \
--password "${LABEL_STUDIO_PASSWORD:?'LABEL_STUDIO_PASSWORD secret is required'}" \
--email "${LABEL_STUDIO_USERNAME}@localhost" \
--preserve-case 2>/dev/null || true
# ------------- 4. start Label Studio in the background --------------------
# Use $SPACE_HOST exactly as the official HF Dockerfile does β€” this is what
# makes HF's reverse proxy route traffic correctly to the container.
echo "[start] Starting Label Studio ..."
label-studio --host="$SPACE_HOST" &
LS_PID=$!
echo "[start] Label Studio PID: $LS_PID"
# Register shutdown handler NOW (after we have LS_PID)
trap shutdown_handler EXIT INT TERM
# ------------- 5. periodic sync loop (runs forever) -----------------------
echo "[loop] Entering sync loop (every ${SYNC_INTERVAL}s) ..."
while true; do
sleep "$SYNC_INTERVAL"
# Verify LS is still alive before syncing
if ! kill -0 "$LS_PID" 2>/dev/null; then
echo "[loop] Label Studio process has exited unexpectedly – stopping container"
exit 1
fi
sync_to_s3 "periodic"
done