#!/bin/bash # ============================================================================= # entrypoint.sh – Label Studio wrapper for Hugging Face Spaces # # What this script does: # 1. On startup : restores SQLite DB + exports from S3 # 2. After restore: verifies DB integrity before starting Label Studio # 3. Starts Label Studio in the background # 4. Every 60 s : checkpoints WAL then syncs /data → S3 # 5. On SIGTERM : does one final sync before the pod dies # # Required HF Space secrets: # BACKUP_S3_BUCKET – bucket name (no s3:// prefix) # LABEL_STUDIO_USERNAME – admin username / email # LABEL_STUDIO_PASSWORD – admin password # # Optional HF Space secrets (with defaults shown): # BACKUP_S3_PREFIX – labelstudio # AWS_DEFAULT_REGION – us-east-1 # AWS_ENDPOINT_URL – https://s3.amazonaws.com (change for R2/MinIO) # SYNC_INTERVAL_SECONDS – 60 # ============================================================================= set -euo pipefail # ------------- configuration (all from HF secrets / env) ------------------- BUCKET="${BACKUP_S3_BUCKET:?'BACKUP_S3_BUCKET secret is required'}" PREFIX="${BACKUP_S3_PREFIX:-labelstudio}" REGION="${AWS_DEFAULT_REGION:-us-east-1}" ENDPOINT="${AWS_ENDPOINT_URL:-https://s3.amazonaws.com}" SYNC_INTERVAL="${SYNC_INTERVAL_SECONDS:-60}" DATA_DIR=/data # Label Studio uses label_studio.sqlite3 (not .db) — confirmed from live logs DB_FILE="$DATA_DIR/label_studio.sqlite3" S3_PATH="s3://${BUCKET}/${PREFIX}" export AWS_DEFAULT_REGION="$REGION" # ------------- helper: WAL checkpoint + S3 upload -------------------------- sync_to_s3() { local reason="${1:-periodic}" echo "[sync] Starting $reason sync to $S3_PATH/ ..." # Checkpoint the WAL so S3 always receives a single self-consistent file. # TRUNCATE: folds all WAL frames into the main DB file and zeros the WAL. # This briefly blocks writers (not readers) — safe for ≤10 concurrent users. if [ -f "$DB_FILE" ]; then sqlite3 "$DB_FILE" "PRAGMA wal_checkpoint(TRUNCATE);" 2>/dev/null || \ echo "[sync] WAL checkpoint skipped (DB may be locked) – proceeding anyway" fi aws s3 sync "$DATA_DIR/" "$S3_PATH/" \ --endpoint-url "$ENDPOINT" \ --no-progress \ && echo "[sync] $reason sync complete" \ || echo "[sync] WARNING: $reason sync failed – will retry next cycle" } # ------------- graceful shutdown ------------------------------------------- shutdown_handler() { echo "" echo "[shutdown] SIGTERM received – uploading final snapshot before exit ..." sync_to_s3 "shutdown" echo "[shutdown] Stopping Label Studio (PID $LS_PID) ..." kill "$LS_PID" 2>/dev/null || true wait "$LS_PID" 2>/dev/null || true echo "[shutdown] Done. Goodbye." } # ------------- 1. restore from S3 ----------------------------------------- echo "============================================================" echo " Label Studio | HF Spaces + S3 backup | $(date -u)" echo " Bucket : $S3_PATH/" echo " Data dir: $DATA_DIR" echo " Sync : every ${SYNC_INTERVAL}s" echo "============================================================" mkdir -p "$DATA_DIR" echo "[restore] Syncing data DOWN from $S3_PATH/ ..." # Use || true so a fresh bucket (nothing to restore) doesn't abort the script aws s3 sync "$S3_PATH/" "$DATA_DIR/" \ --endpoint-url "$ENDPOINT" \ --no-progress \ && echo "[restore] Restore complete" \ || echo "[restore] Restore returned non-zero (may be first run) – continuing" # ------------- 2. DB integrity check before starting LS ------------------- if [ -f "$DB_FILE" ]; then echo "[integrity] Checking DB integrity ..." INTEGRITY=$(sqlite3 "$DB_FILE" "PRAGMA integrity_check;" 2>&1 | head -1) if [ "$INTEGRITY" != "ok" ]; then echo "[integrity] ERROR: DB integrity check failed: $INTEGRITY" echo "[integrity] The restored database is corrupt." echo "[integrity] → Check your S3 bucket for a healthy backup." echo "[integrity] → If you have bucket versioning enabled, restore an earlier version." exit 1 fi echo "[integrity] DB is healthy: $INTEGRITY" else echo "[integrity] No existing DB found – Label Studio will create a fresh one." fi # ------------- 3. ensure admin user exists (idempotent) ------------------- echo "[user] Ensuring admin user exists ..." label-studio user create \ --username "${LABEL_STUDIO_USERNAME:?'LABEL_STUDIO_USERNAME secret is required'}" \ --password "${LABEL_STUDIO_PASSWORD:?'LABEL_STUDIO_PASSWORD secret is required'}" \ --email "${LABEL_STUDIO_USERNAME}@localhost" \ --preserve-case 2>/dev/null || true # ------------- 4. start Label Studio in the background -------------------- # Use $SPACE_HOST exactly as the official HF Dockerfile does — this is what # makes HF's reverse proxy route traffic correctly to the container. echo "[start] Starting Label Studio ..." label-studio --host="$SPACE_HOST" & LS_PID=$! echo "[start] Label Studio PID: $LS_PID" # Register shutdown handler NOW (after we have LS_PID) trap shutdown_handler EXIT INT TERM # ------------- 5. periodic sync loop (runs forever) ----------------------- echo "[loop] Entering sync loop (every ${SYNC_INTERVAL}s) ..." while true; do sleep "$SYNC_INTERVAL" # Verify LS is still alive before syncing if ! kill -0 "$LS_PID" 2>/dev/null; then echo "[loop] Label Studio process has exited unexpectedly – stopping container" exit 1 fi sync_to_s3 "periodic" done