Spaces:
Runtime error
Runtime error
| # entrypoint.sh β Hindsight wrapper with pg_dump backup/restore | |
| # | |
| # Flow: | |
| # 1. Start Hindsight (/app/start-all.sh) β initializes PG + API | |
| # 2. Wait for healthy (PG + API ready) | |
| # 3. Restore from HF Dataset (pg_restore into running PG) | |
| # 4. Trap SIGTERM β pg_dump β upload β shutdown | |
| # 5. Periodic backup loop (pg_dump every BACKUP_INTERVAL_SECONDS) | |
| # 6. If Hindsight dies β emergency pg_dump β exit 1 | |
| set -euo pipefail | |
| BACKUP_INTERVAL="${BACKUP_INTERVAL_SECONDS:-21600}" # default 6 hours | |
| API_PORT="${HINDSIGHT_API_PORT:-7860}" | |
| log() { echo "[ENTRYPOINT] $(date -u +%Y-%m-%dT%H:%M:%SZ) $*"; } | |
| # ============================================================ | |
| # kill_hindsight β Kill the entire process tree, wait for port release | |
| # ============================================================ | |
| kill_hindsight() { | |
| local pid="$1" | |
| log "Stopping Hindsight (PID $pid) and all children..." | |
| # 1. Kill all children first (uvicorn workers, pg0, etc.) | |
| pkill -TERM -P "$pid" 2>/dev/null || true | |
| # 2. Kill the parent | |
| kill -TERM "$pid" 2>/dev/null || true | |
| sleep 3 | |
| # 3. Force-kill any survivors | |
| pkill -9 -P "$pid" 2>/dev/null || true | |
| kill -9 "$pid" 2>/dev/null || true | |
| wait "$pid" 2>/dev/null || true | |
| # 4. Wait for port to be released (up to 30s) | |
| local attempts=0 | |
| while [ "$attempts" -lt 30 ]; do | |
| if ! ss -tlnp 2>/dev/null | grep -q ":${API_PORT} " && \ | |
| ! netstat -tlnp 2>/dev/null | grep -q ":${API_PORT} "; then | |
| log "Port ${API_PORT} released" | |
| return 0 | |
| fi | |
| attempts=$((attempts + 1)) | |
| sleep 1 | |
| done | |
| # 5. Last resort: kill anything still on the port | |
| if command -v fuser >/dev/null 2>&1; then | |
| fuser -k "${API_PORT}/tcp" 2>/dev/null || true | |
| sleep 2 | |
| fi | |
| log "Port cleanup complete (waited ${attempts}s)" | |
| } | |
| # ============================================================ | |
| # STEP 1: Start Hindsight (starts embedded PG + API) | |
| # ============================================================ | |
| log "Starting Hindsight (/app/start-all.sh)..." | |
| /app/start-all.sh & | |
| HINDSIGHT_PID=$! | |
| log "Hindsight started (PID $HINDSIGHT_PID)" | |
| # ============================================================ | |
| # STEP 2: SIGTERM handler β pg_dump before shutdown | |
| # ============================================================ | |
| cleanup() { | |
| log "SIGTERM received β performing final backup..." | |
| python3 /opt/backup/backup.py shutdown || log "Final backup failed (non-fatal)" | |
| kill_hindsight "$HINDSIGHT_PID" | |
| log "Shutdown complete" | |
| exit 0 | |
| } | |
| trap cleanup SIGTERM SIGINT | |
| # ============================================================ | |
| # STEP 3: Wait for Hindsight to be healthy | |
| # ============================================================ | |
| log "Waiting for Hindsight to become healthy..." | |
| for i in $(seq 1 60); do | |
| if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then | |
| log "Hindsight is healthy" | |
| break | |
| fi | |
| if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then | |
| log "Hindsight died during startup" | |
| exit 1 | |
| fi | |
| sleep 5 | |
| done | |
| # ============================================================ | |
| # STEP 4: Restore from backup (PG is now running) | |
| # After pg_restore, we must restart Hindsight so the API | |
| # reconnects to PG and sees the restored data. | |
| # Exit codes from restore.py: | |
| # 0 = data restored successfully | |
| # 2 = no backup found (skip restart) | |
| # 1 = error | |
| # ============================================================ | |
| if [ -n "${SKIP_RESTORE:-}" ]; then | |
| log "SKIP_RESTORE set β skipping restore (fresh start for clean seeding)" | |
| elif [ -n "${HF_TOKEN:-}" ]; then | |
| log "Attempting restore from HF Dataset..." | |
| restore_exit=0 | |
| python3 /opt/backup/restore.py || restore_exit=$? | |
| if [ "$restore_exit" -eq 0 ]; then | |
| log "Restore succeeded β restarting Hindsight to load restored data..." | |
| kill_hindsight "$HINDSIGHT_PID" | |
| /app/start-all.sh & | |
| HINDSIGHT_PID=$! | |
| log "Hindsight restarted (PID $HINDSIGHT_PID)" | |
| # Wait minimum 10s before health check β prevents hitting stale server | |
| sleep 10 | |
| for i in $(seq 1 60); do | |
| if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then | |
| log "Hindsight is healthy after restore" | |
| break | |
| fi | |
| if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then | |
| log "Hindsight died after restore restart" | |
| exit 1 | |
| fi | |
| sleep 5 | |
| done | |
| elif [ "$restore_exit" -eq 2 ]; then | |
| log "No backup found β continuing with fresh database" | |
| else | |
| log "Restore failed β continuing with fresh database" | |
| fi | |
| else | |
| log "HF_TOKEN not set β skipping restore" | |
| fi | |
| # ============================================================ | |
| # STEP 5: Periodic backup loop | |
| # ============================================================ | |
| last_backup=$(date +%s) | |
| while kill -0 "$HINDSIGHT_PID" 2>/dev/null; do | |
| sleep 60 & | |
| wait $! || true # Allow SIGTERM to interrupt sleep | |
| now=$(date +%s) | |
| elapsed=$((now - last_backup)) | |
| if [ "$elapsed" -ge "$BACKUP_INTERVAL" ]; then | |
| if [ -n "${HF_TOKEN:-}" ]; then | |
| log "Scheduled backup (elapsed: ${elapsed}s)..." | |
| python3 /opt/backup/backup.py scheduled || \ | |
| log "Backup failed β will retry next cycle" | |
| fi | |
| last_backup=$(date +%s) | |
| fi | |
| done | |
| # ============================================================ | |
| # STEP 6: Hindsight died unexpectedly | |
| # ============================================================ | |
| log "Hindsight process ended unexpectedly" | |
| if [ -n "${HF_TOKEN:-}" ]; then | |
| log "Performing emergency backup..." | |
| python3 /opt/backup/backup.py crash || log "Emergency backup failed" | |
| fi | |
| exit 1 | |