#!/bin/bash # entrypoint.sh — Hindsight wrapper with pg_dump backup/restore # # Flow: # 1. Start Hindsight (/app/start-all.sh) — initializes PG + API # 2. Wait for healthy (PG + API ready) # 3. Restore from HF Dataset (pg_restore into running PG) # 4. Trap SIGTERM → pg_dump → upload → shutdown # 5. Periodic backup loop (pg_dump every BACKUP_INTERVAL_SECONDS) # 6. If Hindsight dies → emergency pg_dump → exit 1 set -euo pipefail BACKUP_INTERVAL="${BACKUP_INTERVAL_SECONDS:-21600}" # default 6 hours API_PORT="${HINDSIGHT_API_PORT:-7860}" log() { echo "[ENTRYPOINT] $(date -u +%Y-%m-%dT%H:%M:%SZ) $*"; } # ============================================================ # kill_hindsight — Kill the entire process tree, wait for port release # ============================================================ kill_hindsight() { local pid="$1" log "Stopping Hindsight (PID $pid) and all children..." # 1. Kill all children first (uvicorn workers, pg0, etc.) pkill -TERM -P "$pid" 2>/dev/null || true # 2. Kill the parent kill -TERM "$pid" 2>/dev/null || true sleep 3 # 3. Force-kill any survivors pkill -9 -P "$pid" 2>/dev/null || true kill -9 "$pid" 2>/dev/null || true wait "$pid" 2>/dev/null || true # 4. Wait for port to be released (up to 30s) local attempts=0 while [ "$attempts" -lt 30 ]; do if ! ss -tlnp 2>/dev/null | grep -q ":${API_PORT} " && \ ! netstat -tlnp 2>/dev/null | grep -q ":${API_PORT} "; then log "Port ${API_PORT} released" return 0 fi attempts=$((attempts + 1)) sleep 1 done # 5. Last resort: kill anything still on the port if command -v fuser >/dev/null 2>&1; then fuser -k "${API_PORT}/tcp" 2>/dev/null || true sleep 2 fi log "Port cleanup complete (waited ${attempts}s)" } # ============================================================ # STEP 1: Start Hindsight (starts embedded PG + API) # ============================================================ log "Starting Hindsight (/app/start-all.sh)..." /app/start-all.sh & HINDSIGHT_PID=$! log "Hindsight started (PID $HINDSIGHT_PID)" # ============================================================ # STEP 2: SIGTERM handler — pg_dump before shutdown # ============================================================ cleanup() { log "SIGTERM received — performing final backup..." python3 /opt/backup/backup.py shutdown || log "Final backup failed (non-fatal)" kill_hindsight "$HINDSIGHT_PID" log "Shutdown complete" exit 0 } trap cleanup SIGTERM SIGINT # ============================================================ # STEP 3: Wait for Hindsight to be healthy # ============================================================ log "Waiting for Hindsight to become healthy..." for i in $(seq 1 60); do if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then log "Hindsight is healthy" break fi if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then log "Hindsight died during startup" exit 1 fi sleep 5 done # ============================================================ # STEP 4: Restore from backup (PG is now running) # After pg_restore, we must restart Hindsight so the API # reconnects to PG and sees the restored data. # Exit codes from restore.py: # 0 = data restored successfully # 2 = no backup found (skip restart) # 1 = error # ============================================================ if [ -n "${SKIP_RESTORE:-}" ]; then log "SKIP_RESTORE set — skipping restore (fresh start for clean seeding)" elif [ -n "${HF_TOKEN:-}" ]; then log "Attempting restore from HF Dataset..." restore_exit=0 python3 /opt/backup/restore.py || restore_exit=$? if [ "$restore_exit" -eq 0 ]; then log "Restore succeeded — restarting Hindsight to load restored data..." kill_hindsight "$HINDSIGHT_PID" /app/start-all.sh & HINDSIGHT_PID=$! log "Hindsight restarted (PID $HINDSIGHT_PID)" # Wait minimum 10s before health check — prevents hitting stale server sleep 10 for i in $(seq 1 60); do if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then log "Hindsight is healthy after restore" break fi if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then log "Hindsight died after restore restart" exit 1 fi sleep 5 done elif [ "$restore_exit" -eq 2 ]; then log "No backup found — continuing with fresh database" else log "Restore failed — continuing with fresh database" fi else log "HF_TOKEN not set — skipping restore" fi # ============================================================ # STEP 5: Periodic backup loop # ============================================================ last_backup=$(date +%s) while kill -0 "$HINDSIGHT_PID" 2>/dev/null; do sleep 60 & wait $! || true # Allow SIGTERM to interrupt sleep now=$(date +%s) elapsed=$((now - last_backup)) if [ "$elapsed" -ge "$BACKUP_INTERVAL" ]; then if [ -n "${HF_TOKEN:-}" ]; then log "Scheduled backup (elapsed: ${elapsed}s)..." python3 /opt/backup/backup.py scheduled || \ log "Backup failed — will retry next cycle" fi last_backup=$(date +%s) fi done # ============================================================ # STEP 6: Hindsight died unexpectedly # ============================================================ log "Hindsight process ended unexpectedly" if [ -n "${HF_TOKEN:-}" ]; then log "Performing emergency backup..." python3 /opt/backup/backup.py crash || log "Emergency backup failed" fi exit 1