atum-hindsight / scripts /entrypoint.sh
Arnwald84's picture
fix: robust process kill and backup guard to prevent death spiral
04263be verified
#!/bin/bash
# entrypoint.sh β€” Hindsight wrapper with pg_dump backup/restore
#
# Flow:
# 1. Start Hindsight (/app/start-all.sh) β€” initializes PG + API
# 2. Wait for healthy (PG + API ready)
# 3. Restore from HF Dataset (pg_restore into running PG)
# 4. Trap SIGTERM β†’ pg_dump β†’ upload β†’ shutdown
# 5. Periodic backup loop (pg_dump every BACKUP_INTERVAL_SECONDS)
# 6. If Hindsight dies β†’ emergency pg_dump β†’ exit 1
set -euo pipefail
BACKUP_INTERVAL="${BACKUP_INTERVAL_SECONDS:-21600}" # default 6 hours
API_PORT="${HINDSIGHT_API_PORT:-7860}"
log() { echo "[ENTRYPOINT] $(date -u +%Y-%m-%dT%H:%M:%SZ) $*"; }
# ============================================================
# kill_hindsight β€” Kill the entire process tree, wait for port release
# ============================================================
kill_hindsight() {
local pid="$1"
log "Stopping Hindsight (PID $pid) and all children..."
# 1. Kill all children first (uvicorn workers, pg0, etc.)
pkill -TERM -P "$pid" 2>/dev/null || true
# 2. Kill the parent
kill -TERM "$pid" 2>/dev/null || true
sleep 3
# 3. Force-kill any survivors
pkill -9 -P "$pid" 2>/dev/null || true
kill -9 "$pid" 2>/dev/null || true
wait "$pid" 2>/dev/null || true
# 4. Wait for port to be released (up to 30s)
local attempts=0
while [ "$attempts" -lt 30 ]; do
if ! ss -tlnp 2>/dev/null | grep -q ":${API_PORT} " && \
! netstat -tlnp 2>/dev/null | grep -q ":${API_PORT} "; then
log "Port ${API_PORT} released"
return 0
fi
attempts=$((attempts + 1))
sleep 1
done
# 5. Last resort: kill anything still on the port
if command -v fuser >/dev/null 2>&1; then
fuser -k "${API_PORT}/tcp" 2>/dev/null || true
sleep 2
fi
log "Port cleanup complete (waited ${attempts}s)"
}
# ============================================================
# STEP 1: Start Hindsight (starts embedded PG + API)
# ============================================================
log "Starting Hindsight (/app/start-all.sh)..."
/app/start-all.sh &
HINDSIGHT_PID=$!
log "Hindsight started (PID $HINDSIGHT_PID)"
# ============================================================
# STEP 2: SIGTERM handler β€” pg_dump before shutdown
# ============================================================
cleanup() {
log "SIGTERM received β€” performing final backup..."
python3 /opt/backup/backup.py shutdown || log "Final backup failed (non-fatal)"
kill_hindsight "$HINDSIGHT_PID"
log "Shutdown complete"
exit 0
}
trap cleanup SIGTERM SIGINT
# ============================================================
# STEP 3: Wait for Hindsight to be healthy
# ============================================================
log "Waiting for Hindsight to become healthy..."
for i in $(seq 1 60); do
if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then
log "Hindsight is healthy"
break
fi
if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
log "Hindsight died during startup"
exit 1
fi
sleep 5
done
# ============================================================
# STEP 4: Restore from backup (PG is now running)
# After pg_restore, we must restart Hindsight so the API
# reconnects to PG and sees the restored data.
# Exit codes from restore.py:
# 0 = data restored successfully
# 2 = no backup found (skip restart)
# 1 = error
# ============================================================
if [ -n "${SKIP_RESTORE:-}" ]; then
log "SKIP_RESTORE set β€” skipping restore (fresh start for clean seeding)"
elif [ -n "${HF_TOKEN:-}" ]; then
log "Attempting restore from HF Dataset..."
restore_exit=0
python3 /opt/backup/restore.py || restore_exit=$?
if [ "$restore_exit" -eq 0 ]; then
log "Restore succeeded β€” restarting Hindsight to load restored data..."
kill_hindsight "$HINDSIGHT_PID"
/app/start-all.sh &
HINDSIGHT_PID=$!
log "Hindsight restarted (PID $HINDSIGHT_PID)"
# Wait minimum 10s before health check β€” prevents hitting stale server
sleep 10
for i in $(seq 1 60); do
if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then
log "Hindsight is healthy after restore"
break
fi
if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
log "Hindsight died after restore restart"
exit 1
fi
sleep 5
done
elif [ "$restore_exit" -eq 2 ]; then
log "No backup found β€” continuing with fresh database"
else
log "Restore failed β€” continuing with fresh database"
fi
else
log "HF_TOKEN not set β€” skipping restore"
fi
# ============================================================
# STEP 5: Periodic backup loop
# ============================================================
last_backup=$(date +%s)
while kill -0 "$HINDSIGHT_PID" 2>/dev/null; do
sleep 60 &
wait $! || true # Allow SIGTERM to interrupt sleep
now=$(date +%s)
elapsed=$((now - last_backup))
if [ "$elapsed" -ge "$BACKUP_INTERVAL" ]; then
if [ -n "${HF_TOKEN:-}" ]; then
log "Scheduled backup (elapsed: ${elapsed}s)..."
python3 /opt/backup/backup.py scheduled || \
log "Backup failed β€” will retry next cycle"
fi
last_backup=$(date +%s)
fi
done
# ============================================================
# STEP 6: Hindsight died unexpectedly
# ============================================================
log "Hindsight process ended unexpectedly"
if [ -n "${HF_TOKEN:-}" ]; then
log "Performing emergency backup..."
python3 /opt/backup/backup.py crash || log "Emergency backup failed"
fi
exit 1