Spaces:
Runtime error
Runtime error
File size: 5,878 Bytes
7a352c6 04263be 7a352c6 04263be 7a352c6 04263be 7a352c6 04263be 7a352c6 cded1f5 7a352c6 b8236dd 7a352c6 c1ce02e cded1f5 04263be cded1f5 04263be cded1f5 04263be cded1f5 7a352c6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | #!/bin/bash
# entrypoint.sh β Hindsight wrapper with pg_dump backup/restore
#
# Flow:
# 1. Start Hindsight (/app/start-all.sh) β initializes PG + API
# 2. Wait for healthy (PG + API ready)
# 3. Restore from HF Dataset (pg_restore into running PG)
# 4. Trap SIGTERM β pg_dump β upload β shutdown
# 5. Periodic backup loop (pg_dump every BACKUP_INTERVAL_SECONDS)
# 6. If Hindsight dies β emergency pg_dump β exit 1
set -euo pipefail
BACKUP_INTERVAL="${BACKUP_INTERVAL_SECONDS:-21600}" # default 6 hours
API_PORT="${HINDSIGHT_API_PORT:-7860}"
log() { echo "[ENTRYPOINT] $(date -u +%Y-%m-%dT%H:%M:%SZ) $*"; }
# ============================================================
# kill_hindsight β Kill the entire process tree, wait for port release
# ============================================================
kill_hindsight() {
local pid="$1"
log "Stopping Hindsight (PID $pid) and all children..."
# 1. Kill all children first (uvicorn workers, pg0, etc.)
pkill -TERM -P "$pid" 2>/dev/null || true
# 2. Kill the parent
kill -TERM "$pid" 2>/dev/null || true
sleep 3
# 3. Force-kill any survivors
pkill -9 -P "$pid" 2>/dev/null || true
kill -9 "$pid" 2>/dev/null || true
wait "$pid" 2>/dev/null || true
# 4. Wait for port to be released (up to 30s)
local attempts=0
while [ "$attempts" -lt 30 ]; do
if ! ss -tlnp 2>/dev/null | grep -q ":${API_PORT} " && \
! netstat -tlnp 2>/dev/null | grep -q ":${API_PORT} "; then
log "Port ${API_PORT} released"
return 0
fi
attempts=$((attempts + 1))
sleep 1
done
# 5. Last resort: kill anything still on the port
if command -v fuser >/dev/null 2>&1; then
fuser -k "${API_PORT}/tcp" 2>/dev/null || true
sleep 2
fi
log "Port cleanup complete (waited ${attempts}s)"
}
# ============================================================
# STEP 1: Start Hindsight (starts embedded PG + API)
# ============================================================
log "Starting Hindsight (/app/start-all.sh)..."
/app/start-all.sh &
HINDSIGHT_PID=$!
log "Hindsight started (PID $HINDSIGHT_PID)"
# ============================================================
# STEP 2: SIGTERM handler β pg_dump before shutdown
# ============================================================
cleanup() {
log "SIGTERM received β performing final backup..."
python3 /opt/backup/backup.py shutdown || log "Final backup failed (non-fatal)"
kill_hindsight "$HINDSIGHT_PID"
log "Shutdown complete"
exit 0
}
trap cleanup SIGTERM SIGINT
# ============================================================
# STEP 3: Wait for Hindsight to be healthy
# ============================================================
log "Waiting for Hindsight to become healthy..."
for i in $(seq 1 60); do
if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then
log "Hindsight is healthy"
break
fi
if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
log "Hindsight died during startup"
exit 1
fi
sleep 5
done
# ============================================================
# STEP 4: Restore from backup (PG is now running)
# After pg_restore, we must restart Hindsight so the API
# reconnects to PG and sees the restored data.
# Exit codes from restore.py:
# 0 = data restored successfully
# 2 = no backup found (skip restart)
# 1 = error
# ============================================================
if [ -n "${SKIP_RESTORE:-}" ]; then
log "SKIP_RESTORE set β skipping restore (fresh start for clean seeding)"
elif [ -n "${HF_TOKEN:-}" ]; then
log "Attempting restore from HF Dataset..."
restore_exit=0
python3 /opt/backup/restore.py || restore_exit=$?
if [ "$restore_exit" -eq 0 ]; then
log "Restore succeeded β restarting Hindsight to load restored data..."
kill_hindsight "$HINDSIGHT_PID"
/app/start-all.sh &
HINDSIGHT_PID=$!
log "Hindsight restarted (PID $HINDSIGHT_PID)"
# Wait minimum 10s before health check β prevents hitting stale server
sleep 10
for i in $(seq 1 60); do
if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then
log "Hindsight is healthy after restore"
break
fi
if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
log "Hindsight died after restore restart"
exit 1
fi
sleep 5
done
elif [ "$restore_exit" -eq 2 ]; then
log "No backup found β continuing with fresh database"
else
log "Restore failed β continuing with fresh database"
fi
else
log "HF_TOKEN not set β skipping restore"
fi
# ============================================================
# STEP 5: Periodic backup loop
# ============================================================
last_backup=$(date +%s)
while kill -0 "$HINDSIGHT_PID" 2>/dev/null; do
sleep 60 &
wait $! || true # Allow SIGTERM to interrupt sleep
now=$(date +%s)
elapsed=$((now - last_backup))
if [ "$elapsed" -ge "$BACKUP_INTERVAL" ]; then
if [ -n "${HF_TOKEN:-}" ]; then
log "Scheduled backup (elapsed: ${elapsed}s)..."
python3 /opt/backup/backup.py scheduled || \
log "Backup failed β will retry next cycle"
fi
last_backup=$(date +%s)
fi
done
# ============================================================
# STEP 6: Hindsight died unexpectedly
# ============================================================
log "Hindsight process ended unexpectedly"
if [ -n "${HF_TOKEN:-}" ]; then
log "Performing emergency backup..."
python3 /opt/backup/backup.py crash || log "Emergency backup failed"
fi
exit 1
|