Spaces:

Arnwald84
/

atum-hindsight

Runtime error

App Files Files Community

atum-hindsight / scripts /entrypoint.sh

Arnwald84

fix: robust process kill and backup guard to prevent death spiral

04263be verified 3 months ago

raw

history blame contribute delete

5.88 kB

	#!/bin/bash
	# entrypoint.sh — Hindsight wrapper with pg_dump backup/restore
	#
	# Flow:
	# 1. Start Hindsight (/app/start-all.sh) — initializes PG + API
	# 2. Wait for healthy (PG + API ready)
	# 3. Restore from HF Dataset (pg_restore into running PG)
	# 4. Trap SIGTERM → pg_dump → upload → shutdown
	# 5. Periodic backup loop (pg_dump every BACKUP_INTERVAL_SECONDS)
	# 6. If Hindsight dies → emergency pg_dump → exit 1
	set -euo pipefail

	BACKUP_INTERVAL="${BACKUP_INTERVAL_SECONDS:-21600}" # default 6 hours
	API_PORT="${HINDSIGHT_API_PORT:-7860}"

	log() { echo "[ENTRYPOINT] $(date -u +%Y-%m-%dT%H:%M:%SZ) $*"; }

	# ============================================================
	# kill_hindsight — Kill the entire process tree, wait for port release
	# ============================================================
	kill_hindsight() {
	local pid="$1"
	log "Stopping Hindsight (PID $pid) and all children..."

	# 1. Kill all children first (uvicorn workers, pg0, etc.)
	pkill -TERM -P "$pid" 2>/dev/null \|\| true
	# 2. Kill the parent
	kill -TERM "$pid" 2>/dev/null \|\| true
	sleep 3

	# 3. Force-kill any survivors
	pkill -9 -P "$pid" 2>/dev/null \|\| true
	kill -9 "$pid" 2>/dev/null \|\| true
	wait "$pid" 2>/dev/null \|\| true

	# 4. Wait for port to be released (up to 30s)
	local attempts=0
	while [ "$attempts" -lt 30 ]; do
	if ! ss -tlnp 2>/dev/null \| grep -q ":${API_PORT} " && \
	! netstat -tlnp 2>/dev/null \| grep -q ":${API_PORT} "; then
	log "Port ${API_PORT} released"
	return 0
	fi
	attempts=$((attempts + 1))
	sleep 1
	done

	# 5. Last resort: kill anything still on the port
	if command -v fuser >/dev/null 2>&1; then
	fuser -k "${API_PORT}/tcp" 2>/dev/null \|\| true
	sleep 2
	fi
	log "Port cleanup complete (waited ${attempts}s)"
	}

	# ============================================================
	# STEP 1: Start Hindsight (starts embedded PG + API)
	# ============================================================
	log "Starting Hindsight (/app/start-all.sh)..."
	/app/start-all.sh &
	HINDSIGHT_PID=$!
	log "Hindsight started (PID $HINDSIGHT_PID)"

	# ============================================================
	# STEP 2: SIGTERM handler — pg_dump before shutdown
	# ============================================================
	cleanup() {
	log "SIGTERM received — performing final backup..."
	python3 /opt/backup/backup.py shutdown \|\| log "Final backup failed (non-fatal)"
	kill_hindsight "$HINDSIGHT_PID"
	log "Shutdown complete"
	exit 0
	}
	trap cleanup SIGTERM SIGINT

	# ============================================================
	# STEP 3: Wait for Hindsight to be healthy
	# ============================================================
	log "Waiting for Hindsight to become healthy..."
	for i in $(seq 1 60); do
	if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then
	log "Hindsight is healthy"
	break
	fi
	if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
	log "Hindsight died during startup"
	exit 1
	fi
	sleep 5
	done

	# ============================================================
	# STEP 4: Restore from backup (PG is now running)
	# After pg_restore, we must restart Hindsight so the API
	# reconnects to PG and sees the restored data.
	# Exit codes from restore.py:
	# 0 = data restored successfully
	# 2 = no backup found (skip restart)
	# 1 = error
	# ============================================================
	if [ -n "${SKIP_RESTORE:-}" ]; then
	log "SKIP_RESTORE set — skipping restore (fresh start for clean seeding)"
	elif [ -n "${HF_TOKEN:-}" ]; then
	log "Attempting restore from HF Dataset..."
	restore_exit=0
	python3 /opt/backup/restore.py \|\| restore_exit=$?

	if [ "$restore_exit" -eq 0 ]; then
	log "Restore succeeded — restarting Hindsight to load restored data..."
	kill_hindsight "$HINDSIGHT_PID"

	/app/start-all.sh &
	HINDSIGHT_PID=$!
	log "Hindsight restarted (PID $HINDSIGHT_PID)"

	# Wait minimum 10s before health check — prevents hitting stale server
	sleep 10

	for i in $(seq 1 60); do
	if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then
	log "Hindsight is healthy after restore"
	break
	fi
	if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
	log "Hindsight died after restore restart"
	exit 1
	fi
	sleep 5
	done
	elif [ "$restore_exit" -eq 2 ]; then
	log "No backup found — continuing with fresh database"
	else
	log "Restore failed — continuing with fresh database"
	fi
	else
	log "HF_TOKEN not set — skipping restore"
	fi

	# ============================================================
	# STEP 5: Periodic backup loop
	# ============================================================
	last_backup=$(date +%s)
	while kill -0 "$HINDSIGHT_PID" 2>/dev/null; do
	sleep 60 &
	wait $! \|\| true # Allow SIGTERM to interrupt sleep
	now=$(date +%s)
	elapsed=$((now - last_backup))

	if [ "$elapsed" -ge "$BACKUP_INTERVAL" ]; then
	if [ -n "${HF_TOKEN:-}" ]; then
	log "Scheduled backup (elapsed: ${elapsed}s)..."
	python3 /opt/backup/backup.py scheduled \|\| \
	log "Backup failed — will retry next cycle"
	fi
	last_backup=$(date +%s)
	fi
	done

	# ============================================================
	# STEP 6: Hindsight died unexpectedly
	# ============================================================
	log "Hindsight process ended unexpectedly"
	if [ -n "${HF_TOKEN:-}" ]; then
	log "Performing emergency backup..."
	python3 /opt/backup/backup.py crash \|\| log "Emergency backup failed"
	fi
	exit 1