#!/bin/bash
# entrypoint.sh — Hindsight wrapper with pg_dump backup/restore
#
# Flow:
#   1. Start Hindsight (/app/start-all.sh) — initializes PG + API
#   2. Wait for healthy (PG + API ready)
#   3. Restore from HF Dataset (pg_restore into running PG)
#   4. Trap SIGTERM → pg_dump → upload → shutdown
#   5. Periodic backup loop (pg_dump every BACKUP_INTERVAL_SECONDS)
#   6. If Hindsight dies → emergency pg_dump → exit 1
set -euo pipefail

BACKUP_INTERVAL="${BACKUP_INTERVAL_SECONDS:-21600}"  # default 6 hours
API_PORT="${HINDSIGHT_API_PORT:-7860}"

log() { echo "[ENTRYPOINT] $(date -u +%Y-%m-%dT%H:%M:%SZ) $*"; }

# ============================================================
# kill_hindsight — Kill the entire process tree, wait for port release
# ============================================================
kill_hindsight() {
    local pid="$1"
    log "Stopping Hindsight (PID $pid) and all children..."

    # 1. Kill all children first (uvicorn workers, pg0, etc.)
    pkill -TERM -P "$pid" 2>/dev/null || true
    # 2. Kill the parent
    kill -TERM "$pid" 2>/dev/null || true
    sleep 3

    # 3. Force-kill any survivors
    pkill -9 -P "$pid" 2>/dev/null || true
    kill -9 "$pid" 2>/dev/null || true
    wait "$pid" 2>/dev/null || true

    # 4. Wait for port to be released (up to 30s)
    local attempts=0
    while [ "$attempts" -lt 30 ]; do
        if ! ss -tlnp 2>/dev/null | grep -q ":${API_PORT} " && \
           ! netstat -tlnp 2>/dev/null | grep -q ":${API_PORT} "; then
            log "Port ${API_PORT} released"
            return 0
        fi
        attempts=$((attempts + 1))
        sleep 1
    done

    # 5. Last resort: kill anything still on the port
    if command -v fuser >/dev/null 2>&1; then
        fuser -k "${API_PORT}/tcp" 2>/dev/null || true
        sleep 2
    fi
    log "Port cleanup complete (waited ${attempts}s)"
}

# ============================================================
# STEP 1: Start Hindsight (starts embedded PG + API)
# ============================================================
log "Starting Hindsight (/app/start-all.sh)..."
/app/start-all.sh &
HINDSIGHT_PID=$!
log "Hindsight started (PID $HINDSIGHT_PID)"

# ============================================================
# STEP 2: SIGTERM handler — pg_dump before shutdown
# ============================================================
cleanup() {
    log "SIGTERM received — performing final backup..."
    python3 /opt/backup/backup.py shutdown || log "Final backup failed (non-fatal)"
    kill_hindsight "$HINDSIGHT_PID"
    log "Shutdown complete"
    exit 0
}
trap cleanup SIGTERM SIGINT

# ============================================================
# STEP 3: Wait for Hindsight to be healthy
# ============================================================
log "Waiting for Hindsight to become healthy..."
for i in $(seq 1 60); do
    if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then
        log "Hindsight is healthy"
        break
    fi
    if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
        log "Hindsight died during startup"
        exit 1
    fi
    sleep 5
done

# ============================================================
# STEP 4: Restore from backup (PG is now running)
# After pg_restore, we must restart Hindsight so the API
# reconnects to PG and sees the restored data.
# Exit codes from restore.py:
#   0 = data restored successfully
#   2 = no backup found (skip restart)
#   1 = error
# ============================================================
if [ -n "${SKIP_RESTORE:-}" ]; then
    log "SKIP_RESTORE set — skipping restore (fresh start for clean seeding)"
elif [ -n "${HF_TOKEN:-}" ]; then
    log "Attempting restore from HF Dataset..."
    restore_exit=0
    python3 /opt/backup/restore.py || restore_exit=$?

    if [ "$restore_exit" -eq 0 ]; then
        log "Restore succeeded — restarting Hindsight to load restored data..."
        kill_hindsight "$HINDSIGHT_PID"

        /app/start-all.sh &
        HINDSIGHT_PID=$!
        log "Hindsight restarted (PID $HINDSIGHT_PID)"

        # Wait minimum 10s before health check — prevents hitting stale server
        sleep 10

        for i in $(seq 1 60); do
            if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then
                log "Hindsight is healthy after restore"
                break
            fi
            if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
                log "Hindsight died after restore restart"
                exit 1
            fi
            sleep 5
        done
    elif [ "$restore_exit" -eq 2 ]; then
        log "No backup found — continuing with fresh database"
    else
        log "Restore failed — continuing with fresh database"
    fi
else
    log "HF_TOKEN not set — skipping restore"
fi

# ============================================================
# STEP 5: Periodic backup loop
# ============================================================
last_backup=$(date +%s)
while kill -0 "$HINDSIGHT_PID" 2>/dev/null; do
    sleep 60 &
    wait $! || true  # Allow SIGTERM to interrupt sleep
    now=$(date +%s)
    elapsed=$((now - last_backup))

    if [ "$elapsed" -ge "$BACKUP_INTERVAL" ]; then
        if [ -n "${HF_TOKEN:-}" ]; then
            log "Scheduled backup (elapsed: ${elapsed}s)..."
            python3 /opt/backup/backup.py scheduled || \
                log "Backup failed — will retry next cycle"
        fi
        last_backup=$(date +%s)
    fi
done

# ============================================================
# STEP 6: Hindsight died unexpectedly
# ============================================================
log "Hindsight process ended unexpectedly"
if [ -n "${HF_TOKEN:-}" ]; then
    log "Performing emergency backup..."
    python3 /opt/backup/backup.py crash || log "Emergency backup failed"
fi
exit 1