File size: 5,878 Bytes
7a352c6
 
 
 
 
 
 
 
 
 
 
 
 
04263be
7a352c6
 
 
04263be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a352c6
 
 
 
 
 
 
 
 
 
 
 
 
 
04263be
7a352c6
 
 
 
 
 
 
 
 
 
04263be
7a352c6
 
 
 
 
 
 
 
 
 
 
 
cded1f5
 
 
 
 
 
7a352c6
b8236dd
 
 
7a352c6
c1ce02e
 
cded1f5
 
 
04263be
cded1f5
 
 
 
 
04263be
 
 
cded1f5
04263be
cded1f5
 
 
 
 
 
 
 
 
 
 
 
 
 
7a352c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/bin/bash
# entrypoint.sh β€” Hindsight wrapper with pg_dump backup/restore
#
# Flow:
#   1. Start Hindsight (/app/start-all.sh) β€” initializes PG + API
#   2. Wait for healthy (PG + API ready)
#   3. Restore from HF Dataset (pg_restore into running PG)
#   4. Trap SIGTERM β†’ pg_dump β†’ upload β†’ shutdown
#   5. Periodic backup loop (pg_dump every BACKUP_INTERVAL_SECONDS)
#   6. If Hindsight dies β†’ emergency pg_dump β†’ exit 1
set -euo pipefail

BACKUP_INTERVAL="${BACKUP_INTERVAL_SECONDS:-21600}"  # default 6 hours
API_PORT="${HINDSIGHT_API_PORT:-7860}"

log() { echo "[ENTRYPOINT] $(date -u +%Y-%m-%dT%H:%M:%SZ) $*"; }

# ============================================================
# kill_hindsight β€” Kill the entire process tree, wait for port release
# ============================================================
kill_hindsight() {
    local pid="$1"
    log "Stopping Hindsight (PID $pid) and all children..."

    # 1. Kill all children first (uvicorn workers, pg0, etc.)
    pkill -TERM -P "$pid" 2>/dev/null || true
    # 2. Kill the parent
    kill -TERM "$pid" 2>/dev/null || true
    sleep 3

    # 3. Force-kill any survivors
    pkill -9 -P "$pid" 2>/dev/null || true
    kill -9 "$pid" 2>/dev/null || true
    wait "$pid" 2>/dev/null || true

    # 4. Wait for port to be released (up to 30s)
    local attempts=0
    while [ "$attempts" -lt 30 ]; do
        if ! ss -tlnp 2>/dev/null | grep -q ":${API_PORT} " && \
           ! netstat -tlnp 2>/dev/null | grep -q ":${API_PORT} "; then
            log "Port ${API_PORT} released"
            return 0
        fi
        attempts=$((attempts + 1))
        sleep 1
    done

    # 5. Last resort: kill anything still on the port
    if command -v fuser >/dev/null 2>&1; then
        fuser -k "${API_PORT}/tcp" 2>/dev/null || true
        sleep 2
    fi
    log "Port cleanup complete (waited ${attempts}s)"
}

# ============================================================
# STEP 1: Start Hindsight (starts embedded PG + API)
# ============================================================
log "Starting Hindsight (/app/start-all.sh)..."
/app/start-all.sh &
HINDSIGHT_PID=$!
log "Hindsight started (PID $HINDSIGHT_PID)"

# ============================================================
# STEP 2: SIGTERM handler β€” pg_dump before shutdown
# ============================================================
cleanup() {
    log "SIGTERM received β€” performing final backup..."
    python3 /opt/backup/backup.py shutdown || log "Final backup failed (non-fatal)"
    kill_hindsight "$HINDSIGHT_PID"
    log "Shutdown complete"
    exit 0
}
trap cleanup SIGTERM SIGINT

# ============================================================
# STEP 3: Wait for Hindsight to be healthy
# ============================================================
log "Waiting for Hindsight to become healthy..."
for i in $(seq 1 60); do
    if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then
        log "Hindsight is healthy"
        break
    fi
    if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
        log "Hindsight died during startup"
        exit 1
    fi
    sleep 5
done

# ============================================================
# STEP 4: Restore from backup (PG is now running)
# After pg_restore, we must restart Hindsight so the API
# reconnects to PG and sees the restored data.
# Exit codes from restore.py:
#   0 = data restored successfully
#   2 = no backup found (skip restart)
#   1 = error
# ============================================================
if [ -n "${SKIP_RESTORE:-}" ]; then
    log "SKIP_RESTORE set β€” skipping restore (fresh start for clean seeding)"
elif [ -n "${HF_TOKEN:-}" ]; then
    log "Attempting restore from HF Dataset..."
    restore_exit=0
    python3 /opt/backup/restore.py || restore_exit=$?

    if [ "$restore_exit" -eq 0 ]; then
        log "Restore succeeded β€” restarting Hindsight to load restored data..."
        kill_hindsight "$HINDSIGHT_PID"

        /app/start-all.sh &
        HINDSIGHT_PID=$!
        log "Hindsight restarted (PID $HINDSIGHT_PID)"

        # Wait minimum 10s before health check β€” prevents hitting stale server
        sleep 10

        for i in $(seq 1 60); do
            if curl -sf http://localhost:${API_PORT}/health > /dev/null 2>&1; then
                log "Hindsight is healthy after restore"
                break
            fi
            if ! kill -0 "$HINDSIGHT_PID" 2>/dev/null; then
                log "Hindsight died after restore restart"
                exit 1
            fi
            sleep 5
        done
    elif [ "$restore_exit" -eq 2 ]; then
        log "No backup found β€” continuing with fresh database"
    else
        log "Restore failed β€” continuing with fresh database"
    fi
else
    log "HF_TOKEN not set β€” skipping restore"
fi

# ============================================================
# STEP 5: Periodic backup loop
# ============================================================
last_backup=$(date +%s)
while kill -0 "$HINDSIGHT_PID" 2>/dev/null; do
    sleep 60 &
    wait $! || true  # Allow SIGTERM to interrupt sleep
    now=$(date +%s)
    elapsed=$((now - last_backup))

    if [ "$elapsed" -ge "$BACKUP_INTERVAL" ]; then
        if [ -n "${HF_TOKEN:-}" ]; then
            log "Scheduled backup (elapsed: ${elapsed}s)..."
            python3 /opt/backup/backup.py scheduled || \
                log "Backup failed β€” will retry next cycle"
        fi
        last_backup=$(date +%s)
    fi
done

# ============================================================
# STEP 6: Hindsight died unexpectedly
# ============================================================
log "Hindsight process ended unexpectedly"
if [ -n "${HF_TOKEN:-}" ]; then
    log "Performing emergency backup..."
    python3 /opt/backup/backup.py crash || log "Emergency backup failed"
fi
exit 1