File size: 3,963 Bytes
7a352c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cded1f5
 
 
 
 
 
 
7a352c6
 
cded1f5
7a352c6
 
 
 
 
cded1f5
7a352c6
 
 
 
 
 
 
 
cded1f5
7a352c6
 
 
cded1f5
7a352c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cded1f5
7a352c6
 
 
 
 
cded1f5
 
7a352c6
 
 
 
cded1f5
 
7a352c6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/env python3
"""
Hindsight Restore — Download pg_dump from HF Dataset and restore into running PG.

Called AFTER Hindsight starts (PostgreSQL must be running).
Uses pg_restore --clean to replace the fresh empty database with backup data.

Usage (called by entrypoint.sh):
    python3 /opt/backup/restore.py

Env vars:
    HF_TOKEN          — HuggingFace token (read access)
    HF_BACKUP_REPO    — Dataset repo (default: Arnwald84/atum-hindsight-backup)
"""

import glob
import os
import subprocess
import sys
from pathlib import Path

HF_TOKEN = os.environ.get("HF_TOKEN", "")
HF_REPO = os.environ.get("HF_BACKUP_REPO", "Arnwald84/atum-hindsight-backup")

PG_USER = "hindsight"
PG_PASSWORD = "hindsight"
PG_DATABASE = "hindsight"
PG_PORT = "5432"


def log(msg: str) -> None:
    print(f"[RESTORE] {msg}", flush=True)


def find_pg_bin(name: str) -> str:
    """Find a PostgreSQL binary in the pg0 installation."""
    pattern = os.path.expanduser(f"~/.pg0/installation/*/bin/{name}")
    matches = sorted(glob.glob(pattern))
    if matches:
        return matches[-1]
    raise FileNotFoundError(f"{name} not found in ~/.pg0/installation/")


EXIT_RESTORED = 0   # Data was restored — caller should restart Hindsight
EXIT_ERROR = 1       # Restore failed
EXIT_NO_BACKUP = 2   # No backup found — skip restart


def main() -> int:
    """Returns exit code: 0=restored, 1=error, 2=no backup."""
    if not HF_TOKEN:
        log("HF_TOKEN not set — skipping restore")
        return EXIT_NO_BACKUP

    try:
        from huggingface_hub import HfApi, hf_hub_download
    except ImportError:
        log("huggingface_hub not installed — skipping restore")
        return EXIT_NO_BACKUP

    api = HfApi(token=HF_TOKEN)

    # Check if backup exists
    try:
        files = list(api.list_repo_files(repo_id=HF_REPO, repo_type="dataset"))
    except Exception as e:
        log(f"Cannot access repo {HF_REPO}: {e}")
        return EXIT_ERROR

    if "snapshots/latest.pgdump" not in files:
        log("No pg_dump backup found in HF Dataset — starting fresh")
        return EXIT_NO_BACKUP

    log(f"Downloading latest backup from {HF_REPO}...")

    local_path = hf_hub_download(
        repo_id=HF_REPO,
        filename="snapshots/latest.pgdump",
        repo_type="dataset",
        token=HF_TOKEN,
        cache_dir="/tmp/hf_cache",
    )

    size_kb = Path(local_path).stat().st_size / 1024
    log(f"Downloaded: {size_kb:.0f} KB")

    # Restore using pg_restore
    pg_restore = find_pg_bin("pg_restore")
    env = os.environ.copy()
    env["PGPASSWORD"] = PG_PASSWORD

    log("Restoring database...")
    result = subprocess.run(
        [
            pg_restore,
            "-U", PG_USER,
            "-d", PG_DATABASE,
            "-p", PG_PORT,
            "--clean",
            "--if-exists",
            "--no-owner",
            "--no-acl",
            "--single-transaction",
            local_path,
        ],
        capture_output=True,
        text=True,
        env=env,
    )

    if result.returncode != 0:
        stderr = result.stderr.strip()
        # pg_restore often returns non-zero for harmless warnings
        # (e.g., "table does not exist" during --clean --if-exists)
        real_errors = [
            line for line in stderr.split("\n")
            if "ERROR" in line
            and "does not exist" not in line
            and "already exists" not in line
        ]
        if real_errors:
            log(f"pg_restore had errors: {'; '.join(real_errors[:5])}")
            return EXIT_ERROR
        else:
            log("pg_restore completed (minor warnings only)")
    else:
        log("pg_restore completed successfully")

    log("Restore complete — Hindsight should be restarted to load restored data")
    return EXIT_RESTORED


if __name__ == "__main__":
    try:
        code = main()
        sys.exit(code)
    except Exception as e:
        log(f"FAILED: {e}")
        sys.exit(1)