File size: 4,652 Bytes
7a352c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
daf0f64
 
7a352c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
daf0f64
 
 
 
 
 
7a352c6
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
"""
Hindsight Backup — pg_dump to HF Dataset.

Uses pg_dump for a consistent snapshot of the embedded PostgreSQL database
while Hindsight is running. Safe for periodic and shutdown backups.

Usage (called by entrypoint.sh):
    python3 /opt/backup/backup.py [reason]

Env vars:
    HF_TOKEN          — HuggingFace write token
    HF_BACKUP_REPO    — Dataset repo (default: Arnwald84/atum-hindsight-backup)
"""

import glob
import os
import subprocess
import sys
import tempfile
from datetime import datetime, timezone
from pathlib import Path

HF_TOKEN = os.environ.get("HF_TOKEN", "")
HF_REPO = os.environ.get("HF_BACKUP_REPO", "Arnwald84/atum-hindsight-backup")
MAX_HISTORY = 10
MIN_DUMP_SIZE_KB = 200  # Refuse to upload if dump is smaller (likely empty/corrupt)

PG_USER = "hindsight"
PG_PASSWORD = "hindsight"
PG_DATABASE = "hindsight"
PG_PORT = "5432"

REASON = sys.argv[1] if len(sys.argv) > 1 else "manual"


def log(msg: str) -> None:
    print(f"[BACKUP] {msg}", flush=True)


def find_pg_bin(name: str) -> str:
    """Find a PostgreSQL binary in the pg0 installation."""
    pattern = os.path.expanduser(f"~/.pg0/installation/*/bin/{name}")
    matches = sorted(glob.glob(pattern))
    if matches:
        return matches[-1]  # latest version
    raise FileNotFoundError(f"{name} not found in ~/.pg0/installation/")


def create_dump() -> str:
    """Create a pg_dump in custom format (consistent while PG is running)."""
    pg_dump = find_pg_bin("pg_dump")
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
    dump_file = os.path.join(tempfile.gettempdir(), f"hindsight-{timestamp}.pgdump")

    env = os.environ.copy()
    env["PGPASSWORD"] = PG_PASSWORD

    result = subprocess.run(
        [
            pg_dump,
            "-U", PG_USER,
            "-d", PG_DATABASE,
            "-p", PG_PORT,
            "-Fc",
            "--no-owner",
            "--no-acl",
            "-f", dump_file,
        ],
        capture_output=True,
        text=True,
        env=env,
    )
    if result.returncode != 0:
        raise RuntimeError(f"pg_dump failed: {result.stderr}")

    return dump_file


def upload_to_hf(dump_file: str) -> None:
    """Upload pg_dump to HF Dataset as latest + timestamped history."""
    from huggingface_hub import HfApi

    api = HfApi(token=HF_TOKEN)
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")

    # Ensure dataset repo exists (private, idempotent)
    api.create_repo(repo_id=HF_REPO, repo_type="dataset", exist_ok=True, private=True)

    # Upload as latest (overwrite)
    api.upload_file(
        path_or_fileobj=dump_file,
        path_in_repo="snapshots/latest.pgdump",
        repo_id=HF_REPO,
        repo_type="dataset",
        commit_message=f"Backup {timestamp} ({REASON})",
    )
    log(f"Uploaded snapshots/latest.pgdump to {HF_REPO}")

    # Upload timestamped copy for history
    api.upload_file(
        path_or_fileobj=dump_file,
        path_in_repo=f"snapshots/history/{timestamp}.pgdump",
        repo_id=HF_REPO,
        repo_type="dataset",
        commit_message=f"History snapshot {timestamp}",
    )
    log(f"Uploaded snapshots/history/{timestamp}.pgdump")

    # Rotate: keep only the last N history snapshots
    all_files = list(api.list_repo_files(repo_id=HF_REPO, repo_type="dataset"))
    history_files = sorted(
        [f for f in all_files if f.startswith("snapshots/history/") and f.endswith(".pgdump")],
        reverse=True,
    )
    for old_file in history_files[MAX_HISTORY:]:
        api.delete_file(
            path_in_repo=old_file,
            repo_id=HF_REPO,
            repo_type="dataset",
            commit_message=f"Rotate old snapshot {old_file}",
        )
        log(f"Deleted old snapshot: {old_file}")


def main() -> None:
    if not HF_TOKEN:
        log("HF_TOKEN not set — skipping backup")
        return

    log(f"Starting backup (reason: {REASON})...")

    dump_file = create_dump()
    size_kb = Path(dump_file).stat().st_size / 1024
    log(f"pg_dump created: {size_kb:.0f} KB")

    # Guard: refuse to overwrite good backups with empty/corrupt dumps
    if size_kb < MIN_DUMP_SIZE_KB:
        log(f"SKIPPED: dump too small ({size_kb:.0f} KB < {MIN_DUMP_SIZE_KB} KB) — likely empty database, refusing to overwrite good backup")
        Path(dump_file).unlink(missing_ok=True)
        return

    try:
        upload_to_hf(dump_file)
    finally:
        Path(dump_file).unlink(missing_ok=True)

    log("Backup complete")


if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        log(f"FAILED: {e}")
        sys.exit(1)