#!/usr/bin/env python3 """ Hindsight Backup — pg_dump to HF Dataset. Uses pg_dump for a consistent snapshot of the embedded PostgreSQL database while Hindsight is running. Safe for periodic and shutdown backups. Usage (called by entrypoint.sh): python3 /opt/backup/backup.py [reason] Env vars: HF_TOKEN — HuggingFace write token HF_BACKUP_REPO — Dataset repo (default: Arnwald84/atum-hindsight-backup) """ import glob import os import subprocess import sys import tempfile from datetime import datetime, timezone from pathlib import Path HF_TOKEN = os.environ.get("HF_TOKEN", "") HF_REPO = os.environ.get("HF_BACKUP_REPO", "Arnwald84/atum-hindsight-backup") MAX_HISTORY = 10 MIN_DUMP_SIZE_KB = 200 # Refuse to upload if dump is smaller (likely empty/corrupt) PG_USER = "hindsight" PG_PASSWORD = "hindsight" PG_DATABASE = "hindsight" PG_PORT = "5432" REASON = sys.argv[1] if len(sys.argv) > 1 else "manual" def log(msg: str) -> None: print(f"[BACKUP] {msg}", flush=True) def find_pg_bin(name: str) -> str: """Find a PostgreSQL binary in the pg0 installation.""" pattern = os.path.expanduser(f"~/.pg0/installation/*/bin/{name}") matches = sorted(glob.glob(pattern)) if matches: return matches[-1] # latest version raise FileNotFoundError(f"{name} not found in ~/.pg0/installation/") def create_dump() -> str: """Create a pg_dump in custom format (consistent while PG is running).""" pg_dump = find_pg_bin("pg_dump") timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") dump_file = os.path.join(tempfile.gettempdir(), f"hindsight-{timestamp}.pgdump") env = os.environ.copy() env["PGPASSWORD"] = PG_PASSWORD result = subprocess.run( [ pg_dump, "-U", PG_USER, "-d", PG_DATABASE, "-p", PG_PORT, "-Fc", "--no-owner", "--no-acl", "-f", dump_file, ], capture_output=True, text=True, env=env, ) if result.returncode != 0: raise RuntimeError(f"pg_dump failed: {result.stderr}") return dump_file def upload_to_hf(dump_file: str) -> None: """Upload pg_dump to HF Dataset as latest + timestamped history.""" from huggingface_hub import HfApi api = HfApi(token=HF_TOKEN) timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") # Ensure dataset repo exists (private, idempotent) api.create_repo(repo_id=HF_REPO, repo_type="dataset", exist_ok=True, private=True) # Upload as latest (overwrite) api.upload_file( path_or_fileobj=dump_file, path_in_repo="snapshots/latest.pgdump", repo_id=HF_REPO, repo_type="dataset", commit_message=f"Backup {timestamp} ({REASON})", ) log(f"Uploaded snapshots/latest.pgdump to {HF_REPO}") # Upload timestamped copy for history api.upload_file( path_or_fileobj=dump_file, path_in_repo=f"snapshots/history/{timestamp}.pgdump", repo_id=HF_REPO, repo_type="dataset", commit_message=f"History snapshot {timestamp}", ) log(f"Uploaded snapshots/history/{timestamp}.pgdump") # Rotate: keep only the last N history snapshots all_files = list(api.list_repo_files(repo_id=HF_REPO, repo_type="dataset")) history_files = sorted( [f for f in all_files if f.startswith("snapshots/history/") and f.endswith(".pgdump")], reverse=True, ) for old_file in history_files[MAX_HISTORY:]: api.delete_file( path_in_repo=old_file, repo_id=HF_REPO, repo_type="dataset", commit_message=f"Rotate old snapshot {old_file}", ) log(f"Deleted old snapshot: {old_file}") def main() -> None: if not HF_TOKEN: log("HF_TOKEN not set — skipping backup") return log(f"Starting backup (reason: {REASON})...") dump_file = create_dump() size_kb = Path(dump_file).stat().st_size / 1024 log(f"pg_dump created: {size_kb:.0f} KB") # Guard: refuse to overwrite good backups with empty/corrupt dumps if size_kb < MIN_DUMP_SIZE_KB: log(f"SKIPPED: dump too small ({size_kb:.0f} KB < {MIN_DUMP_SIZE_KB} KB) — likely empty database, refusing to overwrite good backup") Path(dump_file).unlink(missing_ok=True) return try: upload_to_hf(dump_file) finally: Path(dump_file).unlink(missing_ok=True) log("Backup complete") if __name__ == "__main__": try: main() except Exception as e: log(f"FAILED: {e}") sys.exit(1)