#!/usr/bin/env python3 """ Hindsight Restore — Download pg_dump from HF Dataset and restore into running PG. Called AFTER Hindsight starts (PostgreSQL must be running). Uses pg_restore --clean to replace the fresh empty database with backup data. Usage (called by entrypoint.sh): python3 /opt/backup/restore.py Env vars: HF_TOKEN — HuggingFace token (read access) HF_BACKUP_REPO — Dataset repo (default: Arnwald84/atum-hindsight-backup) """ import glob import os import subprocess import sys from pathlib import Path HF_TOKEN = os.environ.get("HF_TOKEN", "") HF_REPO = os.environ.get("HF_BACKUP_REPO", "Arnwald84/atum-hindsight-backup") PG_USER = "hindsight" PG_PASSWORD = "hindsight" PG_DATABASE = "hindsight" PG_PORT = "5432" def log(msg: str) -> None: print(f"[RESTORE] {msg}", flush=True) def find_pg_bin(name: str) -> str: """Find a PostgreSQL binary in the pg0 installation.""" pattern = os.path.expanduser(f"~/.pg0/installation/*/bin/{name}") matches = sorted(glob.glob(pattern)) if matches: return matches[-1] raise FileNotFoundError(f"{name} not found in ~/.pg0/installation/") EXIT_RESTORED = 0 # Data was restored — caller should restart Hindsight EXIT_ERROR = 1 # Restore failed EXIT_NO_BACKUP = 2 # No backup found — skip restart def main() -> int: """Returns exit code: 0=restored, 1=error, 2=no backup.""" if not HF_TOKEN: log("HF_TOKEN not set — skipping restore") return EXIT_NO_BACKUP try: from huggingface_hub import HfApi, hf_hub_download except ImportError: log("huggingface_hub not installed — skipping restore") return EXIT_NO_BACKUP api = HfApi(token=HF_TOKEN) # Check if backup exists try: files = list(api.list_repo_files(repo_id=HF_REPO, repo_type="dataset")) except Exception as e: log(f"Cannot access repo {HF_REPO}: {e}") return EXIT_ERROR if "snapshots/latest.pgdump" not in files: log("No pg_dump backup found in HF Dataset — starting fresh") return EXIT_NO_BACKUP log(f"Downloading latest backup from {HF_REPO}...") local_path = hf_hub_download( repo_id=HF_REPO, filename="snapshots/latest.pgdump", repo_type="dataset", token=HF_TOKEN, cache_dir="/tmp/hf_cache", ) size_kb = Path(local_path).stat().st_size / 1024 log(f"Downloaded: {size_kb:.0f} KB") # Restore using pg_restore pg_restore = find_pg_bin("pg_restore") env = os.environ.copy() env["PGPASSWORD"] = PG_PASSWORD log("Restoring database...") result = subprocess.run( [ pg_restore, "-U", PG_USER, "-d", PG_DATABASE, "-p", PG_PORT, "--clean", "--if-exists", "--no-owner", "--no-acl", "--single-transaction", local_path, ], capture_output=True, text=True, env=env, ) if result.returncode != 0: stderr = result.stderr.strip() # pg_restore often returns non-zero for harmless warnings # (e.g., "table does not exist" during --clean --if-exists) real_errors = [ line for line in stderr.split("\n") if "ERROR" in line and "does not exist" not in line and "already exists" not in line ] if real_errors: log(f"pg_restore had errors: {'; '.join(real_errors[:5])}") return EXIT_ERROR else: log("pg_restore completed (minor warnings only)") else: log("pg_restore completed successfully") log("Restore complete — Hindsight should be restarted to load restored data") return EXIT_RESTORED if __name__ == "__main__": try: code = main() sys.exit(code) except Exception as e: log(f"FAILED: {e}") sys.exit(1)