""" db_sync.py — Persists the Label Studio SQLite DB across HF Space rebuilds by syncing to/from the private TrustLLMeu/saga-db-backup dataset repo. Usage: python3 db_sync.py restore # pull DB from HF repo → /data/ls/ python3 db_sync.py backup # push /data/ls/label_studio.sqlite3 → HF repo python3 db_sync.py watch # backup every INTERVAL seconds (run in background) """ import os import shutil import sys import time HF_TOKEN = os.environ.get("HF_TOKEN", "") BACKUP_REPO = "TrustLLMeu/saga-db-backup" DB_PATH = "/data/ls/label_studio.sqlite3" REMOTE_FILE = "label_studio.sqlite3" INTERVAL = 300 # backup every 5 minutes def _api(): from huggingface_hub import HfApi if not HF_TOKEN: raise RuntimeError("HF_TOKEN env var not set") return HfApi(token=HF_TOKEN) def restore(): """Download DB from HF backup repo if it exists. Returns True if restored.""" try: api = _api() # Check if backup file exists in repo files = api.list_repo_files(BACKUP_REPO, repo_type="dataset") if REMOTE_FILE not in list(files): print(f"[db_sync] No backup found in {BACKUP_REPO} — fresh start.", flush=True) return False print(f"[db_sync] Restoring DB from {BACKUP_REPO}...", flush=True) os.makedirs(os.path.dirname(DB_PATH), exist_ok=True) # Download to a temp file first, then atomically replace tmp = DB_PATH + ".restore_tmp" path = api.hf_hub_download( repo_id=BACKUP_REPO, filename=REMOTE_FILE, repo_type="dataset", local_dir=os.path.dirname(tmp), local_dir_use_symlinks=False, ) shutil.move(path, DB_PATH) size = os.path.getsize(DB_PATH) print(f"[db_sync] Restored DB ({size:,} bytes).", flush=True) return True except Exception as e: print(f"[db_sync] Restore failed: {e}", flush=True) return False def backup(): """Upload current DB to HF backup repo.""" if not os.path.exists(DB_PATH): print(f"[db_sync] No DB at {DB_PATH} — skipping backup.", flush=True) return False try: api = _api() size = os.path.getsize(DB_PATH) print(f"[db_sync] Backing up DB ({size:,} bytes) → {BACKUP_REPO}...", flush=True) api.upload_file( path_or_fileobj=DB_PATH, path_in_repo=REMOTE_FILE, repo_id=BACKUP_REPO, repo_type="dataset", commit_message="Auto-backup from HF Space", ) print(f"[db_sync] Backup complete.", flush=True) return True except Exception as e: print(f"[db_sync] Backup failed: {e}", flush=True) return False def watch(): """Run backup every INTERVAL seconds.""" print(f"[db_sync] Watch mode: backing up every {INTERVAL}s.", flush=True) while True: time.sleep(INTERVAL) backup() if __name__ == "__main__": cmd = sys.argv[1] if len(sys.argv) > 1 else "backup" if cmd == "restore": ok = restore() sys.exit(0 if ok else 1) elif cmd == "backup": ok = backup() sys.exit(0 if ok else 1) elif cmd == "watch": watch() else: print(f"Usage: db_sync.py restore|backup|watch", flush=True) sys.exit(1)