saga-annotation / db_sync.py
Hodfa71's picture
Add DB backup/restore (HF dataset repo) β€” survives Docker rebuilds
a5018da verified
"""
db_sync.py β€” Persists the Label Studio SQLite DB across HF Space rebuilds
by syncing to/from the private TrustLLMeu/saga-db-backup dataset repo.
Usage:
python3 db_sync.py restore # pull DB from HF repo β†’ /data/ls/
python3 db_sync.py backup # push /data/ls/label_studio.sqlite3 β†’ HF repo
python3 db_sync.py watch # backup every INTERVAL seconds (run in background)
"""
import os
import shutil
import sys
import time
HF_TOKEN = os.environ.get("HF_TOKEN", "")
BACKUP_REPO = "TrustLLMeu/saga-db-backup"
DB_PATH = "/data/ls/label_studio.sqlite3"
REMOTE_FILE = "label_studio.sqlite3"
INTERVAL = 300 # backup every 5 minutes
def _api():
from huggingface_hub import HfApi
if not HF_TOKEN:
raise RuntimeError("HF_TOKEN env var not set")
return HfApi(token=HF_TOKEN)
def restore():
"""Download DB from HF backup repo if it exists. Returns True if restored."""
try:
api = _api()
# Check if backup file exists in repo
files = api.list_repo_files(BACKUP_REPO, repo_type="dataset")
if REMOTE_FILE not in list(files):
print(f"[db_sync] No backup found in {BACKUP_REPO} β€” fresh start.", flush=True)
return False
print(f"[db_sync] Restoring DB from {BACKUP_REPO}...", flush=True)
os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
# Download to a temp file first, then atomically replace
tmp = DB_PATH + ".restore_tmp"
path = api.hf_hub_download(
repo_id=BACKUP_REPO,
filename=REMOTE_FILE,
repo_type="dataset",
local_dir=os.path.dirname(tmp),
local_dir_use_symlinks=False,
)
shutil.move(path, DB_PATH)
size = os.path.getsize(DB_PATH)
print(f"[db_sync] Restored DB ({size:,} bytes).", flush=True)
return True
except Exception as e:
print(f"[db_sync] Restore failed: {e}", flush=True)
return False
def backup():
"""Upload current DB to HF backup repo."""
if not os.path.exists(DB_PATH):
print(f"[db_sync] No DB at {DB_PATH} β€” skipping backup.", flush=True)
return False
try:
api = _api()
size = os.path.getsize(DB_PATH)
print(f"[db_sync] Backing up DB ({size:,} bytes) β†’ {BACKUP_REPO}...", flush=True)
api.upload_file(
path_or_fileobj=DB_PATH,
path_in_repo=REMOTE_FILE,
repo_id=BACKUP_REPO,
repo_type="dataset",
commit_message="Auto-backup from HF Space",
)
print(f"[db_sync] Backup complete.", flush=True)
return True
except Exception as e:
print(f"[db_sync] Backup failed: {e}", flush=True)
return False
def watch():
"""Run backup every INTERVAL seconds."""
print(f"[db_sync] Watch mode: backing up every {INTERVAL}s.", flush=True)
while True:
time.sleep(INTERVAL)
backup()
if __name__ == "__main__":
cmd = sys.argv[1] if len(sys.argv) > 1 else "backup"
if cmd == "restore":
ok = restore()
sys.exit(0 if ok else 1)
elif cmd == "backup":
ok = backup()
sys.exit(0 if ok else 1)
elif cmd == "watch":
watch()
else:
print(f"Usage: db_sync.py restore|backup|watch", flush=True)
sys.exit(1)