""" sync.py — Backup and restore Open WebUI data to/from HuggingFace Dataset. Usage: python3 sync.py backup /app/backend/data python3 sync.py restore /app/backend/data """ import os import sys import shutil import tarfile import tempfile from pathlib import Path from datetime import datetime try: from huggingface_hub import HfApi, hf_hub_download, upload_file, create_repo from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError except ImportError: print("[SYNC] huggingface_hub not installed. Run: pip install huggingface_hub") sys.exit(1) # ── Config ──────────────────────────────────────────────────────────────────── HF_TOKEN = os.environ.get("HF_TOKEN", "") HF_USERNAME = os.environ.get("HF_USERNAME", "") # auto-detected if blank DATASET_REPO = os.environ.get("OWUI_DATASET_REPO", "") # override if needed BACKUP_FILENAME = "open-webui-data.tar.gz" # Files/dirs to EXCLUDE from backup (large or ephemeral) EXCLUDE_PATTERNS = { "__pycache__", "*.pyc", "node_modules", ".git", "uploads", # exclude uploads dir if large; remove this to include } def get_repo_id(api: HfApi) -> str: """Determine the dataset repo ID to use.""" if DATASET_REPO: return DATASET_REPO if not HF_USERNAME: try: user = api.whoami(token=HF_TOKEN) username = user["name"] except Exception as e: print(f"[SYNC] Could not determine HF username: {e}") sys.exit(1) else: username = HF_USERNAME space_name = os.environ.get("SPACE_ID", "").split("/")[-1] or "open-webui" return f"{username}/{space_name}-data" def ensure_repo(api: HfApi, repo_id: str): """Create the dataset repo if it doesn't exist.""" try: api.repo_info(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN) print(f"[SYNC] Dataset repo exists: {repo_id}") except RepositoryNotFoundError: print(f"[SYNC] Creating private dataset repo: {repo_id}") create_repo( repo_id=repo_id, repo_type="dataset", private=True, token=HF_TOKEN, ) print(f"[SYNC] ✅ Created: {repo_id}") def should_exclude(path: Path, base: Path) -> bool: """Return True if this path should be excluded.""" rel = str(path.relative_to(base)) for pat in EXCLUDE_PATTERNS: if pat.startswith("*"): if path.name.endswith(pat[1:]): return True else: # Check every path component, not just the full relative string if pat in path.parts: return True return False def backup(data_dir: str): """Create a tarball of data_dir and upload to HF Dataset.""" if not HF_TOKEN: print("[SYNC] HF_TOKEN not set — skipping backup.") return data_path = Path(data_dir) if not data_path.exists(): print(f"[SYNC] Data dir does not exist yet: {data_dir}") return api = HfApi() repo_id = get_repo_id(api) ensure_repo(api, repo_id) print(f"[SYNC] Creating backup tarball from: {data_dir}") with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp: tmp_path = tmp.name try: with tarfile.open(tmp_path, "w:gz") as tar: for item in data_path.rglob("*"): if item.is_file() and not should_exclude(item, data_path): arcname = item.relative_to(data_path.parent) tar.add(item, arcname=str(arcname)) size_mb = os.path.getsize(tmp_path) / (1024 * 1024) print(f"[SYNC] Tarball size: {size_mb:.1f} MB") print(f"[SYNC] Uploading to {repo_id}/{BACKUP_FILENAME}...") api.upload_file( path_or_fileobj=tmp_path, path_in_repo=BACKUP_FILENAME, repo_id=repo_id, repo_type="dataset", token=HF_TOKEN, commit_message=f"Auto-backup {datetime.utcnow().strftime('%Y-%m-%d %H:%M')} UTC", ) print(f"[SYNC] ✅ Backup complete → {repo_id}/{BACKUP_FILENAME}") finally: os.unlink(tmp_path) def restore(data_dir: str): """Download backup tarball from HF Dataset and extract to data_dir.""" if not HF_TOKEN: print("[SYNC] HF_TOKEN not set — skipping restore.") return api = HfApi() repo_id = get_repo_id(api) print(f"[SYNC] Looking for backup in: {repo_id}/{BACKUP_FILENAME}") try: local_path = hf_hub_download( repo_id=repo_id, filename=BACKUP_FILENAME, repo_type="dataset", token=HF_TOKEN, local_dir=tempfile.gettempdir(), ) except (RepositoryNotFoundError, EntryNotFoundError): print("[SYNC] No backup found — starting with fresh data.") return except Exception as e: print(f"[SYNC] Could not download backup: {e}") return print(f"[SYNC] Downloaded backup: {local_path}") Path(data_dir).parent.mkdir(parents=True, exist_ok=True) print(f"[SYNC] Extracting to: {Path(data_dir).parent}") with tarfile.open(local_path, "r:gz") as tar: # filter='data' prevents path traversal attacks (Python 3.12+ recommended) try: tar.extractall(path=str(Path(data_dir).parent), filter='data') except TypeError: # Older Python versions don't support filter parameter tar.extractall(path=str(Path(data_dir).parent)) print(f"[SYNC] ✅ Restore complete → {data_dir}") # ── Main ────────────────────────────────────────────────────────────────────── if __name__ == "__main__": if len(sys.argv) < 3: print(f"Usage: python3 {sys.argv[0]} ") sys.exit(1) action = sys.argv[1].lower() data_dir = sys.argv[2] if action == "backup": backup(data_dir) elif action == "restore": restore(data_dir) else: print(f"Unknown action: {action}. Use 'backup' or 'restore'.") sys.exit(1)