Spaces:
Sleeping
Sleeping
| """ | |
| sync.py β Backup and restore Open WebUI data to/from HuggingFace Dataset. | |
| Usage: | |
| python3 sync.py backup /app/backend/data | |
| python3 sync.py restore /app/backend/data | |
| """ | |
| import os | |
| import sys | |
| import shutil | |
| import tarfile | |
| import tempfile | |
| from pathlib import Path | |
| from datetime import datetime | |
| try: | |
| from huggingface_hub import HfApi, hf_hub_download, upload_file, create_repo | |
| from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError | |
| except ImportError: | |
| print("[SYNC] huggingface_hub not installed. Run: pip install huggingface_hub") | |
| sys.exit(1) | |
| # ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| HF_USERNAME = os.environ.get("HF_USERNAME", "") # auto-detected if blank | |
| DATASET_REPO = os.environ.get("OWUI_DATASET_REPO", "") # override if needed | |
| BACKUP_FILENAME = "open-webui-data.tar.gz" | |
| # Files/dirs to EXCLUDE from backup (large or ephemeral) | |
| EXCLUDE_PATTERNS = { | |
| "__pycache__", | |
| "*.pyc", | |
| "node_modules", | |
| ".git", | |
| "uploads", # exclude uploads dir if large; remove this to include | |
| } | |
| def get_repo_id(api: HfApi) -> str: | |
| """Determine the dataset repo ID to use.""" | |
| if DATASET_REPO: | |
| return DATASET_REPO | |
| if not HF_USERNAME: | |
| try: | |
| user = api.whoami(token=HF_TOKEN) | |
| username = user["name"] | |
| except Exception as e: | |
| print(f"[SYNC] Could not determine HF username: {e}") | |
| sys.exit(1) | |
| else: | |
| username = HF_USERNAME | |
| space_name = os.environ.get("SPACE_ID", "").split("/")[-1] or "open-webui" | |
| return f"{username}/{space_name}-data" | |
| def ensure_repo(api: HfApi, repo_id: str): | |
| """Create the dataset repo if it doesn't exist.""" | |
| try: | |
| api.repo_info(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN) | |
| print(f"[SYNC] Dataset repo exists: {repo_id}") | |
| except RepositoryNotFoundError: | |
| print(f"[SYNC] Creating private dataset repo: {repo_id}") | |
| create_repo( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| private=True, | |
| token=HF_TOKEN, | |
| ) | |
| print(f"[SYNC] β Created: {repo_id}") | |
| def should_exclude(path: Path, base: Path) -> bool: | |
| """Return True if this path should be excluded.""" | |
| rel = str(path.relative_to(base)) | |
| for pat in EXCLUDE_PATTERNS: | |
| if pat.startswith("*"): | |
| if path.name.endswith(pat[1:]): | |
| return True | |
| else: | |
| # Check every path component, not just the full relative string | |
| if pat in path.parts: | |
| return True | |
| return False | |
| def backup(data_dir: str): | |
| """Create a tarball of data_dir and upload to HF Dataset.""" | |
| if not HF_TOKEN: | |
| print("[SYNC] HF_TOKEN not set β skipping backup.") | |
| return | |
| data_path = Path(data_dir) | |
| if not data_path.exists(): | |
| print(f"[SYNC] Data dir does not exist yet: {data_dir}") | |
| return | |
| api = HfApi() | |
| repo_id = get_repo_id(api) | |
| ensure_repo(api, repo_id) | |
| print(f"[SYNC] Creating backup tarball from: {data_dir}") | |
| with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp: | |
| tmp_path = tmp.name | |
| try: | |
| with tarfile.open(tmp_path, "w:gz") as tar: | |
| for item in data_path.rglob("*"): | |
| if item.is_file() and not should_exclude(item, data_path): | |
| arcname = item.relative_to(data_path.parent) | |
| tar.add(item, arcname=str(arcname)) | |
| size_mb = os.path.getsize(tmp_path) / (1024 * 1024) | |
| print(f"[SYNC] Tarball size: {size_mb:.1f} MB") | |
| print(f"[SYNC] Uploading to {repo_id}/{BACKUP_FILENAME}...") | |
| api.upload_file( | |
| path_or_fileobj=tmp_path, | |
| path_in_repo=BACKUP_FILENAME, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| commit_message=f"Auto-backup {datetime.utcnow().strftime('%Y-%m-%d %H:%M')} UTC", | |
| ) | |
| print(f"[SYNC] β Backup complete β {repo_id}/{BACKUP_FILENAME}") | |
| finally: | |
| os.unlink(tmp_path) | |
| def restore(data_dir: str): | |
| """Download backup tarball from HF Dataset and extract to data_dir.""" | |
| if not HF_TOKEN: | |
| print("[SYNC] HF_TOKEN not set β skipping restore.") | |
| return | |
| api = HfApi() | |
| repo_id = get_repo_id(api) | |
| print(f"[SYNC] Looking for backup in: {repo_id}/{BACKUP_FILENAME}") | |
| try: | |
| local_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=BACKUP_FILENAME, | |
| repo_type="dataset", | |
| token=HF_TOKEN, | |
| local_dir=tempfile.gettempdir(), | |
| ) | |
| except (RepositoryNotFoundError, EntryNotFoundError): | |
| print("[SYNC] No backup found β starting with fresh data.") | |
| return | |
| except Exception as e: | |
| print(f"[SYNC] Could not download backup: {e}") | |
| return | |
| print(f"[SYNC] Downloaded backup: {local_path}") | |
| Path(data_dir).parent.mkdir(parents=True, exist_ok=True) | |
| print(f"[SYNC] Extracting to: {Path(data_dir).parent}") | |
| with tarfile.open(local_path, "r:gz") as tar: | |
| # filter='data' prevents path traversal attacks (Python 3.12+ recommended) | |
| try: | |
| tar.extractall(path=str(Path(data_dir).parent), filter='data') | |
| except TypeError: | |
| # Older Python versions don't support filter parameter | |
| tar.extractall(path=str(Path(data_dir).parent)) | |
| print(f"[SYNC] β Restore complete β {data_dir}") | |
| # ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| if len(sys.argv) < 3: | |
| print(f"Usage: python3 {sys.argv[0]} <backup|restore> <data_dir>") | |
| sys.exit(1) | |
| action = sys.argv[1].lower() | |
| data_dir = sys.argv[2] | |
| if action == "backup": | |
| backup(data_dir) | |
| elif action == "restore": | |
| restore(data_dir) | |
| else: | |
| print(f"Unknown action: {action}. Use 'backup' or 'restore'.") | |
| sys.exit(1) | |