#!/usr/bin/env python3 import os import shutil import json from pathlib import Path from huggingface_hub import HfApi, create_repo import tarfile import tempfile class HFStorageSync: def __init__(self, repo_id, token=None, data_dir="/tmp/open-webui-data"): self.repo_id = repo_id self.data_dir = Path(data_dir) self.token = token # Initialize API with token directly self.api = HfApi(token=token) if token else HfApi() def ensure_repo_exists(self): """Create repository if it doesn't exist""" if not self.token: print("No token provided, cannot create repository") return False try: # Check if repo exists repo_info = self.api.repo_info(repo_id=self.repo_id, repo_type="dataset") print(f"Repository {self.repo_id} exists") return True except Exception as e: print(f"Repository {self.repo_id} not found, attempting to create...") try: create_repo( repo_id=self.repo_id, repo_type="dataset", token=self.token, private=True, # Make it private by default exist_ok=True ) print(f"Created repository {self.repo_id}") # Create initial README readme_content = """# Open WebUI Storage This dataset stores persistent data for Open WebUI deployment. ## Contents - `data.tar.gz`: Compressed archive containing all Open WebUI data including: - User configurations - Chat histories - Uploaded files - Database files This repository is automatically managed by the Open WebUI sync system. """ with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as tmp: tmp.write(readme_content) tmp.flush() self.api.upload_file( path_or_fileobj=tmp.name, path_in_repo="README.md", repo_id=self.repo_id, repo_type="dataset", commit_message="Initial repository setup", token=self.token ) os.unlink(tmp.name) return True except Exception as create_error: print(f"Failed to create repository: {create_error}") return False def download_data(self): """Download and extract data from HF dataset repo""" try: print("Downloading data from Hugging Face...") # Ensure data directory exists and is writable self.data_dir.mkdir(parents=True, exist_ok=True) # Test write permissions test_file = self.data_dir / "test_write" try: test_file.touch() test_file.unlink() print(f"Data directory {self.data_dir} is writable") except Exception as e: print(f"Warning: Data directory may not be writable: {e}") return if not self.token: print("No HF_TOKEN provided, skipping download") return # Ensure repository exists if not self.ensure_repo_exists(): print("Could not access or create repository") return # Try to download the data archive try: file_path = self.api.hf_hub_download( repo_id=self.repo_id, filename="data.tar.gz", repo_type="dataset", token=self.token ) with tarfile.open(file_path, 'r:gz') as tar: tar.extractall(self.data_dir) print(f"Data extracted to {self.data_dir}") except Exception as e: print(f"No existing data found (this is normal for first run): {e}") except Exception as e: print(f"Error during download: {e}") def upload_data(self): """Compress and upload data to HF dataset repo""" try: if not self.token: print("No HF_TOKEN provided, skipping upload") return print("Uploading data to Hugging Face...") if not self.data_dir.exists() or not any(self.data_dir.iterdir()): print("No data to upload") return # Ensure repository exists if not self.ensure_repo_exists(): print("Could not access or create repository") return # Create temporary archive with tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) as tmp: with tarfile.open(tmp.name, 'w:gz') as tar: for item in self.data_dir.iterdir(): if item.name not in ["test_write", ".gitkeep"]: # Skip test files tar.add(item, arcname=item.name) # Upload to HF self.api.upload_file( path_or_fileobj=tmp.name, path_in_repo="data.tar.gz", repo_id=self.repo_id, repo_type="dataset", commit_message="Update Open WebUI data", token=self.token ) # Clean up os.unlink(tmp.name) print("Data uploaded successfully") except Exception as e: print(f"Error uploading data: {e}") def main(): import sys repo_id = os.getenv("HF_STORAGE_REPO", "nxdev-org/open-webui-storage") token = os.getenv("HF_TOKEN") data_dir = os.getenv("DATA_DIR", "/tmp/open-webui-data") sync = HFStorageSync(repo_id, token, data_dir) if len(sys.argv) > 1: if sys.argv[1] == "download": sync.download_data() elif sys.argv[1] == "upload": sync.upload_data() else: print("Usage: sync_storage.py [download|upload]") else: print("Usage: sync_storage.py [download|upload]") if __name__ == "__main__": main()