|
|
|
|
|
import os |
|
|
import shutil |
|
|
import json |
|
|
from pathlib import Path |
|
|
from huggingface_hub import HfApi, create_repo |
|
|
import tarfile |
|
|
import tempfile |
|
|
|
|
|
class HFStorageSync: |
|
|
def __init__(self, repo_id, token=None, data_dir="/tmp/open-webui-data"): |
|
|
self.repo_id = repo_id |
|
|
self.data_dir = Path(data_dir) |
|
|
self.token = token |
|
|
|
|
|
|
|
|
self.api = HfApi(token=token) if token else HfApi() |
|
|
|
|
|
def ensure_repo_exists(self): |
|
|
"""Create repository if it doesn't exist""" |
|
|
if not self.token: |
|
|
print("No token provided, cannot create repository") |
|
|
return False |
|
|
|
|
|
try: |
|
|
|
|
|
repo_info = self.api.repo_info(repo_id=self.repo_id, repo_type="dataset") |
|
|
print(f"Repository {self.repo_id} exists") |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"Repository {self.repo_id} not found, attempting to create...") |
|
|
try: |
|
|
create_repo( |
|
|
repo_id=self.repo_id, |
|
|
repo_type="dataset", |
|
|
token=self.token, |
|
|
private=True, |
|
|
exist_ok=True |
|
|
) |
|
|
print(f"Created repository {self.repo_id}") |
|
|
|
|
|
|
|
|
readme_content = """# Open WebUI Storage |
|
|
|
|
|
This dataset stores persistent data for Open WebUI deployment. |
|
|
|
|
|
## Contents |
|
|
|
|
|
- `data.tar.gz`: Compressed archive containing all Open WebUI data including: |
|
|
- User configurations |
|
|
- Chat histories |
|
|
- Uploaded files |
|
|
- Database files |
|
|
|
|
|
This repository is automatically managed by the Open WebUI sync system. |
|
|
""" |
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as tmp: |
|
|
tmp.write(readme_content) |
|
|
tmp.flush() |
|
|
|
|
|
self.api.upload_file( |
|
|
path_or_fileobj=tmp.name, |
|
|
path_in_repo="README.md", |
|
|
repo_id=self.repo_id, |
|
|
repo_type="dataset", |
|
|
commit_message="Initial repository setup", |
|
|
token=self.token |
|
|
) |
|
|
|
|
|
os.unlink(tmp.name) |
|
|
|
|
|
return True |
|
|
except Exception as create_error: |
|
|
print(f"Failed to create repository: {create_error}") |
|
|
return False |
|
|
|
|
|
def download_data(self): |
|
|
"""Download and extract data from HF dataset repo""" |
|
|
try: |
|
|
print("Downloading data from Hugging Face...") |
|
|
|
|
|
|
|
|
self.data_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
test_file = self.data_dir / "test_write" |
|
|
try: |
|
|
test_file.touch() |
|
|
test_file.unlink() |
|
|
print(f"Data directory {self.data_dir} is writable") |
|
|
except Exception as e: |
|
|
print(f"Warning: Data directory may not be writable: {e}") |
|
|
return |
|
|
|
|
|
if not self.token: |
|
|
print("No HF_TOKEN provided, skipping download") |
|
|
return |
|
|
|
|
|
|
|
|
if not self.ensure_repo_exists(): |
|
|
print("Could not access or create repository") |
|
|
return |
|
|
|
|
|
|
|
|
try: |
|
|
file_path = self.api.hf_hub_download( |
|
|
repo_id=self.repo_id, |
|
|
filename="data.tar.gz", |
|
|
repo_type="dataset", |
|
|
token=self.token |
|
|
) |
|
|
|
|
|
with tarfile.open(file_path, 'r:gz') as tar: |
|
|
tar.extractall(self.data_dir) |
|
|
|
|
|
print(f"Data extracted to {self.data_dir}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"No existing data found (this is normal for first run): {e}") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error during download: {e}") |
|
|
|
|
|
def upload_data(self): |
|
|
"""Compress and upload data to HF dataset repo""" |
|
|
try: |
|
|
if not self.token: |
|
|
print("No HF_TOKEN provided, skipping upload") |
|
|
return |
|
|
|
|
|
print("Uploading data to Hugging Face...") |
|
|
|
|
|
if not self.data_dir.exists() or not any(self.data_dir.iterdir()): |
|
|
print("No data to upload") |
|
|
return |
|
|
|
|
|
|
|
|
if not self.ensure_repo_exists(): |
|
|
print("Could not access or create repository") |
|
|
return |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) as tmp: |
|
|
with tarfile.open(tmp.name, 'w:gz') as tar: |
|
|
for item in self.data_dir.iterdir(): |
|
|
if item.name not in ["test_write", ".gitkeep"]: |
|
|
tar.add(item, arcname=item.name) |
|
|
|
|
|
|
|
|
self.api.upload_file( |
|
|
path_or_fileobj=tmp.name, |
|
|
path_in_repo="data.tar.gz", |
|
|
repo_id=self.repo_id, |
|
|
repo_type="dataset", |
|
|
commit_message="Update Open WebUI data", |
|
|
token=self.token |
|
|
) |
|
|
|
|
|
|
|
|
os.unlink(tmp.name) |
|
|
|
|
|
print("Data uploaded successfully") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error uploading data: {e}") |
|
|
|
|
|
def main(): |
|
|
import sys |
|
|
|
|
|
repo_id = os.getenv("HF_STORAGE_REPO", "nxdev-org/open-webui-storage") |
|
|
token = os.getenv("HF_TOKEN") |
|
|
data_dir = os.getenv("DATA_DIR", "/tmp/open-webui-data") |
|
|
|
|
|
sync = HFStorageSync(repo_id, token, data_dir) |
|
|
|
|
|
if len(sys.argv) > 1: |
|
|
if sys.argv[1] == "download": |
|
|
sync.download_data() |
|
|
elif sys.argv[1] == "upload": |
|
|
sync.upload_data() |
|
|
else: |
|
|
print("Usage: sync_storage.py [download|upload]") |
|
|
else: |
|
|
print("Usage: sync_storage.py [download|upload]") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|