readeck / sync_storage.py
wwforonce's picture
add dataset
15d23da
#!/usr/bin/env python3
import os
import shutil
import json
from pathlib import Path
from huggingface_hub import HfApi, create_repo
import tarfile
import tempfile
class HFStorageSync:
def __init__(self, repo_id, token=None, data_dir="/tmp/open-webui-data"):
self.repo_id = repo_id
self.data_dir = Path(data_dir)
self.token = token
# Initialize API with token directly
self.api = HfApi(token=token) if token else HfApi()
def ensure_repo_exists(self):
"""Create repository if it doesn't exist"""
if not self.token:
print("No token provided, cannot create repository")
return False
try:
# Check if repo exists
repo_info = self.api.repo_info(repo_id=self.repo_id, repo_type="dataset")
print(f"Repository {self.repo_id} exists")
return True
except Exception as e:
print(f"Repository {self.repo_id} not found, attempting to create...")
try:
create_repo(
repo_id=self.repo_id,
repo_type="dataset",
token=self.token,
private=True, # Make it private by default
exist_ok=True
)
print(f"Created repository {self.repo_id}")
# Create initial README
readme_content = """# Open WebUI Storage
This dataset stores persistent data for Open WebUI deployment.
## Contents
- `data.tar.gz`: Compressed archive containing all Open WebUI data including:
- User configurations
- Chat histories
- Uploaded files
- Database files
This repository is automatically managed by the Open WebUI sync system.
"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as tmp:
tmp.write(readme_content)
tmp.flush()
self.api.upload_file(
path_or_fileobj=tmp.name,
path_in_repo="README.md",
repo_id=self.repo_id,
repo_type="dataset",
commit_message="Initial repository setup",
token=self.token
)
os.unlink(tmp.name)
return True
except Exception as create_error:
print(f"Failed to create repository: {create_error}")
return False
def download_data(self):
"""Download and extract data from HF dataset repo"""
try:
print("Downloading data from Hugging Face...")
# Ensure data directory exists and is writable
self.data_dir.mkdir(parents=True, exist_ok=True)
# Test write permissions
test_file = self.data_dir / "test_write"
try:
test_file.touch()
test_file.unlink()
print(f"Data directory {self.data_dir} is writable")
except Exception as e:
print(f"Warning: Data directory may not be writable: {e}")
return
if not self.token:
print("No HF_TOKEN provided, skipping download")
return
# Ensure repository exists
if not self.ensure_repo_exists():
print("Could not access or create repository")
return
# Try to download the data archive
try:
file_path = self.api.hf_hub_download(
repo_id=self.repo_id,
filename="data.tar.gz",
repo_type="dataset",
token=self.token
)
with tarfile.open(file_path, 'r:gz') as tar:
tar.extractall(self.data_dir)
print(f"Data extracted to {self.data_dir}")
except Exception as e:
print(f"No existing data found (this is normal for first run): {e}")
except Exception as e:
print(f"Error during download: {e}")
def upload_data(self):
"""Compress and upload data to HF dataset repo"""
try:
if not self.token:
print("No HF_TOKEN provided, skipping upload")
return
print("Uploading data to Hugging Face...")
if not self.data_dir.exists() or not any(self.data_dir.iterdir()):
print("No data to upload")
return
# Ensure repository exists
if not self.ensure_repo_exists():
print("Could not access or create repository")
return
# Create temporary archive
with tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) as tmp:
with tarfile.open(tmp.name, 'w:gz') as tar:
for item in self.data_dir.iterdir():
if item.name not in ["test_write", ".gitkeep"]: # Skip test files
tar.add(item, arcname=item.name)
# Upload to HF
self.api.upload_file(
path_or_fileobj=tmp.name,
path_in_repo="data.tar.gz",
repo_id=self.repo_id,
repo_type="dataset",
commit_message="Update Open WebUI data",
token=self.token
)
# Clean up
os.unlink(tmp.name)
print("Data uploaded successfully")
except Exception as e:
print(f"Error uploading data: {e}")
def main():
import sys
repo_id = os.getenv("HF_STORAGE_REPO", "nxdev-org/open-webui-storage")
token = os.getenv("HF_TOKEN")
data_dir = os.getenv("DATA_DIR", "/tmp/open-webui-data")
sync = HFStorageSync(repo_id, token, data_dir)
if len(sys.argv) > 1:
if sys.argv[1] == "download":
sync.download_data()
elif sys.argv[1] == "upload":
sync.upload_data()
else:
print("Usage: sync_storage.py [download|upload]")
else:
print("Usage: sync_storage.py [download|upload]")
if __name__ == "__main__":
main()