File size: 6,637 Bytes
15d23da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
#!/usr/bin/env python3
import os
import shutil
import json
from pathlib import Path
from huggingface_hub import HfApi, create_repo
import tarfile
import tempfile
class HFStorageSync:
def __init__(self, repo_id, token=None, data_dir="/tmp/open-webui-data"):
self.repo_id = repo_id
self.data_dir = Path(data_dir)
self.token = token
# Initialize API with token directly
self.api = HfApi(token=token) if token else HfApi()
def ensure_repo_exists(self):
"""Create repository if it doesn't exist"""
if not self.token:
print("No token provided, cannot create repository")
return False
try:
# Check if repo exists
repo_info = self.api.repo_info(repo_id=self.repo_id, repo_type="dataset")
print(f"Repository {self.repo_id} exists")
return True
except Exception as e:
print(f"Repository {self.repo_id} not found, attempting to create...")
try:
create_repo(
repo_id=self.repo_id,
repo_type="dataset",
token=self.token,
private=True, # Make it private by default
exist_ok=True
)
print(f"Created repository {self.repo_id}")
# Create initial README
readme_content = """# Open WebUI Storage
This dataset stores persistent data for Open WebUI deployment.
## Contents
- `data.tar.gz`: Compressed archive containing all Open WebUI data including:
- User configurations
- Chat histories
- Uploaded files
- Database files
This repository is automatically managed by the Open WebUI sync system.
"""
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as tmp:
tmp.write(readme_content)
tmp.flush()
self.api.upload_file(
path_or_fileobj=tmp.name,
path_in_repo="README.md",
repo_id=self.repo_id,
repo_type="dataset",
commit_message="Initial repository setup",
token=self.token
)
os.unlink(tmp.name)
return True
except Exception as create_error:
print(f"Failed to create repository: {create_error}")
return False
def download_data(self):
"""Download and extract data from HF dataset repo"""
try:
print("Downloading data from Hugging Face...")
# Ensure data directory exists and is writable
self.data_dir.mkdir(parents=True, exist_ok=True)
# Test write permissions
test_file = self.data_dir / "test_write"
try:
test_file.touch()
test_file.unlink()
print(f"Data directory {self.data_dir} is writable")
except Exception as e:
print(f"Warning: Data directory may not be writable: {e}")
return
if not self.token:
print("No HF_TOKEN provided, skipping download")
return
# Ensure repository exists
if not self.ensure_repo_exists():
print("Could not access or create repository")
return
# Try to download the data archive
try:
file_path = self.api.hf_hub_download(
repo_id=self.repo_id,
filename="data.tar.gz",
repo_type="dataset",
token=self.token
)
with tarfile.open(file_path, 'r:gz') as tar:
tar.extractall(self.data_dir)
print(f"Data extracted to {self.data_dir}")
except Exception as e:
print(f"No existing data found (this is normal for first run): {e}")
except Exception as e:
print(f"Error during download: {e}")
def upload_data(self):
"""Compress and upload data to HF dataset repo"""
try:
if not self.token:
print("No HF_TOKEN provided, skipping upload")
return
print("Uploading data to Hugging Face...")
if not self.data_dir.exists() or not any(self.data_dir.iterdir()):
print("No data to upload")
return
# Ensure repository exists
if not self.ensure_repo_exists():
print("Could not access or create repository")
return
# Create temporary archive
with tempfile.NamedTemporaryFile(suffix='.tar.gz', delete=False) as tmp:
with tarfile.open(tmp.name, 'w:gz') as tar:
for item in self.data_dir.iterdir():
if item.name not in ["test_write", ".gitkeep"]: # Skip test files
tar.add(item, arcname=item.name)
# Upload to HF
self.api.upload_file(
path_or_fileobj=tmp.name,
path_in_repo="data.tar.gz",
repo_id=self.repo_id,
repo_type="dataset",
commit_message="Update Open WebUI data",
token=self.token
)
# Clean up
os.unlink(tmp.name)
print("Data uploaded successfully")
except Exception as e:
print(f"Error uploading data: {e}")
def main():
import sys
repo_id = os.getenv("HF_STORAGE_REPO", "nxdev-org/open-webui-storage")
token = os.getenv("HF_TOKEN")
data_dir = os.getenv("DATA_DIR", "/tmp/open-webui-data")
sync = HFStorageSync(repo_id, token, data_dir)
if len(sys.argv) > 1:
if sys.argv[1] == "download":
sync.download_data()
elif sys.argv[1] == "upload":
sync.upload_data()
else:
print("Usage: sync_storage.py [download|upload]")
else:
print("Usage: sync_storage.py [download|upload]")
if __name__ == "__main__":
main()
|