Spaces:
Sleeping
Sleeping
File size: 6,361 Bytes
3d54ee6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | """
sync.py β Backup and restore Open WebUI data to/from HuggingFace Dataset.
Usage:
python3 sync.py backup /app/backend/data
python3 sync.py restore /app/backend/data
"""
import os
import sys
import shutil
import tarfile
import tempfile
from pathlib import Path
from datetime import datetime
try:
from huggingface_hub import HfApi, hf_hub_download, upload_file, create_repo
from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError
except ImportError:
print("[SYNC] huggingface_hub not installed. Run: pip install huggingface_hub")
sys.exit(1)
# ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
HF_TOKEN = os.environ.get("HF_TOKEN", "")
HF_USERNAME = os.environ.get("HF_USERNAME", "") # auto-detected if blank
DATASET_REPO = os.environ.get("OWUI_DATASET_REPO", "") # override if needed
BACKUP_FILENAME = "open-webui-data.tar.gz"
# Files/dirs to EXCLUDE from backup (large or ephemeral)
EXCLUDE_PATTERNS = {
"__pycache__",
"*.pyc",
"node_modules",
".git",
"uploads", # exclude uploads dir if large; remove this to include
}
def get_repo_id(api: HfApi) -> str:
"""Determine the dataset repo ID to use."""
if DATASET_REPO:
return DATASET_REPO
if not HF_USERNAME:
try:
user = api.whoami(token=HF_TOKEN)
username = user["name"]
except Exception as e:
print(f"[SYNC] Could not determine HF username: {e}")
sys.exit(1)
else:
username = HF_USERNAME
space_name = os.environ.get("SPACE_ID", "").split("/")[-1] or "open-webui"
return f"{username}/{space_name}-data"
def ensure_repo(api: HfApi, repo_id: str):
"""Create the dataset repo if it doesn't exist."""
try:
api.repo_info(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
print(f"[SYNC] Dataset repo exists: {repo_id}")
except RepositoryNotFoundError:
print(f"[SYNC] Creating private dataset repo: {repo_id}")
create_repo(
repo_id=repo_id,
repo_type="dataset",
private=True,
token=HF_TOKEN,
)
print(f"[SYNC] β
Created: {repo_id}")
def should_exclude(path: Path, base: Path) -> bool:
"""Return True if this path should be excluded."""
rel = str(path.relative_to(base))
for pat in EXCLUDE_PATTERNS:
if pat.startswith("*"):
if path.name.endswith(pat[1:]):
return True
else:
# Check every path component, not just the full relative string
if pat in path.parts:
return True
return False
def backup(data_dir: str):
"""Create a tarball of data_dir and upload to HF Dataset."""
if not HF_TOKEN:
print("[SYNC] HF_TOKEN not set β skipping backup.")
return
data_path = Path(data_dir)
if not data_path.exists():
print(f"[SYNC] Data dir does not exist yet: {data_dir}")
return
api = HfApi()
repo_id = get_repo_id(api)
ensure_repo(api, repo_id)
print(f"[SYNC] Creating backup tarball from: {data_dir}")
with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
tmp_path = tmp.name
try:
with tarfile.open(tmp_path, "w:gz") as tar:
for item in data_path.rglob("*"):
if item.is_file() and not should_exclude(item, data_path):
arcname = item.relative_to(data_path.parent)
tar.add(item, arcname=str(arcname))
size_mb = os.path.getsize(tmp_path) / (1024 * 1024)
print(f"[SYNC] Tarball size: {size_mb:.1f} MB")
print(f"[SYNC] Uploading to {repo_id}/{BACKUP_FILENAME}...")
api.upload_file(
path_or_fileobj=tmp_path,
path_in_repo=BACKUP_FILENAME,
repo_id=repo_id,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"Auto-backup {datetime.utcnow().strftime('%Y-%m-%d %H:%M')} UTC",
)
print(f"[SYNC] β
Backup complete β {repo_id}/{BACKUP_FILENAME}")
finally:
os.unlink(tmp_path)
def restore(data_dir: str):
"""Download backup tarball from HF Dataset and extract to data_dir."""
if not HF_TOKEN:
print("[SYNC] HF_TOKEN not set β skipping restore.")
return
api = HfApi()
repo_id = get_repo_id(api)
print(f"[SYNC] Looking for backup in: {repo_id}/{BACKUP_FILENAME}")
try:
local_path = hf_hub_download(
repo_id=repo_id,
filename=BACKUP_FILENAME,
repo_type="dataset",
token=HF_TOKEN,
local_dir=tempfile.gettempdir(),
)
except (RepositoryNotFoundError, EntryNotFoundError):
print("[SYNC] No backup found β starting with fresh data.")
return
except Exception as e:
print(f"[SYNC] Could not download backup: {e}")
return
print(f"[SYNC] Downloaded backup: {local_path}")
Path(data_dir).parent.mkdir(parents=True, exist_ok=True)
print(f"[SYNC] Extracting to: {Path(data_dir).parent}")
with tarfile.open(local_path, "r:gz") as tar:
# filter='data' prevents path traversal attacks (Python 3.12+ recommended)
try:
tar.extractall(path=str(Path(data_dir).parent), filter='data')
except TypeError:
# Older Python versions don't support filter parameter
tar.extractall(path=str(Path(data_dir).parent))
print(f"[SYNC] β
Restore complete β {data_dir}")
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if __name__ == "__main__":
if len(sys.argv) < 3:
print(f"Usage: python3 {sys.argv[0]} <backup|restore> <data_dir>")
sys.exit(1)
action = sys.argv[1].lower()
data_dir = sys.argv[2]
if action == "backup":
backup(data_dir)
elif action == "restore":
restore(data_dir)
else:
print(f"Unknown action: {action}. Use 'backup' or 'restore'.")
sys.exit(1)
|