dfdg / sync.py
anuragwank23's picture
Upload 5 files
3d54ee6 verified
"""
sync.py β€” Backup and restore Open WebUI data to/from HuggingFace Dataset.
Usage:
python3 sync.py backup /app/backend/data
python3 sync.py restore /app/backend/data
"""
import os
import sys
import shutil
import tarfile
import tempfile
from pathlib import Path
from datetime import datetime
try:
from huggingface_hub import HfApi, hf_hub_download, upload_file, create_repo
from huggingface_hub.utils import RepositoryNotFoundError, EntryNotFoundError
except ImportError:
print("[SYNC] huggingface_hub not installed. Run: pip install huggingface_hub")
sys.exit(1)
# ── Config ────────────────────────────────────────────────────────────────────
HF_TOKEN = os.environ.get("HF_TOKEN", "")
HF_USERNAME = os.environ.get("HF_USERNAME", "") # auto-detected if blank
DATASET_REPO = os.environ.get("OWUI_DATASET_REPO", "") # override if needed
BACKUP_FILENAME = "open-webui-data.tar.gz"
# Files/dirs to EXCLUDE from backup (large or ephemeral)
EXCLUDE_PATTERNS = {
"__pycache__",
"*.pyc",
"node_modules",
".git",
"uploads", # exclude uploads dir if large; remove this to include
}
def get_repo_id(api: HfApi) -> str:
"""Determine the dataset repo ID to use."""
if DATASET_REPO:
return DATASET_REPO
if not HF_USERNAME:
try:
user = api.whoami(token=HF_TOKEN)
username = user["name"]
except Exception as e:
print(f"[SYNC] Could not determine HF username: {e}")
sys.exit(1)
else:
username = HF_USERNAME
space_name = os.environ.get("SPACE_ID", "").split("/")[-1] or "open-webui"
return f"{username}/{space_name}-data"
def ensure_repo(api: HfApi, repo_id: str):
"""Create the dataset repo if it doesn't exist."""
try:
api.repo_info(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN)
print(f"[SYNC] Dataset repo exists: {repo_id}")
except RepositoryNotFoundError:
print(f"[SYNC] Creating private dataset repo: {repo_id}")
create_repo(
repo_id=repo_id,
repo_type="dataset",
private=True,
token=HF_TOKEN,
)
print(f"[SYNC] βœ… Created: {repo_id}")
def should_exclude(path: Path, base: Path) -> bool:
"""Return True if this path should be excluded."""
rel = str(path.relative_to(base))
for pat in EXCLUDE_PATTERNS:
if pat.startswith("*"):
if path.name.endswith(pat[1:]):
return True
else:
# Check every path component, not just the full relative string
if pat in path.parts:
return True
return False
def backup(data_dir: str):
"""Create a tarball of data_dir and upload to HF Dataset."""
if not HF_TOKEN:
print("[SYNC] HF_TOKEN not set β€” skipping backup.")
return
data_path = Path(data_dir)
if not data_path.exists():
print(f"[SYNC] Data dir does not exist yet: {data_dir}")
return
api = HfApi()
repo_id = get_repo_id(api)
ensure_repo(api, repo_id)
print(f"[SYNC] Creating backup tarball from: {data_dir}")
with tempfile.NamedTemporaryFile(suffix=".tar.gz", delete=False) as tmp:
tmp_path = tmp.name
try:
with tarfile.open(tmp_path, "w:gz") as tar:
for item in data_path.rglob("*"):
if item.is_file() and not should_exclude(item, data_path):
arcname = item.relative_to(data_path.parent)
tar.add(item, arcname=str(arcname))
size_mb = os.path.getsize(tmp_path) / (1024 * 1024)
print(f"[SYNC] Tarball size: {size_mb:.1f} MB")
print(f"[SYNC] Uploading to {repo_id}/{BACKUP_FILENAME}...")
api.upload_file(
path_or_fileobj=tmp_path,
path_in_repo=BACKUP_FILENAME,
repo_id=repo_id,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"Auto-backup {datetime.utcnow().strftime('%Y-%m-%d %H:%M')} UTC",
)
print(f"[SYNC] βœ… Backup complete β†’ {repo_id}/{BACKUP_FILENAME}")
finally:
os.unlink(tmp_path)
def restore(data_dir: str):
"""Download backup tarball from HF Dataset and extract to data_dir."""
if not HF_TOKEN:
print("[SYNC] HF_TOKEN not set β€” skipping restore.")
return
api = HfApi()
repo_id = get_repo_id(api)
print(f"[SYNC] Looking for backup in: {repo_id}/{BACKUP_FILENAME}")
try:
local_path = hf_hub_download(
repo_id=repo_id,
filename=BACKUP_FILENAME,
repo_type="dataset",
token=HF_TOKEN,
local_dir=tempfile.gettempdir(),
)
except (RepositoryNotFoundError, EntryNotFoundError):
print("[SYNC] No backup found β€” starting with fresh data.")
return
except Exception as e:
print(f"[SYNC] Could not download backup: {e}")
return
print(f"[SYNC] Downloaded backup: {local_path}")
Path(data_dir).parent.mkdir(parents=True, exist_ok=True)
print(f"[SYNC] Extracting to: {Path(data_dir).parent}")
with tarfile.open(local_path, "r:gz") as tar:
# filter='data' prevents path traversal attacks (Python 3.12+ recommended)
try:
tar.extractall(path=str(Path(data_dir).parent), filter='data')
except TypeError:
# Older Python versions don't support filter parameter
tar.extractall(path=str(Path(data_dir).parent))
print(f"[SYNC] βœ… Restore complete β†’ {data_dir}")
# ── Main ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
if len(sys.argv) < 3:
print(f"Usage: python3 {sys.argv[0]} <backup|restore> <data_dir>")
sys.exit(1)
action = sys.argv[1].lower()
data_dir = sys.argv[2]
if action == "backup":
backup(data_dir)
elif action == "restore":
restore(data_dir)
else:
print(f"Unknown action: {action}. Use 'backup' or 'restore'.")
sys.exit(1)