| """Backup Aether data dir to HF Dataset as latest-backup.tar.gz. |
| |
| Tar-gzips AETHER_BACKUP_DIR contents into a temp file and uploads to the |
| Dataset repo, replacing latest-backup.tar.gz. A timestamped copy is also |
| kept under history/ for the most recent ~30 backups (older are pruned). |
| """ |
| import os |
| import sys |
| import tarfile |
| import tempfile |
| from datetime import datetime, timezone |
| from pathlib import Path |
|
|
| from huggingface_hub import HfApi |
| from huggingface_hub.utils import RepositoryNotFoundError |
|
|
| token = os.environ["HF_TOKEN"] |
| repo_id = os.environ["AETHER_BACKUP_REPO"] |
| src_dir = Path(os.environ.get("AETHER_BACKUP_DIR", "/opt/aether/data")) |
| keep_history = int(os.environ.get("AETHER_BACKUP_KEEP_COUNT", "30")) |
| filename_latest = os.environ.get("AETHER_BACKUP_LATEST_NAME", "latest-backup.tar.gz") |
|
|
| if not src_dir.exists() or not any(src_dir.iterdir()): |
| print(f"aether-backup: src {src_dir} empty, skip", file=sys.stderr) |
| sys.exit(0) |
|
|
| api = HfApi(token=token) |
| ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") |
| tmpdir = Path(tempfile.mkdtemp(prefix="aether-backup-")) |
| archive_path = tmpdir / f"aether-backup-{ts}.tar.gz" |
|
|
| with tarfile.open(archive_path, "w:gz") as tar: |
| tar.add(src_dir, arcname=src_dir.name) |
|
|
| print(f"aether-backup: created archive {archive_path} ({archive_path.stat().st_size} bytes)", file=sys.stderr) |
|
|
| |
| api.upload_file( |
| path_or_fileobj=str(archive_path), |
| path_in_repo=filename_latest, |
| repo_id=repo_id, |
| repo_type="dataset", |
| commit_message=f"backup {ts}", |
| ) |
| |
| hist_path = f"history/aether-backup-{ts}.tar.gz" |
| api.upload_file( |
| path_or_fileobj=str(archive_path), |
| path_in_repo=hist_path, |
| repo_id=repo_id, |
| repo_type="dataset", |
| commit_message=f"backup history {ts}", |
| ) |
| print(f"aether-backup: uploaded {filename_latest} and {hist_path}", file=sys.stderr) |
|
|
| |
| try: |
| files = api.list_repo_files(repo_id=repo_id, repo_type="dataset") |
| history_files = sorted(f for f in files if f.startswith("history/aether-backup-")) |
| overflow = history_files[:-keep_history] if len(history_files) > keep_history else [] |
| for old in overflow: |
| try: |
| api.delete_file( |
| path_in_repo=old, |
| repo_id=repo_id, |
| repo_type="dataset", |
| commit_message=f"prune {old}", |
| ) |
| print(f"aether-backup: pruned {old}", file=sys.stderr) |
| except Exception as exc: |
| print(f"aether-backup: prune {old} failed: {exc}", file=sys.stderr) |
| except RepositoryNotFoundError: |
| pass |
|
|
| |
| try: |
| archive_path.unlink(missing_ok=True) |
| tmpdir.rmdir() |
| except Exception: |
| pass |
|
|