| import argparse |
| import os |
| import subprocess |
| import time |
| import shutil |
| from huggingface_hub import snapshot_download, HfApi |
|
|
|
|
| |
| |
| |
| |
| FORCED_EXCLUDES = [".cache"] |
|
|
| |
| |
| DEFAULT_EXCLUDES = [ |
| |
| "node_modules", |
| "__pycache__", |
| ".local/share/Trash", |
|
|
| |
| |
| |
| |
| |
| |
| ] |
|
|
|
|
| def run(cmd): |
| subprocess.check_call(cmd) |
|
|
|
|
| def capture(cmd): |
| return subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT) |
|
|
|
|
| def parse_excludes(): |
| """ |
| Excludes come from: |
| - DEFAULT_EXCLUDES ((可选)) |
| - SYNC_EXCLUDES env var: comma-separated patterns |
| - FORCED_EXCLUDES: always enforced (currently ".cache") |
| If SYNC_DISABLE_EXCLUDES=1, we still enforce FORCED_EXCLUDES because Hub rejects ".cache". |
| [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py) |
| """ |
| disable = os.environ.get("SYNC_DISABLE_EXCLUDES") == "1" |
| extra_raw = os.environ.get("SYNC_EXCLUDES", "").strip() |
|
|
| excludes = [] |
| if not disable: |
| excludes.extend(DEFAULT_EXCLUDES) |
| if extra_raw: |
| excludes.extend([x.strip() for x in extra_raw.split(",") if x.strip()]) |
|
|
| |
| excludes.extend(FORCED_EXCLUDES) |
|
|
| |
| seen = set() |
| out = [] |
| for e in excludes: |
| if e not in seen: |
| seen.add(e) |
| out.append(e) |
| return out |
|
|
|
|
| def rsync(src: str, dst: str, delete: bool): |
| excludes = parse_excludes() |
| cmd = ["rsync", "-a"] |
|
|
| if delete: |
| cmd.append("--delete") |
|
|
| for pat in excludes: |
| cmd += ["--exclude", pat] |
|
|
| cmd += [src.rstrip("/") + "/", dst.rstrip("/") + "/"] |
| run(cmd) |
|
|
|
|
| def rsync_has_changes(src: str, dst: str, delete: bool) -> bool: |
| """ |
| Detect whether an rsync would change anything (to skip empty commits). |
| """ |
| excludes = parse_excludes() |
| cmd = ["rsync", "-a", "--dry-run", "--itemize-changes"] |
| if delete: |
| cmd.append("--delete") |
| for pat in excludes: |
| cmd += ["--exclude", pat] |
| cmd += [src.rstrip("/") + "/", dst.rstrip("/") + "/"] |
|
|
| try: |
| out = capture(cmd) |
| except subprocess.CalledProcessError as e: |
| |
| return True |
|
|
| |
| return any(line.strip() for line in out.splitlines()) |
|
|
|
|
| def pull(repo: str, dst: str): |
| """ |
| Download dataset repo snapshot into dst. |
| """ |
| os.makedirs(dst, exist_ok=True) |
|
|
| |
| snapshot_download( |
| repo_id=repo, |
| repo_type="dataset", |
| local_dir=dst, |
| local_dir_use_symlinks=False, |
| token=os.environ.get("HF_TOKEN"), |
| ) |
|
|
|
|
| def rsync_in(src: str, dst: str): |
| """ |
| dataset -> home |
| DO NOT delete by default (avoid wiping image-preinstalled dirs such as .npm-global). |
| """ |
| rsync(src, dst, delete=False) |
|
|
|
|
| def rsync_out(home: str, persist_home: str): |
| """ |
| home -> dataset snapshot folder |
| Use delete=True to keep dataset/home consistent with current home, |
| but always exclude ".cache" (Hub rejects it). [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py) |
| """ |
| rsync(home, persist_home, delete=True) |
|
|
|
|
| def sanitize_forbidden(persist: str): |
| """ |
| Remove forbidden folders if present in persist/home before upload. |
| Currently: persist/home/.cache |
| """ |
| forbidden_path = os.path.join(persist, "home", ".cache") |
| shutil.rmtree(forbidden_path, ignore_errors=True) |
|
|
|
|
| def push_repo(repo: str, persist: str): |
| """ |
| Upload persist folder back to dataset repo. |
| """ |
| sanitize_forbidden(persist) |
|
|
| api = HfApi(token=os.environ.get("HF_TOKEN")) |
|
|
| |
| |
| api.upload_folder( |
| repo_id=repo, |
| repo_type="dataset", |
| folder_path=persist, |
| path_in_repo="", |
| commit_message=f"sync home: {time.strftime('%Y-%m-%d %H:%M:%S')}", |
| ignore_patterns=[ |
| "home/.cache/**", |
| ".cache/**", |
| ], |
| ) |
|
|
|
|
| def push(repo: str, home: str, persist: str): |
| """ |
| home -> persist/home via rsync, then upload persist to Hub |
| """ |
| persist_home = os.path.join(persist, "home") |
| os.makedirs(persist_home, exist_ok=True) |
|
|
| |
| if not rsync_has_changes(home, persist_home, delete=True): |
| print("No files have been modified since last commit. Skipping to prevent empty commit.") |
| return |
|
|
| rsync_out(home, persist_home) |
| push_repo(repo, persist) |
|
|
|
|
| def daemon(repo: str, home: str, persist: str, interval: int): |
| while True: |
| try: |
| push(repo, home, persist) |
| print(f"[sync] pushed OK. next in {interval}s") |
| except Exception as e: |
| print(f"[sync] push failed: {e}") |
| time.sleep(interval) |
|
|
|
|
| if __name__ == "__main__": |
| ap = argparse.ArgumentParser() |
| sub = ap.add_subparsers(dest="cmd", required=True) |
|
|
| p_pull = sub.add_parser("pull") |
| p_pull.add_argument("--repo", required=True) |
| p_pull.add_argument("--dst", required=True) |
|
|
| p_in = sub.add_parser("rsync_in") |
| p_in.add_argument("--src", required=True) |
| p_in.add_argument("--dst", required=True) |
|
|
| p_push = sub.add_parser("push") |
| p_push.add_argument("--repo", required=True) |
| p_push.add_argument("--home", required=True) |
| p_push.add_argument("--persist", required=True) |
|
|
| p_daemon = sub.add_parser("daemon") |
| p_daemon.add_argument("--repo", required=True) |
| p_daemon.add_argument("--home", required=True) |
| p_daemon.add_argument("--persist", required=True) |
| p_daemon.add_argument("--interval", type=int, default=300) |
|
|
| args = ap.parse_args() |
|
|
| if args.cmd == "pull": |
| pull(args.repo, args.dst) |
| elif args.cmd == "rsync_in": |
| rsync_in(args.src, args.dst) |
| elif args.cmd == "push": |
| push(args.repo, args.home, args.persist) |
| elif args.cmd == "daemon": |
| daemon(args.repo, args.home, args.persist, args.interval) |