import argparse import os import subprocess import time import shutil from huggingface_hub import snapshot_download, HfApi # Hugging Face Hub commit validation forbids pushing files under certain folder names, # including ".cache". If we try to upload home/.cache/** we will get: # "Invalid path_in_repo ... cannot update files under a '.cache/' folder". # This is enforced server-side / client-side validation (FORBIDDEN_FOLDERS). [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py) FORCED_EXCLUDES = [".cache"] # Optional default excludes to keep repo size reasonable. # NOTE: Do NOT exclude code-server extensions/User if you want them persisted. DEFAULT_EXCLUDES = [ # huge and usually not worth versioning "node_modules", "__pycache__", ".local/share/Trash", # optional caches (keep if you want full persistence; remove from here if desired) # ".npm/_cacache", # many users exclude this; you may keep it if you want # ".local/share/code-server/Cache", # ".local/share/code-server/CachedData", # ".local/share/code-server/GPUCache", # ".local/share/code-server/logs", ] def run(cmd): subprocess.check_call(cmd) def capture(cmd): return subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT) def parse_excludes(): """ Excludes come from: - DEFAULT_EXCLUDES ((可选)) - SYNC_EXCLUDES env var: comma-separated patterns - FORCED_EXCLUDES: always enforced (currently ".cache") If SYNC_DISABLE_EXCLUDES=1, we still enforce FORCED_EXCLUDES because Hub rejects ".cache". [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py) """ disable = os.environ.get("SYNC_DISABLE_EXCLUDES") == "1" extra_raw = os.environ.get("SYNC_EXCLUDES", "").strip() excludes = [] if not disable: excludes.extend(DEFAULT_EXCLUDES) if extra_raw: excludes.extend([x.strip() for x in extra_raw.split(",") if x.strip()]) # Always enforce forbidden folders excludes excludes.extend(FORCED_EXCLUDES) # de-dup while preserving order seen = set() out = [] for e in excludes: if e not in seen: seen.add(e) out.append(e) return out def rsync(src: str, dst: str, delete: bool): excludes = parse_excludes() cmd = ["rsync", "-a"] if delete: cmd.append("--delete") for pat in excludes: cmd += ["--exclude", pat] cmd += [src.rstrip("/") + "/", dst.rstrip("/") + "/"] run(cmd) def rsync_has_changes(src: str, dst: str, delete: bool) -> bool: """ Detect whether an rsync would change anything (to skip empty commits). """ excludes = parse_excludes() cmd = ["rsync", "-a", "--dry-run", "--itemize-changes"] if delete: cmd.append("--delete") for pat in excludes: cmd += ["--exclude", pat] cmd += [src.rstrip("/") + "/", dst.rstrip("/") + "/"] try: out = capture(cmd) except subprocess.CalledProcessError as e: # if dry-run fails, be conservative and say "has changes" return True # rsync prints one line per changed item; ignore empty output return any(line.strip() for line in out.splitlines()) def pull(repo: str, dst: str): """ Download dataset repo snapshot into dst. """ os.makedirs(dst, exist_ok=True) # snapshot_download uses a local cache; its location is controlled by HF_HOME/HF_HUB_CACHE env vars. [2](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) snapshot_download( repo_id=repo, repo_type="dataset", local_dir=dst, local_dir_use_symlinks=False, # kept for compatibility with older versions; ignored in newer versions token=os.environ.get("HF_TOKEN"), ) def rsync_in(src: str, dst: str): """ dataset -> home DO NOT delete by default (avoid wiping image-preinstalled dirs such as .npm-global). """ rsync(src, dst, delete=False) def rsync_out(home: str, persist_home: str): """ home -> dataset snapshot folder Use delete=True to keep dataset/home consistent with current home, but always exclude ".cache" (Hub rejects it). [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py) """ rsync(home, persist_home, delete=True) def sanitize_forbidden(persist: str): """ Remove forbidden folders if present in persist/home before upload. Currently: persist/home/.cache """ forbidden_path = os.path.join(persist, "home", ".cache") shutil.rmtree(forbidden_path, ignore_errors=True) def push_repo(repo: str, persist: str): """ Upload persist folder back to dataset repo. """ sanitize_forbidden(persist) api = HfApi(token=os.environ.get("HF_TOKEN")) # ignore_patterns provides another safety layer so that even if something slipped in, # it won't be included in the commit operation. api.upload_folder( repo_id=repo, repo_type="dataset", folder_path=persist, path_in_repo="", commit_message=f"sync home: {time.strftime('%Y-%m-%d %H:%M:%S')}", ignore_patterns=[ "home/.cache/**", ".cache/**", ], ) def push(repo: str, home: str, persist: str): """ home -> persist/home via rsync, then upload persist to Hub """ persist_home = os.path.join(persist, "home") os.makedirs(persist_home, exist_ok=True) # If nothing changed, skip commit to avoid empty commits if not rsync_has_changes(home, persist_home, delete=True): print("No files have been modified since last commit. Skipping to prevent empty commit.") return rsync_out(home, persist_home) push_repo(repo, persist) def daemon(repo: str, home: str, persist: str, interval: int): while True: try: push(repo, home, persist) print(f"[sync] pushed OK. next in {interval}s") except Exception as e: print(f"[sync] push failed: {e}") time.sleep(interval) if __name__ == "__main__": ap = argparse.ArgumentParser() sub = ap.add_subparsers(dest="cmd", required=True) p_pull = sub.add_parser("pull") p_pull.add_argument("--repo", required=True) p_pull.add_argument("--dst", required=True) p_in = sub.add_parser("rsync_in") p_in.add_argument("--src", required=True) p_in.add_argument("--dst", required=True) p_push = sub.add_parser("push") p_push.add_argument("--repo", required=True) p_push.add_argument("--home", required=True) p_push.add_argument("--persist", required=True) p_daemon = sub.add_parser("daemon") p_daemon.add_argument("--repo", required=True) p_daemon.add_argument("--home", required=True) p_daemon.add_argument("--persist", required=True) p_daemon.add_argument("--interval", type=int, default=300) args = ap.parse_args() if args.cmd == "pull": pull(args.repo, args.dst) elif args.cmd == "rsync_in": rsync_in(args.src, args.dst) elif args.cmd == "push": push(args.repo, args.home, args.persist) elif args.cmd == "daemon": daemon(args.repo, args.home, args.persist, args.interval)