Code / sync_home.py
gallyg's picture
Upload 11 files
632b0a7 verified
import argparse
import os
import subprocess
import time
import shutil
from huggingface_hub import snapshot_download, HfApi
# Hugging Face Hub commit validation forbids pushing files under certain folder names,
# including ".cache". If we try to upload home/.cache/** we will get:
# "Invalid path_in_repo ... cannot update files under a '.cache/' folder".
# This is enforced server-side / client-side validation (FORBIDDEN_FOLDERS). [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py)
FORCED_EXCLUDES = [".cache"]
# Optional default excludes to keep repo size reasonable.
# NOTE: Do NOT exclude code-server extensions/User if you want them persisted.
DEFAULT_EXCLUDES = [
# huge and usually not worth versioning
"node_modules",
"__pycache__",
".local/share/Trash",
# optional caches (keep if you want full persistence; remove from here if desired)
# ".npm/_cacache", # many users exclude this; you may keep it if you want
# ".local/share/code-server/Cache",
# ".local/share/code-server/CachedData",
# ".local/share/code-server/GPUCache",
# ".local/share/code-server/logs",
]
def run(cmd):
subprocess.check_call(cmd)
def capture(cmd):
return subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT)
def parse_excludes():
"""
Excludes come from:
- DEFAULT_EXCLUDES ((可选))
- SYNC_EXCLUDES env var: comma-separated patterns
- FORCED_EXCLUDES: always enforced (currently ".cache")
If SYNC_DISABLE_EXCLUDES=1, we still enforce FORCED_EXCLUDES because Hub rejects ".cache".
[1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py)
"""
disable = os.environ.get("SYNC_DISABLE_EXCLUDES") == "1"
extra_raw = os.environ.get("SYNC_EXCLUDES", "").strip()
excludes = []
if not disable:
excludes.extend(DEFAULT_EXCLUDES)
if extra_raw:
excludes.extend([x.strip() for x in extra_raw.split(",") if x.strip()])
# Always enforce forbidden folders excludes
excludes.extend(FORCED_EXCLUDES)
# de-dup while preserving order
seen = set()
out = []
for e in excludes:
if e not in seen:
seen.add(e)
out.append(e)
return out
def rsync(src: str, dst: str, delete: bool):
excludes = parse_excludes()
cmd = ["rsync", "-a"]
if delete:
cmd.append("--delete")
for pat in excludes:
cmd += ["--exclude", pat]
cmd += [src.rstrip("/") + "/", dst.rstrip("/") + "/"]
run(cmd)
def rsync_has_changes(src: str, dst: str, delete: bool) -> bool:
"""
Detect whether an rsync would change anything (to skip empty commits).
"""
excludes = parse_excludes()
cmd = ["rsync", "-a", "--dry-run", "--itemize-changes"]
if delete:
cmd.append("--delete")
for pat in excludes:
cmd += ["--exclude", pat]
cmd += [src.rstrip("/") + "/", dst.rstrip("/") + "/"]
try:
out = capture(cmd)
except subprocess.CalledProcessError as e:
# if dry-run fails, be conservative and say "has changes"
return True
# rsync prints one line per changed item; ignore empty output
return any(line.strip() for line in out.splitlines())
def pull(repo: str, dst: str):
"""
Download dataset repo snapshot into dst.
"""
os.makedirs(dst, exist_ok=True)
# snapshot_download uses a local cache; its location is controlled by HF_HOME/HF_HUB_CACHE env vars. [2](https://huggingface.co/docs/huggingface_hub/guides/manage-cache)
snapshot_download(
repo_id=repo,
repo_type="dataset",
local_dir=dst,
local_dir_use_symlinks=False, # kept for compatibility with older versions; ignored in newer versions
token=os.environ.get("HF_TOKEN"),
)
def rsync_in(src: str, dst: str):
"""
dataset -> home
DO NOT delete by default (avoid wiping image-preinstalled dirs such as .npm-global).
"""
rsync(src, dst, delete=False)
def rsync_out(home: str, persist_home: str):
"""
home -> dataset snapshot folder
Use delete=True to keep dataset/home consistent with current home,
but always exclude ".cache" (Hub rejects it). [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py)
"""
rsync(home, persist_home, delete=True)
def sanitize_forbidden(persist: str):
"""
Remove forbidden folders if present in persist/home before upload.
Currently: persist/home/.cache
"""
forbidden_path = os.path.join(persist, "home", ".cache")
shutil.rmtree(forbidden_path, ignore_errors=True)
def push_repo(repo: str, persist: str):
"""
Upload persist folder back to dataset repo.
"""
sanitize_forbidden(persist)
api = HfApi(token=os.environ.get("HF_TOKEN"))
# ignore_patterns provides another safety layer so that even if something slipped in,
# it won't be included in the commit operation.
api.upload_folder(
repo_id=repo,
repo_type="dataset",
folder_path=persist,
path_in_repo="",
commit_message=f"sync home: {time.strftime('%Y-%m-%d %H:%M:%S')}",
ignore_patterns=[
"home/.cache/**",
".cache/**",
],
)
def push(repo: str, home: str, persist: str):
"""
home -> persist/home via rsync, then upload persist to Hub
"""
persist_home = os.path.join(persist, "home")
os.makedirs(persist_home, exist_ok=True)
# If nothing changed, skip commit to avoid empty commits
if not rsync_has_changes(home, persist_home, delete=True):
print("No files have been modified since last commit. Skipping to prevent empty commit.")
return
rsync_out(home, persist_home)
push_repo(repo, persist)
def daemon(repo: str, home: str, persist: str, interval: int):
while True:
try:
push(repo, home, persist)
print(f"[sync] pushed OK. next in {interval}s")
except Exception as e:
print(f"[sync] push failed: {e}")
time.sleep(interval)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
sub = ap.add_subparsers(dest="cmd", required=True)
p_pull = sub.add_parser("pull")
p_pull.add_argument("--repo", required=True)
p_pull.add_argument("--dst", required=True)
p_in = sub.add_parser("rsync_in")
p_in.add_argument("--src", required=True)
p_in.add_argument("--dst", required=True)
p_push = sub.add_parser("push")
p_push.add_argument("--repo", required=True)
p_push.add_argument("--home", required=True)
p_push.add_argument("--persist", required=True)
p_daemon = sub.add_parser("daemon")
p_daemon.add_argument("--repo", required=True)
p_daemon.add_argument("--home", required=True)
p_daemon.add_argument("--persist", required=True)
p_daemon.add_argument("--interval", type=int, default=300)
args = ap.parse_args()
if args.cmd == "pull":
pull(args.repo, args.dst)
elif args.cmd == "rsync_in":
rsync_in(args.src, args.dst)
elif args.cmd == "push":
push(args.repo, args.home, args.persist)
elif args.cmd == "daemon":
daemon(args.repo, args.home, args.persist, args.interval)