File size: 7,305 Bytes
632b0a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import argparse
import os
import subprocess
import time
import shutil
from huggingface_hub import snapshot_download, HfApi


# Hugging Face Hub commit validation forbids pushing files under certain folder names,
# including ".cache". If we try to upload home/.cache/** we will get:
# "Invalid path_in_repo ... cannot update files under a '.cache/' folder".
# This is enforced server-side / client-side validation (FORBIDDEN_FOLDERS). [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py)
FORCED_EXCLUDES = [".cache"]

# Optional default excludes to keep repo size reasonable.
# NOTE: Do NOT exclude code-server extensions/User if you want them persisted.
DEFAULT_EXCLUDES = [
    # huge and usually not worth versioning
    "node_modules",
    "__pycache__",
    ".local/share/Trash",

    # optional caches (keep if you want full persistence; remove from here if desired)
    # ".npm/_cacache",  # many users exclude this; you may keep it if you want
    # ".local/share/code-server/Cache",
    # ".local/share/code-server/CachedData",
    # ".local/share/code-server/GPUCache",
    # ".local/share/code-server/logs",
]


def run(cmd):
    subprocess.check_call(cmd)


def capture(cmd):
    return subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT)


def parse_excludes():
    """
    Excludes come from:
      - DEFAULT_EXCLUDES ((可选))
      - SYNC_EXCLUDES env var: comma-separated patterns
      - FORCED_EXCLUDES: always enforced (currently ".cache")
    If SYNC_DISABLE_EXCLUDES=1, we still enforce FORCED_EXCLUDES because Hub rejects ".cache".
    [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py)
    """
    disable = os.environ.get("SYNC_DISABLE_EXCLUDES") == "1"
    extra_raw = os.environ.get("SYNC_EXCLUDES", "").strip()

    excludes = []
    if not disable:
        excludes.extend(DEFAULT_EXCLUDES)
        if extra_raw:
            excludes.extend([x.strip() for x in extra_raw.split(",") if x.strip()])

    # Always enforce forbidden folders excludes
    excludes.extend(FORCED_EXCLUDES)

    # de-dup while preserving order
    seen = set()
    out = []
    for e in excludes:
        if e not in seen:
            seen.add(e)
            out.append(e)
    return out


def rsync(src: str, dst: str, delete: bool):
    excludes = parse_excludes()
    cmd = ["rsync", "-a"]

    if delete:
        cmd.append("--delete")

    for pat in excludes:
        cmd += ["--exclude", pat]

    cmd += [src.rstrip("/") + "/", dst.rstrip("/") + "/"]
    run(cmd)


def rsync_has_changes(src: str, dst: str, delete: bool) -> bool:
    """
    Detect whether an rsync would change anything (to skip empty commits).
    """
    excludes = parse_excludes()
    cmd = ["rsync", "-a", "--dry-run", "--itemize-changes"]
    if delete:
        cmd.append("--delete")
    for pat in excludes:
        cmd += ["--exclude", pat]
    cmd += [src.rstrip("/") + "/", dst.rstrip("/") + "/"]

    try:
        out = capture(cmd)
    except subprocess.CalledProcessError as e:
        # if dry-run fails, be conservative and say "has changes"
        return True

    # rsync prints one line per changed item; ignore empty output
    return any(line.strip() for line in out.splitlines())


def pull(repo: str, dst: str):
    """
    Download dataset repo snapshot into dst.
    """
    os.makedirs(dst, exist_ok=True)

    # snapshot_download uses a local cache; its location is controlled by HF_HOME/HF_HUB_CACHE env vars. [2](https://huggingface.co/docs/huggingface_hub/guides/manage-cache)
    snapshot_download(
        repo_id=repo,
        repo_type="dataset",
        local_dir=dst,
        local_dir_use_symlinks=False,  # kept for compatibility with older versions; ignored in newer versions
        token=os.environ.get("HF_TOKEN"),
    )


def rsync_in(src: str, dst: str):
    """
    dataset -> home
    DO NOT delete by default (avoid wiping image-preinstalled dirs such as .npm-global).
    """
    rsync(src, dst, delete=False)


def rsync_out(home: str, persist_home: str):
    """
    home -> dataset snapshot folder
    Use delete=True to keep dataset/home consistent with current home,
    but always exclude ".cache" (Hub rejects it). [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py)
    """
    rsync(home, persist_home, delete=True)


def sanitize_forbidden(persist: str):
    """
    Remove forbidden folders if present in persist/home before upload.
    Currently: persist/home/.cache
    """
    forbidden_path = os.path.join(persist, "home", ".cache")
    shutil.rmtree(forbidden_path, ignore_errors=True)


def push_repo(repo: str, persist: str):
    """
    Upload persist folder back to dataset repo.
    """
    sanitize_forbidden(persist)

    api = HfApi(token=os.environ.get("HF_TOKEN"))

    # ignore_patterns provides another safety layer so that even if something slipped in,
    # it won't be included in the commit operation.
    api.upload_folder(
        repo_id=repo,
        repo_type="dataset",
        folder_path=persist,
        path_in_repo="",
        commit_message=f"sync home: {time.strftime('%Y-%m-%d %H:%M:%S')}",
        ignore_patterns=[
            "home/.cache/**",
            ".cache/**",
        ],
    )


def push(repo: str, home: str, persist: str):
    """
    home -> persist/home via rsync, then upload persist to Hub
    """
    persist_home = os.path.join(persist, "home")
    os.makedirs(persist_home, exist_ok=True)

    # If nothing changed, skip commit to avoid empty commits
    if not rsync_has_changes(home, persist_home, delete=True):
        print("No files have been modified since last commit. Skipping to prevent empty commit.")
        return

    rsync_out(home, persist_home)
    push_repo(repo, persist)


def daemon(repo: str, home: str, persist: str, interval: int):
    while True:
        try:
            push(repo, home, persist)
            print(f"[sync] pushed OK. next in {interval}s")
        except Exception as e:
            print(f"[sync] push failed: {e}")
        time.sleep(interval)


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    sub = ap.add_subparsers(dest="cmd", required=True)

    p_pull = sub.add_parser("pull")
    p_pull.add_argument("--repo", required=True)
    p_pull.add_argument("--dst", required=True)

    p_in = sub.add_parser("rsync_in")
    p_in.add_argument("--src", required=True)
    p_in.add_argument("--dst", required=True)

    p_push = sub.add_parser("push")
    p_push.add_argument("--repo", required=True)
    p_push.add_argument("--home", required=True)
    p_push.add_argument("--persist", required=True)

    p_daemon = sub.add_parser("daemon")
    p_daemon.add_argument("--repo", required=True)
    p_daemon.add_argument("--home", required=True)
    p_daemon.add_argument("--persist", required=True)
    p_daemon.add_argument("--interval", type=int, default=300)

    args = ap.parse_args()

    if args.cmd == "pull":
        pull(args.repo, args.dst)
    elif args.cmd == "rsync_in":
        rsync_in(args.src, args.dst)
    elif args.cmd == "push":
        push(args.repo, args.home, args.persist)
    elif args.cmd == "daemon":
        daemon(args.repo, args.home, args.persist, args.interval)