asons commited on
Commit
04c5f6e
·
1 Parent(s): 3ee93dd

Add Dockerfile, entrypoint, sync_home, nginx template from hubs

Browse files
Files changed (4) hide show
  1. Dockerfile +59 -0
  2. entrypoint.sh +292 -0
  3. nginx.conf.template +27 -0
  4. sync_home.py +229 -0
Dockerfile ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM codercom/code-server:latest
2
+
3
+ USER root
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ git curl wget ca-certificates unzip jq \
7
+ zsh openssh-client rsync \
8
+ nginx apache2-utils \
9
+ build-essential \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # ---- Python & venv ----
13
+ RUN apt-get update && apt-get install -y \
14
+ python3 python3-venv python3-pip \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # ---- 创建虚拟环境 ----
18
+ RUN python3 -m venv /opt/venv
19
+
20
+ # ⭐ 设置环境变量,让虚拟环境自动激活
21
+ ENV PATH="/opt/venv/bin:$PATH"
22
+ ENV VIRTUAL_ENV=/opt/venv
23
+
24
+ # ---- 升级 pip 并安装依赖 ----
25
+ RUN pip install --no-cache-dir --upgrade pip \
26
+ && pip install --no-cache-dir "huggingface_hub==0.26.*" \
27
+ && python -c "import huggingface_hub; print('huggingface_hub=', huggingface_hub.__version__)"
28
+
29
+ # 安装 Node.js 20 LTS(替代 apt 自带的旧 node)
30
+ RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates curl gnupg \
31
+ && curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
32
+ && apt-get install -y --no-install-recommends nodejs \
33
+ && rm -rf /var/lib/apt/lists/*
34
+
35
+ # 让 npm 的 cache 和全局安装目录都落在 coder 的 HOME(后续会被你同步到 dataset)
36
+ ENV NPM_CONFIG_CACHE=/home/coder/.npm \
37
+ NPM_CONFIG_PREFIX=/home/coder/.npm-global \
38
+ PATH=/home/coder/.npm-global/bin:$PATH
39
+
40
+ # 确保目录存在且归属正确
41
+ RUN mkdir -p /home/coder/.npm /home/coder/.npm-global \
42
+ && chown -R coder:coder /home/coder/.npm /home/coder/.npm-global
43
+
44
+ # 用 coder 用户安装全局 CLI(推荐)
45
+ USER coder
46
+ RUN npm i -g @cometix/codex @anthropic-ai/claude-code
47
+
48
+ # 切回 root(如果你后面还要装 nginx 等;否则可不切回)
49
+ USER root
50
+
51
+ COPY entrypoint.sh /entrypoint.sh
52
+ COPY sync_home.py /sync_home.py
53
+ COPY nginx.conf.template /etc/nginx/templates/nginx.conf.template
54
+ RUN chmod +x /entrypoint.sh
55
+
56
+ WORKDIR /home/coder/workspace
57
+
58
+ EXPOSE 7860
59
+ ENTRYPOINT ["/entrypoint.sh"]
entrypoint.sh ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ############################################
5
+ # Required env vars (Space 设置):
6
+ # Secrets:
7
+ # HF_TOKEN
8
+ # CODE_SERVER_PASSWORD
9
+ # BASIC_AUTH_PASSWORD
10
+ # Variables:
11
+ # CONFIG_DATASET e.g. "yourname/ubuntu"
12
+ # BASIC_AUTH_USER e.g. "gally"
13
+ # SYNC_INTERVAL_SECONDS e.g. "300"
14
+ #
15
+ # Optional:
16
+ # RESET_CLI_CONFIG=1 # if dataset has no .claude/.codex, remove local remnants to force fresh bootstrap
17
+ ############################################
18
+
19
+ : "${CONFIG_DATASET:?CONFIG_DATASET is required, e.g. yourname/ubuntu}"
20
+ : "${HF_TOKEN:?HF_TOKEN secret is required}"
21
+ : "${CODE_SERVER_PASSWORD:?CODE_SERVER_PASSWORD secret is required}"
22
+ : "${BASIC_AUTH_USER:?BASIC_AUTH_USER variable is required}"
23
+ : "${BASIC_AUTH_PASSWORD:?BASIC_AUTH_PASSWORD secret is required}"
24
+ : "${SYNC_INTERVAL_SECONDS:=300}"
25
+
26
+ # Use venv python (has huggingface_hub installed)
27
+ PY="/opt/venv/bin/python"
28
+
29
+ # Canonical HOME
30
+ export HOME="/home/coder"
31
+
32
+ # npm globals (claude/codex)
33
+ export NPM_CONFIG_CACHE="${NPM_CONFIG_CACHE:-$HOME/.npm}"
34
+ export NPM_CONFIG_PREFIX="${NPM_CONFIG_PREFIX:-$HOME/.npm-global}"
35
+ export PATH="$HOME/.npm-global/bin:$PATH"
36
+ mkdir -p "$NPM_CONFIG_CACHE" "$NPM_CONFIG_PREFIX"
37
+ chown -R coder:coder "$HOME" || true
38
+
39
+ # Make interactive shells stable
40
+ cat >/etc/profile.d/00-dev-env.sh <<'EOF'
41
+ export HOME=/home/coder
42
+ export NPM_CONFIG_PREFIX="$HOME/.npm-global"
43
+ export PATH="$HOME/.npm-global/bin:$PATH"
44
+ EOF
45
+ chmod +x /etc/profile.d/00-dev-env.sh
46
+
47
+ # Bootstrap .bashrc with proper prompt and PATH
48
+ if ! grep -q 'npm-global/bin' /home/coder/.bashrc 2>/dev/null; then
49
+ cat >>/home/coder/.bashrc <<'EOF'
50
+
51
+ export HOME=/home/coder
52
+ export NPM_CONFIG_PREFIX="$HOME/.npm-global"
53
+ case ":$PATH:" in
54
+ *":$HOME/.npm-global/bin:"*) ;;
55
+ *) export PATH="$HOME/.npm-global/bin:$PATH" ;;
56
+ esac
57
+ export PS1='$ '
58
+ EOF
59
+ fi
60
+ chown coder:coder /home/coder/.bashrc 2>/dev/null || true
61
+
62
+ # ---- HF auth/cache MUST stay OUT of $HOME (you sync entire HOME) ----
63
+ # huggingface_hub supports HF_HOME/HF_HUB_CACHE env vars. [8](https://github.com/q09sssisiwjb/Use-vscode-chrome-terminal)
64
+ export HF_TOKEN="${HF_TOKEN}"
65
+ export HF_HOME="/tmp/hf_home"
66
+ export HF_TOKEN_PATH="/tmp/hf_home/token"
67
+ export HF_HUB_CACHE="/tmp/hf_home/hub"
68
+ export HF_ASSETS_CACHE="/tmp/hf_home/assets"
69
+
70
+ rm -rf "${HF_HOME}" 2>/dev/null || true
71
+ mkdir -p "${HF_HUB_CACHE}" "${HF_ASSETS_CACHE}"
72
+ chmod -R 777 "${HF_HOME}" 2>/dev/null || true
73
+
74
+ echo "[debug] Using PY=${PY}"
75
+ "${PY}" -c "import huggingface_hub; print('[debug] huggingface_hub=', huggingface_hub.__version__)"
76
+
77
+ # ---- Pull dataset snapshot ----
78
+ PERSIST="/persist_repo"
79
+ mkdir -p "${PERSIST}"
80
+
81
+ echo "[boot] Pull dataset snapshot -> ${PERSIST}"
82
+ "${PY}" /sync_home.py pull --repo "${CONFIG_DATASET}" --dst "${PERSIST}"
83
+
84
+ # ---- Restore dataset home -> /home/coder (NO delete) ----
85
+ # Keeps image-preinstalled dirs from being wiped.
86
+ if [ -d "${PERSIST}/home" ]; then
87
+ echo "[boot] Restore ${PERSIST}/home -> ${HOME} (NO delete)"
88
+ "${PY}" /sync_home.py rsync_in --src "${PERSIST}/home" --dst "${HOME}"
89
+ else
90
+ echo "[boot] Dataset has no 'home/' yet. Initializing..."
91
+ mkdir -p "${PERSIST}/home"
92
+ fi
93
+
94
+ # Fix ownership after restore
95
+ chown -R coder:coder "${HOME}" || true
96
+
97
+ # ---- Optional: clean bootstrap of .claude/.codex if dataset lacks them ----
98
+ # Use if you want to ensure no local remnants remain when dataset does not have these dirs.
99
+ if [ "${RESET_CLI_CONFIG:-0}" = "1" ]; then
100
+ if [ ! -d "${PERSIST}/home/.claude" ]; then rm -rf /home/coder/.claude 2>/dev/null || true; fi
101
+ if [ ! -d "${PERSIST}/home/.codex" ]; then rm -rf /home/coder/.codex 2>/dev/null || true; fi
102
+ if [ ! -f "${PERSIST}/home/.claude.json" ]; then rm -f /home/coder/.claude.json 2>/dev/null || true; fi
103
+ fi
104
+
105
+ # ---- Restore .claude/.codex only if present in dataset snapshot ----
106
+ if [ -d "${PERSIST}/home/.claude" ]; then
107
+ echo "[fix] Restore ~/.claude from dataset (authoritative)"
108
+ mkdir -p /home/coder/.claude
109
+ rsync -a --delete "${PERSIST}/home/.claude/" "/home/coder/.claude/"
110
+ chown -R coder:coder /home/coder/.claude || true
111
+ else
112
+ echo "[fix] Dataset has no .claude -> skip restore"
113
+ fi
114
+
115
+ if [ -d "${PERSIST}/home/.codex" ]; then
116
+ echo "[fix] Restore ~/.codex from dataset (authoritative)"
117
+ mkdir -p /home/coder/.codex
118
+ rsync -a --delete "${PERSIST}/home/.codex/" "/home/coder/.codex/"
119
+ chown -R coder:coder /home/coder/.codex || true
120
+ else
121
+ echo "[fix] Dataset has no .codex -> skip restore"
122
+ fi
123
+
124
+ # ---- Claude onboarding bypass flag (user-scope file) ----
125
+ # Many guides suggest setting hasCompletedOnboarding=true as a TOP-LEVEL field in ~/.claude.json. [1](https://help.aliyun.com/zh/model-studio/claude-code-coding-plan)[2](https://github.com/ding113/claude-code-hub/issues/352)[3](https://linux.do/t/topic/1416398)
126
+ # This helps avoid onboarding/login/connectivity blockers. It does not replace API auth. [1](https://help.aliyun.com/zh/model-studio/claude-code-coding-plan)[4](https://code.claude.com/docs/en/settings)
127
+ if [ ! -f /home/coder/.claude.json ]; then
128
+ cat >/home/coder/.claude.json <<'EOF'
129
+ { "hasCompletedOnboarding": true }
130
+ EOF
131
+ else
132
+ # Ensure the key exists at top-level (simple merge: if missing, overwrite with minimal file)
133
+ if ! grep -q '"hasCompletedOnboarding"[[:space:]]*:[[:space:]]*true' /home/coder/.claude.json; then
134
+ cat >/home/coder/.claude.json <<'EOF'
135
+ { "hasCompletedOnboarding": true }
136
+ EOF
137
+ fi
138
+ fi
139
+ chown coder:coder /home/coder/.claude.json || true
140
+
141
+ # ---- Bootstrap minimal skeleton configs if absent ----
142
+ # Claude user settings live in ~/.claude/settings.json. [4](https://code.claude.com/docs/en/settings)[1](https://help.aliyun.com/zh/model-studio/claude-code-coding-plan)
143
+ mkdir -p /home/coder/.claude
144
+ if [ ! -f /home/coder/.claude/settings.json ]; then
145
+ cat >/home/coder/.claude/settings.json <<'EOF'
146
+ {
147
+ "$schema": "https://json-schema.org/claude-code-settings.json",
148
+ "env": {
149
+ "ANTHROPIC_BASE_URL": "",
150
+ "ANTHROPIC_AUTH_TOKEN": "",
151
+ "ANTHROPIC_MODEL": ""
152
+ },
153
+ "permissions": {
154
+ "allow": [],
155
+ "deny": []
156
+ }
157
+ }
158
+ EOF
159
+ fi
160
+ if [ ! -f /home/coder/.claude/CLAUDE.md ]; then
161
+ cat >/home/coder/.claude/CLAUDE.md <<'EOF'
162
+ # Claude Code global instructions (portable)
163
+ EOF
164
+ fi
165
+ chown -R coder:coder /home/coder/.claude || true
166
+
167
+ # Codex user config lives in ~/.codex/config.toml. [5](https://stackoverflow.com/questions/66496890/vs-code-nopermissions-filesystemerror-error-eacces-permission-denied)[6](https://hugging-face.cn/docs/hub/spaces-storage)
168
+ mkdir -p /home/coder/.codex
169
+ if [ ! -f /home/coder/.codex/config.toml ]; then
170
+ cat >/home/coder/.codex/config.toml <<'EOF'
171
+ # Codex user config (portable baseline)
172
+ # User-level configuration lives in ~/.codex/config.toml. [5](https://stackoverflow.com/questions/66496890/vs-code-nopermissions-filesystemerror-error-eacces-permission-denied)[6](https://hugging-face.cn/docs/hub/spaces-storage)
173
+ model_provider = "openai"
174
+ # model = "gpt-5.2"
175
+ # approval_policy = "on-request"
176
+ # sandbox_mode = "workspace-write"
177
+ EOF
178
+ fi
179
+ chown -R coder:coder /home/coder/.codex || true
180
+
181
+ # ---- Root fallback symlinks (so even processes with HOME=/root use coder configs) ----
182
+ rm -rf /root/.claude /root/.codex 2>/dev/null || true
183
+ ln -sfn /home/coder/.claude /root/.claude
184
+ ln -sfn /home/coder/.codex /root/.codex
185
+ ln -sfn /home/coder/.claude.json /root/.claude.json
186
+
187
+ # ---- Fix codex vendor binary exec bit (Codex spawns this binary) ----
188
+ CODEX_BIN="/home/coder/.npm-global/lib/node_modules/@cometix/codex/vendor/x86_64-unknown-linux-musl/codex/codex"
189
+ if [ -f "$CODEX_BIN" ]; then
190
+ echo "[fix] chmod +x codex vendor binary: $CODEX_BIN"
191
+ chmod 755 "$CODEX_BIN" || true
192
+ chmod 755 "$(dirname "$CODEX_BIN")" 2>/dev/null || true
193
+ fi
194
+
195
+ # ---- Install wrappers so claude/codex always runnable (exec-bit loss safe) ----
196
+ CLAUDE_JS="/home/coder/.npm-global/lib/node_modules/@anthropic-ai/claude-code/cli.js"
197
+ CODEX_JS="/home/coder/.npm-global/lib/node_modules/@cometix/codex/bin/codex.js"
198
+
199
+ cat >/usr/local/bin/claude <<EOF
200
+ #!/usr/bin/env bash
201
+ exec /usr/bin/node "${CLAUDE_JS}" "\$@"
202
+ EOF
203
+ chmod 755 /usr/local/bin/claude
204
+
205
+ cat >/usr/local/bin/codex <<EOF
206
+ #!/usr/bin/env bash
207
+ exec /usr/bin/node "${CODEX_JS}" "\$@"
208
+ EOF
209
+ chmod 755 /usr/local/bin/codex
210
+
211
+ # ---- Nginx basic auth ----
212
+ htpasswd -bc /etc/nginx/.htpasswd "${BASIC_AUTH_USER}" "${BASIC_AUTH_PASSWORD}"
213
+ cp /etc/nginx/templates/nginx.conf.template /etc/nginx/nginx.conf
214
+
215
+ # ---- code-server dirs ----
216
+ USER_DATA_DIR="/home/coder/.local/share/code-server"
217
+ EXT_DIR="/home/coder/.local/share/code-server/extensions"
218
+ mkdir -p "${USER_DATA_DIR}/User" "${EXT_DIR}"
219
+ chown -R coder:coder "${USER_DATA_DIR}" "${EXT_DIR}" || true
220
+
221
+ # 设置: create baseline ONCE, never overwrite user changes on reboot
222
+ # ---- VS Code user settings: create baseline ONCE, never overwrite user changes ----
223
+ SETTINGS_JSON="${USER_DATA_DIR}/User/settings.json"
224
+ mkdir -p "${USER_DATA_DIR}/User"
225
+ chown -R coder:coder "${USER_DATA_DIR}/User" || true
226
+
227
+ if [ ! -f "${SETTINGS_JSON}" ]; then
228
+ cat > "${SETTINGS_JSON}" <<'EOF'
229
+ {
230
+ "terminal.integrated.defaultProfile.linux": "bash",
231
+ "terminal.integrated.profiles.linux": {
232
+ "bash": { "path": "/bin/bash" }
233
+ },
234
+ "terminal.integrated.cwd": "/home/coder/workspace",
235
+ "terminal.integrated.env.linux": {
236
+ "HOME": "/home/coder",
237
+ "NPM_CONFIG_PREFIX": "/home/coder/.npm-global",
238
+ "PATH": "/home/coder/.npm-global/bin:${env:PATH}"
239
+ },
240
+ "window.restoreWindows": "none"
241
+ }
242
+ EOF
243
+ chown coder:coder "${SETTINGS_JSON}" || true
244
+ else
245
+ echo "[boot] VS Code settings.json exists -> keep user customizations (no overwrite)"
246
+ fi
247
+
248
+ # ---- Start code-server (ignore last opened to avoid /root watcher EACCES) ----
249
+ export PASSWORD="${CODE_SERVER_PASSWORD}"
250
+ echo "[boot] Start code-server with explicit user-data-dir/extensions-dir"
251
+ mkdir -p /home/coder/workspace
252
+ chown coder:coder /home/coder/workspace
253
+
254
+ su -p coder -c "export HOME=/home/coder; export PATH=/home/coder/.npm-global/bin:\$PATH; \
255
+ /usr/bin/code-server \
256
+ --bind-addr 127.0.0.1:8080 \
257
+ --auth password \
258
+ --ignore-last-opened \
259
+ --user-data-dir /home/coder/.local/share/code-server \
260
+ --extensions-dir /home/coder/.local/share/code-server/extensions \
261
+ /home/coder/workspace" &
262
+ CODE_PID=$!
263
+
264
+ # ---- Start nginx (public 7860) ----
265
+ nginx -g "daemon off;" &
266
+ NGINX_PID=$!
267
+
268
+ # ---- Sync daemon (home -> dataset) ----
269
+ "${PY}" /sync_home.py daemon \
270
+ --repo "${CONFIG_DATASET}" \
271
+ --home "${HOME}" \
272
+ --persist "${PERSIST}" \
273
+ --interval "${SYNC_INTERVAL_SECONDS}" &
274
+ SYNC_PID=$!
275
+
276
+ final_sync() {
277
+ echo "[sync] Final sync..."
278
+ "${PY}" /sync_home.py push --repo "${CONFIG_DATASET}" --home "${HOME}" --persist "${PERSIST}" || true
279
+ }
280
+
281
+ shutdown() {
282
+ echo "[signal] termination received"
283
+ final_sync
284
+ kill "${SYNC_PID}" 2>/dev/null || true
285
+ kill "${CODE_PID}" 2>/dev/null || true
286
+ kill "${NGINX_PID}" 2>/dev/null || true
287
+ exit 0
288
+ }
289
+
290
+ trap shutdown SIGTERM SIGINT
291
+
292
+ wait "${NGINX_PID}"
nginx.conf.template ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ worker_processes 1;
2
+
3
+ events { worker_connections 1024; }
4
+
5
+ http {
6
+ include /etc/nginx/mime.types;
7
+ default_type application/octet-stream;
8
+ sendfile on;
9
+
10
+ server {
11
+ listen 7860;
12
+
13
+ auth_basic "Restricted";
14
+ auth_basic_user_file /etc/nginx/.htpasswd;
15
+
16
+ location / {
17
+ proxy_pass http://127.0.0.1:8080;
18
+ proxy_http_version 1.1;
19
+
20
+ proxy_set_header Host $host;
21
+ proxy_set_header Upgrade $http_upgrade;
22
+ proxy_set_header Connection "upgrade";
23
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
24
+ proxy_set_header X-Forwarded-Proto $scheme;
25
+ }
26
+ }
27
+ }
sync_home.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import subprocess
4
+ import time
5
+ import shutil
6
+ from huggingface_hub import snapshot_download, HfApi
7
+
8
+
9
+ # Hugging Face Hub commit validation forbids pushing files under certain folder names,
10
+ # including ".cache". If we try to upload home/.cache/** we will get:
11
+ # "Invalid path_in_repo ... cannot update files under a '.cache/' folder".
12
+ # This is enforced server-side / client-side validation (FORBIDDEN_FOLDERS). [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py)
13
+ FORCED_EXCLUDES = [".cache"]
14
+
15
+ # Optional default excludes to keep repo size reasonable.
16
+ # NOTE: Do NOT exclude code-server extensions/User if you want them persisted.
17
+ DEFAULT_EXCLUDES = [
18
+ # huge and usually not worth versioning
19
+ "node_modules",
20
+ "__pycache__",
21
+ ".local/share/Trash",
22
+
23
+ # optional caches (keep if you want full persistence; remove from here if desired)
24
+ # ".npm/_cacache", # many users exclude this; you may keep it if you want
25
+ # ".local/share/code-server/Cache",
26
+ # ".local/share/code-server/CachedData",
27
+ # ".local/share/code-server/GPUCache",
28
+ # ".local/share/code-server/logs",
29
+ ]
30
+
31
+
32
+ def run(cmd):
33
+ subprocess.check_call(cmd)
34
+
35
+
36
+ def capture(cmd):
37
+ return subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT)
38
+
39
+
40
+ def parse_excludes():
41
+ """
42
+ Excludes come from:
43
+ - DEFAULT_EXCLUDES ((可选))
44
+ - SYNC_EXCLUDES env var: comma-separated patterns
45
+ - FORCED_EXCLUDES: always enforced (currently ".cache")
46
+ If SYNC_DISABLE_EXCLUDES=1, we still enforce FORCED_EXCLUDES because Hub rejects ".cache".
47
+ [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py)
48
+ """
49
+ disable = os.environ.get("SYNC_DISABLE_EXCLUDES") == "1"
50
+ extra_raw = os.environ.get("SYNC_EXCLUDES", "").strip()
51
+
52
+ excludes = []
53
+ if not disable:
54
+ excludes.extend(DEFAULT_EXCLUDES)
55
+ if extra_raw:
56
+ excludes.extend([x.strip() for x in extra_raw.split(",") if x.strip()])
57
+
58
+ # Always enforce forbidden folders excludes
59
+ excludes.extend(FORCED_EXCLUDES)
60
+
61
+ # de-dup while preserving order
62
+ seen = set()
63
+ out = []
64
+ for e in excludes:
65
+ if e not in seen:
66
+ seen.add(e)
67
+ out.append(e)
68
+ return out
69
+
70
+
71
+ def rsync(src: str, dst: str, delete: bool):
72
+ excludes = parse_excludes()
73
+ cmd = ["rsync", "-a"]
74
+
75
+ if delete:
76
+ cmd.append("--delete")
77
+
78
+ for pat in excludes:
79
+ cmd += ["--exclude", pat]
80
+
81
+ cmd += [src.rstrip("/") + "/", dst.rstrip("/") + "/"]
82
+ run(cmd)
83
+
84
+
85
+ def rsync_has_changes(src: str, dst: str, delete: bool) -> bool:
86
+ """
87
+ Detect whether an rsync would change anything (to skip empty commits).
88
+ """
89
+ excludes = parse_excludes()
90
+ cmd = ["rsync", "-a", "--dry-run", "--itemize-changes"]
91
+ if delete:
92
+ cmd.append("--delete")
93
+ for pat in excludes:
94
+ cmd += ["--exclude", pat]
95
+ cmd += [src.rstrip("/") + "/", dst.rstrip("/") + "/"]
96
+
97
+ try:
98
+ out = capture(cmd)
99
+ except subprocess.CalledProcessError as e:
100
+ # if dry-run fails, be conservative and say "has changes"
101
+ return True
102
+
103
+ # rsync prints one line per changed item; ignore empty output
104
+ return any(line.strip() for line in out.splitlines())
105
+
106
+
107
+ def pull(repo: str, dst: str):
108
+ """
109
+ Download dataset repo snapshot into dst.
110
+ """
111
+ os.makedirs(dst, exist_ok=True)
112
+
113
+ # snapshot_download uses a local cache; its location is controlled by HF_HOME/HF_HUB_CACHE env vars. [2](https://huggingface.co/docs/huggingface_hub/guides/manage-cache)
114
+ snapshot_download(
115
+ repo_id=repo,
116
+ repo_type="dataset",
117
+ local_dir=dst,
118
+ token=os.environ.get("HF_TOKEN"),
119
+ )
120
+
121
+
122
+ def rsync_in(src: str, dst: str):
123
+ """
124
+ dataset -> home
125
+ DO NOT delete by default (avoid wiping image-preinstalled dirs such as .npm-global).
126
+ """
127
+ rsync(src, dst, delete=False)
128
+
129
+
130
+ def rsync_out(home: str, persist_home: str):
131
+ """
132
+ home -> dataset snapshot folder
133
+ Use delete=True to keep dataset/home consistent with current home,
134
+ but always exclude ".cache" (Hub rejects it). [1](https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/_commit_api.py)
135
+ """
136
+ rsync(home, persist_home, delete=True)
137
+
138
+
139
+ def sanitize_forbidden(persist: str):
140
+ """
141
+ Remove forbidden folders if present in persist/home before upload.
142
+ Currently: persist/home/.cache
143
+ """
144
+ forbidden_path = os.path.join(persist, "home", ".cache")
145
+ shutil.rmtree(forbidden_path, ignore_errors=True)
146
+
147
+
148
+ def push_repo(repo: str, persist: str):
149
+ """
150
+ Upload persist folder back to dataset repo.
151
+ """
152
+ sanitize_forbidden(persist)
153
+
154
+ api = HfApi(token=os.environ.get("HF_TOKEN"))
155
+
156
+ # ignore_patterns provides another safety layer so that even if something slipped in,
157
+ # it won't be included in the commit operation.
158
+ api.upload_folder(
159
+ repo_id=repo,
160
+ repo_type="dataset",
161
+ folder_path=persist,
162
+ path_in_repo="",
163
+ commit_message=f"sync home: {time.strftime('%Y-%m-%d %H:%M:%S')}",
164
+ ignore_patterns=[
165
+ "home/.cache/**",
166
+ ".cache/**",
167
+ ],
168
+ )
169
+
170
+
171
+ def push(repo: str, home: str, persist: str):
172
+ """
173
+ home -> persist/home via rsync, then upload persist to Hub
174
+ """
175
+ persist_home = os.path.join(persist, "home")
176
+ os.makedirs(persist_home, exist_ok=True)
177
+
178
+ # If nothing changed, skip commit to avoid empty commits
179
+ if not rsync_has_changes(home, persist_home, delete=True):
180
+ print("No files have been modified since last commit. Skipping to prevent empty commit.")
181
+ return
182
+
183
+ rsync_out(home, persist_home)
184
+ push_repo(repo, persist)
185
+
186
+
187
+ def daemon(repo: str, home: str, persist: str, interval: int):
188
+ while True:
189
+ try:
190
+ push(repo, home, persist)
191
+ print(f"[sync] pushed OK. next in {interval}s")
192
+ except Exception as e:
193
+ print(f"[sync] push failed: {e}")
194
+ time.sleep(interval)
195
+
196
+
197
+ if __name__ == "__main__":
198
+ ap = argparse.ArgumentParser()
199
+ sub = ap.add_subparsers(dest="cmd", required=True)
200
+
201
+ p_pull = sub.add_parser("pull")
202
+ p_pull.add_argument("--repo", required=True)
203
+ p_pull.add_argument("--dst", required=True)
204
+
205
+ p_in = sub.add_parser("rsync_in")
206
+ p_in.add_argument("--src", required=True)
207
+ p_in.add_argument("--dst", required=True)
208
+
209
+ p_push = sub.add_parser("push")
210
+ p_push.add_argument("--repo", required=True)
211
+ p_push.add_argument("--home", required=True)
212
+ p_push.add_argument("--persist", required=True)
213
+
214
+ p_daemon = sub.add_parser("daemon")
215
+ p_daemon.add_argument("--repo", required=True)
216
+ p_daemon.add_argument("--home", required=True)
217
+ p_daemon.add_argument("--persist", required=True)
218
+ p_daemon.add_argument("--interval", type=int, default=300)
219
+
220
+ args = ap.parse_args()
221
+
222
+ if args.cmd == "pull":
223
+ pull(args.repo, args.dst)
224
+ elif args.cmd == "rsync_in":
225
+ rsync_in(args.src, args.dst)
226
+ elif args.cmd == "push":
227
+ push(args.repo, args.home, args.persist)
228
+ elif args.cmd == "daemon":
229
+ daemon(args.repo, args.home, args.persist, args.interval)