tao-shen Claude Opus 4.6 commited on
Commit
20de61f
·
1 Parent(s): acafb8d

feat: full-disk persistence — tar entire / filesystem every 60s

Browse files

Sync everything on disk except virtual (/proc, /sys, /dev),
our persist path (/data), and temp dirs (/tmp, /run).
Interval changed from 120s to 60s, first sync after 30s.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. entrypoint.py +24 -33
entrypoint.py CHANGED
@@ -24,7 +24,7 @@ from datetime import datetime, timezone
24
  PERSIST_PATH = os.environ.get("PERSIST_PATH", "/data")
25
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
26
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "")
27
- SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "120"))
28
  SSH_PORT = os.environ.get("SSH_PORT", "2222")
29
  TTYD_PORT = os.environ.get("TTYD_PORT", "7681")
30
  LOGFILE = "/var/log/huggingrun.log"
@@ -32,36 +32,28 @@ STATE_FILE = os.path.join(PERSIST_PATH, "state.tar.zst")
32
  PKG_FILE = os.path.join(PERSIST_PATH, "user-packages.list")
33
  BASE_PKG_FILE = "/etc/base-packages.list"
34
 
35
- # Full filesystem persistence these dirs are synced to/from the dataset.
36
- # tar -p preserves all permissions, so no corruption.
37
- PERSIST_DIRS = [
38
- "home",
39
- "root",
40
- "etc",
41
- "opt",
42
- "var",
43
- "usr/local",
44
- ]
45
-
46
- # Exclude from tar: Docker-managed, virtual, transient, or regenerable files
47
  TAR_EXCLUDES = [
 
 
 
 
 
 
 
 
 
48
  # Docker-managed (overwritten each container start)
49
  "etc/hostname",
50
  "etc/hosts",
51
  "etc/resolv.conf",
52
  "etc/mtab",
53
  # Transient runtime
54
- "var/run",
55
  "var/lock",
56
- "var/cache/apt",
57
- "var/lib/apt/lists",
58
- "var/log", # logs are ephemeral
59
- # Sockets, pids, caches
60
  "*.sock",
61
  "*.pid",
62
- "__pycache__",
63
- "*.pyc",
64
- ".cache",
65
  ]
66
 
67
 
@@ -237,23 +229,22 @@ def save_and_upload():
237
  except Exception:
238
  pass
239
 
240
- # Create tar.zst of all persist dirs
241
  t0 = time.time()
242
- dirs_to_persist = [d for d in PERSIST_DIRS if os.path.isdir(f"/{d}")]
243
- if not dirs_to_persist:
244
- log(" nothing to persist")
245
- return
246
-
247
  excludes = " ".join(f"--exclude='{e}'" for e in TAR_EXCLUDES)
248
- cmd = f"tar --zstd {excludes} -cpf {STATE_FILE} -C / {' '.join(dirs_to_persist)}"
249
  rc, out = run(cmd)
250
  elapsed = time.time() - t0
251
  if rc != 0:
252
- log(f" tar failed ({elapsed:.1f}s)")
253
- return
 
 
 
 
254
 
255
  size_mb = os.path.getsize(STATE_FILE) / 1024 / 1024
256
- log(f" tar: {size_mb:.1f}MB ({elapsed:.1f}s) [{', '.join(dirs_to_persist)}]")
257
 
258
  # Upload both files
259
  api = HfApi(token=HF_TOKEN)
@@ -292,8 +283,8 @@ def save_and_upload():
292
 
293
  # ── Sync Thread ───────────────────────────────────────────────────────
294
  def sync_loop():
295
- log("sync thread: waiting 60s before first sync")
296
- time.sleep(60)
297
  cycle = 0
298
  while True:
299
  cycle += 1
 
24
  PERSIST_PATH = os.environ.get("PERSIST_PATH", "/data")
25
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
26
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "")
27
+ SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "60"))
28
  SSH_PORT = os.environ.get("SSH_PORT", "2222")
29
  TTYD_PORT = os.environ.get("TTYD_PORT", "7681")
30
  LOGFILE = "/var/log/huggingrun.log"
 
32
  PKG_FILE = os.path.join(PERSIST_PATH, "user-packages.list")
33
  BASE_PKG_FILE = "/etc/base-packages.list"
34
 
35
+ # Full-disk persistence: tar everything from / except virtual/transient dirs.
36
+ # Excludes only things that CANNOT or SHOULD NOT be persisted.
 
 
 
 
 
 
 
 
 
 
37
  TAR_EXCLUDES = [
38
+ # Virtual filesystems (kernel-provided, not real files)
39
+ "proc",
40
+ "sys",
41
+ "dev",
42
+ # Our own persist path (avoid recursive tar of state.tar.zst)
43
+ "data",
44
+ # Temporary
45
+ "tmp",
46
+ "run",
47
  # Docker-managed (overwritten each container start)
48
  "etc/hostname",
49
  "etc/hosts",
50
  "etc/resolv.conf",
51
  "etc/mtab",
52
  # Transient runtime
 
53
  "var/lock",
54
+ # Sockets, pids
 
 
 
55
  "*.sock",
56
  "*.pid",
 
 
 
57
  ]
58
 
59
 
 
229
  except Exception:
230
  pass
231
 
232
+ # Create tar.zst of entire filesystem (full-disk persistence)
233
  t0 = time.time()
 
 
 
 
 
234
  excludes = " ".join(f"--exclude='{e}'" for e in TAR_EXCLUDES)
235
+ cmd = f"tar --zstd {excludes} -cpf {STATE_FILE} -C / ."
236
  rc, out = run(cmd)
237
  elapsed = time.time() - t0
238
  if rc != 0:
239
+ # tar returns 1 for "file changed as we read it" — that's OK
240
+ if rc == 1:
241
+ log(f" tar: warnings (files changed during archive) ({elapsed:.1f}s)")
242
+ else:
243
+ log(f" tar failed rc={rc} ({elapsed:.1f}s)")
244
+ return
245
 
246
  size_mb = os.path.getsize(STATE_FILE) / 1024 / 1024
247
+ log(f" tar: {size_mb:.1f}MB ({elapsed:.1f}s) [full disk]")
248
 
249
  # Upload both files
250
  api = HfApi(token=HF_TOKEN)
 
283
 
284
  # ── Sync Thread ───────────────────────────────────────────────────────
285
  def sync_loop():
286
+ log("sync thread: waiting 30s before first sync")
287
+ time.sleep(30)
288
  cycle = 0
289
  while True:
290
  cycle += 1