Spaces:
Sleeping
Sleeping
feat: full-disk persistence — tar entire / filesystem every 60s
Browse filesSync everything on disk except virtual (/proc, /sys, /dev),
our persist path (/data), and temp dirs (/tmp, /run).
Interval changed from 120s to 60s, first sync after 30s.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- entrypoint.py +24 -33
entrypoint.py
CHANGED
|
@@ -24,7 +24,7 @@ from datetime import datetime, timezone
|
|
| 24 |
PERSIST_PATH = os.environ.get("PERSIST_PATH", "/data")
|
| 25 |
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 26 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "")
|
| 27 |
-
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "
|
| 28 |
SSH_PORT = os.environ.get("SSH_PORT", "2222")
|
| 29 |
TTYD_PORT = os.environ.get("TTYD_PORT", "7681")
|
| 30 |
LOGFILE = "/var/log/huggingrun.log"
|
|
@@ -32,36 +32,28 @@ STATE_FILE = os.path.join(PERSIST_PATH, "state.tar.zst")
|
|
| 32 |
PKG_FILE = os.path.join(PERSIST_PATH, "user-packages.list")
|
| 33 |
BASE_PKG_FILE = "/etc/base-packages.list"
|
| 34 |
|
| 35 |
-
# Full
|
| 36 |
-
#
|
| 37 |
-
PERSIST_DIRS = [
|
| 38 |
-
"home",
|
| 39 |
-
"root",
|
| 40 |
-
"etc",
|
| 41 |
-
"opt",
|
| 42 |
-
"var",
|
| 43 |
-
"usr/local",
|
| 44 |
-
]
|
| 45 |
-
|
| 46 |
-
# Exclude from tar: Docker-managed, virtual, transient, or regenerable files
|
| 47 |
TAR_EXCLUDES = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
# Docker-managed (overwritten each container start)
|
| 49 |
"etc/hostname",
|
| 50 |
"etc/hosts",
|
| 51 |
"etc/resolv.conf",
|
| 52 |
"etc/mtab",
|
| 53 |
# Transient runtime
|
| 54 |
-
"var/run",
|
| 55 |
"var/lock",
|
| 56 |
-
|
| 57 |
-
"var/lib/apt/lists",
|
| 58 |
-
"var/log", # logs are ephemeral
|
| 59 |
-
# Sockets, pids, caches
|
| 60 |
"*.sock",
|
| 61 |
"*.pid",
|
| 62 |
-
"__pycache__",
|
| 63 |
-
"*.pyc",
|
| 64 |
-
".cache",
|
| 65 |
]
|
| 66 |
|
| 67 |
|
|
@@ -237,23 +229,22 @@ def save_and_upload():
|
|
| 237 |
except Exception:
|
| 238 |
pass
|
| 239 |
|
| 240 |
-
# Create tar.zst of
|
| 241 |
t0 = time.time()
|
| 242 |
-
dirs_to_persist = [d for d in PERSIST_DIRS if os.path.isdir(f"/{d}")]
|
| 243 |
-
if not dirs_to_persist:
|
| 244 |
-
log(" nothing to persist")
|
| 245 |
-
return
|
| 246 |
-
|
| 247 |
excludes = " ".join(f"--exclude='{e}'" for e in TAR_EXCLUDES)
|
| 248 |
-
cmd = f"tar --zstd {excludes} -cpf {STATE_FILE} -C /
|
| 249 |
rc, out = run(cmd)
|
| 250 |
elapsed = time.time() - t0
|
| 251 |
if rc != 0:
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
size_mb = os.path.getsize(STATE_FILE) / 1024 / 1024
|
| 256 |
-
log(f" tar: {size_mb:.1f}MB ({elapsed:.1f}s) [
|
| 257 |
|
| 258 |
# Upload both files
|
| 259 |
api = HfApi(token=HF_TOKEN)
|
|
@@ -292,8 +283,8 @@ def save_and_upload():
|
|
| 292 |
|
| 293 |
# ── Sync Thread ───────────────────────────────────────────────────────
|
| 294 |
def sync_loop():
|
| 295 |
-
log("sync thread: waiting
|
| 296 |
-
time.sleep(
|
| 297 |
cycle = 0
|
| 298 |
while True:
|
| 299 |
cycle += 1
|
|
|
|
| 24 |
PERSIST_PATH = os.environ.get("PERSIST_PATH", "/data")
|
| 25 |
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 26 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "")
|
| 27 |
+
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "60"))
|
| 28 |
SSH_PORT = os.environ.get("SSH_PORT", "2222")
|
| 29 |
TTYD_PORT = os.environ.get("TTYD_PORT", "7681")
|
| 30 |
LOGFILE = "/var/log/huggingrun.log"
|
|
|
|
| 32 |
PKG_FILE = os.path.join(PERSIST_PATH, "user-packages.list")
|
| 33 |
BASE_PKG_FILE = "/etc/base-packages.list"
|
| 34 |
|
| 35 |
+
# Full-disk persistence: tar everything from / except virtual/transient dirs.
|
| 36 |
+
# Excludes only things that CANNOT or SHOULD NOT be persisted.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
TAR_EXCLUDES = [
|
| 38 |
+
# Virtual filesystems (kernel-provided, not real files)
|
| 39 |
+
"proc",
|
| 40 |
+
"sys",
|
| 41 |
+
"dev",
|
| 42 |
+
# Our own persist path (avoid recursive tar of state.tar.zst)
|
| 43 |
+
"data",
|
| 44 |
+
# Temporary
|
| 45 |
+
"tmp",
|
| 46 |
+
"run",
|
| 47 |
# Docker-managed (overwritten each container start)
|
| 48 |
"etc/hostname",
|
| 49 |
"etc/hosts",
|
| 50 |
"etc/resolv.conf",
|
| 51 |
"etc/mtab",
|
| 52 |
# Transient runtime
|
|
|
|
| 53 |
"var/lock",
|
| 54 |
+
# Sockets, pids
|
|
|
|
|
|
|
|
|
|
| 55 |
"*.sock",
|
| 56 |
"*.pid",
|
|
|
|
|
|
|
|
|
|
| 57 |
]
|
| 58 |
|
| 59 |
|
|
|
|
| 229 |
except Exception:
|
| 230 |
pass
|
| 231 |
|
| 232 |
+
# Create tar.zst of entire filesystem (full-disk persistence)
|
| 233 |
t0 = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
excludes = " ".join(f"--exclude='{e}'" for e in TAR_EXCLUDES)
|
| 235 |
+
cmd = f"tar --zstd {excludes} -cpf {STATE_FILE} -C / ."
|
| 236 |
rc, out = run(cmd)
|
| 237 |
elapsed = time.time() - t0
|
| 238 |
if rc != 0:
|
| 239 |
+
# tar returns 1 for "file changed as we read it" — that's OK
|
| 240 |
+
if rc == 1:
|
| 241 |
+
log(f" tar: warnings (files changed during archive) ({elapsed:.1f}s)")
|
| 242 |
+
else:
|
| 243 |
+
log(f" tar failed rc={rc} ({elapsed:.1f}s)")
|
| 244 |
+
return
|
| 245 |
|
| 246 |
size_mb = os.path.getsize(STATE_FILE) / 1024 / 1024
|
| 247 |
+
log(f" tar: {size_mb:.1f}MB ({elapsed:.1f}s) [full disk]")
|
| 248 |
|
| 249 |
# Upload both files
|
| 250 |
api = HfApi(token=HF_TOKEN)
|
|
|
|
| 283 |
|
| 284 |
# ── Sync Thread ───────────────────────────────────────────────────────
|
| 285 |
def sync_loop():
|
| 286 |
+
log("sync thread: waiting 30s before first sync")
|
| 287 |
+
time.sleep(30)
|
| 288 |
cycle = 0
|
| 289 |
while True:
|
| 290 |
cycle += 1
|