HuggingClaw / jupyter-devdata-sync.py
Anurag
Prune DevData traversal excludes
ff724b2
Raw
History Blame Contribute Delete
18.1 kB
#!/usr/bin/env python3
from __future__ import annotations
import os, shutil, socket, sys, tempfile, time
from pathlib import Path
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip()
DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingclaw-devdata"
BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup"
JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve()
INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180"))
# BUG FIX #5: Respect max file size so giant files don't stall uploads.
# Matches the 50 MB ceiling in openclaw-sync.py; override with DEVDATA_MAX_FILE_BYTES.
MAX_FILE_SIZE_BYTES = int(
(os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024))
)
# Max stale files to delete per commit. Mirrors openclaw-sync.py behaviour.
# Override via DEVDATA_PRUNE_BATCH_SIZE.
PRUNE_BATCH_SIZE = int((os.environ.get("DEVDATA_PRUNE_BATCH_SIZE", "").strip() or "50"))
# Reserved filenames previously generated by DevData. Never copy or restore
# them as user content, and let prune delete any old remote marker files.
RESERVED_SYNC_FILENAMES = {".huggingclaw-empty-dir"}
def is_true(value):
return str(value).strip().lower() in {"1", "true", "yes", "on"}
ENABLE = is_true(os.environ.get("DEVDATA", "on"))
def classify_error(exc: Exception) -> str:
msg = str(exc).lower()
if isinstance(exc, PermissionError) or "permission denied" in msg:
return "filesystem-permission"
if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")):
return "network-provider"
if "unsafe" in msg or "malware" in msg or "security" in msg:
return "safety-scan"
return "general"
# Directory names that are always unsafe/noisy to persist. Keep this list to
# single path components; path-specific exclusions live in SKIP_PATH_PREFIXES.
EXCLUDE = {
".cache",
"node_modules",
".npm",
".yarn",
"Trash", # covers .local/share/Trash and any nested trash folder
".ipynb_checkpoints",
".openclaw",
"site-packages",
"__pycache__",
}
# Path prefixes, relative to JUPYTER_ROOT, that should not be synced. This keeps
# package/runtime trees out of the dataset without blocking real JupyterLab
# settings under .local/share/jupyter/lab/user-settings.
ROOT_SKIP_PATH_PREFIXES = {
("app",),
("HuggingClaw",),
("HuggingClaw-Workspace",),
("browser-deps",),
}
SKIP_PATH_PREFIXES = {
(".jupyter", "runtime"),
(".local", "bin"),
(".local", "lib"),
("lib",),
}
JUPYTER_DATA_DIR_PREFIX = (".local", "share", "jupyter")
JUPYTER_DATA_ALLOW_PREFIXES = {
(".local", "share", "jupyter", "lab", "user-settings"),
(".local", "share", "jupyter", "lab", "workspaces"),
}
def enabled():
jupyter_override = os.environ.get("HUGGINGCLAW_JUPYTER_ENABLED", "")
if jupyter_override.strip():
dev = is_true(jupyter_override)
else:
dev = is_true(os.environ.get("DEV_MODE", ""))
separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME
if ENABLE and dev and HF_TOKEN and not separate_dataset:
print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.")
return ENABLE and dev and bool(HF_TOKEN) and separate_dataset
def validate_jupyter_paths() -> None:
# JupyterLab theme/settings live under ~/.jupyter and ~/.local/share/jupyter.
# If these are not writable, settings can appear to "reset" every restart.
for required in (JUPYTER_ROOT, Path("/home/node/.jupyter"), Path("/home/node/.local/share/jupyter")):
try:
required.mkdir(parents=True, exist_ok=True)
probe = required / ".devdata-write-check"
probe.write_text("ok", encoding="utf-8")
probe.unlink(missing_ok=True)
except Exception as exc:
kind = classify_error(exc)
print(f"DevData warning [{kind}]: {required} is not writable; Jupyter settings may not persist ({exc})")
def repo_id(api) -> str:
ns = HF_USERNAME
if not ns:
who = api.whoami()
ns = who.get("name") or who.get("user") or ""
if not ns:
raise RuntimeError("Cannot resolve HF namespace for devdata sync")
return f"{ns}/{DATASET_NAME}"
# Filename patterns that must never be synced to a public/private HF Dataset.
# These are matched against the *name* of each path component (not the full path),
# so ".env" matches /home/node/.env and /home/node/subdir/.env alike.
import fnmatch as _fnmatch
SECRET_FILENAME_PATTERNS = {
".env", # dotenv files β€” almost always contain API keys
".env.*", # .env.local, .env.production, etc.
"id_rsa",
"id_dsa",
"id_ecdsa",
"id_ed25519",
"authorized_keys",
"known_hosts",
"secret",
"secrets",
"secret.*",
"*.secret",
"*_secret",
"*_secret.*",
"*-secret",
"*-secret.*",
"token",
"token.*",
"*.token",
"*_token",
"*_token.*",
"*-token",
"*-token.*",
"api_token",
"access_token",
"refresh_token",
"credentials", # common credential file names
"credentials.*",
"auth.json",
"auth.yaml",
"auth.yml",
"auth.toml",
"auth.ini",
"*.pem", # TLS/SSH private keys
"*.key", # generic key files
"*.p12", # PKCS#12 bundles
"*.pfx",
".netrc", # stores plaintext passwords
".htpasswd",
}
def _name_is_secret(name: str) -> bool:
"""Return True if *name* matches any secret-exclusion pattern."""
name_lower = name.lower()
return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS)
def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool:
return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
def _is_prefix_of(parts: tuple[str, ...], full_path: tuple[str, ...]) -> bool:
return len(parts) <= len(full_path) and full_path[:len(parts)] == parts
def should_skip(p: Path):
# Reserved sync helper files are not user data. Old datasets may still have
# these markers, but fresh snapshots no longer create them.
if p.name in RESERVED_SYNC_FILENAMES:
return True
# Skip directories/files in the hard-coded exclude set.
parts = p.parts
if any(x in parts for x in EXCLUDE):
return True
if any(parts == prefix or _matches_prefix(parts, prefix) for prefix in ROOT_SKIP_PATH_PREFIXES):
return True
if _matches_prefix(parts, JUPYTER_DATA_DIR_PREFIX) and not any(
_matches_prefix(parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES
):
return True
if any(_matches_prefix(parts, prefix) for prefix in SKIP_PATH_PREFIXES):
return True
# Skip any component whose name looks like a secret file/dir.
return any(_name_is_secret(part) for part in parts)
def iter_sync_tree(root: Path):
"""Yield syncable DevData paths without descending into excluded trees."""
if not root.exists():
return
for dirpath, dirnames, filenames in os.walk(root):
dir_path = Path(dirpath)
try:
dir_rel = dir_path.relative_to(root)
except ValueError:
dir_rel = Path()
kept_dirnames: list[str] = []
for dirname in sorted(dirnames):
rel = dir_rel / dirname
child = dir_path / dirname
rel_parts = rel.parts
# Do not prune ancestors of explicitly allowed Jupyter settings
# paths. should_skip(.local/share/jupyter) is true by design for
# files under that tree, but we must still descend through the
# parent dirs to reach lab/user-settings and lab/workspaces.
allowed_ancestor = any(
_is_prefix_of(rel_parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES
)
if child.is_symlink() or (should_skip(rel) and not allowed_ancestor):
continue
kept_dirnames.append(dirname)
dirnames[:] = kept_dirnames
for dirname in kept_dirnames:
yield dir_path / dirname
for filename in sorted(filenames):
rel = dir_rel / filename
child = dir_path / filename
if child.is_symlink() or should_skip(rel):
continue
yield child
def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
had_copy_failures = False
protected_large_files: set[str] = set()
for p in iter_sync_tree(src):
rel = p.relative_to(src)
target = dst / rel
if p.is_dir():
# Keep parent directories for files copied later in this snapshot.
# Empty folders are intentionally not represented in the git-backed
# HF Dataset; once a folder contains syncable files, those files
# carry the folder path naturally.
target.mkdir(parents=True, exist_ok=True)
elif p.is_file():
# BUG FIX #5: Skip files that exceed the size limit.
try:
if p.stat().st_size > MAX_FILE_SIZE_BYTES:
protected_large_files.add(rel.as_posix())
continue
except OSError:
had_copy_failures = True
protected_large_files.add(rel.as_posix())
continue
target.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(p, target)
except OSError:
had_copy_failures = True
protected_large_files.add(rel.as_posix())
return had_copy_failures, protected_large_files
def is_jupyter_running(port: int = 8888) -> bool:
"""Return True if JupyterLab is already listening on *port*.
BUG FIX #2 (safety net): restore_once() must never run while JupyterLab
is active. Overwriting files under JUPYTER_ROOT (runtime/ sockets, lab/
settings, kernel connection files) while JupyterLab is live corrupts its
state and causes it to exit within seconds.
The primary guard is the --restore / sync separation introduced in
BUG FIX #3, but this TCP probe stays as a hard backstop for any future
code path that might call restore_once() unexpectedly.
"""
try:
with socket.create_connection(("127.0.0.1", port), timeout=2):
return True
except OSError:
return False
def restore_once(api, rid: str):
from huggingface_hub import snapshot_download
from huggingface_hub.errors import RepositoryNotFoundError
tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-"))
try:
snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
for p in tmp.rglob("*"):
rel = p.relative_to(tmp)
if str(rel) == ".gitattributes":
continue
if should_skip(rel):
continue
target = JUPYTER_ROOT / rel
if p.is_dir():
target.mkdir(parents=True, exist_ok=True)
elif p.is_file():
target.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(p, target)
except OSError as exc:
kind = classify_error(exc)
print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}")
print(f"DevData restored from {rid}")
except RepositoryNotFoundError:
print(f"DevData dataset not found yet: {rid}")
except Exception as exc:
kind = classify_error(exc)
print(f"DevData restore warning [{kind}]: {exc}")
finally:
shutil.rmtree(tmp, ignore_errors=True)
def prune_remote_deleted_files(
api,
rid: str,
snapshot_dir: Path,
skip_prefixes: set[str] | None = None,
protected_paths: set[str] | None = None,
) -> None:
"""Delete from the HF dataset any files the user deleted locally.
Without this, deleted files re-appear on the next Space restart because
restore_once() copies everything in the dataset back to disk.
Uses create_commit directly with CommitOperationDelete to avoid the extra
list_repo_files call inside the SDK's delete_files wrapper, and batches
deletions into PRUNE_BATCH_SIZE chunks to avoid hitting the HF API payload
limit when many files are pruned at once.
"""
try:
skip_prefixes = skip_prefixes or set()
protected_paths = protected_paths or set()
local_files = {
p.relative_to(snapshot_dir).as_posix()
for p in snapshot_dir.rglob("*")
if p.is_file()
}
remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset"))
stale = [
f for f in remote_files
if f not in local_files
and f != ".gitattributes"
and f not in protected_paths
and not any(f == prefix or f.startswith(prefix + "/") for prefix in skip_prefixes)
]
if not stale:
return
total = len(stale)
num_batches = (total + PRUNE_BATCH_SIZE - 1) // PRUNE_BATCH_SIZE
ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
for batch_idx in range(num_batches):
batch = stale[batch_idx * PRUNE_BATCH_SIZE:(batch_idx + 1) * PRUNE_BATCH_SIZE]
batch_label = f" (batch {batch_idx + 1}/{num_batches})" if num_batches > 1 else ""
operations = [CommitOperationDelete(path_in_repo=p) for p in batch]
api.create_commit(
repo_id=rid,
repo_type="dataset",
operations=operations,
commit_message=f"DevData prune {len(batch)} deleted file(s) {ts}{batch_label}",
)
print(f"DevData pruned {total} deleted file(s) from {rid}"
+ (f" in {num_batches} batches" if num_batches > 1 else ""))
except Exception as exc:
kind = classify_error(exc)
print(f"DevData prune warning [{kind}]: {exc}")
def sync_loop(api, rid: str):
while True:
tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-"))
try:
had_copy_failures, protected_large_files = snapshot(JUPYTER_ROOT, tmp)
upload_folder(
folder_path=str(tmp),
repo_id=rid,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
)
print(f"DevData synced to {rid}")
# BUG FIX #6: Prune files deleted locally so they don't reappear on restore.
skip_prune_prefixes: set[str] = set()
if had_copy_failures:
# Snapshot copy races can produce a partial view; avoid pruning
# runtime-heavy Jupyter paths in that case.
skip_prune_prefixes.update({"runtime", ".local/share/jupyter/runtime"})
print("DevData snapshot had copy failures; pruning stale files with runtime-path safeguards.")
prune_remote_deleted_files(
api,
rid,
tmp,
skip_prefixes=skip_prune_prefixes,
protected_paths=protected_large_files,
)
except Exception as exc:
kind = classify_error(exc)
print(f"DevData sync warning [{kind}]: {exc}")
finally:
shutil.rmtree(tmp, ignore_errors=True)
time.sleep(INTERVAL)
if __name__ == "__main__":
if not enabled():
print("DevData sync disabled.")
raise SystemExit(0)
from huggingface_hub import CommitOperationDelete, HfApi, upload_folder, snapshot_download
from huggingface_hub.errors import RepositoryNotFoundError
api = HfApi(token=HF_TOKEN)
rid = repo_id(api)
try:
api.repo_info(repo_id=rid, repo_type="dataset")
except RepositoryNotFoundError:
api.create_repo(repo_id=rid, repo_type="dataset", private=True)
# ── BUG FIX #3: Restore must happen BEFORE JupyterLab starts ──────────
# The original code always called restore_once() here, but start.sh starts
# JupyterLab long before the gateway is ready and this script is launched.
# That made restore_once() ALWAYS run while JupyterLab was live, which
# overwrote its runtime/ sockets and settings β†’ JupyterLab died.
#
# Fix: start.sh now calls `python3 jupyter-devdata-sync.py --restore`
# BEFORE starting JupyterLab. That --restore invocation does the restore
# and exits. This background invocation (no --restore flag) skips straight
# to sync_loop so it never touches files while JupyterLab is running.
#
# BUG FIX #2 (safety net): If JupyterLab is somehow already running when
# this code path is reached, abort restore to avoid corrupting its state.
if "--restore" in sys.argv:
# Synchronous restore mode β€” called by start.sh before JupyterLab.
validate_jupyter_paths()
restore_once(api, rid)
raise SystemExit(0)
# Normal background sync mode β€” no restore; go straight to upload loop.
validate_jupyter_paths()
if is_jupyter_running():
print("DevData: background sync started (JupyterLab is live, restore already done by --restore).")
else:
# Fallback: JupyterLab not detected. Should not normally happen
# because start.sh calls --restore before starting JupyterLab and then
# waits for the gateway before launching this background process.
# Log a warning and proceed to sync; do NOT restore to avoid racing
# with a JupyterLab that may be in the middle of starting up.
print("DevData: WARNING β€” JupyterLab not detected on port 8888. Skipping restore to be safe; starting sync loop.")
sync_loop(api, rid)