HuggingClaw

Building

HuggingClaw / jupyter-devdata-sync.py

Anurag

Prune DevData traversal excludes

ff724b2 about 1 month ago

18.1 kB

	#!/usr/bin/env python3
	from __future__ import annotations

	import os, shutil, socket, sys, tempfile, time
	from pathlib import Path

	HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
	HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip()
	DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingclaw-devdata"
	BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup"
	JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve()
	INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180"))
	# BUG FIX #5: Respect max file size so giant files don't stall uploads.
	# Matches the 50 MB ceiling in openclaw-sync.py; override with DEVDATA_MAX_FILE_BYTES.
	MAX_FILE_SIZE_BYTES = int(
	(os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024))
	)
	# Max stale files to delete per commit. Mirrors openclaw-sync.py behaviour.
	# Override via DEVDATA_PRUNE_BATCH_SIZE.
	PRUNE_BATCH_SIZE = int((os.environ.get("DEVDATA_PRUNE_BATCH_SIZE", "").strip() or "50"))
	# Reserved filenames previously generated by DevData. Never copy or restore
	# them as user content, and let prune delete any old remote marker files.
	RESERVED_SYNC_FILENAMES = {".huggingclaw-empty-dir"}

	def is_true(value):
	return str(value).strip().lower() in {"1", "true", "yes", "on"}

	ENABLE = is_true(os.environ.get("DEVDATA", "on"))


	def classify_error(exc: Exception) -> str:
	msg = str(exc).lower()
	if isinstance(exc, PermissionError) or "permission denied" in msg:
	return "filesystem-permission"
	if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")):
	return "network-provider"
	if "unsafe" in msg or "malware" in msg or "security" in msg:
	return "safety-scan"
	return "general"

	# Directory names that are always unsafe/noisy to persist. Keep this list to
	# single path components; path-specific exclusions live in SKIP_PATH_PREFIXES.
	EXCLUDE = {
	".cache",
	"node_modules",
	".npm",
	".yarn",
	"Trash", # covers .local/share/Trash and any nested trash folder
	".ipynb_checkpoints",
	".openclaw",
	"site-packages",
	"__pycache__",
	}

	# Path prefixes, relative to JUPYTER_ROOT, that should not be synced. This keeps
	# package/runtime trees out of the dataset without blocking real JupyterLab
	# settings under .local/share/jupyter/lab/user-settings.
	ROOT_SKIP_PATH_PREFIXES = {
	("app",),
	("HuggingClaw",),
	("HuggingClaw-Workspace",),
	("browser-deps",),
	}

	SKIP_PATH_PREFIXES = {
	(".jupyter", "runtime"),
	(".local", "bin"),
	(".local", "lib"),
	("lib",),
	}

	JUPYTER_DATA_DIR_PREFIX = (".local", "share", "jupyter")
	JUPYTER_DATA_ALLOW_PREFIXES = {
	(".local", "share", "jupyter", "lab", "user-settings"),
	(".local", "share", "jupyter", "lab", "workspaces"),
	}


	def enabled():
	jupyter_override = os.environ.get("HUGGINGCLAW_JUPYTER_ENABLED", "")
	if jupyter_override.strip():
	dev = is_true(jupyter_override)
	else:
	dev = is_true(os.environ.get("DEV_MODE", ""))
	separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME
	if ENABLE and dev and HF_TOKEN and not separate_dataset:
	print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.")
	return ENABLE and dev and bool(HF_TOKEN) and separate_dataset

	def validate_jupyter_paths() -> None:
	# JupyterLab theme/settings live under ~/.jupyter and ~/.local/share/jupyter.
	# If these are not writable, settings can appear to "reset" every restart.
	for required in (JUPYTER_ROOT, Path("/home/node/.jupyter"), Path("/home/node/.local/share/jupyter")):
	try:
	required.mkdir(parents=True, exist_ok=True)
	probe = required / ".devdata-write-check"
	probe.write_text("ok", encoding="utf-8")
	probe.unlink(missing_ok=True)
	except Exception as exc:
	kind = classify_error(exc)
	print(f"DevData warning [{kind}]: {required} is not writable; Jupyter settings may not persist ({exc})")

	def repo_id(api) -> str:
	ns = HF_USERNAME
	if not ns:
	who = api.whoami()
	ns = who.get("name") or who.get("user") or ""
	if not ns:
	raise RuntimeError("Cannot resolve HF namespace for devdata sync")
	return f"{ns}/{DATASET_NAME}"

	# Filename patterns that must never be synced to a public/private HF Dataset.
	# These are matched against the name of each path component (not the full path),
	# so ".env" matches /home/node/.env and /home/node/subdir/.env alike.
	import fnmatch as _fnmatch

	SECRET_FILENAME_PATTERNS = {
	".env", # dotenv files — almost always contain API keys
	".env.*", # .env.local, .env.production, etc.
	"id_rsa",
	"id_dsa",
	"id_ecdsa",
	"id_ed25519",
	"authorized_keys",
	"known_hosts",
	"secret",
	"secrets",
	"secret.*",
	"*.secret",
	"*_secret",
	"_secret.",
	"*-secret",
	"-secret.",
	"token",
	"token.*",
	"*.token",
	"*_token",
	"_token.",
	"*-token",
	"-token.",
	"api_token",
	"access_token",
	"refresh_token",
	"credentials", # common credential file names
	"credentials.*",
	"auth.json",
	"auth.yaml",
	"auth.yml",
	"auth.toml",
	"auth.ini",
	"*.pem", # TLS/SSH private keys
	"*.key", # generic key files
	"*.p12", # PKCS#12 bundles
	"*.pfx",
	".netrc", # stores plaintext passwords
	".htpasswd",
	}


	def _name_is_secret(name: str) -> bool:
	"""Return True if name matches any secret-exclusion pattern."""
	name_lower = name.lower()
	return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS)


	def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool:
	return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix


	def _is_prefix_of(parts: tuple[str, ...], full_path: tuple[str, ...]) -> bool:
	return len(parts) <= len(full_path) and full_path[:len(parts)] == parts


	def should_skip(p: Path):
	# Reserved sync helper files are not user data. Old datasets may still have
	# these markers, but fresh snapshots no longer create them.
	if p.name in RESERVED_SYNC_FILENAMES:
	return True

	# Skip directories/files in the hard-coded exclude set.
	parts = p.parts
	if any(x in parts for x in EXCLUDE):
	return True
	if any(parts == prefix or _matches_prefix(parts, prefix) for prefix in ROOT_SKIP_PATH_PREFIXES):
	return True
	if _matches_prefix(parts, JUPYTER_DATA_DIR_PREFIX) and not any(
	_matches_prefix(parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES
	):
	return True
	if any(_matches_prefix(parts, prefix) for prefix in SKIP_PATH_PREFIXES):
	return True
	# Skip any component whose name looks like a secret file/dir.
	return any(_name_is_secret(part) for part in parts)


	def iter_sync_tree(root: Path):
	"""Yield syncable DevData paths without descending into excluded trees."""
	if not root.exists():
	return

	for dirpath, dirnames, filenames in os.walk(root):
	dir_path = Path(dirpath)
	try:
	dir_rel = dir_path.relative_to(root)
	except ValueError:
	dir_rel = Path()

	kept_dirnames: list[str] = []
	for dirname in sorted(dirnames):
	rel = dir_rel / dirname
	child = dir_path / dirname
	rel_parts = rel.parts
	# Do not prune ancestors of explicitly allowed Jupyter settings
	# paths. should_skip(.local/share/jupyter) is true by design for
	# files under that tree, but we must still descend through the
	# parent dirs to reach lab/user-settings and lab/workspaces.
	allowed_ancestor = any(
	_is_prefix_of(rel_parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES
	)
	if child.is_symlink() or (should_skip(rel) and not allowed_ancestor):
	continue
	kept_dirnames.append(dirname)
	dirnames[:] = kept_dirnames

	for dirname in kept_dirnames:
	yield dir_path / dirname

	for filename in sorted(filenames):
	rel = dir_rel / filename
	child = dir_path / filename
	if child.is_symlink() or should_skip(rel):
	continue
	yield child


	def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
	had_copy_failures = False
	protected_large_files: set[str] = set()
	for p in iter_sync_tree(src):
	rel = p.relative_to(src)
	target = dst / rel
	if p.is_dir():
	# Keep parent directories for files copied later in this snapshot.
	# Empty folders are intentionally not represented in the git-backed
	# HF Dataset; once a folder contains syncable files, those files
	# carry the folder path naturally.
	target.mkdir(parents=True, exist_ok=True)
	elif p.is_file():
	# BUG FIX #5: Skip files that exceed the size limit.
	try:
	if p.stat().st_size > MAX_FILE_SIZE_BYTES:
	protected_large_files.add(rel.as_posix())
	continue
	except OSError:
	had_copy_failures = True
	protected_large_files.add(rel.as_posix())
	continue
	target.parent.mkdir(parents=True, exist_ok=True)
	try:
	shutil.copy2(p, target)
	except OSError:
	had_copy_failures = True
	protected_large_files.add(rel.as_posix())
	return had_copy_failures, protected_large_files

	def is_jupyter_running(port: int = 8888) -> bool:
	"""Return True if JupyterLab is already listening on port.

	BUG FIX #2 (safety net): restore_once() must never run while JupyterLab
	is active. Overwriting files under JUPYTER_ROOT (runtime/ sockets, lab/
	settings, kernel connection files) while JupyterLab is live corrupts its
	state and causes it to exit within seconds.

	The primary guard is the --restore / sync separation introduced in
	BUG FIX #3, but this TCP probe stays as a hard backstop for any future
	code path that might call restore_once() unexpectedly.
	"""
	try:
	with socket.create_connection(("127.0.0.1", port), timeout=2):
	return True
	except OSError:
	return False

	def restore_once(api, rid: str):
	from huggingface_hub import snapshot_download
	from huggingface_hub.errors import RepositoryNotFoundError
	tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-"))
	try:
	snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
	for p in tmp.rglob("*"):
	rel = p.relative_to(tmp)
	if str(rel) == ".gitattributes":
	continue
	if should_skip(rel):
	continue
	target = JUPYTER_ROOT / rel
	if p.is_dir():
	target.mkdir(parents=True, exist_ok=True)
	elif p.is_file():
	target.parent.mkdir(parents=True, exist_ok=True)
	try:
	shutil.copy2(p, target)
	except OSError as exc:
	kind = classify_error(exc)
	print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}")
	print(f"DevData restored from {rid}")
	except RepositoryNotFoundError:
	print(f"DevData dataset not found yet: {rid}")
	except Exception as exc:
	kind = classify_error(exc)
	print(f"DevData restore warning [{kind}]: {exc}")
	finally:
	shutil.rmtree(tmp, ignore_errors=True)

	def prune_remote_deleted_files(
	api,
	rid: str,
	snapshot_dir: Path,
	skip_prefixes: set[str] \| None = None,
	protected_paths: set[str] \| None = None,
	) -> None:
	"""Delete from the HF dataset any files the user deleted locally.

	Without this, deleted files re-appear on the next Space restart because
	restore_once() copies everything in the dataset back to disk.

	Uses create_commit directly with CommitOperationDelete to avoid the extra
	list_repo_files call inside the SDK's delete_files wrapper, and batches
	deletions into PRUNE_BATCH_SIZE chunks to avoid hitting the HF API payload
	limit when many files are pruned at once.
	"""
	try:
	skip_prefixes = skip_prefixes or set()
	protected_paths = protected_paths or set()
	local_files = {
	p.relative_to(snapshot_dir).as_posix()
	for p in snapshot_dir.rglob("*")
	if p.is_file()
	}
	remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset"))
	stale = [
	f for f in remote_files
	if f not in local_files
	and f != ".gitattributes"
	and f not in protected_paths
	and not any(f == prefix or f.startswith(prefix + "/") for prefix in skip_prefixes)
	]
	if not stale:
	return

	total = len(stale)
	num_batches = (total + PRUNE_BATCH_SIZE - 1) // PRUNE_BATCH_SIZE
	ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
	for batch_idx in range(num_batches):
	batch = stale[batch_idx * PRUNE_BATCH_SIZE:(batch_idx + 1) * PRUNE_BATCH_SIZE]
	batch_label = f" (batch {batch_idx + 1}/{num_batches})" if num_batches > 1 else ""
	operations = [CommitOperationDelete(path_in_repo=p) for p in batch]
	api.create_commit(
	repo_id=rid,
	repo_type="dataset",
	operations=operations,
	commit_message=f"DevData prune {len(batch)} deleted file(s) {ts}{batch_label}",
	)
	print(f"DevData pruned {total} deleted file(s) from {rid}"
	+ (f" in {num_batches} batches" if num_batches > 1 else ""))
	except Exception as exc:
	kind = classify_error(exc)
	print(f"DevData prune warning [{kind}]: {exc}")

	def sync_loop(api, rid: str):
	while True:
	tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-"))
	try:
	had_copy_failures, protected_large_files = snapshot(JUPYTER_ROOT, tmp)
	upload_folder(
	folder_path=str(tmp),
	repo_id=rid,
	repo_type="dataset",
	token=HF_TOKEN,
	commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
	)
	print(f"DevData synced to {rid}")
	# BUG FIX #6: Prune files deleted locally so they don't reappear on restore.
	skip_prune_prefixes: set[str] = set()
	if had_copy_failures:
	# Snapshot copy races can produce a partial view; avoid pruning
	# runtime-heavy Jupyter paths in that case.
	skip_prune_prefixes.update({"runtime", ".local/share/jupyter/runtime"})
	print("DevData snapshot had copy failures; pruning stale files with runtime-path safeguards.")
	prune_remote_deleted_files(
	api,
	rid,
	tmp,
	skip_prefixes=skip_prune_prefixes,
	protected_paths=protected_large_files,
	)
	except Exception as exc:
	kind = classify_error(exc)
	print(f"DevData sync warning [{kind}]: {exc}")
	finally:
	shutil.rmtree(tmp, ignore_errors=True)
	time.sleep(INTERVAL)


	if __name__ == "__main__":
	if not enabled():
	print("DevData sync disabled.")
	raise SystemExit(0)

	from huggingface_hub import CommitOperationDelete, HfApi, upload_folder, snapshot_download
	from huggingface_hub.errors import RepositoryNotFoundError

	api = HfApi(token=HF_TOKEN)
	rid = repo_id(api)
	try:
	api.repo_info(repo_id=rid, repo_type="dataset")
	except RepositoryNotFoundError:
	api.create_repo(repo_id=rid, repo_type="dataset", private=True)

	# ── BUG FIX #3: Restore must happen BEFORE JupyterLab starts ──────────
	# The original code always called restore_once() here, but start.sh starts
	# JupyterLab long before the gateway is ready and this script is launched.
	# That made restore_once() ALWAYS run while JupyterLab was live, which
	# overwrote its runtime/ sockets and settings → JupyterLab died.
	#
	# Fix: start.sh now calls `python3 jupyter-devdata-sync.py --restore`
	# BEFORE starting JupyterLab. That --restore invocation does the restore
	# and exits. This background invocation (no --restore flag) skips straight
	# to sync_loop so it never touches files while JupyterLab is running.
	#
	# BUG FIX #2 (safety net): If JupyterLab is somehow already running when
	# this code path is reached, abort restore to avoid corrupting its state.
	if "--restore" in sys.argv:
	# Synchronous restore mode — called by start.sh before JupyterLab.
	validate_jupyter_paths()
	restore_once(api, rid)
	raise SystemExit(0)

	# Normal background sync mode — no restore; go straight to upload loop.
	validate_jupyter_paths()
	if is_jupyter_running():
	print("DevData: background sync started (JupyterLab is live, restore already done by --restore).")
	else:
	# Fallback: JupyterLab not detected. Should not normally happen
	# because start.sh calls --restore before starting JupyterLab and then
	# waits for the gateway before launching this background process.
	# Log a warning and proceed to sync; do NOT restore to avoid racing
	# with a JupyterLab that may be in the middle of starting up.
	print("DevData: WARNING — JupyterLab not detected on port 8888. Skipping restore to be safe; starting sync loop.")

	sync_loop(api, rid)