#!/usr/bin/env python3 from __future__ import annotations import os, shutil, socket, sys, tempfile, time from pathlib import Path HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip() DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingclaw-devdata" BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup" JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve() INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180")) # BUG FIX #5: Respect max file size so giant files don't stall uploads. # Matches the 50 MB ceiling in openclaw-sync.py; override with DEVDATA_MAX_FILE_BYTES. MAX_FILE_SIZE_BYTES = int( (os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024)) ) # Max stale files to delete per commit. Mirrors openclaw-sync.py behaviour. # Override via DEVDATA_PRUNE_BATCH_SIZE. PRUNE_BATCH_SIZE = int((os.environ.get("DEVDATA_PRUNE_BATCH_SIZE", "").strip() or "50")) # Reserved filenames previously generated by DevData. Never copy or restore # them as user content, and let prune delete any old remote marker files. RESERVED_SYNC_FILENAMES = {".huggingclaw-empty-dir"} def is_true(value): return str(value).strip().lower() in {"1", "true", "yes", "on"} ENABLE = is_true(os.environ.get("DEVDATA", "on")) def classify_error(exc: Exception) -> str: msg = str(exc).lower() if isinstance(exc, PermissionError) or "permission denied" in msg: return "filesystem-permission" if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")): return "network-provider" if "unsafe" in msg or "malware" in msg or "security" in msg: return "safety-scan" return "general" # Directory names that are always unsafe/noisy to persist. Keep this list to # single path components; path-specific exclusions live in SKIP_PATH_PREFIXES. EXCLUDE = { ".cache", "node_modules", ".npm", ".yarn", "Trash", # covers .local/share/Trash and any nested trash folder ".ipynb_checkpoints", ".openclaw", "site-packages", "__pycache__", } # Path prefixes, relative to JUPYTER_ROOT, that should not be synced. This keeps # package/runtime trees out of the dataset without blocking real JupyterLab # settings under .local/share/jupyter/lab/user-settings. ROOT_SKIP_PATH_PREFIXES = { ("app",), ("HuggingClaw",), ("HuggingClaw-Workspace",), ("browser-deps",), } SKIP_PATH_PREFIXES = { (".jupyter", "runtime"), (".local", "bin"), (".local", "lib"), ("lib",), } JUPYTER_DATA_DIR_PREFIX = (".local", "share", "jupyter") JUPYTER_DATA_ALLOW_PREFIXES = { (".local", "share", "jupyter", "lab", "user-settings"), (".local", "share", "jupyter", "lab", "workspaces"), } def enabled(): jupyter_override = os.environ.get("HUGGINGCLAW_JUPYTER_ENABLED", "") if jupyter_override.strip(): dev = is_true(jupyter_override) else: dev = is_true(os.environ.get("DEV_MODE", "")) separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME if ENABLE and dev and HF_TOKEN and not separate_dataset: print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.") return ENABLE and dev and bool(HF_TOKEN) and separate_dataset def validate_jupyter_paths() -> None: # JupyterLab theme/settings live under ~/.jupyter and ~/.local/share/jupyter. # If these are not writable, settings can appear to "reset" every restart. for required in (JUPYTER_ROOT, Path("/home/node/.jupyter"), Path("/home/node/.local/share/jupyter")): try: required.mkdir(parents=True, exist_ok=True) probe = required / ".devdata-write-check" probe.write_text("ok", encoding="utf-8") probe.unlink(missing_ok=True) except Exception as exc: kind = classify_error(exc) print(f"DevData warning [{kind}]: {required} is not writable; Jupyter settings may not persist ({exc})") def repo_id(api) -> str: ns = HF_USERNAME if not ns: who = api.whoami() ns = who.get("name") or who.get("user") or "" if not ns: raise RuntimeError("Cannot resolve HF namespace for devdata sync") return f"{ns}/{DATASET_NAME}" # Filename patterns that must never be synced to a public/private HF Dataset. # These are matched against the *name* of each path component (not the full path), # so ".env" matches /home/node/.env and /home/node/subdir/.env alike. import fnmatch as _fnmatch SECRET_FILENAME_PATTERNS = { ".env", # dotenv files — almost always contain API keys ".env.*", # .env.local, .env.production, etc. "id_rsa", "id_dsa", "id_ecdsa", "id_ed25519", "authorized_keys", "known_hosts", "secret", "secrets", "secret.*", "*.secret", "*_secret", "*_secret.*", "*-secret", "*-secret.*", "token", "token.*", "*.token", "*_token", "*_token.*", "*-token", "*-token.*", "api_token", "access_token", "refresh_token", "credentials", # common credential file names "credentials.*", "auth.json", "auth.yaml", "auth.yml", "auth.toml", "auth.ini", "*.pem", # TLS/SSH private keys "*.key", # generic key files "*.p12", # PKCS#12 bundles "*.pfx", ".netrc", # stores plaintext passwords ".htpasswd", } def _name_is_secret(name: str) -> bool: """Return True if *name* matches any secret-exclusion pattern.""" name_lower = name.lower() return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS) def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool: return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix def _is_prefix_of(parts: tuple[str, ...], full_path: tuple[str, ...]) -> bool: return len(parts) <= len(full_path) and full_path[:len(parts)] == parts def should_skip(p: Path): # Reserved sync helper files are not user data. Old datasets may still have # these markers, but fresh snapshots no longer create them. if p.name in RESERVED_SYNC_FILENAMES: return True # Skip directories/files in the hard-coded exclude set. parts = p.parts if any(x in parts for x in EXCLUDE): return True if any(parts == prefix or _matches_prefix(parts, prefix) for prefix in ROOT_SKIP_PATH_PREFIXES): return True if _matches_prefix(parts, JUPYTER_DATA_DIR_PREFIX) and not any( _matches_prefix(parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES ): return True if any(_matches_prefix(parts, prefix) for prefix in SKIP_PATH_PREFIXES): return True # Skip any component whose name looks like a secret file/dir. return any(_name_is_secret(part) for part in parts) def iter_sync_tree(root: Path): """Yield syncable DevData paths without descending into excluded trees.""" if not root.exists(): return for dirpath, dirnames, filenames in os.walk(root): dir_path = Path(dirpath) try: dir_rel = dir_path.relative_to(root) except ValueError: dir_rel = Path() kept_dirnames: list[str] = [] for dirname in sorted(dirnames): rel = dir_rel / dirname child = dir_path / dirname rel_parts = rel.parts # Do not prune ancestors of explicitly allowed Jupyter settings # paths. should_skip(.local/share/jupyter) is true by design for # files under that tree, but we must still descend through the # parent dirs to reach lab/user-settings and lab/workspaces. allowed_ancestor = any( _is_prefix_of(rel_parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES ) if child.is_symlink() or (should_skip(rel) and not allowed_ancestor): continue kept_dirnames.append(dirname) dirnames[:] = kept_dirnames for dirname in kept_dirnames: yield dir_path / dirname for filename in sorted(filenames): rel = dir_rel / filename child = dir_path / filename if child.is_symlink() or should_skip(rel): continue yield child def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]: had_copy_failures = False protected_large_files: set[str] = set() for p in iter_sync_tree(src): rel = p.relative_to(src) target = dst / rel if p.is_dir(): # Keep parent directories for files copied later in this snapshot. # Empty folders are intentionally not represented in the git-backed # HF Dataset; once a folder contains syncable files, those files # carry the folder path naturally. target.mkdir(parents=True, exist_ok=True) elif p.is_file(): # BUG FIX #5: Skip files that exceed the size limit. try: if p.stat().st_size > MAX_FILE_SIZE_BYTES: protected_large_files.add(rel.as_posix()) continue except OSError: had_copy_failures = True protected_large_files.add(rel.as_posix()) continue target.parent.mkdir(parents=True, exist_ok=True) try: shutil.copy2(p, target) except OSError: had_copy_failures = True protected_large_files.add(rel.as_posix()) return had_copy_failures, protected_large_files def is_jupyter_running(port: int = 8888) -> bool: """Return True if JupyterLab is already listening on *port*. BUG FIX #2 (safety net): restore_once() must never run while JupyterLab is active. Overwriting files under JUPYTER_ROOT (runtime/ sockets, lab/ settings, kernel connection files) while JupyterLab is live corrupts its state and causes it to exit within seconds. The primary guard is the --restore / sync separation introduced in BUG FIX #3, but this TCP probe stays as a hard backstop for any future code path that might call restore_once() unexpectedly. """ try: with socket.create_connection(("127.0.0.1", port), timeout=2): return True except OSError: return False def restore_once(api, rid: str): from huggingface_hub import snapshot_download from huggingface_hub.errors import RepositoryNotFoundError tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-")) try: snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN) for p in tmp.rglob("*"): rel = p.relative_to(tmp) if str(rel) == ".gitattributes": continue if should_skip(rel): continue target = JUPYTER_ROOT / rel if p.is_dir(): target.mkdir(parents=True, exist_ok=True) elif p.is_file(): target.parent.mkdir(parents=True, exist_ok=True) try: shutil.copy2(p, target) except OSError as exc: kind = classify_error(exc) print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}") print(f"DevData restored from {rid}") except RepositoryNotFoundError: print(f"DevData dataset not found yet: {rid}") except Exception as exc: kind = classify_error(exc) print(f"DevData restore warning [{kind}]: {exc}") finally: shutil.rmtree(tmp, ignore_errors=True) def prune_remote_deleted_files( api, rid: str, snapshot_dir: Path, skip_prefixes: set[str] | None = None, protected_paths: set[str] | None = None, ) -> None: """Delete from the HF dataset any files the user deleted locally. Without this, deleted files re-appear on the next Space restart because restore_once() copies everything in the dataset back to disk. Uses create_commit directly with CommitOperationDelete to avoid the extra list_repo_files call inside the SDK's delete_files wrapper, and batches deletions into PRUNE_BATCH_SIZE chunks to avoid hitting the HF API payload limit when many files are pruned at once. """ try: skip_prefixes = skip_prefixes or set() protected_paths = protected_paths or set() local_files = { p.relative_to(snapshot_dir).as_posix() for p in snapshot_dir.rglob("*") if p.is_file() } remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset")) stale = [ f for f in remote_files if f not in local_files and f != ".gitattributes" and f not in protected_paths and not any(f == prefix or f.startswith(prefix + "/") for prefix in skip_prefixes) ] if not stale: return total = len(stale) num_batches = (total + PRUNE_BATCH_SIZE - 1) // PRUNE_BATCH_SIZE ts = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) for batch_idx in range(num_batches): batch = stale[batch_idx * PRUNE_BATCH_SIZE:(batch_idx + 1) * PRUNE_BATCH_SIZE] batch_label = f" (batch {batch_idx + 1}/{num_batches})" if num_batches > 1 else "" operations = [CommitOperationDelete(path_in_repo=p) for p in batch] api.create_commit( repo_id=rid, repo_type="dataset", operations=operations, commit_message=f"DevData prune {len(batch)} deleted file(s) {ts}{batch_label}", ) print(f"DevData pruned {total} deleted file(s) from {rid}" + (f" in {num_batches} batches" if num_batches > 1 else "")) except Exception as exc: kind = classify_error(exc) print(f"DevData prune warning [{kind}]: {exc}") def sync_loop(api, rid: str): while True: tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-")) try: had_copy_failures, protected_large_files = snapshot(JUPYTER_ROOT, tmp) upload_folder( folder_path=str(tmp), repo_id=rid, repo_type="dataset", token=HF_TOKEN, commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}", ) print(f"DevData synced to {rid}") # BUG FIX #6: Prune files deleted locally so they don't reappear on restore. skip_prune_prefixes: set[str] = set() if had_copy_failures: # Snapshot copy races can produce a partial view; avoid pruning # runtime-heavy Jupyter paths in that case. skip_prune_prefixes.update({"runtime", ".local/share/jupyter/runtime"}) print("DevData snapshot had copy failures; pruning stale files with runtime-path safeguards.") prune_remote_deleted_files( api, rid, tmp, skip_prefixes=skip_prune_prefixes, protected_paths=protected_large_files, ) except Exception as exc: kind = classify_error(exc) print(f"DevData sync warning [{kind}]: {exc}") finally: shutil.rmtree(tmp, ignore_errors=True) time.sleep(INTERVAL) if __name__ == "__main__": if not enabled(): print("DevData sync disabled.") raise SystemExit(0) from huggingface_hub import CommitOperationDelete, HfApi, upload_folder, snapshot_download from huggingface_hub.errors import RepositoryNotFoundError api = HfApi(token=HF_TOKEN) rid = repo_id(api) try: api.repo_info(repo_id=rid, repo_type="dataset") except RepositoryNotFoundError: api.create_repo(repo_id=rid, repo_type="dataset", private=True) # ── BUG FIX #3: Restore must happen BEFORE JupyterLab starts ────────── # The original code always called restore_once() here, but start.sh starts # JupyterLab long before the gateway is ready and this script is launched. # That made restore_once() ALWAYS run while JupyterLab was live, which # overwrote its runtime/ sockets and settings → JupyterLab died. # # Fix: start.sh now calls `python3 jupyter-devdata-sync.py --restore` # BEFORE starting JupyterLab. That --restore invocation does the restore # and exits. This background invocation (no --restore flag) skips straight # to sync_loop so it never touches files while JupyterLab is running. # # BUG FIX #2 (safety net): If JupyterLab is somehow already running when # this code path is reached, abort restore to avoid corrupting its state. if "--restore" in sys.argv: # Synchronous restore mode — called by start.sh before JupyterLab. validate_jupyter_paths() restore_once(api, rid) raise SystemExit(0) # Normal background sync mode — no restore; go straight to upload loop. validate_jupyter_paths() if is_jupyter_running(): print("DevData: background sync started (JupyterLab is live, restore already done by --restore).") else: # Fallback: JupyterLab not detected. Should not normally happen # because start.sh calls --restore before starting JupyterLab and then # waits for the gateway before launching this background process. # Log a warning and proceed to sync; do NOT restore to avoid racing # with a JupyterLab that may be in the middle of starting up. print("DevData: WARNING — JupyterLab not detected on port 8888. Skipping restore to be safe; starting sync loop.") sync_loop(api, rid)