#!/usr/bin/env python3 from __future__ import annotations import os, shutil, sys, tempfile, time from pathlib import Path HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip() DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "openclaw-hf-devdata" BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "openclaw-hf-backup" HOME_DIR = Path(os.environ.get("DEVDATA_HOME_DIR", "/home/node")).resolve() INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180")) # BUG FIX #5: Respect max file size so giant files don't stall uploads. # Matches the 50 MB ceiling in openclaw-sync.py; override with DEVDATA_MAX_FILE_BYTES. MAX_FILE_SIZE_BYTES = int( (os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024)) ) def is_true(value): return str(value).strip().lower() in {"1", "true", "yes", "on"} ENABLE = is_true(os.environ.get("DEVDATA", "on")) def classify_error(exc: Exception) -> str: msg = str(exc).lower() if isinstance(exc, PermissionError) or "permission denied" in msg: return "filesystem-permission" if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")): return "network-provider" if "unsafe" in msg or "malware" in msg or "security" in msg: return "safety-scan" return "general" # BUG FIX #4: ".local/share/Trash" in the original EXCLUDE set was a # multi-component path string that was never matched because parts-based # lookup compares individual directory names. Added "Trash" as a standalone # component so any path with a "Trash" segment (e.g. .local/share/Trash/*) # is correctly skipped during snapshot and restore. EXCLUDE = { ".cache", "node_modules", ".npm", ".yarn", "Trash", # BUG FIX #4: covers .local/share/Trash (was ".local/share/Trash" — never matched) ".openclaw", "app", "browser-deps", # Exclude Python/system package directories — these contain thousands of files # (e.g. .local/lib/python3.11/site-packages/) and must not be synced to the # HF Dataset. Syncing them causes 10,000+ file fetches on every restore. ".local", "lib", "site-packages", "__pycache__", } def enabled(): terminal_override = os.environ.get("OPENCLAW_HF_TERMINAL_ENABLED", "") if terminal_override.strip(): dev = is_true(terminal_override) else: dev = is_true(os.environ.get("DEV_MODE", "")) separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME if ENABLE and dev and HF_TOKEN and not separate_dataset: print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.") return ENABLE and dev and bool(HF_TOKEN) and separate_dataset def validate_home_writable() -> None: try: HOME_DIR.mkdir(parents=True, exist_ok=True) probe = HOME_DIR / ".devdata-write-check" probe.write_text("ok", encoding="utf-8") probe.unlink(missing_ok=True) except Exception as exc: kind = classify_error(exc) print(f"DevData warning [{kind}]: {HOME_DIR} is not writable; sync will likely fail ({exc})") def repo_id(api) -> str: ns = HF_USERNAME if not ns: who = api.whoami() ns = who.get("name") or who.get("user") or "" if not ns: raise RuntimeError("Cannot resolve HF namespace for devdata sync") return f"{ns}/{DATASET_NAME}" # Filename patterns that must never be synced to a public/private HF Dataset. # These are matched against the *name* of each path component (not the full path), # so ".env" matches /home/node/.env and /home/node/subdir/.env alike. import fnmatch as _fnmatch SECRET_FILENAME_PATTERNS = { ".env", # dotenv files — almost always contain API keys ".env.*", # .env.local, .env.production, etc. "id_rsa", "id_dsa", "id_ecdsa", "id_ed25519", "authorized_keys", "known_hosts", "secret", "secrets", "secret.*", "*.secret", "*_secret", "*_secret.*", "*-secret", "*-secret.*", "token", "token.*", "*.token", "*_token", "*_token.*", "*-token", "*-token.*", "api_token", "access_token", "refresh_token", "credentials", # common credential file names "credentials.*", "auth.json", "auth.yaml", "auth.yml", "auth.toml", "auth.ini", "*.pem", # TLS/SSH private keys "*.key", # generic key files "*.p12", # PKCS#12 bundles "*.pfx", ".netrc", # stores plaintext passwords ".htpasswd", } def _name_is_secret(name: str) -> bool: """Return True if *name* matches any secret-exclusion pattern.""" name_lower = name.lower() return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS) def should_skip(p: Path): # Skip directories/files in the hard-coded exclude set. parts = p.parts if any(x in parts for x in EXCLUDE): return True # Skip any component whose name looks like a secret file/dir. return any(_name_is_secret(part) for part in parts) def snapshot(src: Path, dst: Path): for p in src.rglob("*"): rel = p.relative_to(src) if should_skip(rel): continue if p.is_symlink(): continue target = dst / rel if p.is_dir(): target.mkdir(parents=True, exist_ok=True) elif p.is_file(): # BUG FIX #5: Skip files that exceed the size limit. try: if p.stat().st_size > MAX_FILE_SIZE_BYTES: continue except OSError: continue target.parent.mkdir(parents=True, exist_ok=True) try: shutil.copy2(p, target) except OSError: pass def restore_once(api, rid: str): from huggingface_hub import snapshot_download from huggingface_hub.errors import RepositoryNotFoundError tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-")) try: snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN) for p in tmp.rglob("*"): rel = p.relative_to(tmp) if should_skip(rel): continue if str(rel) == ".gitattributes": continue target = HOME_DIR / rel if p.is_dir(): target.mkdir(parents=True, exist_ok=True) elif p.is_file(): target.parent.mkdir(parents=True, exist_ok=True) try: shutil.copy2(p, target) except OSError as exc: kind = classify_error(exc) print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}") print(f"DevData restored from {rid}") except RepositoryNotFoundError: print(f"DevData dataset not found yet: {rid}") except Exception as exc: kind = classify_error(exc) print(f"DevData restore warning [{kind}]: {exc}") finally: shutil.rmtree(tmp, ignore_errors=True) def prune_remote_deleted_files(api, rid: str, snapshot_dir: Path) -> None: """BUG FIX #6: Delete from the HF dataset any files the user deleted locally. Without this, deleted files re-appear on the next Space restart because restore_once() copies everything in the dataset back to disk. Mirrors the prune_remote_deleted_files() logic in openclaw-sync.py. """ try: local_files = { p.relative_to(snapshot_dir).as_posix() for p in snapshot_dir.rglob("*") if p.is_file() } remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset")) stale = [f for f in remote_files if f not in local_files and f != ".gitattributes"] if stale: api.delete_files( delete_patterns=stale, repo_id=rid, repo_type="dataset", commit_message=f"DevData prune {len(stale)} deleted file(s) {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}", ) print(f"DevData pruned {len(stale)} deleted file(s) from {rid}") except Exception as exc: kind = classify_error(exc) print(f"DevData prune warning [{kind}]: {exc}") def sync_loop(api, rid: str): while True: tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-")) try: snapshot(HOME_DIR, tmp) upload_folder( folder_path=str(tmp), repo_id=rid, repo_type="dataset", token=HF_TOKEN, commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}", ignore_patterns=[".git/*", ".git"], ) print(f"DevData synced to {rid}") # BUG FIX #6: Prune files deleted locally so they don't reappear on restore. prune_remote_deleted_files(api, rid, tmp) except Exception as exc: kind = classify_error(exc) print(f"DevData sync warning [{kind}]: {exc}") finally: shutil.rmtree(tmp, ignore_errors=True) time.sleep(INTERVAL) if __name__ == "__main__": if not enabled(): print("DevData sync disabled.") raise SystemExit(0) from huggingface_hub import HfApi, upload_folder, snapshot_download from huggingface_hub.errors import RepositoryNotFoundError api = HfApi(token=HF_TOKEN) rid = repo_id(api) try: api.repo_info(repo_id=rid, repo_type="dataset") except RepositoryNotFoundError: api.create_repo(repo_id=rid, repo_type="dataset", private=True) # start.sh calls `python3 terminal-devdata-sync.py --restore` once, early in # boot, before health-server.js starts. That invocation restores files and # exits. This background invocation (no --restore flag) skips straight to # sync_loop — the terminal's shell sessions are spawned per-connection and # have no on-disk runtime state for a restore to race with. if "--restore" in sys.argv: validate_home_writable() restore_once(api, rid) raise SystemExit(0) validate_home_writable() sync_loop(api, rid)