| |
| from __future__ import annotations |
|
|
| import os, shutil, socket, sys, tempfile, time |
| from pathlib import Path |
|
|
| HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() |
| HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip() |
| DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "huggingclaw-devdata" |
| BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "huggingclaw-backup" |
| JUPYTER_ROOT = Path(os.environ.get("JUPYTER_ROOT_DIR", "/home/node")).resolve() |
| INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180")) |
| |
| |
| MAX_FILE_SIZE_BYTES = int( |
| (os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024)) |
| ) |
|
|
| def is_true(value): |
| return str(value).strip().lower() in {"1", "true", "yes", "on"} |
|
|
| ENABLE = is_true(os.environ.get("DEVDATA", "on")) |
|
|
|
|
| def classify_error(exc: Exception) -> str: |
| msg = str(exc).lower() |
| if isinstance(exc, PermissionError) or "permission denied" in msg: |
| return "filesystem-permission" |
| if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")): |
| return "network-provider" |
| if "unsafe" in msg or "malware" in msg or "security" in msg: |
| return "safety-scan" |
| return "general" |
|
|
| |
| |
| |
| |
| |
| EXCLUDE = { |
| ".cache", |
| "node_modules", |
| ".npm", |
| ".yarn", |
| "Trash", |
| ".ipynb_checkpoints", |
| ".openclaw", |
| "app", |
| "HuggingClaw", |
| "HuggingClaw-Workspace", |
| "browser-deps", |
| |
| |
| |
| |
| ".local", |
| "lib", |
| "site-packages", |
| "__pycache__", |
| } |
|
|
|
|
| def enabled(): |
| jupyter_override = os.environ.get("HUGGINGCLAW_JUPYTER_ENABLED", "") |
| if jupyter_override.strip(): |
| dev = is_true(jupyter_override) |
| else: |
| dev = is_true(os.environ.get("DEV_MODE", "")) |
| separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME |
| if ENABLE and dev and HF_TOKEN and not separate_dataset: |
| print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.") |
| return ENABLE and dev and bool(HF_TOKEN) and separate_dataset |
|
|
| def validate_jupyter_paths() -> None: |
| |
| |
| for required in (JUPYTER_ROOT, Path("/home/node/.jupyter"), Path("/home/node/.local/share/jupyter")): |
| try: |
| required.mkdir(parents=True, exist_ok=True) |
| probe = required / ".devdata-write-check" |
| probe.write_text("ok", encoding="utf-8") |
| probe.unlink(missing_ok=True) |
| except Exception as exc: |
| kind = classify_error(exc) |
| print(f"DevData warning [{kind}]: {required} is not writable; Jupyter settings may not persist ({exc})") |
|
|
| def repo_id(api) -> str: |
| ns = HF_USERNAME |
| if not ns: |
| who = api.whoami() |
| ns = who.get("name") or who.get("user") or "" |
| if not ns: |
| raise RuntimeError("Cannot resolve HF namespace for devdata sync") |
| return f"{ns}/{DATASET_NAME}" |
|
|
| |
| |
| |
| import fnmatch as _fnmatch |
|
|
| SECRET_FILENAME_PATTERNS = { |
| ".env", |
| ".env.*", |
| "id_rsa", |
| "id_dsa", |
| "id_ecdsa", |
| "id_ed25519", |
| "authorized_keys", |
| "known_hosts", |
| "secret", |
| "secrets", |
| "secret.*", |
| "*.secret", |
| "*_secret", |
| "*_secret.*", |
| "*-secret", |
| "*-secret.*", |
| "token", |
| "token.*", |
| "*.token", |
| "*_token", |
| "*_token.*", |
| "*-token", |
| "*-token.*", |
| "api_token", |
| "access_token", |
| "refresh_token", |
| "credentials", |
| "credentials.*", |
| "auth.json", |
| "auth.yaml", |
| "auth.yml", |
| "auth.toml", |
| "auth.ini", |
| "*.pem", |
| "*.key", |
| "*.p12", |
| "*.pfx", |
| ".netrc", |
| ".htpasswd", |
| } |
|
|
|
|
| def _name_is_secret(name: str) -> bool: |
| """Return True if *name* matches any secret-exclusion pattern.""" |
| name_lower = name.lower() |
| return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS) |
|
|
|
|
| def should_skip(p: Path): |
| |
| parts = p.parts |
| if any(x in parts for x in EXCLUDE): |
| return True |
| |
| return any(_name_is_secret(part) for part in parts) |
|
|
| def snapshot(src: Path, dst: Path): |
| for p in src.rglob("*"): |
| rel = p.relative_to(src) |
| if should_skip(rel): |
| continue |
| if p.is_symlink(): |
| continue |
| target = dst / rel |
| if p.is_dir(): |
| target.mkdir(parents=True, exist_ok=True) |
| elif p.is_file(): |
| |
| try: |
| if p.stat().st_size > MAX_FILE_SIZE_BYTES: |
| continue |
| except OSError: |
| continue |
| target.parent.mkdir(parents=True, exist_ok=True) |
| try: |
| shutil.copy2(p, target) |
| except OSError: |
| pass |
|
|
| def is_jupyter_running(port: int = 8888) -> bool: |
| """Return True if JupyterLab is already listening on *port*. |
| |
| BUG FIX #2 (safety net): restore_once() must never run while JupyterLab |
| is active. Overwriting files under JUPYTER_ROOT (runtime/ sockets, lab/ |
| settings, kernel connection files) while JupyterLab is live corrupts its |
| state and causes it to exit within seconds. |
| |
| The primary guard is the --restore / sync separation introduced in |
| BUG FIX #3, but this TCP probe stays as a hard backstop for any future |
| code path that might call restore_once() unexpectedly. |
| """ |
| try: |
| with socket.create_connection(("127.0.0.1", port), timeout=2): |
| return True |
| except OSError: |
| return False |
|
|
| def restore_once(api, rid: str): |
| from huggingface_hub import snapshot_download |
| from huggingface_hub.errors import RepositoryNotFoundError |
| tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-")) |
| try: |
| snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN) |
| for p in tmp.rglob("*"): |
| rel = p.relative_to(tmp) |
| if should_skip(rel): |
| continue |
| if str(rel) == ".gitattributes": |
| continue |
| target = JUPYTER_ROOT / rel |
| if p.is_dir(): |
| target.mkdir(parents=True, exist_ok=True) |
| elif p.is_file(): |
| target.parent.mkdir(parents=True, exist_ok=True) |
| try: |
| shutil.copy2(p, target) |
| except OSError as exc: |
| kind = classify_error(exc) |
| print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}") |
| print(f"DevData restored from {rid}") |
| except RepositoryNotFoundError: |
| print(f"DevData dataset not found yet: {rid}") |
| except Exception as exc: |
| kind = classify_error(exc) |
| print(f"DevData restore warning [{kind}]: {exc}") |
| finally: |
| shutil.rmtree(tmp, ignore_errors=True) |
|
|
| def prune_remote_deleted_files(api, rid: str, snapshot_dir: Path) -> None: |
| """BUG FIX #6: Delete from the HF dataset any files the user deleted |
| locally. Without this, deleted files re-appear on the next Space restart |
| because restore_once() copies everything in the dataset back to disk. |
| Mirrors the prune_remote_deleted_files() logic in openclaw-sync.py. |
| """ |
| try: |
| local_files = { |
| p.relative_to(snapshot_dir).as_posix() |
| for p in snapshot_dir.rglob("*") |
| if p.is_file() |
| } |
| remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset")) |
| stale = [f for f in remote_files if f not in local_files and f != ".gitattributes"] |
| if stale: |
| api.delete_files( |
| delete_patterns=stale, |
| repo_id=rid, |
| repo_type="dataset", |
| commit_message=f"DevData prune {len(stale)} deleted file(s) {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}", |
| ) |
| print(f"DevData pruned {len(stale)} deleted file(s) from {rid}") |
| except Exception as exc: |
| kind = classify_error(exc) |
| print(f"DevData prune warning [{kind}]: {exc}") |
|
|
| def sync_loop(api, rid: str): |
| while True: |
| tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-")) |
| try: |
| snapshot(JUPYTER_ROOT, tmp) |
| upload_folder( |
| folder_path=str(tmp), |
| repo_id=rid, |
| repo_type="dataset", |
| token=HF_TOKEN, |
| commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}", |
| ignore_patterns=[".git/*", ".git"], |
| ) |
| print(f"DevData synced to {rid}") |
| |
| prune_remote_deleted_files(api, rid, tmp) |
| except Exception as exc: |
| kind = classify_error(exc) |
| print(f"DevData sync warning [{kind}]: {exc}") |
| finally: |
| shutil.rmtree(tmp, ignore_errors=True) |
| time.sleep(INTERVAL) |
|
|
|
|
| if __name__ == "__main__": |
| if not enabled(): |
| print("DevData sync disabled.") |
| raise SystemExit(0) |
|
|
| from huggingface_hub import HfApi, upload_folder, snapshot_download |
| from huggingface_hub.errors import RepositoryNotFoundError |
|
|
| api = HfApi(token=HF_TOKEN) |
| rid = repo_id(api) |
| try: |
| api.repo_info(repo_id=rid, repo_type="dataset") |
| except RepositoryNotFoundError: |
| api.create_repo(repo_id=rid, repo_type="dataset", private=True) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| if "--restore" in sys.argv: |
| |
| validate_jupyter_paths() |
| restore_once(api, rid) |
| raise SystemExit(0) |
|
|
| |
| validate_jupyter_paths() |
| if is_jupyter_running(): |
| print("DevData: background sync started (JupyterLab is live, restore already done by --restore).") |
| else: |
| |
| |
| |
| |
| |
| print("DevData: WARNING β JupyterLab not detected on port 8888. Skipping restore to be safe; starting sync loop.") |
|
|
| sync_loop(api, rid) |
|
|