HF-OpenClaw / terminal-devdata-sync.py
Claude
OpenClaw HF Space: gateway, dashboard, Telegram webhook/polling, Cloudflare proxy + keep-alive, HF backup
5f1df14 unverified
Raw
History Blame Contribute Delete
10.5 kB
#!/usr/bin/env python3
from __future__ import annotations
import os, shutil, sys, tempfile, time
from pathlib import Path
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
HF_USERNAME = os.environ.get("HF_USERNAME", "").strip() or os.environ.get("SPACE_AUTHOR_NAME", "").strip()
DATASET_NAME = os.environ.get("DEVDATA_DATASET_NAME", "").strip() or "openclaw-hf-devdata"
BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "").strip() or os.environ.get("BACKUP_DATASET", "").strip() or "openclaw-hf-backup"
HOME_DIR = Path(os.environ.get("DEVDATA_HOME_DIR", "/home/node")).resolve()
INTERVAL = int((os.environ.get("DEVDATA_SYNC_INTERVAL", "").strip() or "180"))
# BUG FIX #5: Respect max file size so giant files don't stall uploads.
# Matches the 50 MB ceiling in openclaw-sync.py; override with DEVDATA_MAX_FILE_BYTES.
MAX_FILE_SIZE_BYTES = int(
(os.environ.get("DEVDATA_MAX_FILE_BYTES", "").strip() or str(50 * 1024 * 1024))
)
def is_true(value):
return str(value).strip().lower() in {"1", "true", "yes", "on"}
ENABLE = is_true(os.environ.get("DEVDATA", "on"))
def classify_error(exc: Exception) -> str:
msg = str(exc).lower()
if isinstance(exc, PermissionError) or "permission denied" in msg:
return "filesystem-permission"
if any(k in msg for k in ("connection error", "fetch failed", "timeout", "temporarily unavailable", "network")):
return "network-provider"
if "unsafe" in msg or "malware" in msg or "security" in msg:
return "safety-scan"
return "general"
# BUG FIX #4: ".local/share/Trash" in the original EXCLUDE set was a
# multi-component path string that was never matched because parts-based
# lookup compares individual directory names. Added "Trash" as a standalone
# component so any path with a "Trash" segment (e.g. .local/share/Trash/*)
# is correctly skipped during snapshot and restore.
EXCLUDE = {
".cache",
"node_modules",
".npm",
".yarn",
"Trash", # BUG FIX #4: covers .local/share/Trash (was ".local/share/Trash" — never matched)
".openclaw",
"app",
"browser-deps",
# Exclude Python/system package directories — these contain thousands of files
# (e.g. .local/lib/python3.11/site-packages/) and must not be synced to the
# HF Dataset. Syncing them causes 10,000+ file fetches on every restore.
".local",
"lib",
"site-packages",
"__pycache__",
}
def enabled():
terminal_override = os.environ.get("OPENCLAW_HF_TERMINAL_ENABLED", "")
if terminal_override.strip():
dev = is_true(terminal_override)
else:
dev = is_true(os.environ.get("DEV_MODE", ""))
separate_dataset = DATASET_NAME != BACKUP_DATASET_NAME
if ENABLE and dev and HF_TOKEN and not separate_dataset:
print("DevData sync disabled: DEVDATA_DATASET_NAME must be separate from BACKUP_DATASET_NAME.")
return ENABLE and dev and bool(HF_TOKEN) and separate_dataset
def validate_home_writable() -> None:
try:
HOME_DIR.mkdir(parents=True, exist_ok=True)
probe = HOME_DIR / ".devdata-write-check"
probe.write_text("ok", encoding="utf-8")
probe.unlink(missing_ok=True)
except Exception as exc:
kind = classify_error(exc)
print(f"DevData warning [{kind}]: {HOME_DIR} is not writable; sync will likely fail ({exc})")
def repo_id(api) -> str:
ns = HF_USERNAME
if not ns:
who = api.whoami()
ns = who.get("name") or who.get("user") or ""
if not ns:
raise RuntimeError("Cannot resolve HF namespace for devdata sync")
return f"{ns}/{DATASET_NAME}"
# Filename patterns that must never be synced to a public/private HF Dataset.
# These are matched against the *name* of each path component (not the full path),
# so ".env" matches /home/node/.env and /home/node/subdir/.env alike.
import fnmatch as _fnmatch
SECRET_FILENAME_PATTERNS = {
".env", # dotenv files — almost always contain API keys
".env.*", # .env.local, .env.production, etc.
"id_rsa",
"id_dsa",
"id_ecdsa",
"id_ed25519",
"authorized_keys",
"known_hosts",
"secret",
"secrets",
"secret.*",
"*.secret",
"*_secret",
"*_secret.*",
"*-secret",
"*-secret.*",
"token",
"token.*",
"*.token",
"*_token",
"*_token.*",
"*-token",
"*-token.*",
"api_token",
"access_token",
"refresh_token",
"credentials", # common credential file names
"credentials.*",
"auth.json",
"auth.yaml",
"auth.yml",
"auth.toml",
"auth.ini",
"*.pem", # TLS/SSH private keys
"*.key", # generic key files
"*.p12", # PKCS#12 bundles
"*.pfx",
".netrc", # stores plaintext passwords
".htpasswd",
}
def _name_is_secret(name: str) -> bool:
"""Return True if *name* matches any secret-exclusion pattern."""
name_lower = name.lower()
return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS)
def should_skip(p: Path):
# Skip directories/files in the hard-coded exclude set.
parts = p.parts
if any(x in parts for x in EXCLUDE):
return True
# Skip any component whose name looks like a secret file/dir.
return any(_name_is_secret(part) for part in parts)
def snapshot(src: Path, dst: Path):
for p in src.rglob("*"):
rel = p.relative_to(src)
if should_skip(rel):
continue
if p.is_symlink():
continue
target = dst / rel
if p.is_dir():
target.mkdir(parents=True, exist_ok=True)
elif p.is_file():
# BUG FIX #5: Skip files that exceed the size limit.
try:
if p.stat().st_size > MAX_FILE_SIZE_BYTES:
continue
except OSError:
continue
target.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(p, target)
except OSError:
pass
def restore_once(api, rid: str):
from huggingface_hub import snapshot_download
from huggingface_hub.errors import RepositoryNotFoundError
tmp = Path(tempfile.mkdtemp(prefix="devdata-restore-"))
try:
snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
for p in tmp.rglob("*"):
rel = p.relative_to(tmp)
if should_skip(rel):
continue
if str(rel) == ".gitattributes":
continue
target = HOME_DIR / rel
if p.is_dir():
target.mkdir(parents=True, exist_ok=True)
elif p.is_file():
target.parent.mkdir(parents=True, exist_ok=True)
try:
shutil.copy2(p, target)
except OSError as exc:
kind = classify_error(exc)
print(f"DevData restore skip [{kind}] (cannot write {target}): {exc}")
print(f"DevData restored from {rid}")
except RepositoryNotFoundError:
print(f"DevData dataset not found yet: {rid}")
except Exception as exc:
kind = classify_error(exc)
print(f"DevData restore warning [{kind}]: {exc}")
finally:
shutil.rmtree(tmp, ignore_errors=True)
def prune_remote_deleted_files(api, rid: str, snapshot_dir: Path) -> None:
"""BUG FIX #6: Delete from the HF dataset any files the user deleted
locally. Without this, deleted files re-appear on the next Space restart
because restore_once() copies everything in the dataset back to disk.
Mirrors the prune_remote_deleted_files() logic in openclaw-sync.py.
"""
try:
local_files = {
p.relative_to(snapshot_dir).as_posix()
for p in snapshot_dir.rglob("*")
if p.is_file()
}
remote_files = list(api.list_repo_files(repo_id=rid, repo_type="dataset"))
stale = [f for f in remote_files if f not in local_files and f != ".gitattributes"]
if stale:
api.delete_files(
delete_patterns=stale,
repo_id=rid,
repo_type="dataset",
commit_message=f"DevData prune {len(stale)} deleted file(s) {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
)
print(f"DevData pruned {len(stale)} deleted file(s) from {rid}")
except Exception as exc:
kind = classify_error(exc)
print(f"DevData prune warning [{kind}]: {exc}")
def sync_loop(api, rid: str):
while True:
tmp = Path(tempfile.mkdtemp(prefix="devdata-snap-"))
try:
snapshot(HOME_DIR, tmp)
upload_folder(
folder_path=str(tmp),
repo_id=rid,
repo_type="dataset",
token=HF_TOKEN,
commit_message=f"DevData sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
ignore_patterns=[".git/*", ".git"],
)
print(f"DevData synced to {rid}")
# BUG FIX #6: Prune files deleted locally so they don't reappear on restore.
prune_remote_deleted_files(api, rid, tmp)
except Exception as exc:
kind = classify_error(exc)
print(f"DevData sync warning [{kind}]: {exc}")
finally:
shutil.rmtree(tmp, ignore_errors=True)
time.sleep(INTERVAL)
if __name__ == "__main__":
if not enabled():
print("DevData sync disabled.")
raise SystemExit(0)
from huggingface_hub import HfApi, upload_folder, snapshot_download
from huggingface_hub.errors import RepositoryNotFoundError
api = HfApi(token=HF_TOKEN)
rid = repo_id(api)
try:
api.repo_info(repo_id=rid, repo_type="dataset")
except RepositoryNotFoundError:
api.create_repo(repo_id=rid, repo_type="dataset", private=True)
# start.sh calls `python3 terminal-devdata-sync.py --restore` once, early in
# boot, before health-server.js starts. That invocation restores files and
# exits. This background invocation (no --restore flag) skips straight to
# sync_loop — the terminal's shell sessions are spawned per-connection and
# have no on-disk runtime state for a restore to race with.
if "--restore" in sys.argv:
validate_home_writable()
restore_once(api, rid)
raise SystemExit(0)
validate_home_writable()
sync_loop(api, rid)