#!/usr/bin/env python3 """ OpenClaw HF Spaces Persistence — Full Directory Sync ===================================================== Simplified persistence: upload/download the entire ~/.openclaw directory as-is to/from a Hugging Face Dataset repo. - Startup: snapshot_download → ~/.openclaw - Periodic: upload_folder → dataset openclaw_data/ - Shutdown: final upload_folder → dataset openclaw_data/ """ import os import sys import time import threading import subprocess import signal import json import shutil import tempfile import traceback import re from pathlib import Path from datetime import datetime # Set timeout BEFORE importing huggingface_hub os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "300") os.environ.setdefault("HF_HUB_UPLOAD_TIMEOUT", "600") from huggingface_hub import HfApi, snapshot_download # ── Logging helper ────────────────────────────────────────────────────────── class TeeLogger: """Duplicate output to stream and file.""" def __init__(self, filename, stream): self.stream = stream self.file = open(filename, "a", encoding="utf-8") def write(self, message): self.stream.write(message) self.file.write(message) self.flush() def flush(self): self.stream.flush() self.file.flush() def fileno(self): return self.stream.fileno() # ── Configuration ─────────────────────────────────────────────────────────── HF_REPO_ID = os.environ.get("OPENCLAW_DATASET_REPO", "tao-shen/openclaw") HF_TOKEN = os.environ.get("HF_TOKEN") OPENCLAW_HOME = Path.home() / ".openclaw" APP_DIR = Path("/app/openclaw") # Use ".openclaw" - directly read/write the .openclaw folder in dataset DATASET_PATH = ".openclaw" TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "") TELEGRAM_BOT_NAME = os.environ.get("TELEGRAM_BOT_NAME", "opentauronbot") TELEGRAM_ALLOW_USER = os.environ.get("TELEGRAM_ALLOW_USER", "taoshen1") SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "120")) # Setup logging log_dir = OPENCLAW_HOME / "workspace" log_dir.mkdir(parents=True, exist_ok=True) sys.stdout = TeeLogger(log_dir / "sync.log", sys.stdout) sys.stderr = sys.stdout # ── Sync Manager ──────────────────────────────────────────────────────────── class OpenClawFullSync: """Upload/download the entire ~/.openclaw directory to HF Dataset.""" def __init__(self): if not HF_TOKEN: print("[SYNC] WARNING: HF_TOKEN not set. Persistence disabled.") self.enabled = False return self.enabled = True self.api = HfApi(token=HF_TOKEN) self._ensure_repo() # ── Repo management ──────────────────────────────────────────────── def _ensure_repo(self): try: self.api.repo_info(repo_id=HF_REPO_ID, repo_type="dataset") print(f"[SYNC] Dataset repo found: {HF_REPO_ID}") except Exception: print(f"[SYNC] Creating dataset repo: {HF_REPO_ID}") self.api.create_repo(repo_id=HF_REPO_ID, repo_type="dataset", private=True) # ── Restore (startup) ───────────────────────────────────────────── def load_from_repo(self): """Download from dataset → ~/.openclaw""" if not self.enabled: return print(f"[SYNC] ▶ Restoring ~/.openclaw from dataset {HF_REPO_ID} ...") OPENCLAW_HOME.mkdir(parents=True, exist_ok=True) try: files = self.api.list_repo_files(repo_id=HF_REPO_ID, repo_type="dataset") openclaw_files = [f for f in files if f.startswith(f"{DATASET_PATH}/")] if not openclaw_files: print(f"[SYNC] No {DATASET_PATH}/ folder in dataset. Starting fresh.") self._ensure_default_config() self._ensure_telegram_credentials() return print(f"[SYNC] Found {len(openclaw_files)} files under {DATASET_PATH}/ in dataset") with tempfile.TemporaryDirectory() as tmpdir: snapshot_download( repo_id=HF_REPO_ID, repo_type="dataset", allow_patterns=f"{DATASET_PATH}/**", local_dir=tmpdir, token=HF_TOKEN, ) downloaded_root = Path(tmpdir) / DATASET_PATH if downloaded_root.exists(): for item in downloaded_root.rglob("*"): if item.is_file(): rel = item.relative_to(downloaded_root) dest = OPENCLAW_HOME / rel dest.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(str(item), str(dest)) print("[SYNC] ✓ Restore completed.") else: print("[SYNC] Downloaded snapshot but dir not found. Starting fresh.") except Exception as e: print(f"[SYNC] ✗ Restore failed: {e}") traceback.print_exc() # Patch config & telegram after restore self._patch_config() self._ensure_telegram_credentials() self._debug_list_files() # ── Save (periodic + shutdown) ───────────────────────────────────── def save_to_repo(self): """Upload entire ~/.openclaw directory → dataset (all files, no filtering)""" if not self.enabled: return if not OPENCLAW_HOME.exists(): print("[SYNC] ~/.openclaw does not exist, nothing to save.") return print(f"[SYNC] ▶ Uploading ~/.openclaw → dataset {HF_REPO_ID}/{DATASET_PATH}/ ...") try: # Log what will be uploaded total_size = 0 file_count = 0 for root, dirs, fls in os.walk(OPENCLAW_HOME): for fn in fls: fp = os.path.join(root, fn) sz = os.path.getsize(fp) total_size += sz file_count += 1 rel = os.path.relpath(fp, OPENCLAW_HOME) print(f"[SYNC] uploading: {rel} ({sz} bytes)") print(f"[SYNC] Uploading: {file_count} files, {total_size} bytes total") if file_count == 0: print("[SYNC] Nothing to upload.") return # Upload directory, excluding large log files that trigger LFS errors self.api.upload_folder( folder_path=str(OPENCLAW_HOME), path_in_repo=DATASET_PATH, repo_id=HF_REPO_ID, repo_type="dataset", token=HF_TOKEN, commit_message=f"Sync .openclaw — {datetime.now().isoformat()}", ignore_patterns=["*.log", "*.log.*"], # Exclude logs (sync.log is 60MB+, triggers LFS errors) ) print(f"[SYNC] ✓ Upload completed at {datetime.now().isoformat()}") # Verify try: files = self.api.list_repo_files(repo_id=HF_REPO_ID, repo_type="dataset") oc_files = [f for f in files if f.startswith(f"{DATASET_PATH}/")] print(f"[SYNC] Dataset now has {len(oc_files)} files under {DATASET_PATH}/") for f in oc_files[:30]: print(f"[SYNC] {f}") if len(oc_files) > 30: print(f"[SYNC] ... and {len(oc_files) - 30} more") except Exception: pass except Exception as e: print(f"[SYNC] ✗ Upload failed: {e}") traceback.print_exc() # ── Config helpers ───────────────────────────────────────────────── def _ensure_default_config(self): config_path = OPENCLAW_HOME / "openclaw.json" if config_path.exists(): return default_src = Path(__file__).parent / "openclaw.json.default" if default_src.exists(): shutil.copy2(str(default_src), str(config_path)) print("[SYNC] Created openclaw.json from default template") else: with open(config_path, "w") as f: json.dump({ "gateway": { "mode": "local", "bind": "lan", "port": 7860, "trustedProxies": ["0.0.0.0/0"], "controlUi": { "allowInsecureAuth": True, "allowedOrigins": [ "https://tao-shen-openclaw-ai.hf.space", "https://huggingface.co" ] } }, "session": {"scope": "global"}, "models": {"mode": "merge", "providers": {}}, "agents": {"defaults": {"workspace": "~/.openclaw/workspace"}} }, f) print("[SYNC] Created minimal openclaw.json") def _patch_config(self): """Ensure critical settings after restore.""" config_path = OPENCLAW_HOME / "openclaw.json" if not config_path.exists(): self._ensure_default_config() return print("[SYNC] Patching configuration...") try: with open(config_path, "r") as f: data = json.load(f) print("[SYNC] Config parsed OK.") except (json.JSONDecodeError, Exception) as e: # Config is corrupt — back up and start fresh print(f"[SYNC] Config JSON is corrupt: {e}") backup = config_path.with_suffix(f".corrupt_{int(time.time())}") try: import shutil shutil.copy2(config_path, backup) print(f"[SYNC] Backed up corrupt config to {backup.name}") except Exception: pass data = {} print("[SYNC] Starting from clean config.") try: # Remove /dev/null from plugins.locations if "plugins" in data and isinstance(data.get("plugins"), dict): locs = data["plugins"].get("locations", []) if isinstance(locs, list) and "/dev/null" in locs: data["plugins"]["locations"] = [l for l in locs if l != "/dev/null"] # Force full gateway config for HF Spaces # Note: Dockerfile injects "openclaw-space-default" token into Control UI, # so we MUST set it here to match what the browser sends. data["gateway"] = { "mode": "local", "bind": "lan", "port": 7860, "auth": {"token": "openclaw-space-default"}, "trustedProxies": ["0.0.0.0/0"], "controlUi": { "allowInsecureAuth": True, "allowedOrigins": [ "https://tao-shen-openclaw-ai.hf.space", "https://huggingface.co" ] } } print("[SYNC] Set gateway config (auth=default, trustedProxies=all)") # Ensure agents defaults data.setdefault("agents", {}).setdefault("defaults", {}).setdefault("model", {}) data.setdefault("session", {})["scope"] = "global" # Force OpenRouter provider data.setdefault("models", {}).setdefault("providers", {}) data["models"]["providers"]["openrouter"] = { "baseUrl": "https://openrouter.ai/api/v1", "apiKey": "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", "api": "openai-completions", "models": [ {"id": "stepfun/step-3.5-flash:free", "name": "Step-3.5-Flash (Free)"}, {"id": "deepseek/deepseek-chat:free", "name": "DeepSeek V3 (Free)"} ] } # Remove old gemini provider if present data["models"]["providers"].pop("gemini", None) data["agents"]["defaults"]["model"]["primary"] = "openrouter/stepfun/step-3.5-flash:free" # Telegram plugin data.setdefault("plugins", {}).setdefault("entries", {}) if "telegram" not in data["plugins"]["entries"]: data["plugins"]["entries"]["telegram"] = {"enabled": True} elif isinstance(data["plugins"]["entries"]["telegram"], dict): data["plugins"]["entries"]["telegram"]["enabled"] = True with open(config_path, "w") as f: json.dump(data, f, indent=2) print("[SYNC] Config patched and saved.") # Verify write with open(config_path, "r") as f: verify_data = json.load(f) gw = verify_data.get("gateway", {}) providers = list(verify_data.get("models", {}).get("providers", {}).keys()) primary = verify_data.get("agents", {}).get("defaults", {}).get("model", {}).get("primary") print(f"[SYNC] VERIFY: gateway.port={gw.get('port')}, providers={providers}, primary={primary}") except Exception as e: print(f"[SYNC] Failed to patch config: {e}") traceback.print_exc() def _ensure_telegram_credentials(self): """Configure Telegram bot token and allowed users.""" creds_dir = OPENCLAW_HOME / "credentials" creds_dir.mkdir(parents=True, exist_ok=True) if TELEGRAM_BOT_TOKEN: bot_file = creds_dir / "telegram-bot-token.json" with open(bot_file, "w") as f: json.dump({"token": TELEGRAM_BOT_TOKEN, "bot": TELEGRAM_BOT_NAME}, f, indent=2) print(f"[SYNC] Telegram bot configured: {TELEGRAM_BOT_NAME}") allow_file = creds_dir / "telegram-allowFrom.json" if not allow_file.exists(): with open(allow_file, "w") as f: json.dump([TELEGRAM_ALLOW_USER], f, indent=2) print(f"[SYNC] Created telegram-allowFrom.json for {TELEGRAM_ALLOW_USER}") else: try: with open(allow_file, "r") as f: data = json.load(f) if not isinstance(data, list): data = [TELEGRAM_ALLOW_USER] elif TELEGRAM_ALLOW_USER not in data: data.append(TELEGRAM_ALLOW_USER) with open(allow_file, "w") as f: json.dump(data, f, indent=2) except Exception: with open(allow_file, "w") as f: json.dump([TELEGRAM_ALLOW_USER], f, indent=2) def _debug_list_files(self): print(f"[SYNC] Local ~/.openclaw tree:") try: count = 0 for root, dirs, files in os.walk(OPENCLAW_HOME): dirs[:] = [d for d in dirs if d not in {".cache", "node_modules", "__pycache__"}] for name in sorted(files): rel = os.path.relpath(os.path.join(root, name), OPENCLAW_HOME) print(f"[SYNC] {rel}") count += 1 if count > 50: print("[SYNC] ... (truncated)") return except Exception as e: print(f"[SYNC] listing failed: {e}") # ── Background sync loop ────────────────────────────────────────── def background_sync_loop(self, stop_event): print(f"[SYNC] Background sync started (interval={SYNC_INTERVAL}s)") while not stop_event.is_set(): if stop_event.wait(timeout=SYNC_INTERVAL): break print(f"[SYNC] ── Periodic sync triggered at {datetime.now().isoformat()} ──") self.save_to_repo() # ── Application runner ───────────────────────────────────────────── def run_openclaw(self): log_file = OPENCLAW_HOME / "workspace" / "startup.log" log_file.parent.mkdir(parents=True, exist_ok=True) cmd = f"node dist/entry.js gateway 2>&1 | tee -a {log_file}" print(f"[SYNC] Launching: {cmd}") return subprocess.Popen(cmd, shell=True, cwd=str(APP_DIR), stdout=sys.stdout, stderr=sys.stderr) # ── Main ──────────────────────────────────────────────────────────────────── def main(): sync = OpenClawFullSync() # 1. Restore sync.load_from_repo() # 2. Background sync stop_event = threading.Event() t = threading.Thread(target=sync.background_sync_loop, args=(stop_event,), daemon=True) t.start() # 3. Start application process = sync.run_openclaw() # Signal handler def handle_signal(sig, frame): print(f"\n[SYNC] Signal {sig} received. Shutting down...") stop_event.set() if process: process.terminate() try: process.wait(timeout=5) except subprocess.TimeoutExpired: process.kill() print("[SYNC] Final sync...") sync.save_to_repo() sys.exit(0) signal.signal(signal.SIGINT, handle_signal) signal.signal(signal.SIGTERM, handle_signal) # Wait exit_code = process.wait() print(f"[SYNC] OpenClaw exited with code {exit_code}") stop_event.set() print("[SYNC] Final sync...") sync.save_to_repo() sys.exit(exit_code) if __name__ == "__main__": main()