import os import sys import shutil import tarfile import time import signal from pathlib import Path from datetime import datetime from huggingface_hub import HfApi, hf_hub_download, list_repo_files DATA_DIR = Path(os.environ.get("DATA_DIR", "/root/.openclaw")) DATASET_REPO_ID = os.environ.get("DATASET_REPO_ID", "your-username/your-dataset") SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", 300)) HF_TOKEN = os.environ.get("HF_TOKEN") ARCHIVE_NAME = "openclaw_backup.tar.gz" EXCLUDE_FILES = {"openclaw.json"} api = HfApi(token=HF_TOKEN) running = True def signal_handler(signum, frame): global running print(f"[{datetime.now()}] Received signal {signum}, uploading before exit...") upload_to_dataset() running = False sys.exit(0) def download_from_dataset(): DATA_DIR.mkdir(parents=True, exist_ok=True) try: files = list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset", token=HF_TOKEN) if ARCHIVE_NAME in files: print(f"[{datetime.now()}] Downloading {ARCHIVE_NAME}...") archive_path = hf_hub_download( repo_id=DATASET_REPO_ID, filename=ARCHIVE_NAME, repo_type="dataset", local_dir="/tmp", token=HF_TOKEN ) shutil.unpack_archive(archive_path, DATA_DIR) os.remove(archive_path) print(f"[{datetime.now()}] Data restored to {DATA_DIR}") else: print(f"[{datetime.now()}] No backup found, starting fresh") except Exception as e: print(f"[{datetime.now()}] Download failed: {e}") def upload_to_dataset(): files_to_backup = [f for f in DATA_DIR.iterdir() if f.name not in EXCLUDE_FILES] if not files_to_backup: print(f"[{datetime.now()}] No files to upload") return try: archive_path = Path("/tmp") / ARCHIVE_NAME with tarfile.open(archive_path, "w:gz") as tar: for file_path in files_to_backup: tar.add(file_path, arcname=file_path.name) print(f"[{datetime.now()}] Uploading {ARCHIVE_NAME}...") api.upload_file( path_or_fileobj=str(archive_path), path_in_repo=ARCHIVE_NAME, repo_id=DATASET_REPO_ID, repo_type="dataset", ) os.remove(archive_path) print(f"[{datetime.now()}] Upload completed") except Exception as e: print(f"[{datetime.now()}] Upload failed: {e}") def upload_loop(): signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) while running: time.sleep(SYNC_INTERVAL) upload_to_dataset() if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python sync_data.py [download|upload_loop]") sys.exit(1) cmd = sys.argv[1] print(f"Dataset: {DATASET_REPO_ID}") print(f"Data dir: {DATA_DIR}") if cmd == "download": download_from_dataset() elif cmd == "upload_loop": print(f"Sync interval: {SYNC_INTERVAL}s") upload_loop() else: print(f"Unknown command: {cmd}") sys.exit(1)