claw / sync_data.py
1u's picture
Upload sync_data.py
f34c40b verified
import os
import sys
import shutil
import tarfile
import time
import signal
from pathlib import Path
from datetime import datetime
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
DATA_DIR = Path(os.environ.get("DATA_DIR", "/root/.openclaw"))
DATASET_REPO_ID = os.environ.get("DATASET_REPO_ID", "your-username/your-dataset")
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", 300))
HF_TOKEN = os.environ.get("HF_TOKEN")
ARCHIVE_NAME = "openclaw_backup.tar.gz"
EXCLUDE_FILES = {"openclaw.json"}
api = HfApi(token=HF_TOKEN)
running = True
def signal_handler(signum, frame):
global running
print(f"[{datetime.now()}] Received signal {signum}, uploading before exit...")
upload_to_dataset()
running = False
sys.exit(0)
def download_from_dataset():
DATA_DIR.mkdir(parents=True, exist_ok=True)
try:
files = list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset", token=HF_TOKEN)
if ARCHIVE_NAME in files:
print(f"[{datetime.now()}] Downloading {ARCHIVE_NAME}...")
archive_path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=ARCHIVE_NAME,
repo_type="dataset",
local_dir="/tmp",
token=HF_TOKEN
)
shutil.unpack_archive(archive_path, DATA_DIR)
os.remove(archive_path)
print(f"[{datetime.now()}] Data restored to {DATA_DIR}")
else:
print(f"[{datetime.now()}] No backup found, starting fresh")
except Exception as e:
print(f"[{datetime.now()}] Download failed: {e}")
def upload_to_dataset():
files_to_backup = [f for f in DATA_DIR.iterdir() if f.name not in EXCLUDE_FILES]
if not files_to_backup:
print(f"[{datetime.now()}] No files to upload")
return
try:
archive_path = Path("/tmp") / ARCHIVE_NAME
with tarfile.open(archive_path, "w:gz") as tar:
for file_path in files_to_backup:
tar.add(file_path, arcname=file_path.name)
print(f"[{datetime.now()}] Uploading {ARCHIVE_NAME}...")
api.upload_file(
path_or_fileobj=str(archive_path),
path_in_repo=ARCHIVE_NAME,
repo_id=DATASET_REPO_ID,
repo_type="dataset",
)
os.remove(archive_path)
print(f"[{datetime.now()}] Upload completed")
except Exception as e:
print(f"[{datetime.now()}] Upload failed: {e}")
def upload_loop():
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
while running:
time.sleep(SYNC_INTERVAL)
upload_to_dataset()
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python sync_data.py [download|upload_loop]")
sys.exit(1)
cmd = sys.argv[1]
print(f"Dataset: {DATASET_REPO_ID}")
print(f"Data dir: {DATA_DIR}")
if cmd == "download":
download_from_dataset()
elif cmd == "upload_loop":
print(f"Sync interval: {SYNC_INTERVAL}s")
upload_loop()
else:
print(f"Unknown command: {cmd}")
sys.exit(1)