openclaw-Adam / scripts /save_to_dataset.py
tao-shen's picture
feat: Migrate dataset backups to gzipped tar archives and ensure backward compatibility during restore and rotation.
972704a
import os
import tarfile
import tempfile
import sys
import time
from datetime import datetime
from huggingface_hub import HfApi
def main() -> None:
"""
Backs up ~/.openclaw to Hugging Face Dataset with rolling history.
Keeps the last 5 backups to prevent data loss from corruption.
Env vars:
- HF_TOKEN
- OPENCLAW_DATASET_REPO
"""
repo_id = os.environ.get("OPENCLAW_DATASET_REPO")
token = os.environ.get("HF_TOKEN")
state_dir = os.path.expanduser("~/.openclaw")
if not repo_id or not token:
print("[save_to_dataset] Missing configuration.", file=sys.stderr)
return
if not os.path.isdir(state_dir):
print("[save_to_dataset] No state to save.", file=sys.stderr)
return
# 1. Validation: Ensure we have valid credentials before backing up
wa_creds_dir = os.path.join(state_dir, "credentials", "whatsapp", "default")
if os.path.isdir(wa_creds_dir):
file_count = len([f for f in os.listdir(wa_creds_dir) if os.path.isfile(os.path.join(wa_creds_dir, f))])
if file_count < 2:
# Basic sanity check: needs at least creds.json + keys.
# Lowered from 10 to 2 to be less aggressive but still catch empty/broken states.
print(f"[save_to_dataset] Skip: WhatsApp credentials incomplete ({file_count} files).", file=sys.stderr)
return
api = HfApi(token=token)
# Sync system logs to state dir for persistence
try:
sys_log_path = "/home/node/logs"
backup_log_path = os.path.join(state_dir, "logs/sys_logs")
if os.path.exists(sys_log_path):
if os.path.exists(backup_log_path):
import shutil
shutil.rmtree(backup_log_path)
# Use shutil.copytree but ignore socket files if any
import shutil
shutil.copytree(sys_log_path, backup_log_path, ignore_dangling_symlinks=True)
print(f"[save_to_dataset] Synced logs from {sys_log_path} to {backup_log_path}")
except Exception as e:
print(f"[save_to_dataset] Warning: Failed to sync logs: {e}")
# Check for credentials
creds_path = os.path.join(state_dir, "credentials/whatsapp/default/auth_info_multi.json")
if os.path.exists(creds_path):
print(f"[save_to_dataset] ✅ WhatsApp credentials found at {creds_path}")
else:
print(f"[save_to_dataset] ⚠️ WhatsApp credentials NOT found (user might need to login)")
# Generate timestamped filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"state/backup-{timestamp}.tar.gz"
with tempfile.TemporaryDirectory() as tmpdir:
tar_path = os.path.join(tmpdir, "openclaw.tar.gz")
try:
with tarfile.open(tar_path, "w:gz") as tf:
# Filter to exclude lock files or temp files if needed, but allow extensions
def exclude_filter(info: tarfile.TarInfo) -> tarfile.TarInfo | None:
if info.name.endswith(".lock"):
return None
return info
tf.add(state_dir, arcname=".", filter=exclude_filter)
except Exception as e:
print(f"[save_to_dataset] Failed to compress: {e}", file=sys.stderr)
return
print(f"[save_to_dataset] Uploading backup: {filename}")
try:
api.upload_file(
path_or_fileobj=tar_path,
path_in_repo=filename,
repo_id=repo_id,
repo_type="dataset",
)
except Exception as e:
print(f"[save_to_dataset] Upload failed: {e}", file=sys.stderr)
return
# 2. Rotation: Delete old backups, keep last 5
try:
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
# Match both .tar and .tar.gz for backward compatibility during transition
backups = sorted([f for f in files if f.startswith("state/backup-") and (f.endswith(".tar") or f.endswith(".tar.gz"))])
if len(backups) > 5:
# Delete oldest
to_delete = backups[:-5]
print(f"[save_to_dataset] Rotating backups, deleting: {to_delete}")
for old_backup in to_delete:
api.delete_file(
path_in_repo=old_backup,
repo_id=repo_id,
repo_type="dataset",
token=token
)
except Exception as e:
print(f"[save_to_dataset] Rotation failed (non-fatal): {e}", file=sys.stderr)