clawdbot / sync_files.py
acpr123's picture
Upload sync_files.py with huggingface_hub
546d015 verified
#!/usr/bin/env python3
"""
文件同步脚本:将 /data/workspace 的文件同步到 Hugging Face Dataset
用于持久化存储 ClawDBot 生成的文件
使用方法:
1. 创建一个 Dataset: huggingface-cli repo create clawdbot-data --type dataset
2. 设置环境变量: HF_DATASET_REPO=acpr123/clawdbot-data
3. 在 entrypoint.sh 中添加定期运行此脚本的 cron job
"""
import os
import time
from pathlib import Path
from huggingface_hub import HfApi, CommitOperationAdd
def sync_workspace_to_hf():
"""同步 workspace 目录到 HF Dataset"""
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
dataset_repo = os.environ.get("HF_DATASET_REPO", "acpr123/clawdbot-data")
workspace_dir = Path("/data/workspace")
if not hf_token:
print("⚠️ 未配置 HF_TOKEN,跳过文件同步")
return
if not workspace_dir.exists():
print(f"⚠️ Workspace 目录不存在: {workspace_dir}")
return
# 收集所有文件
operations = []
for file_path in workspace_dir.rglob("*"):
if file_path.is_file():
relative_path = file_path.relative_to(workspace_dir)
operations.append(
CommitOperationAdd(
path_in_repo=str(relative_path),
path_or_fileobj=str(file_path),
)
)
if not operations:
print("📁 Workspace 为空,无需同步")
return
try:
api = HfApi()
api.create_commit(
repo_id=dataset_repo,
repo_type="dataset",
operations=operations,
commit_message=f"Sync workspace files at {time.strftime('%Y-%m-%d %H:%M:%S')}",
token=hf_token,
)
print(f"✅ 已同步 {len(operations)} 个文件到 {dataset_repo}")
except Exception as e:
print(f"❌ 同步失败: {e}")
if __name__ == "__main__":
print("🔄 开始同步文件到 Hugging Face Dataset...")
sync_workspace_to_hf()