| |
| """ |
| 文件同步脚本:将 /data/workspace 的文件同步到 Hugging Face Dataset |
| 用于持久化存储 ClawDBot 生成的文件 |
| |
| 使用方法: |
| 1. 创建一个 Dataset: huggingface-cli repo create clawdbot-data --type dataset |
| 2. 设置环境变量: HF_DATASET_REPO=acpr123/clawdbot-data |
| 3. 在 entrypoint.sh 中添加定期运行此脚本的 cron job |
| """ |
| import os |
| import time |
| from pathlib import Path |
| from huggingface_hub import HfApi, CommitOperationAdd |
|
|
| def sync_workspace_to_hf(): |
| """同步 workspace 目录到 HF Dataset""" |
|
|
| hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY") |
| dataset_repo = os.environ.get("HF_DATASET_REPO", "Wenluo/paperclip") |
| workspace_dir = Path("/data/workspace") |
|
|
| if not hf_token: |
| print("⚠️ 未配置 HF_TOKEN,跳过文件同步") |
| return |
|
|
| if not workspace_dir.exists(): |
| print(f"⚠️ Workspace 目录不存在: {workspace_dir}") |
| return |
|
|
| |
| operations = [] |
| for file_path in workspace_dir.rglob("*"): |
| if file_path.is_file(): |
| relative_path = file_path.relative_to(workspace_dir) |
| operations.append( |
| CommitOperationAdd( |
| path_in_repo=str(relative_path), |
| path_or_fileobj=str(file_path), |
| ) |
| ) |
|
|
| if not operations: |
| print("📁 Workspace 为空,无需同步") |
| return |
|
|
| try: |
| api = HfApi() |
| api.create_commit( |
| repo_id=dataset_repo, |
| repo_type="dataset", |
| operations=operations, |
| commit_message=f"Sync workspace files at {time.strftime('%Y-%m-%d %H:%M:%S')}", |
| token=hf_token, |
| ) |
| print(f"✅ 已同步 {len(operations)} 个文件到 {dataset_repo}") |
| except Exception as e: |
| print(f"❌ 同步失败: {e}") |
|
|
| if __name__ == "__main__": |
| print("🔄 开始同步文件到 Hugging Face Dataset...") |
| sync_workspace_to_hf() |
|
|