| | |
| | """ |
| | 文件同步脚本:将 /data/workspace 的文件同步到 Hugging Face Dataset |
| | 用于持久化存储 ClawDBot 生成的文件 |
| | |
| | 使用方法: |
| | 1. 创建一个 Dataset: huggingface-cli repo create clawdbot-data --type dataset |
| | 2. 设置环境变量: HF_DATASET_REPO=acpr123/clawdbot-data |
| | 3. 在 entrypoint.sh 中添加定期运行此脚本的 cron job |
| | """ |
| | import os |
| | import time |
| | from pathlib import Path |
| | from huggingface_hub import HfApi, CommitOperationAdd |
| |
|
| | def sync_workspace_to_hf(): |
| | """同步 workspace 目录到 HF Dataset""" |
| |
|
| | hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY") |
| | dataset_repo = os.environ.get("HF_DATASET_REPO", "acpr123/clawdbot-data") |
| | workspace_dir = Path("/data/workspace") |
| |
|
| | if not hf_token: |
| | print("⚠️ 未配置 HF_TOKEN,跳过文件同步") |
| | return |
| |
|
| | if not workspace_dir.exists(): |
| | print(f"⚠️ Workspace 目录不存在: {workspace_dir}") |
| | return |
| |
|
| | |
| | operations = [] |
| | for file_path in workspace_dir.rglob("*"): |
| | if file_path.is_file(): |
| | relative_path = file_path.relative_to(workspace_dir) |
| | operations.append( |
| | CommitOperationAdd( |
| | path_in_repo=str(relative_path), |
| | path_or_fileobj=str(file_path), |
| | ) |
| | ) |
| |
|
| | if not operations: |
| | print("📁 Workspace 为空,无需同步") |
| | return |
| |
|
| | try: |
| | api = HfApi() |
| | api.create_commit( |
| | repo_id=dataset_repo, |
| | repo_type="dataset", |
| | operations=operations, |
| | commit_message=f"Sync workspace files at {time.strftime('%Y-%m-%d %H:%M:%S')}", |
| | token=hf_token, |
| | ) |
| | print(f"✅ 已同步 {len(operations)} 个文件到 {dataset_repo}") |
| | except Exception as e: |
| | print(f"❌ 同步失败: {e}") |
| |
|
| | if __name__ == "__main__": |
| | print("🔄 开始同步文件到 Hugging Face Dataset...") |
| | sync_workspace_to_hf() |
| |
|