#!/usr/bin/env python3
"""
文件同步脚本：将 /data/workspace 的文件同步到 Hugging Face Dataset
用于持久化存储 ClawDBot 生成的文件

使用方法：
1. 创建一个 Dataset: huggingface-cli repo create clawdbot-data --type dataset
2. 设置环境变量: HF_DATASET_REPO=acpr123/clawdbot-data
3. 在 entrypoint.sh 中添加定期运行此脚本的 cron job
"""
import os
import time
from pathlib import Path
from huggingface_hub import HfApi, CommitOperationAdd

def sync_workspace_to_hf():
    """同步 workspace 目录到 HF Dataset"""

    hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
    dataset_repo = os.environ.get("HF_DATASET_REPO", "acpr123/clawdbot-data")
    workspace_dir = Path("/data/workspace")

    if not hf_token:
        print("⚠️  未配置 HF_TOKEN，跳过文件同步")
        return

    if not workspace_dir.exists():
        print(f"⚠️  Workspace 目录不存在: {workspace_dir}")
        return

    # 收集所有文件
    operations = []
    for file_path in workspace_dir.rglob("*"):
        if file_path.is_file():
            relative_path = file_path.relative_to(workspace_dir)
            operations.append(
                CommitOperationAdd(
                    path_in_repo=str(relative_path),
                    path_or_fileobj=str(file_path),
                )
            )

    if not operations:
        print("📁 Workspace 为空，无需同步")
        return

    try:
        api = HfApi()
        api.create_commit(
            repo_id=dataset_repo,
            repo_type="dataset",
            operations=operations,
            commit_message=f"Sync workspace files at {time.strftime('%Y-%m-%d %H:%M:%S')}",
            token=hf_token,
        )
        print(f"✅ 已同步 {len(operations)} 个文件到 {dataset_repo}")
    except Exception as e:
        print(f"❌ 同步失败: {e}")

if __name__ == "__main__":
    print("🔄 开始同步文件到 Hugging Face Dataset...")
    sync_workspace_to_hf()