paperclip

Runtime error

App Files Files Community

paperclip / sync_files.py

Wenluo

Update sync_files.py

1fe0252 verified about 1 month ago

raw

history blame contribute delete

2.03 kB

	#!/usr/bin/env python3
	"""
	文件同步脚本：将 /data/workspace 的文件同步到 Hugging Face Dataset
	用于持久化存储 ClawDBot 生成的文件

	使用方法：
	1. 创建一个 Dataset: huggingface-cli repo create clawdbot-data --type dataset
	2. 设置环境变量: HF_DATASET_REPO=acpr123/clawdbot-data
	3. 在 entrypoint.sh 中添加定期运行此脚本的 cron job
	"""
	import os
	import time
	from pathlib import Path
	from huggingface_hub import HfApi, CommitOperationAdd

	def sync_workspace_to_hf():
	"""同步 workspace 目录到 HF Dataset"""

	hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_API_KEY")
	dataset_repo = os.environ.get("HF_DATASET_REPO", "Wenluo/paperclip")
	workspace_dir = Path("/data/workspace")

	if not hf_token:
	print("⚠️ 未配置 HF_TOKEN，跳过文件同步")
	return

	if not workspace_dir.exists():
	print(f"⚠️ Workspace 目录不存在: {workspace_dir}")
	return

	# 收集所有文件
	operations = []
	for file_path in workspace_dir.rglob("*"):
	if file_path.is_file():
	relative_path = file_path.relative_to(workspace_dir)
	operations.append(
	CommitOperationAdd(
	path_in_repo=str(relative_path),
	path_or_fileobj=str(file_path),
	)
	)

	if not operations:
	print("📁 Workspace 为空，无需同步")
	return

	try:
	api = HfApi()
	api.create_commit(
	repo_id=dataset_repo,
	repo_type="dataset",
	operations=operations,
	commit_message=f"Sync workspace files at {time.strftime('%Y-%m-%d %H:%M:%S')}",
	token=hf_token,
	)
	print(f"✅ 已同步 {len(operations)} 个文件到 {dataset_repo}")
	except Exception as e:
	print(f"❌ 同步失败: {e}")

	if __name__ == "__main__":
	print("🔄 开始同步文件到 Hugging Face Dataset...")
	sync_workspace_to_hf()