Spaces:
Paused
Paused
File size: 5,136 Bytes
fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 fc2f761 43acb38 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
#!/bin/sh
set -e
# 必须设置的环境变量
if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
echo "请设置环境变量 HF_TOKEN 和 DATASET_ID"
exit 1
fi
# 激活 Python 虚拟环境(请根据实际路径修改)
. "$HOME/venv/bin/activate"
STORAGE_PATH="$HOME/app/data"
FLAG_FILE="$HOME/.hf_backup_first_done"
# 生成 hf_sync.py 脚本
cat > hf_sync.py << 'EOL'
from huggingface_hub import HfApi
import sys, os, tarfile, tempfile
def manage_backups(api, repo_id, max_files=50):
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
backup_files.sort()
if len(backup_files) >= max_files:
for file_to_delete in backup_files[:(len(backup_files) - max_files + 1)]:
try:
api.delete_file(path_in_repo=file_to_delete, repo_id=repo_id, repo_type="dataset")
print(f'已删除旧备份: {file_to_delete}')
except Exception as e:
print(f'删除 {file_to_delete} 时出错: {str(e)}')
def upload_backup(file_path, file_name, token, repo_id):
api = HfApi(token=token)
try:
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file_name,
repo_id=repo_id,
repo_type="dataset"
)
print(f"成功上传 {file_name}")
manage_backups(api, repo_id)
except Exception as e:
print(f"上传文件出错: {str(e)}")
def download_latest_backup(token, repo_id, extract_path):
try:
api = HfApi(token=token)
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
if not backup_files:
print("未找到任何备份文件")
return
latest_backup = sorted(backup_files)[-1]
with tempfile.TemporaryDirectory() as temp_dir:
filepath = api.hf_hub_download(
repo_id=repo_id,
filename=latest_backup,
repo_type="dataset",
local_dir=temp_dir
)
if filepath and os.path.exists(filepath):
with tarfile.open(filepath, 'r:gz') as tar:
tar.extractall(extract_path)
print(f"已成功恢复备份: {latest_backup}")
except Exception as e:
print(f"下载备份出错: {str(e)}")
def super_squash_history(token, repo_id):
try:
api = HfApi(token=token)
api.super_squash_history(repo_id=repo_id, repo_type="dataset")
print("历史合并完成。")
except Exception as e:
print(f"合并历史出错: {str(e)}")
if __name__ == "__main__":
action = sys.argv[1]
token = sys.argv[2]
repo_id = sys.argv[3]
if action == "upload":
file_path = sys.argv[4]
file_name = sys.argv[5]
upload_backup(file_path, file_name, token, repo_id)
elif action == "download":
extract_path = sys.argv[4] if len(sys.argv) > 4 else '.'
download_latest_backup(token, repo_id, extract_path)
elif action == "super_squash":
super_squash_history(token, repo_id)
EOL
upload_backup() {
if [ -d "${STORAGE_PATH}" ]; then
echo "上传备份数据..."
timestamp=$(date +%Y%m%d_%H%M%S)
backup_file="backup_${timestamp}.tar.gz"
tar -czf "/tmp/${backup_file}" -C "$(dirname "${STORAGE_PATH}")" "$(basename "${STORAGE_PATH}")"
python hf_sync.py upload "${HF_TOKEN}" "${DATASET_ID}" "/tmp/${backup_file}" "${backup_file}"
rm -f "/tmp/${backup_file}"
else
echo "数据目录不存在,无法上传备份"
fi
}
download_restore() {
echo "从 HuggingFace 下载最新备份并恢复..."
python hf_sync.py download "${HF_TOKEN}" "${DATASET_ID}" "$HOME/app"
}
if [ ! -f "$FLAG_FILE" ]; then
echo "首次运行,开始上传备份..."
upload_backup
touch "$FLAG_FILE"
echo "首次备份完成,程序退出。"
exit 0
fi
sync_data() {
while true; do
echo "同步进程启动于 $(date)"
upload_backup
download_restore
SQUASH_FLAG_FILE="/tmp/last_squash_time"
NOW=$(date +%s)
SEVEN_DAYS=$((7*24*60*60))
if [ ! -f "$SQUASH_FLAG_FILE" ]; then
echo $NOW > "$SQUASH_FLAG_FILE"
echo "首次合并历史提交..."
python hf_sync.py super_squash "${HF_TOKEN}" "${DATASET_ID}"
else
LAST=$(cat "$SQUASH_FLAG_FILE")
DIFF=$((NOW - LAST))
if [ $DIFF -ge $SEVEN_DAYS ]; then
echo $NOW > "$SQUASH_FLAG_FILE"
echo "超过7天,合并历史提交..."
python hf_sync.py super_squash "${HF_TOKEN}" "${DATASET_ID}"
else
echo "未满7天,跳过合并历史提交。"
fi
fi
SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
echo "下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
sleep $SYNC_INTERVAL
done
}
sync_data &
|