Spaces:
Paused
Paused
Update sync_data.sh
Browse files- sync_data.sh +38 -51
sync_data.sh
CHANGED
|
@@ -92,15 +92,10 @@ python3 -c "
|
|
| 92 |
from huggingface_hub import HfApi
|
| 93 |
try:
|
| 94 |
api = HfApi(token='$HF_TOKEN')
|
| 95 |
-
|
| 96 |
-
# 创建本地测试文件
|
| 97 |
with open('$TEMP_DIR/test_file', 'w') as f:
|
| 98 |
f.write('test')
|
| 99 |
-
|
| 100 |
-
# 上传测试文件
|
| 101 |
test_file_name = '$TEST_FILE_NAME'
|
| 102 |
print(f'正在上传测试文件: {test_file_name}')
|
| 103 |
-
|
| 104 |
api.upload_file(
|
| 105 |
path_or_fileobj='$TEMP_DIR/test_file',
|
| 106 |
path_in_repo=test_file_name,
|
|
@@ -108,8 +103,6 @@ try:
|
|
| 108 |
repo_type='dataset'
|
| 109 |
)
|
| 110 |
print('成功上传测试文件到Dataset')
|
| 111 |
-
|
| 112 |
-
# 删除已上传的测试文件
|
| 113 |
print('正在删除测试文件...')
|
| 114 |
api.delete_file(
|
| 115 |
path_in_repo=test_file_name,
|
|
@@ -117,7 +110,6 @@ try:
|
|
| 117 |
repo_type='dataset'
|
| 118 |
)
|
| 119 |
print('已成功删除测试文件')
|
| 120 |
-
|
| 121 |
except Exception as e:
|
| 122 |
print(f'Dataset权限测试失败: {str(e)}')
|
| 123 |
exit(1)
|
|
@@ -135,14 +127,11 @@ rm -f "$TEMP_DIR/test_file"
|
|
| 135 |
upload_backup() {
|
| 136 |
file_path="$1"
|
| 137 |
file_name="$2"
|
| 138 |
-
|
| 139 |
if [ ! -f "$file_path" ]; then
|
| 140 |
log_error "备份文件不存在: $file_path"
|
| 141 |
return 1
|
| 142 |
fi
|
| 143 |
-
|
| 144 |
log_info "开始上传备份: $file_name ($(du -h $file_path | cut -f1))"
|
| 145 |
-
|
| 146 |
python3 -c "
|
| 147 |
from huggingface_hub import HfApi
|
| 148 |
import sys
|
|
@@ -153,7 +142,6 @@ def manage_backups(api, repo_id, max_files=10):
|
|
| 153 |
files = api.list_repo_files(repo_id=repo_id, repo_type='dataset')
|
| 154 |
backup_files = [f for f in files if f.startswith('sillytavern_backup_') and f.endswith('.tar.gz')]
|
| 155 |
backup_files.sort()
|
| 156 |
-
|
| 157 |
if len(backup_files) >= max_files:
|
| 158 |
files_to_delete = backup_files[:(len(backup_files) - max_files + 1)]
|
| 159 |
for file_to_delete in files_to_delete:
|
|
@@ -168,33 +156,24 @@ token='$HF_TOKEN'
|
|
| 168 |
repo_id='$DATASET_ID'
|
| 169 |
try:
|
| 170 |
api = HfApi(token=token)
|
| 171 |
-
|
| 172 |
-
# 检查文件大小
|
| 173 |
file_size = os.path.getsize('$file_path')
|
| 174 |
print(f'备份文件大小: {file_size / (1024*1024):.2f} MB')
|
| 175 |
-
# 确认Dataset存在
|
| 176 |
try:
|
| 177 |
dataset_info = api.dataset_info(repo_id=repo_id)
|
| 178 |
print(f'Dataset信息: {dataset_info.id}')
|
| 179 |
except Exception as e:
|
| 180 |
print(f'获取Dataset信息失败: {str(e)}')
|
| 181 |
-
|
| 182 |
start_time = time.time()
|
| 183 |
print(f'开始上传: {start_time}')
|
| 184 |
-
|
| 185 |
-
# 上传文件
|
| 186 |
api.upload_file(
|
| 187 |
path_or_fileobj='$file_path',
|
| 188 |
path_in_repo='$file_name',
|
| 189 |
repo_id=repo_id,
|
| 190 |
repo_type='dataset'
|
| 191 |
)
|
| 192 |
-
|
| 193 |
end_time = time.time()
|
| 194 |
print(f'上传完成,耗时: {end_time - start_time:.2f} 秒')
|
| 195 |
print(f'成功上传 $file_name')
|
| 196 |
-
|
| 197 |
-
# 管理备份
|
| 198 |
manage_backups(api, repo_id)
|
| 199 |
except Exception as e:
|
| 200 |
print(f'上传文件时出错: {str(e)}')
|
|
@@ -212,7 +191,6 @@ except Exception as e:
|
|
| 212 |
# 下载最新备份
|
| 213 |
download_latest_backup() {
|
| 214 |
log_info "开始下载最新备份..."
|
| 215 |
-
|
| 216 |
python3 -c "
|
| 217 |
from huggingface_hub import HfApi
|
| 218 |
import sys
|
|
@@ -223,33 +201,23 @@ import time
|
|
| 223 |
try:
|
| 224 |
api = HfApi(token='$HF_TOKEN')
|
| 225 |
print('已创建API实例')
|
| 226 |
-
|
| 227 |
-
# 列出仓库文件
|
| 228 |
try:
|
| 229 |
files = api.list_repo_files(repo_id='$DATASET_ID', repo_type='dataset')
|
| 230 |
print(f'仓库文件数量: {len(files)}')
|
| 231 |
except Exception as e:
|
| 232 |
print(f'列出仓库文件失败: {str(e)}')
|
| 233 |
sys.exit(1)
|
| 234 |
-
|
| 235 |
backup_files = [f for f in files if f.startswith('sillytavern_backup_') and f.endswith('.tar.gz')]
|
| 236 |
print(f'找到备份文件数量: {len(backup_files)}')
|
| 237 |
-
|
| 238 |
if not backup_files:
|
| 239 |
print('未找到备份文件')
|
| 240 |
sys.exit(0)
|
| 241 |
-
|
| 242 |
-
# 按名称排序(实际上是按时间戳排序)
|
| 243 |
latest_backup = sorted(backup_files)[-1]
|
| 244 |
print(f'最新备份文件: {latest_backup}')
|
| 245 |
-
|
| 246 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 247 |
print(f'创建临时目录: {temp_dir}')
|
| 248 |
-
|
| 249 |
start_time = time.time()
|
| 250 |
print(f'开始下载: {start_time}')
|
| 251 |
-
|
| 252 |
-
# 下载文件
|
| 253 |
try:
|
| 254 |
filepath = api.hf_hub_download(
|
| 255 |
repo_id='$DATASET_ID',
|
|
@@ -261,18 +229,11 @@ try:
|
|
| 261 |
except Exception as e:
|
| 262 |
print(f'下载文件失败: {str(e)}')
|
| 263 |
sys.exit(1)
|
| 264 |
-
|
| 265 |
end_time = time.time()
|
| 266 |
print(f'下载完成,耗时: {end_time - start_time:.2f} 秒')
|
| 267 |
-
|
| 268 |
if filepath and os.path.exists(filepath):
|
| 269 |
-
# 确保目标目录存在
|
| 270 |
os.makedirs('$DATA_DIR', exist_ok=True)
|
| 271 |
-
|
| 272 |
-
# 检查文件权限
|
| 273 |
print(f'文件权限: {oct(os.stat(filepath).st_mode)[-3:]}')
|
| 274 |
-
|
| 275 |
-
# 解压文件
|
| 276 |
try:
|
| 277 |
with tarfile.open(filepath, 'r:gz') as tar:
|
| 278 |
print('开始解压文件...')
|
|
@@ -281,7 +242,6 @@ try:
|
|
| 281 |
except Exception as e:
|
| 282 |
print(f'解压文件失败: {str(e)}')
|
| 283 |
sys.exit(1)
|
| 284 |
-
|
| 285 |
print(f'成功从 {latest_backup} 恢复备份')
|
| 286 |
else:
|
| 287 |
print('下载的文件路��无效')
|
|
@@ -306,35 +266,63 @@ download_latest_backup
|
|
| 306 |
# 同步函数
|
| 307 |
sync_data() {
|
| 308 |
log_info "数据同步服务已启动"
|
| 309 |
-
|
| 310 |
while true; do
|
| 311 |
log_info "开始同步进程,时间: $(date)"
|
| 312 |
-
|
| 313 |
if [ -d "$DATA_DIR" ]; then
|
| 314 |
timestamp=$(date +%Y%m%d_%H%M%S)
|
| 315 |
backup_file="sillytavern_backup_${timestamp}.tar.gz"
|
| 316 |
backup_path="${TEMP_DIR}/${backup_file}"
|
| 317 |
-
|
| 318 |
log_info "创建备份文件: $backup_path"
|
| 319 |
-
|
| 320 |
-
# 检查数据目录内容
|
| 321 |
file_count=$(find "$DATA_DIR" -type f | wc -l)
|
| 322 |
log_info "数据目录文件数量: $file_count"
|
| 323 |
-
|
| 324 |
if [ "$file_count" -eq 0 ]; then
|
| 325 |
log_info "数据目录为空,跳过备份"
|
| 326 |
else
|
| 327 |
-
# 压缩数据目录
|
| 328 |
tar -czf "$backup_path" -C "$DATA_DIR" .
|
| 329 |
if [ $? -ne 0 ]; then
|
| 330 |
log_error "创建压缩文件失败"
|
| 331 |
else
|
| 332 |
log_info "压缩文件创建成功: $(du -h $backup_path | cut -f1)"
|
| 333 |
-
|
| 334 |
# 上传备份
|
| 335 |
log_info "正在上传备份到HuggingFace..."
|
| 336 |
upload_backup "$backup_path" "$backup_file"
|
| 337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
# 删除临时备份文件
|
| 339 |
rm -f "$backup_path"
|
| 340 |
log_info "已删除临时备份文件"
|
|
@@ -345,7 +333,6 @@ sync_data() {
|
|
| 345 |
mkdir -p "$DATA_DIR"
|
| 346 |
chmod -R 777 "$DATA_DIR"
|
| 347 |
fi
|
| 348 |
-
|
| 349 |
# 设置同步间隔
|
| 350 |
SYNC_INTERVAL=${SYNC_INTERVAL:-3600}
|
| 351 |
log_info "下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
|
|
@@ -354,4 +341,4 @@ sync_data() {
|
|
| 354 |
}
|
| 355 |
|
| 356 |
# 启动同步进程
|
| 357 |
-
sync_data
|
|
|
|
| 92 |
from huggingface_hub import HfApi
|
| 93 |
try:
|
| 94 |
api = HfApi(token='$HF_TOKEN')
|
|
|
|
|
|
|
| 95 |
with open('$TEMP_DIR/test_file', 'w') as f:
|
| 96 |
f.write('test')
|
|
|
|
|
|
|
| 97 |
test_file_name = '$TEST_FILE_NAME'
|
| 98 |
print(f'正在上传测试文件: {test_file_name}')
|
|
|
|
| 99 |
api.upload_file(
|
| 100 |
path_or_fileobj='$TEMP_DIR/test_file',
|
| 101 |
path_in_repo=test_file_name,
|
|
|
|
| 103 |
repo_type='dataset'
|
| 104 |
)
|
| 105 |
print('成功上传测试文件到Dataset')
|
|
|
|
|
|
|
| 106 |
print('正在删除测试文件...')
|
| 107 |
api.delete_file(
|
| 108 |
path_in_repo=test_file_name,
|
|
|
|
| 110 |
repo_type='dataset'
|
| 111 |
)
|
| 112 |
print('已成功删除测试文件')
|
|
|
|
| 113 |
except Exception as e:
|
| 114 |
print(f'Dataset权限测试失败: {str(e)}')
|
| 115 |
exit(1)
|
|
|
|
| 127 |
upload_backup() {
|
| 128 |
file_path="$1"
|
| 129 |
file_name="$2"
|
|
|
|
| 130 |
if [ ! -f "$file_path" ]; then
|
| 131 |
log_error "备份文件不存在: $file_path"
|
| 132 |
return 1
|
| 133 |
fi
|
|
|
|
| 134 |
log_info "开始上传备份: $file_name ($(du -h $file_path | cut -f1))"
|
|
|
|
| 135 |
python3 -c "
|
| 136 |
from huggingface_hub import HfApi
|
| 137 |
import sys
|
|
|
|
| 142 |
files = api.list_repo_files(repo_id=repo_id, repo_type='dataset')
|
| 143 |
backup_files = [f for f in files if f.startswith('sillytavern_backup_') and f.endswith('.tar.gz')]
|
| 144 |
backup_files.sort()
|
|
|
|
| 145 |
if len(backup_files) >= max_files:
|
| 146 |
files_to_delete = backup_files[:(len(backup_files) - max_files + 1)]
|
| 147 |
for file_to_delete in files_to_delete:
|
|
|
|
| 156 |
repo_id='$DATASET_ID'
|
| 157 |
try:
|
| 158 |
api = HfApi(token=token)
|
|
|
|
|
|
|
| 159 |
file_size = os.path.getsize('$file_path')
|
| 160 |
print(f'备份文件大小: {file_size / (1024*1024):.2f} MB')
|
|
|
|
| 161 |
try:
|
| 162 |
dataset_info = api.dataset_info(repo_id=repo_id)
|
| 163 |
print(f'Dataset信息: {dataset_info.id}')
|
| 164 |
except Exception as e:
|
| 165 |
print(f'获取Dataset信息失败: {str(e)}')
|
|
|
|
| 166 |
start_time = time.time()
|
| 167 |
print(f'开始上传: {start_time}')
|
|
|
|
|
|
|
| 168 |
api.upload_file(
|
| 169 |
path_or_fileobj='$file_path',
|
| 170 |
path_in_repo='$file_name',
|
| 171 |
repo_id=repo_id,
|
| 172 |
repo_type='dataset'
|
| 173 |
)
|
|
|
|
| 174 |
end_time = time.time()
|
| 175 |
print(f'上传完成,耗时: {end_time - start_time:.2f} 秒')
|
| 176 |
print(f'成功上传 $file_name')
|
|
|
|
|
|
|
| 177 |
manage_backups(api, repo_id)
|
| 178 |
except Exception as e:
|
| 179 |
print(f'上传文件时出错: {str(e)}')
|
|
|
|
| 191 |
# 下载最新备份
|
| 192 |
download_latest_backup() {
|
| 193 |
log_info "开始下载最新备份..."
|
|
|
|
| 194 |
python3 -c "
|
| 195 |
from huggingface_hub import HfApi
|
| 196 |
import sys
|
|
|
|
| 201 |
try:
|
| 202 |
api = HfApi(token='$HF_TOKEN')
|
| 203 |
print('已创建API实例')
|
|
|
|
|
|
|
| 204 |
try:
|
| 205 |
files = api.list_repo_files(repo_id='$DATASET_ID', repo_type='dataset')
|
| 206 |
print(f'仓库文件数量: {len(files)}')
|
| 207 |
except Exception as e:
|
| 208 |
print(f'列出仓库文件失败: {str(e)}')
|
| 209 |
sys.exit(1)
|
|
|
|
| 210 |
backup_files = [f for f in files if f.startswith('sillytavern_backup_') and f.endswith('.tar.gz')]
|
| 211 |
print(f'找到备份文件数量: {len(backup_files)}')
|
|
|
|
| 212 |
if not backup_files:
|
| 213 |
print('未找到备份文件')
|
| 214 |
sys.exit(0)
|
|
|
|
|
|
|
| 215 |
latest_backup = sorted(backup_files)[-1]
|
| 216 |
print(f'最新备份文件: {latest_backup}')
|
|
|
|
| 217 |
with tempfile.TemporaryDirectory() as temp_dir:
|
| 218 |
print(f'创建临时目录: {temp_dir}')
|
|
|
|
| 219 |
start_time = time.time()
|
| 220 |
print(f'开始下载: {start_time}')
|
|
|
|
|
|
|
| 221 |
try:
|
| 222 |
filepath = api.hf_hub_download(
|
| 223 |
repo_id='$DATASET_ID',
|
|
|
|
| 229 |
except Exception as e:
|
| 230 |
print(f'下载文件失败: {str(e)}')
|
| 231 |
sys.exit(1)
|
|
|
|
| 232 |
end_time = time.time()
|
| 233 |
print(f'下载完成,耗时: {end_time - start_time:.2f} 秒')
|
|
|
|
| 234 |
if filepath and os.path.exists(filepath):
|
|
|
|
| 235 |
os.makedirs('$DATA_DIR', exist_ok=True)
|
|
|
|
|
|
|
| 236 |
print(f'文件权限: {oct(os.stat(filepath).st_mode)[-3:]}')
|
|
|
|
|
|
|
| 237 |
try:
|
| 238 |
with tarfile.open(filepath, 'r:gz') as tar:
|
| 239 |
print('开始解压文件...')
|
|
|
|
| 242 |
except Exception as e:
|
| 243 |
print(f'解压文件失败: {str(e)}')
|
| 244 |
sys.exit(1)
|
|
|
|
| 245 |
print(f'成功从 {latest_backup} 恢复备份')
|
| 246 |
else:
|
| 247 |
print('下载的文件路��无效')
|
|
|
|
| 266 |
# 同步函数
|
| 267 |
sync_data() {
|
| 268 |
log_info "数据同步服务已启动"
|
|
|
|
| 269 |
while true; do
|
| 270 |
log_info "开始同步进程,时间: $(date)"
|
|
|
|
| 271 |
if [ -d "$DATA_DIR" ]; then
|
| 272 |
timestamp=$(date +%Y%m%d_%H%M%S)
|
| 273 |
backup_file="sillytavern_backup_${timestamp}.tar.gz"
|
| 274 |
backup_path="${TEMP_DIR}/${backup_file}"
|
|
|
|
| 275 |
log_info "创建备份文件: $backup_path"
|
|
|
|
|
|
|
| 276 |
file_count=$(find "$DATA_DIR" -type f | wc -l)
|
| 277 |
log_info "数据目录文件数量: $file_count"
|
|
|
|
| 278 |
if [ "$file_count" -eq 0 ]; then
|
| 279 |
log_info "数据目录为空,跳过备份"
|
| 280 |
else
|
|
|
|
| 281 |
tar -czf "$backup_path" -C "$DATA_DIR" .
|
| 282 |
if [ $? -ne 0 ]; then
|
| 283 |
log_error "创建压缩文件失败"
|
| 284 |
else
|
| 285 |
log_info "压缩文件创建成功: $(du -h $backup_path | cut -f1)"
|
|
|
|
| 286 |
# 上传备份
|
| 287 |
log_info "正在上传备份到HuggingFace..."
|
| 288 |
upload_backup "$backup_path" "$backup_file"
|
| 289 |
+
|
| 290 |
+
# 每7天合并一次历史提交
|
| 291 |
+
SQUASH_FLAG_FILE="/tmp/last_squash_time"
|
| 292 |
+
NOW=$(date +%s)
|
| 293 |
+
SEVEN_DAYS=$((7*24*60*60))
|
| 294 |
+
if [ ! -f "$SQUASH_FLAG_FILE" ]; then
|
| 295 |
+
echo $NOW > "$SQUASH_FLAG_FILE"
|
| 296 |
+
log_info "首次合并历史提交..."
|
| 297 |
+
python3 -c "
|
| 298 |
+
from huggingface_hub import HfApi
|
| 299 |
+
try:
|
| 300 |
+
api = HfApi(token='$HF_TOKEN')
|
| 301 |
+
api.super_squash_history(repo_id='$DATASET_ID', repo_type='dataset')
|
| 302 |
+
print('历史合并完成。')
|
| 303 |
+
except Exception as e:
|
| 304 |
+
print(f'合并历史出错: {str(e)}')
|
| 305 |
+
"
|
| 306 |
+
else
|
| 307 |
+
LAST=$(cat "$SQUASH_FLAG_FILE")
|
| 308 |
+
DIFF=$((NOW - LAST))
|
| 309 |
+
if [ $DIFF -ge $SEVEN_DAYS ]; then
|
| 310 |
+
echo $NOW > "$SQUASH_FLAG_FILE"
|
| 311 |
+
log_info "距离上次合并已超过7天,正在合并历史提交..."
|
| 312 |
+
python3 -c "
|
| 313 |
+
from huggingface_hub import HfApi
|
| 314 |
+
try:
|
| 315 |
+
api = HfApi(token='$HF_TOKEN')
|
| 316 |
+
api.super_squash_history(repo_id='$DATASET_ID', repo_type='dataset')
|
| 317 |
+
print('历史合并完成。')
|
| 318 |
+
except Exception as e:
|
| 319 |
+
print(f'合并历史出错: {str(e)}')
|
| 320 |
+
"
|
| 321 |
+
else
|
| 322 |
+
log_info "距离上次合并未满7天,本次跳过合并历史提交。"
|
| 323 |
+
fi
|
| 324 |
+
fi
|
| 325 |
+
|
| 326 |
# 删除临时备份文件
|
| 327 |
rm -f "$backup_path"
|
| 328 |
log_info "已删除临时备份文件"
|
|
|
|
| 333 |
mkdir -p "$DATA_DIR"
|
| 334 |
chmod -R 777 "$DATA_DIR"
|
| 335 |
fi
|
|
|
|
| 336 |
# 设置同步间隔
|
| 337 |
SYNC_INTERVAL=${SYNC_INTERVAL:-3600}
|
| 338 |
log_info "下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
|
|
|
|
| 341 |
}
|
| 342 |
|
| 343 |
# 启动同步进程
|
| 344 |
+
sync_data
|