HuggingFace0920 commited on
Commit
b200139
·
verified ·
1 Parent(s): 44eee85

Update sync_data.sh

Browse files
Files changed (1) hide show
  1. sync_data.sh +38 -51
sync_data.sh CHANGED
@@ -92,15 +92,10 @@ python3 -c "
92
  from huggingface_hub import HfApi
93
  try:
94
  api = HfApi(token='$HF_TOKEN')
95
-
96
- # 创建本地测试文件
97
  with open('$TEMP_DIR/test_file', 'w') as f:
98
  f.write('test')
99
-
100
- # 上传测试文件
101
  test_file_name = '$TEST_FILE_NAME'
102
  print(f'正在上传测试文件: {test_file_name}')
103
-
104
  api.upload_file(
105
  path_or_fileobj='$TEMP_DIR/test_file',
106
  path_in_repo=test_file_name,
@@ -108,8 +103,6 @@ try:
108
  repo_type='dataset'
109
  )
110
  print('成功上传测试文件到Dataset')
111
-
112
- # 删除已上传的测试文件
113
  print('正在删除测试文件...')
114
  api.delete_file(
115
  path_in_repo=test_file_name,
@@ -117,7 +110,6 @@ try:
117
  repo_type='dataset'
118
  )
119
  print('已成功删除测试文件')
120
-
121
  except Exception as e:
122
  print(f'Dataset权限测试失败: {str(e)}')
123
  exit(1)
@@ -135,14 +127,11 @@ rm -f "$TEMP_DIR/test_file"
135
  upload_backup() {
136
  file_path="$1"
137
  file_name="$2"
138
-
139
  if [ ! -f "$file_path" ]; then
140
  log_error "备份文件不存在: $file_path"
141
  return 1
142
  fi
143
-
144
  log_info "开始上传备份: $file_name ($(du -h $file_path | cut -f1))"
145
-
146
  python3 -c "
147
  from huggingface_hub import HfApi
148
  import sys
@@ -153,7 +142,6 @@ def manage_backups(api, repo_id, max_files=10):
153
  files = api.list_repo_files(repo_id=repo_id, repo_type='dataset')
154
  backup_files = [f for f in files if f.startswith('sillytavern_backup_') and f.endswith('.tar.gz')]
155
  backup_files.sort()
156
-
157
  if len(backup_files) >= max_files:
158
  files_to_delete = backup_files[:(len(backup_files) - max_files + 1)]
159
  for file_to_delete in files_to_delete:
@@ -168,33 +156,24 @@ token='$HF_TOKEN'
168
  repo_id='$DATASET_ID'
169
  try:
170
  api = HfApi(token=token)
171
-
172
- # 检查文件大小
173
  file_size = os.path.getsize('$file_path')
174
  print(f'备份文件大小: {file_size / (1024*1024):.2f} MB')
175
- # 确认Dataset存在
176
  try:
177
  dataset_info = api.dataset_info(repo_id=repo_id)
178
  print(f'Dataset信息: {dataset_info.id}')
179
  except Exception as e:
180
  print(f'获取Dataset信息失败: {str(e)}')
181
-
182
  start_time = time.time()
183
  print(f'开始上传: {start_time}')
184
-
185
- # 上传文件
186
  api.upload_file(
187
  path_or_fileobj='$file_path',
188
  path_in_repo='$file_name',
189
  repo_id=repo_id,
190
  repo_type='dataset'
191
  )
192
-
193
  end_time = time.time()
194
  print(f'上传完成,耗时: {end_time - start_time:.2f} 秒')
195
  print(f'成功上传 $file_name')
196
-
197
- # 管理备份
198
  manage_backups(api, repo_id)
199
  except Exception as e:
200
  print(f'上传文件时出错: {str(e)}')
@@ -212,7 +191,6 @@ except Exception as e:
212
  # 下载最新备份
213
  download_latest_backup() {
214
  log_info "开始下载最新备份..."
215
-
216
  python3 -c "
217
  from huggingface_hub import HfApi
218
  import sys
@@ -223,33 +201,23 @@ import time
223
  try:
224
  api = HfApi(token='$HF_TOKEN')
225
  print('已创建API实例')
226
-
227
- # 列出仓库文件
228
  try:
229
  files = api.list_repo_files(repo_id='$DATASET_ID', repo_type='dataset')
230
  print(f'仓库文件数量: {len(files)}')
231
  except Exception as e:
232
  print(f'列出仓库文件失败: {str(e)}')
233
  sys.exit(1)
234
-
235
  backup_files = [f for f in files if f.startswith('sillytavern_backup_') and f.endswith('.tar.gz')]
236
  print(f'找到备份文件数量: {len(backup_files)}')
237
-
238
  if not backup_files:
239
  print('未找到备份文件')
240
  sys.exit(0)
241
-
242
- # 按名称排序(实际上是按时间戳排序)
243
  latest_backup = sorted(backup_files)[-1]
244
  print(f'最新备份文件: {latest_backup}')
245
-
246
  with tempfile.TemporaryDirectory() as temp_dir:
247
  print(f'创建临时目录: {temp_dir}')
248
-
249
  start_time = time.time()
250
  print(f'开始下载: {start_time}')
251
-
252
- # 下载文件
253
  try:
254
  filepath = api.hf_hub_download(
255
  repo_id='$DATASET_ID',
@@ -261,18 +229,11 @@ try:
261
  except Exception as e:
262
  print(f'下载文件失败: {str(e)}')
263
  sys.exit(1)
264
-
265
  end_time = time.time()
266
  print(f'下载完成,耗时: {end_time - start_time:.2f} 秒')
267
-
268
  if filepath and os.path.exists(filepath):
269
- # 确保目标目录存在
270
  os.makedirs('$DATA_DIR', exist_ok=True)
271
-
272
- # 检查文件权限
273
  print(f'文件权限: {oct(os.stat(filepath).st_mode)[-3:]}')
274
-
275
- # 解压文件
276
  try:
277
  with tarfile.open(filepath, 'r:gz') as tar:
278
  print('开始解压文件...')
@@ -281,7 +242,6 @@ try:
281
  except Exception as e:
282
  print(f'解压文件失败: {str(e)}')
283
  sys.exit(1)
284
-
285
  print(f'成功从 {latest_backup} 恢复备份')
286
  else:
287
  print('下载的文件路��无效')
@@ -306,35 +266,63 @@ download_latest_backup
306
  # 同步函数
307
  sync_data() {
308
  log_info "数据同步服务已启动"
309
-
310
  while true; do
311
  log_info "开始同步进程,时间: $(date)"
312
-
313
  if [ -d "$DATA_DIR" ]; then
314
  timestamp=$(date +%Y%m%d_%H%M%S)
315
  backup_file="sillytavern_backup_${timestamp}.tar.gz"
316
  backup_path="${TEMP_DIR}/${backup_file}"
317
-
318
  log_info "创建备份文件: $backup_path"
319
-
320
- # 检查数据目录内容
321
  file_count=$(find "$DATA_DIR" -type f | wc -l)
322
  log_info "数据目录文件数量: $file_count"
323
-
324
  if [ "$file_count" -eq 0 ]; then
325
  log_info "数据目录为空,跳过备份"
326
  else
327
- # 压缩数据目录
328
  tar -czf "$backup_path" -C "$DATA_DIR" .
329
  if [ $? -ne 0 ]; then
330
  log_error "创建压缩文件失败"
331
  else
332
  log_info "压缩文件创建成功: $(du -h $backup_path | cut -f1)"
333
-
334
  # 上传备份
335
  log_info "正在上传备份到HuggingFace..."
336
  upload_backup "$backup_path" "$backup_file"
337
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  # 删除临时备份文件
339
  rm -f "$backup_path"
340
  log_info "已删除临时备份文件"
@@ -345,7 +333,6 @@ sync_data() {
345
  mkdir -p "$DATA_DIR"
346
  chmod -R 777 "$DATA_DIR"
347
  fi
348
-
349
  # 设置同步间隔
350
  SYNC_INTERVAL=${SYNC_INTERVAL:-3600}
351
  log_info "下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
@@ -354,4 +341,4 @@ sync_data() {
354
  }
355
 
356
  # 启动同步进程
357
- sync_data
 
92
  from huggingface_hub import HfApi
93
  try:
94
  api = HfApi(token='$HF_TOKEN')
 
 
95
  with open('$TEMP_DIR/test_file', 'w') as f:
96
  f.write('test')
 
 
97
  test_file_name = '$TEST_FILE_NAME'
98
  print(f'正在上传测试文件: {test_file_name}')
 
99
  api.upload_file(
100
  path_or_fileobj='$TEMP_DIR/test_file',
101
  path_in_repo=test_file_name,
 
103
  repo_type='dataset'
104
  )
105
  print('成功上传测试文件到Dataset')
 
 
106
  print('正在删除测试文件...')
107
  api.delete_file(
108
  path_in_repo=test_file_name,
 
110
  repo_type='dataset'
111
  )
112
  print('已成功删除测试文件')
 
113
  except Exception as e:
114
  print(f'Dataset权限测试失败: {str(e)}')
115
  exit(1)
 
127
  upload_backup() {
128
  file_path="$1"
129
  file_name="$2"
 
130
  if [ ! -f "$file_path" ]; then
131
  log_error "备份文件不存在: $file_path"
132
  return 1
133
  fi
 
134
  log_info "开始上传备份: $file_name ($(du -h $file_path | cut -f1))"
 
135
  python3 -c "
136
  from huggingface_hub import HfApi
137
  import sys
 
142
  files = api.list_repo_files(repo_id=repo_id, repo_type='dataset')
143
  backup_files = [f for f in files if f.startswith('sillytavern_backup_') and f.endswith('.tar.gz')]
144
  backup_files.sort()
 
145
  if len(backup_files) >= max_files:
146
  files_to_delete = backup_files[:(len(backup_files) - max_files + 1)]
147
  for file_to_delete in files_to_delete:
 
156
  repo_id='$DATASET_ID'
157
  try:
158
  api = HfApi(token=token)
 
 
159
  file_size = os.path.getsize('$file_path')
160
  print(f'备份文件大小: {file_size / (1024*1024):.2f} MB')
 
161
  try:
162
  dataset_info = api.dataset_info(repo_id=repo_id)
163
  print(f'Dataset信息: {dataset_info.id}')
164
  except Exception as e:
165
  print(f'获取Dataset信息失败: {str(e)}')
 
166
  start_time = time.time()
167
  print(f'开始上传: {start_time}')
 
 
168
  api.upload_file(
169
  path_or_fileobj='$file_path',
170
  path_in_repo='$file_name',
171
  repo_id=repo_id,
172
  repo_type='dataset'
173
  )
 
174
  end_time = time.time()
175
  print(f'上传完成,耗时: {end_time - start_time:.2f} 秒')
176
  print(f'成功上传 $file_name')
 
 
177
  manage_backups(api, repo_id)
178
  except Exception as e:
179
  print(f'上传文件时出错: {str(e)}')
 
191
  # 下载最新备份
192
  download_latest_backup() {
193
  log_info "开始下载最新备份..."
 
194
  python3 -c "
195
  from huggingface_hub import HfApi
196
  import sys
 
201
  try:
202
  api = HfApi(token='$HF_TOKEN')
203
  print('已创建API实例')
 
 
204
  try:
205
  files = api.list_repo_files(repo_id='$DATASET_ID', repo_type='dataset')
206
  print(f'仓库文件数量: {len(files)}')
207
  except Exception as e:
208
  print(f'列出仓库文件失败: {str(e)}')
209
  sys.exit(1)
 
210
  backup_files = [f for f in files if f.startswith('sillytavern_backup_') and f.endswith('.tar.gz')]
211
  print(f'找到备份文件数量: {len(backup_files)}')
 
212
  if not backup_files:
213
  print('未找到备份文件')
214
  sys.exit(0)
 
 
215
  latest_backup = sorted(backup_files)[-1]
216
  print(f'最新备份文件: {latest_backup}')
 
217
  with tempfile.TemporaryDirectory() as temp_dir:
218
  print(f'创建临时目录: {temp_dir}')
 
219
  start_time = time.time()
220
  print(f'开始下载: {start_time}')
 
 
221
  try:
222
  filepath = api.hf_hub_download(
223
  repo_id='$DATASET_ID',
 
229
  except Exception as e:
230
  print(f'下载文件失败: {str(e)}')
231
  sys.exit(1)
 
232
  end_time = time.time()
233
  print(f'下载完成,耗时: {end_time - start_time:.2f} 秒')
 
234
  if filepath and os.path.exists(filepath):
 
235
  os.makedirs('$DATA_DIR', exist_ok=True)
 
 
236
  print(f'文件权限: {oct(os.stat(filepath).st_mode)[-3:]}')
 
 
237
  try:
238
  with tarfile.open(filepath, 'r:gz') as tar:
239
  print('开始解压文件...')
 
242
  except Exception as e:
243
  print(f'解压文件失败: {str(e)}')
244
  sys.exit(1)
 
245
  print(f'成功从 {latest_backup} 恢复备份')
246
  else:
247
  print('下载的文件路��无效')
 
266
  # 同步函数
267
  sync_data() {
268
  log_info "数据同步服务已启动"
 
269
  while true; do
270
  log_info "开始同步进程,时间: $(date)"
 
271
  if [ -d "$DATA_DIR" ]; then
272
  timestamp=$(date +%Y%m%d_%H%M%S)
273
  backup_file="sillytavern_backup_${timestamp}.tar.gz"
274
  backup_path="${TEMP_DIR}/${backup_file}"
 
275
  log_info "创建备份文件: $backup_path"
 
 
276
  file_count=$(find "$DATA_DIR" -type f | wc -l)
277
  log_info "数据目录文件数量: $file_count"
 
278
  if [ "$file_count" -eq 0 ]; then
279
  log_info "数据目录为空,跳过备份"
280
  else
 
281
  tar -czf "$backup_path" -C "$DATA_DIR" .
282
  if [ $? -ne 0 ]; then
283
  log_error "创建压缩文件失败"
284
  else
285
  log_info "压缩文件创建成功: $(du -h $backup_path | cut -f1)"
 
286
  # 上传备份
287
  log_info "正在上传备份到HuggingFace..."
288
  upload_backup "$backup_path" "$backup_file"
289
+
290
+ # 每7天合并一次历史提交
291
+ SQUASH_FLAG_FILE="/tmp/last_squash_time"
292
+ NOW=$(date +%s)
293
+ SEVEN_DAYS=$((7*24*60*60))
294
+ if [ ! -f "$SQUASH_FLAG_FILE" ]; then
295
+ echo $NOW > "$SQUASH_FLAG_FILE"
296
+ log_info "首次合并历史提交..."
297
+ python3 -c "
298
+ from huggingface_hub import HfApi
299
+ try:
300
+ api = HfApi(token='$HF_TOKEN')
301
+ api.super_squash_history(repo_id='$DATASET_ID', repo_type='dataset')
302
+ print('历史合并完成。')
303
+ except Exception as e:
304
+ print(f'合并历史出错: {str(e)}')
305
+ "
306
+ else
307
+ LAST=$(cat "$SQUASH_FLAG_FILE")
308
+ DIFF=$((NOW - LAST))
309
+ if [ $DIFF -ge $SEVEN_DAYS ]; then
310
+ echo $NOW > "$SQUASH_FLAG_FILE"
311
+ log_info "距离上次合并已超过7天,正在合并历史提交..."
312
+ python3 -c "
313
+ from huggingface_hub import HfApi
314
+ try:
315
+ api = HfApi(token='$HF_TOKEN')
316
+ api.super_squash_history(repo_id='$DATASET_ID', repo_type='dataset')
317
+ print('历史合并完成。')
318
+ except Exception as e:
319
+ print(f'合并历史出错: {str(e)}')
320
+ "
321
+ else
322
+ log_info "距离上次合并未满7天,本次跳过合并历史提交。"
323
+ fi
324
+ fi
325
+
326
  # 删除临时备份文件
327
  rm -f "$backup_path"
328
  log_info "已删除临时备份文件"
 
333
  mkdir -p "$DATA_DIR"
334
  chmod -R 777 "$DATA_DIR"
335
  fi
 
336
  # 设置同步间隔
337
  SYNC_INTERVAL=${SYNC_INTERVAL:-3600}
338
  log_info "下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
 
341
  }
342
 
343
  # 启动同步进程
344
+ sync_data