mk99z commited on
Commit
e0d3c07
·
verified ·
1 Parent(s): 2c0d66f

Create sync_data.sh

Browse files
Files changed (1) hide show
  1. sync_data.sh +108 -0
sync_data.sh ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # 检查环境变量
4
+ if [[ -z "$HF_TOKEN" ]] || [[ -z "$DATASET_ID" ]]; then
5
+ echo "Starting without backup functionality - missing HF_TOKEN or DATASET_ID"
6
+ exit 0
7
+ fi
8
+
9
+ # 激活虚拟环境
10
+ source /opt/venv/bin/activate
11
+
12
+ # 上传备份
13
+ upload_backup() {
14
+ file_path="$1"
15
+ file_name="$2"
16
+ token="$HF_TOKEN"
17
+ repo_id="$DATASET_ID"
18
+
19
+ python3 -c "
20
+ from huggingface_hub import HfApi
21
+ import sys
22
+ import os
23
+ api = HfApi(token='$token')
24
+ try:
25
+ api.upload_file(
26
+ path_or_fileobj='$file_path',
27
+ path_in_repo='$file_name',
28
+ repo_id='$repo_id',
29
+ repo_type='dataset'
30
+ )
31
+ print(f'Successfully uploaded $file_name')
32
+ except Exception as e:
33
+ print(f'Error uploading file: {str(e)}')
34
+ "
35
+ }
36
+
37
+ # 下载最新备份
38
+ download_latest_backup() {
39
+ token="$HF_TOKEN"
40
+ repo_id="$DATASET_ID"
41
+
42
+ python3 -c "
43
+ from huggingface_hub import HfApi
44
+ import sys
45
+ import os
46
+ import tarfile
47
+ import tempfile
48
+ api = HfApi(token='$token')
49
+ try:
50
+ files = api.list_repo_files(repo_id='$repo_id', repo_type='dataset')
51
+ backup_files = [f for f in files if f.startswith('qinglong_backup_') and f.endswith('.tar.gz')]
52
+
53
+ if not backup_files:
54
+ print('No backup files found')
55
+ sys.exit()
56
+
57
+ latest_backup = sorted(backup_files)[-1]
58
+
59
+ with tempfile.TemporaryDirectory() as temp_dir:
60
+ filepath = api.hf_hub_download(
61
+ repo_id='$repo_id',
62
+ filename=latest_backup,
63
+ repo_type='dataset',
64
+ local_dir=temp_dir
65
+ )
66
+
67
+ if filepath and os.path.exists(filepath):
68
+ with tarfile.open(filepath, 'r:gz') as tar:
69
+ tar.extractall('/ql/data')
70
+ print(f'Successfully restored backup from {latest_backup}')
71
+
72
+ except Exception as e:
73
+ print(f'Error downloading backup: {str(e)}')
74
+ "
75
+ }
76
+
77
+ # 首次启动时下载最新备份
78
+ echo "Downloading latest backup from HuggingFace..."
79
+ download_latest_backup
80
+
81
+ # 同步函数
82
+ sync_data() {
83
+ while true; do
84
+ echo "Starting sync process at $(date)"
85
+
86
+ if [ -d /ql/data ]; then
87
+ timestamp=$(date +%Y%m%d_%H%M%S)
88
+ backup_file="qinglong_backup_${timestamp}.tar.gz"
89
+
90
+ # 压缩数据目录
91
+ tar -czf "/tmp/${backup_file}" -C /ql/data .
92
+
93
+ echo "Uploading backup to HuggingFace..."
94
+ upload_backup "/tmp/${backup_file}" "${backup_file}"
95
+
96
+ rm -f "/tmp/${backup_file}"
97
+ else
98
+ echo "Data directory does not exist yet, waiting for next sync..."
99
+ fi
100
+
101
+ SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
102
+ echo "Next sync in ${SYNC_INTERVAL} seconds..."
103
+ sleep $SYNC_INTERVAL
104
+ done
105
+ }
106
+
107
+ # 启动同步进程
108
+ sync_data