wangyikunkun commited on
Commit
e96f980
·
verified ·
1 Parent(s): 318c6a2

Create sync_data.sh

Browse files
Files changed (1) hide show
  1. sync_data.sh +196 -0
sync_data.sh ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # 检查备份变量
4
+ if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
5
+ echo "⚠️ 缺少 HF_TOKEN 或 DATASET_ID,备份功能未启用"
6
+ exec java ${JVM_OPTS} -jar /opt/halo/halo.jar
7
+ exit 0
8
+ fi
9
+
10
+ # 设置默认值
11
+ DATASET_N=${DATASET_N:-10} # 默认保留最新 10 个备份
12
+ SYNC_INTERVAL=${SYNC_INTERVAL:-36000} # 默认同步间隔 36000 秒(10小时)
13
+ BACKUP_DIR="$HOME/.halo2"
14
+ BACKUP_PREFIX="halo_backup_"
15
+ BACKUP_EXT=".tar.gz"
16
+ HF_BRANCH="main"
17
+
18
+ # 打印消息到控制台
19
+ print_message() {
20
+ echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
21
+ }
22
+
23
+ # 检查并创建 Hugging Face 数据集(如果不存在)
24
+ create_dataset() {
25
+ print_message "检查 Hugging Face 数据集 '${DATASET_ID}' 是否存在..."
26
+ python3 <<EOF
27
+ import os
28
+ from huggingface_hub import HfApi
29
+ try:
30
+ api = HfApi(token=os.getenv("HF_TOKEN"))
31
+ repo_id = os.getenv("DATASET_ID")
32
+ # 获取当前用户的所有数据集
33
+ user_datasets = [d.id for d in api.list_datasets(author=repo_id.split('/')[0])]
34
+ if repo_id not in user_datasets:
35
+ api.create_repo(repo_id=repo_id, repo_type="dataset", private=True)
36
+ print(f"✅ 数据集 '{repo_id}' 不存在,已创建(私有)。")
37
+ else:
38
+ print(f"✅ 数据集 '{repo_id}' 已存在。")
39
+ except Exception as e:
40
+ print(f"⚠️ 数据集检查/创建失败:{str(e)}")
41
+ EOF
42
+ }
43
+
44
+ # 检查并创建 filecode 分支(如果不存在)
45
+ create_branch() {
46
+ print_message "检查 Hugging Face 数据集的 '${HF_BRANCH}' 分支..."
47
+ python3 <<EOF
48
+ import os
49
+ from huggingface_hub import HfApi
50
+ try:
51
+ api = HfApi(token=os.getenv("HF_TOKEN"))
52
+ repo_id = os.getenv("DATASET_ID")
53
+ hf_branch = "${HF_BRANCH}"
54
+ # 获取所有分支
55
+ branches = api.list_repo_refs(repo_id, repo_type="dataset").branches
56
+ branch_names = [b.name for b in branches]
57
+ if hf_branch not in branch_names:
58
+ api.create_branch(repo_id=repo_id, branch=hf_branch, repo_type="dataset")
59
+ print(f"✅ 分支 '{hf_branch}' 不存在,已创建。")
60
+ else:
61
+ print(f"✅ 分支 '{hf_branch}' 已存在。")
62
+ except Exception as e:
63
+ print(f"⚠️ 分支检查/创建失败:{str(e)}")
64
+ EOF
65
+ }
66
+
67
+ # 下载最新备份
68
+ download_data() {
69
+ print_message "开始下载最新备份..."
70
+ python3 <<EOF
71
+ import os
72
+ import tarfile
73
+ import tempfile
74
+ import shutil
75
+ from huggingface_hub import HfApi
76
+ def download_and_extract(api, repo_id, branch):
77
+ """下载并解压最新备份"""
78
+ files = api.list_repo_files(repo_id=repo_id, repo_type='dataset', revision=branch)
79
+ backup_files = sorted(f for f in files if f.startswith('${BACKUP_PREFIX}') and f.endswith('${BACKUP_EXT}'))
80
+
81
+ if not backup_files:
82
+ print("⚠️ 未找到任何备份文件")
83
+ return False
84
+
85
+ latest_backup = backup_files[-1]
86
+
87
+ # 使用临时目录下载文件
88
+ with tempfile.TemporaryDirectory() as temp_dir:
89
+ filepath = api.hf_hub_download(
90
+ repo_id=repo_id,
91
+ filename=latest_backup,
92
+ repo_type='dataset',
93
+ local_dir=temp_dir,
94
+ revision=branch
95
+ )
96
+
97
+ # 直接解压到 BACKUP_DIR
98
+ with tarfile.open(filepath, 'r:gz') as tar:
99
+ tar.extractall("${BACKUP_DIR}")
100
+
101
+ print("✅ 成功从最新备份恢复到 ${BACKUP_DIR}")
102
+ return True
103
+ if __name__ == "__main__":
104
+ api = HfApi(token=os.getenv("HF_TOKEN"))
105
+ restored = download_and_extract(api, os.getenv("DATASET_ID"), "${HF_BRANCH}")
106
+ print("RESTORED=1" if restored else "RESTORED=0")
107
+ EOF
108
+ }
109
+
110
+ # 备份数据到 Hugging Face
111
+ backup_data() {
112
+ local timestamp=$(date +%Y%m%d_%H%M%S)
113
+ local backup_file="${BACKUP_PREFIX}${timestamp}${BACKUP_EXT}"
114
+
115
+ # 使用 Python 创建临时目录
116
+ local temp_dir=$(python3 -c "import tempfile; print(tempfile.mkdtemp())")
117
+ local backup_path="${temp_dir}/${backup_file}"
118
+
119
+ print_message "开始备份数据:${backup_file} (临时目录: ${temp_dir})"
120
+
121
+ # 创建备份文件
122
+ tar -czf "$backup_path" -C "$BACKUP_DIR" .
123
+
124
+ # 使用 Python 上传并清理旧备份
125
+ python3 <<EOF
126
+ import os
127
+ import tempfile
128
+ from huggingface_hub import HfApi
129
+ try:
130
+ api = HfApi(token=os.getenv("HF_TOKEN"))
131
+ repo_id = os.getenv("DATASET_ID")
132
+ backup_file = "${backup_path}"
133
+ dataset_n = ${DATASET_N}
134
+ hf_branch = "${HF_BRANCH}"
135
+ # 上传新备份到 filecode 分支
136
+ api.upload_file(
137
+ path_or_fileobj=backup_file,
138
+ path_in_repo=os.path.basename(backup_file),
139
+ repo_id=repo_id,
140
+ repo_type="dataset",
141
+ revision=hf_branch
142
+ )
143
+ print(f"✅ 备份上传成功(分支: {hf_branch}):{os.path.basename(backup_file)}")
144
+ # 获取所有备份文件
145
+ files = api.list_repo_files(repo_id, repo_type="dataset", revision=hf_branch)
146
+ backup_files = sorted(f for f in files if f.startswith("${BACKUP_PREFIX}") and f.endswith("${BACKUP_EXT}"))
147
+ # 删除旧备份,仅保留最新 dataset_n 个
148
+ if len(backup_files) > dataset_n:
149
+ to_delete = backup_files[:-dataset_n]
150
+ for old_file in to_delete:
151
+ api.delete_file(path_in_repo=old_file, repo_id=repo_id, repo_type="dataset", revision=hf_branch)
152
+ print(f"🗑️ 已删除过期备份(分支: {hf_branch}):{old_file}")
153
+ except Exception as e:
154
+ print(f"⚠️ 备份失败:{str(e)}")
155
+ EOF
156
+
157
+ # 清理本地临时备份文件和目录
158
+ rm -f "$backup_path"
159
+ rm -rf "$temp_dir"
160
+ print_message "✅ 备份完成:${backup_file}"
161
+
162
+ # 使用 super_squash_history 压缩 commit 历史
163
+ print_message "开始压缩 commit 历史..."
164
+ python3 <<EOF
165
+ import os
166
+ from huggingface_hub import HfApi
167
+ try:
168
+ api = HfApi(token=os.getenv("HF_TOKEN"))
169
+ repo_id = os.getenv("DATASET_ID")
170
+ hf_branch = "${HF_BRANCH}"
171
+ api.super_squash_history(repo_id=repo_id, repo_type="dataset", branch=hf_branch)
172
+ print(f"✅ commit 历史已成功压缩(分支: {hf_branch})")
173
+ except Exception as e:
174
+ print(f"⚠️ commit 历史压缩失败:{str(e)}")
175
+ EOF
176
+ print_message "commit 历史压缩完成"
177
+ }
178
+
179
+ # 启动数据同步进程
180
+ sync_data() {
181
+ while true; do
182
+ backup_data
183
+ print_message "⏳ 下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
184
+ sleep "$SYNC_INTERVAL"
185
+ done
186
+ }
187
+
188
+ # 主程序执行逻辑
189
+ (
190
+ print_message "🚀 系统启动,准备从 Hugging Face 下载最新备份..."
191
+ create_dataset
192
+ create_branch
193
+ download_data
194
+ sync_data &
195
+ exec java ${JVM_OPTS} -jar /opt/halo/halo.jar
196
+ ) 2>&1