HuggingFace0920 commited on
Commit
709fd1c
·
verified ·
1 Parent(s): 97622a8

Upload sync_data.sh

Browse files
Files changed (1) hide show
  1. sync_data.sh +47 -67
sync_data.sh CHANGED
@@ -1,27 +1,28 @@
1
  #!/bin/sh
2
 
3
- # 检查必须的环境变量是否已设置
4
  if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
5
- echo "HF_TOKEN DATASET_ID 环境变量未设置"
6
  exit 1
7
  fi
8
 
9
- # 激活 Python 虚拟环境
10
  . /app/venv/bin/activate
11
 
12
- # 生成 HuggingFace 同步脚本(Python 实现核心同步逻辑)
13
  cat > hf_sync.py << 'EOL'
 
14
  from huggingface_hub import HfApi
15
  import sys
16
  import os
17
  import tarfile
18
  import tempfile
19
 
20
- # 管理备份文件,超过最大数量则自动删除最旧的备份
21
  def manage_backups(api, repo_id, max_files=50):
22
  files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
23
  backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
24
- backup_files.sort() # 从旧到新
25
  if len(backup_files) >= max_files:
26
  files_to_delete = backup_files[:(len(backup_files) - max_files + 1)]
27
  for file_to_delete in files_to_delete:
@@ -41,12 +42,13 @@ def upload_backup(file_path, file_name, token, repo_id):
41
  repo_id=repo_id,
42
  repo_type="dataset"
43
  )
44
- print(f"成功上传备份: {file_name}")
45
  manage_backups(api, repo_id)
46
  except Exception as e:
47
  print(f"上传文件出错: {str(e)}")
48
 
49
- # 下载最新的备份并解压到指定目录
 
50
  def download_latest_backup(token, repo_id, extract_path):
51
  try:
52
  api = HfApi(token=token)
@@ -66,95 +68,73 @@ def download_latest_backup(token, repo_id, extract_path):
66
  if filepath and os.path.exists(filepath):
67
  with tarfile.open(filepath, 'r:gz') as tar:
68
  tar.extractall(extract_path)
69
- print(f"成功恢复备份: {latest_backup}")
70
  except Exception as e:
71
  print(f"下载备份出错: {str(e)}")
72
 
73
- # 清理所有 LFS(大文件存储)备份文件,只保留最新的 keep_latest 个
74
- def clear_lfs_files(token, repo_id, keep_latest=3):
75
- try:
76
- api = HfApi(token=token)
77
- files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
78
- lfs_files = [f for f in files if f.endswith('.tar.gz')]
79
- lfs_files.sort() # 从旧到新
80
- files_to_delete = lfs_files[:-keep_latest] if len(lfs_files) > keep_latest else []
81
- if not files_to_delete:
82
- print("没有需要删除的旧 LFS 文件")
83
- return
84
- for lfs_file in files_to_delete:
85
- try:
86
- api.delete_file(path_in_repo=lfs_file, repo_id=repo_id, repo_type="dataset")
87
- print(f"已删除旧 LFS 文件: {lfs_file}")
88
- except Exception as e:
89
- print(f"删除 LFS 文件出错: {str(e)}")
90
- except Exception as e:
91
- print(f"清理 LFS 文件出错: {str(e)}")
92
-
93
- # 合并所有历史提交为一个(Super Squash),可选保留最新的几个提交
94
- def super_squash_history(token, repo_id, keep_latest=1):
95
  try:
96
  api = HfApi(token=token)
97
  api.super_squash_history(repo_id=repo_id, repo_type="dataset", keep_last_commits=keep_latest)
98
- print(f"成功执行 super_squash_history,将历史合并,仅保留最新的 {keep_latest} 个提交。")
99
  except Exception as e:
100
- print(f"super_squash_history 执行出错: {str(e)}")
101
 
102
- # 主函数,根据命令行参数执行不同操作
103
  if __name__ == "__main__":
104
- if len(sys.argv) < 2:
105
- print("用法: python hf_sync.py <操作> [参数]")
106
- sys.exit(1)
107
  action = sys.argv[1]
108
- token = os.getenv('HF_TOKEN')
109
- repo_id = os.getenv('DATASET_ID')
110
  if action == "upload":
111
- if len(sys.argv) < 4:
112
- print("用法: python hf_sync.py upload <file_path> <file_name>")
113
- sys.exit(1)
114
- file_path = sys.argv[2]
115
- file_name = sys.argv[3]
116
  upload_backup(file_path, file_name, token, repo_id)
117
  elif action == "download":
118
- extract_path = sys.argv[2] if len(sys.argv) > 2 else '.'
119
  download_latest_backup(token, repo_id, extract_path)
120
- elif action == "clear_lfs":
121
- keep_latest = int(sys.argv[2]) if len(sys.argv) > 2 else 3
122
- clear_lfs_files(token, repo_id, keep_latest)
123
  elif action == "super_squash":
124
- keep_latest = int(sys.argv[2]) if len(sys.argv) > 2 else 1
125
  super_squash_history(token, repo_id, keep_latest)
126
- else:
127
- print("无效操作。可用操作: upload, download, clear_lfs, super_squash")
128
- sys.exit(1)
129
  EOL
130
-
131
- # 首次启动时自动下载最新备份
132
  echo "正在从 HuggingFace 下载最新备份..."
133
- python hf_sync.py download "./"
134
 
135
- # 定时同步函数
136
  sync_data() {
137
  while true; do
138
- echo "开始同步流程,当前时间: $(date)"
 
 
139
  STORAGE_PATH="./storage"
140
  if [ -d "${STORAGE_PATH}" ]; then
141
- BACKUP_FILE="backup_$(date +%Y%m%d_%H%M%S).tar.gz"
142
- tar -czf "${BACKUP_FILE}" -C "${STORAGE_PATH}" .
143
- echo "已创建备份: ${BACKUP_FILE}"
144
- python hf_sync.py upload "${BACKUP_FILE}" "${BACKUP_FILE}"
145
- rm -f "${BACKUP_FILE}"
146
- python hf_sync.py clear_lfs 3
 
 
 
 
 
 
 
147
  else
148
- echo "数据目录不存在: ${STORAGE_PATH}"
149
  fi
 
 
150
  SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
151
- echo "等待 ${SYNC_INTERVAL} 秒后进行下一次同步..."
152
  sleep $SYNC_INTERVAL
153
  done
154
  }
155
 
156
- # 后台启动同步进程
157
  sync_data &
158
 
159
- # 启动主应用(请根据实际路径调整)
160
  exec bash install_reader.sh
 
1
  #!/bin/sh
2
 
3
+ # 检查环境变量
4
  if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
5
+ echo "未检测到 HF_TOKEN DATASET_ID,备份功能不可用"
6
  exit 1
7
  fi
8
 
9
+ # 激活虚拟环境
10
  . /app/venv/bin/activate
11
 
12
+ # 生成同步脚本
13
  cat > hf_sync.py << 'EOL'
14
+ # HuggingFace 同步脚本
15
  from huggingface_hub import HfApi
16
  import sys
17
  import os
18
  import tarfile
19
  import tempfile
20
 
21
+ # 管理备份文件数量,超出最大数量则自动删除最旧的备份
22
  def manage_backups(api, repo_id, max_files=50):
23
  files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
24
  backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
25
+ backup_files.sort()
26
  if len(backup_files) >= max_files:
27
  files_to_delete = backup_files[:(len(backup_files) - max_files + 1)]
28
  for file_to_delete in files_to_delete:
 
42
  repo_id=repo_id,
43
  repo_type="dataset"
44
  )
45
+ print(f"成功上传 {file_name}")
46
  manage_backups(api, repo_id)
47
  except Exception as e:
48
  print(f"上传文件出错: {str(e)}")
49
 
50
+
51
+ # 下载最新备份
52
  def download_latest_backup(token, repo_id, extract_path):
53
  try:
54
  api = HfApi(token=token)
 
68
  if filepath and os.path.exists(filepath):
69
  with tarfile.open(filepath, 'r:gz') as tar:
70
  tar.extractall(extract_path)
71
+ print(f"已成功恢复备份: {latest_backup}")
72
  except Exception as e:
73
  print(f"下载备份出错: {str(e)}")
74
 
75
+ # 合并历史提交,仅保留最近的若干次提交
76
+ def super_squash_history(token, repo_id, keep_latest=3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  try:
78
  api = HfApi(token=token)
79
  api.super_squash_history(repo_id=repo_id, repo_type="dataset", keep_last_commits=keep_latest)
80
+ print(f"历史合并完成,仅保留最近 {keep_latest} 次提交。")
81
  except Exception as e:
82
+ print(f"合并历史出错: {str(e)}")
83
 
84
+ # 主函数
85
  if __name__ == "__main__":
 
 
 
86
  action = sys.argv[1]
87
+ token = sys.argv[2]
88
+ repo_id = sys.argv[3]
89
  if action == "upload":
90
+ file_path = sys.argv[4]
91
+ file_name = sys.argv[5]
 
 
 
92
  upload_backup(file_path, file_name, token, repo_id)
93
  elif action == "download":
94
+ extract_path = sys.argv[4] if len(sys.argv) > 4 else '.' # 默认为当前目录
95
  download_latest_backup(token, repo_id, extract_path)
 
 
 
96
  elif action == "super_squash":
97
+ keep_latest = int(sys.argv[4]) if len(sys.argv) > 4 else 3
98
  super_squash_history(token, repo_id, keep_latest)
 
 
 
99
  EOL
100
+ # 首次启动时从 HuggingFace 下载最新备份(解压到应用目录)
 
101
  echo "正在从 HuggingFace 下载最新备份..."
102
+ python hf_sync.py download "${HF_TOKEN}" "${DATASET_ID}" "./"
103
 
104
+ # 同步函数
105
  sync_data() {
106
  while true; do
107
+ echo "同步进程启动于 $(date)"
108
+
109
+ # 确保数据目录存在(请根据实际路径修改)
110
  STORAGE_PATH="./storage"
111
  if [ -d "${STORAGE_PATH}" ]; then
112
+ # 创建备份
113
+ timestamp=$(date +%Y%m%d_%H%M%S)
114
+ backup_file="backup_${timestamp}.tar.gz"
115
+
116
+ # 压缩目录(使用-C避免包含父路径)
117
+ tar -czf "/tmp/${backup_file}" -C "$(dirname "${STORAGE_PATH}")" "$(basename "${STORAGE_PATH}")"
118
+
119
+ # 上传到 HuggingFace
120
+ echo "正在上传备份到 HuggingFace..."
121
+ python hf_sync.py upload "${HF_TOKEN}" "${DATASET_ID}" "/tmp/${backup_file}" "${backup_file}"
122
+
123
+ # 清理临时文件
124
+ rm -f "/tmp/${backup_file}"
125
  else
126
+ echo "存储目录 ${STORAGE_PATH} 不存在,等待中..."
127
  fi
128
+
129
+ # 同步间隔
130
  SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
131
+ echo "下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
132
  sleep $SYNC_INTERVAL
133
  done
134
  }
135
 
136
+ # 启动同步进程
137
  sync_data &
138
 
139
+ # 启动主应用(请根据实际路径修改)
140
  exec bash install_reader.sh