hequ commited on
Commit
9abffe0
·
verified ·
1 Parent(s): 7e811cb

Update tools/hf_backup.py

Browse files
Files changed (1) hide show
  1. tools/hf_backup.py +19 -17
tools/hf_backup.py CHANGED
@@ -4,9 +4,11 @@
4
  import argparse
5
  import os
6
  import sys
7
- import tempfile
8
  from huggingface_hub import HfApi
9
 
 
 
10
  def list_backups(api: HfApi, repo_id: str, prefix: str):
11
  files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
12
  backs = [f for f in files if f.startswith(prefix) and f.endswith(".tar.gz")]
@@ -15,16 +17,16 @@ def list_backups(api: HfApi, repo_id: str, prefix: str):
15
 
16
  def ensure_dataset(api: HfApi, repo_id: str):
17
  try:
18
- api.dataset_info(repo_id=repo_id, use_auth_token=api.token) # 兼容旧版
19
  except Exception:
20
- # 不存在则创建(可按需改为 public)
21
  api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)
22
 
23
  def upload(args):
24
  api = HfApi(token=args.token)
25
  ensure_dataset(api, args.repo)
26
 
27
- # 上传
28
  api.upload_file(
29
  path_or_fileobj=args.file,
30
  path_in_repo=os.path.basename(args.file),
@@ -32,7 +34,7 @@ def upload(args):
32
  repo_type="dataset",
33
  )
34
 
35
- # 保留最新 N 份
36
  if args.max and args.max > 0:
37
  backs = list_backups(api, args.repo, args.prefix)
38
  if len(backs) > args.max:
@@ -41,25 +43,25 @@ def upload(args):
41
  try:
42
  api.delete_file(path_in_repo=f, repo_id=args.repo, repo_type="dataset")
43
  except Exception:
44
- # 删除失败不致命
45
- pass
46
 
47
  def restore(args):
48
  api = HfApi(token=args.token)
49
  backs = list_backups(api, args.repo, args.prefix)
50
  if not backs:
51
- return # 没有备份则安静退出,由 shell 脚本处理
52
 
53
  latest = backs[-1]
54
- with tempfile.TemporaryDirectory() as d:
55
- path = api.hf_hub_download(
56
- repo_id=args.repo,
57
- filename=latest,
58
- repo_type="dataset",
59
- local_dir=d
60
- )
61
- # 将本地文件路径直接输出给 shell
62
- print(path)
 
63
 
64
  def main():
65
  p = argparse.ArgumentParser()
 
4
  import argparse
5
  import os
6
  import sys
7
+ from pathlib import Path
8
  from huggingface_hub import HfApi
9
 
10
+ RESTORE_DIR = os.environ.get("HF_RESTORE_DIR", "/tmp/crs_backup") # 与 shell 的 TMP_DIR 对齐
11
+
12
  def list_backups(api: HfApi, repo_id: str, prefix: str):
13
  files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
14
  backs = [f for f in files if f.startswith(prefix) and f.endswith(".tar.gz")]
 
17
 
18
  def ensure_dataset(api: HfApi, repo_id: str):
19
  try:
20
+ api.dataset_info(repo_id=repo_id)
21
  except Exception:
22
+ # 若不存在则创建为私有数据集;如需公开可改 private=False
23
  api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)
24
 
25
  def upload(args):
26
  api = HfApi(token=args.token)
27
  ensure_dataset(api, args.repo)
28
 
29
+ # 上传当前归档
30
  api.upload_file(
31
  path_or_fileobj=args.file,
32
  path_in_repo=os.path.basename(args.file),
 
34
  repo_type="dataset",
35
  )
36
 
37
+ # 仅保留最新 N 份
38
  if args.max and args.max > 0:
39
  backs = list_backups(api, args.repo, args.prefix)
40
  if len(backs) > args.max:
 
43
  try:
44
  api.delete_file(path_in_repo=f, repo_id=args.repo, repo_type="dataset")
45
  except Exception:
46
+ pass # 删除失败不致命
 
47
 
48
  def restore(args):
49
  api = HfApi(token=args.token)
50
  backs = list_backups(api, args.repo, args.prefix)
51
  if not backs:
52
+ return # 无备份,安静退出,由 shell 打日志
53
 
54
  latest = backs[-1]
55
+ # 用持久目录而不是临时目录(否则 Python 退出就删了)
56
+ Path(RESTORE_DIR).mkdir(parents=True, exist_ok=True)
57
+ path = api.hf_hub_download(
58
+ repo_id=args.repo,
59
+ filename=latest,
60
+ repo_type="dataset",
61
+ local_dir=RESTORE_DIR,
62
+ local_dir_use_symlinks=False, # 更稳妥,避免符号链接指向被清理
63
+ )
64
+ print(path) # 仅输出路径,供 shell 脚本接收
65
 
66
  def main():
67
  p = argparse.ArgumentParser()