hequ commited on
Commit
f949e55
·
verified ·
1 Parent(s): 398c93f

Update tools/hf_backup.py

Browse files
Files changed (1) hide show
  1. tools/hf_backup.py +67 -66
tools/hf_backup.py CHANGED
@@ -1,85 +1,86 @@
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
- import os, sys, time, tarfile, tempfile, shutil
4
- from huggingface_hub import HfApi
5
-
6
- def log(*a): print(time.strftime("[%Y-%m-%d %H:%M:%S]"), *a, flush=True)
7
-
8
- token = os.environ.get("HF_TOKEN")
9
- repo_id = os.environ.get("DATASET_ID")
10
- max_backups = int(os.environ.get("MAX_BACKUPS", "10"))
11
- backup_paths = [p.strip() for p in os.environ.get("BACKUP_PATHS", "/app/data,/app/logs").split(",") if p.strip()]
12
- prefix = os.environ.get("BACKUP_PREFIX", "crs_backup_")
13
 
14
- if not token or not repo_id:
15
- log("HF_TOKEN/DATASET_ID 未配置,退出")
16
- sys.exit(0)
17
-
18
- api = HfApi(token=token)
19
 
20
- def list_backups():
21
  files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
22
  backs = [f for f in files if f.startswith(prefix) and f.endswith(".tar.gz")]
23
  backs.sort()
24
  return backs
25
 
26
- def create_archive():
27
- ts = time.strftime("%Y%m%d_%H%M%S")
28
- name = f"{prefix}{ts}.tar.gz"
29
- tmpdir = tempfile.mkdtemp()
30
- path = os.path.join(tmpdir, name)
 
 
31
 
32
- with tarfile.open(path, "w:gz") as tar:
33
- for d in backup_paths:
34
- if not os.path.exists(d):
35
- log("skip missing:", d)
36
- continue
37
- # 归档名尽量短,/app 前缀裁掉,方便恢复
38
- arc = os.path.relpath(d, "/app") if d.startswith("/app/") else os.path.basename(d.rstrip("/"))
39
- log("add to archive:", d, "as", arc)
40
- tar.add(d, arcname=arc, recursive=True)
41
- return path, name, tmpdir
42
 
43
- def upload_and_prune(path, name):
44
- log("Uploading", name)
45
- api.upload_file(path_or_fileobj=path, path_in_repo=name, repo_id=repo_id, repo_type="dataset")
46
- backs = list_backups()
47
- excess = len(backs) - max_backups
48
- for f in backs[:excess]:
49
- try:
50
- log("Pruning old backup:", f)
51
- api.delete_file(path_in_repo=f, repo_id=repo_id, repo_type="dataset")
52
- except Exception as e:
53
- log("Delete failed:", f, e)
54
 
55
- def restore_latest():
56
- backs = list_backups()
 
57
  if not backs:
58
- log("No backups in dataset")
59
- return
60
  latest = backs[-1]
61
- log("Restoring", latest)
62
- tmpdir = tempfile.mkdtemp()
63
- local = api.hf_hub_download(repo_id=repo_id, filename=latest, repo_type="dataset", local_dir=tmpdir)
64
- with tarfile.open(local, "r:gz") as tar:
65
- tar.extractall("/app")
66
- log("Restored", latest)
 
 
 
67
 
68
  def main():
69
- if len(sys.argv) < 2:
70
- log("Usage: hf_backup.py [restore|backup]")
71
- return
72
- cmd = sys.argv[1]
73
- if cmd == "restore":
74
- restore_latest()
75
- elif cmd == "backup":
76
- path, name, tmp = create_archive()
77
- try:
78
- upload_and_prune(path, name)
79
- finally:
80
- shutil.rmtree(os.path.dirname(path), ignore_errors=True)
81
- else:
82
- log("Unknown cmd:", cmd)
 
 
 
 
 
 
 
 
83
 
84
  if __name__ == "__main__":
85
  main()
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
 
 
 
 
 
 
 
 
 
 
3
 
4
+ import argparse
5
+ import os
6
+ import sys
7
+ import tempfile
8
+ from huggingface_hub import HfApi
9
 
10
+ def list_backups(api: HfApi, repo_id: str, prefix: str):
11
  files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
12
  backs = [f for f in files if f.startswith(prefix) and f.endswith(".tar.gz")]
13
  backs.sort()
14
  return backs
15
 
16
+ def upload(args):
17
+ api = HfApi(token=args.token)
18
+ # 可选:确认数据集存在
19
+ try:
20
+ api.dataset_info(repo_id=args.repo)
21
+ except Exception:
22
+ pass
23
 
24
+ # 上传
25
+ api.upload_file(
26
+ path_or_fileobj=args.file,
27
+ path_in_repo=os.path.basename(args.file),
28
+ repo_id=args.repo,
29
+ repo_type="dataset",
30
+ )
 
 
 
31
 
32
+ # 保留最新 N 份
33
+ if args.max and args.max > 0:
34
+ backs = list_backups(api, args.repo, args.prefix)
35
+ if len(backs) > args.max:
36
+ to_del = backs[: len(backs) - args.max]
37
+ for f in to_del:
38
+ try:
39
+ api.delete_file(path_in_repo=f, repo_id=args.repo, repo_type="dataset")
40
+ except Exception:
41
+ # 删除失败不致命
42
+ pass
43
 
44
+ def restore(args):
45
+ api = HfApi(token=args.token)
46
+ backs = list_backups(api, args.repo, args.prefix)
47
  if not backs:
48
+ return # 没有备份则安静退出,由 shell 脚本处理
49
+
50
  latest = backs[-1]
51
+ with tempfile.TemporaryDirectory() as d:
52
+ path = api.hf_hub_download(
53
+ repo_id=args.repo,
54
+ filename=latest,
55
+ repo_type="dataset",
56
+ local_dir=d
57
+ )
58
+ # 将本地文件路径直接输出给 shell
59
+ print(path)
60
 
61
  def main():
62
+ p = argparse.ArgumentParser()
63
+ sub = p.add_subparsers(dest="cmd", required=True)
64
+
65
+ up = sub.add_parser("upload")
66
+ up.add_argument("--token", required=True)
67
+ up.add_argument("--repo", required=True)
68
+ up.add_argument("--file", required=True)
69
+ up.add_argument("--prefix", required=True)
70
+ up.add_argument("--max", type=int, default=10)
71
+ up.set_defaults(func=upload)
72
+
73
+ rs = sub.add_parser("restore")
74
+ rs.add_argument("--token", required=True)
75
+ rs.add_argument("--repo", required=True)
76
+ rs.add_argument("--prefix", required=True)
77
+ rs.set_defaults(func=restore)
78
+
79
+ args = p.parse_args()
80
+ try:
81
+ args.func(args)
82
+ except KeyboardInterrupt:
83
+ sys.exit(130)
84
 
85
  if __name__ == "__main__":
86
  main()