hequ commited on
Commit
4d6f8fd
·
verified ·
1 Parent(s): 0b18167

Create tools/hf_backup.py

Browse files
Files changed (1) hide show
  1. tools/hf_backup.py +85 -0
tools/hf_backup.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ import os, sys, time, tarfile, tempfile, shutil
4
+ from huggingface_hub import HfApi
5
+
6
+ def log(*a): print(time.strftime("[%Y-%m-%d %H:%M:%S]"), *a, flush=True)
7
+
8
+ token = os.environ.get("HF_TOKEN")
9
+ repo_id = os.environ.get("DATASET_ID")
10
+ max_backups = int(os.environ.get("MAX_BACKUPS", "10"))
11
+ backup_paths = [p.strip() for p in os.environ.get("BACKUP_PATHS", "/app/data,/app/logs").split(",") if p.strip()]
12
+ prefix = os.environ.get("BACKUP_PREFIX", "crs_backup_")
13
+
14
+ if not token or not repo_id:
15
+ log("HF_TOKEN/DATASET_ID 未配置,退出")
16
+ sys.exit(0)
17
+
18
+ api = HfApi(token=token)
19
+
20
+ def list_backups():
21
+ files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
22
+ backs = [f for f in files if f.startswith(prefix) and f.endswith(".tar.gz")]
23
+ backs.sort()
24
+ return backs
25
+
26
+ def create_archive():
27
+ ts = time.strftime("%Y%m%d_%H%M%S")
28
+ name = f"{prefix}{ts}.tar.gz"
29
+ tmpdir = tempfile.mkdtemp()
30
+ path = os.path.join(tmpdir, name)
31
+
32
+ with tarfile.open(path, "w:gz") as tar:
33
+ for d in backup_paths:
34
+ if not os.path.exists(d):
35
+ log("skip missing:", d)
36
+ continue
37
+ # 归档名尽量短,/app 前缀裁掉,方便恢复
38
+ arc = os.path.relpath(d, "/app") if d.startswith("/app/") else os.path.basename(d.rstrip("/"))
39
+ log("add to archive:", d, "as", arc)
40
+ tar.add(d, arcname=arc, recursive=True)
41
+ return path, name, tmpdir
42
+
43
+ def upload_and_prune(path, name):
44
+ log("Uploading", name)
45
+ api.upload_file(path_or_fileobj=path, path_in_repo=name, repo_id=repo_id, repo_type="dataset")
46
+ backs = list_backups()
47
+ excess = len(backs) - max_backups
48
+ for f in backs[:excess]:
49
+ try:
50
+ log("Pruning old backup:", f)
51
+ api.delete_file(path_in_repo=f, repo_id=repo_id, repo_type="dataset")
52
+ except Exception as e:
53
+ log("Delete failed:", f, e)
54
+
55
+ def restore_latest():
56
+ backs = list_backups()
57
+ if not backs:
58
+ log("No backups in dataset")
59
+ return
60
+ latest = backs[-1]
61
+ log("Restoring", latest)
62
+ tmpdir = tempfile.mkdtemp()
63
+ local = api.hf_hub_download(repo_id=repo_id, filename=latest, repo_type="dataset", local_dir=tmpdir)
64
+ with tarfile.open(local, "r:gz") as tar:
65
+ tar.extractall("/app")
66
+ log("Restored", latest)
67
+
68
+ def main():
69
+ if len(sys.argv) < 2:
70
+ log("Usage: hf_backup.py [restore|backup]")
71
+ return
72
+ cmd = sys.argv[1]
73
+ if cmd == "restore":
74
+ restore_latest()
75
+ elif cmd == "backup":
76
+ path, name, tmp = create_archive()
77
+ try:
78
+ upload_and_prune(path, name)
79
+ finally:
80
+ shutil.rmtree(os.path.dirname(path), ignore_errors=True)
81
+ else:
82
+ log("Unknown cmd:", cmd)
83
+
84
+ if __name__ == "__main__":
85
+ main()