Update tools/hf_backup.py
Browse files- tools/hf_backup.py +19 -17
tools/hf_backup.py
CHANGED
|
@@ -4,9 +4,11 @@
|
|
| 4 |
import argparse
|
| 5 |
import os
|
| 6 |
import sys
|
| 7 |
-
import
|
| 8 |
from huggingface_hub import HfApi
|
| 9 |
|
|
|
|
|
|
|
| 10 |
def list_backups(api: HfApi, repo_id: str, prefix: str):
|
| 11 |
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
|
| 12 |
backs = [f for f in files if f.startswith(prefix) and f.endswith(".tar.gz")]
|
|
@@ -15,16 +17,16 @@ def list_backups(api: HfApi, repo_id: str, prefix: str):
|
|
| 15 |
|
| 16 |
def ensure_dataset(api: HfApi, repo_id: str):
|
| 17 |
try:
|
| 18 |
-
api.dataset_info(repo_id=repo_id
|
| 19 |
except Exception:
|
| 20 |
-
#
|
| 21 |
api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)
|
| 22 |
|
| 23 |
def upload(args):
|
| 24 |
api = HfApi(token=args.token)
|
| 25 |
ensure_dataset(api, args.repo)
|
| 26 |
|
| 27 |
-
#
|
| 28 |
api.upload_file(
|
| 29 |
path_or_fileobj=args.file,
|
| 30 |
path_in_repo=os.path.basename(args.file),
|
|
@@ -32,7 +34,7 @@ def upload(args):
|
|
| 32 |
repo_type="dataset",
|
| 33 |
)
|
| 34 |
|
| 35 |
-
#
|
| 36 |
if args.max and args.max > 0:
|
| 37 |
backs = list_backups(api, args.repo, args.prefix)
|
| 38 |
if len(backs) > args.max:
|
|
@@ -41,25 +43,25 @@ def upload(args):
|
|
| 41 |
try:
|
| 42 |
api.delete_file(path_in_repo=f, repo_id=args.repo, repo_type="dataset")
|
| 43 |
except Exception:
|
| 44 |
-
# 删除失败不致命
|
| 45 |
-
pass
|
| 46 |
|
| 47 |
def restore(args):
|
| 48 |
api = HfApi(token=args.token)
|
| 49 |
backs = list_backups(api, args.repo, args.prefix)
|
| 50 |
if not backs:
|
| 51 |
-
return #
|
| 52 |
|
| 53 |
latest = backs[-1]
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
#
|
| 62 |
-
|
|
|
|
| 63 |
|
| 64 |
def main():
|
| 65 |
p = argparse.ArgumentParser()
|
|
|
|
| 4 |
import argparse
|
| 5 |
import os
|
| 6 |
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
from huggingface_hub import HfApi
|
| 9 |
|
| 10 |
+
RESTORE_DIR = os.environ.get("HF_RESTORE_DIR", "/tmp/crs_backup") # 与 shell 的 TMP_DIR 对齐
|
| 11 |
+
|
| 12 |
def list_backups(api: HfApi, repo_id: str, prefix: str):
|
| 13 |
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
|
| 14 |
backs = [f for f in files if f.startswith(prefix) and f.endswith(".tar.gz")]
|
|
|
|
| 17 |
|
| 18 |
def ensure_dataset(api: HfApi, repo_id: str):
|
| 19 |
try:
|
| 20 |
+
api.dataset_info(repo_id=repo_id)
|
| 21 |
except Exception:
|
| 22 |
+
# 若不存在则创建为私有数据集;如需公开可改 private=False
|
| 23 |
api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)
|
| 24 |
|
| 25 |
def upload(args):
|
| 26 |
api = HfApi(token=args.token)
|
| 27 |
ensure_dataset(api, args.repo)
|
| 28 |
|
| 29 |
+
# 上传当前归档
|
| 30 |
api.upload_file(
|
| 31 |
path_or_fileobj=args.file,
|
| 32 |
path_in_repo=os.path.basename(args.file),
|
|
|
|
| 34 |
repo_type="dataset",
|
| 35 |
)
|
| 36 |
|
| 37 |
+
# 仅保留最新 N 份
|
| 38 |
if args.max and args.max > 0:
|
| 39 |
backs = list_backups(api, args.repo, args.prefix)
|
| 40 |
if len(backs) > args.max:
|
|
|
|
| 43 |
try:
|
| 44 |
api.delete_file(path_in_repo=f, repo_id=args.repo, repo_type="dataset")
|
| 45 |
except Exception:
|
| 46 |
+
pass # 删除失败不致命
|
|
|
|
| 47 |
|
| 48 |
def restore(args):
|
| 49 |
api = HfApi(token=args.token)
|
| 50 |
backs = list_backups(api, args.repo, args.prefix)
|
| 51 |
if not backs:
|
| 52 |
+
return # 无备份,安静退出,由 shell 打日志
|
| 53 |
|
| 54 |
latest = backs[-1]
|
| 55 |
+
# 用持久目录而不是临时目录(否则 Python 退出就删了)
|
| 56 |
+
Path(RESTORE_DIR).mkdir(parents=True, exist_ok=True)
|
| 57 |
+
path = api.hf_hub_download(
|
| 58 |
+
repo_id=args.repo,
|
| 59 |
+
filename=latest,
|
| 60 |
+
repo_type="dataset",
|
| 61 |
+
local_dir=RESTORE_DIR,
|
| 62 |
+
local_dir_use_symlinks=False, # 更稳妥,避免符号链接指向被清理
|
| 63 |
+
)
|
| 64 |
+
print(path) # 仅输出路径,供 shell 脚本接收
|
| 65 |
|
| 66 |
def main():
|
| 67 |
p = argparse.ArgumentParser()
|