File size: 2,906 Bytes
4d6f8fd f949e55 9abffe0 f949e55 4d6f8fd 9abffe0 f949e55 4d6f8fd 8a46a07 f949e55 9abffe0 f949e55 9abffe0 8a46a07 4d6f8fd 9abffe0 f949e55 4d6f8fd 9abffe0 f949e55 8a46a07 9abffe0 4d6f8fd f949e55 4d6f8fd 9abffe0 f949e55 4d6f8fd 9abffe0 4d6f8fd f949e55 4d6f8fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import os
import sys
from pathlib import Path
from huggingface_hub import HfApi
RESTORE_DIR = os.environ.get("HF_RESTORE_DIR", "/tmp/crs_backup") # 与 shell 的 TMP_DIR 对齐
def list_backups(api: HfApi, repo_id: str, prefix: str):
files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
backs = [f for f in files if f.startswith(prefix) and f.endswith(".tar.gz")]
backs.sort()
return backs
def ensure_dataset(api: HfApi, repo_id: str):
try:
api.dataset_info(repo_id=repo_id)
except Exception:
# 若不存在则创建为私有数据集;如需公开可改 private=False
api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)
def upload(args):
api = HfApi(token=args.token)
ensure_dataset(api, args.repo)
# 上传当前归档
api.upload_file(
path_or_fileobj=args.file,
path_in_repo=os.path.basename(args.file),
repo_id=args.repo,
repo_type="dataset",
)
# 仅保留最新 N 份
if args.max and args.max > 0:
backs = list_backups(api, args.repo, args.prefix)
if len(backs) > args.max:
to_del = backs[: len(backs) - args.max]
for f in to_del:
try:
api.delete_file(path_in_repo=f, repo_id=args.repo, repo_type="dataset")
except Exception:
pass # 删除失败不致命
def restore(args):
api = HfApi(token=args.token)
backs = list_backups(api, args.repo, args.prefix)
if not backs:
return # 无备份,安静退出,由 shell 打日志
latest = backs[-1]
# 用持久目录而不是临时目录(否则 Python 退出就删了)
Path(RESTORE_DIR).mkdir(parents=True, exist_ok=True)
path = api.hf_hub_download(
repo_id=args.repo,
filename=latest,
repo_type="dataset",
local_dir=RESTORE_DIR,
local_dir_use_symlinks=False, # 更稳妥,避免符号链接指向被清理
)
print(path) # 仅输出路径,供 shell 脚本接收
def main():
p = argparse.ArgumentParser()
sub = p.add_subparsers(dest="cmd", required=True)
up = sub.add_parser("upload")
up.add_argument("--token", required=True)
up.add_argument("--repo", required=True)
up.add_argument("--file", required=True)
up.add_argument("--prefix", required=True)
up.add_argument("--max", type=int, default=10)
up.set_defaults(func=upload)
rs = sub.add_parser("restore")
rs.add_argument("--token", required=True)
rs.add_argument("--repo", required=True)
rs.add_argument("--prefix", required=True)
rs.set_defaults(func=restore)
args = p.parse_args()
try:
args.func(args)
except KeyboardInterrupt:
sys.exit(130)
if __name__ == "__main__":
main()
|