File size: 2,906 Bytes
4d6f8fd
 
 
f949e55
 
 
9abffe0
f949e55
4d6f8fd
9abffe0
 
f949e55
4d6f8fd
 
 
 
 
8a46a07
f949e55
9abffe0
f949e55
9abffe0
8a46a07
 
 
 
 
4d6f8fd
9abffe0
f949e55
 
 
 
 
 
4d6f8fd
9abffe0
f949e55
8a46a07
 
 
 
 
 
 
9abffe0
4d6f8fd
f949e55
 
 
4d6f8fd
9abffe0
f949e55
4d6f8fd
9abffe0
 
 
 
 
 
 
 
 
 
4d6f8fd
 
f949e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d6f8fd
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import os
import sys
from pathlib import Path
from huggingface_hub import HfApi

RESTORE_DIR = os.environ.get("HF_RESTORE_DIR", "/tmp/crs_backup")  # 与 shell 的 TMP_DIR 对齐

def list_backups(api: HfApi, repo_id: str, prefix: str):
    files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
    backs = [f for f in files if f.startswith(prefix) and f.endswith(".tar.gz")]
    backs.sort()
    return backs

def ensure_dataset(api: HfApi, repo_id: str):
    try:
        api.dataset_info(repo_id=repo_id)
    except Exception:
        # 若不存在则创建为私有数据集;如需公开可改 private=False
        api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)

def upload(args):
    api = HfApi(token=args.token)
    ensure_dataset(api, args.repo)

    # 上传当前归档
    api.upload_file(
        path_or_fileobj=args.file,
        path_in_repo=os.path.basename(args.file),
        repo_id=args.repo,
        repo_type="dataset",
    )

    # 仅保留最新 N 份
    if args.max and args.max > 0:
        backs = list_backups(api, args.repo, args.prefix)
        if len(backs) > args.max:
            to_del = backs[: len(backs) - args.max]
            for f in to_del:
                try:
                    api.delete_file(path_in_repo=f, repo_id=args.repo, repo_type="dataset")
                except Exception:
                    pass  # 删除失败不致命

def restore(args):
    api = HfApi(token=args.token)
    backs = list_backups(api, args.repo, args.prefix)
    if not backs:
        return  # 无备份,安静退出,由 shell 打日志

    latest = backs[-1]
    # 用持久目录而不是临时目录(否则 Python 退出就删了)
    Path(RESTORE_DIR).mkdir(parents=True, exist_ok=True)
    path = api.hf_hub_download(
        repo_id=args.repo,
        filename=latest,
        repo_type="dataset",
        local_dir=RESTORE_DIR,
        local_dir_use_symlinks=False,  # 更稳妥,避免符号链接指向被清理
    )
    print(path)  # 仅输出路径,供 shell 脚本接收

def main():
    p = argparse.ArgumentParser()
    sub = p.add_subparsers(dest="cmd", required=True)

    up = sub.add_parser("upload")
    up.add_argument("--token", required=True)
    up.add_argument("--repo", required=True)
    up.add_argument("--file", required=True)
    up.add_argument("--prefix", required=True)
    up.add_argument("--max", type=int, default=10)
    up.set_defaults(func=upload)

    rs = sub.add_parser("restore")
    rs.add_argument("--token", required=True)
    rs.add_argument("--repo", required=True)
    rs.add_argument("--prefix", required=True)
    rs.set_defaults(func=restore)

    args = p.parse_args()
    try:
        args.func(args)
    except KeyboardInterrupt:
        sys.exit(130)

if __name__ == "__main__":
    main()