File size: 5,136 Bytes
fc2f761
43acb38
fc2f761
43acb38
fc2f761
43acb38
fc2f761
 
 
43acb38
 
fc2f761
43acb38
 
 
 
fc2f761
 
43acb38
fc2f761
 
 
 
 
 
43acb38
fc2f761
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43acb38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc2f761
 
 
43acb38
 
 
fc2f761
 
 
43acb38
fc2f761
 
 
 
 
 
 
 
 
43acb38
fc2f761
 
43acb38
fc2f761
 
 
 
 
 
 
 
 
43acb38
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/bin/sh
set -e

# 必须设置的环境变量
if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
    echo "请设置环境变量 HF_TOKEN 和 DATASET_ID"
    exit 1
fi

# 激活 Python 虚拟环境(请根据实际路径修改)
. "$HOME/venv/bin/activate"

STORAGE_PATH="$HOME/app/data"
FLAG_FILE="$HOME/.hf_backup_first_done"

# 生成 hf_sync.py 脚本
cat > hf_sync.py << 'EOL'
from huggingface_hub import HfApi
import sys, os, tarfile, tempfile

def manage_backups(api, repo_id, max_files=50):
    files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
    backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
    backup_files.sort()
    if len(backup_files) >= max_files:
        for file_to_delete in backup_files[:(len(backup_files) - max_files + 1)]:
            try:
                api.delete_file(path_in_repo=file_to_delete, repo_id=repo_id, repo_type="dataset")
                print(f'已删除旧备份: {file_to_delete}')
            except Exception as e:
                print(f'删除 {file_to_delete} 时出错: {str(e)}')

def upload_backup(file_path, file_name, token, repo_id):
    api = HfApi(token=token)
    try:
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=file_name,
            repo_id=repo_id,
            repo_type="dataset"
        )
        print(f"成功上传 {file_name}")
        manage_backups(api, repo_id)
    except Exception as e:
        print(f"上传文件出错: {str(e)}")

def download_latest_backup(token, repo_id, extract_path):
    try:
        api = HfApi(token=token)
        files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
        backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
        if not backup_files:
            print("未找到任何备份文件")
            return
        latest_backup = sorted(backup_files)[-1]
        with tempfile.TemporaryDirectory() as temp_dir:
            filepath = api.hf_hub_download(
                repo_id=repo_id,
                filename=latest_backup,
                repo_type="dataset",
                local_dir=temp_dir
            )
            if filepath and os.path.exists(filepath):
                with tarfile.open(filepath, 'r:gz') as tar:
                    tar.extractall(extract_path)
                print(f"已成功恢复备份: {latest_backup}")
    except Exception as e:
        print(f"下载备份出错: {str(e)}")

def super_squash_history(token, repo_id):
    try:
        api = HfApi(token=token)
        api.super_squash_history(repo_id=repo_id, repo_type="dataset")
        print("历史合并完成。")
    except Exception as e:
        print(f"合并历史出错: {str(e)}")

if __name__ == "__main__":
    action = sys.argv[1]
    token = sys.argv[2]
    repo_id = sys.argv[3]
    if action == "upload":
        file_path = sys.argv[4]
        file_name = sys.argv[5]
        upload_backup(file_path, file_name, token, repo_id)
    elif action == "download":
        extract_path = sys.argv[4] if len(sys.argv) > 4 else '.'
        download_latest_backup(token, repo_id, extract_path)
    elif action == "super_squash":
        super_squash_history(token, repo_id)
EOL

upload_backup() {
    if [ -d "${STORAGE_PATH}" ]; then
        echo "上传备份数据..."
        timestamp=$(date +%Y%m%d_%H%M%S)
        backup_file="backup_${timestamp}.tar.gz"
        tar -czf "/tmp/${backup_file}" -C "$(dirname "${STORAGE_PATH}")" "$(basename "${STORAGE_PATH}")"
        python hf_sync.py upload "${HF_TOKEN}" "${DATASET_ID}" "/tmp/${backup_file}" "${backup_file}"
        rm -f "/tmp/${backup_file}"
    else
        echo "数据目录不存在,无法上传备份"
    fi
}

download_restore() {
    echo "从 HuggingFace 下载最新备份并恢复..."
    python hf_sync.py download "${HF_TOKEN}" "${DATASET_ID}" "$HOME/app"
}

if [ ! -f "$FLAG_FILE" ]; then
    echo "首次运行,开始上传备份..."
    upload_backup
    touch "$FLAG_FILE"
    echo "首次备份完成,程序退出。"
    exit 0
fi

sync_data() {
    while true; do
        echo "同步进程启动于 $(date)"
        upload_backup
        download_restore

        SQUASH_FLAG_FILE="/tmp/last_squash_time"
        NOW=$(date +%s)
        SEVEN_DAYS=$((7*24*60*60))

        if [ ! -f "$SQUASH_FLAG_FILE" ]; then
            echo $NOW > "$SQUASH_FLAG_FILE"
            echo "首次合并历史提交..."
            python hf_sync.py super_squash "${HF_TOKEN}" "${DATASET_ID}"
        else
            LAST=$(cat "$SQUASH_FLAG_FILE")
            DIFF=$((NOW - LAST))
            if [ $DIFF -ge $SEVEN_DAYS ]; then
                echo $NOW > "$SQUASH_FLAG_FILE"
                echo "超过7天,合并历史提交..."
                python hf_sync.py super_squash "${HF_TOKEN}" "${DATASET_ID}"
            else
                echo "未满7天,跳过合并历史提交。"
            fi
        fi

        SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
        echo "下次同步将在 ${SYNC_INTERVAL} 秒后进行..."
        sleep $SYNC_INTERVAL
    done
}

sync_data &