File size: 4,506 Bytes
08e5d06
 
 
 
 
 
 
 
5fa24bd
6f33faf
5fa24bd
08e5d06
5fa24bd
08e5d06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fa24bd
08e5d06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fa24bd
08e5d06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fa24bd
 
08e5d06
 
5fa24bd
08e5d06
5fa24bd
08e5d06
 
 
 
 
 
5fa24bd
 
 
08e5d06
 
 
 
5fa24bd
 
08e5d06
 
 
 
 
 
 
 
5fa24bd
08e5d06
 
 
 
 
 
 
 
 
 
 
 
5fa24bd
39941cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/bin/sh

# 检查环境变量
if [ -z "$HF_TOKEN" ] || [ -z "$DATASET_ID" ]; then
    echo "Starting without backup functionality - missing HF_TOKEN or DATASET_ID"
    exit 1
fi

# 激活虚拟环境
. /app/venv/bin/activate

# 上传备份
cat > hf_sync.py << 'EOL'
from huggingface_hub import HfApi
import sys
import os
import tarfile
import tempfile

def manage_backups(api, repo_id, max_files=50):
    files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
    backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
    backup_files.sort()
    
    if len(backup_files) >= max_files:
        files_to_delete = backup_files[:(len(backup_files) - max_files + 1)]
        for file_to_delete in files_to_delete:
            try:
                api.delete_file(path_in_repo=file_to_delete, repo_id=repo_id, repo_type="dataset")
                print(f'Deleted old backup: {file_to_delete}')
            except Exception as e:
                print(f'Error deleting {file_to_delete}: {str(e)}')

def upload_backup(file_path, file_name, token, repo_id):
    api = HfApi(token=token)
    try:
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=file_name,
            repo_id=repo_id,
            repo_type="dataset"
        )
        print(f"Successfully uploaded {file_name}")
        
        manage_backups(api, repo_id)
    except Exception as e:
        print(f"Error uploading file: {str(e)}")

# 下载最新备份
def download_latest_backup(token, repo_id, extract_path):
    try:
        api = HfApi(token=token)
        files = api.list_repo_files(repo_id=repo_id, repo_type="dataset")
        backup_files = [f for f in files if f.startswith('backup_') and f.endswith('.tar.gz')]
        
        if not backup_files:
            print("No backup files found")
            return
            
        latest_backup = sorted(backup_files)[-1]
        
        with tempfile.TemporaryDirectory() as temp_dir:
            filepath = api.hf_hub_download(
                repo_id=repo_id,
                filename=latest_backup,
                repo_type="dataset",
                local_dir=temp_dir
            )
            
            if filepath and os.path.exists(filepath):
                with tarfile.open(filepath, 'r:gz') as tar:
                    tar.extractall(extract_path)  # 解压到指定路径
                print(f"Successfully restored backup from {latest_backup}")
                
    except Exception as e:
        print(f"Error downloading backup: {str(e)}")

if __name__ == "__main__":
    action = sys.argv[1]
    token = sys.argv[2]
    repo_id = sys.argv[3]
    
    if action == "upload":
        file_path = sys.argv[4]
        file_name = sys.argv[5]
        upload_backup(file_path, file_name, token, repo_id)
    elif action == "download":
        extract_path = sys.argv[4] if len(sys.argv) > 4 else '.'  # 默认为当前目录
        download_latest_backup(token, repo_id, extract_path)
EOL

# 首次启动时从HuggingFace下载最新备份(解压到应用目录)
echo "Downloading latest backup from HuggingFace..."
python hf_sync.py download "${HF_TOKEN}" "${DATASET_ID}" "./"  

# 同步函数
sync_data() {
    while true; do
        echo "Starting sync process at $(date)"
        
        # 确保数据目录存在(选择你的实际路径)
        STORAGE_PATH="./storage"  # 或改为"./storage"
        if [ -d "${STORAGE_PATH}" ]; then
            # 创建备份
            timestamp=$(date +%Y%m%d_%H%M%S)
            backup_file="backup_${timestamp}.tar.gz"
            
            # 压缩目录(使用-C避免包含父路径)
            tar -czf "/tmp/${backup_file}" -C "$(dirname "${STORAGE_PATH}")" "$(basename "${STORAGE_PATH}")"
            
            # 上传到HuggingFace
            echo "Uploading backup to HuggingFace..."
            python hf_sync.py upload "${HF_TOKEN}" "${DATASET_ID}" "/tmp/${backup_file}" "${backup_file}"
            
            # 清理临时文件
            rm -f "/tmp/${backup_file}"
        else
            echo "Storage directory ${STORAGE_PATH} does not exist, waiting..."
        fi
        
        # 同步间隔
        SYNC_INTERVAL=${SYNC_INTERVAL:-7200}
        echo "Next sync in ${SYNC_INTERVAL} seconds..."
        sleep $SYNC_INTERVAL
    done
}

# 启动同步进程
sync_data &

# 启动主应用(根据实际路径调整)
exec bash install_reader.sh  # 或改为你的启动命令