lydgs commited on
Commit
3f834dd
·
verified ·
1 Parent(s): d72ac9e

Create scripts/backup_to_dataset.py

Browse files
Files changed (1) hide show
  1. scripts/backup_to_dataset.py +83 -0
scripts/backup_to_dataset.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import sys
4
+ import datetime
5
+ from huggingface_hub import HfApi, login
6
+
7
+ # ===== 配置区域 =====
8
+ DB_PATH = "/data/freellm/database.sqlite" # 数据库文件路径
9
+ DATASET_REPO = "lydgs/freellm-backup" # 替换成你的私有数据集
10
+ BACKUP_PREFIX = "freellm_backup" # 备份文件前缀
11
+ RETENTION_DAYS = 30 # 保留天数(超过则删除)
12
+ # ===================
13
+
14
+ def backup_database():
15
+ # 检查数据库文件
16
+ if not os.path.exists(DB_PATH):
17
+ print(f"❌ 数据库文件不存在: {DB_PATH}")
18
+ return False
19
+
20
+ # 生成备份文件名
21
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
22
+ backup_name = f"{BACKUP_PREFIX}_{timestamp}.sqlite"
23
+
24
+ # 获取 Hugging Face Token
25
+ token = os.getenv("HF_TOKEN")
26
+ if not token:
27
+ print("❌ 环境变量 HF_TOKEN 未设置")
28
+ return False
29
+
30
+ # 登录
31
+ login(token=token)
32
+ api = HfApi()
33
+
34
+ # 1. 上传当前备份
35
+ try:
36
+ api.upload_file(
37
+ path_or_fileobj=DB_PATH,
38
+ path_in_repo=backup_name,
39
+ repo_id=DATASET_REPO,
40
+ repo_type="dataset"
41
+ )
42
+ print(f"✅ 备份成功: https://huggingface.co/datasets/{DATASET_REPO}/blob/main/{backup_name}")
43
+ except Exception as e:
44
+ print(f"❌ 上传失败: {e}")
45
+ return False
46
+
47
+ # 2. 清理旧备份(保留最近 RETENTION_DAYS 天的文件)
48
+ try:
49
+ files = api.list_repo_files(repo_id=DATASET_REPO, repo_type="dataset")
50
+ now = datetime.datetime.now()
51
+ deleted_count = 0
52
+ for f in files:
53
+ if not f.startswith(BACKUP_PREFIX) or not f.endswith(".sqlite"):
54
+ continue
55
+ # 解析时间戳部分,文件名格式: prefix_YYYYMMDD_HHMMSS.sqlite
56
+ try:
57
+ parts = f.replace(".sqlite", "").split("_")
58
+ if len(parts) >= 3:
59
+ ts_str = f"{parts[1]}_{parts[2]}" # YYYYMMDD_HHMMSS
60
+ file_time = datetime.datetime.strptime(ts_str, "%Y%m%d_%H%M%S")
61
+ if (now - file_time).days > RETENTION_DAYS:
62
+ api.delete_file(path_in_repo=f, repo_id=DATASET_REPO, repo_type="dataset")
63
+ print(f"🗑️ 已删除旧备份: {f}")
64
+ deleted_count += 1
65
+ except Exception as parse_err:
66
+ print(f"⚠️ 跳过无法解析的文件: {f}, 原因: {parse_err}")
67
+ if deleted_count:
68
+ print(f"✅ 清理完成,共删除 {deleted_count} 个旧备份")
69
+ else:
70
+ print("✅ 没有需要清理的旧备份")
71
+ except Exception as e:
72
+ print(f"⚠️ 清理旧备份时出错: {e}")
73
+
74
+ return True
75
+
76
+ if __name__ == "__main__":
77
+ print(f"[{datetime.datetime.now()}] 开始备份并清理...")
78
+ if backup_database():
79
+ print("任务完成")
80
+ sys.exit(0)
81
+ else:
82
+ print("任务失败")
83
+ sys.exit(1)