sharween commited on
Commit
13abd1d
·
verified ·
1 Parent(s): 4b045d4

Upload backup-manager.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. backup-manager.py +270 -0
backup-manager.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ backup-manager.py — WebDAV + HF Dataset 双引擎备份/恢复
4
+
5
+ 策略:
6
+ - 全量备份 (tar.gz): 每 24h → WebDAV + HF Dataset
7
+ - 增量备份 (SHA256 manifest): 每小时 → WebDAV (仅变更文件)
8
+ - 恢复: 优先 WebDAV (全量→增量叠加) → fallback HF Dataset
9
+ """
10
+
11
+ import hashlib, json, os, tarfile, time, sys, copy
12
+ from pathlib import Path
13
+
14
+ import requests
15
+
16
+ # ── 配置 ──────────────────────────────────────────────────────────
17
+ STATE_DIR = os.environ.get("OPENCLAW_STATE_DIR", "/root/.openclaw")
18
+ WEBDAV_URL = os.environ.get("WEBDAV_URL", "").rstrip("/")
19
+ WEBDAV_USER = os.environ.get("WEBDAV_USERNAME", "")
20
+ WEBDAV_PASS = os.environ.get("WEBDAV_PASSWORD", "")
21
+ WEBDAV_PATH = os.environ.get("WEBDAV_BASE_PATH", "openclaw-backup")
22
+ HF_REPO = os.environ.get("HF_DATASET", "")
23
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
24
+
25
+ # 备份周期(分钟)
26
+ BACKUP_INCREMENT_INTERVAL = int(os.environ.get("BACKUP_INCREMENT_INTERVAL", "60"))
27
+ BACKUP_FULL_INTERVAL = int(os.environ.get("BACKUP_FULL_INTERVAL", "1440"))
28
+
29
+ FULL_NAME = "openclaw-full.tar.gz"
30
+ MANIFEST_NAME = "_incremental_manifest.json"
31
+
32
+ # ── WebDAV 原始 HTTP 层 ──────────────────────────────────────────
33
+
34
+ def _wd_auth():
35
+ return (WEBDAV_USER, WEBDAV_PASS) if WEBDAV_USER else None
36
+
37
+ def _wd_url(path=""):
38
+ return f"{WEBDAV_URL}/{WEBDAV_PATH}/{path.lstrip('/')}"
39
+
40
+ def _wd_req(method, path="", **kwargs):
41
+ url = _wd_url(path)
42
+ resp = requests.request(method, url, auth=_wd_auth(), timeout=60, **kwargs)
43
+ resp.raise_for_status()
44
+ return resp
45
+
46
+ def wd_exists(path):
47
+ try:
48
+ _wd_req("PROPFIND", path)
49
+ return True
50
+ except Exception:
51
+ return False
52
+
53
+ def wd_upload(path, data):
54
+ return _wd_req("PUT", path, data=data)
55
+
56
+ def wd_download(path):
57
+ return _wd_req("GET", path).content
58
+
59
+ def wd_mkdir(parts):
60
+ """Create parent directories via MKCOL."""
61
+ for i in range(1, len(parts) + 1):
62
+ p = "/".join(parts[:i])
63
+ try:
64
+ _wd_req("MKCOL", p)
65
+ except Exception:
66
+ pass
67
+
68
+ # ── HF Dataset 层 ────────────────────────────────────────────────
69
+
70
+ def _hf_upload(tarpath: str):
71
+ if not HF_REPO or not HF_TOKEN:
72
+ return
73
+ from huggingface_hub import HfApi
74
+ api = HfApi()
75
+ with open(tarpath, "rb") as f:
76
+ api.upload_file(
77
+ path_or_fileobj=f,
78
+ path_in_repo=FULL_NAME,
79
+ repo_id=HF_REPO,
80
+ repo_type="dataset",
81
+ token=HF_TOKEN,
82
+ )
83
+ print(f"[backup] Full backup mirrored to HF Dataset ({HF_REPO})")
84
+
85
+ def _hf_download() -> str | None:
86
+ if not HF_REPO or not HF_TOKEN:
87
+ return None
88
+ try:
89
+ from huggingface_hub import hf_hub_download
90
+ return hf_hub_download(
91
+ repo_id=HF_REPO, filename=FULL_NAME,
92
+ repo_type="dataset", token=HF_TOKEN,
93
+ )
94
+ except Exception as e:
95
+ print(f"[restore] HF fallback unavailable: {e}")
96
+ return None
97
+
98
+ # ── 文件哈希 ──────────────────────────────────────────────────────
99
+
100
+ def _file_hash(path: str) -> str:
101
+ h = hashlib.sha256()
102
+ with open(path, "rb") as f:
103
+ while True:
104
+ chunk = f.read(65536)
105
+ if not chunk:
106
+ break
107
+ h.update(chunk)
108
+ return h.hexdigest()
109
+
110
+ # ── Manifest (增量索引) ──────────────────────────────────────────
111
+
112
+ def _load_manifest() -> dict:
113
+ try:
114
+ data = wd_download(MANIFEST_NAME)
115
+ return json.loads(data)
116
+ except Exception:
117
+ return {}
118
+
119
+ def _save_manifest(manifest: dict):
120
+ wd_upload(MANIFEST_NAME, json.dumps(manifest, indent=2).encode())
121
+
122
+ # ── 全量备份 ──────────────────────────────────────────────────────
123
+
124
+ def full_backup():
125
+ tarpath = f"/tmp/{FULL_NAME}"
126
+ with tarfile.open(tarpath, "w:gz") as tar:
127
+ root = Path(STATE_DIR)
128
+ if root.exists():
129
+ for item in root.iterdir():
130
+ if item.exists():
131
+ tar.add(str(item), arcname=item.name)
132
+ size = os.path.getsize(tarpath)
133
+ print(f"[backup] Full archive created ({size} bytes)")
134
+
135
+ # Upload to WebDAV
136
+ if WEBDAV_URL:
137
+ wd_mkdir([])
138
+ with open(tarpath, "rb") as f:
139
+ wd_upload(FULL_NAME, f.read())
140
+ print(f"[backup] Full backup uploaded to WebDAV")
141
+
142
+ # Mirror to HF Dataset
143
+ _hf_upload(tarpath)
144
+
145
+ os.remove(tarpath)
146
+
147
+ # ── 增量备份 ──────────────────────────────────────────────────────
148
+
149
+ def incremental_backup() -> int:
150
+ root = Path(STATE_DIR)
151
+ if not root.exists():
152
+ return 0
153
+
154
+ manifest = _load_manifest()
155
+ changed = 0
156
+
157
+ for fpath in root.rglob("*"):
158
+ if not fpath.is_file():
159
+ continue
160
+ rel = str(fpath.relative_to(root))
161
+ if rel.startswith(".") or rel == MANIFEST_NAME or rel.startswith("_incremental"):
162
+ continue
163
+
164
+ cur_h = _file_hash(str(fpath))
165
+ prev = manifest.get(rel, {})
166
+
167
+ if cur_h != prev.get("sha256"):
168
+ parts = ["files", *rel.split("/")]
169
+ wd_mkdir(parts[:-1])
170
+ wd_upload("/".join(parts), fpath.read_bytes())
171
+ manifest[rel] = {
172
+ "sha256": cur_h,
173
+ "mtime": fpath.stat().st_mtime,
174
+ "size": fpath.stat().st_size,
175
+ }
176
+ changed += 1
177
+
178
+ if changed:
179
+ _save_manifest(manifest)
180
+ return changed
181
+
182
+ # ── 恢复 ──────────────────────────────────────────────────────────
183
+
184
+ def restore():
185
+ """Restore: WebDAV primary (full → incremental) → HF Dataset fallback."""
186
+ root = Path(STATE_DIR)
187
+ root.mkdir(parents=True, exist_ok=True)
188
+
189
+ restored = False
190
+
191
+ # Strategy 1: WebDAV full backup + incremental overrides
192
+ if WEBDAV_URL and wd_exists(FULL_NAME):
193
+ print("[restore] Downloading full backup from WebDAV...")
194
+ data = wd_download(FULL_NAME)
195
+ tarpath = f"/tmp/{FULL_NAME}"
196
+ with open(tarpath, "wb") as f:
197
+ f.write(data)
198
+ with tarfile.open(tarpath, "r:gz") as tar:
199
+ tar.extractall(path=STATE_DIR)
200
+ os.remove(tarpath)
201
+
202
+ # Apply incremental overrides
203
+ try:
204
+ manifest = _load_manifest()
205
+ count = 0
206
+ for rel, meta in manifest.items():
207
+ p = root / rel
208
+ p.parent.mkdir(parents=True, exist_ok=True)
209
+ try:
210
+ data = wd_download(f"files/{rel}")
211
+ p.write_bytes(data)
212
+ count += 1
213
+ except Exception:
214
+ pass
215
+ print(f"[restore] Applied {count} incremental file overrides")
216
+ except Exception:
217
+ print("[restore] No incremental manifest found (clean start)")
218
+
219
+ print("[restore] Restore from WebDAV complete")
220
+ restored = True
221
+
222
+ # Strategy 2: HF Dataset fallback
223
+ if not restored:
224
+ print("[restore] WebDAV unavailable, trying HF Dataset fallback...")
225
+ path = _hf_download()
226
+ if path:
227
+ with tarfile.open(path, "r:gz") as tar:
228
+ tar.extractall(path=STATE_DIR)
229
+ print("[restore] Restore from HF Dataset complete")
230
+ restored = True
231
+
232
+ if not restored:
233
+ print("[restore] No backup found — fresh start")
234
+
235
+ # ── 调度器 ────────────────────────────────────────────────────────
236
+
237
+ def scheduler_loop():
238
+ from datetime import datetime, timedelta
239
+ last_full: datetime | None = None
240
+
241
+ inc_interval = BACKUP_INCREMENT_INTERVAL
242
+ full_interval = BACKUP_FULL_INTERVAL
243
+
244
+ while True:
245
+ time.sleep(inc_interval * 60)
246
+ c = incremental_backup()
247
+ print(f"[scheduler] Incremental: {c} files changed")
248
+
249
+ now = datetime.now()
250
+ if last_full is None or (now - last_full).total_seconds() / 60 >= full_interval:
251
+ full_backup()
252
+ last_full = now
253
+
254
+ # ── CLI ───────────────────────────────────────────────────────────
255
+
256
+ if __name__ == "__main__":
257
+ cmd = sys.argv[1] if len(sys.argv) > 1 else "restore"
258
+
259
+ if cmd == "restore":
260
+ restore()
261
+ elif cmd == "incremental":
262
+ c = incremental_backup()
263
+ print(f"[backup] Incremental: {c} files changed")
264
+ elif cmd == "full":
265
+ full_backup()
266
+ elif cmd == "scheduler":
267
+ scheduler_loop()
268
+ else:
269
+ print(f"Usage: {sys.argv[0]} {{restore|incremental|full|scheduler}}")
270
+ sys.exit(1)