tao-shen commited on
Commit
e492daf
Β·
1 Parent(s): 5e95362

fix: use openclaw_data/ path, sanitize API keys before upload, add staging dir and tracebacks

Browse files
Files changed (1) hide show
  1. scripts/sync_hf.py +90 -28
scripts/sync_hf.py CHANGED
@@ -7,8 +7,8 @@ Simplified persistence: upload/download the entire ~/.openclaw directory
7
  as-is to/from a Hugging Face Dataset repo.
8
 
9
  - Startup: snapshot_download β†’ ~/.openclaw
10
- - Periodic: upload_folder β†’ dataset .openclaw/
11
- - Shutdown: final upload_folder β†’ dataset .openclaw/
12
  """
13
 
14
  import os
@@ -18,6 +18,10 @@ import threading
18
  import subprocess
19
  import signal
20
  import json
 
 
 
 
21
  from pathlib import Path
22
  from datetime import datetime
23
  from huggingface_hub import HfApi, snapshot_download
@@ -46,6 +50,9 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
46
  OPENCLAW_HOME = Path.home() / ".openclaw"
47
  APP_DIR = Path("/app/openclaw")
48
 
 
 
 
49
  TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
50
  TELEGRAM_BOT_NAME = os.environ.get("TELEGRAM_BOT_NAME", "opentauronbot")
51
  TELEGRAM_ALLOW_USER = os.environ.get("TELEGRAM_ALLOW_USER", "taoshen1")
@@ -75,6 +82,7 @@ EXCLUDE_PATTERNS = [
75
  "**/extensions/**",
76
  "**/.persistence.lock",
77
  "**/.persistence-state.json",
 
78
  ]
79
 
80
  # ── Sync Manager ────────────────────────────────────────────────────────────
@@ -104,49 +112,46 @@ class OpenClawFullSync:
104
  # ── Restore (startup) ─────────────────────────────────────────────
105
 
106
  def load_from_repo(self):
107
- """Download the entire .openclaw/ directory from dataset β†’ ~/.openclaw"""
108
  if not self.enabled:
109
  return
110
  print(f"[SYNC] β–Ά Restoring ~/.openclaw from dataset {HF_REPO_ID} ...")
111
  OPENCLAW_HOME.mkdir(parents=True, exist_ok=True)
112
 
113
  try:
114
- # Check if .openclaw/ folder exists in dataset
115
  files = self.api.list_repo_files(repo_id=HF_REPO_ID, repo_type="dataset")
116
- openclaw_files = [f for f in files if f.startswith(".openclaw/")]
117
  if not openclaw_files:
118
- print("[SYNC] No .openclaw/ folder in dataset. Starting fresh.")
119
  self._ensure_default_config()
120
  self._ensure_telegram_credentials()
121
  return
122
 
123
- print(f"[SYNC] Found {len(openclaw_files)} files under .openclaw/ in dataset")
124
 
125
- # Download .openclaw/* to a temp root, then move into place
126
- import tempfile, shutil
127
  with tempfile.TemporaryDirectory() as tmpdir:
128
  snapshot_download(
129
  repo_id=HF_REPO_ID,
130
  repo_type="dataset",
131
- allow_patterns=".openclaw/**",
132
  local_dir=tmpdir,
133
  token=HF_TOKEN,
134
  )
135
- downloaded_root = Path(tmpdir) / ".openclaw"
136
  if downloaded_root.exists():
137
- # Copy all contents into ~/.openclaw, preserving existing
138
  for item in downloaded_root.rglob("*"):
139
  if item.is_file():
140
  rel = item.relative_to(downloaded_root)
141
  dest = OPENCLAW_HOME / rel
142
  dest.parent.mkdir(parents=True, exist_ok=True)
143
  shutil.copy2(str(item), str(dest))
144
- print(f"[SYNC] βœ“ Restore completed.")
145
  else:
146
- print("[SYNC] Downloaded snapshot but .openclaw dir not found. Starting fresh.")
147
 
148
  except Exception as e:
149
  print(f"[SYNC] βœ— Restore failed: {e}")
 
150
 
151
  # Patch config & telegram after restore
152
  self._patch_config()
@@ -156,40 +161,100 @@ class OpenClawFullSync:
156
  # ── Save (periodic + shutdown) ─────────────────────────────────────
157
 
158
  def save_to_repo(self):
159
- """Upload entire ~/.openclaw directory β†’ dataset .openclaw/"""
160
  if not self.enabled:
161
  return
162
  if not OPENCLAW_HOME.exists():
163
  print("[SYNC] ~/.openclaw does not exist, nothing to save.")
164
  return
165
 
166
- print(f"[SYNC] β–Ά Uploading ~/.openclaw β†’ dataset {HF_REPO_ID} ...")
 
 
 
167
  try:
 
 
 
 
 
 
 
 
 
 
 
 
168
  self.api.upload_folder(
169
- folder_path=str(OPENCLAW_HOME),
170
- path_in_repo=".openclaw",
171
  repo_id=HF_REPO_ID,
172
  repo_type="dataset",
173
  token=HF_TOKEN,
174
- commit_message=f"Sync .openclaw β€” {datetime.now().isoformat()}",
175
  ignore_patterns=EXCLUDE_PATTERNS,
176
  )
177
  print(f"[SYNC] βœ“ Upload completed at {datetime.now().isoformat()}")
178
 
179
- # Verify: list files after upload
180
  try:
181
  files = self.api.list_repo_files(repo_id=HF_REPO_ID, repo_type="dataset")
182
- oc_files = [f for f in files if f.startswith(".openclaw/")]
183
- print(f"[SYNC] Dataset now has {len(oc_files)} files under .openclaw/")
184
- for f in oc_files[:20]:
185
  print(f"[SYNC] {f}")
186
- if len(oc_files) > 20:
187
- print(f"[SYNC] ... and {len(oc_files) - 20} more")
188
  except Exception:
189
  pass
190
 
191
  except Exception as e:
192
  print(f"[SYNC] βœ— Upload failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  # ── Config helpers ─────────────────────────────────────────────────
195
 
@@ -199,7 +264,6 @@ class OpenClawFullSync:
199
  return
200
  default_src = Path(__file__).parent / "openclaw.json.default"
201
  if default_src.exists():
202
- import shutil
203
  shutil.copy2(str(default_src), str(config_path))
204
  print("[SYNC] Created openclaw.json from default template")
205
  else:
@@ -239,7 +303,6 @@ class OpenClawFullSync:
239
  locs = data["plugins"].get("locations", [])
240
  if isinstance(locs, list) and "/dev/null" in locs:
241
  data["plugins"]["locations"] = [l for l in locs if l != "/dev/null"]
242
- print("[SYNC] Removed /dev/null from plugins.locations")
243
 
244
  # Ensure agents defaults
245
  data.setdefault("agents", {}).setdefault("defaults", {}).setdefault("model", {})
@@ -310,7 +373,6 @@ class OpenClawFullSync:
310
  try:
311
  count = 0
312
  for root, dirs, files in os.walk(OPENCLAW_HOME):
313
- # skip noisy dirs
314
  dirs[:] = [d for d in dirs if d not in {".cache", "node_modules", "__pycache__"}]
315
  for name in sorted(files):
316
  rel = os.path.relpath(os.path.join(root, name), OPENCLAW_HOME)
 
7
  as-is to/from a Hugging Face Dataset repo.
8
 
9
  - Startup: snapshot_download β†’ ~/.openclaw
10
+ - Periodic: upload_folder β†’ dataset openclaw_data/
11
+ - Shutdown: final upload_folder β†’ dataset openclaw_data/
12
  """
13
 
14
  import os
 
18
  import subprocess
19
  import signal
20
  import json
21
+ import shutil
22
+ import tempfile
23
+ import traceback
24
+ import re
25
  from pathlib import Path
26
  from datetime import datetime
27
  from huggingface_hub import HfApi, snapshot_download
 
50
  OPENCLAW_HOME = Path.home() / ".openclaw"
51
  APP_DIR = Path("/app/openclaw")
52
 
53
+ # Use "openclaw_data" (no dot prefix) to be visible in HF dataset browser
54
+ DATASET_PATH = "openclaw_data"
55
+
56
  TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
57
  TELEGRAM_BOT_NAME = os.environ.get("TELEGRAM_BOT_NAME", "opentauronbot")
58
  TELEGRAM_ALLOW_USER = os.environ.get("TELEGRAM_ALLOW_USER", "taoshen1")
 
82
  "**/extensions/**",
83
  "**/.persistence.lock",
84
  "**/.persistence-state.json",
85
+ "**/sync.log", # don't upload our own log
86
  ]
87
 
88
  # ── Sync Manager ────────────────────────────────────────────────────────────
 
112
  # ── Restore (startup) ─────────────────────────────────────────────
113
 
114
  def load_from_repo(self):
115
+ """Download from dataset β†’ ~/.openclaw"""
116
  if not self.enabled:
117
  return
118
  print(f"[SYNC] β–Ά Restoring ~/.openclaw from dataset {HF_REPO_ID} ...")
119
  OPENCLAW_HOME.mkdir(parents=True, exist_ok=True)
120
 
121
  try:
 
122
  files = self.api.list_repo_files(repo_id=HF_REPO_ID, repo_type="dataset")
123
+ openclaw_files = [f for f in files if f.startswith(f"{DATASET_PATH}/")]
124
  if not openclaw_files:
125
+ print(f"[SYNC] No {DATASET_PATH}/ folder in dataset. Starting fresh.")
126
  self._ensure_default_config()
127
  self._ensure_telegram_credentials()
128
  return
129
 
130
+ print(f"[SYNC] Found {len(openclaw_files)} files under {DATASET_PATH}/ in dataset")
131
 
 
 
132
  with tempfile.TemporaryDirectory() as tmpdir:
133
  snapshot_download(
134
  repo_id=HF_REPO_ID,
135
  repo_type="dataset",
136
+ allow_patterns=f"{DATASET_PATH}/**",
137
  local_dir=tmpdir,
138
  token=HF_TOKEN,
139
  )
140
+ downloaded_root = Path(tmpdir) / DATASET_PATH
141
  if downloaded_root.exists():
 
142
  for item in downloaded_root.rglob("*"):
143
  if item.is_file():
144
  rel = item.relative_to(downloaded_root)
145
  dest = OPENCLAW_HOME / rel
146
  dest.parent.mkdir(parents=True, exist_ok=True)
147
  shutil.copy2(str(item), str(dest))
148
+ print("[SYNC] βœ“ Restore completed.")
149
  else:
150
+ print("[SYNC] Downloaded snapshot but dir not found. Starting fresh.")
151
 
152
  except Exception as e:
153
  print(f"[SYNC] βœ— Restore failed: {e}")
154
+ traceback.print_exc()
155
 
156
  # Patch config & telegram after restore
157
  self._patch_config()
 
161
  # ── Save (periodic + shutdown) ─────────────────────────────────────
162
 
163
  def save_to_repo(self):
164
+ """Upload entire ~/.openclaw directory β†’ dataset"""
165
  if not self.enabled:
166
  return
167
  if not OPENCLAW_HOME.exists():
168
  print("[SYNC] ~/.openclaw does not exist, nothing to save.")
169
  return
170
 
171
+ print(f"[SYNC] β–Ά Uploading ~/.openclaw β†’ dataset {HF_REPO_ID}/{DATASET_PATH}/ ...")
172
+
173
+ # Create a sanitized staging copy to avoid HF secret detection
174
+ staging_dir = None
175
  try:
176
+ staging_dir = tempfile.mkdtemp(prefix="openclaw-staging-")
177
+ staging_path = Path(staging_dir) / "stage"
178
+
179
+ # Copy the directory, skip symlinks and excluded patterns
180
+ self._copy_for_upload(OPENCLAW_HOME, staging_path)
181
+
182
+ # Sanitize openclaw.json in the staging copy
183
+ config_staged = staging_path / "openclaw.json"
184
+ if config_staged.exists():
185
+ self._sanitize_config_file(config_staged)
186
+
187
+ # Upload the sanitized staging directory
188
  self.api.upload_folder(
189
+ folder_path=str(staging_path),
190
+ path_in_repo=DATASET_PATH,
191
  repo_id=HF_REPO_ID,
192
  repo_type="dataset",
193
  token=HF_TOKEN,
194
+ commit_message=f"Sync openclaw_data β€” {datetime.now().isoformat()}",
195
  ignore_patterns=EXCLUDE_PATTERNS,
196
  )
197
  print(f"[SYNC] βœ“ Upload completed at {datetime.now().isoformat()}")
198
 
199
+ # Verify
200
  try:
201
  files = self.api.list_repo_files(repo_id=HF_REPO_ID, repo_type="dataset")
202
+ oc_files = [f for f in files if f.startswith(f"{DATASET_PATH}/")]
203
+ print(f"[SYNC] Dataset now has {len(oc_files)} files under {DATASET_PATH}/")
204
+ for f in oc_files[:30]:
205
  print(f"[SYNC] {f}")
206
+ if len(oc_files) > 30:
207
+ print(f"[SYNC] ... and {len(oc_files) - 30} more")
208
  except Exception:
209
  pass
210
 
211
  except Exception as e:
212
  print(f"[SYNC] βœ— Upload failed: {e}")
213
+ traceback.print_exc()
214
+ finally:
215
+ if staging_dir and os.path.exists(staging_dir):
216
+ shutil.rmtree(staging_dir, ignore_errors=True)
217
+
218
+ def _copy_for_upload(self, src: Path, dst: Path):
219
+ """Copy directory for upload, skipping symlinks and excluded items."""
220
+ skip_names = {".git", "node_modules", "__pycache__", ".cache",
221
+ "extensions", ".DS_Store"}
222
+ skip_exts = {".lock", ".tmp", ".socket", ".pid", ".pyc"}
223
+
224
+ dst.mkdir(parents=True, exist_ok=True)
225
+ for item in src.iterdir():
226
+ if item.name in skip_names:
227
+ continue
228
+ if item.is_symlink():
229
+ continue
230
+ if item.is_file():
231
+ if item.suffix in skip_exts:
232
+ continue
233
+ if item.name in ("sync.log", ".persistence.lock", ".persistence-state.json"):
234
+ continue
235
+ shutil.copy2(str(item), str(dst / item.name))
236
+ elif item.is_dir():
237
+ self._copy_for_upload(item, dst / item.name)
238
+
239
+ def _sanitize_config_file(self, config_path: Path):
240
+ """Sanitize openclaw.json to remove secrets before upload."""
241
+ try:
242
+ with open(config_path, "r") as f:
243
+ data = json.load(f)
244
+
245
+ # Replace any apiKey that looks like a real key
246
+ if "models" in data and "providers" in data["models"]:
247
+ for prov_name, prov in data["models"]["providers"].items():
248
+ if isinstance(prov, dict) and "apiKey" in prov:
249
+ key = prov["apiKey"]
250
+ if isinstance(key, str) and not key.startswith("${"):
251
+ prov["apiKey"] = "<REDACTED>"
252
+
253
+ with open(config_path, "w") as f:
254
+ json.dump(data, f, indent=2)
255
+ print("[SYNC] Sanitized openclaw.json for upload (redacted API keys)")
256
+ except Exception as e:
257
+ print(f"[SYNC] Warning: could not sanitize config: {e}")
258
 
259
  # ── Config helpers ─────────────────────────────────────────────────
260
 
 
264
  return
265
  default_src = Path(__file__).parent / "openclaw.json.default"
266
  if default_src.exists():
 
267
  shutil.copy2(str(default_src), str(config_path))
268
  print("[SYNC] Created openclaw.json from default template")
269
  else:
 
303
  locs = data["plugins"].get("locations", [])
304
  if isinstance(locs, list) and "/dev/null" in locs:
305
  data["plugins"]["locations"] = [l for l in locs if l != "/dev/null"]
 
306
 
307
  # Ensure agents defaults
308
  data.setdefault("agents", {}).setdefault("defaults", {}).setdefault("model", {})
 
373
  try:
374
  count = 0
375
  for root, dirs, files in os.walk(OPENCLAW_HOME):
 
376
  dirs[:] = [d for d in dirs if d not in {".cache", "node_modules", "__pycache__"}]
377
  for name in sorted(files):
378
  rel = os.path.relpath(os.path.join(root, name), OPENCLAW_HOME)