Anurag commited on
Commit
28ef70e
·
1 Parent(s): 960fd80

Sync only populated Jupyter devdata folders

Browse files
Files changed (3) hide show
  1. jupyter-devdata-sync.py +46 -14
  2. openclaw-sync.py +108 -23
  3. start.sh +17 -0
jupyter-devdata-sync.py CHANGED
@@ -18,6 +18,9 @@ MAX_FILE_SIZE_BYTES = int(
18
  # Max stale files to delete per commit. Mirrors openclaw-sync.py behaviour.
19
  # Override via DEVDATA_PRUNE_BATCH_SIZE.
20
  PRUNE_BATCH_SIZE = int((os.environ.get("DEVDATA_PRUNE_BATCH_SIZE", "").strip() or "50"))
 
 
 
21
 
22
  def is_true(value):
23
  return str(value).strip().lower() in {"1", "true", "yes", "on"}
@@ -35,33 +38,40 @@ def classify_error(exc: Exception) -> str:
35
  return "safety-scan"
36
  return "general"
37
 
38
- # BUG FIX #4: ".local/share/Trash" in the original EXCLUDE set was a
39
- # multi-component path string that was never matched because parts-based
40
- # lookup compares individual directory names. Added "Trash" as a standalone
41
- # component so any path with a "Trash" segment (e.g. .local/share/Trash/*)
42
- # is correctly skipped during snapshot and restore.
43
  EXCLUDE = {
44
  ".cache",
45
  "node_modules",
46
  ".npm",
47
  ".yarn",
48
- "Trash", # BUG FIX #4: covers .local/share/Trash (was ".local/share/Trash" never matched)
49
  ".ipynb_checkpoints",
50
  ".openclaw",
51
  "app",
52
  "HuggingClaw",
53
  "HuggingClaw-Workspace",
54
  "browser-deps",
55
- # Exclude Python/system package directories — these contain thousands of files
56
- # (e.g. .local/lib/python3.11/site-packages/) and must not be synced to the
57
- # HF Dataset. Syncing them causes 10,000+ file fetches on every restore and
58
- # can restore a broken jsonschema that crashes JupyterLab on boot.
59
- ".local",
60
- "lib",
61
  "site-packages",
62
  "__pycache__",
63
  }
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  def enabled():
67
  jupyter_override = os.environ.get("HUGGINGCLAW_JUPYTER_ENABLED", "")
@@ -150,11 +160,26 @@ def _name_is_secret(name: str) -> bool:
150
  return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS)
151
 
152
 
 
 
 
 
153
  def should_skip(p: Path):
 
 
 
 
 
154
  # Skip directories/files in the hard-coded exclude set.
155
  parts = p.parts
156
  if any(x in parts for x in EXCLUDE):
157
  return True
 
 
 
 
 
 
158
  # Skip any component whose name looks like a secret file/dir.
159
  return any(_name_is_secret(part) for part in parts)
160
 
@@ -169,6 +194,10 @@ def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
169
  continue
170
  target = dst / rel
171
  if p.is_dir():
 
 
 
 
172
  target.mkdir(parents=True, exist_ok=True)
173
  elif p.is_file():
174
  # BUG FIX #5: Skip files that exceed the size limit.
@@ -177,12 +206,15 @@ def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
177
  protected_large_files.add(rel.as_posix())
178
  continue
179
  except OSError:
 
 
180
  continue
181
  target.parent.mkdir(parents=True, exist_ok=True)
182
  try:
183
  shutil.copy2(p, target)
184
  except OSError:
185
  had_copy_failures = True
 
186
  return had_copy_failures, protected_large_files
187
 
188
  def is_jupyter_running(port: int = 8888) -> bool:
@@ -211,10 +243,10 @@ def restore_once(api, rid: str):
211
  snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
212
  for p in tmp.rglob("*"):
213
  rel = p.relative_to(tmp)
214
- if should_skip(rel):
215
- continue
216
  if str(rel) == ".gitattributes":
217
  continue
 
 
218
  target = JUPYTER_ROOT / rel
219
  if p.is_dir():
220
  target.mkdir(parents=True, exist_ok=True)
 
18
  # Max stale files to delete per commit. Mirrors openclaw-sync.py behaviour.
19
  # Override via DEVDATA_PRUNE_BATCH_SIZE.
20
  PRUNE_BATCH_SIZE = int((os.environ.get("DEVDATA_PRUNE_BATCH_SIZE", "").strip() or "50"))
21
+ # Reserved filenames previously generated by DevData. Never copy or restore
22
+ # them as user content, and let prune delete any old remote marker files.
23
+ RESERVED_SYNC_FILENAMES = {".huggingclaw-empty-dir"}
24
 
25
  def is_true(value):
26
  return str(value).strip().lower() in {"1", "true", "yes", "on"}
 
38
  return "safety-scan"
39
  return "general"
40
 
41
+ # Directory names that are always unsafe/noisy to persist. Keep this list to
42
+ # single path components; path-specific exclusions live in SKIP_PATH_PREFIXES.
 
 
 
43
  EXCLUDE = {
44
  ".cache",
45
  "node_modules",
46
  ".npm",
47
  ".yarn",
48
+ "Trash", # covers .local/share/Trash and any nested trash folder
49
  ".ipynb_checkpoints",
50
  ".openclaw",
51
  "app",
52
  "HuggingClaw",
53
  "HuggingClaw-Workspace",
54
  "browser-deps",
 
 
 
 
 
 
55
  "site-packages",
56
  "__pycache__",
57
  }
58
 
59
+ # Path prefixes, relative to JUPYTER_ROOT, that should not be synced. This keeps
60
+ # package/runtime trees out of the dataset without blocking real JupyterLab
61
+ # settings under .local/share/jupyter/lab/user-settings.
62
+ SKIP_PATH_PREFIXES = {
63
+ (".jupyter", "runtime"),
64
+ (".local", "bin"),
65
+ (".local", "lib"),
66
+ ("lib",),
67
+ }
68
+
69
+ JUPYTER_DATA_DIR_PREFIX = (".local", "share", "jupyter")
70
+ JUPYTER_DATA_ALLOW_PREFIXES = {
71
+ (".local", "share", "jupyter", "lab", "user-settings"),
72
+ (".local", "share", "jupyter", "lab", "workspaces"),
73
+ }
74
+
75
 
76
  def enabled():
77
  jupyter_override = os.environ.get("HUGGINGCLAW_JUPYTER_ENABLED", "")
 
160
  return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS)
161
 
162
 
163
+ def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool:
164
+ return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
165
+
166
+
167
  def should_skip(p: Path):
168
+ # Reserved sync helper files are not user data. Old datasets may still have
169
+ # these markers, but fresh snapshots no longer create them.
170
+ if p.name in RESERVED_SYNC_FILENAMES:
171
+ return True
172
+
173
  # Skip directories/files in the hard-coded exclude set.
174
  parts = p.parts
175
  if any(x in parts for x in EXCLUDE):
176
  return True
177
+ if _matches_prefix(parts, JUPYTER_DATA_DIR_PREFIX) and not any(
178
+ _matches_prefix(parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES
179
+ ):
180
+ return True
181
+ if any(_matches_prefix(parts, prefix) for prefix in SKIP_PATH_PREFIXES):
182
+ return True
183
  # Skip any component whose name looks like a secret file/dir.
184
  return any(_name_is_secret(part) for part in parts)
185
 
 
194
  continue
195
  target = dst / rel
196
  if p.is_dir():
197
+ # Keep parent directories for files copied later in this snapshot.
198
+ # Empty folders are intentionally not represented in the git-backed
199
+ # HF Dataset; once a folder contains syncable files, those files
200
+ # carry the folder path naturally.
201
  target.mkdir(parents=True, exist_ok=True)
202
  elif p.is_file():
203
  # BUG FIX #5: Skip files that exceed the size limit.
 
206
  protected_large_files.add(rel.as_posix())
207
  continue
208
  except OSError:
209
+ had_copy_failures = True
210
+ protected_large_files.add(rel.as_posix())
211
  continue
212
  target.parent.mkdir(parents=True, exist_ok=True)
213
  try:
214
  shutil.copy2(p, target)
215
  except OSError:
216
  had_copy_failures = True
217
+ protected_large_files.add(rel.as_posix())
218
  return had_copy_failures, protected_large_files
219
 
220
  def is_jupyter_running(port: int = 8888) -> bool:
 
243
  snapshot_download(repo_id=rid, repo_type="dataset", local_dir=str(tmp), local_dir_use_symlinks=False, token=HF_TOKEN)
244
  for p in tmp.rglob("*"):
245
  rel = p.relative_to(tmp)
 
 
246
  if str(rel) == ".gitattributes":
247
  continue
248
+ if should_skip(rel):
249
+ continue
250
  target = JUPYTER_ROOT / rel
251
  if p.is_dir():
252
  target.mkdir(parents=True, exist_ok=True)
openclaw-sync.py CHANGED
@@ -126,13 +126,96 @@ def count_files(path: Path) -> int:
126
  return sum(1 for child in path.rglob("*") if child.is_file())
127
 
128
 
129
- def copy_state_entry_with_retry(source_path: Path, backup_path: Path, attempts: int = 3) -> None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  """Copy one top-level .openclaw entry with short retries for hot files/dirs.
131
 
132
  The state staging dir is seeded from the last known-good backup before this
133
- function runs. Never delete that seeded entry until a fresh copy has fully
134
  succeeded; hot session files can change mid-copy, and a failed copy must not
135
  turn into a deletion that later prunes valid remote session data.
 
 
 
136
  """
137
  last_exc: Exception | None = None
138
  parent = backup_path.parent
@@ -140,37 +223,33 @@ def copy_state_entry_with_retry(source_path: Path, backup_path: Path, attempts:
140
  tmp_path = parent / f".{backup_path.name}.snapshot-tmp-{os.getpid()}-{attempt}"
141
  old_path = parent / f".{backup_path.name}.snapshot-old-{os.getpid()}-{attempt}"
142
  for cleanup_path in (tmp_path, old_path):
143
- if cleanup_path.exists():
144
- if cleanup_path.is_dir():
145
- shutil.rmtree(cleanup_path, ignore_errors=True)
146
- else:
147
- cleanup_path.unlink(missing_ok=True)
148
  try:
 
149
  if source_path.is_dir():
150
- shutil.copytree(source_path, tmp_path)
 
 
 
 
151
  elif source_path.is_file():
152
  tmp_path.parent.mkdir(parents=True, exist_ok=True)
153
  shutil.copy2(source_path, tmp_path)
154
  else:
155
- return
156
 
157
- if backup_path.exists():
158
  backup_path.rename(old_path)
159
  tmp_path.rename(backup_path)
160
- if old_path.exists():
161
- if old_path.is_dir():
162
- shutil.rmtree(old_path, ignore_errors=True)
163
- else:
164
- old_path.unlink(missing_ok=True)
165
- return
166
  except Exception as exc:
167
  last_exc = exc
168
- if tmp_path.exists():
169
- if tmp_path.is_dir():
170
- shutil.rmtree(tmp_path, ignore_errors=True)
171
- else:
172
- tmp_path.unlink(missing_ok=True)
173
- if old_path.exists() and not backup_path.exists():
174
  old_path.rename(backup_path)
175
  if attempt < attempts:
176
  time.sleep(0.2 * attempt)
@@ -200,7 +279,13 @@ def snapshot_state_into_workspace() -> bool:
200
 
201
  backup_path = staging_dir / source_path.name
202
  try:
203
- copy_state_entry_with_retry(source_path, backup_path)
 
 
 
 
 
 
204
  copied_entry_names.add(source_path.name)
205
  except Exception as entry_exc:
206
  skipped_entries.append((source_path.name, entry_exc))
 
126
  return sum(1 for child in path.rglob("*") if child.is_file())
127
 
128
 
129
+ def _remove_path(path: Path) -> None:
130
+ if path.is_dir() and not path.is_symlink():
131
+ shutil.rmtree(path, ignore_errors=True)
132
+ else:
133
+ path.unlink(missing_ok=True)
134
+
135
+
136
+ def _copy_hot_directory_snapshot(source_path: Path, tmp_path: Path) -> bool:
137
+ """Best-effort mirror of a hot directory into *tmp_path*.
138
+
139
+ OpenClaw session files under .openclaw/agents can be rewritten or removed
140
+ while the sync process is copying them. A single raced file must not make the
141
+ whole agents tree keep its old backup forever, so copy files independently
142
+ and preserve the previous staged version only for paths that failed while
143
+ still existing in the source tree.
144
+ """
145
+ had_copy_failures = False
146
+ protected_rel_paths: set[Path] = set()
147
+
148
+ try:
149
+ discovered_paths = sorted(
150
+ source_path.rglob("*"),
151
+ key=lambda child: child.relative_to(source_path).as_posix(),
152
+ )
153
+ except OSError:
154
+ discovered_paths = []
155
+ had_copy_failures = True
156
+
157
+ for source_child in discovered_paths:
158
+ try:
159
+ rel = source_child.relative_to(source_path)
160
+ except ValueError:
161
+ continue
162
+ target_child = tmp_path / rel
163
+
164
+ try:
165
+ if source_child.is_symlink():
166
+ link_target = os.readlink(source_child)
167
+ target_child.parent.mkdir(parents=True, exist_ok=True)
168
+ if target_child.exists() or target_child.is_symlink():
169
+ _remove_path(target_child)
170
+ os.symlink(link_target, target_child)
171
+ elif source_child.is_dir():
172
+ if target_child.exists() and not target_child.is_dir():
173
+ _remove_path(target_child)
174
+ target_child.mkdir(parents=True, exist_ok=True)
175
+ elif source_child.is_file():
176
+ target_child.parent.mkdir(parents=True, exist_ok=True)
177
+ tmp_file = target_child.parent / f".{target_child.name}.copy-tmp-{os.getpid()}"
178
+ try:
179
+ shutil.copy2(source_child, tmp_file)
180
+ if target_child.is_dir() and not target_child.is_symlink():
181
+ _remove_path(target_child)
182
+ tmp_file.replace(target_child)
183
+ finally:
184
+ tmp_file.unlink(missing_ok=True)
185
+ except OSError:
186
+ had_copy_failures = True
187
+ # If the file/dir still exists, treat this as a hot-file race and
188
+ # keep the previously seeded backup for this exact path. If it no
189
+ # longer exists, the stale-prune pass below can remove it.
190
+ if source_child.exists() or source_child.is_symlink():
191
+ protected_rel_paths.add(rel)
192
+ continue
193
+
194
+ for staged_child in sorted(tmp_path.rglob("*"), key=lambda child: len(child.parts), reverse=True):
195
+ try:
196
+ rel = staged_child.relative_to(tmp_path)
197
+ except ValueError:
198
+ continue
199
+ if rel in protected_rel_paths:
200
+ continue
201
+ source_child = source_path / rel
202
+ if source_child.exists() or source_child.is_symlink():
203
+ continue
204
+ _remove_path(staged_child)
205
+
206
+ return had_copy_failures
207
+
208
+
209
+ def copy_state_entry_with_retry(source_path: Path, backup_path: Path, attempts: int = 3) -> bool:
210
  """Copy one top-level .openclaw entry with short retries for hot files/dirs.
211
 
212
  The state staging dir is seeded from the last known-good backup before this
213
+ function runs. Never delete that seeded entry until a fresh copy has fully
214
  succeeded; hot session files can change mid-copy, and a failed copy must not
215
  turn into a deletion that later prunes valid remote session data.
216
+
217
+ Returns True when an individual hot-file copy had to be skipped while the
218
+ rest of the entry was still refreshed.
219
  """
220
  last_exc: Exception | None = None
221
  parent = backup_path.parent
 
223
  tmp_path = parent / f".{backup_path.name}.snapshot-tmp-{os.getpid()}-{attempt}"
224
  old_path = parent / f".{backup_path.name}.snapshot-old-{os.getpid()}-{attempt}"
225
  for cleanup_path in (tmp_path, old_path):
226
+ if cleanup_path.exists() or cleanup_path.is_symlink():
227
+ _remove_path(cleanup_path)
 
 
 
228
  try:
229
+ entry_had_copy_failures = False
230
  if source_path.is_dir():
231
+ if backup_path.is_dir() and not backup_path.is_symlink():
232
+ shutil.copytree(backup_path, tmp_path, symlinks=True)
233
+ else:
234
+ tmp_path.mkdir(parents=True, exist_ok=True)
235
+ entry_had_copy_failures = _copy_hot_directory_snapshot(source_path, tmp_path)
236
  elif source_path.is_file():
237
  tmp_path.parent.mkdir(parents=True, exist_ok=True)
238
  shutil.copy2(source_path, tmp_path)
239
  else:
240
+ return False
241
 
242
+ if backup_path.exists() or backup_path.is_symlink():
243
  backup_path.rename(old_path)
244
  tmp_path.rename(backup_path)
245
+ if old_path.exists() or old_path.is_symlink():
246
+ _remove_path(old_path)
247
+ return entry_had_copy_failures
 
 
 
248
  except Exception as exc:
249
  last_exc = exc
250
+ if tmp_path.exists() or tmp_path.is_symlink():
251
+ _remove_path(tmp_path)
252
+ if (old_path.exists() or old_path.is_symlink()) and not backup_path.exists():
 
 
 
253
  old_path.rename(backup_path)
254
  if attempt < attempts:
255
  time.sleep(0.2 * attempt)
 
279
 
280
  backup_path = staging_dir / source_path.name
281
  try:
282
+ entry_had_copy_failures = copy_state_entry_with_retry(source_path, backup_path)
283
+ if entry_had_copy_failures:
284
+ had_copy_failures = True
285
+ print(
286
+ f"Warning: refreshed {source_path.name} with hot-file skips; "
287
+ "previous backup versions preserved for skipped paths."
288
+ )
289
  copied_entry_names.add(source_path.name)
290
  except Exception as entry_exc:
291
  skipped_entries.append((source_path.name, entry_exc))
start.sh CHANGED
@@ -1631,6 +1631,23 @@ export NPM_CONFIG_PREFIX="${NPM_CONFIG_PREFIX:-/home/node/.local}"
1631
  export npm_config_prefix="$NPM_CONFIG_PREFIX"
1632
  export PYTHONUSERBASE="${PYTHONUSERBASE:-/home/node/.local}"
1633
  export DEBIAN_FRONTEND="${DEBIAN_FRONTEND:-noninteractive}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1634
  if [ -z "${PS1:-}" ] || [ "$PS1" = "$ " ]; then
1635
  export PS1="\u@\h:\w\$ "
1636
  fi
 
1631
  export npm_config_prefix="$NPM_CONFIG_PREFIX"
1632
  export PYTHONUSERBASE="${PYTHONUSERBASE:-/home/node/.local}"
1633
  export DEBIAN_FRONTEND="${DEBIAN_FRONTEND:-noninteractive}"
1634
+ export HISTFILE="${HISTFILE:-/home/node/.bash_history}"
1635
+ export HISTSIZE="${HISTSIZE:-50000}"
1636
+ export HISTFILESIZE="${HISTFILESIZE:-100000}"
1637
+ mkdir -p "$(dirname "$HISTFILE")"
1638
+ touch "$HISTFILE" 2>/dev/null || true
1639
+ chmod 600 "$HISTFILE" 2>/dev/null || true
1640
+ shopt -s histappend 2>/dev/null || true
1641
+ _hc_history_sync_prompt() {
1642
+ history -a
1643
+ history -n
1644
+ }
1645
+ case ";${PROMPT_COMMAND:-};" in
1646
+ *";_hc_history_sync_prompt;"*) ;;
1647
+ ""|";") PROMPT_COMMAND="_hc_history_sync_prompt" ;;
1648
+ *) PROMPT_COMMAND="_hc_history_sync_prompt; ${PROMPT_COMMAND}" ;;
1649
+ esac
1650
+ export PROMPT_COMMAND
1651
  if [ -z "${PS1:-}" ] || [ "$PS1" = "$ " ]; then
1652
  export PS1="\u@\h:\w\$ "
1653
  fi