Anurag commited on
Commit
ce96c5b
Β·
1 Parent(s): e1afa1b

Refresh verified NVIDIA model catalogs

Browse files
Files changed (6) hide show
  1. .env.example +34 -2
  2. README.md +6 -0
  3. env-builder.js +14 -4
  4. jupyter-devdata-sync.py +46 -5
  5. openclaw-sync.py +275 -77
  6. start.sh +87 -48
.env.example CHANGED
@@ -105,6 +105,10 @@ LLM_API_KEY=your_api_key_here
105
  #
106
  # NVIDIA (NVIDIA_API_KEY):
107
  # - nvidia/nemotron-3-super-120b-a12b
 
 
 
 
108
  #
109
  # Groq (GROQ_API_KEY):
110
  # - groq/mixtral-8x7b-32768
@@ -272,8 +276,10 @@ LLM_API_KEY_FALLBACK_ENABLED=true
272
  #
273
  # Optional: explicitly pin model lists per provider for Control UI visibility
274
  # when provider keys are configured.
275
- # Format: comma-separated model IDs
276
- # NVIDIA_MODELS=meta/llama-3.1-70b-instruct,nvidia/llama-3.1-nemotron-70b-instruct
 
 
277
  # OPENAI_MODELS=gpt-4o-mini,gpt-4.1
278
  # GROQ_MODELS=llama-3.3-70b-versatile,deepseek-r1-distill-llama-70b
279
  # independently and in parallel.
@@ -395,6 +401,32 @@ OPENCLAW_CONFIG_SETTLE_SECONDS=3
395
  # Minimum gap between sessions-triggered immediate syncs (seconds). Default: 30.
396
  SESSIONS_MIN_SYNC_GAP=30
397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  # Webhooks: Standard POST notifications for lifecycle events
399
  # WEBHOOK_URL=https://your-webhook-endpoint.com/log
400
 
 
105
  #
106
  # NVIDIA (NVIDIA_API_KEY):
107
  # - nvidia/nemotron-3-super-120b-a12b
108
+ # - nvidia/deepseek-ai/deepseek-v4-flash
109
+ # - nvidia/qwen/qwen3-coder-480b-a35b-instruct
110
+ # - nvidia/moonshotai/kimi-k2.6
111
+ # - nvidia/stepfun-ai/step-3.7-flash
112
  #
113
  # Groq (GROQ_API_KEY):
114
  # - groq/mixtral-8x7b-32768
 
276
  #
277
  # Optional: explicitly pin model lists per provider for Control UI visibility
278
  # when provider keys are configured.
279
+ # Format: comma-separated provider-local model IDs. For NVIDIA, keep NVIDIA-owned
280
+ # API IDs as nvidia/... and third-party hosted IDs as their API owner/id; do not
281
+ # add an extra outer OpenClaw provider prefix in NVIDIA_MODELS.
282
+ # NVIDIA_MODELS=nvidia/nemotron-3-super-120b-a12b,deepseek-ai/deepseek-v4-flash,qwen/qwen3-coder-480b-a35b-instruct,moonshotai/kimi-k2.6,stepfun-ai/step-3.7-flash
283
  # OPENAI_MODELS=gpt-4o-mini,gpt-4.1
284
  # GROQ_MODELS=llama-3.3-70b-versatile,deepseek-r1-distill-llama-70b
285
  # independently and in parallel.
 
401
  # Minimum gap between sessions-triggered immediate syncs (seconds). Default: 30.
402
  SESSIONS_MIN_SYNC_GAP=30
403
 
404
+ # Max seconds a one-off sync waits for another sync process to release its lock.
405
+ # Startup background sync still keeps retrying; this mainly prevents shutdown or
406
+ # gateway-restart saves from hanging forever behind a stuck upload. Default: 20.
407
+ SYNC_LOCK_TIMEOUT=20
408
+
409
+ # Max seconds one HF upload call may run before the sync process marks it failed,
410
+ # releases the lock, and retries on the next pass. This prevents Backup from
411
+ # staying stuck at SYNCING forever. Set to 0 to disable this watchdog. Default: 180.
412
+ SYNC_UPLOAD_TIMEOUT=180
413
+
414
+ # Upload implementation. Default `folder` uses a normal commit upload and is
415
+ # safest for HuggingClaw's small/medium snapshots. Set `large_folder` only if you
416
+ # explicitly need Hugging Face's resumable large-folder uploader.
417
+ SYNC_UPLOAD_STRATEGY=folder
418
+
419
+ # Shutdown/restart one-shot sync upload budgets. These are intentionally much
420
+ # larger than SYNC_LOCK_TIMEOUT so 40-50 MB files are not killed mid-upload just
421
+ # because another sync lock check needed to be bounded. Set to 0 to disable the
422
+ # outer command timeout completely. Defaults: 120 seconds each.
423
+ SYNC_SETTLED_TIMEOUT=120
424
+ SYNC_FINAL_TIMEOUT=120
425
+
426
+ # Lock wait used only by shutdown/restart one-shot syncs. Keep this short
427
+ # because start.sh stops the background sync loop before these commands run.
428
+ SYNC_ONE_SHOT_LOCK_TIMEOUT=5
429
+
430
  # Webhooks: Standard POST notifications for lifecycle events
431
  # WEBHOOK_URL=https://your-webhook-endpoint.com/log
432
 
README.md CHANGED
@@ -188,6 +188,12 @@ HuggingClaw automatically syncs your workspace (chats, settings, sessions) to a
188
  | `OPENCLAW_CONFIG_WATCH_INTERVAL` | `1` | How often to check `openclaw.json` for immediate settings sync |
189
  | `OPENCLAW_CONFIG_SETTLE_SECONDS` | `3` | How long `openclaw.json` must stay valid and unchanged before syncing |
190
  | `SESSIONS_MIN_SYNC_GAP` | `30` | Minimum seconds between session-triggered immediate syncs |
 
 
 
 
 
 
191
 
192
  ## πŸ“¦ Ephemeral Package Re-install *(Optional)*
193
 
 
188
  | `OPENCLAW_CONFIG_WATCH_INTERVAL` | `1` | How often to check `openclaw.json` for immediate settings sync |
189
  | `OPENCLAW_CONFIG_SETTLE_SECONDS` | `3` | How long `openclaw.json` must stay valid and unchanged before syncing |
190
  | `SESSIONS_MIN_SYNC_GAP` | `30` | Minimum seconds between session-triggered immediate syncs |
191
+ | `SYNC_LOCK_TIMEOUT` | `20` | Max seconds one-off syncs wait for another sync lock before failing clearly |
192
+ | `SYNC_UPLOAD_TIMEOUT` | `180` | Max seconds one HF upload call can stay active before failing and retrying next pass; set `0` to disable |
193
+ | `SYNC_UPLOAD_STRATEGY` | `folder` | Upload method: `folder` for normal commit uploads, `large_folder` for HF resumable large-folder uploader |
194
+ | `SYNC_SETTLED_TIMEOUT` | `120` | Shutdown/restart settled-sync upload budget; set `0` to disable the outer timeout |
195
+ | `SYNC_FINAL_TIMEOUT` | `120` | Shutdown/restart final catch-up sync upload budget; set `0` to disable the outer timeout |
196
+ | `SYNC_ONE_SHOT_LOCK_TIMEOUT` | `5` | Short lock wait for shutdown/restart one-shot syncs after the background loop is stopped |
197
 
198
  ## πŸ“¦ Ephemeral Package Re-install *(Optional)*
199
 
env-builder.js CHANGED
@@ -141,8 +141,13 @@ const MODEL_CATALOGS = {
141
  ],
142
  "NVIDIA": [
143
  "nvidia/nemotron-3-super-120b-a12b",
144
- "nvidia/nemotron-4-340b-instruct",
145
- "nvidia/llama-3.1-nemotron-70b-instruct"
 
 
 
 
 
146
  ],
147
  "KiloCode": [
148
  "kilocode/anthropic/claude-opus-4.7",
@@ -375,8 +380,13 @@ const MODEL_CATALOGS = {
375
  ],
376
  "NVIDIA_MODELS": [
377
  "nvidia/nemotron-3-super-120b-a12b",
378
- "nvidia/nemotron-4-340b-instruct",
379
- "nvidia/llama-3.1-nemotron-70b-instruct"
 
 
 
 
 
380
  ],
381
  "KILOCODE_MODELS": [
382
  "kilocode/anthropic/claude-opus-4.7",
 
141
  ],
142
  "NVIDIA": [
143
  "nvidia/nemotron-3-super-120b-a12b",
144
+ "nvidia/deepseek-ai/deepseek-v4-flash",
145
+ "nvidia/qwen/qwen3-coder-480b-a35b-instruct",
146
+ "nvidia/moonshotai/kimi-k2.6",
147
+ "nvidia/minimaxai/minimax-m2.7",
148
+ "nvidia/openai/gpt-oss-120b",
149
+ "nvidia/z-ai/glm5.1",
150
+ "nvidia/stepfun-ai/step-3.7-flash"
151
  ],
152
  "KiloCode": [
153
  "kilocode/anthropic/claude-opus-4.7",
 
380
  ],
381
  "NVIDIA_MODELS": [
382
  "nvidia/nemotron-3-super-120b-a12b",
383
+ "deepseek-ai/deepseek-v4-flash",
384
+ "qwen/qwen3-coder-480b-a35b-instruct",
385
+ "moonshotai/kimi-k2.6",
386
+ "minimaxai/minimax-m2.7",
387
+ "openai/gpt-oss-120b",
388
+ "z-ai/glm5.1",
389
+ "stepfun-ai/step-3.7-flash"
390
  ],
391
  "KILOCODE_MODELS": [
392
  "kilocode/anthropic/claude-opus-4.7",
jupyter-devdata-sync.py CHANGED
@@ -167,6 +167,10 @@ def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool:
167
  return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
168
 
169
 
 
 
 
 
170
  def should_skip(p: Path):
171
  # Reserved sync helper files are not user data. Old datasets may still have
172
  # these markers, but fresh snapshots no longer create them.
@@ -188,15 +192,52 @@ def should_skip(p: Path):
188
  # Skip any component whose name looks like a secret file/dir.
189
  return any(_name_is_secret(part) for part in parts)
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
192
  had_copy_failures = False
193
  protected_large_files: set[str] = set()
194
- for p in src.rglob("*"):
195
  rel = p.relative_to(src)
196
- if should_skip(rel):
197
- continue
198
- if p.is_symlink():
199
- continue
200
  target = dst / rel
201
  if p.is_dir():
202
  # Keep parent directories for files copied later in this snapshot.
 
167
  return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
168
 
169
 
170
+ def _is_prefix_of(parts: tuple[str, ...], full_path: tuple[str, ...]) -> bool:
171
+ return len(parts) <= len(full_path) and full_path[:len(parts)] == parts
172
+
173
+
174
  def should_skip(p: Path):
175
  # Reserved sync helper files are not user data. Old datasets may still have
176
  # these markers, but fresh snapshots no longer create them.
 
192
  # Skip any component whose name looks like a secret file/dir.
193
  return any(_name_is_secret(part) for part in parts)
194
 
195
+
196
+ def iter_sync_tree(root: Path):
197
+ """Yield syncable DevData paths without descending into excluded trees."""
198
+ if not root.exists():
199
+ return
200
+
201
+ for dirpath, dirnames, filenames in os.walk(root):
202
+ dir_path = Path(dirpath)
203
+ try:
204
+ dir_rel = dir_path.relative_to(root)
205
+ except ValueError:
206
+ dir_rel = Path()
207
+
208
+ kept_dirnames: list[str] = []
209
+ for dirname in sorted(dirnames):
210
+ rel = dir_rel / dirname
211
+ child = dir_path / dirname
212
+ rel_parts = rel.parts
213
+ # Do not prune ancestors of explicitly allowed Jupyter settings
214
+ # paths. should_skip(.local/share/jupyter) is true by design for
215
+ # files under that tree, but we must still descend through the
216
+ # parent dirs to reach lab/user-settings and lab/workspaces.
217
+ allowed_ancestor = any(
218
+ _is_prefix_of(rel_parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES
219
+ )
220
+ if child.is_symlink() or (should_skip(rel) and not allowed_ancestor):
221
+ continue
222
+ kept_dirnames.append(dirname)
223
+ dirnames[:] = kept_dirnames
224
+
225
+ for dirname in kept_dirnames:
226
+ yield dir_path / dirname
227
+
228
+ for filename in sorted(filenames):
229
+ rel = dir_rel / filename
230
+ child = dir_path / filename
231
+ if child.is_symlink() or should_skip(rel):
232
+ continue
233
+ yield child
234
+
235
+
236
  def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
237
  had_copy_failures = False
238
  protected_large_files: set[str] = set()
239
+ for p in iter_sync_tree(src):
240
  rel = p.relative_to(src)
 
 
 
 
241
  target = dst / rel
242
  if p.is_dir():
243
  # Keep parent directories for files copied later in this snapshot.
openclaw-sync.py CHANGED
@@ -51,6 +51,9 @@ CONFIG_SETTLE_SECONDS = max(
51
  float(os.environ.get("OPENCLAW_CONFIG_SETTLE_SECONDS", "3")),
52
  )
53
  SESSIONS_MIN_SYNC_GAP = int(os.environ.get("SESSIONS_MIN_SYNC_GAP", "30"))
 
 
 
54
  HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
55
  HF_USERNAME = os.environ.get("HF_USERNAME", "").strip()
56
  SPACE_AUTHOR_NAME = os.environ.get("SPACE_AUTHOR_NAME", "").strip()
@@ -88,6 +91,22 @@ EXCLUDED_STATE_NAMES = {
88
  # "WhatsApp enabled but not installing on restart".
89
  "extensions",
90
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  SESSIONS_ROOT = OPENCLAW_HOME / "agents"
92
  WHATSAPP_CREDS_DIR = OPENCLAW_HOME / "credentials" / "whatsapp" / "default"
93
  WHATSAPP_BACKUP_DIR = STATE_DIR / "credentials" / "whatsapp" / "default"
@@ -95,11 +114,29 @@ RESET_MARKER = WORKSPACE / ".reset_credentials"
95
  HF_API = HfApi(token=HF_TOKEN) if HF_TOKEN else None
96
  STOP_EVENT = threading.Event()
97
  _REPO_ID_CACHE: str | None = None
98
- _SESSIONS_FILE_DIGEST_CACHE: dict[str, tuple[int, int, int, str]] = {}
99
  WorkspaceMarker: TypeAlias = tuple[int, int, int, str]
 
100
  # Set True when prune fails so the next sync pass bypasses the fingerprint
101
  # early-exit and retries the prune even if no local files changed.
102
  _prune_needed: bool = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  def write_status(status: str, message: str) -> None:
@@ -133,6 +170,86 @@ def _remove_path(path: Path) -> None:
133
  path.unlink(missing_ok=True)
134
 
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def _copy_hot_directory_snapshot(source_path: Path, tmp_path: Path) -> bool:
137
  """Best-effort mirror of a hot directory into *tmp_path*.
138
 
@@ -259,6 +376,7 @@ def copy_state_entry_with_retry(source_path: Path, backup_path: Path, attempts:
259
  def snapshot_state_into_workspace() -> bool:
260
  had_copy_failures = False
261
  try:
 
262
  STATE_DIR.mkdir(parents=True, exist_ok=True)
263
  # Atomic snapshot: copy to a staging dir first, then rename.
264
  # This prevents a half-written (or empty) backup if we crash mid-copy,
@@ -274,7 +392,7 @@ def snapshot_state_into_workspace() -> bool:
274
  skipped_entries: list[tuple[str, Exception]] = []
275
  copied_entry_names: set[str] = set()
276
  for source_path in OPENCLAW_HOME.iterdir():
277
- if source_path.name in EXCLUDED_STATE_NAMES:
278
  continue
279
 
280
  backup_path = staging_dir / source_path.name
@@ -300,7 +418,9 @@ def snapshot_state_into_workspace() -> bool:
300
  # dataset on the next sync. Only remove an entry from staging when the
301
  # source has genuinely been deleted from OPENCLAW_HOME.
302
  for staged_path in list(staging_dir.iterdir()):
303
- if staged_path.name in EXCLUDED_STATE_NAMES:
 
 
304
  continue
305
  if staged_path.name in copied_entry_names:
306
  continue
@@ -425,13 +545,8 @@ def locally_existing_large_files(root: Path) -> set[str]:
425
  protected: set[str] = set()
426
  if not root.exists():
427
  return protected
428
- for path in root.rglob("*"):
429
- if not path.is_file():
430
- continue
431
  rel = path.relative_to(root).as_posix()
432
- parts = Path(rel).parts
433
- if any(part in EXCLUDED_SYNC_DIRS for part in parts):
434
- continue
435
  try:
436
  if path.stat().st_size > MAX_FILE_SIZE_BYTES:
437
  protected.add(rel)
@@ -441,6 +556,7 @@ def locally_existing_large_files(root: Path) -> set[str]:
441
 
442
 
443
  def restore_embedded_state() -> None:
 
444
  state_backup_root = STATE_DIR / "openclaw"
445
 
446
  # Migration fix: old backups stored state in ".huggingclaw-state/openclaw"
@@ -462,7 +578,7 @@ def restore_embedded_state() -> None:
462
  if state_backup_root.is_dir():
463
  for source_path in state_backup_root.iterdir():
464
  name = source_path.name
465
- if name in EXCLUDED_STATE_NAMES:
466
  if source_path.is_dir():
467
  shutil.rmtree(source_path, ignore_errors=True)
468
  else:
@@ -527,7 +643,7 @@ def ensure_repo_exists() -> str:
527
 
528
  def _should_exclude(rel_posix: str, path: Path) -> bool:
529
  parts = Path(rel_posix).parts
530
- if any(part in EXCLUDED_SYNC_DIRS for part in parts):
531
  return True
532
  if path.is_file():
533
  try:
@@ -538,16 +654,28 @@ def _should_exclude(rel_posix: str, path: Path) -> bool:
538
  return False
539
 
540
 
541
- def file_marker(path: Path) -> tuple[int, int, int]:
542
  try:
543
  stat = path.stat()
544
  except OSError:
545
- return (0, 0, 0)
546
 
547
  if not path.is_file():
548
- return (0, 0, 0)
549
 
550
- return (1, int(stat.st_size), int(stat.st_mtime_ns))
 
 
 
 
 
 
 
 
 
 
 
 
551
 
552
 
553
  def metadata_marker(root: Path) -> WorkspaceMarker:
@@ -558,17 +686,12 @@ def metadata_marker(root: Path) -> WorkspaceMarker:
558
  total_size = 0
559
  newest_mtime = 0
560
  metadata_hasher = hashlib.sha256()
561
- for path in sorted(root.rglob("*")):
562
- if not path.is_file():
563
- continue
564
  rel = path.relative_to(root).as_posix()
565
  # BUG FIX: use directory-only exclusion here (not size-based) so that
566
  # deleting a large file changes the marker and triggers a sync/prune
567
  # pass. Large files are still excluded from the upload snapshot via
568
  # _should_exclude; this only controls change-detection.
569
- parts = Path(rel).parts
570
- if any(part in EXCLUDED_SYNC_DIRS for part in parts):
571
- continue
572
  try:
573
  stat = path.stat()
574
  except OSError:
@@ -576,14 +699,17 @@ def metadata_marker(root: Path) -> WorkspaceMarker:
576
  file_count += 1
577
  size = int(stat.st_size)
578
  mtime_ns = int(stat.st_mtime_ns)
 
579
  total_size += size
580
- newest_mtime = max(newest_mtime, mtime_ns)
581
  metadata_hasher.update(rel.encode("utf-8"))
582
  metadata_hasher.update(b"\0")
583
  metadata_hasher.update(str(size).encode("ascii"))
584
  metadata_hasher.update(b"\0")
585
  metadata_hasher.update(str(mtime_ns).encode("ascii"))
586
  metadata_hasher.update(b"\0")
 
 
587
  return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
588
 
589
 
@@ -592,15 +718,12 @@ def fingerprint_dir(root: Path) -> str:
592
  if not root.exists():
593
  return hasher.hexdigest()
594
 
595
- for path in sorted(p for p in root.rglob("*") if p.is_file()):
596
  rel = path.relative_to(root).as_posix()
597
  # BUG FIX: use directory-only exclusion (not size-based) so that
598
  # creating or deleting a large file changes the fingerprint.
599
  # Large files are hashed by path + metadata only (no content read)
600
  # to avoid reading gigabytes for a change-detection hash.
601
- parts = Path(rel).parts
602
- if any(part in EXCLUDED_SYNC_DIRS for part in parts):
603
- continue
604
  hasher.update(rel.encode("utf-8"))
605
  try:
606
  stat = path.stat()
@@ -628,7 +751,7 @@ def fingerprint_dir(root: Path) -> str:
628
 
629
  def create_snapshot_dir(source_root: Path) -> Path:
630
  staging_root = Path(tempfile.mkdtemp(prefix="huggingclaw-sync-"))
631
- for path in sorted(source_root.rglob("*")):
632
  rel = path.relative_to(source_root)
633
  rel_posix = rel.as_posix()
634
  if _should_exclude(rel_posix, path):
@@ -648,6 +771,64 @@ def create_snapshot_dir(source_root: Path) -> Path:
648
  return staging_root
649
 
650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  def prune_remote_deleted_files(
652
  repo_id: str,
653
  snapshot_dir: Path,
@@ -679,15 +860,19 @@ def prune_remote_deleted_files(
679
  and path not in protected_paths
680
  and not any(path == prefix or path.startswith(prefix + "/") for prefix in skip_prefixes)
681
  ]
682
- if not stale_files:
 
 
 
 
683
  return
684
 
685
- total = len(stale_files)
686
  num_batches = (total + PRUNE_BATCH_SIZE - 1) // PRUNE_BATCH_SIZE
687
  for batch_idx in range(num_batches):
688
- batch = stale_files[batch_idx * PRUNE_BATCH_SIZE:(batch_idx + 1) * PRUNE_BATCH_SIZE]
689
  batch_label = f"{batch_idx + 1}/{num_batches}" if num_batches > 1 else ""
690
- msg = f"Prune {len(batch)} stale file(s) after workspace sync"
691
  if batch_label:
692
  msg += f" (batch {batch_label})"
693
  operations = [CommitOperationDelete(path_in_repo=path) for path in batch]
@@ -698,7 +883,25 @@ def prune_remote_deleted_files(
698
  commit_message=msg,
699
  )
700
  if num_batches > 1:
701
- print(f"Pruned batch {batch_label}: {len(batch)} file(s)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
702
 
703
 
704
  def restore_workspace() -> bool:
@@ -795,10 +998,20 @@ def _sync_once_unlocked(
795
  write_status("disabled", "HF_TOKEN is not configured.")
796
  return (last_fingerprint or "", last_marker or (0, 0, 0, ""))
797
 
798
- global _prune_needed
799
 
800
  had_snapshot_copy_failures = snapshot_state_into_workspace()
801
  repo_id = ensure_repo_exists()
 
 
 
 
 
 
 
 
 
 
802
  current_marker = metadata_marker(WORKSPACE)
803
  # Session watcher uses content digests and can detect same-size rewrites
804
  # whose workspace metadata marker is unchanged. In that case, force the
@@ -817,25 +1030,11 @@ def _sync_once_unlocked(
817
  write_status("synced", "No workspace changes detected.")
818
  return (last_fingerprint, current_marker)
819
 
820
- write_status("syncing", f"Uploading workspace to {repo_id}")
 
821
  snapshot_dir = create_snapshot_dir(WORKSPACE)
822
  try:
823
- try:
824
- HF_API.upload_large_folder(
825
- repo_id=repo_id,
826
- repo_type="dataset",
827
- folder_path=str(snapshot_dir),
828
- num_workers=2,
829
- print_report=False,
830
- )
831
- except AttributeError:
832
- upload_folder(
833
- folder_path=str(snapshot_dir),
834
- repo_id=repo_id,
835
- repo_type="dataset",
836
- token=HF_TOKEN,
837
- commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
838
- )
839
  skip_prune_prefixes: set[str] = set()
840
  if had_snapshot_copy_failures:
841
  # BUG FIX: the old code added "huggingclaw-state/openclaw" to
@@ -879,7 +1078,18 @@ def sync_once(
879
  ) -> tuple[str, WorkspaceMarker]:
880
  SYNC_LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
881
  with SYNC_LOCK_FILE.open("w", encoding="utf-8") as lock_handle:
882
- fcntl.flock(lock_handle, fcntl.LOCK_EX)
 
 
 
 
 
 
 
 
 
 
 
883
  try:
884
  return _sync_once_unlocked(
885
  last_fingerprint,
@@ -920,9 +1130,6 @@ def sessions_marker() -> tuple[int, int, int, str]:
920
  newest_mtime = 0
921
  metadata_hasher = hashlib.sha256()
922
 
923
- global _SESSIONS_FILE_DIGEST_CACHE
924
- next_cache: dict[str, tuple[int, int, int, str]] = {}
925
-
926
  for profile_dir in sorted(SESSIONS_ROOT.iterdir()):
927
  if not profile_dir.is_dir():
928
  continue
@@ -943,7 +1150,6 @@ def sessions_marker() -> tuple[int, int, int, str]:
943
  size = int(stat.st_size)
944
  mtime_ns = int(stat.st_mtime_ns)
945
  ctime_ns = int(stat.st_ctime_ns)
946
- cache_key = f"{profile_dir.name}\0{rel}"
947
  digest.update(rel.encode("utf-8"))
948
  digest.update(b"\0")
949
  digest.update(str(size).encode("ascii"))
@@ -952,24 +1158,17 @@ def sessions_marker() -> tuple[int, int, int, str]:
952
  digest.update(b"\0")
953
  digest.update(str(ctime_ns).encode("ascii"))
954
  digest.update(b"\0")
955
- cached = _SESSIONS_FILE_DIGEST_CACHE.get(cache_key)
956
- if (
957
- cached is not None
958
- and cached[0] == size
959
- and cached[1] == mtime_ns
960
- and cached[2] == ctime_ns
961
- ):
962
- file_digest = cached[3]
963
- else:
964
- file_hasher = hashlib.sha256()
965
- try:
966
- with path.open("rb") as handle:
967
- for chunk in iter(lambda: handle.read(1024 * 1024), b""):
968
- file_hasher.update(chunk)
969
- except (FileNotFoundError, IsADirectoryError, NotADirectoryError):
970
- continue
971
- file_digest = file_hasher.hexdigest()
972
- next_cache[cache_key] = (size, mtime_ns, ctime_ns, file_digest)
973
  digest.update(file_digest.encode("ascii"))
974
  digest.update(b"\0")
975
 
@@ -981,11 +1180,10 @@ def sessions_marker() -> tuple[int, int, int, str]:
981
  metadata_hasher.update(digest.hexdigest().encode("ascii"))
982
  metadata_hasher.update(b"\0")
983
 
984
- _SESSIONS_FILE_DIGEST_CACHE = next_cache
985
  return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
986
 
987
 
988
- def wait_for_config_settle(config_marker: tuple[int, int, int]) -> tuple[str, tuple[int, int, int]]:
989
  stable_since = time.monotonic()
990
  current_marker = config_marker
991
 
@@ -1008,9 +1206,9 @@ def wait_for_config_settle(config_marker: tuple[int, int, int]) -> tuple[str, tu
1008
 
1009
 
1010
  def wait_for_sync_trigger(
1011
- config_marker: tuple[int, int, int],
1012
  last_sessions_sync_time: float = 0.0,
1013
- ) -> tuple[str, tuple[int, int, int]]:
1014
  deadline = time.monotonic() + max(0, INTERVAL)
1015
  # BUG FIX: also watch sessions directory so new/updated sessions
1016
  # trigger an immediate sync instead of waiting the full interval.
 
51
  float(os.environ.get("OPENCLAW_CONFIG_SETTLE_SECONDS", "3")),
52
  )
53
  SESSIONS_MIN_SYNC_GAP = int(os.environ.get("SESSIONS_MIN_SYNC_GAP", "30"))
54
+ SYNC_LOCK_TIMEOUT = max(1.0, float(os.environ.get("SYNC_LOCK_TIMEOUT", "20")))
55
+ SYNC_UPLOAD_TIMEOUT = max(0.0, float(os.environ.get("SYNC_UPLOAD_TIMEOUT", "180")))
56
+ SYNC_UPLOAD_STRATEGY = os.environ.get("SYNC_UPLOAD_STRATEGY", "folder").strip().lower() or "folder"
57
  HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
58
  HF_USERNAME = os.environ.get("HF_USERNAME", "").strip()
59
  SPACE_AUTHOR_NAME = os.environ.get("SPACE_AUTHOR_NAME", "").strip()
 
91
  # "WhatsApp enabled but not installing on restart".
92
  "extensions",
93
  }
94
+ # Internal restore/snapshot working directories live beside the workspace under
95
+ # /home/node/.openclaw. If a previous container is killed mid-restore, these
96
+ # hidden directories can be left behind and may contain a full copy of the
97
+ # workspace, including huggingclaw-state/openclaw itself. Backing them up makes
98
+ # the dataset grow recursively under huggingclaw-state/openclaw and can keep the
99
+ # sync loop busy forever, which in turn blocks session deletes/updates from being
100
+ # pruned remotely. Treat every such helper path as disposable sync scratch.
101
+ INTERNAL_TEMP_STATE_NAMES = {
102
+ ".workspace-restore-staging",
103
+ ".workspace-restore-old",
104
+ ".openclaw-staging",
105
+ }
106
+ INTERNAL_TEMP_NAME_PREFIXES = (
107
+ ".workspace-restore-",
108
+ ".openclaw-staging",
109
+ )
110
  SESSIONS_ROOT = OPENCLAW_HOME / "agents"
111
  WHATSAPP_CREDS_DIR = OPENCLAW_HOME / "credentials" / "whatsapp" / "default"
112
  WHATSAPP_BACKUP_DIR = STATE_DIR / "credentials" / "whatsapp" / "default"
 
114
  HF_API = HfApi(token=HF_TOKEN) if HF_TOKEN else None
115
  STOP_EVENT = threading.Event()
116
  _REPO_ID_CACHE: str | None = None
 
117
  WorkspaceMarker: TypeAlias = tuple[int, int, int, str]
118
+ FileMarker: TypeAlias = tuple[int, int, int, int, str]
119
  # Set True when prune fails so the next sync pass bypasses the fingerprint
120
  # early-exit and retries the prune even if no local files changed.
121
  _prune_needed: bool = False
122
+ _remote_temp_prune_done: bool = False
123
+
124
+ # Workspace-relative temp paths managed by this sync script. Keep this narrow:
125
+ # user projects may legitimately contain folders with similar names, and those
126
+ # must keep syncing normally. Only the script-owned state/restore scratch
127
+ # locations are skipped and pruned.
128
+ INTERNAL_TEMP_WORKSPACE_PREFIXES: tuple[tuple[str, ...], ...] = (
129
+ (".workspace-restore-staging",),
130
+ (".workspace-restore-old",),
131
+ ("huggingclaw-state", ".openclaw-staging"),
132
+ ("huggingclaw-state", "openclaw", ".workspace-restore-staging"),
133
+ ("huggingclaw-state", "openclaw", ".workspace-restore-old"),
134
+ ("huggingclaw-state", "openclaw", ".openclaw-staging"),
135
+ )
136
+
137
+
138
+ class SyncUploadTimeoutError(TimeoutError):
139
+ pass
140
 
141
 
142
  def write_status(status: str, message: str) -> None:
 
170
  path.unlink(missing_ok=True)
171
 
172
 
173
+ def _is_internal_temp_name(name: str) -> bool:
174
+ return name in INTERNAL_TEMP_STATE_NAMES or any(
175
+ name.startswith(prefix) for prefix in INTERNAL_TEMP_NAME_PREFIXES
176
+ )
177
+
178
+
179
+ def _has_internal_temp_part(path: str) -> bool:
180
+ parts = Path(path).parts
181
+ return any(_matches_prefix(parts, prefix) for prefix in INTERNAL_TEMP_WORKSPACE_PREFIXES)
182
+
183
+
184
+ def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool:
185
+ return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
186
+
187
+
188
+ def _should_skip_state_entry_name(name: str) -> bool:
189
+ return (
190
+ name in EXCLUDED_STATE_NAMES
191
+ or name in EXCLUDED_SYNC_DIRS
192
+ or _is_internal_temp_name(name)
193
+ )
194
+
195
+
196
+ def _should_skip_sync_path(rel_parts: tuple[str, ...]) -> bool:
197
+ return any(part in EXCLUDED_SYNC_DIRS for part in rel_parts) or any(
198
+ _matches_prefix(rel_parts, prefix) for prefix in INTERNAL_TEMP_WORKSPACE_PREFIXES
199
+ )
200
+
201
+
202
+ def _iter_sync_tree(root: Path):
203
+ """Yield syncable paths without descending into ignored/temp directories."""
204
+ if not root.exists():
205
+ return
206
+
207
+ for dirpath, dirnames, filenames in os.walk(root):
208
+ dir_path = Path(dirpath)
209
+ try:
210
+ dir_rel_parts = dir_path.relative_to(root).parts
211
+ except ValueError:
212
+ dir_rel_parts = ()
213
+ if dir_rel_parts == (".",):
214
+ dir_rel_parts = ()
215
+ dirnames[:] = sorted(
216
+ name for name in dirnames
217
+ if not _should_skip_sync_path(dir_rel_parts + (name,))
218
+ )
219
+
220
+ for dirname in dirnames:
221
+ yield dir_path / dirname
222
+ for filename in sorted(filenames):
223
+ if _should_skip_sync_path(dir_rel_parts + (filename,)):
224
+ continue
225
+ yield dir_path / filename
226
+
227
+
228
+ def _iter_sync_files(root: Path):
229
+ for path in _iter_sync_tree(root):
230
+ if path.is_file():
231
+ yield path
232
+
233
+
234
+ def cleanup_internal_temp_paths() -> None:
235
+ """Remove stale sync/restore scratch dirs that must never be backed up."""
236
+ for root in (OPENCLAW_HOME, STATE_DIR):
237
+ if not root.exists() or not root.is_dir():
238
+ continue
239
+ try:
240
+ children = list(root.iterdir())
241
+ except OSError:
242
+ continue
243
+ for child in children:
244
+ if not _is_internal_temp_name(child.name):
245
+ continue
246
+ try:
247
+ _remove_path(child)
248
+ print(f"Removed stale sync scratch path: {child}")
249
+ except OSError as exc:
250
+ print(f"Warning: could not remove stale sync scratch path {child}: {exc}")
251
+
252
+
253
  def _copy_hot_directory_snapshot(source_path: Path, tmp_path: Path) -> bool:
254
  """Best-effort mirror of a hot directory into *tmp_path*.
255
 
 
376
  def snapshot_state_into_workspace() -> bool:
377
  had_copy_failures = False
378
  try:
379
+ cleanup_internal_temp_paths()
380
  STATE_DIR.mkdir(parents=True, exist_ok=True)
381
  # Atomic snapshot: copy to a staging dir first, then rename.
382
  # This prevents a half-written (or empty) backup if we crash mid-copy,
 
392
  skipped_entries: list[tuple[str, Exception]] = []
393
  copied_entry_names: set[str] = set()
394
  for source_path in OPENCLAW_HOME.iterdir():
395
+ if _should_skip_state_entry_name(source_path.name):
396
  continue
397
 
398
  backup_path = staging_dir / source_path.name
 
418
  # dataset on the next sync. Only remove an entry from staging when the
419
  # source has genuinely been deleted from OPENCLAW_HOME.
420
  for staged_path in list(staging_dir.iterdir()):
421
+ if _should_skip_state_entry_name(staged_path.name):
422
+ if staged_path.exists() or staged_path.is_symlink():
423
+ _remove_path(staged_path)
424
  continue
425
  if staged_path.name in copied_entry_names:
426
  continue
 
545
  protected: set[str] = set()
546
  if not root.exists():
547
  return protected
548
+ for path in _iter_sync_files(root):
 
 
549
  rel = path.relative_to(root).as_posix()
 
 
 
550
  try:
551
  if path.stat().st_size > MAX_FILE_SIZE_BYTES:
552
  protected.add(rel)
 
556
 
557
 
558
  def restore_embedded_state() -> None:
559
+ cleanup_internal_temp_paths()
560
  state_backup_root = STATE_DIR / "openclaw"
561
 
562
  # Migration fix: old backups stored state in ".huggingclaw-state/openclaw"
 
578
  if state_backup_root.is_dir():
579
  for source_path in state_backup_root.iterdir():
580
  name = source_path.name
581
+ if _should_skip_state_entry_name(name):
582
  if source_path.is_dir():
583
  shutil.rmtree(source_path, ignore_errors=True)
584
  else:
 
643
 
644
  def _should_exclude(rel_posix: str, path: Path) -> bool:
645
  parts = Path(rel_posix).parts
646
+ if _should_skip_sync_path(parts):
647
  return True
648
  if path.is_file():
649
  try:
 
654
  return False
655
 
656
 
657
+ def file_marker(path: Path) -> FileMarker:
658
  try:
659
  stat = path.stat()
660
  except OSError:
661
+ return (0, 0, 0, 0, "")
662
 
663
  if not path.is_file():
664
+ return (0, 0, 0, 0, "")
665
 
666
+ digest = ""
667
+ try:
668
+ hasher = hashlib.sha256()
669
+ with path.open("rb") as handle:
670
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
671
+ hasher.update(chunk)
672
+ digest = hasher.hexdigest()
673
+ except OSError:
674
+ # If the file is being rewritten, include ctime/mtime/size and let the
675
+ # next watch loop re-read it. Never pretend the marker is unchanged.
676
+ digest = f"unreadable:{time.monotonic_ns()}"
677
+
678
+ return (1, int(stat.st_size), int(stat.st_mtime_ns), int(stat.st_ctime_ns), digest)
679
 
680
 
681
  def metadata_marker(root: Path) -> WorkspaceMarker:
 
686
  total_size = 0
687
  newest_mtime = 0
688
  metadata_hasher = hashlib.sha256()
689
+ for path in _iter_sync_files(root):
 
 
690
  rel = path.relative_to(root).as_posix()
691
  # BUG FIX: use directory-only exclusion here (not size-based) so that
692
  # deleting a large file changes the marker and triggers a sync/prune
693
  # pass. Large files are still excluded from the upload snapshot via
694
  # _should_exclude; this only controls change-detection.
 
 
 
695
  try:
696
  stat = path.stat()
697
  except OSError:
 
699
  file_count += 1
700
  size = int(stat.st_size)
701
  mtime_ns = int(stat.st_mtime_ns)
702
+ ctime_ns = int(stat.st_ctime_ns)
703
  total_size += size
704
+ newest_mtime = max(newest_mtime, mtime_ns, ctime_ns)
705
  metadata_hasher.update(rel.encode("utf-8"))
706
  metadata_hasher.update(b"\0")
707
  metadata_hasher.update(str(size).encode("ascii"))
708
  metadata_hasher.update(b"\0")
709
  metadata_hasher.update(str(mtime_ns).encode("ascii"))
710
  metadata_hasher.update(b"\0")
711
+ metadata_hasher.update(str(ctime_ns).encode("ascii"))
712
+ metadata_hasher.update(b"\0")
713
  return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
714
 
715
 
 
718
  if not root.exists():
719
  return hasher.hexdigest()
720
 
721
+ for path in _iter_sync_files(root):
722
  rel = path.relative_to(root).as_posix()
723
  # BUG FIX: use directory-only exclusion (not size-based) so that
724
  # creating or deleting a large file changes the fingerprint.
725
  # Large files are hashed by path + metadata only (no content read)
726
  # to avoid reading gigabytes for a change-detection hash.
 
 
 
727
  hasher.update(rel.encode("utf-8"))
728
  try:
729
  stat = path.stat()
 
751
 
752
  def create_snapshot_dir(source_root: Path) -> Path:
753
  staging_root = Path(tempfile.mkdtemp(prefix="huggingclaw-sync-"))
754
+ for path in _iter_sync_tree(source_root):
755
  rel = path.relative_to(source_root)
756
  rel_posix = rel.as_posix()
757
  if _should_exclude(rel_posix, path):
 
771
  return staging_root
772
 
773
 
774
+ def _run_with_timeout(seconds: float, label: str, func):
775
+ """Run *func* with a SIGALRM watchdog so hung HF calls release the sync lock."""
776
+ if seconds <= 0 or not hasattr(signal, "SIGALRM"):
777
+ return func()
778
+
779
+ previous_handler = signal.getsignal(signal.SIGALRM)
780
+ try:
781
+ previous_timer = signal.setitimer(signal.ITIMER_REAL, 0)
782
+ except Exception:
783
+ previous_timer = (0.0, 0.0)
784
+
785
+ def _timeout_handler(_sig, _frame):
786
+ raise SyncUploadTimeoutError(f"{label} timed out after {seconds:.0f}s")
787
+
788
+ signal.signal(signal.SIGALRM, _timeout_handler)
789
+ signal.setitimer(signal.ITIMER_REAL, seconds)
790
+ try:
791
+ return func()
792
+ finally:
793
+ signal.setitimer(signal.ITIMER_REAL, 0)
794
+ signal.signal(signal.SIGALRM, previous_handler)
795
+ if previous_timer and previous_timer[0] > 0:
796
+ signal.setitimer(signal.ITIMER_REAL, previous_timer[0], previous_timer[1])
797
+
798
+
799
+ def upload_snapshot(repo_id: str, snapshot_dir: Path) -> None:
800
+ strategy = SYNC_UPLOAD_STRATEGY.replace("-", "_")
801
+ timeout_label = "no timeout" if SYNC_UPLOAD_TIMEOUT <= 0 else f"{SYNC_UPLOAD_TIMEOUT:.0f}s timeout"
802
+ if strategy not in {"folder", "upload_folder", "large_folder"}:
803
+ print(f"Warning: unknown SYNC_UPLOAD_STRATEGY={SYNC_UPLOAD_STRATEGY!r}; using upload_folder.")
804
+ strategy = "folder"
805
+
806
+ def _upload() -> None:
807
+ if strategy == "large_folder":
808
+ try:
809
+ HF_API.upload_large_folder(
810
+ repo_id=repo_id,
811
+ repo_type="dataset",
812
+ folder_path=str(snapshot_dir),
813
+ num_workers=2,
814
+ print_report=False,
815
+ )
816
+ return
817
+ except AttributeError:
818
+ print("Warning: upload_large_folder unavailable; falling back to upload_folder.")
819
+
820
+ upload_folder(
821
+ folder_path=str(snapshot_dir),
822
+ repo_id=repo_id,
823
+ repo_type="dataset",
824
+ token=HF_TOKEN,
825
+ commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
826
+ )
827
+
828
+ print(f"Workspace upload: strategy={strategy}, {timeout_label}")
829
+ _run_with_timeout(SYNC_UPLOAD_TIMEOUT, "Workspace upload", _upload)
830
+
831
+
832
  def prune_remote_deleted_files(
833
  repo_id: str,
834
  snapshot_dir: Path,
 
860
  and path not in protected_paths
861
  and not any(path == prefix or path.startswith(prefix + "/") for prefix in skip_prefixes)
862
  ]
863
+ _commit_delete_batches(repo_id, stale_files, "Prune", "stale file(s) after workspace sync")
864
+
865
+
866
+ def _commit_delete_batches(repo_id: str, paths: list[str], action: str, reason: str) -> None:
867
+ if not paths:
868
  return
869
 
870
+ total = len(paths)
871
  num_batches = (total + PRUNE_BATCH_SIZE - 1) // PRUNE_BATCH_SIZE
872
  for batch_idx in range(num_batches):
873
+ batch = paths[batch_idx * PRUNE_BATCH_SIZE:(batch_idx + 1) * PRUNE_BATCH_SIZE]
874
  batch_label = f"{batch_idx + 1}/{num_batches}" if num_batches > 1 else ""
875
+ msg = f"{action} {len(batch)} {reason}"
876
  if batch_label:
877
  msg += f" (batch {batch_label})"
878
  operations = [CommitOperationDelete(path_in_repo=path) for path in batch]
 
883
  commit_message=msg,
884
  )
885
  if num_batches > 1:
886
+ print(f"{action} batch {batch_label}: {len(batch)} file(s)")
887
+
888
+
889
+ def prune_remote_internal_temp_files(repo_id: str) -> None:
890
+ """Remove old recursive scratch files from the HF backup even if nothing changed locally."""
891
+ if HF_API is None:
892
+ return
893
+
894
+ remote_files = list(HF_API.list_repo_files(repo_id=repo_id, repo_type="dataset"))
895
+ internal_temp_files = [
896
+ path for path in remote_files
897
+ if path != ".gitattributes" and _has_internal_temp_part(path)
898
+ ]
899
+ _commit_delete_batches(
900
+ repo_id,
901
+ internal_temp_files,
902
+ "Prune internal sync scratch",
903
+ "file(s) from backup",
904
+ )
905
 
906
 
907
  def restore_workspace() -> bool:
 
998
  write_status("disabled", "HF_TOKEN is not configured.")
999
  return (last_fingerprint or "", last_marker or (0, 0, 0, ""))
1000
 
1001
+ global _prune_needed, _remote_temp_prune_done
1002
 
1003
  had_snapshot_copy_failures = snapshot_state_into_workspace()
1004
  repo_id = ensure_repo_exists()
1005
+ if not _remote_temp_prune_done:
1006
+ try:
1007
+ prune_remote_internal_temp_files(repo_id)
1008
+ _remote_temp_prune_done = True
1009
+ except Exception as temp_prune_exc:
1010
+ # Do not block normal user-data sync if the cleanup commit fails;
1011
+ # leave the flag false so the next pass retries before taking the
1012
+ # metadata/fingerprint early exit again.
1013
+ print(f"Warning: could not prune internal sync scratch files: {temp_prune_exc}")
1014
+ _prune_needed = True
1015
  current_marker = metadata_marker(WORKSPACE)
1016
  # Session watcher uses content digests and can detect same-size rewrites
1017
  # whose workspace metadata marker is unchanged. In that case, force the
 
1030
  write_status("synced", "No workspace changes detected.")
1031
  return (last_fingerprint, current_marker)
1032
 
1033
+ upload_timeout_label = "no timeout" if SYNC_UPLOAD_TIMEOUT <= 0 else f"timeout {SYNC_UPLOAD_TIMEOUT:.0f}s"
1034
+ write_status("syncing", f"Uploading workspace to {repo_id} ({SYNC_UPLOAD_STRATEGY}, {upload_timeout_label})")
1035
  snapshot_dir = create_snapshot_dir(WORKSPACE)
1036
  try:
1037
+ upload_snapshot(repo_id, snapshot_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1038
  skip_prune_prefixes: set[str] = set()
1039
  if had_snapshot_copy_failures:
1040
  # BUG FIX: the old code added "huggingclaw-state/openclaw" to
 
1078
  ) -> tuple[str, WorkspaceMarker]:
1079
  SYNC_LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
1080
  with SYNC_LOCK_FILE.open("w", encoding="utf-8") as lock_handle:
1081
+ deadline = time.monotonic() + SYNC_LOCK_TIMEOUT
1082
+ while True:
1083
+ try:
1084
+ fcntl.flock(lock_handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
1085
+ break
1086
+ except BlockingIOError:
1087
+ if time.monotonic() >= deadline:
1088
+ raise TimeoutError(
1089
+ f"Timed out waiting {SYNC_LOCK_TIMEOUT:.0f}s for workspace sync lock."
1090
+ )
1091
+ if STOP_EVENT.wait(0.25):
1092
+ raise TimeoutError("Stopped while waiting for workspace sync lock.")
1093
  try:
1094
  return _sync_once_unlocked(
1095
  last_fingerprint,
 
1130
  newest_mtime = 0
1131
  metadata_hasher = hashlib.sha256()
1132
 
 
 
 
1133
  for profile_dir in sorted(SESSIONS_ROOT.iterdir()):
1134
  if not profile_dir.is_dir():
1135
  continue
 
1150
  size = int(stat.st_size)
1151
  mtime_ns = int(stat.st_mtime_ns)
1152
  ctime_ns = int(stat.st_ctime_ns)
 
1153
  digest.update(rel.encode("utf-8"))
1154
  digest.update(b"\0")
1155
  digest.update(str(size).encode("ascii"))
 
1158
  digest.update(b"\0")
1159
  digest.update(str(ctime_ns).encode("ascii"))
1160
  digest.update(b"\0")
1161
+ # Sessions are the most important live data. Hash every scan
1162
+ # instead of trusting size/mtime/ctime caches so same-size rewrites
1163
+ # and very small edits are never missed by the sessions trigger.
1164
+ file_hasher = hashlib.sha256()
1165
+ try:
1166
+ with path.open("rb") as handle:
1167
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
1168
+ file_hasher.update(chunk)
1169
+ except (FileNotFoundError, IsADirectoryError, NotADirectoryError):
1170
+ continue
1171
+ file_digest = file_hasher.hexdigest()
 
 
 
 
 
 
 
1172
  digest.update(file_digest.encode("ascii"))
1173
  digest.update(b"\0")
1174
 
 
1180
  metadata_hasher.update(digest.hexdigest().encode("ascii"))
1181
  metadata_hasher.update(b"\0")
1182
 
 
1183
  return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
1184
 
1185
 
1186
+ def wait_for_config_settle(config_marker: FileMarker) -> tuple[str, FileMarker]:
1187
  stable_since = time.monotonic()
1188
  current_marker = config_marker
1189
 
 
1206
 
1207
 
1208
  def wait_for_sync_trigger(
1209
+ config_marker: FileMarker,
1210
  last_sessions_sync_time: float = 0.0,
1211
+ ) -> tuple[str, FileMarker]:
1212
  deadline = time.monotonic() + max(0, INTERVAL)
1213
  # BUG FIX: also watch sessions directory so new/updated sessions
1214
  # trigger an immediate sync instead of waiting the full interval.
start.sh CHANGED
@@ -357,12 +357,12 @@ case "$LLM_PROVIDER" in
357
  mistral-*|codestral-*|devstral-*|voxtral-*)
358
  export MISTRAL_API_KEY="$LLM_API_KEY"
359
  echo "Note: bare Mistral model '$LLM_MODEL' detected; mapped LLM_API_KEY β†’ MISTRAL_API_KEY. Use 'mistral/${LLM_MODEL}' prefix to be explicit." ;;
360
- moonshotai|meta-llama|deepseek-ai|MiniMaxAI|minimax-ai|Qwen|zai-org|mistralai|google)
361
  echo "Warning: LLM_MODEL='$LLM_MODEL' uses sub-provider prefix '$LLM_PROVIDER'. This is a router-namespaced model (Together/OpenRouter). Mapping LLM_API_KEY β†’ TOGETHER_API_KEY. If using OpenRouter, also set OPENROUTER_API_KEY as a separate secret."
362
  export TOGETHER_API_KEY="${TOGETHER_API_KEY:-$LLM_API_KEY}" ;;
363
  # ── Fallback: Anthropic (default) ──
364
  *)
365
- echo "Warning: Unknown provider prefix '$LLM_PROVIDER' in LLM_MODEL='$LLM_MODEL'. Defaulting to ANTHROPIC_API_KEY. If using a router-namespaced model (e.g. moonshotai/Kimi-K2.6), set TOGETHER_API_KEY or OPENROUTER_API_KEY as a separate secret."
366
  export ANTHROPIC_API_KEY="$LLM_API_KEY"
367
  ;;
368
  esac
@@ -657,7 +657,7 @@ _DEFAULT_XAI_MODELS="xai/grok-4.20,xai/grok-4.3,xai/grok-4.1,xai/grok-latest,xai
657
  _DEFAULT_COHERE_MODELS="cohere/command-a,cohere/command-a-03-2025,cohere/command-a-reasoning-08-2025,cohere/command-r-plus-08-2024"
658
  _DEFAULT_TOGETHER_MODELS="together/moonshotai/Kimi-K2.6,together/deepseek-ai/DeepSeek-V4-Pro,together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput,together/meta-llama/Llama-3.3-70B-Instruct-Turbo"
659
  _DEFAULT_CEREBRAS_MODELS="cerebras/zai-glm-4.7,cerebras/gpt-oss-120b,cerebras/deepseek-r1,cerebras/qwen3-32b"
660
- _DEFAULT_NVIDIA_MODELS="nvidia/nemotron-3-super-120b-a12b,nvidia/nemotron-4-340b-instruct,nvidia/llama-3.1-nemotron-70b-instruct"
661
  _DEFAULT_KILOCODE_MODELS="kilocode/kilo/auto,kilocode/anthropic/claude-opus-4.7,kilocode/openai/gpt-5.4,kilocode/google/gemini-2.5-pro"
662
  _DEFAULT_MOONSHOT_MODELS="moonshot/kimi-k2.6,moonshot/kimi-k2.6-thinking,moonshot/kimi-k2-thinking"
663
  _DEFAULT_MINIMAX_MODELS="minimax/MiniMax-M2.7,minimax/minimax-m1.5"
@@ -725,26 +725,40 @@ inject_provider_models_from_env() {
725
  | awk 'NF' \
726
  | jq -R . \
727
  | jq -s --arg provider "$provider" '
728
- map(
729
- if contains("/") then
730
- # Fix cross-prefix: strip only the first "/" segment (the foreign provider
731
- # prefix) and reapply the correct one. Using `last` was a bug for 3-part IDs:
732
- # e.g. "openai/gpt-oss-120b" injected into "groq" would become
733
- # "groq/gpt-oss-120b" instead of "groq/openai/gpt-oss-120b".
734
- # Examples:
735
- # "google/gemini-2.5-pro" β†’ "google-vertex/gemini-2.5-pro"
736
- # "openai/gpt-oss-120b" β†’ "groq/openai/gpt-oss-120b"
737
- # "moonshotai/kimi-k2.6" β†’ "nvidia/moonshotai/kimi-k2.6"
738
- if startswith($provider + "/") then .
739
- else ($provider + "/" + (split("/") | .[1:] | join("/")))
 
 
 
 
 
 
 
 
 
 
 
 
 
740
  end
 
 
741
  else
742
- ($provider + "/" + .)
743
- end
744
- )
745
  | map({id: ., name: .})
746
  | unique_by(.id)')
747
-
748
  # Build provider patch: always inject models; conditionally inject apiKey, baseUrl, api.
749
  # Existing saved config wins on merge (see config-patch jq below), so this only fills
750
  # in missing fields β€” it never overwrites what the user already configured manually.
@@ -1414,32 +1428,63 @@ if [ -n "${WEBHOOK_URL:-}" ]; then
1414
  fi
1415
 
1416
  # ── Trap SIGTERM for graceful shutdown ──
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1417
  graceful_shutdown() {
1418
  echo "Shutting down..."
1419
  if [ -f "/home/node/app/openclaw-sync.py" ] && [ -n "${HF_TOKEN:-}" ]; then
1420
  echo "Saving state before exit..."
1421
- # BUG FIX: kill the background sync loop *before* running the shutdown
1422
- # syncs. The loop holds the sync lock while uploading; if it is
1423
- # mid-upload when sync-once-settled runs, the 8-second timeout fires
1424
- # before the lock is released and the settled-sync is silently skipped.
1425
- # Killing the loop first releases the lock immediately (fcntl.flock is
1426
- # released on process exit) so sync-once-settled can acquire it cleanly.
1427
- if [ -n "${SYNC_LOOP_PID:-}" ]; then
1428
- kill "$SYNC_LOOP_PID" 2>/dev/null || true
1429
- # Give Python a moment to flush and release the lock file.
1430
- # Reduced from 0.5 s β†’ 0.3 s to reclaim time for the upload.
1431
- sleep 0.3
1432
- fi
1433
  # Pass 1: wait for config to settle then upload β€” avoids pushing a
1434
- # half-written JSON config to the dataset. Timeout raised from 8 s
1435
- # to 15 s so the 3-second settle window + actual upload both fit
1436
- # within the budget (old 8 s left only 5 s for the upload itself).
1437
- timeout 15s python3 /home/node/app/openclaw-sync.py sync-once-settled || \
1438
  echo "Warning: could not complete settled shutdown sync"
1439
  # Pass 2: catch any writes that arrived after the settled sync completed.
1440
  # BUG FIX: added timeout (previously unbounded) so HF container kill
1441
  # can't interrupt a hung upload and lose all data silently.
1442
- timeout 8s python3 /home/node/app/openclaw-sync.py sync-once || \
1443
  echo "Warning: could not complete final shutdown sync"
1444
  elif [ -f "/home/node/app/openclaw-sync.py" ]; then
1445
  echo "HF_TOKEN not set; skipping shutdown backup sync."
@@ -2439,27 +2484,21 @@ sync_before_gateway_restart() {
2439
 
2440
  echo "Gateway stopped; saving latest OpenClaw state before restart..."
2441
  # Kill the background sync loop before syncing β€” same reason as in
2442
- # graceful_shutdown: the loop holds the fcntl.flock while uploading;
2443
- # if it is mid-upload when sync-once-settled runs, the lock contention
2444
- # will eat into (or exhaust) the 15s timeout budget, silently skipping
2445
- # the upload. Killing the loop releases the lock immediately (fcntl.flock
2446
- # is released on process exit) so sync-once-settled can acquire it cleanly.
2447
- if [ -n "${SYNC_LOOP_PID:-}" ]; then
2448
- kill "$SYNC_LOOP_PID" 2>/dev/null || true
2449
- sleep 0.3
2450
- SYNC_LOOP_PID=""
2451
- fi
2452
  # Pass 1: wait for config to settle then upload β€” avoids pushing a
2453
  # half-written JSON config to the dataset. Timeout added (was unbounded)
2454
  # so a slow HF upload cannot stall gateway restarts indefinitely.
2455
- timeout 15s python3 /home/node/app/openclaw-sync.py sync-once-settled || \
2456
  echo "Warning: could not sync settled state before gateway restart"
2457
  # Pass 2: catch any writes that arrived after the settled sync completed
2458
  # (e.g. session state flushed by OpenClaw just before exit).
2459
  # BUG FIX: this second pass was missing from the restart path (it existed
2460
  # only in graceful_shutdown), so last-second writes were lost on watchdog
2461
  # restarts β€” causing sessions to disappear after OpenClaw auto-restarts.
2462
- timeout 8s python3 /home/node/app/openclaw-sync.py sync-once || \
2463
  echo "Warning: could not complete final sync before gateway restart"
2464
  }
2465
 
 
357
  mistral-*|codestral-*|devstral-*|voxtral-*)
358
  export MISTRAL_API_KEY="$LLM_API_KEY"
359
  echo "Note: bare Mistral model '$LLM_MODEL' detected; mapped LLM_API_KEY β†’ MISTRAL_API_KEY. Use 'mistral/${LLM_MODEL}' prefix to be explicit." ;;
360
+ moonshotai|meta-llama|deepseek-ai|minimaxai|MiniMaxAI|minimax-ai|Qwen|zai-org|mistralai|google)
361
  echo "Warning: LLM_MODEL='$LLM_MODEL' uses sub-provider prefix '$LLM_PROVIDER'. This is a router-namespaced model (Together/OpenRouter). Mapping LLM_API_KEY β†’ TOGETHER_API_KEY. If using OpenRouter, also set OPENROUTER_API_KEY as a separate secret."
362
  export TOGETHER_API_KEY="${TOGETHER_API_KEY:-$LLM_API_KEY}" ;;
363
  # ── Fallback: Anthropic (default) ──
364
  *)
365
+ echo "Warning: Unknown provider prefix '$LLM_PROVIDER' in LLM_MODEL='$LLM_MODEL'. Defaulting to ANTHROPIC_API_KEY. If using a router-namespaced model (e.g. moonshotai/kimi-k2.6), set TOGETHER_API_KEY or OPENROUTER_API_KEY as a separate secret."
366
  export ANTHROPIC_API_KEY="$LLM_API_KEY"
367
  ;;
368
  esac
 
657
  _DEFAULT_COHERE_MODELS="cohere/command-a,cohere/command-a-03-2025,cohere/command-a-reasoning-08-2025,cohere/command-r-plus-08-2024"
658
  _DEFAULT_TOGETHER_MODELS="together/moonshotai/Kimi-K2.6,together/deepseek-ai/DeepSeek-V4-Pro,together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput,together/meta-llama/Llama-3.3-70B-Instruct-Turbo"
659
  _DEFAULT_CEREBRAS_MODELS="cerebras/zai-glm-4.7,cerebras/gpt-oss-120b,cerebras/deepseek-r1,cerebras/qwen3-32b"
660
+ _DEFAULT_NVIDIA_MODELS="nvidia/nemotron-3-super-120b-a12b,deepseek-ai/deepseek-v4-flash,qwen/qwen3-coder-480b-a35b-instruct,moonshotai/kimi-k2.6,minimaxai/minimax-m2.7,openai/gpt-oss-120b,z-ai/glm5.1,stepfun-ai/step-3.7-flash"
661
  _DEFAULT_KILOCODE_MODELS="kilocode/kilo/auto,kilocode/anthropic/claude-opus-4.7,kilocode/openai/gpt-5.4,kilocode/google/gemini-2.5-pro"
662
  _DEFAULT_MOONSHOT_MODELS="moonshot/kimi-k2.6,moonshot/kimi-k2.6-thinking,moonshot/kimi-k2-thinking"
663
  _DEFAULT_MINIMAX_MODELS="minimax/MiniMax-M2.7,minimax/minimax-m1.5"
 
725
  | awk 'NF' \
726
  | jq -R . \
727
  | jq -s --arg provider "$provider" '
728
+ def strip_outer_provider:
729
+ (split("/") | .[1:] | join("/"));
730
+ def normalize_provider_model:
731
+ # OpenClaw stores provider-local model ids inside
732
+ # models.providers.<provider>.models[].id. The user-facing model ref
733
+ # is composed later as <provider>/<id>. Most providers use ids like
734
+ # "gpt-5.4", so "openai/gpt-5.4" becomes "gpt-5.4" here.
735
+ # NVIDIA is special because some valid NVIDIA API ids themselves
736
+ # start with "nvidia/" (for example nvidia/nemotron-...). Keep those
737
+ # single-prefix ids intact, but strip an outer OpenClaw provider prefix
738
+ # from full refs like "nvidia/deepseek-ai/..." or legacy
739
+ # "nvidia/nvidia/nemotron-..." so the provider sends the exact NVIDIA
740
+ # documented model id to integrate.api.nvidia.com.
741
+ if $provider == "nvidia" then
742
+ if startswith("nvidia/nvidia/")
743
+ or startswith("nvidia/deepseek-ai/")
744
+ or startswith("nvidia/qwen/")
745
+ or startswith("nvidia/moonshotai/")
746
+ or startswith("nvidia/minimaxai/")
747
+ or startswith("nvidia/openai/")
748
+ or startswith("nvidia/z-ai/")
749
+ or startswith("nvidia/stepfun-ai/") then
750
+ strip_outer_provider
751
+ else
752
+ .
753
  end
754
+ elif startswith($provider + "/") then
755
+ strip_outer_provider
756
  else
757
+ .
758
+ end;
759
+ map(normalize_provider_model)
760
  | map({id: ., name: .})
761
  | unique_by(.id)')
 
762
  # Build provider patch: always inject models; conditionally inject apiKey, baseUrl, api.
763
  # Existing saved config wins on merge (see config-patch jq below), so this only fills
764
  # in missing fields β€” it never overwrites what the user already configured manually.
 
1428
  fi
1429
 
1430
  # ── Trap SIGTERM for graceful shutdown ──
1431
+ stop_background_sync_loop() {
1432
+ [ -n "${SYNC_LOOP_PID:-}" ] || return 0
1433
+
1434
+ if ! kill -0 "$SYNC_LOOP_PID" 2>/dev/null; then
1435
+ SYNC_LOOP_PID=""
1436
+ return 0
1437
+ fi
1438
+
1439
+ kill "$SYNC_LOOP_PID" 2>/dev/null || true
1440
+ # Wait for the Python process to actually exit so its fcntl lock is released.
1441
+ # A fixed short sleep was not enough when the process was inside a HF upload,
1442
+ # which made the following one-shot syncs burn their timeout waiting on the
1443
+ # lock and left sessions/config changes unsaved.
1444
+ for _sync_stop_i in 1 2 3 4 5 6 7 8 9 10; do
1445
+ if ! kill -0 "$SYNC_LOOP_PID" 2>/dev/null; then
1446
+ SYNC_LOOP_PID=""
1447
+ unset _sync_stop_i
1448
+ return 0
1449
+ fi
1450
+ sleep 0.2
1451
+ done
1452
+
1453
+ echo "Warning: workspace sync loop did not stop after SIGTERM; forcing it to release the sync lock."
1454
+ kill -9 "$SYNC_LOOP_PID" 2>/dev/null || true
1455
+ sleep 0.2
1456
+ SYNC_LOOP_PID=""
1457
+ unset _sync_stop_i
1458
+ }
1459
+
1460
+ run_openclaw_sync_with_timeout() {
1461
+ local timeout_seconds="$1"
1462
+ local command_name="$2"
1463
+ local lock_timeout_seconds="$3"
1464
+ if [ "${timeout_seconds}" = "0" ]; then
1465
+ env SYNC_LOCK_TIMEOUT="$lock_timeout_seconds" \
1466
+ python3 /home/node/app/openclaw-sync.py "$command_name"
1467
+ else
1468
+ timeout --kill-after=10s "${timeout_seconds}s" env SYNC_LOCK_TIMEOUT="$lock_timeout_seconds" \
1469
+ python3 /home/node/app/openclaw-sync.py "$command_name"
1470
+ fi
1471
+ }
1472
+
1473
  graceful_shutdown() {
1474
  echo "Shutting down..."
1475
  if [ -f "/home/node/app/openclaw-sync.py" ] && [ -n "${HF_TOKEN:-}" ]; then
1476
  echo "Saving state before exit..."
1477
+ stop_background_sync_loop
 
 
 
 
 
 
 
 
 
 
 
1478
  # Pass 1: wait for config to settle then upload β€” avoids pushing a
1479
+ # half-written JSON config to the dataset. The per-command lock wait is
1480
+ # intentionally shorter than the outer timeout so a stuck lock fails clearly
1481
+ # and leaves time for the final catch-up pass.
1482
+ run_openclaw_sync_with_timeout "${SYNC_SETTLED_TIMEOUT:-120}" sync-once-settled "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
1483
  echo "Warning: could not complete settled shutdown sync"
1484
  # Pass 2: catch any writes that arrived after the settled sync completed.
1485
  # BUG FIX: added timeout (previously unbounded) so HF container kill
1486
  # can't interrupt a hung upload and lose all data silently.
1487
+ run_openclaw_sync_with_timeout "${SYNC_FINAL_TIMEOUT:-120}" sync-once "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
1488
  echo "Warning: could not complete final shutdown sync"
1489
  elif [ -f "/home/node/app/openclaw-sync.py" ]; then
1490
  echo "HF_TOKEN not set; skipping shutdown backup sync."
 
2484
 
2485
  echo "Gateway stopped; saving latest OpenClaw state before restart..."
2486
  # Kill the background sync loop before syncing β€” same reason as in
2487
+ # graceful_shutdown: the loop holds the fcntl.flock while uploading. Wait for
2488
+ # it to exit (and force-kill as a last resort) before the one-shot syncs so
2489
+ # gateway restarts cannot hang behind a stale lock.
2490
+ stop_background_sync_loop
 
 
 
 
 
 
2491
  # Pass 1: wait for config to settle then upload β€” avoids pushing a
2492
  # half-written JSON config to the dataset. Timeout added (was unbounded)
2493
  # so a slow HF upload cannot stall gateway restarts indefinitely.
2494
+ run_openclaw_sync_with_timeout "${SYNC_SETTLED_TIMEOUT:-120}" sync-once-settled "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
2495
  echo "Warning: could not sync settled state before gateway restart"
2496
  # Pass 2: catch any writes that arrived after the settled sync completed
2497
  # (e.g. session state flushed by OpenClaw just before exit).
2498
  # BUG FIX: this second pass was missing from the restart path (it existed
2499
  # only in graceful_shutdown), so last-second writes were lost on watchdog
2500
  # restarts β€” causing sessions to disappear after OpenClaw auto-restarts.
2501
+ run_openclaw_sync_with_timeout "${SYNC_FINAL_TIMEOUT:-120}" sync-once "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
2502
  echo "Warning: could not complete final sync before gateway restart"
2503
  }
2504