Spaces:

anurag008w
/

HuggingClaw

Running

App Files Files Community

Anurag commited on 5 days ago

Commit

ce96c5b

1 Parent(s): e1afa1b

Refresh verified NVIDIA model catalogs

Browse files

Files changed (6) hide show

.env.example +34 -2
README.md +6 -0
env-builder.js +14 -4
jupyter-devdata-sync.py +46 -5
openclaw-sync.py +275 -77
start.sh +87 -48

.env.example CHANGED Viewed

@@ -105,6 +105,10 @@ LLM_API_KEY=your_api_key_here
 #
 # NVIDIA (NVIDIA_API_KEY):
 #   - nvidia/nemotron-3-super-120b-a12b
 #
 # Groq (GROQ_API_KEY):
 #   - groq/mixtral-8x7b-32768
@@ -272,8 +276,10 @@ LLM_API_KEY_FALLBACK_ENABLED=true
 #
 # Optional: explicitly pin model lists per provider for Control UI visibility
 # when provider keys are configured.
-# Format: comma-separated model IDs
-# NVIDIA_MODELS=meta/llama-3.1-70b-instruct,nvidia/llama-3.1-nemotron-70b-instruct
 # OPENAI_MODELS=gpt-4o-mini,gpt-4.1
 # GROQ_MODELS=llama-3.3-70b-versatile,deepseek-r1-distill-llama-70b
 #      independently and in parallel.
@@ -395,6 +401,32 @@ OPENCLAW_CONFIG_SETTLE_SECONDS=3
 # Minimum gap between sessions-triggered immediate syncs (seconds). Default: 30.
 SESSIONS_MIN_SYNC_GAP=30
 # Webhooks: Standard POST notifications for lifecycle events
 # WEBHOOK_URL=https://your-webhook-endpoint.com/log

 #
 # NVIDIA (NVIDIA_API_KEY):
 #   - nvidia/nemotron-3-super-120b-a12b
+#   - nvidia/deepseek-ai/deepseek-v4-flash
+#   - nvidia/qwen/qwen3-coder-480b-a35b-instruct
+#   - nvidia/moonshotai/kimi-k2.6
+#   - nvidia/stepfun-ai/step-3.7-flash
 #
 # Groq (GROQ_API_KEY):
 #   - groq/mixtral-8x7b-32768
 #
 # Optional: explicitly pin model lists per provider for Control UI visibility
 # when provider keys are configured.
+# Format: comma-separated provider-local model IDs. For NVIDIA, keep NVIDIA-owned
+# API IDs as nvidia/... and third-party hosted IDs as their API owner/id; do not
+# add an extra outer OpenClaw provider prefix in NVIDIA_MODELS.
+# NVIDIA_MODELS=nvidia/nemotron-3-super-120b-a12b,deepseek-ai/deepseek-v4-flash,qwen/qwen3-coder-480b-a35b-instruct,moonshotai/kimi-k2.6,stepfun-ai/step-3.7-flash
 # OPENAI_MODELS=gpt-4o-mini,gpt-4.1
 # GROQ_MODELS=llama-3.3-70b-versatile,deepseek-r1-distill-llama-70b
 #      independently and in parallel.
 # Minimum gap between sessions-triggered immediate syncs (seconds). Default: 30.
 SESSIONS_MIN_SYNC_GAP=30
+# Max seconds a one-off sync waits for another sync process to release its lock.
+# Startup background sync still keeps retrying; this mainly prevents shutdown or
+# gateway-restart saves from hanging forever behind a stuck upload. Default: 20.
+SYNC_LOCK_TIMEOUT=20
+# Max seconds one HF upload call may run before the sync process marks it failed,
+# releases the lock, and retries on the next pass. This prevents Backup from
+# staying stuck at SYNCING forever. Set to 0 to disable this watchdog. Default: 180.
+SYNC_UPLOAD_TIMEOUT=180
+# Upload implementation. Default `folder` uses a normal commit upload and is
+# safest for HuggingClaw's small/medium snapshots. Set `large_folder` only if you
+# explicitly need Hugging Face's resumable large-folder uploader.
+SYNC_UPLOAD_STRATEGY=folder
+# Shutdown/restart one-shot sync upload budgets. These are intentionally much
+# larger than SYNC_LOCK_TIMEOUT so 40-50 MB files are not killed mid-upload just
+# because another sync lock check needed to be bounded. Set to 0 to disable the
+# outer command timeout completely. Defaults: 120 seconds each.
+SYNC_SETTLED_TIMEOUT=120
+SYNC_FINAL_TIMEOUT=120
+# Lock wait used only by shutdown/restart one-shot syncs. Keep this short
+# because start.sh stops the background sync loop before these commands run.
+SYNC_ONE_SHOT_LOCK_TIMEOUT=5
 # Webhooks: Standard POST notifications for lifecycle events
 # WEBHOOK_URL=https://your-webhook-endpoint.com/log

README.md CHANGED Viewed

@@ -188,6 +188,12 @@ HuggingClaw automatically syncs your workspace (chats, settings, sessions) to a
 | `OPENCLAW_CONFIG_WATCH_INTERVAL` | `1` | How often to check `openclaw.json` for immediate settings sync |
 | `OPENCLAW_CONFIG_SETTLE_SECONDS` | `3` | How long `openclaw.json` must stay valid and unchanged before syncing |
 | `SESSIONS_MIN_SYNC_GAP` | `30` | Minimum seconds between session-triggered immediate syncs |
 ## 📦 Ephemeral Package Re-install *(Optional)*

 | `OPENCLAW_CONFIG_WATCH_INTERVAL` | `1` | How often to check `openclaw.json` for immediate settings sync |
 | `OPENCLAW_CONFIG_SETTLE_SECONDS` | `3` | How long `openclaw.json` must stay valid and unchanged before syncing |
 | `SESSIONS_MIN_SYNC_GAP` | `30` | Minimum seconds between session-triggered immediate syncs |
+| `SYNC_LOCK_TIMEOUT` | `20` | Max seconds one-off syncs wait for another sync lock before failing clearly |
+| `SYNC_UPLOAD_TIMEOUT` | `180` | Max seconds one HF upload call can stay active before failing and retrying next pass; set `0` to disable |
+| `SYNC_UPLOAD_STRATEGY` | `folder` | Upload method: `folder` for normal commit uploads, `large_folder` for HF resumable large-folder uploader |
+| `SYNC_SETTLED_TIMEOUT` | `120` | Shutdown/restart settled-sync upload budget; set `0` to disable the outer timeout |
+| `SYNC_FINAL_TIMEOUT` | `120` | Shutdown/restart final catch-up sync upload budget; set `0` to disable the outer timeout |
+| `SYNC_ONE_SHOT_LOCK_TIMEOUT` | `5` | Short lock wait for shutdown/restart one-shot syncs after the background loop is stopped |
 ## 📦 Ephemeral Package Re-install *(Optional)*

env-builder.js CHANGED Viewed

@@ -141,8 +141,13 @@ const MODEL_CATALOGS = {
     ],
     "NVIDIA": [
       "nvidia/nemotron-3-super-120b-a12b",
-      "nvidia/nemotron-4-340b-instruct",
-      "nvidia/llama-3.1-nemotron-70b-instruct"
     ],
     "KiloCode": [
       "kilocode/anthropic/claude-opus-4.7",
@@ -375,8 +380,13 @@ const MODEL_CATALOGS = {
   ],
   "NVIDIA_MODELS": [
     "nvidia/nemotron-3-super-120b-a12b",
-    "nvidia/nemotron-4-340b-instruct",
-    "nvidia/llama-3.1-nemotron-70b-instruct"
   ],
   "KILOCODE_MODELS": [
     "kilocode/anthropic/claude-opus-4.7",

     ],
     "NVIDIA": [
       "nvidia/nemotron-3-super-120b-a12b",
+      "nvidia/deepseek-ai/deepseek-v4-flash",
+      "nvidia/qwen/qwen3-coder-480b-a35b-instruct",
+      "nvidia/moonshotai/kimi-k2.6",
+      "nvidia/minimaxai/minimax-m2.7",
+      "nvidia/openai/gpt-oss-120b",
+      "nvidia/z-ai/glm5.1",
+      "nvidia/stepfun-ai/step-3.7-flash"
     ],
     "KiloCode": [
       "kilocode/anthropic/claude-opus-4.7",
   ],
   "NVIDIA_MODELS": [
     "nvidia/nemotron-3-super-120b-a12b",
+    "deepseek-ai/deepseek-v4-flash",
+    "qwen/qwen3-coder-480b-a35b-instruct",
+    "moonshotai/kimi-k2.6",
+    "minimaxai/minimax-m2.7",
+    "openai/gpt-oss-120b",
+    "z-ai/glm5.1",
+    "stepfun-ai/step-3.7-flash"
   ],
   "KILOCODE_MODELS": [
     "kilocode/anthropic/claude-opus-4.7",

jupyter-devdata-sync.py CHANGED Viewed

@@ -167,6 +167,10 @@ def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool:
     return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
 def should_skip(p: Path):
     # Reserved sync helper files are not user data. Old datasets may still have
     # these markers, but fresh snapshots no longer create them.
@@ -188,15 +192,52 @@ def should_skip(p: Path):
     # Skip any component whose name looks like a secret file/dir.
     return any(_name_is_secret(part) for part in parts)
 def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
     had_copy_failures = False
     protected_large_files: set[str] = set()
-    for p in src.rglob("*"):
         rel = p.relative_to(src)
-        if should_skip(rel):
-            continue
-        if p.is_symlink():
-            continue
         target = dst / rel
         if p.is_dir():
             # Keep parent directories for files copied later in this snapshot.

     return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
+def _is_prefix_of(parts: tuple[str, ...], full_path: tuple[str, ...]) -> bool:
+    return len(parts) <= len(full_path) and full_path[:len(parts)] == parts
 def should_skip(p: Path):
     # Reserved sync helper files are not user data. Old datasets may still have
     # these markers, but fresh snapshots no longer create them.
     # Skip any component whose name looks like a secret file/dir.
     return any(_name_is_secret(part) for part in parts)
+def iter_sync_tree(root: Path):
+    """Yield syncable DevData paths without descending into excluded trees."""
+    if not root.exists():
+        return
+    for dirpath, dirnames, filenames in os.walk(root):
+        dir_path = Path(dirpath)
+        try:
+            dir_rel = dir_path.relative_to(root)
+        except ValueError:
+            dir_rel = Path()
+        kept_dirnames: list[str] = []
+        for dirname in sorted(dirnames):
+            rel = dir_rel / dirname
+            child = dir_path / dirname
+            rel_parts = rel.parts
+            # Do not prune ancestors of explicitly allowed Jupyter settings
+            # paths. should_skip(.local/share/jupyter) is true by design for
+            # files under that tree, but we must still descend through the
+            # parent dirs to reach lab/user-settings and lab/workspaces.
+            allowed_ancestor = any(
+                _is_prefix_of(rel_parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES
+            )
+            if child.is_symlink() or (should_skip(rel) and not allowed_ancestor):
+                continue
+            kept_dirnames.append(dirname)
+        dirnames[:] = kept_dirnames
+        for dirname in kept_dirnames:
+            yield dir_path / dirname
+        for filename in sorted(filenames):
+            rel = dir_rel / filename
+            child = dir_path / filename
+            if child.is_symlink() or should_skip(rel):
+                continue
+            yield child
 def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
     had_copy_failures = False
     protected_large_files: set[str] = set()
+    for p in iter_sync_tree(src):
         rel = p.relative_to(src)
         target = dst / rel
         if p.is_dir():
             # Keep parent directories for files copied later in this snapshot.

openclaw-sync.py CHANGED Viewed

@@ -51,6 +51,9 @@ CONFIG_SETTLE_SECONDS = max(
     float(os.environ.get("OPENCLAW_CONFIG_SETTLE_SECONDS", "3")),
 )
 SESSIONS_MIN_SYNC_GAP = int(os.environ.get("SESSIONS_MIN_SYNC_GAP", "30"))
 HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
 HF_USERNAME = os.environ.get("HF_USERNAME", "").strip()
 SPACE_AUTHOR_NAME = os.environ.get("SPACE_AUTHOR_NAME", "").strip()
@@ -88,6 +91,22 @@ EXCLUDED_STATE_NAMES = {
     # "WhatsApp enabled but not installing on restart".
     "extensions",
 }
 SESSIONS_ROOT = OPENCLAW_HOME / "agents"
 WHATSAPP_CREDS_DIR = OPENCLAW_HOME / "credentials" / "whatsapp" / "default"
 WHATSAPP_BACKUP_DIR = STATE_DIR / "credentials" / "whatsapp" / "default"
@@ -95,11 +114,29 @@ RESET_MARKER = WORKSPACE / ".reset_credentials"
 HF_API = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 STOP_EVENT = threading.Event()
 _REPO_ID_CACHE: str | None = None
-_SESSIONS_FILE_DIGEST_CACHE: dict[str, tuple[int, int, int, str]] = {}
 WorkspaceMarker: TypeAlias = tuple[int, int, int, str]
 # Set True when prune fails so the next sync pass bypasses the fingerprint
 # early-exit and retries the prune even if no local files changed.
 _prune_needed: bool = False
 def write_status(status: str, message: str) -> None:
@@ -133,6 +170,86 @@ def _remove_path(path: Path) -> None:
         path.unlink(missing_ok=True)
 def _copy_hot_directory_snapshot(source_path: Path, tmp_path: Path) -> bool:
     """Best-effort mirror of a hot directory into *tmp_path*.
@@ -259,6 +376,7 @@ def copy_state_entry_with_retry(source_path: Path, backup_path: Path, attempts:
 def snapshot_state_into_workspace() -> bool:
     had_copy_failures = False
     try:
         STATE_DIR.mkdir(parents=True, exist_ok=True)
         # Atomic snapshot: copy to a staging dir first, then rename.
         # This prevents a half-written (or empty) backup if we crash mid-copy,
@@ -274,7 +392,7 @@ def snapshot_state_into_workspace() -> bool:
         skipped_entries: list[tuple[str, Exception]] = []
         copied_entry_names: set[str] = set()
         for source_path in OPENCLAW_HOME.iterdir():
-            if source_path.name in EXCLUDED_STATE_NAMES:
                 continue
             backup_path = staging_dir / source_path.name
@@ -300,7 +418,9 @@ def snapshot_state_into_workspace() -> bool:
         # dataset on the next sync.  Only remove an entry from staging when the
         # source has genuinely been deleted from OPENCLAW_HOME.
         for staged_path in list(staging_dir.iterdir()):
-            if staged_path.name in EXCLUDED_STATE_NAMES:
                 continue
             if staged_path.name in copied_entry_names:
                 continue
@@ -425,13 +545,8 @@ def locally_existing_large_files(root: Path) -> set[str]:
     protected: set[str] = set()
     if not root.exists():
         return protected
-    for path in root.rglob("*"):
-        if not path.is_file():
-            continue
         rel = path.relative_to(root).as_posix()
-        parts = Path(rel).parts
-        if any(part in EXCLUDED_SYNC_DIRS for part in parts):
-            continue
         try:
             if path.stat().st_size > MAX_FILE_SIZE_BYTES:
                 protected.add(rel)
@@ -441,6 +556,7 @@ def locally_existing_large_files(root: Path) -> set[str]:
 def restore_embedded_state() -> None:
     state_backup_root = STATE_DIR / "openclaw"
     # Migration fix: old backups stored state in ".huggingclaw-state/openclaw"
@@ -462,7 +578,7 @@ def restore_embedded_state() -> None:
     if state_backup_root.is_dir():
         for source_path in state_backup_root.iterdir():
             name = source_path.name
-            if name in EXCLUDED_STATE_NAMES:
                 if source_path.is_dir():
                     shutil.rmtree(source_path, ignore_errors=True)
                 else:
@@ -527,7 +643,7 @@ def ensure_repo_exists() -> str:
 def _should_exclude(rel_posix: str, path: Path) -> bool:
     parts = Path(rel_posix).parts
-    if any(part in EXCLUDED_SYNC_DIRS for part in parts):
         return True
     if path.is_file():
         try:
@@ -538,16 +654,28 @@ def _should_exclude(rel_posix: str, path: Path) -> bool:
     return False
-def file_marker(path: Path) -> tuple[int, int, int]:
     try:
         stat = path.stat()
     except OSError:
-        return (0, 0, 0)
     if not path.is_file():
-        return (0, 0, 0)
-    return (1, int(stat.st_size), int(stat.st_mtime_ns))
 def metadata_marker(root: Path) -> WorkspaceMarker:
@@ -558,17 +686,12 @@ def metadata_marker(root: Path) -> WorkspaceMarker:
     total_size = 0
     newest_mtime = 0
     metadata_hasher = hashlib.sha256()
-    for path in sorted(root.rglob("*")):
-        if not path.is_file():
-            continue
         rel = path.relative_to(root).as_posix()
         # BUG FIX: use directory-only exclusion here (not size-based) so that
         # deleting a large file changes the marker and triggers a sync/prune
         # pass.  Large files are still excluded from the upload snapshot via
         # _should_exclude; this only controls change-detection.
-        parts = Path(rel).parts
-        if any(part in EXCLUDED_SYNC_DIRS for part in parts):
-            continue
         try:
             stat = path.stat()
         except OSError:
@@ -576,14 +699,17 @@ def metadata_marker(root: Path) -> WorkspaceMarker:
         file_count += 1
         size = int(stat.st_size)
         mtime_ns = int(stat.st_mtime_ns)
         total_size += size
-        newest_mtime = max(newest_mtime, mtime_ns)
         metadata_hasher.update(rel.encode("utf-8"))
         metadata_hasher.update(b"\0")
         metadata_hasher.update(str(size).encode("ascii"))
         metadata_hasher.update(b"\0")
         metadata_hasher.update(str(mtime_ns).encode("ascii"))
         metadata_hasher.update(b"\0")
     return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
@@ -592,15 +718,12 @@ def fingerprint_dir(root: Path) -> str:
     if not root.exists():
         return hasher.hexdigest()
-    for path in sorted(p for p in root.rglob("*") if p.is_file()):
         rel = path.relative_to(root).as_posix()
         # BUG FIX: use directory-only exclusion (not size-based) so that
         # creating or deleting a large file changes the fingerprint.
         # Large files are hashed by path + metadata only (no content read)
         # to avoid reading gigabytes for a change-detection hash.
-        parts = Path(rel).parts
-        if any(part in EXCLUDED_SYNC_DIRS for part in parts):
-            continue
         hasher.update(rel.encode("utf-8"))
         try:
             stat = path.stat()
@@ -628,7 +751,7 @@ def fingerprint_dir(root: Path) -> str:
 def create_snapshot_dir(source_root: Path) -> Path:
     staging_root = Path(tempfile.mkdtemp(prefix="huggingclaw-sync-"))
-    for path in sorted(source_root.rglob("*")):
         rel = path.relative_to(source_root)
         rel_posix = rel.as_posix()
         if _should_exclude(rel_posix, path):
@@ -648,6 +771,64 @@ def create_snapshot_dir(source_root: Path) -> Path:
     return staging_root
 def prune_remote_deleted_files(
     repo_id: str,
     snapshot_dir: Path,
@@ -679,15 +860,19 @@ def prune_remote_deleted_files(
         and path not in protected_paths
         and not any(path == prefix or path.startswith(prefix + "/") for prefix in skip_prefixes)
     ]
-    if not stale_files:
         return
-    total = len(stale_files)
     num_batches = (total + PRUNE_BATCH_SIZE - 1) // PRUNE_BATCH_SIZE
     for batch_idx in range(num_batches):
-        batch = stale_files[batch_idx * PRUNE_BATCH_SIZE:(batch_idx + 1) * PRUNE_BATCH_SIZE]
         batch_label = f"{batch_idx + 1}/{num_batches}" if num_batches > 1 else ""
-        msg = f"Prune {len(batch)} stale file(s) after workspace sync"
         if batch_label:
             msg += f" (batch {batch_label})"
         operations = [CommitOperationDelete(path_in_repo=path) for path in batch]
@@ -698,7 +883,25 @@ def prune_remote_deleted_files(
             commit_message=msg,
         )
         if num_batches > 1:
-            print(f"Pruned batch {batch_label}: {len(batch)} file(s)")
 def restore_workspace() -> bool:
@@ -795,10 +998,20 @@ def _sync_once_unlocked(
         write_status("disabled", "HF_TOKEN is not configured.")
         return (last_fingerprint or "", last_marker or (0, 0, 0, ""))
-    global _prune_needed
     had_snapshot_copy_failures = snapshot_state_into_workspace()
     repo_id = ensure_repo_exists()
     current_marker = metadata_marker(WORKSPACE)
     # Session watcher uses content digests and can detect same-size rewrites
     # whose workspace metadata marker is unchanged.  In that case, force the
@@ -817,25 +1030,11 @@ def _sync_once_unlocked(
         write_status("synced", "No workspace changes detected.")
         return (last_fingerprint, current_marker)
-    write_status("syncing", f"Uploading workspace to {repo_id}")
     snapshot_dir = create_snapshot_dir(WORKSPACE)
     try:
-        try:
-            HF_API.upload_large_folder(
-                repo_id=repo_id,
-                repo_type="dataset",
-                folder_path=str(snapshot_dir),
-                num_workers=2,
-                print_report=False,
-            )
-        except AttributeError:
-            upload_folder(
-                folder_path=str(snapshot_dir),
-                repo_id=repo_id,
-                repo_type="dataset",
-                token=HF_TOKEN,
-                commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
-            )
         skip_prune_prefixes: set[str] = set()
         if had_snapshot_copy_failures:
             # BUG FIX: the old code added "huggingclaw-state/openclaw" to
@@ -879,7 +1078,18 @@ def sync_once(
 ) -> tuple[str, WorkspaceMarker]:
     SYNC_LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
     with SYNC_LOCK_FILE.open("w", encoding="utf-8") as lock_handle:
-        fcntl.flock(lock_handle, fcntl.LOCK_EX)
         try:
             return _sync_once_unlocked(
                 last_fingerprint,
@@ -920,9 +1130,6 @@ def sessions_marker() -> tuple[int, int, int, str]:
     newest_mtime = 0
     metadata_hasher = hashlib.sha256()
-    global _SESSIONS_FILE_DIGEST_CACHE
-    next_cache: dict[str, tuple[int, int, int, str]] = {}
     for profile_dir in sorted(SESSIONS_ROOT.iterdir()):
         if not profile_dir.is_dir():
             continue
@@ -943,7 +1150,6 @@ def sessions_marker() -> tuple[int, int, int, str]:
             size = int(stat.st_size)
             mtime_ns = int(stat.st_mtime_ns)
             ctime_ns = int(stat.st_ctime_ns)
-            cache_key = f"{profile_dir.name}\0{rel}"
             digest.update(rel.encode("utf-8"))
             digest.update(b"\0")
             digest.update(str(size).encode("ascii"))
@@ -952,24 +1158,17 @@ def sessions_marker() -> tuple[int, int, int, str]:
             digest.update(b"\0")
             digest.update(str(ctime_ns).encode("ascii"))
             digest.update(b"\0")
-            cached = _SESSIONS_FILE_DIGEST_CACHE.get(cache_key)
-            if (
-                cached is not None
-                and cached[0] == size
-                and cached[1] == mtime_ns
-                and cached[2] == ctime_ns
-            ):
-                file_digest = cached[3]
-            else:
-                file_hasher = hashlib.sha256()
-                try:
-                    with path.open("rb") as handle:
-                        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
-                            file_hasher.update(chunk)
-                except (FileNotFoundError, IsADirectoryError, NotADirectoryError):
-                    continue
-                file_digest = file_hasher.hexdigest()
-            next_cache[cache_key] = (size, mtime_ns, ctime_ns, file_digest)
             digest.update(file_digest.encode("ascii"))
             digest.update(b"\0")
@@ -981,11 +1180,10 @@ def sessions_marker() -> tuple[int, int, int, str]:
         metadata_hasher.update(digest.hexdigest().encode("ascii"))
         metadata_hasher.update(b"\0")
-    _SESSIONS_FILE_DIGEST_CACHE = next_cache
     return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
-def wait_for_config_settle(config_marker: tuple[int, int, int]) -> tuple[str, tuple[int, int, int]]:
     stable_since = time.monotonic()
     current_marker = config_marker
@@ -1008,9 +1206,9 @@ def wait_for_config_settle(config_marker: tuple[int, int, int]) -> tuple[str, tu
 def wait_for_sync_trigger(
-    config_marker: tuple[int, int, int],
     last_sessions_sync_time: float = 0.0,
-) -> tuple[str, tuple[int, int, int]]:
     deadline = time.monotonic() + max(0, INTERVAL)
     # BUG FIX: also watch sessions directory so new/updated sessions
     # trigger an immediate sync instead of waiting the full interval.

     float(os.environ.get("OPENCLAW_CONFIG_SETTLE_SECONDS", "3")),
 )
 SESSIONS_MIN_SYNC_GAP = int(os.environ.get("SESSIONS_MIN_SYNC_GAP", "30"))
+SYNC_LOCK_TIMEOUT = max(1.0, float(os.environ.get("SYNC_LOCK_TIMEOUT", "20")))
+SYNC_UPLOAD_TIMEOUT = max(0.0, float(os.environ.get("SYNC_UPLOAD_TIMEOUT", "180")))
+SYNC_UPLOAD_STRATEGY = os.environ.get("SYNC_UPLOAD_STRATEGY", "folder").strip().lower() or "folder"
 HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
 HF_USERNAME = os.environ.get("HF_USERNAME", "").strip()
 SPACE_AUTHOR_NAME = os.environ.get("SPACE_AUTHOR_NAME", "").strip()
     # "WhatsApp enabled but not installing on restart".
     "extensions",
 }
+# Internal restore/snapshot working directories live beside the workspace under
+# /home/node/.openclaw.  If a previous container is killed mid-restore, these
+# hidden directories can be left behind and may contain a full copy of the
+# workspace, including huggingclaw-state/openclaw itself.  Backing them up makes
+# the dataset grow recursively under huggingclaw-state/openclaw and can keep the
+# sync loop busy forever, which in turn blocks session deletes/updates from being
+# pruned remotely.  Treat every such helper path as disposable sync scratch.
+INTERNAL_TEMP_STATE_NAMES = {
+    ".workspace-restore-staging",
+    ".workspace-restore-old",
+    ".openclaw-staging",
+}
+INTERNAL_TEMP_NAME_PREFIXES = (
+    ".workspace-restore-",
+    ".openclaw-staging",
+)
 SESSIONS_ROOT = OPENCLAW_HOME / "agents"
 WHATSAPP_CREDS_DIR = OPENCLAW_HOME / "credentials" / "whatsapp" / "default"
 WHATSAPP_BACKUP_DIR = STATE_DIR / "credentials" / "whatsapp" / "default"
 HF_API = HfApi(token=HF_TOKEN) if HF_TOKEN else None
 STOP_EVENT = threading.Event()
 _REPO_ID_CACHE: str | None = None
 WorkspaceMarker: TypeAlias = tuple[int, int, int, str]
+FileMarker: TypeAlias = tuple[int, int, int, int, str]
 # Set True when prune fails so the next sync pass bypasses the fingerprint
 # early-exit and retries the prune even if no local files changed.
 _prune_needed: bool = False
+_remote_temp_prune_done: bool = False
+# Workspace-relative temp paths managed by this sync script.  Keep this narrow:
+# user projects may legitimately contain folders with similar names, and those
+# must keep syncing normally.  Only the script-owned state/restore scratch
+# locations are skipped and pruned.
+INTERNAL_TEMP_WORKSPACE_PREFIXES: tuple[tuple[str, ...], ...] = (
+    (".workspace-restore-staging",),
+    (".workspace-restore-old",),
+    ("huggingclaw-state", ".openclaw-staging"),
+    ("huggingclaw-state", "openclaw", ".workspace-restore-staging"),
+    ("huggingclaw-state", "openclaw", ".workspace-restore-old"),
+    ("huggingclaw-state", "openclaw", ".openclaw-staging"),
+)
+class SyncUploadTimeoutError(TimeoutError):
+    pass
 def write_status(status: str, message: str) -> None:
         path.unlink(missing_ok=True)
+def _is_internal_temp_name(name: str) -> bool:
+    return name in INTERNAL_TEMP_STATE_NAMES or any(
+        name.startswith(prefix) for prefix in INTERNAL_TEMP_NAME_PREFIXES
+    )
+def _has_internal_temp_part(path: str) -> bool:
+    parts = Path(path).parts
+    return any(_matches_prefix(parts, prefix) for prefix in INTERNAL_TEMP_WORKSPACE_PREFIXES)
+def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool:
+    return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
+def _should_skip_state_entry_name(name: str) -> bool:
+    return (
+        name in EXCLUDED_STATE_NAMES
+        or name in EXCLUDED_SYNC_DIRS
+        or _is_internal_temp_name(name)
+    )
+def _should_skip_sync_path(rel_parts: tuple[str, ...]) -> bool:
+    return any(part in EXCLUDED_SYNC_DIRS for part in rel_parts) or any(
+        _matches_prefix(rel_parts, prefix) for prefix in INTERNAL_TEMP_WORKSPACE_PREFIXES
+    )
+def _iter_sync_tree(root: Path):
+    """Yield syncable paths without descending into ignored/temp directories."""
+    if not root.exists():
+        return
+    for dirpath, dirnames, filenames in os.walk(root):
+        dir_path = Path(dirpath)
+        try:
+            dir_rel_parts = dir_path.relative_to(root).parts
+        except ValueError:
+            dir_rel_parts = ()
+        if dir_rel_parts == (".",):
+            dir_rel_parts = ()
+        dirnames[:] = sorted(
+            name for name in dirnames
+            if not _should_skip_sync_path(dir_rel_parts + (name,))
+        )
+        for dirname in dirnames:
+            yield dir_path / dirname
+        for filename in sorted(filenames):
+            if _should_skip_sync_path(dir_rel_parts + (filename,)):
+                continue
+            yield dir_path / filename
+def _iter_sync_files(root: Path):
+    for path in _iter_sync_tree(root):
+        if path.is_file():
+            yield path
+def cleanup_internal_temp_paths() -> None:
+    """Remove stale sync/restore scratch dirs that must never be backed up."""
+    for root in (OPENCLAW_HOME, STATE_DIR):
+        if not root.exists() or not root.is_dir():
+            continue
+        try:
+            children = list(root.iterdir())
+        except OSError:
+            continue
+        for child in children:
+            if not _is_internal_temp_name(child.name):
+                continue
+            try:
+                _remove_path(child)
+                print(f"Removed stale sync scratch path: {child}")
+            except OSError as exc:
+                print(f"Warning: could not remove stale sync scratch path {child}: {exc}")
 def _copy_hot_directory_snapshot(source_path: Path, tmp_path: Path) -> bool:
     """Best-effort mirror of a hot directory into *tmp_path*.
 def snapshot_state_into_workspace() -> bool:
     had_copy_failures = False
     try:
+        cleanup_internal_temp_paths()
         STATE_DIR.mkdir(parents=True, exist_ok=True)
         # Atomic snapshot: copy to a staging dir first, then rename.
         # This prevents a half-written (or empty) backup if we crash mid-copy,
         skipped_entries: list[tuple[str, Exception]] = []
         copied_entry_names: set[str] = set()
         for source_path in OPENCLAW_HOME.iterdir():
+            if _should_skip_state_entry_name(source_path.name):
                 continue
             backup_path = staging_dir / source_path.name
         # dataset on the next sync.  Only remove an entry from staging when the
         # source has genuinely been deleted from OPENCLAW_HOME.
         for staged_path in list(staging_dir.iterdir()):
+            if _should_skip_state_entry_name(staged_path.name):
+                if staged_path.exists() or staged_path.is_symlink():
+                    _remove_path(staged_path)
                 continue
             if staged_path.name in copied_entry_names:
                 continue
     protected: set[str] = set()
     if not root.exists():
         return protected
+    for path in _iter_sync_files(root):
         rel = path.relative_to(root).as_posix()
         try:
             if path.stat().st_size > MAX_FILE_SIZE_BYTES:
                 protected.add(rel)
 def restore_embedded_state() -> None:
+    cleanup_internal_temp_paths()
     state_backup_root = STATE_DIR / "openclaw"
     # Migration fix: old backups stored state in ".huggingclaw-state/openclaw"
     if state_backup_root.is_dir():
         for source_path in state_backup_root.iterdir():
             name = source_path.name
+            if _should_skip_state_entry_name(name):
                 if source_path.is_dir():
                     shutil.rmtree(source_path, ignore_errors=True)
                 else:
 def _should_exclude(rel_posix: str, path: Path) -> bool:
     parts = Path(rel_posix).parts
+    if _should_skip_sync_path(parts):
         return True
     if path.is_file():
         try:
     return False
+def file_marker(path: Path) -> FileMarker:
     try:
         stat = path.stat()
     except OSError:
+        return (0, 0, 0, 0, "")
     if not path.is_file():
+        return (0, 0, 0, 0, "")
+    digest = ""
+    try:
+        hasher = hashlib.sha256()
+        with path.open("rb") as handle:
+            for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+                hasher.update(chunk)
+        digest = hasher.hexdigest()
+    except OSError:
+        # If the file is being rewritten, include ctime/mtime/size and let the
+        # next watch loop re-read it.  Never pretend the marker is unchanged.
+        digest = f"unreadable:{time.monotonic_ns()}"
+    return (1, int(stat.st_size), int(stat.st_mtime_ns), int(stat.st_ctime_ns), digest)
 def metadata_marker(root: Path) -> WorkspaceMarker:
     total_size = 0
     newest_mtime = 0
     metadata_hasher = hashlib.sha256()
+    for path in _iter_sync_files(root):
         rel = path.relative_to(root).as_posix()
         # BUG FIX: use directory-only exclusion here (not size-based) so that
         # deleting a large file changes the marker and triggers a sync/prune
         # pass.  Large files are still excluded from the upload snapshot via
         # _should_exclude; this only controls change-detection.
         try:
             stat = path.stat()
         except OSError:
         file_count += 1
         size = int(stat.st_size)
         mtime_ns = int(stat.st_mtime_ns)
+        ctime_ns = int(stat.st_ctime_ns)
         total_size += size
+        newest_mtime = max(newest_mtime, mtime_ns, ctime_ns)
         metadata_hasher.update(rel.encode("utf-8"))
         metadata_hasher.update(b"\0")
         metadata_hasher.update(str(size).encode("ascii"))
         metadata_hasher.update(b"\0")
         metadata_hasher.update(str(mtime_ns).encode("ascii"))
         metadata_hasher.update(b"\0")
+        metadata_hasher.update(str(ctime_ns).encode("ascii"))
+        metadata_hasher.update(b"\0")
     return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
     if not root.exists():
         return hasher.hexdigest()
+    for path in _iter_sync_files(root):
         rel = path.relative_to(root).as_posix()
         # BUG FIX: use directory-only exclusion (not size-based) so that
         # creating or deleting a large file changes the fingerprint.
         # Large files are hashed by path + metadata only (no content read)
         # to avoid reading gigabytes for a change-detection hash.
         hasher.update(rel.encode("utf-8"))
         try:
             stat = path.stat()
 def create_snapshot_dir(source_root: Path) -> Path:
     staging_root = Path(tempfile.mkdtemp(prefix="huggingclaw-sync-"))
+    for path in _iter_sync_tree(source_root):
         rel = path.relative_to(source_root)
         rel_posix = rel.as_posix()
         if _should_exclude(rel_posix, path):
     return staging_root
+def _run_with_timeout(seconds: float, label: str, func):
+    """Run *func* with a SIGALRM watchdog so hung HF calls release the sync lock."""
+    if seconds <= 0 or not hasattr(signal, "SIGALRM"):
+        return func()
+    previous_handler = signal.getsignal(signal.SIGALRM)
+    try:
+        previous_timer = signal.setitimer(signal.ITIMER_REAL, 0)
+    except Exception:
+        previous_timer = (0.0, 0.0)
+    def _timeout_handler(_sig, _frame):
+        raise SyncUploadTimeoutError(f"{label} timed out after {seconds:.0f}s")
+    signal.signal(signal.SIGALRM, _timeout_handler)
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    try:
+        return func()
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+        signal.signal(signal.SIGALRM, previous_handler)
+        if previous_timer and previous_timer[0] > 0:
+            signal.setitimer(signal.ITIMER_REAL, previous_timer[0], previous_timer[1])
+def upload_snapshot(repo_id: str, snapshot_dir: Path) -> None:
+    strategy = SYNC_UPLOAD_STRATEGY.replace("-", "_")
+    timeout_label = "no timeout" if SYNC_UPLOAD_TIMEOUT <= 0 else f"{SYNC_UPLOAD_TIMEOUT:.0f}s timeout"
+    if strategy not in {"folder", "upload_folder", "large_folder"}:
+        print(f"Warning: unknown SYNC_UPLOAD_STRATEGY={SYNC_UPLOAD_STRATEGY!r}; using upload_folder.")
+        strategy = "folder"
+    def _upload() -> None:
+        if strategy == "large_folder":
+            try:
+                HF_API.upload_large_folder(
+                    repo_id=repo_id,
+                    repo_type="dataset",
+                    folder_path=str(snapshot_dir),
+                    num_workers=2,
+                    print_report=False,
+                )
+                return
+            except AttributeError:
+                print("Warning: upload_large_folder unavailable; falling back to upload_folder.")
+        upload_folder(
+            folder_path=str(snapshot_dir),
+            repo_id=repo_id,
+            repo_type="dataset",
+            token=HF_TOKEN,
+            commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
+        )
+    print(f"Workspace upload: strategy={strategy}, {timeout_label}")
+    _run_with_timeout(SYNC_UPLOAD_TIMEOUT, "Workspace upload", _upload)
 def prune_remote_deleted_files(
     repo_id: str,
     snapshot_dir: Path,
         and path not in protected_paths
         and not any(path == prefix or path.startswith(prefix + "/") for prefix in skip_prefixes)
     ]
+    _commit_delete_batches(repo_id, stale_files, "Prune", "stale file(s) after workspace sync")
+def _commit_delete_batches(repo_id: str, paths: list[str], action: str, reason: str) -> None:
+    if not paths:
         return
+    total = len(paths)
     num_batches = (total + PRUNE_BATCH_SIZE - 1) // PRUNE_BATCH_SIZE
     for batch_idx in range(num_batches):
+        batch = paths[batch_idx * PRUNE_BATCH_SIZE:(batch_idx + 1) * PRUNE_BATCH_SIZE]
         batch_label = f"{batch_idx + 1}/{num_batches}" if num_batches > 1 else ""
+        msg = f"{action} {len(batch)} {reason}"
         if batch_label:
             msg += f" (batch {batch_label})"
         operations = [CommitOperationDelete(path_in_repo=path) for path in batch]
             commit_message=msg,
         )
         if num_batches > 1:
+            print(f"{action} batch {batch_label}: {len(batch)} file(s)")
+def prune_remote_internal_temp_files(repo_id: str) -> None:
+    """Remove old recursive scratch files from the HF backup even if nothing changed locally."""
+    if HF_API is None:
+        return
+    remote_files = list(HF_API.list_repo_files(repo_id=repo_id, repo_type="dataset"))
+    internal_temp_files = [
+        path for path in remote_files
+        if path != ".gitattributes" and _has_internal_temp_part(path)
+    ]
+    _commit_delete_batches(
+        repo_id,
+        internal_temp_files,
+        "Prune internal sync scratch",
+        "file(s) from backup",
+    )
 def restore_workspace() -> bool:
         write_status("disabled", "HF_TOKEN is not configured.")
         return (last_fingerprint or "", last_marker or (0, 0, 0, ""))
+    global _prune_needed, _remote_temp_prune_done
     had_snapshot_copy_failures = snapshot_state_into_workspace()
     repo_id = ensure_repo_exists()
+    if not _remote_temp_prune_done:
+        try:
+            prune_remote_internal_temp_files(repo_id)
+            _remote_temp_prune_done = True
+        except Exception as temp_prune_exc:
+            # Do not block normal user-data sync if the cleanup commit fails;
+            # leave the flag false so the next pass retries before taking the
+            # metadata/fingerprint early exit again.
+            print(f"Warning: could not prune internal sync scratch files: {temp_prune_exc}")
+            _prune_needed = True
     current_marker = metadata_marker(WORKSPACE)
     # Session watcher uses content digests and can detect same-size rewrites
     # whose workspace metadata marker is unchanged.  In that case, force the
         write_status("synced", "No workspace changes detected.")
         return (last_fingerprint, current_marker)
+    upload_timeout_label = "no timeout" if SYNC_UPLOAD_TIMEOUT <= 0 else f"timeout {SYNC_UPLOAD_TIMEOUT:.0f}s"
+    write_status("syncing", f"Uploading workspace to {repo_id} ({SYNC_UPLOAD_STRATEGY}, {upload_timeout_label})")
     snapshot_dir = create_snapshot_dir(WORKSPACE)
     try:
+        upload_snapshot(repo_id, snapshot_dir)
         skip_prune_prefixes: set[str] = set()
         if had_snapshot_copy_failures:
             # BUG FIX: the old code added "huggingclaw-state/openclaw" to
 ) -> tuple[str, WorkspaceMarker]:
     SYNC_LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
     with SYNC_LOCK_FILE.open("w", encoding="utf-8") as lock_handle:
+        deadline = time.monotonic() + SYNC_LOCK_TIMEOUT
+        while True:
+            try:
+                fcntl.flock(lock_handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
+                break
+            except BlockingIOError:
+                if time.monotonic() >= deadline:
+                    raise TimeoutError(
+                        f"Timed out waiting {SYNC_LOCK_TIMEOUT:.0f}s for workspace sync lock."
+                    )
+                if STOP_EVENT.wait(0.25):
+                    raise TimeoutError("Stopped while waiting for workspace sync lock.")
         try:
             return _sync_once_unlocked(
                 last_fingerprint,
     newest_mtime = 0
     metadata_hasher = hashlib.sha256()
     for profile_dir in sorted(SESSIONS_ROOT.iterdir()):
         if not profile_dir.is_dir():
             continue
             size = int(stat.st_size)
             mtime_ns = int(stat.st_mtime_ns)
             ctime_ns = int(stat.st_ctime_ns)
             digest.update(rel.encode("utf-8"))
             digest.update(b"\0")
             digest.update(str(size).encode("ascii"))
             digest.update(b"\0")
             digest.update(str(ctime_ns).encode("ascii"))
             digest.update(b"\0")
+            # Sessions are the most important live data.  Hash every scan
+            # instead of trusting size/mtime/ctime caches so same-size rewrites
+            # and very small edits are never missed by the sessions trigger.
+            file_hasher = hashlib.sha256()
+            try:
+                with path.open("rb") as handle:
+                    for chunk in iter(lambda: handle.read(1024 * 1024), b""):
+                        file_hasher.update(chunk)
+            except (FileNotFoundError, IsADirectoryError, NotADirectoryError):
+                continue
+            file_digest = file_hasher.hexdigest()
             digest.update(file_digest.encode("ascii"))
             digest.update(b"\0")
         metadata_hasher.update(digest.hexdigest().encode("ascii"))
         metadata_hasher.update(b"\0")
     return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
+def wait_for_config_settle(config_marker: FileMarker) -> tuple[str, FileMarker]:
     stable_since = time.monotonic()
     current_marker = config_marker
 def wait_for_sync_trigger(
+    config_marker: FileMarker,
     last_sessions_sync_time: float = 0.0,
+) -> tuple[str, FileMarker]:
     deadline = time.monotonic() + max(0, INTERVAL)
     # BUG FIX: also watch sessions directory so new/updated sessions
     # trigger an immediate sync instead of waiting the full interval.

start.sh CHANGED Viewed

@@ -357,12 +357,12 @@ case "$LLM_PROVIDER" in
   mistral-*|codestral-*|devstral-*|voxtral-*)
     export MISTRAL_API_KEY="$LLM_API_KEY"
     echo "Note: bare Mistral model '$LLM_MODEL' detected; mapped LLM_API_KEY → MISTRAL_API_KEY. Use 'mistral/${LLM_MODEL}' prefix to be explicit." ;;
-  moonshotai|meta-llama|deepseek-ai|MiniMaxAI|minimax-ai|Qwen|zai-org|mistralai|google)
     echo "Warning: LLM_MODEL='$LLM_MODEL' uses sub-provider prefix '$LLM_PROVIDER'. This is a router-namespaced model (Together/OpenRouter). Mapping LLM_API_KEY → TOGETHER_API_KEY. If using OpenRouter, also set OPENROUTER_API_KEY as a separate secret."
     export TOGETHER_API_KEY="${TOGETHER_API_KEY:-$LLM_API_KEY}" ;;
   # ── Fallback: Anthropic (default) ──
   *)
-    echo "Warning: Unknown provider prefix '$LLM_PROVIDER' in LLM_MODEL='$LLM_MODEL'. Defaulting to ANTHROPIC_API_KEY. If using a router-namespaced model (e.g. moonshotai/Kimi-K2.6), set TOGETHER_API_KEY or OPENROUTER_API_KEY as a separate secret."
     export ANTHROPIC_API_KEY="$LLM_API_KEY"
     ;;
 esac
@@ -657,7 +657,7 @@ _DEFAULT_XAI_MODELS="xai/grok-4.20,xai/grok-4.3,xai/grok-4.1,xai/grok-latest,xai
 _DEFAULT_COHERE_MODELS="cohere/command-a,cohere/command-a-03-2025,cohere/command-a-reasoning-08-2025,cohere/command-r-plus-08-2024"
 _DEFAULT_TOGETHER_MODELS="together/moonshotai/Kimi-K2.6,together/deepseek-ai/DeepSeek-V4-Pro,together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput,together/meta-llama/Llama-3.3-70B-Instruct-Turbo"
 _DEFAULT_CEREBRAS_MODELS="cerebras/zai-glm-4.7,cerebras/gpt-oss-120b,cerebras/deepseek-r1,cerebras/qwen3-32b"
-_DEFAULT_NVIDIA_MODELS="nvidia/nemotron-3-super-120b-a12b,nvidia/nemotron-4-340b-instruct,nvidia/llama-3.1-nemotron-70b-instruct"
 _DEFAULT_KILOCODE_MODELS="kilocode/kilo/auto,kilocode/anthropic/claude-opus-4.7,kilocode/openai/gpt-5.4,kilocode/google/gemini-2.5-pro"
 _DEFAULT_MOONSHOT_MODELS="moonshot/kimi-k2.6,moonshot/kimi-k2.6-thinking,moonshot/kimi-k2-thinking"
 _DEFAULT_MINIMAX_MODELS="minimax/MiniMax-M2.7,minimax/minimax-m1.5"
@@ -725,26 +725,40 @@ inject_provider_models_from_env() {
     | awk 'NF' \
     | jq -R . \
     | jq -s --arg provider "$provider" '
-        map(
-          if contains("/") then
-            # Fix cross-prefix: strip only the first "/" segment (the foreign provider
-            # prefix) and reapply the correct one. Using `last` was a bug for 3-part IDs:
-            # e.g. "openai/gpt-oss-120b" injected into "groq" would become
-            #      "groq/gpt-oss-120b" instead of "groq/openai/gpt-oss-120b".
-            # Examples:
-            #   "google/gemini-2.5-pro"        → "google-vertex/gemini-2.5-pro"
-            #   "openai/gpt-oss-120b"          → "groq/openai/gpt-oss-120b"
-            #   "moonshotai/kimi-k2.6"         → "nvidia/moonshotai/kimi-k2.6"
-            if startswith($provider + "/") then .
-            else ($provider + "/" + (split("/") | .[1:] | join("/")))
             end
           else
-            ($provider + "/" + .)
-          end
-        )
         | map({id: ., name: .})
         | unique_by(.id)')
   # Build provider patch: always inject models; conditionally inject apiKey, baseUrl, api.
   # Existing saved config wins on merge (see config-patch jq below), so this only fills
   # in missing fields — it never overwrites what the user already configured manually.
@@ -1414,32 +1428,63 @@ if [ -n "${WEBHOOK_URL:-}" ]; then
 fi
 # ── Trap SIGTERM for graceful shutdown ──
 graceful_shutdown() {
   echo "Shutting down..."
   if [ -f "/home/node/app/openclaw-sync.py" ] && [ -n "${HF_TOKEN:-}" ]; then
     echo "Saving state before exit..."
-    # BUG FIX: kill the background sync loop *before* running the shutdown
-    # syncs.  The loop holds the sync lock while uploading; if it is
-    # mid-upload when sync-once-settled runs, the 8-second timeout fires
-    # before the lock is released and the settled-sync is silently skipped.
-    # Killing the loop first releases the lock immediately (fcntl.flock is
-    # released on process exit) so sync-once-settled can acquire it cleanly.
-    if [ -n "${SYNC_LOOP_PID:-}" ]; then
-      kill "$SYNC_LOOP_PID" 2>/dev/null || true
-      # Give Python a moment to flush and release the lock file.
-      # Reduced from 0.5 s → 0.3 s to reclaim time for the upload.
-      sleep 0.3
-    fi
     # Pass 1: wait for config to settle then upload — avoids pushing a
-    # half-written JSON config to the dataset.  Timeout raised from 8 s
-    # to 15 s so the 3-second settle window + actual upload both fit
-    # within the budget (old 8 s left only 5 s for the upload itself).
-    timeout 15s python3 /home/node/app/openclaw-sync.py sync-once-settled || \
       echo "Warning: could not complete settled shutdown sync"
     # Pass 2: catch any writes that arrived after the settled sync completed.
     # BUG FIX: added timeout (previously unbounded) so HF container kill
     # can't interrupt a hung upload and lose all data silently.
-    timeout 8s python3 /home/node/app/openclaw-sync.py sync-once || \
       echo "Warning: could not complete final shutdown sync"
   elif [ -f "/home/node/app/openclaw-sync.py" ]; then
     echo "HF_TOKEN not set; skipping shutdown backup sync."
@@ -2439,27 +2484,21 @@ sync_before_gateway_restart() {
   echo "Gateway stopped; saving latest OpenClaw state before restart..."
   # Kill the background sync loop before syncing — same reason as in
-  # graceful_shutdown: the loop holds the fcntl.flock while uploading;
-  # if it is mid-upload when sync-once-settled runs, the lock contention
-  # will eat into (or exhaust) the 15s timeout budget, silently skipping
-  # the upload.  Killing the loop releases the lock immediately (fcntl.flock
-  # is released on process exit) so sync-once-settled can acquire it cleanly.
-  if [ -n "${SYNC_LOOP_PID:-}" ]; then
-    kill "$SYNC_LOOP_PID" 2>/dev/null || true
-    sleep 0.3
-    SYNC_LOOP_PID=""
-  fi
   # Pass 1: wait for config to settle then upload — avoids pushing a
   # half-written JSON config to the dataset.  Timeout added (was unbounded)
   # so a slow HF upload cannot stall gateway restarts indefinitely.
-  timeout 15s python3 /home/node/app/openclaw-sync.py sync-once-settled || \
     echo "Warning: could not sync settled state before gateway restart"
   # Pass 2: catch any writes that arrived after the settled sync completed
   # (e.g. session state flushed by OpenClaw just before exit).
   # BUG FIX: this second pass was missing from the restart path (it existed
   # only in graceful_shutdown), so last-second writes were lost on watchdog
   # restarts — causing sessions to disappear after OpenClaw auto-restarts.
-  timeout 8s python3 /home/node/app/openclaw-sync.py sync-once || \
     echo "Warning: could not complete final sync before gateway restart"
 }

   mistral-*|codestral-*|devstral-*|voxtral-*)
     export MISTRAL_API_KEY="$LLM_API_KEY"
     echo "Note: bare Mistral model '$LLM_MODEL' detected; mapped LLM_API_KEY → MISTRAL_API_KEY. Use 'mistral/${LLM_MODEL}' prefix to be explicit." ;;
+  moonshotai|meta-llama|deepseek-ai|minimaxai|MiniMaxAI|minimax-ai|Qwen|zai-org|mistralai|google)
     echo "Warning: LLM_MODEL='$LLM_MODEL' uses sub-provider prefix '$LLM_PROVIDER'. This is a router-namespaced model (Together/OpenRouter). Mapping LLM_API_KEY → TOGETHER_API_KEY. If using OpenRouter, also set OPENROUTER_API_KEY as a separate secret."
     export TOGETHER_API_KEY="${TOGETHER_API_KEY:-$LLM_API_KEY}" ;;
   # ── Fallback: Anthropic (default) ──
   *)
+    echo "Warning: Unknown provider prefix '$LLM_PROVIDER' in LLM_MODEL='$LLM_MODEL'. Defaulting to ANTHROPIC_API_KEY. If using a router-namespaced model (e.g. moonshotai/kimi-k2.6), set TOGETHER_API_KEY or OPENROUTER_API_KEY as a separate secret."
     export ANTHROPIC_API_KEY="$LLM_API_KEY"
     ;;
 esac
 _DEFAULT_COHERE_MODELS="cohere/command-a,cohere/command-a-03-2025,cohere/command-a-reasoning-08-2025,cohere/command-r-plus-08-2024"
 _DEFAULT_TOGETHER_MODELS="together/moonshotai/Kimi-K2.6,together/deepseek-ai/DeepSeek-V4-Pro,together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput,together/meta-llama/Llama-3.3-70B-Instruct-Turbo"
 _DEFAULT_CEREBRAS_MODELS="cerebras/zai-glm-4.7,cerebras/gpt-oss-120b,cerebras/deepseek-r1,cerebras/qwen3-32b"
+_DEFAULT_NVIDIA_MODELS="nvidia/nemotron-3-super-120b-a12b,deepseek-ai/deepseek-v4-flash,qwen/qwen3-coder-480b-a35b-instruct,moonshotai/kimi-k2.6,minimaxai/minimax-m2.7,openai/gpt-oss-120b,z-ai/glm5.1,stepfun-ai/step-3.7-flash"
 _DEFAULT_KILOCODE_MODELS="kilocode/kilo/auto,kilocode/anthropic/claude-opus-4.7,kilocode/openai/gpt-5.4,kilocode/google/gemini-2.5-pro"
 _DEFAULT_MOONSHOT_MODELS="moonshot/kimi-k2.6,moonshot/kimi-k2.6-thinking,moonshot/kimi-k2-thinking"
 _DEFAULT_MINIMAX_MODELS="minimax/MiniMax-M2.7,minimax/minimax-m1.5"
     | awk 'NF' \
     | jq -R . \
     | jq -s --arg provider "$provider" '
+        def strip_outer_provider:
+          (split("/") | .[1:] | join("/"));
+        def normalize_provider_model:
+          # OpenClaw stores provider-local model ids inside
+          # models.providers.<provider>.models[].id. The user-facing model ref
+          # is composed later as <provider>/<id>. Most providers use ids like
+          # "gpt-5.4", so "openai/gpt-5.4" becomes "gpt-5.4" here.
+          # NVIDIA is special because some valid NVIDIA API ids themselves
+          # start with "nvidia/" (for example nvidia/nemotron-...). Keep those
+          # single-prefix ids intact, but strip an outer OpenClaw provider prefix
+          # from full refs like "nvidia/deepseek-ai/..." or legacy
+          # "nvidia/nvidia/nemotron-..." so the provider sends the exact NVIDIA
+          # documented model id to integrate.api.nvidia.com.
+          if $provider == "nvidia" then
+            if startswith("nvidia/nvidia/")
+               or startswith("nvidia/deepseek-ai/")
+               or startswith("nvidia/qwen/")
+               or startswith("nvidia/moonshotai/")
+               or startswith("nvidia/minimaxai/")
+               or startswith("nvidia/openai/")
+               or startswith("nvidia/z-ai/")
+               or startswith("nvidia/stepfun-ai/") then
+              strip_outer_provider
+            else
+              .
             end
+          elif startswith($provider + "/") then
+            strip_outer_provider
           else
+            .
+          end;
+        map(normalize_provider_model)
         | map({id: ., name: .})
         | unique_by(.id)')
   # Build provider patch: always inject models; conditionally inject apiKey, baseUrl, api.
   # Existing saved config wins on merge (see config-patch jq below), so this only fills
   # in missing fields — it never overwrites what the user already configured manually.
 fi
 # ── Trap SIGTERM for graceful shutdown ──
+stop_background_sync_loop() {
+  [ -n "${SYNC_LOOP_PID:-}" ] || return 0
+  if ! kill -0 "$SYNC_LOOP_PID" 2>/dev/null; then
+    SYNC_LOOP_PID=""
+    return 0
+  fi
+  kill "$SYNC_LOOP_PID" 2>/dev/null || true
+  # Wait for the Python process to actually exit so its fcntl lock is released.
+  # A fixed short sleep was not enough when the process was inside a HF upload,
+  # which made the following one-shot syncs burn their timeout waiting on the
+  # lock and left sessions/config changes unsaved.
+  for _sync_stop_i in 1 2 3 4 5 6 7 8 9 10; do
+    if ! kill -0 "$SYNC_LOOP_PID" 2>/dev/null; then
+      SYNC_LOOP_PID=""
+      unset _sync_stop_i
+      return 0
+    fi
+    sleep 0.2
+  done
+  echo "Warning: workspace sync loop did not stop after SIGTERM; forcing it to release the sync lock."
+  kill -9 "$SYNC_LOOP_PID" 2>/dev/null || true
+  sleep 0.2
+  SYNC_LOOP_PID=""
+  unset _sync_stop_i
+}
+run_openclaw_sync_with_timeout() {
+  local timeout_seconds="$1"
+  local command_name="$2"
+  local lock_timeout_seconds="$3"
+  if [ "${timeout_seconds}" = "0" ]; then
+    env SYNC_LOCK_TIMEOUT="$lock_timeout_seconds" \
+      python3 /home/node/app/openclaw-sync.py "$command_name"
+  else
+    timeout --kill-after=10s "${timeout_seconds}s" env SYNC_LOCK_TIMEOUT="$lock_timeout_seconds" \
+      python3 /home/node/app/openclaw-sync.py "$command_name"
+  fi
+}
 graceful_shutdown() {
   echo "Shutting down..."
   if [ -f "/home/node/app/openclaw-sync.py" ] && [ -n "${HF_TOKEN:-}" ]; then
     echo "Saving state before exit..."
+    stop_background_sync_loop
     # Pass 1: wait for config to settle then upload — avoids pushing a
+    # half-written JSON config to the dataset.  The per-command lock wait is
+    # intentionally shorter than the outer timeout so a stuck lock fails clearly
+    # and leaves time for the final catch-up pass.
+    run_openclaw_sync_with_timeout "${SYNC_SETTLED_TIMEOUT:-120}" sync-once-settled "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
       echo "Warning: could not complete settled shutdown sync"
     # Pass 2: catch any writes that arrived after the settled sync completed.
     # BUG FIX: added timeout (previously unbounded) so HF container kill
     # can't interrupt a hung upload and lose all data silently.
+    run_openclaw_sync_with_timeout "${SYNC_FINAL_TIMEOUT:-120}" sync-once "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
       echo "Warning: could not complete final shutdown sync"
   elif [ -f "/home/node/app/openclaw-sync.py" ]; then
     echo "HF_TOKEN not set; skipping shutdown backup sync."
   echo "Gateway stopped; saving latest OpenClaw state before restart..."
   # Kill the background sync loop before syncing — same reason as in
+  # graceful_shutdown: the loop holds the fcntl.flock while uploading.  Wait for
+  # it to exit (and force-kill as a last resort) before the one-shot syncs so
+  # gateway restarts cannot hang behind a stale lock.
+  stop_background_sync_loop
   # Pass 1: wait for config to settle then upload — avoids pushing a
   # half-written JSON config to the dataset.  Timeout added (was unbounded)
   # so a slow HF upload cannot stall gateway restarts indefinitely.
+  run_openclaw_sync_with_timeout "${SYNC_SETTLED_TIMEOUT:-120}" sync-once-settled "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
     echo "Warning: could not sync settled state before gateway restart"
   # Pass 2: catch any writes that arrived after the settled sync completed
   # (e.g. session state flushed by OpenClaw just before exit).
   # BUG FIX: this second pass was missing from the restart path (it existed
   # only in graceful_shutdown), so last-second writes were lost on watchdog
   # restarts — causing sessions to disappear after OpenClaw auto-restarts.
+  run_openclaw_sync_with_timeout "${SYNC_FINAL_TIMEOUT:-120}" sync-once "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
     echo "Warning: could not complete final sync before gateway restart"
 }