Spaces:
Running
Running
Anurag commited on
Commit Β·
ce96c5b
1
Parent(s): e1afa1b
Refresh verified NVIDIA model catalogs
Browse files- .env.example +34 -2
- README.md +6 -0
- env-builder.js +14 -4
- jupyter-devdata-sync.py +46 -5
- openclaw-sync.py +275 -77
- start.sh +87 -48
.env.example
CHANGED
|
@@ -105,6 +105,10 @@ LLM_API_KEY=your_api_key_here
|
|
| 105 |
#
|
| 106 |
# NVIDIA (NVIDIA_API_KEY):
|
| 107 |
# - nvidia/nemotron-3-super-120b-a12b
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
#
|
| 109 |
# Groq (GROQ_API_KEY):
|
| 110 |
# - groq/mixtral-8x7b-32768
|
|
@@ -272,8 +276,10 @@ LLM_API_KEY_FALLBACK_ENABLED=true
|
|
| 272 |
#
|
| 273 |
# Optional: explicitly pin model lists per provider for Control UI visibility
|
| 274 |
# when provider keys are configured.
|
| 275 |
-
# Format: comma-separated model IDs
|
| 276 |
-
#
|
|
|
|
|
|
|
| 277 |
# OPENAI_MODELS=gpt-4o-mini,gpt-4.1
|
| 278 |
# GROQ_MODELS=llama-3.3-70b-versatile,deepseek-r1-distill-llama-70b
|
| 279 |
# independently and in parallel.
|
|
@@ -395,6 +401,32 @@ OPENCLAW_CONFIG_SETTLE_SECONDS=3
|
|
| 395 |
# Minimum gap between sessions-triggered immediate syncs (seconds). Default: 30.
|
| 396 |
SESSIONS_MIN_SYNC_GAP=30
|
| 397 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
# Webhooks: Standard POST notifications for lifecycle events
|
| 399 |
# WEBHOOK_URL=https://your-webhook-endpoint.com/log
|
| 400 |
|
|
|
|
| 105 |
#
|
| 106 |
# NVIDIA (NVIDIA_API_KEY):
|
| 107 |
# - nvidia/nemotron-3-super-120b-a12b
|
| 108 |
+
# - nvidia/deepseek-ai/deepseek-v4-flash
|
| 109 |
+
# - nvidia/qwen/qwen3-coder-480b-a35b-instruct
|
| 110 |
+
# - nvidia/moonshotai/kimi-k2.6
|
| 111 |
+
# - nvidia/stepfun-ai/step-3.7-flash
|
| 112 |
#
|
| 113 |
# Groq (GROQ_API_KEY):
|
| 114 |
# - groq/mixtral-8x7b-32768
|
|
|
|
| 276 |
#
|
| 277 |
# Optional: explicitly pin model lists per provider for Control UI visibility
|
| 278 |
# when provider keys are configured.
|
| 279 |
+
# Format: comma-separated provider-local model IDs. For NVIDIA, keep NVIDIA-owned
|
| 280 |
+
# API IDs as nvidia/... and third-party hosted IDs as their API owner/id; do not
|
| 281 |
+
# add an extra outer OpenClaw provider prefix in NVIDIA_MODELS.
|
| 282 |
+
# NVIDIA_MODELS=nvidia/nemotron-3-super-120b-a12b,deepseek-ai/deepseek-v4-flash,qwen/qwen3-coder-480b-a35b-instruct,moonshotai/kimi-k2.6,stepfun-ai/step-3.7-flash
|
| 283 |
# OPENAI_MODELS=gpt-4o-mini,gpt-4.1
|
| 284 |
# GROQ_MODELS=llama-3.3-70b-versatile,deepseek-r1-distill-llama-70b
|
| 285 |
# independently and in parallel.
|
|
|
|
| 401 |
# Minimum gap between sessions-triggered immediate syncs (seconds). Default: 30.
|
| 402 |
SESSIONS_MIN_SYNC_GAP=30
|
| 403 |
|
| 404 |
+
# Max seconds a one-off sync waits for another sync process to release its lock.
|
| 405 |
+
# Startup background sync still keeps retrying; this mainly prevents shutdown or
|
| 406 |
+
# gateway-restart saves from hanging forever behind a stuck upload. Default: 20.
|
| 407 |
+
SYNC_LOCK_TIMEOUT=20
|
| 408 |
+
|
| 409 |
+
# Max seconds one HF upload call may run before the sync process marks it failed,
|
| 410 |
+
# releases the lock, and retries on the next pass. This prevents Backup from
|
| 411 |
+
# staying stuck at SYNCING forever. Set to 0 to disable this watchdog. Default: 180.
|
| 412 |
+
SYNC_UPLOAD_TIMEOUT=180
|
| 413 |
+
|
| 414 |
+
# Upload implementation. Default `folder` uses a normal commit upload and is
|
| 415 |
+
# safest for HuggingClaw's small/medium snapshots. Set `large_folder` only if you
|
| 416 |
+
# explicitly need Hugging Face's resumable large-folder uploader.
|
| 417 |
+
SYNC_UPLOAD_STRATEGY=folder
|
| 418 |
+
|
| 419 |
+
# Shutdown/restart one-shot sync upload budgets. These are intentionally much
|
| 420 |
+
# larger than SYNC_LOCK_TIMEOUT so 40-50 MB files are not killed mid-upload just
|
| 421 |
+
# because another sync lock check needed to be bounded. Set to 0 to disable the
|
| 422 |
+
# outer command timeout completely. Defaults: 120 seconds each.
|
| 423 |
+
SYNC_SETTLED_TIMEOUT=120
|
| 424 |
+
SYNC_FINAL_TIMEOUT=120
|
| 425 |
+
|
| 426 |
+
# Lock wait used only by shutdown/restart one-shot syncs. Keep this short
|
| 427 |
+
# because start.sh stops the background sync loop before these commands run.
|
| 428 |
+
SYNC_ONE_SHOT_LOCK_TIMEOUT=5
|
| 429 |
+
|
| 430 |
# Webhooks: Standard POST notifications for lifecycle events
|
| 431 |
# WEBHOOK_URL=https://your-webhook-endpoint.com/log
|
| 432 |
|
README.md
CHANGED
|
@@ -188,6 +188,12 @@ HuggingClaw automatically syncs your workspace (chats, settings, sessions) to a
|
|
| 188 |
| `OPENCLAW_CONFIG_WATCH_INTERVAL` | `1` | How often to check `openclaw.json` for immediate settings sync |
|
| 189 |
| `OPENCLAW_CONFIG_SETTLE_SECONDS` | `3` | How long `openclaw.json` must stay valid and unchanged before syncing |
|
| 190 |
| `SESSIONS_MIN_SYNC_GAP` | `30` | Minimum seconds between session-triggered immediate syncs |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
## π¦ Ephemeral Package Re-install *(Optional)*
|
| 193 |
|
|
|
|
| 188 |
| `OPENCLAW_CONFIG_WATCH_INTERVAL` | `1` | How often to check `openclaw.json` for immediate settings sync |
|
| 189 |
| `OPENCLAW_CONFIG_SETTLE_SECONDS` | `3` | How long `openclaw.json` must stay valid and unchanged before syncing |
|
| 190 |
| `SESSIONS_MIN_SYNC_GAP` | `30` | Minimum seconds between session-triggered immediate syncs |
|
| 191 |
+
| `SYNC_LOCK_TIMEOUT` | `20` | Max seconds one-off syncs wait for another sync lock before failing clearly |
|
| 192 |
+
| `SYNC_UPLOAD_TIMEOUT` | `180` | Max seconds one HF upload call can stay active before failing and retrying next pass; set `0` to disable |
|
| 193 |
+
| `SYNC_UPLOAD_STRATEGY` | `folder` | Upload method: `folder` for normal commit uploads, `large_folder` for HF resumable large-folder uploader |
|
| 194 |
+
| `SYNC_SETTLED_TIMEOUT` | `120` | Shutdown/restart settled-sync upload budget; set `0` to disable the outer timeout |
|
| 195 |
+
| `SYNC_FINAL_TIMEOUT` | `120` | Shutdown/restart final catch-up sync upload budget; set `0` to disable the outer timeout |
|
| 196 |
+
| `SYNC_ONE_SHOT_LOCK_TIMEOUT` | `5` | Short lock wait for shutdown/restart one-shot syncs after the background loop is stopped |
|
| 197 |
|
| 198 |
## π¦ Ephemeral Package Re-install *(Optional)*
|
| 199 |
|
env-builder.js
CHANGED
|
@@ -141,8 +141,13 @@ const MODEL_CATALOGS = {
|
|
| 141 |
],
|
| 142 |
"NVIDIA": [
|
| 143 |
"nvidia/nemotron-3-super-120b-a12b",
|
| 144 |
-
"nvidia/
|
| 145 |
-
"nvidia/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
],
|
| 147 |
"KiloCode": [
|
| 148 |
"kilocode/anthropic/claude-opus-4.7",
|
|
@@ -375,8 +380,13 @@ const MODEL_CATALOGS = {
|
|
| 375 |
],
|
| 376 |
"NVIDIA_MODELS": [
|
| 377 |
"nvidia/nemotron-3-super-120b-a12b",
|
| 378 |
-
"
|
| 379 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
],
|
| 381 |
"KILOCODE_MODELS": [
|
| 382 |
"kilocode/anthropic/claude-opus-4.7",
|
|
|
|
| 141 |
],
|
| 142 |
"NVIDIA": [
|
| 143 |
"nvidia/nemotron-3-super-120b-a12b",
|
| 144 |
+
"nvidia/deepseek-ai/deepseek-v4-flash",
|
| 145 |
+
"nvidia/qwen/qwen3-coder-480b-a35b-instruct",
|
| 146 |
+
"nvidia/moonshotai/kimi-k2.6",
|
| 147 |
+
"nvidia/minimaxai/minimax-m2.7",
|
| 148 |
+
"nvidia/openai/gpt-oss-120b",
|
| 149 |
+
"nvidia/z-ai/glm5.1",
|
| 150 |
+
"nvidia/stepfun-ai/step-3.7-flash"
|
| 151 |
],
|
| 152 |
"KiloCode": [
|
| 153 |
"kilocode/anthropic/claude-opus-4.7",
|
|
|
|
| 380 |
],
|
| 381 |
"NVIDIA_MODELS": [
|
| 382 |
"nvidia/nemotron-3-super-120b-a12b",
|
| 383 |
+
"deepseek-ai/deepseek-v4-flash",
|
| 384 |
+
"qwen/qwen3-coder-480b-a35b-instruct",
|
| 385 |
+
"moonshotai/kimi-k2.6",
|
| 386 |
+
"minimaxai/minimax-m2.7",
|
| 387 |
+
"openai/gpt-oss-120b",
|
| 388 |
+
"z-ai/glm5.1",
|
| 389 |
+
"stepfun-ai/step-3.7-flash"
|
| 390 |
],
|
| 391 |
"KILOCODE_MODELS": [
|
| 392 |
"kilocode/anthropic/claude-opus-4.7",
|
jupyter-devdata-sync.py
CHANGED
|
@@ -167,6 +167,10 @@ def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool:
|
|
| 167 |
return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
|
| 168 |
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
def should_skip(p: Path):
|
| 171 |
# Reserved sync helper files are not user data. Old datasets may still have
|
| 172 |
# these markers, but fresh snapshots no longer create them.
|
|
@@ -188,15 +192,52 @@ def should_skip(p: Path):
|
|
| 188 |
# Skip any component whose name looks like a secret file/dir.
|
| 189 |
return any(_name_is_secret(part) for part in parts)
|
| 190 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
|
| 192 |
had_copy_failures = False
|
| 193 |
protected_large_files: set[str] = set()
|
| 194 |
-
for p in
|
| 195 |
rel = p.relative_to(src)
|
| 196 |
-
if should_skip(rel):
|
| 197 |
-
continue
|
| 198 |
-
if p.is_symlink():
|
| 199 |
-
continue
|
| 200 |
target = dst / rel
|
| 201 |
if p.is_dir():
|
| 202 |
# Keep parent directories for files copied later in this snapshot.
|
|
|
|
| 167 |
return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
|
| 168 |
|
| 169 |
|
| 170 |
+
def _is_prefix_of(parts: tuple[str, ...], full_path: tuple[str, ...]) -> bool:
|
| 171 |
+
return len(parts) <= len(full_path) and full_path[:len(parts)] == parts
|
| 172 |
+
|
| 173 |
+
|
| 174 |
def should_skip(p: Path):
|
| 175 |
# Reserved sync helper files are not user data. Old datasets may still have
|
| 176 |
# these markers, but fresh snapshots no longer create them.
|
|
|
|
| 192 |
# Skip any component whose name looks like a secret file/dir.
|
| 193 |
return any(_name_is_secret(part) for part in parts)
|
| 194 |
|
| 195 |
+
|
| 196 |
+
def iter_sync_tree(root: Path):
|
| 197 |
+
"""Yield syncable DevData paths without descending into excluded trees."""
|
| 198 |
+
if not root.exists():
|
| 199 |
+
return
|
| 200 |
+
|
| 201 |
+
for dirpath, dirnames, filenames in os.walk(root):
|
| 202 |
+
dir_path = Path(dirpath)
|
| 203 |
+
try:
|
| 204 |
+
dir_rel = dir_path.relative_to(root)
|
| 205 |
+
except ValueError:
|
| 206 |
+
dir_rel = Path()
|
| 207 |
+
|
| 208 |
+
kept_dirnames: list[str] = []
|
| 209 |
+
for dirname in sorted(dirnames):
|
| 210 |
+
rel = dir_rel / dirname
|
| 211 |
+
child = dir_path / dirname
|
| 212 |
+
rel_parts = rel.parts
|
| 213 |
+
# Do not prune ancestors of explicitly allowed Jupyter settings
|
| 214 |
+
# paths. should_skip(.local/share/jupyter) is true by design for
|
| 215 |
+
# files under that tree, but we must still descend through the
|
| 216 |
+
# parent dirs to reach lab/user-settings and lab/workspaces.
|
| 217 |
+
allowed_ancestor = any(
|
| 218 |
+
_is_prefix_of(rel_parts, prefix) for prefix in JUPYTER_DATA_ALLOW_PREFIXES
|
| 219 |
+
)
|
| 220 |
+
if child.is_symlink() or (should_skip(rel) and not allowed_ancestor):
|
| 221 |
+
continue
|
| 222 |
+
kept_dirnames.append(dirname)
|
| 223 |
+
dirnames[:] = kept_dirnames
|
| 224 |
+
|
| 225 |
+
for dirname in kept_dirnames:
|
| 226 |
+
yield dir_path / dirname
|
| 227 |
+
|
| 228 |
+
for filename in sorted(filenames):
|
| 229 |
+
rel = dir_rel / filename
|
| 230 |
+
child = dir_path / filename
|
| 231 |
+
if child.is_symlink() or should_skip(rel):
|
| 232 |
+
continue
|
| 233 |
+
yield child
|
| 234 |
+
|
| 235 |
+
|
| 236 |
def snapshot(src: Path, dst: Path) -> tuple[bool, set[str]]:
|
| 237 |
had_copy_failures = False
|
| 238 |
protected_large_files: set[str] = set()
|
| 239 |
+
for p in iter_sync_tree(src):
|
| 240 |
rel = p.relative_to(src)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
target = dst / rel
|
| 242 |
if p.is_dir():
|
| 243 |
# Keep parent directories for files copied later in this snapshot.
|
openclaw-sync.py
CHANGED
|
@@ -51,6 +51,9 @@ CONFIG_SETTLE_SECONDS = max(
|
|
| 51 |
float(os.environ.get("OPENCLAW_CONFIG_SETTLE_SECONDS", "3")),
|
| 52 |
)
|
| 53 |
SESSIONS_MIN_SYNC_GAP = int(os.environ.get("SESSIONS_MIN_SYNC_GAP", "30"))
|
|
|
|
|
|
|
|
|
|
| 54 |
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
|
| 55 |
HF_USERNAME = os.environ.get("HF_USERNAME", "").strip()
|
| 56 |
SPACE_AUTHOR_NAME = os.environ.get("SPACE_AUTHOR_NAME", "").strip()
|
|
@@ -88,6 +91,22 @@ EXCLUDED_STATE_NAMES = {
|
|
| 88 |
# "WhatsApp enabled but not installing on restart".
|
| 89 |
"extensions",
|
| 90 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
SESSIONS_ROOT = OPENCLAW_HOME / "agents"
|
| 92 |
WHATSAPP_CREDS_DIR = OPENCLAW_HOME / "credentials" / "whatsapp" / "default"
|
| 93 |
WHATSAPP_BACKUP_DIR = STATE_DIR / "credentials" / "whatsapp" / "default"
|
|
@@ -95,11 +114,29 @@ RESET_MARKER = WORKSPACE / ".reset_credentials"
|
|
| 95 |
HF_API = HfApi(token=HF_TOKEN) if HF_TOKEN else None
|
| 96 |
STOP_EVENT = threading.Event()
|
| 97 |
_REPO_ID_CACHE: str | None = None
|
| 98 |
-
_SESSIONS_FILE_DIGEST_CACHE: dict[str, tuple[int, int, int, str]] = {}
|
| 99 |
WorkspaceMarker: TypeAlias = tuple[int, int, int, str]
|
|
|
|
| 100 |
# Set True when prune fails so the next sync pass bypasses the fingerprint
|
| 101 |
# early-exit and retries the prune even if no local files changed.
|
| 102 |
_prune_needed: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
|
| 105 |
def write_status(status: str, message: str) -> None:
|
|
@@ -133,6 +170,86 @@ def _remove_path(path: Path) -> None:
|
|
| 133 |
path.unlink(missing_ok=True)
|
| 134 |
|
| 135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
def _copy_hot_directory_snapshot(source_path: Path, tmp_path: Path) -> bool:
|
| 137 |
"""Best-effort mirror of a hot directory into *tmp_path*.
|
| 138 |
|
|
@@ -259,6 +376,7 @@ def copy_state_entry_with_retry(source_path: Path, backup_path: Path, attempts:
|
|
| 259 |
def snapshot_state_into_workspace() -> bool:
|
| 260 |
had_copy_failures = False
|
| 261 |
try:
|
|
|
|
| 262 |
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
| 263 |
# Atomic snapshot: copy to a staging dir first, then rename.
|
| 264 |
# This prevents a half-written (or empty) backup if we crash mid-copy,
|
|
@@ -274,7 +392,7 @@ def snapshot_state_into_workspace() -> bool:
|
|
| 274 |
skipped_entries: list[tuple[str, Exception]] = []
|
| 275 |
copied_entry_names: set[str] = set()
|
| 276 |
for source_path in OPENCLAW_HOME.iterdir():
|
| 277 |
-
if source_path.name
|
| 278 |
continue
|
| 279 |
|
| 280 |
backup_path = staging_dir / source_path.name
|
|
@@ -300,7 +418,9 @@ def snapshot_state_into_workspace() -> bool:
|
|
| 300 |
# dataset on the next sync. Only remove an entry from staging when the
|
| 301 |
# source has genuinely been deleted from OPENCLAW_HOME.
|
| 302 |
for staged_path in list(staging_dir.iterdir()):
|
| 303 |
-
if staged_path.name
|
|
|
|
|
|
|
| 304 |
continue
|
| 305 |
if staged_path.name in copied_entry_names:
|
| 306 |
continue
|
|
@@ -425,13 +545,8 @@ def locally_existing_large_files(root: Path) -> set[str]:
|
|
| 425 |
protected: set[str] = set()
|
| 426 |
if not root.exists():
|
| 427 |
return protected
|
| 428 |
-
for path in
|
| 429 |
-
if not path.is_file():
|
| 430 |
-
continue
|
| 431 |
rel = path.relative_to(root).as_posix()
|
| 432 |
-
parts = Path(rel).parts
|
| 433 |
-
if any(part in EXCLUDED_SYNC_DIRS for part in parts):
|
| 434 |
-
continue
|
| 435 |
try:
|
| 436 |
if path.stat().st_size > MAX_FILE_SIZE_BYTES:
|
| 437 |
protected.add(rel)
|
|
@@ -441,6 +556,7 @@ def locally_existing_large_files(root: Path) -> set[str]:
|
|
| 441 |
|
| 442 |
|
| 443 |
def restore_embedded_state() -> None:
|
|
|
|
| 444 |
state_backup_root = STATE_DIR / "openclaw"
|
| 445 |
|
| 446 |
# Migration fix: old backups stored state in ".huggingclaw-state/openclaw"
|
|
@@ -462,7 +578,7 @@ def restore_embedded_state() -> None:
|
|
| 462 |
if state_backup_root.is_dir():
|
| 463 |
for source_path in state_backup_root.iterdir():
|
| 464 |
name = source_path.name
|
| 465 |
-
if name
|
| 466 |
if source_path.is_dir():
|
| 467 |
shutil.rmtree(source_path, ignore_errors=True)
|
| 468 |
else:
|
|
@@ -527,7 +643,7 @@ def ensure_repo_exists() -> str:
|
|
| 527 |
|
| 528 |
def _should_exclude(rel_posix: str, path: Path) -> bool:
|
| 529 |
parts = Path(rel_posix).parts
|
| 530 |
-
if
|
| 531 |
return True
|
| 532 |
if path.is_file():
|
| 533 |
try:
|
|
@@ -538,16 +654,28 @@ def _should_exclude(rel_posix: str, path: Path) -> bool:
|
|
| 538 |
return False
|
| 539 |
|
| 540 |
|
| 541 |
-
def file_marker(path: Path) ->
|
| 542 |
try:
|
| 543 |
stat = path.stat()
|
| 544 |
except OSError:
|
| 545 |
-
return (0, 0, 0)
|
| 546 |
|
| 547 |
if not path.is_file():
|
| 548 |
-
return (0, 0, 0)
|
| 549 |
|
| 550 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
|
| 552 |
|
| 553 |
def metadata_marker(root: Path) -> WorkspaceMarker:
|
|
@@ -558,17 +686,12 @@ def metadata_marker(root: Path) -> WorkspaceMarker:
|
|
| 558 |
total_size = 0
|
| 559 |
newest_mtime = 0
|
| 560 |
metadata_hasher = hashlib.sha256()
|
| 561 |
-
for path in
|
| 562 |
-
if not path.is_file():
|
| 563 |
-
continue
|
| 564 |
rel = path.relative_to(root).as_posix()
|
| 565 |
# BUG FIX: use directory-only exclusion here (not size-based) so that
|
| 566 |
# deleting a large file changes the marker and triggers a sync/prune
|
| 567 |
# pass. Large files are still excluded from the upload snapshot via
|
| 568 |
# _should_exclude; this only controls change-detection.
|
| 569 |
-
parts = Path(rel).parts
|
| 570 |
-
if any(part in EXCLUDED_SYNC_DIRS for part in parts):
|
| 571 |
-
continue
|
| 572 |
try:
|
| 573 |
stat = path.stat()
|
| 574 |
except OSError:
|
|
@@ -576,14 +699,17 @@ def metadata_marker(root: Path) -> WorkspaceMarker:
|
|
| 576 |
file_count += 1
|
| 577 |
size = int(stat.st_size)
|
| 578 |
mtime_ns = int(stat.st_mtime_ns)
|
|
|
|
| 579 |
total_size += size
|
| 580 |
-
newest_mtime = max(newest_mtime, mtime_ns)
|
| 581 |
metadata_hasher.update(rel.encode("utf-8"))
|
| 582 |
metadata_hasher.update(b"\0")
|
| 583 |
metadata_hasher.update(str(size).encode("ascii"))
|
| 584 |
metadata_hasher.update(b"\0")
|
| 585 |
metadata_hasher.update(str(mtime_ns).encode("ascii"))
|
| 586 |
metadata_hasher.update(b"\0")
|
|
|
|
|
|
|
| 587 |
return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
|
| 588 |
|
| 589 |
|
|
@@ -592,15 +718,12 @@ def fingerprint_dir(root: Path) -> str:
|
|
| 592 |
if not root.exists():
|
| 593 |
return hasher.hexdigest()
|
| 594 |
|
| 595 |
-
for path in
|
| 596 |
rel = path.relative_to(root).as_posix()
|
| 597 |
# BUG FIX: use directory-only exclusion (not size-based) so that
|
| 598 |
# creating or deleting a large file changes the fingerprint.
|
| 599 |
# Large files are hashed by path + metadata only (no content read)
|
| 600 |
# to avoid reading gigabytes for a change-detection hash.
|
| 601 |
-
parts = Path(rel).parts
|
| 602 |
-
if any(part in EXCLUDED_SYNC_DIRS for part in parts):
|
| 603 |
-
continue
|
| 604 |
hasher.update(rel.encode("utf-8"))
|
| 605 |
try:
|
| 606 |
stat = path.stat()
|
|
@@ -628,7 +751,7 @@ def fingerprint_dir(root: Path) -> str:
|
|
| 628 |
|
| 629 |
def create_snapshot_dir(source_root: Path) -> Path:
|
| 630 |
staging_root = Path(tempfile.mkdtemp(prefix="huggingclaw-sync-"))
|
| 631 |
-
for path in
|
| 632 |
rel = path.relative_to(source_root)
|
| 633 |
rel_posix = rel.as_posix()
|
| 634 |
if _should_exclude(rel_posix, path):
|
|
@@ -648,6 +771,64 @@ def create_snapshot_dir(source_root: Path) -> Path:
|
|
| 648 |
return staging_root
|
| 649 |
|
| 650 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
def prune_remote_deleted_files(
|
| 652 |
repo_id: str,
|
| 653 |
snapshot_dir: Path,
|
|
@@ -679,15 +860,19 @@ def prune_remote_deleted_files(
|
|
| 679 |
and path not in protected_paths
|
| 680 |
and not any(path == prefix or path.startswith(prefix + "/") for prefix in skip_prefixes)
|
| 681 |
]
|
| 682 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
return
|
| 684 |
|
| 685 |
-
total = len(
|
| 686 |
num_batches = (total + PRUNE_BATCH_SIZE - 1) // PRUNE_BATCH_SIZE
|
| 687 |
for batch_idx in range(num_batches):
|
| 688 |
-
batch =
|
| 689 |
batch_label = f"{batch_idx + 1}/{num_batches}" if num_batches > 1 else ""
|
| 690 |
-
msg = f"
|
| 691 |
if batch_label:
|
| 692 |
msg += f" (batch {batch_label})"
|
| 693 |
operations = [CommitOperationDelete(path_in_repo=path) for path in batch]
|
|
@@ -698,7 +883,25 @@ def prune_remote_deleted_files(
|
|
| 698 |
commit_message=msg,
|
| 699 |
)
|
| 700 |
if num_batches > 1:
|
| 701 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 702 |
|
| 703 |
|
| 704 |
def restore_workspace() -> bool:
|
|
@@ -795,10 +998,20 @@ def _sync_once_unlocked(
|
|
| 795 |
write_status("disabled", "HF_TOKEN is not configured.")
|
| 796 |
return (last_fingerprint or "", last_marker or (0, 0, 0, ""))
|
| 797 |
|
| 798 |
-
global _prune_needed
|
| 799 |
|
| 800 |
had_snapshot_copy_failures = snapshot_state_into_workspace()
|
| 801 |
repo_id = ensure_repo_exists()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 802 |
current_marker = metadata_marker(WORKSPACE)
|
| 803 |
# Session watcher uses content digests and can detect same-size rewrites
|
| 804 |
# whose workspace metadata marker is unchanged. In that case, force the
|
|
@@ -817,25 +1030,11 @@ def _sync_once_unlocked(
|
|
| 817 |
write_status("synced", "No workspace changes detected.")
|
| 818 |
return (last_fingerprint, current_marker)
|
| 819 |
|
| 820 |
-
|
|
|
|
| 821 |
snapshot_dir = create_snapshot_dir(WORKSPACE)
|
| 822 |
try:
|
| 823 |
-
|
| 824 |
-
HF_API.upload_large_folder(
|
| 825 |
-
repo_id=repo_id,
|
| 826 |
-
repo_type="dataset",
|
| 827 |
-
folder_path=str(snapshot_dir),
|
| 828 |
-
num_workers=2,
|
| 829 |
-
print_report=False,
|
| 830 |
-
)
|
| 831 |
-
except AttributeError:
|
| 832 |
-
upload_folder(
|
| 833 |
-
folder_path=str(snapshot_dir),
|
| 834 |
-
repo_id=repo_id,
|
| 835 |
-
repo_type="dataset",
|
| 836 |
-
token=HF_TOKEN,
|
| 837 |
-
commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
|
| 838 |
-
)
|
| 839 |
skip_prune_prefixes: set[str] = set()
|
| 840 |
if had_snapshot_copy_failures:
|
| 841 |
# BUG FIX: the old code added "huggingclaw-state/openclaw" to
|
|
@@ -879,7 +1078,18 @@ def sync_once(
|
|
| 879 |
) -> tuple[str, WorkspaceMarker]:
|
| 880 |
SYNC_LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
|
| 881 |
with SYNC_LOCK_FILE.open("w", encoding="utf-8") as lock_handle:
|
| 882 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 883 |
try:
|
| 884 |
return _sync_once_unlocked(
|
| 885 |
last_fingerprint,
|
|
@@ -920,9 +1130,6 @@ def sessions_marker() -> tuple[int, int, int, str]:
|
|
| 920 |
newest_mtime = 0
|
| 921 |
metadata_hasher = hashlib.sha256()
|
| 922 |
|
| 923 |
-
global _SESSIONS_FILE_DIGEST_CACHE
|
| 924 |
-
next_cache: dict[str, tuple[int, int, int, str]] = {}
|
| 925 |
-
|
| 926 |
for profile_dir in sorted(SESSIONS_ROOT.iterdir()):
|
| 927 |
if not profile_dir.is_dir():
|
| 928 |
continue
|
|
@@ -943,7 +1150,6 @@ def sessions_marker() -> tuple[int, int, int, str]:
|
|
| 943 |
size = int(stat.st_size)
|
| 944 |
mtime_ns = int(stat.st_mtime_ns)
|
| 945 |
ctime_ns = int(stat.st_ctime_ns)
|
| 946 |
-
cache_key = f"{profile_dir.name}\0{rel}"
|
| 947 |
digest.update(rel.encode("utf-8"))
|
| 948 |
digest.update(b"\0")
|
| 949 |
digest.update(str(size).encode("ascii"))
|
|
@@ -952,24 +1158,17 @@ def sessions_marker() -> tuple[int, int, int, str]:
|
|
| 952 |
digest.update(b"\0")
|
| 953 |
digest.update(str(ctime_ns).encode("ascii"))
|
| 954 |
digest.update(b"\0")
|
| 955 |
-
|
| 956 |
-
|
| 957 |
-
|
| 958 |
-
|
| 959 |
-
|
| 960 |
-
|
| 961 |
-
|
| 962 |
-
|
| 963 |
-
|
| 964 |
-
|
| 965 |
-
|
| 966 |
-
with path.open("rb") as handle:
|
| 967 |
-
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
| 968 |
-
file_hasher.update(chunk)
|
| 969 |
-
except (FileNotFoundError, IsADirectoryError, NotADirectoryError):
|
| 970 |
-
continue
|
| 971 |
-
file_digest = file_hasher.hexdigest()
|
| 972 |
-
next_cache[cache_key] = (size, mtime_ns, ctime_ns, file_digest)
|
| 973 |
digest.update(file_digest.encode("ascii"))
|
| 974 |
digest.update(b"\0")
|
| 975 |
|
|
@@ -981,11 +1180,10 @@ def sessions_marker() -> tuple[int, int, int, str]:
|
|
| 981 |
metadata_hasher.update(digest.hexdigest().encode("ascii"))
|
| 982 |
metadata_hasher.update(b"\0")
|
| 983 |
|
| 984 |
-
_SESSIONS_FILE_DIGEST_CACHE = next_cache
|
| 985 |
return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
|
| 986 |
|
| 987 |
|
| 988 |
-
def wait_for_config_settle(config_marker:
|
| 989 |
stable_since = time.monotonic()
|
| 990 |
current_marker = config_marker
|
| 991 |
|
|
@@ -1008,9 +1206,9 @@ def wait_for_config_settle(config_marker: tuple[int, int, int]) -> tuple[str, tu
|
|
| 1008 |
|
| 1009 |
|
| 1010 |
def wait_for_sync_trigger(
|
| 1011 |
-
config_marker:
|
| 1012 |
last_sessions_sync_time: float = 0.0,
|
| 1013 |
-
) -> tuple[str,
|
| 1014 |
deadline = time.monotonic() + max(0, INTERVAL)
|
| 1015 |
# BUG FIX: also watch sessions directory so new/updated sessions
|
| 1016 |
# trigger an immediate sync instead of waiting the full interval.
|
|
|
|
| 51 |
float(os.environ.get("OPENCLAW_CONFIG_SETTLE_SECONDS", "3")),
|
| 52 |
)
|
| 53 |
SESSIONS_MIN_SYNC_GAP = int(os.environ.get("SESSIONS_MIN_SYNC_GAP", "30"))
|
| 54 |
+
SYNC_LOCK_TIMEOUT = max(1.0, float(os.environ.get("SYNC_LOCK_TIMEOUT", "20")))
|
| 55 |
+
SYNC_UPLOAD_TIMEOUT = max(0.0, float(os.environ.get("SYNC_UPLOAD_TIMEOUT", "180")))
|
| 56 |
+
SYNC_UPLOAD_STRATEGY = os.environ.get("SYNC_UPLOAD_STRATEGY", "folder").strip().lower() or "folder"
|
| 57 |
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
|
| 58 |
HF_USERNAME = os.environ.get("HF_USERNAME", "").strip()
|
| 59 |
SPACE_AUTHOR_NAME = os.environ.get("SPACE_AUTHOR_NAME", "").strip()
|
|
|
|
| 91 |
# "WhatsApp enabled but not installing on restart".
|
| 92 |
"extensions",
|
| 93 |
}
|
| 94 |
+
# Internal restore/snapshot working directories live beside the workspace under
|
| 95 |
+
# /home/node/.openclaw. If a previous container is killed mid-restore, these
|
| 96 |
+
# hidden directories can be left behind and may contain a full copy of the
|
| 97 |
+
# workspace, including huggingclaw-state/openclaw itself. Backing them up makes
|
| 98 |
+
# the dataset grow recursively under huggingclaw-state/openclaw and can keep the
|
| 99 |
+
# sync loop busy forever, which in turn blocks session deletes/updates from being
|
| 100 |
+
# pruned remotely. Treat every such helper path as disposable sync scratch.
|
| 101 |
+
INTERNAL_TEMP_STATE_NAMES = {
|
| 102 |
+
".workspace-restore-staging",
|
| 103 |
+
".workspace-restore-old",
|
| 104 |
+
".openclaw-staging",
|
| 105 |
+
}
|
| 106 |
+
INTERNAL_TEMP_NAME_PREFIXES = (
|
| 107 |
+
".workspace-restore-",
|
| 108 |
+
".openclaw-staging",
|
| 109 |
+
)
|
| 110 |
SESSIONS_ROOT = OPENCLAW_HOME / "agents"
|
| 111 |
WHATSAPP_CREDS_DIR = OPENCLAW_HOME / "credentials" / "whatsapp" / "default"
|
| 112 |
WHATSAPP_BACKUP_DIR = STATE_DIR / "credentials" / "whatsapp" / "default"
|
|
|
|
| 114 |
HF_API = HfApi(token=HF_TOKEN) if HF_TOKEN else None
|
| 115 |
STOP_EVENT = threading.Event()
|
| 116 |
_REPO_ID_CACHE: str | None = None
|
|
|
|
| 117 |
WorkspaceMarker: TypeAlias = tuple[int, int, int, str]
|
| 118 |
+
FileMarker: TypeAlias = tuple[int, int, int, int, str]
|
| 119 |
# Set True when prune fails so the next sync pass bypasses the fingerprint
|
| 120 |
# early-exit and retries the prune even if no local files changed.
|
| 121 |
_prune_needed: bool = False
|
| 122 |
+
_remote_temp_prune_done: bool = False
|
| 123 |
+
|
| 124 |
+
# Workspace-relative temp paths managed by this sync script. Keep this narrow:
|
| 125 |
+
# user projects may legitimately contain folders with similar names, and those
|
| 126 |
+
# must keep syncing normally. Only the script-owned state/restore scratch
|
| 127 |
+
# locations are skipped and pruned.
|
| 128 |
+
INTERNAL_TEMP_WORKSPACE_PREFIXES: tuple[tuple[str, ...], ...] = (
|
| 129 |
+
(".workspace-restore-staging",),
|
| 130 |
+
(".workspace-restore-old",),
|
| 131 |
+
("huggingclaw-state", ".openclaw-staging"),
|
| 132 |
+
("huggingclaw-state", "openclaw", ".workspace-restore-staging"),
|
| 133 |
+
("huggingclaw-state", "openclaw", ".workspace-restore-old"),
|
| 134 |
+
("huggingclaw-state", "openclaw", ".openclaw-staging"),
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class SyncUploadTimeoutError(TimeoutError):
|
| 139 |
+
pass
|
| 140 |
|
| 141 |
|
| 142 |
def write_status(status: str, message: str) -> None:
|
|
|
|
| 170 |
path.unlink(missing_ok=True)
|
| 171 |
|
| 172 |
|
| 173 |
+
def _is_internal_temp_name(name: str) -> bool:
|
| 174 |
+
return name in INTERNAL_TEMP_STATE_NAMES or any(
|
| 175 |
+
name.startswith(prefix) for prefix in INTERNAL_TEMP_NAME_PREFIXES
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def _has_internal_temp_part(path: str) -> bool:
|
| 180 |
+
parts = Path(path).parts
|
| 181 |
+
return any(_matches_prefix(parts, prefix) for prefix in INTERNAL_TEMP_WORKSPACE_PREFIXES)
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def _matches_prefix(parts: tuple[str, ...], prefix: tuple[str, ...]) -> bool:
|
| 185 |
+
return len(parts) >= len(prefix) and parts[:len(prefix)] == prefix
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def _should_skip_state_entry_name(name: str) -> bool:
|
| 189 |
+
return (
|
| 190 |
+
name in EXCLUDED_STATE_NAMES
|
| 191 |
+
or name in EXCLUDED_SYNC_DIRS
|
| 192 |
+
or _is_internal_temp_name(name)
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def _should_skip_sync_path(rel_parts: tuple[str, ...]) -> bool:
|
| 197 |
+
return any(part in EXCLUDED_SYNC_DIRS for part in rel_parts) or any(
|
| 198 |
+
_matches_prefix(rel_parts, prefix) for prefix in INTERNAL_TEMP_WORKSPACE_PREFIXES
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def _iter_sync_tree(root: Path):
|
| 203 |
+
"""Yield syncable paths without descending into ignored/temp directories."""
|
| 204 |
+
if not root.exists():
|
| 205 |
+
return
|
| 206 |
+
|
| 207 |
+
for dirpath, dirnames, filenames in os.walk(root):
|
| 208 |
+
dir_path = Path(dirpath)
|
| 209 |
+
try:
|
| 210 |
+
dir_rel_parts = dir_path.relative_to(root).parts
|
| 211 |
+
except ValueError:
|
| 212 |
+
dir_rel_parts = ()
|
| 213 |
+
if dir_rel_parts == (".",):
|
| 214 |
+
dir_rel_parts = ()
|
| 215 |
+
dirnames[:] = sorted(
|
| 216 |
+
name for name in dirnames
|
| 217 |
+
if not _should_skip_sync_path(dir_rel_parts + (name,))
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
for dirname in dirnames:
|
| 221 |
+
yield dir_path / dirname
|
| 222 |
+
for filename in sorted(filenames):
|
| 223 |
+
if _should_skip_sync_path(dir_rel_parts + (filename,)):
|
| 224 |
+
continue
|
| 225 |
+
yield dir_path / filename
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def _iter_sync_files(root: Path):
|
| 229 |
+
for path in _iter_sync_tree(root):
|
| 230 |
+
if path.is_file():
|
| 231 |
+
yield path
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def cleanup_internal_temp_paths() -> None:
|
| 235 |
+
"""Remove stale sync/restore scratch dirs that must never be backed up."""
|
| 236 |
+
for root in (OPENCLAW_HOME, STATE_DIR):
|
| 237 |
+
if not root.exists() or not root.is_dir():
|
| 238 |
+
continue
|
| 239 |
+
try:
|
| 240 |
+
children = list(root.iterdir())
|
| 241 |
+
except OSError:
|
| 242 |
+
continue
|
| 243 |
+
for child in children:
|
| 244 |
+
if not _is_internal_temp_name(child.name):
|
| 245 |
+
continue
|
| 246 |
+
try:
|
| 247 |
+
_remove_path(child)
|
| 248 |
+
print(f"Removed stale sync scratch path: {child}")
|
| 249 |
+
except OSError as exc:
|
| 250 |
+
print(f"Warning: could not remove stale sync scratch path {child}: {exc}")
|
| 251 |
+
|
| 252 |
+
|
| 253 |
def _copy_hot_directory_snapshot(source_path: Path, tmp_path: Path) -> bool:
|
| 254 |
"""Best-effort mirror of a hot directory into *tmp_path*.
|
| 255 |
|
|
|
|
| 376 |
def snapshot_state_into_workspace() -> bool:
|
| 377 |
had_copy_failures = False
|
| 378 |
try:
|
| 379 |
+
cleanup_internal_temp_paths()
|
| 380 |
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
| 381 |
# Atomic snapshot: copy to a staging dir first, then rename.
|
| 382 |
# This prevents a half-written (or empty) backup if we crash mid-copy,
|
|
|
|
| 392 |
skipped_entries: list[tuple[str, Exception]] = []
|
| 393 |
copied_entry_names: set[str] = set()
|
| 394 |
for source_path in OPENCLAW_HOME.iterdir():
|
| 395 |
+
if _should_skip_state_entry_name(source_path.name):
|
| 396 |
continue
|
| 397 |
|
| 398 |
backup_path = staging_dir / source_path.name
|
|
|
|
| 418 |
# dataset on the next sync. Only remove an entry from staging when the
|
| 419 |
# source has genuinely been deleted from OPENCLAW_HOME.
|
| 420 |
for staged_path in list(staging_dir.iterdir()):
|
| 421 |
+
if _should_skip_state_entry_name(staged_path.name):
|
| 422 |
+
if staged_path.exists() or staged_path.is_symlink():
|
| 423 |
+
_remove_path(staged_path)
|
| 424 |
continue
|
| 425 |
if staged_path.name in copied_entry_names:
|
| 426 |
continue
|
|
|
|
| 545 |
protected: set[str] = set()
|
| 546 |
if not root.exists():
|
| 547 |
return protected
|
| 548 |
+
for path in _iter_sync_files(root):
|
|
|
|
|
|
|
| 549 |
rel = path.relative_to(root).as_posix()
|
|
|
|
|
|
|
|
|
|
| 550 |
try:
|
| 551 |
if path.stat().st_size > MAX_FILE_SIZE_BYTES:
|
| 552 |
protected.add(rel)
|
|
|
|
| 556 |
|
| 557 |
|
| 558 |
def restore_embedded_state() -> None:
|
| 559 |
+
cleanup_internal_temp_paths()
|
| 560 |
state_backup_root = STATE_DIR / "openclaw"
|
| 561 |
|
| 562 |
# Migration fix: old backups stored state in ".huggingclaw-state/openclaw"
|
|
|
|
| 578 |
if state_backup_root.is_dir():
|
| 579 |
for source_path in state_backup_root.iterdir():
|
| 580 |
name = source_path.name
|
| 581 |
+
if _should_skip_state_entry_name(name):
|
| 582 |
if source_path.is_dir():
|
| 583 |
shutil.rmtree(source_path, ignore_errors=True)
|
| 584 |
else:
|
|
|
|
| 643 |
|
| 644 |
def _should_exclude(rel_posix: str, path: Path) -> bool:
|
| 645 |
parts = Path(rel_posix).parts
|
| 646 |
+
if _should_skip_sync_path(parts):
|
| 647 |
return True
|
| 648 |
if path.is_file():
|
| 649 |
try:
|
|
|
|
| 654 |
return False
|
| 655 |
|
| 656 |
|
| 657 |
+
def file_marker(path: Path) -> FileMarker:
|
| 658 |
try:
|
| 659 |
stat = path.stat()
|
| 660 |
except OSError:
|
| 661 |
+
return (0, 0, 0, 0, "")
|
| 662 |
|
| 663 |
if not path.is_file():
|
| 664 |
+
return (0, 0, 0, 0, "")
|
| 665 |
|
| 666 |
+
digest = ""
|
| 667 |
+
try:
|
| 668 |
+
hasher = hashlib.sha256()
|
| 669 |
+
with path.open("rb") as handle:
|
| 670 |
+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
| 671 |
+
hasher.update(chunk)
|
| 672 |
+
digest = hasher.hexdigest()
|
| 673 |
+
except OSError:
|
| 674 |
+
# If the file is being rewritten, include ctime/mtime/size and let the
|
| 675 |
+
# next watch loop re-read it. Never pretend the marker is unchanged.
|
| 676 |
+
digest = f"unreadable:{time.monotonic_ns()}"
|
| 677 |
+
|
| 678 |
+
return (1, int(stat.st_size), int(stat.st_mtime_ns), int(stat.st_ctime_ns), digest)
|
| 679 |
|
| 680 |
|
| 681 |
def metadata_marker(root: Path) -> WorkspaceMarker:
|
|
|
|
| 686 |
total_size = 0
|
| 687 |
newest_mtime = 0
|
| 688 |
metadata_hasher = hashlib.sha256()
|
| 689 |
+
for path in _iter_sync_files(root):
|
|
|
|
|
|
|
| 690 |
rel = path.relative_to(root).as_posix()
|
| 691 |
# BUG FIX: use directory-only exclusion here (not size-based) so that
|
| 692 |
# deleting a large file changes the marker and triggers a sync/prune
|
| 693 |
# pass. Large files are still excluded from the upload snapshot via
|
| 694 |
# _should_exclude; this only controls change-detection.
|
|
|
|
|
|
|
|
|
|
| 695 |
try:
|
| 696 |
stat = path.stat()
|
| 697 |
except OSError:
|
|
|
|
| 699 |
file_count += 1
|
| 700 |
size = int(stat.st_size)
|
| 701 |
mtime_ns = int(stat.st_mtime_ns)
|
| 702 |
+
ctime_ns = int(stat.st_ctime_ns)
|
| 703 |
total_size += size
|
| 704 |
+
newest_mtime = max(newest_mtime, mtime_ns, ctime_ns)
|
| 705 |
metadata_hasher.update(rel.encode("utf-8"))
|
| 706 |
metadata_hasher.update(b"\0")
|
| 707 |
metadata_hasher.update(str(size).encode("ascii"))
|
| 708 |
metadata_hasher.update(b"\0")
|
| 709 |
metadata_hasher.update(str(mtime_ns).encode("ascii"))
|
| 710 |
metadata_hasher.update(b"\0")
|
| 711 |
+
metadata_hasher.update(str(ctime_ns).encode("ascii"))
|
| 712 |
+
metadata_hasher.update(b"\0")
|
| 713 |
return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
|
| 714 |
|
| 715 |
|
|
|
|
| 718 |
if not root.exists():
|
| 719 |
return hasher.hexdigest()
|
| 720 |
|
| 721 |
+
for path in _iter_sync_files(root):
|
| 722 |
rel = path.relative_to(root).as_posix()
|
| 723 |
# BUG FIX: use directory-only exclusion (not size-based) so that
|
| 724 |
# creating or deleting a large file changes the fingerprint.
|
| 725 |
# Large files are hashed by path + metadata only (no content read)
|
| 726 |
# to avoid reading gigabytes for a change-detection hash.
|
|
|
|
|
|
|
|
|
|
| 727 |
hasher.update(rel.encode("utf-8"))
|
| 728 |
try:
|
| 729 |
stat = path.stat()
|
|
|
|
| 751 |
|
| 752 |
def create_snapshot_dir(source_root: Path) -> Path:
|
| 753 |
staging_root = Path(tempfile.mkdtemp(prefix="huggingclaw-sync-"))
|
| 754 |
+
for path in _iter_sync_tree(source_root):
|
| 755 |
rel = path.relative_to(source_root)
|
| 756 |
rel_posix = rel.as_posix()
|
| 757 |
if _should_exclude(rel_posix, path):
|
|
|
|
| 771 |
return staging_root
|
| 772 |
|
| 773 |
|
| 774 |
+
def _run_with_timeout(seconds: float, label: str, func):
|
| 775 |
+
"""Run *func* with a SIGALRM watchdog so hung HF calls release the sync lock."""
|
| 776 |
+
if seconds <= 0 or not hasattr(signal, "SIGALRM"):
|
| 777 |
+
return func()
|
| 778 |
+
|
| 779 |
+
previous_handler = signal.getsignal(signal.SIGALRM)
|
| 780 |
+
try:
|
| 781 |
+
previous_timer = signal.setitimer(signal.ITIMER_REAL, 0)
|
| 782 |
+
except Exception:
|
| 783 |
+
previous_timer = (0.0, 0.0)
|
| 784 |
+
|
| 785 |
+
def _timeout_handler(_sig, _frame):
|
| 786 |
+
raise SyncUploadTimeoutError(f"{label} timed out after {seconds:.0f}s")
|
| 787 |
+
|
| 788 |
+
signal.signal(signal.SIGALRM, _timeout_handler)
|
| 789 |
+
signal.setitimer(signal.ITIMER_REAL, seconds)
|
| 790 |
+
try:
|
| 791 |
+
return func()
|
| 792 |
+
finally:
|
| 793 |
+
signal.setitimer(signal.ITIMER_REAL, 0)
|
| 794 |
+
signal.signal(signal.SIGALRM, previous_handler)
|
| 795 |
+
if previous_timer and previous_timer[0] > 0:
|
| 796 |
+
signal.setitimer(signal.ITIMER_REAL, previous_timer[0], previous_timer[1])
|
| 797 |
+
|
| 798 |
+
|
| 799 |
+
def upload_snapshot(repo_id: str, snapshot_dir: Path) -> None:
|
| 800 |
+
strategy = SYNC_UPLOAD_STRATEGY.replace("-", "_")
|
| 801 |
+
timeout_label = "no timeout" if SYNC_UPLOAD_TIMEOUT <= 0 else f"{SYNC_UPLOAD_TIMEOUT:.0f}s timeout"
|
| 802 |
+
if strategy not in {"folder", "upload_folder", "large_folder"}:
|
| 803 |
+
print(f"Warning: unknown SYNC_UPLOAD_STRATEGY={SYNC_UPLOAD_STRATEGY!r}; using upload_folder.")
|
| 804 |
+
strategy = "folder"
|
| 805 |
+
|
| 806 |
+
def _upload() -> None:
|
| 807 |
+
if strategy == "large_folder":
|
| 808 |
+
try:
|
| 809 |
+
HF_API.upload_large_folder(
|
| 810 |
+
repo_id=repo_id,
|
| 811 |
+
repo_type="dataset",
|
| 812 |
+
folder_path=str(snapshot_dir),
|
| 813 |
+
num_workers=2,
|
| 814 |
+
print_report=False,
|
| 815 |
+
)
|
| 816 |
+
return
|
| 817 |
+
except AttributeError:
|
| 818 |
+
print("Warning: upload_large_folder unavailable; falling back to upload_folder.")
|
| 819 |
+
|
| 820 |
+
upload_folder(
|
| 821 |
+
folder_path=str(snapshot_dir),
|
| 822 |
+
repo_id=repo_id,
|
| 823 |
+
repo_type="dataset",
|
| 824 |
+
token=HF_TOKEN,
|
| 825 |
+
commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
|
| 826 |
+
)
|
| 827 |
+
|
| 828 |
+
print(f"Workspace upload: strategy={strategy}, {timeout_label}")
|
| 829 |
+
_run_with_timeout(SYNC_UPLOAD_TIMEOUT, "Workspace upload", _upload)
|
| 830 |
+
|
| 831 |
+
|
| 832 |
def prune_remote_deleted_files(
|
| 833 |
repo_id: str,
|
| 834 |
snapshot_dir: Path,
|
|
|
|
| 860 |
and path not in protected_paths
|
| 861 |
and not any(path == prefix or path.startswith(prefix + "/") for prefix in skip_prefixes)
|
| 862 |
]
|
| 863 |
+
_commit_delete_batches(repo_id, stale_files, "Prune", "stale file(s) after workspace sync")
|
| 864 |
+
|
| 865 |
+
|
| 866 |
+
def _commit_delete_batches(repo_id: str, paths: list[str], action: str, reason: str) -> None:
|
| 867 |
+
if not paths:
|
| 868 |
return
|
| 869 |
|
| 870 |
+
total = len(paths)
|
| 871 |
num_batches = (total + PRUNE_BATCH_SIZE - 1) // PRUNE_BATCH_SIZE
|
| 872 |
for batch_idx in range(num_batches):
|
| 873 |
+
batch = paths[batch_idx * PRUNE_BATCH_SIZE:(batch_idx + 1) * PRUNE_BATCH_SIZE]
|
| 874 |
batch_label = f"{batch_idx + 1}/{num_batches}" if num_batches > 1 else ""
|
| 875 |
+
msg = f"{action} {len(batch)} {reason}"
|
| 876 |
if batch_label:
|
| 877 |
msg += f" (batch {batch_label})"
|
| 878 |
operations = [CommitOperationDelete(path_in_repo=path) for path in batch]
|
|
|
|
| 883 |
commit_message=msg,
|
| 884 |
)
|
| 885 |
if num_batches > 1:
|
| 886 |
+
print(f"{action} batch {batch_label}: {len(batch)} file(s)")
|
| 887 |
+
|
| 888 |
+
|
| 889 |
+
def prune_remote_internal_temp_files(repo_id: str) -> None:
|
| 890 |
+
"""Remove old recursive scratch files from the HF backup even if nothing changed locally."""
|
| 891 |
+
if HF_API is None:
|
| 892 |
+
return
|
| 893 |
+
|
| 894 |
+
remote_files = list(HF_API.list_repo_files(repo_id=repo_id, repo_type="dataset"))
|
| 895 |
+
internal_temp_files = [
|
| 896 |
+
path for path in remote_files
|
| 897 |
+
if path != ".gitattributes" and _has_internal_temp_part(path)
|
| 898 |
+
]
|
| 899 |
+
_commit_delete_batches(
|
| 900 |
+
repo_id,
|
| 901 |
+
internal_temp_files,
|
| 902 |
+
"Prune internal sync scratch",
|
| 903 |
+
"file(s) from backup",
|
| 904 |
+
)
|
| 905 |
|
| 906 |
|
| 907 |
def restore_workspace() -> bool:
|
|
|
|
| 998 |
write_status("disabled", "HF_TOKEN is not configured.")
|
| 999 |
return (last_fingerprint or "", last_marker or (0, 0, 0, ""))
|
| 1000 |
|
| 1001 |
+
global _prune_needed, _remote_temp_prune_done
|
| 1002 |
|
| 1003 |
had_snapshot_copy_failures = snapshot_state_into_workspace()
|
| 1004 |
repo_id = ensure_repo_exists()
|
| 1005 |
+
if not _remote_temp_prune_done:
|
| 1006 |
+
try:
|
| 1007 |
+
prune_remote_internal_temp_files(repo_id)
|
| 1008 |
+
_remote_temp_prune_done = True
|
| 1009 |
+
except Exception as temp_prune_exc:
|
| 1010 |
+
# Do not block normal user-data sync if the cleanup commit fails;
|
| 1011 |
+
# leave the flag false so the next pass retries before taking the
|
| 1012 |
+
# metadata/fingerprint early exit again.
|
| 1013 |
+
print(f"Warning: could not prune internal sync scratch files: {temp_prune_exc}")
|
| 1014 |
+
_prune_needed = True
|
| 1015 |
current_marker = metadata_marker(WORKSPACE)
|
| 1016 |
# Session watcher uses content digests and can detect same-size rewrites
|
| 1017 |
# whose workspace metadata marker is unchanged. In that case, force the
|
|
|
|
| 1030 |
write_status("synced", "No workspace changes detected.")
|
| 1031 |
return (last_fingerprint, current_marker)
|
| 1032 |
|
| 1033 |
+
upload_timeout_label = "no timeout" if SYNC_UPLOAD_TIMEOUT <= 0 else f"timeout {SYNC_UPLOAD_TIMEOUT:.0f}s"
|
| 1034 |
+
write_status("syncing", f"Uploading workspace to {repo_id} ({SYNC_UPLOAD_STRATEGY}, {upload_timeout_label})")
|
| 1035 |
snapshot_dir = create_snapshot_dir(WORKSPACE)
|
| 1036 |
try:
|
| 1037 |
+
upload_snapshot(repo_id, snapshot_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1038 |
skip_prune_prefixes: set[str] = set()
|
| 1039 |
if had_snapshot_copy_failures:
|
| 1040 |
# BUG FIX: the old code added "huggingclaw-state/openclaw" to
|
|
|
|
| 1078 |
) -> tuple[str, WorkspaceMarker]:
|
| 1079 |
SYNC_LOCK_FILE.parent.mkdir(parents=True, exist_ok=True)
|
| 1080 |
with SYNC_LOCK_FILE.open("w", encoding="utf-8") as lock_handle:
|
| 1081 |
+
deadline = time.monotonic() + SYNC_LOCK_TIMEOUT
|
| 1082 |
+
while True:
|
| 1083 |
+
try:
|
| 1084 |
+
fcntl.flock(lock_handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
| 1085 |
+
break
|
| 1086 |
+
except BlockingIOError:
|
| 1087 |
+
if time.monotonic() >= deadline:
|
| 1088 |
+
raise TimeoutError(
|
| 1089 |
+
f"Timed out waiting {SYNC_LOCK_TIMEOUT:.0f}s for workspace sync lock."
|
| 1090 |
+
)
|
| 1091 |
+
if STOP_EVENT.wait(0.25):
|
| 1092 |
+
raise TimeoutError("Stopped while waiting for workspace sync lock.")
|
| 1093 |
try:
|
| 1094 |
return _sync_once_unlocked(
|
| 1095 |
last_fingerprint,
|
|
|
|
| 1130 |
newest_mtime = 0
|
| 1131 |
metadata_hasher = hashlib.sha256()
|
| 1132 |
|
|
|
|
|
|
|
|
|
|
| 1133 |
for profile_dir in sorted(SESSIONS_ROOT.iterdir()):
|
| 1134 |
if not profile_dir.is_dir():
|
| 1135 |
continue
|
|
|
|
| 1150 |
size = int(stat.st_size)
|
| 1151 |
mtime_ns = int(stat.st_mtime_ns)
|
| 1152 |
ctime_ns = int(stat.st_ctime_ns)
|
|
|
|
| 1153 |
digest.update(rel.encode("utf-8"))
|
| 1154 |
digest.update(b"\0")
|
| 1155 |
digest.update(str(size).encode("ascii"))
|
|
|
|
| 1158 |
digest.update(b"\0")
|
| 1159 |
digest.update(str(ctime_ns).encode("ascii"))
|
| 1160 |
digest.update(b"\0")
|
| 1161 |
+
# Sessions are the most important live data. Hash every scan
|
| 1162 |
+
# instead of trusting size/mtime/ctime caches so same-size rewrites
|
| 1163 |
+
# and very small edits are never missed by the sessions trigger.
|
| 1164 |
+
file_hasher = hashlib.sha256()
|
| 1165 |
+
try:
|
| 1166 |
+
with path.open("rb") as handle:
|
| 1167 |
+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
| 1168 |
+
file_hasher.update(chunk)
|
| 1169 |
+
except (FileNotFoundError, IsADirectoryError, NotADirectoryError):
|
| 1170 |
+
continue
|
| 1171 |
+
file_digest = file_hasher.hexdigest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1172 |
digest.update(file_digest.encode("ascii"))
|
| 1173 |
digest.update(b"\0")
|
| 1174 |
|
|
|
|
| 1180 |
metadata_hasher.update(digest.hexdigest().encode("ascii"))
|
| 1181 |
metadata_hasher.update(b"\0")
|
| 1182 |
|
|
|
|
| 1183 |
return (file_count, total_size, newest_mtime, metadata_hasher.hexdigest())
|
| 1184 |
|
| 1185 |
|
| 1186 |
+
def wait_for_config_settle(config_marker: FileMarker) -> tuple[str, FileMarker]:
|
| 1187 |
stable_since = time.monotonic()
|
| 1188 |
current_marker = config_marker
|
| 1189 |
|
|
|
|
| 1206 |
|
| 1207 |
|
| 1208 |
def wait_for_sync_trigger(
|
| 1209 |
+
config_marker: FileMarker,
|
| 1210 |
last_sessions_sync_time: float = 0.0,
|
| 1211 |
+
) -> tuple[str, FileMarker]:
|
| 1212 |
deadline = time.monotonic() + max(0, INTERVAL)
|
| 1213 |
# BUG FIX: also watch sessions directory so new/updated sessions
|
| 1214 |
# trigger an immediate sync instead of waiting the full interval.
|
start.sh
CHANGED
|
@@ -357,12 +357,12 @@ case "$LLM_PROVIDER" in
|
|
| 357 |
mistral-*|codestral-*|devstral-*|voxtral-*)
|
| 358 |
export MISTRAL_API_KEY="$LLM_API_KEY"
|
| 359 |
echo "Note: bare Mistral model '$LLM_MODEL' detected; mapped LLM_API_KEY β MISTRAL_API_KEY. Use 'mistral/${LLM_MODEL}' prefix to be explicit." ;;
|
| 360 |
-
moonshotai|meta-llama|deepseek-ai|MiniMaxAI|minimax-ai|Qwen|zai-org|mistralai|google)
|
| 361 |
echo "Warning: LLM_MODEL='$LLM_MODEL' uses sub-provider prefix '$LLM_PROVIDER'. This is a router-namespaced model (Together/OpenRouter). Mapping LLM_API_KEY β TOGETHER_API_KEY. If using OpenRouter, also set OPENROUTER_API_KEY as a separate secret."
|
| 362 |
export TOGETHER_API_KEY="${TOGETHER_API_KEY:-$LLM_API_KEY}" ;;
|
| 363 |
# ββ Fallback: Anthropic (default) ββ
|
| 364 |
*)
|
| 365 |
-
echo "Warning: Unknown provider prefix '$LLM_PROVIDER' in LLM_MODEL='$LLM_MODEL'. Defaulting to ANTHROPIC_API_KEY. If using a router-namespaced model (e.g. moonshotai/
|
| 366 |
export ANTHROPIC_API_KEY="$LLM_API_KEY"
|
| 367 |
;;
|
| 368 |
esac
|
|
@@ -657,7 +657,7 @@ _DEFAULT_XAI_MODELS="xai/grok-4.20,xai/grok-4.3,xai/grok-4.1,xai/grok-latest,xai
|
|
| 657 |
_DEFAULT_COHERE_MODELS="cohere/command-a,cohere/command-a-03-2025,cohere/command-a-reasoning-08-2025,cohere/command-r-plus-08-2024"
|
| 658 |
_DEFAULT_TOGETHER_MODELS="together/moonshotai/Kimi-K2.6,together/deepseek-ai/DeepSeek-V4-Pro,together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput,together/meta-llama/Llama-3.3-70B-Instruct-Turbo"
|
| 659 |
_DEFAULT_CEREBRAS_MODELS="cerebras/zai-glm-4.7,cerebras/gpt-oss-120b,cerebras/deepseek-r1,cerebras/qwen3-32b"
|
| 660 |
-
_DEFAULT_NVIDIA_MODELS="nvidia/nemotron-3-super-120b-a12b,
|
| 661 |
_DEFAULT_KILOCODE_MODELS="kilocode/kilo/auto,kilocode/anthropic/claude-opus-4.7,kilocode/openai/gpt-5.4,kilocode/google/gemini-2.5-pro"
|
| 662 |
_DEFAULT_MOONSHOT_MODELS="moonshot/kimi-k2.6,moonshot/kimi-k2.6-thinking,moonshot/kimi-k2-thinking"
|
| 663 |
_DEFAULT_MINIMAX_MODELS="minimax/MiniMax-M2.7,minimax/minimax-m1.5"
|
|
@@ -725,26 +725,40 @@ inject_provider_models_from_env() {
|
|
| 725 |
| awk 'NF' \
|
| 726 |
| jq -R . \
|
| 727 |
| jq -s --arg provider "$provider" '
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
end
|
|
|
|
|
|
|
| 741 |
else
|
| 742 |
-
|
| 743 |
-
end
|
| 744 |
-
)
|
| 745 |
| map({id: ., name: .})
|
| 746 |
| unique_by(.id)')
|
| 747 |
-
|
| 748 |
# Build provider patch: always inject models; conditionally inject apiKey, baseUrl, api.
|
| 749 |
# Existing saved config wins on merge (see config-patch jq below), so this only fills
|
| 750 |
# in missing fields β it never overwrites what the user already configured manually.
|
|
@@ -1414,32 +1428,63 @@ if [ -n "${WEBHOOK_URL:-}" ]; then
|
|
| 1414 |
fi
|
| 1415 |
|
| 1416 |
# ββ Trap SIGTERM for graceful shutdown ββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1417 |
graceful_shutdown() {
|
| 1418 |
echo "Shutting down..."
|
| 1419 |
if [ -f "/home/node/app/openclaw-sync.py" ] && [ -n "${HF_TOKEN:-}" ]; then
|
| 1420 |
echo "Saving state before exit..."
|
| 1421 |
-
|
| 1422 |
-
# syncs. The loop holds the sync lock while uploading; if it is
|
| 1423 |
-
# mid-upload when sync-once-settled runs, the 8-second timeout fires
|
| 1424 |
-
# before the lock is released and the settled-sync is silently skipped.
|
| 1425 |
-
# Killing the loop first releases the lock immediately (fcntl.flock is
|
| 1426 |
-
# released on process exit) so sync-once-settled can acquire it cleanly.
|
| 1427 |
-
if [ -n "${SYNC_LOOP_PID:-}" ]; then
|
| 1428 |
-
kill "$SYNC_LOOP_PID" 2>/dev/null || true
|
| 1429 |
-
# Give Python a moment to flush and release the lock file.
|
| 1430 |
-
# Reduced from 0.5 s β 0.3 s to reclaim time for the upload.
|
| 1431 |
-
sleep 0.3
|
| 1432 |
-
fi
|
| 1433 |
# Pass 1: wait for config to settle then upload β avoids pushing a
|
| 1434 |
-
# half-written JSON config to the dataset.
|
| 1435 |
-
#
|
| 1436 |
-
#
|
| 1437 |
-
|
| 1438 |
echo "Warning: could not complete settled shutdown sync"
|
| 1439 |
# Pass 2: catch any writes that arrived after the settled sync completed.
|
| 1440 |
# BUG FIX: added timeout (previously unbounded) so HF container kill
|
| 1441 |
# can't interrupt a hung upload and lose all data silently.
|
| 1442 |
-
|
| 1443 |
echo "Warning: could not complete final shutdown sync"
|
| 1444 |
elif [ -f "/home/node/app/openclaw-sync.py" ]; then
|
| 1445 |
echo "HF_TOKEN not set; skipping shutdown backup sync."
|
|
@@ -2439,27 +2484,21 @@ sync_before_gateway_restart() {
|
|
| 2439 |
|
| 2440 |
echo "Gateway stopped; saving latest OpenClaw state before restart..."
|
| 2441 |
# Kill the background sync loop before syncing β same reason as in
|
| 2442 |
-
# graceful_shutdown: the loop holds the fcntl.flock while uploading
|
| 2443 |
-
#
|
| 2444 |
-
#
|
| 2445 |
-
|
| 2446 |
-
# is released on process exit) so sync-once-settled can acquire it cleanly.
|
| 2447 |
-
if [ -n "${SYNC_LOOP_PID:-}" ]; then
|
| 2448 |
-
kill "$SYNC_LOOP_PID" 2>/dev/null || true
|
| 2449 |
-
sleep 0.3
|
| 2450 |
-
SYNC_LOOP_PID=""
|
| 2451 |
-
fi
|
| 2452 |
# Pass 1: wait for config to settle then upload β avoids pushing a
|
| 2453 |
# half-written JSON config to the dataset. Timeout added (was unbounded)
|
| 2454 |
# so a slow HF upload cannot stall gateway restarts indefinitely.
|
| 2455 |
-
|
| 2456 |
echo "Warning: could not sync settled state before gateway restart"
|
| 2457 |
# Pass 2: catch any writes that arrived after the settled sync completed
|
| 2458 |
# (e.g. session state flushed by OpenClaw just before exit).
|
| 2459 |
# BUG FIX: this second pass was missing from the restart path (it existed
|
| 2460 |
# only in graceful_shutdown), so last-second writes were lost on watchdog
|
| 2461 |
# restarts β causing sessions to disappear after OpenClaw auto-restarts.
|
| 2462 |
-
|
| 2463 |
echo "Warning: could not complete final sync before gateway restart"
|
| 2464 |
}
|
| 2465 |
|
|
|
|
| 357 |
mistral-*|codestral-*|devstral-*|voxtral-*)
|
| 358 |
export MISTRAL_API_KEY="$LLM_API_KEY"
|
| 359 |
echo "Note: bare Mistral model '$LLM_MODEL' detected; mapped LLM_API_KEY β MISTRAL_API_KEY. Use 'mistral/${LLM_MODEL}' prefix to be explicit." ;;
|
| 360 |
+
moonshotai|meta-llama|deepseek-ai|minimaxai|MiniMaxAI|minimax-ai|Qwen|zai-org|mistralai|google)
|
| 361 |
echo "Warning: LLM_MODEL='$LLM_MODEL' uses sub-provider prefix '$LLM_PROVIDER'. This is a router-namespaced model (Together/OpenRouter). Mapping LLM_API_KEY β TOGETHER_API_KEY. If using OpenRouter, also set OPENROUTER_API_KEY as a separate secret."
|
| 362 |
export TOGETHER_API_KEY="${TOGETHER_API_KEY:-$LLM_API_KEY}" ;;
|
| 363 |
# ββ Fallback: Anthropic (default) ββ
|
| 364 |
*)
|
| 365 |
+
echo "Warning: Unknown provider prefix '$LLM_PROVIDER' in LLM_MODEL='$LLM_MODEL'. Defaulting to ANTHROPIC_API_KEY. If using a router-namespaced model (e.g. moonshotai/kimi-k2.6), set TOGETHER_API_KEY or OPENROUTER_API_KEY as a separate secret."
|
| 366 |
export ANTHROPIC_API_KEY="$LLM_API_KEY"
|
| 367 |
;;
|
| 368 |
esac
|
|
|
|
| 657 |
_DEFAULT_COHERE_MODELS="cohere/command-a,cohere/command-a-03-2025,cohere/command-a-reasoning-08-2025,cohere/command-r-plus-08-2024"
|
| 658 |
_DEFAULT_TOGETHER_MODELS="together/moonshotai/Kimi-K2.6,together/deepseek-ai/DeepSeek-V4-Pro,together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput,together/meta-llama/Llama-3.3-70B-Instruct-Turbo"
|
| 659 |
_DEFAULT_CEREBRAS_MODELS="cerebras/zai-glm-4.7,cerebras/gpt-oss-120b,cerebras/deepseek-r1,cerebras/qwen3-32b"
|
| 660 |
+
_DEFAULT_NVIDIA_MODELS="nvidia/nemotron-3-super-120b-a12b,deepseek-ai/deepseek-v4-flash,qwen/qwen3-coder-480b-a35b-instruct,moonshotai/kimi-k2.6,minimaxai/minimax-m2.7,openai/gpt-oss-120b,z-ai/glm5.1,stepfun-ai/step-3.7-flash"
|
| 661 |
_DEFAULT_KILOCODE_MODELS="kilocode/kilo/auto,kilocode/anthropic/claude-opus-4.7,kilocode/openai/gpt-5.4,kilocode/google/gemini-2.5-pro"
|
| 662 |
_DEFAULT_MOONSHOT_MODELS="moonshot/kimi-k2.6,moonshot/kimi-k2.6-thinking,moonshot/kimi-k2-thinking"
|
| 663 |
_DEFAULT_MINIMAX_MODELS="minimax/MiniMax-M2.7,minimax/minimax-m1.5"
|
|
|
|
| 725 |
| awk 'NF' \
|
| 726 |
| jq -R . \
|
| 727 |
| jq -s --arg provider "$provider" '
|
| 728 |
+
def strip_outer_provider:
|
| 729 |
+
(split("/") | .[1:] | join("/"));
|
| 730 |
+
def normalize_provider_model:
|
| 731 |
+
# OpenClaw stores provider-local model ids inside
|
| 732 |
+
# models.providers.<provider>.models[].id. The user-facing model ref
|
| 733 |
+
# is composed later as <provider>/<id>. Most providers use ids like
|
| 734 |
+
# "gpt-5.4", so "openai/gpt-5.4" becomes "gpt-5.4" here.
|
| 735 |
+
# NVIDIA is special because some valid NVIDIA API ids themselves
|
| 736 |
+
# start with "nvidia/" (for example nvidia/nemotron-...). Keep those
|
| 737 |
+
# single-prefix ids intact, but strip an outer OpenClaw provider prefix
|
| 738 |
+
# from full refs like "nvidia/deepseek-ai/..." or legacy
|
| 739 |
+
# "nvidia/nvidia/nemotron-..." so the provider sends the exact NVIDIA
|
| 740 |
+
# documented model id to integrate.api.nvidia.com.
|
| 741 |
+
if $provider == "nvidia" then
|
| 742 |
+
if startswith("nvidia/nvidia/")
|
| 743 |
+
or startswith("nvidia/deepseek-ai/")
|
| 744 |
+
or startswith("nvidia/qwen/")
|
| 745 |
+
or startswith("nvidia/moonshotai/")
|
| 746 |
+
or startswith("nvidia/minimaxai/")
|
| 747 |
+
or startswith("nvidia/openai/")
|
| 748 |
+
or startswith("nvidia/z-ai/")
|
| 749 |
+
or startswith("nvidia/stepfun-ai/") then
|
| 750 |
+
strip_outer_provider
|
| 751 |
+
else
|
| 752 |
+
.
|
| 753 |
end
|
| 754 |
+
elif startswith($provider + "/") then
|
| 755 |
+
strip_outer_provider
|
| 756 |
else
|
| 757 |
+
.
|
| 758 |
+
end;
|
| 759 |
+
map(normalize_provider_model)
|
| 760 |
| map({id: ., name: .})
|
| 761 |
| unique_by(.id)')
|
|
|
|
| 762 |
# Build provider patch: always inject models; conditionally inject apiKey, baseUrl, api.
|
| 763 |
# Existing saved config wins on merge (see config-patch jq below), so this only fills
|
| 764 |
# in missing fields β it never overwrites what the user already configured manually.
|
|
|
|
| 1428 |
fi
|
| 1429 |
|
| 1430 |
# ββ Trap SIGTERM for graceful shutdown ββ
|
| 1431 |
+
stop_background_sync_loop() {
|
| 1432 |
+
[ -n "${SYNC_LOOP_PID:-}" ] || return 0
|
| 1433 |
+
|
| 1434 |
+
if ! kill -0 "$SYNC_LOOP_PID" 2>/dev/null; then
|
| 1435 |
+
SYNC_LOOP_PID=""
|
| 1436 |
+
return 0
|
| 1437 |
+
fi
|
| 1438 |
+
|
| 1439 |
+
kill "$SYNC_LOOP_PID" 2>/dev/null || true
|
| 1440 |
+
# Wait for the Python process to actually exit so its fcntl lock is released.
|
| 1441 |
+
# A fixed short sleep was not enough when the process was inside a HF upload,
|
| 1442 |
+
# which made the following one-shot syncs burn their timeout waiting on the
|
| 1443 |
+
# lock and left sessions/config changes unsaved.
|
| 1444 |
+
for _sync_stop_i in 1 2 3 4 5 6 7 8 9 10; do
|
| 1445 |
+
if ! kill -0 "$SYNC_LOOP_PID" 2>/dev/null; then
|
| 1446 |
+
SYNC_LOOP_PID=""
|
| 1447 |
+
unset _sync_stop_i
|
| 1448 |
+
return 0
|
| 1449 |
+
fi
|
| 1450 |
+
sleep 0.2
|
| 1451 |
+
done
|
| 1452 |
+
|
| 1453 |
+
echo "Warning: workspace sync loop did not stop after SIGTERM; forcing it to release the sync lock."
|
| 1454 |
+
kill -9 "$SYNC_LOOP_PID" 2>/dev/null || true
|
| 1455 |
+
sleep 0.2
|
| 1456 |
+
SYNC_LOOP_PID=""
|
| 1457 |
+
unset _sync_stop_i
|
| 1458 |
+
}
|
| 1459 |
+
|
| 1460 |
+
run_openclaw_sync_with_timeout() {
|
| 1461 |
+
local timeout_seconds="$1"
|
| 1462 |
+
local command_name="$2"
|
| 1463 |
+
local lock_timeout_seconds="$3"
|
| 1464 |
+
if [ "${timeout_seconds}" = "0" ]; then
|
| 1465 |
+
env SYNC_LOCK_TIMEOUT="$lock_timeout_seconds" \
|
| 1466 |
+
python3 /home/node/app/openclaw-sync.py "$command_name"
|
| 1467 |
+
else
|
| 1468 |
+
timeout --kill-after=10s "${timeout_seconds}s" env SYNC_LOCK_TIMEOUT="$lock_timeout_seconds" \
|
| 1469 |
+
python3 /home/node/app/openclaw-sync.py "$command_name"
|
| 1470 |
+
fi
|
| 1471 |
+
}
|
| 1472 |
+
|
| 1473 |
graceful_shutdown() {
|
| 1474 |
echo "Shutting down..."
|
| 1475 |
if [ -f "/home/node/app/openclaw-sync.py" ] && [ -n "${HF_TOKEN:-}" ]; then
|
| 1476 |
echo "Saving state before exit..."
|
| 1477 |
+
stop_background_sync_loop
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1478 |
# Pass 1: wait for config to settle then upload β avoids pushing a
|
| 1479 |
+
# half-written JSON config to the dataset. The per-command lock wait is
|
| 1480 |
+
# intentionally shorter than the outer timeout so a stuck lock fails clearly
|
| 1481 |
+
# and leaves time for the final catch-up pass.
|
| 1482 |
+
run_openclaw_sync_with_timeout "${SYNC_SETTLED_TIMEOUT:-120}" sync-once-settled "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
|
| 1483 |
echo "Warning: could not complete settled shutdown sync"
|
| 1484 |
# Pass 2: catch any writes that arrived after the settled sync completed.
|
| 1485 |
# BUG FIX: added timeout (previously unbounded) so HF container kill
|
| 1486 |
# can't interrupt a hung upload and lose all data silently.
|
| 1487 |
+
run_openclaw_sync_with_timeout "${SYNC_FINAL_TIMEOUT:-120}" sync-once "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
|
| 1488 |
echo "Warning: could not complete final shutdown sync"
|
| 1489 |
elif [ -f "/home/node/app/openclaw-sync.py" ]; then
|
| 1490 |
echo "HF_TOKEN not set; skipping shutdown backup sync."
|
|
|
|
| 2484 |
|
| 2485 |
echo "Gateway stopped; saving latest OpenClaw state before restart..."
|
| 2486 |
# Kill the background sync loop before syncing β same reason as in
|
| 2487 |
+
# graceful_shutdown: the loop holds the fcntl.flock while uploading. Wait for
|
| 2488 |
+
# it to exit (and force-kill as a last resort) before the one-shot syncs so
|
| 2489 |
+
# gateway restarts cannot hang behind a stale lock.
|
| 2490 |
+
stop_background_sync_loop
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2491 |
# Pass 1: wait for config to settle then upload β avoids pushing a
|
| 2492 |
# half-written JSON config to the dataset. Timeout added (was unbounded)
|
| 2493 |
# so a slow HF upload cannot stall gateway restarts indefinitely.
|
| 2494 |
+
run_openclaw_sync_with_timeout "${SYNC_SETTLED_TIMEOUT:-120}" sync-once-settled "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
|
| 2495 |
echo "Warning: could not sync settled state before gateway restart"
|
| 2496 |
# Pass 2: catch any writes that arrived after the settled sync completed
|
| 2497 |
# (e.g. session state flushed by OpenClaw just before exit).
|
| 2498 |
# BUG FIX: this second pass was missing from the restart path (it existed
|
| 2499 |
# only in graceful_shutdown), so last-second writes were lost on watchdog
|
| 2500 |
# restarts β causing sessions to disappear after OpenClaw auto-restarts.
|
| 2501 |
+
run_openclaw_sync_with_timeout "${SYNC_FINAL_TIMEOUT:-120}" sync-once "${SYNC_ONE_SHOT_LOCK_TIMEOUT:-5}" || \
|
| 2502 |
echo "Warning: could not complete final sync before gateway restart"
|
| 2503 |
}
|
| 2504 |
|