anurag008w commited on
Commit
9a6444d
Β·
1 Parent(s): 2eb43c4

fix: merge injected model providers using wildcard in start.sh

Browse files
Files changed (6) hide show
  1. .env.example +5 -1
  2. README.md +3 -1
  3. env-builder.js +20 -1
  4. multi-provider-key-rotator.cjs +51 -17
  5. openclaw-sync.py +18 -10
  6. start.sh +51 -8
.env.example CHANGED
@@ -169,7 +169,7 @@ LLM_API_KEY_FALLBACK_ENABLED=true
169
  # KEY_MAX_INFLIGHT_PER_KEY=3
170
  #
171
  # Auto-retry count for fetch requests on retryable errors/statuses.
172
- # Total attempts = 1 + retries (GET/HEAD/OPTIONS only to avoid duplicate writes).
173
  # KEY_FETCH_MAX_RETRIES=2
174
  #
175
  # Base delay (ms) between auto-retries. Exponential per attempt; also respects
@@ -180,6 +180,10 @@ LLM_API_KEY_FALLBACK_ENABLED=true
180
  # KEY_ROTATOR_DIAGNOSTICS=true
181
  # KEY_ROTATOR_DIAGNOSTICS_INTERVAL_MS=60000
182
  #
 
 
 
 
183
  # Note: This rotator does not re-send the same failed request automatically.
184
  # It blacklists/penalizes the failed key so the *next* request prefers
185
  # healthier keys.
 
169
  # KEY_MAX_INFLIGHT_PER_KEY=3
170
  #
171
  # Auto-retry count for fetch requests on retryable errors/statuses.
172
+ # Total attempts = 1 + retries (GET/HEAD/OPTIONS/POST).
173
  # KEY_FETCH_MAX_RETRIES=2
174
  #
175
  # Base delay (ms) between auto-retries. Exponential per attempt; also respects
 
180
  # KEY_ROTATOR_DIAGNOSTICS=true
181
  # KEY_ROTATOR_DIAGNOSTICS_INTERVAL_MS=60000
182
  #
183
+ # Log verbosity controls for rotator internals.
184
+ # KEY_ROTATOR_LOG_LEVEL=info
185
+ # KEY_ROTATOR_VERBOSE_PICKS=false
186
+ #
187
  # Note: This rotator does not re-send the same failed request automatically.
188
  # It blacklists/penalizes the failed key so the *next* request prefers
189
  # healthier keys.
README.md CHANGED
@@ -280,10 +280,12 @@ Optional tuning:
280
  - `KEY_PERM_SUSPEND_MS` (default `57600000`) β€” long suspend duration for exhausted/auth-invalid keys (**capped at 16h max**).
281
  - `KEY_FAILURE_DECAY_MS` (default `900000`) β€” recent-failure decay window used to deprioritize keys.
282
  - `KEY_MAX_INFLIGHT_PER_KEY` (default `3`) β€” soft concurrent request cap per key.
283
- - `KEY_FETCH_MAX_RETRIES` (default `2`) β€” auto-retry count for retryable failures on **GET/HEAD/OPTIONS** with a different key.
284
  - `KEY_FETCH_RETRY_BASE_DELAY_MS` (default `250`) β€” base delay for retry backoff (respects `Retry-After`, capped to 10s).
285
  - `KEY_ROTATOR_DIAGNOSTICS=true` β€” emit periodic provider/key health snapshots.
286
  - `KEY_ROTATOR_DIAGNOSTICS_INTERVAL_MS` (default `60000`) β€” diagnostics interval.
 
 
287
 
288
  Supported per-provider variables: `ANTHROPIC_API_KEYS`, `OPENAI_API_KEYS`, `GEMINI_API_KEYS`, `DEEPSEEK_API_KEYS`, `GROQ_API_KEYS`, `MISTRAL_API_KEYS`, `OPENROUTER_API_KEYS`, `XAI_API_KEYS`, `NVIDIA_API_KEYS`, `COHERE_API_KEYS`, `TOGETHER_API_KEYS`, `CEREBRAS_API_KEYS`, and more β€” see `.env.example` for the full list.
289
 
 
280
  - `KEY_PERM_SUSPEND_MS` (default `57600000`) β€” long suspend duration for exhausted/auth-invalid keys (**capped at 16h max**).
281
  - `KEY_FAILURE_DECAY_MS` (default `900000`) β€” recent-failure decay window used to deprioritize keys.
282
  - `KEY_MAX_INFLIGHT_PER_KEY` (default `3`) β€” soft concurrent request cap per key.
283
+ - `KEY_FETCH_MAX_RETRIES` (default `2`) β€” auto-retry count for retryable failures on **GET/HEAD/OPTIONS/POST** with a different key.
284
  - `KEY_FETCH_RETRY_BASE_DELAY_MS` (default `250`) β€” base delay for retry backoff (respects `Retry-After`, capped to 10s).
285
  - `KEY_ROTATOR_DIAGNOSTICS=true` β€” emit periodic provider/key health snapshots.
286
  - `KEY_ROTATOR_DIAGNOSTICS_INTERVAL_MS` (default `60000`) β€” diagnostics interval.
287
+ - `KEY_ROTATOR_LOG_LEVEL` (`info`/`debug`/`silent`, default `info`) β€” controls rotator log verbosity.
288
+ - `KEY_ROTATOR_VERBOSE_PICKS` (`true`/`false`, default `false`) β€” enable per-request key-pick logs (best with `KEY_ROTATOR_LOG_LEVEL=debug`).
289
 
290
  Supported per-provider variables: `ANTHROPIC_API_KEYS`, `OPENAI_API_KEYS`, `GEMINI_API_KEYS`, `DEEPSEEK_API_KEYS`, `GROQ_API_KEYS`, `MISTRAL_API_KEYS`, `OPENROUTER_API_KEYS`, `XAI_API_KEYS`, `NVIDIA_API_KEYS`, `COHERE_API_KEYS`, `TOGETHER_API_KEYS`, `CEREBRAS_API_KEYS`, and more β€” see `.env.example` for the full list.
291
 
env-builder.js CHANGED
@@ -529,7 +529,7 @@ const FIELDS = [
529
  "g": "Plugins",
530
  "icon": "πŸ”„",
531
  "k": "KEY_FETCH_MAX_RETRIES",
532
- "lbl": "Auto-retries for retryable failures (GET/HEAD/OPTIONS only)",
533
  "type": "text",
534
  "ph": "2",
535
  "tag": "advanced"
@@ -543,6 +543,25 @@ const FIELDS = [
543
  "ph": "250",
544
  "tag": "advanced"
545
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
  {
547
  "g": "Plugins",
548
  "icon": "πŸ“Š",
 
529
  "g": "Plugins",
530
  "icon": "πŸ”„",
531
  "k": "KEY_FETCH_MAX_RETRIES",
532
+ "lbl": "Auto-retries for retryable failures (GET/HEAD/OPTIONS/POST)",
533
  "type": "text",
534
  "ph": "2",
535
  "tag": "advanced"
 
543
  "ph": "250",
544
  "tag": "advanced"
545
  },
546
+
547
+ {
548
+ "g": "Plugins",
549
+ "icon": "🧾",
550
+ "k": "KEY_ROTATOR_LOG_LEVEL",
551
+ "lbl": "Key-rotator log level (info/debug/silent)",
552
+ "type": "text",
553
+ "ph": "info",
554
+ "tag": "advanced"
555
+ },
556
+ {
557
+ "g": "Plugins",
558
+ "icon": "🧾",
559
+ "k": "KEY_ROTATOR_VERBOSE_PICKS",
560
+ "lbl": "Verbose per-request key pick logs (use with debug)",
561
+ "type": "toggle",
562
+ "ph": "false",
563
+ "tag": "advanced"
564
+ },
565
  {
566
  "g": "Plugins",
567
  "icon": "πŸ“Š",
multi-provider-key-rotator.cjs CHANGED
@@ -13,13 +13,18 @@
13
  * KEY_BLACKLIST_COOLDOWN_MS base backoff ms (default 60 000)
14
  * KEY_MAX_STRIKES failures before perm (default 3)
15
  * LLM_API_KEY_FALLBACK_ENABLED true/false (default true)
 
 
16
  */
17
 
18
  const http = require('node:http');
19
  const https = require('node:https');
20
 
21
- const log = (...a) => console.error(...a);
22
- const warn = (...a) => console.warn(...a);
 
 
 
23
 
24
  // ─── Config ──────────────────────────────────────────────────────────────────
25
 
@@ -175,7 +180,7 @@ function isActive(p, key) {
175
  if (ks.blacklistedUntil === 0) return true; // not blacklisted
176
  if (Date.now() >= ks.blacklistedUntil) {
177
  ks.blacklistedUntil = 0; // expired β†’ back in pool
178
- log(`[key-rotator] ${p.name}: ...${key.slice(-6)} back in pool`);
179
  return true;
180
  }
181
  return false;
@@ -209,12 +214,27 @@ function recordFailure(p, key) {
209
  const jitter = 1 + ((Math.random() * 2 - 1) * (COOLDOWN_JITTER_PCT / 100));
210
  cooldown = Math.max(1000, Math.round(cooldown * jitter));
211
  const secs = Math.round(cooldown / 1000);
212
- log(`[key-rotator] ${p.name}: ...${key.slice(-6)} strike ${ks.strikes}/${MAX_STRIKES} β€” backoff ${secs}s`);
213
  }
214
 
215
  ks.blacklistedUntil = Date.now() + cooldown;
216
  }
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  /**
219
  * Called on any 2xx/3xx response β€” resets the key's strike counter.
220
  */
@@ -223,12 +243,12 @@ function recordSuccess(p, key) {
223
  if (ks && ks.strikes > 0) {
224
  ks.strikes = 0;
225
  ks.lastFailureAt = 0;
226
- log(`[key-rotator] ${p.name}: ...${key.slice(-6)} recovered β€” strikes reset`);
227
  }
228
  }
229
 
230
  function classifyRetryableFailure(status, errCode) {
231
- const retryableStatus = new Set([408, 425, 429, 500, 502, 503, 504, 529, 402]);
232
  const retryableErrorCodes = new Set([
233
  'ECONNRESET', 'ETIMEDOUT', 'EAI_AGAIN', 'ENOTFOUND',
234
  'ECONNREFUSED', 'EPIPE',
@@ -280,7 +300,7 @@ function nextKey(p) {
280
  const inflight = p.inFlight.get(key) || 0;
281
  if (inflight < MAX_INFLIGHT_PER_KEY) {
282
  p.idx = (i + 1) % total; // next call starts AFTER the key we just picked
283
- log(`[key-rotator] ${p.name}: picked ...${key.slice(-6)} inflight=${inflight + 1}/${MAX_INFLIGHT_PER_KEY}`);
284
  return key;
285
  }
286
  if (!bestPick) bestPick = { i, key, inflight, score: Number.POSITIVE_INFINITY };
@@ -353,10 +373,20 @@ function handleStatus(p, key, status) {
353
  warn(`[key-rotator] ${p.name}: ...${key.slice(-6)} auth-failed (${status}) β€” suspended for ${formatHours(PERM_SUSPEND_MS)} h`);
354
  return;
355
  }
356
- if (classifyRetryableFailure(status)) {
 
357
  recordFailure(p, key);
358
- warn(`[key-rotator] ${p.name}: retryable status=${status} on ...${key.slice(-6)}`);
359
- } else if (status >= 200 && status < 400) {
 
 
 
 
 
 
 
 
 
360
  recordSuccess(p, key);
361
  }
362
  }
@@ -364,9 +394,11 @@ function handleStatus(p, key, status) {
364
  function handleTransportError(p, key, err) {
365
  if (!p || !key) return;
366
  const code = err?.code ? String(err.code).toUpperCase() : '';
367
- if (classifyRetryableFailure(undefined, code)) {
 
 
368
  recordFailure(p, key);
369
- warn(`[key-rotator] ${p.name}: retryable network code=${code} on ...${key.slice(-6)}`);
370
  }
371
  }
372
 
@@ -428,7 +460,8 @@ function patchFetch() {
428
  const baseRequest = new Request(input, init);
429
  const method = String(baseRequest.method || 'GET').toUpperCase();
430
  const replaySafe = method === 'GET' || method === 'HEAD' || method === 'OPTIONS';
431
- const maxAttempts = replaySafe ? 1 + FETCH_MAX_RETRIES : 1;
 
432
  const triedKeys = new Set();
433
  let lastErr = null;
434
  let lastResponse = null;
@@ -478,7 +511,7 @@ function patchFetch() {
478
  10_000,
479
  Math.max(retryAfterMs, FETCH_RETRY_BASE_DELAY_MS * Math.pow(2, attempt - 1)),
480
  );
481
- warn(`[key-rotator] ${provider.name}: fetch retry ${attempt}/${maxAttempts - 1} after status=${response.status}`);
482
  await sleep(backoffMs);
483
  continue;
484
  }
@@ -488,10 +521,11 @@ function patchFetch() {
488
  try { handleTransportError(provider, usedKey, err); } catch (_) {}
489
  try { endInFlight(provider, usedKey); } catch (_) {}
490
  const code = err?.code ? String(err.code).toUpperCase() : '';
491
- const shouldRetry = attempt < maxAttempts && classifyRetryableFailure(undefined, code);
 
492
  if (shouldRetry) {
493
  const backoffMs = Math.min(10_000, FETCH_RETRY_BASE_DELAY_MS * Math.pow(2, attempt - 1));
494
- warn(`[key-rotator] ${provider.name}: fetch retry ${attempt}/${maxAttempts - 1} after network code=${code || 'unknown'}`);
495
  await sleep(backoffMs);
496
  continue;
497
  }
@@ -578,4 +612,4 @@ patchHttpModule(http);
578
  patchHttpModule(https);
579
  startDiagnostics();
580
 
581
- log(`[key-rotator] loaded β€” cooldown base:${BASE_COOLDOWN_MS/1000}s max-strikes:${MAX_STRIKES} perm-suspend:${formatHours(PERM_SUSPEND_MS)}h (cap 16h) max-inflight-per-key:${MAX_INFLIGHT_PER_KEY} diagnostics:${DIAGNOSTICS_ENABLED ? 'on' : 'off'}`);
 
13
  * KEY_BLACKLIST_COOLDOWN_MS base backoff ms (default 60 000)
14
  * KEY_MAX_STRIKES failures before perm (default 3)
15
  * LLM_API_KEY_FALLBACK_ENABLED true/false (default true)
16
+ * KEY_ROTATOR_LOG_LEVEL info/debug/silent (default info)
17
+ * KEY_ROTATOR_VERBOSE_PICKS true/false (default false)
18
  */
19
 
20
  const http = require('node:http');
21
  const https = require('node:https');
22
 
23
+ const LOG_LEVEL = String(process.env.KEY_ROTATOR_LOG_LEVEL || 'info').trim().toLowerCase();
24
+ const VERBOSE_PICKS = /^(1|true|yes|on)$/i.test(String(process.env.KEY_ROTATOR_VERBOSE_PICKS || '').trim());
25
+ const log = (...a) => { if (LOG_LEVEL !== 'silent') console.error(...a); };
26
+ const warn = (...a) => { if (LOG_LEVEL !== 'silent') console.warn(...a); };
27
+ const debug = (...a) => { if (LOG_LEVEL === 'debug') console.error(...a); };
28
 
29
  // ─── Config ──────────────────────────────────────────────────────────────────
30
 
 
180
  if (ks.blacklistedUntil === 0) return true; // not blacklisted
181
  if (Date.now() >= ks.blacklistedUntil) {
182
  ks.blacklistedUntil = 0; // expired β†’ back in pool
183
+ debug(`[key-rotator] ${p.name}: ...${key.slice(-6)} back in pool`);
184
  return true;
185
  }
186
  return false;
 
214
  const jitter = 1 + ((Math.random() * 2 - 1) * (COOLDOWN_JITTER_PCT / 100));
215
  cooldown = Math.max(1000, Math.round(cooldown * jitter));
216
  const secs = Math.round(cooldown / 1000);
217
+ debug(`[key-rotator] ${p.name}: ...${key.slice(-6)} strike ${ks.strikes}/${MAX_STRIKES} β€” backoff ${secs}s`);
218
  }
219
 
220
  ks.blacklistedUntil = Date.now() + cooldown;
221
  }
222
 
223
+ /**
224
+ * Called on transient retryable failures (non-quota/rate):
225
+ * applies short cooldown without incrementing strikes.
226
+ */
227
+ function recordTransientFailure(p, key) {
228
+ let ks = p.keyState.get(key);
229
+ if (!ks) { ks = makeKeyState(); p.keyState.set(key, ks); }
230
+ ks.lastFailureAt = Date.now();
231
+ const jitter = 1 + ((Math.random() * 2 - 1) * (COOLDOWN_JITTER_PCT / 100));
232
+ const cooldown = Math.max(1000, Math.round(BASE_COOLDOWN_MS * jitter));
233
+ ks.blacklistedUntil = Math.max(ks.blacklistedUntil || 0, Date.now() + cooldown);
234
+ const secs = Math.round(cooldown / 1000);
235
+ debug(`[key-rotator] ${p.name}: ...${key.slice(-6)} transient backoff ${secs}s (strikes unchanged)`);
236
+ }
237
+
238
  /**
239
  * Called on any 2xx/3xx response β€” resets the key's strike counter.
240
  */
 
243
  if (ks && ks.strikes > 0) {
244
  ks.strikes = 0;
245
  ks.lastFailureAt = 0;
246
+ debug(`[key-rotator] ${p.name}: ...${key.slice(-6)} recovered β€” strikes reset`);
247
  }
248
  }
249
 
250
  function classifyRetryableFailure(status, errCode) {
251
+ const retryableStatus = new Set([402, 408, 425, 429, 500, 502, 503, 504, 520, 521, 522, 523, 524, 529]);
252
  const retryableErrorCodes = new Set([
253
  'ECONNRESET', 'ETIMEDOUT', 'EAI_AGAIN', 'ENOTFOUND',
254
  'ECONNREFUSED', 'EPIPE',
 
300
  const inflight = p.inFlight.get(key) || 0;
301
  if (inflight < MAX_INFLIGHT_PER_KEY) {
302
  p.idx = (i + 1) % total; // next call starts AFTER the key we just picked
303
+ if (VERBOSE_PICKS) debug(`[key-rotator] ${p.name}: picked ...${key.slice(-6)} inflight=${inflight + 1}/${MAX_INFLIGHT_PER_KEY}`);
304
  return key;
305
  }
306
  if (!bestPick) bestPick = { i, key, inflight, score: Number.POSITIVE_INFINITY };
 
373
  warn(`[key-rotator] ${p.name}: ...${key.slice(-6)} auth-failed (${status}) β€” suspended for ${formatHours(PERM_SUSPEND_MS)} h`);
374
  return;
375
  }
376
+
377
+ if (status === 429 || status === 402) {
378
  recordFailure(p, key);
379
+ warn(`[key-rotator] ${p.name}: quota/rate status=${status} on ...${key.slice(-6)}`);
380
+ return;
381
+ }
382
+
383
+ if (classifyRetryableFailure(status)) {
384
+ recordTransientFailure(p, key);
385
+ warn(`[key-rotator] ${p.name}: transient status=${status} on ...${key.slice(-6)}`);
386
+ return;
387
+ }
388
+
389
+ if (status >= 200 && status < 400) {
390
  recordSuccess(p, key);
391
  }
392
  }
 
394
  function handleTransportError(p, key, err) {
395
  if (!p || !key) return;
396
  const code = err?.code ? String(err.code).toUpperCase() : '';
397
+ const name = String(err?.name || '');
398
+ const retryable = classifyRetryableFailure(undefined, code) || name === 'AbortError';
399
+ if (retryable) {
400
  recordFailure(p, key);
401
+ warn(`[key-rotator] ${p.name}: retryable network ${name || 'Error'}${code ? ` code=${code}` : ''} on ...${key.slice(-6)}`);
402
  }
403
  }
404
 
 
460
  const baseRequest = new Request(input, init);
461
  const method = String(baseRequest.method || 'GET').toUpperCase();
462
  const replaySafe = method === 'GET' || method === 'HEAD' || method === 'OPTIONS';
463
+ const retryEligible = replaySafe || method === 'POST';
464
+ const maxAttempts = retryEligible ? 1 + FETCH_MAX_RETRIES : 1;
465
  const triedKeys = new Set();
466
  let lastErr = null;
467
  let lastResponse = null;
 
511
  10_000,
512
  Math.max(retryAfterMs, FETCH_RETRY_BASE_DELAY_MS * Math.pow(2, attempt - 1)),
513
  );
514
+ warn(`[key-rotator] ${provider.name}: fetch retry ${attempt}/${maxAttempts - 1} after status=${response.status} method=${method}`);
515
  await sleep(backoffMs);
516
  continue;
517
  }
 
521
  try { handleTransportError(provider, usedKey, err); } catch (_) {}
522
  try { endInFlight(provider, usedKey); } catch (_) {}
523
  const code = err?.code ? String(err.code).toUpperCase() : '';
524
+ const isAbort = String(err?.name || '') === 'AbortError';
525
+ const shouldRetry = attempt < maxAttempts && (classifyRetryableFailure(undefined, code) || isAbort);
526
  if (shouldRetry) {
527
  const backoffMs = Math.min(10_000, FETCH_RETRY_BASE_DELAY_MS * Math.pow(2, attempt - 1));
528
+ warn(`[key-rotator] ${provider.name}: fetch retry ${attempt}/${maxAttempts - 1} after network ${isAbort ? 'AbortError' : `code=${code || 'unknown'}`} method=${method}`);
529
  await sleep(backoffMs);
530
  continue;
531
  }
 
612
  patchHttpModule(https);
613
  startDiagnostics();
614
 
615
+ log(`[key-rotator] loaded β€” cooldown base:${BASE_COOLDOWN_MS/1000}s max-strikes:${MAX_STRIKES} perm-suspend:${formatHours(PERM_SUSPEND_MS)}h (cap 16h) max-inflight-per-key:${MAX_INFLIGHT_PER_KEY} diagnostics:${DIAGNOSTICS_ENABLED ? 'on' : 'off'} log-level:${LOG_LEVEL} verbose-picks:${VERBOSE_PICKS ? 'on' : 'off'}`);
openclaw-sync.py CHANGED
@@ -140,7 +140,8 @@ def copy_state_entry_with_retry(source_path: Path, backup_path: Path, attempts:
140
  continue
141
  raise last_exc
142
 
143
- def snapshot_state_into_workspace() -> None:
 
144
  try:
145
  STATE_DIR.mkdir(parents=True, exist_ok=True)
146
  # Atomic snapshot: copy to a staging dir first, then rename.
@@ -185,6 +186,7 @@ def snapshot_state_into_workspace() -> None:
185
  # known-good version for only those entries (staging was seeded from
186
  # previous backup). This preserves forward progress for the rest.
187
  if skipped_entries:
 
188
  for name, entry_exc in skipped_entries:
189
  print(f"Warning: keeping previous state entry {name}: {entry_exc}")
190
  print(
@@ -200,10 +202,11 @@ def snapshot_state_into_workspace() -> None:
200
  if staging_dir.exists():
201
  shutil.rmtree(staging_dir, ignore_errors=True)
202
  print(f"Warning: could not snapshot OpenClaw state: {exc}")
 
203
 
204
  try:
205
  if not WHATSAPP_ENABLED:
206
- return
207
 
208
  STATE_DIR.mkdir(parents=True, exist_ok=True)
209
 
@@ -212,16 +215,16 @@ def snapshot_state_into_workspace() -> None:
212
  shutil.rmtree(WHATSAPP_BACKUP_DIR, ignore_errors=True)
213
  print("Removed backed-up WhatsApp credentials after reset request.")
214
  RESET_MARKER.unlink(missing_ok=True)
215
- return
216
 
217
  if not WHATSAPP_CREDS_DIR.exists():
218
- return
219
 
220
  file_count = count_files(WHATSAPP_CREDS_DIR)
221
  if file_count < 2:
222
  if file_count > 0:
223
  print(f"WhatsApp backup skipped: credentials incomplete ({file_count} files).")
224
- return
225
 
226
  WHATSAPP_BACKUP_DIR.parent.mkdir(parents=True, exist_ok=True)
227
  if WHATSAPP_BACKUP_DIR.exists():
@@ -229,6 +232,8 @@ def snapshot_state_into_workspace() -> None:
229
  shutil.copytree(WHATSAPP_CREDS_DIR, WHATSAPP_BACKUP_DIR)
230
  except Exception as exc:
231
  print(f"Warning: could not snapshot WhatsApp state: {exc}")
 
 
232
 
233
 
234
  def restore_embedded_state() -> None:
@@ -515,7 +520,7 @@ def _sync_once_unlocked(
515
  write_status("disabled", "HF_TOKEN is not configured.")
516
  return (last_fingerprint or "", last_marker or (0, 0, 0, ""))
517
 
518
- snapshot_state_into_workspace()
519
  repo_id = ensure_repo_exists()
520
  current_marker = metadata_marker(WORKSPACE)
521
  if last_marker is not None and current_marker == last_marker:
@@ -547,10 +552,13 @@ def _sync_once_unlocked(
547
  commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
548
  ignore_patterns=[".git/*", ".git"],
549
  )
550
- try:
551
- prune_remote_deleted_files(repo_id, snapshot_dir)
552
- except Exception as prune_exc:
553
- print(f"Warning: could not prune stale remote files: {prune_exc}")
 
 
 
554
  finally:
555
  shutil.rmtree(snapshot_dir, ignore_errors=True)
556
 
 
140
  continue
141
  raise last_exc
142
 
143
+ def snapshot_state_into_workspace() -> bool:
144
+ had_copy_failures = False
145
  try:
146
  STATE_DIR.mkdir(parents=True, exist_ok=True)
147
  # Atomic snapshot: copy to a staging dir first, then rename.
 
186
  # known-good version for only those entries (staging was seeded from
187
  # previous backup). This preserves forward progress for the rest.
188
  if skipped_entries:
189
+ had_copy_failures = True
190
  for name, entry_exc in skipped_entries:
191
  print(f"Warning: keeping previous state entry {name}: {entry_exc}")
192
  print(
 
202
  if staging_dir.exists():
203
  shutil.rmtree(staging_dir, ignore_errors=True)
204
  print(f"Warning: could not snapshot OpenClaw state: {exc}")
205
+ had_copy_failures = True
206
 
207
  try:
208
  if not WHATSAPP_ENABLED:
209
+ return had_copy_failures
210
 
211
  STATE_DIR.mkdir(parents=True, exist_ok=True)
212
 
 
215
  shutil.rmtree(WHATSAPP_BACKUP_DIR, ignore_errors=True)
216
  print("Removed backed-up WhatsApp credentials after reset request.")
217
  RESET_MARKER.unlink(missing_ok=True)
218
+ return had_copy_failures
219
 
220
  if not WHATSAPP_CREDS_DIR.exists():
221
+ return had_copy_failures
222
 
223
  file_count = count_files(WHATSAPP_CREDS_DIR)
224
  if file_count < 2:
225
  if file_count > 0:
226
  print(f"WhatsApp backup skipped: credentials incomplete ({file_count} files).")
227
+ return had_copy_failures
228
 
229
  WHATSAPP_BACKUP_DIR.parent.mkdir(parents=True, exist_ok=True)
230
  if WHATSAPP_BACKUP_DIR.exists():
 
232
  shutil.copytree(WHATSAPP_CREDS_DIR, WHATSAPP_BACKUP_DIR)
233
  except Exception as exc:
234
  print(f"Warning: could not snapshot WhatsApp state: {exc}")
235
+ had_copy_failures = True
236
+ return had_copy_failures
237
 
238
 
239
  def restore_embedded_state() -> None:
 
520
  write_status("disabled", "HF_TOKEN is not configured.")
521
  return (last_fingerprint or "", last_marker or (0, 0, 0, ""))
522
 
523
+ had_snapshot_copy_failures = snapshot_state_into_workspace()
524
  repo_id = ensure_repo_exists()
525
  current_marker = metadata_marker(WORKSPACE)
526
  if last_marker is not None and current_marker == last_marker:
 
552
  commit_message=f"HuggingClaw sync {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
553
  ignore_patterns=[".git/*", ".git"],
554
  )
555
+ if had_snapshot_copy_failures:
556
+ print("Warning: skipping remote prune this pass because local state snapshot had copy failures.")
557
+ else:
558
+ try:
559
+ prune_remote_deleted_files(repo_id, snapshot_dir)
560
+ except Exception as prune_exc:
561
+ print(f"Warning: could not prune stale remote files: {prune_exc}")
562
  finally:
563
  shutil.rmtree(snapshot_dir, ignore_errors=True)
564
 
start.sh CHANGED
@@ -783,6 +783,36 @@ if [ "$WHATSAPP_ENABLED_NORMALIZED" = "true" ]; then
783
  CONFIG_JSON=$(echo "$CONFIG_JSON" | jq '.channels.whatsapp = {"dmPolicy": "pairing"}')
784
  fi
785
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  # Write config
787
  EXISTING_CONFIG="/home/node/.openclaw/openclaw.json"
788
  WHATSAPP_CONFIG_ENABLED=false
@@ -820,8 +850,8 @@ if [ -f "$EXISTING_CONFIG" ]; then
820
  | if (($injectedModelsProviders | length) > 0) then
821
  ($injectedModelsProviders | to_entries) as $entries
822
  | reduce $entries[] as $e (.;
823
- .models.providers[$e.key] = ((.models.providers[$e.key] // {})
824
- + {models: (($e.value.models // []) | unique_by(.id))})
825
  )
826
  else
827
  .
@@ -852,16 +882,29 @@ if [ -f "$EXISTING_CONFIG" ]; then
852
  "$EXISTING_CONFIG" 2>/dev/null)
853
 
854
  if [ -n "$PATCHED" ]; then
855
- echo "$PATCHED" > "$EXISTING_CONFIG.tmp" \
856
- && mv "$EXISTING_CONFIG.tmp" "$EXISTING_CONFIG"
857
- echo "Config patched successfully."
 
 
 
 
858
  else
859
- echo "Patch failed β€” writing fresh config."
860
- echo "$CONFIG_JSON" > "$EXISTING_CONFIG"
 
 
 
 
 
 
 
 
 
861
  fi
862
  else
863
  echo "No restored config β€” writing fresh config..."
864
- echo "$CONFIG_JSON" > "$EXISTING_CONFIG"
865
  fi
866
  chmod 600 "$EXISTING_CONFIG"
867
 
 
783
  CONFIG_JSON=$(echo "$CONFIG_JSON" | jq '.channels.whatsapp = {"dmPolicy": "pairing"}')
784
  fi
785
 
786
+
787
+ validate_json_file() {
788
+ local file="$1"
789
+ [ -f "$file" ] || return 1
790
+ jq -e . "$file" >/dev/null 2>&1
791
+ }
792
+
793
+ write_json_atomic() {
794
+ local dest="$1"
795
+ local payload="$2"
796
+ local tmp
797
+ tmp="${dest}.tmp.$$"
798
+ printf '%s\n' "$payload" > "$tmp" || return 1
799
+ if ! jq -e . "$tmp" >/dev/null 2>&1; then
800
+ echo "ERROR: refusing to write invalid JSON to $dest" >&2
801
+ rm -f "$tmp"
802
+ return 1
803
+ fi
804
+ mv "$tmp" "$dest"
805
+ }
806
+
807
+ backup_config_copy() {
808
+ local src="$1"
809
+ [ -f "$src" ] || return 0
810
+ local stamp backup
811
+ stamp="$(date +%Y%m%d-%H%M%S)"
812
+ backup="${src}.backup.${stamp}"
813
+ cp -a "$src" "$backup" 2>/dev/null || cp "$src" "$backup" 2>/dev/null || true
814
+ }
815
+
816
  # Write config
817
  EXISTING_CONFIG="/home/node/.openclaw/openclaw.json"
818
  WHATSAPP_CONFIG_ENABLED=false
 
850
  | if (($injectedModelsProviders | length) > 0) then
851
  ($injectedModelsProviders | to_entries) as $entries
852
  | reduce $entries[] as $e (.;
853
+ (($desired.models.providers[$e.key] // {}) * {models: (($e.value.models // []) | unique_by(.id))}) as $desiredProvider
854
+ | .models.providers[$e.key] = ((.models.providers[$e.key] // {}) * $desiredProvider)
855
  )
856
  else
857
  .
 
882
  "$EXISTING_CONFIG" 2>/dev/null)
883
 
884
  if [ -n "$PATCHED" ]; then
885
+ backup_config_copy "$EXISTING_CONFIG"
886
+ if write_json_atomic "$EXISTING_CONFIG" "$PATCHED"; then
887
+ echo "Config patched successfully."
888
+ else
889
+ echo "Patch produced invalid JSON β€” writing fresh config."
890
+ write_json_atomic "$EXISTING_CONFIG" "$CONFIG_JSON" || { echo "ERROR: could not write valid fallback config" >&2; exit 1; }
891
+ fi
892
  else
893
+ echo "Patch failed."
894
+ # Validate only on patch failure (as requested). If restored config is invalid,
895
+ # quarantine it and regenerate from runtime config; otherwise keep it untouched.
896
+ if ! validate_json_file "$EXISTING_CONFIG"; then
897
+ echo "Restored config is invalid JSON β€” backing up and regenerating from runtime config."
898
+ cp "$EXISTING_CONFIG" "${EXISTING_CONFIG}.invalid.$(date +%Y%m%d-%H%M%S)" 2>/dev/null || true
899
+ backup_config_copy "$EXISTING_CONFIG"
900
+ write_json_atomic "$EXISTING_CONFIG" "$CONFIG_JSON" || { echo "ERROR: could not write valid fallback config" >&2; exit 1; }
901
+ else
902
+ echo "Patch failed but restored config is valid β€” keeping existing config unchanged."
903
+ fi
904
  fi
905
  else
906
  echo "No restored config β€” writing fresh config..."
907
+ write_json_atomic "$EXISTING_CONFIG" "$CONFIG_JSON" || { echo "ERROR: could not write valid config" >&2; exit 1; }
908
  fi
909
  chmod 600 "$EXISTING_CONFIG"
910