import type { OpenClawConfig } from "../../config/config.js"; import { normalizeProviderId } from "../model-selection.js"; import { logAuthProfileFailureStateChange } from "./state-observation.js"; import { saveAuthProfileStore, updateAuthProfileStoreWithLock } from "./store.js"; import type { AuthProfileFailureReason, AuthProfileStore, ProfileUsageStats } from "./types.js"; const FAILURE_REASON_PRIORITY: AuthProfileFailureReason[] = [ "auth_permanent", "auth", "billing", "format", "model_not_found", "overloaded", "timeout", "rate_limit", "unknown", ]; const FAILURE_REASON_SET = new Set(FAILURE_REASON_PRIORITY); const FAILURE_REASON_ORDER = new Map( FAILURE_REASON_PRIORITY.map((reason, index) => [reason, index]), ); function isAuthCooldownBypassedForProvider(provider: string | undefined): boolean { const normalized = normalizeProviderId(provider ?? ""); return normalized === "openrouter" || normalized === "kilocode"; } export function resolveProfileUnusableUntil( stats: Pick, ): number | null { const values = [stats.cooldownUntil, stats.disabledUntil] .filter((value): value is number => typeof value === "number") .filter((value) => Number.isFinite(value) && value > 0); if (values.length === 0) { return null; } return Math.max(...values); } /** * Check if a profile is currently in cooldown (due to rate limits, overload, or other transient failures). */ export function isProfileInCooldown( store: AuthProfileStore, profileId: string, now?: number, ): boolean { if (isAuthCooldownBypassedForProvider(store.profiles[profileId]?.provider)) { return false; } const stats = store.usageStats?.[profileId]; if (!stats) { return false; } const unusableUntil = resolveProfileUnusableUntil(stats); const ts = now ?? Date.now(); return unusableUntil ? ts < unusableUntil : false; } function isActiveUnusableWindow(until: number | undefined, now: number): boolean { return typeof until === "number" && Number.isFinite(until) && until > 0 && now < until; } /** * Infer the most likely reason all candidate profiles are currently unavailable. * * We prefer explicit active `disabledReason` values (for example billing/auth) * over generic cooldown buckets, then fall back to failure-count signals. */ export function resolveProfilesUnavailableReason(params: { store: AuthProfileStore; profileIds: string[]; now?: number; }): AuthProfileFailureReason | null { const now = params.now ?? Date.now(); const scores = new Map(); const addScore = (reason: AuthProfileFailureReason, value: number) => { if (!FAILURE_REASON_SET.has(reason) || value <= 0 || !Number.isFinite(value)) { return; } scores.set(reason, (scores.get(reason) ?? 0) + value); }; for (const profileId of params.profileIds) { const stats = params.store.usageStats?.[profileId]; if (!stats) { continue; } const disabledActive = isActiveUnusableWindow(stats.disabledUntil, now); if (disabledActive && stats.disabledReason && FAILURE_REASON_SET.has(stats.disabledReason)) { // Disabled reasons are explicit and high-signal; weight heavily. addScore(stats.disabledReason, 1_000); continue; } const cooldownActive = isActiveUnusableWindow(stats.cooldownUntil, now); if (!cooldownActive) { continue; } let recordedReason = false; for (const [rawReason, rawCount] of Object.entries(stats.failureCounts ?? {})) { const reason = rawReason as AuthProfileFailureReason; const count = typeof rawCount === "number" ? rawCount : 0; if (!FAILURE_REASON_SET.has(reason) || count <= 0) { continue; } addScore(reason, count); recordedReason = true; } if (!recordedReason) { // No failure counts recorded for this cooldown window. Previously this // defaulted to "rate_limit", which caused false "rate limit reached" // warnings when the actual reason was unknown (e.g. transient network // blip or server error without a classified failure count). addScore("unknown", 1); } } if (scores.size === 0) { return null; } let best: AuthProfileFailureReason | null = null; let bestScore = -1; let bestPriority = Number.MAX_SAFE_INTEGER; for (const reason of FAILURE_REASON_PRIORITY) { const score = scores.get(reason); if (typeof score !== "number") { continue; } const priority = FAILURE_REASON_ORDER.get(reason) ?? Number.MAX_SAFE_INTEGER; if (score > bestScore || (score === bestScore && priority < bestPriority)) { best = reason; bestScore = score; bestPriority = priority; } } return best; } /** * Return the soonest `unusableUntil` timestamp (ms epoch) among the given * profiles, or `null` when no profile has a recorded cooldown. Note: the * returned timestamp may be in the past if the cooldown has already expired. */ export function getSoonestCooldownExpiry( store: AuthProfileStore, profileIds: string[], ): number | null { let soonest: number | null = null; for (const id of profileIds) { const stats = store.usageStats?.[id]; if (!stats) { continue; } const until = resolveProfileUnusableUntil(stats); if (typeof until !== "number" || !Number.isFinite(until) || until <= 0) { continue; } if (soonest === null || until < soonest) { soonest = until; } } return soonest; } /** * Clear expired cooldowns from all profiles in the store. * * When `cooldownUntil` or `disabledUntil` has passed, the corresponding fields * are removed and error counters are reset so the profile gets a fresh start * (circuit-breaker half-open → closed). Without this, a stale `errorCount` * causes the *next* transient failure to immediately escalate to a much longer * cooldown — the root cause of profiles appearing "stuck" after rate limits. * * `cooldownUntil` and `disabledUntil` are handled independently: if a profile * has both and only one has expired, only that field is cleared. * * Mutates the in-memory store; disk persistence happens lazily on the next * store write (e.g. `markAuthProfileUsed` / `markAuthProfileFailure`), which * matches the existing save pattern throughout the auth-profiles module. * * @returns `true` if any profile was modified. */ export function clearExpiredCooldowns(store: AuthProfileStore, now?: number): boolean { const usageStats = store.usageStats; if (!usageStats) { return false; } const ts = now ?? Date.now(); let mutated = false; for (const [profileId, stats] of Object.entries(usageStats)) { if (!stats) { continue; } let profileMutated = false; const cooldownExpired = typeof stats.cooldownUntil === "number" && Number.isFinite(stats.cooldownUntil) && stats.cooldownUntil > 0 && ts >= stats.cooldownUntil; const disabledExpired = typeof stats.disabledUntil === "number" && Number.isFinite(stats.disabledUntil) && stats.disabledUntil > 0 && ts >= stats.disabledUntil; if (cooldownExpired) { stats.cooldownUntil = undefined; profileMutated = true; } if (disabledExpired) { stats.disabledUntil = undefined; stats.disabledReason = undefined; profileMutated = true; } // Reset error counters when ALL cooldowns have expired so the profile gets // a fair retry window. Preserves lastFailureAt for the failureWindowMs // decay check in computeNextProfileUsageStats. if (profileMutated && !resolveProfileUnusableUntil(stats)) { stats.errorCount = 0; stats.failureCounts = undefined; } if (profileMutated) { usageStats[profileId] = stats; mutated = true; } } return mutated; } /** * Mark a profile as successfully used. Resets error count and updates lastUsed. * Uses store lock to avoid overwriting concurrent usage updates. */ export async function markAuthProfileUsed(params: { store: AuthProfileStore; profileId: string; agentDir?: string; }): Promise { const { store, profileId, agentDir } = params; const updated = await updateAuthProfileStoreWithLock({ agentDir, updater: (freshStore) => { if (!freshStore.profiles[profileId]) { return false; } updateUsageStatsEntry(freshStore, profileId, (existing) => resetUsageStats(existing, { lastUsed: Date.now() }), ); return true; }, }); if (updated) { store.usageStats = updated.usageStats; return; } if (!store.profiles[profileId]) { return; } updateUsageStatsEntry(store, profileId, (existing) => resetUsageStats(existing, { lastUsed: Date.now() }), ); saveAuthProfileStore(store, agentDir); } export function calculateAuthProfileCooldownMs(errorCount: number): number { const normalized = Math.max(1, errorCount); return Math.min( 60 * 60 * 1000, // 1 hour max 60 * 1000 * 5 ** Math.min(normalized - 1, 3), ); } type ResolvedAuthCooldownConfig = { billingBackoffMs: number; billingMaxMs: number; failureWindowMs: number; }; function resolveAuthCooldownConfig(params: { cfg?: OpenClawConfig; providerId: string; }): ResolvedAuthCooldownConfig { const defaults = { billingBackoffHours: 5, billingMaxHours: 24, failureWindowHours: 24, } as const; const resolveHours = (value: unknown, fallback: number) => typeof value === "number" && Number.isFinite(value) && value > 0 ? value : fallback; const cooldowns = params.cfg?.auth?.cooldowns; const billingOverride = (() => { const map = cooldowns?.billingBackoffHoursByProvider; if (!map) { return undefined; } for (const [key, value] of Object.entries(map)) { if (normalizeProviderId(key) === params.providerId) { return value; } } return undefined; })(); const billingBackoffHours = resolveHours( billingOverride ?? cooldowns?.billingBackoffHours, defaults.billingBackoffHours, ); const billingMaxHours = resolveHours(cooldowns?.billingMaxHours, defaults.billingMaxHours); const failureWindowHours = resolveHours( cooldowns?.failureWindowHours, defaults.failureWindowHours, ); return { billingBackoffMs: billingBackoffHours * 60 * 60 * 1000, billingMaxMs: billingMaxHours * 60 * 60 * 1000, failureWindowMs: failureWindowHours * 60 * 60 * 1000, }; } function calculateAuthProfileBillingDisableMsWithConfig(params: { errorCount: number; baseMs: number; maxMs: number; }): number { const normalized = Math.max(1, params.errorCount); const baseMs = Math.max(60_000, params.baseMs); const maxMs = Math.max(baseMs, params.maxMs); const exponent = Math.min(normalized - 1, 10); const raw = baseMs * 2 ** exponent; return Math.min(maxMs, raw); } export function resolveProfileUnusableUntilForDisplay( store: AuthProfileStore, profileId: string, ): number | null { if (isAuthCooldownBypassedForProvider(store.profiles[profileId]?.provider)) { return null; } const stats = store.usageStats?.[profileId]; if (!stats) { return null; } return resolveProfileUnusableUntil(stats); } function resetUsageStats( existing: ProfileUsageStats | undefined, overrides?: Partial, ): ProfileUsageStats { return { ...existing, errorCount: 0, cooldownUntil: undefined, disabledUntil: undefined, disabledReason: undefined, failureCounts: undefined, ...overrides, }; } function updateUsageStatsEntry( store: AuthProfileStore, profileId: string, updater: (existing: ProfileUsageStats | undefined) => ProfileUsageStats, ): void { store.usageStats = store.usageStats ?? {}; store.usageStats[profileId] = updater(store.usageStats[profileId]); } function keepActiveWindowOrRecompute(params: { existingUntil: number | undefined; now: number; recomputedUntil: number; }): number { const { existingUntil, now, recomputedUntil } = params; const hasActiveWindow = typeof existingUntil === "number" && Number.isFinite(existingUntil) && existingUntil > now; return hasActiveWindow ? existingUntil : recomputedUntil; } function computeNextProfileUsageStats(params: { existing: ProfileUsageStats; now: number; reason: AuthProfileFailureReason; cfgResolved: ResolvedAuthCooldownConfig; }): ProfileUsageStats { const windowMs = params.cfgResolved.failureWindowMs; const windowExpired = typeof params.existing.lastFailureAt === "number" && params.existing.lastFailureAt > 0 && params.now - params.existing.lastFailureAt > windowMs; // If the previous cooldown has already expired, reset error counters so the // profile gets a fresh backoff window. clearExpiredCooldowns() does this // in-memory during profile ordering, but the on-disk state may still carry // the old counters when the lock-based updater reads a fresh store. Without // this check, stale error counts from an expired cooldown cause the next // failure to escalate to a much longer cooldown (e.g. 1 min → 25 min). const unusableUntil = resolveProfileUnusableUntil(params.existing); const previousCooldownExpired = typeof unusableUntil === "number" && params.now >= unusableUntil; const shouldResetCounters = windowExpired || previousCooldownExpired; const baseErrorCount = shouldResetCounters ? 0 : (params.existing.errorCount ?? 0); const nextErrorCount = baseErrorCount + 1; const failureCounts = shouldResetCounters ? {} : { ...params.existing.failureCounts }; failureCounts[params.reason] = (failureCounts[params.reason] ?? 0) + 1; const updatedStats: ProfileUsageStats = { ...params.existing, errorCount: nextErrorCount, failureCounts, lastFailureAt: params.now, }; if (params.reason === "billing" || params.reason === "auth_permanent") { const billingCount = failureCounts[params.reason] ?? 1; const backoffMs = calculateAuthProfileBillingDisableMsWithConfig({ errorCount: billingCount, baseMs: params.cfgResolved.billingBackoffMs, maxMs: params.cfgResolved.billingMaxMs, }); // Keep active disable windows immutable so retries within the window cannot // extend recovery time indefinitely. updatedStats.disabledUntil = keepActiveWindowOrRecompute({ existingUntil: params.existing.disabledUntil, now: params.now, recomputedUntil: params.now + backoffMs, }); updatedStats.disabledReason = params.reason; } else { const backoffMs = calculateAuthProfileCooldownMs(nextErrorCount); // Keep active cooldown windows immutable so retries within the window // cannot push recovery further out. updatedStats.cooldownUntil = keepActiveWindowOrRecompute({ existingUntil: params.existing.cooldownUntil, now: params.now, recomputedUntil: params.now + backoffMs, }); } return updatedStats; } /** * Mark a profile as failed for a specific reason. Billing and permanent-auth * failures are treated as "disabled" (longer backoff) vs the regular cooldown * window. */ export async function markAuthProfileFailure(params: { store: AuthProfileStore; profileId: string; reason: AuthProfileFailureReason; cfg?: OpenClawConfig; agentDir?: string; runId?: string; }): Promise { const { store, profileId, reason, agentDir, cfg, runId } = params; const profile = store.profiles[profileId]; if (!profile || isAuthCooldownBypassedForProvider(profile.provider)) { return; } let nextStats: ProfileUsageStats | undefined; let previousStats: ProfileUsageStats | undefined; let updateTime = 0; const updated = await updateAuthProfileStoreWithLock({ agentDir, updater: (freshStore) => { const profile = freshStore.profiles[profileId]; if (!profile || isAuthCooldownBypassedForProvider(profile.provider)) { return false; } const now = Date.now(); const providerKey = normalizeProviderId(profile.provider); const cfgResolved = resolveAuthCooldownConfig({ cfg, providerId: providerKey, }); previousStats = freshStore.usageStats?.[profileId]; updateTime = now; const computed = computeNextProfileUsageStats({ existing: previousStats ?? {}, now, reason, cfgResolved, }); nextStats = computed; updateUsageStatsEntry(freshStore, profileId, () => computed); return true; }, }); if (updated) { store.usageStats = updated.usageStats; if (nextStats) { logAuthProfileFailureStateChange({ runId, profileId, provider: profile.provider, reason, previous: previousStats, next: nextStats, now: updateTime, }); } return; } if (!store.profiles[profileId]) { return; } const now = Date.now(); const providerKey = normalizeProviderId(store.profiles[profileId]?.provider ?? ""); const cfgResolved = resolveAuthCooldownConfig({ cfg, providerId: providerKey, }); previousStats = store.usageStats?.[profileId]; const computed = computeNextProfileUsageStats({ existing: previousStats ?? {}, now, reason, cfgResolved, }); nextStats = computed; updateUsageStatsEntry(store, profileId, () => computed); saveAuthProfileStore(store, agentDir); logAuthProfileFailureStateChange({ runId, profileId, provider: store.profiles[profileId]?.provider ?? profile.provider, reason, previous: previousStats, next: nextStats, now, }); } /** * Mark a profile as transiently failed. Applies exponential backoff cooldown. * Cooldown times: 1min, 5min, 25min, max 1 hour. * Uses store lock to avoid overwriting concurrent usage updates. */ export async function markAuthProfileCooldown(params: { store: AuthProfileStore; profileId: string; agentDir?: string; runId?: string; }): Promise { await markAuthProfileFailure({ store: params.store, profileId: params.profileId, reason: "unknown", agentDir: params.agentDir, runId: params.runId, }); } /** * Clear cooldown for a profile (e.g., manual reset). * Uses store lock to avoid overwriting concurrent usage updates. */ export async function clearAuthProfileCooldown(params: { store: AuthProfileStore; profileId: string; agentDir?: string; }): Promise { const { store, profileId, agentDir } = params; const updated = await updateAuthProfileStoreWithLock({ agentDir, updater: (freshStore) => { if (!freshStore.usageStats?.[profileId]) { return false; } updateUsageStatsEntry(freshStore, profileId, (existing) => resetUsageStats(existing)); return true; }, }); if (updated) { store.usageStats = updated.usageStats; return; } if (!store.usageStats?.[profileId]) { return; } updateUsageStatsEntry(store, profileId, (existing) => resetUsageStats(existing)); saveAuthProfileStore(store, agentDir); }