/** * Production-grade retry logic — failure-type-aware. * * NOT "retry 3 times with delay" (naive approach). * Instead: each failure type gets a different response. * * 429 → respect Retry-After header, wait, then retry * 503 → exponential backoff WITH JITTER (prevent thundering herd) * 500 → retry 2x, then dead-letter for manual review * 422 → permanent failure, do not retry (bad input) * ECONNRESET → network issue, retry with short delay * TIMEOUT → retry with longer timeout */ import { AxiosError } from "axios"; import { logger } from "./logger"; export interface RetryConfig { provider: string; maxRetries?: number; // default 3 baseDelayMs?: number; // default 1000 maxDelayMs?: number; // default 30000 } // ─── Circuit breaker state ─────────────────────────────────── interface CircuitState { failures: number; lastFailure: number; isOpen: boolean; halfOpenAt: number; // when to try again } const circuits = new Map(); const CIRCUIT_THRESHOLD = 5; // failures before opening const CIRCUIT_RESET_MS = 60_000; // 1 min cooldown export function isCircuitOpen(provider: string): boolean { const state = circuits.get(provider); if (!state?.isOpen) return false; // Check if enough time has passed (half-open) if (Date.now() >= state.halfOpenAt) { state.isOpen = false; // allow one attempt return false; } return true; } export function recordSuccess(provider: string): void { circuits.set(provider, { failures: 0, lastFailure: 0, isOpen: false, halfOpenAt: 0, }); } export function recordFailure(provider: string): void { const state = circuits.get(provider) ?? { failures: 0, lastFailure: 0, isOpen: false, halfOpenAt: 0, }; state.failures++; state.lastFailure = Date.now(); if (state.failures >= CIRCUIT_THRESHOLD) { state.isOpen = true; state.halfOpenAt = Date.now() + CIRCUIT_RESET_MS; logger.warn({ provider, failures: state.failures }, "Circuit OPEN — provider temporarily disabled"); } circuits.set(provider, state); } // ─── Failure classification ────────────────────────────────── type FailureType = | "rate_limited" // 429 | "server_error" // 500 | "service_unavailable" // 503 | "bad_input" // 422, 400 | "auth_failed" // 401, 403 | "network_error" // ECONNRESET, ENOTFOUND | "timeout" // ETIMEDOUT, ESOCKETTIMEDOUT | "unknown"; function classifyFailure(err: unknown): { type: FailureType; retryable: boolean; waitMs: number } { if (err instanceof AxiosError) { const status = err.response?.status; const retryAfter = parseInt(err.response?.headers?.["retry-after"] ?? "0", 10); switch (status) { case 429: return { type: "rate_limited", retryable: true, waitMs: retryAfter ? retryAfter * 1000 : 10_000, }; case 503: return { type: "service_unavailable", retryable: true, waitMs: 5_000 }; case 500: return { type: "server_error", retryable: true, waitMs: 3_000 }; case 422: case 400: return { type: "bad_input", retryable: false, waitMs: 0 }; case 401: case 403: return { type: "auth_failed", retryable: false, waitMs: 0 }; } // Network errors const code = err.code; if (code === "ECONNRESET" || code === "ENOTFOUND" || code === "ECONNREFUSED") { return { type: "network_error", retryable: true, waitMs: 2_000 }; } if (code === "ETIMEDOUT" || code === "ESOCKETTIMEDOUT") { return { type: "timeout", retryable: true, waitMs: 3_000 }; } } return { type: "unknown", retryable: true, waitMs: 2_000 }; } // ─── Main retry function ──────────────────────────────────── export async function withRetry( fn: () => Promise, config: RetryConfig ): Promise { const maxRetries = config.maxRetries ?? 3; const baseDelay = config.baseDelayMs ?? 1000; const maxDelay = config.maxDelayMs ?? 30_000; let attempt = 0; while (true) { try { const result = await fn(); if (attempt > 0) { // Recovered after retry — record success recordSuccess(config.provider); logger.info({ provider: config.provider, attempts: attempt + 1 }, "Retry succeeded"); } return result; } catch (err) { attempt++; const failure = classifyFailure(err); // Permanent failure — don't retry if (!failure.retryable) { logger.error( { provider: config.provider, failureType: failure.type, attempt }, "Permanent failure — not retrying" ); recordFailure(config.provider); throw err; } // Max retries exceeded if (attempt >= maxRetries) { logger.error( { provider: config.provider, failureType: failure.type, attempts: attempt }, "Max retries exceeded" ); recordFailure(config.provider); throw err; } // Calculate wait time with jitter // Jitter prevents thundering herd: 1000 requests don't all retry at same time const exponentialDelay = Math.min( maxDelay, baseDelay * Math.pow(2, attempt - 1) ); const jitter = Math.random() * exponentialDelay * 0.3; // ±30% jitter const waitMs = Math.max(failure.waitMs, exponentialDelay + jitter); logger.warn( { provider: config.provider, failureType: failure.type, attempt, maxRetries, waitMs: Math.round(waitMs), }, `Retry ${attempt}/${maxRetries} after ${Math.round(waitMs)}ms` ); await sleep(waitMs); } } } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); }