clienttarget / src /shared /utils /retry.ts
iDevBuddy
feat: Phase 1 β€” AI Client Acquisition System
bd28470
/**
* Production-grade retry logic β€” failure-type-aware.
*
* NOT "retry 3 times with delay" (naive approach).
* Instead: each failure type gets a different response.
*
* 429 β†’ respect Retry-After header, wait, then retry
* 503 β†’ exponential backoff WITH JITTER (prevent thundering herd)
* 500 β†’ retry 2x, then dead-letter for manual review
* 422 β†’ permanent failure, do not retry (bad input)
* ECONNRESET β†’ network issue, retry with short delay
* TIMEOUT β†’ retry with longer timeout
*/
import { AxiosError } from "axios";
import { logger } from "./logger";
export interface RetryConfig {
provider: string;
maxRetries?: number; // default 3
baseDelayMs?: number; // default 1000
maxDelayMs?: number; // default 30000
}
// ─── Circuit breaker state ───────────────────────────────────
interface CircuitState {
failures: number;
lastFailure: number;
isOpen: boolean;
halfOpenAt: number; // when to try again
}
const circuits = new Map<string, CircuitState>();
const CIRCUIT_THRESHOLD = 5; // failures before opening
const CIRCUIT_RESET_MS = 60_000; // 1 min cooldown
export function isCircuitOpen(provider: string): boolean {
const state = circuits.get(provider);
if (!state?.isOpen) return false;
// Check if enough time has passed (half-open)
if (Date.now() >= state.halfOpenAt) {
state.isOpen = false; // allow one attempt
return false;
}
return true;
}
export function recordSuccess(provider: string): void {
circuits.set(provider, {
failures: 0,
lastFailure: 0,
isOpen: false,
halfOpenAt: 0,
});
}
export function recordFailure(provider: string): void {
const state = circuits.get(provider) ?? {
failures: 0, lastFailure: 0, isOpen: false, halfOpenAt: 0,
};
state.failures++;
state.lastFailure = Date.now();
if (state.failures >= CIRCUIT_THRESHOLD) {
state.isOpen = true;
state.halfOpenAt = Date.now() + CIRCUIT_RESET_MS;
logger.warn({ provider, failures: state.failures }, "Circuit OPEN β€” provider temporarily disabled");
}
circuits.set(provider, state);
}
// ─── Failure classification ──────────────────────────────────
type FailureType =
| "rate_limited" // 429
| "server_error" // 500
| "service_unavailable" // 503
| "bad_input" // 422, 400
| "auth_failed" // 401, 403
| "network_error" // ECONNRESET, ENOTFOUND
| "timeout" // ETIMEDOUT, ESOCKETTIMEDOUT
| "unknown";
function classifyFailure(err: unknown): { type: FailureType; retryable: boolean; waitMs: number } {
if (err instanceof AxiosError) {
const status = err.response?.status;
const retryAfter = parseInt(err.response?.headers?.["retry-after"] ?? "0", 10);
switch (status) {
case 429:
return {
type: "rate_limited",
retryable: true,
waitMs: retryAfter ? retryAfter * 1000 : 10_000,
};
case 503:
return { type: "service_unavailable", retryable: true, waitMs: 5_000 };
case 500:
return { type: "server_error", retryable: true, waitMs: 3_000 };
case 422:
case 400:
return { type: "bad_input", retryable: false, waitMs: 0 };
case 401:
case 403:
return { type: "auth_failed", retryable: false, waitMs: 0 };
}
// Network errors
const code = err.code;
if (code === "ECONNRESET" || code === "ENOTFOUND" || code === "ECONNREFUSED") {
return { type: "network_error", retryable: true, waitMs: 2_000 };
}
if (code === "ETIMEDOUT" || code === "ESOCKETTIMEDOUT") {
return { type: "timeout", retryable: true, waitMs: 3_000 };
}
}
return { type: "unknown", retryable: true, waitMs: 2_000 };
}
// ─── Main retry function ────────────────────────────────────
export async function withRetry<T>(
fn: () => Promise<T>,
config: RetryConfig
): Promise<T> {
const maxRetries = config.maxRetries ?? 3;
const baseDelay = config.baseDelayMs ?? 1000;
const maxDelay = config.maxDelayMs ?? 30_000;
let attempt = 0;
while (true) {
try {
const result = await fn();
if (attempt > 0) {
// Recovered after retry β€” record success
recordSuccess(config.provider);
logger.info({ provider: config.provider, attempts: attempt + 1 }, "Retry succeeded");
}
return result;
} catch (err) {
attempt++;
const failure = classifyFailure(err);
// Permanent failure β€” don't retry
if (!failure.retryable) {
logger.error(
{ provider: config.provider, failureType: failure.type, attempt },
"Permanent failure β€” not retrying"
);
recordFailure(config.provider);
throw err;
}
// Max retries exceeded
if (attempt >= maxRetries) {
logger.error(
{ provider: config.provider, failureType: failure.type, attempts: attempt },
"Max retries exceeded"
);
recordFailure(config.provider);
throw err;
}
// Calculate wait time with jitter
// Jitter prevents thundering herd: 1000 requests don't all retry at same time
const exponentialDelay = Math.min(
maxDelay,
baseDelay * Math.pow(2, attempt - 1)
);
const jitter = Math.random() * exponentialDelay * 0.3; // Β±30% jitter
const waitMs = Math.max(failure.waitMs, exponentialDelay + jitter);
logger.warn(
{
provider: config.provider,
failureType: failure.type,
attempt,
maxRetries,
waitMs: Math.round(waitMs),
},
`Retry ${attempt}/${maxRetries} after ${Math.round(waitMs)}ms`
);
await sleep(waitMs);
}
}
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}