Spaces:
Paused
Paused
File size: 5,088 Bytes
c1243f9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | import type { startGatewayServer } from "../../gateway/server.js";
import type { defaultRuntime } from "../../runtime.js";
import { acquireGatewayLock } from "../../infra/gateway-lock.js";
import {
consumeGatewaySigusr1RestartAuthorization,
isGatewaySigusr1RestartExternallyAllowed,
markGatewaySigusr1RestartHandled,
} from "../../infra/restart.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
import {
getActiveTaskCount,
resetAllLanes,
waitForActiveTasks,
} from "../../process/command-queue.js";
import { createRestartIterationHook } from "../../process/restart-recovery.js";
const gatewayLog = createSubsystemLogger("gateway");
type GatewayRunSignalAction = "stop" | "restart";
export async function runGatewayLoop(params: {
start: () => Promise<Awaited<ReturnType<typeof startGatewayServer>>>;
runtime: typeof defaultRuntime;
}) {
const lock = await acquireGatewayLock();
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
let shuttingDown = false;
let restartResolver: (() => void) | null = null;
const cleanupSignals = () => {
process.removeListener("SIGTERM", onSigterm);
process.removeListener("SIGINT", onSigint);
process.removeListener("SIGUSR1", onSigusr1);
};
const DRAIN_TIMEOUT_MS = 30_000;
const SHUTDOWN_TIMEOUT_MS = 5_000;
const request = (action: GatewayRunSignalAction, signal: string) => {
if (shuttingDown) {
gatewayLog.info(`received ${signal} during shutdown; ignoring`);
return;
}
shuttingDown = true;
const isRestart = action === "restart";
gatewayLog.info(`received ${signal}; ${isRestart ? "restarting" : "shutting down"}`);
// Allow extra time for draining active turns on restart.
const forceExitMs = isRestart ? DRAIN_TIMEOUT_MS + SHUTDOWN_TIMEOUT_MS : SHUTDOWN_TIMEOUT_MS;
const forceExitTimer = setTimeout(() => {
gatewayLog.error("shutdown timed out; exiting without full cleanup");
cleanupSignals();
params.runtime.exit(0);
}, forceExitMs);
void (async () => {
try {
// On restart, wait for in-flight agent turns to finish before
// tearing down the server so buffered messages are delivered.
if (isRestart) {
const activeTasks = getActiveTaskCount();
if (activeTasks > 0) {
gatewayLog.info(
`draining ${activeTasks} active task(s) before restart (timeout ${DRAIN_TIMEOUT_MS}ms)`,
);
const { drained } = await waitForActiveTasks(DRAIN_TIMEOUT_MS);
if (drained) {
gatewayLog.info("all active tasks drained");
} else {
gatewayLog.warn("drain timeout reached; proceeding with restart");
}
}
}
await server?.close({
reason: isRestart ? "gateway restarting" : "gateway stopping",
restartExpectedMs: isRestart ? 1500 : null,
});
} catch (err) {
gatewayLog.error(`shutdown error: ${String(err)}`);
} finally {
clearTimeout(forceExitTimer);
server = null;
if (isRestart) {
shuttingDown = false;
restartResolver?.();
} else {
cleanupSignals();
params.runtime.exit(0);
}
}
})();
};
const onSigterm = () => {
gatewayLog.info("signal SIGTERM received");
request("stop", "SIGTERM");
};
const onSigint = () => {
gatewayLog.info("signal SIGINT received");
request("stop", "SIGINT");
};
const onSigusr1 = () => {
gatewayLog.info("signal SIGUSR1 received");
const authorized = consumeGatewaySigusr1RestartAuthorization();
if (!authorized && !isGatewaySigusr1RestartExternallyAllowed()) {
gatewayLog.warn(
"SIGUSR1 restart ignored (not authorized; enable commands.restart or use gateway tool).",
);
return;
}
markGatewaySigusr1RestartHandled();
request("restart", "SIGUSR1");
};
process.on("SIGTERM", onSigterm);
process.on("SIGINT", onSigint);
process.on("SIGUSR1", onSigusr1);
try {
const onIteration = createRestartIterationHook(() => {
// After an in-process restart (SIGUSR1), reset command-queue lane state.
// Interrupted tasks from the previous lifecycle may have left `active`
// counts elevated (their finally blocks never ran), permanently blocking
// new work from draining. This must happen here — at the restart
// coordinator level — rather than inside individual subsystem init
// functions, to avoid surprising cross-cutting side effects.
resetAllLanes();
});
// Keep process alive; SIGUSR1 triggers an in-process restart (no supervisor required).
// SIGTERM/SIGINT still exit after a graceful shutdown.
// eslint-disable-next-line no-constant-condition
while (true) {
onIteration();
server = await params.start();
await new Promise<void>((resolve) => {
restartResolver = resolve;
});
}
} finally {
await lock?.release();
cleanupSignals();
}
}
|