doatlas-2 / artifacts /api-server /src /lib /agent-supervisor.test.ts
Iostream-Li's picture
Add files using upload-large-folder tool
5871090 verified
/**
* Regression coverage for the agent loop's recovery / self-check paths.
*
* Run with: pnpm --filter @workspace/api-server test
*
* These tests exercise pure / DB-free behavior on AgentRunState plus the
* exported prompt-formatting helper so the four canonical scenarios
* called out in the agent-loop spec stay green on every change:
*
* (a) circuit breaker tripping after the threshold
* (b) per-turn cache returning the same result for an identical
* (tool, args) call
* (c) validator downgrading a step that has no successful tool route
* (d) cross-turn working memory carrying an unresolved step into the
* next turn's system prompt
*/
import { test } from "node:test";
import assert from "node:assert/strict";
// `@workspace/db` reads DATABASE_URL at import time. Tests don't actually
// touch Postgres — every code path under test is DB-free — so a sentinel
// connection string is enough to satisfy the import guard.
process.env.DATABASE_URL ??= "postgres://test:test@127.0.0.1:5432/test";
import type { AgentRunState as AgentRunStateType } from "./agent-supervisor";
const {
AgentRunState,
CIRCUIT_BREAKER_THRESHOLD,
formatCarryOverPrompt,
} = await import("./agent-supervisor");
function makeState() {
const events: Array<{ event: string; data: Record<string, unknown> }> = [];
const state = new AgentRunState({
conversationId: "cnv_test",
userId: "usr_test",
messageId: "msg_test",
emit: (ev) => events.push(ev),
});
return { state, events };
}
function feedPlanWithStep(state: AgentRunStateType, stepId: string, opts: {
goal?: string;
success_criteria?: string;
} = {}) {
const plan = {
steps: [
{
id: stepId,
goal: opts.goal ?? `Goal for ${stepId}`,
success_criteria: opts.success_criteria,
},
],
};
state.ingestTextDelta(`<plan>${JSON.stringify(plan)}</plan>\n`);
state.ingestTextDelta(
`<step_update>${JSON.stringify({ id: stepId, status: "running" })}</step_update>\n`,
);
}
// ----------------------------------------------------------------- (a)
test("circuit breaker trips after THRESHOLD transient failures and stays open", () => {
const { state } = makeState();
state.noteIteration();
// No failures recorded yet — no short-circuit.
assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null);
// Record threshold-1 transient failures → still closed.
for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD - 1; i++) {
state.recordToolOutcome({
toolCallId: `c_${i}`,
toolName: "search_pubmed",
args: { query: "x" },
result: { error: "rate limited", error_code: "rate_limited" },
isError: true,
durationMs: 10,
});
}
assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null);
// One more transient failure → breaker trips.
state.recordToolOutcome({
toolCallId: "c_final",
toolName: "search_pubmed",
args: { query: "x" },
result: { error: "rate limited", error_code: "rate_limited" },
isError: true,
durationMs: 10,
});
const open = state.circuitCheck("search_pubmed", { query: "anything else" });
assert.ok(open, "circuit should be open");
assert.equal(open!.error_code, "circuit_open");
assert.equal(open!.retryable, false);
assert.match(open!.suggestion, /opentargets|uniprot|alternative/i);
// A different source key on the same tool should NOT be open — the
// breaker is per-source, not per-tool.
assert.equal(
state.circuitCheck("search_pubmed", { source: "europepmc" }),
null,
);
});
test("non-transient errors do not contribute to the circuit breaker", () => {
const { state } = makeState();
for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD + 2; i++) {
state.recordToolOutcome({
toolCallId: `c_${i}`,
toolName: "lookup_uniprot",
args: { id: "P12345" },
result: { error: "bad request", error_code: "invalid_argument" },
isError: true,
durationMs: 5,
});
}
assert.equal(state.circuitCheck("lookup_uniprot", { id: "P12345" }), null);
});
// ----------------------------------------------------------------- (b)
test("per-turn cache returns the same result for an identical (tool, args) call", () => {
const { state } = makeState();
const args = { query: "BRCA1", limit: 5 };
const result = { hits: [{ pmid: "12345", title: "demo" }] };
// No cache hit before recording.
assert.equal(state.cacheLookup("search_pubmed", args), undefined);
state.recordToolOutcome({
toolCallId: "c1",
toolName: "search_pubmed",
args,
result,
isError: false,
durationMs: 42,
});
// Same args → same cached object reference.
const cached = state.cacheLookup("search_pubmed", args);
assert.strictEqual(cached, result);
// Equivalent args object (different reference, same JSON) → still hits.
const cachedClone = state.cacheLookup("search_pubmed", { query: "BRCA1", limit: 5 });
assert.strictEqual(cachedClone, result);
// Different args → no hit.
assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined);
// Errors are NOT cached.
state.recordToolOutcome({
toolCallId: "c2",
toolName: "search_pubmed",
args: { query: "TP53" },
result: { error: "boom", error_code: "internal" },
isError: true,
durationMs: 1,
});
assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined);
});
// ----------------------------------------------------------------- (c)
test("validator downgrades a 'done' step that has no successful tool route", () => {
const { state } = makeState();
state.noteIteration();
feedPlanWithStep(state, "s1", { goal: "Find something" });
// Model claims done with zero recorded tool calls.
state.ingestTextDelta(
`<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
);
const v = state.runValidator();
assert.equal(v.passed, false);
assert.equal(v.downgraded.length, 1);
assert.equal(v.downgraded[0]!.id, "s1");
assert.equal(v.downgraded[0]!.reason, "no_evidence");
// The step itself was mutated to 'failed' with a downgrade note.
const publicRun = state.toPublic("complete", new Date(), {
input: 0,
output: 0,
});
const step = publicRun.steps.find((s) => s.id === "s1")!;
assert.equal(step.status, "failed");
assert.match(step.note ?? "", /downgraded.*no_evidence/);
assert.equal(state.needsRecoveryRound(), true);
});
test("validator passes when a 'done' step has a source-linked tool route", () => {
const { state } = makeState();
state.noteIteration();
feedPlanWithStep(state, "s1", { goal: "Look up paper" });
state.recordToolOutcome({
toolCallId: "c1",
toolName: "search_pubmed",
args: { query: "BRCA1" },
// Result includes a source-linked PMID, so the evidence-ref check passes.
result: { hits: [{ pmid: "12345", url: "https://pubmed/12345" }] },
isError: false,
durationMs: 5,
});
state.ingestTextDelta(
`<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
);
const v = state.runValidator();
assert.equal(v.passed, true);
assert.equal(v.downgraded.length, 0);
assert.equal(state.needsRecoveryRound(), false);
});
// ----------------------------------------------------------------- (d)
test("cross-turn working memory carries an unresolved step into the next turn's system prompt", () => {
const { state } = makeState();
state.noteIteration();
feedPlanWithStep(state, "s1", { goal: "Investigate target X" });
state.ingestTextDelta(
`<step_update>${JSON.stringify({
id: "s1",
status: "failed",
note: "rate limited",
})}</step_update>\n`,
);
// The pending memory record should include the failed step.
const record = state.buildPendingWorkingMemory();
assert.ok(record, "expected a memory record for a failed step");
assert.equal(record!.unresolved_steps.length, 1);
assert.equal(record!.unresolved_steps[0]!.id, "s1");
assert.equal(record!.unresolved_steps[0]!.goal, "Investigate target X");
assert.equal(record!.unresolved_steps[0]!.note, "rate limited");
assert.equal(record!.unresolved_steps[0]!.source_run_id, state.id);
// The system-prompt formatter must surface the unresolved step verbatim.
const prompt = formatCarryOverPrompt(record);
assert.match(prompt, /Carry-over from the previous turn/);
assert.match(prompt, /- s1: Investigate target X \(rate limited\)/);
// Resolved-only state → no carry-over (memory is cleared next turn).
const { state: clean } = makeState();
clean.noteIteration();
feedPlanWithStep(clean, "s1");
clean.recordToolOutcome({
toolCallId: "c1",
toolName: "search_pubmed",
args: { query: "x" },
result: { hits: [{ pmid: "1", url: "https://e/1" }] },
isError: false,
durationMs: 1,
});
clean.ingestTextDelta(
`<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
);
assert.equal(clean.buildPendingWorkingMemory(), null);
assert.equal(formatCarryOverPrompt(null), "");
});
test("reflection-only unresolved steps are also carried forward", () => {
const { state } = makeState();
state.noteIteration();
feedPlanWithStep(state, "s2", { goal: "Cross-reference disease" });
// Plan-side step is still 'running', but reflection lists it unresolved.
state.ingestTextDelta(
`<reflection>${JSON.stringify({
step_states: [{ id: "s2", status: "pending" }],
unresolved: [{ id: "s2", reason: "no time" }],
})}</reflection>\n`,
);
const record = state.buildPendingWorkingMemory();
assert.ok(record);
assert.equal(record!.unresolved_steps.length, 1);
assert.equal(record!.unresolved_steps[0]!.id, "s2");
});
// ----------------------------------------------------------------- bonus
test("agent tags are stripped from user-visible deltas", () => {
const { state } = makeState();
const visible = state.ingestTextDelta(
`Hello!\n<plan>${JSON.stringify({
steps: [{ id: "s1", goal: "g" }],
})}</plan>\nWorld\n`,
);
// 'Hello!' and 'World' both pass through; the plan tag does not.
assert.match(visible, /Hello!/);
assert.match(visible, /World/);
assert.doesNotMatch(visible, /<plan>/);
assert.doesNotMatch(visible, /<\/plan>/);
});