/** * Regression coverage for the agent loop's recovery / self-check paths. * * Run with: pnpm --filter @workspace/api-server test * * These tests exercise pure / DB-free behavior on AgentRunState plus the * exported prompt-formatting helper so the four canonical scenarios * called out in the agent-loop spec stay green on every change: * * (a) circuit breaker tripping after the threshold * (b) per-turn cache returning the same result for an identical * (tool, args) call * (c) validator downgrading a step that has no successful tool route * (d) cross-turn working memory carrying an unresolved step into the * next turn's system prompt */ import { test } from "node:test"; import assert from "node:assert/strict"; // `@workspace/db` reads DATABASE_URL at import time. Tests don't actually // touch Postgres — every code path under test is DB-free — so a sentinel // connection string is enough to satisfy the import guard. process.env.DATABASE_URL ??= "postgres://test:test@127.0.0.1:5432/test"; import type { AgentRunState as AgentRunStateType } from "./agent-supervisor"; const { AgentRunState, CIRCUIT_BREAKER_THRESHOLD, formatCarryOverPrompt, } = await import("./agent-supervisor"); function makeState() { const events: Array<{ event: string; data: Record }> = []; const state = new AgentRunState({ conversationId: "cnv_test", userId: "usr_test", messageId: "msg_test", emit: (ev) => events.push(ev), }); return { state, events }; } function feedPlanWithStep(state: AgentRunStateType, stepId: string, opts: { goal?: string; success_criteria?: string; } = {}) { const plan = { steps: [ { id: stepId, goal: opts.goal ?? `Goal for ${stepId}`, success_criteria: opts.success_criteria, }, ], }; state.ingestTextDelta(`${JSON.stringify(plan)}\n`); state.ingestTextDelta( `${JSON.stringify({ id: stepId, status: "running" })}\n`, ); } // ----------------------------------------------------------------- (a) test("circuit breaker trips after THRESHOLD transient failures and stays open", () => { const { state } = makeState(); state.noteIteration(); // No failures recorded yet — no short-circuit. assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null); // Record threshold-1 transient failures → still closed. for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD - 1; i++) { state.recordToolOutcome({ toolCallId: `c_${i}`, toolName: "search_pubmed", args: { query: "x" }, result: { error: "rate limited", error_code: "rate_limited" }, isError: true, durationMs: 10, }); } assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null); // One more transient failure → breaker trips. state.recordToolOutcome({ toolCallId: "c_final", toolName: "search_pubmed", args: { query: "x" }, result: { error: "rate limited", error_code: "rate_limited" }, isError: true, durationMs: 10, }); const open = state.circuitCheck("search_pubmed", { query: "anything else" }); assert.ok(open, "circuit should be open"); assert.equal(open!.error_code, "circuit_open"); assert.equal(open!.retryable, false); assert.match(open!.suggestion, /opentargets|uniprot|alternative/i); // A different source key on the same tool should NOT be open — the // breaker is per-source, not per-tool. assert.equal( state.circuitCheck("search_pubmed", { source: "europepmc" }), null, ); }); test("non-transient errors do not contribute to the circuit breaker", () => { const { state } = makeState(); for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD + 2; i++) { state.recordToolOutcome({ toolCallId: `c_${i}`, toolName: "lookup_uniprot", args: { id: "P12345" }, result: { error: "bad request", error_code: "invalid_argument" }, isError: true, durationMs: 5, }); } assert.equal(state.circuitCheck("lookup_uniprot", { id: "P12345" }), null); }); // ----------------------------------------------------------------- (b) test("per-turn cache returns the same result for an identical (tool, args) call", () => { const { state } = makeState(); const args = { query: "BRCA1", limit: 5 }; const result = { hits: [{ pmid: "12345", title: "demo" }] }; // No cache hit before recording. assert.equal(state.cacheLookup("search_pubmed", args), undefined); state.recordToolOutcome({ toolCallId: "c1", toolName: "search_pubmed", args, result, isError: false, durationMs: 42, }); // Same args → same cached object reference. const cached = state.cacheLookup("search_pubmed", args); assert.strictEqual(cached, result); // Equivalent args object (different reference, same JSON) → still hits. const cachedClone = state.cacheLookup("search_pubmed", { query: "BRCA1", limit: 5 }); assert.strictEqual(cachedClone, result); // Different args → no hit. assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined); // Errors are NOT cached. state.recordToolOutcome({ toolCallId: "c2", toolName: "search_pubmed", args: { query: "TP53" }, result: { error: "boom", error_code: "internal" }, isError: true, durationMs: 1, }); assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined); }); // ----------------------------------------------------------------- (c) test("validator downgrades a 'done' step that has no successful tool route", () => { const { state } = makeState(); state.noteIteration(); feedPlanWithStep(state, "s1", { goal: "Find something" }); // Model claims done with zero recorded tool calls. state.ingestTextDelta( `${JSON.stringify({ id: "s1", status: "done" })}\n`, ); const v = state.runValidator(); assert.equal(v.passed, false); assert.equal(v.downgraded.length, 1); assert.equal(v.downgraded[0]!.id, "s1"); assert.equal(v.downgraded[0]!.reason, "no_evidence"); // The step itself was mutated to 'failed' with a downgrade note. const publicRun = state.toPublic("complete", new Date(), { input: 0, output: 0, }); const step = publicRun.steps.find((s) => s.id === "s1")!; assert.equal(step.status, "failed"); assert.match(step.note ?? "", /downgraded.*no_evidence/); assert.equal(state.needsRecoveryRound(), true); }); test("validator passes when a 'done' step has a source-linked tool route", () => { const { state } = makeState(); state.noteIteration(); feedPlanWithStep(state, "s1", { goal: "Look up paper" }); state.recordToolOutcome({ toolCallId: "c1", toolName: "search_pubmed", args: { query: "BRCA1" }, // Result includes a source-linked PMID, so the evidence-ref check passes. result: { hits: [{ pmid: "12345", url: "https://pubmed/12345" }] }, isError: false, durationMs: 5, }); state.ingestTextDelta( `${JSON.stringify({ id: "s1", status: "done" })}\n`, ); const v = state.runValidator(); assert.equal(v.passed, true); assert.equal(v.downgraded.length, 0); assert.equal(state.needsRecoveryRound(), false); }); // ----------------------------------------------------------------- (d) test("cross-turn working memory carries an unresolved step into the next turn's system prompt", () => { const { state } = makeState(); state.noteIteration(); feedPlanWithStep(state, "s1", { goal: "Investigate target X" }); state.ingestTextDelta( `${JSON.stringify({ id: "s1", status: "failed", note: "rate limited", })}\n`, ); // The pending memory record should include the failed step. const record = state.buildPendingWorkingMemory(); assert.ok(record, "expected a memory record for a failed step"); assert.equal(record!.unresolved_steps.length, 1); assert.equal(record!.unresolved_steps[0]!.id, "s1"); assert.equal(record!.unresolved_steps[0]!.goal, "Investigate target X"); assert.equal(record!.unresolved_steps[0]!.note, "rate limited"); assert.equal(record!.unresolved_steps[0]!.source_run_id, state.id); // The system-prompt formatter must surface the unresolved step verbatim. const prompt = formatCarryOverPrompt(record); assert.match(prompt, /Carry-over from the previous turn/); assert.match(prompt, /- s1: Investigate target X $rate limited$/); // Resolved-only state → no carry-over (memory is cleared next turn). const { state: clean } = makeState(); clean.noteIteration(); feedPlanWithStep(clean, "s1"); clean.recordToolOutcome({ toolCallId: "c1", toolName: "search_pubmed", args: { query: "x" }, result: { hits: [{ pmid: "1", url: "https://e/1" }] }, isError: false, durationMs: 1, }); clean.ingestTextDelta( `${JSON.stringify({ id: "s1", status: "done" })}\n`, ); assert.equal(clean.buildPendingWorkingMemory(), null); assert.equal(formatCarryOverPrompt(null), ""); }); test("reflection-only unresolved steps are also carried forward", () => { const { state } = makeState(); state.noteIteration(); feedPlanWithStep(state, "s2", { goal: "Cross-reference disease" }); // Plan-side step is still 'running', but reflection lists it unresolved. state.ingestTextDelta( `${JSON.stringify({ step_states: [{ id: "s2", status: "pending" }], unresolved: [{ id: "s2", reason: "no time" }], })}\n`, ); const record = state.buildPendingWorkingMemory(); assert.ok(record); assert.equal(record!.unresolved_steps.length, 1); assert.equal(record!.unresolved_steps[0]!.id, "s2"); }); // ----------------------------------------------------------------- bonus test("agent tags are stripped from user-visible deltas", () => { const { state } = makeState(); const visible = state.ingestTextDelta( `Hello!\n${JSON.stringify({ steps: [{ id: "s1", goal: "g" }], })}\nWorld\n`, ); // 'Hello!' and 'World' both pass through; the plan tag does not. assert.match(visible, /Hello!/); assert.match(visible, /World/); assert.doesNotMatch(visible, //); assert.doesNotMatch(visible, /<\/plan>/); });