doatlas-2 / artifacts /api-server /src /lib /agent-supervisor.test.ts

Add files using upload-large-folder tool

5871090 verified 15 days ago

10.3 kB

	/**
	* Regression coverage for the agent loop's recovery / self-check paths.
	*
	* Run with: pnpm --filter @workspace/api-server test
	*
	* These tests exercise pure / DB-free behavior on AgentRunState plus the
	* exported prompt-formatting helper so the four canonical scenarios
	* called out in the agent-loop spec stay green on every change:
	*
	* (a) circuit breaker tripping after the threshold
	* (b) per-turn cache returning the same result for an identical
	* (tool, args) call
	* (c) validator downgrading a step that has no successful tool route
	* (d) cross-turn working memory carrying an unresolved step into the
	* next turn's system prompt
	*/
	import { test } from "node:test";
	import assert from "node:assert/strict";

	// `@workspace/db` reads DATABASE_URL at import time. Tests don't actually
	// touch Postgres — every code path under test is DB-free — so a sentinel
	// connection string is enough to satisfy the import guard.
	process.env.DATABASE_URL ??= "postgres://test:test@127.0.0.1:5432/test";

	import type { AgentRunState as AgentRunStateType } from "./agent-supervisor";

	const {
	AgentRunState,
	CIRCUIT_BREAKER_THRESHOLD,
	formatCarryOverPrompt,
	} = await import("./agent-supervisor");

	function makeState() {
	const events: Array<{ event: string; data: Record<string, unknown> }> = [];
	const state = new AgentRunState({
	conversationId: "cnv_test",
	userId: "usr_test",
	messageId: "msg_test",
	emit: (ev) => events.push(ev),
	});
	return { state, events };
	}

	function feedPlanWithStep(state: AgentRunStateType, stepId: string, opts: {
	goal?: string;
	success_criteria?: string;
	} = {}) {
	const plan = {
	steps: [
	{
	id: stepId,
	goal: opts.goal ?? `Goal for ${stepId}`,
	success_criteria: opts.success_criteria,
	},
	],
	};
	state.ingestTextDelta(`<plan>${JSON.stringify(plan)}</plan>\n`);
	state.ingestTextDelta(
	`<step_update>${JSON.stringify({ id: stepId, status: "running" })}</step_update>\n`,
	);
	}

	// ----------------------------------------------------------------- (a)

	test("circuit breaker trips after THRESHOLD transient failures and stays open", () => {
	const { state } = makeState();
	state.noteIteration();

	// No failures recorded yet — no short-circuit.
	assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null);

	// Record threshold-1 transient failures → still closed.
	for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD - 1; i++) {
	state.recordToolOutcome({
	toolCallId: `c_${i}`,
	toolName: "search_pubmed",
	args: { query: "x" },
	result: { error: "rate limited", error_code: "rate_limited" },
	isError: true,
	durationMs: 10,
	});
	}
	assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null);

	// One more transient failure → breaker trips.
	state.recordToolOutcome({
	toolCallId: "c_final",
	toolName: "search_pubmed",
	args: { query: "x" },
	result: { error: "rate limited", error_code: "rate_limited" },
	isError: true,
	durationMs: 10,
	});

	const open = state.circuitCheck("search_pubmed", { query: "anything else" });
	assert.ok(open, "circuit should be open");
	assert.equal(open!.error_code, "circuit_open");
	assert.equal(open!.retryable, false);
	assert.match(open!.suggestion, /opentargets\|uniprot\|alternative/i);

	// A different source key on the same tool should NOT be open — the
	// breaker is per-source, not per-tool.
	assert.equal(
	state.circuitCheck("search_pubmed", { source: "europepmc" }),
	null,
	);
	});

	test("non-transient errors do not contribute to the circuit breaker", () => {
	const { state } = makeState();
	for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD + 2; i++) {
	state.recordToolOutcome({
	toolCallId: `c_${i}`,
	toolName: "lookup_uniprot",
	args: { id: "P12345" },
	result: { error: "bad request", error_code: "invalid_argument" },
	isError: true,
	durationMs: 5,
	});
	}
	assert.equal(state.circuitCheck("lookup_uniprot", { id: "P12345" }), null);
	});

	// ----------------------------------------------------------------- (b)

	test("per-turn cache returns the same result for an identical (tool, args) call", () => {
	const { state } = makeState();
	const args = { query: "BRCA1", limit: 5 };
	const result = { hits: [{ pmid: "12345", title: "demo" }] };

	// No cache hit before recording.
	assert.equal(state.cacheLookup("search_pubmed", args), undefined);

	state.recordToolOutcome({
	toolCallId: "c1",
	toolName: "search_pubmed",
	args,
	result,
	isError: false,
	durationMs: 42,
	});

	// Same args → same cached object reference.
	const cached = state.cacheLookup("search_pubmed", args);
	assert.strictEqual(cached, result);

	// Equivalent args object (different reference, same JSON) → still hits.
	const cachedClone = state.cacheLookup("search_pubmed", { query: "BRCA1", limit: 5 });
	assert.strictEqual(cachedClone, result);

	// Different args → no hit.
	assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined);

	// Errors are NOT cached.
	state.recordToolOutcome({
	toolCallId: "c2",
	toolName: "search_pubmed",
	args: { query: "TP53" },
	result: { error: "boom", error_code: "internal" },
	isError: true,
	durationMs: 1,
	});
	assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined);
	});

	// ----------------------------------------------------------------- (c)

	test("validator downgrades a 'done' step that has no successful tool route", () => {
	const { state } = makeState();
	state.noteIteration();
	feedPlanWithStep(state, "s1", { goal: "Find something" });

	// Model claims done with zero recorded tool calls.
	state.ingestTextDelta(
	`<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
	);

	const v = state.runValidator();
	assert.equal(v.passed, false);
	assert.equal(v.downgraded.length, 1);
	assert.equal(v.downgraded[0]!.id, "s1");
	assert.equal(v.downgraded[0]!.reason, "no_evidence");

	// The step itself was mutated to 'failed' with a downgrade note.
	const publicRun = state.toPublic("complete", new Date(), {
	input: 0,
	output: 0,
	});
	const step = publicRun.steps.find((s) => s.id === "s1")!;
	assert.equal(step.status, "failed");
	assert.match(step.note ?? "", /downgraded.*no_evidence/);

	assert.equal(state.needsRecoveryRound(), true);
	});

	test("validator passes when a 'done' step has a source-linked tool route", () => {
	const { state } = makeState();
	state.noteIteration();
	feedPlanWithStep(state, "s1", { goal: "Look up paper" });

	state.recordToolOutcome({
	toolCallId: "c1",
	toolName: "search_pubmed",
	args: { query: "BRCA1" },
	// Result includes a source-linked PMID, so the evidence-ref check passes.
	result: { hits: [{ pmid: "12345", url: "https://pubmed/12345" }] },
	isError: false,
	durationMs: 5,
	});

	state.ingestTextDelta(
	`<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
	);

	const v = state.runValidator();
	assert.equal(v.passed, true);
	assert.equal(v.downgraded.length, 0);
	assert.equal(state.needsRecoveryRound(), false);
	});

	// ----------------------------------------------------------------- (d)

	test("cross-turn working memory carries an unresolved step into the next turn's system prompt", () => {
	const { state } = makeState();
	state.noteIteration();
	feedPlanWithStep(state, "s1", { goal: "Investigate target X" });
	state.ingestTextDelta(
	`<step_update>${JSON.stringify({
	id: "s1",
	status: "failed",
	note: "rate limited",
	})}</step_update>\n`,
	);

	// The pending memory record should include the failed step.
	const record = state.buildPendingWorkingMemory();
	assert.ok(record, "expected a memory record for a failed step");
	assert.equal(record!.unresolved_steps.length, 1);
	assert.equal(record!.unresolved_steps[0]!.id, "s1");
	assert.equal(record!.unresolved_steps[0]!.goal, "Investigate target X");
	assert.equal(record!.unresolved_steps[0]!.note, "rate limited");
	assert.equal(record!.unresolved_steps[0]!.source_run_id, state.id);

	// The system-prompt formatter must surface the unresolved step verbatim.
	const prompt = formatCarryOverPrompt(record);
	assert.match(prompt, /Carry-over from the previous turn/);
	assert.match(prompt, /- s1: Investigate target X $rate limited$/);

	// Resolved-only state → no carry-over (memory is cleared next turn).
	const { state: clean } = makeState();
	clean.noteIteration();
	feedPlanWithStep(clean, "s1");
	clean.recordToolOutcome({
	toolCallId: "c1",
	toolName: "search_pubmed",
	args: { query: "x" },
	result: { hits: [{ pmid: "1", url: "https://e/1" }] },
	isError: false,
	durationMs: 1,
	});
	clean.ingestTextDelta(
	`<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
	);
	assert.equal(clean.buildPendingWorkingMemory(), null);
	assert.equal(formatCarryOverPrompt(null), "");
	});

	test("reflection-only unresolved steps are also carried forward", () => {
	const { state } = makeState();
	state.noteIteration();
	feedPlanWithStep(state, "s2", { goal: "Cross-reference disease" });
	// Plan-side step is still 'running', but reflection lists it unresolved.
	state.ingestTextDelta(
	`<reflection>${JSON.stringify({
	step_states: [{ id: "s2", status: "pending" }],
	unresolved: [{ id: "s2", reason: "no time" }],
	})}</reflection>\n`,
	);
	const record = state.buildPendingWorkingMemory();
	assert.ok(record);
	assert.equal(record!.unresolved_steps.length, 1);
	assert.equal(record!.unresolved_steps[0]!.id, "s2");
	});

	// ----------------------------------------------------------------- bonus

	test("agent tags are stripped from user-visible deltas", () => {
	const { state } = makeState();
	const visible = state.ingestTextDelta(
	`Hello!\n<plan>${JSON.stringify({
	steps: [{ id: "s1", goal: "g" }],
	})}</plan>\nWorld\n`,
	);
	// 'Hello!' and 'World' both pass through; the plan tag does not.
	assert.match(visible, /Hello!/);
	assert.match(visible, /World/);
	assert.doesNotMatch(visible, /<plan>/);
	assert.doesNotMatch(visible, /<\/plan>/);
	});