File size: 10,323 Bytes
5871090 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | /**
* Regression coverage for the agent loop's recovery / self-check paths.
*
* Run with: pnpm --filter @workspace/api-server test
*
* These tests exercise pure / DB-free behavior on AgentRunState plus the
* exported prompt-formatting helper so the four canonical scenarios
* called out in the agent-loop spec stay green on every change:
*
* (a) circuit breaker tripping after the threshold
* (b) per-turn cache returning the same result for an identical
* (tool, args) call
* (c) validator downgrading a step that has no successful tool route
* (d) cross-turn working memory carrying an unresolved step into the
* next turn's system prompt
*/
import { test } from "node:test";
import assert from "node:assert/strict";
// `@workspace/db` reads DATABASE_URL at import time. Tests don't actually
// touch Postgres β every code path under test is DB-free β so a sentinel
// connection string is enough to satisfy the import guard.
process.env.DATABASE_URL ??= "postgres://test:test@127.0.0.1:5432/test";
import type { AgentRunState as AgentRunStateType } from "./agent-supervisor";
const {
AgentRunState,
CIRCUIT_BREAKER_THRESHOLD,
formatCarryOverPrompt,
} = await import("./agent-supervisor");
function makeState() {
const events: Array<{ event: string; data: Record<string, unknown> }> = [];
const state = new AgentRunState({
conversationId: "cnv_test",
userId: "usr_test",
messageId: "msg_test",
emit: (ev) => events.push(ev),
});
return { state, events };
}
function feedPlanWithStep(state: AgentRunStateType, stepId: string, opts: {
goal?: string;
success_criteria?: string;
} = {}) {
const plan = {
steps: [
{
id: stepId,
goal: opts.goal ?? `Goal for ${stepId}`,
success_criteria: opts.success_criteria,
},
],
};
state.ingestTextDelta(`<plan>${JSON.stringify(plan)}</plan>\n`);
state.ingestTextDelta(
`<step_update>${JSON.stringify({ id: stepId, status: "running" })}</step_update>\n`,
);
}
// ----------------------------------------------------------------- (a)
test("circuit breaker trips after THRESHOLD transient failures and stays open", () => {
const { state } = makeState();
state.noteIteration();
// No failures recorded yet β no short-circuit.
assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null);
// Record threshold-1 transient failures β still closed.
for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD - 1; i++) {
state.recordToolOutcome({
toolCallId: `c_${i}`,
toolName: "search_pubmed",
args: { query: "x" },
result: { error: "rate limited", error_code: "rate_limited" },
isError: true,
durationMs: 10,
});
}
assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null);
// One more transient failure β breaker trips.
state.recordToolOutcome({
toolCallId: "c_final",
toolName: "search_pubmed",
args: { query: "x" },
result: { error: "rate limited", error_code: "rate_limited" },
isError: true,
durationMs: 10,
});
const open = state.circuitCheck("search_pubmed", { query: "anything else" });
assert.ok(open, "circuit should be open");
assert.equal(open!.error_code, "circuit_open");
assert.equal(open!.retryable, false);
assert.match(open!.suggestion, /opentargets|uniprot|alternative/i);
// A different source key on the same tool should NOT be open β the
// breaker is per-source, not per-tool.
assert.equal(
state.circuitCheck("search_pubmed", { source: "europepmc" }),
null,
);
});
test("non-transient errors do not contribute to the circuit breaker", () => {
const { state } = makeState();
for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD + 2; i++) {
state.recordToolOutcome({
toolCallId: `c_${i}`,
toolName: "lookup_uniprot",
args: { id: "P12345" },
result: { error: "bad request", error_code: "invalid_argument" },
isError: true,
durationMs: 5,
});
}
assert.equal(state.circuitCheck("lookup_uniprot", { id: "P12345" }), null);
});
// ----------------------------------------------------------------- (b)
test("per-turn cache returns the same result for an identical (tool, args) call", () => {
const { state } = makeState();
const args = { query: "BRCA1", limit: 5 };
const result = { hits: [{ pmid: "12345", title: "demo" }] };
// No cache hit before recording.
assert.equal(state.cacheLookup("search_pubmed", args), undefined);
state.recordToolOutcome({
toolCallId: "c1",
toolName: "search_pubmed",
args,
result,
isError: false,
durationMs: 42,
});
// Same args β same cached object reference.
const cached = state.cacheLookup("search_pubmed", args);
assert.strictEqual(cached, result);
// Equivalent args object (different reference, same JSON) β still hits.
const cachedClone = state.cacheLookup("search_pubmed", { query: "BRCA1", limit: 5 });
assert.strictEqual(cachedClone, result);
// Different args β no hit.
assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined);
// Errors are NOT cached.
state.recordToolOutcome({
toolCallId: "c2",
toolName: "search_pubmed",
args: { query: "TP53" },
result: { error: "boom", error_code: "internal" },
isError: true,
durationMs: 1,
});
assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined);
});
// ----------------------------------------------------------------- (c)
test("validator downgrades a 'done' step that has no successful tool route", () => {
const { state } = makeState();
state.noteIteration();
feedPlanWithStep(state, "s1", { goal: "Find something" });
// Model claims done with zero recorded tool calls.
state.ingestTextDelta(
`<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
);
const v = state.runValidator();
assert.equal(v.passed, false);
assert.equal(v.downgraded.length, 1);
assert.equal(v.downgraded[0]!.id, "s1");
assert.equal(v.downgraded[0]!.reason, "no_evidence");
// The step itself was mutated to 'failed' with a downgrade note.
const publicRun = state.toPublic("complete", new Date(), {
input: 0,
output: 0,
});
const step = publicRun.steps.find((s) => s.id === "s1")!;
assert.equal(step.status, "failed");
assert.match(step.note ?? "", /downgraded.*no_evidence/);
assert.equal(state.needsRecoveryRound(), true);
});
test("validator passes when a 'done' step has a source-linked tool route", () => {
const { state } = makeState();
state.noteIteration();
feedPlanWithStep(state, "s1", { goal: "Look up paper" });
state.recordToolOutcome({
toolCallId: "c1",
toolName: "search_pubmed",
args: { query: "BRCA1" },
// Result includes a source-linked PMID, so the evidence-ref check passes.
result: { hits: [{ pmid: "12345", url: "https://pubmed/12345" }] },
isError: false,
durationMs: 5,
});
state.ingestTextDelta(
`<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
);
const v = state.runValidator();
assert.equal(v.passed, true);
assert.equal(v.downgraded.length, 0);
assert.equal(state.needsRecoveryRound(), false);
});
// ----------------------------------------------------------------- (d)
test("cross-turn working memory carries an unresolved step into the next turn's system prompt", () => {
const { state } = makeState();
state.noteIteration();
feedPlanWithStep(state, "s1", { goal: "Investigate target X" });
state.ingestTextDelta(
`<step_update>${JSON.stringify({
id: "s1",
status: "failed",
note: "rate limited",
})}</step_update>\n`,
);
// The pending memory record should include the failed step.
const record = state.buildPendingWorkingMemory();
assert.ok(record, "expected a memory record for a failed step");
assert.equal(record!.unresolved_steps.length, 1);
assert.equal(record!.unresolved_steps[0]!.id, "s1");
assert.equal(record!.unresolved_steps[0]!.goal, "Investigate target X");
assert.equal(record!.unresolved_steps[0]!.note, "rate limited");
assert.equal(record!.unresolved_steps[0]!.source_run_id, state.id);
// The system-prompt formatter must surface the unresolved step verbatim.
const prompt = formatCarryOverPrompt(record);
assert.match(prompt, /Carry-over from the previous turn/);
assert.match(prompt, /- s1: Investigate target X \(rate limited\)/);
// Resolved-only state β no carry-over (memory is cleared next turn).
const { state: clean } = makeState();
clean.noteIteration();
feedPlanWithStep(clean, "s1");
clean.recordToolOutcome({
toolCallId: "c1",
toolName: "search_pubmed",
args: { query: "x" },
result: { hits: [{ pmid: "1", url: "https://e/1" }] },
isError: false,
durationMs: 1,
});
clean.ingestTextDelta(
`<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
);
assert.equal(clean.buildPendingWorkingMemory(), null);
assert.equal(formatCarryOverPrompt(null), "");
});
test("reflection-only unresolved steps are also carried forward", () => {
const { state } = makeState();
state.noteIteration();
feedPlanWithStep(state, "s2", { goal: "Cross-reference disease" });
// Plan-side step is still 'running', but reflection lists it unresolved.
state.ingestTextDelta(
`<reflection>${JSON.stringify({
step_states: [{ id: "s2", status: "pending" }],
unresolved: [{ id: "s2", reason: "no time" }],
})}</reflection>\n`,
);
const record = state.buildPendingWorkingMemory();
assert.ok(record);
assert.equal(record!.unresolved_steps.length, 1);
assert.equal(record!.unresolved_steps[0]!.id, "s2");
});
// ----------------------------------------------------------------- bonus
test("agent tags are stripped from user-visible deltas", () => {
const { state } = makeState();
const visible = state.ingestTextDelta(
`Hello!\n<plan>${JSON.stringify({
steps: [{ id: "s1", goal: "g" }],
})}</plan>\nWorld\n`,
);
// 'Hello!' and 'World' both pass through; the plan tag does not.
assert.match(visible, /Hello!/);
assert.match(visible, /World/);
assert.doesNotMatch(visible, /<plan>/);
assert.doesNotMatch(visible, /<\/plan>/);
});
|