File size: 10,323 Bytes
5871090
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
/**
 * Regression coverage for the agent loop's recovery / self-check paths.
 *
 * Run with:  pnpm --filter @workspace/api-server test
 *
 * These tests exercise pure / DB-free behavior on AgentRunState plus the
 * exported prompt-formatting helper so the four canonical scenarios
 * called out in the agent-loop spec stay green on every change:
 *
 *   (a) circuit breaker tripping after the threshold
 *   (b) per-turn cache returning the same result for an identical
 *       (tool, args) call
 *   (c) validator downgrading a step that has no successful tool route
 *   (d) cross-turn working memory carrying an unresolved step into the
 *       next turn's system prompt
 */
import { test } from "node:test";
import assert from "node:assert/strict";

// `@workspace/db` reads DATABASE_URL at import time. Tests don't actually
// touch Postgres β€” every code path under test is DB-free β€” so a sentinel
// connection string is enough to satisfy the import guard.
process.env.DATABASE_URL ??= "postgres://test:test@127.0.0.1:5432/test";

import type { AgentRunState as AgentRunStateType } from "./agent-supervisor";

const {
  AgentRunState,
  CIRCUIT_BREAKER_THRESHOLD,
  formatCarryOverPrompt,
} = await import("./agent-supervisor");

function makeState() {
  const events: Array<{ event: string; data: Record<string, unknown> }> = [];
  const state = new AgentRunState({
    conversationId: "cnv_test",
    userId: "usr_test",
    messageId: "msg_test",
    emit: (ev) => events.push(ev),
  });
  return { state, events };
}

function feedPlanWithStep(state: AgentRunStateType, stepId: string, opts: {
  goal?: string;
  success_criteria?: string;
} = {}) {
  const plan = {
    steps: [
      {
        id: stepId,
        goal: opts.goal ?? `Goal for ${stepId}`,
        success_criteria: opts.success_criteria,
      },
    ],
  };
  state.ingestTextDelta(`<plan>${JSON.stringify(plan)}</plan>\n`);
  state.ingestTextDelta(
    `<step_update>${JSON.stringify({ id: stepId, status: "running" })}</step_update>\n`,
  );
}

// ----------------------------------------------------------------- (a)

test("circuit breaker trips after THRESHOLD transient failures and stays open", () => {
  const { state } = makeState();
  state.noteIteration();

  // No failures recorded yet β€” no short-circuit.
  assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null);

  // Record threshold-1 transient failures β†’ still closed.
  for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD - 1; i++) {
    state.recordToolOutcome({
      toolCallId: `c_${i}`,
      toolName: "search_pubmed",
      args: { query: "x" },
      result: { error: "rate limited", error_code: "rate_limited" },
      isError: true,
      durationMs: 10,
    });
  }
  assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null);

  // One more transient failure β†’ breaker trips.
  state.recordToolOutcome({
    toolCallId: "c_final",
    toolName: "search_pubmed",
    args: { query: "x" },
    result: { error: "rate limited", error_code: "rate_limited" },
    isError: true,
    durationMs: 10,
  });

  const open = state.circuitCheck("search_pubmed", { query: "anything else" });
  assert.ok(open, "circuit should be open");
  assert.equal(open!.error_code, "circuit_open");
  assert.equal(open!.retryable, false);
  assert.match(open!.suggestion, /opentargets|uniprot|alternative/i);

  // A different source key on the same tool should NOT be open β€” the
  // breaker is per-source, not per-tool.
  assert.equal(
    state.circuitCheck("search_pubmed", { source: "europepmc" }),
    null,
  );
});

test("non-transient errors do not contribute to the circuit breaker", () => {
  const { state } = makeState();
  for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD + 2; i++) {
    state.recordToolOutcome({
      toolCallId: `c_${i}`,
      toolName: "lookup_uniprot",
      args: { id: "P12345" },
      result: { error: "bad request", error_code: "invalid_argument" },
      isError: true,
      durationMs: 5,
    });
  }
  assert.equal(state.circuitCheck("lookup_uniprot", { id: "P12345" }), null);
});

// ----------------------------------------------------------------- (b)

test("per-turn cache returns the same result for an identical (tool, args) call", () => {
  const { state } = makeState();
  const args = { query: "BRCA1", limit: 5 };
  const result = { hits: [{ pmid: "12345", title: "demo" }] };

  // No cache hit before recording.
  assert.equal(state.cacheLookup("search_pubmed", args), undefined);

  state.recordToolOutcome({
    toolCallId: "c1",
    toolName: "search_pubmed",
    args,
    result,
    isError: false,
    durationMs: 42,
  });

  // Same args β†’ same cached object reference.
  const cached = state.cacheLookup("search_pubmed", args);
  assert.strictEqual(cached, result);

  // Equivalent args object (different reference, same JSON) β†’ still hits.
  const cachedClone = state.cacheLookup("search_pubmed", { query: "BRCA1", limit: 5 });
  assert.strictEqual(cachedClone, result);

  // Different args β†’ no hit.
  assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined);

  // Errors are NOT cached.
  state.recordToolOutcome({
    toolCallId: "c2",
    toolName: "search_pubmed",
    args: { query: "TP53" },
    result: { error: "boom", error_code: "internal" },
    isError: true,
    durationMs: 1,
  });
  assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined);
});

// ----------------------------------------------------------------- (c)

test("validator downgrades a 'done' step that has no successful tool route", () => {
  const { state } = makeState();
  state.noteIteration();
  feedPlanWithStep(state, "s1", { goal: "Find something" });

  // Model claims done with zero recorded tool calls.
  state.ingestTextDelta(
    `<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
  );

  const v = state.runValidator();
  assert.equal(v.passed, false);
  assert.equal(v.downgraded.length, 1);
  assert.equal(v.downgraded[0]!.id, "s1");
  assert.equal(v.downgraded[0]!.reason, "no_evidence");

  // The step itself was mutated to 'failed' with a downgrade note.
  const publicRun = state.toPublic("complete", new Date(), {
    input: 0,
    output: 0,
  });
  const step = publicRun.steps.find((s) => s.id === "s1")!;
  assert.equal(step.status, "failed");
  assert.match(step.note ?? "", /downgraded.*no_evidence/);

  assert.equal(state.needsRecoveryRound(), true);
});

test("validator passes when a 'done' step has a source-linked tool route", () => {
  const { state } = makeState();
  state.noteIteration();
  feedPlanWithStep(state, "s1", { goal: "Look up paper" });

  state.recordToolOutcome({
    toolCallId: "c1",
    toolName: "search_pubmed",
    args: { query: "BRCA1" },
    // Result includes a source-linked PMID, so the evidence-ref check passes.
    result: { hits: [{ pmid: "12345", url: "https://pubmed/12345" }] },
    isError: false,
    durationMs: 5,
  });

  state.ingestTextDelta(
    `<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
  );

  const v = state.runValidator();
  assert.equal(v.passed, true);
  assert.equal(v.downgraded.length, 0);
  assert.equal(state.needsRecoveryRound(), false);
});

// ----------------------------------------------------------------- (d)

test("cross-turn working memory carries an unresolved step into the next turn's system prompt", () => {
  const { state } = makeState();
  state.noteIteration();
  feedPlanWithStep(state, "s1", { goal: "Investigate target X" });
  state.ingestTextDelta(
    `<step_update>${JSON.stringify({
      id: "s1",
      status: "failed",
      note: "rate limited",
    })}</step_update>\n`,
  );

  // The pending memory record should include the failed step.
  const record = state.buildPendingWorkingMemory();
  assert.ok(record, "expected a memory record for a failed step");
  assert.equal(record!.unresolved_steps.length, 1);
  assert.equal(record!.unresolved_steps[0]!.id, "s1");
  assert.equal(record!.unresolved_steps[0]!.goal, "Investigate target X");
  assert.equal(record!.unresolved_steps[0]!.note, "rate limited");
  assert.equal(record!.unresolved_steps[0]!.source_run_id, state.id);

  // The system-prompt formatter must surface the unresolved step verbatim.
  const prompt = formatCarryOverPrompt(record);
  assert.match(prompt, /Carry-over from the previous turn/);
  assert.match(prompt, /- s1: Investigate target X \(rate limited\)/);

  // Resolved-only state β†’ no carry-over (memory is cleared next turn).
  const { state: clean } = makeState();
  clean.noteIteration();
  feedPlanWithStep(clean, "s1");
  clean.recordToolOutcome({
    toolCallId: "c1",
    toolName: "search_pubmed",
    args: { query: "x" },
    result: { hits: [{ pmid: "1", url: "https://e/1" }] },
    isError: false,
    durationMs: 1,
  });
  clean.ingestTextDelta(
    `<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
  );
  assert.equal(clean.buildPendingWorkingMemory(), null);
  assert.equal(formatCarryOverPrompt(null), "");
});

test("reflection-only unresolved steps are also carried forward", () => {
  const { state } = makeState();
  state.noteIteration();
  feedPlanWithStep(state, "s2", { goal: "Cross-reference disease" });
  // Plan-side step is still 'running', but reflection lists it unresolved.
  state.ingestTextDelta(
    `<reflection>${JSON.stringify({
      step_states: [{ id: "s2", status: "pending" }],
      unresolved: [{ id: "s2", reason: "no time" }],
    })}</reflection>\n`,
  );
  const record = state.buildPendingWorkingMemory();
  assert.ok(record);
  assert.equal(record!.unresolved_steps.length, 1);
  assert.equal(record!.unresolved_steps[0]!.id, "s2");
});

// ----------------------------------------------------------------- bonus

test("agent tags are stripped from user-visible deltas", () => {
  const { state } = makeState();
  const visible = state.ingestTextDelta(
    `Hello!\n<plan>${JSON.stringify({
      steps: [{ id: "s1", goal: "g" }],
    })}</plan>\nWorld\n`,
  );
  // 'Hello!' and 'World' both pass through; the plan tag does not.
  assert.match(visible, /Hello!/);
  assert.match(visible, /World/);
  assert.doesNotMatch(visible, /<plan>/);
  assert.doesNotMatch(visible, /<\/plan>/);
});