File size: 10,323 Bytes
/**
 * Regression coverage for the agent loop's recovery / self-check paths.
 *
 * Run with:  pnpm --filter @workspace/api-server test
 *
 * These tests exercise pure / DB-free behavior on AgentRunState plus the
 * exported prompt-formatting helper so the four canonical scenarios
 * called out in the agent-loop spec stay green on every change:
 *
 *   (a) circuit breaker tripping after the threshold
 *   (b) per-turn cache returning the same result for an identical
 *       (tool, args) call
 *   (c) validator downgrading a step that has no successful tool route
 *   (d) cross-turn working memory carrying an unresolved step into the
 *       next turn's system prompt
 */
import { test } from "node:test";
import assert from "node:assert/strict";

// `@workspace/db` reads DATABASE_URL at import time. Tests don't actually
// touch Postgres — every code path under test is DB-free — so a sentinel
// connection string is enough to satisfy the import guard.
process.env.DATABASE_URL ??= "postgres://test:test@127.0.0.1:5432/test";

import type { AgentRunState as AgentRunStateType } from "./agent-supervisor";

const {
  AgentRunState,
  CIRCUIT_BREAKER_THRESHOLD,
  formatCarryOverPrompt,
} = await import("./agent-supervisor");

function makeState() {
  const events: Array<{ event: string; data: Record<string, unknown> }> = [];
  const state = new AgentRunState({
    conversationId: "cnv_test",
    userId: "usr_test",
    messageId: "msg_test",
    emit: (ev) => events.push(ev),
  });
  return { state, events };
}

function feedPlanWithStep(state: AgentRunStateType, stepId: string, opts: {
  goal?: string;
  success_criteria?: string;
} = {}) {
  const plan = {
    steps: [
      {
        id: stepId,
        goal: opts.goal ?? `Goal for ${stepId}`,
        success_criteria: opts.success_criteria,
      },
    ],
  };
  state.ingestTextDelta(`<plan>${JSON.stringify(plan)}</plan>\n`);
  state.ingestTextDelta(
    `<step_update>${JSON.stringify({ id: stepId, status: "running" })}</step_update>\n`,
  );
}

// ----------------------------------------------------------------- (a)

test("circuit breaker trips after THRESHOLD transient failures and stays open", () => {
  const { state } = makeState();
  state.noteIteration();

  // No failures recorded yet — no short-circuit.
  assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null);

  // Record threshold-1 transient failures → still closed.
  for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD - 1; i++) {
    state.recordToolOutcome({
      toolCallId: `c_${i}`,
      toolName: "search_pubmed",
      args: { query: "x" },
      result: { error: "rate limited", error_code: "rate_limited" },
      isError: true,
      durationMs: 10,
    });
  }
  assert.equal(state.circuitCheck("search_pubmed", { query: "x" }), null);

  // One more transient failure → breaker trips.
  state.recordToolOutcome({
    toolCallId: "c_final",
    toolName: "search_pubmed",
    args: { query: "x" },
    result: { error: "rate limited", error_code: "rate_limited" },
    isError: true,
    durationMs: 10,
  });

  const open = state.circuitCheck("search_pubmed", { query: "anything else" });
  assert.ok(open, "circuit should be open");
  assert.equal(open!.error_code, "circuit_open");
  assert.equal(open!.retryable, false);
  assert.match(open!.suggestion, /opentargets|uniprot|alternative/i);

  // A different source key on the same tool should NOT be open — the
  // breaker is per-source, not per-tool.
  assert.equal(
    state.circuitCheck("search_pubmed", { source: "europepmc" }),
    null,
  );
});

test("non-transient errors do not contribute to the circuit breaker", () => {
  const { state } = makeState();
  for (let i = 0; i < CIRCUIT_BREAKER_THRESHOLD + 2; i++) {
    state.recordToolOutcome({
      toolCallId: `c_${i}`,
      toolName: "lookup_uniprot",
      args: { id: "P12345" },
      result: { error: "bad request", error_code: "invalid_argument" },
      isError: true,
      durationMs: 5,
    });
  }
  assert.equal(state.circuitCheck("lookup_uniprot", { id: "P12345" }), null);
});

// ----------------------------------------------------------------- (b)

test("per-turn cache returns the same result for an identical (tool, args) call", () => {
  const { state } = makeState();
  const args = { query: "BRCA1", limit: 5 };
  const result = { hits: [{ pmid: "12345", title: "demo" }] };

  // No cache hit before recording.
  assert.equal(state.cacheLookup("search_pubmed", args), undefined);

  state.recordToolOutcome({
    toolCallId: "c1",
    toolName: "search_pubmed",
    args,
    result,
    isError: false,
    durationMs: 42,
  });

  // Same args → same cached object reference.
  const cached = state.cacheLookup("search_pubmed", args);
  assert.strictEqual(cached, result);

  // Equivalent args object (different reference, same JSON) → still hits.
  const cachedClone = state.cacheLookup("search_pubmed", { query: "BRCA1", limit: 5 });
  assert.strictEqual(cachedClone, result);

  // Different args → no hit.
  assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined);

  // Errors are NOT cached.
  state.recordToolOutcome({
    toolCallId: "c2",
    toolName: "search_pubmed",
    args: { query: "TP53" },
    result: { error: "boom", error_code: "internal" },
    isError: true,
    durationMs: 1,
  });
  assert.equal(state.cacheLookup("search_pubmed", { query: "TP53" }), undefined);
});

// ----------------------------------------------------------------- (c)

test("validator downgrades a 'done' step that has no successful tool route", () => {
  const { state } = makeState();
  state.noteIteration();
  feedPlanWithStep(state, "s1", { goal: "Find something" });

  // Model claims done with zero recorded tool calls.
  state.ingestTextDelta(
    `<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
  );

  const v = state.runValidator();
  assert.equal(v.passed, false);
  assert.equal(v.downgraded.length, 1);
  assert.equal(v.downgraded[0]!.id, "s1");
  assert.equal(v.downgraded[0]!.reason, "no_evidence");

  // The step itself was mutated to 'failed' with a downgrade note.
  const publicRun = state.toPublic("complete", new Date(), {
    input: 0,
    output: 0,
  });
  const step = publicRun.steps.find((s) => s.id === "s1")!;
  assert.equal(step.status, "failed");
  assert.match(step.note ?? "", /downgraded.*no_evidence/);

  assert.equal(state.needsRecoveryRound(), true);
});

test("validator passes when a 'done' step has a source-linked tool route", () => {
  const { state } = makeState();
  state.noteIteration();
  feedPlanWithStep(state, "s1", { goal: "Look up paper" });

  state.recordToolOutcome({
    toolCallId: "c1",
    toolName: "search_pubmed",
    args: { query: "BRCA1" },
    // Result includes a source-linked PMID, so the evidence-ref check passes.
    result: { hits: [{ pmid: "12345", url: "https://pubmed/12345" }] },
    isError: false,
    durationMs: 5,
  });

  state.ingestTextDelta(
    `<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
  );

  const v = state.runValidator();
  assert.equal(v.passed, true);
  assert.equal(v.downgraded.length, 0);
  assert.equal(state.needsRecoveryRound(), false);
});

// ----------------------------------------------------------------- (d)

test("cross-turn working memory carries an unresolved step into the next turn's system prompt", () => {
  const { state } = makeState();
  state.noteIteration();
  feedPlanWithStep(state, "s1", { goal: "Investigate target X" });
  state.ingestTextDelta(
    `<step_update>${JSON.stringify({
      id: "s1",
      status: "failed",
      note: "rate limited",
    })}</step_update>\n`,
  );

  // The pending memory record should include the failed step.
  const record = state.buildPendingWorkingMemory();
  assert.ok(record, "expected a memory record for a failed step");
  assert.equal(record!.unresolved_steps.length, 1);
  assert.equal(record!.unresolved_steps[0]!.id, "s1");
  assert.equal(record!.unresolved_steps[0]!.goal, "Investigate target X");
  assert.equal(record!.unresolved_steps[0]!.note, "rate limited");
  assert.equal(record!.unresolved_steps[0]!.source_run_id, state.id);

  // The system-prompt formatter must surface the unresolved step verbatim.
  const prompt = formatCarryOverPrompt(record);
  assert.match(prompt, /Carry-over from the previous turn/);
  assert.match(prompt, /- s1: Investigate target X \(rate limited\)/);

  // Resolved-only state → no carry-over (memory is cleared next turn).
  const { state: clean } = makeState();
  clean.noteIteration();
  feedPlanWithStep(clean, "s1");
  clean.recordToolOutcome({
    toolCallId: "c1",
    toolName: "search_pubmed",
    args: { query: "x" },
    result: { hits: [{ pmid: "1", url: "https://e/1" }] },
    isError: false,
    durationMs: 1,
  });
  clean.ingestTextDelta(
    `<step_update>${JSON.stringify({ id: "s1", status: "done" })}</step_update>\n`,
  );
  assert.equal(clean.buildPendingWorkingMemory(), null);
  assert.equal(formatCarryOverPrompt(null), "");
});

test("reflection-only unresolved steps are also carried forward", () => {
  const { state } = makeState();
  state.noteIteration();
  feedPlanWithStep(state, "s2", { goal: "Cross-reference disease" });
  // Plan-side step is still 'running', but reflection lists it unresolved.
  state.ingestTextDelta(
    `<reflection>${JSON.stringify({
      step_states: [{ id: "s2", status: "pending" }],
      unresolved: [{ id: "s2", reason: "no time" }],
    })}</reflection>\n`,
  );
  const record = state.buildPendingWorkingMemory();
  assert.ok(record);
  assert.equal(record!.unresolved_steps.length, 1);
  assert.equal(record!.unresolved_steps[0]!.id, "s2");
});

// ----------------------------------------------------------------- bonus

test("agent tags are stripped from user-visible deltas", () => {
  const { state } = makeState();
  const visible = state.ingestTextDelta(
    `Hello!\n<plan>${JSON.stringify({
      steps: [{ id: "s1", goal: "g" }],
    })}</plan>\nWorld\n`,
  );
  // 'Hello!' and 'World' both pass through; the plan tag does not.
  assert.match(visible, /Hello!/);
  assert.match(visible, /World/);
  assert.doesNotMatch(visible, /<plan>/);
  assert.doesNotMatch(visible, /<\/plan>/);
});