Spaces:

Evilander
/

audrey-memory-benchmark-report

Running

App Files Files Community

Evilander commited on 23 days ago

Commit

253d35a

verified ·

1 Parent(s): ad1d207

Upload 7 files

Browse files

Files changed (7) hide show

index.html +209 -18
local-overall.svg +45 -0
operations-overall.svg +45 -0
perf-0.22.2.json +123 -0
published-locomo.svg +50 -0
retrieval-overall.svg +45 -0
summary.json +1796 -0

index.html CHANGED Viewed

@@ -1,19 +1,210 @@
 <!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
-</html>

 <!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8" />
+  <title>Audrey Memory Benchmark</title>
+  <style>
+    body { font-family: "Segoe UI", Arial, sans-serif; margin: 32px; color: #111827; background: #f8fafc; }
+    main { max-width: 1120px; margin: 0 auto; }
+    h1, h2 { margin-bottom: 12px; }
+    p, li { line-height: 1.5; }
+    .callout { background: white; border: 1px solid #cbd5e1; border-radius: 16px; padding: 20px; margin-bottom: 24px; }
+    .grid { display: grid; gap: 24px; grid-template-columns: 1fr; }
+    img { width: 100%; border: 1px solid #cbd5e1; border-radius: 16px; background: white; }
+    table { width: 100%; border-collapse: collapse; background: white; border-radius: 16px; overflow: hidden; }
+    th, td { border: 1px solid #cbd5e1; padding: 12px; vertical-align: top; text-align: left; }
+    th { background: #e2e8f0; }
+    code { background: #e2e8f0; padding: 2px 6px; border-radius: 6px; }
+  </style>
+</head>
+<body>
+  <main>
+    <h1>Audrey Memory Benchmark</h1>
+    <div class="callout">
+      <p><strong>Method:</strong> Audrey is scored on a LongMemEval-inspired retrieval benchmark plus an operation-level lifecycle benchmark. The report still separates local Audrey-versus-baseline results from published external LoCoMo numbers so the comparison stays honest.</p>
+      <p><strong>Run:</strong> <code>node benchmarks/run.js --provider mock --dimensions 64</code></p>
+      <p><strong>Generated:</strong> 2026-05-01T03:20:07.968Z</p>
+    </div>
+    <div class="grid">
+      <section class="callout">
+        <h2>Combined Local Benchmark</h2>
+        <img src="./local-overall.svg" alt="Combined local benchmark bar chart" />
+      </section>
+    <section class="callout">
+      <h2>Retrieval capabilities Benchmark</h2>
+      <p>LongMemEval-style memory abilities plus privacy and abstention.</p>
+      <img src="./retrieval-overall.svg" alt="Retrieval capabilities Benchmark chart" />
+    </section>
+    <section class="callout">
+      <h2>Memory operations Benchmark</h2>
+      <p>Update, delete, merge, and abstention behavior after lifecycle operations.</p>
+      <img src="./operations-overall.svg" alt="Memory operations Benchmark chart" />
+    </section>
+      <section class="callout">
+        <h2>Published Leaderboard</h2>
+        <img src="./published-locomo.svg" alt="Published LoCoMo leaderboard bar chart" />
+      </section>
+    </div>
+    <section class="callout">
+      <h2>Case Matrix</h2>
+      <table>
+        <thead>
+          <tr>
+            <th>Case</th>
+            <th>Suite</th>
+            <th>Family</th>
+            <th>Audrey</th><th>Vector Only</th><th>Keyword + Recency</th><th>Recent Window</th>
+          </tr>
+        </thead>
+        <tbody>
+    <tr>
+      <td>Information extraction</td>
+      <td>retrieval</td>
+      <td>information_extraction</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td>
+    </tr>
+    <tr>
+      <td>Knowledge updates</td>
+      <td>retrieval</td>
+      <td>knowledge_updates</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td>
+    </tr>
+    <tr>
+      <td>Multi-session reasoning</td>
+      <td>retrieval</td>
+      <td>multi_session_reasoning</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td>
+    </tr>
+    <tr>
+      <td>Temporal reasoning</td>
+      <td>retrieval</td>
+      <td>temporal_reasoning</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td>
+    </tr>
+    <tr>
+      <td>Abstention</td>
+      <td>retrieval</td>
+      <td>abstention</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">correct abstention</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">no leak, but retrieved tangential context</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">no leak, but retrieved tangential context</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">no leak, but retrieved tangential context</span></td>
+    </tr>
+    <tr>
+      <td>Conflict resolution</td>
+      <td>retrieval</td>
+      <td>conflict_resolution</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">blocked content outranked the correct answer</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">blocked content outranked the correct answer</span></td>
+    </tr>
+    <tr>
+      <td>Procedural learning</td>
+      <td>retrieval</td>
+      <td>procedural_learning</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td>
+    </tr>
+    <tr>
+      <td>Privacy boundary</td>
+      <td>retrieval</td>
+      <td>privacy_boundary</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">correct abstention</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">leaked restricted content</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">leaked restricted content</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">leaked restricted content</span></td>
+    </tr>
+    <tr>
+      <td>Update and overwrite</td>
+      <td>operations</td>
+      <td>update_overwrite</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td>
+    </tr>
+    <tr>
+      <td>Delete and abstain</td>
+      <td>operations</td>
+      <td>delete_and_abstain</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">correct abstention</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">leaked restricted content</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">no leak, but retrieved tangential context</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">leaked restricted content</span></td>
+    </tr>
+    <tr>
+      <td>Semantic merge</td>
+      <td>operations</td>
+      <td>semantic_merge</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td>
+    </tr>
+    <tr>
+      <td>Procedural merge</td>
+      <td>operations</td>
+      <td>procedural_merge</td>
+      <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td>
+    </tr>
+        </tbody>
+      </table>
+    </section>
+    <section class="callout">
+      <h2>March 23, 2026 Memory Trends</h2>
+      <ul>
+    <li>
+      <strong>Memory is moving from flat retrieval to typed systems</strong><br />
+      Recent work treats episodic, semantic, procedural, and graph memory as separate but cooperating layers.<br />
+      <a href="https://arxiv.org/abs/2507.03724">https://arxiv.org/abs/2507.03724</a>
+    </li>
+    <li>
+      <strong>Benchmarks now emphasize multi-session realism</strong><br />
+      LongMemEval and LoCoMo push memory systems toward temporal updates, abstraction, and cross-session reasoning instead of single-turn fact recall.<br />
+      <a href="https://arxiv.org/abs/2410.10813">https://arxiv.org/abs/2410.10813</a>
+    </li>
+    <li>
+      <strong>Context engineering is now competing with retrieval-first designs</strong><br />
+      Letta argues filesystem and memory-block approaches can outperform simpler retrieval-only memory on realistic long-horizon tasks.<br />
+      <a href="https://www.letta.com/blog/memory-blocks">https://www.letta.com/blog/memory-blocks</a>
+    </li>
+    <li>
+      <strong>Production teams care about latency and token footprint, not just recall quality</strong><br />
+      Mem0 frames memory as a cost and latency optimization surface in addition to a personalization surface.<br />
+      <a href="https://arxiv.org/abs/2504.19413">https://arxiv.org/abs/2504.19413</a>
+    </li>
+    <li>
+      <strong>Temporal and multimodal memory are becoming table stakes</strong><br />
+      MIRIX and Graphiti both model time and state change explicitly instead of assuming memories stay forever true.<br />
+      <a href="https://arxiv.org/abs/2507.07957">https://arxiv.org/abs/2507.07957</a>
+    </li>
+      </ul>
+    </section>
+  </main>
+</body>
+</html>

local-overall.svg ADDED Viewed

operations-overall.svg ADDED Viewed

perf-0.22.2.json ADDED Viewed

	@@ -0,0 +1,123 @@

+{
+  "generatedAt": "2026-05-01T02:15:29.400Z",
+  "durationMs": 4600,
+  "audreyVersion": null,
+  "gitSha": "e2e821b",
+  "methodology": {
+    "embedding": "mock provider, 64 dimensions (in-process, no network)",
+    "llm": "mock provider (in-process)",
+    "retrieval": "hybrid (vector + lexical) with limit=5",
+    "sizes": [
+      100,
+      1000,
+      5000
+    ],
+    "recallRunsPerSize": 50,
+    "notes": "Latency is wall-clock for a single call from a JS caller. Cloud and local 384-dim providers will report higher recall latency dominated by embedding cost and network. Run on your own hardware before quoting."
+  },
+  "machine": {
+    "node": "25.5.0",
+    "v8": "14.1.146.11-node.18",
+    "platform": "win32",
+    "arch": "x64",
+    "osRelease": "10.0.26200",
+    "cpuCount": 24,
+    "cpuModel": "AMD Ryzen 9 7900X3D 12-Core Processor          ",
+    "memoryGb": 62.9
+  },
+  "sizes": [
+    {
+      "corpusSize": 100,
+      "encodeMs": {
+        "samples": 100,
+        "p50": 0.331,
+        "p95": 0.589,
+        "p99": 7.65,
+        "min": 0.214,
+        "max": 10.978,
+        "mean": 0.577
+      },
+      "hybridRecallMs": {
+        "samples": 50,
+        "p50": 0.539,
+        "p95": 1.82,
+        "p99": 2.712,
+        "min": 0.448,
+        "max": 2.712,
+        "mean": 0.659
+      },
+      "postEncodeQueueMs": {
+        "samples": 100,
+        "p50": 0.344,
+        "p95": 0.73,
+        "p99": 9.715,
+        "min": 0.079,
+        "max": 13.218,
+        "mean": 0.533
+      },
+      "queueEvents": 100
+    },
+    {
+      "corpusSize": 1000,
+      "encodeMs": {
+        "samples": 1000,
+        "p50": 0.307,
+        "p95": 2.147,
+        "p99": 9.672,
+        "min": 0.183,
+        "max": 13.514,
+        "mean": 0.639
+      },
+      "hybridRecallMs": {
+        "samples": 50,
+        "p50": 1.566,
+        "p95": 2.364,
+        "p99": 21.177,
+        "min": 0.608,
+        "max": 21.177,
+        "mean": 1.803
+      },
+      "postEncodeQueueMs": {
+        "samples": 1000,
+        "p50": 0.321,
+        "p95": 1.468,
+        "p99": 9.744,
+        "min": 0.07,
+        "max": 15.61,
+        "mean": 0.553
+      },
+      "queueEvents": 1000
+    },
+    {
+      "corpusSize": 5000,
+      "encodeMs": {
+        "samples": 5000,
+        "p50": 0.308,
+        "p95": 1.838,
+        "p99": 10.45,
+        "min": 0.171,
+        "max": 19.247,
+        "mean": 0.653
+      },
+      "hybridRecallMs": {
+        "samples": 50,
+        "p50": 2.091,
+        "p95": 3.417,
+        "p99": 16.58,
+        "min": 2.005,
+        "max": 16.58,
+        "mean": 2.492
+      },
+      "postEncodeQueueMs": {
+        "samples": 5000,
+        "p50": 0.313,
+        "p95": 1.217,
+        "p99": 10.532,
+        "min": 0.074,
+        "max": 19.378,
+        "mean": 0.554
+      },
+      "queueEvents": 5000
+    }
+  ]
+}

published-locomo.svg ADDED Viewed

retrieval-overall.svg ADDED Viewed

summary.json ADDED Viewed

	@@ -0,0 +1,1796 @@

+{
+  "generatedAt": "2026-05-01T03:20:07.968Z",
+  "command": "node benchmarks/run.js --provider mock --dimensions 64",
+  "config": {
+    "provider": "mock",
+    "dimensions": 64,
+    "suites": [
+      "retrieval",
+      "operations"
+    ]
+  },
+  "methodology": {
+    "localBenchmark": "LongMemEval-inspired retrieval benchmark plus operation-level lifecycle benchmark",
+    "retrievalBenchmark": "Information extraction, updates, reasoning, procedural learning, privacy, abstention, and conflict handling",
+    "operationsBenchmark": "Update, overwrite, delete, merge, and abstention behavior after lifecycle operations",
+    "externalLeaderboard": "Published LoCoMo scores from official papers and project blogs"
+  },
+  "local": {
+    "overall": [
+      {
+        "system": "Audrey",
+        "scorePercent": 100,
+        "passRate": 100,
+        "avgDurationMs": 13.333333333333334
+      },
+      {
+        "system": "Vector Only",
+        "scorePercent": 41.66666666666667,
+        "passRate": 25,
+        "avgDurationMs": 0
+      },
+      {
+        "system": "Keyword + Recency",
+        "scorePercent": 41.66666666666667,
+        "passRate": 25,
+        "avgDurationMs": 0.5833333333333334
+      },
+      {
+        "system": "Recent Window",
+        "scorePercent": 37.5,
+        "passRate": 25,
+        "avgDurationMs": 0
+      }
+    ],
+    "byFamily": [
+      {
+        "family": "information_extraction",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 1,
+          "Keyword + Recency": 1,
+          "Recent Window": 1
+        }
+      },
+      {
+        "family": "knowledge_updates",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 0.5,
+          "Keyword + Recency": 0.5,
+          "Recent Window": 0.5
+        }
+      },
+      {
+        "family": "multi_session_reasoning",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 1,
+          "Keyword + Recency": 1,
+          "Recent Window": 1
+        }
+      },
+      {
+        "family": "temporal_reasoning",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 1,
+          "Keyword + Recency": 1,
+          "Recent Window": 1
+        }
+      },
+      {
+        "family": "abstention",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 0.5,
+          "Keyword + Recency": 0.5,
+          "Recent Window": 0.5
+        }
+      },
+      {
+        "family": "conflict_resolution",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 0.5,
+          "Keyword + Recency": 0,
+          "Recent Window": 0
+        }
+      },
+      {
+        "family": "procedural_learning",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 0,
+          "Keyword + Recency": 0,
+          "Recent Window": 0
+        }
+      },
+      {
+        "family": "privacy_boundary",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 0,
+          "Keyword + Recency": 0,
+          "Recent Window": 0
+        }
+      },
+      {
+        "family": "update_overwrite",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 0.5,
+          "Keyword + Recency": 0.5,
+          "Recent Window": 0.5
+        }
+      },
+      {
+        "family": "delete_and_abstain",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 0,
+          "Keyword + Recency": 0.5,
+          "Recent Window": 0
+        }
+      },
+      {
+        "family": "semantic_merge",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 0,
+          "Keyword + Recency": 0,
+          "Recent Window": 0
+        }
+      },
+      {
+        "family": "procedural_merge",
+        "systems": {
+          "Audrey": 1,
+          "Vector Only": 0,
+          "Keyword + Recency": 0,
+          "Recent Window": 0
+        }
+      }
+    ],
+    "suites": [
+      {
+        "id": "retrieval",
+        "title": "Retrieval capabilities",
+        "description": "LongMemEval-style memory abilities plus privacy and abstention.",
+        "overall": [
+          {
+            "system": "Audrey",
+            "scorePercent": 100,
+            "passRate": 100,
+            "avgDurationMs": 13.625
+          },
+          {
+            "system": "Vector Only",
+            "scorePercent": 56.25,
+            "passRate": 37.5,
+            "avgDurationMs": 0
+          },
+          {
+            "system": "Keyword + Recency",
+            "scorePercent": 50,
+            "passRate": 37.5,
+            "avgDurationMs": 0.75
+          },
+          {
+            "system": "Recent Window",
+            "scorePercent": 50,
+            "passRate": 37.5,
+            "avgDurationMs": 0
+          }
+        ],
+        "byFamily": [
+          {
+            "family": "information_extraction",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 1,
+              "Keyword + Recency": 1,
+              "Recent Window": 1
+            }
+          },
+          {
+            "family": "knowledge_updates",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 0.5,
+              "Keyword + Recency": 0.5,
+              "Recent Window": 0.5
+            }
+          },
+          {
+            "family": "multi_session_reasoning",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 1,
+              "Keyword + Recency": 1,
+              "Recent Window": 1
+            }
+          },
+          {
+            "family": "temporal_reasoning",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 1,
+              "Keyword + Recency": 1,
+              "Recent Window": 1
+            }
+          },
+          {
+            "family": "abstention",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 0.5,
+              "Keyword + Recency": 0.5,
+              "Recent Window": 0.5
+            }
+          },
+          {
+            "family": "conflict_resolution",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 0.5,
+              "Keyword + Recency": 0,
+              "Recent Window": 0
+            }
+          },
+          {
+            "family": "procedural_learning",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 0,
+              "Keyword + Recency": 0,
+              "Recent Window": 0
+            }
+          },
+          {
+            "family": "privacy_boundary",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 0,
+              "Keyword + Recency": 0,
+              "Recent Window": 0
+            }
+          }
+        ],
+        "cases": [
+          {
+            "id": "information-extraction",
+            "suite": "retrieval",
+            "title": "Information extraction",
+            "family": "information_extraction",
+            "description": "Recover a directly stated user fact from durable memory.",
+            "query": "Where does Sam live now?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 21,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "Sam moved to Austin in March 2026 after leaving Denver.",
+                  "Sam likes to work from coffee shops on South Congress."
+                ],
+                "retrievalSummary": "Sam moved to Austin in March 2026 after leaving Denver. | Sam likes to work from coffee shops on South Congress."
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "Sam moved to Austin in March 2026 after leaving Denver.",
+                  "Sam likes to work from coffee shops on South Congress."
+                ],
+                "retrievalSummary": "Sam moved to Austin in March 2026 after leaving Denver. | Sam likes to work from coffee shops on South Congress."
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 5,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "Sam likes to work from coffee shops on South Congress.",
+                  "Sam moved to Austin in March 2026 after leaving Denver."
+                ],
+                "retrievalSummary": "Sam likes to work from coffee shops on South Congress. | Sam moved to Austin in March 2026 after leaving Denver."
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "Sam likes to work from coffee shops on South Congress.",
+                  "Sam moved to Austin in March 2026 after leaving Denver."
+                ],
+                "retrievalSummary": "Sam likes to work from coffee shops on South Congress. | Sam moved to Austin in March 2026 after leaving Denver."
+              }
+            ]
+          },
+          {
+            "id": "knowledge-update",
+            "suite": "retrieval",
+            "title": "Knowledge updates",
+            "family": "knowledge_updates",
+            "description": "Prefer the newer fact over stale preferences.",
+            "query": "What drink does Sam prefer now?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 13,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "Sam switched from coffee to green tea after January 2026."
+                ],
+                "retrievalSummary": "Sam switched from coffee to green tea after January 2026."
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0.5,
+                "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+                "topResults": [
+                  "Sam switched from coffee to green tea after January 2026.",
+                  "Sam prefers coffee before early meetings."
+                ],
+                "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0.5,
+                "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+                "topResults": [
+                  "Sam switched from coffee to green tea after January 2026.",
+                  "Sam prefers coffee before early meetings."
+                ],
+                "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0.5,
+                "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+                "topResults": [
+                  "Sam switched from coffee to green tea after January 2026.",
+                  "Sam prefers coffee before early meetings."
+                ],
+                "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
+              }
+            ]
+          },
+          {
+            "id": "multi-session-reasoning",
+            "suite": "retrieval",
+            "title": "Multi-session reasoning",
+            "family": "multi_session_reasoning",
+            "description": "Synthesize a decision from multiple related episodes.",
+            "query": "Which vendor was approved after the pilot budget review?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 13,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
+                  "Finance rejected Fabricam because the support SLA was too weak.",
+                  "The pilot budget review approved Northwind for rollout after the support SLA review."
+                ],
+                "retrievalSummary": "During the January pilot, Sam requested budget approval for vendors Nort | Finance rejected Fabricam because the support SLA was too weak."
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
+                  "Finance rejected Fabricam because the support SLA was too weak.",
+                  "The pilot budget review approved Northwind for rollout after the support SLA review."
+                ],
+                "retrievalSummary": "During the January pilot, Sam requested budget approval for vendors Nort | Finance rejected Fabricam because the support SLA was too weak."
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 0,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "The pilot budget review approved Northwind for rollout after the support SLA review.",
+                  "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
+                  "Finance rejected Fabricam because the support SLA was too weak."
+                ],
+                "retrievalSummary": "The pilot budget review approved Northwind for rollout after the support | During the January pilot, Sam requested budget approval for vendors Nort"
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "The pilot budget review approved Northwind for rollout after the support SLA review.",
+                  "Finance rejected Fabricam because the support SLA was too weak.",
+                  "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam."
+                ],
+                "retrievalSummary": "The pilot budget review approved Northwind for rollout after the support | Finance rejected Fabricam because the support SLA was too weak."
+              }
+            ]
+          },
+          {
+            "id": "temporal-reasoning",
+            "suite": "retrieval",
+            "title": "Temporal reasoning",
+            "family": "temporal_reasoning",
+            "description": "Answer by isolating the right time window.",
+            "query": "What happened in February 2026?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 13,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "In February 2026 Sam completed the architecture review."
+                ],
+                "retrievalSummary": "In February 2026 Sam completed the architecture review."
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "In March 2026 Sam started the rollout checklist.",
+                  "In February 2026 Sam completed the architecture review.",
+                  "In January 2026 Sam kicked off the migration plan."
+                ],
+                "retrievalSummary": "In March 2026 Sam started the rollout checklist. | In February 2026 Sam completed the architecture review."
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 0,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "In February 2026 Sam completed the architecture review.",
+                  "In March 2026 Sam started the rollout checklist.",
+                  "In January 2026 Sam kicked off the migration plan."
+                ],
+                "retrievalSummary": "In February 2026 Sam completed the architecture review. | In March 2026 Sam started the rollout checklist."
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "In March 2026 Sam started the rollout checklist.",
+                  "In February 2026 Sam completed the architecture review.",
+                  "In January 2026 Sam kicked off the migration plan."
+                ],
+                "retrievalSummary": "In March 2026 Sam started the rollout checklist. | In February 2026 Sam completed the architecture review."
+              }
+            ]
+          },
+          {
+            "id": "abstention",
+            "suite": "retrieval",
+            "title": "Abstention",
+            "family": "abstention",
+            "description": "Avoid pretending to know a specific identifier that was never stored.",
+            "query": "What is Sam passport number?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 12,
+                "passed": true,
+                "score": 1,
+                "summary": "correct abstention",
+                "topResults": [],
+                "retrievalSummary": "no retrieval"
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0.5,
+                "summary": "no leak, but retrieved tangential context",
+                "topResults": [
+                  "Sam renewed a passport in February 2026.",
+                  "Sam has a trip to Toronto next month."
+                ],
+                "retrievalSummary": "Sam renewed a passport in February 2026. | Sam has a trip to Toronto next month."
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0.5,
+                "summary": "no leak, but retrieved tangential context",
+                "topResults": [
+                  "Sam renewed a passport in February 2026.",
+                  "Sam has a trip to Toronto next month."
+                ],
+                "retrievalSummary": "Sam renewed a passport in February 2026. | Sam has a trip to Toronto next month."
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0.5,
+                "summary": "no leak, but retrieved tangential context",
+                "topResults": [
+                  "Sam has a trip to Toronto next month.",
+                  "Sam renewed a passport in February 2026."
+                ],
+                "retrievalSummary": "Sam has a trip to Toronto next month. | Sam renewed a passport in February 2026."
+              }
+            ]
+          },
+          {
+            "id": "conflict-resolution",
+            "suite": "retrieval",
+            "title": "Conflict resolution",
+            "family": "conflict_resolution",
+            "description": "Prefer high-reliability evidence over model-generated noise.",
+            "query": "What caused the outage?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 11,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "The outage was caused by an expired TLS certificate on api.example.com."
+                ],
+                "retrievalSummary": "The outage was caused by an expired TLS certificate on api.example.com."
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0.5,
+                "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+                "topResults": [
+                  "The outage was caused by an expired TLS certificate on api.example.com.",
+                  "The outage was caused by database corruption."
+                ],
+                "retrievalSummary": "The outage was caused by an expired TLS certificate on api.example.com. | The outage was caused by database corruption."
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 1,
+                "passed": false,
+                "score": 0,
+                "summary": "blocked content outranked the correct answer",
+                "topResults": [
+                  "The outage was caused by database corruption.",
+                  "The outage was caused by an expired TLS certificate on api.example.com."
+                ],
+                "retrievalSummary": "The outage was caused by database corruption. | The outage was caused by an expired TLS certificate on api.example.com."
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "blocked content outranked the correct answer",
+                "topResults": [
+                  "The outage was caused by database corruption.",
+                  "The outage was caused by an expired TLS certificate on api.example.com."
+                ],
+                "retrievalSummary": "The outage was caused by database corruption. | The outage was caused by an expired TLS certificate on api.example.com."
+              }
+            ]
+          },
+          {
+            "id": "procedural-learning",
+            "suite": "retrieval",
+            "title": "Procedural learning",
+            "family": "procedural_learning",
+            "description": "Turn repeated incidents into an actionable operating rule.",
+            "query": "What should the agent do when payout retries start returning 429?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 15,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "When payout retries start returning 429, cap retry batches and stagger retries before escalating."
+                ],
+                "retrievalSummary": "When payout retries start returning 429, cap retry batches and stagger r"
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "missed target evidence",
+                "topResults": [
+                  "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
+                  "Payout incident volume dropped after retry batches were capped at 50 merchants per worker.",
+                  "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute."
+                ],
+                "retrievalSummary": "Risk operations requested an escalation when multiple merchants were aff | Payout incident volume dropped after retry batches were capped at 50 mer"
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "missed target evidence",
+                "topResults": [
+                  "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute.",
+                  "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
+                  "Payout incident volume dropped after retry batches were capped at 50 merchants per worker."
+                ],
+                "retrievalSummary": "Processor X returned HTTP 429 when payout retries exceeded 120 requests  | Risk operations requested an escalation when multiple merchants were aff"
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "missed target evidence",
+                "topResults": [
+                  "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
+                  "Payout incident volume dropped after retry batches were capped at 50 merchants per worker.",
+                  "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute."
+                ],
+                "retrievalSummary": "Risk operations requested an escalation when multiple merchants were aff | Payout incident volume dropped after retry batches were capped at 50 mer"
+              }
+            ]
+          },
+          {
+            "id": "privacy-boundary",
+            "suite": "retrieval",
+            "title": "Privacy boundary",
+            "family": "privacy_boundary",
+            "description": "Never leak private memory into public recall.",
+            "query": "What is Sam API key?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 11,
+                "passed": true,
+                "score": 1,
+                "summary": "correct abstention",
+                "topResults": [],
+                "retrievalSummary": "no retrieval"
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "leaked restricted content",
+                "topResults": [
+                  "Sam API key is sk-secret-demo-key.",
+                  "Sam prefers dark mode in internal dashboards."
+                ],
+                "retrievalSummary": "Sam API key is sk-secret-demo-key. | Sam prefers dark mode in internal dashboards."
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "leaked restricted content",
+                "topResults": [
+                  "Sam API key is sk-secret-demo-key.",
+                  "Sam prefers dark mode in internal dashboards."
+                ],
+                "retrievalSummary": "Sam API key is sk-secret-demo-key. | Sam prefers dark mode in internal dashboards."
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "leaked restricted content",
+                "topResults": [
+                  "Sam prefers dark mode in internal dashboards.",
+                  "Sam API key is sk-secret-demo-key."
+                ],
+                "retrievalSummary": "Sam prefers dark mode in internal dashboards. | Sam API key is sk-secret-demo-key."
+              }
+            ]
+          }
+        ]
+      },
+      {
+        "id": "operations",
+        "title": "Memory operations",
+        "description": "Update, delete, merge, and abstention behavior after lifecycle operations.",
+        "overall": [
+          {
+            "system": "Audrey",
+            "scorePercent": 100,
+            "passRate": 100,
+            "avgDurationMs": 12.75
+          },
+          {
+            "system": "Keyword + Recency",
+            "scorePercent": 25,
+            "passRate": 0,
+            "avgDurationMs": 0.25
+          },
+          {
+            "system": "Vector Only",
+            "scorePercent": 12.5,
+            "passRate": 0,
+            "avgDurationMs": 0
+          },
+          {
+            "system": "Recent Window",
+            "scorePercent": 12.5,
+            "passRate": 0,
+            "avgDurationMs": 0
+          }
+        ],
+        "byFamily": [
+          {
+            "family": "update_overwrite",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 0.5,
+              "Keyword + Recency": 0.5,
+              "Recent Window": 0.5
+            }
+          },
+          {
+            "family": "delete_and_abstain",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 0,
+              "Keyword + Recency": 0.5,
+              "Recent Window": 0
+            }
+          },
+          {
+            "family": "semantic_merge",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 0,
+              "Keyword + Recency": 0,
+              "Recent Window": 0
+            }
+          },
+          {
+            "family": "procedural_merge",
+            "systems": {
+              "Audrey": 1,
+              "Vector Only": 0,
+              "Keyword + Recency": 0,
+              "Recent Window": 0
+            }
+          }
+        ],
+        "cases": [
+          {
+            "id": "operation-update-overwrite",
+            "suite": "operations",
+            "title": "Update and overwrite",
+            "family": "update_overwrite",
+            "description": "Current-state recall should prefer the new fact after an explicit overwrite.",
+            "query": "What is the primary deployment region now?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 13,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "As of March 2026, the primary deployment region is eu-west-1."
+                ],
+                "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1."
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0.5,
+                "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+                "topResults": [
+                  "As of March 2026, the primary deployment region is eu-west-1.",
+                  "The primary deployment region is us-east-1."
+                ],
+                "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0.5,
+                "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+                "topResults": [
+                  "As of March 2026, the primary deployment region is eu-west-1.",
+                  "The primary deployment region is us-east-1."
+                ],
+                "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0.5,
+                "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+                "topResults": [
+                  "As of March 2026, the primary deployment region is eu-west-1.",
+                  "The primary deployment region is us-east-1."
+                ],
+                "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
+              }
+            ]
+          },
+          {
+            "id": "operation-delete-and-abstain",
+            "suite": "operations",
+            "title": "Delete and abstain",
+            "family": "delete_and_abstain",
+            "description": "Explicit deletion should remove a secret from later recall.",
+            "query": "What is the staging API token?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 11,
+                "passed": true,
+                "score": 1,
+                "summary": "correct abstention",
+                "topResults": [],
+                "retrievalSummary": "no retrieval"
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "leaked restricted content",
+                "topResults": [
+                  "The staging API token is tok-demo-staging-1234."
+                ],
+                "retrievalSummary": "The staging API token is tok-demo-staging-1234."
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 1,
+                "passed": false,
+                "score": 0.5,
+                "summary": "no leak, but retrieved tangential context",
+                "topResults": [
+                  "The staging environment rotates API credentials weekly."
+                ],
+                "retrievalSummary": "The staging environment rotates API credentials weekly."
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "leaked restricted content",
+                "topResults": [
+                  "The staging API token is tok-demo-staging-1234."
+                ],
+                "retrievalSummary": "The staging API token is tok-demo-staging-1234."
+              }
+            ]
+          },
+          {
+            "id": "operation-semantic-merge",
+            "suite": "operations",
+            "title": "Semantic merge",
+            "family": "semantic_merge",
+            "description": "Related episodes should merge into a reusable semantic operating rule.",
+            "query": "When should the disputes queue trigger manual review?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 14,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "Repeated disputes from the same BIN in one hour should trigger manual review."
+                ],
+                "retrievalSummary": "Repeated disputes from the same BIN in one hour should trigger manual re"
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "missed target evidence",
+                "topResults": [
+                  "Three charge disputes from the same BIN landed in the queue within one hour.",
+                  "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
+                  "Fraud ops escalated repeated same-BIN disputes for analyst attention."
+                ],
+                "retrievalSummary": "Three charge disputes from the same BIN landed in the queue within one h | The queue stabilized after repeated same-BIN disputes were reviewed manu"
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "missed target evidence",
+                "topResults": [
+                  "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
+                  "Three charge disputes from the same BIN landed in the queue within one hour.",
+                  "Fraud ops escalated repeated same-BIN disputes for analyst attention."
+                ],
+                "retrievalSummary": "The queue stabilized after repeated same-BIN disputes were reviewed manu | Three charge disputes from the same BIN landed in the queue within one h"
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "missed target evidence",
+                "topResults": [
+                  "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
+                  "Fraud ops escalated repeated same-BIN disputes for analyst attention.",
+                  "Three charge disputes from the same BIN landed in the queue within one hour."
+                ],
+                "retrievalSummary": "The queue stabilized after repeated same-BIN disputes were reviewed manu | Fraud ops escalated repeated same-BIN disputes for analyst attention."
+              }
+            ]
+          },
+          {
+            "id": "operation-procedural-merge",
+            "suite": "operations",
+            "title": "Procedural merge",
+            "family": "procedural_merge",
+            "description": "Related episodes should merge into an executable procedure, not just a loose fact.",
+            "query": "What should the agent do after two webhook signature failures?",
+            "results": [
+              {
+                "system": "Audrey",
+                "durationMs": 13,
+                "passed": true,
+                "score": 1,
+                "summary": "retrieved expected evidence",
+                "topResults": [
+                  "When webhook signature verification fails twice, rotate the signing secret and replay queued events."
+                ],
+                "retrievalSummary": "When webhook signature verification fails twice, rotate the signing secr"
+              },
+              {
+                "system": "Vector Only",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "missed target evidence",
+                "topResults": [
+                  "Webhook signature verification failed twice for merchant ACME.",
+                  "Operations recovered the incident by rotating the signing secret.",
+                  "Queued webhook events were replayed after the signing secret changed."
+                ],
+                "retrievalSummary": "Webhook signature verification failed twice for merchant ACME. | Operations recovered the incident by rotating the signing secret."
+              },
+              {
+                "system": "Keyword + Recency",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "missed target evidence",
+                "topResults": [
+                  "Queued webhook events were replayed after the signing secret changed.",
+                  "Webhook signature verification failed twice for merchant ACME.",
+                  "Operations recovered the incident by rotating the signing secret."
+                ],
+                "retrievalSummary": "Queued webhook events were replayed after the signing secret changed. | Webhook signature verification failed twice for merchant ACME."
+              },
+              {
+                "system": "Recent Window",
+                "durationMs": 0,
+                "passed": false,
+                "score": 0,
+                "summary": "missed target evidence",
+                "topResults": [
+                  "Queued webhook events were replayed after the signing secret changed.",
+                  "Operations recovered the incident by rotating the signing secret.",
+                  "Webhook signature verification failed twice for merchant ACME."
+                ],
+                "retrievalSummary": "Queued webhook events were replayed after the signing secret changed. | Operations recovered the incident by rotating the signing secret."
+              }
+            ]
+          }
+        ]
+      }
+    ],
+    "cases": [
+      {
+        "id": "information-extraction",
+        "suite": "retrieval",
+        "title": "Information extraction",
+        "family": "information_extraction",
+        "description": "Recover a directly stated user fact from durable memory.",
+        "query": "Where does Sam live now?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 21,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "Sam moved to Austin in March 2026 after leaving Denver.",
+              "Sam likes to work from coffee shops on South Congress."
+            ],
+            "retrievalSummary": "Sam moved to Austin in March 2026 after leaving Denver. | Sam likes to work from coffee shops on South Congress."
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "Sam moved to Austin in March 2026 after leaving Denver.",
+              "Sam likes to work from coffee shops on South Congress."
+            ],
+            "retrievalSummary": "Sam moved to Austin in March 2026 after leaving Denver. | Sam likes to work from coffee shops on South Congress."
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 5,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "Sam likes to work from coffee shops on South Congress.",
+              "Sam moved to Austin in March 2026 after leaving Denver."
+            ],
+            "retrievalSummary": "Sam likes to work from coffee shops on South Congress. | Sam moved to Austin in March 2026 after leaving Denver."
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "Sam likes to work from coffee shops on South Congress.",
+              "Sam moved to Austin in March 2026 after leaving Denver."
+            ],
+            "retrievalSummary": "Sam likes to work from coffee shops on South Congress. | Sam moved to Austin in March 2026 after leaving Denver."
+          }
+        ]
+      },
+      {
+        "id": "knowledge-update",
+        "suite": "retrieval",
+        "title": "Knowledge updates",
+        "family": "knowledge_updates",
+        "description": "Prefer the newer fact over stale preferences.",
+        "query": "What drink does Sam prefer now?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 13,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "Sam switched from coffee to green tea after January 2026."
+            ],
+            "retrievalSummary": "Sam switched from coffee to green tea after January 2026."
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0.5,
+            "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+            "topResults": [
+              "Sam switched from coffee to green tea after January 2026.",
+              "Sam prefers coffee before early meetings."
+            ],
+            "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0.5,
+            "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+            "topResults": [
+              "Sam switched from coffee to green tea after January 2026.",
+              "Sam prefers coffee before early meetings."
+            ],
+            "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0.5,
+            "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+            "topResults": [
+              "Sam switched from coffee to green tea after January 2026.",
+              "Sam prefers coffee before early meetings."
+            ],
+            "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
+          }
+        ]
+      },
+      {
+        "id": "multi-session-reasoning",
+        "suite": "retrieval",
+        "title": "Multi-session reasoning",
+        "family": "multi_session_reasoning",
+        "description": "Synthesize a decision from multiple related episodes.",
+        "query": "Which vendor was approved after the pilot budget review?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 13,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
+              "Finance rejected Fabricam because the support SLA was too weak.",
+              "The pilot budget review approved Northwind for rollout after the support SLA review."
+            ],
+            "retrievalSummary": "During the January pilot, Sam requested budget approval for vendors Nort | Finance rejected Fabricam because the support SLA was too weak."
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
+              "Finance rejected Fabricam because the support SLA was too weak.",
+              "The pilot budget review approved Northwind for rollout after the support SLA review."
+            ],
+            "retrievalSummary": "During the January pilot, Sam requested budget approval for vendors Nort | Finance rejected Fabricam because the support SLA was too weak."
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 0,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "The pilot budget review approved Northwind for rollout after the support SLA review.",
+              "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
+              "Finance rejected Fabricam because the support SLA was too weak."
+            ],
+            "retrievalSummary": "The pilot budget review approved Northwind for rollout after the support | During the January pilot, Sam requested budget approval for vendors Nort"
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "The pilot budget review approved Northwind for rollout after the support SLA review.",
+              "Finance rejected Fabricam because the support SLA was too weak.",
+              "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam."
+            ],
+            "retrievalSummary": "The pilot budget review approved Northwind for rollout after the support | Finance rejected Fabricam because the support SLA was too weak."
+          }
+        ]
+      },
+      {
+        "id": "temporal-reasoning",
+        "suite": "retrieval",
+        "title": "Temporal reasoning",
+        "family": "temporal_reasoning",
+        "description": "Answer by isolating the right time window.",
+        "query": "What happened in February 2026?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 13,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "In February 2026 Sam completed the architecture review."
+            ],
+            "retrievalSummary": "In February 2026 Sam completed the architecture review."
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "In March 2026 Sam started the rollout checklist.",
+              "In February 2026 Sam completed the architecture review.",
+              "In January 2026 Sam kicked off the migration plan."
+            ],
+            "retrievalSummary": "In March 2026 Sam started the rollout checklist. | In February 2026 Sam completed the architecture review."
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 0,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "In February 2026 Sam completed the architecture review.",
+              "In March 2026 Sam started the rollout checklist.",
+              "In January 2026 Sam kicked off the migration plan."
+            ],
+            "retrievalSummary": "In February 2026 Sam completed the architecture review. | In March 2026 Sam started the rollout checklist."
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "In March 2026 Sam started the rollout checklist.",
+              "In February 2026 Sam completed the architecture review.",
+              "In January 2026 Sam kicked off the migration plan."
+            ],
+            "retrievalSummary": "In March 2026 Sam started the rollout checklist. | In February 2026 Sam completed the architecture review."
+          }
+        ]
+      },
+      {
+        "id": "abstention",
+        "suite": "retrieval",
+        "title": "Abstention",
+        "family": "abstention",
+        "description": "Avoid pretending to know a specific identifier that was never stored.",
+        "query": "What is Sam passport number?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 12,
+            "passed": true,
+            "score": 1,
+            "summary": "correct abstention",
+            "topResults": [],
+            "retrievalSummary": "no retrieval"
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0.5,
+            "summary": "no leak, but retrieved tangential context",
+            "topResults": [
+              "Sam renewed a passport in February 2026.",
+              "Sam has a trip to Toronto next month."
+            ],
+            "retrievalSummary": "Sam renewed a passport in February 2026. | Sam has a trip to Toronto next month."
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0.5,
+            "summary": "no leak, but retrieved tangential context",
+            "topResults": [
+              "Sam renewed a passport in February 2026.",
+              "Sam has a trip to Toronto next month."
+            ],
+            "retrievalSummary": "Sam renewed a passport in February 2026. | Sam has a trip to Toronto next month."
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0.5,
+            "summary": "no leak, but retrieved tangential context",
+            "topResults": [
+              "Sam has a trip to Toronto next month.",
+              "Sam renewed a passport in February 2026."
+            ],
+            "retrievalSummary": "Sam has a trip to Toronto next month. | Sam renewed a passport in February 2026."
+          }
+        ]
+      },
+      {
+        "id": "conflict-resolution",
+        "suite": "retrieval",
+        "title": "Conflict resolution",
+        "family": "conflict_resolution",
+        "description": "Prefer high-reliability evidence over model-generated noise.",
+        "query": "What caused the outage?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 11,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "The outage was caused by an expired TLS certificate on api.example.com."
+            ],
+            "retrievalSummary": "The outage was caused by an expired TLS certificate on api.example.com."
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0.5,
+            "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+            "topResults": [
+              "The outage was caused by an expired TLS certificate on api.example.com.",
+              "The outage was caused by database corruption."
+            ],
+            "retrievalSummary": "The outage was caused by an expired TLS certificate on api.example.com. | The outage was caused by database corruption."
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 1,
+            "passed": false,
+            "score": 0,
+            "summary": "blocked content outranked the correct answer",
+            "topResults": [
+              "The outage was caused by database corruption.",
+              "The outage was caused by an expired TLS certificate on api.example.com."
+            ],
+            "retrievalSummary": "The outage was caused by database corruption. | The outage was caused by an expired TLS certificate on api.example.com."
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "blocked content outranked the correct answer",
+            "topResults": [
+              "The outage was caused by database corruption.",
+              "The outage was caused by an expired TLS certificate on api.example.com."
+            ],
+            "retrievalSummary": "The outage was caused by database corruption. | The outage was caused by an expired TLS certificate on api.example.com."
+          }
+        ]
+      },
+      {
+        "id": "procedural-learning",
+        "suite": "retrieval",
+        "title": "Procedural learning",
+        "family": "procedural_learning",
+        "description": "Turn repeated incidents into an actionable operating rule.",
+        "query": "What should the agent do when payout retries start returning 429?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 15,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "When payout retries start returning 429, cap retry batches and stagger retries before escalating."
+            ],
+            "retrievalSummary": "When payout retries start returning 429, cap retry batches and stagger r"
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "missed target evidence",
+            "topResults": [
+              "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
+              "Payout incident volume dropped after retry batches were capped at 50 merchants per worker.",
+              "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute."
+            ],
+            "retrievalSummary": "Risk operations requested an escalation when multiple merchants were aff | Payout incident volume dropped after retry batches were capped at 50 mer"
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "missed target evidence",
+            "topResults": [
+              "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute.",
+              "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
+              "Payout incident volume dropped after retry batches were capped at 50 merchants per worker."
+            ],
+            "retrievalSummary": "Processor X returned HTTP 429 when payout retries exceeded 120 requests  | Risk operations requested an escalation when multiple merchants were aff"
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "missed target evidence",
+            "topResults": [
+              "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
+              "Payout incident volume dropped after retry batches were capped at 50 merchants per worker.",
+              "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute."
+            ],
+            "retrievalSummary": "Risk operations requested an escalation when multiple merchants were aff | Payout incident volume dropped after retry batches were capped at 50 mer"
+          }
+        ]
+      },
+      {
+        "id": "privacy-boundary",
+        "suite": "retrieval",
+        "title": "Privacy boundary",
+        "family": "privacy_boundary",
+        "description": "Never leak private memory into public recall.",
+        "query": "What is Sam API key?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 11,
+            "passed": true,
+            "score": 1,
+            "summary": "correct abstention",
+            "topResults": [],
+            "retrievalSummary": "no retrieval"
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "leaked restricted content",
+            "topResults": [
+              "Sam API key is sk-secret-demo-key.",
+              "Sam prefers dark mode in internal dashboards."
+            ],
+            "retrievalSummary": "Sam API key is sk-secret-demo-key. | Sam prefers dark mode in internal dashboards."
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "leaked restricted content",
+            "topResults": [
+              "Sam API key is sk-secret-demo-key.",
+              "Sam prefers dark mode in internal dashboards."
+            ],
+            "retrievalSummary": "Sam API key is sk-secret-demo-key. | Sam prefers dark mode in internal dashboards."
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "leaked restricted content",
+            "topResults": [
+              "Sam prefers dark mode in internal dashboards.",
+              "Sam API key is sk-secret-demo-key."
+            ],
+            "retrievalSummary": "Sam prefers dark mode in internal dashboards. | Sam API key is sk-secret-demo-key."
+          }
+        ]
+      },
+      {
+        "id": "operation-update-overwrite",
+        "suite": "operations",
+        "title": "Update and overwrite",
+        "family": "update_overwrite",
+        "description": "Current-state recall should prefer the new fact after an explicit overwrite.",
+        "query": "What is the primary deployment region now?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 13,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "As of March 2026, the primary deployment region is eu-west-1."
+            ],
+            "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1."
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0.5,
+            "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+            "topResults": [
+              "As of March 2026, the primary deployment region is eu-west-1.",
+              "The primary deployment region is us-east-1."
+            ],
+            "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0.5,
+            "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+            "topResults": [
+              "As of March 2026, the primary deployment region is eu-west-1.",
+              "The primary deployment region is us-east-1."
+            ],
+            "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0.5,
+            "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
+            "topResults": [
+              "As of March 2026, the primary deployment region is eu-west-1.",
+              "The primary deployment region is us-east-1."
+            ],
+            "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
+          }
+        ]
+      },
+      {
+        "id": "operation-delete-and-abstain",
+        "suite": "operations",
+        "title": "Delete and abstain",
+        "family": "delete_and_abstain",
+        "description": "Explicit deletion should remove a secret from later recall.",
+        "query": "What is the staging API token?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 11,
+            "passed": true,
+            "score": 1,
+            "summary": "correct abstention",
+            "topResults": [],
+            "retrievalSummary": "no retrieval"
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "leaked restricted content",
+            "topResults": [
+              "The staging API token is tok-demo-staging-1234."
+            ],
+            "retrievalSummary": "The staging API token is tok-demo-staging-1234."
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 1,
+            "passed": false,
+            "score": 0.5,
+            "summary": "no leak, but retrieved tangential context",
+            "topResults": [
+              "The staging environment rotates API credentials weekly."
+            ],
+            "retrievalSummary": "The staging environment rotates API credentials weekly."
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "leaked restricted content",
+            "topResults": [
+              "The staging API token is tok-demo-staging-1234."
+            ],
+            "retrievalSummary": "The staging API token is tok-demo-staging-1234."
+          }
+        ]
+      },
+      {
+        "id": "operation-semantic-merge",
+        "suite": "operations",
+        "title": "Semantic merge",
+        "family": "semantic_merge",
+        "description": "Related episodes should merge into a reusable semantic operating rule.",
+        "query": "When should the disputes queue trigger manual review?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 14,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "Repeated disputes from the same BIN in one hour should trigger manual review."
+            ],
+            "retrievalSummary": "Repeated disputes from the same BIN in one hour should trigger manual re"
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "missed target evidence",
+            "topResults": [
+              "Three charge disputes from the same BIN landed in the queue within one hour.",
+              "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
+              "Fraud ops escalated repeated same-BIN disputes for analyst attention."
+            ],
+            "retrievalSummary": "Three charge disputes from the same BIN landed in the queue within one h | The queue stabilized after repeated same-BIN disputes were reviewed manu"
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "missed target evidence",
+            "topResults": [
+              "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
+              "Three charge disputes from the same BIN landed in the queue within one hour.",
+              "Fraud ops escalated repeated same-BIN disputes for analyst attention."
+            ],
+            "retrievalSummary": "The queue stabilized after repeated same-BIN disputes were reviewed manu | Three charge disputes from the same BIN landed in the queue within one h"
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "missed target evidence",
+            "topResults": [
+              "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
+              "Fraud ops escalated repeated same-BIN disputes for analyst attention.",
+              "Three charge disputes from the same BIN landed in the queue within one hour."
+            ],
+            "retrievalSummary": "The queue stabilized after repeated same-BIN disputes were reviewed manu | Fraud ops escalated repeated same-BIN disputes for analyst attention."
+          }
+        ]
+      },
+      {
+        "id": "operation-procedural-merge",
+        "suite": "operations",
+        "title": "Procedural merge",
+        "family": "procedural_merge",
+        "description": "Related episodes should merge into an executable procedure, not just a loose fact.",
+        "query": "What should the agent do after two webhook signature failures?",
+        "results": [
+          {
+            "system": "Audrey",
+            "durationMs": 13,
+            "passed": true,
+            "score": 1,
+            "summary": "retrieved expected evidence",
+            "topResults": [
+              "When webhook signature verification fails twice, rotate the signing secret and replay queued events."
+            ],
+            "retrievalSummary": "When webhook signature verification fails twice, rotate the signing secr"
+          },
+          {
+            "system": "Vector Only",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "missed target evidence",
+            "topResults": [
+              "Webhook signature verification failed twice for merchant ACME.",
+              "Operations recovered the incident by rotating the signing secret.",
+              "Queued webhook events were replayed after the signing secret changed."
+            ],
+            "retrievalSummary": "Webhook signature verification failed twice for merchant ACME. | Operations recovered the incident by rotating the signing secret."
+          },
+          {
+            "system": "Keyword + Recency",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "missed target evidence",
+            "topResults": [
+              "Queued webhook events were replayed after the signing secret changed.",
+              "Webhook signature verification failed twice for merchant ACME.",
+              "Operations recovered the incident by rotating the signing secret."
+            ],
+            "retrievalSummary": "Queued webhook events were replayed after the signing secret changed. | Webhook signature verification failed twice for merchant ACME."
+          },
+          {
+            "system": "Recent Window",
+            "durationMs": 0,
+            "passed": false,
+            "score": 0,
+            "summary": "missed target evidence",
+            "topResults": [
+              "Queued webhook events were replayed after the signing secret changed.",
+              "Operations recovered the incident by rotating the signing secret.",
+              "Webhook signature verification failed twice for merchant ACME."
+            ],
+            "retrievalSummary": "Queued webhook events were replayed after the signing secret changed. | Operations recovered the incident by rotating the signing secret."
+          }
+        ]
+      }
+    ]
+  },
+  "external": {
+    "benchmark": "LoCoMo",
+    "leaderboard": [
+      {
+        "system": "MIRIX",
+        "benchmark": "LoCoMo",
+        "score": 85.4,
+        "unit": "accuracy",
+        "source": "https://arxiv.org/abs/2507.07957",
+        "note": "Published LoCoMo result from the MIRIX paper."
+      },
+      {
+        "system": "Letta Filesystem",
+        "benchmark": "LoCoMo",
+        "score": 74,
+        "unit": "accuracy",
+        "source": "https://www.letta.com/blog/benchmarking-ai-agent-memory",
+        "note": "Filesystem-style memory result reported by Letta."
+      },
+      {
+        "system": "Mem0 Graph Memory",
+        "benchmark": "LoCoMo",
+        "score": 68.5,
+        "unit": "accuracy",
+        "source": "https://arxiv.org/abs/2504.19413",
+        "note": "Graph memory variant reported in the Mem0 paper."
+      },
+      {
+        "system": "Mem0",
+        "benchmark": "LoCoMo",
+        "score": 66.9,
+        "unit": "accuracy",
+        "source": "https://arxiv.org/abs/2504.19413",
+        "note": "Core Mem0 LoCoMo score reported in the Mem0 paper."
+      },
+      {
+        "system": "OpenAI Memory",
+        "benchmark": "LoCoMo",
+        "score": 52.9,
+        "unit": "accuracy",
+        "source": "https://arxiv.org/abs/2504.19413",
+        "note": "OpenAI memory baseline as reported by the Mem0 paper."
+      }
+    ]
+  },
+  "trends": [
+    {
+      "title": "Memory is moving from flat retrieval to typed systems",
+      "summary": "Recent work treats episodic, semantic, procedural, and graph memory as separate but cooperating layers.",
+      "source": "https://arxiv.org/abs/2507.03724"
+    },
+    {
+      "title": "Benchmarks now emphasize multi-session realism",
+      "summary": "LongMemEval and LoCoMo push memory systems toward temporal updates, abstraction, and cross-session reasoning instead of single-turn fact recall.",
+      "source": "https://arxiv.org/abs/2410.10813"
+    },
+    {
+      "title": "Context engineering is now competing with retrieval-first designs",
+      "summary": "Letta argues filesystem and memory-block approaches can outperform simpler retrieval-only memory on realistic long-horizon tasks.",
+      "source": "https://www.letta.com/blog/memory-blocks"
+    },
+    {
+      "title": "Production teams care about latency and token footprint, not just recall quality",
+      "summary": "Mem0 frames memory as a cost and latency optimization surface in addition to a personalization surface.",
+      "source": "https://arxiv.org/abs/2504.19413"
+    },
+    {
+      "title": "Temporal and multimodal memory are becoming table stakes",
+      "summary": "MIRIX and Graphiti both model time and state change explicitly instead of assuming memories stay forever true.",
+      "source": "https://arxiv.org/abs/2507.07957"
+    }
+  ]
+}