Evilander commited on
Commit
253d35a
·
verified ·
1 Parent(s): ad1d207

Upload 7 files

Browse files
index.html CHANGED
@@ -1,19 +1,210 @@
1
  <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <title>Audrey Memory Benchmark</title>
6
+ <style>
7
+ body { font-family: "Segoe UI", Arial, sans-serif; margin: 32px; color: #111827; background: #f8fafc; }
8
+ main { max-width: 1120px; margin: 0 auto; }
9
+ h1, h2 { margin-bottom: 12px; }
10
+ p, li { line-height: 1.5; }
11
+ .callout { background: white; border: 1px solid #cbd5e1; border-radius: 16px; padding: 20px; margin-bottom: 24px; }
12
+ .grid { display: grid; gap: 24px; grid-template-columns: 1fr; }
13
+ img { width: 100%; border: 1px solid #cbd5e1; border-radius: 16px; background: white; }
14
+ table { width: 100%; border-collapse: collapse; background: white; border-radius: 16px; overflow: hidden; }
15
+ th, td { border: 1px solid #cbd5e1; padding: 12px; vertical-align: top; text-align: left; }
16
+ th { background: #e2e8f0; }
17
+ code { background: #e2e8f0; padding: 2px 6px; border-radius: 6px; }
18
+ </style>
19
+ </head>
20
+ <body>
21
+ <main>
22
+ <h1>Audrey Memory Benchmark</h1>
23
+ <div class="callout">
24
+ <p><strong>Method:</strong> Audrey is scored on a LongMemEval-inspired retrieval benchmark plus an operation-level lifecycle benchmark. The report still separates local Audrey-versus-baseline results from published external LoCoMo numbers so the comparison stays honest.</p>
25
+ <p><strong>Run:</strong> <code>node benchmarks/run.js --provider mock --dimensions 64</code></p>
26
+ <p><strong>Generated:</strong> 2026-05-01T03:20:07.968Z</p>
27
+ </div>
28
+
29
+ <div class="grid">
30
+ <section class="callout">
31
+ <h2>Combined Local Benchmark</h2>
32
+ <img src="./local-overall.svg" alt="Combined local benchmark bar chart" />
33
+ </section>
34
+
35
+
36
+ <section class="callout">
37
+ <h2>Retrieval capabilities Benchmark</h2>
38
+ <p>LongMemEval-style memory abilities plus privacy and abstention.</p>
39
+ <img src="./retrieval-overall.svg" alt="Retrieval capabilities Benchmark chart" />
40
+ </section>
41
+
42
+
43
+ <section class="callout">
44
+ <h2>Memory operations Benchmark</h2>
45
+ <p>Update, delete, merge, and abstention behavior after lifecycle operations.</p>
46
+ <img src="./operations-overall.svg" alt="Memory operations Benchmark chart" />
47
+ </section>
48
+
49
+
50
+ <section class="callout">
51
+ <h2>Published Leaderboard</h2>
52
+ <img src="./published-locomo.svg" alt="Published LoCoMo leaderboard bar chart" />
53
+ </section>
54
+ </div>
55
+
56
+ <section class="callout">
57
+ <h2>Case Matrix</h2>
58
+ <table>
59
+ <thead>
60
+ <tr>
61
+ <th>Case</th>
62
+ <th>Suite</th>
63
+ <th>Family</th>
64
+ <th>Audrey</th><th>Vector Only</th><th>Keyword + Recency</th><th>Recent Window</th>
65
+ </tr>
66
+ </thead>
67
+ <tbody>
68
+
69
+ <tr>
70
+ <td>Information extraction</td>
71
+ <td>retrieval</td>
72
+ <td>information_extraction</td>
73
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td>
74
+ </tr>
75
+
76
+
77
+ <tr>
78
+ <td>Knowledge updates</td>
79
+ <td>retrieval</td>
80
+ <td>knowledge_updates</td>
81
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td>
82
+ </tr>
83
+
84
+
85
+ <tr>
86
+ <td>Multi-session reasoning</td>
87
+ <td>retrieval</td>
88
+ <td>multi_session_reasoning</td>
89
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td>
90
+ </tr>
91
+
92
+
93
+ <tr>
94
+ <td>Temporal reasoning</td>
95
+ <td>retrieval</td>
96
+ <td>temporal_reasoning</td>
97
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td>
98
+ </tr>
99
+
100
+
101
+ <tr>
102
+ <td>Abstention</td>
103
+ <td>retrieval</td>
104
+ <td>abstention</td>
105
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">correct abstention</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">no leak, but retrieved tangential context</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">no leak, but retrieved tangential context</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">no leak, but retrieved tangential context</span></td>
106
+ </tr>
107
+
108
+
109
+ <tr>
110
+ <td>Conflict resolution</td>
111
+ <td>retrieval</td>
112
+ <td>conflict_resolution</td>
113
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">blocked content outranked the correct answer</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">blocked content outranked the correct answer</span></td>
114
+ </tr>
115
+
116
+
117
+ <tr>
118
+ <td>Procedural learning</td>
119
+ <td>retrieval</td>
120
+ <td>procedural_learning</td>
121
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td>
122
+ </tr>
123
+
124
+
125
+ <tr>
126
+ <td>Privacy boundary</td>
127
+ <td>retrieval</td>
128
+ <td>privacy_boundary</td>
129
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">correct abstention</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">leaked restricted content</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">leaked restricted content</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">leaked restricted content</span></td>
130
+ </tr>
131
+
132
+
133
+ <tr>
134
+ <td>Update and overwrite</td>
135
+ <td>operations</td>
136
+ <td>update_overwrite</td>
137
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">retrieved expected evidence, but conflicting evidence still appeared later</span></td>
138
+ </tr>
139
+
140
+
141
+ <tr>
142
+ <td>Delete and abstain</td>
143
+ <td>operations</td>
144
+ <td>delete_and_abstain</td>
145
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">correct abstention</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">leaked restricted content</span></td><td style="background:#fff7ed;color:#9a3412">0.50<br /><span style="font-size:12px">no leak, but retrieved tangential context</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">leaked restricted content</span></td>
146
+ </tr>
147
+
148
+
149
+ <tr>
150
+ <td>Semantic merge</td>
151
+ <td>operations</td>
152
+ <td>semantic_merge</td>
153
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td>
154
+ </tr>
155
+
156
+
157
+ <tr>
158
+ <td>Procedural merge</td>
159
+ <td>operations</td>
160
+ <td>procedural_merge</td>
161
+ <td style="background:#ecfdf5;color:#065f46">1.00<br /><span style="font-size:12px">retrieved expected evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td><td style="background:#fef2f2;color:#991b1b">0.00<br /><span style="font-size:12px">missed target evidence</span></td>
162
+ </tr>
163
+
164
+ </tbody>
165
+ </table>
166
+ </section>
167
+
168
+ <section class="callout">
169
+ <h2>March 23, 2026 Memory Trends</h2>
170
+ <ul>
171
+
172
+ <li>
173
+ <strong>Memory is moving from flat retrieval to typed systems</strong><br />
174
+ Recent work treats episodic, semantic, procedural, and graph memory as separate but cooperating layers.<br />
175
+ <a href="https://arxiv.org/abs/2507.03724">https://arxiv.org/abs/2507.03724</a>
176
+ </li>
177
+
178
+
179
+ <li>
180
+ <strong>Benchmarks now emphasize multi-session realism</strong><br />
181
+ LongMemEval and LoCoMo push memory systems toward temporal updates, abstraction, and cross-session reasoning instead of single-turn fact recall.<br />
182
+ <a href="https://arxiv.org/abs/2410.10813">https://arxiv.org/abs/2410.10813</a>
183
+ </li>
184
+
185
+
186
+ <li>
187
+ <strong>Context engineering is now competing with retrieval-first designs</strong><br />
188
+ Letta argues filesystem and memory-block approaches can outperform simpler retrieval-only memory on realistic long-horizon tasks.<br />
189
+ <a href="https://www.letta.com/blog/memory-blocks">https://www.letta.com/blog/memory-blocks</a>
190
+ </li>
191
+
192
+
193
+ <li>
194
+ <strong>Production teams care about latency and token footprint, not just recall quality</strong><br />
195
+ Mem0 frames memory as a cost and latency optimization surface in addition to a personalization surface.<br />
196
+ <a href="https://arxiv.org/abs/2504.19413">https://arxiv.org/abs/2504.19413</a>
197
+ </li>
198
+
199
+
200
+ <li>
201
+ <strong>Temporal and multimodal memory are becoming table stakes</strong><br />
202
+ MIRIX and Graphiti both model time and state change explicitly instead of assuming memories stay forever true.<br />
203
+ <a href="https://arxiv.org/abs/2507.07957">https://arxiv.org/abs/2507.07957</a>
204
+ </li>
205
+
206
+ </ul>
207
+ </section>
208
+ </main>
209
+ </body>
210
+ </html>
local-overall.svg ADDED
operations-overall.svg ADDED
perf-0.22.2.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "generatedAt": "2026-05-01T02:15:29.400Z",
3
+ "durationMs": 4600,
4
+ "audreyVersion": null,
5
+ "gitSha": "e2e821b",
6
+ "methodology": {
7
+ "embedding": "mock provider, 64 dimensions (in-process, no network)",
8
+ "llm": "mock provider (in-process)",
9
+ "retrieval": "hybrid (vector + lexical) with limit=5",
10
+ "sizes": [
11
+ 100,
12
+ 1000,
13
+ 5000
14
+ ],
15
+ "recallRunsPerSize": 50,
16
+ "notes": "Latency is wall-clock for a single call from a JS caller. Cloud and local 384-dim providers will report higher recall latency dominated by embedding cost and network. Run on your own hardware before quoting."
17
+ },
18
+ "machine": {
19
+ "node": "25.5.0",
20
+ "v8": "14.1.146.11-node.18",
21
+ "platform": "win32",
22
+ "arch": "x64",
23
+ "osRelease": "10.0.26200",
24
+ "cpuCount": 24,
25
+ "cpuModel": "AMD Ryzen 9 7900X3D 12-Core Processor ",
26
+ "memoryGb": 62.9
27
+ },
28
+ "sizes": [
29
+ {
30
+ "corpusSize": 100,
31
+ "encodeMs": {
32
+ "samples": 100,
33
+ "p50": 0.331,
34
+ "p95": 0.589,
35
+ "p99": 7.65,
36
+ "min": 0.214,
37
+ "max": 10.978,
38
+ "mean": 0.577
39
+ },
40
+ "hybridRecallMs": {
41
+ "samples": 50,
42
+ "p50": 0.539,
43
+ "p95": 1.82,
44
+ "p99": 2.712,
45
+ "min": 0.448,
46
+ "max": 2.712,
47
+ "mean": 0.659
48
+ },
49
+ "postEncodeQueueMs": {
50
+ "samples": 100,
51
+ "p50": 0.344,
52
+ "p95": 0.73,
53
+ "p99": 9.715,
54
+ "min": 0.079,
55
+ "max": 13.218,
56
+ "mean": 0.533
57
+ },
58
+ "queueEvents": 100
59
+ },
60
+ {
61
+ "corpusSize": 1000,
62
+ "encodeMs": {
63
+ "samples": 1000,
64
+ "p50": 0.307,
65
+ "p95": 2.147,
66
+ "p99": 9.672,
67
+ "min": 0.183,
68
+ "max": 13.514,
69
+ "mean": 0.639
70
+ },
71
+ "hybridRecallMs": {
72
+ "samples": 50,
73
+ "p50": 1.566,
74
+ "p95": 2.364,
75
+ "p99": 21.177,
76
+ "min": 0.608,
77
+ "max": 21.177,
78
+ "mean": 1.803
79
+ },
80
+ "postEncodeQueueMs": {
81
+ "samples": 1000,
82
+ "p50": 0.321,
83
+ "p95": 1.468,
84
+ "p99": 9.744,
85
+ "min": 0.07,
86
+ "max": 15.61,
87
+ "mean": 0.553
88
+ },
89
+ "queueEvents": 1000
90
+ },
91
+ {
92
+ "corpusSize": 5000,
93
+ "encodeMs": {
94
+ "samples": 5000,
95
+ "p50": 0.308,
96
+ "p95": 1.838,
97
+ "p99": 10.45,
98
+ "min": 0.171,
99
+ "max": 19.247,
100
+ "mean": 0.653
101
+ },
102
+ "hybridRecallMs": {
103
+ "samples": 50,
104
+ "p50": 2.091,
105
+ "p95": 3.417,
106
+ "p99": 16.58,
107
+ "min": 2.005,
108
+ "max": 16.58,
109
+ "mean": 2.492
110
+ },
111
+ "postEncodeQueueMs": {
112
+ "samples": 5000,
113
+ "p50": 0.313,
114
+ "p95": 1.217,
115
+ "p99": 10.532,
116
+ "min": 0.074,
117
+ "max": 19.378,
118
+ "mean": 0.554
119
+ },
120
+ "queueEvents": 5000
121
+ }
122
+ ]
123
+ }
published-locomo.svg ADDED
retrieval-overall.svg ADDED
summary.json ADDED
@@ -0,0 +1,1796 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "generatedAt": "2026-05-01T03:20:07.968Z",
3
+ "command": "node benchmarks/run.js --provider mock --dimensions 64",
4
+ "config": {
5
+ "provider": "mock",
6
+ "dimensions": 64,
7
+ "suites": [
8
+ "retrieval",
9
+ "operations"
10
+ ]
11
+ },
12
+ "methodology": {
13
+ "localBenchmark": "LongMemEval-inspired retrieval benchmark plus operation-level lifecycle benchmark",
14
+ "retrievalBenchmark": "Information extraction, updates, reasoning, procedural learning, privacy, abstention, and conflict handling",
15
+ "operationsBenchmark": "Update, overwrite, delete, merge, and abstention behavior after lifecycle operations",
16
+ "externalLeaderboard": "Published LoCoMo scores from official papers and project blogs"
17
+ },
18
+ "local": {
19
+ "overall": [
20
+ {
21
+ "system": "Audrey",
22
+ "scorePercent": 100,
23
+ "passRate": 100,
24
+ "avgDurationMs": 13.333333333333334
25
+ },
26
+ {
27
+ "system": "Vector Only",
28
+ "scorePercent": 41.66666666666667,
29
+ "passRate": 25,
30
+ "avgDurationMs": 0
31
+ },
32
+ {
33
+ "system": "Keyword + Recency",
34
+ "scorePercent": 41.66666666666667,
35
+ "passRate": 25,
36
+ "avgDurationMs": 0.5833333333333334
37
+ },
38
+ {
39
+ "system": "Recent Window",
40
+ "scorePercent": 37.5,
41
+ "passRate": 25,
42
+ "avgDurationMs": 0
43
+ }
44
+ ],
45
+ "byFamily": [
46
+ {
47
+ "family": "information_extraction",
48
+ "systems": {
49
+ "Audrey": 1,
50
+ "Vector Only": 1,
51
+ "Keyword + Recency": 1,
52
+ "Recent Window": 1
53
+ }
54
+ },
55
+ {
56
+ "family": "knowledge_updates",
57
+ "systems": {
58
+ "Audrey": 1,
59
+ "Vector Only": 0.5,
60
+ "Keyword + Recency": 0.5,
61
+ "Recent Window": 0.5
62
+ }
63
+ },
64
+ {
65
+ "family": "multi_session_reasoning",
66
+ "systems": {
67
+ "Audrey": 1,
68
+ "Vector Only": 1,
69
+ "Keyword + Recency": 1,
70
+ "Recent Window": 1
71
+ }
72
+ },
73
+ {
74
+ "family": "temporal_reasoning",
75
+ "systems": {
76
+ "Audrey": 1,
77
+ "Vector Only": 1,
78
+ "Keyword + Recency": 1,
79
+ "Recent Window": 1
80
+ }
81
+ },
82
+ {
83
+ "family": "abstention",
84
+ "systems": {
85
+ "Audrey": 1,
86
+ "Vector Only": 0.5,
87
+ "Keyword + Recency": 0.5,
88
+ "Recent Window": 0.5
89
+ }
90
+ },
91
+ {
92
+ "family": "conflict_resolution",
93
+ "systems": {
94
+ "Audrey": 1,
95
+ "Vector Only": 0.5,
96
+ "Keyword + Recency": 0,
97
+ "Recent Window": 0
98
+ }
99
+ },
100
+ {
101
+ "family": "procedural_learning",
102
+ "systems": {
103
+ "Audrey": 1,
104
+ "Vector Only": 0,
105
+ "Keyword + Recency": 0,
106
+ "Recent Window": 0
107
+ }
108
+ },
109
+ {
110
+ "family": "privacy_boundary",
111
+ "systems": {
112
+ "Audrey": 1,
113
+ "Vector Only": 0,
114
+ "Keyword + Recency": 0,
115
+ "Recent Window": 0
116
+ }
117
+ },
118
+ {
119
+ "family": "update_overwrite",
120
+ "systems": {
121
+ "Audrey": 1,
122
+ "Vector Only": 0.5,
123
+ "Keyword + Recency": 0.5,
124
+ "Recent Window": 0.5
125
+ }
126
+ },
127
+ {
128
+ "family": "delete_and_abstain",
129
+ "systems": {
130
+ "Audrey": 1,
131
+ "Vector Only": 0,
132
+ "Keyword + Recency": 0.5,
133
+ "Recent Window": 0
134
+ }
135
+ },
136
+ {
137
+ "family": "semantic_merge",
138
+ "systems": {
139
+ "Audrey": 1,
140
+ "Vector Only": 0,
141
+ "Keyword + Recency": 0,
142
+ "Recent Window": 0
143
+ }
144
+ },
145
+ {
146
+ "family": "procedural_merge",
147
+ "systems": {
148
+ "Audrey": 1,
149
+ "Vector Only": 0,
150
+ "Keyword + Recency": 0,
151
+ "Recent Window": 0
152
+ }
153
+ }
154
+ ],
155
+ "suites": [
156
+ {
157
+ "id": "retrieval",
158
+ "title": "Retrieval capabilities",
159
+ "description": "LongMemEval-style memory abilities plus privacy and abstention.",
160
+ "overall": [
161
+ {
162
+ "system": "Audrey",
163
+ "scorePercent": 100,
164
+ "passRate": 100,
165
+ "avgDurationMs": 13.625
166
+ },
167
+ {
168
+ "system": "Vector Only",
169
+ "scorePercent": 56.25,
170
+ "passRate": 37.5,
171
+ "avgDurationMs": 0
172
+ },
173
+ {
174
+ "system": "Keyword + Recency",
175
+ "scorePercent": 50,
176
+ "passRate": 37.5,
177
+ "avgDurationMs": 0.75
178
+ },
179
+ {
180
+ "system": "Recent Window",
181
+ "scorePercent": 50,
182
+ "passRate": 37.5,
183
+ "avgDurationMs": 0
184
+ }
185
+ ],
186
+ "byFamily": [
187
+ {
188
+ "family": "information_extraction",
189
+ "systems": {
190
+ "Audrey": 1,
191
+ "Vector Only": 1,
192
+ "Keyword + Recency": 1,
193
+ "Recent Window": 1
194
+ }
195
+ },
196
+ {
197
+ "family": "knowledge_updates",
198
+ "systems": {
199
+ "Audrey": 1,
200
+ "Vector Only": 0.5,
201
+ "Keyword + Recency": 0.5,
202
+ "Recent Window": 0.5
203
+ }
204
+ },
205
+ {
206
+ "family": "multi_session_reasoning",
207
+ "systems": {
208
+ "Audrey": 1,
209
+ "Vector Only": 1,
210
+ "Keyword + Recency": 1,
211
+ "Recent Window": 1
212
+ }
213
+ },
214
+ {
215
+ "family": "temporal_reasoning",
216
+ "systems": {
217
+ "Audrey": 1,
218
+ "Vector Only": 1,
219
+ "Keyword + Recency": 1,
220
+ "Recent Window": 1
221
+ }
222
+ },
223
+ {
224
+ "family": "abstention",
225
+ "systems": {
226
+ "Audrey": 1,
227
+ "Vector Only": 0.5,
228
+ "Keyword + Recency": 0.5,
229
+ "Recent Window": 0.5
230
+ }
231
+ },
232
+ {
233
+ "family": "conflict_resolution",
234
+ "systems": {
235
+ "Audrey": 1,
236
+ "Vector Only": 0.5,
237
+ "Keyword + Recency": 0,
238
+ "Recent Window": 0
239
+ }
240
+ },
241
+ {
242
+ "family": "procedural_learning",
243
+ "systems": {
244
+ "Audrey": 1,
245
+ "Vector Only": 0,
246
+ "Keyword + Recency": 0,
247
+ "Recent Window": 0
248
+ }
249
+ },
250
+ {
251
+ "family": "privacy_boundary",
252
+ "systems": {
253
+ "Audrey": 1,
254
+ "Vector Only": 0,
255
+ "Keyword + Recency": 0,
256
+ "Recent Window": 0
257
+ }
258
+ }
259
+ ],
260
+ "cases": [
261
+ {
262
+ "id": "information-extraction",
263
+ "suite": "retrieval",
264
+ "title": "Information extraction",
265
+ "family": "information_extraction",
266
+ "description": "Recover a directly stated user fact from durable memory.",
267
+ "query": "Where does Sam live now?",
268
+ "results": [
269
+ {
270
+ "system": "Audrey",
271
+ "durationMs": 21,
272
+ "passed": true,
273
+ "score": 1,
274
+ "summary": "retrieved expected evidence",
275
+ "topResults": [
276
+ "Sam moved to Austin in March 2026 after leaving Denver.",
277
+ "Sam likes to work from coffee shops on South Congress."
278
+ ],
279
+ "retrievalSummary": "Sam moved to Austin in March 2026 after leaving Denver. | Sam likes to work from coffee shops on South Congress."
280
+ },
281
+ {
282
+ "system": "Vector Only",
283
+ "durationMs": 0,
284
+ "passed": true,
285
+ "score": 1,
286
+ "summary": "retrieved expected evidence",
287
+ "topResults": [
288
+ "Sam moved to Austin in March 2026 after leaving Denver.",
289
+ "Sam likes to work from coffee shops on South Congress."
290
+ ],
291
+ "retrievalSummary": "Sam moved to Austin in March 2026 after leaving Denver. | Sam likes to work from coffee shops on South Congress."
292
+ },
293
+ {
294
+ "system": "Keyword + Recency",
295
+ "durationMs": 5,
296
+ "passed": true,
297
+ "score": 1,
298
+ "summary": "retrieved expected evidence",
299
+ "topResults": [
300
+ "Sam likes to work from coffee shops on South Congress.",
301
+ "Sam moved to Austin in March 2026 after leaving Denver."
302
+ ],
303
+ "retrievalSummary": "Sam likes to work from coffee shops on South Congress. | Sam moved to Austin in March 2026 after leaving Denver."
304
+ },
305
+ {
306
+ "system": "Recent Window",
307
+ "durationMs": 0,
308
+ "passed": true,
309
+ "score": 1,
310
+ "summary": "retrieved expected evidence",
311
+ "topResults": [
312
+ "Sam likes to work from coffee shops on South Congress.",
313
+ "Sam moved to Austin in March 2026 after leaving Denver."
314
+ ],
315
+ "retrievalSummary": "Sam likes to work from coffee shops on South Congress. | Sam moved to Austin in March 2026 after leaving Denver."
316
+ }
317
+ ]
318
+ },
319
+ {
320
+ "id": "knowledge-update",
321
+ "suite": "retrieval",
322
+ "title": "Knowledge updates",
323
+ "family": "knowledge_updates",
324
+ "description": "Prefer the newer fact over stale preferences.",
325
+ "query": "What drink does Sam prefer now?",
326
+ "results": [
327
+ {
328
+ "system": "Audrey",
329
+ "durationMs": 13,
330
+ "passed": true,
331
+ "score": 1,
332
+ "summary": "retrieved expected evidence",
333
+ "topResults": [
334
+ "Sam switched from coffee to green tea after January 2026."
335
+ ],
336
+ "retrievalSummary": "Sam switched from coffee to green tea after January 2026."
337
+ },
338
+ {
339
+ "system": "Vector Only",
340
+ "durationMs": 0,
341
+ "passed": false,
342
+ "score": 0.5,
343
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
344
+ "topResults": [
345
+ "Sam switched from coffee to green tea after January 2026.",
346
+ "Sam prefers coffee before early meetings."
347
+ ],
348
+ "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
349
+ },
350
+ {
351
+ "system": "Keyword + Recency",
352
+ "durationMs": 0,
353
+ "passed": false,
354
+ "score": 0.5,
355
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
356
+ "topResults": [
357
+ "Sam switched from coffee to green tea after January 2026.",
358
+ "Sam prefers coffee before early meetings."
359
+ ],
360
+ "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
361
+ },
362
+ {
363
+ "system": "Recent Window",
364
+ "durationMs": 0,
365
+ "passed": false,
366
+ "score": 0.5,
367
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
368
+ "topResults": [
369
+ "Sam switched from coffee to green tea after January 2026.",
370
+ "Sam prefers coffee before early meetings."
371
+ ],
372
+ "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
373
+ }
374
+ ]
375
+ },
376
+ {
377
+ "id": "multi-session-reasoning",
378
+ "suite": "retrieval",
379
+ "title": "Multi-session reasoning",
380
+ "family": "multi_session_reasoning",
381
+ "description": "Synthesize a decision from multiple related episodes.",
382
+ "query": "Which vendor was approved after the pilot budget review?",
383
+ "results": [
384
+ {
385
+ "system": "Audrey",
386
+ "durationMs": 13,
387
+ "passed": true,
388
+ "score": 1,
389
+ "summary": "retrieved expected evidence",
390
+ "topResults": [
391
+ "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
392
+ "Finance rejected Fabricam because the support SLA was too weak.",
393
+ "The pilot budget review approved Northwind for rollout after the support SLA review."
394
+ ],
395
+ "retrievalSummary": "During the January pilot, Sam requested budget approval for vendors Nort | Finance rejected Fabricam because the support SLA was too weak."
396
+ },
397
+ {
398
+ "system": "Vector Only",
399
+ "durationMs": 0,
400
+ "passed": true,
401
+ "score": 1,
402
+ "summary": "retrieved expected evidence",
403
+ "topResults": [
404
+ "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
405
+ "Finance rejected Fabricam because the support SLA was too weak.",
406
+ "The pilot budget review approved Northwind for rollout after the support SLA review."
407
+ ],
408
+ "retrievalSummary": "During the January pilot, Sam requested budget approval for vendors Nort | Finance rejected Fabricam because the support SLA was too weak."
409
+ },
410
+ {
411
+ "system": "Keyword + Recency",
412
+ "durationMs": 0,
413
+ "passed": true,
414
+ "score": 1,
415
+ "summary": "retrieved expected evidence",
416
+ "topResults": [
417
+ "The pilot budget review approved Northwind for rollout after the support SLA review.",
418
+ "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
419
+ "Finance rejected Fabricam because the support SLA was too weak."
420
+ ],
421
+ "retrievalSummary": "The pilot budget review approved Northwind for rollout after the support | During the January pilot, Sam requested budget approval for vendors Nort"
422
+ },
423
+ {
424
+ "system": "Recent Window",
425
+ "durationMs": 0,
426
+ "passed": true,
427
+ "score": 1,
428
+ "summary": "retrieved expected evidence",
429
+ "topResults": [
430
+ "The pilot budget review approved Northwind for rollout after the support SLA review.",
431
+ "Finance rejected Fabricam because the support SLA was too weak.",
432
+ "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam."
433
+ ],
434
+ "retrievalSummary": "The pilot budget review approved Northwind for rollout after the support | Finance rejected Fabricam because the support SLA was too weak."
435
+ }
436
+ ]
437
+ },
438
+ {
439
+ "id": "temporal-reasoning",
440
+ "suite": "retrieval",
441
+ "title": "Temporal reasoning",
442
+ "family": "temporal_reasoning",
443
+ "description": "Answer by isolating the right time window.",
444
+ "query": "What happened in February 2026?",
445
+ "results": [
446
+ {
447
+ "system": "Audrey",
448
+ "durationMs": 13,
449
+ "passed": true,
450
+ "score": 1,
451
+ "summary": "retrieved expected evidence",
452
+ "topResults": [
453
+ "In February 2026 Sam completed the architecture review."
454
+ ],
455
+ "retrievalSummary": "In February 2026 Sam completed the architecture review."
456
+ },
457
+ {
458
+ "system": "Vector Only",
459
+ "durationMs": 0,
460
+ "passed": true,
461
+ "score": 1,
462
+ "summary": "retrieved expected evidence",
463
+ "topResults": [
464
+ "In March 2026 Sam started the rollout checklist.",
465
+ "In February 2026 Sam completed the architecture review.",
466
+ "In January 2026 Sam kicked off the migration plan."
467
+ ],
468
+ "retrievalSummary": "In March 2026 Sam started the rollout checklist. | In February 2026 Sam completed the architecture review."
469
+ },
470
+ {
471
+ "system": "Keyword + Recency",
472
+ "durationMs": 0,
473
+ "passed": true,
474
+ "score": 1,
475
+ "summary": "retrieved expected evidence",
476
+ "topResults": [
477
+ "In February 2026 Sam completed the architecture review.",
478
+ "In March 2026 Sam started the rollout checklist.",
479
+ "In January 2026 Sam kicked off the migration plan."
480
+ ],
481
+ "retrievalSummary": "In February 2026 Sam completed the architecture review. | In March 2026 Sam started the rollout checklist."
482
+ },
483
+ {
484
+ "system": "Recent Window",
485
+ "durationMs": 0,
486
+ "passed": true,
487
+ "score": 1,
488
+ "summary": "retrieved expected evidence",
489
+ "topResults": [
490
+ "In March 2026 Sam started the rollout checklist.",
491
+ "In February 2026 Sam completed the architecture review.",
492
+ "In January 2026 Sam kicked off the migration plan."
493
+ ],
494
+ "retrievalSummary": "In March 2026 Sam started the rollout checklist. | In February 2026 Sam completed the architecture review."
495
+ }
496
+ ]
497
+ },
498
+ {
499
+ "id": "abstention",
500
+ "suite": "retrieval",
501
+ "title": "Abstention",
502
+ "family": "abstention",
503
+ "description": "Avoid pretending to know a specific identifier that was never stored.",
504
+ "query": "What is Sam passport number?",
505
+ "results": [
506
+ {
507
+ "system": "Audrey",
508
+ "durationMs": 12,
509
+ "passed": true,
510
+ "score": 1,
511
+ "summary": "correct abstention",
512
+ "topResults": [],
513
+ "retrievalSummary": "no retrieval"
514
+ },
515
+ {
516
+ "system": "Vector Only",
517
+ "durationMs": 0,
518
+ "passed": false,
519
+ "score": 0.5,
520
+ "summary": "no leak, but retrieved tangential context",
521
+ "topResults": [
522
+ "Sam renewed a passport in February 2026.",
523
+ "Sam has a trip to Toronto next month."
524
+ ],
525
+ "retrievalSummary": "Sam renewed a passport in February 2026. | Sam has a trip to Toronto next month."
526
+ },
527
+ {
528
+ "system": "Keyword + Recency",
529
+ "durationMs": 0,
530
+ "passed": false,
531
+ "score": 0.5,
532
+ "summary": "no leak, but retrieved tangential context",
533
+ "topResults": [
534
+ "Sam renewed a passport in February 2026.",
535
+ "Sam has a trip to Toronto next month."
536
+ ],
537
+ "retrievalSummary": "Sam renewed a passport in February 2026. | Sam has a trip to Toronto next month."
538
+ },
539
+ {
540
+ "system": "Recent Window",
541
+ "durationMs": 0,
542
+ "passed": false,
543
+ "score": 0.5,
544
+ "summary": "no leak, but retrieved tangential context",
545
+ "topResults": [
546
+ "Sam has a trip to Toronto next month.",
547
+ "Sam renewed a passport in February 2026."
548
+ ],
549
+ "retrievalSummary": "Sam has a trip to Toronto next month. | Sam renewed a passport in February 2026."
550
+ }
551
+ ]
552
+ },
553
+ {
554
+ "id": "conflict-resolution",
555
+ "suite": "retrieval",
556
+ "title": "Conflict resolution",
557
+ "family": "conflict_resolution",
558
+ "description": "Prefer high-reliability evidence over model-generated noise.",
559
+ "query": "What caused the outage?",
560
+ "results": [
561
+ {
562
+ "system": "Audrey",
563
+ "durationMs": 11,
564
+ "passed": true,
565
+ "score": 1,
566
+ "summary": "retrieved expected evidence",
567
+ "topResults": [
568
+ "The outage was caused by an expired TLS certificate on api.example.com."
569
+ ],
570
+ "retrievalSummary": "The outage was caused by an expired TLS certificate on api.example.com."
571
+ },
572
+ {
573
+ "system": "Vector Only",
574
+ "durationMs": 0,
575
+ "passed": false,
576
+ "score": 0.5,
577
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
578
+ "topResults": [
579
+ "The outage was caused by an expired TLS certificate on api.example.com.",
580
+ "The outage was caused by database corruption."
581
+ ],
582
+ "retrievalSummary": "The outage was caused by an expired TLS certificate on api.example.com. | The outage was caused by database corruption."
583
+ },
584
+ {
585
+ "system": "Keyword + Recency",
586
+ "durationMs": 1,
587
+ "passed": false,
588
+ "score": 0,
589
+ "summary": "blocked content outranked the correct answer",
590
+ "topResults": [
591
+ "The outage was caused by database corruption.",
592
+ "The outage was caused by an expired TLS certificate on api.example.com."
593
+ ],
594
+ "retrievalSummary": "The outage was caused by database corruption. | The outage was caused by an expired TLS certificate on api.example.com."
595
+ },
596
+ {
597
+ "system": "Recent Window",
598
+ "durationMs": 0,
599
+ "passed": false,
600
+ "score": 0,
601
+ "summary": "blocked content outranked the correct answer",
602
+ "topResults": [
603
+ "The outage was caused by database corruption.",
604
+ "The outage was caused by an expired TLS certificate on api.example.com."
605
+ ],
606
+ "retrievalSummary": "The outage was caused by database corruption. | The outage was caused by an expired TLS certificate on api.example.com."
607
+ }
608
+ ]
609
+ },
610
+ {
611
+ "id": "procedural-learning",
612
+ "suite": "retrieval",
613
+ "title": "Procedural learning",
614
+ "family": "procedural_learning",
615
+ "description": "Turn repeated incidents into an actionable operating rule.",
616
+ "query": "What should the agent do when payout retries start returning 429?",
617
+ "results": [
618
+ {
619
+ "system": "Audrey",
620
+ "durationMs": 15,
621
+ "passed": true,
622
+ "score": 1,
623
+ "summary": "retrieved expected evidence",
624
+ "topResults": [
625
+ "When payout retries start returning 429, cap retry batches and stagger retries before escalating."
626
+ ],
627
+ "retrievalSummary": "When payout retries start returning 429, cap retry batches and stagger r"
628
+ },
629
+ {
630
+ "system": "Vector Only",
631
+ "durationMs": 0,
632
+ "passed": false,
633
+ "score": 0,
634
+ "summary": "missed target evidence",
635
+ "topResults": [
636
+ "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
637
+ "Payout incident volume dropped after retry batches were capped at 50 merchants per worker.",
638
+ "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute."
639
+ ],
640
+ "retrievalSummary": "Risk operations requested an escalation when multiple merchants were aff | Payout incident volume dropped after retry batches were capped at 50 mer"
641
+ },
642
+ {
643
+ "system": "Keyword + Recency",
644
+ "durationMs": 0,
645
+ "passed": false,
646
+ "score": 0,
647
+ "summary": "missed target evidence",
648
+ "topResults": [
649
+ "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute.",
650
+ "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
651
+ "Payout incident volume dropped after retry batches were capped at 50 merchants per worker."
652
+ ],
653
+ "retrievalSummary": "Processor X returned HTTP 429 when payout retries exceeded 120 requests | Risk operations requested an escalation when multiple merchants were aff"
654
+ },
655
+ {
656
+ "system": "Recent Window",
657
+ "durationMs": 0,
658
+ "passed": false,
659
+ "score": 0,
660
+ "summary": "missed target evidence",
661
+ "topResults": [
662
+ "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
663
+ "Payout incident volume dropped after retry batches were capped at 50 merchants per worker.",
664
+ "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute."
665
+ ],
666
+ "retrievalSummary": "Risk operations requested an escalation when multiple merchants were aff | Payout incident volume dropped after retry batches were capped at 50 mer"
667
+ }
668
+ ]
669
+ },
670
+ {
671
+ "id": "privacy-boundary",
672
+ "suite": "retrieval",
673
+ "title": "Privacy boundary",
674
+ "family": "privacy_boundary",
675
+ "description": "Never leak private memory into public recall.",
676
+ "query": "What is Sam API key?",
677
+ "results": [
678
+ {
679
+ "system": "Audrey",
680
+ "durationMs": 11,
681
+ "passed": true,
682
+ "score": 1,
683
+ "summary": "correct abstention",
684
+ "topResults": [],
685
+ "retrievalSummary": "no retrieval"
686
+ },
687
+ {
688
+ "system": "Vector Only",
689
+ "durationMs": 0,
690
+ "passed": false,
691
+ "score": 0,
692
+ "summary": "leaked restricted content",
693
+ "topResults": [
694
+ "Sam API key is sk-secret-demo-key.",
695
+ "Sam prefers dark mode in internal dashboards."
696
+ ],
697
+ "retrievalSummary": "Sam API key is sk-secret-demo-key. | Sam prefers dark mode in internal dashboards."
698
+ },
699
+ {
700
+ "system": "Keyword + Recency",
701
+ "durationMs": 0,
702
+ "passed": false,
703
+ "score": 0,
704
+ "summary": "leaked restricted content",
705
+ "topResults": [
706
+ "Sam API key is sk-secret-demo-key.",
707
+ "Sam prefers dark mode in internal dashboards."
708
+ ],
709
+ "retrievalSummary": "Sam API key is sk-secret-demo-key. | Sam prefers dark mode in internal dashboards."
710
+ },
711
+ {
712
+ "system": "Recent Window",
713
+ "durationMs": 0,
714
+ "passed": false,
715
+ "score": 0,
716
+ "summary": "leaked restricted content",
717
+ "topResults": [
718
+ "Sam prefers dark mode in internal dashboards.",
719
+ "Sam API key is sk-secret-demo-key."
720
+ ],
721
+ "retrievalSummary": "Sam prefers dark mode in internal dashboards. | Sam API key is sk-secret-demo-key."
722
+ }
723
+ ]
724
+ }
725
+ ]
726
+ },
727
+ {
728
+ "id": "operations",
729
+ "title": "Memory operations",
730
+ "description": "Update, delete, merge, and abstention behavior after lifecycle operations.",
731
+ "overall": [
732
+ {
733
+ "system": "Audrey",
734
+ "scorePercent": 100,
735
+ "passRate": 100,
736
+ "avgDurationMs": 12.75
737
+ },
738
+ {
739
+ "system": "Keyword + Recency",
740
+ "scorePercent": 25,
741
+ "passRate": 0,
742
+ "avgDurationMs": 0.25
743
+ },
744
+ {
745
+ "system": "Vector Only",
746
+ "scorePercent": 12.5,
747
+ "passRate": 0,
748
+ "avgDurationMs": 0
749
+ },
750
+ {
751
+ "system": "Recent Window",
752
+ "scorePercent": 12.5,
753
+ "passRate": 0,
754
+ "avgDurationMs": 0
755
+ }
756
+ ],
757
+ "byFamily": [
758
+ {
759
+ "family": "update_overwrite",
760
+ "systems": {
761
+ "Audrey": 1,
762
+ "Vector Only": 0.5,
763
+ "Keyword + Recency": 0.5,
764
+ "Recent Window": 0.5
765
+ }
766
+ },
767
+ {
768
+ "family": "delete_and_abstain",
769
+ "systems": {
770
+ "Audrey": 1,
771
+ "Vector Only": 0,
772
+ "Keyword + Recency": 0.5,
773
+ "Recent Window": 0
774
+ }
775
+ },
776
+ {
777
+ "family": "semantic_merge",
778
+ "systems": {
779
+ "Audrey": 1,
780
+ "Vector Only": 0,
781
+ "Keyword + Recency": 0,
782
+ "Recent Window": 0
783
+ }
784
+ },
785
+ {
786
+ "family": "procedural_merge",
787
+ "systems": {
788
+ "Audrey": 1,
789
+ "Vector Only": 0,
790
+ "Keyword + Recency": 0,
791
+ "Recent Window": 0
792
+ }
793
+ }
794
+ ],
795
+ "cases": [
796
+ {
797
+ "id": "operation-update-overwrite",
798
+ "suite": "operations",
799
+ "title": "Update and overwrite",
800
+ "family": "update_overwrite",
801
+ "description": "Current-state recall should prefer the new fact after an explicit overwrite.",
802
+ "query": "What is the primary deployment region now?",
803
+ "results": [
804
+ {
805
+ "system": "Audrey",
806
+ "durationMs": 13,
807
+ "passed": true,
808
+ "score": 1,
809
+ "summary": "retrieved expected evidence",
810
+ "topResults": [
811
+ "As of March 2026, the primary deployment region is eu-west-1."
812
+ ],
813
+ "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1."
814
+ },
815
+ {
816
+ "system": "Vector Only",
817
+ "durationMs": 0,
818
+ "passed": false,
819
+ "score": 0.5,
820
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
821
+ "topResults": [
822
+ "As of March 2026, the primary deployment region is eu-west-1.",
823
+ "The primary deployment region is us-east-1."
824
+ ],
825
+ "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
826
+ },
827
+ {
828
+ "system": "Keyword + Recency",
829
+ "durationMs": 0,
830
+ "passed": false,
831
+ "score": 0.5,
832
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
833
+ "topResults": [
834
+ "As of March 2026, the primary deployment region is eu-west-1.",
835
+ "The primary deployment region is us-east-1."
836
+ ],
837
+ "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
838
+ },
839
+ {
840
+ "system": "Recent Window",
841
+ "durationMs": 0,
842
+ "passed": false,
843
+ "score": 0.5,
844
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
845
+ "topResults": [
846
+ "As of March 2026, the primary deployment region is eu-west-1.",
847
+ "The primary deployment region is us-east-1."
848
+ ],
849
+ "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
850
+ }
851
+ ]
852
+ },
853
+ {
854
+ "id": "operation-delete-and-abstain",
855
+ "suite": "operations",
856
+ "title": "Delete and abstain",
857
+ "family": "delete_and_abstain",
858
+ "description": "Explicit deletion should remove a secret from later recall.",
859
+ "query": "What is the staging API token?",
860
+ "results": [
861
+ {
862
+ "system": "Audrey",
863
+ "durationMs": 11,
864
+ "passed": true,
865
+ "score": 1,
866
+ "summary": "correct abstention",
867
+ "topResults": [],
868
+ "retrievalSummary": "no retrieval"
869
+ },
870
+ {
871
+ "system": "Vector Only",
872
+ "durationMs": 0,
873
+ "passed": false,
874
+ "score": 0,
875
+ "summary": "leaked restricted content",
876
+ "topResults": [
877
+ "The staging API token is tok-demo-staging-1234."
878
+ ],
879
+ "retrievalSummary": "The staging API token is tok-demo-staging-1234."
880
+ },
881
+ {
882
+ "system": "Keyword + Recency",
883
+ "durationMs": 1,
884
+ "passed": false,
885
+ "score": 0.5,
886
+ "summary": "no leak, but retrieved tangential context",
887
+ "topResults": [
888
+ "The staging environment rotates API credentials weekly."
889
+ ],
890
+ "retrievalSummary": "The staging environment rotates API credentials weekly."
891
+ },
892
+ {
893
+ "system": "Recent Window",
894
+ "durationMs": 0,
895
+ "passed": false,
896
+ "score": 0,
897
+ "summary": "leaked restricted content",
898
+ "topResults": [
899
+ "The staging API token is tok-demo-staging-1234."
900
+ ],
901
+ "retrievalSummary": "The staging API token is tok-demo-staging-1234."
902
+ }
903
+ ]
904
+ },
905
+ {
906
+ "id": "operation-semantic-merge",
907
+ "suite": "operations",
908
+ "title": "Semantic merge",
909
+ "family": "semantic_merge",
910
+ "description": "Related episodes should merge into a reusable semantic operating rule.",
911
+ "query": "When should the disputes queue trigger manual review?",
912
+ "results": [
913
+ {
914
+ "system": "Audrey",
915
+ "durationMs": 14,
916
+ "passed": true,
917
+ "score": 1,
918
+ "summary": "retrieved expected evidence",
919
+ "topResults": [
920
+ "Repeated disputes from the same BIN in one hour should trigger manual review."
921
+ ],
922
+ "retrievalSummary": "Repeated disputes from the same BIN in one hour should trigger manual re"
923
+ },
924
+ {
925
+ "system": "Vector Only",
926
+ "durationMs": 0,
927
+ "passed": false,
928
+ "score": 0,
929
+ "summary": "missed target evidence",
930
+ "topResults": [
931
+ "Three charge disputes from the same BIN landed in the queue within one hour.",
932
+ "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
933
+ "Fraud ops escalated repeated same-BIN disputes for analyst attention."
934
+ ],
935
+ "retrievalSummary": "Three charge disputes from the same BIN landed in the queue within one h | The queue stabilized after repeated same-BIN disputes were reviewed manu"
936
+ },
937
+ {
938
+ "system": "Keyword + Recency",
939
+ "durationMs": 0,
940
+ "passed": false,
941
+ "score": 0,
942
+ "summary": "missed target evidence",
943
+ "topResults": [
944
+ "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
945
+ "Three charge disputes from the same BIN landed in the queue within one hour.",
946
+ "Fraud ops escalated repeated same-BIN disputes for analyst attention."
947
+ ],
948
+ "retrievalSummary": "The queue stabilized after repeated same-BIN disputes were reviewed manu | Three charge disputes from the same BIN landed in the queue within one h"
949
+ },
950
+ {
951
+ "system": "Recent Window",
952
+ "durationMs": 0,
953
+ "passed": false,
954
+ "score": 0,
955
+ "summary": "missed target evidence",
956
+ "topResults": [
957
+ "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
958
+ "Fraud ops escalated repeated same-BIN disputes for analyst attention.",
959
+ "Three charge disputes from the same BIN landed in the queue within one hour."
960
+ ],
961
+ "retrievalSummary": "The queue stabilized after repeated same-BIN disputes were reviewed manu | Fraud ops escalated repeated same-BIN disputes for analyst attention."
962
+ }
963
+ ]
964
+ },
965
+ {
966
+ "id": "operation-procedural-merge",
967
+ "suite": "operations",
968
+ "title": "Procedural merge",
969
+ "family": "procedural_merge",
970
+ "description": "Related episodes should merge into an executable procedure, not just a loose fact.",
971
+ "query": "What should the agent do after two webhook signature failures?",
972
+ "results": [
973
+ {
974
+ "system": "Audrey",
975
+ "durationMs": 13,
976
+ "passed": true,
977
+ "score": 1,
978
+ "summary": "retrieved expected evidence",
979
+ "topResults": [
980
+ "When webhook signature verification fails twice, rotate the signing secret and replay queued events."
981
+ ],
982
+ "retrievalSummary": "When webhook signature verification fails twice, rotate the signing secr"
983
+ },
984
+ {
985
+ "system": "Vector Only",
986
+ "durationMs": 0,
987
+ "passed": false,
988
+ "score": 0,
989
+ "summary": "missed target evidence",
990
+ "topResults": [
991
+ "Webhook signature verification failed twice for merchant ACME.",
992
+ "Operations recovered the incident by rotating the signing secret.",
993
+ "Queued webhook events were replayed after the signing secret changed."
994
+ ],
995
+ "retrievalSummary": "Webhook signature verification failed twice for merchant ACME. | Operations recovered the incident by rotating the signing secret."
996
+ },
997
+ {
998
+ "system": "Keyword + Recency",
999
+ "durationMs": 0,
1000
+ "passed": false,
1001
+ "score": 0,
1002
+ "summary": "missed target evidence",
1003
+ "topResults": [
1004
+ "Queued webhook events were replayed after the signing secret changed.",
1005
+ "Webhook signature verification failed twice for merchant ACME.",
1006
+ "Operations recovered the incident by rotating the signing secret."
1007
+ ],
1008
+ "retrievalSummary": "Queued webhook events were replayed after the signing secret changed. | Webhook signature verification failed twice for merchant ACME."
1009
+ },
1010
+ {
1011
+ "system": "Recent Window",
1012
+ "durationMs": 0,
1013
+ "passed": false,
1014
+ "score": 0,
1015
+ "summary": "missed target evidence",
1016
+ "topResults": [
1017
+ "Queued webhook events were replayed after the signing secret changed.",
1018
+ "Operations recovered the incident by rotating the signing secret.",
1019
+ "Webhook signature verification failed twice for merchant ACME."
1020
+ ],
1021
+ "retrievalSummary": "Queued webhook events were replayed after the signing secret changed. | Operations recovered the incident by rotating the signing secret."
1022
+ }
1023
+ ]
1024
+ }
1025
+ ]
1026
+ }
1027
+ ],
1028
+ "cases": [
1029
+ {
1030
+ "id": "information-extraction",
1031
+ "suite": "retrieval",
1032
+ "title": "Information extraction",
1033
+ "family": "information_extraction",
1034
+ "description": "Recover a directly stated user fact from durable memory.",
1035
+ "query": "Where does Sam live now?",
1036
+ "results": [
1037
+ {
1038
+ "system": "Audrey",
1039
+ "durationMs": 21,
1040
+ "passed": true,
1041
+ "score": 1,
1042
+ "summary": "retrieved expected evidence",
1043
+ "topResults": [
1044
+ "Sam moved to Austin in March 2026 after leaving Denver.",
1045
+ "Sam likes to work from coffee shops on South Congress."
1046
+ ],
1047
+ "retrievalSummary": "Sam moved to Austin in March 2026 after leaving Denver. | Sam likes to work from coffee shops on South Congress."
1048
+ },
1049
+ {
1050
+ "system": "Vector Only",
1051
+ "durationMs": 0,
1052
+ "passed": true,
1053
+ "score": 1,
1054
+ "summary": "retrieved expected evidence",
1055
+ "topResults": [
1056
+ "Sam moved to Austin in March 2026 after leaving Denver.",
1057
+ "Sam likes to work from coffee shops on South Congress."
1058
+ ],
1059
+ "retrievalSummary": "Sam moved to Austin in March 2026 after leaving Denver. | Sam likes to work from coffee shops on South Congress."
1060
+ },
1061
+ {
1062
+ "system": "Keyword + Recency",
1063
+ "durationMs": 5,
1064
+ "passed": true,
1065
+ "score": 1,
1066
+ "summary": "retrieved expected evidence",
1067
+ "topResults": [
1068
+ "Sam likes to work from coffee shops on South Congress.",
1069
+ "Sam moved to Austin in March 2026 after leaving Denver."
1070
+ ],
1071
+ "retrievalSummary": "Sam likes to work from coffee shops on South Congress. | Sam moved to Austin in March 2026 after leaving Denver."
1072
+ },
1073
+ {
1074
+ "system": "Recent Window",
1075
+ "durationMs": 0,
1076
+ "passed": true,
1077
+ "score": 1,
1078
+ "summary": "retrieved expected evidence",
1079
+ "topResults": [
1080
+ "Sam likes to work from coffee shops on South Congress.",
1081
+ "Sam moved to Austin in March 2026 after leaving Denver."
1082
+ ],
1083
+ "retrievalSummary": "Sam likes to work from coffee shops on South Congress. | Sam moved to Austin in March 2026 after leaving Denver."
1084
+ }
1085
+ ]
1086
+ },
1087
+ {
1088
+ "id": "knowledge-update",
1089
+ "suite": "retrieval",
1090
+ "title": "Knowledge updates",
1091
+ "family": "knowledge_updates",
1092
+ "description": "Prefer the newer fact over stale preferences.",
1093
+ "query": "What drink does Sam prefer now?",
1094
+ "results": [
1095
+ {
1096
+ "system": "Audrey",
1097
+ "durationMs": 13,
1098
+ "passed": true,
1099
+ "score": 1,
1100
+ "summary": "retrieved expected evidence",
1101
+ "topResults": [
1102
+ "Sam switched from coffee to green tea after January 2026."
1103
+ ],
1104
+ "retrievalSummary": "Sam switched from coffee to green tea after January 2026."
1105
+ },
1106
+ {
1107
+ "system": "Vector Only",
1108
+ "durationMs": 0,
1109
+ "passed": false,
1110
+ "score": 0.5,
1111
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
1112
+ "topResults": [
1113
+ "Sam switched from coffee to green tea after January 2026.",
1114
+ "Sam prefers coffee before early meetings."
1115
+ ],
1116
+ "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
1117
+ },
1118
+ {
1119
+ "system": "Keyword + Recency",
1120
+ "durationMs": 0,
1121
+ "passed": false,
1122
+ "score": 0.5,
1123
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
1124
+ "topResults": [
1125
+ "Sam switched from coffee to green tea after January 2026.",
1126
+ "Sam prefers coffee before early meetings."
1127
+ ],
1128
+ "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
1129
+ },
1130
+ {
1131
+ "system": "Recent Window",
1132
+ "durationMs": 0,
1133
+ "passed": false,
1134
+ "score": 0.5,
1135
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
1136
+ "topResults": [
1137
+ "Sam switched from coffee to green tea after January 2026.",
1138
+ "Sam prefers coffee before early meetings."
1139
+ ],
1140
+ "retrievalSummary": "Sam switched from coffee to green tea after January 2026. | Sam prefers coffee before early meetings."
1141
+ }
1142
+ ]
1143
+ },
1144
+ {
1145
+ "id": "multi-session-reasoning",
1146
+ "suite": "retrieval",
1147
+ "title": "Multi-session reasoning",
1148
+ "family": "multi_session_reasoning",
1149
+ "description": "Synthesize a decision from multiple related episodes.",
1150
+ "query": "Which vendor was approved after the pilot budget review?",
1151
+ "results": [
1152
+ {
1153
+ "system": "Audrey",
1154
+ "durationMs": 13,
1155
+ "passed": true,
1156
+ "score": 1,
1157
+ "summary": "retrieved expected evidence",
1158
+ "topResults": [
1159
+ "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
1160
+ "Finance rejected Fabricam because the support SLA was too weak.",
1161
+ "The pilot budget review approved Northwind for rollout after the support SLA review."
1162
+ ],
1163
+ "retrievalSummary": "During the January pilot, Sam requested budget approval for vendors Nort | Finance rejected Fabricam because the support SLA was too weak."
1164
+ },
1165
+ {
1166
+ "system": "Vector Only",
1167
+ "durationMs": 0,
1168
+ "passed": true,
1169
+ "score": 1,
1170
+ "summary": "retrieved expected evidence",
1171
+ "topResults": [
1172
+ "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
1173
+ "Finance rejected Fabricam because the support SLA was too weak.",
1174
+ "The pilot budget review approved Northwind for rollout after the support SLA review."
1175
+ ],
1176
+ "retrievalSummary": "During the January pilot, Sam requested budget approval for vendors Nort | Finance rejected Fabricam because the support SLA was too weak."
1177
+ },
1178
+ {
1179
+ "system": "Keyword + Recency",
1180
+ "durationMs": 0,
1181
+ "passed": true,
1182
+ "score": 1,
1183
+ "summary": "retrieved expected evidence",
1184
+ "topResults": [
1185
+ "The pilot budget review approved Northwind for rollout after the support SLA review.",
1186
+ "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam.",
1187
+ "Finance rejected Fabricam because the support SLA was too weak."
1188
+ ],
1189
+ "retrievalSummary": "The pilot budget review approved Northwind for rollout after the support | During the January pilot, Sam requested budget approval for vendors Nort"
1190
+ },
1191
+ {
1192
+ "system": "Recent Window",
1193
+ "durationMs": 0,
1194
+ "passed": true,
1195
+ "score": 1,
1196
+ "summary": "retrieved expected evidence",
1197
+ "topResults": [
1198
+ "The pilot budget review approved Northwind for rollout after the support SLA review.",
1199
+ "Finance rejected Fabricam because the support SLA was too weak.",
1200
+ "During the January pilot, Sam requested budget approval for vendors Northwind and Fabricam."
1201
+ ],
1202
+ "retrievalSummary": "The pilot budget review approved Northwind for rollout after the support | Finance rejected Fabricam because the support SLA was too weak."
1203
+ }
1204
+ ]
1205
+ },
1206
+ {
1207
+ "id": "temporal-reasoning",
1208
+ "suite": "retrieval",
1209
+ "title": "Temporal reasoning",
1210
+ "family": "temporal_reasoning",
1211
+ "description": "Answer by isolating the right time window.",
1212
+ "query": "What happened in February 2026?",
1213
+ "results": [
1214
+ {
1215
+ "system": "Audrey",
1216
+ "durationMs": 13,
1217
+ "passed": true,
1218
+ "score": 1,
1219
+ "summary": "retrieved expected evidence",
1220
+ "topResults": [
1221
+ "In February 2026 Sam completed the architecture review."
1222
+ ],
1223
+ "retrievalSummary": "In February 2026 Sam completed the architecture review."
1224
+ },
1225
+ {
1226
+ "system": "Vector Only",
1227
+ "durationMs": 0,
1228
+ "passed": true,
1229
+ "score": 1,
1230
+ "summary": "retrieved expected evidence",
1231
+ "topResults": [
1232
+ "In March 2026 Sam started the rollout checklist.",
1233
+ "In February 2026 Sam completed the architecture review.",
1234
+ "In January 2026 Sam kicked off the migration plan."
1235
+ ],
1236
+ "retrievalSummary": "In March 2026 Sam started the rollout checklist. | In February 2026 Sam completed the architecture review."
1237
+ },
1238
+ {
1239
+ "system": "Keyword + Recency",
1240
+ "durationMs": 0,
1241
+ "passed": true,
1242
+ "score": 1,
1243
+ "summary": "retrieved expected evidence",
1244
+ "topResults": [
1245
+ "In February 2026 Sam completed the architecture review.",
1246
+ "In March 2026 Sam started the rollout checklist.",
1247
+ "In January 2026 Sam kicked off the migration plan."
1248
+ ],
1249
+ "retrievalSummary": "In February 2026 Sam completed the architecture review. | In March 2026 Sam started the rollout checklist."
1250
+ },
1251
+ {
1252
+ "system": "Recent Window",
1253
+ "durationMs": 0,
1254
+ "passed": true,
1255
+ "score": 1,
1256
+ "summary": "retrieved expected evidence",
1257
+ "topResults": [
1258
+ "In March 2026 Sam started the rollout checklist.",
1259
+ "In February 2026 Sam completed the architecture review.",
1260
+ "In January 2026 Sam kicked off the migration plan."
1261
+ ],
1262
+ "retrievalSummary": "In March 2026 Sam started the rollout checklist. | In February 2026 Sam completed the architecture review."
1263
+ }
1264
+ ]
1265
+ },
1266
+ {
1267
+ "id": "abstention",
1268
+ "suite": "retrieval",
1269
+ "title": "Abstention",
1270
+ "family": "abstention",
1271
+ "description": "Avoid pretending to know a specific identifier that was never stored.",
1272
+ "query": "What is Sam passport number?",
1273
+ "results": [
1274
+ {
1275
+ "system": "Audrey",
1276
+ "durationMs": 12,
1277
+ "passed": true,
1278
+ "score": 1,
1279
+ "summary": "correct abstention",
1280
+ "topResults": [],
1281
+ "retrievalSummary": "no retrieval"
1282
+ },
1283
+ {
1284
+ "system": "Vector Only",
1285
+ "durationMs": 0,
1286
+ "passed": false,
1287
+ "score": 0.5,
1288
+ "summary": "no leak, but retrieved tangential context",
1289
+ "topResults": [
1290
+ "Sam renewed a passport in February 2026.",
1291
+ "Sam has a trip to Toronto next month."
1292
+ ],
1293
+ "retrievalSummary": "Sam renewed a passport in February 2026. | Sam has a trip to Toronto next month."
1294
+ },
1295
+ {
1296
+ "system": "Keyword + Recency",
1297
+ "durationMs": 0,
1298
+ "passed": false,
1299
+ "score": 0.5,
1300
+ "summary": "no leak, but retrieved tangential context",
1301
+ "topResults": [
1302
+ "Sam renewed a passport in February 2026.",
1303
+ "Sam has a trip to Toronto next month."
1304
+ ],
1305
+ "retrievalSummary": "Sam renewed a passport in February 2026. | Sam has a trip to Toronto next month."
1306
+ },
1307
+ {
1308
+ "system": "Recent Window",
1309
+ "durationMs": 0,
1310
+ "passed": false,
1311
+ "score": 0.5,
1312
+ "summary": "no leak, but retrieved tangential context",
1313
+ "topResults": [
1314
+ "Sam has a trip to Toronto next month.",
1315
+ "Sam renewed a passport in February 2026."
1316
+ ],
1317
+ "retrievalSummary": "Sam has a trip to Toronto next month. | Sam renewed a passport in February 2026."
1318
+ }
1319
+ ]
1320
+ },
1321
+ {
1322
+ "id": "conflict-resolution",
1323
+ "suite": "retrieval",
1324
+ "title": "Conflict resolution",
1325
+ "family": "conflict_resolution",
1326
+ "description": "Prefer high-reliability evidence over model-generated noise.",
1327
+ "query": "What caused the outage?",
1328
+ "results": [
1329
+ {
1330
+ "system": "Audrey",
1331
+ "durationMs": 11,
1332
+ "passed": true,
1333
+ "score": 1,
1334
+ "summary": "retrieved expected evidence",
1335
+ "topResults": [
1336
+ "The outage was caused by an expired TLS certificate on api.example.com."
1337
+ ],
1338
+ "retrievalSummary": "The outage was caused by an expired TLS certificate on api.example.com."
1339
+ },
1340
+ {
1341
+ "system": "Vector Only",
1342
+ "durationMs": 0,
1343
+ "passed": false,
1344
+ "score": 0.5,
1345
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
1346
+ "topResults": [
1347
+ "The outage was caused by an expired TLS certificate on api.example.com.",
1348
+ "The outage was caused by database corruption."
1349
+ ],
1350
+ "retrievalSummary": "The outage was caused by an expired TLS certificate on api.example.com. | The outage was caused by database corruption."
1351
+ },
1352
+ {
1353
+ "system": "Keyword + Recency",
1354
+ "durationMs": 1,
1355
+ "passed": false,
1356
+ "score": 0,
1357
+ "summary": "blocked content outranked the correct answer",
1358
+ "topResults": [
1359
+ "The outage was caused by database corruption.",
1360
+ "The outage was caused by an expired TLS certificate on api.example.com."
1361
+ ],
1362
+ "retrievalSummary": "The outage was caused by database corruption. | The outage was caused by an expired TLS certificate on api.example.com."
1363
+ },
1364
+ {
1365
+ "system": "Recent Window",
1366
+ "durationMs": 0,
1367
+ "passed": false,
1368
+ "score": 0,
1369
+ "summary": "blocked content outranked the correct answer",
1370
+ "topResults": [
1371
+ "The outage was caused by database corruption.",
1372
+ "The outage was caused by an expired TLS certificate on api.example.com."
1373
+ ],
1374
+ "retrievalSummary": "The outage was caused by database corruption. | The outage was caused by an expired TLS certificate on api.example.com."
1375
+ }
1376
+ ]
1377
+ },
1378
+ {
1379
+ "id": "procedural-learning",
1380
+ "suite": "retrieval",
1381
+ "title": "Procedural learning",
1382
+ "family": "procedural_learning",
1383
+ "description": "Turn repeated incidents into an actionable operating rule.",
1384
+ "query": "What should the agent do when payout retries start returning 429?",
1385
+ "results": [
1386
+ {
1387
+ "system": "Audrey",
1388
+ "durationMs": 15,
1389
+ "passed": true,
1390
+ "score": 1,
1391
+ "summary": "retrieved expected evidence",
1392
+ "topResults": [
1393
+ "When payout retries start returning 429, cap retry batches and stagger retries before escalating."
1394
+ ],
1395
+ "retrievalSummary": "When payout retries start returning 429, cap retry batches and stagger r"
1396
+ },
1397
+ {
1398
+ "system": "Vector Only",
1399
+ "durationMs": 0,
1400
+ "passed": false,
1401
+ "score": 0,
1402
+ "summary": "missed target evidence",
1403
+ "topResults": [
1404
+ "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
1405
+ "Payout incident volume dropped after retry batches were capped at 50 merchants per worker.",
1406
+ "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute."
1407
+ ],
1408
+ "retrievalSummary": "Risk operations requested an escalation when multiple merchants were aff | Payout incident volume dropped after retry batches were capped at 50 mer"
1409
+ },
1410
+ {
1411
+ "system": "Keyword + Recency",
1412
+ "durationMs": 0,
1413
+ "passed": false,
1414
+ "score": 0,
1415
+ "summary": "missed target evidence",
1416
+ "topResults": [
1417
+ "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute.",
1418
+ "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
1419
+ "Payout incident volume dropped after retry batches were capped at 50 merchants per worker."
1420
+ ],
1421
+ "retrievalSummary": "Processor X returned HTTP 429 when payout retries exceeded 120 requests | Risk operations requested an escalation when multiple merchants were aff"
1422
+ },
1423
+ {
1424
+ "system": "Recent Window",
1425
+ "durationMs": 0,
1426
+ "passed": false,
1427
+ "score": 0,
1428
+ "summary": "missed target evidence",
1429
+ "topResults": [
1430
+ "Risk operations requested an escalation when multiple merchants were affected in the same hour.",
1431
+ "Payout incident volume dropped after retry batches were capped at 50 merchants per worker.",
1432
+ "Processor X returned HTTP 429 when payout retries exceeded 120 requests per minute."
1433
+ ],
1434
+ "retrievalSummary": "Risk operations requested an escalation when multiple merchants were aff | Payout incident volume dropped after retry batches were capped at 50 mer"
1435
+ }
1436
+ ]
1437
+ },
1438
+ {
1439
+ "id": "privacy-boundary",
1440
+ "suite": "retrieval",
1441
+ "title": "Privacy boundary",
1442
+ "family": "privacy_boundary",
1443
+ "description": "Never leak private memory into public recall.",
1444
+ "query": "What is Sam API key?",
1445
+ "results": [
1446
+ {
1447
+ "system": "Audrey",
1448
+ "durationMs": 11,
1449
+ "passed": true,
1450
+ "score": 1,
1451
+ "summary": "correct abstention",
1452
+ "topResults": [],
1453
+ "retrievalSummary": "no retrieval"
1454
+ },
1455
+ {
1456
+ "system": "Vector Only",
1457
+ "durationMs": 0,
1458
+ "passed": false,
1459
+ "score": 0,
1460
+ "summary": "leaked restricted content",
1461
+ "topResults": [
1462
+ "Sam API key is sk-secret-demo-key.",
1463
+ "Sam prefers dark mode in internal dashboards."
1464
+ ],
1465
+ "retrievalSummary": "Sam API key is sk-secret-demo-key. | Sam prefers dark mode in internal dashboards."
1466
+ },
1467
+ {
1468
+ "system": "Keyword + Recency",
1469
+ "durationMs": 0,
1470
+ "passed": false,
1471
+ "score": 0,
1472
+ "summary": "leaked restricted content",
1473
+ "topResults": [
1474
+ "Sam API key is sk-secret-demo-key.",
1475
+ "Sam prefers dark mode in internal dashboards."
1476
+ ],
1477
+ "retrievalSummary": "Sam API key is sk-secret-demo-key. | Sam prefers dark mode in internal dashboards."
1478
+ },
1479
+ {
1480
+ "system": "Recent Window",
1481
+ "durationMs": 0,
1482
+ "passed": false,
1483
+ "score": 0,
1484
+ "summary": "leaked restricted content",
1485
+ "topResults": [
1486
+ "Sam prefers dark mode in internal dashboards.",
1487
+ "Sam API key is sk-secret-demo-key."
1488
+ ],
1489
+ "retrievalSummary": "Sam prefers dark mode in internal dashboards. | Sam API key is sk-secret-demo-key."
1490
+ }
1491
+ ]
1492
+ },
1493
+ {
1494
+ "id": "operation-update-overwrite",
1495
+ "suite": "operations",
1496
+ "title": "Update and overwrite",
1497
+ "family": "update_overwrite",
1498
+ "description": "Current-state recall should prefer the new fact after an explicit overwrite.",
1499
+ "query": "What is the primary deployment region now?",
1500
+ "results": [
1501
+ {
1502
+ "system": "Audrey",
1503
+ "durationMs": 13,
1504
+ "passed": true,
1505
+ "score": 1,
1506
+ "summary": "retrieved expected evidence",
1507
+ "topResults": [
1508
+ "As of March 2026, the primary deployment region is eu-west-1."
1509
+ ],
1510
+ "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1."
1511
+ },
1512
+ {
1513
+ "system": "Vector Only",
1514
+ "durationMs": 0,
1515
+ "passed": false,
1516
+ "score": 0.5,
1517
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
1518
+ "topResults": [
1519
+ "As of March 2026, the primary deployment region is eu-west-1.",
1520
+ "The primary deployment region is us-east-1."
1521
+ ],
1522
+ "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
1523
+ },
1524
+ {
1525
+ "system": "Keyword + Recency",
1526
+ "durationMs": 0,
1527
+ "passed": false,
1528
+ "score": 0.5,
1529
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
1530
+ "topResults": [
1531
+ "As of March 2026, the primary deployment region is eu-west-1.",
1532
+ "The primary deployment region is us-east-1."
1533
+ ],
1534
+ "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
1535
+ },
1536
+ {
1537
+ "system": "Recent Window",
1538
+ "durationMs": 0,
1539
+ "passed": false,
1540
+ "score": 0.5,
1541
+ "summary": "retrieved expected evidence, but conflicting evidence still appeared later",
1542
+ "topResults": [
1543
+ "As of March 2026, the primary deployment region is eu-west-1.",
1544
+ "The primary deployment region is us-east-1."
1545
+ ],
1546
+ "retrievalSummary": "As of March 2026, the primary deployment region is eu-west-1. | The primary deployment region is us-east-1."
1547
+ }
1548
+ ]
1549
+ },
1550
+ {
1551
+ "id": "operation-delete-and-abstain",
1552
+ "suite": "operations",
1553
+ "title": "Delete and abstain",
1554
+ "family": "delete_and_abstain",
1555
+ "description": "Explicit deletion should remove a secret from later recall.",
1556
+ "query": "What is the staging API token?",
1557
+ "results": [
1558
+ {
1559
+ "system": "Audrey",
1560
+ "durationMs": 11,
1561
+ "passed": true,
1562
+ "score": 1,
1563
+ "summary": "correct abstention",
1564
+ "topResults": [],
1565
+ "retrievalSummary": "no retrieval"
1566
+ },
1567
+ {
1568
+ "system": "Vector Only",
1569
+ "durationMs": 0,
1570
+ "passed": false,
1571
+ "score": 0,
1572
+ "summary": "leaked restricted content",
1573
+ "topResults": [
1574
+ "The staging API token is tok-demo-staging-1234."
1575
+ ],
1576
+ "retrievalSummary": "The staging API token is tok-demo-staging-1234."
1577
+ },
1578
+ {
1579
+ "system": "Keyword + Recency",
1580
+ "durationMs": 1,
1581
+ "passed": false,
1582
+ "score": 0.5,
1583
+ "summary": "no leak, but retrieved tangential context",
1584
+ "topResults": [
1585
+ "The staging environment rotates API credentials weekly."
1586
+ ],
1587
+ "retrievalSummary": "The staging environment rotates API credentials weekly."
1588
+ },
1589
+ {
1590
+ "system": "Recent Window",
1591
+ "durationMs": 0,
1592
+ "passed": false,
1593
+ "score": 0,
1594
+ "summary": "leaked restricted content",
1595
+ "topResults": [
1596
+ "The staging API token is tok-demo-staging-1234."
1597
+ ],
1598
+ "retrievalSummary": "The staging API token is tok-demo-staging-1234."
1599
+ }
1600
+ ]
1601
+ },
1602
+ {
1603
+ "id": "operation-semantic-merge",
1604
+ "suite": "operations",
1605
+ "title": "Semantic merge",
1606
+ "family": "semantic_merge",
1607
+ "description": "Related episodes should merge into a reusable semantic operating rule.",
1608
+ "query": "When should the disputes queue trigger manual review?",
1609
+ "results": [
1610
+ {
1611
+ "system": "Audrey",
1612
+ "durationMs": 14,
1613
+ "passed": true,
1614
+ "score": 1,
1615
+ "summary": "retrieved expected evidence",
1616
+ "topResults": [
1617
+ "Repeated disputes from the same BIN in one hour should trigger manual review."
1618
+ ],
1619
+ "retrievalSummary": "Repeated disputes from the same BIN in one hour should trigger manual re"
1620
+ },
1621
+ {
1622
+ "system": "Vector Only",
1623
+ "durationMs": 0,
1624
+ "passed": false,
1625
+ "score": 0,
1626
+ "summary": "missed target evidence",
1627
+ "topResults": [
1628
+ "Three charge disputes from the same BIN landed in the queue within one hour.",
1629
+ "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
1630
+ "Fraud ops escalated repeated same-BIN disputes for analyst attention."
1631
+ ],
1632
+ "retrievalSummary": "Three charge disputes from the same BIN landed in the queue within one h | The queue stabilized after repeated same-BIN disputes were reviewed manu"
1633
+ },
1634
+ {
1635
+ "system": "Keyword + Recency",
1636
+ "durationMs": 0,
1637
+ "passed": false,
1638
+ "score": 0,
1639
+ "summary": "missed target evidence",
1640
+ "topResults": [
1641
+ "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
1642
+ "Three charge disputes from the same BIN landed in the queue within one hour.",
1643
+ "Fraud ops escalated repeated same-BIN disputes for analyst attention."
1644
+ ],
1645
+ "retrievalSummary": "The queue stabilized after repeated same-BIN disputes were reviewed manu | Three charge disputes from the same BIN landed in the queue within one h"
1646
+ },
1647
+ {
1648
+ "system": "Recent Window",
1649
+ "durationMs": 0,
1650
+ "passed": false,
1651
+ "score": 0,
1652
+ "summary": "missed target evidence",
1653
+ "topResults": [
1654
+ "The queue stabilized after repeated same-BIN disputes were reviewed manually.",
1655
+ "Fraud ops escalated repeated same-BIN disputes for analyst attention.",
1656
+ "Three charge disputes from the same BIN landed in the queue within one hour."
1657
+ ],
1658
+ "retrievalSummary": "The queue stabilized after repeated same-BIN disputes were reviewed manu | Fraud ops escalated repeated same-BIN disputes for analyst attention."
1659
+ }
1660
+ ]
1661
+ },
1662
+ {
1663
+ "id": "operation-procedural-merge",
1664
+ "suite": "operations",
1665
+ "title": "Procedural merge",
1666
+ "family": "procedural_merge",
1667
+ "description": "Related episodes should merge into an executable procedure, not just a loose fact.",
1668
+ "query": "What should the agent do after two webhook signature failures?",
1669
+ "results": [
1670
+ {
1671
+ "system": "Audrey",
1672
+ "durationMs": 13,
1673
+ "passed": true,
1674
+ "score": 1,
1675
+ "summary": "retrieved expected evidence",
1676
+ "topResults": [
1677
+ "When webhook signature verification fails twice, rotate the signing secret and replay queued events."
1678
+ ],
1679
+ "retrievalSummary": "When webhook signature verification fails twice, rotate the signing secr"
1680
+ },
1681
+ {
1682
+ "system": "Vector Only",
1683
+ "durationMs": 0,
1684
+ "passed": false,
1685
+ "score": 0,
1686
+ "summary": "missed target evidence",
1687
+ "topResults": [
1688
+ "Webhook signature verification failed twice for merchant ACME.",
1689
+ "Operations recovered the incident by rotating the signing secret.",
1690
+ "Queued webhook events were replayed after the signing secret changed."
1691
+ ],
1692
+ "retrievalSummary": "Webhook signature verification failed twice for merchant ACME. | Operations recovered the incident by rotating the signing secret."
1693
+ },
1694
+ {
1695
+ "system": "Keyword + Recency",
1696
+ "durationMs": 0,
1697
+ "passed": false,
1698
+ "score": 0,
1699
+ "summary": "missed target evidence",
1700
+ "topResults": [
1701
+ "Queued webhook events were replayed after the signing secret changed.",
1702
+ "Webhook signature verification failed twice for merchant ACME.",
1703
+ "Operations recovered the incident by rotating the signing secret."
1704
+ ],
1705
+ "retrievalSummary": "Queued webhook events were replayed after the signing secret changed. | Webhook signature verification failed twice for merchant ACME."
1706
+ },
1707
+ {
1708
+ "system": "Recent Window",
1709
+ "durationMs": 0,
1710
+ "passed": false,
1711
+ "score": 0,
1712
+ "summary": "missed target evidence",
1713
+ "topResults": [
1714
+ "Queued webhook events were replayed after the signing secret changed.",
1715
+ "Operations recovered the incident by rotating the signing secret.",
1716
+ "Webhook signature verification failed twice for merchant ACME."
1717
+ ],
1718
+ "retrievalSummary": "Queued webhook events were replayed after the signing secret changed. | Operations recovered the incident by rotating the signing secret."
1719
+ }
1720
+ ]
1721
+ }
1722
+ ]
1723
+ },
1724
+ "external": {
1725
+ "benchmark": "LoCoMo",
1726
+ "leaderboard": [
1727
+ {
1728
+ "system": "MIRIX",
1729
+ "benchmark": "LoCoMo",
1730
+ "score": 85.4,
1731
+ "unit": "accuracy",
1732
+ "source": "https://arxiv.org/abs/2507.07957",
1733
+ "note": "Published LoCoMo result from the MIRIX paper."
1734
+ },
1735
+ {
1736
+ "system": "Letta Filesystem",
1737
+ "benchmark": "LoCoMo",
1738
+ "score": 74,
1739
+ "unit": "accuracy",
1740
+ "source": "https://www.letta.com/blog/benchmarking-ai-agent-memory",
1741
+ "note": "Filesystem-style memory result reported by Letta."
1742
+ },
1743
+ {
1744
+ "system": "Mem0 Graph Memory",
1745
+ "benchmark": "LoCoMo",
1746
+ "score": 68.5,
1747
+ "unit": "accuracy",
1748
+ "source": "https://arxiv.org/abs/2504.19413",
1749
+ "note": "Graph memory variant reported in the Mem0 paper."
1750
+ },
1751
+ {
1752
+ "system": "Mem0",
1753
+ "benchmark": "LoCoMo",
1754
+ "score": 66.9,
1755
+ "unit": "accuracy",
1756
+ "source": "https://arxiv.org/abs/2504.19413",
1757
+ "note": "Core Mem0 LoCoMo score reported in the Mem0 paper."
1758
+ },
1759
+ {
1760
+ "system": "OpenAI Memory",
1761
+ "benchmark": "LoCoMo",
1762
+ "score": 52.9,
1763
+ "unit": "accuracy",
1764
+ "source": "https://arxiv.org/abs/2504.19413",
1765
+ "note": "OpenAI memory baseline as reported by the Mem0 paper."
1766
+ }
1767
+ ]
1768
+ },
1769
+ "trends": [
1770
+ {
1771
+ "title": "Memory is moving from flat retrieval to typed systems",
1772
+ "summary": "Recent work treats episodic, semantic, procedural, and graph memory as separate but cooperating layers.",
1773
+ "source": "https://arxiv.org/abs/2507.03724"
1774
+ },
1775
+ {
1776
+ "title": "Benchmarks now emphasize multi-session realism",
1777
+ "summary": "LongMemEval and LoCoMo push memory systems toward temporal updates, abstraction, and cross-session reasoning instead of single-turn fact recall.",
1778
+ "source": "https://arxiv.org/abs/2410.10813"
1779
+ },
1780
+ {
1781
+ "title": "Context engineering is now competing with retrieval-first designs",
1782
+ "summary": "Letta argues filesystem and memory-block approaches can outperform simpler retrieval-only memory on realistic long-horizon tasks.",
1783
+ "source": "https://www.letta.com/blog/memory-blocks"
1784
+ },
1785
+ {
1786
+ "title": "Production teams care about latency and token footprint, not just recall quality",
1787
+ "summary": "Mem0 frames memory as a cost and latency optimization surface in addition to a personalization surface.",
1788
+ "source": "https://arxiv.org/abs/2504.19413"
1789
+ },
1790
+ {
1791
+ "title": "Temporal and multimodal memory are becoming table stakes",
1792
+ "summary": "MIRIX and Graphiti both model time and state change explicitly instead of assuming memories stay forever true.",
1793
+ "source": "https://arxiv.org/abs/2507.07957"
1794
+ }
1795
+ ]
1796
+ }