2009YU commited on
Commit
3a64edb
·
verified ·
1 Parent(s): e8088bb

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. .gitattributes +5 -34
  2. .gitignore +5 -0
  3. LICENSE +21 -0
  4. README.md +207 -0
  5. README.zh-CN.md +207 -0
  6. assets/tmcra-logo.png +3 -0
  7. code/memory_adapters.py +0 -0
  8. code/profile_layer.py +748 -0
  9. code/run_lme_s10_native_tmcra.py +0 -0
  10. docs/BASELINE_S500_20260525.md +35 -0
  11. docs/OPTIONAL_MODULES_AND_PARALLEL.md +347 -0
  12. docs/OPTIONAL_MODULES_AND_PARALLEL.zh-CN.md +345 -0
  13. docs/TRAINING.md +113 -0
  14. github_push.err.log +2 -0
  15. github_push.out.log +2 -0
  16. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_01.pt +3 -0
  17. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_02.pt +3 -0
  18. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_03.pt +3 -0
  19. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_04.pt +3 -0
  20. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_05.pt +3 -0
  21. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_06.pt +3 -0
  22. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_07.pt +3 -0
  23. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_step_0000600.pt +3 -0
  24. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_step_0000650.pt +3 -0
  25. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_01.pt +3 -0
  26. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_02.pt +3 -0
  27. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_03.pt +3 -0
  28. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_04.pt +3 -0
  29. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_05.pt +3 -0
  30. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_07.pt +3 -0
  31. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_step_0000600.pt +3 -0
  32. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_step_0000650.pt +3 -0
  33. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/export_manifest.json +1 -0
  34. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/launch_train.sh +48 -0
  35. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt +3 -0
  36. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_best.pt +3 -0
  37. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt +3 -0
  38. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/nohup.log +0 -0
  39. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt +3 -0
  40. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_best.pt +3 -0
  41. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt +3 -0
  42. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train.log +84 -0
  43. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train.pid +1 -0
  44. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train_summary.json +0 -0
  45. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train_summary.partial.json +1 -0
  46. models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/training_issues.jsonl +1 -0
  47. results/judge_gpt4o_alias_vectorengine.jsonl +500 -0
  48. results/judge_gpt4o_alias_vectorengine.jsonl.summary.json +36 -0
  49. results/lme_s500_frozen_baseline38_full10_20260525_results.tar.gz +3 -0
  50. results/predictions.jsonl +500 -0
.gitattributes CHANGED
@@ -1,35 +1,6 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.pt filter=lfs diff=lfs merge=lfs -text
2
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
3
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
4
+ *.jsonl filter=lfs diff=lfs merge=lfs -text
5
+ *.json linguist-detectable=true
6
+ assets/tmcra-logo.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ .DS_Store
4
+ Thumbs.db
5
+ *.pid
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 TMCRA contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TMCRA Long-Memory Runtime
2
+
3
+ <p align="center">
4
+ <img src="assets/tmcra-logo.png" alt="TMCRA logo" width="420">
5
+ </p>
6
+
7
+ [中文版本](README.zh-CN.md)
8
+
9
+ TMCRA is a runtime layer that gives agent systems long-term memory. It helps an LLM retrieve, connect, and reason over long dialogue histories without exposing the full conversation context on every turn.
10
+
11
+ This repository contains a frozen TMCRA baseline package with model weights, runtime code snapshots, training metadata, and LongMemEval S500 benchmark results.
12
+
13
+ License: MIT.
14
+
15
+ ## What TMCRA Does
16
+
17
+ TMCRA adds a dedicated memory runtime between an agent application and its answer model.
18
+
19
+ At write time, TMCRA turns dialogue into memory nodes, event units, profile signals, and graph paths. This lets the system preserve not only isolated facts, but also relationships between facts across turns and sessions.
20
+
21
+ At retrieval time, TMCRA scores graph nodes and paths, selects compact evidence, and injects only the most relevant memory context into the answer model. The answer model still performs natural-language reasoning, while TMCRA handles long-memory organization, recall, and evidence surfacing.
22
+
23
+ The current runtime focuses on:
24
+
25
+ - user fact memory
26
+ - assistant-response memory
27
+ - profile and preference memory
28
+ - temporal memory
29
+ - cross-session graph tunneling
30
+ - learned node/path scoring
31
+ - compact evidence selection for downstream LLMs
32
+
33
+ ## Algorithm Structure
34
+
35
+ ```mermaid
36
+ flowchart LR
37
+ A["Dialogue Turns"] --> B["Writer Layer"]
38
+ B --> C["Memory Nodes"]
39
+ B --> D["Event Units"]
40
+ B --> E["Profile Signals"]
41
+ B --> F["Temporal Signals"]
42
+
43
+ C --> G["Graph Memory Store"]
44
+ D --> G
45
+ E --> G
46
+ F --> G
47
+
48
+ G --> H["Learned Node Scorer"]
49
+ G --> I["Learned Path Scorer"]
50
+ H --> J["Evidence Selection"]
51
+ I --> J
52
+
53
+ J --> K["Compact Memory Context"]
54
+ K --> L["Answer Model"]
55
+ L --> M["Agent Response"]
56
+
57
+ N["User Query"] --> H
58
+ N --> I
59
+ N --> J
60
+ N --> L
61
+ ```
62
+
63
+ The writer layer produces memory units from dialogue. The graph memory store preserves facts, events, profile signals, temporal signals, and cross-session links. The learned node/path scorers select relevant evidence for the current query, and the answer model uses that compact evidence to produce the final response.
64
+
65
+ ## Why TMCRA
66
+
67
+ Long-running agents need more than simple vector recall. They need to preserve user facts, preferences, timeline changes, cross-session events, and multi-step evidence chains.
68
+
69
+ TMCRA organizes memory into graph nodes and learned retrieval paths, then surfaces compact evidence to the answer model. The goal is to let external agents use long-term memory through a runtime/API layer while keeping the memory algorithm and model weights independently deployable.
70
+
71
+ ## How to Use
72
+
73
+ For inference/runtime use, load the graph scorer weights from:
74
+
75
+ ```text
76
+ models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/
77
+ ```
78
+
79
+ The main runtime files are:
80
+
81
+ ```text
82
+ node_scorer.pt
83
+ path_scorer.pt
84
+ export_manifest.json
85
+ ```
86
+
87
+ A typical runtime configuration points TMCRA to these weights:
88
+
89
+ ```bash
90
+ export TMCRA_NODE_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt"
91
+ export TMCRA_PATH_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt"
92
+ export TMCRA_RETRIEVAL_MODE="hybrid_node_scored"
93
+ export TMCRA_REQUIRE_LEARNED_SCORER="1"
94
+ ```
95
+
96
+ The evaluation entrypoint snapshot is:
97
+
98
+ ```text
99
+ code/run_lme_s10_native_tmcra.py
100
+ ```
101
+
102
+ The core adapter snapshot is:
103
+
104
+ ```text
105
+ code/memory_adapters.py
106
+ ```
107
+
108
+ For a deployment build, load the two scorer files into the TMCRA adapter and point the agent's memory middleware to the TMCRA retrieval API. The answer model can be any OpenAI-compatible or local LLM endpoint; TMCRA supplies the selected memory evidence, and the answer model produces the final response.
109
+
110
+ ## Dependency Environment
111
+
112
+ The included code snapshot is Python-based. A practical runtime environment should include:
113
+
114
+ - Python 3.10 or newer
115
+ - PyTorch, with CUDA recommended for model inference
116
+ - NumPy and standard Python data-processing libraries
117
+ - an OpenAI-compatible or local LLM endpoint for the answer layer and writer layer
118
+ - optional Git LFS support when pulling the full model package from GitHub
119
+
120
+ The benchmark scripts expect LongMemEval-format input data and write JSONL predictions and judge outputs. Runtime deployments can use the same model files without running the benchmark harness.
121
+
122
+ ## Optional Modules
123
+
124
+ TMCRA also keeps optional extension points for retrieval and planning experiments. These modules can be enabled in deployment or evaluation builds when the target use case needs them.
125
+
126
+ - **Embedder interface**: an optional semantic embedding channel that can run alongside the graph-memory scorer. It is intended to provide additional dense semantic recall before or during graph evidence selection, without replacing the learned graph node/path scorers.
127
+ - **LLM planner interface**: an optional planner hook that can use an external LLM to organize evidence, expand query intent, or create an answer plan before the final answer call. This is useful for experiments and higher-cost deployments, while the default baseline keeps the core graph scorer path independently measurable.
128
+
129
+ These interfaces are integration points, not required dependencies for the frozen S500 baseline. They are designed so downstream deployments can decide whether to run a lighter scorer-only path or a heavier path with embedder/planner assistance.
130
+
131
+ ## Included Artifacts
132
+
133
+ - `code/`: runtime and evaluation code snapshot for this baseline.
134
+ - `models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/`: full trained graph-model output directory.
135
+ - `results/`: predictions, judge output, summary metrics, and compressed run artifacts.
136
+ - `docs/`: baseline record and result notes.
137
+
138
+ Additional documentation:
139
+
140
+ - `docs/BASELINE_S500_20260525.md`: benchmark record and subtask metrics.
141
+ - `docs/TRAINING.md`: model training direction and released training artifacts.
142
+ - `docs/OPTIONAL_MODULES_AND_PARALLEL.md`: optional embedder/planner integration points and parallel evaluation plan.
143
+ - `docs/OPTIONAL_MODULES_AND_PARALLEL.zh-CN.md`: Chinese version of the optional module and parallel evaluation guide.
144
+
145
+ ## Model Package
146
+
147
+ The included model package preserves the full training output for the graph scorer stack:
148
+
149
+ - `node_scorer.pt` and `path_scorer.pt`: runtime graph scoring weights.
150
+ - `node_scorer_best.pt` and `path_scorer_best.pt`: best checkpoint aliases.
151
+ - `node_scorer_last.pt` and `path_scorer_last.pt`: final training aliases.
152
+ - `checkpoints/`: epoch and step checkpoints.
153
+ - `export_manifest.json`, `train_summary.json`, and `train.log`: model metadata and training trace.
154
+
155
+ ## Current Strengths
156
+
157
+ - Strong direct user-fact recall in single-session settings.
158
+ - Strong assistant-detail recall.
159
+ - Competitive knowledge-update behavior for changing facts.
160
+ - Working temporal and preference retrieval layers with clear room for further specialization.
161
+
162
+ ## Active Improvement Areas
163
+
164
+ - Multi-session aggregation and unit coverage.
165
+ - Deeper time-graph reasoning.
166
+ - Preference-profile abstraction and cross-session tunneling.
167
+ - Query-graph to memory-graph matching for complex questions.
168
+
169
+ ## Intended Use
170
+
171
+ This repository is a public-facing evidence package for TMCRA's long-memory runtime work. It is suitable for:
172
+
173
+ - Benchmark review.
174
+ - Model and result inspection.
175
+ - Reproducing the frozen baseline.
176
+ - Demonstrating how TMCRA can be packaged as an external memory runtime for agents.
177
+
178
+ ## Benchmark Result
179
+
180
+ This package includes a full LongMemEval S500 run.
181
+
182
+ - Benchmark: LongMemEval S set, 500 samples
183
+ - Evaluation: official-compatible LongMemEval judge prompt
184
+ - Judge model: `gpt-4o`, resolved as `gpt-4o-2024-08-06`
185
+ - Writer layer used in this run: DeepSeek v4 Flash
186
+ - Answer layer used in this run: GPT5.4-compatible API
187
+ - Overall accuracy: `310 / 500 = 62.00%`
188
+
189
+ ## Results by Task Type
190
+
191
+ | task type | accuracy | count |
192
+ | --- | ---: | ---: |
193
+ | single-session-user | 81.43% | 70 |
194
+ | single-session-assistant | 78.57% | 56 |
195
+ | knowledge-update | 70.51% | 78 |
196
+ | temporal-reasoning | 63.16% | 133 |
197
+ | single-session-preference | 56.67% | 30 |
198
+ | multi-session | 39.85% | 133 |
199
+
200
+ The benchmark outputs are available in:
201
+
202
+ ```text
203
+ results/predictions.jsonl
204
+ results/judge_gpt4o_alias_vectorengine.jsonl
205
+ results/judge_gpt4o_alias_vectorengine.jsonl.summary.json
206
+ results/lme_s500_frozen_baseline38_full10_20260525_results.tar.gz
207
+ ```
README.zh-CN.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TMCRA 长记忆运行时
2
+
3
+ <p align="center">
4
+ <img src="assets/tmcra-logo.png" alt="TMCRA logo" width="420">
5
+ </p>
6
+
7
+ [English version](README.md)
8
+
9
+ TMCRA 是给 Agent 系统提供长期记忆能力的运行时层。它帮助大语言模型在长对话历史中检索、连接和推理相关记忆,而不需要在每一轮都暴露完整上下文。
10
+
11
+ 本仓库包含一个冻结的 TMCRA 基线包,包括模型权重、运行时代码快照、训练元数据和 LongMemEval S500 测评结果。
12
+
13
+ 许可证:MIT。
14
+
15
+ ## TMCRA 的作用
16
+
17
+ TMCRA 在 Agent 应用和回答模型之间增加一层专门的记忆运行时。
18
+
19
+ 在写入阶段,TMCRA 会把对话转成记忆节点、事件单元、profile 信号和图路径。这让系统不仅能保存孤立事实,也能保存不同轮次、不同会话之间的事实关系。
20
+
21
+ 在召回阶段,TMCRA 会对图节点和路径进行打分,选择紧凑的证据,并只把最相关的记忆上下文注入给回答模型。回答模型仍然负责自然语言推理和最终表达,TMCRA 负责长记忆组织、召回和证据呈现。
22
+
23
+ 当前运行时重点覆盖:
24
+
25
+ - 用户事实记忆
26
+ - 助手回答细节记忆
27
+ - profile 与偏好记忆
28
+ - 时间记忆
29
+ - 跨会话图隧穿
30
+ - 学习式节点/路径打分
31
+ - 面向下游 LLM 的紧凑证据选择
32
+
33
+ ## 算法结构图
34
+
35
+ ```mermaid
36
+ flowchart LR
37
+ A["对话轮次"] --> B["写入层"]
38
+ B --> C["记忆节点"]
39
+ B --> D["事件单元"]
40
+ B --> E["Profile 信号"]
41
+ B --> F["时间信号"]
42
+
43
+ C --> G["图记忆库"]
44
+ D --> G
45
+ E --> G
46
+ F --> G
47
+
48
+ G --> H["学习式节点 Scorer"]
49
+ G --> I["学习式路径 Scorer"]
50
+ H --> J["证据选择"]
51
+ I --> J
52
+
53
+ J --> K["紧凑记忆上下文"]
54
+ K --> L["回答模型"]
55
+ L --> M["Agent 回复"]
56
+
57
+ N["用户问题"] --> H
58
+ N --> I
59
+ N --> J
60
+ N --> L
61
+ ```
62
+
63
+ 写入层从对话中生成记忆单元。图记忆库保存事实、事件、profile 信号、时间信号和跨会话连接。学习式节点/路径 scorer 会根据当前问题选择相关证据,回答模型使用这些紧凑证据生成最终回复。
64
+
65
+ ## 为什么需要 TMCRA
66
+
67
+ 长期运行的 Agent 不能只依赖简单向量召回。它需要保存用户事实、偏好、时间线变化、跨会话事件,以及多步证据链。
68
+
69
+ TMCRA 将记忆组织成图节点和学习得到的召回路径,再把压缩后的证据提供给回答模型。目标是让外部 Agent 可以通过运行时/API 层使用长期记忆,同时保持记忆算法和模型权重可以独立部署。
70
+
71
+ ## 如何使用
72
+
73
+ 推理或运行时使用时,加载下面目录中的图 scorer 权重:
74
+
75
+ ```text
76
+ models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/
77
+ ```
78
+
79
+ 主要运行时文件是:
80
+
81
+ ```text
82
+ node_scorer.pt
83
+ path_scorer.pt
84
+ export_manifest.json
85
+ ```
86
+
87
+ 典型运行配置如下:
88
+
89
+ ```bash
90
+ export TMCRA_NODE_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt"
91
+ export TMCRA_PATH_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt"
92
+ export TMCRA_RETRIEVAL_MODE="hybrid_node_scored"
93
+ export TMCRA_REQUIRE_LEARNED_SCORER="1"
94
+ ```
95
+
96
+ 测评入口代码快照:
97
+
98
+ ```text
99
+ code/run_lme_s10_native_tmcra.py
100
+ ```
101
+
102
+ 核心适配器代码快照:
103
+
104
+ ```text
105
+ code/memory_adapters.py
106
+ ```
107
+
108
+ 部署时,将两个 scorer 文件加载到 TMCRA adapter 中,并让 Agent 的记忆中间件调用 TMCRA 召回 API。回答模型可以是任意 OpenAI 兼容接口或本地 LLM;TMCRA 提供选中的记忆证据,回答模型生成最终回复。
109
+
110
+ ## 依赖环境
111
+
112
+ 当前代码快照基于 Python。建议运行环境包括:
113
+
114
+ - Python 3.10 或更高版本
115
+ - PyTorch,推荐使用 CUDA 做模型推理
116
+ - NumPy 及常见 Python 数据处理库
117
+ - 用于回答层和写入层的 OpenAI 兼容接口或本地 LLM endpoint
118
+ - 从 GitHub 拉取完整模型包时建议支持 Git LFS
119
+
120
+ benchmark 脚本使用 LongMemEval 格式输入数据,并输出 JSONL 格式的预测和 judge 结果。实际运行时部署可以直接使用同一套模型文件,不需要运行 benchmark harness。
121
+
122
+ ## 可开启模块
123
+
124
+ TMCRA 当前也保留了用于召回和规划实验的可选接入口。部署或评估时,如果目标场景需要,可以按需开启这些模块。
125
+
126
+ - **Embedder 接入口**:可选的语义 embedding 通道,可以和图记忆 scorer 并行工作。它用于在图证据选择前或选择过程中提供额外的稠密语义召回,但不替代学习式图节点/路径 scorer。
127
+ - **LLM planner 接入口**:可选的规划 hook,可以调用外部 LLM 来整理证据、扩展 query intent,或在最终回答前生成 answer plan。它适合实验和更高成本的部署场景;默认基线仍保持核心图 scorer 链路可以独立测量。
128
+
129
+ 这些接口是可集成的扩展点,不是冻结 S500 基线的必需依赖。下游部署可以根据成本和效果选择轻量的 scorer-only 路径,或开��� embedder/planner 辅助的增强路径。
130
+
131
+ ## 包含内容
132
+
133
+ - `code/`:该基线使用的运行时和测评代码快照。
134
+ - `models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/`:完整训练后的图模型输出目录。
135
+ - `results/`:预测结果、judge 输出、汇总指标和压缩后的运行结果。
136
+ - `docs/`:基线记录和结果说明。
137
+
138
+ 补充文档:
139
+
140
+ - `docs/BASELINE_S500_20260525.md`:benchmark 记录和子任务指标。
141
+ - `docs/TRAINING.md`:模型训练方向和随包发布的训练产物说明。
142
+ - `docs/OPTIONAL_MODULES_AND_PARALLEL.zh-CN.md`:embedder/planner 可选接入口和并行评估推进方案。
143
+ - `docs/OPTIONAL_MODULES_AND_PARALLEL.md`:上述文档的英文版本。
144
+
145
+ ## 模型包
146
+
147
+ 随包发布的模型目录保留了图 scorer 栈的完整训练输出:
148
+
149
+ - `node_scorer.pt` 和 `path_scorer.pt`:运行时图打分权重。
150
+ - `node_scorer_best.pt` 和 `path_scorer_best.pt`:验证表现最好的 checkpoint 别名。
151
+ - `node_scorer_last.pt` 和 `path_scorer_last.pt`:训练结束时的 checkpoint 别名。
152
+ - `checkpoints/`:按 epoch 和 step 保存的中间 checkpoint。
153
+ - `export_manifest.json`、`train_summary.json` 和 `train.log`:模型元数据和训练轨迹。
154
+
155
+ ## 当前优势
156
+
157
+ - 单会话直接用户事实召回较强。
158
+ - 助手细节召回较强。
159
+ - 对变化事实的 knowledge-update 任务有较稳定表现。
160
+ - 已具备可工作的时间记忆和偏好召回层,后续仍有专项优化空间。
161
+
162
+ ## 当前改进方向
163
+
164
+ - multi-session 聚合和 unit coverage。
165
+ - 更深的时间图推理。
166
+ - 偏好 profile 抽象和跨会话隧穿。
167
+ - query graph 与 memory graph 的复杂问题匹配。
168
+
169
+ ## 适用场景
170
+
171
+ 本仓库是 TMCRA 长记忆运行时工作的对外展示包,适合用于:
172
+
173
+ - benchmark 审阅
174
+ - 模型和结果检查
175
+ - 复现冻结基线
176
+ - 展示 TMCRA 如何作为外部 Agent 的记忆运行时进行封装
177
+
178
+ ## Benchmark 结果
179
+
180
+ 本包包含一次完整 LongMemEval S500 测试。
181
+
182
+ - Benchmark:LongMemEval S set,500 条样本
183
+ - Evaluation:official-compatible LongMemEval judge prompt
184
+ - Judge 模型:`gpt-4o`,解析为 `gpt-4o-2024-08-06`
185
+ - 本轮使用的写入层:DeepSeek v4 Flash
186
+ - 本轮使用的回答层:GPT5.4-compatible API
187
+ - 总准确率:`310 / 500 = 62.00%`
188
+
189
+ ## 各任务类型结果
190
+
191
+ | 任务类型 | 准确率 | 数量 |
192
+ | --- | ---: | ---: |
193
+ | single-session-user | 81.43% | 70 |
194
+ | single-session-assistant | 78.57% | 56 |
195
+ | knowledge-update | 70.51% | 78 |
196
+ | temporal-reasoning | 63.16% | 133 |
197
+ | single-session-preference | 56.67% | 30 |
198
+ | multi-session | 39.85% | 133 |
199
+
200
+ benchmark 输出文件位于:
201
+
202
+ ```text
203
+ results/predictions.jsonl
204
+ results/judge_gpt4o_alias_vectorengine.jsonl
205
+ results/judge_gpt4o_alias_vectorengine.jsonl.summary.json
206
+ results/lme_s500_frozen_baseline38_full10_20260525_results.tar.gz
207
+ ```
assets/tmcra-logo.png ADDED

Git LFS Details

  • SHA256: 95a8fc61e6ace316cc46ccaba1208b5b40842008abe273c2f4d875dd5b5a3601
  • Pointer size: 132 Bytes
  • Size of remote file: 1.04 MB
code/memory_adapters.py ADDED
The diff for this file is too large to render. See raw diff
 
code/profile_layer.py ADDED
@@ -0,0 +1,748 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any, Iterable, Mapping, Sequence
5
+
6
+
7
+ PROFILE_CATEGORIES = {"profile", "preference", "goal", "constraint", "stage_state", "status"}
8
+ PROFILE_TYPES = {"setup", "preference", "constraint", "goal", "avoid", "usage_context"}
9
+ PROFILE_SEMANTIC_SLOTS = {"identity", "research_topic", "education", "occupation"}
10
+ PROFILE_AGGREGATE_SOURCE_KIND = "public_dialog_profile"
11
+ PROFILE_AGGREGATE_CATEGORY = "profile"
12
+ PROFILE_CLUSTER_SOURCE_KIND = "public_dialog_profile_cluster"
13
+ PROFILE_CLUSTER_CATEGORY = "profile"
14
+ PROFILE_CONSOLIDATOR_VERSION = "profile_consolidator_v1_structured_summary"
15
+
16
+ _PROFILE_QUERY_MARKERS = (
17
+ "preference",
18
+ "prefer",
19
+ "like",
20
+ "dislike",
21
+ "recommend",
22
+ "suggest",
23
+ "advice",
24
+ "advise",
25
+ "any advice",
26
+ "any suggestions",
27
+ "any tips",
28
+ "tips",
29
+ "trouble with",
30
+ "struggling with",
31
+ "what do you think",
32
+ "learn more",
33
+ "resources",
34
+ "suited",
35
+ "fit me",
36
+ "for me",
37
+ "based on me",
38
+ "based on my",
39
+ "my setup",
40
+ "my profile",
41
+ "my goal",
42
+ "my constraint",
43
+ "my occupation",
44
+ "my previous occupation",
45
+ "my role",
46
+ "my previous role",
47
+ "my job",
48
+ "my previous job",
49
+ "what was my",
50
+ "where did i work",
51
+ "worked as",
52
+ "occupation",
53
+ "previous occupation",
54
+ "role",
55
+ "previous role",
56
+ "job",
57
+ "career",
58
+ "background",
59
+ "identity",
60
+ "experience",
61
+ "avoid",
62
+ "what should i",
63
+ "should i",
64
+ "what should",
65
+ "serve",
66
+ "dinner",
67
+ "homegrown",
68
+ "ingredients",
69
+ "battery life",
70
+ "getting around",
71
+ "偏好",
72
+ "喜欢",
73
+ "不喜欢",
74
+ "推荐",
75
+ "建议",
76
+ "适合我",
77
+ "根据我",
78
+ "我的情况",
79
+ "我的配置",
80
+ "我的目标",
81
+ "我的约束",
82
+ "画像",
83
+ "避免",
84
+ )
85
+
86
+
87
+ def _clean_text(value: Any) -> str:
88
+ return re.sub(r"\s+", " ", str(value or "").strip())
89
+
90
+
91
+ def _normalize(value: Any) -> str:
92
+ return _clean_text(value).lower()
93
+
94
+
95
+ def _tokens(value: Any) -> list[str]:
96
+ text = _normalize(value)
97
+ english = re.findall(r"[a-z0-9_]+", text)
98
+ cjk = [char for char in text if "\u4e00" <= char <= "\u9fff"]
99
+ return _dedupe([*english, *cjk])
100
+
101
+
102
+ def _slug(value: Any, *, fallback: str = "general") -> str:
103
+ text = _normalize(value)
104
+ parts = re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]", text)
105
+ slug = "_".join(parts[:10]).strip("_")
106
+ return slug or fallback
107
+
108
+
109
+ def _dedupe(items: Iterable[Any], *, max_items: int | None = None) -> list[str]:
110
+ values: list[str] = []
111
+ seen: set[str] = set()
112
+ for item in items:
113
+ text = _clean_text(item)
114
+ if not text:
115
+ continue
116
+ key = _normalize(text)
117
+ if key in seen:
118
+ continue
119
+ seen.add(key)
120
+ values.append(text)
121
+ if max_items is not None and len(values) >= max_items:
122
+ break
123
+ return values
124
+
125
+
126
+ def _first_nonempty(*values: Any) -> str:
127
+ for value in values:
128
+ text = _clean_text(value)
129
+ if text:
130
+ return text
131
+ return ""
132
+
133
+
134
+ def _bounded_union(*groups: Iterable[Any], max_items: int) -> list[str]:
135
+ return _dedupe([item for group in groups for item in group], max_items=max_items)
136
+
137
+
138
+ def _bounded_int_union(*groups: Iterable[Any], max_items: int) -> list[int]:
139
+ values: list[int] = []
140
+ seen: set[int] = set()
141
+ for group in groups:
142
+ for item in group:
143
+ try:
144
+ value = int(item)
145
+ except Exception:
146
+ continue
147
+ if value in seen:
148
+ continue
149
+ seen.add(value)
150
+ values.append(value)
151
+ if len(values) >= max_items:
152
+ return sorted(values)
153
+ return sorted(values)
154
+
155
+
156
+ def is_profile_layer_record(*, category: Any, source_kind: Any = "", semantic_slot: Any = "", metadata: Mapping[str, Any] | None = None) -> bool:
157
+ data = dict(metadata or {})
158
+ if bool(data.get("profile_layer")):
159
+ return True
160
+ category_text = _normalize(category)
161
+ slot_text = _normalize(semantic_slot or data.get("semantic_slot", ""))
162
+ source_text = _normalize(source_kind)
163
+ if category_text in PROFILE_CATEGORIES:
164
+ return True
165
+ if slot_text in PROFILE_SEMANTIC_SLOTS or slot_text.startswith("profile_"):
166
+ return True
167
+ return source_text in {
168
+ "public_dialog_profile",
169
+ "public_dialog_profile_cluster",
170
+ "public_dialog_preference",
171
+ "public_dialog_goal",
172
+ "public_dialog_constraint",
173
+ }
174
+
175
+
176
+ def infer_profile_type(*, category: Any, semantic_slot: Any = "", relation: Any = "", value: Any = "", metadata: Mapping[str, Any] | None = None) -> str:
177
+ data = dict(metadata or {})
178
+ explicit = _normalize(data.get("profile_type", ""))
179
+ if explicit in PROFILE_TYPES:
180
+ return explicit
181
+ category_text = _normalize(category)
182
+ combined = _normalize(f"{semantic_slot} {relation} {value}")
183
+ if category_text == "preference" or any(marker in combined for marker in ("prefer", "like", "default", "偏好", "喜欢", "默认")):
184
+ return "preference"
185
+ if category_text == "constraint" or any(marker in combined for marker in ("must", "cannot", "forbid", "constraint", "必须", "不能", "约束")):
186
+ return "constraint"
187
+ if category_text == "goal" or any(marker in combined for marker in ("goal", "target", "objective", "目标")):
188
+ return "goal"
189
+ if any(marker in combined for marker in ("avoid", "dislike", "do not", "don't", "避免", "不喜欢", "不要")):
190
+ return "avoid"
191
+ if any(marker in combined for marker in ("setup", "environment", "workflow", "current", "配置", "环境", "流程", "当前")):
192
+ return "setup"
193
+ return "usage_context"
194
+
195
+
196
+ def infer_profile_domain(
197
+ *,
198
+ category: Any,
199
+ semantic_slot: Any = "",
200
+ slot_key: Any = "",
201
+ anchors: Sequence[Any] = (),
202
+ value: Any = "",
203
+ metadata: Mapping[str, Any] | None = None,
204
+ ) -> tuple[str, str]:
205
+ data = dict(metadata or {})
206
+ explicit = _first_nonempty(data.get("profile_domain", ""), data.get("profile_domain_label", ""), data.get("domain", ""))
207
+ if explicit:
208
+ return _slug(explicit), explicit
209
+ subject = _first_nonempty(data.get("subject", ""), data.get("extracted_subject", ""))
210
+ if subject:
211
+ return _slug(subject), subject
212
+ slot = _normalize(semantic_slot)
213
+ if slot and slot not in {"profile", "preference", "goal", "constraint", "event", "fact", "status"}:
214
+ return _slug(slot), slot.replace("_", " ")
215
+ for anchor in anchors:
216
+ anchor_text = _clean_text(anchor)
217
+ if not anchor_text:
218
+ continue
219
+ if re.search(r"^\d{4}|\d{1,2}\s+[A-Za-z]+|^[A-Z][a-z]+$", anchor_text):
220
+ continue
221
+ return _slug(anchor_text), anchor_text
222
+ slot_key_text = _clean_text(slot_key)
223
+ if slot_key_text:
224
+ tail = slot_key_text.split(".")[-1].replace("_", " ")
225
+ if tail:
226
+ return _slug(tail), tail
227
+ value_tokens = _tokens(value)
228
+ if value_tokens:
229
+ label = " ".join(value_tokens[:4])
230
+ return _slug(label), label
231
+ return _slug(category), _normalize(category) or "general"
232
+
233
+
234
+ def profile_candidate_metadata(
235
+ *,
236
+ category: Any,
237
+ semantic_slot: Any = "",
238
+ relation: Any = "",
239
+ value: Any = "",
240
+ source_span: Any = "",
241
+ slot_key: Any = "",
242
+ anchors: Sequence[Any] = (),
243
+ subject: Any = "",
244
+ subject_signature: Any = "",
245
+ proposal: Mapping[str, Any] | None = None,
246
+ ) -> dict[str, Any]:
247
+ proposal_data = dict(proposal or {})
248
+ base_metadata = {
249
+ "profile_type": proposal_data.get("profile_type", ""),
250
+ "profile_domain": proposal_data.get("profile_domain", ""),
251
+ "profile_domain_label": proposal_data.get("profile_domain_label", ""),
252
+ "subject": subject,
253
+ "extracted_subject": proposal_data.get("extracted_subject", ""),
254
+ }
255
+ if not is_profile_layer_record(category=category, semantic_slot=semantic_slot, metadata=base_metadata):
256
+ return {}
257
+ profile_type = infer_profile_type(
258
+ category=category,
259
+ semantic_slot=semantic_slot,
260
+ relation=relation,
261
+ value=value or source_span,
262
+ metadata=proposal_data,
263
+ )
264
+ domain, domain_label = infer_profile_domain(
265
+ category=category,
266
+ semantic_slot=semantic_slot,
267
+ slot_key=slot_key,
268
+ anchors=anchors,
269
+ value=value or source_span,
270
+ metadata={**proposal_data, "subject": subject},
271
+ )
272
+ normalized_subject_signature = _slug(subject_signature or subject or domain, fallback=domain)
273
+ route_terms = _dedupe(
274
+ [
275
+ profile_type,
276
+ domain_label,
277
+ semantic_slot,
278
+ subject,
279
+ *list(anchors or []),
280
+ ],
281
+ max_items=12,
282
+ )
283
+ return {
284
+ "profile_layer": True,
285
+ "profile_candidate_status": _clean_text(proposal_data.get("profile_candidate_status", "")) or "writer_candidate",
286
+ "profile_consolidation_stage": _clean_text(proposal_data.get("profile_consolidation_stage", "")) or "pre_consolidation",
287
+ "profile_type": profile_type,
288
+ "profile_domain": domain,
289
+ "profile_domain_label": domain_label,
290
+ "profile_subject_signature": normalized_subject_signature,
291
+ "profile_support_key": f"{profile_type}:{domain}:{normalized_subject_signature}",
292
+ "profile_route_terms": route_terms,
293
+ }
294
+
295
+
296
+ def profile_aggregate_slot_key(metadata: Mapping[str, Any]) -> str:
297
+ data = dict(metadata or {})
298
+ support_key = _clean_text(data.get("profile_support_key", ""))
299
+ if support_key:
300
+ return f"tmcra.profile.aggregate.{_slug(support_key)}"
301
+ profile_type = _normalize(data.get("profile_type", "")) or "usage_context"
302
+ domain = _normalize(data.get("profile_domain", "")) or _slug(data.get("profile_domain_label", "general"))
303
+ subject = _normalize(data.get("profile_subject_signature", "")) or domain
304
+ return f"tmcra.profile.aggregate.{_slug(f'{profile_type}:{domain}:{subject}')}"
305
+
306
+
307
+ def profile_aggregate_value(
308
+ *,
309
+ profile_type: Any,
310
+ domain_label: Any,
311
+ support_values: Sequence[Any],
312
+ ) -> str:
313
+ typed = _clean_text(profile_type) or "usage_context"
314
+ domain = _clean_text(domain_label) or "general"
315
+ values = _dedupe(support_values, max_items=5)
316
+ if not values:
317
+ return f"User {typed} profile for {domain}."
318
+ return f"User {typed} profile for {domain}: " + "; ".join(values)
319
+
320
+
321
+ def _profile_output_kind(profile_types: Sequence[Any]) -> str:
322
+ normalized = {_normalize(item) for item in profile_types if _clean_text(item)}
323
+ if "constraint" in normalized:
324
+ return "constraint_profile"
325
+ if "goal" in normalized:
326
+ return "goal_profile"
327
+ if normalized.intersection({"preference", "avoid"}):
328
+ return "preference_profile"
329
+ if "setup" in normalized:
330
+ return "setup_profile"
331
+ return "usage_context_profile"
332
+
333
+
334
+ def _profile_update_policy(profile_types: Sequence[Any]) -> str:
335
+ normalized = {_normalize(item) for item in profile_types if _clean_text(item)}
336
+ if "constraint" in normalized:
337
+ return "preserve_until_explicitly_changed"
338
+ if normalized.intersection({"preference", "avoid", "goal", "setup"}):
339
+ return "update_on_newer_user_evidence"
340
+ return "background_context"
341
+
342
+
343
+ def _profile_memory_type(profile_types: Sequence[Any]) -> str:
344
+ normalized = {_normalize(item) for item in profile_types if _clean_text(item)}
345
+ if "constraint" in normalized:
346
+ return "hard_constraint"
347
+ if normalized.intersection({"preference", "avoid"}):
348
+ return "durable_preference"
349
+ return "profile_context"
350
+
351
+
352
+ def profile_summary(
353
+ *,
354
+ profile_types: Sequence[Any],
355
+ domain_label: Any,
356
+ support_values: Sequence[Any],
357
+ stage: str,
358
+ ) -> str:
359
+ kind = _profile_output_kind(profile_types)
360
+ domain = _clean_text(domain_label) or "general"
361
+ values = _dedupe(support_values, max_items=6 if stage == "cluster" else 4)
362
+ prefix = {
363
+ "constraint_profile": "User constraint profile",
364
+ "goal_profile": "User goal profile",
365
+ "preference_profile": "User preference profile",
366
+ "setup_profile": "User setup profile",
367
+ "usage_context_profile": "User usage-context profile",
368
+ }.get(kind, "User profile")
369
+ if not values:
370
+ return f"{prefix} for {domain}."
371
+ return f"{prefix} for {domain}: " + "; ".join(values)
372
+
373
+
374
+ def build_profile_aggregate_metadata(
375
+ *,
376
+ support_record_id: Any,
377
+ support_turn_index: int,
378
+ support_value: Any,
379
+ support_anchors: Sequence[Any],
380
+ support_metadata: Mapping[str, Any],
381
+ existing_metadata: Mapping[str, Any] | None = None,
382
+ ) -> dict[str, Any]:
383
+ support = dict(support_metadata or {})
384
+ existing = dict(existing_metadata or {})
385
+ profile_type = _normalize(support.get("profile_type", "")) or "usage_context"
386
+ domain = _normalize(support.get("profile_domain", "")) or _slug(support.get("profile_domain_label", "general"))
387
+ domain_label = _clean_text(support.get("profile_domain_label", "")) or domain.replace("_", " ")
388
+ subject_signature = _clean_text(support.get("profile_subject_signature", "")) or domain
389
+ support_ids = _bounded_union(existing.get("profile_support_ids", []) or [], [support_record_id], max_items=64)
390
+ support_turns = _bounded_int_union(existing.get("profile_support_turns", []) or [], [support_turn_index], max_items=64)
391
+ support_values = _bounded_union(existing.get("profile_support_values", []) or [], [support_value], max_items=12)
392
+ support_route_terms = _bounded_union(
393
+ existing.get("profile_route_terms", []) or [],
394
+ support.get("profile_route_terms", []) or [],
395
+ support_anchors,
396
+ max_items=24,
397
+ )
398
+ profile_types = [profile_type]
399
+ output_kind = _profile_output_kind(profile_types)
400
+ memory_type = _profile_memory_type(profile_types)
401
+ summary = profile_summary(
402
+ profile_types=profile_types,
403
+ domain_label=domain_label,
404
+ support_values=support_values,
405
+ stage="aggregate",
406
+ )
407
+ value = profile_aggregate_value(
408
+ profile_type=profile_type,
409
+ domain_label=domain_label,
410
+ support_values=support_values,
411
+ )
412
+ return {
413
+ **existing,
414
+ "profile_layer": True,
415
+ "profile_candidate_status": "consolidated",
416
+ "profile_consolidation_stage": "aggregate",
417
+ "profile_consolidator_version": PROFILE_CONSOLIDATOR_VERSION,
418
+ "profile_aggregate_node": True,
419
+ "profile_type": profile_type,
420
+ "profile_domain": domain,
421
+ "profile_domain_label": domain_label,
422
+ "profile_subject_signature": subject_signature,
423
+ "profile_support_key": f"{profile_type}:{domain}:{subject_signature}",
424
+ "profile_support_ids": support_ids,
425
+ "profile_support_turns": support_turns,
426
+ "profile_support_values": support_values,
427
+ "profile_support_count": len(support_ids),
428
+ "profile_route_terms": support_route_terms,
429
+ "profile_value": value,
430
+ "profile_summary": summary,
431
+ "profile_output_kind": output_kind,
432
+ "profile_update_policy": _profile_update_policy(profile_types),
433
+ "profile_conflict_policy": "latest_active_support_only",
434
+ "profile_evidence_count": len(support_ids),
435
+ "memory_type": memory_type,
436
+ "durable_memory_type": memory_type,
437
+ "memory_chain_depth_layer": "profile",
438
+ "depth_layer": "profile",
439
+ }
440
+
441
+
442
+ _PROFILE_CLUSTER_STOPWORDS = {
443
+ "user",
444
+ "profile",
445
+ "preference",
446
+ "preferences",
447
+ "constraint",
448
+ "constraints",
449
+ "goal",
450
+ "goals",
451
+ "avoid",
452
+ "usage",
453
+ "context",
454
+ "general",
455
+ "default",
456
+ "should",
457
+ "would",
458
+ "could",
459
+ "want",
460
+ "wants",
461
+ "need",
462
+ "needs",
463
+ "when",
464
+ "then",
465
+ "than",
466
+ "with",
467
+ "that",
468
+ "this",
469
+ "from",
470
+ "into",
471
+ "instead",
472
+ }
473
+
474
+
475
+ def profile_cluster_tokens(*values: Any) -> list[str]:
476
+ tokens: list[str] = []
477
+ for value in values:
478
+ if isinstance(value, (list, tuple, set)):
479
+ for item in value:
480
+ tokens.extend(_tokens(item))
481
+ else:
482
+ tokens.extend(_tokens(value))
483
+ filtered = [
484
+ token
485
+ for token in tokens
486
+ if token
487
+ and token not in _PROFILE_CLUSTER_STOPWORDS
488
+ and not token.isdigit()
489
+ and (len(token) > 2 or any("\u4e00" <= char <= "\u9fff" for char in token))
490
+ ]
491
+ return _dedupe(filtered, max_items=32)
492
+
493
+
494
+ def profile_cluster_similarity(source_metadata: Mapping[str, Any], target_metadata: Mapping[str, Any]) -> float:
495
+ source = dict(source_metadata or {})
496
+ target = dict(target_metadata or {})
497
+ source_tokens = set(
498
+ profile_cluster_tokens(
499
+ source.get("profile_domain_label", ""),
500
+ source.get("profile_domain", ""),
501
+ source.get("profile_route_terms", []) or [],
502
+ source.get("profile_support_values", []) or [],
503
+ source.get("profile_value", ""),
504
+ )
505
+ )
506
+ target_tokens = set(
507
+ profile_cluster_tokens(
508
+ target.get("profile_domain_label", ""),
509
+ target.get("profile_domain", ""),
510
+ target.get("profile_route_terms", []) or [],
511
+ target.get("profile_support_values", []) or [],
512
+ target.get("profile_value", ""),
513
+ )
514
+ )
515
+ if not source_tokens or not target_tokens:
516
+ return 0.0
517
+ overlap = len(source_tokens & target_tokens) / max(1, len(source_tokens | target_tokens))
518
+ containment = len(source_tokens & target_tokens) / max(1, min(len(source_tokens), len(target_tokens)))
519
+ type_bonus = 0.06 if _normalize(source.get("profile_type", "")) == _normalize(target.get("profile_type", "")) else 0.0
520
+ return round(min(1.0, (0.62 * overlap) + (0.38 * containment) + type_bonus), 6)
521
+
522
+
523
+ def profile_cluster_slot_key(metadata: Mapping[str, Any]) -> str:
524
+ data = dict(metadata or {})
525
+ tokens = profile_cluster_tokens(
526
+ data.get("profile_domain_label", ""),
527
+ data.get("profile_domain", ""),
528
+ data.get("profile_route_terms", []) or [],
529
+ data.get("profile_support_values", []) or [],
530
+ data.get("profile_value", ""),
531
+ )
532
+ if tokens:
533
+ return f"tmcra.profile.cluster.{_slug('_'.join(tokens[:5]))}"
534
+ support_profiles = data.get("profile_support_profile_ids", []) or []
535
+ support_seed = _clean_text(support_profiles[0] if support_profiles else "")
536
+ seed = support_seed or _clean_text(data.get("profile_support_key", "general"))
537
+ return f"tmcra.profile.cluster.{_slug(seed)}"
538
+
539
+
540
+ def profile_cluster_value(*, support_values: Sequence[Any]) -> str:
541
+ values = _dedupe(support_values, max_items=8)
542
+ if not values:
543
+ return "User profile cluster."
544
+ return "User profile cluster: " + "; ".join(values)
545
+
546
+
547
+ def build_profile_cluster_metadata(
548
+ *,
549
+ support_profile_id: Any,
550
+ support_metadata: Mapping[str, Any],
551
+ existing_metadata: Mapping[str, Any] | None = None,
552
+ ) -> dict[str, Any]:
553
+ support = dict(support_metadata or {})
554
+ existing = dict(existing_metadata or {})
555
+ support_profile_ids = _bounded_union(
556
+ existing.get("profile_support_profile_ids", []) or [],
557
+ [support_profile_id],
558
+ max_items=32,
559
+ )
560
+ support_ids = _bounded_union(
561
+ existing.get("profile_support_ids", []) or [],
562
+ support.get("profile_support_ids", []) or [],
563
+ max_items=96,
564
+ )
565
+ support_turns = _bounded_int_union(
566
+ existing.get("profile_support_turns", []) or [],
567
+ support.get("profile_support_turns", []) or [],
568
+ max_items=96,
569
+ )
570
+ raw_support_values = support.get("profile_support_values", []) or []
571
+ support_values = _bounded_union(
572
+ existing.get("profile_support_values", []) or [],
573
+ raw_support_values,
574
+ [] if raw_support_values else [support.get("profile_value", "")],
575
+ max_items=16,
576
+ )
577
+ route_terms = _bounded_union(
578
+ existing.get("profile_route_terms", []) or [],
579
+ existing.get("profile_cluster_route_terms", []) or [],
580
+ support.get("profile_route_terms", []) or [],
581
+ profile_cluster_tokens(
582
+ support.get("profile_domain_label", ""),
583
+ support.get("profile_domain", ""),
584
+ support.get("profile_value", ""),
585
+ support.get("profile_support_values", []) or [],
586
+ ),
587
+ max_items=32,
588
+ )
589
+ profile_types = _bounded_union(
590
+ existing.get("profile_cluster_types", []) or [],
591
+ [support.get("profile_type", "")],
592
+ max_items=8,
593
+ )
594
+ domains = _bounded_union(
595
+ existing.get("profile_cluster_domains", []) or [],
596
+ [support.get("profile_domain", ""), support.get("profile_domain_label", "")],
597
+ max_items=16,
598
+ )
599
+ primary_type = _clean_text(profile_types[0] if profile_types else support.get("profile_type", "")) or "usage_context"
600
+ primary_domain = _clean_text(domains[0] if domains else support.get("profile_domain", "")) or "general"
601
+ output_kind = _profile_output_kind(profile_types)
602
+ memory_type = _profile_memory_type(profile_types)
603
+ summary = profile_summary(
604
+ profile_types=profile_types,
605
+ domain_label=primary_domain,
606
+ support_values=support_values,
607
+ stage="cluster",
608
+ )
609
+ value = profile_cluster_value(support_values=support_values)
610
+ return {
611
+ **existing,
612
+ "profile_layer": True,
613
+ "profile_candidate_status": "consolidated",
614
+ "profile_consolidation_stage": "cluster",
615
+ "profile_consolidator_version": PROFILE_CONSOLIDATOR_VERSION,
616
+ "profile_cluster_node": True,
617
+ "profile_type": primary_type,
618
+ "profile_domain": _slug(primary_domain),
619
+ "profile_domain_label": primary_domain.replace("_", " "),
620
+ "profile_subject_signature": _clean_text(existing.get("profile_subject_signature", "")) or _slug(primary_domain),
621
+ "profile_support_key": _clean_text(existing.get("profile_support_key", "")) or f"cluster:{_slug(primary_domain)}",
622
+ "profile_support_profile_ids": support_profile_ids,
623
+ "profile_support_ids": support_ids,
624
+ "profile_support_turns": support_turns,
625
+ "profile_support_values": support_values,
626
+ "profile_support_count": len(support_ids),
627
+ "profile_cluster_profile_count": len(support_profile_ids),
628
+ "profile_cluster_types": profile_types,
629
+ "profile_cluster_domains": domains,
630
+ "profile_cluster_route_terms": route_terms,
631
+ "profile_route_terms": route_terms,
632
+ "profile_value": value,
633
+ "profile_summary": summary,
634
+ "profile_output_kind": output_kind,
635
+ "profile_update_policy": _profile_update_policy(profile_types),
636
+ "profile_conflict_policy": "latest_active_support_only",
637
+ "profile_evidence_count": len(support_ids),
638
+ "memory_type": memory_type,
639
+ "durable_memory_type": memory_type,
640
+ "memory_chain_depth_layer": "profile",
641
+ "depth_layer": "profile",
642
+ }
643
+
644
+
645
+ def infer_profile_query_intent(query: Any) -> dict[str, Any]:
646
+ query_text = _clean_text(query)
647
+ lowered = _normalize(query_text)
648
+ enabled = any(marker in lowered for marker in _PROFILE_QUERY_MARKERS)
649
+ types: list[str] = []
650
+ if any(marker in lowered for marker in ("prefer", "preference", "like", "dislike", "偏好", "喜欢", "不喜欢")):
651
+ types.append("preference")
652
+ if any(marker in lowered for marker in ("constraint", "must", "cannot", "policy", "约束", "限制", "必须", "不能")):
653
+ types.append("constraint")
654
+ if any(marker in lowered for marker in ("goal", "target", "objective", "目标")):
655
+ types.append("goal")
656
+ if any(marker in lowered for marker in ("avoid", "dislike", "避免", "不喜欢")):
657
+ types.append("avoid")
658
+ if any(marker in lowered for marker in ("setup", "profile", "current", "occupation", "role", "job", "career", "background", "identity", "experience", "配置", "画像", "当前")):
659
+ types.append("setup")
660
+ if enabled and any(
661
+ marker in lowered
662
+ for marker in (
663
+ "recommend",
664
+ "suggest",
665
+ "advice",
666
+ "tips",
667
+ "trouble with",
668
+ "struggling with",
669
+ "what do you think",
670
+ "should i",
671
+ "what should",
672
+ "serve",
673
+ "dinner",
674
+ "battery life",
675
+ "getting around",
676
+ "resources",
677
+ "learn more",
678
+ "推荐",
679
+ "建议",
680
+ )
681
+ ):
682
+ types.append("usage_context")
683
+ if not types and enabled:
684
+ types.append("usage_context")
685
+ return {
686
+ "enabled": enabled,
687
+ "types": _dedupe(types),
688
+ "tokens": _tokens(query_text),
689
+ }
690
+
691
+
692
+ def profile_query_score_delta(
693
+ *,
694
+ query: Any,
695
+ query_tokens: set[str],
696
+ category: Any,
697
+ source_kind: Any,
698
+ semantic_slot: Any,
699
+ value: Any,
700
+ anchors: Sequence[Any],
701
+ metadata: Mapping[str, Any] | None = None,
702
+ ) -> tuple[float, str]:
703
+ data = dict(metadata or {})
704
+ intent = infer_profile_query_intent(query)
705
+ if not intent.get("enabled"):
706
+ return 0.0, ""
707
+ if not is_profile_layer_record(category=category, source_kind=source_kind, semantic_slot=semantic_slot, metadata=data):
708
+ return 0.0, ""
709
+ profile_type = _normalize(data.get("profile_type", "")) or infer_profile_type(
710
+ category=category,
711
+ semantic_slot=semantic_slot,
712
+ value=value,
713
+ metadata=data,
714
+ )
715
+ domain = _normalize(data.get("profile_domain_label", "") or data.get("profile_domain", ""))
716
+ route_terms = " ".join(str(item) for item in data.get("profile_route_terms", []) or [])
717
+ record_tokens = set(_tokens(f"{profile_type} {domain} {semantic_slot} {value} {' '.join(str(item) for item in anchors)} {route_terms}"))
718
+ overlap = len(set(query_tokens) & record_tokens) / max(1, len(set(query_tokens) | record_tokens)) if query_tokens or record_tokens else 0.0
719
+ type_match = profile_type in set(intent.get("types", []) or [])
720
+ delta = 0.20 + (0.16 if type_match else 0.0) + (0.18 * overlap)
721
+ if data.get("profile_candidate_status") == "consolidated":
722
+ delta += 0.06
723
+ if data.get("profile_cluster_node") or data.get("profile_consolidation_stage") == "cluster":
724
+ delta += 0.08
725
+ return round(min(delta, 0.54), 6), "profile_route"
726
+
727
+
728
+ def profile_edge_score(source_metadata: Mapping[str, Any], target_metadata: Mapping[str, Any], *, source_value: Any = "", target_value: Any = "") -> tuple[float, str]:
729
+ source = dict(source_metadata or {})
730
+ target = dict(target_metadata or {})
731
+ if not source.get("profile_layer") or not target.get("profile_layer"):
732
+ return 0.0, ""
733
+ source_key = _normalize(source.get("profile_support_key", ""))
734
+ target_key = _normalize(target.get("profile_support_key", ""))
735
+ source_domain = _normalize(source.get("profile_domain", ""))
736
+ target_domain = _normalize(target.get("profile_domain", ""))
737
+ source_type = _normalize(source.get("profile_type", ""))
738
+ target_type = _normalize(target.get("profile_type", ""))
739
+ source_tokens = set(_tokens(f"{source_value} {source.get('profile_domain_label', '')} {' '.join(source.get('profile_route_terms', []) or [])}"))
740
+ target_tokens = set(_tokens(f"{target_value} {target.get('profile_domain_label', '')} {' '.join(target.get('profile_route_terms', []) or [])}"))
741
+ overlap = len(source_tokens & target_tokens) / max(1, len(source_tokens | target_tokens)) if source_tokens or target_tokens else 0.0
742
+ if source_key and source_key == target_key:
743
+ return round(0.72 + (0.14 * overlap), 6), "profile_support"
744
+ if source_domain and source_domain == target_domain:
745
+ return round(0.58 + (0.12 if source_type == target_type else 0.04) + (0.12 * overlap), 6), "profile_tunnel"
746
+ if source_type and source_type == target_type and overlap >= 0.22:
747
+ return round(0.44 + (0.18 * overlap), 6), "profile_soft_tunnel"
748
+ return 0.0, ""
code/run_lme_s10_native_tmcra.py ADDED
The diff for this file is too large to render. See raw diff
 
docs/BASELINE_S500_20260525.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # S500 Baseline Record
2
+
3
+ ## Run Identity
4
+
5
+ - Name: frozen baseline38 GPT5.4 full10
6
+ - Date: 2026-05-25
7
+ - Local package: `tmcra_s500_baseline38_release_20260525`
8
+ - Result archive: `results/lme_s500_frozen_baseline38_full10_20260525_results.tar.gz`
9
+ - Judge output: `results/judge_gpt4o_alias_vectorengine.jsonl`
10
+ - Summary: `results/judge_gpt4o_alias_vectorengine.jsonl.summary.json`
11
+
12
+ ## Metrics
13
+
14
+ - Evaluated samples: 500
15
+ - Correct: 310
16
+ - Accuracy: 62.00%
17
+ - Judge alias: `gpt-4o`
18
+ - Judge resolved model: `gpt-4o-2024-08-06`
19
+
20
+ ## Subtask Metrics
21
+
22
+ | subtask | accuracy | count |
23
+ | --- | ---: | ---: |
24
+ | knowledge-update | 70.51% | 78 |
25
+ | multi-session | 39.85% | 133 |
26
+ | single-session-assistant | 78.57% | 56 |
27
+ | single-session-preference | 56.67% | 30 |
28
+ | single-session-user | 81.43% | 70 |
29
+ | temporal-reasoning | 63.16% | 133 |
30
+
31
+ ## Interpretation
32
+
33
+ This baseline is useful as a frozen comparison point. Stronger areas are single-session direct user facts, assistant detail recall, and knowledge-update. Weak areas remain multi-session aggregation, preference abstraction, and temporal reasoning.
34
+
35
+ The full model output directory is included to preserve all runtime weights and training checkpoints, not only deployment weights.
docs/OPTIONAL_MODULES_AND_PARALLEL.md ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Optional Modules and Parallel Evaluation Plan
2
+
3
+ This document describes two optional extension points already preserved in the current TMCRA package:
4
+
5
+ - Embedder interface
6
+ - LLM planner interface
7
+
8
+ It also summarizes the parallel evaluation pattern used for the S500 baseline so later deployment, evaluation, and ablation runs can reuse the same structure.
9
+
10
+ ## 1. Current Main Path
11
+
12
+ The frozen S500 baseline uses this core path:
13
+
14
+ ```text
15
+ dialogue -> writer layer -> graph memory -> learned node/path scorer -> evidence selection -> answer layer
16
+ ```
17
+
18
+ The responsibilities are:
19
+
20
+ - The writer layer converts dialogue into memory nodes, event units, profile signals, and temporal signals.
21
+ - The graph memory layer stores nodes, paths, and tunnel links.
22
+ - `node_scorer.pt` and `path_scorer.pt` perform learned node/path scoring.
23
+ - Evidence selection converts candidate memories into compact evidence.
24
+ - The answer-layer LLM produces the final response from the selected evidence.
25
+
26
+ Embedder and LLM planner modules are optional enhancement modules. They should not replace the main graph model. They are better treated as auxiliary channels, ablation switches, or higher-cost deployment paths.
27
+
28
+ ## 2. Embedder Interface
29
+
30
+ The current code exposes three embedder integration points.
31
+
32
+ ### 2.1 Write-time Indexing
33
+
34
+ At write time, TMCRA can build an embedding index for newly written memory nodes. That index can later serve as an auxiliary candidate source during retrieval.
35
+
36
+ Relevant configuration:
37
+
38
+ ```bash
39
+ export TMCRA_EMBEDDER_MODEL_PATH="BAAI/bge-m3"
40
+ export TMCRA_EMBEDDER_DEVICE="cuda"
41
+ export TMCRA_EMBEDDER_MODEL_MAX_LENGTH="512"
42
+ export TMCRA_WRITE_EMBEDDER_INDEX_MODE="bge_m3"
43
+ export TMCRA_WRITE_EMBEDDER_INDEX_MAX_TERMS="96"
44
+ ```
45
+
46
+ Purpose:
47
+
48
+ - Build semantic indexes after the writer stores memory nodes.
49
+ - Keep the original graph structure unchanged.
50
+ - Add a semantic candidate channel without replacing the learned node/path scorers.
51
+
52
+ ### 2.2 Pre-recall Candidate Expansion
53
+
54
+ Before graph retrieval, the embedder can find candidate event ids, which are then passed into graph retrieval and scorer ranking.
55
+
56
+ Relevant configuration:
57
+
58
+ ```bash
59
+ export TMCRA_EMBEDDER_PRE_RECALL_MODE="bge_m3"
60
+ export TMCRA_EMBEDDER_PRE_RECALL_K="16"
61
+ export TMCRA_EMBEDDER_INDEX_RECALL_MODE="bge_m3"
62
+ export TMCRA_EMBEDDER_INDEX_RECALL_K="24"
63
+ ```
64
+
65
+ Purpose:
66
+
67
+ - Expand the candidate range before retrieval.
68
+ - Help when query wording and memory wording differ.
69
+ - Provide an auxiliary path for semantically close memories with weak graph paths.
70
+
71
+ ### 2.3 Post-recall Fusion
72
+
73
+ After retrieval, embedder-matched events can be fused with graph-model results so semantically relevant nodes receive a limited boost.
74
+
75
+ Relevant configuration:
76
+
77
+ ```bash
78
+ export TMCRA_EMBEDDER_FUSION_MODE="on"
79
+ export TMCRA_EMBEDDER_FUSION_WEIGHT="0.35"
80
+ export TMCRA_EMBEDDER_FUSION_SCORE_FLOOR="0.62"
81
+ export TMCRA_EMBEDDER_FUSION_TOP_K="16"
82
+ export TMCRA_EMBEDDER_FUSION_SELECT_K="4"
83
+ export TMCRA_EMBEDDER_FUSION_MAX_BOOST="0.42"
84
+ ```
85
+
86
+ Purpose:
87
+
88
+ - Give semantically similar candidates a bounded score boost.
89
+ - Prevent the embedder from directly replacing main evidence ranking.
90
+ - Use embedding as an auxiliary recall layer for the learned graph scorer.
91
+
92
+ ## 3. LLM Planner Interface
93
+
94
+ The current code exposes three main LLM planner paths. They run either after retrieval or before query-side retrieval expansion.
95
+
96
+ ### 3.1 Evidence-unit Planner
97
+
98
+ The evidence-unit planner runs after retrieval and uses an LLM to normalize retrieved windows into evidence units.
99
+
100
+ Relevant configuration:
101
+
102
+ ```bash
103
+ export TMCRA_EVIDENCE_UNIT_PLANNER_MODE="on"
104
+ export TMCRA_EVIDENCE_UNIT_PLANNER_BASE_URL="<openai-compatible-base-url>"
105
+ export TMCRA_EVIDENCE_UNIT_PLANNER_MODEL="<planner-model>"
106
+ export TMCRA_EVIDENCE_UNIT_PLANNER_API_KEY="<planner-api-key>"
107
+ export TMCRA_EVIDENCE_UNIT_PLANNER_MAX_CANDIDATES="10"
108
+ export TMCRA_EVIDENCE_UNIT_PLANNER_CHARS="1100"
109
+ export TMCRA_EVIDENCE_UNIT_PLANNER_MAX_TOKENS="760"
110
+ export TMCRA_EVIDENCE_UNIT_PLANNER_REORDER="0"
111
+ ```
112
+
113
+ If planner-specific base/model/key values are not configured, this planner inherits answer-layer configuration:
114
+
115
+ ```bash
116
+ export TMCRA_ANSWER_BASE_URL="<openai-compatible-base-url>"
117
+ export TMCRA_ANSWER_MODEL="<answer-model>"
118
+ export TMCRA_ANSWER_API_KEY="<answer-api-key>"
119
+ ```
120
+
121
+ Purpose:
122
+
123
+ - Mark answer units, positive evidence, temporal anchors, current values, old values, constraints, and negative evidence.
124
+ - Help the final answer layer understand how to use the retrieved evidence.
125
+ - Organize evidence without replacing graph retrieval.
126
+
127
+ ### 3.2 LLM Channel Planner
128
+
129
+ The LLM channel planner runs before final evidence is sent to the answer layer. It separates main evidence, coverage evidence, support evidence, and suppressed evidence.
130
+
131
+ Relevant configuration:
132
+
133
+ ```bash
134
+ export TMCRA_LLM_CHANNEL_PLANNER_MODE="on"
135
+ export TMCRA_LLM_CHANNEL_PLANNER_MAX_WINDOWS="16"
136
+ export TMCRA_LLM_CHANNEL_PLANNER_WINDOW_CHARS="520"
137
+ export TMCRA_LLM_CHANNEL_PLANNER_MAX_TOKENS="700"
138
+ ```
139
+
140
+ Purpose:
141
+
142
+ - Make coverage evidence supplement main facts instead of replacing them.
143
+ - Improve count/sum/ratio/duration/multi-unit tasks.
144
+ - Provide a higher-cost quality mode for experiments and selected deployments.
145
+
146
+ In the frozen S500 baseline record, this module was:
147
+
148
+ ```text
149
+ llm_channel_planner=off
150
+ ```
151
+
152
+ ### 3.3 Query Graph Builder
153
+
154
+ The query graph builder runs before retrieval. It converts the user question into a compact query graph and can expand that graph into sidecar retrieval queries.
155
+
156
+ Relevant configuration:
157
+
158
+ ```bash
159
+ export TMCRA_QUERY_GRAPH_BUILDER_MODE="on"
160
+ export TMCRA_QUERY_GRAPH_BASE_URL="<openai-compatible-base-url>"
161
+ export TMCRA_QUERY_GRAPH_MODEL="<query-graph-model>"
162
+ export TMCRA_QUERY_GRAPH_API_KEY="<query-graph-api-key>"
163
+ export TMCRA_QUERY_GRAPH_MAX_TOKENS="700"
164
+ export TMCRA_QUERY_GRAPH_SIDECAR_RETRIEVAL_MODE="on"
165
+ export TMCRA_QUERY_GRAPH_SIDECAR_MAX_QUERIES="6"
166
+ export TMCRA_QUERY_GRAPH_SIDECAR_TOP_K="4"
167
+ ```
168
+
169
+ Purpose:
170
+
171
+ - Convert the question into task intent, required units, operation, and tunnel needs.
172
+ - Give complex multi-session, temporal, and profile questions a clearer retrieval direction.
173
+ - Test whether building a query graph before retrieval improves candidate recall.
174
+
175
+ ## 4. Local Model Planner vs LLM Planner
176
+
177
+ The code also contains local model planner interfaces, for example:
178
+
179
+ ```bash
180
+ export TMCRA_ANSWER_WINDOW_PLANNER_MODE="on"
181
+ export TMCRA_ANSWER_WINDOW_PLANNER_MODEL_PATH="<planner-checkpoint>"
182
+ export TMCRA_UNIFIED_OPERATION_PLANNER_MODE="on"
183
+ export TMCRA_UNIFIED_OPERATION_PLANNER_MODEL_PATH="<planner-checkpoint>"
184
+ export TMCRA_INJECTION_PLANNER_MODE="guided"
185
+ export TMCRA_INJECTION_PLANNER_MODEL_PATH="<planner-checkpoint>"
186
+ ```
187
+
188
+ These are local model interfaces, not LLM planner interfaces.
189
+
190
+ The distinction is:
191
+
192
+ - LLM planner: calls an external or local LLM; higher cost; useful for validating capability ceilings.
193
+ - Local model planner: lower cost and better for productization, but requires targeted training and stability validation.
194
+
195
+ Recommended workflow:
196
+
197
+ ```text
198
+ validate behavior with an LLM planner -> distill or train the useful behavior into the graph model or a local planner head
199
+ ```
200
+
201
+ ## 5. Parallel Evaluation Plan
202
+
203
+ The S500 baseline used shard-level parallelism:
204
+
205
+ ```text
206
+ 500 samples -> 10 shards -> 50 samples per shard
207
+ ```
208
+
209
+ Each shard runs independently:
210
+
211
+ ```text
212
+ input_shard_N.json -> shard_N/ -> predictions/debug/summary
213
+ ```
214
+
215
+ Core parallelization principles:
216
+
217
+ - One independent process per shard.
218
+ - One independent output directory per shard.
219
+ - Writer key pool is rotated by shard index.
220
+ - Main model weights are read-only and shared.
221
+ - Predictions, samples_debug, and judge results are merged after all shards complete.
222
+
223
+ Key S500 baseline runtime configuration:
224
+
225
+ ```text
226
+ samples=500
227
+ shards=10
228
+ per_shard=50
229
+ writer=DeepSeek v4 Flash
230
+ answer_layer=GPT5.4
231
+ llm_channel_planner=off
232
+ history_mode=controlled_answer_plus_distractors
233
+ ```
234
+
235
+ ### 5.1 Reusable Parallel Template
236
+
237
+ Recommended baseline template:
238
+
239
+ ```bash
240
+ export TMCRA_RETRIEVAL_MODE="hybrid_node_scored"
241
+ export TMCRA_REQUIRE_LEARNED_SCORER="1"
242
+ export TMCRA_NODE_MODEL_DEVICE="cuda"
243
+ export TMCRA_NODE_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt"
244
+ export TMCRA_PATH_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt"
245
+
246
+ export TMCRA_WRITER_MODEL="deepseek-chat"
247
+ export TMCRA_WRITER_MAX_TOKENS="512"
248
+ export TMCRA_WRITER_TIMEOUT_SECONDS="180"
249
+ export TMCRA_WRITER_TEMPERATURE="0"
250
+ export TMCRA_WRITER_INPUT_MODE="delta"
251
+ export TMCRA_WRITER_MAX_PROPOSALS="2"
252
+
253
+ export TMCRA_ANSWER_MAX_TOKENS="512"
254
+ ```
255
+
256
+ Single-shard execution shape:
257
+
258
+ ```bash
259
+ python code/run_lme_s10_native_tmcra.py \
260
+ --data "<run-root>/input_shard_N.json" \
261
+ --repo "<tmcra-repo-root>" \
262
+ --service-root "<tmcra-service-root>" \
263
+ --out "<run-root>/shard_N" \
264
+ --limit 50 \
265
+ --top-k 10 \
266
+ --max-distractor-sessions 5 \
267
+ --max-distractor-chunks 1 \
268
+ --max-answer-chunks 4 \
269
+ --chunk-chars 7000
270
+ ```
271
+
272
+ ### 5.2 Suggested Rollout Order
273
+
274
+ Do not enable all optional modules at once. Use staged A/B testing:
275
+
276
+ 1. **Baseline scorer-only**
277
+ - embedder off
278
+ - LLM channel planner off
279
+ - query graph builder off
280
+ - confirms frozen baseline stability
281
+
282
+ 2. **Embedder pre-recall A/B**
283
+ - enable write-time indexing and pre-recall candidate expansion only
284
+ - measure candidate hit rate, retrieval latency, and error-type shifts
285
+
286
+ 3. **Embedder fusion A/B**
287
+ - enable fusion only after pre-recall is stable
288
+ - keep boost bounded so embedder does not override the main graph scorer
289
+
290
+ 4. **Evidence-unit planner A/B**
291
+ - enable LLM evidence-unit planner
292
+ - measure whether the answer layer uses retrieved evidence better
293
+
294
+ 5. **LLM channel planner A/B**
295
+ - test mainly on multi/aggregation/temporal error clusters
296
+ - verify coverage evidence supplements main facts instead of replacing them
297
+
298
+ 6. **Query graph builder A/B**
299
+ - validate the ceiling of query-graph-first retrieval
300
+ - if effective, distill the behavior into query-understanding or graph scorer training
301
+
302
+ ### 5.3 Parallel Scale Guidance
303
+
304
+ Parallelism should not be determined only by the number of API keys. Also watch:
305
+
306
+ - GPU memory
307
+ - CPU memory
308
+ - writer latency
309
+ - answer-layer latency
310
+ - graph ingest / SQLite write overhead
311
+ - average writer calls per shard
312
+
313
+ Scale gradually:
314
+
315
+ ```text
316
+ 5 shards smoke -> 10 shards stable -> 20 shards stress -> 30 shards only if no memory/API/IO issue
317
+ ```
318
+
319
+ If error rate rises, memory drops sharply, API 402/429 appears, chunk errors occur, or shards stall, reduce parallelism first and then resume missing samples.
320
+
321
+ ## 6. Recommended Experiment Matrix
322
+
323
+ Minimal interpretable matrix:
324
+
325
+ | experiment | Embedder | LLM planner | purpose |
326
+ | --- | --- | --- | --- |
327
+ | baseline | off | off | fixed main graph-model baseline |
328
+ | embedder-pre | pre-recall on | off | test candidate expansion |
329
+ | embedder-fusion | pre-recall + fusion on | off | test semantic fusion |
330
+ | evidence-unit | off | evidence-unit on | test pre-answer evidence organization |
331
+ | channel-planner | off | channel planner on | test main/coverage separation |
332
+ | query-graph | off | query graph on | test query-graph retrieval |
333
+ | combined-light | pre-recall on | evidence-unit on | test lower-cost combined path |
334
+ | combined-heavy | pre-recall + fusion on | evidence-unit + channel planner on | test capability ceiling |
335
+
336
+ Each run should preserve:
337
+
338
+ - predictions
339
+ - samples_debug
340
+ - judge output
341
+ - by-task accuracy
342
+ - writer calls
343
+ - retrieval latency
344
+ - answer latency
345
+ - per-sample error type
346
+
347
+ This makes it possible to separate recall errors, evidence-selection errors, planner errors, answer-layer errors, and parallel-runtime instability.
docs/OPTIONAL_MODULES_AND_PARALLEL.zh-CN.md ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 可开启模块接入与并行推进说明
2
+
3
+ 本文说明当前 TMCRA 包中已经保留的两个可开启接入口:
4
+
5
+ - Embedder 接入口
6
+ - LLM planner 接入口
7
+
8
+ 同时说明 S500 基线测试中使用过的并行推进方式,方便后续在部署、评估或消融实验中复用。
9
+
10
+ ## 1. 当前主链路
11
+
12
+ 冻结 S500 基线的核心链路是:
13
+
14
+ ```text
15
+ dialogue -> writer layer -> graph memory -> learned node/path scorer -> evidence selection -> answer layer
16
+ ```
17
+
18
+ 其中:
19
+
20
+ - 写入层负责把对话转成记忆节点、事件单元、profile 信号、时间信号。
21
+ - 图记忆层保存节点、路径和隧穿关系。
22
+ - `node_scorer.pt` 和 `path_scorer.pt` 负责学习式节点/路径打分。
23
+ - evidence selection 把候选记忆整理成紧凑证据。
24
+ - 回答层 LLM 根据证据生成最终回复。
25
+
26
+ Embedder 和 LLM planner 都是可开启增强模块,不应该替代主图模型。它们更适合作为辅助通道、对比实验或高成本部署路径。
27
+
28
+ ## 2. Embedder 接入口
29
+
30
+ Embedder 当前有三类接入位置。
31
+
32
+ ### 2.1 写入阶段索引
33
+
34
+ 写入阶段可以为新写入的记忆建立 embedding 索引,后续召回时作为辅助候选来源。
35
+
36
+ 相关配置:
37
+
38
+ ```bash
39
+ export TMCRA_EMBEDDER_MODEL_PATH="BAAI/bge-m3"
40
+ export TMCRA_EMBEDDER_DEVICE="cuda"
41
+ export TMCRA_EMBEDDER_MODEL_MAX_LENGTH="512"
42
+ export TMCRA_WRITE_EMBEDDER_INDEX_MODE="bge_m3"
43
+ export TMCRA_WRITE_EMBEDDER_INDEX_MAX_TERMS="96"
44
+ ```
45
+
46
+ 作用:
47
+
48
+ - 在 writer 写入记忆节点后,为节点文本建立语义索引。
49
+ - 不改变原图结构。
50
+ - 不替代 node/path scorer,只是给召回增加一条语义候选通道。
51
+
52
+ ### 2.2 召回前候选补充
53
+
54
+ 召回前可以先用 embedder 找到一批候选 event id,再交给图召回和 scorer 做后续排序。
55
+
56
+ 相关配置:
57
+
58
+ ```bash
59
+ export TMCRA_EMBEDDER_PRE_RECALL_MODE="bge_m3"
60
+ export TMCRA_EMBEDDER_PRE_RECALL_K="16"
61
+ export TMCRA_EMBEDDER_INDEX_RECALL_MODE="bge_m3"
62
+ export TMCRA_EMBEDDER_INDEX_RECALL_K="24"
63
+ ```
64
+
65
+ 作用:
66
+
67
+ - 帮助召回阶段扩大候选范围。
68
+ - 对语义相近但图路径弱的记忆提供补充入口。
69
+ - 适合测试 query 与 memory 表达不完全一致的场景。
70
+
71
+ ### 2.3 召回后融合加权
72
+
73
+ 召回后可以把 embedder 命中的 event 与主图模型结果融合,让高语义相关的节点获得有限 boost。
74
+
75
+ 相关配置:
76
+
77
+ ```bash
78
+ export TMCRA_EMBEDDER_FUSION_MODE="on"
79
+ export TMCRA_EMBEDDER_FUSION_WEIGHT="0.35"
80
+ export TMCRA_EMBEDDER_FUSION_SCORE_FLOOR="0.62"
81
+ export TMCRA_EMBEDDER_FUSION_TOP_K="16"
82
+ export TMCRA_EMBEDDER_FUSION_SELECT_K="4"
83
+ export TMCRA_EMBEDDER_FUSION_MAX_BOOST="0.42"
84
+ ```
85
+
86
+ 作用:
87
+
88
+ - 给语义相似候选增加有限分数。
89
+ - 避免 embedder 直接重排主证据。
90
+ - 适合作为主图 scorer 的辅助召回层。
91
+
92
+ ## 3. LLM Planner 接入口
93
+
94
+ LLM planner 当前主要有三类接入方式。它们都位于召回之后或 query 进入召回之前,用于增强证据组织能力。
95
+
96
+ ### 3.1 Evidence-unit planner
97
+
98
+ Evidence-unit planner 在召回后运行,用 LLM 把候选窗口整理成 evidence unit。
99
+
100
+ 相关配置:
101
+
102
+ ```bash
103
+ export TMCRA_EVIDENCE_UNIT_PLANNER_MODE="on"
104
+ export TMCRA_EVIDENCE_UNIT_PLANNER_BASE_URL="<openai-compatible-base-url>"
105
+ export TMCRA_EVIDENCE_UNIT_PLANNER_MODEL="<planner-model>"
106
+ export TMCRA_EVIDENCE_UNIT_PLANNER_API_KEY="<planner-api-key>"
107
+ export TMCRA_EVIDENCE_UNIT_PLANNER_MAX_CANDIDATES="10"
108
+ export TMCRA_EVIDENCE_UNIT_PLANNER_CHARS="1100"
109
+ export TMCRA_EVIDENCE_UNIT_PLANNER_MAX_TOKENS="760"
110
+ export TMCRA_EVIDENCE_UNIT_PLANNER_REORDER="0"
111
+ ```
112
+
113
+ 如果不单独设置 planner 的 base/model/key,它会继承回答层配置:
114
+
115
+ ```bash
116
+ export TMCRA_ANSWER_BASE_URL="<openai-compatible-base-url>"
117
+ export TMCRA_ANSWER_MODEL="<answer-model>"
118
+ export TMCRA_ANSWER_API_KEY="<answer-api-key>"
119
+ ```
120
+
121
+ 作用:
122
+
123
+ - 标注候选窗口里的 answer unit、positive evidence、temporal anchor、current value、old value、constraint、negative evidence。
124
+ - 帮助最终回答层理解“这批证据应该怎么用”。
125
+ - 默认更适合做证据整理,不建议让它直接替代图召回。
126
+
127
+ ### 3.2 LLM channel planner
128
+
129
+ LLM channel planner 在最终证据进入回答层前运行,用 LLM 区分 main evidence、coverage evidence、support evidence 和 suppress evidence。
130
+
131
+ 相关配置:
132
+
133
+ ```bash
134
+ export TMCRA_LLM_CHANNEL_PLANNER_MODE="on"
135
+ export TMCRA_LLM_CHANNEL_PLANNER_MAX_WINDOWS="16"
136
+ export TMCRA_LLM_CHANNEL_PLANNER_WINDOW_CHARS="520"
137
+ export TMCRA_LLM_CHANNEL_PLANNER_MAX_TOKENS="700"
138
+ ```
139
+
140
+ 作用:
141
+
142
+ - 让 coverage 证据补充主事实,而不是替代主事实。
143
+ - 对 count/sum/ratio/duration/multi-unit 问题特别有用。
144
+ - 成本高于纯模型 scorer,适合高质量模式或实验开关。
145
+
146
+ 冻结 S500 基线记录中,该项为:
147
+
148
+ ```text
149
+ llm_channel_planner=off
150
+ ```
151
+
152
+ ### 3.3 Query graph builder
153
+
154
+ Query graph builder 在召回前运行,把用户问题转成 query graph,再扩展为 sidecar retrieval queries。
155
+
156
+ 相关配置:
157
+
158
+ ```bash
159
+ export TMCRA_QUERY_GRAPH_BUILDER_MODE="on"
160
+ export TMCRA_QUERY_GRAPH_BASE_URL="<openai-compatible-base-url>"
161
+ export TMCRA_QUERY_GRAPH_MODEL="<query-graph-model>"
162
+ export TMCRA_QUERY_GRAPH_API_KEY="<query-graph-api-key>"
163
+ export TMCRA_QUERY_GRAPH_MAX_TOKENS="700"
164
+ export TMCRA_QUERY_GRAPH_SIDECAR_RETRIEVAL_MODE="on"
165
+ export TMCRA_QUERY_GRAPH_SIDECAR_MAX_QUERIES="6"
166
+ export TMCRA_QUERY_GRAPH_SIDECAR_TOP_K="4"
167
+ ```
168
+
169
+ 作用:
170
+
171
+ - 把问题拆成 task intent、required units、operation、tunnel needs。
172
+ - 对复杂 multi-session、temporal、profile 问题提供更明确的召回方向。
173
+ - 适合做对比实验,观察“问题建图后再召回”是否提升候选命中。
174
+
175
+ ## 4. 本地模型 planner 与 LLM planner 的区别
176
+
177
+ 当前代码里也有本地模型 planner 接口,例如:
178
+
179
+ ```bash
180
+ export TMCRA_ANSWER_WINDOW_PLANNER_MODE="on"
181
+ export TMCRA_ANSWER_WINDOW_PLANNER_MODEL_PATH="<planner-checkpoint>"
182
+ export TMCRA_UNIFIED_OPERATION_PLANNER_MODE="on"
183
+ export TMCRA_UNIFIED_OPERATION_PLANNER_MODEL_PATH="<planner-checkpoint>"
184
+ export TMCRA_INJECTION_PLANNER_MODE="guided"
185
+ export TMCRA_INJECTION_PLANNER_MODEL_PATH="<planner-checkpoint>"
186
+ ```
187
+
188
+ 这些是本地模型接入口,不是 LLM planner。区别是:
189
+
190
+ - LLM planner:调用外部或本地 LLM,成本更高,适合验证能力上限。
191
+ - 本地模型 planner:成本更低,更适合产品化,但需要专项训练和稳定性验证。
192
+
193
+ 建议流程是:
194
+
195
+ ```text
196
+ 先用 LLM planner 验证能力是否有效 -> 再把有效行为蒸馏/训练进本地图模型或 planner head
197
+ ```
198
+
199
+ ## 5. 并行推进方案
200
+
201
+ S500 基线采用过分片并行方式:
202
+
203
+ ```text
204
+ 500 samples -> 10 shards -> 50 samples per shard
205
+ ```
206
+
207
+ 每个 shard 独立运行:
208
+
209
+ ```text
210
+ input_shard_N.json -> shard_N/ -> predictions/debug/summary
211
+ ```
212
+
213
+ 核心并行原则:
214
+
215
+ - 每个 shard 独立进程。
216
+ - 每个 shard 独立输出目录。
217
+ - writer key pool 按 shard index 轮转。
218
+ - 主模型权重只读共享。
219
+ - 最终合并 predictions、samples_debug、judge 结果。
220
+
221
+ 冻结 S500 记录中的关键运行配置:
222
+
223
+ ```text
224
+ samples=500
225
+ shards=10
226
+ per_shard=50
227
+ writer=DeepSeek v4 Flash
228
+ answer_layer=GPT5.4
229
+ llm_channel_planner=off
230
+ history_mode=controlled_answer_plus_distractors
231
+ ```
232
+
233
+ ### 5.1 复用的并行模板
234
+
235
+ 推荐的并行模板:
236
+
237
+ ```bash
238
+ export TMCRA_RETRIEVAL_MODE="hybrid_node_scored"
239
+ export TMCRA_REQUIRE_LEARNED_SCORER="1"
240
+ export TMCRA_NODE_MODEL_DEVICE="cuda"
241
+ export TMCRA_NODE_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt"
242
+ export TMCRA_PATH_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt"
243
+
244
+ export TMCRA_WRITER_MODEL="deepseek-chat"
245
+ export TMCRA_WRITER_MAX_TOKENS="512"
246
+ export TMCRA_WRITER_TIMEOUT_SECONDS="180"
247
+ export TMCRA_WRITER_TEMPERATURE="0"
248
+ export TMCRA_WRITER_INPUT_MODE="delta"
249
+ export TMCRA_WRITER_MAX_PROPOSALS="2"
250
+
251
+ export TMCRA_ANSWER_MAX_TOKENS="512"
252
+ ```
253
+
254
+ 单 shard 执行形态:
255
+
256
+ ```bash
257
+ python code/run_lme_s10_native_tmcra.py \
258
+ --data "<run-root>/input_shard_N.json" \
259
+ --repo "<tmcra-repo-root>" \
260
+ --service-root "<tmcra-service-root>" \
261
+ --out "<run-root>/shard_N" \
262
+ --limit 50 \
263
+ --top-k 10 \
264
+ --max-distractor-sessions 5 \
265
+ --max-distractor-chunks 1 \
266
+ --max-answer-chunks 4 \
267
+ --chunk-chars 7000
268
+ ```
269
+
270
+ ### 5.2 推进顺序
271
+
272
+ 建议按下面顺序推进,不要一次把所有模块全打开:
273
+
274
+ 1. **Baseline scorer-only**
275
+ - embedder off
276
+ - LLM channel planner off
277
+ - query graph builder off
278
+ - 用于确认 frozen baseline 是否稳定。
279
+
280
+ 2. **Embedder pre-recall A/B**
281
+ - 只打开写入索引和召回前候选补充。
282
+ - 观察候选命中率、召回延迟、错误类型是否变化。
283
+
284
+ 3. **Embedder fusion A/B**
285
+ - 在 pre-recall 稳定后打开 fusion。
286
+ - 控制 boost,不允许 embedder 直接压过主图 scorer。
287
+
288
+ 4. **Evidence-unit planner A/B**
289
+ - 打开 LLM evidence-unit planner。
290
+ - 观察 answer 层是否更会使用召回证据。
291
+
292
+ 5. **LLM channel planner A/B**
293
+ - 只在 multi/aggregation/temporal 错误集中验证。
294
+ - 重点观察 coverage 证据是否补充主事实,而不是替换主事实。
295
+
296
+ 6. **Query graph builder A/B**
297
+ - 用于验证“问题建图后再召回”的上限。
298
+ - 如果有效,再考虑训练进 query-understanding 或 graph scorer。
299
+
300
+ ### 5.3 并行规模建议
301
+
302
+ 并行数不要只看 API 数量,还要看:
303
+
304
+ - GPU 显存
305
+ - CPU 内存
306
+ - writer 延迟
307
+ - answer 层延迟
308
+ - graph ingest/SQLite 写入开销
309
+ - 每 shard 平均 writer calls
310
+
311
+ 建议从小到大:
312
+
313
+ ```text
314
+ 5 shards smoke -> 10 shards stable -> 20 shards stress -> 30 shards only if no memory/API/IO issue
315
+ ```
316
+
317
+ 如果出现错误率升高、内存下降明显、API 402/429、chunk error 或 shard 卡住,应先降并行,再补跑缺失样本。
318
+
319
+ ## 6. 推荐实验矩阵
320
+
321
+ 最小可解释矩阵:
322
+
323
+ | 实验 | Embedder | LLM planner | 目的 |
324
+ | --- | --- | --- | --- |
325
+ | baseline | off | off | 固定主图模型基线 |
326
+ | embedder-pre | pre-recall on | off | 测候选扩展是否提升 |
327
+ | embedder-fusion | pre-recall + fusion on | off | 测语义融合是否提升 |
328
+ | evidence-unit | off | evidence-unit on | 测回答前证据整理 |
329
+ | channel-planner | off | channel planner on | 测 main/coverage 分离 |
330
+ | query-graph | off | query graph on | 测问题建图召回 |
331
+ | combined-light | pre-recall on | evidence-unit on | 测较低成本组合 |
332
+ | combined-heavy | pre-recall + fusion on | evidence-unit + channel planner on | 测能力上限 |
333
+
334
+ 每一组都应保留:
335
+
336
+ - predictions
337
+ - samples_debug
338
+ - judge output
339
+ - by-task accuracy
340
+ - writer calls
341
+ - retrieval latency
342
+ - answer latency
343
+ - per-sample error type
344
+
345
+ 这样后续可以判断问题来自召回、证据选择、planner、回答层,还是并行运行不稳定。
docs/TRAINING.md ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TMCRA Training Notes
2
+
3
+ This document summarizes the training direction behind the graph scorer package included in this release.
4
+
5
+ ## Training Goal
6
+
7
+ TMCRA trains graph-scoring components for long-memory retrieval. The goal is to help the runtime decide which memory nodes and graph paths should be surfaced to an answer model for a given user query.
8
+
9
+ The trained model is not intended to replace the answer LLM. It is responsible for memory selection:
10
+
11
+ - identify relevant memory nodes
12
+ - score graph paths between related memory events
13
+ - preserve useful cross-turn and cross-session links
14
+ - reduce noisy or stale evidence before answer generation
15
+
16
+ ## Model Components
17
+
18
+ The released model directory contains two main runtime scorers:
19
+
20
+ - `node_scorer.pt`: scores candidate memory nodes.
21
+ - `path_scorer.pt`: scores graph paths and tunnel links between memory nodes.
22
+
23
+ The training output also includes:
24
+
25
+ - best checkpoints
26
+ - last checkpoints
27
+ - epoch and step checkpoints
28
+ - training summary and logs
29
+ - export manifest
30
+
31
+ ## Training Data Direction
32
+
33
+ Training data is built around dialogue-memory behavior rather than isolated QA pairs. Samples are designed to teach the model how memory should connect across turns and sessions.
34
+
35
+ The major training directions include:
36
+
37
+ - direct user facts
38
+ - assistant-provided details
39
+ - preference/profile signals
40
+ - temporal state changes
41
+ - old-value vs current-value selection
42
+ - cross-session event links
43
+ - multi-evidence aggregation
44
+ - evidence-positive vs noise/negative memory separation
45
+ - unit-to-unit coverage for count/sum/compare tasks
46
+
47
+ ## Graph Memory Supervision
48
+
49
+ Each training example is converted into graph-oriented supervision. Instead of only asking whether a text chunk is relevant, TMCRA trains over:
50
+
51
+ - memory node relevance
52
+ - event-unit relevance
53
+ - path usefulness
54
+ - tunnel/link usefulness
55
+ - evidence role
56
+ - currentness and temporal state
57
+ - whether a candidate should be injected into answer context
58
+
59
+ This allows the runtime to learn memory structure, not only lexical similarity.
60
+
61
+ ## Writer and Scorer Separation
62
+
63
+ TMCRA separates memory writing from graph scoring.
64
+
65
+ The writer extracts candidate memory records from dialogue. The graph model then learns how those records should be selected and connected during retrieval.
66
+
67
+ This separation is important because a long-memory system needs two different abilities:
68
+
69
+ - write useful memory units from conversation
70
+ - retrieve and connect the right units later under noise
71
+
72
+ ## Training Output Included
73
+
74
+ The packaged model output is located at:
75
+
76
+ ```text
77
+ models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/
78
+ ```
79
+
80
+ Runtime files:
81
+
82
+ ```text
83
+ node_scorer.pt
84
+ path_scorer.pt
85
+ export_manifest.json
86
+ ```
87
+
88
+ Full training trace:
89
+
90
+ ```text
91
+ checkpoints/
92
+ node_scorer_best.pt
93
+ path_scorer_best.pt
94
+ node_scorer_last.pt
95
+ path_scorer_last.pt
96
+ train_summary.json
97
+ train.log
98
+ training_issues.jsonl
99
+ ```
100
+
101
+ ## Current Training Lessons
102
+
103
+ The current baseline shows that TMCRA has strong single-session fact recall and assistant-detail recall. It also has working temporal and preference layers.
104
+
105
+ The main remaining training targets are:
106
+
107
+ - stronger multi-session aggregation
108
+ - better unit coverage for count/sum/compare questions
109
+ - deeper temporal graph planning
110
+ - query-graph to memory-graph matching
111
+ - more stable preference abstraction under indirect user requests
112
+
113
+ These directions are the next step for improving TMCRA from a working long-memory runtime into a stronger general agent-memory layer.
github_push.err.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ To https://github.com/reshuibuduo/TMCRA-agent-memory-algorithm.git
2
+ * [new branch] master -> master
github_push.out.log ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Uploading LFS objects: 100% (27/27), 5.6 GB | 0 B/s, done.
2
+ branch 'master' set up to track 'origin/master'.
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_01.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aaa0153ae0642e709d287b947403493b461594dc8f7ac7f94b31b00aa32c7cf
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_02.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eec8b8fcef26d82c9306d8f96b98915724e8c0d0967ba33ddcfba7dbf983cb4
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_03.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3137ad50512ef6890eecf0d241348844e2ae43c4a46649b4ade50f29f68c4c33
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_04.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38c5e248f449080f4316c8f0a28ea8490bb2d9413afda7ce0d4324b256ce025a
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_05.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61e6ee0e0f02de94c46b7d3bfb8bf954a7588282592fd6507068cb46942bca16
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_06.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92af8e2ae8e55a29b9abcaffc02f37bb09544188f591b00fb5200fadfb46fbed
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_07.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dec148fe044f7f94694b4f72b49a01e7a74455f1bee17253beb53f6e6f94ce7
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_step_0000600.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29d3d1436839617bdda708090cac2727cb66c19dcdcf8634c6474f907b2c82c2
3
+ size 238032983
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_step_0000650.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02077ad6b1df777cf8d9478a3fda234e7059340bce7d320f12184e89225e9510
3
+ size 238032983
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_01.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28e8279e49f7dcc7a9edacf7c827de6bb2d0bc8ac3c2725a12dae314dd847ee1
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_02.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b895a3a0f2fdb0690ae9ea26b04f2c93a67b23a66a988fab2ad5db12b7c35c7
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_03.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b6a0480a36e0665efd92edab06f51a3deec6195e72a09bdb253205f91cb4fec
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_04.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b13d4ca8479fbdec63eebefdf40cbdabc700fb8074b8b7317a619b66260dafb
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_05.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:182fe18bb0288645186d13e677fbd2b560681794a007efd02de7ad21c39cf581
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_07.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d1961ac8ee0bcb0d8f9e5ea82de282e990f26fbbe237e8bb8e45d60be827749
3
+ size 238030855
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_step_0000600.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fbbc7510ecc077fe9724593568ea829f5bf2b90e8e59388675052853c1d9d99
3
+ size 238032983
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_step_0000650.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:148b3461a74a390534a1cd5161555e5c310d11c2d0406e785654957eae9178b8
3
+ size 238032983
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/export_manifest.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"answer_calibration_variant": "competition_alignment_v3_trimaze", "answer_plan_variant": "candidate_event_answer_plan_v1", "checkpoint_audit": {"node": {"missing_prefixes": [], "ok": true, "path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt", "prefix_counts": {"event_head": 10, "event_tunnel_head": 4, "final_event_fusion_head": 4, "final_path_fusion_head": 4, "memory_router_head": 6, "path_head": 11, "path_tunnel_head": 4, "question_intent_head": 12, "temporal_head": 4}, "required_prefixes": ["memory_router_head", "temporal_head", "question_intent_head", "event_head", "path_head", "event_tunnel_head", "path_tunnel_head", "final_event_fusion_head", "final_path_fusion_head"], "saved_head": "event", "state_key_count": 448}, "path": {"missing_prefixes": [], "ok": true, "path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt", "prefix_counts": {"event_head": 10, "event_tunnel_head": 4, "final_event_fusion_head": 4, "final_path_fusion_head": 4, "memory_router_head": 6, "path_head": 11, "path_tunnel_head": 4, "question_intent_head": 12, "temporal_head": 4}, "required_prefixes": ["memory_router_head", "temporal_head", "question_intent_head", "event_head", "path_head", "event_tunnel_head", "path_tunnel_head", "final_event_fusion_head", "final_path_fusion_head"], "saved_head": "path", "state_key_count": 448}}, "completed_at": "2026-05-24T04:38:50", "created_at": "2026-05-23T20:23:49.564683+00:00", "dataset_version": "typed_tunnel_graph_v1", "encoder_attention_heads": 8, "encoder_attention_layers": 6, "encoder_structural_bias_variant": "tmcra_token_role_relbias_v1", "event_distractor_variant": "candidate_distractor_delta_bce_v1", "event_subgraph_attention_heads": 8, "event_subgraph_attention_layers": 6, "event_subgraph_refiner_variant": "event_centered_subgraph_attn_6l_h448", "focus_counts": {"action_frame_temporal_binding": 58, "cross_session_multi_event_chain": 104, "hard_distractor_similar_entity": 61, "normal_non_chain_stability": 16, "numeric_aggregation_unit_coverage": 116, "profile_boundary_mixed_case": 36, "same_entity_unit_to_unit_chain": 104, "status_negative_exclusion_update": 53}, "graph_count": 333, "input_jsonl": "outputs/action_frame_tunnel_final_accepted_20260524/typed_tunnel_accepted.jsonl", "memory_router_variant": "question_pooled_multilabel_v1", "memory_tunnel_variant": "chain_depth_tunnel_support_delta_v1", "message_passing_variant": "support_to_event_bidirectional_shared_typed_v1", "model_arch_version": "node_memory_v11_tmcra_scale49m_trimaze", "node_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "node_max_tokens": 32, "notes": ["Benchmark data is not an input to this builder.", "Typed tunnel signals are encoded as event/path metadata and existing graph node/path types.", "No new node type, edge type, path type, or model tensor shape is introduced.", "Train with train_locomo_node_memory.py and a compatible node-memory checkpoint."], "output_dir": "outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset", "pair_feature_adapter_variant": "residual_pair_feature_adapter_v1", "query_counts": {"test": 54, "train": 439, "val": 55}, "question_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "question_intent_variant": "text_pooled_multihead_v1", "question_max_tokens": 48, "resume": {"resume_checkpoint": "<tmcra-service-root>/models/tmcra_node_memory_v11_priorfusion_zeroload_20260517/node_scorer.pt", "resume_mode": "weights_only_forced", "resume_weights_only": true}, "sample_count": 548, "sample_id_prefix": "actionframe548_", "skipped_count": 0, "source_data_dirs": ["outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset"], "source_manifests": ["outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset/export_manifest.json"], "split_policy": {"test_ratio": 0.09999999999999995, "train_ratio": 0.8, "val_ratio": 0.1}, "status": "completed", "trainable_stage": {"frozen_parameter_count": 47963050, "trainable_name_prefixes": ["event_tunnel_head", "final_event_fusion_head", "final_path_fusion_head", "path_tunnel_head"], "trainable_parameter_count": 3831302, "trainable_stage": "tunnel_fusion_only"}, "training_sampling": {"answer_type_counts": {"multi_evidence": 422, "time": 17}, "base_row_count": 439, "effective_row_count": 697, "loss_weight_range_estimate": {"max": 1.549193, "min": 1.319091}, "mode": "source_aware_balanced", "repeat_histogram": {"1": 118, "2": 144}, "source_count": 1, "source_details": {"dataset": {"avg_example_weight_estimate": 1.540283, "loss_source_factor": 1.0, "row_count": 439, "row_share": 1.0, "sampled_row_count_estimate": 697, "sampled_row_share_estimate": 1.0, "sampling_source_factor": 1.0, "weighted_loss_mass": 676.184129}}, "source_row_counts": {"dataset": 439}, "supervision_bucket_counts": {"typed_tunnel_multi_positive": 439}, "weighted_answer_mass_estimate": {"multi_evidence": 653.759589, "time": 22.42454}, "weighted_supervision_bucket_mass_estimate": {"typed_tunnel_multi_positive": 680.095876}}, "typed_tunnel_edge_count": 3855}
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/launch_train.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ cd <tmcra-repo-root>
4
+ DATA=outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset
5
+ OUT="$1"
6
+ <tmcra-service-root>/.venv/bin/python scripts/train_locomo_node_memory.py \
7
+ --data-dir "$DATA" \
8
+ --output-dir "$OUT" \
9
+ --resume-checkpoint <tmcra-service-root>/models/tmcra_node_memory_v11_priorfusion_zeroload_20260517/node_scorer.pt \
10
+ --resume-weights-only \
11
+ --trainable-stage tunnel_fusion_only \
12
+ --epochs 8 \
13
+ --batch-size 8 \
14
+ --lr 8e-05 \
15
+ --fail-on-bad-query-rows \
16
+ --epoch-train-eval-max-rows 80 \
17
+ --epoch-val-eval-max-rows 32 \
18
+ --graph-memory-cache-size 64 \
19
+ --lazy-prefetch-workers 6 \
20
+ --lazy-prefetch-window 24 \
21
+ --batch-prepare-workers 16 \
22
+ --batch-prepare-lookahead-batches 64 \
23
+ --graph-prefetch-lookahead-batches 3 \
24
+ --reserve-cpu-cores 2 \
25
+ --torch-cpu-threads 4 \
26
+ --torch-interop-threads 4 \
27
+ --log-every-steps 20 \
28
+ --checkpoint-every-steps 50 \
29
+ --keep-step-checkpoints 2 \
30
+ --train-sampling-mode source_aware_balanced \
31
+ --loss-group-balancing-mode supervision_bucket \
32
+ --loss-source-alpha 0.4 \
33
+ --loss-blend-uniform-ratio 0.25 \
34
+ --loss-weight-power 0.5 \
35
+ --loss-time-boost 1.45 \
36
+ --loss-multi-evidence-boost 2.0 \
37
+ --loss-temporal-positive-boost 1.2 \
38
+ --loss-min-example-weight 0.65 \
39
+ --loss-max-example-weight 1.85 \
40
+ --l2sp-loss-weight 0.02 \
41
+ --event-selection-positive-coverage-count 3 \
42
+ --path-selection-positive-coverage-count 3 \
43
+ --multi-positive-coverage-fraction 0.7 \
44
+ --multi-positive-recall-coverage-count 6 \
45
+ --multi-positive-event-coverage-count 5 \
46
+ --multi-positive-path-coverage-count 3 \
47
+ --multi-positive-final-event-set-coverage-count 5 \
48
+ > "$OUT/train.log" 2>&1
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb4e0d3f51143ec2822315721f76d2ed636218ae3c6edd488d7fa53774193739
3
+ size 238028279
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb4e0d3f51143ec2822315721f76d2ed636218ae3c6edd488d7fa53774193739
3
+ size 238028279
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7aeba343e330c918956dd53cca9104c580c8d45051ae108608fff0404af5b7c
3
+ size 238028279
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/nohup.log ADDED
File without changes
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6cc3e720625ccbfb9ad7f322cd5e23c4b7b87d31cbb780c30d94ca7d2573c80
3
+ size 238028279
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6cc3e720625ccbfb9ad7f322cd5e23c4b7b87d31cbb780c30d94ca7d2573c80
3
+ size 238028279
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ec5a311d5a1fcf78436310c846f03bc4b70491a7a5b68a065cd485298fbc63e
3
+ size 238028279
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train.log ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [train_locomo_node_memory] 2026-05-24T04:25:59 gc_process_disabled reason='python_3_12_runtime_stability' scope='cli_process'
2
+ [train_locomo_node_memory] 2026-05-24T04:25:59 query_split_loaded elapsed_seconds=0.022 rows=439 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' split='train'
3
+ [train_locomo_node_memory] 2026-05-24T04:25:59 query_split_loaded elapsed_seconds=0.003 rows=55 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' split='val'
4
+ [train_locomo_node_memory] 2026-05-24T04:25:59 query_split_loaded elapsed_seconds=0.003 rows=54 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' split='test'
5
+ [train_locomo_node_memory] 2026-05-24T04:25:59 graph_source_index_started cache_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset/graph_tensor_cache_v2' cache_write_enabled=True prebuild_skipped_for_warm_cache=False requested_conversation_ids=333 require_cache_hit=False source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset'
6
+ [train_locomo_node_memory] 2026-05-24T04:25:59 graph_source_index_ready graph_count=333 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' test_rows=54 train_rows=439 val_rows=55
7
+ [train_locomo_node_memory] 2026-05-24T04:26:00 resume_training_state_skipped current_batch_size=8 reason='resume_weights_only' resume_checkpoint='<tmcra-service-root>/models/tmcra_node_memory_v11_priorfusion_zeroload_20260517/node_scorer.pt' saved_batch_size=8
8
+ [train_locomo_node_memory] 2026-05-24T04:26:00 resume_checkpoint_loaded resume_checkpoint='<tmcra-service-root>/models/tmcra_node_memory_v11_priorfusion_zeroload_20260517/node_scorer.pt' resume_mode='weights_only_forced' saved_head='event' state_key_count=448
9
+ [train_locomo_node_memory] 2026-05-24T04:26:00 training_started batch_prepare_lookahead_batches=64 batch_prepare_workers=16 batch_size=8 checkpoint_every_steps=50 cpu_count=36 data_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' device='cuda' epoch_train_eval=True epoch_train_eval_max_rows=80 epoch_val_eval=True epoch_val_eval_max_rows=32 epochs=8 gc_mode='disabled_during_training' graph_cache_device='cpu' graph_cache_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset/graph_tensor_cache_v2' graph_count=333 graph_memory_cache_size=64 graph_prefetch_lookahead_batches=3 issue_log='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/training_issues.jsonl' lazy_prefetch_window=24 lazy_prefetch_workers=6 log_every_steps=20 max_train_steps=0 output_dir='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557' prebuild_issue_log='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/graph_cache_prebuild_errors.jsonl' reserve_cpu_cores=2 resume_checkpoint='<tmcra-service-root>/models/tmcra_node_memory_v11_priorfusion_zeroload_20260517/node_scorer.pt' resume_mode='weights_only_forced' resume_weights_only=True skip_bad_query_rows=False skip_final_eval=False test_row_limit=0 test_rows=54 torch_cpu_threads=4 torch_interop_threads=4 train_row_limit=0 train_rows=697 train_rows_before_limit=439 train_rows_effective=697 train_sampling_mode='source_aware_balanced' trainable_stage='tunnel_fusion_only' usable_cpu_cores=34 val_row_limit=0 val_rows=55 val_rows_before_limit=55
10
+ [train_locomo_node_memory] 2026-05-24T04:26:00 training_sampling_ready base_row_count=439 effective_row_count=697 loss_weight_max=1.549193 loss_weight_min=1.319091 mode='source_aware_balanced' source_count=1
11
+ [train_locomo_node_memory] 2026-05-24T04:26:31 batch_prepare_pool_event backend='process' event_type='batch_prepare_process_pool_ready' graph_error_stage='train_batch_prepare' worker_count=16
12
+ [train_locomo_node_memory] 2026-05-24T04:26:33 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.061843 batch_graph_error_count=0 batch_l2sp_loss=0.0 batch_loss=7.617344379425049 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=1 epoch_step=1 global_step=1 total_steps=704
13
+ [train_locomo_node_memory] 2026-05-24T04:26:49 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.064798 batch_graph_error_count=0 batch_l2sp_loss=7e-08 batch_loss=6.46565055847168 batch_loss_group_count=1 batch_path_recall_at_3=0.833333 epoch=1 epoch_step=20 global_step=20 total_steps=704
14
+ [train_locomo_node_memory] 2026-05-24T04:27:05 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.086709 batch_graph_error_count=0 batch_l2sp_loss=1.03e-06 batch_loss=7.7209601402282715 batch_loss_group_count=1 batch_path_recall_at_3=0.875 epoch=1 epoch_step=40 global_step=40 total_steps=704
15
+ [train_locomo_node_memory] 2026-05-24T04:27:15 step_checkpoint_saved epoch=1 epoch_step=50 global_step=50 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
16
+ [train_locomo_node_memory] 2026-05-24T04:27:22 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.126139 batch_graph_error_count=0 batch_l2sp_loss=3.54e-06 batch_loss=6.973794937133789 batch_loss_group_count=1 batch_path_recall_at_3=0.875 epoch=1 epoch_step=60 global_step=60 total_steps=704
17
+ [train_locomo_node_memory] 2026-05-24T04:27:39 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.029379 batch_graph_error_count=0 batch_l2sp_loss=5.95e-06 batch_loss=6.7596940994262695 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=1 epoch_step=80 global_step=80 total_steps=704
18
+ [train_locomo_node_memory] 2026-05-24T04:28:03 epoch_completed best_val_loss=4.024884 best_val_selection_score=116.975116 epoch=1 global_step=97 is_best=True train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=4.217114 train_path_recall_at_3=0.95 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=4.024884 val_path_recall_at_3=0.96875 val_selection_score=116.975116
19
+ [train_locomo_node_memory] 2026-05-24T04:28:06 checkpoint_saved epoch=1 global_step=97 is_best=True node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
20
+ [train_locomo_node_memory] 2026-05-24T04:28:09 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.131766 batch_graph_error_count=0 batch_l2sp_loss=8.54e-06 batch_loss=6.195566654205322 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=2 epoch_step=3 global_step=100 total_steps=704
21
+ [train_locomo_node_memory] 2026-05-24T04:28:11 step_checkpoint_saved epoch=2 epoch_step=3 global_step=100 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
22
+ [train_locomo_node_memory] 2026-05-24T04:28:30 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.003715 batch_graph_error_count=0 batch_l2sp_loss=1.191e-05 batch_loss=6.659519195556641 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=2 epoch_step=23 global_step=120 total_steps=704
23
+ [train_locomo_node_memory] 2026-05-24T04:28:46 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.061323 batch_graph_error_count=0 batch_l2sp_loss=1.57e-05 batch_loss=5.569675445556641 batch_loss_group_count=1 batch_path_recall_at_3=0.833333 epoch=2 epoch_step=43 global_step=140 total_steps=704
24
+ [train_locomo_node_memory] 2026-05-24T04:28:57 step_checkpoint_saved epoch=2 epoch_step=53 global_step=150 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
25
+ [train_locomo_node_memory] 2026-05-24T04:29:05 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.038297 batch_graph_error_count=0 batch_l2sp_loss=1.985e-05 batch_loss=7.193849563598633 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=2 epoch_step=63 global_step=160 total_steps=704
26
+ [train_locomo_node_memory] 2026-05-24T04:29:21 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.046331 batch_graph_error_count=0 batch_l2sp_loss=2.431e-05 batch_loss=5.3383002281188965 batch_loss_group_count=1 batch_path_recall_at_3=0.875 epoch=2 epoch_step=83 global_step=180 total_steps=704
27
+ [train_locomo_node_memory] 2026-05-24T04:29:40 epoch_completed best_val_loss=3.758544 best_val_selection_score=117.128901 epoch=2 global_step=191 is_best=True train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.925607 train_path_recall_at_3=0.9625 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.758544 val_path_recall_at_3=0.96875 val_selection_score=117.128901
28
+ [train_locomo_node_memory] 2026-05-24T04:29:44 checkpoint_saved epoch=2 global_step=191 is_best=True node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
29
+ [train_locomo_node_memory] 2026-05-24T04:29:53 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.13841 batch_graph_error_count=0 batch_l2sp_loss=2.782e-05 batch_loss=6.168829917907715 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=3 epoch_step=9 global_step=200 total_steps=704
30
+ [train_locomo_node_memory] 2026-05-24T04:29:56 step_checkpoint_saved epoch=3 epoch_step=9 global_step=200 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
31
+ [train_locomo_node_memory] 2026-05-24T04:30:12 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.050877 batch_graph_error_count=0 batch_l2sp_loss=3.105e-05 batch_loss=5.412087917327881 batch_loss_group_count=1 batch_path_recall_at_3=0.75 epoch=3 epoch_step=29 global_step=220 total_steps=704
32
+ [train_locomo_node_memory] 2026-05-24T04:30:27 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.133808 batch_graph_error_count=0 batch_l2sp_loss=3.419e-05 batch_loss=7.1404523849487305 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=3 epoch_step=49 global_step=240 total_steps=704
33
+ [train_locomo_node_memory] 2026-05-24T04:30:38 step_checkpoint_saved epoch=3 epoch_step=59 global_step=250 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
34
+ [train_locomo_node_memory] 2026-05-24T04:30:46 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=3.686e-05 batch_loss=4.634302616119385 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=3 epoch_step=69 global_step=260 total_steps=704
35
+ [train_locomo_node_memory] 2026-05-24T04:31:02 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.08086 batch_graph_error_count=0 batch_l2sp_loss=3.993e-05 batch_loss=6.546285629272461 batch_loss_group_count=1 batch_path_recall_at_3=0.875 epoch=3 epoch_step=89 global_step=280 total_steps=704
36
+ [train_locomo_node_memory] 2026-05-24T04:31:17 epoch_completed best_val_loss=3.710867 best_val_selection_score=118.164133 epoch=3 global_step=286 is_best=True train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.793919 train_path_recall_at_3=0.9875 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.710867 val_path_recall_at_3=1.0 val_selection_score=118.164133
37
+ [train_locomo_node_memory] 2026-05-24T04:31:21 checkpoint_saved epoch=3 global_step=286 is_best=True node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
38
+ [train_locomo_node_memory] 2026-05-24T04:31:34 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=4.268e-05 batch_loss=6.9818620681762695 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=4 epoch_step=14 global_step=300 total_steps=704
39
+ [train_locomo_node_memory] 2026-05-24T04:31:36 step_checkpoint_saved epoch=4 epoch_step=14 global_step=300 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
40
+ [train_locomo_node_memory] 2026-05-24T04:31:52 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.02366 batch_graph_error_count=0 batch_l2sp_loss=4.505e-05 batch_loss=6.924047470092773 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=4 epoch_step=34 global_step=320 total_steps=704
41
+ [train_locomo_node_memory] 2026-05-24T04:32:09 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.065861 batch_graph_error_count=0 batch_l2sp_loss=4.73e-05 batch_loss=5.456366062164307 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=4 epoch_step=54 global_step=340 total_steps=704
42
+ [train_locomo_node_memory] 2026-05-24T04:32:20 step_checkpoint_saved epoch=4 epoch_step=64 global_step=350 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
43
+ [train_locomo_node_memory] 2026-05-24T04:32:28 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.033157 batch_graph_error_count=0 batch_l2sp_loss=4.912e-05 batch_loss=7.173478603363037 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=4 epoch_step=74 global_step=360 total_steps=704
44
+ [train_locomo_node_memory] 2026-05-24T04:32:44 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=5.091e-05 batch_loss=4.523404121398926 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=4 epoch_step=94 global_step=380 total_steps=704
45
+ [train_locomo_node_memory] 2026-05-24T04:32:54 epoch_completed best_val_loss=3.718748 best_val_selection_score=118.424107 epoch=4 global_step=380 is_best=True train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.824029 train_path_recall_at_3=0.9875 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.718748 val_path_recall_at_3=1.0 val_selection_score=118.424107
46
+ [train_locomo_node_memory] 2026-05-24T04:32:58 checkpoint_saved epoch=4 global_step=380 is_best=True node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
47
+ [train_locomo_node_memory] 2026-05-24T04:33:16 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=5.232e-05 batch_loss=6.215372085571289 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=5 epoch_step=20 global_step=400 total_steps=704
48
+ [train_locomo_node_memory] 2026-05-24T04:33:18 step_checkpoint_saved epoch=5 epoch_step=20 global_step=400 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
49
+ [train_locomo_node_memory] 2026-05-24T04:33:35 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.02998 batch_graph_error_count=0 batch_l2sp_loss=5.362e-05 batch_loss=6.076883316040039 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=5 epoch_step=40 global_step=420 total_steps=704
50
+ [train_locomo_node_memory] 2026-05-24T04:33:50 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.026733 batch_graph_error_count=0 batch_l2sp_loss=5.463e-05 batch_loss=4.687601566314697 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=5 epoch_step=60 global_step=440 total_steps=704
51
+ [train_locomo_node_memory] 2026-05-24T04:34:00 step_checkpoint_saved epoch=5 epoch_step=70 global_step=450 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
52
+ [train_locomo_node_memory] 2026-05-24T04:34:08 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.017054 batch_graph_error_count=0 batch_l2sp_loss=5.565e-05 batch_loss=6.018680095672607 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=5 epoch_step=80 global_step=460 total_steps=704
53
+ [train_locomo_node_memory] 2026-05-24T04:34:29 epoch_completed best_val_loss=3.718748 best_val_selection_score=118.424107 epoch=5 global_step=474 is_best=False train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.451428 train_path_recall_at_3=0.975 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.512776 val_path_recall_at_3=1.0 val_selection_score=118.487224
54
+ [train_locomo_node_memory] 2026-05-24T04:34:32 checkpoint_saved epoch=5 global_step=474 is_best=False node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
55
+ [train_locomo_node_memory] 2026-05-24T04:34:38 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.011568 batch_graph_error_count=0 batch_l2sp_loss=5.656e-05 batch_loss=4.576869964599609 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=6 epoch_step=6 global_step=480 total_steps=704
56
+ [train_locomo_node_memory] 2026-05-24T04:34:54 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.061455 batch_graph_error_count=0 batch_l2sp_loss=5.735e-05 batch_loss=4.150051116943359 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=6 epoch_step=26 global_step=500 total_steps=704
57
+ [train_locomo_node_memory] 2026-05-24T04:34:57 step_checkpoint_saved epoch=6 epoch_step=26 global_step=500 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
58
+ [train_locomo_node_memory] 2026-05-24T04:35:12 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.000704 batch_graph_error_count=0 batch_l2sp_loss=5.803e-05 batch_loss=5.257463455200195 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=6 epoch_step=46 global_step=520 total_steps=704
59
+ [train_locomo_node_memory] 2026-05-24T04:35:28 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=5.871e-05 batch_loss=5.609147071838379 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=6 epoch_step=66 global_step=540 total_steps=704
60
+ [train_locomo_node_memory] 2026-05-24T04:35:39 step_checkpoint_saved epoch=6 epoch_step=76 global_step=550 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
61
+ [train_locomo_node_memory] 2026-05-24T04:35:46 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.284378 batch_graph_error_count=0 batch_l2sp_loss=5.923e-05 batch_loss=6.41322135925293 batch_loss_group_count=1 batch_path_recall_at_3=0.857143 epoch=6 epoch_step=86 global_step=560 total_steps=704
62
+ [train_locomo_node_memory] 2026-05-24T04:36:03 epoch_completed best_val_loss=3.718748 best_val_selection_score=118.424107 epoch=6 global_step=569 is_best=False train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.54849 train_path_recall_at_3=0.95 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.330123 val_path_recall_at_3=1.0 val_selection_score=117.669877
63
+ [train_locomo_node_memory] 2026-05-24T04:36:05 checkpoint_saved epoch=6 global_step=569 is_best=False node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
64
+ [train_locomo_node_memory] 2026-05-24T04:36:16 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.051387 batch_graph_error_count=0 batch_l2sp_loss=5.974e-05 batch_loss=5.32041072845459 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=7 epoch_step=11 global_step=580 total_steps=704
65
+ [train_locomo_node_memory] 2026-05-24T04:36:32 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.042565 batch_graph_error_count=0 batch_l2sp_loss=6.013e-05 batch_loss=5.378664970397949 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=7 epoch_step=31 global_step=600 total_steps=704
66
+ [train_locomo_node_memory] 2026-05-24T04:36:35 step_checkpoint_saved epoch=7 epoch_step=31 global_step=600 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
67
+ [train_locomo_node_memory] 2026-05-24T04:36:50 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.023837 batch_graph_error_count=0 batch_l2sp_loss=6.043e-05 batch_loss=4.947947978973389 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=7 epoch_step=51 global_step=620 total_steps=704
68
+ [train_locomo_node_memory] 2026-05-24T04:37:05 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=6.069e-05 batch_loss=5.096177101135254 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=7 epoch_step=71 global_step=640 total_steps=704
69
+ [train_locomo_node_memory] 2026-05-24T04:37:16 step_checkpoint_saved epoch=7 epoch_step=81 global_step=650 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
70
+ [train_locomo_node_memory] 2026-05-24T04:37:24 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.008784 batch_graph_error_count=0 batch_l2sp_loss=6.089e-05 batch_loss=6.337571144104004 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=7 epoch_step=91 global_step=660 total_steps=704
71
+ [train_locomo_node_memory] 2026-05-24T04:37:37 epoch_completed best_val_loss=3.718748 best_val_selection_score=118.424107 epoch=7 global_step=665 is_best=False train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.466264 train_path_recall_at_3=1.0 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.604544 val_path_recall_at_3=1.0 val_selection_score=117.895456
72
+ [train_locomo_node_memory] 2026-05-24T04:37:40 checkpoint_saved epoch=7 global_step=665 is_best=False node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
73
+ [train_locomo_node_memory] 2026-05-24T04:37:41 final_eval_started batch_size=8 rows=439 split='train'
74
+ [train_locomo_node_memory] 2026-05-24T04:37:41 lazy_graph_cache_progress disk_cache_hits=1 graph_count=333 known_graph_count=333 last_conversation_id='actionframe548_typed_tunnel_v1_50' loaded_from='disk_cache' loads=1 memory_cache_hits=0 memory_cache_size=1 prefetch_completed=1 prefetch_pending=11 prefetch_submitted=12 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' tensorized=0
75
+ [train_locomo_node_memory] 2026-05-24T04:38:34 final_eval_completed event_recall_at_5=0.997722 path_recall_at_3=0.98861 rows=439 split='train' temporal_accuracy=0.235294
76
+ [train_locomo_node_memory] 2026-05-24T04:38:34 final_eval_started batch_size=8 rows=55 split='val'
77
+ [train_locomo_node_memory] 2026-05-24T04:38:42 final_eval_completed event_recall_at_5=1.0 path_recall_at_3=1.0 rows=55 split='val' temporal_accuracy=0.388889
78
+ [train_locomo_node_memory] 2026-05-24T04:38:42 query_split_load_deferred_started source_count=1 split='test'
79
+ [train_locomo_node_memory] 2026-05-24T04:38:42 query_split_loaded elapsed_seconds=0.016 rows=54 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' split='test'
80
+ [train_locomo_node_memory] 2026-05-24T04:38:42 query_split_load_deferred_completed rows=54 split='test'
81
+ [train_locomo_node_memory] 2026-05-24T04:38:42 final_eval_started batch_size=8 rows=54 split='test'
82
+ [train_locomo_node_memory] 2026-05-24T04:38:47 final_eval_completed event_recall_at_5=1.0 path_recall_at_3=1.0 rows=54 split='test' temporal_accuracy=0.434783
83
+ [train_locomo_node_memory] 2026-05-24T04:38:50 unified_checkpoint_audit_passed node_missing_prefixes='' path_missing_prefixes='' required_prefix_count=9
84
+ [train_locomo_node_memory] 2026-05-24T04:38:50 training_completed best_val_loss=3.718748 elapsed_seconds=770.323 issue_count=1 node_model_path='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt' path_model_path='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt' promoted_checkpoint_source='best' test_event_recall_at_5=1.0 test_path_recall_at_3=1.0 val_event_recall_at_5=1.0 val_path_recall_at_3=1.0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train.pid ADDED
@@ -0,0 +1 @@
 
 
1
+ 826216
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train_summary.json ADDED
The diff for this file is too large to render. See raw diff
 
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train_summary.partial.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"answer_calibration_variant": "competition_alignment_v3_trimaze", "answer_plan_variant": "candidate_event_answer_plan_v1", "artifacts": {"checkpoints_dir": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints", "node_model_best_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_best.pt", "node_model_last_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt", "path_model_best_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_best.pt", "path_model_last_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt"}, "best_val_loss": 3.718748, "best_val_selection_score": 118.424107, "config": {"amp": true, "answer_calibration_variant": "competition_alignment_v3_trimaze", "answer_plan_current_old_margin": 0.2, "answer_plan_current_old_margin_loss_weight": 0.0, "answer_plan_loss_weight": 0.18, "answer_plan_selected_negative_margin": 0.3, "answer_plan_selected_negative_margin_loss_weight": 0.0, "answer_plan_selection_margin": 0.12, "answer_plan_selection_top_k": 5, "answer_plan_variant": "candidate_event_answer_plan_v1", "answer_refusal_loss_weight": 0.08, "answer_refusal_margin": 0.15, "answer_type_loss_weight": 0.2, "batch_prepare_lookahead_batches": 64, "batch_prepare_workers": 16, "batch_size": 8, "checkpoint_every_steps": 50, "early_stopping_patience": 3, "encoder_attention_heads": 8, "encoder_attention_layers": 6, "encoder_structural_bias_variant": "tmcra_token_role_relbias_v1", "epoch_train_eval_enabled": true, "epoch_train_eval_max_rows": 80, "epoch_val_eval_enabled": true, "epoch_val_eval_max_rows": 32, "epochs": 8, "event_distractor_loss_weight": 0.15, "event_distractor_variant": "candidate_distractor_delta_bce_v1", "event_hard_negative_loss_weight": 0.35, "event_hard_negative_margin": 0.35, "event_loss_weight": 1.1, "event_matrix_delta_loss_weight": 0.08, "event_matrix_delta_margin": 0.15, "event_selection_loss_weight": 0.35, "event_selection_margin": 0.2, "event_selection_positive_coverage_count": 3, "event_selection_top_k": 5, "event_subgraph_attention_heads": 8, "event_subgraph_attention_layers": 6, "event_subgraph_refiner_variant": "event_centered_subgraph_attn_6l_h448", "event_tunnel_loss_weight": 0.12, "event_tunnel_margin": 0.16, "event_tunnel_selection_loss_weight": 0.08, "final_event_set_loss_weight": 0.35, "final_event_set_margin": 0.08, "final_event_set_positive_coverage_count": 3, "final_event_set_support_path_k": 3, "final_event_set_top_k": 10, "grad_clip": 1.0, "graph_prefetch_lookahead_batches": 3, "l2sp_loss_weight": 0.02, "log_every_steps": 20, "loss_blend_uniform_ratio": 0.25, "loss_group_balancing_mode": "supervision_bucket", "loss_max_example_weight": 1.85, "loss_min_example_weight": 0.65, "loss_multi_evidence_boost": 2.0, "loss_source_alpha": 0.4, "loss_temporal_positive_boost": 1.2, "loss_time_boost": 1.45, "loss_weight_power": 0.5, "lr": 8e-05, "max_train_steps": 0, "memory_router_loss_weight": 0.18, "memory_router_variant": "question_pooled_multilabel_v1", "memory_tunnel_variant": "chain_depth_tunnel_support_delta_v1", "message_passing_variant": "support_to_event_bidirectional_shared_typed_v1", "model_arch_version": "node_memory_v11_tmcra_scale49m_trimaze", "multi_positive_coverage_fraction": 0.7, "multi_positive_event_coverage_count": 5, "multi_positive_final_event_set_coverage_count": 5, "multi_positive_path_coverage_count": 3, "multi_positive_recall_coverage_count": 6, "node_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "node_max_tokens": 32, "online_event_hard_negative_limit": 6, "online_path_hard_negative_limit": 8, "pair_feature_adapter_variant": "residual_pair_feature_adapter_v1", "path_hard_negative_loss_weight": 0.25, "path_hard_negative_margin": 0.28, "path_loss_weight": 0.55, "path_matrix_delta_loss_weight": 0.08, "path_matrix_delta_margin": 0.12, "path_selection_loss_weight": 0.3, "path_selection_margin": 0.15, "path_selection_positive_coverage_count": 3, "path_selection_top_k": 3, "path_tunnel_delta_loss_weight": 0.12, "path_tunnel_loss_weight": 0.06, "path_tunnel_margin": 0.16, "path_tunnel_selection_loss_weight": 0.04, "question_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "question_intent_variant": "text_pooled_multihead_v1", "question_max_tokens": 48, "question_understanding_loss_weight": 0.12, "recall_loss_weight": 0.6, "recall_selection_loss_weight": 0.3, "recall_selection_margin": 0.12, "recall_selection_positive_coverage_count": 3, "recall_selection_top_k": 24, "sampling_blend_uniform_ratio": 0.35, "sampling_max_conversation_multiplier": 2.0, "sampling_max_group_repeat": 2, "sampling_multi_evidence_boost": 1.45, "sampling_source_alpha": 0.35, "sampling_temporal_positive_boost": 1.15, "sampling_time_boost": 1.2, "temporal_loss_weight": 0.3, "token_role_loss_weight": 0.08, "train_sampling_mode": "source_aware_balanced", "trainable_stage": "tunnel_fusion_only", "warmup_ratio": 0.06, "weight_decay": 0.01}, "device": "cuda", "encoder_attention_heads": 8, "encoder_attention_layers": 6, "encoder_structural_bias_variant": "tmcra_token_role_relbias_v1", "epoch": 7, "event_distractor_variant": "candidate_distractor_delta_bce_v1", "event_subgraph_attention_heads": 8, "event_subgraph_attention_layers": 6, "event_subgraph_refiner_variant": "event_centered_subgraph_attn_6l_h448", "final_eval_progress": {"completed_splits": ["train", "val", "test"], "test_metrics": {"answer_plan_current_top1_accuracy": 0.296296, "answer_plan_current_total": 54, "answer_plan_loss": 0.519994, "answer_plan_loss_count": 54, "answer_plan_selected_positive_coverage_at_5": 0.778761, "answer_plan_selected_positive_total": 113, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 54, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 54, "answer_type_loss": 3.65495, "answer_type_loss_count": 54, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.129032, "answer_plan_current_total": 31, "answer_plan_selected_positive_coverage_at_5": 0.776119, "answer_plan_selected_positive_total": 67, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 31, "event5_positive_total": 67, "event_positive_coverage_at_5": 0.940298, "event_recall_at_1": 0.903226, "event_recall_at_5": 1.0, "event_recall_total": 31, "path3_positive_total": 93, "path_positive_coverage_at_3": 0.849462, "path_recall_at_3": 1.0, "path_recall_total": 31, "recall_event24_positive_total": 67, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 31, "samples": 31, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.521739, "answer_plan_current_total": 23, "answer_plan_selected_positive_coverage_at_5": 0.782609, "answer_plan_selected_positive_total": 46, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 23, "event5_positive_total": 46, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.913044, "event_recall_at_5": 1.0, "event_recall_total": 23, "path3_positive_total": 69, "path_positive_coverage_at_3": 0.855072, "path_recall_at_3": 1.0, "path_recall_total": 23, "recall_event24_positive_total": 46, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 23, "samples": 23, "temporal_accuracy": 0.434783, "temporal_total": 23}}, "evaluation_limited": false, "event5_positive_total": 113, "event_distractor_loss": 0.652414, "event_distractor_loss_count": 54, "event_hard_negative_loss": 0.095313, "event_hard_negative_loss_count": 54, "event_loss": 0.388887, "event_loss_count": 54, "event_matrix_delta_loss": 0.121482, "event_matrix_delta_loss_count": 54, "event_positive_coverage_at_5": 0.964602, "event_recall_at_1": 0.907407, "event_recall_at_5": 1.0, "event_recall_total": 54, "event_selection_loss": 0.033558, "event_selection_loss_count": 54, "event_tunnel_loss": 0.257923, "event_tunnel_loss_count": 54, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 54, "final_event_set_loss": 0.006052, "final_event_set_loss_count": 54, "graph_error_count": 0, "loss": 3.097328, "loss_group_count": 7, "memory_router_exact_match": 0.851852, "memory_router_f1": 0.962963, "memory_router_loss": 0.407471, "memory_router_loss_count": 54, "memory_router_total": 54, "path3_positive_total": 162, "path_hard_negative_loss": 0.211284, "path_hard_negative_loss_count": 54, "path_loss": 0.429968, "path_loss_count": 54, "path_matrix_delta_loss": 0.494685, "path_matrix_delta_loss_count": 54, "path_positive_coverage_at_3": 0.851852, "path_recall_at_3": 1.0, "path_recall_total": 54, "path_selection_loss": 0.245353, "path_selection_loss_count": 54, "path_tunnel_delta_loss": 0.720266, "path_tunnel_delta_loss_count": 54, "path_tunnel_delta_positive_coverage_at_3": 0.487654, "path_tunnel_delta_positive_total": 162, "path_tunnel_delta_recall_at_3": 0.87037, "path_tunnel_delta_recall_total": 54, "path_tunnel_loss": 0.348827, "path_tunnel_loss_count": 54, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 54, "path_tunnel_rescue050_recall_at_3": 0.981481, "path_tunnel_rescue050_recall_total": 54, "path_tunnel_rescue100_recall_at_3": 0.981481, "path_tunnel_rescue100_recall_total": 54, "path_tunnel_selection_loss": 0.168474, "path_tunnel_selection_loss_count": 54, "path_tunnel_support_positive_coverage_at_3": 0.777778, "path_tunnel_support_positive_total": 162, "path_tunnel_support_recall_at_3": 0.981481, "path_tunnel_support_recall_total": 54, "question_understanding_loss": 3.784985, "question_understanding_loss_count": 54, "recall_event24_positive_total": 113, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 54, "recall_loss": 0.925862, "recall_loss_count": 54, "recall_selection_loss": 0.0, "recall_selection_loss_count": 54, "rows_evaluated": 54, "rows_total": 54, "samples": 54, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.296296, "answer_plan_current_total": 54, "answer_plan_selected_positive_coverage_at_5": 0.778761, "answer_plan_selected_positive_total": 113, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 54, "event5_positive_total": 113, "event_positive_coverage_at_5": 0.964602, "event_recall_at_1": 0.907407, "event_recall_at_5": 1.0, "event_recall_total": 54, "path3_positive_total": 162, "path_positive_coverage_at_3": 0.851852, "path_recall_at_3": 1.0, "path_recall_total": 54, "recall_event24_positive_total": 113, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 54, "samples": 54, "temporal_accuracy": 0.434783, "temporal_total": 23}}, "temporal_accuracy": 0.434783, "temporal_loss": 0.466158, "temporal_loss_count": 23, "temporal_total": 23, "token_role_loss": 0.0, "token_role_loss_count": 7, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}, "train_metrics": {"answer_plan_current_top1_accuracy": 0.418764, "answer_plan_current_total": 437, "answer_plan_loss": 0.551131, "answer_plan_loss_count": 439, "answer_plan_selected_positive_coverage_at_5": 0.794829, "answer_plan_selected_positive_total": 1199, "answer_plan_selected_recall_at_5": 0.970387, "answer_plan_selected_total": 439, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 439, "answer_type_loss": 5.186379, "answer_type_loss_count": 439, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.428571, "answer_plan_current_total": 420, "answer_plan_selected_positive_coverage_at_5": 0.796217, "answer_plan_selected_positive_total": 1163, "answer_plan_selected_recall_at_5": 0.971564, "answer_plan_selected_total": 422, "event5_positive_total": 1163, "event_positive_coverage_at_5": 0.986242, "event_recall_at_1": 0.845972, "event_recall_at_5": 0.99763, "event_recall_total": 422, "path3_positive_total": 1266, "path_positive_coverage_at_3": 0.888626, "path_recall_at_3": 0.990521, "path_recall_total": 422, "recall_event24_positive_total": 1163, "recall_event_positive_coverage_at_24": 0.993121, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 422, "samples": 422, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.176471, "answer_plan_current_total": 17, "answer_plan_selected_positive_coverage_at_5": 0.75, "answer_plan_selected_positive_total": 36, "answer_plan_selected_recall_at_5": 0.941176, "answer_plan_selected_total": 17, "event5_positive_total": 36, "event_positive_coverage_at_5": 0.944444, "event_recall_at_1": 0.647059, "event_recall_at_5": 1.0, "event_recall_total": 17, "path3_positive_total": 51, "path_positive_coverage_at_3": 0.784314, "path_recall_at_3": 0.941176, "path_recall_total": 17, "recall_event24_positive_total": 36, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 17, "samples": 17, "temporal_accuracy": 0.235294, "temporal_total": 17}}, "evaluation_limited": false, "event5_positive_total": 1199, "event_distractor_loss": 0.671777, "event_distractor_loss_count": 439, "event_hard_negative_loss": 0.090675, "event_hard_negative_loss_count": 439, "event_loss": 0.352816, "event_loss_count": 439, "event_matrix_delta_loss": 0.134491, "event_matrix_delta_loss_count": 439, "event_positive_coverage_at_5": 0.984987, "event_recall_at_1": 0.838269, "event_recall_at_5": 0.997722, "event_recall_total": 439, "event_selection_loss": 0.011378, "event_selection_loss_count": 439, "event_tunnel_loss": 0.235026, "event_tunnel_loss_count": 439, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 439, "final_event_set_loss": 0.023022, "final_event_set_loss_count": 439, "graph_error_count": 0, "loss": 3.572644, "loss_group_count": 60, "memory_router_exact_match": 0.366743, "memory_router_f1": 0.845193, "memory_router_loss": 1.455285, "memory_router_loss_count": 439, "memory_router_total": 439, "path3_positive_total": 1317, "path_hard_negative_loss": 0.128464, "path_hard_negative_loss_count": 439, "path_loss": 0.392987, "path_loss_count": 439, "path_matrix_delta_loss": 0.489592, "path_matrix_delta_loss_count": 439, "path_positive_coverage_at_3": 0.884586, "path_recall_at_3": 0.98861, "path_recall_total": 439, "path_selection_loss": 0.139808, "path_selection_loss_count": 439, "path_tunnel_delta_loss": 0.629687, "path_tunnel_delta_loss_count": 439, "path_tunnel_delta_positive_coverage_at_3": 0.458618, "path_tunnel_delta_positive_total": 1317, "path_tunnel_delta_recall_at_3": 0.788155, "path_tunnel_delta_recall_total": 439, "path_tunnel_loss": 0.352045, "path_tunnel_loss_count": 439, "path_tunnel_rescue025_recall_at_3": 0.98861, "path_tunnel_rescue025_recall_total": 439, "path_tunnel_rescue050_recall_at_3": 0.981777, "path_tunnel_rescue050_recall_total": 439, "path_tunnel_rescue100_recall_at_3": 0.977221, "path_tunnel_rescue100_recall_total": 439, "path_tunnel_selection_loss": 0.15888, "path_tunnel_selection_loss_count": 439, "path_tunnel_support_positive_coverage_at_3": 0.800304, "path_tunnel_support_positive_total": 1317, "path_tunnel_support_recall_at_3": 0.938497, "path_tunnel_support_recall_total": 439, "question_understanding_loss": 5.304419, "question_understanding_loss_count": 439, "recall_event24_positive_total": 1199, "recall_event_positive_coverage_at_24": 0.993328, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 439, "recall_loss": 0.878647, "recall_loss_count": 439, "recall_selection_loss": 0.0, "recall_selection_loss_count": 439, "rows_evaluated": 439, "rows_total": 439, "samples": 439, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.418764, "answer_plan_current_total": 437, "answer_plan_selected_positive_coverage_at_5": 0.794829, "answer_plan_selected_positive_total": 1199, "answer_plan_selected_recall_at_5": 0.970387, "answer_plan_selected_total": 439, "event5_positive_total": 1199, "event_positive_coverage_at_5": 0.984987, "event_recall_at_1": 0.838269, "event_recall_at_5": 0.997722, "event_recall_total": 439, "path3_positive_total": 1317, "path_positive_coverage_at_3": 0.884586, "path_recall_at_3": 0.98861, "path_recall_total": 439, "recall_event24_positive_total": 1199, "recall_event_positive_coverage_at_24": 0.993328, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 439, "samples": 439, "temporal_accuracy": 0.235294, "temporal_total": 17}}, "temporal_accuracy": 0.235294, "temporal_loss": 0.558015, "temporal_loss_count": 17, "temporal_total": 17, "token_role_loss": 0.0, "token_role_loss_count": 60, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}, "val_metrics": {"answer_plan_current_top1_accuracy": 0.490909, "answer_plan_current_total": 55, "answer_plan_loss": 0.528866, "answer_plan_loss_count": 55, "answer_plan_selected_positive_coverage_at_5": 0.843284, "answer_plan_selected_positive_total": 134, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 55, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 55, "answer_type_loss": 5.884438, "answer_type_loss_count": 55, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.486486, "answer_plan_current_total": 37, "answer_plan_selected_positive_coverage_at_5": 0.816327, "answer_plan_selected_positive_total": 98, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 37, "event5_positive_total": 98, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.918919, "event_recall_at_5": 1.0, "event_recall_total": 37, "path3_positive_total": 111, "path_positive_coverage_at_3": 0.900901, "path_recall_at_3": 1.0, "path_recall_total": 37, "recall_event24_positive_total": 98, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 37, "samples": 37, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.5, "answer_plan_current_total": 18, "answer_plan_selected_positive_coverage_at_5": 0.916666, "answer_plan_selected_positive_total": 36, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 18, "event5_positive_total": 36, "event_positive_coverage_at_5": 0.972222, "event_recall_at_1": 0.833333, "event_recall_at_5": 1.0, "event_recall_total": 18, "path3_positive_total": 54, "path_positive_coverage_at_3": 0.740741, "path_recall_at_3": 1.0, "path_recall_total": 18, "recall_event24_positive_total": 36, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 18, "samples": 18, "temporal_accuracy": 0.388889, "temporal_total": 18}}, "evaluation_limited": false, "event5_positive_total": 134, "event_distractor_loss": 0.642121, "event_distractor_loss_count": 55, "event_hard_negative_loss": 0.068247, "event_hard_negative_loss_count": 55, "event_loss": 0.364887, "event_loss_count": 55, "event_matrix_delta_loss": 0.122631, "event_matrix_delta_loss_count": 55, "event_positive_coverage_at_5": 0.992537, "event_recall_at_1": 0.890909, "event_recall_at_5": 1.0, "event_recall_total": 55, "event_selection_loss": 0.009935, "event_selection_loss_count": 55, "event_tunnel_loss": 0.240878, "event_tunnel_loss_count": 55, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 55, "final_event_set_loss": 0.014737, "final_event_set_loss_count": 55, "graph_error_count": 0, "loss": 3.474828, "loss_group_count": 7, "memory_router_exact_match": 0.381818, "memory_router_f1": 0.921212, "memory_router_loss": 0.550699, "memory_router_loss_count": 55, "memory_router_total": 55, "path3_positive_total": 165, "path_hard_negative_loss": 0.235719, "path_hard_negative_loss_count": 55, "path_loss": 0.424168, "path_loss_count": 55, "path_matrix_delta_loss": 0.554701, "path_matrix_delta_loss_count": 55, "path_positive_coverage_at_3": 0.848485, "path_recall_at_3": 1.0, "path_recall_total": 55, "path_selection_loss": 0.204065, "path_selection_loss_count": 55, "path_tunnel_delta_loss": 0.663551, "path_tunnel_delta_loss_count": 55, "path_tunnel_delta_positive_coverage_at_3": 0.630303, "path_tunnel_delta_positive_total": 165, "path_tunnel_delta_recall_at_3": 0.854546, "path_tunnel_delta_recall_total": 55, "path_tunnel_loss": 0.364049, "path_tunnel_loss_count": 55, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 55, "path_tunnel_rescue050_recall_at_3": 1.0, "path_tunnel_rescue050_recall_total": 55, "path_tunnel_rescue100_recall_at_3": 1.0, "path_tunnel_rescue100_recall_total": 55, "path_tunnel_selection_loss": 0.172115, "path_tunnel_selection_loss_count": 55, "path_tunnel_support_positive_coverage_at_3": 0.812121, "path_tunnel_support_positive_total": 165, "path_tunnel_support_recall_at_3": 0.981818, "path_tunnel_support_recall_total": 55, "question_understanding_loss": 3.861762, "question_understanding_loss_count": 55, "recall_event24_positive_total": 134, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 55, "recall_loss": 0.859693, "recall_loss_count": 55, "recall_selection_loss": 0.0, "recall_selection_loss_count": 55, "rows_evaluated": 55, "rows_total": 55, "samples": 55, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.490909, "answer_plan_current_total": 55, "answer_plan_selected_positive_coverage_at_5": 0.843284, "answer_plan_selected_positive_total": 134, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 55, "event5_positive_total": 134, "event_positive_coverage_at_5": 0.992537, "event_recall_at_1": 0.890909, "event_recall_at_5": 1.0, "event_recall_total": 55, "path3_positive_total": 165, "path_positive_coverage_at_3": 0.848485, "path_recall_at_3": 1.0, "path_recall_total": 55, "recall_event24_positive_total": 134, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 55, "samples": 55, "temporal_accuracy": 0.388889, "temporal_total": 18}}, "temporal_accuracy": 0.388889, "temporal_loss": 0.502006, "temporal_loss_count": 18, "temporal_total": 18, "token_role_loss": 0.0, "token_role_loss_count": 7, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}}, "global_step": 665, "issue_counts": {"batch_prepare_pool_event": 1}, "issue_log": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/training_issues.jsonl", "last_epoch": {"best_val_loss": 3.718748, "best_val_selection_key": [1.0, 1.0, 1.0, 0.428571, -3.718748], "best_val_selection_score": 118.424107, "epoch": 7, "global_step": 665, "is_best": false, "patience": 3, "total_steps": 704, "train": {"answer_plan_current_top1_accuracy": 0.379747, "answer_plan_current_total": 79, "answer_plan_loss": 0.549526, "answer_plan_loss_count": 80, "answer_plan_selected_positive_coverage_at_5": 0.821101, "answer_plan_selected_positive_total": 218, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 80, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 80, "answer_type_loss": 5.16927, "answer_type_loss_count": 80, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.376623, "answer_plan_current_total": 77, "answer_plan_selected_positive_coverage_at_5": 0.821596, "answer_plan_selected_positive_total": 213, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 78, "event5_positive_total": 213, "event_positive_coverage_at_5": 0.981221, "event_recall_at_1": 0.846154, "event_recall_at_5": 1.0, "event_recall_total": 78, "path3_positive_total": 234, "path_positive_coverage_at_3": 0.910256, "path_recall_at_3": 1.0, "path_recall_total": 78, "recall_event24_positive_total": 213, "recall_event_positive_coverage_at_24": 0.995305, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 78, "samples": 78, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.5, "answer_plan_current_total": 2, "answer_plan_selected_positive_coverage_at_5": 0.8, "answer_plan_selected_positive_total": 5, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 2, "event5_positive_total": 5, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 1.0, "event_recall_at_5": 1.0, "event_recall_total": 2, "path3_positive_total": 6, "path_positive_coverage_at_3": 1.0, "path_recall_at_3": 1.0, "path_recall_total": 2, "recall_event24_positive_total": 5, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 2, "samples": 2, "temporal_accuracy": 0.0, "temporal_total": 2}}, "evaluation_limited": true, "event5_positive_total": 218, "event_distractor_loss": 0.666098, "event_distractor_loss_count": 80, "event_hard_negative_loss": 0.078884, "event_hard_negative_loss_count": 80, "event_loss": 0.323909, "event_loss_count": 80, "event_matrix_delta_loss": 0.128518, "event_matrix_delta_loss_count": 80, "event_positive_coverage_at_5": 0.981651, "event_recall_at_1": 0.85, "event_recall_at_5": 1.0, "event_recall_total": 80, "event_selection_loss": 0.014046, "event_selection_loss_count": 80, "event_tunnel_loss": 0.213497, "event_tunnel_loss_count": 80, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 80, "final_event_set_loss": 0.013138, "final_event_set_loss_count": 80, "graph_error_count": 0, "loss": 3.466264, "loss_group_count": 11, "memory_router_exact_match": 0.375, "memory_router_f1": 0.845089, "memory_router_loss": 1.479962, "memory_router_loss_count": 80, "memory_router_total": 80, "path3_positive_total": 240, "path_hard_negative_loss": 0.117886, "path_hard_negative_loss_count": 80, "path_loss": 0.369553, "path_loss_count": 80, "path_matrix_delta_loss": 0.454712, "path_matrix_delta_loss_count": 80, "path_positive_coverage_at_3": 0.9125, "path_recall_at_3": 1.0, "path_recall_total": 80, "path_selection_loss": 0.12883, "path_selection_loss_count": 80, "path_tunnel_delta_loss": 0.60523, "path_tunnel_delta_loss_count": 80, "path_tunnel_delta_positive_coverage_at_3": 0.479167, "path_tunnel_delta_positive_total": 240, "path_tunnel_delta_recall_at_3": 0.75, "path_tunnel_delta_recall_total": 80, "path_tunnel_loss": 0.338907, "path_tunnel_loss_count": 80, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 80, "path_tunnel_rescue050_recall_at_3": 0.9875, "path_tunnel_rescue050_recall_total": 80, "path_tunnel_rescue100_recall_at_3": 0.9875, "path_tunnel_rescue100_recall_total": 80, "path_tunnel_selection_loss": 0.14157, "path_tunnel_selection_loss_count": 80, "path_tunnel_support_positive_coverage_at_3": 0.829167, "path_tunnel_support_positive_total": 240, "path_tunnel_support_recall_at_3": 0.9375, "path_tunnel_support_recall_total": 80, "question_understanding_loss": 5.293586, "question_understanding_loss_count": 80, "recall_event24_positive_total": 218, "recall_event_positive_coverage_at_24": 0.995413, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 80, "recall_loss": 0.820672, "recall_loss_count": 80, "recall_selection_loss": 0.0, "recall_selection_loss_count": 80, "rows_evaluated": 80, "rows_total": 439, "samples": 80, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.379747, "answer_plan_current_total": 79, "answer_plan_selected_positive_coverage_at_5": 0.821101, "answer_plan_selected_positive_total": 218, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 80, "event5_positive_total": 218, "event_positive_coverage_at_5": 0.981651, "event_recall_at_1": 0.85, "event_recall_at_5": 1.0, "event_recall_total": 80, "path3_positive_total": 240, "path_positive_coverage_at_3": 0.9125, "path_recall_at_3": 1.0, "path_recall_total": 80, "recall_event24_positive_total": 218, "recall_event_positive_coverage_at_24": 0.995413, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 80, "samples": 80, "temporal_accuracy": 0.0, "temporal_total": 2}}, "temporal_accuracy": 0.0, "temporal_loss": 0.513602, "temporal_loss_count": 2, "temporal_total": 2, "token_role_loss": 0.0, "token_role_loss_count": 11, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}, "val": {"answer_plan_current_top1_accuracy": 0.53125, "answer_plan_current_total": 32, "answer_plan_loss": 0.530247, "answer_plan_loss_count": 32, "answer_plan_selected_positive_coverage_at_5": 0.848101, "answer_plan_selected_positive_total": 79, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 32, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 32, "answer_type_loss": 6.441788, "answer_type_loss_count": 32, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.545455, "answer_plan_current_total": 22, "answer_plan_selected_positive_coverage_at_5": 0.847458, "answer_plan_selected_positive_total": 59, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 22, "event5_positive_total": 59, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.909091, "event_recall_at_5": 1.0, "event_recall_total": 22, "path3_positive_total": 66, "path_positive_coverage_at_3": 0.924242, "path_recall_at_3": 1.0, "path_recall_total": 22, "recall_event24_positive_total": 59, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 22, "samples": 22, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.5, "answer_plan_current_total": 10, "answer_plan_selected_positive_coverage_at_5": 0.85, "answer_plan_selected_positive_total": 20, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 10, "event5_positive_total": 20, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.7, "event_recall_at_5": 1.0, "event_recall_total": 10, "path3_positive_total": 30, "path_positive_coverage_at_3": 0.733334, "path_recall_at_3": 1.0, "path_recall_total": 10, "recall_event24_positive_total": 20, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 10, "samples": 10, "temporal_accuracy": 0.3, "temporal_total": 10}}, "evaluation_limited": true, "event5_positive_total": 79, "event_distractor_loss": 0.646066, "event_distractor_loss_count": 32, "event_hard_negative_loss": 0.062803, "event_hard_negative_loss_count": 32, "event_loss": 0.379233, "event_loss_count": 32, "event_matrix_delta_loss": 0.125441, "event_matrix_delta_loss_count": 32, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.84375, "event_recall_at_5": 1.0, "event_recall_total": 32, "event_selection_loss": 0.001734, "event_selection_loss_count": 32, "event_tunnel_loss": 0.248372, "event_tunnel_loss_count": 32, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 32, "final_event_set_loss": 0.012893, "final_event_set_loss_count": 32, "graph_error_count": 0, "loss": 3.604544, "loss_group_count": 4, "memory_router_exact_match": 0.34375, "memory_router_f1": 0.918403, "memory_router_loss": 0.575108, "memory_router_loss_count": 32, "memory_router_total": 32, "path3_positive_total": 96, "path_hard_negative_loss": 0.23831, "path_hard_negative_loss_count": 32, "path_loss": 0.422076, "path_loss_count": 32, "path_matrix_delta_loss": 0.564214, "path_matrix_delta_loss_count": 32, "path_positive_coverage_at_3": 0.864584, "path_recall_at_3": 1.0, "path_recall_total": 32, "path_selection_loss": 0.189845, "path_selection_loss_count": 32, "path_tunnel_delta_loss": 0.658501, "path_tunnel_delta_loss_count": 32, "path_tunnel_delta_positive_coverage_at_3": 0.614584, "path_tunnel_delta_positive_total": 96, "path_tunnel_delta_recall_at_3": 0.84375, "path_tunnel_delta_recall_total": 32, "path_tunnel_loss": 0.360756, "path_tunnel_loss_count": 32, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 32, "path_tunnel_rescue050_recall_at_3": 1.0, "path_tunnel_rescue050_recall_total": 32, "path_tunnel_rescue100_recall_at_3": 1.0, "path_tunnel_rescue100_recall_total": 32, "path_tunnel_selection_loss": 0.183911, "path_tunnel_selection_loss_count": 32, "path_tunnel_support_positive_coverage_at_3": 0.802083, "path_tunnel_support_positive_total": 96, "path_tunnel_support_recall_at_3": 0.96875, "path_tunnel_support_recall_total": 32, "question_understanding_loss": 3.751869, "question_understanding_loss_count": 32, "recall_event24_positive_total": 79, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 32, "recall_loss": 0.894676, "recall_loss_count": 32, "recall_selection_loss": 0.0, "recall_selection_loss_count": 32, "rows_evaluated": 32, "rows_total": 55, "samples": 32, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.53125, "answer_plan_current_total": 32, "answer_plan_selected_positive_coverage_at_5": 0.848101, "answer_plan_selected_positive_total": 79, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 32, "event5_positive_total": 79, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.84375, "event_recall_at_5": 1.0, "event_recall_total": 32, "path3_positive_total": 96, "path_positive_coverage_at_3": 0.864584, "path_recall_at_3": 1.0, "path_recall_total": 32, "recall_event24_positive_total": 79, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 32, "samples": 32, "temporal_accuracy": 0.3, "temporal_total": 10}}, "temporal_accuracy": 0.3, "temporal_loss": 0.50511, "temporal_loss_count": 10, "temporal_total": 10, "token_role_loss": 0.0, "token_role_loss_count": 4, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}, "val_selection_enabled": true, "val_selection_score": 117.895456}, "last_step_checkpoint": {"epoch": 7, "epoch_step": 81, "global_step": 650, "node_model_last_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt", "path_model_last_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt", "total_steps": 704}, "latest_epoch": 7, "latest_global_step": 665, "memory_router_variant": "question_pooled_multilabel_v1", "memory_tunnel_variant": "chain_depth_tunnel_support_delta_v1", "message_passing_variant": "support_to_event_bidirectional_shared_typed_v1", "model_arch_version": "node_memory_v11_tmcra_scale49m_trimaze", "node_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "node_max_tokens": 32, "pair_feature_adapter_variant": "residual_pair_feature_adapter_v1", "prebuild_issue_log": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/graph_cache_prebuild_errors.jsonl", "question_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "question_intent_variant": "text_pooled_multihead_v1", "question_max_tokens": 48, "status": "running", "train": {"answer_plan_current_top1_accuracy": 0.379747, "answer_plan_current_total": 79, "answer_plan_loss": 0.549526, "answer_plan_loss_count": 80, "answer_plan_selected_positive_coverage_at_5": 0.821101, "answer_plan_selected_positive_total": 218, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 80, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 80, "answer_type_loss": 5.16927, "answer_type_loss_count": 80, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.376623, "answer_plan_current_total": 77, "answer_plan_selected_positive_coverage_at_5": 0.821596, "answer_plan_selected_positive_total": 213, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 78, "event5_positive_total": 213, "event_positive_coverage_at_5": 0.981221, "event_recall_at_1": 0.846154, "event_recall_at_5": 1.0, "event_recall_total": 78, "path3_positive_total": 234, "path_positive_coverage_at_3": 0.910256, "path_recall_at_3": 1.0, "path_recall_total": 78, "recall_event24_positive_total": 213, "recall_event_positive_coverage_at_24": 0.995305, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 78, "samples": 78, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.5, "answer_plan_current_total": 2, "answer_plan_selected_positive_coverage_at_5": 0.8, "answer_plan_selected_positive_total": 5, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 2, "event5_positive_total": 5, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 1.0, "event_recall_at_5": 1.0, "event_recall_total": 2, "path3_positive_total": 6, "path_positive_coverage_at_3": 1.0, "path_recall_at_3": 1.0, "path_recall_total": 2, "recall_event24_positive_total": 5, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 2, "samples": 2, "temporal_accuracy": 0.0, "temporal_total": 2}}, "evaluation_limited": true, "event5_positive_total": 218, "event_distractor_loss": 0.666098, "event_distractor_loss_count": 80, "event_hard_negative_loss": 0.078884, "event_hard_negative_loss_count": 80, "event_loss": 0.323909, "event_loss_count": 80, "event_matrix_delta_loss": 0.128518, "event_matrix_delta_loss_count": 80, "event_positive_coverage_at_5": 0.981651, "event_recall_at_1": 0.85, "event_recall_at_5": 1.0, "event_recall_total": 80, "event_selection_loss": 0.014046, "event_selection_loss_count": 80, "event_tunnel_loss": 0.213497, "event_tunnel_loss_count": 80, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 80, "final_event_set_loss": 0.013138, "final_event_set_loss_count": 80, "graph_error_count": 0, "loss": 3.466264, "loss_group_count": 11, "memory_router_exact_match": 0.375, "memory_router_f1": 0.845089, "memory_router_loss": 1.479962, "memory_router_loss_count": 80, "memory_router_total": 80, "path3_positive_total": 240, "path_hard_negative_loss": 0.117886, "path_hard_negative_loss_count": 80, "path_loss": 0.369553, "path_loss_count": 80, "path_matrix_delta_loss": 0.454712, "path_matrix_delta_loss_count": 80, "path_positive_coverage_at_3": 0.9125, "path_recall_at_3": 1.0, "path_recall_total": 80, "path_selection_loss": 0.12883, "path_selection_loss_count": 80, "path_tunnel_delta_loss": 0.60523, "path_tunnel_delta_loss_count": 80, "path_tunnel_delta_positive_coverage_at_3": 0.479167, "path_tunnel_delta_positive_total": 240, "path_tunnel_delta_recall_at_3": 0.75, "path_tunnel_delta_recall_total": 80, "path_tunnel_loss": 0.338907, "path_tunnel_loss_count": 80, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 80, "path_tunnel_rescue050_recall_at_3": 0.9875, "path_tunnel_rescue050_recall_total": 80, "path_tunnel_rescue100_recall_at_3": 0.9875, "path_tunnel_rescue100_recall_total": 80, "path_tunnel_selection_loss": 0.14157, "path_tunnel_selection_loss_count": 80, "path_tunnel_support_positive_coverage_at_3": 0.829167, "path_tunnel_support_positive_total": 240, "path_tunnel_support_recall_at_3": 0.9375, "path_tunnel_support_recall_total": 80, "question_understanding_loss": 5.293586, "question_understanding_loss_count": 80, "recall_event24_positive_total": 218, "recall_event_positive_coverage_at_24": 0.995413, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 80, "recall_loss": 0.820672, "recall_loss_count": 80, "recall_selection_loss": 0.0, "recall_selection_loss_count": 80, "rows_evaluated": 80, "rows_total": 439, "samples": 80, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.379747, "answer_plan_current_total": 79, "answer_plan_selected_positive_coverage_at_5": 0.821101, "answer_plan_selected_positive_total": 218, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 80, "event5_positive_total": 218, "event_positive_coverage_at_5": 0.981651, "event_recall_at_1": 0.85, "event_recall_at_5": 1.0, "event_recall_total": 80, "path3_positive_total": 240, "path_positive_coverage_at_3": 0.9125, "path_recall_at_3": 1.0, "path_recall_total": 80, "recall_event24_positive_total": 218, "recall_event_positive_coverage_at_24": 0.995413, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 80, "samples": 80, "temporal_accuracy": 0.0, "temporal_total": 2}}, "temporal_accuracy": 0.0, "temporal_loss": 0.513602, "temporal_loss_count": 2, "temporal_total": 2, "token_role_loss": 0.0, "token_role_loss_count": 11, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}, "training_sampling": {"answer_type_counts": {"multi_evidence": 422, "time": 17}, "base_row_count": 439, "effective_row_count": 697, "loss_weight_range_estimate": {"max": 1.549193, "min": 1.319091}, "mode": "source_aware_balanced", "repeat_histogram": {"1": 118, "2": 144}, "source_count": 1, "source_details": {"dataset": {"avg_example_weight_estimate": 1.540283, "loss_source_factor": 1.0, "row_count": 439, "row_share": 1.0, "sampled_row_count_estimate": 697, "sampled_row_share_estimate": 1.0, "sampling_source_factor": 1.0, "weighted_loss_mass": 676.184129}}, "source_row_counts": {"dataset": 439}, "supervision_bucket_counts": {"typed_tunnel_multi_positive": 439}, "weighted_answer_mass_estimate": {"multi_evidence": 653.759589, "time": 22.42454}, "weighted_supervision_bucket_mass_estimate": {"typed_tunnel_multi_positive": 680.095876}}, "val": {"answer_plan_current_top1_accuracy": 0.53125, "answer_plan_current_total": 32, "answer_plan_loss": 0.530247, "answer_plan_loss_count": 32, "answer_plan_selected_positive_coverage_at_5": 0.848101, "answer_plan_selected_positive_total": 79, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 32, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 32, "answer_type_loss": 6.441788, "answer_type_loss_count": 32, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.545455, "answer_plan_current_total": 22, "answer_plan_selected_positive_coverage_at_5": 0.847458, "answer_plan_selected_positive_total": 59, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 22, "event5_positive_total": 59, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.909091, "event_recall_at_5": 1.0, "event_recall_total": 22, "path3_positive_total": 66, "path_positive_coverage_at_3": 0.924242, "path_recall_at_3": 1.0, "path_recall_total": 22, "recall_event24_positive_total": 59, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 22, "samples": 22, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.5, "answer_plan_current_total": 10, "answer_plan_selected_positive_coverage_at_5": 0.85, "answer_plan_selected_positive_total": 20, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 10, "event5_positive_total": 20, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.7, "event_recall_at_5": 1.0, "event_recall_total": 10, "path3_positive_total": 30, "path_positive_coverage_at_3": 0.733334, "path_recall_at_3": 1.0, "path_recall_total": 10, "recall_event24_positive_total": 20, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 10, "samples": 10, "temporal_accuracy": 0.3, "temporal_total": 10}}, "evaluation_limited": true, "event5_positive_total": 79, "event_distractor_loss": 0.646066, "event_distractor_loss_count": 32, "event_hard_negative_loss": 0.062803, "event_hard_negative_loss_count": 32, "event_loss": 0.379233, "event_loss_count": 32, "event_matrix_delta_loss": 0.125441, "event_matrix_delta_loss_count": 32, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.84375, "event_recall_at_5": 1.0, "event_recall_total": 32, "event_selection_loss": 0.001734, "event_selection_loss_count": 32, "event_tunnel_loss": 0.248372, "event_tunnel_loss_count": 32, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 32, "final_event_set_loss": 0.012893, "final_event_set_loss_count": 32, "graph_error_count": 0, "loss": 3.604544, "loss_group_count": 4, "memory_router_exact_match": 0.34375, "memory_router_f1": 0.918403, "memory_router_loss": 0.575108, "memory_router_loss_count": 32, "memory_router_total": 32, "path3_positive_total": 96, "path_hard_negative_loss": 0.23831, "path_hard_negative_loss_count": 32, "path_loss": 0.422076, "path_loss_count": 32, "path_matrix_delta_loss": 0.564214, "path_matrix_delta_loss_count": 32, "path_positive_coverage_at_3": 0.864584, "path_recall_at_3": 1.0, "path_recall_total": 32, "path_selection_loss": 0.189845, "path_selection_loss_count": 32, "path_tunnel_delta_loss": 0.658501, "path_tunnel_delta_loss_count": 32, "path_tunnel_delta_positive_coverage_at_3": 0.614584, "path_tunnel_delta_positive_total": 96, "path_tunnel_delta_recall_at_3": 0.84375, "path_tunnel_delta_recall_total": 32, "path_tunnel_loss": 0.360756, "path_tunnel_loss_count": 32, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 32, "path_tunnel_rescue050_recall_at_3": 1.0, "path_tunnel_rescue050_recall_total": 32, "path_tunnel_rescue100_recall_at_3": 1.0, "path_tunnel_rescue100_recall_total": 32, "path_tunnel_selection_loss": 0.183911, "path_tunnel_selection_loss_count": 32, "path_tunnel_support_positive_coverage_at_3": 0.802083, "path_tunnel_support_positive_total": 96, "path_tunnel_support_recall_at_3": 0.96875, "path_tunnel_support_recall_total": 32, "question_understanding_loss": 3.751869, "question_understanding_loss_count": 32, "recall_event24_positive_total": 79, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 32, "recall_loss": 0.894676, "recall_loss_count": 32, "recall_selection_loss": 0.0, "recall_selection_loss_count": 32, "rows_evaluated": 32, "rows_total": 55, "samples": 32, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.53125, "answer_plan_current_total": 32, "answer_plan_selected_positive_coverage_at_5": 0.848101, "answer_plan_selected_positive_total": 79, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 32, "event5_positive_total": 79, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.84375, "event_recall_at_5": 1.0, "event_recall_total": 32, "path3_positive_total": 96, "path_positive_coverage_at_3": 0.864584, "path_recall_at_3": 1.0, "path_recall_total": 32, "recall_event24_positive_total": 79, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 32, "samples": 32, "temporal_accuracy": 0.3, "temporal_total": 10}}, "temporal_accuracy": 0.3, "temporal_loss": 0.50511, "temporal_loss_count": 10, "temporal_total": 10, "token_role_loss": 0.0, "token_role_loss_count": 4, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}}
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/training_issues.jsonl ADDED
@@ -0,0 +1 @@
 
 
1
+ {"backend": "process", "event_type": "batch_prepare_process_pool_ready", "graph_error_stage": "train_batch_prepare", "issue_type": "batch_prepare_pool_event", "timestamp": "2026-05-24T04:26:31", "worker_count": 16}
results/judge_gpt4o_alias_vectorengine.jsonl ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"question_id": "7e00a6cb", "hypothesis": "International Budget Hostel", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
2
+ {"question_id": "1903aded", "hypothesis": "I can’t recover the 7th job from the provided memory evidence. The retrieved notes only show unrelated lists, like living room color schemes, dessert ideas, and sales tax questions, not the earlier work-from-home jobs for seniors list.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
3
+ {"question_id": "ceb54acb", "hypothesis": "Sexual fixations, Problematic sexual behaviors, Sexual impulsivity, Compulsive sexuality", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
4
+ {"question_id": "f523d9fe", "hypothesis": "I can’t tell from the available memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
5
+ {"question_id": "0e5e2d1a", "hypothesis": "38 subjects", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
6
+ {"question_id": "fea54f57", "hypothesis": "Evolution", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
7
+ {"question_id": "cc539528", "hypothesis": "Ruby, Python, or PHP", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
8
+ {"question_id": "dc439ea3", "hypothesis": "Hoop Dance", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
9
+ {"question_id": "18dcd5a5", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
10
+ {"question_id": "488d3006", "hypothesis": "GR-90", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
11
+ {"question_id": "58470ed2", "hypothesis": "The Library is a sphere whose exact center is any one of its hexagons and whose circumference is inaccessible.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
12
+ {"question_id": "8cf51dda", "hypothesis": "1. To identify molecular subtypes of endometrial cancer using a combination of genomic and transcriptomic approaches. 2. To investigate the clinical and biological significance of the identified molecular subtypes, including their association with patient outcomes. 3. To develop biomarkers for the early detection and prognosis of endometrial cancer based on the identified molecular subtypes.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
13
+ {"question_id": "1d4da289", "hypothesis": "biometric authentication and one-time passwords (OTP)", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
14
+ {"question_id": "8464fc84", "hypothesis": "Roscioli", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
15
+ {"question_id": "8aef76bc", "hypothesis": "Mod Podge or another sealant", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
16
+ {"question_id": "71a3fd6b", "hypothesis": "+49 (0) 62 32 / 14 23 - 0", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
17
+ {"question_id": "2bf43736", "hypothesis": "Chapter 4 of Book 1, titled \"Vocal Prayer and Meditation.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
18
+ {"question_id": "70b3e69b", "hypothesis": "Marina Rossell", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
19
+ {"question_id": "8752c811", "hypothesis": "The 27th parameter was Perspective.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
20
+ {"question_id": "3249768e", "hypothesis": "Absinthe", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
21
+ {"question_id": "1b9b7252", "hypothesis": "Mindful.org", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
22
+ {"question_id": "1568498a", "hypothesis": "29. Rd3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
23
+ {"question_id": "6222b6eb", "hypothesis": "6S", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
24
+ {"question_id": "e8a79c70", "hypothesis": "2-3 eggs", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
25
+ {"question_id": "d596882b", "hypothesis": "By Chloe", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
26
+ {"question_id": "e3fc4d6e", "hypothesis": "Dr. Arati Prabhakar", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
27
+ {"question_id": "51b23612", "hypothesis": "Nu, pogodi!", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
28
+ {"question_id": "3e321797", "hypothesis": "10 minutes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
29
+ {"question_id": "e982271f", "hypothesis": "Revolution Hall", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
30
+ {"question_id": "352ab8bd", "hypothesis": "approximately 20%", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
31
+ {"question_id": "fca762bc", "hypothesis": "Memrise", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
32
+ {"question_id": "7a8d0b71", "hypothesis": "In the visible remembered campaign-plan excerpt, I can confirm we discussed the DHL Wellness Retreats email marketing campaign, but the specific influencer marketing allocation amount is not shown in the evidence provided.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
33
+ {"question_id": "a40e080f", "hypothesis": "Patagonia and Southwest Airlines", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
34
+ {"question_id": "8b9d4367", "hypothesis": "Jaipur Rugs", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
35
+ {"question_id": "5809eb10", "hypothesis": "2015", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
36
+ {"question_id": "41275add", "hypothesis": "How to Sit Properly at a Desk to Avoid Back Pain (Mayo Clinic)", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
37
+ {"question_id": "4388e9dd", "hypothesis": "an untidy, stained white shirt", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
38
+ {"question_id": "4baee567", "hypothesis": "12", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
39
+ {"question_id": "561fabcd", "hypothesis": "Contaminated Colossus", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
40
+ {"question_id": "b759caee", "hypothesis": "@jessica_poole_jewellery", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
41
+ {"question_id": "ac031881", "hypothesis": "squares", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
42
+ {"question_id": "28bcfaac", "hypothesis": "MusicTheory.net", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
43
+ {"question_id": "16c90bf4", "hypothesis": "A pilsner or lager.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
44
+ {"question_id": "c8f1aeed", "hypothesis": "Pennsylvania", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
45
+ {"question_id": "eaca4986", "hypothesis": "C D E F G A B A G F E D C", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
46
+ {"question_id": "c7cf7dfd", "hypothesis": "Nostalgia", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
47
+ {"question_id": "e48988bc", "hypothesis": "Patagonia", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
48
+ {"question_id": "1de5cff2", "hypothesis": "Veja", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
49
+ {"question_id": "65240037", "hypothesis": "1:10 ratio", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
50
+ {"question_id": "778164c6", "hypothesis": "The evidence only shows that we discussed popular Caribbean dishes featuring snapper, but it does not include the actual Jamaican fruit-based dish name.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
51
+ {"question_id": "e47becba", "hypothesis": "Business Administration", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
52
+ {"question_id": "118b2229", "hypothesis": "45 minutes each way", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
53
+ {"question_id": "51a45a95", "hypothesis": "The evidence does not say where you redeemed the $5 coffee creamer coupon.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
54
+ {"question_id": "58bf7951", "hypothesis": "The Glass Menagerie", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
55
+ {"question_id": "1e043500", "hypothesis": "Summer Vibes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
56
+ {"question_id": "c5e8278d", "hypothesis": "Johnson", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
57
+ {"question_id": "6ade9755", "hypothesis": "Serenity Yoga", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
58
+ {"question_id": "6f9b354f", "hypothesis": "a lighter shade of gray", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
59
+ {"question_id": "58ef2f1c", "hypothesis": "back on Valentine's Day", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
60
+ {"question_id": "f8c5f88b", "hypothesis": "The retrieved evidence does not say where you bought the new tennis racket from.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
61
+ {"question_id": "5d3d2817", "hypothesis": "marketing specialist at a small startup", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
62
+ {"question_id": "7527f7e2", "hypothesis": "$800", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
63
+ {"question_id": "c960da58", "hypothesis": "20", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
64
+ {"question_id": "3b6f954b", "hypothesis": "University of Melbourne", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
65
+ {"question_id": "726462e0", "hypothesis": "10%", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
66
+ {"question_id": "94f70d80", "hypothesis": "The retrieved evidence does not state any assembly time for an IKEA bookshelf. It only mentions plans to visit IKEA to look at coffee tables and read reviews, so the duration cannot be determined from this evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
67
+ {"question_id": "66f24dbb", "hypothesis": "No sister gift is shown in memory; the only recorded birthday gift purchase was a customized phone case with his favorite game's logo for your brother.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
68
+ {"question_id": "ad7109d1", "hypothesis": "The retrieved evidence does not specify the speed of your new internet plan.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
69
+ {"question_id": "af8d2e46", "hypothesis": "7", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
70
+ {"question_id": "dccbc061", "hypothesis": "staunch atheist", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
71
+ {"question_id": "c8c3f81d", "hypothesis": "Nike", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
72
+ {"question_id": "8ebdbe50", "hypothesis": "Data Science certification", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
73
+ {"question_id": "6b168ec8", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
74
+ {"question_id": "75499fd8", "hypothesis": "Golden Retriever", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
75
+ {"question_id": "21436231", "hypothesis": "Not stated", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
76
+ {"question_id": "95bcc1c8", "hypothesis": "10", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
77
+ {"question_id": "0862e8bf", "hypothesis": "Luna", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
78
+ {"question_id": "853b0a1d", "hypothesis": "I can tell it was a gift from your grandma, but the evidence here doesn’t include your age when she gave you the silver necklace.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
79
+ {"question_id": "a06e4cfe", "hypothesis": "3:1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
80
+ {"question_id": "37d43f65", "hypothesis": "16GB", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
81
+ {"question_id": "b86304ba", "hypothesis": "Triple what you paid for it.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
82
+ {"question_id": "d52b4f67", "hypothesis": "Grand Ballroom", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
83
+ {"question_id": "25e5aa4f", "hypothesis": "UCLA", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
84
+ {"question_id": "caf9ead2", "hypothesis": "around 5 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
85
+ {"question_id": "8550ddae", "hypothesis": "Lavender Dream cocktail", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
86
+ {"question_id": "60d45044", "hypothesis": "Japanese short-grain rice", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
87
+ {"question_id": "3f1e9474", "hypothesis": "Sarah", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
88
+ {"question_id": "86b68151", "hypothesis": "IKEA", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
89
+ {"question_id": "577d4d32", "hypothesis": "7 pm", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
90
+ {"question_id": "ec81a493", "hypothesis": "500 copies", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
91
+ {"question_id": "15745da0", "hypothesis": "about three months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
92
+ {"question_id": "e01b8e2f", "hypothesis": "Hawaii", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
93
+ {"question_id": "bc8a6e93", "hypothesis": "lemon blueberry cake", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
94
+ {"question_id": "ccb36322", "hypothesis": "There’s no evidence here of any music streaming service you’ve been using lately; the only services mentioned are photo album services like Shutterfly, Snapfish, and Mixbook.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
95
+ {"question_id": "001be529", "hypothesis": "Your asylum application had been approved by May 28, 2023, and there was a waiting period, but the evidence does not state how long you waited for the decision.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
96
+ {"question_id": "b320f3f8", "hypothesis": "a rare blue Snaggletooth action figure", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
97
+ {"question_id": "19b5f2b3", "hypothesis": "The evidence only shows that you were in Japan/Kyoto and planning to travel elsewhere in Asia afterward, but it does not state how long you were in Japan.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
98
+ {"question_id": "4fd1909e", "hypothesis": "Xfinity Center", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
99
+ {"question_id": "545bd2b5", "hypothesis": "2 hours per day", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
100
+ {"question_id": "8a137a7f", "hypothesis": "Philips LED bulb", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
101
+ {"question_id": "76d63226", "hypothesis": "55-inch", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
102
+ {"question_id": "86f00804", "hypothesis": "The Seven Husbands of Evelyn Hugo", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
103
+ {"question_id": "8e9d538c", "hypothesis": "17 skeins", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
104
+ {"question_id": "311778f1", "hypothesis": "around 10 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
105
+ {"question_id": "c19f7a0b", "hypothesis": "around 6:30 pm", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
106
+ {"question_id": "4100d0a0", "hypothesis": "Irish and Italian", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
107
+ {"question_id": "29f2956b", "hypothesis": "30 minutes daily", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
108
+ {"question_id": "1faac195", "hypothesis": "Denver", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
109
+ {"question_id": "faba32e5", "hypothesis": "24 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
110
+ {"question_id": "f4f1d8a4", "hypothesis": "my sister", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
111
+ {"question_id": "c14c00dd", "hypothesis": "Trader Joe's", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
112
+ {"question_id": "36580ce8", "hypothesis": "The evidence only says you’d been dealing with unspecified health issues for a few months; it does not identify any specific issue you initially thought was just a cold.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
113
+ {"question_id": "3d86fd0a", "hypothesis": "Information unavailable", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
114
+ {"question_id": "a82c026e", "hypothesis": "Dark Souls 3 DLC", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
115
+ {"question_id": "0862e8bf_abs", "hypothesis": "I can’t determine your hamster’s name from the provided evidence because the memories only mention your cat and unrelated coding discussion, not any hamster.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
116
+ {"question_id": "15745da0_abs", "hypothesis": "No evidence says how long you've been collecting vintage films.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
117
+ {"question_id": "bc8a6e93_abs", "hypothesis": "I don't know", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
118
+ {"question_id": "19b5f2b3_abs", "hypothesis": "The available evidence doesn’t mention a Korea trip or its duration, so I can’t determine how long you were in Korea.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
119
+ {"question_id": "29f2956b_abs", "hypothesis": "I only have evidence that you practice guitar for 30 minutes daily; there’s no evidence here about a daily violin practice time.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
120
+ {"question_id": "f4f1d8a4_abs", "hypothesis": "I only remember a new stand mixer as a birthday gift from your sister, not your dad.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
121
+ {"question_id": "0a995998", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
122
+ {"question_id": "6d550036", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
123
+ {"question_id": "gpt4_59c863d7", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
124
+ {"question_id": "b5ef892d", "hypothesis": "5 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
125
+ {"question_id": "e831120c", "hypothesis": "3.5 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
126
+ {"question_id": "3a704032", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
127
+ {"question_id": "gpt4_d84a3211", "hypothesis": "I can identify bike-related activity this year, like getting bike lights installed, but there are no explicit dollar amounts for any bike-related purchases or services in the evidence, so I can’t compute a total spent.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
128
+ {"question_id": "aae3761f", "hypothesis": "10 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
129
+ {"question_id": "gpt4_f2262a51", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
130
+ {"question_id": "dd2973ad", "hypothesis": "I can only narrow it down indirectly: your doctor’s appointment was at 10 AM on Thursday 2023-05-18, so the day before was Wednesday 2023-05-17. The only bedtime evidence says you went to bed at 2 AM on last Wednesday relative to 2023-05-29, which refers to 2023-05-24, not 2023-05-17. So the closest bedtime mentioned is 2 AM, but it does not match the day before that appointment.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
131
+ {"question_id": "c4a1ceb8", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
132
+ {"question_id": "gpt4_a56e767c", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
133
+ {"question_id": "6cb6f249", "hypothesis": "17 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
134
+ {"question_id": "46a3abf7", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
135
+ {"question_id": "36b9f61e", "hypothesis": "2000", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
136
+ {"question_id": "28dc39ac", "hypothesis": "29 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
137
+ {"question_id": "gpt4_2f8be40d", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
138
+ {"question_id": "2e6d26dc", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
139
+ {"question_id": "gpt4_15e38248", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
140
+ {"question_id": "88432d0a", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
141
+ {"question_id": "80ec1f4f", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
142
+ {"question_id": "d23cf73b", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
143
+ {"question_id": "gpt4_7fce9456", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
144
+ {"question_id": "d682f1a2", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
145
+ {"question_id": "7024f17c", "hypothesis": "0.5 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
146
+ {"question_id": "gpt4_5501fe77", "hypothesis": "TikTok", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
147
+ {"question_id": "gpt4_2ba83207", "hypothesis": "Trader Joe's", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
148
+ {"question_id": "2318644b", "hypothesis": "Tokyo is the only place with a numeric nightly accommodation cost in the evidence: about $30/night. For Hawaii, the evidence only says you had a luxurious resort stay in Maui, which suggests it was more expensive but does not give a dollar amount, so the exact per-night difference cannot be calculated from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
149
+ {"question_id": "2ce6a0f2", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
150
+ {"question_id": "gpt4_d12ceb0e", "hypothesis": "Insufficient evidence to compute the average age of you, your parents, and your grandparents.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
151
+ {"question_id": "00ca467f", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
152
+ {"question_id": "b3c15d39", "hypothesis": "5 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
153
+ {"question_id": "gpt4_31ff4165", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
154
+ {"question_id": "eeda8a6d", "hypothesis": "17", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
155
+ {"question_id": "2788b940", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
156
+ {"question_id": "60bf93ed", "hypothesis": "It arrived on 1/20, but I don’t see the purchase date in the evidence, so I can’t calculate the number of days exactly.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
157
+ {"question_id": "9d25d4e0", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
158
+ {"question_id": "129d1232", "hypothesis": "600 dollars", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
159
+ {"question_id": "60472f9c", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
160
+ {"question_id": "gpt4_194be4b3", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
161
+ {"question_id": "a9f6b44c", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
162
+ {"question_id": "d851d5ba", "hypothesis": "At least 2,250 USD", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
163
+ {"question_id": "5a7937c8", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
164
+ {"question_id": "gpt4_ab202e7f", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
165
+ {"question_id": "gpt4_e05b82a6", "hypothesis": "7", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
166
+ {"question_id": "gpt4_731e37d7", "hypothesis": "$200", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
167
+ {"question_id": "edced276", "hypothesis": "8 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
168
+ {"question_id": "10d9b85a", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
169
+ {"question_id": "e3038f8c", "hypothesis": "87", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
170
+ {"question_id": "2b8f3739", "hypothesis": "$495", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
171
+ {"question_id": "1a8a66a6", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
172
+ {"question_id": "c2ac3c61", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
173
+ {"question_id": "bf659f65", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
174
+ {"question_id": "gpt4_372c3eed", "hypothesis": "8 years", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
175
+ {"question_id": "gpt4_2f91af09", "hypothesis": "6", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
176
+ {"question_id": "81507db6", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
177
+ {"question_id": "88432d0a_abs", "hypothesis": "0", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
178
+ {"question_id": "80ec1f4f_abs", "hypothesis": "0", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
179
+ {"question_id": "eeda8a6d_abs", "hypothesis": "not evidenced in the retrieved memory", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
180
+ {"question_id": "60bf93ed_abs", "hypothesis": "70 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
181
+ {"question_id": "edced276_abs", "hypothesis": "I can only confirm a Hawaii trip, and no number of days is given for it. There’s also no evidence here of a Seattle trip duration, so the total can’t be determined from the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
182
+ {"question_id": "gpt4_372c3eed_abs", "hypothesis": "8 years documented; no evidence of a completed Master's degree", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
183
+ {"question_id": "8a2466db", "hypothesis": "Sorry, I don’t know of any specific resources from memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
184
+ {"question_id": "06878be2", "hypothesis": "A comfortable, durable Sony-compatible camera bag, a Godox V1 hard case or Think Tank Photo Flash Pouch, and a lens cleaning kit with a microfiber cloth, blower, and lens cleaning solution would complement your Sony A7R IV setup.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
185
+ {"question_id": "75832dbd", "hypothesis": "Recent publications or conferences on the impact of mega-churches on Baptist church vitality, Baptist identity and mission, and practical collaboration models among smaller Baptist churches—especially case-study-focused work on regional or denominational cooperation.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
186
+ {"question_id": "0edc2aef", "hypothesis": "Look for a Miami hotel with a balcony hot tub and a standout view, similar to your preferred room at The Edgewater Hotel.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
187
+ {"question_id": "35a27287", "hypothesis": "French- or Spanish-focused language exchange meetups, conversation groups, or cultural events nearby would fit you best; check Meetup, Facebook Groups, local universities, or language schools this weekend.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
188
+ {"question_id": "32260d93", "hypothesis": "Mike Birbiglia: My Girlfriend's Boyfriend", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
189
+ {"question_id": "195a1a1b", "hypothesis": "Try a calm wind-down routine: read a non-stimulating book, do a guided sleep meditation or body scan, practice deep breathing, do gentle yoga or stretching, listen to soothing music or nature sounds, and take a 30-minute break from screens before bed.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
190
+ {"question_id": "afdc33df", "hypothesis": "Regularly clean your granite countertop, use coasters or pads under pots and utensils, always use a cutting board, and check the faucet/aerator for moisture or buildup so the sink area stays cleaner.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
191
+ {"question_id": "caf03d32", "hypothesis": "Try more plant-based slow cooker recipes like lentil curry, roasted vegetable stew, black bean chili, eggplant parmesan, vegan chili, or a quinoa and black bean bowl.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
192
+ {"question_id": "54026fce", "hypothesis": "Try suggesting virtual coffee breaks in a collaborative way: ask open-ended questions about staying connected, share that you miss casual chats, propose a weekly virtual coffee break, and invite your team’s feedback before deciding.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
193
+ {"question_id": "06f04340", "hypothesis": "Serve a hearty quinoa salad with your roasted homegrown vegetables over mixed greens, finished with vinaigrette; roast the vegetables first, then toss with the greens and quinoa.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
194
+ {"question_id": "6b7dfb22", "hypothesis": "Study real flowers from life or photos, browse Instagram and art communities for ideas, experiment with new techniques like palette-knife texture, and take breaks while tracking small wins to help inspiration return.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
195
+ {"question_id": "1a1907b4", "hypothesis": "Try a Pimm's Cup with a Twist, especially with Ruby Red Grapefruit simple syrup.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
196
+ {"question_id": "09d032c9", "hypothesis": "A compact portable power bank could help as a backup.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
197
+ {"question_id": "38146c39", "hypothesis": "Try adding warm spices, nuts like walnuts or pecans, a little citrus zest, or a touch of espresso or coffee for extra flavor.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
198
+ {"question_id": "d24813b1", "hypothesis": "Bake chocolate chip cookies or a lemon poppyseed cake; both fit your recent baking interests, and the lemon poppyseed cake worked well for a colleague event before.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
199
+ {"question_id": "57f827a0", "hypothesis": "No direct memory-backed evidence shows you were planning to rearrange your bedroom furniture this weekend.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
200
+ {"question_id": "95228167", "hypothesis": "Try several guitars and focus on comfort, playability, build quality, tuning stability, and the sound you like best.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
201
+ {"question_id": "505af2f5", "hypothesis": "Try a homemade vanilla-cinnamon creamer with unsweetened almond or oat milk, a little monk fruit or stevia instead of honey, and optional coconut or hazelnut extract.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
202
+ {"question_id": "75f70248", "hypothesis": "Possibly yes—the prior memory suggests your living room had a dust issue and a shedding cat, so dust or pet hair there could be contributing to your sneezing.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
203
+ {"question_id": "d6233ab6", "hypothesis": "Yes—if you’re feeling nostalgic and want to reconnect with old high school friends, attending your reunion sounds like a good idea.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
204
+ {"question_id": "1da05512", "hypothesis": "Buy now if your storage issue is already a problem and a beginner-friendly 2-bay model fits your budget; wait only if you expect much larger future storage needs or still need to compare options.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
205
+ {"question_id": "fca70973", "hypothesis": "Try Universal Studios Hollywood’s VIP Experience with the Gourmet Buffet and Behind-the-Scenes Tour as the top pick; Disneyland Resort Halloween dining or Knott’s Scary Farm Boo-fet would also fit your immersive, exclusive theme park weekend vibe.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
206
+ {"question_id": "b6025781", "hypothesis": "Try herb-rosemary roasted sweet potatoes, quinoa and roasted vegetable bowls with salmon, tofu, or lentil bolognese, and a batch of homemade granola for snacks.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
207
+ {"question_id": "a89d7624", "hypothesis": "Visit Red Rocks Park and Amphitheatre and check out other live music spots like The Ogden Theatre, The Bluebird Theater, The Fillmore Auditorium, and Larimer Lounge.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
208
+ {"question_id": "b0479f84", "hypothesis": "Try Planet Earth, Blue Planet, Dynasties, Chasing Coral, and March of the Penguins—especially the nature and wildlife ones since that’s what you were most interested in.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
209
+ {"question_id": "1d4e3b97", "hypothesis": "Yes — the recent upgrade to your bike’s chain and cassette could be why it’s performing better.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
210
+ {"question_id": "07b6f563", "hypothesis": "An iPhone 13 Pro-compatible wallet case like the OtterBox wallet case would be a useful accessory; I’d skip suggesting a power bank since you already have one that’s working well.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
211
+ {"question_id": "1c0ddc50", "hypothesis": "Listen to history and science podcasts during your 40-minute commute—try starter episodes from Hardcore History, Lore, The Dollop, StarTalk Radio, or Radiolab.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
212
+ {"question_id": "0a34ad58", "hypothesis": "Since you already have a Suica card, use it for trains and subways to make getting around easier. If you’re heading out from Shinjuku, plan a little extra time, especially for early or busy destinations like Tsukiji, where it’s best to leave very early and expect crowds. Keep some yen/cash with you because some places, especially around markets, may not take cards, and if needed you can exchange money near Shinjuku.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
213
+ {"question_id": "d3ab962e", "hypothesis": "8 miles", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
214
+ {"question_id": "2311e44b", "hypothesis": "The Nightingale' is 440 pages long, but I don’t have a numeric page-progress value for how far you’ve already read, so the exact pages left can’t be calculated from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
215
+ {"question_id": "cc06de0d", "hypothesis": "I can’t compute the difference from the provided evidence because it gives your daily train fare as $6, but it doesn’t include the taxi ride cost.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
216
+ {"question_id": "a11281a2", "hypothesis": "about 100 followers", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
217
+ {"question_id": "4f54b7c9", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
218
+ {"question_id": "85fa3a3f", "hypothesis": "50", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
219
+ {"question_id": "9aaed6a3", "hypothesis": "$0.75", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
220
+ {"question_id": "1f2b8d4f", "hypothesis": "I can’t determine an exact price difference from the evidence. The price of your luxury boots isn’t given, and no specific similar pair found at a budget store has a stated price. The closest budget-store-style prices mentioned were about $20-$50 for affordable ankle boots, but that still isn’t enough to calculate the difference.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
221
+ {"question_id": "e6041065", "hypothesis": "40%", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
222
+ {"question_id": "51c32626", "hypothesis": "No explicit submission date was given; it was only mentioned by 2023/05/22 as a past event.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
223
+ {"question_id": "d905b33f", "hypothesis": "20%", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
224
+ {"question_id": "7405e8b1", "hypothesis": "HelloFresh: 40% on your first order. For UberEats, the evidence only says you got 20% off an order last week, not that it was your first order.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
225
+ {"question_id": "f35224e0", "hypothesis": "At least 12 episodes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
226
+ {"question_id": "6456829e", "hypothesis": "5 tomato plants and 3 cucumber plants", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
227
+ {"question_id": "a4996e51", "hypothesis": "Up to 50 hours/week.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
228
+ {"question_id": "3c1045c8", "hypothesis": "unavailable", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
229
+ {"question_id": "60036106", "hypothesis": "12,000 people", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
230
+ {"question_id": "681a1674", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
231
+ {"question_id": "e25c3b8d", "hypothesis": "I can only tell that you paid $200 for the designer handbag at TK Maxx; the savings amount isn’t stated in the evidence, so it can’t be determined from this information alone.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
232
+ {"question_id": "4adc0475", "hypothesis": "Insufficient evidence for an exact total.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
233
+ {"question_id": "4bc144e2", "hypothesis": "Parking ticket: $50; car wash not found in the retrieved evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
234
+ {"question_id": "ef66a6e5", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
235
+ {"question_id": "5025383b", "hypothesis": "cooking", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
236
+ {"question_id": "a1cc6108", "hypothesis": "You were about 0 years old when Alex was born.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
237
+ {"question_id": "9ee3ecd6", "hypothesis": "300 points", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
238
+ {"question_id": "3fdac837", "hypothesis": "12 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
239
+ {"question_id": "91b15a6e", "hypothesis": "$5,150", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
240
+ {"question_id": "27016adc", "hypothesis": "The planned renovations are $20,000. I don’t see the countryside property’s purchase price in the provided evidence, so the exact percentage can’t be computed from this evidence alone.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
241
+ {"question_id": "720133ac", "hypothesis": "$75", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
242
+ {"question_id": "77eafa52", "hypothesis": "$300", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
243
+ {"question_id": "8979f9ec", "hypothesis": "I can only confirm those were lunch meal-prep recipe ideas mentioned on 2023/05/26; there’s no evidence here giving a number of meals for the chicken fajitas or any lentil soup, so the total can’t be computed from the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
244
+ {"question_id": "0100672e", "hypothesis": "$12 per coffee mug", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
245
+ {"question_id": "a96c20ee", "hypothesis": "I presented a poster on my thesis research at my first research conference over the summer, but the university isn't stated in the evidence provided.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
246
+ {"question_id": "92a0aa75", "hypothesis": "Senior Marketing Specialist at the company", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
247
+ {"question_id": "3fe836c9", "hypothesis": "The final sale price was $325,000. The only other amount mentioned is $350,000, but in the evidence it appears as a generic mortgage example, not a clearly stated pre-approval amount, so the exact difference can’t be determined from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
248
+ {"question_id": "1c549ce4", "hypothesis": "The only purchase cost shown is the waterproof car cover at $120. I don’t see evidence that you purchased a detailing spray or what it cost, so I can’t add a total beyond the confirmed $120.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
249
+ {"question_id": "6c49646a", "hypothesis": "1940 miles", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
250
+ {"question_id": "1192316e", "hypothesis": "I can’t compute a total from the provided evidence because it only mentions meditation and breakfast timing, not how long you take to get ready or commute to work.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
251
+ {"question_id": "0ea62687", "hypothesis": "I can’t determine the difference from the retrieved memory: a few months ago it was 30 mpg, but your current mpg isn’t stated.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
252
+ {"question_id": "67e0d0f2", "hypothesis": "At least 8. You explicitly mentioned completing 8 edX courses, and you also said you had completed some Coursera courses, but without a number, so the exact total isn’t determined from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
253
+ {"question_id": "bb7c3b45", "hypothesis": "The evidence only gives the original retail price of the Jimmy Choo heels as $500. It does not say what you actually paid, so the amount saved cannot be determined from the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
254
+ {"question_id": "ba358f49", "hypothesis": "I don't know—Rachel is expected to get married in 2023, but your age isn't provided.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
255
+ {"question_id": "61f8c8f8", "hypothesis": "10 minutes faster", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
256
+ {"question_id": "60159905", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
257
+ {"question_id": "ef9cf60a", "hypothesis": "$100", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
258
+ {"question_id": "73d42213", "hypothesis": "No explicit arrival time is stated for Monday. The only clinic-travel detail says it took two hours to get there last time, but that doesn’t give a reach time.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
259
+ {"question_id": "bc149d6b", "hypothesis": "70 pounds", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
260
+ {"question_id": "099778bb", "hypothesis": "Insufficient evidence to determine the percentage; only the total of 100 leadership positions is provided, not how many are held by women.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
261
+ {"question_id": "09ba9854", "hypothesis": "About $50.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
262
+ {"question_id": "d6062bb9", "hypothesis": "1998", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
263
+ {"question_id": "157a136e", "hypothesis": "I don't know", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
264
+ {"question_id": "c18a7dc8", "hypothesis": "7", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
265
+ {"question_id": "a3332713", "hypothesis": "About $200 total", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
266
+ {"question_id": "55241a1f", "hypothesis": "Your recent Facebook Live session had 12 comments, but I don’t have evidence here for the comment count of your most popular YouTube video, so I can’t compute the total from the available information.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
267
+ {"question_id": "a08a253f", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
268
+ {"question_id": "f0e564bc", "hypothesis": "The evidence shows $800 for the designer handbag, but no amount is provided for the high-end skincare products, so the total cannot be determined.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
269
+ {"question_id": "078150f1", "hypothesis": "$50 more", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
270
+ {"question_id": "8cf4d046", "hypothesis": "I can’t compute an average GPA from the evidence provided. It mentions your undergraduate result as a First-Class distinction from the University of Mumbai and that you completed a Master’s in Data Science at UIUC, but it does not give explicit GPA values for either degree.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
271
+ {"question_id": "a346bb18", "hypothesis": "12 minutes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
272
+ {"question_id": "37f165cf", "hypothesis": "416 pages and 440 pages", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
273
+ {"question_id": "8e91e7d9", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
274
+ {"question_id": "87f22b4a", "hypothesis": "$120", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
275
+ {"question_id": "e56a43b9", "hypothesis": "$5 discount", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
276
+ {"question_id": "efc3f7c2", "hypothesis": "30 minutes earlier", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
277
+ {"question_id": "21d02d0d", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
278
+ {"question_id": "2311e44b_abs", "hypothesis": "You mentioned that you've been reading \"Sapiens\" at a pace of 10–20 pages a week, but there’s no evidence here stating the book’s total page count or how many pages you’ve already finished, so the exact number of pages left can’t be determined from the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
279
+ {"question_id": "6456829e_abs", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
280
+ {"question_id": "e5ba910e_abs", "hypothesis": "$378", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
281
+ {"question_id": "a96c20ee_abs", "hypothesis": "Harvard University", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
282
+ {"question_id": "ba358f49_abs", "hypothesis": "Insufficient evidence to determine Rachel's age when you get married.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
283
+ {"question_id": "09ba9854_abs", "hypothesis": "The evidence doesn’t give enough to calculate the savings exactly. It does mention the Airport Limousine Bus costs ¥3,200 one way to Shinjuku Station, but it does not provide a full taxi fare from the airport to your hotel, only that a short taxi ride from Shinjuku Station to the hotel might be needed.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
284
+ {"question_id": "gpt4_59149c77", "hypothesis": "7 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
285
+ {"question_id": "gpt4_f49edff3", "hypothesis": "helped my friend prepare a nursery, helped my cousin pick out stuff for her baby shower, ordered a customized phone case for my friend's birthday", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
286
+ {"question_id": "71017276", "hypothesis": "4 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
287
+ {"question_id": "b46e15ed", "hypothesis": "2 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
288
+ {"question_id": "gpt4_fa19884c", "hypothesis": "6 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
289
+ {"question_id": "0bc8ad92", "hypothesis": "2 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
290
+ {"question_id": "af082822", "hypothesis": "2 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
291
+ {"question_id": "gpt4_4929293a", "hypothesis": "Michael's engagement party", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
292
+ {"question_id": "gpt4_b5700ca9", "hypothesis": "4 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
293
+ {"question_id": "9a707b81", "hypothesis": "21 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
294
+ {"question_id": "gpt4_1d4ab0c9", "hypothesis": "24 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
295
+ {"question_id": "gpt4_e072b769", "hypothesis": "about 3 weeks ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
296
+ {"question_id": "0db4c65d", "hypothesis": "18 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
297
+ {"question_id": "gpt4_1d80365e", "hypothesis": "2 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
298
+ {"question_id": "gpt4_7f6b06db", "hypothesis": "Yosemite, Big Sur and Monterey, Yosemite National Park", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
299
+ {"question_id": "gpt4_6dc9b45b", "hypothesis": "4 months ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
300
+ {"question_id": "gpt4_8279ba02", "hypothesis": "10 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
301
+ {"question_id": "gpt4_18c2b244", "hypothesis": "I used a Buy One Get One Free coupon on Luvs diapers at Walmart, then I redeemed $12 cashback for a $10 Amazon gift card from Ibotta, then I signed up for the rewards program at ShopRite.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
302
+ {"question_id": "gpt4_a1b77f9c", "hypothesis": "Insufficient evidence to calculate the total weeks spent.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
303
+ {"question_id": "gpt4_1916e0ea", "hypothesis": "I can identify the Instacart online grocery shopping date as 2023/02/28, but the provided evidence does not include the date you cancelled your FarmFresh subscription, so the number of days passed cannot be determined from this evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
304
+ {"question_id": "gpt4_7a0daae1", "hypothesis": "I can tell you received your new tennis racket on 2023/03/17, but the buy date isn’t provided in the evidence, so the number of weeks between buying and receiving can’t be determined from the available information.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
305
+ {"question_id": "gpt4_468eb063", "hypothesis": "9 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
306
+ {"question_id": "gpt4_7abb270c", "hypothesis": "Science Museum, Metropolitan Museum of Art, Museum of Contemporary Art, Modern Art Museum, and two missing museum visits", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
307
+ {"question_id": "gpt4_1e4a8aeb", "hypothesis": "6 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
308
+ {"question_id": "gpt4_4fc4f797", "hypothesis": "38 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
309
+ {"question_id": "4dfccbf7", "hypothesis": "24 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
310
+ {"question_id": "gpt4_61e13b3c", "hypothesis": "3 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
311
+ {"question_id": "gpt4_45189cb4", "hypothesis": "College Football National Championship game (Georgia vs. Alabama), then Lakers game, then Kansas City Chiefs vs. Buffalo Bills Divisional Round game", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
312
+ {"question_id": "2ebe6c90", "hypothesis": "21 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
313
+ {"question_id": "gpt4_e061b84f", "hypothesis": "volleyball league game, Spring Sprint Triathlon, company's annual charity soccer tournament", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
314
+ {"question_id": "370a8ff4", "hypothesis": "11 weeks and 4 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
315
+ {"question_id": "gpt4_d6585ce8", "hypothesis": "free outdoor concert series in the park, jazz night at a local bar", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
316
+ {"question_id": "gpt4_4ef30696", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
317
+ {"question_id": "gpt4_ec93e27f", "hypothesis": "train", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
318
+ {"question_id": "6e984301", "hypothesis": "about 9 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
319
+ {"question_id": "8077ef71", "hypothesis": "26 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
320
+ {"question_id": "gpt4_f420262c", "hypothesis": "Delta, American Airlines", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
321
+ {"question_id": "gpt4_8e165409", "hypothesis": "22", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
322
+ {"question_id": "gpt4_74aed68e", "hypothesis": "29 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
323
+ {"question_id": "bcbe585f", "hypothesis": "4 weeks and 2 days ago (about 4.3 weeks)", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
324
+ {"question_id": "gpt4_21adecb5", "hypothesis": "6 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
325
+ {"question_id": "5e1b23de", "hypothesis": "3 months ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
326
+ {"question_id": "gpt4_98f46fc6", "hypothesis": "Insufficient evidence to determine", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
327
+ {"question_id": "gpt4_af6db32f", "hypothesis": "17 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
328
+ {"question_id": "eac54adc", "hypothesis": "19 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
329
+ {"question_id": "gpt4_7ddcf75f", "hypothesis": "3 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
330
+ {"question_id": "gpt4_a2d1d1f6", "hypothesis": "3 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
331
+ {"question_id": "gpt4_85da3956", "hypothesis": "3 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
332
+ {"question_id": "gpt4_b0863698", "hypothesis": "7 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
333
+ {"question_id": "gpt4_68e94287", "hypothesis": "The vegan chili recipe post", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
334
+ {"question_id": "gpt4_e414231e", "hypothesis": "4 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
335
+ {"question_id": "gpt4_7ca326fa", "hypothesis": "Rachel first, Alex second, Emma third", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
336
+ {"question_id": "gpt4_7bc6cf22", "hypothesis": "12 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
337
+ {"question_id": "2ebe6c92", "hypothesis": "The Song of Achilles", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
338
+ {"question_id": "gpt4_e061b84g", "hypothesis": "the company's annual charity soccer tournament", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
339
+ {"question_id": "71017277", "hypothesis": "my aunt", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
340
+ {"question_id": "b46e15ee", "hypothesis": "Walk for Hunger", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
341
+ {"question_id": "gpt4_d6585ce9", "hypothesis": "my sister", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
342
+ {"question_id": "gpt4_1e4a8aec", "hypothesis": "planted 12 new tomato saplings", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
343
+ {"question_id": "gpt4_f420262d", "hypothesis": "Delta", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
344
+ {"question_id": "gpt4_59149c78", "hypothesis": "the Metropolitan Museum of Art", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
345
+ {"question_id": "gpt4_e414231f", "hypothesis": "mountain bike", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
346
+ {"question_id": "gpt4_4929293b", "hypothesis": "your cousin's wedding", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
347
+ {"question_id": "gpt4_468eb064", "hypothesis": "Emma", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
348
+ {"question_id": "gpt4_fa19884d", "hypothesis": "Last Friday was 2023/03/31. The evidence from that day only shows artist recommendations, not a single explicit artist you definitely started listening to. The artists suggested then were Miles Davis, John Coltrane, Charlie Parker, and Thelonious Monk, plus bluegrass acts like Bill Monroe and the Blue Grass Boys, Earl Scruggs, Flatt and Scruggs, Alison Krauss and Union Station, and The Infamous Stringdusters.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
349
+ {"question_id": "9a707b82", "hypothesis": "croissants", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
350
+ {"question_id": "eac54add", "hypothesis": "I can’t identify any business milestone from the evidence. The only milestone-like detail mentioned around the retrieved time is that you had lost about 2 pounds after replacing your bathroom scale, which is a personal health update, not a business milestone.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
351
+ {"question_id": "4dfccbf8", "hypothesis": "Ukulele lessons with Rachel", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
352
+ {"question_id": "0bc8ad93", "hypothesis": "with a friend", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
353
+ {"question_id": "6e984302", "hypothesis": "a set of sculpting tools: a modeling tool set, a wire cutter, and a sculpting mat", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
354
+ {"question_id": "gpt4_8279ba03", "hypothesis": "I only have evidence of you buying a Samsung Galaxy S21 for about $800 at Best Buy, not a kitchen appliance 10 days before 2023/03/25.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
355
+ {"question_id": "gpt4_b5700ca0", "hypothesis": "the Episcopal Church", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
356
+ {"question_id": "gpt4_68e94288", "hypothesis": "#PlankChallenge", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
357
+ {"question_id": "gpt4_2655b836", "hypothesis": "No retrieved evidence mentions your new car’s first service or any issue you had after it, so the first post-service issue cannot be determined from the available evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
358
+ {"question_id": "gpt4_2487a7cb", "hypothesis": "The 'Data Analysis using Python' webinar", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
359
+ {"question_id": "gpt4_76048e76", "hypothesis": "bike", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
360
+ {"question_id": "gpt4_2312f94c", "hypothesis": "Samsung Galaxy S22", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
361
+ {"question_id": "0bb5a684", "hypothesis": "7 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
362
+ {"question_id": "08f4fc43", "hypothesis": "30 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
363
+ {"question_id": "2c63a862", "hypothesis": "14 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
364
+ {"question_id": "gpt4_385a5000", "hypothesis": "Tomatoes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
365
+ {"question_id": "2a1811e2", "hypothesis": "11", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
366
+ {"question_id": "bbf86515", "hypothesis": "4 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
367
+ {"question_id": "gpt4_5dcc0aab", "hypothesis": "white Adidas sneakers", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
368
+ {"question_id": "gpt4_0b2f1d21", "hypothesis": "The stand mixer malfunction happened first.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
369
+ {"question_id": "f0853d11", "hypothesis": "14 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
370
+ {"question_id": "gpt4_6ed717ea", "hypothesis": "the training pads for Luna", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
371
+ {"question_id": "gpt4_70e84552", "hypothesis": "Fixing the fence", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
372
+ {"question_id": "a3838d2b", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
373
+ {"question_id": "gpt4_93159ced", "hypothesis": "I can’t compute the exact amount from the evidence: it shows you’ve been at NovaTech for about 4 years and 3 months, and that you started working in the field when you graduated, but it doesn’t give your graduation date or total career length before NovaTech.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
374
+ {"question_id": "gpt4_2d58bcd6", "hypothesis": "The Hate U Give", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
375
+ {"question_id": "gpt4_65aabe59", "hypothesis": "Can't tell from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
376
+ {"question_id": "982b5123", "hypothesis": "3 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
377
+ {"question_id": "b9cfe692", "hypothesis": "I have evidence only for one of the two books: \"The Seven Husbands of Evelyn Hugo\" took you two and a half weeks to finish. There’s no evidence here for how long \"The Nightingale\" took, so I can’t compute a combined total from the provided memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
378
+ {"question_id": "gpt4_4edbafa2", "hypothesis": "2023-06-03", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
379
+ {"question_id": "c8090214", "hypothesis": "7 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
380
+ {"question_id": "gpt4_483dd43c", "hypothesis": "Game of Thrones", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
381
+ {"question_id": "e4e14d04", "hypothesis": "cannot be determined from the provided evidence", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
382
+ {"question_id": "c9f37c46", "hypothesis": "unresolved", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
383
+ {"question_id": "gpt4_2c50253f", "hypothesis": "I wake up at 6:30 AM on Tuesdays and Thursdays.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
384
+ {"question_id": "dcfa8644", "hypothesis": "14 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
385
+ {"question_id": "gpt4_b4a80587", "hypothesis": "the arrival of the new prime lens", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
386
+ {"question_id": "gpt4_9a159967", "hypothesis": "Southwest Airlines and American Airlines (tie)", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
387
+ {"question_id": "cc6d1ec1", "hypothesis": "about two months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
388
+ {"question_id": "gpt4_8c8961ae", "hypothesis": "the solo trip to Thailand", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
389
+ {"question_id": "gpt4_d9af6064", "hypothesis": "new router", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
390
+ {"question_id": "gpt4_7de946e7", "hypothesis": "persistent cough", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
391
+ {"question_id": "d01c6aa8", "hypothesis": "Cannot be determined from the retrieved evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
392
+ {"question_id": "993da5e2", "hypothesis": "I can only infer that you rearranged the furniture about three weeks before 2023/05/26; there’s no evidence here about when you started using the new area rug, so the duration before the rearrangement can’t be determined from the provided memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
393
+ {"question_id": "a3045048", "hypothesis": "30 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
394
+ {"question_id": "gpt4_d31cdae3", "hypothesis": "The family road trip across the American Southwest.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
395
+ {"question_id": "gpt4_cd90e484", "hypothesis": "Insufficient evidence to determine exactly", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
396
+ {"question_id": "gpt4_88806d6e", "hypothesis": "Can't tell from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
397
+ {"question_id": "gpt4_4cd9eba1", "hypothesis": "1 week", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
398
+ {"question_id": "gpt4_93f6379c", "hypothesis": "Page Turners", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
399
+ {"question_id": "b29f3365", "hypothesis": "six weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
400
+ {"question_id": "gpt4_2f56ae70", "hypothesis": "There’s no evidence here of you starting any streaming service, so I can’t identify a most recent one from the provided memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
401
+ {"question_id": "6613b389", "hypothesis": "2 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
402
+ {"question_id": "gpt4_78cf46a3", "hypothesis": "Receiving the new phone case happened first.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
403
+ {"question_id": "gpt4_0a05b494", "hypothesis": "the tourist from Australia", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
404
+ {"question_id": "gpt4_1a1dc16d", "hypothesis": "The pride parade", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
405
+ {"question_id": "gpt4_2f584639", "hypothesis": "The necklace for your sister.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
406
+ {"question_id": "gpt4_213fd887", "hypothesis": "the volleyball league", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
407
+ {"question_id": "gpt4_5438fa52", "hypothesis": "the start of your Spanish classes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
408
+ {"question_id": "gpt4_c27434e8", "hypothesis": "Japanese Zero fighter plane model", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
409
+ {"question_id": "gpt4_fe651585", "hypothesis": "Rachel", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
410
+ {"question_id": "8c18457d", "hypothesis": "21 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
411
+ {"question_id": "gpt4_70e84552_abs", "hypothesis": "Fixing the fence", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
412
+ {"question_id": "gpt4_93159ced_abs", "hypothesis": "The evidence doesn’t support a current job at Google. On 2023/05/30, you were described as working at NovaTech, and while there’s a truncated note saying you’d been in software engineering before mentioning your current job, it doesn’t include the duration. So I can’t compute how long you’d been working before starting a job at Google from the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
413
+ {"question_id": "982b5123_abs", "hypothesis": "There’s no evidence here of you booking an Airbnb in Sacramento. The only Airbnb mention is for San Francisco, tied to your stay in Haight-Ashbury on 2023/05/27.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
414
+ {"question_id": "c8090214_abs", "hypothesis": "I attended the Holiday Market 7 days before Black Friday, but there’s no evidence here that you bought an iPad, so the day difference to an iPad purchase can’t be determined from the provided information.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
415
+ {"question_id": "gpt4_c27434e8_abs", "hypothesis": "Insufficient evidence to determine", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
416
+ {"question_id": "gpt4_fe651585_abs", "hypothesis": "Alex", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
417
+ {"question_id": "6a1eabeb", "hypothesis": "25:50", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
418
+ {"question_id": "6aeb4375", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
419
+ {"question_id": "830ce83f", "hypothesis": "Chicago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
420
+ {"question_id": "852ce960", "hypothesis": "$350,000", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
421
+ {"question_id": "945e3d21", "hypothesis": "three times a week", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
422
+ {"question_id": "d7c942c3", "hypothesis": "Yes — your mom is using the same grocery list app as you.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
423
+ {"question_id": "71315a70", "hypothesis": "10-12 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
424
+ {"question_id": "89941a93", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
425
+ {"question_id": "ce6d2d27", "hypothesis": "Friday", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
426
+ {"question_id": "9ea5eabc", "hypothesis": "Paris", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
427
+ {"question_id": "07741c44", "hypothesis": "shoe rack", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
428
+ {"question_id": "a1eacc2a", "hypothesis": "7", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
429
+ {"question_id": "184da446", "hypothesis": "220 pages", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
430
+ {"question_id": "031748ae", "hypothesis": "4 when you started; 5 now", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
431
+ {"question_id": "4d6b87c8", "hypothesis": "25", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
432
+ {"question_id": "0f05491a", "hypothesis": "120 stars", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
433
+ {"question_id": "08e075c7", "hypothesis": "I can’t determine that from the provided evidence. The memory snippets only mention your bedtime routine, reading, and using a massage roller, with no evidence about when you started using a Fitbit Charge 3.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
434
+ {"question_id": "f9e8c073", "hypothesis": "5 sessions", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
435
+ {"question_id": "41698283", "hypothesis": "70-200mm zoom lens", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
436
+ {"question_id": "2698e78f", "hypothesis": "Not specified; the memory only says to review it regularly with Dr. Smith.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
437
+ {"question_id": "b6019101", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
438
+ {"question_id": "45dc21b6", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
439
+ {"question_id": "5a4f22c0", "hypothesis": "TechCorp", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
440
+ {"question_id": "6071bd76", "hypothesis": "less", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
441
+ {"question_id": "e493bb7c", "hypothesis": "above my bed in my bedroom", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
442
+ {"question_id": "618f13b2", "hypothesis": "6", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
443
+ {"question_id": "72e3ee87", "hypothesis": "50", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
444
+ {"question_id": "c4ea545c", "hypothesis": "Insufficient evidence", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
445
+ {"question_id": "01493427", "hypothesis": "17", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
446
+ {"question_id": "6a27ffc2", "hypothesis": "30", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
447
+ {"question_id": "2133c1b5", "hypothesis": "3 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
448
+ {"question_id": "18bc8abd", "hypothesis": "Kansas City Masterpiece BBQ sauce", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
449
+ {"question_id": "db467c8c", "hypothesis": "about nine months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
450
+ {"question_id": "7a87bd0c", "hypothesis": "4 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
451
+ {"question_id": "e61a7584", "hypothesis": "about 9 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
452
+ {"question_id": "1cea1afa", "hypothesis": "600", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
453
+ {"question_id": "ed4ddc30", "hypothesis": "20 dozen eggs", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
454
+ {"question_id": "8fb83627", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
455
+ {"question_id": "b01defab", "hypothesis": "Yes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
456
+ {"question_id": "22d2cb42", "hypothesis": "the music shop on Main St", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
457
+ {"question_id": "0e4e4c46", "hypothesis": "132 points", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
458
+ {"question_id": "4b24c848", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
459
+ {"question_id": "7e974930", "hypothesis": "Your most recent Downtown Farmers Market visit was on 2023/09/30, but the exact earnings amount isn’t shown in the retrieved evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
460
+ {"question_id": "603deb26", "hypothesis": "10", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
461
+ {"question_id": "59524333", "hypothesis": "6:00 pm", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
462
+ {"question_id": "5831f84d", "hypothesis": "10", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
463
+ {"question_id": "eace081b", "hypothesis": "Oahu", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
464
+ {"question_id": "affe2881", "hypothesis": "32", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
465
+ {"question_id": "50635ada", "hypothesis": "Premier Gold status", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
466
+ {"question_id": "e66b632c", "hypothesis": "26 minutes and 30 seconds", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
467
+ {"question_id": "0ddfec37", "hypothesis": "Cannot be determined from available evidence", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
468
+ {"question_id": "f685340e", "hypothesis": "Previously, every other week. Now, not explicitly stated.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
469
+ {"question_id": "cc5ded98", "hypothesis": "an hour each day", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
470
+ {"question_id": "dfde3500", "hypothesis": "Wednesday", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
471
+ {"question_id": "69fee5aa", "hypothesis": "37", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
472
+ {"question_id": "7401057b", "hypothesis": "1 free night's stay", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
473
+ {"question_id": "cf22b7bf", "hypothesis": "10 pounds", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
474
+ {"question_id": "a2f3aa27", "hypothesis": "close to 1300 followers", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
475
+ {"question_id": "c7dc5443", "hypothesis": "5-2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
476
+ {"question_id": "06db6396", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
477
+ {"question_id": "3ba21379", "hypothesis": "Ford F-150 pickup truck model", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
478
+ {"question_id": "9bbe84a2", "hypothesis": "level 100 before the end of the year", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
479
+ {"question_id": "10e09553", "hypothesis": "7", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
480
+ {"question_id": "dad224aa", "hypothesis": "7:30 am", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
481
+ {"question_id": "ba61f0b9", "hypothesis": "6", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
482
+ {"question_id": "42ec0761", "hypothesis": "Yes, you have a spare screwdriver.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
483
+ {"question_id": "5c40ec5b", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
484
+ {"question_id": "c6853660", "hypothesis": "increase", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
485
+ {"question_id": "26bdc477", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
486
+ {"question_id": "0977f2af", "hypothesis": "I can’t tell from the retrieved evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
487
+ {"question_id": "6aeb4375_abs", "hypothesis": "{\"answer\":0}", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
488
+ {"question_id": "031748ae_abs", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
489
+ {"question_id": "2698e78f_abs", "hypothesis": "No evidence provided states how often you see Dr. Johnson.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
490
+ {"question_id": "2133c1b5_abs", "hypothesis": "About 7 months. The evidence says it had been 1 month in your new studio apartment in Harajuku on 2023/04/11, which makes it about 7 months by 2023/10/15. The memory mentions Harajuku, not Shinjuku.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
491
+ {"question_id": "0ddfec37_abs", "hypothesis": "0", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
492
+ {"question_id": "f685340e_abs", "hypothesis": "Every other week.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
493
+ {"question_id": "89941a94", "hypothesis": "Yes — a road bike.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
494
+ {"question_id": "07741c45", "hypothesis": "shoe rack", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
495
+ {"question_id": "7161e7e2", "hypothesis": "8 am - 4 pm (Day Shift)", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
496
+ {"question_id": "c4f10528", "hypothesis": "Miss Bee Providore", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
497
+ {"question_id": "89527b6b", "hypothesis": "I can only recover that the dinosaur-book conversation was on 2023/05/23 and the visible excerpt includes a T-Rex with a green scaly body; the Plesiosaur image description and its body color are not present in the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
498
+ {"question_id": "e9327a54", "hypothesis": "I don’t have the dessert shop’s name in the retrieved memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
499
+ {"question_id": "4c36ccef", "hypothesis": "Roscioli", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
500
+ {"question_id": "6ae235be", "hypothesis": "Atmospheric distillation, fluid catalytic cracking (FCC), alkylation, and hydrotreating.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
results/judge_gpt4o_alias_vectorengine.jsonl.summary.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metric_model_alias": "gpt-4o",
3
+ "metric_model": "gpt-4o-2024-08-06",
4
+ "base_url": "OpenAI-compatible judge endpoint",
5
+ "hyp_file": "<longmemeval-root>/runs/lme_s500_frozen_baseline38_gpt54_full10_20260525_085540/off/predictions.jsonl",
6
+ "ref_file": "<longmemeval-root>/data/longmemeval_s_cleaned.json",
7
+ "result_file": "<longmemeval-root>/runs/lme_s500_frozen_baseline38_gpt54_full10_20260525_085540/off/judge_gpt4o_alias_vectorengine.jsonl",
8
+ "evaluated_count": 500,
9
+ "accuracy": 0.62,
10
+ "by_question_type": {
11
+ "knowledge-update": {
12
+ "accuracy": 0.7051,
13
+ "count": 78
14
+ },
15
+ "multi-session": {
16
+ "accuracy": 0.3985,
17
+ "count": 133
18
+ },
19
+ "single-session-assistant": {
20
+ "accuracy": 0.7857,
21
+ "count": 56
22
+ },
23
+ "single-session-preference": {
24
+ "accuracy": 0.5667,
25
+ "count": 30
26
+ },
27
+ "single-session-user": {
28
+ "accuracy": 0.8143,
29
+ "count": 70
30
+ },
31
+ "temporal-reasoning": {
32
+ "accuracy": 0.6316,
33
+ "count": 133
34
+ }
35
+ }
36
+ }
results/lme_s500_frozen_baseline38_full10_20260525_results.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e88380901df26ac2374524fb812f6405d994775d1163db228fd9aedf1e56acea
3
+ size 79506510
results/predictions.jsonl ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"question_id": "7e00a6cb", "hypothesis": "International Budget Hostel"}
2
+ {"question_id": "1903aded", "hypothesis": "I can’t recover the 7th job from the provided memory evidence. The retrieved notes only show unrelated lists, like living room color schemes, dessert ideas, and sales tax questions, not the earlier work-from-home jobs for seniors list."}
3
+ {"question_id": "ceb54acb", "hypothesis": "Sexual fixations, Problematic sexual behaviors, Sexual impulsivity, Compulsive sexuality"}
4
+ {"question_id": "f523d9fe", "hypothesis": "I can’t tell from the available memory."}
5
+ {"question_id": "0e5e2d1a", "hypothesis": "38 subjects"}
6
+ {"question_id": "fea54f57", "hypothesis": "Evolution"}
7
+ {"question_id": "cc539528", "hypothesis": "Ruby, Python, or PHP"}
8
+ {"question_id": "dc439ea3", "hypothesis": "Hoop Dance"}
9
+ {"question_id": "18dcd5a5", "hypothesis": "4"}
10
+ {"question_id": "488d3006", "hypothesis": "GR-90"}
11
+ {"question_id": "58470ed2", "hypothesis": "The Library is a sphere whose exact center is any one of its hexagons and whose circumference is inaccessible."}
12
+ {"question_id": "8cf51dda", "hypothesis": "1. To identify molecular subtypes of endometrial cancer using a combination of genomic and transcriptomic approaches. 2. To investigate the clinical and biological significance of the identified molecular subtypes, including their association with patient outcomes. 3. To develop biomarkers for the early detection and prognosis of endometrial cancer based on the identified molecular subtypes."}
13
+ {"question_id": "1d4da289", "hypothesis": "biometric authentication and one-time passwords (OTP)"}
14
+ {"question_id": "8464fc84", "hypothesis": "Roscioli"}
15
+ {"question_id": "8aef76bc", "hypothesis": "Mod Podge or another sealant"}
16
+ {"question_id": "71a3fd6b", "hypothesis": "+49 (0) 62 32 / 14 23 - 0"}
17
+ {"question_id": "2bf43736", "hypothesis": "Chapter 4 of Book 1, titled \"Vocal Prayer and Meditation."}
18
+ {"question_id": "70b3e69b", "hypothesis": "Marina Rossell"}
19
+ {"question_id": "8752c811", "hypothesis": "The 27th parameter was Perspective."}
20
+ {"question_id": "3249768e", "hypothesis": "Absinthe"}
21
+ {"question_id": "1b9b7252", "hypothesis": "Mindful.org"}
22
+ {"question_id": "1568498a", "hypothesis": "29. Rd3"}
23
+ {"question_id": "6222b6eb", "hypothesis": "6S"}
24
+ {"question_id": "e8a79c70", "hypothesis": "2-3 eggs"}
25
+ {"question_id": "d596882b", "hypothesis": "By Chloe"}
26
+ {"question_id": "e3fc4d6e", "hypothesis": "Dr. Arati Prabhakar"}
27
+ {"question_id": "51b23612", "hypothesis": "Nu, pogodi!"}
28
+ {"question_id": "3e321797", "hypothesis": "10 minutes"}
29
+ {"question_id": "e982271f", "hypothesis": "Revolution Hall"}
30
+ {"question_id": "352ab8bd", "hypothesis": "approximately 20%"}
31
+ {"question_id": "fca762bc", "hypothesis": "Memrise"}
32
+ {"question_id": "7a8d0b71", "hypothesis": "In the visible remembered campaign-plan excerpt, I can confirm we discussed the DHL Wellness Retreats email marketing campaign, but the specific influencer marketing allocation amount is not shown in the evidence provided."}
33
+ {"question_id": "a40e080f", "hypothesis": "Patagonia and Southwest Airlines"}
34
+ {"question_id": "8b9d4367", "hypothesis": "Jaipur Rugs"}
35
+ {"question_id": "5809eb10", "hypothesis": "2015"}
36
+ {"question_id": "41275add", "hypothesis": "How to Sit Properly at a Desk to Avoid Back Pain (Mayo Clinic)"}
37
+ {"question_id": "4388e9dd", "hypothesis": "an untidy, stained white shirt"}
38
+ {"question_id": "4baee567", "hypothesis": "12"}
39
+ {"question_id": "561fabcd", "hypothesis": "Contaminated Colossus"}
40
+ {"question_id": "b759caee", "hypothesis": "@jessica_poole_jewellery"}
41
+ {"question_id": "ac031881", "hypothesis": "squares"}
42
+ {"question_id": "28bcfaac", "hypothesis": "MusicTheory.net"}
43
+ {"question_id": "16c90bf4", "hypothesis": "A pilsner or lager."}
44
+ {"question_id": "c8f1aeed", "hypothesis": "Pennsylvania"}
45
+ {"question_id": "eaca4986", "hypothesis": "C D E F G A B A G F E D C"}
46
+ {"question_id": "c7cf7dfd", "hypothesis": "Nostalgia"}
47
+ {"question_id": "e48988bc", "hypothesis": "Patagonia"}
48
+ {"question_id": "1de5cff2", "hypothesis": "Veja"}
49
+ {"question_id": "65240037", "hypothesis": "1:10 ratio"}
50
+ {"question_id": "778164c6", "hypothesis": "The evidence only shows that we discussed popular Caribbean dishes featuring snapper, but it does not include the actual Jamaican fruit-based dish name."}
51
+ {"question_id": "e47becba", "hypothesis": "Business Administration"}
52
+ {"question_id": "118b2229", "hypothesis": "45 minutes each way"}
53
+ {"question_id": "51a45a95", "hypothesis": "The evidence does not say where you redeemed the $5 coffee creamer coupon."}
54
+ {"question_id": "58bf7951", "hypothesis": "The Glass Menagerie"}
55
+ {"question_id": "1e043500", "hypothesis": "Summer Vibes"}
56
+ {"question_id": "c5e8278d", "hypothesis": "Johnson"}
57
+ {"question_id": "6ade9755", "hypothesis": "Serenity Yoga"}
58
+ {"question_id": "6f9b354f", "hypothesis": "a lighter shade of gray"}
59
+ {"question_id": "58ef2f1c", "hypothesis": "back on Valentine's Day"}
60
+ {"question_id": "f8c5f88b", "hypothesis": "The retrieved evidence does not say where you bought the new tennis racket from."}
61
+ {"question_id": "5d3d2817", "hypothesis": "marketing specialist at a small startup"}
62
+ {"question_id": "7527f7e2", "hypothesis": "$800"}
63
+ {"question_id": "c960da58", "hypothesis": "20"}
64
+ {"question_id": "3b6f954b", "hypothesis": "University of Melbourne"}
65
+ {"question_id": "726462e0", "hypothesis": "10%"}
66
+ {"question_id": "94f70d80", "hypothesis": "The retrieved evidence does not state any assembly time for an IKEA bookshelf. It only mentions plans to visit IKEA to look at coffee tables and read reviews, so the duration cannot be determined from this evidence."}
67
+ {"question_id": "66f24dbb", "hypothesis": "No sister gift is shown in memory; the only recorded birthday gift purchase was a customized phone case with his favorite game's logo for your brother."}
68
+ {"question_id": "ad7109d1", "hypothesis": "The retrieved evidence does not specify the speed of your new internet plan."}
69
+ {"question_id": "af8d2e46", "hypothesis": "7"}
70
+ {"question_id": "dccbc061", "hypothesis": "staunch atheist"}
71
+ {"question_id": "c8c3f81d", "hypothesis": "Nike"}
72
+ {"question_id": "8ebdbe50", "hypothesis": "Data Science certification"}
73
+ {"question_id": "6b168ec8", "hypothesis": "3"}
74
+ {"question_id": "75499fd8", "hypothesis": "Golden Retriever"}
75
+ {"question_id": "21436231", "hypothesis": "Not stated"}
76
+ {"question_id": "95bcc1c8", "hypothesis": "10"}
77
+ {"question_id": "0862e8bf", "hypothesis": "Luna"}
78
+ {"question_id": "853b0a1d", "hypothesis": "I can tell it was a gift from your grandma, but the evidence here doesn’t include your age when she gave you the silver necklace."}
79
+ {"question_id": "a06e4cfe", "hypothesis": "3:1"}
80
+ {"question_id": "37d43f65", "hypothesis": "16GB"}
81
+ {"question_id": "b86304ba", "hypothesis": "Triple what you paid for it."}
82
+ {"question_id": "d52b4f67", "hypothesis": "Grand Ballroom"}
83
+ {"question_id": "25e5aa4f", "hypothesis": "UCLA"}
84
+ {"question_id": "caf9ead2", "hypothesis": "around 5 hours"}
85
+ {"question_id": "8550ddae", "hypothesis": "Lavender Dream cocktail"}
86
+ {"question_id": "60d45044", "hypothesis": "Japanese short-grain rice"}
87
+ {"question_id": "3f1e9474", "hypothesis": "Sarah"}
88
+ {"question_id": "86b68151", "hypothesis": "IKEA"}
89
+ {"question_id": "577d4d32", "hypothesis": "7 pm"}
90
+ {"question_id": "ec81a493", "hypothesis": "500 copies"}
91
+ {"question_id": "15745da0", "hypothesis": "about three months"}
92
+ {"question_id": "e01b8e2f", "hypothesis": "Hawaii"}
93
+ {"question_id": "bc8a6e93", "hypothesis": "lemon blueberry cake"}
94
+ {"question_id": "ccb36322", "hypothesis": "There’s no evidence here of any music streaming service you’ve been using lately; the only services mentioned are photo album services like Shutterfly, Snapfish, and Mixbook."}
95
+ {"question_id": "001be529", "hypothesis": "Your asylum application had been approved by May 28, 2023, and there was a waiting period, but the evidence does not state how long you waited for the decision."}
96
+ {"question_id": "b320f3f8", "hypothesis": "a rare blue Snaggletooth action figure"}
97
+ {"question_id": "19b5f2b3", "hypothesis": "The evidence only shows that you were in Japan/Kyoto and planning to travel elsewhere in Asia afterward, but it does not state how long you were in Japan."}
98
+ {"question_id": "4fd1909e", "hypothesis": "Xfinity Center"}
99
+ {"question_id": "545bd2b5", "hypothesis": "2 hours per day"}
100
+ {"question_id": "8a137a7f", "hypothesis": "Philips LED bulb"}
101
+ {"question_id": "76d63226", "hypothesis": "55-inch"}
102
+ {"question_id": "86f00804", "hypothesis": "The Seven Husbands of Evelyn Hugo"}
103
+ {"question_id": "8e9d538c", "hypothesis": "17 skeins"}
104
+ {"question_id": "311778f1", "hypothesis": "around 10 hours"}
105
+ {"question_id": "c19f7a0b", "hypothesis": "around 6:30 pm"}
106
+ {"question_id": "4100d0a0", "hypothesis": "Irish and Italian"}
107
+ {"question_id": "29f2956b", "hypothesis": "30 minutes daily"}
108
+ {"question_id": "1faac195", "hypothesis": "Denver"}
109
+ {"question_id": "faba32e5", "hypothesis": "24 hours"}
110
+ {"question_id": "f4f1d8a4", "hypothesis": "my sister"}
111
+ {"question_id": "c14c00dd", "hypothesis": "Trader Joe's"}
112
+ {"question_id": "36580ce8", "hypothesis": "The evidence only says you’d been dealing with unspecified health issues for a few months; it does not identify any specific issue you initially thought was just a cold."}
113
+ {"question_id": "3d86fd0a", "hypothesis": "Information unavailable"}
114
+ {"question_id": "a82c026e", "hypothesis": "Dark Souls 3 DLC"}
115
+ {"question_id": "0862e8bf_abs", "hypothesis": "I can’t determine your hamster’s name from the provided evidence because the memories only mention your cat and unrelated coding discussion, not any hamster."}
116
+ {"question_id": "15745da0_abs", "hypothesis": "No evidence says how long you've been collecting vintage films."}
117
+ {"question_id": "bc8a6e93_abs", "hypothesis": "I don't know"}
118
+ {"question_id": "19b5f2b3_abs", "hypothesis": "The available evidence doesn’t mention a Korea trip or its duration, so I can’t determine how long you were in Korea."}
119
+ {"question_id": "29f2956b_abs", "hypothesis": "I only have evidence that you practice guitar for 30 minutes daily; there’s no evidence here about a daily violin practice time."}
120
+ {"question_id": "f4f1d8a4_abs", "hypothesis": "I only remember a new stand mixer as a birthday gift from your sister, not your dad."}
121
+ {"question_id": "0a995998", "hypothesis": "2"}
122
+ {"question_id": "6d550036", "hypothesis": "1"}
123
+ {"question_id": "gpt4_59c863d7", "hypothesis": "4"}
124
+ {"question_id": "b5ef892d", "hypothesis": "5 days"}
125
+ {"question_id": "e831120c", "hypothesis": "3.5 weeks"}
126
+ {"question_id": "3a704032", "hypothesis": "3"}
127
+ {"question_id": "gpt4_d84a3211", "hypothesis": "I can identify bike-related activity this year, like getting bike lights installed, but there are no explicit dollar amounts for any bike-related purchases or services in the evidence, so I can’t compute a total spent."}
128
+ {"question_id": "aae3761f", "hypothesis": "10 hours"}
129
+ {"question_id": "gpt4_f2262a51", "hypothesis": "1"}
130
+ {"question_id": "dd2973ad", "hypothesis": "I can only narrow it down indirectly: your doctor’s appointment was at 10 AM on Thursday 2023-05-18, so the day before was Wednesday 2023-05-17. The only bedtime evidence says you went to bed at 2 AM on last Wednesday relative to 2023-05-29, which refers to 2023-05-24, not 2023-05-17. So the closest bedtime mentioned is 2 AM, but it does not match the day before that appointment."}
131
+ {"question_id": "c4a1ceb8", "hypothesis": "3"}
132
+ {"question_id": "gpt4_a56e767c", "hypothesis": "3"}
133
+ {"question_id": "6cb6f249", "hypothesis": "17 days"}
134
+ {"question_id": "46a3abf7", "hypothesis": "3"}
135
+ {"question_id": "36b9f61e", "hypothesis": "2000"}
136
+ {"question_id": "28dc39ac", "hypothesis": "29 hours"}
137
+ {"question_id": "gpt4_2f8be40d", "hypothesis": "2"}
138
+ {"question_id": "2e6d26dc", "hypothesis": "3"}
139
+ {"question_id": "gpt4_15e38248", "hypothesis": "2"}
140
+ {"question_id": "88432d0a", "hypothesis": "4"}
141
+ {"question_id": "80ec1f4f", "hypothesis": "1"}
142
+ {"question_id": "d23cf73b", "hypothesis": "2"}
143
+ {"question_id": "gpt4_7fce9456", "hypothesis": "2"}
144
+ {"question_id": "d682f1a2", "hypothesis": "2"}
145
+ {"question_id": "7024f17c", "hypothesis": "0.5 hours"}
146
+ {"question_id": "gpt4_5501fe77", "hypothesis": "TikTok"}
147
+ {"question_id": "gpt4_2ba83207", "hypothesis": "Trader Joe's"}
148
+ {"question_id": "2318644b", "hypothesis": "Tokyo is the only place with a numeric nightly accommodation cost in the evidence: about $30/night. For Hawaii, the evidence only says you had a luxurious resort stay in Maui, which suggests it was more expensive but does not give a dollar amount, so the exact per-night difference cannot be calculated from the evidence."}
149
+ {"question_id": "2ce6a0f2", "hypothesis": "1"}
150
+ {"question_id": "gpt4_d12ceb0e", "hypothesis": "Insufficient evidence to compute the average age of you, your parents, and your grandparents."}
151
+ {"question_id": "00ca467f", "hypothesis": "2"}
152
+ {"question_id": "b3c15d39", "hypothesis": "5 days"}
153
+ {"question_id": "gpt4_31ff4165", "hypothesis": "4"}
154
+ {"question_id": "eeda8a6d", "hypothesis": "17"}
155
+ {"question_id": "2788b940", "hypothesis": "2"}
156
+ {"question_id": "60bf93ed", "hypothesis": "It arrived on 1/20, but I don’t see the purchase date in the evidence, so I can’t calculate the number of days exactly."}
157
+ {"question_id": "9d25d4e0", "hypothesis": "2"}
158
+ {"question_id": "129d1232", "hypothesis": "600 dollars"}
159
+ {"question_id": "60472f9c", "hypothesis": "1"}
160
+ {"question_id": "gpt4_194be4b3", "hypothesis": "3"}
161
+ {"question_id": "a9f6b44c", "hypothesis": "2"}
162
+ {"question_id": "d851d5ba", "hypothesis": "At least 2,250 USD"}
163
+ {"question_id": "5a7937c8", "hypothesis": "2"}
164
+ {"question_id": "gpt4_ab202e7f", "hypothesis": "2"}
165
+ {"question_id": "gpt4_e05b82a6", "hypothesis": "7"}
166
+ {"question_id": "gpt4_731e37d7", "hypothesis": "$200"}
167
+ {"question_id": "edced276", "hypothesis": "8 days"}
168
+ {"question_id": "10d9b85a", "hypothesis": "2"}
169
+ {"question_id": "e3038f8c", "hypothesis": "87"}
170
+ {"question_id": "2b8f3739", "hypothesis": "$495"}
171
+ {"question_id": "1a8a66a6", "hypothesis": "2"}
172
+ {"question_id": "c2ac3c61", "hypothesis": "5"}
173
+ {"question_id": "bf659f65", "hypothesis": "1"}
174
+ {"question_id": "gpt4_372c3eed", "hypothesis": "8 years"}
175
+ {"question_id": "gpt4_2f91af09", "hypothesis": "6"}
176
+ {"question_id": "81507db6", "hypothesis": "3"}
177
+ {"question_id": "88432d0a_abs", "hypothesis": "0"}
178
+ {"question_id": "80ec1f4f_abs", "hypothesis": "0"}
179
+ {"question_id": "eeda8a6d_abs", "hypothesis": "not evidenced in the retrieved memory"}
180
+ {"question_id": "60bf93ed_abs", "hypothesis": "70 days"}
181
+ {"question_id": "edced276_abs", "hypothesis": "I can only confirm a Hawaii trip, and no number of days is given for it. There’s also no evidence here of a Seattle trip duration, so the total can’t be determined from the provided evidence."}
182
+ {"question_id": "gpt4_372c3eed_abs", "hypothesis": "8 years documented; no evidence of a completed Master's degree"}
183
+ {"question_id": "8a2466db", "hypothesis": "Sorry, I don’t know of any specific resources from memory."}
184
+ {"question_id": "06878be2", "hypothesis": "A comfortable, durable Sony-compatible camera bag, a Godox V1 hard case or Think Tank Photo Flash Pouch, and a lens cleaning kit with a microfiber cloth, blower, and lens cleaning solution would complement your Sony A7R IV setup."}
185
+ {"question_id": "75832dbd", "hypothesis": "Recent publications or conferences on the impact of mega-churches on Baptist church vitality, Baptist identity and mission, and practical collaboration models among smaller Baptist churches—especially case-study-focused work on regional or denominational cooperation."}
186
+ {"question_id": "0edc2aef", "hypothesis": "Look for a Miami hotel with a balcony hot tub and a standout view, similar to your preferred room at The Edgewater Hotel."}
187
+ {"question_id": "35a27287", "hypothesis": "French- or Spanish-focused language exchange meetups, conversation groups, or cultural events nearby would fit you best; check Meetup, Facebook Groups, local universities, or language schools this weekend."}
188
+ {"question_id": "32260d93", "hypothesis": "Mike Birbiglia: My Girlfriend's Boyfriend"}
189
+ {"question_id": "195a1a1b", "hypothesis": "Try a calm wind-down routine: read a non-stimulating book, do a guided sleep meditation or body scan, practice deep breathing, do gentle yoga or stretching, listen to soothing music or nature sounds, and take a 30-minute break from screens before bed."}
190
+ {"question_id": "afdc33df", "hypothesis": "Regularly clean your granite countertop, use coasters or pads under pots and utensils, always use a cutting board, and check the faucet/aerator for moisture or buildup so the sink area stays cleaner."}
191
+ {"question_id": "caf03d32", "hypothesis": "Try more plant-based slow cooker recipes like lentil curry, roasted vegetable stew, black bean chili, eggplant parmesan, vegan chili, or a quinoa and black bean bowl."}
192
+ {"question_id": "54026fce", "hypothesis": "Try suggesting virtual coffee breaks in a collaborative way: ask open-ended questions about staying connected, share that you miss casual chats, propose a weekly virtual coffee break, and invite your team’s feedback before deciding."}
193
+ {"question_id": "06f04340", "hypothesis": "Serve a hearty quinoa salad with your roasted homegrown vegetables over mixed greens, finished with vinaigrette; roast the vegetables first, then toss with the greens and quinoa."}
194
+ {"question_id": "6b7dfb22", "hypothesis": "Study real flowers from life or photos, browse Instagram and art communities for ideas, experiment with new techniques like palette-knife texture, and take breaks while tracking small wins to help inspiration return."}
195
+ {"question_id": "1a1907b4", "hypothesis": "Try a Pimm's Cup with a Twist, especially with Ruby Red Grapefruit simple syrup."}
196
+ {"question_id": "09d032c9", "hypothesis": "A compact portable power bank could help as a backup."}
197
+ {"question_id": "38146c39", "hypothesis": "Try adding warm spices, nuts like walnuts or pecans, a little citrus zest, or a touch of espresso or coffee for extra flavor."}
198
+ {"question_id": "d24813b1", "hypothesis": "Bake chocolate chip cookies or a lemon poppyseed cake; both fit your recent baking interests, and the lemon poppyseed cake worked well for a colleague event before."}
199
+ {"question_id": "57f827a0", "hypothesis": "No direct memory-backed evidence shows you were planning to rearrange your bedroom furniture this weekend."}
200
+ {"question_id": "95228167", "hypothesis": "Try several guitars and focus on comfort, playability, build quality, tuning stability, and the sound you like best."}
201
+ {"question_id": "505af2f5", "hypothesis": "Try a homemade vanilla-cinnamon creamer with unsweetened almond or oat milk, a little monk fruit or stevia instead of honey, and optional coconut or hazelnut extract."}
202
+ {"question_id": "75f70248", "hypothesis": "Possibly yes—the prior memory suggests your living room had a dust issue and a shedding cat, so dust or pet hair there could be contributing to your sneezing."}
203
+ {"question_id": "d6233ab6", "hypothesis": "Yes—if you’re feeling nostalgic and want to reconnect with old high school friends, attending your reunion sounds like a good idea."}
204
+ {"question_id": "1da05512", "hypothesis": "Buy now if your storage issue is already a problem and a beginner-friendly 2-bay model fits your budget; wait only if you expect much larger future storage needs or still need to compare options."}
205
+ {"question_id": "fca70973", "hypothesis": "Try Universal Studios Hollywood’s VIP Experience with the Gourmet Buffet and Behind-the-Scenes Tour as the top pick; Disneyland Resort Halloween dining or Knott’s Scary Farm Boo-fet would also fit your immersive, exclusive theme park weekend vibe."}
206
+ {"question_id": "b6025781", "hypothesis": "Try herb-rosemary roasted sweet potatoes, quinoa and roasted vegetable bowls with salmon, tofu, or lentil bolognese, and a batch of homemade granola for snacks."}
207
+ {"question_id": "a89d7624", "hypothesis": "Visit Red Rocks Park and Amphitheatre and check out other live music spots like The Ogden Theatre, The Bluebird Theater, The Fillmore Auditorium, and Larimer Lounge."}
208
+ {"question_id": "b0479f84", "hypothesis": "Try Planet Earth, Blue Planet, Dynasties, Chasing Coral, and March of the Penguins—especially the nature and wildlife ones since that’s what you were most interested in."}
209
+ {"question_id": "1d4e3b97", "hypothesis": "Yes — the recent upgrade to your bike’s chain and cassette could be why it’s performing better."}
210
+ {"question_id": "07b6f563", "hypothesis": "An iPhone 13 Pro-compatible wallet case like the OtterBox wallet case would be a useful accessory; I’d skip suggesting a power bank since you already have one that’s working well."}
211
+ {"question_id": "1c0ddc50", "hypothesis": "Listen to history and science podcasts during your 40-minute commute—try starter episodes from Hardcore History, Lore, The Dollop, StarTalk Radio, or Radiolab."}
212
+ {"question_id": "0a34ad58", "hypothesis": "Since you already have a Suica card, use it for trains and subways to make getting around easier. If you’re heading out from Shinjuku, plan a little extra time, especially for early or busy destinations like Tsukiji, where it’s best to leave very early and expect crowds. Keep some yen/cash with you because some places, especially around markets, may not take cards, and if needed you can exchange money near Shinjuku."}
213
+ {"question_id": "d3ab962e", "hypothesis": "8 miles"}
214
+ {"question_id": "2311e44b", "hypothesis": "The Nightingale' is 440 pages long, but I don’t have a numeric page-progress value for how far you’ve already read, so the exact pages left can’t be calculated from the evidence."}
215
+ {"question_id": "cc06de0d", "hypothesis": "I can’t compute the difference from the provided evidence because it gives your daily train fare as $6, but it doesn’t include the taxi ride cost."}
216
+ {"question_id": "a11281a2", "hypothesis": "about 100 followers"}
217
+ {"question_id": "4f54b7c9", "hypothesis": "5"}
218
+ {"question_id": "85fa3a3f", "hypothesis": "50"}
219
+ {"question_id": "9aaed6a3", "hypothesis": "$0.75"}
220
+ {"question_id": "1f2b8d4f", "hypothesis": "I can’t determine an exact price difference from the evidence. The price of your luxury boots isn’t given, and no specific similar pair found at a budget store has a stated price. The closest budget-store-style prices mentioned were about $20-$50 for affordable ankle boots, but that still isn’t enough to calculate the difference."}
221
+ {"question_id": "e6041065", "hypothesis": "40%"}
222
+ {"question_id": "51c32626", "hypothesis": "No explicit submission date was given; it was only mentioned by 2023/05/22 as a past event."}
223
+ {"question_id": "d905b33f", "hypothesis": "20%"}
224
+ {"question_id": "7405e8b1", "hypothesis": "HelloFresh: 40% on your first order. For UberEats, the evidence only says you got 20% off an order last week, not that it was your first order."}
225
+ {"question_id": "f35224e0", "hypothesis": "At least 12 episodes"}
226
+ {"question_id": "6456829e", "hypothesis": "5 tomato plants and 3 cucumber plants"}
227
+ {"question_id": "a4996e51", "hypothesis": "Up to 50 hours/week."}
228
+ {"question_id": "3c1045c8", "hypothesis": "unavailable"}
229
+ {"question_id": "60036106", "hypothesis": "12,000 people"}
230
+ {"question_id": "681a1674", "hypothesis": "1"}
231
+ {"question_id": "e25c3b8d", "hypothesis": "I can only tell that you paid $200 for the designer handbag at TK Maxx; the savings amount isn’t stated in the evidence, so it can’t be determined from this information alone."}
232
+ {"question_id": "4adc0475", "hypothesis": "Insufficient evidence for an exact total."}
233
+ {"question_id": "4bc144e2", "hypothesis": "Parking ticket: $50; car wash not found in the retrieved evidence."}
234
+ {"question_id": "ef66a6e5", "hypothesis": "4"}
235
+ {"question_id": "5025383b", "hypothesis": "cooking"}
236
+ {"question_id": "a1cc6108", "hypothesis": "You were about 0 years old when Alex was born."}
237
+ {"question_id": "9ee3ecd6", "hypothesis": "300 points"}
238
+ {"question_id": "3fdac837", "hypothesis": "12 days"}
239
+ {"question_id": "91b15a6e", "hypothesis": "$5,150"}
240
+ {"question_id": "27016adc", "hypothesis": "The planned renovations are $20,000. I don’t see the countryside property’s purchase price in the provided evidence, so the exact percentage can’t be computed from this evidence alone."}
241
+ {"question_id": "720133ac", "hypothesis": "$75"}
242
+ {"question_id": "77eafa52", "hypothesis": "$300"}
243
+ {"question_id": "8979f9ec", "hypothesis": "I can only confirm those were lunch meal-prep recipe ideas mentioned on 2023/05/26; there’s no evidence here giving a number of meals for the chicken fajitas or any lentil soup, so the total can’t be computed from the provided evidence."}
244
+ {"question_id": "0100672e", "hypothesis": "$12 per coffee mug"}
245
+ {"question_id": "a96c20ee", "hypothesis": "I presented a poster on my thesis research at my first research conference over the summer, but the university isn't stated in the evidence provided."}
246
+ {"question_id": "92a0aa75", "hypothesis": "Senior Marketing Specialist at the company"}
247
+ {"question_id": "3fe836c9", "hypothesis": "The final sale price was $325,000. The only other amount mentioned is $350,000, but in the evidence it appears as a generic mortgage example, not a clearly stated pre-approval amount, so the exact difference can’t be determined from the evidence."}
248
+ {"question_id": "1c549ce4", "hypothesis": "The only purchase cost shown is the waterproof car cover at $120. I don’t see evidence that you purchased a detailing spray or what it cost, so I can’t add a total beyond the confirmed $120."}
249
+ {"question_id": "6c49646a", "hypothesis": "1940 miles"}
250
+ {"question_id": "1192316e", "hypothesis": "I can’t compute a total from the provided evidence because it only mentions meditation and breakfast timing, not how long you take to get ready or commute to work."}
251
+ {"question_id": "0ea62687", "hypothesis": "I can’t determine the difference from the retrieved memory: a few months ago it was 30 mpg, but your current mpg isn’t stated."}
252
+ {"question_id": "67e0d0f2", "hypothesis": "At least 8. You explicitly mentioned completing 8 edX courses, and you also said you had completed some Coursera courses, but without a number, so the exact total isn’t determined from the evidence."}
253
+ {"question_id": "bb7c3b45", "hypothesis": "The evidence only gives the original retail price of the Jimmy Choo heels as $500. It does not say what you actually paid, so the amount saved cannot be determined from the provided evidence."}
254
+ {"question_id": "ba358f49", "hypothesis": "I don't know—Rachel is expected to get married in 2023, but your age isn't provided."}
255
+ {"question_id": "61f8c8f8", "hypothesis": "10 minutes faster"}
256
+ {"question_id": "60159905", "hypothesis": "1"}
257
+ {"question_id": "ef9cf60a", "hypothesis": "$100"}
258
+ {"question_id": "73d42213", "hypothesis": "No explicit arrival time is stated for Monday. The only clinic-travel detail says it took two hours to get there last time, but that doesn’t give a reach time."}
259
+ {"question_id": "bc149d6b", "hypothesis": "70 pounds"}
260
+ {"question_id": "099778bb", "hypothesis": "Insufficient evidence to determine the percentage; only the total of 100 leadership positions is provided, not how many are held by women."}
261
+ {"question_id": "09ba9854", "hypothesis": "About $50."}
262
+ {"question_id": "d6062bb9", "hypothesis": "1998"}
263
+ {"question_id": "157a136e", "hypothesis": "I don't know"}
264
+ {"question_id": "c18a7dc8", "hypothesis": "7"}
265
+ {"question_id": "a3332713", "hypothesis": "About $200 total"}
266
+ {"question_id": "55241a1f", "hypothesis": "Your recent Facebook Live session had 12 comments, but I don’t have evidence here for the comment count of your most popular YouTube video, so I can’t compute the total from the available information."}
267
+ {"question_id": "a08a253f", "hypothesis": "4"}
268
+ {"question_id": "f0e564bc", "hypothesis": "The evidence shows $800 for the designer handbag, but no amount is provided for the high-end skincare products, so the total cannot be determined."}
269
+ {"question_id": "078150f1", "hypothesis": "$50 more"}
270
+ {"question_id": "8cf4d046", "hypothesis": "I can’t compute an average GPA from the evidence provided. It mentions your undergraduate result as a First-Class distinction from the University of Mumbai and that you completed a Master’s in Data Science at UIUC, but it does not give explicit GPA values for either degree."}
271
+ {"question_id": "a346bb18", "hypothesis": "12 minutes"}
272
+ {"question_id": "37f165cf", "hypothesis": "416 pages and 440 pages"}
273
+ {"question_id": "8e91e7d9", "hypothesis": "1"}
274
+ {"question_id": "87f22b4a", "hypothesis": "$120"}
275
+ {"question_id": "e56a43b9", "hypothesis": "$5 discount"}
276
+ {"question_id": "efc3f7c2", "hypothesis": "30 minutes earlier"}
277
+ {"question_id": "21d02d0d", "hypothesis": "2"}
278
+ {"question_id": "2311e44b_abs", "hypothesis": "You mentioned that you've been reading \"Sapiens\" at a pace of 10–20 pages a week, but there’s no evidence here stating the book’s total page count or how many pages you’ve already finished, so the exact number of pages left can’t be determined from the provided evidence."}
279
+ {"question_id": "6456829e_abs", "hypothesis": "5"}
280
+ {"question_id": "e5ba910e_abs", "hypothesis": "$378"}
281
+ {"question_id": "a96c20ee_abs", "hypothesis": "Harvard University"}
282
+ {"question_id": "ba358f49_abs", "hypothesis": "Insufficient evidence to determine Rachel's age when you get married."}
283
+ {"question_id": "09ba9854_abs", "hypothesis": "The evidence doesn’t give enough to calculate the savings exactly. It does mention the Airport Limousine Bus costs ¥3,200 one way to Shinjuku Station, but it does not provide a full taxi fare from the airport to your hotel, only that a short taxi ride from Shinjuku Station to the hotel might be needed."}
284
+ {"question_id": "gpt4_59149c77", "hypothesis": "7 days"}
285
+ {"question_id": "gpt4_f49edff3", "hypothesis": "helped my friend prepare a nursery, helped my cousin pick out stuff for her baby shower, ordered a customized phone case for my friend's birthday"}
286
+ {"question_id": "71017276", "hypothesis": "4 weeks"}
287
+ {"question_id": "b46e15ed", "hypothesis": "2 months"}
288
+ {"question_id": "gpt4_fa19884c", "hypothesis": "6 days"}
289
+ {"question_id": "0bc8ad92", "hypothesis": "2 months"}
290
+ {"question_id": "af082822", "hypothesis": "2 weeks"}
291
+ {"question_id": "gpt4_4929293a", "hypothesis": "Michael's engagement party"}
292
+ {"question_id": "gpt4_b5700ca9", "hypothesis": "4 days ago"}
293
+ {"question_id": "9a707b81", "hypothesis": "21 days"}
294
+ {"question_id": "gpt4_1d4ab0c9", "hypothesis": "24 days"}
295
+ {"question_id": "gpt4_e072b769", "hypothesis": "about 3 weeks ago"}
296
+ {"question_id": "0db4c65d", "hypothesis": "18 days"}
297
+ {"question_id": "gpt4_1d80365e", "hypothesis": "2 days"}
298
+ {"question_id": "gpt4_7f6b06db", "hypothesis": "Yosemite, Big Sur and Monterey, Yosemite National Park"}
299
+ {"question_id": "gpt4_6dc9b45b", "hypothesis": "4 months ago"}
300
+ {"question_id": "gpt4_8279ba02", "hypothesis": "10 days ago"}
301
+ {"question_id": "gpt4_18c2b244", "hypothesis": "I used a Buy One Get One Free coupon on Luvs diapers at Walmart, then I redeemed $12 cashback for a $10 Amazon gift card from Ibotta, then I signed up for the rewards program at ShopRite."}
302
+ {"question_id": "gpt4_a1b77f9c", "hypothesis": "Insufficient evidence to calculate the total weeks spent."}
303
+ {"question_id": "gpt4_1916e0ea", "hypothesis": "I can identify the Instacart online grocery shopping date as 2023/02/28, but the provided evidence does not include the date you cancelled your FarmFresh subscription, so the number of days passed cannot be determined from this evidence."}
304
+ {"question_id": "gpt4_7a0daae1", "hypothesis": "I can tell you received your new tennis racket on 2023/03/17, but the buy date isn’t provided in the evidence, so the number of weeks between buying and receiving can’t be determined from the available information."}
305
+ {"question_id": "gpt4_468eb063", "hypothesis": "9 days"}
306
+ {"question_id": "gpt4_7abb270c", "hypothesis": "Science Museum, Metropolitan Museum of Art, Museum of Contemporary Art, Modern Art Museum, and two missing museum visits"}
307
+ {"question_id": "gpt4_1e4a8aeb", "hypothesis": "6 days"}
308
+ {"question_id": "gpt4_4fc4f797", "hypothesis": "38 days"}
309
+ {"question_id": "4dfccbf7", "hypothesis": "24 days"}
310
+ {"question_id": "gpt4_61e13b3c", "hypothesis": "3 weeks"}
311
+ {"question_id": "gpt4_45189cb4", "hypothesis": "College Football National Championship game (Georgia vs. Alabama), then Lakers game, then Kansas City Chiefs vs. Buffalo Bills Divisional Round game"}
312
+ {"question_id": "2ebe6c90", "hypothesis": "21 days"}
313
+ {"question_id": "gpt4_e061b84f", "hypothesis": "volleyball league game, Spring Sprint Triathlon, company's annual charity soccer tournament"}
314
+ {"question_id": "370a8ff4", "hypothesis": "11 weeks and 4 days"}
315
+ {"question_id": "gpt4_d6585ce8", "hypothesis": "free outdoor concert series in the park, jazz night at a local bar"}
316
+ {"question_id": "gpt4_4ef30696", "hypothesis": "1"}
317
+ {"question_id": "gpt4_ec93e27f", "hypothesis": "train"}
318
+ {"question_id": "6e984301", "hypothesis": "about 9 weeks"}
319
+ {"question_id": "8077ef71", "hypothesis": "26 days ago"}
320
+ {"question_id": "gpt4_f420262c", "hypothesis": "Delta, American Airlines"}
321
+ {"question_id": "gpt4_8e165409", "hypothesis": "22"}
322
+ {"question_id": "gpt4_74aed68e", "hypothesis": "29 days"}
323
+ {"question_id": "bcbe585f", "hypothesis": "4 weeks and 2 days ago (about 4.3 weeks)"}
324
+ {"question_id": "gpt4_21adecb5", "hypothesis": "6 months"}
325
+ {"question_id": "5e1b23de", "hypothesis": "3 months ago"}
326
+ {"question_id": "gpt4_98f46fc6", "hypothesis": "Insufficient evidence to determine"}
327
+ {"question_id": "gpt4_af6db32f", "hypothesis": "17 days"}
328
+ {"question_id": "eac54adc", "hypothesis": "19 days"}
329
+ {"question_id": "gpt4_7ddcf75f", "hypothesis": "3 days ago"}
330
+ {"question_id": "gpt4_a2d1d1f6", "hypothesis": "3 days ago"}
331
+ {"question_id": "gpt4_85da3956", "hypothesis": "3 weeks"}
332
+ {"question_id": "gpt4_b0863698", "hypothesis": "7 days ago"}
333
+ {"question_id": "gpt4_68e94287", "hypothesis": "The vegan chili recipe post"}
334
+ {"question_id": "gpt4_e414231e", "hypothesis": "4 days"}
335
+ {"question_id": "gpt4_7ca326fa", "hypothesis": "Rachel first, Alex second, Emma third"}
336
+ {"question_id": "gpt4_7bc6cf22", "hypothesis": "12 days"}
337
+ {"question_id": "2ebe6c92", "hypothesis": "The Song of Achilles"}
338
+ {"question_id": "gpt4_e061b84g", "hypothesis": "the company's annual charity soccer tournament"}
339
+ {"question_id": "71017277", "hypothesis": "my aunt"}
340
+ {"question_id": "b46e15ee", "hypothesis": "Walk for Hunger"}
341
+ {"question_id": "gpt4_d6585ce9", "hypothesis": "my sister"}
342
+ {"question_id": "gpt4_1e4a8aec", "hypothesis": "planted 12 new tomato saplings"}
343
+ {"question_id": "gpt4_f420262d", "hypothesis": "Delta"}
344
+ {"question_id": "gpt4_59149c78", "hypothesis": "the Metropolitan Museum of Art"}
345
+ {"question_id": "gpt4_e414231f", "hypothesis": "mountain bike"}
346
+ {"question_id": "gpt4_4929293b", "hypothesis": "your cousin's wedding"}
347
+ {"question_id": "gpt4_468eb064", "hypothesis": "Emma"}
348
+ {"question_id": "gpt4_fa19884d", "hypothesis": "Last Friday was 2023/03/31. The evidence from that day only shows artist recommendations, not a single explicit artist you definitely started listening to. The artists suggested then were Miles Davis, John Coltrane, Charlie Parker, and Thelonious Monk, plus bluegrass acts like Bill Monroe and the Blue Grass Boys, Earl Scruggs, Flatt and Scruggs, Alison Krauss and Union Station, and The Infamous Stringdusters."}
349
+ {"question_id": "9a707b82", "hypothesis": "croissants"}
350
+ {"question_id": "eac54add", "hypothesis": "I can’t identify any business milestone from the evidence. The only milestone-like detail mentioned around the retrieved time is that you had lost about 2 pounds after replacing your bathroom scale, which is a personal health update, not a business milestone."}
351
+ {"question_id": "4dfccbf8", "hypothesis": "Ukulele lessons with Rachel"}
352
+ {"question_id": "0bc8ad93", "hypothesis": "with a friend"}
353
+ {"question_id": "6e984302", "hypothesis": "a set of sculpting tools: a modeling tool set, a wire cutter, and a sculpting mat"}
354
+ {"question_id": "gpt4_8279ba03", "hypothesis": "I only have evidence of you buying a Samsung Galaxy S21 for about $800 at Best Buy, not a kitchen appliance 10 days before 2023/03/25."}
355
+ {"question_id": "gpt4_b5700ca0", "hypothesis": "the Episcopal Church"}
356
+ {"question_id": "gpt4_68e94288", "hypothesis": "#PlankChallenge"}
357
+ {"question_id": "gpt4_2655b836", "hypothesis": "No retrieved evidence mentions your new car’s first service or any issue you had after it, so the first post-service issue cannot be determined from the available evidence."}
358
+ {"question_id": "gpt4_2487a7cb", "hypothesis": "The 'Data Analysis using Python' webinar"}
359
+ {"question_id": "gpt4_76048e76", "hypothesis": "bike"}
360
+ {"question_id": "gpt4_2312f94c", "hypothesis": "Samsung Galaxy S22"}
361
+ {"question_id": "0bb5a684", "hypothesis": "7 days"}
362
+ {"question_id": "08f4fc43", "hypothesis": "30 days"}
363
+ {"question_id": "2c63a862", "hypothesis": "14 days"}
364
+ {"question_id": "gpt4_385a5000", "hypothesis": "Tomatoes"}
365
+ {"question_id": "2a1811e2", "hypothesis": "11"}
366
+ {"question_id": "bbf86515", "hypothesis": "4 days"}
367
+ {"question_id": "gpt4_5dcc0aab", "hypothesis": "white Adidas sneakers"}
368
+ {"question_id": "gpt4_0b2f1d21", "hypothesis": "The stand mixer malfunction happened first."}
369
+ {"question_id": "f0853d11", "hypothesis": "14 days"}
370
+ {"question_id": "gpt4_6ed717ea", "hypothesis": "the training pads for Luna"}
371
+ {"question_id": "gpt4_70e84552", "hypothesis": "Fixing the fence"}
372
+ {"question_id": "a3838d2b", "hypothesis": "2"}
373
+ {"question_id": "gpt4_93159ced", "hypothesis": "I can’t compute the exact amount from the evidence: it shows you’ve been at NovaTech for about 4 years and 3 months, and that you started working in the field when you graduated, but it doesn’t give your graduation date or total career length before NovaTech."}
374
+ {"question_id": "gpt4_2d58bcd6", "hypothesis": "The Hate U Give"}
375
+ {"question_id": "gpt4_65aabe59", "hypothesis": "Can't tell from the evidence."}
376
+ {"question_id": "982b5123", "hypothesis": "3 months"}
377
+ {"question_id": "b9cfe692", "hypothesis": "I have evidence only for one of the two books: \"The Seven Husbands of Evelyn Hugo\" took you two and a half weeks to finish. There’s no evidence here for how long \"The Nightingale\" took, so I can’t compute a combined total from the provided memory."}
378
+ {"question_id": "gpt4_4edbafa2", "hypothesis": "2023-06-03"}
379
+ {"question_id": "c8090214", "hypothesis": "7 days"}
380
+ {"question_id": "gpt4_483dd43c", "hypothesis": "Game of Thrones"}
381
+ {"question_id": "e4e14d04", "hypothesis": "cannot be determined from the provided evidence"}
382
+ {"question_id": "c9f37c46", "hypothesis": "unresolved"}
383
+ {"question_id": "gpt4_2c50253f", "hypothesis": "I wake up at 6:30 AM on Tuesdays and Thursdays."}
384
+ {"question_id": "dcfa8644", "hypothesis": "14 days"}
385
+ {"question_id": "gpt4_b4a80587", "hypothesis": "the arrival of the new prime lens"}
386
+ {"question_id": "gpt4_9a159967", "hypothesis": "Southwest Airlines and American Airlines (tie)"}
387
+ {"question_id": "cc6d1ec1", "hypothesis": "about two months"}
388
+ {"question_id": "gpt4_8c8961ae", "hypothesis": "the solo trip to Thailand"}
389
+ {"question_id": "gpt4_d9af6064", "hypothesis": "new router"}
390
+ {"question_id": "gpt4_7de946e7", "hypothesis": "persistent cough"}
391
+ {"question_id": "d01c6aa8", "hypothesis": "Cannot be determined from the retrieved evidence."}
392
+ {"question_id": "993da5e2", "hypothesis": "I can only infer that you rearranged the furniture about three weeks before 2023/05/26; there’s no evidence here about when you started using the new area rug, so the duration before the rearrangement can’t be determined from the provided memory."}
393
+ {"question_id": "a3045048", "hypothesis": "30 days"}
394
+ {"question_id": "gpt4_d31cdae3", "hypothesis": "The family road trip across the American Southwest."}
395
+ {"question_id": "gpt4_cd90e484", "hypothesis": "Insufficient evidence to determine exactly"}
396
+ {"question_id": "gpt4_88806d6e", "hypothesis": "Can't tell from the evidence."}
397
+ {"question_id": "gpt4_4cd9eba1", "hypothesis": "1 week"}
398
+ {"question_id": "gpt4_93f6379c", "hypothesis": "Page Turners"}
399
+ {"question_id": "b29f3365", "hypothesis": "six weeks"}
400
+ {"question_id": "gpt4_2f56ae70", "hypothesis": "There’s no evidence here of you starting any streaming service, so I can’t identify a most recent one from the provided memory."}
401
+ {"question_id": "6613b389", "hypothesis": "2 months"}
402
+ {"question_id": "gpt4_78cf46a3", "hypothesis": "Receiving the new phone case happened first."}
403
+ {"question_id": "gpt4_0a05b494", "hypothesis": "the tourist from Australia"}
404
+ {"question_id": "gpt4_1a1dc16d", "hypothesis": "The pride parade"}
405
+ {"question_id": "gpt4_2f584639", "hypothesis": "The necklace for your sister."}
406
+ {"question_id": "gpt4_213fd887", "hypothesis": "the volleyball league"}
407
+ {"question_id": "gpt4_5438fa52", "hypothesis": "the start of your Spanish classes"}
408
+ {"question_id": "gpt4_c27434e8", "hypothesis": "Japanese Zero fighter plane model"}
409
+ {"question_id": "gpt4_fe651585", "hypothesis": "Rachel"}
410
+ {"question_id": "8c18457d", "hypothesis": "21 days"}
411
+ {"question_id": "gpt4_70e84552_abs", "hypothesis": "Fixing the fence"}
412
+ {"question_id": "gpt4_93159ced_abs", "hypothesis": "The evidence doesn’t support a current job at Google. On 2023/05/30, you were described as working at NovaTech, and while there’s a truncated note saying you’d been in software engineering before mentioning your current job, it doesn’t include the duration. So I can’t compute how long you’d been working before starting a job at Google from the provided evidence."}
413
+ {"question_id": "982b5123_abs", "hypothesis": "There’s no evidence here of you booking an Airbnb in Sacramento. The only Airbnb mention is for San Francisco, tied to your stay in Haight-Ashbury on 2023/05/27."}
414
+ {"question_id": "c8090214_abs", "hypothesis": "I attended the Holiday Market 7 days before Black Friday, but there’s no evidence here that you bought an iPad, so the day difference to an iPad purchase can’t be determined from the provided information."}
415
+ {"question_id": "gpt4_c27434e8_abs", "hypothesis": "Insufficient evidence to determine"}
416
+ {"question_id": "gpt4_fe651585_abs", "hypothesis": "Alex"}
417
+ {"question_id": "6a1eabeb", "hypothesis": "25:50"}
418
+ {"question_id": "6aeb4375", "hypothesis": "2"}
419
+ {"question_id": "830ce83f", "hypothesis": "Chicago"}
420
+ {"question_id": "852ce960", "hypothesis": "$350,000"}
421
+ {"question_id": "945e3d21", "hypothesis": "three times a week"}
422
+ {"question_id": "d7c942c3", "hypothesis": "Yes — your mom is using the same grocery list app as you."}
423
+ {"question_id": "71315a70", "hypothesis": "10-12 hours"}
424
+ {"question_id": "89941a93", "hypothesis": "1"}
425
+ {"question_id": "ce6d2d27", "hypothesis": "Friday"}
426
+ {"question_id": "9ea5eabc", "hypothesis": "Paris"}
427
+ {"question_id": "07741c44", "hypothesis": "shoe rack"}
428
+ {"question_id": "a1eacc2a", "hypothesis": "7"}
429
+ {"question_id": "184da446", "hypothesis": "220 pages"}
430
+ {"question_id": "031748ae", "hypothesis": "4 when you started; 5 now"}
431
+ {"question_id": "4d6b87c8", "hypothesis": "25"}
432
+ {"question_id": "0f05491a", "hypothesis": "120 stars"}
433
+ {"question_id": "08e075c7", "hypothesis": "I can’t determine that from the provided evidence. The memory snippets only mention your bedtime routine, reading, and using a massage roller, with no evidence about when you started using a Fitbit Charge 3."}
434
+ {"question_id": "f9e8c073", "hypothesis": "5 sessions"}
435
+ {"question_id": "41698283", "hypothesis": "70-200mm zoom lens"}
436
+ {"question_id": "2698e78f", "hypothesis": "Not specified; the memory only says to review it regularly with Dr. Smith."}
437
+ {"question_id": "b6019101", "hypothesis": "5"}
438
+ {"question_id": "45dc21b6", "hypothesis": "2"}
439
+ {"question_id": "5a4f22c0", "hypothesis": "TechCorp"}
440
+ {"question_id": "6071bd76", "hypothesis": "less"}
441
+ {"question_id": "e493bb7c", "hypothesis": "above my bed in my bedroom"}
442
+ {"question_id": "618f13b2", "hypothesis": "6"}
443
+ {"question_id": "72e3ee87", "hypothesis": "50"}
444
+ {"question_id": "c4ea545c", "hypothesis": "Insufficient evidence"}
445
+ {"question_id": "01493427", "hypothesis": "17"}
446
+ {"question_id": "6a27ffc2", "hypothesis": "30"}
447
+ {"question_id": "2133c1b5", "hypothesis": "3 months"}
448
+ {"question_id": "18bc8abd", "hypothesis": "Kansas City Masterpiece BBQ sauce"}
449
+ {"question_id": "db467c8c", "hypothesis": "about nine months"}
450
+ {"question_id": "7a87bd0c", "hypothesis": "4 weeks"}
451
+ {"question_id": "e61a7584", "hypothesis": "about 9 months"}
452
+ {"question_id": "1cea1afa", "hypothesis": "600"}
453
+ {"question_id": "ed4ddc30", "hypothesis": "20 dozen eggs"}
454
+ {"question_id": "8fb83627", "hypothesis": "5"}
455
+ {"question_id": "b01defab", "hypothesis": "Yes"}
456
+ {"question_id": "22d2cb42", "hypothesis": "the music shop on Main St"}
457
+ {"question_id": "0e4e4c46", "hypothesis": "132 points"}
458
+ {"question_id": "4b24c848", "hypothesis": "5"}
459
+ {"question_id": "7e974930", "hypothesis": "Your most recent Downtown Farmers Market visit was on 2023/09/30, but the exact earnings amount isn’t shown in the retrieved evidence."}
460
+ {"question_id": "603deb26", "hypothesis": "10"}
461
+ {"question_id": "59524333", "hypothesis": "6:00 pm"}
462
+ {"question_id": "5831f84d", "hypothesis": "10"}
463
+ {"question_id": "eace081b", "hypothesis": "Oahu"}
464
+ {"question_id": "affe2881", "hypothesis": "32"}
465
+ {"question_id": "50635ada", "hypothesis": "Premier Gold status"}
466
+ {"question_id": "e66b632c", "hypothesis": "26 minutes and 30 seconds"}
467
+ {"question_id": "0ddfec37", "hypothesis": "Cannot be determined from available evidence"}
468
+ {"question_id": "f685340e", "hypothesis": "Previously, every other week. Now, not explicitly stated."}
469
+ {"question_id": "cc5ded98", "hypothesis": "an hour each day"}
470
+ {"question_id": "dfde3500", "hypothesis": "Wednesday"}
471
+ {"question_id": "69fee5aa", "hypothesis": "37"}
472
+ {"question_id": "7401057b", "hypothesis": "1 free night's stay"}
473
+ {"question_id": "cf22b7bf", "hypothesis": "10 pounds"}
474
+ {"question_id": "a2f3aa27", "hypothesis": "close to 1300 followers"}
475
+ {"question_id": "c7dc5443", "hypothesis": "5-2"}
476
+ {"question_id": "06db6396", "hypothesis": "5"}
477
+ {"question_id": "3ba21379", "hypothesis": "Ford F-150 pickup truck model"}
478
+ {"question_id": "9bbe84a2", "hypothesis": "level 100 before the end of the year"}
479
+ {"question_id": "10e09553", "hypothesis": "7"}
480
+ {"question_id": "dad224aa", "hypothesis": "7:30 am"}
481
+ {"question_id": "ba61f0b9", "hypothesis": "6"}
482
+ {"question_id": "42ec0761", "hypothesis": "Yes, you have a spare screwdriver."}
483
+ {"question_id": "5c40ec5b", "hypothesis": "2"}
484
+ {"question_id": "c6853660", "hypothesis": "increase"}
485
+ {"question_id": "26bdc477", "hypothesis": "5"}
486
+ {"question_id": "0977f2af", "hypothesis": "I can’t tell from the retrieved evidence."}
487
+ {"question_id": "6aeb4375_abs", "hypothesis": "{\"answer\":0}"}
488
+ {"question_id": "031748ae_abs", "hypothesis": "4"}
489
+ {"question_id": "2698e78f_abs", "hypothesis": "No evidence provided states how often you see Dr. Johnson."}
490
+ {"question_id": "2133c1b5_abs", "hypothesis": "About 7 months. The evidence says it had been 1 month in your new studio apartment in Harajuku on 2023/04/11, which makes it about 7 months by 2023/10/15. The memory mentions Harajuku, not Shinjuku."}
491
+ {"question_id": "0ddfec37_abs", "hypothesis": "0"}
492
+ {"question_id": "f685340e_abs", "hypothesis": "Every other week."}
493
+ {"question_id": "89941a94", "hypothesis": "Yes — a road bike."}
494
+ {"question_id": "07741c45", "hypothesis": "shoe rack"}
495
+ {"question_id": "7161e7e2", "hypothesis": "8 am - 4 pm (Day Shift)"}
496
+ {"question_id": "c4f10528", "hypothesis": "Miss Bee Providore"}
497
+ {"question_id": "89527b6b", "hypothesis": "I can only recover that the dinosaur-book conversation was on 2023/05/23 and the visible excerpt includes a T-Rex with a green scaly body; the Plesiosaur image description and its body color are not present in the provided evidence."}
498
+ {"question_id": "e9327a54", "hypothesis": "I don’t have the dessert shop’s name in the retrieved memory."}
499
+ {"question_id": "4c36ccef", "hypothesis": "Roscioli"}
500
+ {"question_id": "6ae235be", "hypothesis": "Atmospheric distillation, fluid catalytic cracking (FCC), alkylation, and hydrotreating."}