2009YU commited on 24 days ago

Commit

3a64edb

verified ·

1 Parent(s): e8088bb

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

.gitattributes +5 -34
.gitignore +5 -0
LICENSE +21 -0
README.md +207 -0
README.zh-CN.md +207 -0
assets/tmcra-logo.png +3 -0
code/memory_adapters.py +0 -0
code/profile_layer.py +748 -0
code/run_lme_s10_native_tmcra.py +0 -0
docs/BASELINE_S500_20260525.md +35 -0
docs/OPTIONAL_MODULES_AND_PARALLEL.md +347 -0
docs/OPTIONAL_MODULES_AND_PARALLEL.zh-CN.md +345 -0
docs/TRAINING.md +113 -0
github_push.err.log +2 -0
github_push.out.log +2 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_01.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_02.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_03.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_04.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_05.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_06.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_07.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_step_0000600.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_step_0000650.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_01.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_02.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_03.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_04.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_05.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_07.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_step_0000600.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_step_0000650.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/export_manifest.json +1 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/launch_train.sh +48 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_best.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/nohup.log +0 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_best.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt +3 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train.log +84 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train.pid +1 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train_summary.json +0 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train_summary.partial.json +1 -0
models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/training_issues.jsonl +1 -0
results/judge_gpt4o_alias_vectorengine.jsonl +500 -0
results/judge_gpt4o_alias_vectorengine.jsonl.summary.json +36 -0
results/lme_s500_frozen_baseline38_full10_20260525_results.tar.gz +3 -0
results/predictions.jsonl +500 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,6 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.sqlite3 filter=lfs diff=lfs merge=lfs -text
+*.jsonl filter=lfs diff=lfs merge=lfs -text
+*.json linguist-detectable=true
+assets/tmcra-logo.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/
+*.pyc
+.DS_Store
+Thumbs.db
+*.pid

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 TMCRA contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+# TMCRA Long-Memory Runtime
+<p align="center">
+  <img src="assets/tmcra-logo.png" alt="TMCRA logo" width="420">
+</p>
+[中文版本](README.zh-CN.md)
+TMCRA is a runtime layer that gives agent systems long-term memory. It helps an LLM retrieve, connect, and reason over long dialogue histories without exposing the full conversation context on every turn.
+This repository contains a frozen TMCRA baseline package with model weights, runtime code snapshots, training metadata, and LongMemEval S500 benchmark results.
+License: MIT.
+## What TMCRA Does
+TMCRA adds a dedicated memory runtime between an agent application and its answer model.
+At write time, TMCRA turns dialogue into memory nodes, event units, profile signals, and graph paths. This lets the system preserve not only isolated facts, but also relationships between facts across turns and sessions.
+At retrieval time, TMCRA scores graph nodes and paths, selects compact evidence, and injects only the most relevant memory context into the answer model. The answer model still performs natural-language reasoning, while TMCRA handles long-memory organization, recall, and evidence surfacing.
+The current runtime focuses on:
+- user fact memory
+- assistant-response memory
+- profile and preference memory
+- temporal memory
+- cross-session graph tunneling
+- learned node/path scoring
+- compact evidence selection for downstream LLMs
+## Algorithm Structure
+```mermaid
+flowchart LR
+    A["Dialogue Turns"] --> B["Writer Layer"]
+    B --> C["Memory Nodes"]
+    B --> D["Event Units"]
+    B --> E["Profile Signals"]
+    B --> F["Temporal Signals"]
+    C --> G["Graph Memory Store"]
+    D --> G
+    E --> G
+    F --> G
+    G --> H["Learned Node Scorer"]
+    G --> I["Learned Path Scorer"]
+    H --> J["Evidence Selection"]
+    I --> J
+    J --> K["Compact Memory Context"]
+    K --> L["Answer Model"]
+    L --> M["Agent Response"]
+    N["User Query"] --> H
+    N --> I
+    N --> J
+    N --> L
+```
+The writer layer produces memory units from dialogue. The graph memory store preserves facts, events, profile signals, temporal signals, and cross-session links. The learned node/path scorers select relevant evidence for the current query, and the answer model uses that compact evidence to produce the final response.
+## Why TMCRA
+Long-running agents need more than simple vector recall. They need to preserve user facts, preferences, timeline changes, cross-session events, and multi-step evidence chains.
+TMCRA organizes memory into graph nodes and learned retrieval paths, then surfaces compact evidence to the answer model. The goal is to let external agents use long-term memory through a runtime/API layer while keeping the memory algorithm and model weights independently deployable.
+## How to Use
+For inference/runtime use, load the graph scorer weights from:
+```text
+models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/
+```
+The main runtime files are:
+```text
+node_scorer.pt
+path_scorer.pt
+export_manifest.json
+```
+A typical runtime configuration points TMCRA to these weights:
+```bash
+export TMCRA_NODE_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt"
+export TMCRA_PATH_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt"
+export TMCRA_RETRIEVAL_MODE="hybrid_node_scored"
+export TMCRA_REQUIRE_LEARNED_SCORER="1"
+```
+The evaluation entrypoint snapshot is:
+```text
+code/run_lme_s10_native_tmcra.py
+```
+The core adapter snapshot is:
+```text
+code/memory_adapters.py
+```
+For a deployment build, load the two scorer files into the TMCRA adapter and point the agent's memory middleware to the TMCRA retrieval API. The answer model can be any OpenAI-compatible or local LLM endpoint; TMCRA supplies the selected memory evidence, and the answer model produces the final response.
+## Dependency Environment
+The included code snapshot is Python-based. A practical runtime environment should include:
+- Python 3.10 or newer
+- PyTorch, with CUDA recommended for model inference
+- NumPy and standard Python data-processing libraries
+- an OpenAI-compatible or local LLM endpoint for the answer layer and writer layer
+- optional Git LFS support when pulling the full model package from GitHub
+The benchmark scripts expect LongMemEval-format input data and write JSONL predictions and judge outputs. Runtime deployments can use the same model files without running the benchmark harness.
+## Optional Modules
+TMCRA also keeps optional extension points for retrieval and planning experiments. These modules can be enabled in deployment or evaluation builds when the target use case needs them.
+- **Embedder interface**: an optional semantic embedding channel that can run alongside the graph-memory scorer. It is intended to provide additional dense semantic recall before or during graph evidence selection, without replacing the learned graph node/path scorers.
+- **LLM planner interface**: an optional planner hook that can use an external LLM to organize evidence, expand query intent, or create an answer plan before the final answer call. This is useful for experiments and higher-cost deployments, while the default baseline keeps the core graph scorer path independently measurable.
+These interfaces are integration points, not required dependencies for the frozen S500 baseline. They are designed so downstream deployments can decide whether to run a lighter scorer-only path or a heavier path with embedder/planner assistance.
+## Included Artifacts
+- `code/`: runtime and evaluation code snapshot for this baseline.
+- `models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/`: full trained graph-model output directory.
+- `results/`: predictions, judge output, summary metrics, and compressed run artifacts.
+- `docs/`: baseline record and result notes.
+Additional documentation:
+- `docs/BASELINE_S500_20260525.md`: benchmark record and subtask metrics.
+- `docs/TRAINING.md`: model training direction and released training artifacts.
+- `docs/OPTIONAL_MODULES_AND_PARALLEL.md`: optional embedder/planner integration points and parallel evaluation plan.
+- `docs/OPTIONAL_MODULES_AND_PARALLEL.zh-CN.md`: Chinese version of the optional module and parallel evaluation guide.
+## Model Package
+The included model package preserves the full training output for the graph scorer stack:
+- `node_scorer.pt` and `path_scorer.pt`: runtime graph scoring weights.
+- `node_scorer_best.pt` and `path_scorer_best.pt`: best checkpoint aliases.
+- `node_scorer_last.pt` and `path_scorer_last.pt`: final training aliases.
+- `checkpoints/`: epoch and step checkpoints.
+- `export_manifest.json`, `train_summary.json`, and `train.log`: model metadata and training trace.
+## Current Strengths
+- Strong direct user-fact recall in single-session settings.
+- Strong assistant-detail recall.
+- Competitive knowledge-update behavior for changing facts.
+- Working temporal and preference retrieval layers with clear room for further specialization.
+## Active Improvement Areas
+- Multi-session aggregation and unit coverage.
+- Deeper time-graph reasoning.
+- Preference-profile abstraction and cross-session tunneling.
+- Query-graph to memory-graph matching for complex questions.
+## Intended Use
+This repository is a public-facing evidence package for TMCRA's long-memory runtime work. It is suitable for:
+- Benchmark review.
+- Model and result inspection.
+- Reproducing the frozen baseline.
+- Demonstrating how TMCRA can be packaged as an external memory runtime for agents.
+## Benchmark Result
+This package includes a full LongMemEval S500 run.
+- Benchmark: LongMemEval S set, 500 samples
+- Evaluation: official-compatible LongMemEval judge prompt
+- Judge model: `gpt-4o`, resolved as `gpt-4o-2024-08-06`
+- Writer layer used in this run: DeepSeek v4 Flash
+- Answer layer used in this run: GPT5.4-compatible API
+- Overall accuracy: `310 / 500 = 62.00%`
+## Results by Task Type
+| task type | accuracy | count |
+| --- | ---: | ---: |
+| single-session-user | 81.43% | 70 |
+| single-session-assistant | 78.57% | 56 |
+| knowledge-update | 70.51% | 78 |
+| temporal-reasoning | 63.16% | 133 |
+| single-session-preference | 56.67% | 30 |
+| multi-session | 39.85% | 133 |
+The benchmark outputs are available in:
+```text
+results/predictions.jsonl
+results/judge_gpt4o_alias_vectorengine.jsonl
+results/judge_gpt4o_alias_vectorengine.jsonl.summary.json
+results/lme_s500_frozen_baseline38_full10_20260525_results.tar.gz
+```

README.zh-CN.md ADDED Viewed

	@@ -0,0 +1,207 @@

+# TMCRA 长记忆运行时
+<p align="center">
+  <img src="assets/tmcra-logo.png" alt="TMCRA logo" width="420">
+</p>
+[English version](README.md)
+TMCRA 是给 Agent 系统提供长期记忆能力的运行时层。它帮助大语言模型在长对话历史中检索、连接和推理相关记忆，而不需要在每一轮都暴露完整上下文。
+本仓库包含一个冻结的 TMCRA 基线包，包括模型权重、运行时代码快照、训练元数据和 LongMemEval S500 测评结果。
+许可证：MIT。
+## TMCRA 的作用
+TMCRA 在 Agent 应用和回答模型之间增加一层专门的记忆运行时。
+在写入阶段，TMCRA 会把对话转成记忆节点、事件单元、profile 信号和图路径。这让系统不仅能保存孤立事实，也能保存不同轮次、不同会话之间的事实关系。
+在召回阶段，TMCRA 会对图节点和路径进行打分，选择紧凑的证据，并只把最相关的记忆上下文注入给回答模型。回答模型仍然负责自然语言推理和最终表达，TMCRA 负责长记忆组织、召回和证据呈现。
+当前运行时重点覆盖：
+- 用户事实记忆
+- 助手回答细节记忆
+- profile 与偏好记忆
+- 时间记忆
+- 跨会话图隧穿
+- 学习式节点/路径打分
+- 面向下游 LLM 的紧凑证据选择
+## 算法结构图
+```mermaid
+flowchart LR
+    A["对话轮次"] --> B["写入层"]
+    B --> C["记忆节点"]
+    B --> D["事件单元"]
+    B --> E["Profile 信号"]
+    B --> F["时间信号"]
+    C --> G["图记忆库"]
+    D --> G
+    E --> G
+    F --> G
+    G --> H["学习式节点 Scorer"]
+    G --> I["学习式路径 Scorer"]
+    H --> J["证据选择"]
+    I --> J
+    J --> K["紧凑记忆上下文"]
+    K --> L["回答模型"]
+    L --> M["Agent 回复"]
+    N["用户问题"] --> H
+    N --> I
+    N --> J
+    N --> L
+```
+写入层从对话中生成记忆单元。图记忆库保存事实、事件、profile 信号、时间信号和跨会话连接。学习式节点/路径 scorer 会根据当前问题选择相关证据，回答模型使用这些紧凑证据生成最终回复。
+## 为什么需要 TMCRA
+长期运行的 Agent 不能只依赖简单向量召回。它需要保存用户事实、偏好、时间线变化、跨会话事件，以及多步证据链。
+TMCRA 将记忆组织成图节点和学习得到的召回路径，再把压缩后的证据提供给回答模型。目标是让外部 Agent 可以通过运行时/API 层使用长期记忆，同时保持记忆算法和模型权重可以独立部署。
+## 如何使用
+推理或运行时使用时，加载下面目录中的图 scorer 权重：
+```text
+models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/
+```
+主要运行时文件是：
+```text
+node_scorer.pt
+path_scorer.pt
+export_manifest.json
+```
+典型运行配置如下：
+```bash
+export TMCRA_NODE_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt"
+export TMCRA_PATH_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt"
+export TMCRA_RETRIEVAL_MODE="hybrid_node_scored"
+export TMCRA_REQUIRE_LEARNED_SCORER="1"
+```
+测评入口代码快照：
+```text
+code/run_lme_s10_native_tmcra.py
+```
+核心适配器代码快照：
+```text
+code/memory_adapters.py
+```
+部署时，将两个 scorer 文件加载到 TMCRA adapter 中，并让 Agent 的记忆中间件调用 TMCRA 召回 API。回答模型可以是任意 OpenAI 兼容接口或本地 LLM；TMCRA 提供选中的记忆证据，回答模型生成最终回复。
+## 依赖环境
+当前代码快照基于 Python。建议运行环境包括：
+- Python 3.10 或更高版本
+- PyTorch，推荐使用 CUDA 做模型推理
+- NumPy 及常见 Python 数据处理库
+- 用于回答层和写入层的 OpenAI 兼容接口或本地 LLM endpoint
+- 从 GitHub 拉取完整模型包时建议支持 Git LFS
+benchmark 脚本使用 LongMemEval 格式输入数据，并输出 JSONL 格式的预测和 judge 结果。实际运行时部署可以直接使用同一套模型文件，不需要运行 benchmark harness。
+## 可开启模块
+TMCRA 当前也保留了用于召回和规划实验的可选接入口。部署或评估时，如果目标场景需要，可以按需开启这些模块。
+- **Embedder 接入口**：可选的语义 embedding 通道，可以和图记忆 scorer 并行工作。它用于在图证据选择前或选择过程中提供额外的稠密语义召回，但不替代学习式图节点/路径 scorer。
+- **LLM planner 接入口**：可选的规划 hook，可以调用外部 LLM 来整理证据、扩展 query intent，或在最终回答前生成 answer plan。它适合实验和更高成本的部署场景；默认基线仍保持核心图 scorer 链路可以独立测量。
+这些接口是可集成的扩展点，不是冻结 S500 基线的必需依赖。下游部署可以根据成本和效果选择轻量的 scorer-only 路径，或开��� embedder/planner 辅助的增强路径。
+## 包含内容
+- `code/`：该基线使用的运行时和测评代码快照。
+- `models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/`：完整训练后的图模型输出目录。
+- `results/`：预测结果、judge 输出、汇总指标和压缩后的运行结果。
+- `docs/`：基线记录和结果说明。
+补充文档：
+- `docs/BASELINE_S500_20260525.md`：benchmark 记录和子任务指标。
+- `docs/TRAINING.md`：模型训练方向和随包发布的训练产物说明。
+- `docs/OPTIONAL_MODULES_AND_PARALLEL.zh-CN.md`：embedder/planner 可选接入口和并行评估推进方案。
+- `docs/OPTIONAL_MODULES_AND_PARALLEL.md`：上述文档的英文版本。
+## 模型包
+随包发布的模型目录保留了图 scorer 栈的完整训练输出：
+- `node_scorer.pt` 和 `path_scorer.pt`：运行时图打分权重。
+- `node_scorer_best.pt` 和 `path_scorer_best.pt`：验证表现最好的 checkpoint 别名。
+- `node_scorer_last.pt` 和 `path_scorer_last.pt`：训练结束时的 checkpoint 别名。
+- `checkpoints/`：按 epoch 和 step 保存的中间 checkpoint。
+- `export_manifest.json`、`train_summary.json` 和 `train.log`：模型元数据和训练轨迹。
+## 当前优势
+- 单会话直接用户事实召回较强。
+- 助手细节召回较强。
+- 对变化事实的 knowledge-update 任务有较稳定表现。
+- 已具备可工作的时间记忆和偏好召回层，后续仍有专项优化空间。
+## 当前改进方向
+- multi-session 聚合和 unit coverage。
+- 更深的时间图推理。
+- 偏好 profile 抽象和跨会话隧穿。
+- query graph 与 memory graph 的复杂问题匹配。
+## 适用场景
+本仓库是 TMCRA 长记忆运行时工作的对外展示包，适合用于：
+- benchmark 审阅
+- 模型和结果检查
+- 复现冻结基线
+- 展示 TMCRA 如何作为外部 Agent 的记忆运行时进行封装
+## Benchmark 结果
+本包包含一次完整 LongMemEval S500 测试。
+- Benchmark：LongMemEval S set，500 条样本
+- Evaluation：official-compatible LongMemEval judge prompt
+- Judge 模型：`gpt-4o`，解析为 `gpt-4o-2024-08-06`
+- 本轮使用的写入层：DeepSeek v4 Flash
+- 本轮使用的回答层：GPT5.4-compatible API
+- 总准确率：`310 / 500 = 62.00%`
+## 各任务类型结果
+| 任务类型 | 准确率 | 数量 |
+| --- | ---: | ---: |
+| single-session-user | 81.43% | 70 |
+| single-session-assistant | 78.57% | 56 |
+| knowledge-update | 70.51% | 78 |
+| temporal-reasoning | 63.16% | 133 |
+| single-session-preference | 56.67% | 30 |
+| multi-session | 39.85% | 133 |
+benchmark 输出文件位于：
+```text
+results/predictions.jsonl
+results/judge_gpt4o_alias_vectorengine.jsonl
+results/judge_gpt4o_alias_vectorengine.jsonl.summary.json
+results/lme_s500_frozen_baseline38_full10_20260525_results.tar.gz
+```

assets/tmcra-logo.png ADDED Viewed

Git LFS Details

SHA256: 95a8fc61e6ace316cc46ccaba1208b5b40842008abe273c2f4d875dd5b5a3601
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

code/memory_adapters.py ADDED Viewed

The diff for this file is too large to render. See raw diff

code/profile_layer.py ADDED Viewed

	@@ -0,0 +1,748 @@

+from __future__ import annotations
+import re
+from typing import Any, Iterable, Mapping, Sequence
+PROFILE_CATEGORIES = {"profile", "preference", "goal", "constraint", "stage_state", "status"}
+PROFILE_TYPES = {"setup", "preference", "constraint", "goal", "avoid", "usage_context"}
+PROFILE_SEMANTIC_SLOTS = {"identity", "research_topic", "education", "occupation"}
+PROFILE_AGGREGATE_SOURCE_KIND = "public_dialog_profile"
+PROFILE_AGGREGATE_CATEGORY = "profile"
+PROFILE_CLUSTER_SOURCE_KIND = "public_dialog_profile_cluster"
+PROFILE_CLUSTER_CATEGORY = "profile"
+PROFILE_CONSOLIDATOR_VERSION = "profile_consolidator_v1_structured_summary"
+_PROFILE_QUERY_MARKERS = (
+    "preference",
+    "prefer",
+    "like",
+    "dislike",
+    "recommend",
+    "suggest",
+    "advice",
+    "advise",
+    "any advice",
+    "any suggestions",
+    "any tips",
+    "tips",
+    "trouble with",
+    "struggling with",
+    "what do you think",
+    "learn more",
+    "resources",
+    "suited",
+    "fit me",
+    "for me",
+    "based on me",
+    "based on my",
+    "my setup",
+    "my profile",
+    "my goal",
+    "my constraint",
+    "my occupation",
+    "my previous occupation",
+    "my role",
+    "my previous role",
+    "my job",
+    "my previous job",
+    "what was my",
+    "where did i work",
+    "worked as",
+    "occupation",
+    "previous occupation",
+    "role",
+    "previous role",
+    "job",
+    "career",
+    "background",
+    "identity",
+    "experience",
+    "avoid",
+    "what should i",
+    "should i",
+    "what should",
+    "serve",
+    "dinner",
+    "homegrown",
+    "ingredients",
+    "battery life",
+    "getting around",
+    "偏好",
+    "喜欢",
+    "不喜欢",
+    "推荐",
+    "建议",
+    "适合我",
+    "根据我",
+    "我的情况",
+    "我的配置",
+    "我的目标",
+    "我的约束",
+    "画像",
+    "避免",
+)
+def _clean_text(value: Any) -> str:
+    return re.sub(r"\s+", " ", str(value or "").strip())
+def _normalize(value: Any) -> str:
+    return _clean_text(value).lower()
+def _tokens(value: Any) -> list[str]:
+    text = _normalize(value)
+    english = re.findall(r"[a-z0-9_]+", text)
+    cjk = [char for char in text if "\u4e00" <= char <= "\u9fff"]
+    return _dedupe([*english, *cjk])
+def _slug(value: Any, *, fallback: str = "general") -> str:
+    text = _normalize(value)
+    parts = re.findall(r"[a-z0-9]+|[\u4e00-\u9fff]", text)
+    slug = "_".join(parts[:10]).strip("_")
+    return slug or fallback
+def _dedupe(items: Iterable[Any], *, max_items: int | None = None) -> list[str]:
+    values: list[str] = []
+    seen: set[str] = set()
+    for item in items:
+        text = _clean_text(item)
+        if not text:
+            continue
+        key = _normalize(text)
+        if key in seen:
+            continue
+        seen.add(key)
+        values.append(text)
+        if max_items is not None and len(values) >= max_items:
+            break
+    return values
+def _first_nonempty(*values: Any) -> str:
+    for value in values:
+        text = _clean_text(value)
+        if text:
+            return text
+    return ""
+def _bounded_union(*groups: Iterable[Any], max_items: int) -> list[str]:
+    return _dedupe([item for group in groups for item in group], max_items=max_items)
+def _bounded_int_union(*groups: Iterable[Any], max_items: int) -> list[int]:
+    values: list[int] = []
+    seen: set[int] = set()
+    for group in groups:
+        for item in group:
+            try:
+                value = int(item)
+            except Exception:
+                continue
+            if value in seen:
+                continue
+            seen.add(value)
+            values.append(value)
+            if len(values) >= max_items:
+                return sorted(values)
+    return sorted(values)
+def is_profile_layer_record(*, category: Any, source_kind: Any = "", semantic_slot: Any = "", metadata: Mapping[str, Any] | None = None) -> bool:
+    data = dict(metadata or {})
+    if bool(data.get("profile_layer")):
+        return True
+    category_text = _normalize(category)
+    slot_text = _normalize(semantic_slot or data.get("semantic_slot", ""))
+    source_text = _normalize(source_kind)
+    if category_text in PROFILE_CATEGORIES:
+        return True
+    if slot_text in PROFILE_SEMANTIC_SLOTS or slot_text.startswith("profile_"):
+        return True
+    return source_text in {
+        "public_dialog_profile",
+        "public_dialog_profile_cluster",
+        "public_dialog_preference",
+        "public_dialog_goal",
+        "public_dialog_constraint",
+    }
+def infer_profile_type(*, category: Any, semantic_slot: Any = "", relation: Any = "", value: Any = "", metadata: Mapping[str, Any] | None = None) -> str:
+    data = dict(metadata or {})
+    explicit = _normalize(data.get("profile_type", ""))
+    if explicit in PROFILE_TYPES:
+        return explicit
+    category_text = _normalize(category)
+    combined = _normalize(f"{semantic_slot} {relation} {value}")
+    if category_text == "preference" or any(marker in combined for marker in ("prefer", "like", "default", "偏好", "喜欢", "默认")):
+        return "preference"
+    if category_text == "constraint" or any(marker in combined for marker in ("must", "cannot", "forbid", "constraint", "必须", "不能", "约束")):
+        return "constraint"
+    if category_text == "goal" or any(marker in combined for marker in ("goal", "target", "objective", "目标")):
+        return "goal"
+    if any(marker in combined for marker in ("avoid", "dislike", "do not", "don't", "避免", "不喜欢", "不要")):
+        return "avoid"
+    if any(marker in combined for marker in ("setup", "environment", "workflow", "current", "配置", "环境", "流程", "当前")):
+        return "setup"
+    return "usage_context"
+def infer_profile_domain(
+    *,
+    category: Any,
+    semantic_slot: Any = "",
+    slot_key: Any = "",
+    anchors: Sequence[Any] = (),
+    value: Any = "",
+    metadata: Mapping[str, Any] | None = None,
+) -> tuple[str, str]:
+    data = dict(metadata or {})
+    explicit = _first_nonempty(data.get("profile_domain", ""), data.get("profile_domain_label", ""), data.get("domain", ""))
+    if explicit:
+        return _slug(explicit), explicit
+    subject = _first_nonempty(data.get("subject", ""), data.get("extracted_subject", ""))
+    if subject:
+        return _slug(subject), subject
+    slot = _normalize(semantic_slot)
+    if slot and slot not in {"profile", "preference", "goal", "constraint", "event", "fact", "status"}:
+        return _slug(slot), slot.replace("_", " ")
+    for anchor in anchors:
+        anchor_text = _clean_text(anchor)
+        if not anchor_text:
+            continue
+        if re.search(r"^\d{4}|\d{1,2}\s+[A-Za-z]+|^[A-Z][a-z]+$", anchor_text):
+            continue
+        return _slug(anchor_text), anchor_text
+    slot_key_text = _clean_text(slot_key)
+    if slot_key_text:
+        tail = slot_key_text.split(".")[-1].replace("_", " ")
+        if tail:
+            return _slug(tail), tail
+    value_tokens = _tokens(value)
+    if value_tokens:
+        label = " ".join(value_tokens[:4])
+        return _slug(label), label
+    return _slug(category), _normalize(category) or "general"
+def profile_candidate_metadata(
+    *,
+    category: Any,
+    semantic_slot: Any = "",
+    relation: Any = "",
+    value: Any = "",
+    source_span: Any = "",
+    slot_key: Any = "",
+    anchors: Sequence[Any] = (),
+    subject: Any = "",
+    subject_signature: Any = "",
+    proposal: Mapping[str, Any] | None = None,
+) -> dict[str, Any]:
+    proposal_data = dict(proposal or {})
+    base_metadata = {
+        "profile_type": proposal_data.get("profile_type", ""),
+        "profile_domain": proposal_data.get("profile_domain", ""),
+        "profile_domain_label": proposal_data.get("profile_domain_label", ""),
+        "subject": subject,
+        "extracted_subject": proposal_data.get("extracted_subject", ""),
+    }
+    if not is_profile_layer_record(category=category, semantic_slot=semantic_slot, metadata=base_metadata):
+        return {}
+    profile_type = infer_profile_type(
+        category=category,
+        semantic_slot=semantic_slot,
+        relation=relation,
+        value=value or source_span,
+        metadata=proposal_data,
+    )
+    domain, domain_label = infer_profile_domain(
+        category=category,
+        semantic_slot=semantic_slot,
+        slot_key=slot_key,
+        anchors=anchors,
+        value=value or source_span,
+        metadata={**proposal_data, "subject": subject},
+    )
+    normalized_subject_signature = _slug(subject_signature or subject or domain, fallback=domain)
+    route_terms = _dedupe(
+        [
+            profile_type,
+            domain_label,
+            semantic_slot,
+            subject,
+            *list(anchors or []),
+        ],
+        max_items=12,
+    )
+    return {
+        "profile_layer": True,
+        "profile_candidate_status": _clean_text(proposal_data.get("profile_candidate_status", "")) or "writer_candidate",
+        "profile_consolidation_stage": _clean_text(proposal_data.get("profile_consolidation_stage", "")) or "pre_consolidation",
+        "profile_type": profile_type,
+        "profile_domain": domain,
+        "profile_domain_label": domain_label,
+        "profile_subject_signature": normalized_subject_signature,
+        "profile_support_key": f"{profile_type}:{domain}:{normalized_subject_signature}",
+        "profile_route_terms": route_terms,
+    }
+def profile_aggregate_slot_key(metadata: Mapping[str, Any]) -> str:
+    data = dict(metadata or {})
+    support_key = _clean_text(data.get("profile_support_key", ""))
+    if support_key:
+        return f"tmcra.profile.aggregate.{_slug(support_key)}"
+    profile_type = _normalize(data.get("profile_type", "")) or "usage_context"
+    domain = _normalize(data.get("profile_domain", "")) or _slug(data.get("profile_domain_label", "general"))
+    subject = _normalize(data.get("profile_subject_signature", "")) or domain
+    return f"tmcra.profile.aggregate.{_slug(f'{profile_type}:{domain}:{subject}')}"
+def profile_aggregate_value(
+    *,
+    profile_type: Any,
+    domain_label: Any,
+    support_values: Sequence[Any],
+) -> str:
+    typed = _clean_text(profile_type) or "usage_context"
+    domain = _clean_text(domain_label) or "general"
+    values = _dedupe(support_values, max_items=5)
+    if not values:
+        return f"User {typed} profile for {domain}."
+    return f"User {typed} profile for {domain}: " + "; ".join(values)
+def _profile_output_kind(profile_types: Sequence[Any]) -> str:
+    normalized = {_normalize(item) for item in profile_types if _clean_text(item)}
+    if "constraint" in normalized:
+        return "constraint_profile"
+    if "goal" in normalized:
+        return "goal_profile"
+    if normalized.intersection({"preference", "avoid"}):
+        return "preference_profile"
+    if "setup" in normalized:
+        return "setup_profile"
+    return "usage_context_profile"
+def _profile_update_policy(profile_types: Sequence[Any]) -> str:
+    normalized = {_normalize(item) for item in profile_types if _clean_text(item)}
+    if "constraint" in normalized:
+        return "preserve_until_explicitly_changed"
+    if normalized.intersection({"preference", "avoid", "goal", "setup"}):
+        return "update_on_newer_user_evidence"
+    return "background_context"
+def _profile_memory_type(profile_types: Sequence[Any]) -> str:
+    normalized = {_normalize(item) for item in profile_types if _clean_text(item)}
+    if "constraint" in normalized:
+        return "hard_constraint"
+    if normalized.intersection({"preference", "avoid"}):
+        return "durable_preference"
+    return "profile_context"
+def profile_summary(
+    *,
+    profile_types: Sequence[Any],
+    domain_label: Any,
+    support_values: Sequence[Any],
+    stage: str,
+) -> str:
+    kind = _profile_output_kind(profile_types)
+    domain = _clean_text(domain_label) or "general"
+    values = _dedupe(support_values, max_items=6 if stage == "cluster" else 4)
+    prefix = {
+        "constraint_profile": "User constraint profile",
+        "goal_profile": "User goal profile",
+        "preference_profile": "User preference profile",
+        "setup_profile": "User setup profile",
+        "usage_context_profile": "User usage-context profile",
+    }.get(kind, "User profile")
+    if not values:
+        return f"{prefix} for {domain}."
+    return f"{prefix} for {domain}: " + "; ".join(values)
+def build_profile_aggregate_metadata(
+    *,
+    support_record_id: Any,
+    support_turn_index: int,
+    support_value: Any,
+    support_anchors: Sequence[Any],
+    support_metadata: Mapping[str, Any],
+    existing_metadata: Mapping[str, Any] | None = None,
+) -> dict[str, Any]:
+    support = dict(support_metadata or {})
+    existing = dict(existing_metadata or {})
+    profile_type = _normalize(support.get("profile_type", "")) or "usage_context"
+    domain = _normalize(support.get("profile_domain", "")) or _slug(support.get("profile_domain_label", "general"))
+    domain_label = _clean_text(support.get("profile_domain_label", "")) or domain.replace("_", " ")
+    subject_signature = _clean_text(support.get("profile_subject_signature", "")) or domain
+    support_ids = _bounded_union(existing.get("profile_support_ids", []) or [], [support_record_id], max_items=64)
+    support_turns = _bounded_int_union(existing.get("profile_support_turns", []) or [], [support_turn_index], max_items=64)
+    support_values = _bounded_union(existing.get("profile_support_values", []) or [], [support_value], max_items=12)
+    support_route_terms = _bounded_union(
+        existing.get("profile_route_terms", []) or [],
+        support.get("profile_route_terms", []) or [],
+        support_anchors,
+        max_items=24,
+    )
+    profile_types = [profile_type]
+    output_kind = _profile_output_kind(profile_types)
+    memory_type = _profile_memory_type(profile_types)
+    summary = profile_summary(
+        profile_types=profile_types,
+        domain_label=domain_label,
+        support_values=support_values,
+        stage="aggregate",
+    )
+    value = profile_aggregate_value(
+        profile_type=profile_type,
+        domain_label=domain_label,
+        support_values=support_values,
+    )
+    return {
+        **existing,
+        "profile_layer": True,
+        "profile_candidate_status": "consolidated",
+        "profile_consolidation_stage": "aggregate",
+        "profile_consolidator_version": PROFILE_CONSOLIDATOR_VERSION,
+        "profile_aggregate_node": True,
+        "profile_type": profile_type,
+        "profile_domain": domain,
+        "profile_domain_label": domain_label,
+        "profile_subject_signature": subject_signature,
+        "profile_support_key": f"{profile_type}:{domain}:{subject_signature}",
+        "profile_support_ids": support_ids,
+        "profile_support_turns": support_turns,
+        "profile_support_values": support_values,
+        "profile_support_count": len(support_ids),
+        "profile_route_terms": support_route_terms,
+        "profile_value": value,
+        "profile_summary": summary,
+        "profile_output_kind": output_kind,
+        "profile_update_policy": _profile_update_policy(profile_types),
+        "profile_conflict_policy": "latest_active_support_only",
+        "profile_evidence_count": len(support_ids),
+        "memory_type": memory_type,
+        "durable_memory_type": memory_type,
+        "memory_chain_depth_layer": "profile",
+        "depth_layer": "profile",
+    }
+_PROFILE_CLUSTER_STOPWORDS = {
+    "user",
+    "profile",
+    "preference",
+    "preferences",
+    "constraint",
+    "constraints",
+    "goal",
+    "goals",
+    "avoid",
+    "usage",
+    "context",
+    "general",
+    "default",
+    "should",
+    "would",
+    "could",
+    "want",
+    "wants",
+    "need",
+    "needs",
+    "when",
+    "then",
+    "than",
+    "with",
+    "that",
+    "this",
+    "from",
+    "into",
+    "instead",
+}
+def profile_cluster_tokens(*values: Any) -> list[str]:
+    tokens: list[str] = []
+    for value in values:
+        if isinstance(value, (list, tuple, set)):
+            for item in value:
+                tokens.extend(_tokens(item))
+        else:
+            tokens.extend(_tokens(value))
+    filtered = [
+        token
+        for token in tokens
+        if token
+        and token not in _PROFILE_CLUSTER_STOPWORDS
+        and not token.isdigit()
+        and (len(token) > 2 or any("\u4e00" <= char <= "\u9fff" for char in token))
+    ]
+    return _dedupe(filtered, max_items=32)
+def profile_cluster_similarity(source_metadata: Mapping[str, Any], target_metadata: Mapping[str, Any]) -> float:
+    source = dict(source_metadata or {})
+    target = dict(target_metadata or {})
+    source_tokens = set(
+        profile_cluster_tokens(
+            source.get("profile_domain_label", ""),
+            source.get("profile_domain", ""),
+            source.get("profile_route_terms", []) or [],
+            source.get("profile_support_values", []) or [],
+            source.get("profile_value", ""),
+        )
+    )
+    target_tokens = set(
+        profile_cluster_tokens(
+            target.get("profile_domain_label", ""),
+            target.get("profile_domain", ""),
+            target.get("profile_route_terms", []) or [],
+            target.get("profile_support_values", []) or [],
+            target.get("profile_value", ""),
+        )
+    )
+    if not source_tokens or not target_tokens:
+        return 0.0
+    overlap = len(source_tokens & target_tokens) / max(1, len(source_tokens | target_tokens))
+    containment = len(source_tokens & target_tokens) / max(1, min(len(source_tokens), len(target_tokens)))
+    type_bonus = 0.06 if _normalize(source.get("profile_type", "")) == _normalize(target.get("profile_type", "")) else 0.0
+    return round(min(1.0, (0.62 * overlap) + (0.38 * containment) + type_bonus), 6)
+def profile_cluster_slot_key(metadata: Mapping[str, Any]) -> str:
+    data = dict(metadata or {})
+    tokens = profile_cluster_tokens(
+        data.get("profile_domain_label", ""),
+        data.get("profile_domain", ""),
+        data.get("profile_route_terms", []) or [],
+        data.get("profile_support_values", []) or [],
+        data.get("profile_value", ""),
+    )
+    if tokens:
+        return f"tmcra.profile.cluster.{_slug('_'.join(tokens[:5]))}"
+    support_profiles = data.get("profile_support_profile_ids", []) or []
+    support_seed = _clean_text(support_profiles[0] if support_profiles else "")
+    seed = support_seed or _clean_text(data.get("profile_support_key", "general"))
+    return f"tmcra.profile.cluster.{_slug(seed)}"
+def profile_cluster_value(*, support_values: Sequence[Any]) -> str:
+    values = _dedupe(support_values, max_items=8)
+    if not values:
+        return "User profile cluster."
+    return "User profile cluster: " + "; ".join(values)
+def build_profile_cluster_metadata(
+    *,
+    support_profile_id: Any,
+    support_metadata: Mapping[str, Any],
+    existing_metadata: Mapping[str, Any] | None = None,
+) -> dict[str, Any]:
+    support = dict(support_metadata or {})
+    existing = dict(existing_metadata or {})
+    support_profile_ids = _bounded_union(
+        existing.get("profile_support_profile_ids", []) or [],
+        [support_profile_id],
+        max_items=32,
+    )
+    support_ids = _bounded_union(
+        existing.get("profile_support_ids", []) or [],
+        support.get("profile_support_ids", []) or [],
+        max_items=96,
+    )
+    support_turns = _bounded_int_union(
+        existing.get("profile_support_turns", []) or [],
+        support.get("profile_support_turns", []) or [],
+        max_items=96,
+    )
+    raw_support_values = support.get("profile_support_values", []) or []
+    support_values = _bounded_union(
+        existing.get("profile_support_values", []) or [],
+        raw_support_values,
+        [] if raw_support_values else [support.get("profile_value", "")],
+        max_items=16,
+    )
+    route_terms = _bounded_union(
+        existing.get("profile_route_terms", []) or [],
+        existing.get("profile_cluster_route_terms", []) or [],
+        support.get("profile_route_terms", []) or [],
+        profile_cluster_tokens(
+            support.get("profile_domain_label", ""),
+            support.get("profile_domain", ""),
+            support.get("profile_value", ""),
+            support.get("profile_support_values", []) or [],
+        ),
+        max_items=32,
+    )
+    profile_types = _bounded_union(
+        existing.get("profile_cluster_types", []) or [],
+        [support.get("profile_type", "")],
+        max_items=8,
+    )
+    domains = _bounded_union(
+        existing.get("profile_cluster_domains", []) or [],
+        [support.get("profile_domain", ""), support.get("profile_domain_label", "")],
+        max_items=16,
+    )
+    primary_type = _clean_text(profile_types[0] if profile_types else support.get("profile_type", "")) or "usage_context"
+    primary_domain = _clean_text(domains[0] if domains else support.get("profile_domain", "")) or "general"
+    output_kind = _profile_output_kind(profile_types)
+    memory_type = _profile_memory_type(profile_types)
+    summary = profile_summary(
+        profile_types=profile_types,
+        domain_label=primary_domain,
+        support_values=support_values,
+        stage="cluster",
+    )
+    value = profile_cluster_value(support_values=support_values)
+    return {
+        **existing,
+        "profile_layer": True,
+        "profile_candidate_status": "consolidated",
+        "profile_consolidation_stage": "cluster",
+        "profile_consolidator_version": PROFILE_CONSOLIDATOR_VERSION,
+        "profile_cluster_node": True,
+        "profile_type": primary_type,
+        "profile_domain": _slug(primary_domain),
+        "profile_domain_label": primary_domain.replace("_", " "),
+        "profile_subject_signature": _clean_text(existing.get("profile_subject_signature", "")) or _slug(primary_domain),
+        "profile_support_key": _clean_text(existing.get("profile_support_key", "")) or f"cluster:{_slug(primary_domain)}",
+        "profile_support_profile_ids": support_profile_ids,
+        "profile_support_ids": support_ids,
+        "profile_support_turns": support_turns,
+        "profile_support_values": support_values,
+        "profile_support_count": len(support_ids),
+        "profile_cluster_profile_count": len(support_profile_ids),
+        "profile_cluster_types": profile_types,
+        "profile_cluster_domains": domains,
+        "profile_cluster_route_terms": route_terms,
+        "profile_route_terms": route_terms,
+        "profile_value": value,
+        "profile_summary": summary,
+        "profile_output_kind": output_kind,
+        "profile_update_policy": _profile_update_policy(profile_types),
+        "profile_conflict_policy": "latest_active_support_only",
+        "profile_evidence_count": len(support_ids),
+        "memory_type": memory_type,
+        "durable_memory_type": memory_type,
+        "memory_chain_depth_layer": "profile",
+        "depth_layer": "profile",
+    }
+def infer_profile_query_intent(query: Any) -> dict[str, Any]:
+    query_text = _clean_text(query)
+    lowered = _normalize(query_text)
+    enabled = any(marker in lowered for marker in _PROFILE_QUERY_MARKERS)
+    types: list[str] = []
+    if any(marker in lowered for marker in ("prefer", "preference", "like", "dislike", "偏好", "喜欢", "不喜欢")):
+        types.append("preference")
+    if any(marker in lowered for marker in ("constraint", "must", "cannot", "policy", "约束", "限制", "必须", "不能")):
+        types.append("constraint")
+    if any(marker in lowered for marker in ("goal", "target", "objective", "目标")):
+        types.append("goal")
+    if any(marker in lowered for marker in ("avoid", "dislike", "避免", "不喜欢")):
+        types.append("avoid")
+    if any(marker in lowered for marker in ("setup", "profile", "current", "occupation", "role", "job", "career", "background", "identity", "experience", "配置", "画像", "当前")):
+        types.append("setup")
+    if enabled and any(
+        marker in lowered
+        for marker in (
+            "recommend",
+            "suggest",
+            "advice",
+            "tips",
+            "trouble with",
+            "struggling with",
+            "what do you think",
+            "should i",
+            "what should",
+            "serve",
+            "dinner",
+            "battery life",
+            "getting around",
+            "resources",
+            "learn more",
+            "推荐",
+            "建议",
+        )
+    ):
+        types.append("usage_context")
+    if not types and enabled:
+        types.append("usage_context")
+    return {
+        "enabled": enabled,
+        "types": _dedupe(types),
+        "tokens": _tokens(query_text),
+    }
+def profile_query_score_delta(
+    *,
+    query: Any,
+    query_tokens: set[str],
+    category: Any,
+    source_kind: Any,
+    semantic_slot: Any,
+    value: Any,
+    anchors: Sequence[Any],
+    metadata: Mapping[str, Any] | None = None,
+) -> tuple[float, str]:
+    data = dict(metadata or {})
+    intent = infer_profile_query_intent(query)
+    if not intent.get("enabled"):
+        return 0.0, ""
+    if not is_profile_layer_record(category=category, source_kind=source_kind, semantic_slot=semantic_slot, metadata=data):
+        return 0.0, ""
+    profile_type = _normalize(data.get("profile_type", "")) or infer_profile_type(
+        category=category,
+        semantic_slot=semantic_slot,
+        value=value,
+        metadata=data,
+    )
+    domain = _normalize(data.get("profile_domain_label", "") or data.get("profile_domain", ""))
+    route_terms = " ".join(str(item) for item in data.get("profile_route_terms", []) or [])
+    record_tokens = set(_tokens(f"{profile_type} {domain} {semantic_slot} {value} {' '.join(str(item) for item in anchors)} {route_terms}"))
+    overlap = len(set(query_tokens) & record_tokens) / max(1, len(set(query_tokens) | record_tokens)) if query_tokens or record_tokens else 0.0
+    type_match = profile_type in set(intent.get("types", []) or [])
+    delta = 0.20 + (0.16 if type_match else 0.0) + (0.18 * overlap)
+    if data.get("profile_candidate_status") == "consolidated":
+        delta += 0.06
+    if data.get("profile_cluster_node") or data.get("profile_consolidation_stage") == "cluster":
+        delta += 0.08
+    return round(min(delta, 0.54), 6), "profile_route"
+def profile_edge_score(source_metadata: Mapping[str, Any], target_metadata: Mapping[str, Any], *, source_value: Any = "", target_value: Any = "") -> tuple[float, str]:
+    source = dict(source_metadata or {})
+    target = dict(target_metadata or {})
+    if not source.get("profile_layer") or not target.get("profile_layer"):
+        return 0.0, ""
+    source_key = _normalize(source.get("profile_support_key", ""))
+    target_key = _normalize(target.get("profile_support_key", ""))
+    source_domain = _normalize(source.get("profile_domain", ""))
+    target_domain = _normalize(target.get("profile_domain", ""))
+    source_type = _normalize(source.get("profile_type", ""))
+    target_type = _normalize(target.get("profile_type", ""))
+    source_tokens = set(_tokens(f"{source_value} {source.get('profile_domain_label', '')} {' '.join(source.get('profile_route_terms', []) or [])}"))
+    target_tokens = set(_tokens(f"{target_value} {target.get('profile_domain_label', '')} {' '.join(target.get('profile_route_terms', []) or [])}"))
+    overlap = len(source_tokens & target_tokens) / max(1, len(source_tokens | target_tokens)) if source_tokens or target_tokens else 0.0
+    if source_key and source_key == target_key:
+        return round(0.72 + (0.14 * overlap), 6), "profile_support"
+    if source_domain and source_domain == target_domain:
+        return round(0.58 + (0.12 if source_type == target_type else 0.04) + (0.12 * overlap), 6), "profile_tunnel"
+    if source_type and source_type == target_type and overlap >= 0.22:
+        return round(0.44 + (0.18 * overlap), 6), "profile_soft_tunnel"
+    return 0.0, ""

code/run_lme_s10_native_tmcra.py ADDED Viewed

The diff for this file is too large to render. See raw diff

docs/BASELINE_S500_20260525.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# S500 Baseline Record
+## Run Identity
+- Name: frozen baseline38 GPT5.4 full10
+- Date: 2026-05-25
+- Local package: `tmcra_s500_baseline38_release_20260525`
+- Result archive: `results/lme_s500_frozen_baseline38_full10_20260525_results.tar.gz`
+- Judge output: `results/judge_gpt4o_alias_vectorengine.jsonl`
+- Summary: `results/judge_gpt4o_alias_vectorengine.jsonl.summary.json`
+## Metrics
+- Evaluated samples: 500
+- Correct: 310
+- Accuracy: 62.00%
+- Judge alias: `gpt-4o`
+- Judge resolved model: `gpt-4o-2024-08-06`
+## Subtask Metrics
+| subtask | accuracy | count |
+| --- | ---: | ---: |
+| knowledge-update | 70.51% | 78 |
+| multi-session | 39.85% | 133 |
+| single-session-assistant | 78.57% | 56 |
+| single-session-preference | 56.67% | 30 |
+| single-session-user | 81.43% | 70 |
+| temporal-reasoning | 63.16% | 133 |
+## Interpretation
+This baseline is useful as a frozen comparison point. Stronger areas are single-session direct user facts, assistant detail recall, and knowledge-update. Weak areas remain multi-session aggregation, preference abstraction, and temporal reasoning.
+The full model output directory is included to preserve all runtime weights and training checkpoints, not only deployment weights.

docs/OPTIONAL_MODULES_AND_PARALLEL.md ADDED Viewed

	@@ -0,0 +1,347 @@

+# Optional Modules and Parallel Evaluation Plan
+This document describes two optional extension points already preserved in the current TMCRA package:
+- Embedder interface
+- LLM planner interface
+It also summarizes the parallel evaluation pattern used for the S500 baseline so later deployment, evaluation, and ablation runs can reuse the same structure.
+## 1. Current Main Path
+The frozen S500 baseline uses this core path:
+```text
+dialogue -> writer layer -> graph memory -> learned node/path scorer -> evidence selection -> answer layer
+```
+The responsibilities are:
+- The writer layer converts dialogue into memory nodes, event units, profile signals, and temporal signals.
+- The graph memory layer stores nodes, paths, and tunnel links.
+- `node_scorer.pt` and `path_scorer.pt` perform learned node/path scoring.
+- Evidence selection converts candidate memories into compact evidence.
+- The answer-layer LLM produces the final response from the selected evidence.
+Embedder and LLM planner modules are optional enhancement modules. They should not replace the main graph model. They are better treated as auxiliary channels, ablation switches, or higher-cost deployment paths.
+## 2. Embedder Interface
+The current code exposes three embedder integration points.
+### 2.1 Write-time Indexing
+At write time, TMCRA can build an embedding index for newly written memory nodes. That index can later serve as an auxiliary candidate source during retrieval.
+Relevant configuration:
+```bash
+export TMCRA_EMBEDDER_MODEL_PATH="BAAI/bge-m3"
+export TMCRA_EMBEDDER_DEVICE="cuda"
+export TMCRA_EMBEDDER_MODEL_MAX_LENGTH="512"
+export TMCRA_WRITE_EMBEDDER_INDEX_MODE="bge_m3"
+export TMCRA_WRITE_EMBEDDER_INDEX_MAX_TERMS="96"
+```
+Purpose:
+- Build semantic indexes after the writer stores memory nodes.
+- Keep the original graph structure unchanged.
+- Add a semantic candidate channel without replacing the learned node/path scorers.
+### 2.2 Pre-recall Candidate Expansion
+Before graph retrieval, the embedder can find candidate event ids, which are then passed into graph retrieval and scorer ranking.
+Relevant configuration:
+```bash
+export TMCRA_EMBEDDER_PRE_RECALL_MODE="bge_m3"
+export TMCRA_EMBEDDER_PRE_RECALL_K="16"
+export TMCRA_EMBEDDER_INDEX_RECALL_MODE="bge_m3"
+export TMCRA_EMBEDDER_INDEX_RECALL_K="24"
+```
+Purpose:
+- Expand the candidate range before retrieval.
+- Help when query wording and memory wording differ.
+- Provide an auxiliary path for semantically close memories with weak graph paths.
+### 2.3 Post-recall Fusion
+After retrieval, embedder-matched events can be fused with graph-model results so semantically relevant nodes receive a limited boost.
+Relevant configuration:
+```bash
+export TMCRA_EMBEDDER_FUSION_MODE="on"
+export TMCRA_EMBEDDER_FUSION_WEIGHT="0.35"
+export TMCRA_EMBEDDER_FUSION_SCORE_FLOOR="0.62"
+export TMCRA_EMBEDDER_FUSION_TOP_K="16"
+export TMCRA_EMBEDDER_FUSION_SELECT_K="4"
+export TMCRA_EMBEDDER_FUSION_MAX_BOOST="0.42"
+```
+Purpose:
+- Give semantically similar candidates a bounded score boost.
+- Prevent the embedder from directly replacing main evidence ranking.
+- Use embedding as an auxiliary recall layer for the learned graph scorer.
+## 3. LLM Planner Interface
+The current code exposes three main LLM planner paths. They run either after retrieval or before query-side retrieval expansion.
+### 3.1 Evidence-unit Planner
+The evidence-unit planner runs after retrieval and uses an LLM to normalize retrieved windows into evidence units.
+Relevant configuration:
+```bash
+export TMCRA_EVIDENCE_UNIT_PLANNER_MODE="on"
+export TMCRA_EVIDENCE_UNIT_PLANNER_BASE_URL="<openai-compatible-base-url>"
+export TMCRA_EVIDENCE_UNIT_PLANNER_MODEL="<planner-model>"
+export TMCRA_EVIDENCE_UNIT_PLANNER_API_KEY="<planner-api-key>"
+export TMCRA_EVIDENCE_UNIT_PLANNER_MAX_CANDIDATES="10"
+export TMCRA_EVIDENCE_UNIT_PLANNER_CHARS="1100"
+export TMCRA_EVIDENCE_UNIT_PLANNER_MAX_TOKENS="760"
+export TMCRA_EVIDENCE_UNIT_PLANNER_REORDER="0"
+```
+If planner-specific base/model/key values are not configured, this planner inherits answer-layer configuration:
+```bash
+export TMCRA_ANSWER_BASE_URL="<openai-compatible-base-url>"
+export TMCRA_ANSWER_MODEL="<answer-model>"
+export TMCRA_ANSWER_API_KEY="<answer-api-key>"
+```
+Purpose:
+- Mark answer units, positive evidence, temporal anchors, current values, old values, constraints, and negative evidence.
+- Help the final answer layer understand how to use the retrieved evidence.
+- Organize evidence without replacing graph retrieval.
+### 3.2 LLM Channel Planner
+The LLM channel planner runs before final evidence is sent to the answer layer. It separates main evidence, coverage evidence, support evidence, and suppressed evidence.
+Relevant configuration:
+```bash
+export TMCRA_LLM_CHANNEL_PLANNER_MODE="on"
+export TMCRA_LLM_CHANNEL_PLANNER_MAX_WINDOWS="16"
+export TMCRA_LLM_CHANNEL_PLANNER_WINDOW_CHARS="520"
+export TMCRA_LLM_CHANNEL_PLANNER_MAX_TOKENS="700"
+```
+Purpose:
+- Make coverage evidence supplement main facts instead of replacing them.
+- Improve count/sum/ratio/duration/multi-unit tasks.
+- Provide a higher-cost quality mode for experiments and selected deployments.
+In the frozen S500 baseline record, this module was:
+```text
+llm_channel_planner=off
+```
+### 3.3 Query Graph Builder
+The query graph builder runs before retrieval. It converts the user question into a compact query graph and can expand that graph into sidecar retrieval queries.
+Relevant configuration:
+```bash
+export TMCRA_QUERY_GRAPH_BUILDER_MODE="on"
+export TMCRA_QUERY_GRAPH_BASE_URL="<openai-compatible-base-url>"
+export TMCRA_QUERY_GRAPH_MODEL="<query-graph-model>"
+export TMCRA_QUERY_GRAPH_API_KEY="<query-graph-api-key>"
+export TMCRA_QUERY_GRAPH_MAX_TOKENS="700"
+export TMCRA_QUERY_GRAPH_SIDECAR_RETRIEVAL_MODE="on"
+export TMCRA_QUERY_GRAPH_SIDECAR_MAX_QUERIES="6"
+export TMCRA_QUERY_GRAPH_SIDECAR_TOP_K="4"
+```
+Purpose:
+- Convert the question into task intent, required units, operation, and tunnel needs.
+- Give complex multi-session, temporal, and profile questions a clearer retrieval direction.
+- Test whether building a query graph before retrieval improves candidate recall.
+## 4. Local Model Planner vs LLM Planner
+The code also contains local model planner interfaces, for example:
+```bash
+export TMCRA_ANSWER_WINDOW_PLANNER_MODE="on"
+export TMCRA_ANSWER_WINDOW_PLANNER_MODEL_PATH="<planner-checkpoint>"
+export TMCRA_UNIFIED_OPERATION_PLANNER_MODE="on"
+export TMCRA_UNIFIED_OPERATION_PLANNER_MODEL_PATH="<planner-checkpoint>"
+export TMCRA_INJECTION_PLANNER_MODE="guided"
+export TMCRA_INJECTION_PLANNER_MODEL_PATH="<planner-checkpoint>"
+```
+These are local model interfaces, not LLM planner interfaces.
+The distinction is:
+- LLM planner: calls an external or local LLM; higher cost; useful for validating capability ceilings.
+- Local model planner: lower cost and better for productization, but requires targeted training and stability validation.
+Recommended workflow:
+```text
+validate behavior with an LLM planner -> distill or train the useful behavior into the graph model or a local planner head
+```
+## 5. Parallel Evaluation Plan
+The S500 baseline used shard-level parallelism:
+```text
+500 samples -> 10 shards -> 50 samples per shard
+```
+Each shard runs independently:
+```text
+input_shard_N.json -> shard_N/ -> predictions/debug/summary
+```
+Core parallelization principles:
+- One independent process per shard.
+- One independent output directory per shard.
+- Writer key pool is rotated by shard index.
+- Main model weights are read-only and shared.
+- Predictions, samples_debug, and judge results are merged after all shards complete.
+Key S500 baseline runtime configuration:
+```text
+samples=500
+shards=10
+per_shard=50
+writer=DeepSeek v4 Flash
+answer_layer=GPT5.4
+llm_channel_planner=off
+history_mode=controlled_answer_plus_distractors
+```
+### 5.1 Reusable Parallel Template
+Recommended baseline template:
+```bash
+export TMCRA_RETRIEVAL_MODE="hybrid_node_scored"
+export TMCRA_REQUIRE_LEARNED_SCORER="1"
+export TMCRA_NODE_MODEL_DEVICE="cuda"
+export TMCRA_NODE_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt"
+export TMCRA_PATH_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt"
+export TMCRA_WRITER_MODEL="deepseek-chat"
+export TMCRA_WRITER_MAX_TOKENS="512"
+export TMCRA_WRITER_TIMEOUT_SECONDS="180"
+export TMCRA_WRITER_TEMPERATURE="0"
+export TMCRA_WRITER_INPUT_MODE="delta"
+export TMCRA_WRITER_MAX_PROPOSALS="2"
+export TMCRA_ANSWER_MAX_TOKENS="512"
+```
+Single-shard execution shape:
+```bash
+python code/run_lme_s10_native_tmcra.py \
+  --data "<run-root>/input_shard_N.json" \
+  --repo "<tmcra-repo-root>" \
+  --service-root "<tmcra-service-root>" \
+  --out "<run-root>/shard_N" \
+  --limit 50 \
+  --top-k 10 \
+  --max-distractor-sessions 5 \
+  --max-distractor-chunks 1 \
+  --max-answer-chunks 4 \
+  --chunk-chars 7000
+```
+### 5.2 Suggested Rollout Order
+Do not enable all optional modules at once. Use staged A/B testing:
+1. **Baseline scorer-only**
+   - embedder off
+   - LLM channel planner off
+   - query graph builder off
+   - confirms frozen baseline stability
+2. **Embedder pre-recall A/B**
+   - enable write-time indexing and pre-recall candidate expansion only
+   - measure candidate hit rate, retrieval latency, and error-type shifts
+3. **Embedder fusion A/B**
+   - enable fusion only after pre-recall is stable
+   - keep boost bounded so embedder does not override the main graph scorer
+4. **Evidence-unit planner A/B**
+   - enable LLM evidence-unit planner
+   - measure whether the answer layer uses retrieved evidence better
+5. **LLM channel planner A/B**
+   - test mainly on multi/aggregation/temporal error clusters
+   - verify coverage evidence supplements main facts instead of replacing them
+6. **Query graph builder A/B**
+   - validate the ceiling of query-graph-first retrieval
+   - if effective, distill the behavior into query-understanding or graph scorer training
+### 5.3 Parallel Scale Guidance
+Parallelism should not be determined only by the number of API keys. Also watch:
+- GPU memory
+- CPU memory
+- writer latency
+- answer-layer latency
+- graph ingest / SQLite write overhead
+- average writer calls per shard
+Scale gradually:
+```text
+5 shards smoke -> 10 shards stable -> 20 shards stress -> 30 shards only if no memory/API/IO issue
+```
+If error rate rises, memory drops sharply, API 402/429 appears, chunk errors occur, or shards stall, reduce parallelism first and then resume missing samples.
+## 6. Recommended Experiment Matrix
+Minimal interpretable matrix:
+| experiment | Embedder | LLM planner | purpose |
+| --- | --- | --- | --- |
+| baseline | off | off | fixed main graph-model baseline |
+| embedder-pre | pre-recall on | off | test candidate expansion |
+| embedder-fusion | pre-recall + fusion on | off | test semantic fusion |
+| evidence-unit | off | evidence-unit on | test pre-answer evidence organization |
+| channel-planner | off | channel planner on | test main/coverage separation |
+| query-graph | off | query graph on | test query-graph retrieval |
+| combined-light | pre-recall on | evidence-unit on | test lower-cost combined path |
+| combined-heavy | pre-recall + fusion on | evidence-unit + channel planner on | test capability ceiling |
+Each run should preserve:
+- predictions
+- samples_debug
+- judge output
+- by-task accuracy
+- writer calls
+- retrieval latency
+- answer latency
+- per-sample error type
+This makes it possible to separate recall errors, evidence-selection errors, planner errors, answer-layer errors, and parallel-runtime instability.

docs/OPTIONAL_MODULES_AND_PARALLEL.zh-CN.md ADDED Viewed

	@@ -0,0 +1,345 @@

+# 可开启模块接入与并行推进说明
+本文说明当前 TMCRA 包中已经保留的两个可开启接入口：
+- Embedder 接入口
+- LLM planner 接入口
+同时说明 S500 基线测试中使用过的并行推进方式，方便后续在部署、评估或消融实验中复用。
+## 1. 当前主链路
+冻结 S500 基线的核心链路是：
+```text
+dialogue -> writer layer -> graph memory -> learned node/path scorer -> evidence selection -> answer layer
+```
+其中：
+- 写入层负责把对话转成记忆节点、事件单元、profile 信号、时间信号。
+- 图记忆层保存节点、路径和隧穿关系。
+- `node_scorer.pt` 和 `path_scorer.pt` 负责学习式节点/路径打分。
+- evidence selection 把候选记忆整理成紧凑证据。
+- 回答层 LLM 根据证据生成最终回复。
+Embedder 和 LLM planner 都是可开启增强模块，不应该替代主图模型。它们更适合作为辅助通道、对比实验或高成本部署路径。
+## 2. Embedder 接入口
+Embedder 当前有三类接入位置。
+### 2.1 写入阶段索引
+写入阶段可以为新写入的记忆建立 embedding 索引，后续召回时作为辅助候选来源。
+相关配置：
+```bash
+export TMCRA_EMBEDDER_MODEL_PATH="BAAI/bge-m3"
+export TMCRA_EMBEDDER_DEVICE="cuda"
+export TMCRA_EMBEDDER_MODEL_MAX_LENGTH="512"
+export TMCRA_WRITE_EMBEDDER_INDEX_MODE="bge_m3"
+export TMCRA_WRITE_EMBEDDER_INDEX_MAX_TERMS="96"
+```
+作用：
+- 在 writer 写入记忆节点后，为节点文本建立语义索引。
+- 不改变原图结构。
+- 不替代 node/path scorer，只是给召回增加一条语义候选通道。
+### 2.2 召回前候选补充
+召回前可以先用 embedder 找到一批候选 event id，再交给图召回和 scorer 做后续排序。
+相关配置：
+```bash
+export TMCRA_EMBEDDER_PRE_RECALL_MODE="bge_m3"
+export TMCRA_EMBEDDER_PRE_RECALL_K="16"
+export TMCRA_EMBEDDER_INDEX_RECALL_MODE="bge_m3"
+export TMCRA_EMBEDDER_INDEX_RECALL_K="24"
+```
+作用：
+- 帮助召回阶段扩大候选范围。
+- 对语义相近但图路径弱的记忆提供补充入口。
+- 适合测试 query 与 memory 表达不完全一致的场景。
+### 2.3 召回后融合加权
+召回后可以把 embedder 命中的 event 与主图模型结果融合，让高语义相关的节点获得有限 boost。
+相关配置：
+```bash
+export TMCRA_EMBEDDER_FUSION_MODE="on"
+export TMCRA_EMBEDDER_FUSION_WEIGHT="0.35"
+export TMCRA_EMBEDDER_FUSION_SCORE_FLOOR="0.62"
+export TMCRA_EMBEDDER_FUSION_TOP_K="16"
+export TMCRA_EMBEDDER_FUSION_SELECT_K="4"
+export TMCRA_EMBEDDER_FUSION_MAX_BOOST="0.42"
+```
+作用：
+- 给语义相似候选增加有限分数。
+- 避免 embedder 直接重排主证据。
+- 适合作为主图 scorer 的辅助召回层。
+## 3. LLM Planner 接入口
+LLM planner 当前主要有三类接入方式。它们都位于召回之后或 query 进入召回之前，用于增强证据组织能力。
+### 3.1 Evidence-unit planner
+Evidence-unit planner 在召回后运行，用 LLM 把候选窗口整理成 evidence unit。
+相关配置：
+```bash
+export TMCRA_EVIDENCE_UNIT_PLANNER_MODE="on"
+export TMCRA_EVIDENCE_UNIT_PLANNER_BASE_URL="<openai-compatible-base-url>"
+export TMCRA_EVIDENCE_UNIT_PLANNER_MODEL="<planner-model>"
+export TMCRA_EVIDENCE_UNIT_PLANNER_API_KEY="<planner-api-key>"
+export TMCRA_EVIDENCE_UNIT_PLANNER_MAX_CANDIDATES="10"
+export TMCRA_EVIDENCE_UNIT_PLANNER_CHARS="1100"
+export TMCRA_EVIDENCE_UNIT_PLANNER_MAX_TOKENS="760"
+export TMCRA_EVIDENCE_UNIT_PLANNER_REORDER="0"
+```
+如果不单独设置 planner 的 base/model/key，它会继承回答层配置：
+```bash
+export TMCRA_ANSWER_BASE_URL="<openai-compatible-base-url>"
+export TMCRA_ANSWER_MODEL="<answer-model>"
+export TMCRA_ANSWER_API_KEY="<answer-api-key>"
+```
+作用：
+- 标注候选窗口里的 answer unit、positive evidence、temporal anchor、current value、old value、constraint、negative evidence。
+- 帮助最终回答层理解“这批证据应该怎么用”。
+- 默认更适合做证据整理，不建议让它直接替代图召回。
+### 3.2 LLM channel planner
+LLM channel planner 在最终证据进入回答层前运行，用 LLM 区分 main evidence、coverage evidence、support evidence 和 suppress evidence。
+相关配置：
+```bash
+export TMCRA_LLM_CHANNEL_PLANNER_MODE="on"
+export TMCRA_LLM_CHANNEL_PLANNER_MAX_WINDOWS="16"
+export TMCRA_LLM_CHANNEL_PLANNER_WINDOW_CHARS="520"
+export TMCRA_LLM_CHANNEL_PLANNER_MAX_TOKENS="700"
+```
+作用：
+- 让 coverage 证据补充主事实，而不是替代主事实。
+- 对 count/sum/ratio/duration/multi-unit 问题特别有用。
+- 成本高于纯模型 scorer，适合高质量模式或实验开关。
+冻结 S500 基线记录中，该项为：
+```text
+llm_channel_planner=off
+```
+### 3.3 Query graph builder
+Query graph builder 在召回前运行，把用户问题转成 query graph，再扩展为 sidecar retrieval queries。
+相关配置：
+```bash
+export TMCRA_QUERY_GRAPH_BUILDER_MODE="on"
+export TMCRA_QUERY_GRAPH_BASE_URL="<openai-compatible-base-url>"
+export TMCRA_QUERY_GRAPH_MODEL="<query-graph-model>"
+export TMCRA_QUERY_GRAPH_API_KEY="<query-graph-api-key>"
+export TMCRA_QUERY_GRAPH_MAX_TOKENS="700"
+export TMCRA_QUERY_GRAPH_SIDECAR_RETRIEVAL_MODE="on"
+export TMCRA_QUERY_GRAPH_SIDECAR_MAX_QUERIES="6"
+export TMCRA_QUERY_GRAPH_SIDECAR_TOP_K="4"
+```
+作用：
+- 把问题拆成 task intent、required units、operation、tunnel needs。
+- 对复杂 multi-session、temporal、profile 问题提供更明确的召回方向。
+- 适合做对比实验，观察“问题建图后再召回”是否提升候选命中。
+## 4. 本地模型 planner 与 LLM planner 的区别
+当前代码里也有本地模型 planner 接口，例如：
+```bash
+export TMCRA_ANSWER_WINDOW_PLANNER_MODE="on"
+export TMCRA_ANSWER_WINDOW_PLANNER_MODEL_PATH="<planner-checkpoint>"
+export TMCRA_UNIFIED_OPERATION_PLANNER_MODE="on"
+export TMCRA_UNIFIED_OPERATION_PLANNER_MODEL_PATH="<planner-checkpoint>"
+export TMCRA_INJECTION_PLANNER_MODE="guided"
+export TMCRA_INJECTION_PLANNER_MODEL_PATH="<planner-checkpoint>"
+```
+这些是本地模型接入口，不是 LLM planner。区别是：
+- LLM planner：调用外部或本地 LLM，成本更高，适合验证能力上限。
+- 本地模型 planner：成本更低，更适合产品化，但需要专项训练和稳定性验证。
+建议流程是：
+```text
+先用 LLM planner 验证能力是否有效 -> 再把有效行为蒸馏/训练进本地图模型或 planner head
+```
+## 5. 并行推进方案
+S500 基线采用过分片并行方式：
+```text
+500 samples -> 10 shards -> 50 samples per shard
+```
+每个 shard 独立运行：
+```text
+input_shard_N.json -> shard_N/ -> predictions/debug/summary
+```
+核心并行原则：
+- 每个 shard 独立进程。
+- 每个 shard 独立输出目录。
+- writer key pool 按 shard index 轮转。
+- 主模型权重只读共享。
+- 最终合并 predictions、samples_debug、judge 结果。
+冻结 S500 记录中的关键运行配置：
+```text
+samples=500
+shards=10
+per_shard=50
+writer=DeepSeek v4 Flash
+answer_layer=GPT5.4
+llm_channel_planner=off
+history_mode=controlled_answer_plus_distractors
+```
+### 5.1 复用的并行模板
+推荐的并行模板：
+```bash
+export TMCRA_RETRIEVAL_MODE="hybrid_node_scored"
+export TMCRA_REQUIRE_LEARNED_SCORER="1"
+export TMCRA_NODE_MODEL_DEVICE="cuda"
+export TMCRA_NODE_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt"
+export TMCRA_PATH_MODEL_PATH="models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt"
+export TMCRA_WRITER_MODEL="deepseek-chat"
+export TMCRA_WRITER_MAX_TOKENS="512"
+export TMCRA_WRITER_TIMEOUT_SECONDS="180"
+export TMCRA_WRITER_TEMPERATURE="0"
+export TMCRA_WRITER_INPUT_MODE="delta"
+export TMCRA_WRITER_MAX_PROPOSALS="2"
+export TMCRA_ANSWER_MAX_TOKENS="512"
+```
+单 shard 执行形态：
+```bash
+python code/run_lme_s10_native_tmcra.py \
+  --data "<run-root>/input_shard_N.json" \
+  --repo "<tmcra-repo-root>" \
+  --service-root "<tmcra-service-root>" \
+  --out "<run-root>/shard_N" \
+  --limit 50 \
+  --top-k 10 \
+  --max-distractor-sessions 5 \
+  --max-distractor-chunks 1 \
+  --max-answer-chunks 4 \
+  --chunk-chars 7000
+```
+### 5.2 推进顺序
+建议按下面顺序推进，不要一次把所有模块全打开：
+1. **Baseline scorer-only**
+   - embedder off
+   - LLM channel planner off
+   - query graph builder off
+   - 用于确认 frozen baseline 是否稳定。
+2. **Embedder pre-recall A/B**
+   - 只打开写入索引和召回前候选补充。
+   - 观察候选命中率、召回延迟、错误类型是否变化。
+3. **Embedder fusion A/B**
+   - 在 pre-recall 稳定后打开 fusion。
+   - 控制 boost，不允许 embedder 直接压过主图 scorer。
+4. **Evidence-unit planner A/B**
+   - 打开 LLM evidence-unit planner。
+   - 观察 answer 层是否更会使用召回证据。
+5. **LLM channel planner A/B**
+   - 只在 multi/aggregation/temporal 错误集中验证。
+   - 重点观察 coverage 证据是否补充主事实，而不是替换主事实。
+6. **Query graph builder A/B**
+   - 用于验证“问题建图后再召回”的上限。
+   - 如果有效，再考虑训练进 query-understanding 或 graph scorer。
+### 5.3 并行规模建议
+并行数不要只看 API 数量，还要看：
+- GPU 显存
+- CPU 内存
+- writer 延迟
+- answer 层延迟
+- graph ingest/SQLite 写入开销
+- 每 shard 平均 writer calls
+建议从小到大：
+```text
+5 shards smoke -> 10 shards stable -> 20 shards stress -> 30 shards only if no memory/API/IO issue
+```
+如果出现错误率升高、内存下降明显、API 402/429、chunk error 或 shard 卡住，应先降并行，再补跑缺失样本。
+## 6. 推荐实验矩阵
+最小可解释矩阵：
+| 实验 | Embedder | LLM planner | 目的 |
+| --- | --- | --- | --- |
+| baseline | off | off | 固定主图模型基线 |
+| embedder-pre | pre-recall on | off | 测候选扩展是否提升 |
+| embedder-fusion | pre-recall + fusion on | off | 测语义融合是否提升 |
+| evidence-unit | off | evidence-unit on | 测回答前证据整理 |
+| channel-planner | off | channel planner on | 测 main/coverage 分离 |
+| query-graph | off | query graph on | 测问题建图召回 |
+| combined-light | pre-recall on | evidence-unit on | 测较低成本组合 |
+| combined-heavy | pre-recall + fusion on | evidence-unit + channel planner on | 测能力上限 |
+每一组都应保留：
+- predictions
+- samples_debug
+- judge output
+- by-task accuracy
+- writer calls
+- retrieval latency
+- answer latency
+- per-sample error type
+这样后续可以判断问题来自召回、证据选择、planner、回答层，还是并行运行不稳定。

docs/TRAINING.md ADDED Viewed

	@@ -0,0 +1,113 @@

+# TMCRA Training Notes
+This document summarizes the training direction behind the graph scorer package included in this release.
+## Training Goal
+TMCRA trains graph-scoring components for long-memory retrieval. The goal is to help the runtime decide which memory nodes and graph paths should be surfaced to an answer model for a given user query.
+The trained model is not intended to replace the answer LLM. It is responsible for memory selection:
+- identify relevant memory nodes
+- score graph paths between related memory events
+- preserve useful cross-turn and cross-session links
+- reduce noisy or stale evidence before answer generation
+## Model Components
+The released model directory contains two main runtime scorers:
+- `node_scorer.pt`: scores candidate memory nodes.
+- `path_scorer.pt`: scores graph paths and tunnel links between memory nodes.
+The training output also includes:
+- best checkpoints
+- last checkpoints
+- epoch and step checkpoints
+- training summary and logs
+- export manifest
+## Training Data Direction
+Training data is built around dialogue-memory behavior rather than isolated QA pairs. Samples are designed to teach the model how memory should connect across turns and sessions.
+The major training directions include:
+- direct user facts
+- assistant-provided details
+- preference/profile signals
+- temporal state changes
+- old-value vs current-value selection
+- cross-session event links
+- multi-evidence aggregation
+- evidence-positive vs noise/negative memory separation
+- unit-to-unit coverage for count/sum/compare tasks
+## Graph Memory Supervision
+Each training example is converted into graph-oriented supervision. Instead of only asking whether a text chunk is relevant, TMCRA trains over:
+- memory node relevance
+- event-unit relevance
+- path usefulness
+- tunnel/link usefulness
+- evidence role
+- currentness and temporal state
+- whether a candidate should be injected into answer context
+This allows the runtime to learn memory structure, not only lexical similarity.
+## Writer and Scorer Separation
+TMCRA separates memory writing from graph scoring.
+The writer extracts candidate memory records from dialogue. The graph model then learns how those records should be selected and connected during retrieval.
+This separation is important because a long-memory system needs two different abilities:
+- write useful memory units from conversation
+- retrieve and connect the right units later under noise
+## Training Output Included
+The packaged model output is located at:
+```text
+models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/
+```
+Runtime files:
+```text
+node_scorer.pt
+path_scorer.pt
+export_manifest.json
+```
+Full training trace:
+```text
+checkpoints/
+node_scorer_best.pt
+path_scorer_best.pt
+node_scorer_last.pt
+path_scorer_last.pt
+train_summary.json
+train.log
+training_issues.jsonl
+```
+## Current Training Lessons
+The current baseline shows that TMCRA has strong single-session fact recall and assistant-detail recall. It also has working temporal and preference layers.
+The main remaining training targets are:
+- stronger multi-session aggregation
+- better unit coverage for count/sum/compare questions
+- deeper temporal graph planning
+- query-graph to memory-graph matching
+- more stable preference abstraction under indirect user requests
+These directions are the next step for improving TMCRA from a working long-memory runtime into a stronger general agent-memory layer.

github_push.err.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ To https://github.com/reshuibuduo/TMCRA-agent-memory-algorithm.git
2	+ * [new branch] master -> master

github_push.out.log ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ Uploading LFS objects: 100% (27/27), 5.6 GB \| 0 B/s, done.
2	+ branch 'master' set up to track 'origin/master'.

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_01.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8aaa0153ae0642e709d287b947403493b461594dc8f7ac7f94b31b00aa32c7cf
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_02.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9eec8b8fcef26d82c9306d8f96b98915724e8c0d0967ba33ddcfba7dbf983cb4
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_03.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3137ad50512ef6890eecf0d241348844e2ae43c4a46649b4ade50f29f68c4c33
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_04.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38c5e248f449080f4316c8f0a28ea8490bb2d9413afda7ce0d4324b256ce025a
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_05.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61e6ee0e0f02de94c46b7d3bfb8bf954a7588282592fd6507068cb46942bca16
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_06.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92af8e2ae8e55a29b9abcaffc02f37bb09544188f591b00fb5200fadfb46fbed
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_epoch_07.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dec148fe044f7f94694b4f72b49a01e7a74455f1bee17253beb53f6e6f94ce7
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_step_0000600.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29d3d1436839617bdda708090cac2727cb66c19dcdcf8634c6474f907b2c82c2
+size 238032983

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/node_scorer_step_0000650.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02077ad6b1df777cf8d9478a3fda234e7059340bce7d320f12184e89225e9510
+size 238032983

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_01.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28e8279e49f7dcc7a9edacf7c827de6bb2d0bc8ac3c2725a12dae314dd847ee1
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_02.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b895a3a0f2fdb0690ae9ea26b04f2c93a67b23a66a988fab2ad5db12b7c35c7
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_03.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b6a0480a36e0665efd92edab06f51a3deec6195e72a09bdb253205f91cb4fec
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_04.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b13d4ca8479fbdec63eebefdf40cbdabc700fb8074b8b7317a619b66260dafb
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_05.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:182fe18bb0288645186d13e677fbd2b560681794a007efd02de7ad21c39cf581
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_epoch_07.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d1961ac8ee0bcb0d8f9e5ea82de282e990f26fbbe237e8bb8e45d60be827749
+size 238030855

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_step_0000600.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2fbbc7510ecc077fe9724593568ea829f5bf2b90e8e59388675052853c1d9d99
+size 238032983

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints/path_scorer_step_0000650.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:148b3461a74a390534a1cd5161555e5c310d11c2d0406e785654957eae9178b8
+size 238032983

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/export_manifest.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"answer_calibration_variant": "competition_alignment_v3_trimaze", "answer_plan_variant": "candidate_event_answer_plan_v1", "checkpoint_audit": {"node": {"missing_prefixes": [], "ok": true, "path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt", "prefix_counts": {"event_head": 10, "event_tunnel_head": 4, "final_event_fusion_head": 4, "final_path_fusion_head": 4, "memory_router_head": 6, "path_head": 11, "path_tunnel_head": 4, "question_intent_head": 12, "temporal_head": 4}, "required_prefixes": ["memory_router_head", "temporal_head", "question_intent_head", "event_head", "path_head", "event_tunnel_head", "path_tunnel_head", "final_event_fusion_head", "final_path_fusion_head"], "saved_head": "event", "state_key_count": 448}, "path": {"missing_prefixes": [], "ok": true, "path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt", "prefix_counts": {"event_head": 10, "event_tunnel_head": 4, "final_event_fusion_head": 4, "final_path_fusion_head": 4, "memory_router_head": 6, "path_head": 11, "path_tunnel_head": 4, "question_intent_head": 12, "temporal_head": 4}, "required_prefixes": ["memory_router_head", "temporal_head", "question_intent_head", "event_head", "path_head", "event_tunnel_head", "path_tunnel_head", "final_event_fusion_head", "final_path_fusion_head"], "saved_head": "path", "state_key_count": 448}}, "completed_at": "2026-05-24T04:38:50", "created_at": "2026-05-23T20:23:49.564683+00:00", "dataset_version": "typed_tunnel_graph_v1", "encoder_attention_heads": 8, "encoder_attention_layers": 6, "encoder_structural_bias_variant": "tmcra_token_role_relbias_v1", "event_distractor_variant": "candidate_distractor_delta_bce_v1", "event_subgraph_attention_heads": 8, "event_subgraph_attention_layers": 6, "event_subgraph_refiner_variant": "event_centered_subgraph_attn_6l_h448", "focus_counts": {"action_frame_temporal_binding": 58, "cross_session_multi_event_chain": 104, "hard_distractor_similar_entity": 61, "normal_non_chain_stability": 16, "numeric_aggregation_unit_coverage": 116, "profile_boundary_mixed_case": 36, "same_entity_unit_to_unit_chain": 104, "status_negative_exclusion_update": 53}, "graph_count": 333, "input_jsonl": "outputs/action_frame_tunnel_final_accepted_20260524/typed_tunnel_accepted.jsonl", "memory_router_variant": "question_pooled_multilabel_v1", "memory_tunnel_variant": "chain_depth_tunnel_support_delta_v1", "message_passing_variant": "support_to_event_bidirectional_shared_typed_v1", "model_arch_version": "node_memory_v11_tmcra_scale49m_trimaze", "node_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "node_max_tokens": 32, "notes": ["Benchmark data is not an input to this builder.", "Typed tunnel signals are encoded as event/path metadata and existing graph node/path types.", "No new node type, edge type, path type, or model tensor shape is introduced.", "Train with train_locomo_node_memory.py and a compatible node-memory checkpoint."], "output_dir": "outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset", "pair_feature_adapter_variant": "residual_pair_feature_adapter_v1", "query_counts": {"test": 54, "train": 439, "val": 55}, "question_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "question_intent_variant": "text_pooled_multihead_v1", "question_max_tokens": 48, "resume": {"resume_checkpoint": "<tmcra-service-root>/models/tmcra_node_memory_v11_priorfusion_zeroload_20260517/node_scorer.pt", "resume_mode": "weights_only_forced", "resume_weights_only": true}, "sample_count": 548, "sample_id_prefix": "actionframe548_", "skipped_count": 0, "source_data_dirs": ["outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset"], "source_manifests": ["outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset/export_manifest.json"], "split_policy": {"test_ratio": 0.09999999999999995, "train_ratio": 0.8, "val_ratio": 0.1}, "status": "completed", "trainable_stage": {"frozen_parameter_count": 47963050, "trainable_name_prefixes": ["event_tunnel_head", "final_event_fusion_head", "final_path_fusion_head", "path_tunnel_head"], "trainable_parameter_count": 3831302, "trainable_stage": "tunnel_fusion_only"}, "training_sampling": {"answer_type_counts": {"multi_evidence": 422, "time": 17}, "base_row_count": 439, "effective_row_count": 697, "loss_weight_range_estimate": {"max": 1.549193, "min": 1.319091}, "mode": "source_aware_balanced", "repeat_histogram": {"1": 118, "2": 144}, "source_count": 1, "source_details": {"dataset": {"avg_example_weight_estimate": 1.540283, "loss_source_factor": 1.0, "row_count": 439, "row_share": 1.0, "sampled_row_count_estimate": 697, "sampled_row_share_estimate": 1.0, "sampling_source_factor": 1.0, "weighted_loss_mass": 676.184129}}, "source_row_counts": {"dataset": 439}, "supervision_bucket_counts": {"typed_tunnel_multi_positive": 439}, "weighted_answer_mass_estimate": {"multi_evidence": 653.759589, "time": 22.42454}, "weighted_supervision_bucket_mass_estimate": {"typed_tunnel_multi_positive": 680.095876}}, "typed_tunnel_edge_count": 3855}

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/launch_train.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env bash
+set -euo pipefail
+cd <tmcra-repo-root>
+DATA=outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset
+OUT="$1"
+<tmcra-service-root>/.venv/bin/python scripts/train_locomo_node_memory.py \
+  --data-dir "$DATA" \
+  --output-dir "$OUT" \
+  --resume-checkpoint <tmcra-service-root>/models/tmcra_node_memory_v11_priorfusion_zeroload_20260517/node_scorer.pt \
+  --resume-weights-only \
+  --trainable-stage tunnel_fusion_only \
+  --epochs 8 \
+  --batch-size 8 \
+  --lr 8e-05 \
+  --fail-on-bad-query-rows \
+  --epoch-train-eval-max-rows 80 \
+  --epoch-val-eval-max-rows 32 \
+  --graph-memory-cache-size 64 \
+  --lazy-prefetch-workers 6 \
+  --lazy-prefetch-window 24 \
+  --batch-prepare-workers 16 \
+  --batch-prepare-lookahead-batches 64 \
+  --graph-prefetch-lookahead-batches 3 \
+  --reserve-cpu-cores 2 \
+  --torch-cpu-threads 4 \
+  --torch-interop-threads 4 \
+  --log-every-steps 20 \
+  --checkpoint-every-steps 50 \
+  --keep-step-checkpoints 2 \
+  --train-sampling-mode source_aware_balanced \
+  --loss-group-balancing-mode supervision_bucket \
+  --loss-source-alpha 0.4 \
+  --loss-blend-uniform-ratio 0.25 \
+  --loss-weight-power 0.5 \
+  --loss-time-boost 1.45 \
+  --loss-multi-evidence-boost 2.0 \
+  --loss-temporal-positive-boost 1.2 \
+  --loss-min-example-weight 0.65 \
+  --loss-max-example-weight 1.85 \
+  --l2sp-loss-weight 0.02 \
+  --event-selection-positive-coverage-count 3 \
+  --path-selection-positive-coverage-count 3 \
+  --multi-positive-coverage-fraction 0.7 \
+  --multi-positive-recall-coverage-count 6 \
+  --multi-positive-event-coverage-count 5 \
+  --multi-positive-path-coverage-count 3 \
+  --multi-positive-final-event-set-coverage-count 5 \
+  > "$OUT/train.log" 2>&1

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb4e0d3f51143ec2822315721f76d2ed636218ae3c6edd488d7fa53774193739
+size 238028279

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb4e0d3f51143ec2822315721f76d2ed636218ae3c6edd488d7fa53774193739
+size 238028279

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7aeba343e330c918956dd53cca9104c580c8d45051ae108608fff0404af5b7c
+size 238028279

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/nohup.log ADDED Viewed

File without changes

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6cc3e720625ccbfb9ad7f322cd5e23c4b7b87d31cbb780c30d94ca7d2573c80
+size 238028279

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6cc3e720625ccbfb9ad7f322cd5e23c4b7b87d31cbb780c30d94ca7d2573c80
+size 238028279

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ec5a311d5a1fcf78436310c846f03bc4b70491a7a5b68a065cd485298fbc63e
+size 238028279

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train.log ADDED Viewed

	@@ -0,0 +1,84 @@

+[train_locomo_node_memory] 2026-05-24T04:25:59 gc_process_disabled reason='python_3_12_runtime_stability' scope='cli_process'
+[train_locomo_node_memory] 2026-05-24T04:25:59 query_split_loaded elapsed_seconds=0.022 rows=439 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' split='train'
+[train_locomo_node_memory] 2026-05-24T04:25:59 query_split_loaded elapsed_seconds=0.003 rows=55 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' split='val'
+[train_locomo_node_memory] 2026-05-24T04:25:59 query_split_loaded elapsed_seconds=0.003 rows=54 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' split='test'
+[train_locomo_node_memory] 2026-05-24T04:25:59 graph_source_index_started cache_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset/graph_tensor_cache_v2' cache_write_enabled=True prebuild_skipped_for_warm_cache=False requested_conversation_ids=333 require_cache_hit=False source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset'
+[train_locomo_node_memory] 2026-05-24T04:25:59 graph_source_index_ready graph_count=333 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' test_rows=54 train_rows=439 val_rows=55
+[train_locomo_node_memory] 2026-05-24T04:26:00 resume_training_state_skipped current_batch_size=8 reason='resume_weights_only' resume_checkpoint='<tmcra-service-root>/models/tmcra_node_memory_v11_priorfusion_zeroload_20260517/node_scorer.pt' saved_batch_size=8
+[train_locomo_node_memory] 2026-05-24T04:26:00 resume_checkpoint_loaded resume_checkpoint='<tmcra-service-root>/models/tmcra_node_memory_v11_priorfusion_zeroload_20260517/node_scorer.pt' resume_mode='weights_only_forced' saved_head='event' state_key_count=448
+[train_locomo_node_memory] 2026-05-24T04:26:00 training_started batch_prepare_lookahead_batches=64 batch_prepare_workers=16 batch_size=8 checkpoint_every_steps=50 cpu_count=36 data_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' device='cuda' epoch_train_eval=True epoch_train_eval_max_rows=80 epoch_val_eval=True epoch_val_eval_max_rows=32 epochs=8 gc_mode='disabled_during_training' graph_cache_device='cpu' graph_cache_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset/graph_tensor_cache_v2' graph_count=333 graph_memory_cache_size=64 graph_prefetch_lookahead_batches=3 issue_log='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/training_issues.jsonl' lazy_prefetch_window=24 lazy_prefetch_workers=6 log_every_steps=20 max_train_steps=0 output_dir='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557' prebuild_issue_log='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/graph_cache_prebuild_errors.jsonl' reserve_cpu_cores=2 resume_checkpoint='<tmcra-service-root>/models/tmcra_node_memory_v11_priorfusion_zeroload_20260517/node_scorer.pt' resume_mode='weights_only_forced' resume_weights_only=True skip_bad_query_rows=False skip_final_eval=False test_row_limit=0 test_rows=54 torch_cpu_threads=4 torch_interop_threads=4 train_row_limit=0 train_rows=697 train_rows_before_limit=439 train_rows_effective=697 train_sampling_mode='source_aware_balanced' trainable_stage='tunnel_fusion_only' usable_cpu_cores=34 val_row_limit=0 val_rows=55 val_rows_before_limit=55
+[train_locomo_node_memory] 2026-05-24T04:26:00 training_sampling_ready base_row_count=439 effective_row_count=697 loss_weight_max=1.549193 loss_weight_min=1.319091 mode='source_aware_balanced' source_count=1
+[train_locomo_node_memory] 2026-05-24T04:26:31 batch_prepare_pool_event backend='process' event_type='batch_prepare_process_pool_ready' graph_error_stage='train_batch_prepare' worker_count=16
+[train_locomo_node_memory] 2026-05-24T04:26:33 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.061843 batch_graph_error_count=0 batch_l2sp_loss=0.0 batch_loss=7.617344379425049 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=1 epoch_step=1 global_step=1 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:26:49 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.064798 batch_graph_error_count=0 batch_l2sp_loss=7e-08 batch_loss=6.46565055847168 batch_loss_group_count=1 batch_path_recall_at_3=0.833333 epoch=1 epoch_step=20 global_step=20 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:27:05 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.086709 batch_graph_error_count=0 batch_l2sp_loss=1.03e-06 batch_loss=7.7209601402282715 batch_loss_group_count=1 batch_path_recall_at_3=0.875 epoch=1 epoch_step=40 global_step=40 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:27:15 step_checkpoint_saved epoch=1 epoch_step=50 global_step=50 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:27:22 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.126139 batch_graph_error_count=0 batch_l2sp_loss=3.54e-06 batch_loss=6.973794937133789 batch_loss_group_count=1 batch_path_recall_at_3=0.875 epoch=1 epoch_step=60 global_step=60 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:27:39 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.029379 batch_graph_error_count=0 batch_l2sp_loss=5.95e-06 batch_loss=6.7596940994262695 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=1 epoch_step=80 global_step=80 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:28:03 epoch_completed best_val_loss=4.024884 best_val_selection_score=116.975116 epoch=1 global_step=97 is_best=True train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=4.217114 train_path_recall_at_3=0.95 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=4.024884 val_path_recall_at_3=0.96875 val_selection_score=116.975116
+[train_locomo_node_memory] 2026-05-24T04:28:06 checkpoint_saved epoch=1 global_step=97 is_best=True node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
+[train_locomo_node_memory] 2026-05-24T04:28:09 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.131766 batch_graph_error_count=0 batch_l2sp_loss=8.54e-06 batch_loss=6.195566654205322 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=2 epoch_step=3 global_step=100 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:28:11 step_checkpoint_saved epoch=2 epoch_step=3 global_step=100 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:28:30 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.003715 batch_graph_error_count=0 batch_l2sp_loss=1.191e-05 batch_loss=6.659519195556641 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=2 epoch_step=23 global_step=120 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:28:46 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.061323 batch_graph_error_count=0 batch_l2sp_loss=1.57e-05 batch_loss=5.569675445556641 batch_loss_group_count=1 batch_path_recall_at_3=0.833333 epoch=2 epoch_step=43 global_step=140 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:28:57 step_checkpoint_saved epoch=2 epoch_step=53 global_step=150 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:29:05 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.038297 batch_graph_error_count=0 batch_l2sp_loss=1.985e-05 batch_loss=7.193849563598633 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=2 epoch_step=63 global_step=160 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:29:21 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.046331 batch_graph_error_count=0 batch_l2sp_loss=2.431e-05 batch_loss=5.3383002281188965 batch_loss_group_count=1 batch_path_recall_at_3=0.875 epoch=2 epoch_step=83 global_step=180 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:29:40 epoch_completed best_val_loss=3.758544 best_val_selection_score=117.128901 epoch=2 global_step=191 is_best=True train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.925607 train_path_recall_at_3=0.9625 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.758544 val_path_recall_at_3=0.96875 val_selection_score=117.128901
+[train_locomo_node_memory] 2026-05-24T04:29:44 checkpoint_saved epoch=2 global_step=191 is_best=True node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
+[train_locomo_node_memory] 2026-05-24T04:29:53 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.13841 batch_graph_error_count=0 batch_l2sp_loss=2.782e-05 batch_loss=6.168829917907715 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=3 epoch_step=9 global_step=200 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:29:56 step_checkpoint_saved epoch=3 epoch_step=9 global_step=200 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:30:12 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.050877 batch_graph_error_count=0 batch_l2sp_loss=3.105e-05 batch_loss=5.412087917327881 batch_loss_group_count=1 batch_path_recall_at_3=0.75 epoch=3 epoch_step=29 global_step=220 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:30:27 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.133808 batch_graph_error_count=0 batch_l2sp_loss=3.419e-05 batch_loss=7.1404523849487305 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=3 epoch_step=49 global_step=240 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:30:38 step_checkpoint_saved epoch=3 epoch_step=59 global_step=250 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:30:46 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=3.686e-05 batch_loss=4.634302616119385 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=3 epoch_step=69 global_step=260 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:31:02 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.08086 batch_graph_error_count=0 batch_l2sp_loss=3.993e-05 batch_loss=6.546285629272461 batch_loss_group_count=1 batch_path_recall_at_3=0.875 epoch=3 epoch_step=89 global_step=280 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:31:17 epoch_completed best_val_loss=3.710867 best_val_selection_score=118.164133 epoch=3 global_step=286 is_best=True train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.793919 train_path_recall_at_3=0.9875 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.710867 val_path_recall_at_3=1.0 val_selection_score=118.164133
+[train_locomo_node_memory] 2026-05-24T04:31:21 checkpoint_saved epoch=3 global_step=286 is_best=True node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
+[train_locomo_node_memory] 2026-05-24T04:31:34 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=4.268e-05 batch_loss=6.9818620681762695 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=4 epoch_step=14 global_step=300 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:31:36 step_checkpoint_saved epoch=4 epoch_step=14 global_step=300 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:31:52 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.02366 batch_graph_error_count=0 batch_l2sp_loss=4.505e-05 batch_loss=6.924047470092773 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=4 epoch_step=34 global_step=320 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:32:09 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.065861 batch_graph_error_count=0 batch_l2sp_loss=4.73e-05 batch_loss=5.456366062164307 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=4 epoch_step=54 global_step=340 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:32:20 step_checkpoint_saved epoch=4 epoch_step=64 global_step=350 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:32:28 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.033157 batch_graph_error_count=0 batch_l2sp_loss=4.912e-05 batch_loss=7.173478603363037 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=4 epoch_step=74 global_step=360 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:32:44 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=5.091e-05 batch_loss=4.523404121398926 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=4 epoch_step=94 global_step=380 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:32:54 epoch_completed best_val_loss=3.718748 best_val_selection_score=118.424107 epoch=4 global_step=380 is_best=True train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.824029 train_path_recall_at_3=0.9875 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.718748 val_path_recall_at_3=1.0 val_selection_score=118.424107
+[train_locomo_node_memory] 2026-05-24T04:32:58 checkpoint_saved epoch=4 global_step=380 is_best=True node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
+[train_locomo_node_memory] 2026-05-24T04:33:16 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=5.232e-05 batch_loss=6.215372085571289 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=5 epoch_step=20 global_step=400 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:33:18 step_checkpoint_saved epoch=5 epoch_step=20 global_step=400 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:33:35 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.02998 batch_graph_error_count=0 batch_l2sp_loss=5.362e-05 batch_loss=6.076883316040039 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=5 epoch_step=40 global_step=420 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:33:50 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.026733 batch_graph_error_count=0 batch_l2sp_loss=5.463e-05 batch_loss=4.687601566314697 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=5 epoch_step=60 global_step=440 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:34:00 step_checkpoint_saved epoch=5 epoch_step=70 global_step=450 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:34:08 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.017054 batch_graph_error_count=0 batch_l2sp_loss=5.565e-05 batch_loss=6.018680095672607 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=5 epoch_step=80 global_step=460 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:34:29 epoch_completed best_val_loss=3.718748 best_val_selection_score=118.424107 epoch=5 global_step=474 is_best=False train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.451428 train_path_recall_at_3=0.975 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.512776 val_path_recall_at_3=1.0 val_selection_score=118.487224
+[train_locomo_node_memory] 2026-05-24T04:34:32 checkpoint_saved epoch=5 global_step=474 is_best=False node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
+[train_locomo_node_memory] 2026-05-24T04:34:38 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.011568 batch_graph_error_count=0 batch_l2sp_loss=5.656e-05 batch_loss=4.576869964599609 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=6 epoch_step=6 global_step=480 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:34:54 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.061455 batch_graph_error_count=0 batch_l2sp_loss=5.735e-05 batch_loss=4.150051116943359 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=6 epoch_step=26 global_step=500 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:34:57 step_checkpoint_saved epoch=6 epoch_step=26 global_step=500 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:35:12 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.000704 batch_graph_error_count=0 batch_l2sp_loss=5.803e-05 batch_loss=5.257463455200195 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=6 epoch_step=46 global_step=520 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:35:28 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=5.871e-05 batch_loss=5.609147071838379 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=6 epoch_step=66 global_step=540 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:35:39 step_checkpoint_saved epoch=6 epoch_step=76 global_step=550 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:35:46 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.284378 batch_graph_error_count=0 batch_l2sp_loss=5.923e-05 batch_loss=6.41322135925293 batch_loss_group_count=1 batch_path_recall_at_3=0.857143 epoch=6 epoch_step=86 global_step=560 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:36:03 epoch_completed best_val_loss=3.718748 best_val_selection_score=118.424107 epoch=6 global_step=569 is_best=False train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.54849 train_path_recall_at_3=0.95 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.330123 val_path_recall_at_3=1.0 val_selection_score=117.669877
+[train_locomo_node_memory] 2026-05-24T04:36:05 checkpoint_saved epoch=6 global_step=569 is_best=False node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
+[train_locomo_node_memory] 2026-05-24T04:36:16 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.051387 batch_graph_error_count=0 batch_l2sp_loss=5.974e-05 batch_loss=5.32041072845459 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=7 epoch_step=11 global_step=580 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:36:32 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.042565 batch_graph_error_count=0 batch_l2sp_loss=6.013e-05 batch_loss=5.378664970397949 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=7 epoch_step=31 global_step=600 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:36:35 step_checkpoint_saved epoch=7 epoch_step=31 global_step=600 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:36:50 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.023837 batch_graph_error_count=0 batch_l2sp_loss=6.043e-05 batch_loss=4.947947978973389 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=7 epoch_step=51 global_step=620 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:37:05 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.0 batch_graph_error_count=0 batch_l2sp_loss=6.069e-05 batch_loss=5.096177101135254 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=7 epoch_step=71 global_step=640 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:37:16 step_checkpoint_saved epoch=7 epoch_step=81 global_step=650 node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt' total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:37:24 step_completed batch_event_recall_at_5=1.0 batch_final_event_set_loss=0.008784 batch_graph_error_count=0 batch_l2sp_loss=6.089e-05 batch_loss=6.337571144104004 batch_loss_group_count=1 batch_path_recall_at_3=1.0 epoch=7 epoch_step=91 global_step=660 total_steps=704
+[train_locomo_node_memory] 2026-05-24T04:37:37 epoch_completed best_val_loss=3.718748 best_val_selection_score=118.424107 epoch=7 global_step=665 is_best=False train_event_recall_at_5=1.0 train_graph_error_count=0 train_loss=3.466264 train_path_recall_at_3=1.0 val_event_recall_at_5=1.0 val_graph_error_count=0 val_loss=3.604544 val_path_recall_at_3=1.0 val_selection_score=117.895456
+[train_locomo_node_memory] 2026-05-24T04:37:40 checkpoint_saved epoch=7 global_step=665 is_best=False node_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt' path_model_last='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt'
+[train_locomo_node_memory] 2026-05-24T04:37:41 final_eval_started batch_size=8 rows=439 split='train'
+[train_locomo_node_memory] 2026-05-24T04:37:41 lazy_graph_cache_progress disk_cache_hits=1 graph_count=333 known_graph_count=333 last_conversation_id='actionframe548_typed_tunnel_v1_50' loaded_from='disk_cache' loads=1 memory_cache_hits=0 memory_cache_size=1 prefetch_completed=1 prefetch_pending=11 prefetch_submitted=12 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' tensorized=0
+[train_locomo_node_memory] 2026-05-24T04:38:34 final_eval_completed event_recall_at_5=0.997722 path_recall_at_3=0.98861 rows=439 split='train' temporal_accuracy=0.235294
+[train_locomo_node_memory] 2026-05-24T04:38:34 final_eval_started batch_size=8 rows=55 split='val'
+[train_locomo_node_memory] 2026-05-24T04:38:42 final_eval_completed event_recall_at_5=1.0 path_recall_at_3=1.0 rows=55 split='val' temporal_accuracy=0.388889
+[train_locomo_node_memory] 2026-05-24T04:38:42 query_split_load_deferred_started source_count=1 split='test'
+[train_locomo_node_memory] 2026-05-24T04:38:42 query_split_loaded elapsed_seconds=0.016 rows=54 source_dir='outputs/action_frame_tunnel_graph_dataset_548_event_units_20260524/dataset' split='test'
+[train_locomo_node_memory] 2026-05-24T04:38:42 query_split_load_deferred_completed rows=54 split='test'
+[train_locomo_node_memory] 2026-05-24T04:38:42 final_eval_started batch_size=8 rows=54 split='test'
+[train_locomo_node_memory] 2026-05-24T04:38:47 final_eval_completed event_recall_at_5=1.0 path_recall_at_3=1.0 rows=54 split='test' temporal_accuracy=0.434783
+[train_locomo_node_memory] 2026-05-24T04:38:50 unified_checkpoint_audit_passed node_missing_prefixes='' path_missing_prefixes='' required_prefix_count=9
+[train_locomo_node_memory] 2026-05-24T04:38:50 training_completed best_val_loss=3.718748 elapsed_seconds=770.323 issue_count=1 node_model_path='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer.pt' path_model_path='outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer.pt' promoted_checkpoint_source='best' test_event_recall_at_5=1.0 test_path_recall_at_3=1.0 val_event_recall_at_5=1.0 val_path_recall_at_3=1.0

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train.pid ADDED Viewed

	@@ -0,0 +1 @@


1	+ 826216

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train_summary.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/train_summary.partial.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"answer_calibration_variant": "competition_alignment_v3_trimaze", "answer_plan_variant": "candidate_event_answer_plan_v1", "artifacts": {"checkpoints_dir": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/checkpoints", "node_model_best_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_best.pt", "node_model_last_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt", "path_model_best_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_best.pt", "path_model_last_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt"}, "best_val_loss": 3.718748, "best_val_selection_score": 118.424107, "config": {"amp": true, "answer_calibration_variant": "competition_alignment_v3_trimaze", "answer_plan_current_old_margin": 0.2, "answer_plan_current_old_margin_loss_weight": 0.0, "answer_plan_loss_weight": 0.18, "answer_plan_selected_negative_margin": 0.3, "answer_plan_selected_negative_margin_loss_weight": 0.0, "answer_plan_selection_margin": 0.12, "answer_plan_selection_top_k": 5, "answer_plan_variant": "candidate_event_answer_plan_v1", "answer_refusal_loss_weight": 0.08, "answer_refusal_margin": 0.15, "answer_type_loss_weight": 0.2, "batch_prepare_lookahead_batches": 64, "batch_prepare_workers": 16, "batch_size": 8, "checkpoint_every_steps": 50, "early_stopping_patience": 3, "encoder_attention_heads": 8, "encoder_attention_layers": 6, "encoder_structural_bias_variant": "tmcra_token_role_relbias_v1", "epoch_train_eval_enabled": true, "epoch_train_eval_max_rows": 80, "epoch_val_eval_enabled": true, "epoch_val_eval_max_rows": 32, "epochs": 8, "event_distractor_loss_weight": 0.15, "event_distractor_variant": "candidate_distractor_delta_bce_v1", "event_hard_negative_loss_weight": 0.35, "event_hard_negative_margin": 0.35, "event_loss_weight": 1.1, "event_matrix_delta_loss_weight": 0.08, "event_matrix_delta_margin": 0.15, "event_selection_loss_weight": 0.35, "event_selection_margin": 0.2, "event_selection_positive_coverage_count": 3, "event_selection_top_k": 5, "event_subgraph_attention_heads": 8, "event_subgraph_attention_layers": 6, "event_subgraph_refiner_variant": "event_centered_subgraph_attn_6l_h448", "event_tunnel_loss_weight": 0.12, "event_tunnel_margin": 0.16, "event_tunnel_selection_loss_weight": 0.08, "final_event_set_loss_weight": 0.35, "final_event_set_margin": 0.08, "final_event_set_positive_coverage_count": 3, "final_event_set_support_path_k": 3, "final_event_set_top_k": 10, "grad_clip": 1.0, "graph_prefetch_lookahead_batches": 3, "l2sp_loss_weight": 0.02, "log_every_steps": 20, "loss_blend_uniform_ratio": 0.25, "loss_group_balancing_mode": "supervision_bucket", "loss_max_example_weight": 1.85, "loss_min_example_weight": 0.65, "loss_multi_evidence_boost": 2.0, "loss_source_alpha": 0.4, "loss_temporal_positive_boost": 1.2, "loss_time_boost": 1.45, "loss_weight_power": 0.5, "lr": 8e-05, "max_train_steps": 0, "memory_router_loss_weight": 0.18, "memory_router_variant": "question_pooled_multilabel_v1", "memory_tunnel_variant": "chain_depth_tunnel_support_delta_v1", "message_passing_variant": "support_to_event_bidirectional_shared_typed_v1", "model_arch_version": "node_memory_v11_tmcra_scale49m_trimaze", "multi_positive_coverage_fraction": 0.7, "multi_positive_event_coverage_count": 5, "multi_positive_final_event_set_coverage_count": 5, "multi_positive_path_coverage_count": 3, "multi_positive_recall_coverage_count": 6, "node_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "node_max_tokens": 32, "online_event_hard_negative_limit": 6, "online_path_hard_negative_limit": 8, "pair_feature_adapter_variant": "residual_pair_feature_adapter_v1", "path_hard_negative_loss_weight": 0.25, "path_hard_negative_margin": 0.28, "path_loss_weight": 0.55, "path_matrix_delta_loss_weight": 0.08, "path_matrix_delta_margin": 0.12, "path_selection_loss_weight": 0.3, "path_selection_margin": 0.15, "path_selection_positive_coverage_count": 3, "path_selection_top_k": 3, "path_tunnel_delta_loss_weight": 0.12, "path_tunnel_loss_weight": 0.06, "path_tunnel_margin": 0.16, "path_tunnel_selection_loss_weight": 0.04, "question_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "question_intent_variant": "text_pooled_multihead_v1", "question_max_tokens": 48, "question_understanding_loss_weight": 0.12, "recall_loss_weight": 0.6, "recall_selection_loss_weight": 0.3, "recall_selection_margin": 0.12, "recall_selection_positive_coverage_count": 3, "recall_selection_top_k": 24, "sampling_blend_uniform_ratio": 0.35, "sampling_max_conversation_multiplier": 2.0, "sampling_max_group_repeat": 2, "sampling_multi_evidence_boost": 1.45, "sampling_source_alpha": 0.35, "sampling_temporal_positive_boost": 1.15, "sampling_time_boost": 1.2, "temporal_loss_weight": 0.3, "token_role_loss_weight": 0.08, "train_sampling_mode": "source_aware_balanced", "trainable_stage": "tunnel_fusion_only", "warmup_ratio": 0.06, "weight_decay": 0.01}, "device": "cuda", "encoder_attention_heads": 8, "encoder_attention_layers": 6, "encoder_structural_bias_variant": "tmcra_token_role_relbias_v1", "epoch": 7, "event_distractor_variant": "candidate_distractor_delta_bce_v1", "event_subgraph_attention_heads": 8, "event_subgraph_attention_layers": 6, "event_subgraph_refiner_variant": "event_centered_subgraph_attn_6l_h448", "final_eval_progress": {"completed_splits": ["train", "val", "test"], "test_metrics": {"answer_plan_current_top1_accuracy": 0.296296, "answer_plan_current_total": 54, "answer_plan_loss": 0.519994, "answer_plan_loss_count": 54, "answer_plan_selected_positive_coverage_at_5": 0.778761, "answer_plan_selected_positive_total": 113, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 54, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 54, "answer_type_loss": 3.65495, "answer_type_loss_count": 54, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.129032, "answer_plan_current_total": 31, "answer_plan_selected_positive_coverage_at_5": 0.776119, "answer_plan_selected_positive_total": 67, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 31, "event5_positive_total": 67, "event_positive_coverage_at_5": 0.940298, "event_recall_at_1": 0.903226, "event_recall_at_5": 1.0, "event_recall_total": 31, "path3_positive_total": 93, "path_positive_coverage_at_3": 0.849462, "path_recall_at_3": 1.0, "path_recall_total": 31, "recall_event24_positive_total": 67, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 31, "samples": 31, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.521739, "answer_plan_current_total": 23, "answer_plan_selected_positive_coverage_at_5": 0.782609, "answer_plan_selected_positive_total": 46, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 23, "event5_positive_total": 46, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.913044, "event_recall_at_5": 1.0, "event_recall_total": 23, "path3_positive_total": 69, "path_positive_coverage_at_3": 0.855072, "path_recall_at_3": 1.0, "path_recall_total": 23, "recall_event24_positive_total": 46, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 23, "samples": 23, "temporal_accuracy": 0.434783, "temporal_total": 23}}, "evaluation_limited": false, "event5_positive_total": 113, "event_distractor_loss": 0.652414, "event_distractor_loss_count": 54, "event_hard_negative_loss": 0.095313, "event_hard_negative_loss_count": 54, "event_loss": 0.388887, "event_loss_count": 54, "event_matrix_delta_loss": 0.121482, "event_matrix_delta_loss_count": 54, "event_positive_coverage_at_5": 0.964602, "event_recall_at_1": 0.907407, "event_recall_at_5": 1.0, "event_recall_total": 54, "event_selection_loss": 0.033558, "event_selection_loss_count": 54, "event_tunnel_loss": 0.257923, "event_tunnel_loss_count": 54, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 54, "final_event_set_loss": 0.006052, "final_event_set_loss_count": 54, "graph_error_count": 0, "loss": 3.097328, "loss_group_count": 7, "memory_router_exact_match": 0.851852, "memory_router_f1": 0.962963, "memory_router_loss": 0.407471, "memory_router_loss_count": 54, "memory_router_total": 54, "path3_positive_total": 162, "path_hard_negative_loss": 0.211284, "path_hard_negative_loss_count": 54, "path_loss": 0.429968, "path_loss_count": 54, "path_matrix_delta_loss": 0.494685, "path_matrix_delta_loss_count": 54, "path_positive_coverage_at_3": 0.851852, "path_recall_at_3": 1.0, "path_recall_total": 54, "path_selection_loss": 0.245353, "path_selection_loss_count": 54, "path_tunnel_delta_loss": 0.720266, "path_tunnel_delta_loss_count": 54, "path_tunnel_delta_positive_coverage_at_3": 0.487654, "path_tunnel_delta_positive_total": 162, "path_tunnel_delta_recall_at_3": 0.87037, "path_tunnel_delta_recall_total": 54, "path_tunnel_loss": 0.348827, "path_tunnel_loss_count": 54, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 54, "path_tunnel_rescue050_recall_at_3": 0.981481, "path_tunnel_rescue050_recall_total": 54, "path_tunnel_rescue100_recall_at_3": 0.981481, "path_tunnel_rescue100_recall_total": 54, "path_tunnel_selection_loss": 0.168474, "path_tunnel_selection_loss_count": 54, "path_tunnel_support_positive_coverage_at_3": 0.777778, "path_tunnel_support_positive_total": 162, "path_tunnel_support_recall_at_3": 0.981481, "path_tunnel_support_recall_total": 54, "question_understanding_loss": 3.784985, "question_understanding_loss_count": 54, "recall_event24_positive_total": 113, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 54, "recall_loss": 0.925862, "recall_loss_count": 54, "recall_selection_loss": 0.0, "recall_selection_loss_count": 54, "rows_evaluated": 54, "rows_total": 54, "samples": 54, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.296296, "answer_plan_current_total": 54, "answer_plan_selected_positive_coverage_at_5": 0.778761, "answer_plan_selected_positive_total": 113, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 54, "event5_positive_total": 113, "event_positive_coverage_at_5": 0.964602, "event_recall_at_1": 0.907407, "event_recall_at_5": 1.0, "event_recall_total": 54, "path3_positive_total": 162, "path_positive_coverage_at_3": 0.851852, "path_recall_at_3": 1.0, "path_recall_total": 54, "recall_event24_positive_total": 113, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 54, "samples": 54, "temporal_accuracy": 0.434783, "temporal_total": 23}}, "temporal_accuracy": 0.434783, "temporal_loss": 0.466158, "temporal_loss_count": 23, "temporal_total": 23, "token_role_loss": 0.0, "token_role_loss_count": 7, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}, "train_metrics": {"answer_plan_current_top1_accuracy": 0.418764, "answer_plan_current_total": 437, "answer_plan_loss": 0.551131, "answer_plan_loss_count": 439, "answer_plan_selected_positive_coverage_at_5": 0.794829, "answer_plan_selected_positive_total": 1199, "answer_plan_selected_recall_at_5": 0.970387, "answer_plan_selected_total": 439, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 439, "answer_type_loss": 5.186379, "answer_type_loss_count": 439, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.428571, "answer_plan_current_total": 420, "answer_plan_selected_positive_coverage_at_5": 0.796217, "answer_plan_selected_positive_total": 1163, "answer_plan_selected_recall_at_5": 0.971564, "answer_plan_selected_total": 422, "event5_positive_total": 1163, "event_positive_coverage_at_5": 0.986242, "event_recall_at_1": 0.845972, "event_recall_at_5": 0.99763, "event_recall_total": 422, "path3_positive_total": 1266, "path_positive_coverage_at_3": 0.888626, "path_recall_at_3": 0.990521, "path_recall_total": 422, "recall_event24_positive_total": 1163, "recall_event_positive_coverage_at_24": 0.993121, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 422, "samples": 422, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.176471, "answer_plan_current_total": 17, "answer_plan_selected_positive_coverage_at_5": 0.75, "answer_plan_selected_positive_total": 36, "answer_plan_selected_recall_at_5": 0.941176, "answer_plan_selected_total": 17, "event5_positive_total": 36, "event_positive_coverage_at_5": 0.944444, "event_recall_at_1": 0.647059, "event_recall_at_5": 1.0, "event_recall_total": 17, "path3_positive_total": 51, "path_positive_coverage_at_3": 0.784314, "path_recall_at_3": 0.941176, "path_recall_total": 17, "recall_event24_positive_total": 36, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 17, "samples": 17, "temporal_accuracy": 0.235294, "temporal_total": 17}}, "evaluation_limited": false, "event5_positive_total": 1199, "event_distractor_loss": 0.671777, "event_distractor_loss_count": 439, "event_hard_negative_loss": 0.090675, "event_hard_negative_loss_count": 439, "event_loss": 0.352816, "event_loss_count": 439, "event_matrix_delta_loss": 0.134491, "event_matrix_delta_loss_count": 439, "event_positive_coverage_at_5": 0.984987, "event_recall_at_1": 0.838269, "event_recall_at_5": 0.997722, "event_recall_total": 439, "event_selection_loss": 0.011378, "event_selection_loss_count": 439, "event_tunnel_loss": 0.235026, "event_tunnel_loss_count": 439, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 439, "final_event_set_loss": 0.023022, "final_event_set_loss_count": 439, "graph_error_count": 0, "loss": 3.572644, "loss_group_count": 60, "memory_router_exact_match": 0.366743, "memory_router_f1": 0.845193, "memory_router_loss": 1.455285, "memory_router_loss_count": 439, "memory_router_total": 439, "path3_positive_total": 1317, "path_hard_negative_loss": 0.128464, "path_hard_negative_loss_count": 439, "path_loss": 0.392987, "path_loss_count": 439, "path_matrix_delta_loss": 0.489592, "path_matrix_delta_loss_count": 439, "path_positive_coverage_at_3": 0.884586, "path_recall_at_3": 0.98861, "path_recall_total": 439, "path_selection_loss": 0.139808, "path_selection_loss_count": 439, "path_tunnel_delta_loss": 0.629687, "path_tunnel_delta_loss_count": 439, "path_tunnel_delta_positive_coverage_at_3": 0.458618, "path_tunnel_delta_positive_total": 1317, "path_tunnel_delta_recall_at_3": 0.788155, "path_tunnel_delta_recall_total": 439, "path_tunnel_loss": 0.352045, "path_tunnel_loss_count": 439, "path_tunnel_rescue025_recall_at_3": 0.98861, "path_tunnel_rescue025_recall_total": 439, "path_tunnel_rescue050_recall_at_3": 0.981777, "path_tunnel_rescue050_recall_total": 439, "path_tunnel_rescue100_recall_at_3": 0.977221, "path_tunnel_rescue100_recall_total": 439, "path_tunnel_selection_loss": 0.15888, "path_tunnel_selection_loss_count": 439, "path_tunnel_support_positive_coverage_at_3": 0.800304, "path_tunnel_support_positive_total": 1317, "path_tunnel_support_recall_at_3": 0.938497, "path_tunnel_support_recall_total": 439, "question_understanding_loss": 5.304419, "question_understanding_loss_count": 439, "recall_event24_positive_total": 1199, "recall_event_positive_coverage_at_24": 0.993328, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 439, "recall_loss": 0.878647, "recall_loss_count": 439, "recall_selection_loss": 0.0, "recall_selection_loss_count": 439, "rows_evaluated": 439, "rows_total": 439, "samples": 439, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.418764, "answer_plan_current_total": 437, "answer_plan_selected_positive_coverage_at_5": 0.794829, "answer_plan_selected_positive_total": 1199, "answer_plan_selected_recall_at_5": 0.970387, "answer_plan_selected_total": 439, "event5_positive_total": 1199, "event_positive_coverage_at_5": 0.984987, "event_recall_at_1": 0.838269, "event_recall_at_5": 0.997722, "event_recall_total": 439, "path3_positive_total": 1317, "path_positive_coverage_at_3": 0.884586, "path_recall_at_3": 0.98861, "path_recall_total": 439, "recall_event24_positive_total": 1199, "recall_event_positive_coverage_at_24": 0.993328, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 439, "samples": 439, "temporal_accuracy": 0.235294, "temporal_total": 17}}, "temporal_accuracy": 0.235294, "temporal_loss": 0.558015, "temporal_loss_count": 17, "temporal_total": 17, "token_role_loss": 0.0, "token_role_loss_count": 60, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}, "val_metrics": {"answer_plan_current_top1_accuracy": 0.490909, "answer_plan_current_total": 55, "answer_plan_loss": 0.528866, "answer_plan_loss_count": 55, "answer_plan_selected_positive_coverage_at_5": 0.843284, "answer_plan_selected_positive_total": 134, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 55, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 55, "answer_type_loss": 5.884438, "answer_type_loss_count": 55, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.486486, "answer_plan_current_total": 37, "answer_plan_selected_positive_coverage_at_5": 0.816327, "answer_plan_selected_positive_total": 98, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 37, "event5_positive_total": 98, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.918919, "event_recall_at_5": 1.0, "event_recall_total": 37, "path3_positive_total": 111, "path_positive_coverage_at_3": 0.900901, "path_recall_at_3": 1.0, "path_recall_total": 37, "recall_event24_positive_total": 98, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 37, "samples": 37, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.5, "answer_plan_current_total": 18, "answer_plan_selected_positive_coverage_at_5": 0.916666, "answer_plan_selected_positive_total": 36, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 18, "event5_positive_total": 36, "event_positive_coverage_at_5": 0.972222, "event_recall_at_1": 0.833333, "event_recall_at_5": 1.0, "event_recall_total": 18, "path3_positive_total": 54, "path_positive_coverage_at_3": 0.740741, "path_recall_at_3": 1.0, "path_recall_total": 18, "recall_event24_positive_total": 36, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 18, "samples": 18, "temporal_accuracy": 0.388889, "temporal_total": 18}}, "evaluation_limited": false, "event5_positive_total": 134, "event_distractor_loss": 0.642121, "event_distractor_loss_count": 55, "event_hard_negative_loss": 0.068247, "event_hard_negative_loss_count": 55, "event_loss": 0.364887, "event_loss_count": 55, "event_matrix_delta_loss": 0.122631, "event_matrix_delta_loss_count": 55, "event_positive_coverage_at_5": 0.992537, "event_recall_at_1": 0.890909, "event_recall_at_5": 1.0, "event_recall_total": 55, "event_selection_loss": 0.009935, "event_selection_loss_count": 55, "event_tunnel_loss": 0.240878, "event_tunnel_loss_count": 55, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 55, "final_event_set_loss": 0.014737, "final_event_set_loss_count": 55, "graph_error_count": 0, "loss": 3.474828, "loss_group_count": 7, "memory_router_exact_match": 0.381818, "memory_router_f1": 0.921212, "memory_router_loss": 0.550699, "memory_router_loss_count": 55, "memory_router_total": 55, "path3_positive_total": 165, "path_hard_negative_loss": 0.235719, "path_hard_negative_loss_count": 55, "path_loss": 0.424168, "path_loss_count": 55, "path_matrix_delta_loss": 0.554701, "path_matrix_delta_loss_count": 55, "path_positive_coverage_at_3": 0.848485, "path_recall_at_3": 1.0, "path_recall_total": 55, "path_selection_loss": 0.204065, "path_selection_loss_count": 55, "path_tunnel_delta_loss": 0.663551, "path_tunnel_delta_loss_count": 55, "path_tunnel_delta_positive_coverage_at_3": 0.630303, "path_tunnel_delta_positive_total": 165, "path_tunnel_delta_recall_at_3": 0.854546, "path_tunnel_delta_recall_total": 55, "path_tunnel_loss": 0.364049, "path_tunnel_loss_count": 55, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 55, "path_tunnel_rescue050_recall_at_3": 1.0, "path_tunnel_rescue050_recall_total": 55, "path_tunnel_rescue100_recall_at_3": 1.0, "path_tunnel_rescue100_recall_total": 55, "path_tunnel_selection_loss": 0.172115, "path_tunnel_selection_loss_count": 55, "path_tunnel_support_positive_coverage_at_3": 0.812121, "path_tunnel_support_positive_total": 165, "path_tunnel_support_recall_at_3": 0.981818, "path_tunnel_support_recall_total": 55, "question_understanding_loss": 3.861762, "question_understanding_loss_count": 55, "recall_event24_positive_total": 134, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 55, "recall_loss": 0.859693, "recall_loss_count": 55, "recall_selection_loss": 0.0, "recall_selection_loss_count": 55, "rows_evaluated": 55, "rows_total": 55, "samples": 55, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.490909, "answer_plan_current_total": 55, "answer_plan_selected_positive_coverage_at_5": 0.843284, "answer_plan_selected_positive_total": 134, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 55, "event5_positive_total": 134, "event_positive_coverage_at_5": 0.992537, "event_recall_at_1": 0.890909, "event_recall_at_5": 1.0, "event_recall_total": 55, "path3_positive_total": 165, "path_positive_coverage_at_3": 0.848485, "path_recall_at_3": 1.0, "path_recall_total": 55, "recall_event24_positive_total": 134, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 55, "samples": 55, "temporal_accuracy": 0.388889, "temporal_total": 18}}, "temporal_accuracy": 0.388889, "temporal_loss": 0.502006, "temporal_loss_count": 18, "temporal_total": 18, "token_role_loss": 0.0, "token_role_loss_count": 7, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}}, "global_step": 665, "issue_counts": {"batch_prepare_pool_event": 1}, "issue_log": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/training_issues.jsonl", "last_epoch": {"best_val_loss": 3.718748, "best_val_selection_key": [1.0, 1.0, 1.0, 0.428571, -3.718748], "best_val_selection_score": 118.424107, "epoch": 7, "global_step": 665, "is_best": false, "patience": 3, "total_steps": 704, "train": {"answer_plan_current_top1_accuracy": 0.379747, "answer_plan_current_total": 79, "answer_plan_loss": 0.549526, "answer_plan_loss_count": 80, "answer_plan_selected_positive_coverage_at_5": 0.821101, "answer_plan_selected_positive_total": 218, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 80, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 80, "answer_type_loss": 5.16927, "answer_type_loss_count": 80, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.376623, "answer_plan_current_total": 77, "answer_plan_selected_positive_coverage_at_5": 0.821596, "answer_plan_selected_positive_total": 213, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 78, "event5_positive_total": 213, "event_positive_coverage_at_5": 0.981221, "event_recall_at_1": 0.846154, "event_recall_at_5": 1.0, "event_recall_total": 78, "path3_positive_total": 234, "path_positive_coverage_at_3": 0.910256, "path_recall_at_3": 1.0, "path_recall_total": 78, "recall_event24_positive_total": 213, "recall_event_positive_coverage_at_24": 0.995305, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 78, "samples": 78, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.5, "answer_plan_current_total": 2, "answer_plan_selected_positive_coverage_at_5": 0.8, "answer_plan_selected_positive_total": 5, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 2, "event5_positive_total": 5, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 1.0, "event_recall_at_5": 1.0, "event_recall_total": 2, "path3_positive_total": 6, "path_positive_coverage_at_3": 1.0, "path_recall_at_3": 1.0, "path_recall_total": 2, "recall_event24_positive_total": 5, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 2, "samples": 2, "temporal_accuracy": 0.0, "temporal_total": 2}}, "evaluation_limited": true, "event5_positive_total": 218, "event_distractor_loss": 0.666098, "event_distractor_loss_count": 80, "event_hard_negative_loss": 0.078884, "event_hard_negative_loss_count": 80, "event_loss": 0.323909, "event_loss_count": 80, "event_matrix_delta_loss": 0.128518, "event_matrix_delta_loss_count": 80, "event_positive_coverage_at_5": 0.981651, "event_recall_at_1": 0.85, "event_recall_at_5": 1.0, "event_recall_total": 80, "event_selection_loss": 0.014046, "event_selection_loss_count": 80, "event_tunnel_loss": 0.213497, "event_tunnel_loss_count": 80, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 80, "final_event_set_loss": 0.013138, "final_event_set_loss_count": 80, "graph_error_count": 0, "loss": 3.466264, "loss_group_count": 11, "memory_router_exact_match": 0.375, "memory_router_f1": 0.845089, "memory_router_loss": 1.479962, "memory_router_loss_count": 80, "memory_router_total": 80, "path3_positive_total": 240, "path_hard_negative_loss": 0.117886, "path_hard_negative_loss_count": 80, "path_loss": 0.369553, "path_loss_count": 80, "path_matrix_delta_loss": 0.454712, "path_matrix_delta_loss_count": 80, "path_positive_coverage_at_3": 0.9125, "path_recall_at_3": 1.0, "path_recall_total": 80, "path_selection_loss": 0.12883, "path_selection_loss_count": 80, "path_tunnel_delta_loss": 0.60523, "path_tunnel_delta_loss_count": 80, "path_tunnel_delta_positive_coverage_at_3": 0.479167, "path_tunnel_delta_positive_total": 240, "path_tunnel_delta_recall_at_3": 0.75, "path_tunnel_delta_recall_total": 80, "path_tunnel_loss": 0.338907, "path_tunnel_loss_count": 80, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 80, "path_tunnel_rescue050_recall_at_3": 0.9875, "path_tunnel_rescue050_recall_total": 80, "path_tunnel_rescue100_recall_at_3": 0.9875, "path_tunnel_rescue100_recall_total": 80, "path_tunnel_selection_loss": 0.14157, "path_tunnel_selection_loss_count": 80, "path_tunnel_support_positive_coverage_at_3": 0.829167, "path_tunnel_support_positive_total": 240, "path_tunnel_support_recall_at_3": 0.9375, "path_tunnel_support_recall_total": 80, "question_understanding_loss": 5.293586, "question_understanding_loss_count": 80, "recall_event24_positive_total": 218, "recall_event_positive_coverage_at_24": 0.995413, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 80, "recall_loss": 0.820672, "recall_loss_count": 80, "recall_selection_loss": 0.0, "recall_selection_loss_count": 80, "rows_evaluated": 80, "rows_total": 439, "samples": 80, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.379747, "answer_plan_current_total": 79, "answer_plan_selected_positive_coverage_at_5": 0.821101, "answer_plan_selected_positive_total": 218, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 80, "event5_positive_total": 218, "event_positive_coverage_at_5": 0.981651, "event_recall_at_1": 0.85, "event_recall_at_5": 1.0, "event_recall_total": 80, "path3_positive_total": 240, "path_positive_coverage_at_3": 0.9125, "path_recall_at_3": 1.0, "path_recall_total": 80, "recall_event24_positive_total": 218, "recall_event_positive_coverage_at_24": 0.995413, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 80, "samples": 80, "temporal_accuracy": 0.0, "temporal_total": 2}}, "temporal_accuracy": 0.0, "temporal_loss": 0.513602, "temporal_loss_count": 2, "temporal_total": 2, "token_role_loss": 0.0, "token_role_loss_count": 11, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}, "val": {"answer_plan_current_top1_accuracy": 0.53125, "answer_plan_current_total": 32, "answer_plan_loss": 0.530247, "answer_plan_loss_count": 32, "answer_plan_selected_positive_coverage_at_5": 0.848101, "answer_plan_selected_positive_total": 79, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 32, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 32, "answer_type_loss": 6.441788, "answer_type_loss_count": 32, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.545455, "answer_plan_current_total": 22, "answer_plan_selected_positive_coverage_at_5": 0.847458, "answer_plan_selected_positive_total": 59, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 22, "event5_positive_total": 59, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.909091, "event_recall_at_5": 1.0, "event_recall_total": 22, "path3_positive_total": 66, "path_positive_coverage_at_3": 0.924242, "path_recall_at_3": 1.0, "path_recall_total": 22, "recall_event24_positive_total": 59, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 22, "samples": 22, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.5, "answer_plan_current_total": 10, "answer_plan_selected_positive_coverage_at_5": 0.85, "answer_plan_selected_positive_total": 20, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 10, "event5_positive_total": 20, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.7, "event_recall_at_5": 1.0, "event_recall_total": 10, "path3_positive_total": 30, "path_positive_coverage_at_3": 0.733334, "path_recall_at_3": 1.0, "path_recall_total": 10, "recall_event24_positive_total": 20, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 10, "samples": 10, "temporal_accuracy": 0.3, "temporal_total": 10}}, "evaluation_limited": true, "event5_positive_total": 79, "event_distractor_loss": 0.646066, "event_distractor_loss_count": 32, "event_hard_negative_loss": 0.062803, "event_hard_negative_loss_count": 32, "event_loss": 0.379233, "event_loss_count": 32, "event_matrix_delta_loss": 0.125441, "event_matrix_delta_loss_count": 32, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.84375, "event_recall_at_5": 1.0, "event_recall_total": 32, "event_selection_loss": 0.001734, "event_selection_loss_count": 32, "event_tunnel_loss": 0.248372, "event_tunnel_loss_count": 32, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 32, "final_event_set_loss": 0.012893, "final_event_set_loss_count": 32, "graph_error_count": 0, "loss": 3.604544, "loss_group_count": 4, "memory_router_exact_match": 0.34375, "memory_router_f1": 0.918403, "memory_router_loss": 0.575108, "memory_router_loss_count": 32, "memory_router_total": 32, "path3_positive_total": 96, "path_hard_negative_loss": 0.23831, "path_hard_negative_loss_count": 32, "path_loss": 0.422076, "path_loss_count": 32, "path_matrix_delta_loss": 0.564214, "path_matrix_delta_loss_count": 32, "path_positive_coverage_at_3": 0.864584, "path_recall_at_3": 1.0, "path_recall_total": 32, "path_selection_loss": 0.189845, "path_selection_loss_count": 32, "path_tunnel_delta_loss": 0.658501, "path_tunnel_delta_loss_count": 32, "path_tunnel_delta_positive_coverage_at_3": 0.614584, "path_tunnel_delta_positive_total": 96, "path_tunnel_delta_recall_at_3": 0.84375, "path_tunnel_delta_recall_total": 32, "path_tunnel_loss": 0.360756, "path_tunnel_loss_count": 32, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 32, "path_tunnel_rescue050_recall_at_3": 1.0, "path_tunnel_rescue050_recall_total": 32, "path_tunnel_rescue100_recall_at_3": 1.0, "path_tunnel_rescue100_recall_total": 32, "path_tunnel_selection_loss": 0.183911, "path_tunnel_selection_loss_count": 32, "path_tunnel_support_positive_coverage_at_3": 0.802083, "path_tunnel_support_positive_total": 96, "path_tunnel_support_recall_at_3": 0.96875, "path_tunnel_support_recall_total": 32, "question_understanding_loss": 3.751869, "question_understanding_loss_count": 32, "recall_event24_positive_total": 79, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 32, "recall_loss": 0.894676, "recall_loss_count": 32, "recall_selection_loss": 0.0, "recall_selection_loss_count": 32, "rows_evaluated": 32, "rows_total": 55, "samples": 32, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.53125, "answer_plan_current_total": 32, "answer_plan_selected_positive_coverage_at_5": 0.848101, "answer_plan_selected_positive_total": 79, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 32, "event5_positive_total": 79, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.84375, "event_recall_at_5": 1.0, "event_recall_total": 32, "path3_positive_total": 96, "path_positive_coverage_at_3": 0.864584, "path_recall_at_3": 1.0, "path_recall_total": 32, "recall_event24_positive_total": 79, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 32, "samples": 32, "temporal_accuracy": 0.3, "temporal_total": 10}}, "temporal_accuracy": 0.3, "temporal_loss": 0.50511, "temporal_loss_count": 10, "temporal_total": 10, "token_role_loss": 0.0, "token_role_loss_count": 4, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}, "val_selection_enabled": true, "val_selection_score": 117.895456}, "last_step_checkpoint": {"epoch": 7, "epoch_step": 81, "global_step": 650, "node_model_last_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/node_scorer_last.pt", "path_model_last_path": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/path_scorer_last.pt", "total_steps": 704}, "latest_epoch": 7, "latest_global_step": 665, "memory_router_variant": "question_pooled_multilabel_v1", "memory_tunnel_variant": "chain_depth_tunnel_support_delta_v1", "message_passing_variant": "support_to_event_bidirectional_shared_typed_v1", "model_arch_version": "node_memory_v11_tmcra_scale49m_trimaze", "node_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "node_max_tokens": 32, "pair_feature_adapter_variant": "residual_pair_feature_adapter_v1", "prebuild_issue_log": "outputs/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/graph_cache_prebuild_errors.jsonl", "question_encoder_variant": "hashed_tmcra_token_attn_pool_6l_w256", "question_intent_variant": "text_pooled_multihead_v1", "question_max_tokens": 48, "status": "running", "train": {"answer_plan_current_top1_accuracy": 0.379747, "answer_plan_current_total": 79, "answer_plan_loss": 0.549526, "answer_plan_loss_count": 80, "answer_plan_selected_positive_coverage_at_5": 0.821101, "answer_plan_selected_positive_total": 218, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 80, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 80, "answer_type_loss": 5.16927, "answer_type_loss_count": 80, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.376623, "answer_plan_current_total": 77, "answer_plan_selected_positive_coverage_at_5": 0.821596, "answer_plan_selected_positive_total": 213, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 78, "event5_positive_total": 213, "event_positive_coverage_at_5": 0.981221, "event_recall_at_1": 0.846154, "event_recall_at_5": 1.0, "event_recall_total": 78, "path3_positive_total": 234, "path_positive_coverage_at_3": 0.910256, "path_recall_at_3": 1.0, "path_recall_total": 78, "recall_event24_positive_total": 213, "recall_event_positive_coverage_at_24": 0.995305, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 78, "samples": 78, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.5, "answer_plan_current_total": 2, "answer_plan_selected_positive_coverage_at_5": 0.8, "answer_plan_selected_positive_total": 5, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 2, "event5_positive_total": 5, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 1.0, "event_recall_at_5": 1.0, "event_recall_total": 2, "path3_positive_total": 6, "path_positive_coverage_at_3": 1.0, "path_recall_at_3": 1.0, "path_recall_total": 2, "recall_event24_positive_total": 5, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 2, "samples": 2, "temporal_accuracy": 0.0, "temporal_total": 2}}, "evaluation_limited": true, "event5_positive_total": 218, "event_distractor_loss": 0.666098, "event_distractor_loss_count": 80, "event_hard_negative_loss": 0.078884, "event_hard_negative_loss_count": 80, "event_loss": 0.323909, "event_loss_count": 80, "event_matrix_delta_loss": 0.128518, "event_matrix_delta_loss_count": 80, "event_positive_coverage_at_5": 0.981651, "event_recall_at_1": 0.85, "event_recall_at_5": 1.0, "event_recall_total": 80, "event_selection_loss": 0.014046, "event_selection_loss_count": 80, "event_tunnel_loss": 0.213497, "event_tunnel_loss_count": 80, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 80, "final_event_set_loss": 0.013138, "final_event_set_loss_count": 80, "graph_error_count": 0, "loss": 3.466264, "loss_group_count": 11, "memory_router_exact_match": 0.375, "memory_router_f1": 0.845089, "memory_router_loss": 1.479962, "memory_router_loss_count": 80, "memory_router_total": 80, "path3_positive_total": 240, "path_hard_negative_loss": 0.117886, "path_hard_negative_loss_count": 80, "path_loss": 0.369553, "path_loss_count": 80, "path_matrix_delta_loss": 0.454712, "path_matrix_delta_loss_count": 80, "path_positive_coverage_at_3": 0.9125, "path_recall_at_3": 1.0, "path_recall_total": 80, "path_selection_loss": 0.12883, "path_selection_loss_count": 80, "path_tunnel_delta_loss": 0.60523, "path_tunnel_delta_loss_count": 80, "path_tunnel_delta_positive_coverage_at_3": 0.479167, "path_tunnel_delta_positive_total": 240, "path_tunnel_delta_recall_at_3": 0.75, "path_tunnel_delta_recall_total": 80, "path_tunnel_loss": 0.338907, "path_tunnel_loss_count": 80, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 80, "path_tunnel_rescue050_recall_at_3": 0.9875, "path_tunnel_rescue050_recall_total": 80, "path_tunnel_rescue100_recall_at_3": 0.9875, "path_tunnel_rescue100_recall_total": 80, "path_tunnel_selection_loss": 0.14157, "path_tunnel_selection_loss_count": 80, "path_tunnel_support_positive_coverage_at_3": 0.829167, "path_tunnel_support_positive_total": 240, "path_tunnel_support_recall_at_3": 0.9375, "path_tunnel_support_recall_total": 80, "question_understanding_loss": 5.293586, "question_understanding_loss_count": 80, "recall_event24_positive_total": 218, "recall_event_positive_coverage_at_24": 0.995413, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 80, "recall_loss": 0.820672, "recall_loss_count": 80, "recall_selection_loss": 0.0, "recall_selection_loss_count": 80, "rows_evaluated": 80, "rows_total": 439, "samples": 80, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.379747, "answer_plan_current_total": 79, "answer_plan_selected_positive_coverage_at_5": 0.821101, "answer_plan_selected_positive_total": 218, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 80, "event5_positive_total": 218, "event_positive_coverage_at_5": 0.981651, "event_recall_at_1": 0.85, "event_recall_at_5": 1.0, "event_recall_total": 80, "path3_positive_total": 240, "path_positive_coverage_at_3": 0.9125, "path_recall_at_3": 1.0, "path_recall_total": 80, "recall_event24_positive_total": 218, "recall_event_positive_coverage_at_24": 0.995413, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 80, "samples": 80, "temporal_accuracy": 0.0, "temporal_total": 2}}, "temporal_accuracy": 0.0, "temporal_loss": 0.513602, "temporal_loss_count": 2, "temporal_total": 2, "token_role_loss": 0.0, "token_role_loss_count": 11, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}, "training_sampling": {"answer_type_counts": {"multi_evidence": 422, "time": 17}, "base_row_count": 439, "effective_row_count": 697, "loss_weight_range_estimate": {"max": 1.549193, "min": 1.319091}, "mode": "source_aware_balanced", "repeat_histogram": {"1": 118, "2": 144}, "source_count": 1, "source_details": {"dataset": {"avg_example_weight_estimate": 1.540283, "loss_source_factor": 1.0, "row_count": 439, "row_share": 1.0, "sampled_row_count_estimate": 697, "sampled_row_share_estimate": 1.0, "sampling_source_factor": 1.0, "weighted_loss_mass": 676.184129}}, "source_row_counts": {"dataset": 439}, "supervision_bucket_counts": {"typed_tunnel_multi_positive": 439}, "weighted_answer_mass_estimate": {"multi_evidence": 653.759589, "time": 22.42454}, "weighted_supervision_bucket_mass_estimate": {"typed_tunnel_multi_positive": 680.095876}}, "val": {"answer_plan_current_top1_accuracy": 0.53125, "answer_plan_current_total": 32, "answer_plan_loss": 0.530247, "answer_plan_loss_count": 32, "answer_plan_selected_positive_coverage_at_5": 0.848101, "answer_plan_selected_positive_total": 79, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 32, "answer_refusal_loss": 0.0, "answer_refusal_loss_count": 32, "answer_type_loss": 6.441788, "answer_type_loss_count": 32, "answer_type_metrics": {"multi_evidence": {"answer_plan_current_top1_accuracy": 0.545455, "answer_plan_current_total": 22, "answer_plan_selected_positive_coverage_at_5": 0.847458, "answer_plan_selected_positive_total": 59, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 22, "event5_positive_total": 59, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.909091, "event_recall_at_5": 1.0, "event_recall_total": 22, "path3_positive_total": 66, "path_positive_coverage_at_3": 0.924242, "path_recall_at_3": 1.0, "path_recall_total": 22, "recall_event24_positive_total": 59, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 22, "samples": 22, "temporal_accuracy": 0.0, "temporal_total": 0}, "time": {"answer_plan_current_top1_accuracy": 0.5, "answer_plan_current_total": 10, "answer_plan_selected_positive_coverage_at_5": 0.85, "answer_plan_selected_positive_total": 20, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 10, "event5_positive_total": 20, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.7, "event_recall_at_5": 1.0, "event_recall_total": 10, "path3_positive_total": 30, "path_positive_coverage_at_3": 0.733334, "path_recall_at_3": 1.0, "path_recall_total": 10, "recall_event24_positive_total": 20, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 10, "samples": 10, "temporal_accuracy": 0.3, "temporal_total": 10}}, "evaluation_limited": true, "event5_positive_total": 79, "event_distractor_loss": 0.646066, "event_distractor_loss_count": 32, "event_hard_negative_loss": 0.062803, "event_hard_negative_loss_count": 32, "event_loss": 0.379233, "event_loss_count": 32, "event_matrix_delta_loss": 0.125441, "event_matrix_delta_loss_count": 32, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.84375, "event_recall_at_5": 1.0, "event_recall_total": 32, "event_selection_loss": 0.001734, "event_selection_loss_count": 32, "event_tunnel_loss": 0.248372, "event_tunnel_loss_count": 32, "event_tunnel_selection_loss": 0.0, "event_tunnel_selection_loss_count": 32, "final_event_set_loss": 0.012893, "final_event_set_loss_count": 32, "graph_error_count": 0, "loss": 3.604544, "loss_group_count": 4, "memory_router_exact_match": 0.34375, "memory_router_f1": 0.918403, "memory_router_loss": 0.575108, "memory_router_loss_count": 32, "memory_router_total": 32, "path3_positive_total": 96, "path_hard_negative_loss": 0.23831, "path_hard_negative_loss_count": 32, "path_loss": 0.422076, "path_loss_count": 32, "path_matrix_delta_loss": 0.564214, "path_matrix_delta_loss_count": 32, "path_positive_coverage_at_3": 0.864584, "path_recall_at_3": 1.0, "path_recall_total": 32, "path_selection_loss": 0.189845, "path_selection_loss_count": 32, "path_tunnel_delta_loss": 0.658501, "path_tunnel_delta_loss_count": 32, "path_tunnel_delta_positive_coverage_at_3": 0.614584, "path_tunnel_delta_positive_total": 96, "path_tunnel_delta_recall_at_3": 0.84375, "path_tunnel_delta_recall_total": 32, "path_tunnel_loss": 0.360756, "path_tunnel_loss_count": 32, "path_tunnel_rescue025_recall_at_3": 1.0, "path_tunnel_rescue025_recall_total": 32, "path_tunnel_rescue050_recall_at_3": 1.0, "path_tunnel_rescue050_recall_total": 32, "path_tunnel_rescue100_recall_at_3": 1.0, "path_tunnel_rescue100_recall_total": 32, "path_tunnel_selection_loss": 0.183911, "path_tunnel_selection_loss_count": 32, "path_tunnel_support_positive_coverage_at_3": 0.802083, "path_tunnel_support_positive_total": 96, "path_tunnel_support_recall_at_3": 0.96875, "path_tunnel_support_recall_total": 32, "question_understanding_loss": 3.751869, "question_understanding_loss_count": 32, "recall_event24_positive_total": 79, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 32, "recall_loss": 0.894676, "recall_loss_count": 32, "recall_selection_loss": 0.0, "recall_selection_loss_count": 32, "rows_evaluated": 32, "rows_total": 55, "samples": 32, "supervision_bucket_metrics": {"typed_tunnel_multi_positive": {"answer_plan_current_top1_accuracy": 0.53125, "answer_plan_current_total": 32, "answer_plan_selected_positive_coverage_at_5": 0.848101, "answer_plan_selected_positive_total": 79, "answer_plan_selected_recall_at_5": 1.0, "answer_plan_selected_total": 32, "event5_positive_total": 79, "event_positive_coverage_at_5": 1.0, "event_recall_at_1": 0.84375, "event_recall_at_5": 1.0, "event_recall_total": 32, "path3_positive_total": 96, "path_positive_coverage_at_3": 0.864584, "path_recall_at_3": 1.0, "path_recall_total": 32, "recall_event24_positive_total": 79, "recall_event_positive_coverage_at_24": 1.0, "recall_event_recall_at_24": 1.0, "recall_event_recall_total": 32, "samples": 32, "temporal_accuracy": 0.3, "temporal_total": 10}}, "temporal_accuracy": 0.3, "temporal_loss": 0.50511, "temporal_loss_count": 10, "temporal_total": 10, "token_role_loss": 0.0, "token_role_loss_count": 4, "training_weight_max": 1.35, "training_weight_mean": 1.35, "training_weight_min": 1.35}}

models/action_frame_tunnel_graph548_tunnel_fusion_train_20260524_042557/training_issues.jsonl ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"backend": "process", "event_type": "batch_prepare_process_pool_ready", "graph_error_stage": "train_batch_prepare", "issue_type": "batch_prepare_pool_event", "timestamp": "2026-05-24T04:26:31", "worker_count": 16}

results/judge_gpt4o_alias_vectorengine.jsonl ADDED Viewed

	@@ -0,0 +1,500 @@

+{"question_id": "7e00a6cb", "hypothesis": "International Budget Hostel", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "1903aded", "hypothesis": "I can’t recover the 7th job from the provided memory evidence. The retrieved notes only show unrelated lists, like living room color schemes, dessert ideas, and sales tax questions, not the earlier work-from-home jobs for seniors list.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "ceb54acb", "hypothesis": "Sexual fixations, Problematic sexual behaviors, Sexual impulsivity, Compulsive sexuality", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "f523d9fe", "hypothesis": "I can’t tell from the available memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "0e5e2d1a", "hypothesis": "38 subjects", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "fea54f57", "hypothesis": "Evolution", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "cc539528", "hypothesis": "Ruby, Python, or PHP", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "dc439ea3", "hypothesis": "Hoop Dance", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "18dcd5a5", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "488d3006", "hypothesis": "GR-90", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "58470ed2", "hypothesis": "The Library is a sphere whose exact center is any one of its hexagons and whose circumference is inaccessible.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "8cf51dda", "hypothesis": "1. To identify molecular subtypes of endometrial cancer using a combination of genomic and transcriptomic approaches. 2. To investigate the clinical and biological significance of the identified molecular subtypes, including their association with patient outcomes. 3. To develop biomarkers for the early detection and prognosis of endometrial cancer based on the identified molecular subtypes.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "1d4da289", "hypothesis": "biometric authentication and one-time passwords (OTP)", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "8464fc84", "hypothesis": "Roscioli", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "8aef76bc", "hypothesis": "Mod Podge or another sealant", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "71a3fd6b", "hypothesis": "+49 (0) 62 32 / 14 23 - 0", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "2bf43736", "hypothesis": "Chapter 4 of Book 1, titled \"Vocal Prayer and Meditation.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "70b3e69b", "hypothesis": "Marina Rossell", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "8752c811", "hypothesis": "The 27th parameter was Perspective.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "3249768e", "hypothesis": "Absinthe", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "1b9b7252", "hypothesis": "Mindful.org", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "1568498a", "hypothesis": "29. Rd3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "6222b6eb", "hypothesis": "6S", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "e8a79c70", "hypothesis": "2-3 eggs", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "d596882b", "hypothesis": "By Chloe", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "e3fc4d6e", "hypothesis": "Dr. Arati Prabhakar", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "51b23612", "hypothesis": "Nu, pogodi!", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "3e321797", "hypothesis": "10 minutes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "e982271f", "hypothesis": "Revolution Hall", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "352ab8bd", "hypothesis": "approximately 20%", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "fca762bc", "hypothesis": "Memrise", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "7a8d0b71", "hypothesis": "In the visible remembered campaign-plan excerpt, I can confirm we discussed the DHL Wellness Retreats email marketing campaign, but the specific influencer marketing allocation amount is not shown in the evidence provided.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "a40e080f", "hypothesis": "Patagonia and Southwest Airlines", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "8b9d4367", "hypothesis": "Jaipur Rugs", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "5809eb10", "hypothesis": "2015", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "41275add", "hypothesis": "How to Sit Properly at a Desk to Avoid Back Pain (Mayo Clinic)", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "4388e9dd", "hypothesis": "an untidy, stained white shirt", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "4baee567", "hypothesis": "12", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "561fabcd", "hypothesis": "Contaminated Colossus", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "b759caee", "hypothesis": "@jessica_poole_jewellery", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "ac031881", "hypothesis": "squares", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "28bcfaac", "hypothesis": "MusicTheory.net", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "16c90bf4", "hypothesis": "A pilsner or lager.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "c8f1aeed", "hypothesis": "Pennsylvania", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "eaca4986", "hypothesis": "C D E F G A B A G F E D C", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "c7cf7dfd", "hypothesis": "Nostalgia", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "e48988bc", "hypothesis": "Patagonia", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "1de5cff2", "hypothesis": "Veja", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "65240037", "hypothesis": "1:10 ratio", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "778164c6", "hypothesis": "The evidence only shows that we discussed popular Caribbean dishes featuring snapper, but it does not include the actual Jamaican fruit-based dish name.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "e47becba", "hypothesis": "Business Administration", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "118b2229", "hypothesis": "45 minutes each way", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "51a45a95", "hypothesis": "The evidence does not say where you redeemed the $5 coffee creamer coupon.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "58bf7951", "hypothesis": "The Glass Menagerie", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "1e043500", "hypothesis": "Summer Vibes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "c5e8278d", "hypothesis": "Johnson", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "6ade9755", "hypothesis": "Serenity Yoga", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "6f9b354f", "hypothesis": "a lighter shade of gray", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "58ef2f1c", "hypothesis": "back on Valentine's Day", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "f8c5f88b", "hypothesis": "The retrieved evidence does not say where you bought the new tennis racket from.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "5d3d2817", "hypothesis": "marketing specialist at a small startup", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "7527f7e2", "hypothesis": "$800", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "c960da58", "hypothesis": "20", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "3b6f954b", "hypothesis": "University of Melbourne", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "726462e0", "hypothesis": "10%", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "94f70d80", "hypothesis": "The retrieved evidence does not state any assembly time for an IKEA bookshelf. It only mentions plans to visit IKEA to look at coffee tables and read reviews, so the duration cannot be determined from this evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "66f24dbb", "hypothesis": "No sister gift is shown in memory; the only recorded birthday gift purchase was a customized phone case with his favorite game's logo for your brother.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "ad7109d1", "hypothesis": "The retrieved evidence does not specify the speed of your new internet plan.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "af8d2e46", "hypothesis": "7", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "dccbc061", "hypothesis": "staunch atheist", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "c8c3f81d", "hypothesis": "Nike", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "8ebdbe50", "hypothesis": "Data Science certification", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "6b168ec8", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "75499fd8", "hypothesis": "Golden Retriever", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "21436231", "hypothesis": "Not stated", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "95bcc1c8", "hypothesis": "10", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "0862e8bf", "hypothesis": "Luna", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "853b0a1d", "hypothesis": "I can tell it was a gift from your grandma, but the evidence here doesn’t include your age when she gave you the silver necklace.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "a06e4cfe", "hypothesis": "3:1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "37d43f65", "hypothesis": "16GB", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "b86304ba", "hypothesis": "Triple what you paid for it.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "d52b4f67", "hypothesis": "Grand Ballroom", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "25e5aa4f", "hypothesis": "UCLA", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "caf9ead2", "hypothesis": "around 5 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "8550ddae", "hypothesis": "Lavender Dream cocktail", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "60d45044", "hypothesis": "Japanese short-grain rice", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "3f1e9474", "hypothesis": "Sarah", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "86b68151", "hypothesis": "IKEA", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "577d4d32", "hypothesis": "7 pm", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "ec81a493", "hypothesis": "500 copies", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "15745da0", "hypothesis": "about three months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "e01b8e2f", "hypothesis": "Hawaii", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "bc8a6e93", "hypothesis": "lemon blueberry cake", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "ccb36322", "hypothesis": "There’s no evidence here of any music streaming service you’ve been using lately; the only services mentioned are photo album services like Shutterfly, Snapfish, and Mixbook.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "001be529", "hypothesis": "Your asylum application had been approved by May 28, 2023, and there was a waiting period, but the evidence does not state how long you waited for the decision.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "b320f3f8", "hypothesis": "a rare blue Snaggletooth action figure", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "19b5f2b3", "hypothesis": "The evidence only shows that you were in Japan/Kyoto and planning to travel elsewhere in Asia afterward, but it does not state how long you were in Japan.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "4fd1909e", "hypothesis": "Xfinity Center", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "545bd2b5", "hypothesis": "2 hours per day", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "8a137a7f", "hypothesis": "Philips LED bulb", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "76d63226", "hypothesis": "55-inch", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "86f00804", "hypothesis": "The Seven Husbands of Evelyn Hugo", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "8e9d538c", "hypothesis": "17 skeins", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "311778f1", "hypothesis": "around 10 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "c19f7a0b", "hypothesis": "around 6:30 pm", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "4100d0a0", "hypothesis": "Irish and Italian", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "29f2956b", "hypothesis": "30 minutes daily", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "1faac195", "hypothesis": "Denver", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "faba32e5", "hypothesis": "24 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "f4f1d8a4", "hypothesis": "my sister", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "c14c00dd", "hypothesis": "Trader Joe's", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "36580ce8", "hypothesis": "The evidence only says you’d been dealing with unspecified health issues for a few months; it does not identify any specific issue you initially thought was just a cold.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "3d86fd0a", "hypothesis": "Information unavailable", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "a82c026e", "hypothesis": "Dark Souls 3 DLC", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "0862e8bf_abs", "hypothesis": "I can’t determine your hamster’s name from the provided evidence because the memories only mention your cat and unrelated coding discussion, not any hamster.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "15745da0_abs", "hypothesis": "No evidence says how long you've been collecting vintage films.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "bc8a6e93_abs", "hypothesis": "I don't know", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "19b5f2b3_abs", "hypothesis": "The available evidence doesn’t mention a Korea trip or its duration, so I can’t determine how long you were in Korea.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "29f2956b_abs", "hypothesis": "I only have evidence that you practice guitar for 30 minutes daily; there’s no evidence here about a daily violin practice time.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "f4f1d8a4_abs", "hypothesis": "I only remember a new stand mixer as a birthday gift from your sister, not your dad.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "0a995998", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "6d550036", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_59c863d7", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "b5ef892d", "hypothesis": "5 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "e831120c", "hypothesis": "3.5 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "3a704032", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_d84a3211", "hypothesis": "I can identify bike-related activity this year, like getting bike lights installed, but there are no explicit dollar amounts for any bike-related purchases or services in the evidence, so I can’t compute a total spent.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "aae3761f", "hypothesis": "10 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_f2262a51", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "dd2973ad", "hypothesis": "I can only narrow it down indirectly: your doctor’s appointment was at 10 AM on Thursday 2023-05-18, so the day before was Wednesday 2023-05-17. The only bedtime evidence says you went to bed at 2 AM on last Wednesday relative to 2023-05-29, which refers to 2023-05-24, not 2023-05-17. So the closest bedtime mentioned is 2 AM, but it does not match the day before that appointment.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "c4a1ceb8", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_a56e767c", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "6cb6f249", "hypothesis": "17 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "46a3abf7", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "36b9f61e", "hypothesis": "2000", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "28dc39ac", "hypothesis": "29 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_2f8be40d", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "2e6d26dc", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_15e38248", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "88432d0a", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "80ec1f4f", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "d23cf73b", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_7fce9456", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "d682f1a2", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "7024f17c", "hypothesis": "0.5 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_5501fe77", "hypothesis": "TikTok", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_2ba83207", "hypothesis": "Trader Joe's", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "2318644b", "hypothesis": "Tokyo is the only place with a numeric nightly accommodation cost in the evidence: about $30/night. For Hawaii, the evidence only says you had a luxurious resort stay in Maui, which suggests it was more expensive but does not give a dollar amount, so the exact per-night difference cannot be calculated from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "2ce6a0f2", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_d12ceb0e", "hypothesis": "Insufficient evidence to compute the average age of you, your parents, and your grandparents.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "00ca467f", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "b3c15d39", "hypothesis": "5 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_31ff4165", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "eeda8a6d", "hypothesis": "17", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "2788b940", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "60bf93ed", "hypothesis": "It arrived on 1/20, but I don’t see the purchase date in the evidence, so I can’t calculate the number of days exactly.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "9d25d4e0", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "129d1232", "hypothesis": "600 dollars", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "60472f9c", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_194be4b3", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "a9f6b44c", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "d851d5ba", "hypothesis": "At least 2,250 USD", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "5a7937c8", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_ab202e7f", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_e05b82a6", "hypothesis": "7", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_731e37d7", "hypothesis": "$200", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "edced276", "hypothesis": "8 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "10d9b85a", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "e3038f8c", "hypothesis": "87", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "2b8f3739", "hypothesis": "$495", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "1a8a66a6", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "c2ac3c61", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "bf659f65", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_372c3eed", "hypothesis": "8 years", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_2f91af09", "hypothesis": "6", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "81507db6", "hypothesis": "3", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "88432d0a_abs", "hypothesis": "0", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "80ec1f4f_abs", "hypothesis": "0", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "eeda8a6d_abs", "hypothesis": "not evidenced in the retrieved memory", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "60bf93ed_abs", "hypothesis": "70 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "edced276_abs", "hypothesis": "I can only confirm a Hawaii trip, and no number of days is given for it. There’s also no evidence here of a Seattle trip duration, so the total can’t be determined from the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "gpt4_372c3eed_abs", "hypothesis": "8 years documented; no evidence of a completed Master's degree", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "8a2466db", "hypothesis": "Sorry, I don’t know of any specific resources from memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "06878be2", "hypothesis": "A comfortable, durable Sony-compatible camera bag, a Godox V1 hard case or Think Tank Photo Flash Pouch, and a lens cleaning kit with a microfiber cloth, blower, and lens cleaning solution would complement your Sony A7R IV setup.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "75832dbd", "hypothesis": "Recent publications or conferences on the impact of mega-churches on Baptist church vitality, Baptist identity and mission, and practical collaboration models among smaller Baptist churches—especially case-study-focused work on regional or denominational cooperation.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "0edc2aef", "hypothesis": "Look for a Miami hotel with a balcony hot tub and a standout view, similar to your preferred room at The Edgewater Hotel.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "35a27287", "hypothesis": "French- or Spanish-focused language exchange meetups, conversation groups, or cultural events nearby would fit you best; check Meetup, Facebook Groups, local universities, or language schools this weekend.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "32260d93", "hypothesis": "Mike Birbiglia: My Girlfriend's Boyfriend", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "195a1a1b", "hypothesis": "Try a calm wind-down routine: read a non-stimulating book, do a guided sleep meditation or body scan, practice deep breathing, do gentle yoga or stretching, listen to soothing music or nature sounds, and take a 30-minute break from screens before bed.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "afdc33df", "hypothesis": "Regularly clean your granite countertop, use coasters or pads under pots and utensils, always use a cutting board, and check the faucet/aerator for moisture or buildup so the sink area stays cleaner.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "caf03d32", "hypothesis": "Try more plant-based slow cooker recipes like lentil curry, roasted vegetable stew, black bean chili, eggplant parmesan, vegan chili, or a quinoa and black bean bowl.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "54026fce", "hypothesis": "Try suggesting virtual coffee breaks in a collaborative way: ask open-ended questions about staying connected, share that you miss casual chats, propose a weekly virtual coffee break, and invite your team’s feedback before deciding.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "06f04340", "hypothesis": "Serve a hearty quinoa salad with your roasted homegrown vegetables over mixed greens, finished with vinaigrette; roast the vegetables first, then toss with the greens and quinoa.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "6b7dfb22", "hypothesis": "Study real flowers from life or photos, browse Instagram and art communities for ideas, experiment with new techniques like palette-knife texture, and take breaks while tracking small wins to help inspiration return.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "1a1907b4", "hypothesis": "Try a Pimm's Cup with a Twist, especially with Ruby Red Grapefruit simple syrup.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "09d032c9", "hypothesis": "A compact portable power bank could help as a backup.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "38146c39", "hypothesis": "Try adding warm spices, nuts like walnuts or pecans, a little citrus zest, or a touch of espresso or coffee for extra flavor.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "d24813b1", "hypothesis": "Bake chocolate chip cookies or a lemon poppyseed cake; both fit your recent baking interests, and the lemon poppyseed cake worked well for a colleague event before.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "57f827a0", "hypothesis": "No direct memory-backed evidence shows you were planning to rearrange your bedroom furniture this weekend.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "95228167", "hypothesis": "Try several guitars and focus on comfort, playability, build quality, tuning stability, and the sound you like best.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "505af2f5", "hypothesis": "Try a homemade vanilla-cinnamon creamer with unsweetened almond or oat milk, a little monk fruit or stevia instead of honey, and optional coconut or hazelnut extract.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "75f70248", "hypothesis": "Possibly yes—the prior memory suggests your living room had a dust issue and a shedding cat, so dust or pet hair there could be contributing to your sneezing.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "d6233ab6", "hypothesis": "Yes—if you’re feeling nostalgic and want to reconnect with old high school friends, attending your reunion sounds like a good idea.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "1da05512", "hypothesis": "Buy now if your storage issue is already a problem and a beginner-friendly 2-bay model fits your budget; wait only if you expect much larger future storage needs or still need to compare options.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "fca70973", "hypothesis": "Try Universal Studios Hollywood’s VIP Experience with the Gourmet Buffet and Behind-the-Scenes Tour as the top pick; Disneyland Resort Halloween dining or Knott’s Scary Farm Boo-fet would also fit your immersive, exclusive theme park weekend vibe.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "b6025781", "hypothesis": "Try herb-rosemary roasted sweet potatoes, quinoa and roasted vegetable bowls with salmon, tofu, or lentil bolognese, and a batch of homemade granola for snacks.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "a89d7624", "hypothesis": "Visit Red Rocks Park and Amphitheatre and check out other live music spots like The Ogden Theatre, The Bluebird Theater, The Fillmore Auditorium, and Larimer Lounge.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "b0479f84", "hypothesis": "Try Planet Earth, Blue Planet, Dynasties, Chasing Coral, and March of the Penguins—especially the nature and wildlife ones since that’s what you were most interested in.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "1d4e3b97", "hypothesis": "Yes — the recent upgrade to your bike’s chain and cassette could be why it’s performing better.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "07b6f563", "hypothesis": "An iPhone 13 Pro-compatible wallet case like the OtterBox wallet case would be a useful accessory; I’d skip suggesting a power bank since you already have one that’s working well.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "1c0ddc50", "hypothesis": "Listen to history and science podcasts during your 40-minute commute—try starter episodes from Hardcore History, Lore, The Dollop, StarTalk Radio, or Radiolab.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "0a34ad58", "hypothesis": "Since you already have a Suica card, use it for trains and subways to make getting around easier. If you’re heading out from Shinjuku, plan a little extra time, especially for early or busy destinations like Tsukiji, where it’s best to leave very early and expect crowds. Keep some yen/cash with you because some places, especially around markets, may not take cards, and if needed you can exchange money near Shinjuku.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "d3ab962e", "hypothesis": "8 miles", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "2311e44b", "hypothesis": "The Nightingale' is 440 pages long, but I don’t have a numeric page-progress value for how far you’ve already read, so the exact pages left can’t be calculated from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "cc06de0d", "hypothesis": "I can’t compute the difference from the provided evidence because it gives your daily train fare as $6, but it doesn’t include the taxi ride cost.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "a11281a2", "hypothesis": "about 100 followers", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "4f54b7c9", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "85fa3a3f", "hypothesis": "50", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "9aaed6a3", "hypothesis": "$0.75", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "1f2b8d4f", "hypothesis": "I can’t determine an exact price difference from the evidence. The price of your luxury boots isn’t given, and no specific similar pair found at a budget store has a stated price. The closest budget-store-style prices mentioned were about $20-$50 for affordable ankle boots, but that still isn’t enough to calculate the difference.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "e6041065", "hypothesis": "40%", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "51c32626", "hypothesis": "No explicit submission date was given; it was only mentioned by 2023/05/22 as a past event.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "d905b33f", "hypothesis": "20%", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "7405e8b1", "hypothesis": "HelloFresh: 40% on your first order. For UberEats, the evidence only says you got 20% off an order last week, not that it was your first order.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "f35224e0", "hypothesis": "At least 12 episodes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "6456829e", "hypothesis": "5 tomato plants and 3 cucumber plants", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "a4996e51", "hypothesis": "Up to 50 hours/week.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "3c1045c8", "hypothesis": "unavailable", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "60036106", "hypothesis": "12,000 people", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "681a1674", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "e25c3b8d", "hypothesis": "I can only tell that you paid $200 for the designer handbag at TK Maxx; the savings amount isn’t stated in the evidence, so it can’t be determined from this information alone.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "4adc0475", "hypothesis": "Insufficient evidence for an exact total.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "4bc144e2", "hypothesis": "Parking ticket: $50; car wash not found in the retrieved evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "ef66a6e5", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "5025383b", "hypothesis": "cooking", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "a1cc6108", "hypothesis": "You were about 0 years old when Alex was born.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "9ee3ecd6", "hypothesis": "300 points", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "3fdac837", "hypothesis": "12 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "91b15a6e", "hypothesis": "$5,150", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "27016adc", "hypothesis": "The planned renovations are $20,000. I don’t see the countryside property’s purchase price in the provided evidence, so the exact percentage can’t be computed from this evidence alone.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "720133ac", "hypothesis": "$75", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "77eafa52", "hypothesis": "$300", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "8979f9ec", "hypothesis": "I can only confirm those were lunch meal-prep recipe ideas mentioned on 2023/05/26; there’s no evidence here giving a number of meals for the chicken fajitas or any lentil soup, so the total can’t be computed from the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "0100672e", "hypothesis": "$12 per coffee mug", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "a96c20ee", "hypothesis": "I presented a poster on my thesis research at my first research conference over the summer, but the university isn't stated in the evidence provided.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "92a0aa75", "hypothesis": "Senior Marketing Specialist at the company", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "3fe836c9", "hypothesis": "The final sale price was $325,000. The only other amount mentioned is $350,000, but in the evidence it appears as a generic mortgage example, not a clearly stated pre-approval amount, so the exact difference can’t be determined from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "1c549ce4", "hypothesis": "The only purchase cost shown is the waterproof car cover at $120. I don’t see evidence that you purchased a detailing spray or what it cost, so I can’t add a total beyond the confirmed $120.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "6c49646a", "hypothesis": "1940 miles", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "1192316e", "hypothesis": "I can’t compute a total from the provided evidence because it only mentions meditation and breakfast timing, not how long you take to get ready or commute to work.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "0ea62687", "hypothesis": "I can’t determine the difference from the retrieved memory: a few months ago it was 30 mpg, but your current mpg isn’t stated.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "67e0d0f2", "hypothesis": "At least 8. You explicitly mentioned completing 8 edX courses, and you also said you had completed some Coursera courses, but without a number, so the exact total isn’t determined from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "bb7c3b45", "hypothesis": "The evidence only gives the original retail price of the Jimmy Choo heels as $500. It does not say what you actually paid, so the amount saved cannot be determined from the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "ba358f49", "hypothesis": "I don't know—Rachel is expected to get married in 2023, but your age isn't provided.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "61f8c8f8", "hypothesis": "10 minutes faster", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "60159905", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "ef9cf60a", "hypothesis": "$100", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "73d42213", "hypothesis": "No explicit arrival time is stated for Monday. The only clinic-travel detail says it took two hours to get there last time, but that doesn’t give a reach time.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "bc149d6b", "hypothesis": "70 pounds", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "099778bb", "hypothesis": "Insufficient evidence to determine the percentage; only the total of 100 leadership positions is provided, not how many are held by women.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "09ba9854", "hypothesis": "About $50.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "d6062bb9", "hypothesis": "1998", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "157a136e", "hypothesis": "I don't know", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "c18a7dc8", "hypothesis": "7", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "a3332713", "hypothesis": "About $200 total", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "55241a1f", "hypothesis": "Your recent Facebook Live session had 12 comments, but I don’t have evidence here for the comment count of your most popular YouTube video, so I can’t compute the total from the available information.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "a08a253f", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "f0e564bc", "hypothesis": "The evidence shows $800 for the designer handbag, but no amount is provided for the high-end skincare products, so the total cannot be determined.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "078150f1", "hypothesis": "$50 more", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "8cf4d046", "hypothesis": "I can’t compute an average GPA from the evidence provided. It mentions your undergraduate result as a First-Class distinction from the University of Mumbai and that you completed a Master’s in Data Science at UIUC, but it does not give explicit GPA values for either degree.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "a346bb18", "hypothesis": "12 minutes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "37f165cf", "hypothesis": "416 pages and 440 pages", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "8e91e7d9", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "87f22b4a", "hypothesis": "$120", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "e56a43b9", "hypothesis": "$5 discount", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "efc3f7c2", "hypothesis": "30 minutes earlier", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "21d02d0d", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "2311e44b_abs", "hypothesis": "You mentioned that you've been reading \"Sapiens\" at a pace of 10–20 pages a week, but there’s no evidence here stating the book’s total page count or how many pages you’ve already finished, so the exact number of pages left can’t be determined from the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "6456829e_abs", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "e5ba910e_abs", "hypothesis": "$378", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "a96c20ee_abs", "hypothesis": "Harvard University", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "ba358f49_abs", "hypothesis": "Insufficient evidence to determine Rachel's age when you get married.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "09ba9854_abs", "hypothesis": "The evidence doesn’t give enough to calculate the savings exactly. It does mention the Airport Limousine Bus costs ¥3,200 one way to Shinjuku Station, but it does not provide a full taxi fare from the airport to your hotel, only that a short taxi ride from Shinjuku Station to the hotel might be needed.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "gpt4_59149c77", "hypothesis": "7 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_f49edff3", "hypothesis": "helped my friend prepare a nursery, helped my cousin pick out stuff for her baby shower, ordered a customized phone case for my friend's birthday", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "71017276", "hypothesis": "4 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "b46e15ed", "hypothesis": "2 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_fa19884c", "hypothesis": "6 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "0bc8ad92", "hypothesis": "2 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "af082822", "hypothesis": "2 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_4929293a", "hypothesis": "Michael's engagement party", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_b5700ca9", "hypothesis": "4 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "9a707b81", "hypothesis": "21 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_1d4ab0c9", "hypothesis": "24 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_e072b769", "hypothesis": "about 3 weeks ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "0db4c65d", "hypothesis": "18 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_1d80365e", "hypothesis": "2 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_7f6b06db", "hypothesis": "Yosemite, Big Sur and Monterey, Yosemite National Park", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_6dc9b45b", "hypothesis": "4 months ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_8279ba02", "hypothesis": "10 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_18c2b244", "hypothesis": "I used a Buy One Get One Free coupon on Luvs diapers at Walmart, then I redeemed $12 cashback for a $10 Amazon gift card from Ibotta, then I signed up for the rewards program at ShopRite.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_a1b77f9c", "hypothesis": "Insufficient evidence to calculate the total weeks spent.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_1916e0ea", "hypothesis": "I can identify the Instacart online grocery shopping date as 2023/02/28, but the provided evidence does not include the date you cancelled your FarmFresh subscription, so the number of days passed cannot be determined from this evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_7a0daae1", "hypothesis": "I can tell you received your new tennis racket on 2023/03/17, but the buy date isn’t provided in the evidence, so the number of weeks between buying and receiving can’t be determined from the available information.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_468eb063", "hypothesis": "9 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_7abb270c", "hypothesis": "Science Museum, Metropolitan Museum of Art, Museum of Contemporary Art, Modern Art Museum, and two missing museum visits", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_1e4a8aeb", "hypothesis": "6 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_4fc4f797", "hypothesis": "38 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "4dfccbf7", "hypothesis": "24 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_61e13b3c", "hypothesis": "3 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_45189cb4", "hypothesis": "College Football National Championship game (Georgia vs. Alabama), then Lakers game, then Kansas City Chiefs vs. Buffalo Bills Divisional Round game", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "2ebe6c90", "hypothesis": "21 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_e061b84f", "hypothesis": "volleyball league game, Spring Sprint Triathlon, company's annual charity soccer tournament", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "370a8ff4", "hypothesis": "11 weeks and 4 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_d6585ce8", "hypothesis": "free outdoor concert series in the park, jazz night at a local bar", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_4ef30696", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_ec93e27f", "hypothesis": "train", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "6e984301", "hypothesis": "about 9 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "8077ef71", "hypothesis": "26 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_f420262c", "hypothesis": "Delta, American Airlines", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_8e165409", "hypothesis": "22", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_74aed68e", "hypothesis": "29 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "bcbe585f", "hypothesis": "4 weeks and 2 days ago (about 4.3 weeks)", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_21adecb5", "hypothesis": "6 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "5e1b23de", "hypothesis": "3 months ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_98f46fc6", "hypothesis": "Insufficient evidence to determine", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_af6db32f", "hypothesis": "17 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "eac54adc", "hypothesis": "19 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_7ddcf75f", "hypothesis": "3 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_a2d1d1f6", "hypothesis": "3 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_85da3956", "hypothesis": "3 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_b0863698", "hypothesis": "7 days ago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_68e94287", "hypothesis": "The vegan chili recipe post", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_e414231e", "hypothesis": "4 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_7ca326fa", "hypothesis": "Rachel first, Alex second, Emma third", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_7bc6cf22", "hypothesis": "12 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "2ebe6c92", "hypothesis": "The Song of Achilles", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_e061b84g", "hypothesis": "the company's annual charity soccer tournament", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "71017277", "hypothesis": "my aunt", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "b46e15ee", "hypothesis": "Walk for Hunger", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_d6585ce9", "hypothesis": "my sister", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_1e4a8aec", "hypothesis": "planted 12 new tomato saplings", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_f420262d", "hypothesis": "Delta", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_59149c78", "hypothesis": "the Metropolitan Museum of Art", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_e414231f", "hypothesis": "mountain bike", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_4929293b", "hypothesis": "your cousin's wedding", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_468eb064", "hypothesis": "Emma", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_fa19884d", "hypothesis": "Last Friday was 2023/03/31. The evidence from that day only shows artist recommendations, not a single explicit artist you definitely started listening to. The artists suggested then were Miles Davis, John Coltrane, Charlie Parker, and Thelonious Monk, plus bluegrass acts like Bill Monroe and the Blue Grass Boys, Earl Scruggs, Flatt and Scruggs, Alison Krauss and Union Station, and The Infamous Stringdusters.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "9a707b82", "hypothesis": "croissants", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "eac54add", "hypothesis": "I can’t identify any business milestone from the evidence. The only milestone-like detail mentioned around the retrieved time is that you had lost about 2 pounds after replacing your bathroom scale, which is a personal health update, not a business milestone.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "4dfccbf8", "hypothesis": "Ukulele lessons with Rachel", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "0bc8ad93", "hypothesis": "with a friend", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "6e984302", "hypothesis": "a set of sculpting tools: a modeling tool set, a wire cutter, and a sculpting mat", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_8279ba03", "hypothesis": "I only have evidence of you buying a Samsung Galaxy S21 for about $800 at Best Buy, not a kitchen appliance 10 days before 2023/03/25.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_b5700ca0", "hypothesis": "the Episcopal Church", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_68e94288", "hypothesis": "#PlankChallenge", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_2655b836", "hypothesis": "No retrieved evidence mentions your new car’s first service or any issue you had after it, so the first post-service issue cannot be determined from the available evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_2487a7cb", "hypothesis": "The 'Data Analysis using Python' webinar", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_76048e76", "hypothesis": "bike", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_2312f94c", "hypothesis": "Samsung Galaxy S22", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "0bb5a684", "hypothesis": "7 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "08f4fc43", "hypothesis": "30 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "2c63a862", "hypothesis": "14 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_385a5000", "hypothesis": "Tomatoes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "2a1811e2", "hypothesis": "11", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "bbf86515", "hypothesis": "4 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_5dcc0aab", "hypothesis": "white Adidas sneakers", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_0b2f1d21", "hypothesis": "The stand mixer malfunction happened first.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "f0853d11", "hypothesis": "14 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "gpt4_6ed717ea", "hypothesis": "the training pads for Luna", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_70e84552", "hypothesis": "Fixing the fence", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "a3838d2b", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_93159ced", "hypothesis": "I can’t compute the exact amount from the evidence: it shows you’ve been at NovaTech for about 4 years and 3 months, and that you started working in the field when you graduated, but it doesn’t give your graduation date or total career length before NovaTech.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_2d58bcd6", "hypothesis": "The Hate U Give", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_65aabe59", "hypothesis": "Can't tell from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "982b5123", "hypothesis": "3 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "b9cfe692", "hypothesis": "I have evidence only for one of the two books: \"The Seven Husbands of Evelyn Hugo\" took you two and a half weeks to finish. There’s no evidence here for how long \"The Nightingale\" took, so I can’t compute a combined total from the provided memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_4edbafa2", "hypothesis": "2023-06-03", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "c8090214", "hypothesis": "7 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_483dd43c", "hypothesis": "Game of Thrones", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "e4e14d04", "hypothesis": "cannot be determined from the provided evidence", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "c9f37c46", "hypothesis": "unresolved", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_2c50253f", "hypothesis": "I wake up at 6:30 AM on Tuesdays and Thursdays.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "dcfa8644", "hypothesis": "14 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_b4a80587", "hypothesis": "the arrival of the new prime lens", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_9a159967", "hypothesis": "Southwest Airlines and American Airlines (tie)", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "cc6d1ec1", "hypothesis": "about two months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_8c8961ae", "hypothesis": "the solo trip to Thailand", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_d9af6064", "hypothesis": "new router", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_7de946e7", "hypothesis": "persistent cough", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "d01c6aa8", "hypothesis": "Cannot be determined from the retrieved evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "993da5e2", "hypothesis": "I can only infer that you rearranged the furniture about three weeks before 2023/05/26; there’s no evidence here about when you started using the new area rug, so the duration before the rearrangement can’t be determined from the provided memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "a3045048", "hypothesis": "30 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_d31cdae3", "hypothesis": "The family road trip across the American Southwest.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_cd90e484", "hypothesis": "Insufficient evidence to determine exactly", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_88806d6e", "hypothesis": "Can't tell from the evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_4cd9eba1", "hypothesis": "1 week", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_93f6379c", "hypothesis": "Page Turners", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "b29f3365", "hypothesis": "six weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_2f56ae70", "hypothesis": "There’s no evidence here of you starting any streaming service, so I can’t identify a most recent one from the provided memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "6613b389", "hypothesis": "2 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_78cf46a3", "hypothesis": "Receiving the new phone case happened first.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_0a05b494", "hypothesis": "the tourist from Australia", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_1a1dc16d", "hypothesis": "The pride parade", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_2f584639", "hypothesis": "The necklace for your sister.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "gpt4_213fd887", "hypothesis": "the volleyball league", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_5438fa52", "hypothesis": "the start of your Spanish classes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "gpt4_c27434e8", "hypothesis": "Japanese Zero fighter plane model", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_fe651585", "hypothesis": "Rachel", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "8c18457d", "hypothesis": "21 days", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_70e84552_abs", "hypothesis": "Fixing the fence", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "gpt4_93159ced_abs", "hypothesis": "The evidence doesn’t support a current job at Google. On 2023/05/30, you were described as working at NovaTech, and while there’s a truncated note saying you’d been in software engineering before mentioning your current job, it doesn’t include the duration. So I can’t compute how long you’d been working before starting a job at Google from the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "982b5123_abs", "hypothesis": "There’s no evidence here of you booking an Airbnb in Sacramento. The only Airbnb mention is for San Francisco, tied to your stay in Haight-Ashbury on 2023/05/27.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "c8090214_abs", "hypothesis": "I attended the Holiday Market 7 days before Black Friday, but there’s no evidence here that you bought an iPad, so the day difference to an iPad purchase can’t be determined from the provided information.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_c27434e8_abs", "hypothesis": "Insufficient evidence to determine", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "gpt4_fe651585_abs", "hypothesis": "Alex", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "6a1eabeb", "hypothesis": "25:50", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "6aeb4375", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "830ce83f", "hypothesis": "Chicago", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "852ce960", "hypothesis": "$350,000", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "945e3d21", "hypothesis": "three times a week", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "d7c942c3", "hypothesis": "Yes — your mom is using the same grocery list app as you.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "71315a70", "hypothesis": "10-12 hours", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "89941a93", "hypothesis": "1", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "ce6d2d27", "hypothesis": "Friday", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "9ea5eabc", "hypothesis": "Paris", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "07741c44", "hypothesis": "shoe rack", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "a1eacc2a", "hypothesis": "7", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "184da446", "hypothesis": "220 pages", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "031748ae", "hypothesis": "4 when you started; 5 now", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "4d6b87c8", "hypothesis": "25", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "0f05491a", "hypothesis": "120 stars", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "08e075c7", "hypothesis": "I can’t determine that from the provided evidence. The memory snippets only mention your bedtime routine, reading, and using a massage roller, with no evidence about when you started using a Fitbit Charge 3.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "f9e8c073", "hypothesis": "5 sessions", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "41698283", "hypothesis": "70-200mm zoom lens", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "2698e78f", "hypothesis": "Not specified; the memory only says to review it regularly with Dr. Smith.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "b6019101", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "45dc21b6", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "5a4f22c0", "hypothesis": "TechCorp", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "6071bd76", "hypothesis": "less", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "e493bb7c", "hypothesis": "above my bed in my bedroom", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "618f13b2", "hypothesis": "6", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "72e3ee87", "hypothesis": "50", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "c4ea545c", "hypothesis": "Insufficient evidence", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "01493427", "hypothesis": "17", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "6a27ffc2", "hypothesis": "30", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "2133c1b5", "hypothesis": "3 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "18bc8abd", "hypothesis": "Kansas City Masterpiece BBQ sauce", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "db467c8c", "hypothesis": "about nine months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "7a87bd0c", "hypothesis": "4 weeks", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "e61a7584", "hypothesis": "about 9 months", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "1cea1afa", "hypothesis": "600", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "ed4ddc30", "hypothesis": "20 dozen eggs", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "8fb83627", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "b01defab", "hypothesis": "Yes", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "22d2cb42", "hypothesis": "the music shop on Main St", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "0e4e4c46", "hypothesis": "132 points", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "4b24c848", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "7e974930", "hypothesis": "Your most recent Downtown Farmers Market visit was on 2023/09/30, but the exact earnings amount isn’t shown in the retrieved evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "603deb26", "hypothesis": "10", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "59524333", "hypothesis": "6:00 pm", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "5831f84d", "hypothesis": "10", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "eace081b", "hypothesis": "Oahu", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "affe2881", "hypothesis": "32", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "50635ada", "hypothesis": "Premier Gold status", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "e66b632c", "hypothesis": "26 minutes and 30 seconds", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "0ddfec37", "hypothesis": "Cannot be determined from available evidence", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "f685340e", "hypothesis": "Previously, every other week. Now, not explicitly stated.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "cc5ded98", "hypothesis": "an hour each day", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "dfde3500", "hypothesis": "Wednesday", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "69fee5aa", "hypothesis": "37", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "7401057b", "hypothesis": "1 free night's stay", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "cf22b7bf", "hypothesis": "10 pounds", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "a2f3aa27", "hypothesis": "close to 1300 followers", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "c7dc5443", "hypothesis": "5-2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "06db6396", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "3ba21379", "hypothesis": "Ford F-150 pickup truck model", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "9bbe84a2", "hypothesis": "level 100 before the end of the year", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "10e09553", "hypothesis": "7", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "dad224aa", "hypothesis": "7:30 am", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "yes"}}
+{"question_id": "ba61f0b9", "hypothesis": "6", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "42ec0761", "hypothesis": "Yes, you have a spare screwdriver.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "5c40ec5b", "hypothesis": "2", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "c6853660", "hypothesis": "increase", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "26bdc477", "hypothesis": "5", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "0977f2af", "hypothesis": "I can’t tell from the retrieved evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "6aeb4375_abs", "hypothesis": "{\"answer\":0}", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "031748ae_abs", "hypothesis": "4", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "2698e78f_abs", "hypothesis": "No evidence provided states how often you see Dr. Johnson.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "2133c1b5_abs", "hypothesis": "About 7 months. The evidence says it had been 1 month in your new studio apartment in Harajuku on 2023/04/11, which makes it about 7 months by 2023/10/15. The memory mentions Harajuku, not Shinjuku.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "0ddfec37_abs", "hypothesis": "0", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "f685340e_abs", "hypothesis": "Every other week.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No"}}
+{"question_id": "89941a94", "hypothesis": "Yes — a road bike.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "07741c45", "hypothesis": "shoe rack", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "7161e7e2", "hypothesis": "8 am - 4 pm (Day Shift)", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}
+{"question_id": "c4f10528", "hypothesis": "Miss Bee Providore", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "89527b6b", "hypothesis": "I can only recover that the dinosaur-book conversation was on 2023/05/23 and the visible excerpt includes a T-Rex with a green scaly body; the Plesiosaur image description and its body color are not present in the provided evidence.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "e9327a54", "hypothesis": "I don’t have the dessert shop’s name in the retrieved memory.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": false, "raw_response": "No."}}
+{"question_id": "4c36ccef", "hypothesis": "Roscioli", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes"}}
+{"question_id": "6ae235be", "hypothesis": "Atmospheric distillation, fluid catalytic cracking (FCC), alkylation, and hydrotreating.", "autoeval_label": {"model": "gpt-4o-2024-08-06", "label": true, "raw_response": "Yes."}}

results/judge_gpt4o_alias_vectorengine.jsonl.summary.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "metric_model_alias": "gpt-4o",
+  "metric_model": "gpt-4o-2024-08-06",
+  "base_url": "OpenAI-compatible judge endpoint",
+  "hyp_file": "<longmemeval-root>/runs/lme_s500_frozen_baseline38_gpt54_full10_20260525_085540/off/predictions.jsonl",
+  "ref_file": "<longmemeval-root>/data/longmemeval_s_cleaned.json",
+  "result_file": "<longmemeval-root>/runs/lme_s500_frozen_baseline38_gpt54_full10_20260525_085540/off/judge_gpt4o_alias_vectorengine.jsonl",
+  "evaluated_count": 500,
+  "accuracy": 0.62,
+  "by_question_type": {
+    "knowledge-update": {
+      "accuracy": 0.7051,
+      "count": 78
+    },
+    "multi-session": {
+      "accuracy": 0.3985,
+      "count": 133
+    },
+    "single-session-assistant": {
+      "accuracy": 0.7857,
+      "count": 56
+    },
+    "single-session-preference": {
+      "accuracy": 0.5667,
+      "count": 30
+    },
+    "single-session-user": {
+      "accuracy": 0.8143,
+      "count": 70
+    },
+    "temporal-reasoning": {
+      "accuracy": 0.6316,
+      "count": 133
+    }
+  }
+}

results/lme_s500_frozen_baseline38_full10_20260525_results.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e88380901df26ac2374524fb812f6405d994775d1163db228fd9aedf1e56acea
+size 79506510

results/predictions.jsonl ADDED Viewed

	@@ -0,0 +1,500 @@

+{"question_id": "7e00a6cb", "hypothesis": "International Budget Hostel"}
+{"question_id": "1903aded", "hypothesis": "I can’t recover the 7th job from the provided memory evidence. The retrieved notes only show unrelated lists, like living room color schemes, dessert ideas, and sales tax questions, not the earlier work-from-home jobs for seniors list."}
+{"question_id": "ceb54acb", "hypothesis": "Sexual fixations, Problematic sexual behaviors, Sexual impulsivity, Compulsive sexuality"}
+{"question_id": "f523d9fe", "hypothesis": "I can’t tell from the available memory."}
+{"question_id": "0e5e2d1a", "hypothesis": "38 subjects"}
+{"question_id": "fea54f57", "hypothesis": "Evolution"}
+{"question_id": "cc539528", "hypothesis": "Ruby, Python, or PHP"}
+{"question_id": "dc439ea3", "hypothesis": "Hoop Dance"}
+{"question_id": "18dcd5a5", "hypothesis": "4"}
+{"question_id": "488d3006", "hypothesis": "GR-90"}
+{"question_id": "58470ed2", "hypothesis": "The Library is a sphere whose exact center is any one of its hexagons and whose circumference is inaccessible."}
+{"question_id": "8cf51dda", "hypothesis": "1. To identify molecular subtypes of endometrial cancer using a combination of genomic and transcriptomic approaches. 2. To investigate the clinical and biological significance of the identified molecular subtypes, including their association with patient outcomes. 3. To develop biomarkers for the early detection and prognosis of endometrial cancer based on the identified molecular subtypes."}
+{"question_id": "1d4da289", "hypothesis": "biometric authentication and one-time passwords (OTP)"}
+{"question_id": "8464fc84", "hypothesis": "Roscioli"}
+{"question_id": "8aef76bc", "hypothesis": "Mod Podge or another sealant"}
+{"question_id": "71a3fd6b", "hypothesis": "+49 (0) 62 32 / 14 23 - 0"}
+{"question_id": "2bf43736", "hypothesis": "Chapter 4 of Book 1, titled \"Vocal Prayer and Meditation."}
+{"question_id": "70b3e69b", "hypothesis": "Marina Rossell"}
+{"question_id": "8752c811", "hypothesis": "The 27th parameter was Perspective."}
+{"question_id": "3249768e", "hypothesis": "Absinthe"}
+{"question_id": "1b9b7252", "hypothesis": "Mindful.org"}
+{"question_id": "1568498a", "hypothesis": "29. Rd3"}
+{"question_id": "6222b6eb", "hypothesis": "6S"}
+{"question_id": "e8a79c70", "hypothesis": "2-3 eggs"}
+{"question_id": "d596882b", "hypothesis": "By Chloe"}
+{"question_id": "e3fc4d6e", "hypothesis": "Dr. Arati Prabhakar"}
+{"question_id": "51b23612", "hypothesis": "Nu, pogodi!"}
+{"question_id": "3e321797", "hypothesis": "10 minutes"}
+{"question_id": "e982271f", "hypothesis": "Revolution Hall"}
+{"question_id": "352ab8bd", "hypothesis": "approximately 20%"}
+{"question_id": "fca762bc", "hypothesis": "Memrise"}
+{"question_id": "7a8d0b71", "hypothesis": "In the visible remembered campaign-plan excerpt, I can confirm we discussed the DHL Wellness Retreats email marketing campaign, but the specific influencer marketing allocation amount is not shown in the evidence provided."}
+{"question_id": "a40e080f", "hypothesis": "Patagonia and Southwest Airlines"}
+{"question_id": "8b9d4367", "hypothesis": "Jaipur Rugs"}
+{"question_id": "5809eb10", "hypothesis": "2015"}
+{"question_id": "41275add", "hypothesis": "How to Sit Properly at a Desk to Avoid Back Pain (Mayo Clinic)"}
+{"question_id": "4388e9dd", "hypothesis": "an untidy, stained white shirt"}
+{"question_id": "4baee567", "hypothesis": "12"}
+{"question_id": "561fabcd", "hypothesis": "Contaminated Colossus"}
+{"question_id": "b759caee", "hypothesis": "@jessica_poole_jewellery"}
+{"question_id": "ac031881", "hypothesis": "squares"}
+{"question_id": "28bcfaac", "hypothesis": "MusicTheory.net"}
+{"question_id": "16c90bf4", "hypothesis": "A pilsner or lager."}
+{"question_id": "c8f1aeed", "hypothesis": "Pennsylvania"}
+{"question_id": "eaca4986", "hypothesis": "C D E F G A B A G F E D C"}
+{"question_id": "c7cf7dfd", "hypothesis": "Nostalgia"}
+{"question_id": "e48988bc", "hypothesis": "Patagonia"}
+{"question_id": "1de5cff2", "hypothesis": "Veja"}
+{"question_id": "65240037", "hypothesis": "1:10 ratio"}
+{"question_id": "778164c6", "hypothesis": "The evidence only shows that we discussed popular Caribbean dishes featuring snapper, but it does not include the actual Jamaican fruit-based dish name."}
+{"question_id": "e47becba", "hypothesis": "Business Administration"}
+{"question_id": "118b2229", "hypothesis": "45 minutes each way"}
+{"question_id": "51a45a95", "hypothesis": "The evidence does not say where you redeemed the $5 coffee creamer coupon."}
+{"question_id": "58bf7951", "hypothesis": "The Glass Menagerie"}
+{"question_id": "1e043500", "hypothesis": "Summer Vibes"}
+{"question_id": "c5e8278d", "hypothesis": "Johnson"}
+{"question_id": "6ade9755", "hypothesis": "Serenity Yoga"}
+{"question_id": "6f9b354f", "hypothesis": "a lighter shade of gray"}
+{"question_id": "58ef2f1c", "hypothesis": "back on Valentine's Day"}
+{"question_id": "f8c5f88b", "hypothesis": "The retrieved evidence does not say where you bought the new tennis racket from."}
+{"question_id": "5d3d2817", "hypothesis": "marketing specialist at a small startup"}
+{"question_id": "7527f7e2", "hypothesis": "$800"}
+{"question_id": "c960da58", "hypothesis": "20"}
+{"question_id": "3b6f954b", "hypothesis": "University of Melbourne"}
+{"question_id": "726462e0", "hypothesis": "10%"}
+{"question_id": "94f70d80", "hypothesis": "The retrieved evidence does not state any assembly time for an IKEA bookshelf. It only mentions plans to visit IKEA to look at coffee tables and read reviews, so the duration cannot be determined from this evidence."}
+{"question_id": "66f24dbb", "hypothesis": "No sister gift is shown in memory; the only recorded birthday gift purchase was a customized phone case with his favorite game's logo for your brother."}
+{"question_id": "ad7109d1", "hypothesis": "The retrieved evidence does not specify the speed of your new internet plan."}
+{"question_id": "af8d2e46", "hypothesis": "7"}
+{"question_id": "dccbc061", "hypothesis": "staunch atheist"}
+{"question_id": "c8c3f81d", "hypothesis": "Nike"}
+{"question_id": "8ebdbe50", "hypothesis": "Data Science certification"}
+{"question_id": "6b168ec8", "hypothesis": "3"}
+{"question_id": "75499fd8", "hypothesis": "Golden Retriever"}
+{"question_id": "21436231", "hypothesis": "Not stated"}
+{"question_id": "95bcc1c8", "hypothesis": "10"}
+{"question_id": "0862e8bf", "hypothesis": "Luna"}
+{"question_id": "853b0a1d", "hypothesis": "I can tell it was a gift from your grandma, but the evidence here doesn’t include your age when she gave you the silver necklace."}
+{"question_id": "a06e4cfe", "hypothesis": "3:1"}
+{"question_id": "37d43f65", "hypothesis": "16GB"}
+{"question_id": "b86304ba", "hypothesis": "Triple what you paid for it."}
+{"question_id": "d52b4f67", "hypothesis": "Grand Ballroom"}
+{"question_id": "25e5aa4f", "hypothesis": "UCLA"}
+{"question_id": "caf9ead2", "hypothesis": "around 5 hours"}
+{"question_id": "8550ddae", "hypothesis": "Lavender Dream cocktail"}
+{"question_id": "60d45044", "hypothesis": "Japanese short-grain rice"}
+{"question_id": "3f1e9474", "hypothesis": "Sarah"}
+{"question_id": "86b68151", "hypothesis": "IKEA"}
+{"question_id": "577d4d32", "hypothesis": "7 pm"}
+{"question_id": "ec81a493", "hypothesis": "500 copies"}
+{"question_id": "15745da0", "hypothesis": "about three months"}
+{"question_id": "e01b8e2f", "hypothesis": "Hawaii"}
+{"question_id": "bc8a6e93", "hypothesis": "lemon blueberry cake"}
+{"question_id": "ccb36322", "hypothesis": "There’s no evidence here of any music streaming service you’ve been using lately; the only services mentioned are photo album services like Shutterfly, Snapfish, and Mixbook."}
+{"question_id": "001be529", "hypothesis": "Your asylum application had been approved by May 28, 2023, and there was a waiting period, but the evidence does not state how long you waited for the decision."}
+{"question_id": "b320f3f8", "hypothesis": "a rare blue Snaggletooth action figure"}
+{"question_id": "19b5f2b3", "hypothesis": "The evidence only shows that you were in Japan/Kyoto and planning to travel elsewhere in Asia afterward, but it does not state how long you were in Japan."}
+{"question_id": "4fd1909e", "hypothesis": "Xfinity Center"}
+{"question_id": "545bd2b5", "hypothesis": "2 hours per day"}
+{"question_id": "8a137a7f", "hypothesis": "Philips LED bulb"}
+{"question_id": "76d63226", "hypothesis": "55-inch"}
+{"question_id": "86f00804", "hypothesis": "The Seven Husbands of Evelyn Hugo"}
+{"question_id": "8e9d538c", "hypothesis": "17 skeins"}
+{"question_id": "311778f1", "hypothesis": "around 10 hours"}
+{"question_id": "c19f7a0b", "hypothesis": "around 6:30 pm"}
+{"question_id": "4100d0a0", "hypothesis": "Irish and Italian"}
+{"question_id": "29f2956b", "hypothesis": "30 minutes daily"}
+{"question_id": "1faac195", "hypothesis": "Denver"}
+{"question_id": "faba32e5", "hypothesis": "24 hours"}
+{"question_id": "f4f1d8a4", "hypothesis": "my sister"}
+{"question_id": "c14c00dd", "hypothesis": "Trader Joe's"}
+{"question_id": "36580ce8", "hypothesis": "The evidence only says you’d been dealing with unspecified health issues for a few months; it does not identify any specific issue you initially thought was just a cold."}
+{"question_id": "3d86fd0a", "hypothesis": "Information unavailable"}
+{"question_id": "a82c026e", "hypothesis": "Dark Souls 3 DLC"}
+{"question_id": "0862e8bf_abs", "hypothesis": "I can’t determine your hamster’s name from the provided evidence because the memories only mention your cat and unrelated coding discussion, not any hamster."}
+{"question_id": "15745da0_abs", "hypothesis": "No evidence says how long you've been collecting vintage films."}
+{"question_id": "bc8a6e93_abs", "hypothesis": "I don't know"}
+{"question_id": "19b5f2b3_abs", "hypothesis": "The available evidence doesn’t mention a Korea trip or its duration, so I can’t determine how long you were in Korea."}
+{"question_id": "29f2956b_abs", "hypothesis": "I only have evidence that you practice guitar for 30 minutes daily; there’s no evidence here about a daily violin practice time."}
+{"question_id": "f4f1d8a4_abs", "hypothesis": "I only remember a new stand mixer as a birthday gift from your sister, not your dad."}
+{"question_id": "0a995998", "hypothesis": "2"}
+{"question_id": "6d550036", "hypothesis": "1"}
+{"question_id": "gpt4_59c863d7", "hypothesis": "4"}
+{"question_id": "b5ef892d", "hypothesis": "5 days"}
+{"question_id": "e831120c", "hypothesis": "3.5 weeks"}
+{"question_id": "3a704032", "hypothesis": "3"}
+{"question_id": "gpt4_d84a3211", "hypothesis": "I can identify bike-related activity this year, like getting bike lights installed, but there are no explicit dollar amounts for any bike-related purchases or services in the evidence, so I can’t compute a total spent."}
+{"question_id": "aae3761f", "hypothesis": "10 hours"}
+{"question_id": "gpt4_f2262a51", "hypothesis": "1"}
+{"question_id": "dd2973ad", "hypothesis": "I can only narrow it down indirectly: your doctor’s appointment was at 10 AM on Thursday 2023-05-18, so the day before was Wednesday 2023-05-17. The only bedtime evidence says you went to bed at 2 AM on last Wednesday relative to 2023-05-29, which refers to 2023-05-24, not 2023-05-17. So the closest bedtime mentioned is 2 AM, but it does not match the day before that appointment."}
+{"question_id": "c4a1ceb8", "hypothesis": "3"}
+{"question_id": "gpt4_a56e767c", "hypothesis": "3"}
+{"question_id": "6cb6f249", "hypothesis": "17 days"}
+{"question_id": "46a3abf7", "hypothesis": "3"}
+{"question_id": "36b9f61e", "hypothesis": "2000"}
+{"question_id": "28dc39ac", "hypothesis": "29 hours"}
+{"question_id": "gpt4_2f8be40d", "hypothesis": "2"}
+{"question_id": "2e6d26dc", "hypothesis": "3"}
+{"question_id": "gpt4_15e38248", "hypothesis": "2"}
+{"question_id": "88432d0a", "hypothesis": "4"}
+{"question_id": "80ec1f4f", "hypothesis": "1"}
+{"question_id": "d23cf73b", "hypothesis": "2"}
+{"question_id": "gpt4_7fce9456", "hypothesis": "2"}
+{"question_id": "d682f1a2", "hypothesis": "2"}
+{"question_id": "7024f17c", "hypothesis": "0.5 hours"}
+{"question_id": "gpt4_5501fe77", "hypothesis": "TikTok"}
+{"question_id": "gpt4_2ba83207", "hypothesis": "Trader Joe's"}
+{"question_id": "2318644b", "hypothesis": "Tokyo is the only place with a numeric nightly accommodation cost in the evidence: about $30/night. For Hawaii, the evidence only says you had a luxurious resort stay in Maui, which suggests it was more expensive but does not give a dollar amount, so the exact per-night difference cannot be calculated from the evidence."}
+{"question_id": "2ce6a0f2", "hypothesis": "1"}
+{"question_id": "gpt4_d12ceb0e", "hypothesis": "Insufficient evidence to compute the average age of you, your parents, and your grandparents."}
+{"question_id": "00ca467f", "hypothesis": "2"}
+{"question_id": "b3c15d39", "hypothesis": "5 days"}
+{"question_id": "gpt4_31ff4165", "hypothesis": "4"}
+{"question_id": "eeda8a6d", "hypothesis": "17"}
+{"question_id": "2788b940", "hypothesis": "2"}
+{"question_id": "60bf93ed", "hypothesis": "It arrived on 1/20, but I don’t see the purchase date in the evidence, so I can’t calculate the number of days exactly."}
+{"question_id": "9d25d4e0", "hypothesis": "2"}
+{"question_id": "129d1232", "hypothesis": "600 dollars"}
+{"question_id": "60472f9c", "hypothesis": "1"}
+{"question_id": "gpt4_194be4b3", "hypothesis": "3"}
+{"question_id": "a9f6b44c", "hypothesis": "2"}
+{"question_id": "d851d5ba", "hypothesis": "At least 2,250 USD"}
+{"question_id": "5a7937c8", "hypothesis": "2"}
+{"question_id": "gpt4_ab202e7f", "hypothesis": "2"}
+{"question_id": "gpt4_e05b82a6", "hypothesis": "7"}
+{"question_id": "gpt4_731e37d7", "hypothesis": "$200"}
+{"question_id": "edced276", "hypothesis": "8 days"}
+{"question_id": "10d9b85a", "hypothesis": "2"}
+{"question_id": "e3038f8c", "hypothesis": "87"}
+{"question_id": "2b8f3739", "hypothesis": "$495"}
+{"question_id": "1a8a66a6", "hypothesis": "2"}
+{"question_id": "c2ac3c61", "hypothesis": "5"}
+{"question_id": "bf659f65", "hypothesis": "1"}
+{"question_id": "gpt4_372c3eed", "hypothesis": "8 years"}
+{"question_id": "gpt4_2f91af09", "hypothesis": "6"}
+{"question_id": "81507db6", "hypothesis": "3"}
+{"question_id": "88432d0a_abs", "hypothesis": "0"}
+{"question_id": "80ec1f4f_abs", "hypothesis": "0"}
+{"question_id": "eeda8a6d_abs", "hypothesis": "not evidenced in the retrieved memory"}
+{"question_id": "60bf93ed_abs", "hypothesis": "70 days"}
+{"question_id": "edced276_abs", "hypothesis": "I can only confirm a Hawaii trip, and no number of days is given for it. There’s also no evidence here of a Seattle trip duration, so the total can’t be determined from the provided evidence."}
+{"question_id": "gpt4_372c3eed_abs", "hypothesis": "8 years documented; no evidence of a completed Master's degree"}
+{"question_id": "8a2466db", "hypothesis": "Sorry, I don’t know of any specific resources from memory."}
+{"question_id": "06878be2", "hypothesis": "A comfortable, durable Sony-compatible camera bag, a Godox V1 hard case or Think Tank Photo Flash Pouch, and a lens cleaning kit with a microfiber cloth, blower, and lens cleaning solution would complement your Sony A7R IV setup."}
+{"question_id": "75832dbd", "hypothesis": "Recent publications or conferences on the impact of mega-churches on Baptist church vitality, Baptist identity and mission, and practical collaboration models among smaller Baptist churches—especially case-study-focused work on regional or denominational cooperation."}
+{"question_id": "0edc2aef", "hypothesis": "Look for a Miami hotel with a balcony hot tub and a standout view, similar to your preferred room at The Edgewater Hotel."}
+{"question_id": "35a27287", "hypothesis": "French- or Spanish-focused language exchange meetups, conversation groups, or cultural events nearby would fit you best; check Meetup, Facebook Groups, local universities, or language schools this weekend."}
+{"question_id": "32260d93", "hypothesis": "Mike Birbiglia: My Girlfriend's Boyfriend"}
+{"question_id": "195a1a1b", "hypothesis": "Try a calm wind-down routine: read a non-stimulating book, do a guided sleep meditation or body scan, practice deep breathing, do gentle yoga or stretching, listen to soothing music or nature sounds, and take a 30-minute break from screens before bed."}
+{"question_id": "afdc33df", "hypothesis": "Regularly clean your granite countertop, use coasters or pads under pots and utensils, always use a cutting board, and check the faucet/aerator for moisture or buildup so the sink area stays cleaner."}
+{"question_id": "caf03d32", "hypothesis": "Try more plant-based slow cooker recipes like lentil curry, roasted vegetable stew, black bean chili, eggplant parmesan, vegan chili, or a quinoa and black bean bowl."}
+{"question_id": "54026fce", "hypothesis": "Try suggesting virtual coffee breaks in a collaborative way: ask open-ended questions about staying connected, share that you miss casual chats, propose a weekly virtual coffee break, and invite your team’s feedback before deciding."}
+{"question_id": "06f04340", "hypothesis": "Serve a hearty quinoa salad with your roasted homegrown vegetables over mixed greens, finished with vinaigrette; roast the vegetables first, then toss with the greens and quinoa."}
+{"question_id": "6b7dfb22", "hypothesis": "Study real flowers from life or photos, browse Instagram and art communities for ideas, experiment with new techniques like palette-knife texture, and take breaks while tracking small wins to help inspiration return."}
+{"question_id": "1a1907b4", "hypothesis": "Try a Pimm's Cup with a Twist, especially with Ruby Red Grapefruit simple syrup."}
+{"question_id": "09d032c9", "hypothesis": "A compact portable power bank could help as a backup."}
+{"question_id": "38146c39", "hypothesis": "Try adding warm spices, nuts like walnuts or pecans, a little citrus zest, or a touch of espresso or coffee for extra flavor."}
+{"question_id": "d24813b1", "hypothesis": "Bake chocolate chip cookies or a lemon poppyseed cake; both fit your recent baking interests, and the lemon poppyseed cake worked well for a colleague event before."}
+{"question_id": "57f827a0", "hypothesis": "No direct memory-backed evidence shows you were planning to rearrange your bedroom furniture this weekend."}
+{"question_id": "95228167", "hypothesis": "Try several guitars and focus on comfort, playability, build quality, tuning stability, and the sound you like best."}
+{"question_id": "505af2f5", "hypothesis": "Try a homemade vanilla-cinnamon creamer with unsweetened almond or oat milk, a little monk fruit or stevia instead of honey, and optional coconut or hazelnut extract."}
+{"question_id": "75f70248", "hypothesis": "Possibly yes—the prior memory suggests your living room had a dust issue and a shedding cat, so dust or pet hair there could be contributing to your sneezing."}
+{"question_id": "d6233ab6", "hypothesis": "Yes—if you’re feeling nostalgic and want to reconnect with old high school friends, attending your reunion sounds like a good idea."}
+{"question_id": "1da05512", "hypothesis": "Buy now if your storage issue is already a problem and a beginner-friendly 2-bay model fits your budget; wait only if you expect much larger future storage needs or still need to compare options."}
+{"question_id": "fca70973", "hypothesis": "Try Universal Studios Hollywood’s VIP Experience with the Gourmet Buffet and Behind-the-Scenes Tour as the top pick; Disneyland Resort Halloween dining or Knott’s Scary Farm Boo-fet would also fit your immersive, exclusive theme park weekend vibe."}
+{"question_id": "b6025781", "hypothesis": "Try herb-rosemary roasted sweet potatoes, quinoa and roasted vegetable bowls with salmon, tofu, or lentil bolognese, and a batch of homemade granola for snacks."}
+{"question_id": "a89d7624", "hypothesis": "Visit Red Rocks Park and Amphitheatre and check out other live music spots like The Ogden Theatre, The Bluebird Theater, The Fillmore Auditorium, and Larimer Lounge."}
+{"question_id": "b0479f84", "hypothesis": "Try Planet Earth, Blue Planet, Dynasties, Chasing Coral, and March of the Penguins—especially the nature and wildlife ones since that’s what you were most interested in."}
+{"question_id": "1d4e3b97", "hypothesis": "Yes — the recent upgrade to your bike’s chain and cassette could be why it’s performing better."}
+{"question_id": "07b6f563", "hypothesis": "An iPhone 13 Pro-compatible wallet case like the OtterBox wallet case would be a useful accessory; I’d skip suggesting a power bank since you already have one that’s working well."}
+{"question_id": "1c0ddc50", "hypothesis": "Listen to history and science podcasts during your 40-minute commute—try starter episodes from Hardcore History, Lore, The Dollop, StarTalk Radio, or Radiolab."}
+{"question_id": "0a34ad58", "hypothesis": "Since you already have a Suica card, use it for trains and subways to make getting around easier. If you’re heading out from Shinjuku, plan a little extra time, especially for early or busy destinations like Tsukiji, where it’s best to leave very early and expect crowds. Keep some yen/cash with you because some places, especially around markets, may not take cards, and if needed you can exchange money near Shinjuku."}
+{"question_id": "d3ab962e", "hypothesis": "8 miles"}
+{"question_id": "2311e44b", "hypothesis": "The Nightingale' is 440 pages long, but I don’t have a numeric page-progress value for how far you’ve already read, so the exact pages left can’t be calculated from the evidence."}
+{"question_id": "cc06de0d", "hypothesis": "I can’t compute the difference from the provided evidence because it gives your daily train fare as $6, but it doesn’t include the taxi ride cost."}
+{"question_id": "a11281a2", "hypothesis": "about 100 followers"}
+{"question_id": "4f54b7c9", "hypothesis": "5"}
+{"question_id": "85fa3a3f", "hypothesis": "50"}
+{"question_id": "9aaed6a3", "hypothesis": "$0.75"}
+{"question_id": "1f2b8d4f", "hypothesis": "I can’t determine an exact price difference from the evidence. The price of your luxury boots isn’t given, and no specific similar pair found at a budget store has a stated price. The closest budget-store-style prices mentioned were about $20-$50 for affordable ankle boots, but that still isn’t enough to calculate the difference."}
+{"question_id": "e6041065", "hypothesis": "40%"}
+{"question_id": "51c32626", "hypothesis": "No explicit submission date was given; it was only mentioned by 2023/05/22 as a past event."}
+{"question_id": "d905b33f", "hypothesis": "20%"}
+{"question_id": "7405e8b1", "hypothesis": "HelloFresh: 40% on your first order. For UberEats, the evidence only says you got 20% off an order last week, not that it was your first order."}
+{"question_id": "f35224e0", "hypothesis": "At least 12 episodes"}
+{"question_id": "6456829e", "hypothesis": "5 tomato plants and 3 cucumber plants"}
+{"question_id": "a4996e51", "hypothesis": "Up to 50 hours/week."}
+{"question_id": "3c1045c8", "hypothesis": "unavailable"}
+{"question_id": "60036106", "hypothesis": "12,000 people"}
+{"question_id": "681a1674", "hypothesis": "1"}
+{"question_id": "e25c3b8d", "hypothesis": "I can only tell that you paid $200 for the designer handbag at TK Maxx; the savings amount isn’t stated in the evidence, so it can’t be determined from this information alone."}
+{"question_id": "4adc0475", "hypothesis": "Insufficient evidence for an exact total."}
+{"question_id": "4bc144e2", "hypothesis": "Parking ticket: $50; car wash not found in the retrieved evidence."}
+{"question_id": "ef66a6e5", "hypothesis": "4"}
+{"question_id": "5025383b", "hypothesis": "cooking"}
+{"question_id": "a1cc6108", "hypothesis": "You were about 0 years old when Alex was born."}
+{"question_id": "9ee3ecd6", "hypothesis": "300 points"}
+{"question_id": "3fdac837", "hypothesis": "12 days"}
+{"question_id": "91b15a6e", "hypothesis": "$5,150"}
+{"question_id": "27016adc", "hypothesis": "The planned renovations are $20,000. I don’t see the countryside property’s purchase price in the provided evidence, so the exact percentage can’t be computed from this evidence alone."}
+{"question_id": "720133ac", "hypothesis": "$75"}
+{"question_id": "77eafa52", "hypothesis": "$300"}
+{"question_id": "8979f9ec", "hypothesis": "I can only confirm those were lunch meal-prep recipe ideas mentioned on 2023/05/26; there’s no evidence here giving a number of meals for the chicken fajitas or any lentil soup, so the total can’t be computed from the provided evidence."}
+{"question_id": "0100672e", "hypothesis": "$12 per coffee mug"}
+{"question_id": "a96c20ee", "hypothesis": "I presented a poster on my thesis research at my first research conference over the summer, but the university isn't stated in the evidence provided."}
+{"question_id": "92a0aa75", "hypothesis": "Senior Marketing Specialist at the company"}
+{"question_id": "3fe836c9", "hypothesis": "The final sale price was $325,000. The only other amount mentioned is $350,000, but in the evidence it appears as a generic mortgage example, not a clearly stated pre-approval amount, so the exact difference can’t be determined from the evidence."}
+{"question_id": "1c549ce4", "hypothesis": "The only purchase cost shown is the waterproof car cover at $120. I don’t see evidence that you purchased a detailing spray or what it cost, so I can’t add a total beyond the confirmed $120."}
+{"question_id": "6c49646a", "hypothesis": "1940 miles"}
+{"question_id": "1192316e", "hypothesis": "I can’t compute a total from the provided evidence because it only mentions meditation and breakfast timing, not how long you take to get ready or commute to work."}
+{"question_id": "0ea62687", "hypothesis": "I can’t determine the difference from the retrieved memory: a few months ago it was 30 mpg, but your current mpg isn’t stated."}
+{"question_id": "67e0d0f2", "hypothesis": "At least 8. You explicitly mentioned completing 8 edX courses, and you also said you had completed some Coursera courses, but without a number, so the exact total isn’t determined from the evidence."}
+{"question_id": "bb7c3b45", "hypothesis": "The evidence only gives the original retail price of the Jimmy Choo heels as $500. It does not say what you actually paid, so the amount saved cannot be determined from the provided evidence."}
+{"question_id": "ba358f49", "hypothesis": "I don't know—Rachel is expected to get married in 2023, but your age isn't provided."}
+{"question_id": "61f8c8f8", "hypothesis": "10 minutes faster"}
+{"question_id": "60159905", "hypothesis": "1"}
+{"question_id": "ef9cf60a", "hypothesis": "$100"}
+{"question_id": "73d42213", "hypothesis": "No explicit arrival time is stated for Monday. The only clinic-travel detail says it took two hours to get there last time, but that doesn’t give a reach time."}
+{"question_id": "bc149d6b", "hypothesis": "70 pounds"}
+{"question_id": "099778bb", "hypothesis": "Insufficient evidence to determine the percentage; only the total of 100 leadership positions is provided, not how many are held by women."}
+{"question_id": "09ba9854", "hypothesis": "About $50."}
+{"question_id": "d6062bb9", "hypothesis": "1998"}
+{"question_id": "157a136e", "hypothesis": "I don't know"}
+{"question_id": "c18a7dc8", "hypothesis": "7"}
+{"question_id": "a3332713", "hypothesis": "About $200 total"}
+{"question_id": "55241a1f", "hypothesis": "Your recent Facebook Live session had 12 comments, but I don’t have evidence here for the comment count of your most popular YouTube video, so I can’t compute the total from the available information."}
+{"question_id": "a08a253f", "hypothesis": "4"}
+{"question_id": "f0e564bc", "hypothesis": "The evidence shows $800 for the designer handbag, but no amount is provided for the high-end skincare products, so the total cannot be determined."}
+{"question_id": "078150f1", "hypothesis": "$50 more"}
+{"question_id": "8cf4d046", "hypothesis": "I can’t compute an average GPA from the evidence provided. It mentions your undergraduate result as a First-Class distinction from the University of Mumbai and that you completed a Master’s in Data Science at UIUC, but it does not give explicit GPA values for either degree."}
+{"question_id": "a346bb18", "hypothesis": "12 minutes"}
+{"question_id": "37f165cf", "hypothesis": "416 pages and 440 pages"}
+{"question_id": "8e91e7d9", "hypothesis": "1"}
+{"question_id": "87f22b4a", "hypothesis": "$120"}
+{"question_id": "e56a43b9", "hypothesis": "$5 discount"}
+{"question_id": "efc3f7c2", "hypothesis": "30 minutes earlier"}
+{"question_id": "21d02d0d", "hypothesis": "2"}
+{"question_id": "2311e44b_abs", "hypothesis": "You mentioned that you've been reading \"Sapiens\" at a pace of 10–20 pages a week, but there’s no evidence here stating the book’s total page count or how many pages you’ve already finished, so the exact number of pages left can’t be determined from the provided evidence."}
+{"question_id": "6456829e_abs", "hypothesis": "5"}
+{"question_id": "e5ba910e_abs", "hypothesis": "$378"}
+{"question_id": "a96c20ee_abs", "hypothesis": "Harvard University"}
+{"question_id": "ba358f49_abs", "hypothesis": "Insufficient evidence to determine Rachel's age when you get married."}
+{"question_id": "09ba9854_abs", "hypothesis": "The evidence doesn’t give enough to calculate the savings exactly. It does mention the Airport Limousine Bus costs ¥3,200 one way to Shinjuku Station, but it does not provide a full taxi fare from the airport to your hotel, only that a short taxi ride from Shinjuku Station to the hotel might be needed."}
+{"question_id": "gpt4_59149c77", "hypothesis": "7 days"}
+{"question_id": "gpt4_f49edff3", "hypothesis": "helped my friend prepare a nursery, helped my cousin pick out stuff for her baby shower, ordered a customized phone case for my friend's birthday"}
+{"question_id": "71017276", "hypothesis": "4 weeks"}
+{"question_id": "b46e15ed", "hypothesis": "2 months"}
+{"question_id": "gpt4_fa19884c", "hypothesis": "6 days"}
+{"question_id": "0bc8ad92", "hypothesis": "2 months"}
+{"question_id": "af082822", "hypothesis": "2 weeks"}
+{"question_id": "gpt4_4929293a", "hypothesis": "Michael's engagement party"}
+{"question_id": "gpt4_b5700ca9", "hypothesis": "4 days ago"}
+{"question_id": "9a707b81", "hypothesis": "21 days"}
+{"question_id": "gpt4_1d4ab0c9", "hypothesis": "24 days"}
+{"question_id": "gpt4_e072b769", "hypothesis": "about 3 weeks ago"}
+{"question_id": "0db4c65d", "hypothesis": "18 days"}
+{"question_id": "gpt4_1d80365e", "hypothesis": "2 days"}
+{"question_id": "gpt4_7f6b06db", "hypothesis": "Yosemite, Big Sur and Monterey, Yosemite National Park"}
+{"question_id": "gpt4_6dc9b45b", "hypothesis": "4 months ago"}
+{"question_id": "gpt4_8279ba02", "hypothesis": "10 days ago"}
+{"question_id": "gpt4_18c2b244", "hypothesis": "I used a Buy One Get One Free coupon on Luvs diapers at Walmart, then I redeemed $12 cashback for a $10 Amazon gift card from Ibotta, then I signed up for the rewards program at ShopRite."}
+{"question_id": "gpt4_a1b77f9c", "hypothesis": "Insufficient evidence to calculate the total weeks spent."}
+{"question_id": "gpt4_1916e0ea", "hypothesis": "I can identify the Instacart online grocery shopping date as 2023/02/28, but the provided evidence does not include the date you cancelled your FarmFresh subscription, so the number of days passed cannot be determined from this evidence."}
+{"question_id": "gpt4_7a0daae1", "hypothesis": "I can tell you received your new tennis racket on 2023/03/17, but the buy date isn’t provided in the evidence, so the number of weeks between buying and receiving can’t be determined from the available information."}
+{"question_id": "gpt4_468eb063", "hypothesis": "9 days"}
+{"question_id": "gpt4_7abb270c", "hypothesis": "Science Museum, Metropolitan Museum of Art, Museum of Contemporary Art, Modern Art Museum, and two missing museum visits"}
+{"question_id": "gpt4_1e4a8aeb", "hypothesis": "6 days"}
+{"question_id": "gpt4_4fc4f797", "hypothesis": "38 days"}
+{"question_id": "4dfccbf7", "hypothesis": "24 days"}
+{"question_id": "gpt4_61e13b3c", "hypothesis": "3 weeks"}
+{"question_id": "gpt4_45189cb4", "hypothesis": "College Football National Championship game (Georgia vs. Alabama), then Lakers game, then Kansas City Chiefs vs. Buffalo Bills Divisional Round game"}
+{"question_id": "2ebe6c90", "hypothesis": "21 days"}
+{"question_id": "gpt4_e061b84f", "hypothesis": "volleyball league game, Spring Sprint Triathlon, company's annual charity soccer tournament"}
+{"question_id": "370a8ff4", "hypothesis": "11 weeks and 4 days"}
+{"question_id": "gpt4_d6585ce8", "hypothesis": "free outdoor concert series in the park, jazz night at a local bar"}
+{"question_id": "gpt4_4ef30696", "hypothesis": "1"}
+{"question_id": "gpt4_ec93e27f", "hypothesis": "train"}
+{"question_id": "6e984301", "hypothesis": "about 9 weeks"}
+{"question_id": "8077ef71", "hypothesis": "26 days ago"}
+{"question_id": "gpt4_f420262c", "hypothesis": "Delta, American Airlines"}
+{"question_id": "gpt4_8e165409", "hypothesis": "22"}
+{"question_id": "gpt4_74aed68e", "hypothesis": "29 days"}
+{"question_id": "bcbe585f", "hypothesis": "4 weeks and 2 days ago (about 4.3 weeks)"}
+{"question_id": "gpt4_21adecb5", "hypothesis": "6 months"}
+{"question_id": "5e1b23de", "hypothesis": "3 months ago"}
+{"question_id": "gpt4_98f46fc6", "hypothesis": "Insufficient evidence to determine"}
+{"question_id": "gpt4_af6db32f", "hypothesis": "17 days"}
+{"question_id": "eac54adc", "hypothesis": "19 days"}
+{"question_id": "gpt4_7ddcf75f", "hypothesis": "3 days ago"}
+{"question_id": "gpt4_a2d1d1f6", "hypothesis": "3 days ago"}
+{"question_id": "gpt4_85da3956", "hypothesis": "3 weeks"}
+{"question_id": "gpt4_b0863698", "hypothesis": "7 days ago"}
+{"question_id": "gpt4_68e94287", "hypothesis": "The vegan chili recipe post"}
+{"question_id": "gpt4_e414231e", "hypothesis": "4 days"}
+{"question_id": "gpt4_7ca326fa", "hypothesis": "Rachel first, Alex second, Emma third"}
+{"question_id": "gpt4_7bc6cf22", "hypothesis": "12 days"}
+{"question_id": "2ebe6c92", "hypothesis": "The Song of Achilles"}
+{"question_id": "gpt4_e061b84g", "hypothesis": "the company's annual charity soccer tournament"}
+{"question_id": "71017277", "hypothesis": "my aunt"}
+{"question_id": "b46e15ee", "hypothesis": "Walk for Hunger"}
+{"question_id": "gpt4_d6585ce9", "hypothesis": "my sister"}
+{"question_id": "gpt4_1e4a8aec", "hypothesis": "planted 12 new tomato saplings"}
+{"question_id": "gpt4_f420262d", "hypothesis": "Delta"}
+{"question_id": "gpt4_59149c78", "hypothesis": "the Metropolitan Museum of Art"}
+{"question_id": "gpt4_e414231f", "hypothesis": "mountain bike"}
+{"question_id": "gpt4_4929293b", "hypothesis": "your cousin's wedding"}
+{"question_id": "gpt4_468eb064", "hypothesis": "Emma"}
+{"question_id": "gpt4_fa19884d", "hypothesis": "Last Friday was 2023/03/31. The evidence from that day only shows artist recommendations, not a single explicit artist you definitely started listening to. The artists suggested then were Miles Davis, John Coltrane, Charlie Parker, and Thelonious Monk, plus bluegrass acts like Bill Monroe and the Blue Grass Boys, Earl Scruggs, Flatt and Scruggs, Alison Krauss and Union Station, and The Infamous Stringdusters."}
+{"question_id": "9a707b82", "hypothesis": "croissants"}
+{"question_id": "eac54add", "hypothesis": "I can’t identify any business milestone from the evidence. The only milestone-like detail mentioned around the retrieved time is that you had lost about 2 pounds after replacing your bathroom scale, which is a personal health update, not a business milestone."}
+{"question_id": "4dfccbf8", "hypothesis": "Ukulele lessons with Rachel"}
+{"question_id": "0bc8ad93", "hypothesis": "with a friend"}
+{"question_id": "6e984302", "hypothesis": "a set of sculpting tools: a modeling tool set, a wire cutter, and a sculpting mat"}
+{"question_id": "gpt4_8279ba03", "hypothesis": "I only have evidence of you buying a Samsung Galaxy S21 for about $800 at Best Buy, not a kitchen appliance 10 days before 2023/03/25."}
+{"question_id": "gpt4_b5700ca0", "hypothesis": "the Episcopal Church"}
+{"question_id": "gpt4_68e94288", "hypothesis": "#PlankChallenge"}
+{"question_id": "gpt4_2655b836", "hypothesis": "No retrieved evidence mentions your new car’s first service or any issue you had after it, so the first post-service issue cannot be determined from the available evidence."}
+{"question_id": "gpt4_2487a7cb", "hypothesis": "The 'Data Analysis using Python' webinar"}
+{"question_id": "gpt4_76048e76", "hypothesis": "bike"}
+{"question_id": "gpt4_2312f94c", "hypothesis": "Samsung Galaxy S22"}
+{"question_id": "0bb5a684", "hypothesis": "7 days"}
+{"question_id": "08f4fc43", "hypothesis": "30 days"}
+{"question_id": "2c63a862", "hypothesis": "14 days"}
+{"question_id": "gpt4_385a5000", "hypothesis": "Tomatoes"}
+{"question_id": "2a1811e2", "hypothesis": "11"}
+{"question_id": "bbf86515", "hypothesis": "4 days"}
+{"question_id": "gpt4_5dcc0aab", "hypothesis": "white Adidas sneakers"}
+{"question_id": "gpt4_0b2f1d21", "hypothesis": "The stand mixer malfunction happened first."}
+{"question_id": "f0853d11", "hypothesis": "14 days"}
+{"question_id": "gpt4_6ed717ea", "hypothesis": "the training pads for Luna"}
+{"question_id": "gpt4_70e84552", "hypothesis": "Fixing the fence"}
+{"question_id": "a3838d2b", "hypothesis": "2"}
+{"question_id": "gpt4_93159ced", "hypothesis": "I can’t compute the exact amount from the evidence: it shows you’ve been at NovaTech for about 4 years and 3 months, and that you started working in the field when you graduated, but it doesn’t give your graduation date or total career length before NovaTech."}
+{"question_id": "gpt4_2d58bcd6", "hypothesis": "The Hate U Give"}
+{"question_id": "gpt4_65aabe59", "hypothesis": "Can't tell from the evidence."}
+{"question_id": "982b5123", "hypothesis": "3 months"}
+{"question_id": "b9cfe692", "hypothesis": "I have evidence only for one of the two books: \"The Seven Husbands of Evelyn Hugo\" took you two and a half weeks to finish. There’s no evidence here for how long \"The Nightingale\" took, so I can’t compute a combined total from the provided memory."}
+{"question_id": "gpt4_4edbafa2", "hypothesis": "2023-06-03"}
+{"question_id": "c8090214", "hypothesis": "7 days"}
+{"question_id": "gpt4_483dd43c", "hypothesis": "Game of Thrones"}
+{"question_id": "e4e14d04", "hypothesis": "cannot be determined from the provided evidence"}
+{"question_id": "c9f37c46", "hypothesis": "unresolved"}
+{"question_id": "gpt4_2c50253f", "hypothesis": "I wake up at 6:30 AM on Tuesdays and Thursdays."}
+{"question_id": "dcfa8644", "hypothesis": "14 days"}
+{"question_id": "gpt4_b4a80587", "hypothesis": "the arrival of the new prime lens"}
+{"question_id": "gpt4_9a159967", "hypothesis": "Southwest Airlines and American Airlines (tie)"}
+{"question_id": "cc6d1ec1", "hypothesis": "about two months"}
+{"question_id": "gpt4_8c8961ae", "hypothesis": "the solo trip to Thailand"}
+{"question_id": "gpt4_d9af6064", "hypothesis": "new router"}
+{"question_id": "gpt4_7de946e7", "hypothesis": "persistent cough"}
+{"question_id": "d01c6aa8", "hypothesis": "Cannot be determined from the retrieved evidence."}
+{"question_id": "993da5e2", "hypothesis": "I can only infer that you rearranged the furniture about three weeks before 2023/05/26; there’s no evidence here about when you started using the new area rug, so the duration before the rearrangement can’t be determined from the provided memory."}
+{"question_id": "a3045048", "hypothesis": "30 days"}
+{"question_id": "gpt4_d31cdae3", "hypothesis": "The family road trip across the American Southwest."}
+{"question_id": "gpt4_cd90e484", "hypothesis": "Insufficient evidence to determine exactly"}
+{"question_id": "gpt4_88806d6e", "hypothesis": "Can't tell from the evidence."}
+{"question_id": "gpt4_4cd9eba1", "hypothesis": "1 week"}
+{"question_id": "gpt4_93f6379c", "hypothesis": "Page Turners"}
+{"question_id": "b29f3365", "hypothesis": "six weeks"}
+{"question_id": "gpt4_2f56ae70", "hypothesis": "There’s no evidence here of you starting any streaming service, so I can’t identify a most recent one from the provided memory."}
+{"question_id": "6613b389", "hypothesis": "2 months"}
+{"question_id": "gpt4_78cf46a3", "hypothesis": "Receiving the new phone case happened first."}
+{"question_id": "gpt4_0a05b494", "hypothesis": "the tourist from Australia"}
+{"question_id": "gpt4_1a1dc16d", "hypothesis": "The pride parade"}
+{"question_id": "gpt4_2f584639", "hypothesis": "The necklace for your sister."}
+{"question_id": "gpt4_213fd887", "hypothesis": "the volleyball league"}
+{"question_id": "gpt4_5438fa52", "hypothesis": "the start of your Spanish classes"}
+{"question_id": "gpt4_c27434e8", "hypothesis": "Japanese Zero fighter plane model"}
+{"question_id": "gpt4_fe651585", "hypothesis": "Rachel"}
+{"question_id": "8c18457d", "hypothesis": "21 days"}
+{"question_id": "gpt4_70e84552_abs", "hypothesis": "Fixing the fence"}
+{"question_id": "gpt4_93159ced_abs", "hypothesis": "The evidence doesn’t support a current job at Google. On 2023/05/30, you were described as working at NovaTech, and while there’s a truncated note saying you’d been in software engineering before mentioning your current job, it doesn’t include the duration. So I can’t compute how long you’d been working before starting a job at Google from the provided evidence."}
+{"question_id": "982b5123_abs", "hypothesis": "There’s no evidence here of you booking an Airbnb in Sacramento. The only Airbnb mention is for San Francisco, tied to your stay in Haight-Ashbury on 2023/05/27."}
+{"question_id": "c8090214_abs", "hypothesis": "I attended the Holiday Market 7 days before Black Friday, but there’s no evidence here that you bought an iPad, so the day difference to an iPad purchase can’t be determined from the provided information."}
+{"question_id": "gpt4_c27434e8_abs", "hypothesis": "Insufficient evidence to determine"}
+{"question_id": "gpt4_fe651585_abs", "hypothesis": "Alex"}
+{"question_id": "6a1eabeb", "hypothesis": "25:50"}
+{"question_id": "6aeb4375", "hypothesis": "2"}
+{"question_id": "830ce83f", "hypothesis": "Chicago"}
+{"question_id": "852ce960", "hypothesis": "$350,000"}
+{"question_id": "945e3d21", "hypothesis": "three times a week"}
+{"question_id": "d7c942c3", "hypothesis": "Yes — your mom is using the same grocery list app as you."}
+{"question_id": "71315a70", "hypothesis": "10-12 hours"}
+{"question_id": "89941a93", "hypothesis": "1"}
+{"question_id": "ce6d2d27", "hypothesis": "Friday"}
+{"question_id": "9ea5eabc", "hypothesis": "Paris"}
+{"question_id": "07741c44", "hypothesis": "shoe rack"}
+{"question_id": "a1eacc2a", "hypothesis": "7"}
+{"question_id": "184da446", "hypothesis": "220 pages"}
+{"question_id": "031748ae", "hypothesis": "4 when you started; 5 now"}
+{"question_id": "4d6b87c8", "hypothesis": "25"}
+{"question_id": "0f05491a", "hypothesis": "120 stars"}
+{"question_id": "08e075c7", "hypothesis": "I can’t determine that from the provided evidence. The memory snippets only mention your bedtime routine, reading, and using a massage roller, with no evidence about when you started using a Fitbit Charge 3."}
+{"question_id": "f9e8c073", "hypothesis": "5 sessions"}
+{"question_id": "41698283", "hypothesis": "70-200mm zoom lens"}
+{"question_id": "2698e78f", "hypothesis": "Not specified; the memory only says to review it regularly with Dr. Smith."}
+{"question_id": "b6019101", "hypothesis": "5"}
+{"question_id": "45dc21b6", "hypothesis": "2"}
+{"question_id": "5a4f22c0", "hypothesis": "TechCorp"}
+{"question_id": "6071bd76", "hypothesis": "less"}
+{"question_id": "e493bb7c", "hypothesis": "above my bed in my bedroom"}
+{"question_id": "618f13b2", "hypothesis": "6"}
+{"question_id": "72e3ee87", "hypothesis": "50"}
+{"question_id": "c4ea545c", "hypothesis": "Insufficient evidence"}
+{"question_id": "01493427", "hypothesis": "17"}
+{"question_id": "6a27ffc2", "hypothesis": "30"}
+{"question_id": "2133c1b5", "hypothesis": "3 months"}
+{"question_id": "18bc8abd", "hypothesis": "Kansas City Masterpiece BBQ sauce"}
+{"question_id": "db467c8c", "hypothesis": "about nine months"}
+{"question_id": "7a87bd0c", "hypothesis": "4 weeks"}
+{"question_id": "e61a7584", "hypothesis": "about 9 months"}
+{"question_id": "1cea1afa", "hypothesis": "600"}
+{"question_id": "ed4ddc30", "hypothesis": "20 dozen eggs"}
+{"question_id": "8fb83627", "hypothesis": "5"}
+{"question_id": "b01defab", "hypothesis": "Yes"}
+{"question_id": "22d2cb42", "hypothesis": "the music shop on Main St"}
+{"question_id": "0e4e4c46", "hypothesis": "132 points"}
+{"question_id": "4b24c848", "hypothesis": "5"}
+{"question_id": "7e974930", "hypothesis": "Your most recent Downtown Farmers Market visit was on 2023/09/30, but the exact earnings amount isn’t shown in the retrieved evidence."}
+{"question_id": "603deb26", "hypothesis": "10"}
+{"question_id": "59524333", "hypothesis": "6:00 pm"}
+{"question_id": "5831f84d", "hypothesis": "10"}
+{"question_id": "eace081b", "hypothesis": "Oahu"}
+{"question_id": "affe2881", "hypothesis": "32"}
+{"question_id": "50635ada", "hypothesis": "Premier Gold status"}
+{"question_id": "e66b632c", "hypothesis": "26 minutes and 30 seconds"}
+{"question_id": "0ddfec37", "hypothesis": "Cannot be determined from available evidence"}
+{"question_id": "f685340e", "hypothesis": "Previously, every other week. Now, not explicitly stated."}
+{"question_id": "cc5ded98", "hypothesis": "an hour each day"}
+{"question_id": "dfde3500", "hypothesis": "Wednesday"}
+{"question_id": "69fee5aa", "hypothesis": "37"}
+{"question_id": "7401057b", "hypothesis": "1 free night's stay"}
+{"question_id": "cf22b7bf", "hypothesis": "10 pounds"}
+{"question_id": "a2f3aa27", "hypothesis": "close to 1300 followers"}
+{"question_id": "c7dc5443", "hypothesis": "5-2"}
+{"question_id": "06db6396", "hypothesis": "5"}
+{"question_id": "3ba21379", "hypothesis": "Ford F-150 pickup truck model"}
+{"question_id": "9bbe84a2", "hypothesis": "level 100 before the end of the year"}
+{"question_id": "10e09553", "hypothesis": "7"}
+{"question_id": "dad224aa", "hypothesis": "7:30 am"}
+{"question_id": "ba61f0b9", "hypothesis": "6"}
+{"question_id": "42ec0761", "hypothesis": "Yes, you have a spare screwdriver."}
+{"question_id": "5c40ec5b", "hypothesis": "2"}
+{"question_id": "c6853660", "hypothesis": "increase"}
+{"question_id": "26bdc477", "hypothesis": "5"}
+{"question_id": "0977f2af", "hypothesis": "I can’t tell from the retrieved evidence."}
+{"question_id": "6aeb4375_abs", "hypothesis": "{\"answer\":0}"}
+{"question_id": "031748ae_abs", "hypothesis": "4"}
+{"question_id": "2698e78f_abs", "hypothesis": "No evidence provided states how often you see Dr. Johnson."}
+{"question_id": "2133c1b5_abs", "hypothesis": "About 7 months. The evidence says it had been 1 month in your new studio apartment in Harajuku on 2023/04/11, which makes it about 7 months by 2023/10/15. The memory mentions Harajuku, not Shinjuku."}
+{"question_id": "0ddfec37_abs", "hypothesis": "0"}
+{"question_id": "f685340e_abs", "hypothesis": "Every other week."}
+{"question_id": "89941a94", "hypothesis": "Yes — a road bike."}
+{"question_id": "07741c45", "hypothesis": "shoe rack"}
+{"question_id": "7161e7e2", "hypothesis": "8 am - 4 pm (Day Shift)"}
+{"question_id": "c4f10528", "hypothesis": "Miss Bee Providore"}
+{"question_id": "89527b6b", "hypothesis": "I can only recover that the dinosaur-book conversation was on 2023/05/23 and the visible excerpt includes a T-Rex with a green scaly body; the Plesiosaur image description and its body color are not present in the provided evidence."}
+{"question_id": "e9327a54", "hypothesis": "I don’t have the dessert shop’s name in the retrieved memory."}
+{"question_id": "4c36ccef", "hypothesis": "Roscioli"}
+{"question_id": "6ae235be", "hypothesis": "Atmospheric distillation, fluid catalytic cracking (FCC), alkylation, and hydrotreating."}