Borchmann commited on
Commit
50d53bd
·
verified ·
1 Parent(s): 5754893

Upload folder using huggingface_hub

Browse files
Files changed (33) hide show
  1. app.py +7 -0
  2. eval/cleanup_submissions.py +181 -0
  3. eval/delete_unlinked.py +87 -0
  4. eval/link_file_search_predictions.py +103 -0
  5. eval/metrics.py +5 -1
  6. eval/reevaluate_submissions.py +1 -0
  7. eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json +33 -28
  8. eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json +33 -28
  9. eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json +33 -28
  10. eval/reevaluated_results/Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json +23 -23
  11. eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json +25 -25
  12. eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json +24 -24
  13. eval/reevaluated_results/Google/Gemini_2.5_Flash_with_File_Search_results_20260103_221253.json +120 -0
  14. eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json +28 -28
  15. eval/reevaluated_results/Google/Gemini_2.5_Pro_with_File_Search_results_20260103_221943.json +120 -0
  16. eval/reevaluated_results/Google/Gemini_3.0_Pro_(Preview)_with_File_Search_results_20260104_120431.json +120 -0
  17. eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260117_193634.json +121 -0
  18. eval/reevaluated_results/Humanity/Human_with_Oracle_Retriever_results_20260122_214532.json +119 -0
  19. eval/reevaluated_results/OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json +25 -25
  20. eval/reevaluated_results/OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json +8 -8
  21. eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json +27 -27
  22. eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_results_20260104_121551.json +120 -0
  23. eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json +27 -27
  24. eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json +20 -20
  25. eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json +25 -25
  26. eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_results_20260104_122026.json +120 -0
  27. eval/reevaluated_results/OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json +25 -25
  28. eval/reevaluated_results/OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json +22 -22
  29. eval/reevaluated_results/OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json +27 -27
  30. eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json +24 -24
  31. eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json +21 -21
  32. eval/reevaluated_results/Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json +34 -29
  33. eval/reevaluated_results/Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json +34 -29
app.py CHANGED
@@ -1172,6 +1172,7 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
1172
  cells = []
1173
  model_name = row.get("Model", "")
1174
  organization = row.get("Organization", "")
 
1175
 
1176
  # Check if this is a human performance row (should merge Model, Organization, Model Type)
1177
  is_human_row = organization == "Humanity"
@@ -1273,6 +1274,9 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
1273
  cells.append(f'<td style="text-align: center;">{cell_html}</td>')
1274
  elif col.startswith("Attribution"):
1275
  # Format F1 scores (scale 0-100) - NOT bias-adjusted
 
 
 
1276
  try:
1277
  attr_val = f"{float(value):.1f}" if value else "0"
1278
  attr_float = float(value) if value else 0
@@ -1309,6 +1313,9 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
1309
  elif col == "Effort (Kuiper)":
1310
  # Format Kuiper statistic (lower is better for calibration)
1311
  # Hide for Conventional RAG models (not meaningful)
 
 
 
1312
  tags = row.get("Tags", [])
1313
  is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
1314
  if is_conventional_rag:
 
1172
  cells = []
1173
  model_name = row.get("Model", "")
1174
  organization = row.get("Organization", "")
1175
+ hide_attrib_kuiper = model_name == "Human with Oracle Retriever"
1176
 
1177
  # Check if this is a human performance row (should merge Model, Organization, Model Type)
1178
  is_human_row = organization == "Humanity"
 
1274
  cells.append(f'<td style="text-align: center;">{cell_html}</td>')
1275
  elif col.startswith("Attribution"):
1276
  # Format F1 scores (scale 0-100) - NOT bias-adjusted
1277
+ if hide_attrib_kuiper:
1278
+ cells.append('<td style="text-align: center;">—</td>')
1279
+ continue
1280
  try:
1281
  attr_val = f"{float(value):.1f}" if value else "0"
1282
  attr_float = float(value) if value else 0
 
1313
  elif col == "Effort (Kuiper)":
1314
  # Format Kuiper statistic (lower is better for calibration)
1315
  # Hide for Conventional RAG models (not meaningful)
1316
+ if hide_attrib_kuiper:
1317
+ cells.append('<td style="text-align: center;">—</td>')
1318
+ continue
1319
  tags = row.get("Tags", [])
1320
  is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
1321
  if is_conventional_rag:
eval/cleanup_submissions.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Clean up backend-results submissions by keeping only the newest prediction per prefix
4
+ and the matching results file (if available). Optionally updates kept results to
5
+ record source_predictions_file and result_file_path.
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import os
11
+ import re
12
+ from dataclasses import dataclass
13
+ from typing import Dict, List, Optional, Tuple
14
+
15
+ from huggingface_hub import HfApi, hf_hub_download, list_repo_files
16
+
17
+
18
+ RESULTS_REPO = "agentic-document-ai/backend-results"
19
+ TOKEN = os.environ.get("HF_TOKEN")
20
+
21
+
22
+ @dataclass
23
+ class FileEntry:
24
+ path: str
25
+ prefix: str
26
+ ts_raw: str
27
+ ts_key: Tuple[int, int, int, int, int, int]
28
+
29
+
30
+ def _parse_timestamp(ts: str) -> Tuple[int, int, int, int, int, int]:
31
+ """Parse timestamps in YYYYMMDD_HHMMSS or YYYY-MM-DDTHH-MM-SSZ format."""
32
+ match_compact = re.match(r"^(\d{8})_(\d{6})$", ts)
33
+ if match_compact:
34
+ ymd, hms = match_compact.groups()
35
+ return (
36
+ int(ymd[0:4]),
37
+ int(ymd[4:6]),
38
+ int(ymd[6:8]),
39
+ int(hms[0:2]),
40
+ int(hms[2:4]),
41
+ int(hms[4:6]),
42
+ )
43
+
44
+ match_iso = re.match(r"^(\d{4})-(\d{2})-(\d{2})T(\d{2})-(\d{2})-(\d{2})Z$", ts)
45
+ if match_iso:
46
+ return tuple(int(x) for x in match_iso.groups()) # type: ignore[return-value]
47
+
48
+ # Fallback: treat as zero to make it always older than parsed timestamps
49
+ return (0, 0, 0, 0, 0, 0)
50
+
51
+
52
+ def _split_predictions(path: str) -> Optional[FileEntry]:
53
+ if "_predictions_" not in path or not path.endswith(".jsonl"):
54
+ return None
55
+ prefix, ts = path.rsplit("_predictions_", 1)
56
+ ts = ts.replace(".jsonl", "")
57
+ return FileEntry(path=path, prefix=prefix, ts_raw=ts, ts_key=_parse_timestamp(ts))
58
+
59
+
60
+ def _split_results(path: str) -> Optional[FileEntry]:
61
+ if "_results_" not in path or not path.endswith(".json"):
62
+ return None
63
+ prefix, ts = path.rsplit("_results_", 1)
64
+ ts = ts.replace(".json", "")
65
+ return FileEntry(path=path, prefix=prefix, ts_raw=ts, ts_key=_parse_timestamp(ts))
66
+
67
+
68
+ def main() -> int:
69
+ parser = argparse.ArgumentParser(description="Clean backend-results submissions")
70
+ parser.add_argument("--apply", action="store_true", help="Apply deletions/updates (default is dry-run)")
71
+ parser.add_argument("--drop-unmatched-results", action="store_true",
72
+ help="Delete results that do not match the latest prediction timestamp")
73
+ args = parser.parse_args()
74
+
75
+ api = HfApi(token=TOKEN)
76
+ files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
77
+
78
+ predictions: List[FileEntry] = []
79
+ results: List[FileEntry] = []
80
+
81
+ for f in files:
82
+ p = _split_predictions(f)
83
+ if p:
84
+ predictions.append(p)
85
+ continue
86
+ r = _split_results(f)
87
+ if r:
88
+ results.append(r)
89
+
90
+ preds_by_prefix: Dict[str, List[FileEntry]] = {}
91
+ results_by_prefix: Dict[str, List[FileEntry]] = {}
92
+
93
+ for p in predictions:
94
+ preds_by_prefix.setdefault(p.prefix, []).append(p)
95
+ for r in results:
96
+ results_by_prefix.setdefault(r.prefix, []).append(r)
97
+
98
+ to_delete: List[str] = []
99
+ to_update: List[Tuple[str, str]] = [] # (result_path, prediction_path)
100
+ unmatched_latest: List[str] = []
101
+
102
+ all_prefixes = sorted(set(preds_by_prefix) | set(results_by_prefix))
103
+ for prefix in all_prefixes:
104
+ preds = sorted(preds_by_prefix.get(prefix, []), key=lambda x: x.ts_key, reverse=True)
105
+ res = sorted(results_by_prefix.get(prefix, []), key=lambda x: x.ts_key, reverse=True)
106
+
107
+ latest_pred = preds[0] if preds else None
108
+ latest_res = res[0] if res else None
109
+
110
+ # Remove all older predictions
111
+ if preds:
112
+ for old in preds[1:]:
113
+ to_delete.append(old.path)
114
+
115
+ # Remove older results; keep latest for now
116
+ if res:
117
+ for old in res[1:]:
118
+ to_delete.append(old.path)
119
+
120
+ if latest_pred and latest_res:
121
+ # If a result matches the latest prediction timestamp, link it
122
+ if latest_res.ts_raw == latest_pred.ts_raw:
123
+ to_update.append((latest_res.path, latest_pred.path))
124
+ else:
125
+ unmatched_latest.append(prefix)
126
+ if args.drop_unmatched_results:
127
+ to_delete.append(latest_res.path)
128
+ elif latest_res and not latest_pred:
129
+ # Results without any predictions
130
+ if args.drop_unmatched_results:
131
+ to_delete.append(latest_res.path)
132
+
133
+ print(f"Predictions: {len(predictions)}")
134
+ print(f"Results: {len(results)}")
135
+ print(f"Delete candidates: {len(to_delete)}")
136
+ print(f"Results to update (link to latest predictions): {len(to_update)}")
137
+ if unmatched_latest:
138
+ print("\nPrefixes where latest result does NOT match latest prediction:")
139
+ for p in unmatched_latest:
140
+ print(" ", p)
141
+
142
+ if not args.apply:
143
+ print("\nDry-run only. Re-run with --apply to execute changes.")
144
+ return 0
145
+
146
+ # Apply updates
147
+ for result_path, pred_path in to_update:
148
+ local_path = hf_hub_download(
149
+ repo_id=RESULTS_REPO,
150
+ filename=result_path,
151
+ repo_type="dataset",
152
+ token=TOKEN,
153
+ )
154
+ with open(local_path) as f:
155
+ data = json.load(f)
156
+ data["source_predictions_file"] = pred_path
157
+ data["result_file_path"] = result_path
158
+ api.upload_file(
159
+ path_or_fileobj=json.dumps(data, indent=2).encode("utf-8"),
160
+ path_in_repo=result_path,
161
+ repo_id=RESULTS_REPO,
162
+ repo_type="dataset",
163
+ token=TOKEN,
164
+ commit_message=f"Link result to latest prediction: {result_path}",
165
+ )
166
+
167
+ # Apply deletions
168
+ for path in to_delete:
169
+ api.delete_file(
170
+ path_in_repo=path,
171
+ repo_id=RESULTS_REPO,
172
+ repo_type="dataset",
173
+ token=TOKEN,
174
+ )
175
+
176
+ print("Cleanup complete.")
177
+ return 0
178
+
179
+
180
+ if __name__ == "__main__":
181
+ raise SystemExit(main())
eval/delete_unlinked.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Delete Humanity submissions and unlinked results from backend-results.
4
+
5
+ Usage:
6
+ export HF_TOKEN=...
7
+ python streamlit_app/eval/delete_unlinked.py # dry-run
8
+ python streamlit_app/eval/delete_unlinked.py --apply # actually delete
9
+ """
10
+
11
+ import argparse
12
+ import os
13
+
14
+ from huggingface_hub import HfApi, list_repo_files
15
+
16
+
17
+ RESULTS_REPO = "agentic-document-ai/backend-results"
18
+ TOKEN = os.environ.get("HF_TOKEN")
19
+
20
+
21
+ def main() -> int:
22
+ parser = argparse.ArgumentParser(description="Delete Humanity and unlinked files")
23
+ parser.add_argument("--apply", action="store_true", help="Actually delete (default: dry-run)")
24
+ args = parser.parse_args()
25
+
26
+ api = HfApi(token=TOKEN)
27
+ files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
28
+
29
+ result_files = [f for f in files if f.endswith('.json') and '_results_' in f]
30
+ pred_files = [f for f in files if f.endswith('.jsonl') and '_predictions_' in f]
31
+
32
+ def key_for_result(name: str):
33
+ parts = name.rsplit('_results_', 1)
34
+ if len(parts) != 2:
35
+ return None
36
+ return f"{parts[0]}_{parts[1].replace('.json','')}"
37
+
38
+ def key_for_pred(name: str):
39
+ parts = name.rsplit('_predictions_', 1)
40
+ if len(parts) != 2:
41
+ return None
42
+ return f"{parts[0]}_{parts[1].replace('.jsonl','')}"
43
+
44
+ result_keys = {key_for_result(f): f for f in result_files if key_for_result(f)}
45
+ pred_keys = {key_for_pred(f): f for f in pred_files if key_for_pred(f)}
46
+
47
+ # Find unlinked results (no matching prediction)
48
+ unlinked_results = [result_keys[k] for k in set(result_keys) - set(pred_keys)]
49
+
50
+ # Find all Humanity files
51
+ humanity_files = [f for f in files if f.startswith("Humanity/")]
52
+
53
+ # Combine into deletion list (deduplicated)
54
+ to_delete = sorted(set(unlinked_results + humanity_files))
55
+
56
+ print(f"Files to delete: {len(to_delete)}")
57
+ for f in to_delete:
58
+ print(f" {f}")
59
+
60
+ if not to_delete:
61
+ print("Nothing to delete.")
62
+ return 0
63
+
64
+ if not args.apply:
65
+ print("\nDry-run mode. Add --apply to actually delete.")
66
+ return 0
67
+
68
+ print(f"\nDeleting {len(to_delete)} files...")
69
+ for f in to_delete:
70
+ try:
71
+ api.delete_file(
72
+ path_in_repo=f,
73
+ repo_id=RESULTS_REPO,
74
+ repo_type="dataset",
75
+ token=TOKEN,
76
+ commit_message=f"Cleanup: delete {f}"
77
+ )
78
+ print(f" ✓ Deleted: {f}")
79
+ except Exception as e:
80
+ print(f" ✗ Error deleting {f}: {e}")
81
+
82
+ print("\nDone!")
83
+ return 0
84
+
85
+
86
+ if __name__ == "__main__":
87
+ raise SystemExit(main())
eval/link_file_search_predictions.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Upload missing File Search predictions and link them to existing results.
4
+
5
+ Usage:
6
+ export HF_TOKEN=...
7
+ python streamlit_app/eval/link_file_search_predictions.py --apply
8
+ """
9
+
10
+ import argparse
11
+ import json
12
+ import os
13
+ from pathlib import Path
14
+
15
+ from huggingface_hub import HfApi, hf_hub_download
16
+
17
+
18
+ RESULTS_REPO = "agentic-document-ai/backend-results"
19
+ TOKEN = os.environ.get("HF_TOKEN")
20
+ BASE_DIR = Path(__file__).resolve().parents[2] # Project root
21
+ FILE_SEARCH_DIR = BASE_DIR / "file_search_results"
22
+
23
+ # Map missing results -> local prediction file
24
+ MISSING_RESULTS = {
25
+ "Google/Gemini_2.5_Flash_with_File_Search_results_20260103_221253.json": "gemini-2.5-flash.jsonl",
26
+ "Google/Gemini_2.5_Pro_with_File_Search_results_20260103_221943.json": "gemini-2.5-pro.jsonl",
27
+ "Google/Gemini_3.0_Pro_(Preview)_with_File_Search_results_20260104_120431.json": "gemini-3-pro-preview.jsonl",
28
+ "OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_results_20260104_121551.json": "gpt-5.2-2025-12-11.jsonl",
29
+ "OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_results_20260104_122026.json": "gpt-5-mini-2025-08-07.jsonl",
30
+ }
31
+
32
+
33
+ def _pred_path_from_result(result_path: str) -> str:
34
+ # {org}/{model}_results_{ts}.json -> {org}/{model}_predictions_{ts}.jsonl
35
+ base, ts = result_path.rsplit("_results_", 1)
36
+ ts = ts.replace(".json", "")
37
+ return f"{base}_predictions_{ts}.jsonl"
38
+
39
+
40
+ def main() -> int:
41
+ parser = argparse.ArgumentParser(description="Upload file_search_results predictions and link them to results.")
42
+ parser.add_argument("--apply", action="store_true", help="Apply uploads/updates (default: dry-run)")
43
+ args = parser.parse_args()
44
+
45
+ if not FILE_SEARCH_DIR.exists():
46
+ raise FileNotFoundError(f"Missing directory: {FILE_SEARCH_DIR}")
47
+
48
+ api = HfApi(token=TOKEN)
49
+
50
+ actions = []
51
+ for result_path, local_name in MISSING_RESULTS.items():
52
+ local_file = FILE_SEARCH_DIR / local_name
53
+ if not local_file.exists():
54
+ raise FileNotFoundError(f"Missing local prediction file: {local_file}")
55
+
56
+ pred_path = _pred_path_from_result(result_path)
57
+ actions.append((result_path, pred_path, local_file))
58
+
59
+ print(f"Planned uploads: {len(actions)}")
60
+ for result_path, pred_path, local_file in actions:
61
+ print(f"- {local_file.name} -> {pred_path}")
62
+
63
+ if not args.apply:
64
+ print("\nDry-run only. Re-run with --apply to execute.")
65
+ return 0
66
+
67
+ for result_path, pred_path, local_file in actions:
68
+ # Upload predictions file
69
+ api.upload_file(
70
+ path_or_fileobj=str(local_file),
71
+ path_in_repo=pred_path,
72
+ repo_id=RESULTS_REPO,
73
+ repo_type="dataset",
74
+ token=TOKEN,
75
+ commit_message=f"Add predictions for {pred_path}",
76
+ )
77
+
78
+ # Update results JSON with linkage fields
79
+ local_result = hf_hub_download(
80
+ repo_id=RESULTS_REPO,
81
+ filename=result_path,
82
+ repo_type="dataset",
83
+ token=TOKEN,
84
+ )
85
+ with open(local_result) as f:
86
+ data = json.load(f)
87
+ data["source_predictions_file"] = pred_path
88
+ data["result_file_path"] = result_path
89
+ api.upload_file(
90
+ path_or_fileobj=json.dumps(data, indent=2).encode("utf-8"),
91
+ path_in_repo=result_path,
92
+ repo_id=RESULTS_REPO,
93
+ repo_type="dataset",
94
+ token=TOKEN,
95
+ commit_message=f"Link result to predictions: {result_path}",
96
+ )
97
+
98
+ print("Done.")
99
+ return 0
100
+
101
+
102
+ if __name__ == "__main__":
103
+ raise SystemExit(main())
eval/metrics.py CHANGED
@@ -328,13 +328,17 @@ def _get_gemini_model():
328
  def _call_gemini_with_timeout(model, prompt, timeout=30):
329
  """Call Gemini with a timeout using threading."""
330
  from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
 
 
 
 
331
 
332
  def _call():
333
  return model.generate_content(
334
  prompt,
335
  tools=[_LLM_JUDGE_TOOL],
336
  tool_config={"function_calling_config": {"mode": "ANY"}},
337
- generation_config={"temperature": 0},
338
  request_options={"timeout": timeout},
339
  )
340
 
 
328
  def _call_gemini_with_timeout(model, prompt, timeout=30):
329
  """Call Gemini with a timeout using threading."""
330
  from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
331
+ try:
332
+ temperature = float(os.environ.get("GEMINI_TEMPERATURE", "0"))
333
+ except ValueError:
334
+ temperature = 0.0
335
 
336
  def _call():
337
  return model.generate_content(
338
  prompt,
339
  tools=[_LLM_JUDGE_TOOL],
340
  tool_config={"function_calling_config": {"mode": "ANY"}},
341
+ generation_config={"temperature": temperature},
342
  request_options={"timeout": timeout},
343
  )
344
 
eval/reevaluate_submissions.py CHANGED
@@ -30,6 +30,7 @@ from metrics import (
30
  # Config
31
  RESULTS_REPO = "agentic-document-ai/backend-results"
32
  TOKEN = os.environ.get("HF_TOKEN")
 
33
 
34
 
35
  def load_gold_data():
 
30
  # Config
31
  RESULTS_REPO = "agentic-document-ai/backend-results"
32
  TOKEN = os.environ.get("HF_TOKEN")
33
+ os.environ.setdefault("GEMINI_TEMPERATURE", "0")
34
 
35
 
36
  def load_gold_data():
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json CHANGED
@@ -15,28 +15,28 @@
15
  "submission_date": "2026-01-10T13:16:29.905067+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 59.40452333237904,
19
  "semantic_ci": [
20
- 54.988846362852016,
21
- 63.82020030190607
22
  ],
23
- "anls": 57.61603118163428,
24
- "page_f1": 58.72697776505391,
25
- "doc_f1": 80.62601393262716,
26
- "kuiper": 34.73747494989995
27
  },
28
  "single_evidence": {
29
- "semantic": 65.45750168199147,
30
- "anls": 63.05477065574925,
31
- "n": 364
32
  },
33
  "multi_evidence_same_doc": {
34
  "semantic": 36.443148688046655,
35
- "anls": 36.39125352974683,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 54.02160864345738,
40
  "anls": 53.75681851851068,
41
  "n": 51
42
  },
@@ -52,33 +52,33 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 68.02721088435374,
56
- "anls": 66.4976376669925,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 45.474711623779946,
61
- "anls": 49.53672826145982,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
- "semantic": 63.775510204081634,
66
  "anls": 57.39996898263027,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 62.961354754667845,
71
- "anls": 60.56474101398679,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 59.731209556993534,
76
- "anls": 53.957859669066565,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 61.69909824394873,
81
- "anls": 54.79335264218985,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
@@ -91,13 +91,18 @@
91
  "anls": 73.07522250524337,
92
  "n": 24
93
  },
 
 
 
 
 
94
  "Reference": {
95
- "semantic": 61.813186813186825,
96
- "anls": 63.19327183267644,
97
  "n": 52
98
  },
99
  "Reports": {
100
- "semantic": 57.82312925170068,
101
  "anls": 53.11616787903517,
102
  "n": 75
103
  },
@@ -107,10 +112,10 @@
107
  "n": 23
108
  }
109
  },
110
- "n_evaluated": 499,
111
- "n_unmatched": 1767
112
  },
113
- "reevaluated_date": "2026-01-16T15:38:39.375804+00:00",
114
  "source_predictions_file": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_predictions_20260110_131629.jsonl",
115
  "result_file_path": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json"
116
  }
 
15
  "submission_date": "2026-01-10T13:16:29.905067+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 60.30612244897959,
19
  "semantic_ci": [
20
+ 55.908758540500614,
21
+ 64.70348635745857
22
  ],
23
+ "anls": 58.219456137310175,
24
+ "page_f1": 58.609523809523814,
25
+ "doc_f1": 80.4647619047619,
26
+ "kuiper": 36.59000000000024
27
  },
28
  "single_evidence": {
29
+ "semantic": 66.53620352250489,
30
+ "anls": 63.67469870606113,
31
+ "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
  "semantic": 36.443148688046655,
35
+ "anls": 37.224586863080155,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 55.022008803521395,
40
  "anls": 53.75681851851068,
41
  "n": 51
42
  },
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 72.27891156462587,
56
+ "anls": 68.58097100032585,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 48.802129547471154,
61
+ "anls": 51.018941700195,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
+ "semantic": 57.39795918367348,
66
  "anls": 57.39996898263027,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 62.961354754667845,
71
+ "anls": 60.57273973419155,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 57.24240915878547,
76
+ "anls": 55.177371864188515,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 59.32605600379688,
81
+ "anls": 55.25846892125962,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
 
91
  "anls": 73.07522250524337,
92
  "n": 24
93
  },
94
+ "Other": {
95
+ "semantic": 100.0,
96
+ "anls": 100.0,
97
+ "n": 1
98
+ },
99
  "Reference": {
100
+ "semantic": 65.73783359497645,
101
+ "anls": 63.24305900202882,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 59.18367346938776,
106
  "anls": 53.11616787903517,
107
  "n": 75
108
  },
 
112
  "n": 23
113
  }
114
  },
115
+ "n_evaluated": 500,
116
+ "n_unmatched": 1766
117
  },
118
+ "reevaluated_date": "2026-01-22T21:56:11.212274+00:00",
119
  "source_predictions_file": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_predictions_20260110_131629.jsonl",
120
  "result_file_path": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json"
121
  }
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json CHANGED
@@ -15,28 +15,28 @@
15
  "submission_date": "2026-01-10T13:20:54.125677+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 57.87084372827287,
19
  "semantic_ci": [
20
- 53.43484625371868,
21
- 62.30684120282706
22
  ],
23
- "anls": 57.937064653000625,
24
- "page_f1": 54.83061360816872,
25
- "doc_f1": 78.76514934631167,
26
- "kuiper": 36.665330661322855
27
  },
28
  "single_evidence": {
29
- "semantic": 65.03700381251402,
30
- "anls": 64.58920167676482,
31
- "n": 364
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 27.332361516034982,
35
- "anls": 29.952414571366546,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 57.022809123649466,
40
  "anls": 56.55143191196322,
41
  "n": 51
42
  },
@@ -53,12 +53,12 @@
53
  },
54
  "Events": {
55
  "semantic": 80.78231292517005,
56
- "anls": 77.14578581514066,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 46.58385093167702,
61
- "anls": 47.03746065646829,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -67,13 +67,13 @@
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 53.19148936170214,
71
- "anls": 56.60682613221971,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 63.46441015430564,
76
- "anls": 61.11435746903807,
77
  "n": 41
78
  },
79
  "Legal": {
@@ -82,7 +82,7 @@
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
- "semantic": 40.81632653061224,
86
  "anls": 54.65844817149165,
87
  "n": 25
88
  },
@@ -91,26 +91,31 @@
91
  "anls": 73.59601449275362,
92
  "n": 24
93
  },
 
 
 
 
 
94
  "Reference": {
95
- "semantic": 64.75667189952904,
96
- "anls": 68.57667578882189,
97
  "n": 52
98
  },
99
  "Reports": {
100
- "semantic": 57.14285714285715,
101
  "anls": 56.44955119487462,
102
  "n": 75
103
  },
104
  "Technical": {
105
- "semantic": 57.67524401064772,
106
  "anls": 51.60498619336015,
107
  "n": 23
108
  }
109
  },
110
- "n_evaluated": 499,
111
- "n_unmatched": 1767
112
  },
113
- "reevaluated_date": "2026-01-16T15:40:22.888859+00:00",
114
  "source_predictions_file": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132054.jsonl",
115
  "result_file_path": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json"
116
  }
 
15
  "submission_date": "2026-01-10T13:20:54.125677+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 58.77551020408164,
19
  "semantic_ci": [
20
+ 54.35541948342135,
21
+ 63.195600924741925
22
  ],
23
+ "anls": 58.50093245097659,
24
+ "page_f1": 54.72095238095238,
25
+ "doc_f1": 78.60761904761905,
26
+ "kuiper": 38.259999999999735
27
  },
28
  "single_evidence": {
29
+ "semantic": 65.97707576181158,
30
+ "anls": 65.20641198351612,
31
+ "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 27.939747327502424,
35
+ "anls": 30.547652666604648,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 58.023209283713484,
40
  "anls": 56.55143191196322,
41
  "n": 51
42
  },
 
53
  },
54
  "Events": {
55
  "semantic": 80.78231292517005,
56
+ "anls": 79.22911914847398,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 48.802129547471154,
61
+ "anls": 48.51967409520347,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 51.02040816326531,
71
+ "anls": 56.616157972458616,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 64.70881035340967,
76
+ "anls": 62.33386966416003,
77
  "n": 41
78
  },
79
  "Legal": {
 
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
+ "semantic": 42.85714285714285,
86
  "anls": 54.65844817149165,
87
  "n": 25
88
  },
 
91
  "anls": 73.59601449275362,
92
  "n": 24
93
  },
94
+ "Other": {
95
+ "semantic": 100.0,
96
+ "anls": 100.0,
97
+ "n": 1
98
+ },
99
  "Reference": {
100
+ "semantic": 65.73783359497645,
101
+ "anls": 68.63568984240118,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 57.82312925170068,
106
  "anls": 56.44955119487462,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 62.111801242236034,
111
  "anls": 51.60498619336015,
112
  "n": 23
113
  }
114
  },
115
+ "n_evaluated": 500,
116
+ "n_unmatched": 1766
117
  },
118
+ "reevaluated_date": "2026-01-22T21:57:09.736897+00:00",
119
  "source_predictions_file": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132054.jsonl",
120
  "result_file_path": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json"
121
  }
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json CHANGED
@@ -15,29 +15,29 @@
15
  "submission_date": "2026-01-10T13:23:58.123387+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 46.623859964827616,
19
  "semantic_ci": [
20
- 42.1639790166841,
21
- 51.08374091297113
22
  ],
23
- "anls": 45.43424080850834,
24
- "page_f1": 47.685529789738204,
25
- "doc_f1": 69.57247828991316,
26
- "kuiper": 48.63927855711413
27
  },
28
  "single_evidence": {
29
- "semantic": 54.10405920610001,
30
- "anls": 51.808356346955755,
31
- "n": 364
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 16.399416909621,
35
- "anls": 18.845466696680308,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 43.01720688275311,
40
- "anls": 43.73382844377694,
41
  "n": 51
42
  },
43
  "by_domain": {
@@ -52,13 +52,13 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 59.52380952380955,
56
  "anls": 54.598842018196855,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 29.94676131322094,
61
- "anls": 29.932472094079802,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -68,12 +68,12 @@
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 44.507164567954845,
71
- "anls": 44.439106365198185,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 47.28720756595321,
76
- "anls": 50.16056789323261,
77
  "n": 41
78
  },
79
  "Legal": {
@@ -87,18 +87,23 @@
87
  "n": 25
88
  },
89
  "Misc": {
90
- "semantic": 59.52380952380955,
91
  "anls": 59.60305559882987,
92
  "n": 24
93
  },
 
 
 
 
 
94
  "Reference": {
95
- "semantic": 43.171114599686035,
96
- "anls": 50.44584246011934,
97
  "n": 52
98
  },
99
  "Reports": {
100
- "semantic": 57.14285714285715,
101
- "anls": 50.000135852648256,
102
  "n": 75
103
  },
104
  "Technical": {
@@ -107,10 +112,10 @@
107
  "n": 23
108
  }
109
  },
110
- "n_evaluated": 499,
111
- "n_unmatched": 1767
112
  },
113
- "reevaluated_date": "2026-01-16T15:41:38.978677+00:00",
114
  "source_predictions_file": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132358.jsonl",
115
  "result_file_path": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json"
116
  }
 
15
  "submission_date": "2026-01-10T13:23:58.123387+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 47.3469387755102,
19
  "semantic_ci": [
20
+ 42.886491622183534,
21
+ 51.807385928836865
22
  ],
23
+ "anls": 45.99676623639152,
24
+ "page_f1": 47.590158730158734,
25
+ "doc_f1": 69.43333333333334,
26
+ "kuiper": 50.216000000000136
27
  },
28
  "single_evidence": {
29
+ "semantic": 54.794520547945204,
30
+ "anls": 52.49777090665106,
31
+ "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 17.006802721088444,
35
+ "anls": 18.782788634002245,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 44.01760704281713,
40
+ "anls": 44.29299003944969,
41
  "n": 51
42
  },
43
  "by_domain": {
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 63.775510204081634,
56
  "anls": 54.598842018196855,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 31.055900621118017,
61
+ "anls": 31.32939863081144,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 44.507164567954845,
71
+ "anls": 44.44710508540295,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 49.77600796416127,
76
+ "anls": 52.59959228347652,
77
  "n": 41
78
  },
79
  "Legal": {
 
87
  "n": 25
88
  },
89
  "Misc": {
90
+ "semantic": 61.64965986394556,
91
  "anls": 59.60305559882987,
92
  "n": 24
93
  },
94
+ "Other": {
95
+ "semantic": 100.0,
96
+ "anls": 100.0,
97
+ "n": 1
98
+ },
99
  "Reference": {
100
+ "semantic": 44.15227629513344,
101
+ "anls": 50.50485651369861,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 55.78231292517006,
106
+ "anls": 49.929936422448826,
107
  "n": 75
108
  },
109
  "Technical": {
 
112
  "n": 23
113
  }
114
  },
115
+ "n_evaluated": 500,
116
+ "n_unmatched": 1766
117
  },
118
+ "reevaluated_date": "2026-01-22T21:58:16.385696+00:00",
119
  "source_predictions_file": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132358.jsonl",
120
  "result_file_path": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json"
121
  }
eval/reevaluated_results/Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json CHANGED
@@ -15,24 +15,24 @@
15
  "submission_date": "2026-01-09T13:03:19.649656+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 67.24489795918367,
19
  "semantic_ci": [
20
- 63.00498530397807,
21
- 71.48481061438929
22
  ],
23
- "anls": 61.60747574238133,
24
  "page_f1": 72.02476190476192,
25
  "doc_f1": 88.24761904761905,
26
- "kuiper": 51.64257028112422
27
  },
28
  "single_evidence": {
29
- "semantic": 72.54682694995807,
30
- "anls": 66.17043328346772,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 45.55393586005831,
35
- "anls": 42.55992435652824,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
@@ -42,7 +42,7 @@
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
- "semantic": 71.42857142857143,
46
  "anls": 63.92691050779287,
47
  "n": 15
48
  },
@@ -52,23 +52,23 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 87.15986394557822,
56
  "anls": 72.62325637325637,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 57.12067435669922,
61
- "anls": 54.29593695395653,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
- "semantic": 73.34183673469387,
66
  "anls": 68.77016129032259,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 64.04689535388624,
71
- "anls": 62.779826338896896,
72
  "n": 47
73
  },
74
  "HR/Employment": {
@@ -82,7 +82,7 @@
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
- "semantic": 48.97959183673469,
86
  "anls": 41.69842237151431,
87
  "n": 25
88
  },
@@ -97,17 +97,17 @@
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 61.813186813186825,
101
- "anls": 64.3484267705628,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 79.59183673469387,
106
- "anls": 65.36479556179735,
107
  "n": 75
108
  },
109
  "Technical": {
110
- "semantic": 77.63975155279503,
111
  "anls": 64.75817505570946,
112
  "n": 23
113
  }
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T15:42:51.557033+00:00",
119
  "source_predictions_file": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_predictions_20260109_130319.jsonl",
120
  "result_file_path": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json"
121
  }
 
15
  "submission_date": "2026-01-09T13:03:19.649656+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 68.16326530612247,
19
  "semantic_ci": [
20
+ 63.95120785444595,
21
+ 72.37532275779898
22
  ],
23
+ "anls": 61.45994021171436,
24
  "page_f1": 72.02476190476192,
25
  "doc_f1": 88.24761904761905,
26
+ "kuiper": 50.710843373494285
27
  },
28
  "single_evidence": {
29
+ "semantic": 73.24573665082471,
30
+ "anls": 66.15560605815631,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 47.9834791059281,
35
+ "anls": 41.74616449825649,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
 
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
+ "semantic": 68.02721088435374,
46
  "anls": 63.92691050779287,
47
  "n": 15
48
  },
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 89.28571428571429,
56
  "anls": 72.62325637325637,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 59.89352262644188,
61
+ "anls": 54.1464804322174,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
+ "semantic": 76.53061224489795,
66
  "anls": 68.77016129032259,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 62.961354754667845,
71
+ "anls": 61.67613431405342,
72
  "n": 47
73
  },
74
  "HR/Employment": {
 
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
+ "semantic": 51.02040816326531,
86
  "anls": 41.69842237151431,
87
  "n": 25
88
  },
 
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 64.75667189952904,
101
+ "anls": 64.40744082414207,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 80.95238095238096,
106
+ "anls": 65.21528928243777,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 73.20319432120675,
111
  "anls": 64.75817505570946,
112
  "n": 23
113
  }
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T21:59:23.189394+00:00",
119
  "source_predictions_file": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_predictions_20260109_130319.jsonl",
120
  "result_file_path": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json"
121
  }
eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json CHANGED
@@ -15,24 +15,24 @@
15
  "submission_date": "2026-01-09T12:58:16.611348+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 79.18367346938776,
19
  "semantic_ci": [
20
- 75.45467776603695,
21
- 82.91266917273857
22
  ],
23
- "anls": 71.74787642305597,
24
  "page_f1": 79.12333333333333,
25
  "doc_f1": 92.98636363636363,
26
- "kuiper": 35.64777327935229
27
  },
28
  "single_evidence": {
29
- "semantic": 82.05199888174448,
30
- "anls": 74.4069329276591,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 64.99028182701652,
35
- "anls": 60.572680530273026,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
@@ -42,8 +42,8 @@
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
- "semantic": 78.2312925170068,
46
- "anls": 69.51092117758785,
47
  "n": 15
48
  },
49
  "Education": {
@@ -57,27 +57,27 @@
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 72.09405501330967,
61
- "anls": 66.81148919563769,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
- "semantic": 70.1530612244898,
66
  "anls": 76.26728110599078,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 87.92878853669127,
71
- "anls": 74.90457714355891,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
  "semantic": 77.15281234444997,
76
- "anls": 72.85160396238213,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 90.17560512577124,
81
  "anls": 72.74221043114129,
82
  "n": 43
83
  },
@@ -93,21 +93,21 @@
93
  },
94
  "Other": {
95
  "semantic": 0.0,
96
- "anls": 0.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 70.6436420722135,
101
- "anls": 72.21619612753193,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 85.0340136054422,
106
- "anls": 74.0536995032274,
107
  "n": 75
108
  },
109
  "Technical": {
110
- "semantic": 70.9849157054126,
111
  "anls": 60.23577215564363,
112
  "n": 23
113
  }
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T15:45:07.483636+00:00",
119
  "source_predictions_file": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_predictions_20260109_125816.jsonl",
120
  "result_file_path": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json"
121
  }
 
15
  "submission_date": "2026-01-09T12:58:16.611348+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 80.61224489795919,
19
  "semantic_ci": [
20
+ 76.96923621662829,
21
+ 84.25525357929008
22
  ],
23
+ "anls": 72.85884587946542,
24
  "page_f1": 79.12333333333333,
25
  "doc_f1": 92.98636363636363,
26
+ "kuiper": 35.0526315789474
27
  },
28
  "single_evidence": {
29
+ "semantic": 83.58960022365112,
30
+ "anls": 75.8229313455021,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 66.81243926141885,
35
+ "anls": 61.03274369327326,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
 
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
+ "semantic": 85.0340136054422,
46
+ "anls": 76.17758784425452,
47
  "n": 15
48
  },
49
  "Education": {
 
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 73.75776397515527,
61
+ "anls": 69.52764532211991,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
+ "semantic": 76.53061224489795,
66
  "anls": 76.26728110599078,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 86.84324793747285,
71
+ "anls": 76.0805203770741,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
  "semantic": 77.15281234444997,
76
+ "anls": 74.07111615750408,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 91.36212624584718,
81
  "anls": 72.74221043114129,
82
  "n": 43
83
  },
 
93
  },
94
  "Other": {
95
  "semantic": 0.0,
96
+ "anls": 76.92307692307692,
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 72.60596546310833,
101
+ "anls": 72.8846701655712,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 87.07482993197281,
106
+ "anls": 73.90230357912102,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 75.42147293700089,
111
  "anls": 60.23577215564363,
112
  "n": 23
113
  }
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:00:12.094993+00:00",
119
  "source_predictions_file": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_predictions_20260109_125816.jsonl",
120
  "result_file_path": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json"
121
  }
eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json CHANGED
@@ -15,28 +15,28 @@
15
  "submission_date": "2026-01-09T18:25:59.636344+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 58.06122448979592,
19
  "semantic_ci": [
20
- 53.63195829834202,
21
- 62.49049068124981
22
  ],
23
- "anls": 55.486869478144165,
24
  "page_f1": 60.9663492063492,
25
  "doc_f1": 78.82920634920634,
26
- "kuiper": 45.71999999999974
27
  },
28
  "single_evidence": {
29
- "semantic": 64.71903830025161,
30
- "anls": 60.42073737602547,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 30.976676384839653,
35
- "anls": 33.035531481450356,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 55.022008803521395,
40
  "anls": 57.1545284780579,
41
  "n": 51
42
  },
@@ -52,13 +52,13 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 82.90816326530613,
56
- "anls": 76.85643564356435,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 42.701863354037265,
61
- "anls": 40.952902757926644,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -68,26 +68,26 @@
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 66.21797655232307,
71
- "anls": 67.70262933196864,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 59.731209556993534,
76
  "anls": 60.95035529628296,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 55.766492643569066,
81
  "anls": 51.45105745077384,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
- "semantic": 48.97959183673469,
86
  "anls": 54.40739778239778,
87
  "n": 25
88
  },
89
  "Misc": {
90
- "semantic": 76.53061224489795,
91
  "anls": 73.82172131147541,
92
  "n": 24
93
  },
@@ -97,17 +97,17 @@
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 60.83202511773942,
101
- "anls": 64.46714691613596,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 54.42176870748299,
106
  "anls": 45.47473759975617,
107
  "n": 75
108
  },
109
  "Technical": {
110
- "semantic": 51.02040816326531,
111
  "anls": 35.96181299748582,
112
  "n": 23
113
  }
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T15:48:34.415858+00:00",
119
  "source_predictions_file": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_predictions_20260109_182559.jsonl",
120
  "result_file_path": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json"
121
  }
 
15
  "submission_date": "2026-01-09T18:25:59.636344+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 58.46938775510204,
19
  "semantic_ci": [
20
+ 54.04525386645418,
21
+ 62.89352164374991
22
  ],
23
+ "anls": 55.989429815086645,
24
  "page_f1": 60.9663492063492,
25
  "doc_f1": 78.82920634920634,
26
+ "kuiper": 46.50600000000012
27
  },
28
  "single_evidence": {
29
+ "semantic": 65.27816606094493,
30
+ "anls": 60.972189892385046,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 30.36929057337221,
35
+ "anls": 33.63076957668846,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 56.022408963585434,
40
  "anls": 57.1545284780579,
41
  "n": 51
42
  },
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 85.0340136054422,
56
+ "anls": 78.93976897689768,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 42.14729370008874,
61
+ "anls": 43.10328860396467,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 66.21797655232307,
71
+ "anls": 67.7106280521734,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 60.97560975609757,
76
  "anls": 60.95035529628296,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 53.393450403417184,
81
  "anls": 51.45105745077384,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
+ "semantic": 53.06122448979592,
86
  "anls": 54.40739778239778,
87
  "n": 25
88
  },
89
  "Misc": {
90
+ "semantic": 78.65646258503402,
91
  "anls": 73.82172131147541,
92
  "n": 24
93
  },
 
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 62.79434850863422,
101
+ "anls": 64.52616096971524,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 55.102040816326536,
106
  "anls": 45.47473759975617,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 48.802129547471154,
111
  "anls": 35.96181299748582,
112
  "n": 23
113
  }
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:01:09.960296+00:00",
119
  "source_predictions_file": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_predictions_20260109_182559.jsonl",
120
  "result_file_path": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json"
121
  }
eval/reevaluated_results/Google/Gemini_2.5_Flash_with_File_Search_results_20260103_221253.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Gemini 2.5 Flash with File Search",
3
+ "organization": "Google",
4
+ "description": "Managed, single-shot retrieval mechanism.",
5
+ "link": "https://ai.google.dev/gemini-api/docs/file-search",
6
+ "tags": [
7
+ "Conventional RAG",
8
+ "Semantic Search Tool"
9
+ ],
10
+ "submitted_by": "Borchmann",
11
+ "metadata": {
12
+ "model_type": "api"
13
+ },
14
+ "submission_date": "2026-01-03T22:12:53.645813+00:00",
15
+ "results": {
16
+ "overall": {
17
+ "semantic": 71.83673469387755,
18
+ "semantic_ci": [
19
+ 67.75383041519983,
20
+ 75.91963897255526
21
+ ],
22
+ "anls": 56.38605375030021,
23
+ "page_f1": 52.15333333333333,
24
+ "doc_f1": 80.91445887445887,
25
+ "kuiper": 14.495999999999947
26
+ },
27
+ "single_evidence": {
28
+ "semantic": 74.0844282918647,
29
+ "anls": 59.70060588908858,
30
+ "n": 365
31
+ },
32
+ "multi_evidence_same_doc": {
33
+ "semantic": 61.34596695821186,
34
+ "anls": 44.2307193586438,
35
+ "n": 84
36
+ },
37
+ "multi_evidence_multi_doc": {
38
+ "semantic": 73.02921168467388,
39
+ "anls": 52.68480979424892,
40
+ "n": 51
41
+ },
42
+ "by_domain": {
43
+ "Cases/Logs": {
44
+ "semantic": 78.2312925170068,
45
+ "anls": 76.85185185185186,
46
+ "n": 15
47
+ },
48
+ "Education": {
49
+ "semantic": 83.4879406307978,
50
+ "anls": 59.32605273514364,
51
+ "n": 22
52
+ },
53
+ "Events": {
54
+ "semantic": 87.15986394557822,
55
+ "anls": 67.94733044733044,
56
+ "n": 24
57
+ },
58
+ "Financial": {
59
+ "semantic": 64.88464951197871,
60
+ "anls": 53.16591091793465,
61
+ "n": 92
62
+ },
63
+ "Financial/Tax": {
64
+ "semantic": 60.58673469387755,
65
+ "anls": 45.19230769230769,
66
+ "n": 16
67
+ },
68
+ "Government/Regulatory": {
69
+ "semantic": 68.38905775075989,
70
+ "anls": 52.69755357661292,
71
+ "n": 47
72
+ },
73
+ "HR/Employment": {
74
+ "semantic": 70.93081134892981,
75
+ "anls": 56.2715906011529,
76
+ "n": 41
77
+ },
78
+ "Legal": {
79
+ "semantic": 85.4295206454675,
80
+ "anls": 63.338167000957704,
81
+ "n": 43
82
+ },
83
+ "Media/Publishing": {
84
+ "semantic": 65.30612244897961,
85
+ "anls": 61.1117685382045,
86
+ "n": 25
87
+ },
88
+ "Misc": {
89
+ "semantic": 70.1530612244898,
90
+ "anls": 60.28005464480874,
91
+ "n": 24
92
+ },
93
+ "Other": {
94
+ "semantic": 51.02040816326531,
95
+ "anls": 59.09090909090908,
96
+ "n": 1
97
+ },
98
+ "Reference": {
99
+ "semantic": 72.60596546310833,
100
+ "anls": 56.51193666615843,
101
+ "n": 52
102
+ },
103
+ "Reports": {
104
+ "semantic": 69.38775510204084,
105
+ "anls": 50.21750576702014,
106
+ "n": 75
107
+ },
108
+ "Technical": {
109
+ "semantic": 75.42147293700089,
110
+ "anls": 54.08686051995165,
111
+ "n": 23
112
+ }
113
+ },
114
+ "n_evaluated": 500,
115
+ "n_unmatched": 0
116
+ },
117
+ "reevaluated_date": "2026-01-22T22:02:26.923804+00:00",
118
+ "source_predictions_file": "Google/Gemini_2.5_Flash_with_File_Search_predictions_20260103_221253.jsonl",
119
+ "result_file_path": "Google/Gemini_2.5_Flash_with_File_Search_results_20260103_221253.json"
120
+ }
eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json CHANGED
@@ -15,28 +15,28 @@
15
  "submission_date": "2026-01-09T18:30:30.608183+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 60.20408163265305,
19
  "semantic_ci": [
20
- 55.80507165182195,
21
- 64.60309161348415
22
  ],
23
- "anls": 55.97919862778078,
24
  "page_f1": 60.299220779220775,
25
  "doc_f1": 74.23636363636363,
26
- "kuiper": 40.40399999999971
27
  },
28
  "single_evidence": {
29
- "semantic": 68.0738048644115,
30
- "anls": 61.04578922425167,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 31.584062196307094,
35
- "anls": 36.778404555871965,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 51.02040816326531,
40
  "anls": 51.343142438142856,
41
  "n": 51
42
  },
@@ -52,13 +52,13 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 68.02721088435374,
56
- "anls": 62.67819322254806,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 47.6929902395741,
61
- "anls": 48.11929370300614,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -67,23 +67,23 @@
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 68.38905775075989,
71
- "anls": 64.23333377770668,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 59.731209556993534,
76
- "anls": 48.92979153124233,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 68.81822496440435,
81
  "anls": 60.44220952048519,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
- "semantic": 48.97959183673469,
86
- "anls": 52.95641025641026,
87
  "n": 25
88
  },
89
  "Misc": {
@@ -93,21 +93,21 @@
93
  },
94
  "Other": {
95
  "semantic": 0.0,
96
- "anls": 0.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 64.75667189952904,
101
- "anls": 63.5134680018838,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 62.58503401360544,
106
- "anls": 54.81415365192609,
107
  "n": 75
108
  },
109
  "Technical": {
110
- "semantic": 46.58385093167702,
111
  "anls": 40.50127359810298,
112
  "n": 23
113
  }
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T15:51:00.509214+00:00",
119
  "source_predictions_file": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_predictions_20260109_183030.jsonl",
120
  "result_file_path": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json"
121
  }
 
15
  "submission_date": "2026-01-09T18:30:30.608183+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 60.10204081632653,
19
  "semantic_ci": [
20
+ 55.70140355733895,
21
+ 64.50267807531411
22
  ],
23
+ "anls": 56.41686335308852,
24
  "page_f1": 60.299220779220775,
25
  "doc_f1": 74.23636363636363,
26
+ "kuiper": 38.901999999999795
27
  },
28
  "single_evidence": {
29
+ "semantic": 67.0953312831982,
30
+ "anls": 61.720866213550785,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 34.62099125364433,
35
+ "anls": 36.450181479201696,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 52.02080832332933,
40
  "anls": 51.343142438142856,
41
  "n": 51
42
  },
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 72.27891156462587,
56
+ "anls": 64.7615265558814,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 50.46583850931676,
61
+ "anls": 49.206250224745276,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 65.13243595310465,
71
+ "anls": 62.58648616221405,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 58.4868093578895,
76
+ "anls": 49.471796951296525,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 66.44518272425249,
81
  "anls": 60.44220952048519,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
+ "semantic": 44.897959183673464,
86
+ "anls": 53.44125874125874,
87
  "n": 25
88
  },
89
  "Misc": {
 
93
  },
94
  "Other": {
95
  "semantic": 0.0,
96
+ "anls": 76.92307692307692,
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 66.71899529042385,
101
+ "anls": 64.18194203992306,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 61.904761904761905,
106
+ "anls": 54.81691417642573,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 44.365572315882865,
111
  "anls": 40.50127359810298,
112
  "n": 23
113
  }
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:03:24.876553+00:00",
119
  "source_predictions_file": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_predictions_20260109_183030.jsonl",
120
  "result_file_path": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json"
121
  }
eval/reevaluated_results/Google/Gemini_2.5_Pro_with_File_Search_results_20260103_221943.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Gemini 2.5 Pro with File Search",
3
+ "organization": "Google",
4
+ "description": "Managed, single-shot retrieval mechanism.",
5
+ "link": "https://ai.google.dev/gemini-api/docs/file-search",
6
+ "tags": [
7
+ "Conventional RAG",
8
+ "Semantic Search Tool"
9
+ ],
10
+ "submitted_by": "Borchmann",
11
+ "metadata": {
12
+ "model_type": "api"
13
+ },
14
+ "submission_date": "2026-01-03T22:19:43.085381+00:00",
15
+ "results": {
16
+ "overall": {
17
+ "semantic": 73.06122448979592,
18
+ "semantic_ci": [
19
+ 69.02799722537132,
20
+ 77.0944517542205
21
+ ],
22
+ "anls": 67.06445662551177,
23
+ "page_f1": 60.936190476190475,
24
+ "doc_f1": 87.83333333333334,
25
+ "kuiper": 15.25
26
+ },
27
+ "single_evidence": {
28
+ "semantic": 75.06290187307802,
29
+ "anls": 69.51189266445841,
30
+ "n": 365
31
+ },
32
+ "multi_evidence_same_doc": {
33
+ "semantic": 70.45675413022352,
34
+ "anls": 62.81319563063059,
35
+ "n": 84
36
+ },
37
+ "multi_evidence_multi_doc": {
38
+ "semantic": 63.025210084033624,
39
+ "anls": 56.5505697501097,
40
+ "n": 51
41
+ },
42
+ "by_domain": {
43
+ "Cases/Logs": {
44
+ "semantic": 74.82993197278913,
45
+ "anls": 72.85185185185185,
46
+ "n": 15
47
+ },
48
+ "Education": {
49
+ "semantic": 83.4879406307978,
50
+ "anls": 66.82605273514365,
51
+ "n": 22
52
+ },
53
+ "Events": {
54
+ "semantic": 80.78231292517005,
55
+ "anls": 63.51613609636826,
56
+ "n": 24
57
+ },
58
+ "Financial": {
59
+ "semantic": 73.75776397515527,
60
+ "anls": 71.81492329679233,
61
+ "n": 92
62
+ },
63
+ "Financial/Tax": {
64
+ "semantic": 44.642857142857146,
65
+ "anls": 48.56036324786325,
66
+ "n": 16
67
+ },
68
+ "Government/Regulatory": {
69
+ "semantic": 82.50108554059923,
70
+ "anls": 71.42658144393866,
71
+ "n": 47
72
+ },
73
+ "HR/Employment": {
74
+ "semantic": 74.66401194624191,
75
+ "anls": 67.74502588179764,
76
+ "n": 41
77
+ },
78
+ "Legal": {
79
+ "semantic": 77.12387280493593,
80
+ "anls": 68.13676633444075,
81
+ "n": 43
82
+ },
83
+ "Media/Publishing": {
84
+ "semantic": 69.38775510204084,
85
+ "anls": 72.05791173717081,
86
+ "n": 25
87
+ },
88
+ "Misc": {
89
+ "semantic": 72.27891156462587,
90
+ "anls": 69.65505464480874,
91
+ "n": 24
92
+ },
93
+ "Other": {
94
+ "semantic": 51.02040816326531,
95
+ "anls": 59.09090909090908,
96
+ "n": 1
97
+ },
98
+ "Reference": {
99
+ "semantic": 71.62480376766091,
100
+ "anls": 72.50339016950552,
101
+ "n": 52
102
+ },
103
+ "Reports": {
104
+ "semantic": 67.34693877551022,
105
+ "anls": 59.42273394165809,
106
+ "n": 75
107
+ },
108
+ "Technical": {
109
+ "semantic": 68.76663708961844,
110
+ "anls": 53.79711579945216,
111
+ "n": 23
112
+ }
113
+ },
114
+ "n_evaluated": 500,
115
+ "n_unmatched": 0
116
+ },
117
+ "reevaluated_date": "2026-01-22T22:04:21.890237+00:00",
118
+ "source_predictions_file": "Google/Gemini_2.5_Pro_with_File_Search_predictions_20260103_221943.jsonl",
119
+ "result_file_path": "Google/Gemini_2.5_Pro_with_File_Search_results_20260103_221943.json"
120
+ }
eval/reevaluated_results/Google/Gemini_3.0_Pro_(Preview)_with_File_Search_results_20260104_120431.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Gemini 3 Pro (Preview) with File Search",
3
+ "organization": "Google",
4
+ "description": "Managed, single-shot retrieval mechanism.",
5
+ "link": "https://ai.google.dev/gemini-api/docs/file-search",
6
+ "tags": [
7
+ "Conventional RAG",
8
+ "Semantic Search Tool"
9
+ ],
10
+ "submitted_by": "Borchmann",
11
+ "metadata": {
12
+ "model_type": "api"
13
+ },
14
+ "submission_date": "2026-01-04T12:04:31.393913+00:00",
15
+ "results": {
16
+ "overall": {
17
+ "semantic": 78.57142857142857,
18
+ "semantic_ci": [
19
+ 74.80745735419877,
20
+ 82.33539978865836
21
+ ],
22
+ "anls": 68.92843881871933,
23
+ "page_f1": 70.0995238095238,
24
+ "doc_f1": 94.17333333333333,
25
+ "kuiper": 12.06600000000001
26
+ },
27
+ "single_evidence": {
28
+ "semantic": 80.09505171931788,
29
+ "anls": 71.30190923977649,
30
+ "n": 365
31
+ },
32
+ "multi_evidence_same_doc": {
33
+ "semantic": 74.1010689990282,
34
+ "anls": 67.2325874522391,
35
+ "n": 84
36
+ },
37
+ "multi_evidence_multi_doc": {
38
+ "semantic": 75.03001200480193,
39
+ "anls": 54.73500374221878,
40
+ "n": 51
41
+ },
42
+ "by_domain": {
43
+ "Cases/Logs": {
44
+ "semantic": 85.0340136054422,
45
+ "anls": 74.51761001296605,
46
+ "n": 15
47
+ },
48
+ "Education": {
49
+ "semantic": 90.44526901669758,
50
+ "anls": 62.23980564889655,
51
+ "n": 22
52
+ },
53
+ "Events": {
54
+ "semantic": 87.15986394557822,
55
+ "anls": 63.97470042977855,
56
+ "n": 24
57
+ },
58
+ "Financial": {
59
+ "semantic": 77.08518189884651,
60
+ "anls": 71.47919047104104,
61
+ "n": 92
62
+ },
63
+ "Financial/Tax": {
64
+ "semantic": 76.53061224489795,
65
+ "anls": 75.04133597883597,
66
+ "n": 16
67
+ },
68
+ "Government/Regulatory": {
69
+ "semantic": 80.33000434216238,
70
+ "anls": 67.12054458180654,
71
+ "n": 47
72
+ },
73
+ "HR/Employment": {
74
+ "semantic": 74.66401194624191,
75
+ "anls": 70.8778954315538,
76
+ "n": 41
77
+ },
78
+ "Legal": {
79
+ "semantic": 80.68343616516374,
80
+ "anls": 64.51676230745998,
81
+ "n": 43
82
+ },
83
+ "Media/Publishing": {
84
+ "semantic": 65.30612244897961,
85
+ "anls": 71.26169358330307,
86
+ "n": 25
87
+ },
88
+ "Misc": {
89
+ "semantic": 85.0340136054422,
90
+ "anls": 71.74962035802997,
91
+ "n": 24
92
+ },
93
+ "Other": {
94
+ "semantic": 0.0,
95
+ "anls": 29.54545454545454,
96
+ "n": 1
97
+ },
98
+ "Reference": {
99
+ "semantic": 80.45525902668759,
100
+ "anls": 76.46179993780217,
101
+ "n": 52
102
+ },
103
+ "Reports": {
104
+ "semantic": 80.95238095238096,
105
+ "anls": 69.86245910414694,
106
+ "n": 75
107
+ },
108
+ "Technical": {
109
+ "semantic": 59.89352262644188,
110
+ "anls": 47.01665330524952,
111
+ "n": 23
112
+ }
113
+ },
114
+ "n_evaluated": 500,
115
+ "n_unmatched": 0
116
+ },
117
+ "reevaluated_date": "2026-01-22T22:05:27.215227+00:00",
118
+ "source_predictions_file": "Google/Gemini_3.0_Pro_(Preview)_with_File_Search_predictions_20260104_120431.jsonl",
119
+ "result_file_path": "Google/Gemini_3.0_Pro_(Preview)_with_File_Search_results_20260104_120431.json"
120
+ }
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260117_193634.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Human with BM25 Search Tool",
3
+ "organization": "Humanity",
4
+ "description": "Human equipped with the same search engine as agentic baselines.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-17T19:36:34.967206+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 82.14285714285717,
19
+ "semantic_ci": [
20
+ 78.5991954078934,
21
+ 85.68651887782092
22
+ ],
23
+ "anls": 83.70514272504292,
24
+ "page_f1": 79.25590111642744,
25
+ "doc_f1": 93.42612554112554,
26
+ "kuiper": 8.696969696969703
27
+ },
28
+ "single_evidence": {
29
+ "semantic": 84.14872798434443,
30
+ "anls": 85.91721757063848,
31
+ "n": 365
32
+ },
33
+ "multi_evidence_same_doc": {
34
+ "semantic": 79.56754130223517,
35
+ "anls": 82.31765689212445,
36
+ "n": 84
37
+ },
38
+ "multi_evidence_multi_doc": {
39
+ "semantic": 72.02881152460986,
40
+ "anls": 70.15889745686191,
41
+ "n": 51
42
+ },
43
+ "by_domain": {
44
+ "Cases/Logs": {
45
+ "semantic": 68.02721088435374,
46
+ "anls": 72.72727272727272,
47
+ "n": 15
48
+ },
49
+ "Education": {
50
+ "semantic": 100.0,
51
+ "anls": 92.81192695119195,
52
+ "n": 22
53
+ },
54
+ "Events": {
55
+ "semantic": 87.15986394557822,
56
+ "anls": 83.66935483870968,
57
+ "n": 24
58
+ },
59
+ "Financial": {
60
+ "semantic": 78.19432120674357,
61
+ "anls": 79.59127298509043,
62
+ "n": 92
63
+ },
64
+ "Financial/Tax": {
65
+ "semantic": 82.90816326530613,
66
+ "anls": 83.28405017921146,
67
+ "n": 16
68
+ },
69
+ "Government/Regulatory": {
70
+ "semantic": 84.67216673903604,
71
+ "anls": 87.11278292004198,
72
+ "n": 47
73
+ },
74
+ "HR/Employment": {
75
+ "semantic": 87.10801393728221,
76
+ "anls": 81.26741515002432,
77
+ "n": 41
78
+ },
79
+ "Legal": {
80
+ "semantic": 88.9890840056953,
81
+ "anls": 85.09035538105306,
82
+ "n": 43
83
+ },
84
+ "Media/Publishing": {
85
+ "semantic": 75.51020408163265,
86
+ "anls": 79.91696395686839,
87
+ "n": 25
88
+ },
89
+ "Misc": {
90
+ "semantic": 74.4047619047619,
91
+ "anls": 79.0967077930904,
92
+ "n": 24
93
+ },
94
+ "Other": {
95
+ "semantic": 0.0,
96
+ "anls": 76.92307692307692,
97
+ "n": 1
98
+ },
99
+ "Reference": {
100
+ "semantic": 83.39874411302984,
101
+ "anls": 92.19517190376465,
102
+ "n": 52
103
+ },
104
+ "Reports": {
105
+ "semantic": 78.2312925170068,
106
+ "anls": 83.77634694564375,
107
+ "n": 75
108
+ },
109
+ "Technical": {
110
+ "semantic": 84.29458740017748,
111
+ "anls": 83.52609662978936,
112
+ "n": 23
113
+ }
114
+ },
115
+ "n_evaluated": 500,
116
+ "n_unmatched": 0
117
+ },
118
+ "reevaluated_date": "2026-01-22T22:06:05.498305+00:00",
119
+ "source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260117_193634.jsonl",
120
+ "result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260117_193634.json"
121
+ }
eval/reevaluated_results/Humanity/Human_with_Oracle_Retriever_results_20260122_214532.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Human with Oracle Retriever",
3
+ "organization": "Humanity",
4
+ "description": "Human given gold standard evidence pages.",
5
+ "link": "",
6
+ "tags": [
7
+ "Vision and Language"
8
+ ],
9
+ "submitted_by": "Borchmann",
10
+ "metadata": {
11
+ "model_type": "api"
12
+ },
13
+ "submission_date": "2026-01-22T21:45:32.545870+00:00",
14
+ "results": {
15
+ "overall": {
16
+ "semantic": 99.38775510204081,
17
+ "semantic_ci": [
18
+ 97.96443309097894,
19
+ 100.0
20
+ ],
21
+ "anls": 93.7121419059059,
22
+ "page_f1": 88.62341991341991,
23
+ "doc_f1": 97.18112554112554,
24
+ "kuiper": 4.630303030303009
25
+ },
26
+ "single_evidence": {
27
+ "semantic": 99.24517752306402,
28
+ "anls": 94.03593158263021,
29
+ "n": 365
30
+ },
31
+ "multi_evidence_same_doc": {
32
+ "semantic": 100.0,
33
+ "anls": 95.10936020828757,
34
+ "n": 84
35
+ },
36
+ "multi_evidence_multi_doc": {
37
+ "semantic": 98.03921568627453,
38
+ "anls": 89.09352289797572,
39
+ "n": 51
40
+ },
41
+ "by_domain": {
42
+ "Cases/Logs": {
43
+ "semantic": 95.23809523809524,
44
+ "anls": 93.33333333333333,
45
+ "n": 15
46
+ },
47
+ "Education": {
48
+ "semantic": 100.0,
49
+ "anls": 92.99351054603811,
50
+ "n": 22
51
+ },
52
+ "Events": {
53
+ "semantic": 100.0,
54
+ "anls": 97.74763766699252,
55
+ "n": 24
56
+ },
57
+ "Financial": {
58
+ "semantic": 98.15882874889085,
59
+ "anls": 92.61042950143054,
60
+ "n": 92
61
+ },
62
+ "Financial/Tax": {
63
+ "semantic": 100.0,
64
+ "anls": 96.38888888888889,
65
+ "n": 16
66
+ },
67
+ "Government/Regulatory": {
68
+ "semantic": 100.0,
69
+ "anls": 96.77157151500082,
70
+ "n": 47
71
+ },
72
+ "HR/Employment": {
73
+ "semantic": 97.06321553011448,
74
+ "anls": 91.56441076843375,
75
+ "n": 41
76
+ },
77
+ "Legal": {
78
+ "semantic": 100.0,
79
+ "anls": 95.3032819893285,
80
+ "n": 43
81
+ },
82
+ "Media/Publishing": {
83
+ "semantic": 93.87755102040816,
84
+ "anls": 90.74640522875816,
85
+ "n": 25
86
+ },
87
+ "Misc": {
88
+ "semantic": 97.78911564625852,
89
+ "anls": 90.94982803662161,
90
+ "n": 24
91
+ },
92
+ "Other": {
93
+ "semantic": 0.0,
94
+ "anls": 76.92307692307692,
95
+ "n": 1
96
+ },
97
+ "Reference": {
98
+ "semantic": 100.0,
99
+ "anls": 96.68889612023322,
100
+ "n": 52
101
+ },
102
+ "Reports": {
103
+ "semantic": 98.63945578231294,
104
+ "anls": 93.02938751681923,
105
+ "n": 75
106
+ },
107
+ "Technical": {
108
+ "semantic": 100.0,
109
+ "anls": 89.9146569474778,
110
+ "n": 23
111
+ }
112
+ },
113
+ "n_evaluated": 500,
114
+ "n_unmatched": 0
115
+ },
116
+ "reevaluated_date": "2026-01-22T22:06:32.856334+00:00",
117
+ "source_predictions_file": "Humanity/Human_with_Oracle_Retriever_predictions_20260122_214532.jsonl",
118
+ "result_file_path": "Humanity/Human_with_Oracle_Retriever_results_20260122_214532.json"
119
+ }
eval/reevaluated_results/OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json CHANGED
@@ -15,28 +15,28 @@
15
  "submission_date": "2026-01-09T15:32:21.908816+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 58.6734693877551,
19
  "semantic_ci": [
20
- 54.252012440670164,
21
- 63.09492633484003
22
  ],
23
- "anls": 53.29254644474454,
24
  "page_f1": 64.14190476190477,
25
  "doc_f1": 82.82666666666667,
26
- "kuiper": 42.51800000000039
27
  },
28
  "single_evidence": {
29
- "semantic": 65.41794800111826,
30
- "anls": 58.84025149533676,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 37.65792031098154,
35
- "anls": 35.98946829890793,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 45.01800720288115,
40
  "anls": 42.087570381688025,
41
  "n": 51
42
  },
@@ -52,33 +52,33 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 70.1530612244898,
56
  "anls": 67.55050505050505,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 42.701863354037265,
61
- "anls": 43.62404525327831,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
- "semantic": 73.34183673469387,
66
  "anls": 64.58333333333334,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 57.533651758575765,
71
- "anls": 51.52629513848961,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 70.93081134892981,
76
- "anls": 55.117501174925685,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 60.51257712387282,
81
- "anls": 55.94315245478037,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
@@ -93,21 +93,21 @@
93
  },
94
  "Other": {
95
  "semantic": 0.0,
96
- "anls": 0.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
  "semantic": 64.75667189952904,
101
- "anls": 60.011945621794936,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 61.22448979591838,
106
- "anls": 47.26331129213486,
107
  "n": 75
108
  },
109
  "Technical": {
110
- "semantic": 66.54835847382431,
111
  "anls": 61.60068502092203,
112
  "n": 23
113
  }
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T16:01:46.742921+00:00",
119
  "source_predictions_file": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153221.jsonl",
120
  "result_file_path": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json"
121
  }
 
15
  "submission_date": "2026-01-09T15:32:21.908816+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 60.00000000000001,
19
  "semantic_ci": [
20
+ 55.59775423620991,
21
+ 64.4022457637901
22
  ],
23
+ "anls": 54.023238374203174,
24
  "page_f1": 64.14190476190477,
25
  "doc_f1": 82.82666666666667,
26
+ "kuiper": 43.20000000000029
27
  },
28
  "single_evidence": {
29
+ "semantic": 66.25663964215825,
30
+ "anls": 59.82892458572295,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 42.5170068027211,
35
+ "anls": 36.04280504579306,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 44.01760704281713,
40
  "anls": 42.087570381688025,
41
  "n": 51
42
  },
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 74.4047619047619,
56
  "anls": 67.55050505050505,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 43.81100266193434,
61
+ "anls": 46.01082061559715,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
+ "semantic": 76.53061224489795,
66
  "anls": 64.58333333333334,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 57.533651758575765,
71
+ "anls": 51.55429065920628,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 72.17521154803384,
76
+ "anls": 56.33701337004763,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 61.69909824394873,
81
+ "anls": 57.105943152454785,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
 
93
  },
94
  "Other": {
95
  "semantic": 0.0,
96
+ "anls": 76.92307692307692,
97
  "n": 1
98
  },
99
  "Reference": {
100
  "semantic": 64.75667189952904,
101
+ "anls": 60.26278174219628,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 62.58503401360544,
106
+ "anls": 46.65638178197955,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 75.42147293700089,
111
  "anls": 61.60068502092203,
112
  "n": 23
113
  }
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:07:42.289399+00:00",
119
  "source_predictions_file": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153221.jsonl",
120
  "result_file_path": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json"
121
  }
eval/reevaluated_results/OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json CHANGED
@@ -23,15 +23,15 @@
23
  "anls": 19.21201395702391,
24
  "page_f1": 27.60809523809524,
25
  "doc_f1": 40.18095238095238,
26
- "kuiper": 27.428000000000136
27
  },
28
  "single_evidence": {
29
- "semantic": 23.623147889292703,
30
  "anls": 22.4105437044892,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 7.896015549076768,
35
  "anls": 9.153597726228908,
36
  "n": 84
37
  },
@@ -77,7 +77,7 @@
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 26.10346464167062,
81
  "anls": 20.54263565891473,
82
  "n": 43
83
  },
@@ -97,17 +97,17 @@
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 17.66091051805338,
101
  "anls": 20.3827772417516,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 23.12925170068026,
106
  "anls": 19.284216647617285,
107
  "n": 75
108
  },
109
  "Technical": {
110
- "semantic": 22.182786157941443,
111
  "anls": 27.075249588209658,
112
  "n": 23
113
  }
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T16:02:35.269375+00:00",
119
  "source_predictions_file": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153812.jsonl",
120
  "result_file_path": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json"
121
  }
 
23
  "anls": 19.21201395702391,
24
  "page_f1": 27.60809523809524,
25
  "doc_f1": 40.18095238095238,
26
+ "kuiper": 28.60000000000006
27
  },
28
  "single_evidence": {
29
+ "semantic": 24.042493709812685,
30
  "anls": 22.4105437044892,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 6.073858114674442,
35
  "anls": 9.153597726228908,
36
  "n": 84
37
  },
 
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 23.730422401518744,
81
  "anls": 20.54263565891473,
82
  "n": 43
83
  },
 
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 16.679748822605976,
101
  "anls": 20.3827772417516,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 23.809523809523817,
106
  "anls": 19.284216647617285,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 26.619343389529732,
111
  "anls": 27.075249588209658,
112
  "n": 23
113
  }
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:08:24.315093+00:00",
119
  "source_predictions_file": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153812.jsonl",
120
  "result_file_path": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json"
121
  }
eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json CHANGED
@@ -15,29 +15,29 @@
15
  "submission_date": "2026-01-09T15:19:12.016451+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 67.04081632653062,
19
  "semantic_ci": [
20
- 62.79494568958393,
21
- 71.2866869634773
22
  ],
23
- "anls": 57.28438090955278,
24
  "page_f1": 67.62380952380951,
25
  "doc_f1": 83.72666666666666,
26
- "kuiper": 63.51399999999979
27
  },
28
  "single_evidence": {
29
- "semantic": 72.2672630696114,
30
- "anls": 61.48347787804644,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 50.41302235179786,
35
- "anls": 40.75979036064321,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 57.022809123649466,
40
- "anls": 54.44899292147869,
41
  "n": 51
42
  },
43
  "by_domain": {
@@ -52,13 +52,13 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 72.27891156462587,
56
  "anls": 57.55050505050505,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 55.4569653948536,
61
- "anls": 49.679264550051975,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -68,46 +68,46 @@
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 68.38905775075989,
71
- "anls": 58.551919442177,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 62.220009955201604,
76
  "anls": 44.265703074651974,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 81.86995728523969,
81
- "anls": 66.19399979865096,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
- "semantic": 38.77551020408162,
86
  "anls": 35.05751747729549,
87
  "n": 25
88
  },
89
  "Misc": {
90
- "semantic": 87.15986394557822,
91
  "anls": 82.5164707977208,
92
  "n": 24
93
  },
94
  "Other": {
95
  "semantic": 0.0,
96
- "anls": 0.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 74.56828885400316,
101
- "anls": 67.62508443509842,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 68.02721088435374,
106
- "anls": 59.65381728416852,
107
  "n": 75
108
  },
109
  "Technical": {
110
- "semantic": 62.111801242236034,
111
  "anls": 55.55075090789312,
112
  "n": 23
113
  }
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T16:03:33.475794+00:00",
119
  "source_predictions_file": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_predictions_20260109_151912.jsonl",
120
  "result_file_path": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json"
121
  }
 
15
  "submission_date": "2026-01-09T15:19:12.016451+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 67.75510204081634,
19
  "semantic_ci": [
20
+ 63.53045246104174,
21
+ 71.97975162059093
22
  ],
23
+ "anls": 57.79787184551629,
24
  "page_f1": 67.62380952380951,
25
  "doc_f1": 83.72666666666666,
26
+ "kuiper": 64.75199999999975
27
  },
28
  "single_evidence": {
29
+ "semantic": 73.24573665082471,
30
+ "anls": 62.10194433589725,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 51.62779397473275,
35
+ "anls": 41.00509049000287,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 55.022008803521395,
40
+ "anls": 54.65291449010614,
41
  "n": 51
42
  },
43
  "by_domain": {
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 76.53061224489795,
56
  "anls": 57.55050505050505,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 57.67524401064772,
61
+ "anls": 51.01775086245134,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 68.38905775075989,
71
+ "anls": 58.57991496289369,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 60.97560975609757,
76
  "anls": 44.265703074651974,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 83.05647840531563,
81
+ "anls": 66.93395751535287,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
+ "semantic": 36.734693877551,
86
  "anls": 35.05751747729549,
87
  "n": 25
88
  },
89
  "Misc": {
90
+ "semantic": 85.0340136054422,
91
  "anls": 82.5164707977208,
92
  "n": 24
93
  },
94
  "Other": {
95
  "semantic": 0.0,
96
+ "anls": 76.92307692307692,
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 77.51177394034536,
101
+ "anls": 68.29355847313767,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 68.70748299319727,
106
+ "anls": 59.504311004808905,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 59.89352262644188,
111
  "anls": 55.55075090789312,
112
  "n": 23
113
  }
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:09:18.181737+00:00",
119
  "source_predictions_file": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_predictions_20260109_151912.jsonl",
120
  "result_file_path": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json"
121
  }
eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_results_20260104_121551.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-5.2 (2025-12-11) with File Search",
3
+ "organization": "OpenAI",
4
+ "description": "Managed, single-shot retrieval mechanism.",
5
+ "link": "https://platform.openai.com/docs/guides/tools-file-search",
6
+ "tags": [
7
+ "Conventional RAG",
8
+ "Semantic Search Tool"
9
+ ],
10
+ "submitted_by": "Borchmann",
11
+ "metadata": {
12
+ "model_type": "api"
13
+ },
14
+ "submission_date": "2026-01-04T12:15:51.350064+00:00",
15
+ "results": {
16
+ "overall": {
17
+ "semantic": 50.0,
18
+ "semantic_ci": [
19
+ 45.52884072176271,
20
+ 54.471159278237295
21
+ ],
22
+ "anls": 46.08410854382378,
23
+ "page_f1": 28.519365079365084,
24
+ "doc_f1": 68.52666666666667,
25
+ "kuiper": 26.695999999999877
26
+ },
27
+ "single_evidence": {
28
+ "semantic": 56.192339949678505,
29
+ "anls": 50.86266041052095,
30
+ "n": 365
31
+ },
32
+ "multi_evidence_same_doc": {
33
+ "semantic": 39.48007774538386,
34
+ "anls": 38.16677868634344,
35
+ "n": 84
36
+ },
37
+ "multi_evidence_multi_doc": {
38
+ "semantic": 23.0092036814726,
39
+ "anls": 24.92497671409598,
40
+ "n": 51
41
+ },
42
+ "by_domain": {
43
+ "Cases/Logs": {
44
+ "semantic": 20.40816326530612,
45
+ "anls": 20.0,
46
+ "n": 15
47
+ },
48
+ "Education": {
49
+ "semantic": 74.21150278293136,
50
+ "anls": 60.780598189689094,
51
+ "n": 22
52
+ },
53
+ "Events": {
54
+ "semantic": 63.775510204081634,
55
+ "anls": 49.319377600130515,
56
+ "n": 24
57
+ },
58
+ "Financial": {
59
+ "semantic": 54.902395740905064,
60
+ "anls": 52.85763703072669,
61
+ "n": 92
62
+ },
63
+ "Financial/Tax": {
64
+ "semantic": 25.510204081632654,
65
+ "anls": 24.39516129032258,
66
+ "n": 16
67
+ },
68
+ "Government/Regulatory": {
69
+ "semantic": 53.19148936170214,
70
+ "anls": 44.987482344705185,
71
+ "n": 47
72
+ },
73
+ "HR/Employment": {
74
+ "semantic": 42.309606769537076,
75
+ "anls": 45.208397582300094,
76
+ "n": 41
77
+ },
78
+ "Legal": {
79
+ "semantic": 48.647365923113426,
80
+ "anls": 39.61843128160958,
81
+ "n": 43
82
+ },
83
+ "Media/Publishing": {
84
+ "semantic": 40.81632653061224,
85
+ "anls": 38.34620945100294,
86
+ "n": 25
87
+ },
88
+ "Misc": {
89
+ "semantic": 61.64965986394556,
90
+ "anls": 59.15390227662317,
91
+ "n": 24
92
+ },
93
+ "Other": {
94
+ "semantic": 0.0,
95
+ "anls": 0.0,
96
+ "n": 1
97
+ },
98
+ "Reference": {
99
+ "semantic": 38.265306122448976,
100
+ "anls": 41.31709677341507,
101
+ "n": 52
102
+ },
103
+ "Reports": {
104
+ "semantic": 47.619047619047635,
105
+ "anls": 43.77803423273486,
106
+ "n": 75
107
+ },
108
+ "Technical": {
109
+ "semantic": 73.20319432120675,
110
+ "anls": 64.61964587357369,
111
+ "n": 23
112
+ }
113
+ },
114
+ "n_evaluated": 500,
115
+ "n_unmatched": 0
116
+ },
117
+ "reevaluated_date": "2026-01-22T22:10:25.673064+00:00",
118
+ "source_predictions_file": "OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_predictions_20260104_121551.jsonl",
119
+ "result_file_path": "OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_results_20260104_121551.json"
120
+ }
eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json CHANGED
@@ -15,28 +15,28 @@
15
  "submission_date": "2026-01-09T15:21:04.336083+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 77.14285714285715,
19
  "semantic_ci": [
20
- 73.30142802417011,
21
- 80.98428626154418
22
  ],
23
- "anls": 70.03817583122695,
24
  "page_f1": 74.16285714285713,
25
  "doc_f1": 86.45064935064934,
26
- "kuiper": 51.96199999999987
27
  },
28
  "single_evidence": {
29
- "semantic": 81.49287112105115,
30
- "anls": 73.42321103264047,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 58.91642371234208,
35
- "anls": 56.78859482300274,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 76.03041216486595,
40
  "anls": 67.63478281504847,
41
  "n": 51
42
  },
@@ -57,8 +57,8 @@
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 67.10292812777284,
61
- "anls": 62.36899647186356,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -67,43 +67,43 @@
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 74.90230134607035,
71
- "anls": 68.7671602173282,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 79.64161274265804,
76
- "anls": 64.5688672367669,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 81.86995728523969,
81
- "anls": 70.27143399236422,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
- "semantic": 63.265306122449,
86
  "anls": 65.71897407160566,
87
  "n": 25
88
  },
89
  "Misc": {
90
- "semantic": 85.0340136054422,
91
  "anls": 86.70405982905983,
92
  "n": 24
93
  },
94
  "Other": {
95
- "semantic": 0.0,
96
- "anls": 0.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 78.49293563579278,
101
- "anls": 76.57306264232653,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 82.31292517006803,
106
- "anls": 71.72139814224423,
107
  "n": 75
108
  },
109
  "Technical": {
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T16:04:34.030101+00:00",
119
  "source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152104.jsonl",
120
  "result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json"
121
  }
 
15
  "submission_date": "2026-01-09T15:21:04.336083+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 77.6530612244898,
19
  "semantic_ci": [
20
+ 73.83864314648315,
21
+ 81.46747930249646
22
  ],
23
+ "anls": 71.05072433302601,
24
  "page_f1": 74.16285714285713,
25
  "doc_f1": 86.45064935064934,
26
+ "kuiper": 52.62199999999985
27
  },
28
  "single_evidence": {
29
+ "semantic": 82.19178082191782,
30
+ "anls": 74.70786997380705,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 60.13119533527698,
35
+ "anls": 57.23352026792817,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 74.02961184473791,
40
  "anls": 67.63478281504847,
41
  "n": 51
42
  },
 
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 68.76663708961844,
61
+ "anls": 65.45826780841752,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 77.07338254450717,
71
+ "anls": 69.85898552527891,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 74.66401194624191,
76
+ "anls": 65.78837943188886,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 83.05647840531563,
81
+ "anls": 71.01139170906613,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
+ "semantic": 61.22448979591838,
86
  "anls": 65.71897407160566,
87
  "n": 25
88
  },
89
  "Misc": {
90
+ "semantic": 87.15986394557822,
91
  "anls": 86.70405982905983,
92
  "n": 24
93
  },
94
  "Other": {
95
+ "semantic": 100.0,
96
+ "anls": 100.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 80.45525902668759,
101
+ "anls": 77.21482727865639,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 81.63265306122449,
106
+ "anls": 71.12880554965163,
107
  "n": 75
108
  },
109
  "Technical": {
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:11:16.583710+00:00",
119
  "source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152104.jsonl",
120
  "result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json"
121
  }
eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json CHANGED
@@ -14,24 +14,24 @@
14
  "submission_date": "2026-01-04T14:05:37.240829+00:00",
15
  "results": {
16
  "overall": {
17
- "semantic": 48.367346938775505,
18
  "semantic_ci": [
19
- 43.90134346980983,
20
- 52.833350407741186
21
  ],
22
- "anls": 44.84773268944071,
23
  "page_f1": 29.277142857142856,
24
  "doc_f1": 66.60666666666667,
25
- "kuiper": 28.83600000000007
26
  },
27
  "single_evidence": {
28
- "semantic": 54.375174727425204,
29
- "anls": 48.77673180310333,
30
  "n": 365
31
  },
32
  "multi_evidence_same_doc": {
33
  "semantic": 36.443148688046655,
34
- "anls": 38.29854300170963,
35
  "n": 84
36
  },
37
  "multi_evidence_multi_doc": {
@@ -41,7 +41,7 @@
41
  },
42
  "by_domain": {
43
  "Cases/Logs": {
44
- "semantic": 20.40816326530612,
45
  "anls": 14.833333333333334,
46
  "n": 15
47
  },
@@ -51,13 +51,13 @@
51
  "n": 22
52
  },
53
  "Events": {
54
- "semantic": 68.02721088435374,
55
  "anls": 55.83149489399489,
56
  "n": 24
57
  },
58
  "Financial": {
59
- "semantic": 51.02040816326531,
60
- "anls": 46.26513610007698,
61
  "n": 92
62
  },
63
  "Financial/Tax": {
@@ -66,8 +66,8 @@
66
  "n": 16
67
  },
68
  "Government/Regulatory": {
69
- "semantic": 48.84932696482849,
70
- "anls": 41.75603723934328,
71
  "n": 47
72
  },
73
  "HR/Employment": {
@@ -76,8 +76,8 @@
76
  "n": 41
77
  },
78
  "Legal": {
79
- "semantic": 37.968675842429995,
80
- "anls": 32.74308378959542,
81
  "n": 43
82
  },
83
  "Media/Publishing": {
@@ -96,17 +96,17 @@
96
  "n": 1
97
  },
98
  "Reference": {
99
- "semantic": 40.22762951334379,
100
  "anls": 40.48309244262362,
101
  "n": 52
102
  },
103
  "Reports": {
104
  "semantic": 47.619047619047635,
105
- "anls": 46.80494177991155,
106
  "n": 75
107
  },
108
  "Technical": {
109
- "semantic": 66.54835847382431,
110
  "anls": 62.77759844334801,
111
  "n": 23
112
  }
@@ -114,7 +114,7 @@
114
  "n_evaluated": 500,
115
  "n_unmatched": 0
116
  },
117
- "reevaluated_date": "2026-01-16T16:06:19.286211+00:00",
118
  "source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_predictions_20260104_140537.jsonl",
119
  "result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json"
120
  }
 
14
  "submission_date": "2026-01-04T14:05:37.240829+00:00",
15
  "results": {
16
  "overall": {
17
+ "semantic": 49.59183673469388,
18
  "semantic_ci": [
19
+ 45.12153634505467,
20
+ 54.06213712433308
21
  ],
22
+ "anls": 45.66299621788548,
23
  "page_f1": 29.277142857142856,
24
  "doc_f1": 66.60666666666667,
25
+ "kuiper": 29.030000000000086
26
  },
27
  "single_evidence": {
28
+ "semantic": 56.05255800950517,
29
+ "anls": 49.87536451502012,
30
  "n": 365
31
  },
32
  "multi_evidence_same_doc": {
33
  "semantic": 36.443148688046655,
34
+ "anls": 38.377481387099635,
35
  "n": 84
36
  },
37
  "multi_evidence_multi_doc": {
 
41
  },
42
  "by_domain": {
43
  "Cases/Logs": {
44
+ "semantic": 13.605442176870747,
45
  "anls": 14.833333333333334,
46
  "n": 15
47
  },
 
51
  "n": 22
52
  },
53
  "Events": {
54
+ "semantic": 76.53061224489795,
55
  "anls": 55.83149489399489,
56
  "n": 24
57
  },
58
  "Financial": {
59
+ "semantic": 56.011535048802116,
60
+ "anls": 50.61975566529438,
61
  "n": 92
62
  },
63
  "Financial/Tax": {
 
66
  "n": 16
67
  },
68
  "Government/Regulatory": {
69
+ "semantic": 46.678245766391655,
70
+ "anls": 41.76403595954804,
71
  "n": 47
72
  },
73
  "HR/Employment": {
 
76
  "n": 41
77
  },
78
  "Legal": {
79
+ "semantic": 40.34171808258187,
80
+ "anls": 33.905874487269834,
81
  "n": 43
82
  },
83
  "Media/Publishing": {
 
96
  "n": 1
97
  },
98
  "Reference": {
99
+ "semantic": 42.1899529042386,
100
  "anls": 40.48309244262362,
101
  "n": 52
102
  },
103
  "Reports": {
104
  "semantic": 47.619047619047635,
105
+ "anls": 46.22668610488168,
106
  "n": 75
107
  },
108
  "Technical": {
109
+ "semantic": 64.33007985803019,
110
  "anls": 62.77759844334801,
111
  "n": 23
112
  }
 
114
  "n_evaluated": 500,
115
  "n_unmatched": 0
116
  },
117
+ "reevaluated_date": "2026-01-22T22:12:20.346525+00:00",
118
  "source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_predictions_20260104_140537.jsonl",
119
  "result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json"
120
  }
eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json CHANGED
@@ -15,34 +15,34 @@
15
  "submission_date": "2026-01-09T15:26:50.820104+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 65.51020408163264,
19
  "semantic_ci": [
20
- 61.22229033719482,
21
- 69.79811782607047
22
  ],
23
- "anls": 55.16542612989696,
24
  "page_f1": 67.57095238095239,
25
  "doc_f1": 82.35303030303031,
26
- "kuiper": 73.26853707414845
27
  },
28
  "single_evidence": {
29
- "semantic": 72.2672630696114,
30
- "anls": 60.98668315256136,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 47.37609329446065,
35
- "anls": 37.944647803393245,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 47.0188075230092,
40
  "anls": 41.86712350546175,
41
  "n": 51
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
- "semantic": 68.02721088435374,
46
  "anls": 57.16524216524217,
47
  "n": 15
48
  },
@@ -52,13 +52,13 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 63.775510204081634,
56
  "anls": 53.63190419293608,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 49.3566992014197,
61
- "anls": 43.770804881794874,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -67,8 +67,8 @@
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 74.90230134607035,
71
- "anls": 62.856694438441366,
72
  "n": 47
73
  },
74
  "HR/Employment": {
@@ -77,33 +77,33 @@
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 75.93735168485999,
81
  "anls": 62.31744836688789,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
- "semantic": 51.02040816326531,
86
  "anls": 39.93216037493774,
87
  "n": 25
88
  },
89
  "Misc": {
90
- "semantic": 72.27891156462587,
91
- "anls": 63.35950315116982,
92
  "n": 24
93
  },
94
  "Other": {
95
- "semantic": 0.0,
96
  "anls": 0.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
  "semantic": 80.45525902668759,
101
- "anls": 73.02503210878088,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 67.34693877551022,
106
- "anls": 54.869395530526155,
107
  "n": 75
108
  },
109
  "Technical": {
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T16:07:26.362120+00:00",
119
  "source_predictions_file": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152650.jsonl",
120
  "result_file_path": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json"
121
  }
 
15
  "submission_date": "2026-01-09T15:26:50.820104+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 66.9387755102041,
19
  "semantic_ci": [
20
+ 62.68995725738806,
21
+ 71.18759376302013
22
  ],
23
+ "anls": 55.23436182110524,
24
  "page_f1": 67.57095238095239,
25
  "doc_f1": 82.35303030303031,
26
+ "kuiper": 73.23246492985982
27
  },
28
  "single_evidence": {
29
+ "semantic": 73.94464635169136,
30
+ "anls": 61.111836074632876,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 47.9834791059281,
35
+ "anls": 37.81116005396503,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 48.01920768307324,
40
  "anls": 41.86712350546175,
41
  "n": 51
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
+ "semantic": 71.42857142857143,
46
  "anls": 57.16524216524217,
47
  "n": 15
48
  },
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 68.02721088435374,
56
  "anls": 53.63190419293608,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 52.68411712511091,
61
+ "anls": 44.09196027631984,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 73.81676074685193,
71
+ "anls": 62.884689959158045,
72
  "n": 47
73
  },
74
  "HR/Employment": {
 
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 78.31039392501187,
81
  "anls": 62.31744836688789,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
+ "semantic": 53.06122448979592,
86
  "anls": 39.93216037493774,
87
  "n": 25
88
  },
89
  "Misc": {
90
+ "semantic": 74.4047619047619,
91
+ "anls": 63.84908648450315,
92
  "n": 24
93
  },
94
  "Other": {
95
+ "semantic": 51.02040816326531,
96
  "anls": 0.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
  "semantic": 80.45525902668759,
101
+ "anls": 73.08404616236015,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 68.02721088435374,
106
+ "anls": 54.719889251166556,
107
  "n": 75
108
  },
109
  "Technical": {
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:13:30.005424+00:00",
119
  "source_predictions_file": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152650.jsonl",
120
  "result_file_path": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json"
121
  }
eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_results_20260104_122026.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-5 Mini (2025-08-07) with File Search",
3
+ "organization": "OpenAI",
4
+ "description": "Managed, single-shot retrieval mechanism.",
5
+ "link": "https://platform.openai.com/docs/guides/tools-file-search",
6
+ "tags": [
7
+ "Conventional RAG",
8
+ "Semantic Search Tool"
9
+ ],
10
+ "submitted_by": "Borchmann",
11
+ "metadata": {
12
+ "model_type": "api"
13
+ },
14
+ "submission_date": "2026-01-04T12:20:26.152729+00:00",
15
+ "results": {
16
+ "overall": {
17
+ "semantic": 48.46938775510205,
18
+ "semantic_ci": [
19
+ 44.00292754203926,
20
+ 52.935847968164836
21
+ ],
22
+ "anls": 41.81689987677872,
23
+ "page_f1": 29.013073593073592,
24
+ "doc_f1": 67.32666666666667,
25
+ "kuiper": 28.0
26
+ },
27
+ "single_evidence": {
28
+ "semantic": 55.21386636846521,
29
+ "anls": 47.22776663670515,
30
+ "n": 365
31
+ },
32
+ "multi_evidence_same_doc": {
33
+ "semantic": 32.79883381924198,
34
+ "anls": 30.536053736441175,
35
+ "n": 84
36
+ },
37
+ "multi_evidence_multi_doc": {
38
+ "semantic": 26.010404161664663,
39
+ "anls": 21.672286316292585,
40
+ "n": 51
41
+ },
42
+ "by_domain": {
43
+ "Cases/Logs": {
44
+ "semantic": 13.605442176870747,
45
+ "anls": 16.666666666666664,
46
+ "n": 15
47
+ },
48
+ "Education": {
49
+ "semantic": 81.16883116883116,
50
+ "anls": 68.20851085673219,
51
+ "n": 22
52
+ },
53
+ "Events": {
54
+ "semantic": 65.9013605442177,
55
+ "anls": 53.08302808302808,
56
+ "n": 24
57
+ },
58
+ "Financial": {
59
+ "semantic": 52.12954747116238,
60
+ "anls": 46.51029159374941,
61
+ "n": 92
62
+ },
63
+ "Financial/Tax": {
64
+ "semantic": 19.132653061224488,
65
+ "anls": 17.775843108504397,
66
+ "n": 16
67
+ },
68
+ "Government/Regulatory": {
69
+ "semantic": 45.59270516717324,
70
+ "anls": 37.66328697492653,
71
+ "n": 47
72
+ },
73
+ "HR/Employment": {
74
+ "semantic": 41.06520657043304,
75
+ "anls": 36.31954842987894,
76
+ "n": 41
77
+ },
78
+ "Legal": {
79
+ "semantic": 43.90128144280967,
80
+ "anls": 39.4998961859427,
81
+ "n": 43
82
+ },
83
+ "Media/Publishing": {
84
+ "semantic": 36.734693877551,
85
+ "anls": 37.934534534534535,
86
+ "n": 25
87
+ },
88
+ "Misc": {
89
+ "semantic": 72.27891156462587,
90
+ "anls": 64.10710364514712,
91
+ "n": 24
92
+ },
93
+ "Other": {
94
+ "semantic": 0.0,
95
+ "anls": 0.0,
96
+ "n": 1
97
+ },
98
+ "Reference": {
99
+ "semantic": 41.208791208791204,
100
+ "anls": 36.730980853494025,
101
+ "n": 52
102
+ },
103
+ "Reports": {
104
+ "semantic": 47.619047619047635,
105
+ "anls": 37.56027917001248,
106
+ "n": 75
107
+ },
108
+ "Technical": {
109
+ "semantic": 64.33007985803019,
110
+ "anls": 49.94662638112699,
111
+ "n": 23
112
+ }
113
+ },
114
+ "n_evaluated": 500,
115
+ "n_unmatched": 0
116
+ },
117
+ "reevaluated_date": "2026-01-22T22:14:40.534356+00:00",
118
+ "source_predictions_file": "OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_predictions_20260104_122026.jsonl",
119
+ "result_file_path": "OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_results_20260104_122026.json"
120
+ }
eval/reevaluated_results/OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json CHANGED
@@ -15,28 +15,28 @@
15
  "submission_date": "2026-01-09T15:28:28.366309+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 56.122448979591844,
19
  "semantic_ci": [
20
- 51.67281165196697,
21
- 60.57208630721672
22
  ],
23
- "anls": 52.255247982009955,
24
  "page_f1": 60.877142857142864,
25
  "doc_f1": 82.2030303030303,
26
- "kuiper": 48.159999999999975
27
  },
28
  "single_evidence": {
29
- "semantic": 64.85882024042495,
30
- "anls": 59.369915601211495,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 31.584062196307094,
35
- "anls": 34.367489115024085,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 34.01360544217687,
40
  "anls": 30.798739429426515,
41
  "n": 51
42
  },
@@ -52,13 +52,13 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 59.52380952380955,
56
  "anls": 51.78930433365917,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 41.038154392191664,
61
- "anls": 40.14762316798784,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -67,18 +67,18 @@
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 66.21797655232307,
71
- "anls": 56.496054764723326,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 52.264808362369344,
76
- "anls": 42.85858107680723,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 60.51257712387282,
81
- "anls": 55.28314708547266,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
@@ -87,22 +87,22 @@
87
  "n": 25
88
  },
89
  "Misc": {
90
- "semantic": 72.27891156462587,
91
  "anls": 74.53137140637142,
92
  "n": 24
93
  },
94
  "Other": {
95
- "semantic": 0.0,
96
  "anls": 0.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 57.88854003139717,
101
- "anls": 61.940508414693205,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 54.42176870748299,
106
  "anls": 48.18660787855504,
107
  "n": 75
108
  },
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T16:08:36.360733+00:00",
119
  "source_predictions_file": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152828.jsonl",
120
  "result_file_path": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json"
121
  }
 
15
  "submission_date": "2026-01-09T15:28:28.366309+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 58.16326530612243,
19
  "semantic_ci": [
20
+ 53.735254533391874,
21
+ 62.591276078853
22
  ],
23
+ "anls": 52.7083705578831,
24
  "page_f1": 60.877142857142864,
25
  "doc_f1": 82.2030303030303,
26
+ "kuiper": 49.84000000000006
27
  },
28
  "single_evidence": {
29
+ "semantic": 67.23511322337154,
30
+ "anls": 60.04044465907011,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 32.79883381924198,
35
+ "anls": 34.15103889857387,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 35.014005602240914,
40
  "anls": 30.798739429426515,
41
  "n": 51
42
  },
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 63.775510204081634,
56
  "anls": 51.78930433365917,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 44.365572315882865,
61
+ "anls": 41.87226084914726,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 64.04689535388624,
71
+ "anls": 55.460220498205956,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 54.75360876057741,
76
+ "anls": 44.078093271929184,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 64.07214048410061,
81
+ "anls": 56.02310480217457,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
 
87
  "n": 25
88
  },
89
  "Misc": {
90
+ "semantic": 74.4047619047619,
91
  "anls": 74.53137140637142,
92
  "n": 24
93
  },
94
  "Other": {
95
+ "semantic": 51.02040816326531,
96
  "anls": 0.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 62.79434850863422,
101
+ "anls": 62.608982452732455,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 55.78231292517006,
106
  "anls": 48.18660787855504,
107
  "n": 75
108
  },
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:15:43.985703+00:00",
119
  "source_predictions_file": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152828.jsonl",
120
  "result_file_path": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json"
121
  }
eval/reevaluated_results/OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json CHANGED
@@ -15,23 +15,23 @@
15
  "submission_date": "2026-01-09T15:35:16.458002+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 51.63265306122449,
19
  "semantic_ci": [
20
- 47.16092128223699,
21
- 56.104384840211985
22
  ],
23
- "anls": 46.26708858125157,
24
  "page_f1": 59.905054945054935,
25
  "doc_f1": 77.61731601731601,
26
- "kuiper": 41.783673469388106
27
  },
28
  "single_evidence": {
29
- "semantic": 61.50405367626502,
30
- "anls": 54.710786964911776,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 29.154518950437307,
35
  "anls": 26.237172573133016,
36
  "n": 84
37
  },
@@ -42,7 +42,7 @@
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
- "semantic": 47.619047619047635,
46
  "anls": 39.64209401709402,
47
  "n": 15
48
  },
@@ -52,13 +52,13 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 55.2721088435374,
56
  "anls": 53.83018770627063,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 36.04702750665483,
61
- "anls": 34.96285359224887,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -67,23 +67,23 @@
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 45.59270516717324,
71
- "anls": 44.19583719868558,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 54.75360876057741,
76
- "anls": 46.501429746354255,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 51.02040816326531,
81
  "anls": 43.64210613408689,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
  "semantic": 48.97959183673469,
86
- "anls": 46.71106819031614,
87
  "n": 25
88
  },
89
  "Misc": {
@@ -98,16 +98,16 @@
98
  },
99
  "Reference": {
100
  "semantic": 66.71899529042385,
101
- "anls": 62.86510186138165,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 53.74149659863945,
106
  "anls": 45.15164464860224,
107
  "n": 75
108
  },
109
  "Technical": {
110
- "semantic": 70.9849157054126,
111
  "anls": 53.71736172158072,
112
  "n": 23
113
  }
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T16:10:10.834088+00:00",
119
  "source_predictions_file": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153516.jsonl",
120
  "result_file_path": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json"
121
  }
 
15
  "submission_date": "2026-01-09T15:35:16.458002+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 52.959183673469404,
19
  "semantic_ci": [
20
+ 48.490359884292864,
21
+ 57.42800746264594
22
  ],
23
+ "anls": 46.73693856273607,
24
  "page_f1": 59.905054945054935,
25
  "doc_f1": 77.61731601731601,
26
+ "kuiper": 40.7714285714291
27
  },
28
  "single_evidence": {
29
+ "semantic": 63.181436958344975,
30
+ "anls": 55.35441707653439,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 29.761904761904773,
35
  "anls": 26.237172573133016,
36
  "n": 84
37
  },
 
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
+ "semantic": 51.02040816326531,
46
  "anls": 39.64209401709402,
47
  "n": 15
48
  },
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 59.52380952380955,
56
  "anls": 53.83018770627063,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 37.710736468500436,
61
+ "anls": 36.592053196991955,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 46.678245766391655,
71
+ "anls": 44.3103860759977,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 57.24240915878547,
76
+ "anls": 47.47703950245181,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 55.766492643569066,
81
  "anls": 43.64210613408689,
82
  "n": 43
83
  },
84
  "Media/Publishing": {
85
  "semantic": 48.97959183673469,
86
+ "anls": 47.21106819031614,
87
  "n": 25
88
  },
89
  "Misc": {
 
98
  },
99
  "Reference": {
100
  "semantic": 66.71899529042385,
101
+ "anls": 63.387310128155136,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 54.42176870748299,
106
  "anls": 45.15164464860224,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 68.76663708961844,
111
  "anls": 53.71736172158072,
112
  "n": 23
113
  }
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:17:05.838647+00:00",
119
  "source_predictions_file": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153516.jsonl",
120
  "result_file_path": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json"
121
  }
eval/reevaluated_results/OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json CHANGED
@@ -15,35 +15,35 @@
15
  "submission_date": "2026-01-09T18:53:47.189606+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 80.97828309680585,
19
  "semantic_ci": [
20
- 77.35470686592657,
21
- 84.60185932768512
22
  ],
23
- "anls": 73.52101315170081,
24
  "page_f1": 78.4607309857811,
25
  "doc_f1": 90.20248288785363,
26
- "kuiper": 27.436873747495046
27
  },
28
  "single_evidence": {
29
- "semantic": 85.36106750392466,
30
- "anls": 76.30968193570469,
31
  "n": 364
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 65.59766763848395,
35
- "anls": 63.78712934826842,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 75.03001200480193,
40
  "anls": 69.64985240877743,
41
  "n": 51
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
- "semantic": 88.43537414965988,
46
- "anls": 85.12820512820514,
47
  "n": 15
48
  },
49
  "Education": {
@@ -52,13 +52,13 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 82.90816326530613,
56
  "anls": 79.84423442344234,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 68.21206743566992,
61
- "anls": 63.13552237747254,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -67,17 +67,17 @@
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 89.01432913590969,
71
- "anls": 78.26722646935413,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
  "semantic": 89.59681433549028,
76
- "anls": 77.34609828919353,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 84.24299952539157,
81
  "anls": 68.10496996543507,
82
  "n": 43
83
  },
@@ -92,22 +92,22 @@
92
  "n": 24
93
  },
94
  "Other": {
95
- "semantic": 0.0,
96
- "anls": 0.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 84.37990580847723,
101
- "anls": 83.68517307852197,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 77.90954219525649,
106
- "anls": 71.94584088751826,
107
  "n": 74
108
  },
109
  "Technical": {
110
- "semantic": 79.85803016858917,
111
  "anls": 55.56822369489126,
112
  "n": 23
113
  }
@@ -115,7 +115,7 @@
115
  "n_evaluated": 499,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T16:11:07.316820+00:00",
119
  "source_predictions_file": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_predictions_20260109_185347.jsonl",
120
  "result_file_path": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json"
121
  }
 
15
  "submission_date": "2026-01-09T18:53:47.189606+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 82.20522678009081,
19
  "semantic_ci": [
20
+ 78.66223544202387,
21
+ 85.74821811815777
22
  ],
23
+ "anls": 74.76545756458952,
24
  "page_f1": 78.4607309857811,
25
  "doc_f1": 90.20248288785363,
26
+ "kuiper": 25.80160320641279
27
  },
28
  "single_evidence": {
29
+ "semantic": 87.04305898183449,
30
+ "anls": 77.74037073431458,
31
  "n": 364
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 66.81243926141885,
35
+ "anls": 64.98007029276216,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 73.02921168467388,
40
  "anls": 69.64985240877743,
41
  "n": 51
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
+ "semantic": 95.23809523809524,
46
+ "anls": 91.7948717948718,
47
  "n": 15
48
  },
49
  "Education": {
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 87.15986394557822,
56
  "anls": 79.84423442344234,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 68.76663708961844,
61
+ "anls": 65.75226955943711,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 90.09986973512808,
71
+ "anls": 80.50699949010338,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
  "semantic": 89.59681433549028,
76
+ "anls": 78.32170804529109,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 85.4295206454675,
81
  "anls": 68.10496996543507,
82
  "n": 43
83
  },
 
92
  "n": 24
93
  },
94
  "Other": {
95
+ "semantic": 100.0,
96
+ "anls": 100.0,
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 88.30455259026688,
101
+ "anls": 84.35364711656122,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 78.59900717043573,
106
+ "anls": 71.94863871640305,
107
  "n": 74
108
  },
109
  "Technical": {
110
+ "semantic": 75.42147293700089,
111
  "anls": 55.56822369489126,
112
  "n": 23
113
  }
 
115
  "n_evaluated": 499,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:17:55.213452+00:00",
119
  "source_predictions_file": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_predictions_20260109_185347.jsonl",
120
  "result_file_path": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json"
121
  }
eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json CHANGED
@@ -15,35 +15,35 @@
15
  "submission_date": "2026-01-09T15:44:27.735534+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 46.73469387755102,
19
  "semantic_ci": [
20
- 42.278445434797305,
21
- 51.190942320304735
22
  ],
23
- "anls": 45.649762341432954,
24
  "page_f1": 43.169719169719166,
25
  "doc_f1": 59.24761904761905,
26
  "kuiper": null
27
  },
28
  "single_evidence": {
29
- "semantic": 51.02040816326531,
30
- "anls": 49.144211108257174,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 32.79883381924198,
35
- "anls": 35.078637953143826,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 39.015606242497,
40
  "anls": 38.05173584585349,
41
  "n": 51
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
  "semantic": 54.42176870748299,
46
- "anls": 48.75783475783476,
47
  "n": 15
48
  },
49
  "Education": {
@@ -52,32 +52,32 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 57.39795918367348,
56
- "anls": 55.056754787358244,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 44.365572315882865,
61
- "anls": 48.16466676354977,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
- "semantic": 51.02040816326531,
66
  "anls": 44.99022482893451,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 54.277029960920544,
71
- "anls": 47.1956486962086,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 46.042807366849175,
76
  "anls": 32.93040293040293,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 39.15519696250593,
81
  "anls": 35.73555320648344,
82
  "n": 43
83
  },
@@ -97,17 +97,17 @@
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 45.13343799058085,
101
- "anls": 48.954805357154754,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 51.02040816326531,
106
- "anls": 50.65907330372244,
107
  "n": 75
108
  },
109
  "Technical": {
110
- "semantic": 51.02040816326531,
111
  "anls": 46.20014437749956,
112
  "n": 23
113
  }
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T16:14:49.523407+00:00",
119
  "source_predictions_file": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_154427.jsonl",
120
  "result_file_path": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json"
121
  }
 
15
  "submission_date": "2026-01-09T15:44:27.735534+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 48.57142857142857,
19
  "semantic_ci": [
20
+ 44.10452956979305,
21
+ 53.038327573064095
22
  ],
23
+ "anls": 46.86058848865213,
24
  "page_f1": 43.169719169719166,
25
  "doc_f1": 59.24761904761905,
26
  "kuiper": null
27
  },
28
  "single_evidence": {
29
+ "semantic": 53.955828906905246,
30
+ "anls": 50.80230983229035,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 32.19144800777454,
35
+ "anls": 35.08110270716138,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 37.01480592236896,
40
  "anls": 38.05173584585349,
41
  "n": 51
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
  "semantic": 54.42176870748299,
46
+ "anls": 52.09116809116809,
47
  "n": 15
48
  },
49
  "Education": {
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 61.64965986394556,
56
+ "anls": 57.140088120691566,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 51.02040816326531,
61
+ "anls": 53.968561691086,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
+ "semantic": 44.642857142857146,
66
  "anls": 44.99022482893451,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
  "semantic": 54.277029960920544,
71
+ "anls": 46.13981762917933,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 42.309606769537076,
76
  "anls": 32.93040293040293,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 40.34171808258187,
81
  "anls": 35.73555320648344,
82
  "n": 43
83
  },
 
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 47.09576138147568,
101
+ "anls": 49.356185378099994,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 54.42176870748299,
106
+ "anls": 50.66183382822209,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 53.238686779059464,
111
  "anls": 46.20014437749956,
112
  "n": 23
113
  }
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:18:55.670607+00:00",
119
  "source_predictions_file": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_154427.jsonl",
120
  "result_file_path": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json"
121
  }
eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json CHANGED
@@ -15,34 +15,34 @@
15
  "submission_date": "2026-01-09T17:56:39.771528+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 50.7142857142857,
19
  "semantic_ci": [
20
- 46.24231243352625,
21
- 55.186258995045144
22
  ],
23
- "anls": 47.46445252141211,
24
  "page_f1": 48.43228327228327,
25
  "doc_f1": 62.30761904761904,
26
  "kuiper": null
27
  },
28
  "single_evidence": {
29
- "semantic": 54.095610847078554,
30
- "anls": 49.17254529050683,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 35.22837706511177,
35
- "anls": 39.524985989825936,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 52.02080832332933,
40
  "anls": 48.31663542207227,
41
  "n": 51
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
- "semantic": 57.82312925170068,
46
  "anls": 43.64672364672364,
47
  "n": 15
48
  },
@@ -52,13 +52,13 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 48.89455782312926,
56
  "anls": 46.90982404692082,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 48.802129547471154,
61
- "anls": 48.83531625708929,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
@@ -67,17 +67,17 @@
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 61.87581415544942,
71
- "anls": 49.070286122357786,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 41.06520657043304,
76
  "anls": 34.149915125524885,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 46.27432368296155,
81
  "anls": 46.299372462163156,
82
  "n": 43
83
  },
@@ -97,12 +97,12 @@
97
  "n": 1
98
  },
99
  "Reference": {
100
- "semantic": 58.86970172684458,
101
- "anls": 58.28202679165414,
102
  "n": 52
103
  },
104
  "Reports": {
105
- "semantic": 54.42176870748299,
106
  "anls": 52.18098320525303,
107
  "n": 75
108
  },
@@ -115,7 +115,7 @@
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
- "reevaluated_date": "2026-01-16T16:17:00.809294+00:00",
119
  "source_predictions_file": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_175639.jsonl",
120
  "result_file_path": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json"
121
  }
 
15
  "submission_date": "2026-01-09T17:56:39.771528+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 52.85714285714286,
19
  "semantic_ci": [
20
+ 48.3879879090649,
21
+ 57.326297805220825
22
  ],
23
+ "anls": 48.84111465957483,
24
  "page_f1": 48.43228327228327,
25
  "doc_f1": 62.30761904761904,
26
  "kuiper": null
27
  },
28
  "single_evidence": {
29
+ "semantic": 56.05255800950517,
30
+ "anls": 50.78441123319547,
31
  "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 38.87269193391642,
35
+ "anls": 40.715462180302126,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 53.02120848339336,
40
  "anls": 48.31663542207227,
41
  "n": 51
42
  },
43
  "by_domain": {
44
  "Cases/Logs": {
45
+ "semantic": 61.22448979591838,
46
  "anls": 43.64672364672364,
47
  "n": 15
48
  },
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 53.14625850340138,
56
  "anls": 46.90982404692082,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 56.011535048802116,
61
+ "anls": 54.83808397045483,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
 
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 66.21797655232307,
71
+ "anls": 51.225941217542555,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 39.82080637132901,
76
  "anls": 34.149915125524885,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 47.46084480303749,
81
  "anls": 46.299372462163156,
82
  "n": 43
83
  },
 
97
  "n": 1
98
  },
99
  "Reference": {
100
+ "semantic": 56.907378335949765,
101
+ "anls": 58.9505008296934,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 56.4625850340136,
106
  "anls": 52.18098320525303,
107
  "n": 75
108
  },
 
115
  "n_evaluated": 500,
116
  "n_unmatched": 0
117
  },
118
+ "reevaluated_date": "2026-01-22T22:19:59.654603+00:00",
119
  "source_predictions_file": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_175639.jsonl",
120
  "result_file_path": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json"
121
  }
eval/reevaluated_results/Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json CHANGED
@@ -15,28 +15,28 @@
15
  "submission_date": "2026-01-10T13:22:27.811792+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 44.57895382601939,
19
  "semantic_ci": [
20
- 40.138241872067184,
21
- 49.01966577997159
22
  ],
23
- "anls": 30.17090068718362,
24
- "page_f1": 28.991793110029583,
25
- "doc_f1": 51.58650634602539,
26
- "kuiper": 26.255020080321316
27
  },
28
  "single_evidence": {
29
- "semantic": 50.88024220677282,
30
- "anls": 35.972008760814205,
31
- "n": 364
32
  },
33
  "multi_evidence_same_doc": {
34
- "semantic": 16.399416909621,
35
- "anls": 12.405045351473925,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 46.018407362945176,
40
  "anls": 18.02832244008715,
41
  "n": 51
42
  },
@@ -47,32 +47,32 @@
47
  "n": 15
48
  },
49
  "Education": {
50
- "semantic": 57.97773654916514,
51
  "anls": 34.34782608695652,
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 72.27891156462587,
56
- "anls": 52.92922722985768,
57
  "n": 24
58
  },
59
  "Financial": {
60
- "semantic": 34.38331854480922,
61
- "anls": 23.538822057620244,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
- "semantic": 19.132653061224488,
66
  "anls": 21.39516129032258,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 37.99392097264438,
71
- "anls": 29.5464725643897,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 53.50920856147338,
76
  "anls": 37.17815890071988,
77
  "n": 41
78
  },
@@ -87,17 +87,22 @@
87
  "n": 25
88
  },
89
  "Misc": {
90
- "semantic": 68.02721088435374,
91
  "anls": 48.707026404394824,
92
  "n": 24
93
  },
 
 
 
 
 
94
  "Reference": {
95
- "semantic": 40.22762951334379,
96
- "anls": 23.25877926421405,
97
  "n": 52
98
  },
99
  "Reports": {
100
- "semantic": 42.85714285714285,
101
  "anls": 25.79399206429042,
102
  "n": 75
103
  },
@@ -107,10 +112,10 @@
107
  "n": 23
108
  }
109
  },
110
- "n_evaluated": 499,
111
- "n_unmatched": 1767
112
  },
113
- "reevaluated_date": "2026-01-16T16:19:24.691340+00:00",
114
  "source_predictions_file": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_predictions_20260110_132227.jsonl",
115
  "result_file_path": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json"
116
  }
 
15
  "submission_date": "2026-01-10T13:22:27.811792+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 46.020408163265316,
19
  "semantic_ci": [
20
+ 41.56988099714986,
21
+ 50.470935329380765
22
  ],
23
+ "anls": 30.43927742127836,
24
+ "page_f1": 28.93380952380953,
25
+ "doc_f1": 51.483333333333334,
26
+ "kuiper": 27.468937875751397
27
  },
28
  "single_evidence": {
29
+ "semantic": 52.278445624825274,
30
+ "anls": 36.186768374440895,
31
+ "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
+ "semantic": 18.221574344023328,
35
+ "anls": 13.000283446712018,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 47.0188075230092,
40
  "anls": 18.02832244008715,
41
  "n": 51
42
  },
 
47
  "n": 15
48
  },
49
  "Education": {
50
+ "semantic": 60.29684601113172,
51
  "anls": 34.34782608695652,
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 74.4047619047619,
56
+ "anls": 55.012560563191016,
57
  "n": 24
58
  },
59
  "Financial": {
60
+ "semantic": 36.601597160603376,
61
+ "anls": 24.625778579359373,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
+ "semantic": 22.321428571428573,
66
  "anls": 21.39516129032258,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 40.16500217108119,
71
+ "anls": 29.574468085106382,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 49.77600796416127,
76
  "anls": 37.17815890071988,
77
  "n": 41
78
  },
 
87
  "n": 25
88
  },
89
  "Misc": {
90
+ "semantic": 70.1530612244898,
91
  "anls": 48.707026404394824,
92
  "n": 24
93
  },
94
+ "Other": {
95
+ "semantic": 100.0,
96
+ "anls": 0.0,
97
+ "n": 1
98
+ },
99
  "Reference": {
100
+ "semantic": 43.171114599686035,
101
+ "anls": 23.509615384615383,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 44.897959183673464,
106
  "anls": 25.79399206429042,
107
  "n": 75
108
  },
 
112
  "n": 23
113
  }
114
  },
115
+ "n_evaluated": 500,
116
+ "n_unmatched": 1766
117
  },
118
+ "reevaluated_date": "2026-01-22T22:21:10.223032+00:00",
119
  "source_predictions_file": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_predictions_20260110_132227.jsonl",
120
  "result_file_path": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json"
121
  }
eval/reevaluated_results/Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json CHANGED
@@ -15,28 +15,28 @@
15
  "submission_date": "2026-01-10T13:18:26.686587+00:00",
16
  "results": {
17
  "overall": {
18
- "semantic": 65.64148705574414,
19
  "semantic_ci": [
20
- 61.35270759605711,
21
- 69.93026651543116
22
  ],
23
- "anls": 59.661893537203156,
24
- "page_f1": 66.02347552247352,
25
- "doc_f1": 86.7908978129419,
26
- "kuiper": 50.23185483870943
27
  },
28
  "single_evidence": {
29
- "semantic": 71.34447185467593,
30
- "anls": 64.16155076739506,
31
- "n": 364
32
  },
33
  "multi_evidence_same_doc": {
34
  "semantic": 37.0505344995141,
35
- "anls": 36.01688306453112,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
- "semantic": 72.02881152460986,
40
  "anls": 66.49141604533267,
41
  "n": 51
42
  },
@@ -52,32 +52,32 @@
52
  "n": 22
53
  },
54
  "Events": {
55
- "semantic": 78.65646258503402,
56
- "anls": 67.87290397408577,
57
  "n": 24
58
  },
59
  "Financial": {
60
  "semantic": 53.79325643300798,
61
- "anls": 51.19993983845437,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
- "semantic": 63.775510204081634,
66
  "anls": 62.5648667601683,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
- "semantic": 75.98784194528876,
71
- "anls": 70.93589720557641,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
- "semantic": 67.19761075161773,
76
- "anls": 59.735891761304075,
77
  "n": 41
78
  },
79
  "Legal": {
80
- "semantic": 70.0047460844803,
81
  "anls": 55.536175710594314,
82
  "n": 43
83
  },
@@ -87,30 +87,35 @@
87
  "n": 25
88
  },
89
  "Misc": {
90
- "semantic": 74.4047619047619,
91
  "anls": 75.10160446706249,
92
  "n": 24
93
  },
 
 
 
 
 
94
  "Reference": {
95
- "semantic": 59.85086342229201,
96
- "anls": 60.632124141167864,
97
  "n": 52
98
  },
99
  "Reports": {
100
- "semantic": 63.94557823129252,
101
  "anls": 56.89167319856098,
102
  "n": 75
103
  },
104
  "Technical": {
105
- "semantic": 68.76663708961844,
106
  "anls": 51.450020851943364,
107
  "n": 23
108
  }
109
  },
110
- "n_evaluated": 499,
111
- "n_unmatched": 1767
112
  },
113
- "reevaluated_date": "2026-01-16T16:20:28.331848+00:00",
114
  "source_predictions_file": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_predictions_20260110_131826.jsonl",
115
  "result_file_path": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json"
116
  }
 
15
  "submission_date": "2026-01-10T13:18:26.686587+00:00",
16
  "results": {
17
  "overall": {
18
+ "semantic": 66.12244897959185,
19
  "semantic_ci": [
20
+ 61.850797145071965,
21
+ 70.39410081411174
22
  ],
23
+ "anls": 60.25345765733381,
24
+ "page_f1": 65.89142857142856,
25
+ "doc_f1": 86.61731601731603,
26
+ "kuiper": 51.43661971830999
27
  },
28
  "single_evidence": {
29
+ "semantic": 72.2672630696114,
30
+ "anls": 64.8225984463954,
31
+ "n": 365
32
  },
33
  "multi_evidence_same_doc": {
34
  "semantic": 37.0505344995141,
35
+ "anls": 36.61212115976921,
36
  "n": 84
37
  },
38
  "multi_evidence_multi_doc": {
39
+ "semantic": 70.0280112044818,
40
  "anls": 66.49141604533267,
41
  "n": 51
42
  },
 
52
  "n": 22
53
  },
54
  "Events": {
55
+ "semantic": 80.78231292517005,
56
+ "anls": 69.9562373074191,
57
  "n": 24
58
  },
59
  "Financial": {
60
  "semantic": 53.79325643300798,
61
+ "anls": 52.97859596493658,
62
  "n": 92
63
  },
64
  "Financial/Tax": {
65
+ "semantic": 70.1530612244898,
66
  "anls": 62.5648667601683,
67
  "n": 16
68
  },
69
  "Government/Regulatory": {
70
+ "semantic": 77.07338254450717,
71
+ "anls": 71.3587742867764,
72
  "n": 47
73
  },
74
  "HR/Employment": {
75
+ "semantic": 64.70881035340967,
76
+ "anls": 60.27789718135828,
77
  "n": 41
78
  },
79
  "Legal": {
80
+ "semantic": 67.63170384432841,
81
  "anls": 55.536175710594314,
82
  "n": 43
83
  },
 
87
  "n": 25
88
  },
89
  "Misc": {
90
+ "semantic": 72.27891156462587,
91
  "anls": 75.10160446706249,
92
  "n": 24
93
  },
94
+ "Other": {
95
+ "semantic": 100.0,
96
+ "anls": 86.66666666666667,
97
+ "n": 1
98
+ },
99
  "Reference": {
100
+ "semantic": 63.775510204081634,
101
+ "anls": 60.88296026156921,
102
  "n": 52
103
  },
104
  "Reports": {
105
+ "semantic": 65.30612244897961,
106
  "anls": 56.89167319856098,
107
  "n": 75
108
  },
109
  "Technical": {
110
+ "semantic": 66.54835847382431,
111
  "anls": 51.450020851943364,
112
  "n": 23
113
  }
114
  },
115
+ "n_evaluated": 500,
116
+ "n_unmatched": 1766
117
  },
118
+ "reevaluated_date": "2026-01-22T22:22:05.057646+00:00",
119
  "source_predictions_file": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_predictions_20260110_131826.jsonl",
120
  "result_file_path": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json"
121
  }