Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- app.py +7 -0
- eval/cleanup_submissions.py +181 -0
- eval/delete_unlinked.py +87 -0
- eval/link_file_search_predictions.py +103 -0
- eval/metrics.py +5 -1
- eval/reevaluate_submissions.py +1 -0
- eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json +33 -28
- eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json +33 -28
- eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json +33 -28
- eval/reevaluated_results/Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json +23 -23
- eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json +25 -25
- eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json +24 -24
- eval/reevaluated_results/Google/Gemini_2.5_Flash_with_File_Search_results_20260103_221253.json +120 -0
- eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json +28 -28
- eval/reevaluated_results/Google/Gemini_2.5_Pro_with_File_Search_results_20260103_221943.json +120 -0
- eval/reevaluated_results/Google/Gemini_3.0_Pro_(Preview)_with_File_Search_results_20260104_120431.json +120 -0
- eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260117_193634.json +121 -0
- eval/reevaluated_results/Humanity/Human_with_Oracle_Retriever_results_20260122_214532.json +119 -0
- eval/reevaluated_results/OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json +25 -25
- eval/reevaluated_results/OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json +8 -8
- eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json +27 -27
- eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_results_20260104_121551.json +120 -0
- eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json +27 -27
- eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json +20 -20
- eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json +25 -25
- eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_results_20260104_122026.json +120 -0
- eval/reevaluated_results/OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json +25 -25
- eval/reevaluated_results/OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json +22 -22
- eval/reevaluated_results/OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json +27 -27
- eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json +24 -24
- eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json +21 -21
- eval/reevaluated_results/Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json +34 -29
- eval/reevaluated_results/Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json +34 -29
app.py
CHANGED
|
@@ -1172,6 +1172,7 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
|
|
| 1172 |
cells = []
|
| 1173 |
model_name = row.get("Model", "")
|
| 1174 |
organization = row.get("Organization", "")
|
|
|
|
| 1175 |
|
| 1176 |
# Check if this is a human performance row (should merge Model, Organization, Model Type)
|
| 1177 |
is_human_row = organization == "Humanity"
|
|
@@ -1273,6 +1274,9 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
|
|
| 1273 |
cells.append(f'<td style="text-align: center;">{cell_html}</td>')
|
| 1274 |
elif col.startswith("Attribution"):
|
| 1275 |
# Format F1 scores (scale 0-100) - NOT bias-adjusted
|
|
|
|
|
|
|
|
|
|
| 1276 |
try:
|
| 1277 |
attr_val = f"{float(value):.1f}" if value else "0"
|
| 1278 |
attr_float = float(value) if value else 0
|
|
@@ -1309,6 +1313,9 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
|
|
| 1309 |
elif col == "Effort (Kuiper)":
|
| 1310 |
# Format Kuiper statistic (lower is better for calibration)
|
| 1311 |
# Hide for Conventional RAG models (not meaningful)
|
|
|
|
|
|
|
|
|
|
| 1312 |
tags = row.get("Tags", [])
|
| 1313 |
is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
|
| 1314 |
if is_conventional_rag:
|
|
|
|
| 1172 |
cells = []
|
| 1173 |
model_name = row.get("Model", "")
|
| 1174 |
organization = row.get("Organization", "")
|
| 1175 |
+
hide_attrib_kuiper = model_name == "Human with Oracle Retriever"
|
| 1176 |
|
| 1177 |
# Check if this is a human performance row (should merge Model, Organization, Model Type)
|
| 1178 |
is_human_row = organization == "Humanity"
|
|
|
|
| 1274 |
cells.append(f'<td style="text-align: center;">{cell_html}</td>')
|
| 1275 |
elif col.startswith("Attribution"):
|
| 1276 |
# Format F1 scores (scale 0-100) - NOT bias-adjusted
|
| 1277 |
+
if hide_attrib_kuiper:
|
| 1278 |
+
cells.append('<td style="text-align: center;">—</td>')
|
| 1279 |
+
continue
|
| 1280 |
try:
|
| 1281 |
attr_val = f"{float(value):.1f}" if value else "0"
|
| 1282 |
attr_float = float(value) if value else 0
|
|
|
|
| 1313 |
elif col == "Effort (Kuiper)":
|
| 1314 |
# Format Kuiper statistic (lower is better for calibration)
|
| 1315 |
# Hide for Conventional RAG models (not meaningful)
|
| 1316 |
+
if hide_attrib_kuiper:
|
| 1317 |
+
cells.append('<td style="text-align: center;">—</td>')
|
| 1318 |
+
continue
|
| 1319 |
tags = row.get("Tags", [])
|
| 1320 |
is_conventional_rag = "Conventional RAG" in tags if isinstance(tags, list) else False
|
| 1321 |
if is_conventional_rag:
|
eval/cleanup_submissions.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Clean up backend-results submissions by keeping only the newest prediction per prefix
|
| 4 |
+
and the matching results file (if available). Optionally updates kept results to
|
| 5 |
+
record source_predictions_file and result_file_path.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import argparse
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import re
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from typing import Dict, List, Optional, Tuple
|
| 14 |
+
|
| 15 |
+
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
RESULTS_REPO = "agentic-document-ai/backend-results"
|
| 19 |
+
TOKEN = os.environ.get("HF_TOKEN")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class FileEntry:
|
| 24 |
+
path: str
|
| 25 |
+
prefix: str
|
| 26 |
+
ts_raw: str
|
| 27 |
+
ts_key: Tuple[int, int, int, int, int, int]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _parse_timestamp(ts: str) -> Tuple[int, int, int, int, int, int]:
|
| 31 |
+
"""Parse timestamps in YYYYMMDD_HHMMSS or YYYY-MM-DDTHH-MM-SSZ format."""
|
| 32 |
+
match_compact = re.match(r"^(\d{8})_(\d{6})$", ts)
|
| 33 |
+
if match_compact:
|
| 34 |
+
ymd, hms = match_compact.groups()
|
| 35 |
+
return (
|
| 36 |
+
int(ymd[0:4]),
|
| 37 |
+
int(ymd[4:6]),
|
| 38 |
+
int(ymd[6:8]),
|
| 39 |
+
int(hms[0:2]),
|
| 40 |
+
int(hms[2:4]),
|
| 41 |
+
int(hms[4:6]),
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
match_iso = re.match(r"^(\d{4})-(\d{2})-(\d{2})T(\d{2})-(\d{2})-(\d{2})Z$", ts)
|
| 45 |
+
if match_iso:
|
| 46 |
+
return tuple(int(x) for x in match_iso.groups()) # type: ignore[return-value]
|
| 47 |
+
|
| 48 |
+
# Fallback: treat as zero to make it always older than parsed timestamps
|
| 49 |
+
return (0, 0, 0, 0, 0, 0)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _split_predictions(path: str) -> Optional[FileEntry]:
|
| 53 |
+
if "_predictions_" not in path or not path.endswith(".jsonl"):
|
| 54 |
+
return None
|
| 55 |
+
prefix, ts = path.rsplit("_predictions_", 1)
|
| 56 |
+
ts = ts.replace(".jsonl", "")
|
| 57 |
+
return FileEntry(path=path, prefix=prefix, ts_raw=ts, ts_key=_parse_timestamp(ts))
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _split_results(path: str) -> Optional[FileEntry]:
|
| 61 |
+
if "_results_" not in path or not path.endswith(".json"):
|
| 62 |
+
return None
|
| 63 |
+
prefix, ts = path.rsplit("_results_", 1)
|
| 64 |
+
ts = ts.replace(".json", "")
|
| 65 |
+
return FileEntry(path=path, prefix=prefix, ts_raw=ts, ts_key=_parse_timestamp(ts))
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def main() -> int:
|
| 69 |
+
parser = argparse.ArgumentParser(description="Clean backend-results submissions")
|
| 70 |
+
parser.add_argument("--apply", action="store_true", help="Apply deletions/updates (default is dry-run)")
|
| 71 |
+
parser.add_argument("--drop-unmatched-results", action="store_true",
|
| 72 |
+
help="Delete results that do not match the latest prediction timestamp")
|
| 73 |
+
args = parser.parse_args()
|
| 74 |
+
|
| 75 |
+
api = HfApi(token=TOKEN)
|
| 76 |
+
files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
|
| 77 |
+
|
| 78 |
+
predictions: List[FileEntry] = []
|
| 79 |
+
results: List[FileEntry] = []
|
| 80 |
+
|
| 81 |
+
for f in files:
|
| 82 |
+
p = _split_predictions(f)
|
| 83 |
+
if p:
|
| 84 |
+
predictions.append(p)
|
| 85 |
+
continue
|
| 86 |
+
r = _split_results(f)
|
| 87 |
+
if r:
|
| 88 |
+
results.append(r)
|
| 89 |
+
|
| 90 |
+
preds_by_prefix: Dict[str, List[FileEntry]] = {}
|
| 91 |
+
results_by_prefix: Dict[str, List[FileEntry]] = {}
|
| 92 |
+
|
| 93 |
+
for p in predictions:
|
| 94 |
+
preds_by_prefix.setdefault(p.prefix, []).append(p)
|
| 95 |
+
for r in results:
|
| 96 |
+
results_by_prefix.setdefault(r.prefix, []).append(r)
|
| 97 |
+
|
| 98 |
+
to_delete: List[str] = []
|
| 99 |
+
to_update: List[Tuple[str, str]] = [] # (result_path, prediction_path)
|
| 100 |
+
unmatched_latest: List[str] = []
|
| 101 |
+
|
| 102 |
+
all_prefixes = sorted(set(preds_by_prefix) | set(results_by_prefix))
|
| 103 |
+
for prefix in all_prefixes:
|
| 104 |
+
preds = sorted(preds_by_prefix.get(prefix, []), key=lambda x: x.ts_key, reverse=True)
|
| 105 |
+
res = sorted(results_by_prefix.get(prefix, []), key=lambda x: x.ts_key, reverse=True)
|
| 106 |
+
|
| 107 |
+
latest_pred = preds[0] if preds else None
|
| 108 |
+
latest_res = res[0] if res else None
|
| 109 |
+
|
| 110 |
+
# Remove all older predictions
|
| 111 |
+
if preds:
|
| 112 |
+
for old in preds[1:]:
|
| 113 |
+
to_delete.append(old.path)
|
| 114 |
+
|
| 115 |
+
# Remove older results; keep latest for now
|
| 116 |
+
if res:
|
| 117 |
+
for old in res[1:]:
|
| 118 |
+
to_delete.append(old.path)
|
| 119 |
+
|
| 120 |
+
if latest_pred and latest_res:
|
| 121 |
+
# If a result matches the latest prediction timestamp, link it
|
| 122 |
+
if latest_res.ts_raw == latest_pred.ts_raw:
|
| 123 |
+
to_update.append((latest_res.path, latest_pred.path))
|
| 124 |
+
else:
|
| 125 |
+
unmatched_latest.append(prefix)
|
| 126 |
+
if args.drop_unmatched_results:
|
| 127 |
+
to_delete.append(latest_res.path)
|
| 128 |
+
elif latest_res and not latest_pred:
|
| 129 |
+
# Results without any predictions
|
| 130 |
+
if args.drop_unmatched_results:
|
| 131 |
+
to_delete.append(latest_res.path)
|
| 132 |
+
|
| 133 |
+
print(f"Predictions: {len(predictions)}")
|
| 134 |
+
print(f"Results: {len(results)}")
|
| 135 |
+
print(f"Delete candidates: {len(to_delete)}")
|
| 136 |
+
print(f"Results to update (link to latest predictions): {len(to_update)}")
|
| 137 |
+
if unmatched_latest:
|
| 138 |
+
print("\nPrefixes where latest result does NOT match latest prediction:")
|
| 139 |
+
for p in unmatched_latest:
|
| 140 |
+
print(" ", p)
|
| 141 |
+
|
| 142 |
+
if not args.apply:
|
| 143 |
+
print("\nDry-run only. Re-run with --apply to execute changes.")
|
| 144 |
+
return 0
|
| 145 |
+
|
| 146 |
+
# Apply updates
|
| 147 |
+
for result_path, pred_path in to_update:
|
| 148 |
+
local_path = hf_hub_download(
|
| 149 |
+
repo_id=RESULTS_REPO,
|
| 150 |
+
filename=result_path,
|
| 151 |
+
repo_type="dataset",
|
| 152 |
+
token=TOKEN,
|
| 153 |
+
)
|
| 154 |
+
with open(local_path) as f:
|
| 155 |
+
data = json.load(f)
|
| 156 |
+
data["source_predictions_file"] = pred_path
|
| 157 |
+
data["result_file_path"] = result_path
|
| 158 |
+
api.upload_file(
|
| 159 |
+
path_or_fileobj=json.dumps(data, indent=2).encode("utf-8"),
|
| 160 |
+
path_in_repo=result_path,
|
| 161 |
+
repo_id=RESULTS_REPO,
|
| 162 |
+
repo_type="dataset",
|
| 163 |
+
token=TOKEN,
|
| 164 |
+
commit_message=f"Link result to latest prediction: {result_path}",
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
# Apply deletions
|
| 168 |
+
for path in to_delete:
|
| 169 |
+
api.delete_file(
|
| 170 |
+
path_in_repo=path,
|
| 171 |
+
repo_id=RESULTS_REPO,
|
| 172 |
+
repo_type="dataset",
|
| 173 |
+
token=TOKEN,
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
print("Cleanup complete.")
|
| 177 |
+
return 0
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
raise SystemExit(main())
|
eval/delete_unlinked.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Delete Humanity submissions and unlinked results from backend-results.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
export HF_TOKEN=...
|
| 7 |
+
python streamlit_app/eval/delete_unlinked.py # dry-run
|
| 8 |
+
python streamlit_app/eval/delete_unlinked.py --apply # actually delete
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import argparse
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
from huggingface_hub import HfApi, list_repo_files
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
RESULTS_REPO = "agentic-document-ai/backend-results"
|
| 18 |
+
TOKEN = os.environ.get("HF_TOKEN")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def main() -> int:
|
| 22 |
+
parser = argparse.ArgumentParser(description="Delete Humanity and unlinked files")
|
| 23 |
+
parser.add_argument("--apply", action="store_true", help="Actually delete (default: dry-run)")
|
| 24 |
+
args = parser.parse_args()
|
| 25 |
+
|
| 26 |
+
api = HfApi(token=TOKEN)
|
| 27 |
+
files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
|
| 28 |
+
|
| 29 |
+
result_files = [f for f in files if f.endswith('.json') and '_results_' in f]
|
| 30 |
+
pred_files = [f for f in files if f.endswith('.jsonl') and '_predictions_' in f]
|
| 31 |
+
|
| 32 |
+
def key_for_result(name: str):
|
| 33 |
+
parts = name.rsplit('_results_', 1)
|
| 34 |
+
if len(parts) != 2:
|
| 35 |
+
return None
|
| 36 |
+
return f"{parts[0]}_{parts[1].replace('.json','')}"
|
| 37 |
+
|
| 38 |
+
def key_for_pred(name: str):
|
| 39 |
+
parts = name.rsplit('_predictions_', 1)
|
| 40 |
+
if len(parts) != 2:
|
| 41 |
+
return None
|
| 42 |
+
return f"{parts[0]}_{parts[1].replace('.jsonl','')}"
|
| 43 |
+
|
| 44 |
+
result_keys = {key_for_result(f): f for f in result_files if key_for_result(f)}
|
| 45 |
+
pred_keys = {key_for_pred(f): f for f in pred_files if key_for_pred(f)}
|
| 46 |
+
|
| 47 |
+
# Find unlinked results (no matching prediction)
|
| 48 |
+
unlinked_results = [result_keys[k] for k in set(result_keys) - set(pred_keys)]
|
| 49 |
+
|
| 50 |
+
# Find all Humanity files
|
| 51 |
+
humanity_files = [f for f in files if f.startswith("Humanity/")]
|
| 52 |
+
|
| 53 |
+
# Combine into deletion list (deduplicated)
|
| 54 |
+
to_delete = sorted(set(unlinked_results + humanity_files))
|
| 55 |
+
|
| 56 |
+
print(f"Files to delete: {len(to_delete)}")
|
| 57 |
+
for f in to_delete:
|
| 58 |
+
print(f" {f}")
|
| 59 |
+
|
| 60 |
+
if not to_delete:
|
| 61 |
+
print("Nothing to delete.")
|
| 62 |
+
return 0
|
| 63 |
+
|
| 64 |
+
if not args.apply:
|
| 65 |
+
print("\nDry-run mode. Add --apply to actually delete.")
|
| 66 |
+
return 0
|
| 67 |
+
|
| 68 |
+
print(f"\nDeleting {len(to_delete)} files...")
|
| 69 |
+
for f in to_delete:
|
| 70 |
+
try:
|
| 71 |
+
api.delete_file(
|
| 72 |
+
path_in_repo=f,
|
| 73 |
+
repo_id=RESULTS_REPO,
|
| 74 |
+
repo_type="dataset",
|
| 75 |
+
token=TOKEN,
|
| 76 |
+
commit_message=f"Cleanup: delete {f}"
|
| 77 |
+
)
|
| 78 |
+
print(f" ✓ Deleted: {f}")
|
| 79 |
+
except Exception as e:
|
| 80 |
+
print(f" ✗ Error deleting {f}: {e}")
|
| 81 |
+
|
| 82 |
+
print("\nDone!")
|
| 83 |
+
return 0
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
raise SystemExit(main())
|
eval/link_file_search_predictions.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Upload missing File Search predictions and link them to existing results.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
export HF_TOKEN=...
|
| 7 |
+
python streamlit_app/eval/link_file_search_predictions.py --apply
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import argparse
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
from huggingface_hub import HfApi, hf_hub_download
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
RESULTS_REPO = "agentic-document-ai/backend-results"
|
| 19 |
+
TOKEN = os.environ.get("HF_TOKEN")
|
| 20 |
+
BASE_DIR = Path(__file__).resolve().parents[2] # Project root
|
| 21 |
+
FILE_SEARCH_DIR = BASE_DIR / "file_search_results"
|
| 22 |
+
|
| 23 |
+
# Map missing results -> local prediction file
|
| 24 |
+
MISSING_RESULTS = {
|
| 25 |
+
"Google/Gemini_2.5_Flash_with_File_Search_results_20260103_221253.json": "gemini-2.5-flash.jsonl",
|
| 26 |
+
"Google/Gemini_2.5_Pro_with_File_Search_results_20260103_221943.json": "gemini-2.5-pro.jsonl",
|
| 27 |
+
"Google/Gemini_3.0_Pro_(Preview)_with_File_Search_results_20260104_120431.json": "gemini-3-pro-preview.jsonl",
|
| 28 |
+
"OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_results_20260104_121551.json": "gpt-5.2-2025-12-11.jsonl",
|
| 29 |
+
"OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_results_20260104_122026.json": "gpt-5-mini-2025-08-07.jsonl",
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _pred_path_from_result(result_path: str) -> str:
|
| 34 |
+
# {org}/{model}_results_{ts}.json -> {org}/{model}_predictions_{ts}.jsonl
|
| 35 |
+
base, ts = result_path.rsplit("_results_", 1)
|
| 36 |
+
ts = ts.replace(".json", "")
|
| 37 |
+
return f"{base}_predictions_{ts}.jsonl"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main() -> int:
|
| 41 |
+
parser = argparse.ArgumentParser(description="Upload file_search_results predictions and link them to results.")
|
| 42 |
+
parser.add_argument("--apply", action="store_true", help="Apply uploads/updates (default: dry-run)")
|
| 43 |
+
args = parser.parse_args()
|
| 44 |
+
|
| 45 |
+
if not FILE_SEARCH_DIR.exists():
|
| 46 |
+
raise FileNotFoundError(f"Missing directory: {FILE_SEARCH_DIR}")
|
| 47 |
+
|
| 48 |
+
api = HfApi(token=TOKEN)
|
| 49 |
+
|
| 50 |
+
actions = []
|
| 51 |
+
for result_path, local_name in MISSING_RESULTS.items():
|
| 52 |
+
local_file = FILE_SEARCH_DIR / local_name
|
| 53 |
+
if not local_file.exists():
|
| 54 |
+
raise FileNotFoundError(f"Missing local prediction file: {local_file}")
|
| 55 |
+
|
| 56 |
+
pred_path = _pred_path_from_result(result_path)
|
| 57 |
+
actions.append((result_path, pred_path, local_file))
|
| 58 |
+
|
| 59 |
+
print(f"Planned uploads: {len(actions)}")
|
| 60 |
+
for result_path, pred_path, local_file in actions:
|
| 61 |
+
print(f"- {local_file.name} -> {pred_path}")
|
| 62 |
+
|
| 63 |
+
if not args.apply:
|
| 64 |
+
print("\nDry-run only. Re-run with --apply to execute.")
|
| 65 |
+
return 0
|
| 66 |
+
|
| 67 |
+
for result_path, pred_path, local_file in actions:
|
| 68 |
+
# Upload predictions file
|
| 69 |
+
api.upload_file(
|
| 70 |
+
path_or_fileobj=str(local_file),
|
| 71 |
+
path_in_repo=pred_path,
|
| 72 |
+
repo_id=RESULTS_REPO,
|
| 73 |
+
repo_type="dataset",
|
| 74 |
+
token=TOKEN,
|
| 75 |
+
commit_message=f"Add predictions for {pred_path}",
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Update results JSON with linkage fields
|
| 79 |
+
local_result = hf_hub_download(
|
| 80 |
+
repo_id=RESULTS_REPO,
|
| 81 |
+
filename=result_path,
|
| 82 |
+
repo_type="dataset",
|
| 83 |
+
token=TOKEN,
|
| 84 |
+
)
|
| 85 |
+
with open(local_result) as f:
|
| 86 |
+
data = json.load(f)
|
| 87 |
+
data["source_predictions_file"] = pred_path
|
| 88 |
+
data["result_file_path"] = result_path
|
| 89 |
+
api.upload_file(
|
| 90 |
+
path_or_fileobj=json.dumps(data, indent=2).encode("utf-8"),
|
| 91 |
+
path_in_repo=result_path,
|
| 92 |
+
repo_id=RESULTS_REPO,
|
| 93 |
+
repo_type="dataset",
|
| 94 |
+
token=TOKEN,
|
| 95 |
+
commit_message=f"Link result to predictions: {result_path}",
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
print("Done.")
|
| 99 |
+
return 0
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
if __name__ == "__main__":
|
| 103 |
+
raise SystemExit(main())
|
eval/metrics.py
CHANGED
|
@@ -328,13 +328,17 @@ def _get_gemini_model():
|
|
| 328 |
def _call_gemini_with_timeout(model, prompt, timeout=30):
|
| 329 |
"""Call Gemini with a timeout using threading."""
|
| 330 |
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
def _call():
|
| 333 |
return model.generate_content(
|
| 334 |
prompt,
|
| 335 |
tools=[_LLM_JUDGE_TOOL],
|
| 336 |
tool_config={"function_calling_config": {"mode": "ANY"}},
|
| 337 |
-
generation_config={"temperature":
|
| 338 |
request_options={"timeout": timeout},
|
| 339 |
)
|
| 340 |
|
|
|
|
| 328 |
def _call_gemini_with_timeout(model, prompt, timeout=30):
|
| 329 |
"""Call Gemini with a timeout using threading."""
|
| 330 |
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
|
| 331 |
+
try:
|
| 332 |
+
temperature = float(os.environ.get("GEMINI_TEMPERATURE", "0"))
|
| 333 |
+
except ValueError:
|
| 334 |
+
temperature = 0.0
|
| 335 |
|
| 336 |
def _call():
|
| 337 |
return model.generate_content(
|
| 338 |
prompt,
|
| 339 |
tools=[_LLM_JUDGE_TOOL],
|
| 340 |
tool_config={"function_calling_config": {"mode": "ANY"}},
|
| 341 |
+
generation_config={"temperature": temperature},
|
| 342 |
request_options={"timeout": timeout},
|
| 343 |
)
|
| 344 |
|
eval/reevaluate_submissions.py
CHANGED
|
@@ -30,6 +30,7 @@ from metrics import (
|
|
| 30 |
# Config
|
| 31 |
RESULTS_REPO = "agentic-document-ai/backend-results"
|
| 32 |
TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def load_gold_data():
|
|
|
|
| 30 |
# Config
|
| 31 |
RESULTS_REPO = "agentic-document-ai/backend-results"
|
| 32 |
TOKEN = os.environ.get("HF_TOKEN")
|
| 33 |
+
os.environ.setdefault("GEMINI_TEMPERATURE", "0")
|
| 34 |
|
| 35 |
|
| 36 |
def load_gold_data():
|
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json
CHANGED
|
@@ -15,28 +15,28 @@
|
|
| 15 |
"submission_date": "2026-01-10T13:16:29.905067+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls":
|
| 24 |
-
"page_f1": 58.
|
| 25 |
-
"doc_f1": 80.
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls": 63.
|
| 31 |
-
"n":
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
"semantic": 36.443148688046655,
|
| 35 |
-
"anls":
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 53.75681851851068,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
@@ -52,33 +52,33 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
-
"anls":
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
-
"semantic":
|
| 66 |
"anls": 57.39996898263027,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 62.961354754667845,
|
| 71 |
-
"anls": 60.
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
-
"anls":
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
|
@@ -91,13 +91,18 @@
|
|
| 91 |
"anls": 73.07522250524337,
|
| 92 |
"n": 24
|
| 93 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
"Reference": {
|
| 95 |
-
"semantic":
|
| 96 |
-
"anls": 63.
|
| 97 |
"n": 52
|
| 98 |
},
|
| 99 |
"Reports": {
|
| 100 |
-
"semantic":
|
| 101 |
"anls": 53.11616787903517,
|
| 102 |
"n": 75
|
| 103 |
},
|
|
@@ -107,10 +112,10 @@
|
|
| 107 |
"n": 23
|
| 108 |
}
|
| 109 |
},
|
| 110 |
-
"n_evaluated":
|
| 111 |
-
"n_unmatched":
|
| 112 |
},
|
| 113 |
-
"reevaluated_date": "2026-01-
|
| 114 |
"source_predictions_file": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_predictions_20260110_131629.jsonl",
|
| 115 |
"result_file_path": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json"
|
| 116 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-10T13:16:29.905067+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 60.30612244897959,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
55.908758540500614,
|
| 21 |
+
64.70348635745857
|
| 22 |
],
|
| 23 |
+
"anls": 58.219456137310175,
|
| 24 |
+
"page_f1": 58.609523809523814,
|
| 25 |
+
"doc_f1": 80.4647619047619,
|
| 26 |
+
"kuiper": 36.59000000000024
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 66.53620352250489,
|
| 30 |
+
"anls": 63.67469870606113,
|
| 31 |
+
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
"semantic": 36.443148688046655,
|
| 35 |
+
"anls": 37.224586863080155,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 55.022008803521395,
|
| 40 |
"anls": 53.75681851851068,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 72.27891156462587,
|
| 56 |
+
"anls": 68.58097100032585,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 48.802129547471154,
|
| 61 |
+
"anls": 51.018941700195,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
+
"semantic": 57.39795918367348,
|
| 66 |
"anls": 57.39996898263027,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 62.961354754667845,
|
| 71 |
+
"anls": 60.57273973419155,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 57.24240915878547,
|
| 76 |
+
"anls": 55.177371864188515,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 59.32605600379688,
|
| 81 |
+
"anls": 55.25846892125962,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
|
|
|
| 91 |
"anls": 73.07522250524337,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
+
"Other": {
|
| 95 |
+
"semantic": 100.0,
|
| 96 |
+
"anls": 100.0,
|
| 97 |
+
"n": 1
|
| 98 |
+
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 65.73783359497645,
|
| 101 |
+
"anls": 63.24305900202882,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 59.18367346938776,
|
| 106 |
"anls": 53.11616787903517,
|
| 107 |
"n": 75
|
| 108 |
},
|
|
|
|
| 112 |
"n": 23
|
| 113 |
}
|
| 114 |
},
|
| 115 |
+
"n_evaluated": 500,
|
| 116 |
+
"n_unmatched": 1766
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T21:56:11.212274+00:00",
|
| 119 |
"source_predictions_file": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_predictions_20260110_131629.jsonl",
|
| 120 |
"result_file_path": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json"
|
| 121 |
}
|
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json
CHANGED
|
@@ -15,28 +15,28 @@
|
|
| 15 |
"submission_date": "2026-01-10T13:20:54.125677+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls":
|
| 24 |
-
"page_f1": 54.
|
| 25 |
-
"doc_f1": 78.
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic": 65.
|
| 30 |
-
"anls":
|
| 31 |
-
"n":
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic": 27.
|
| 35 |
-
"anls":
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 56.55143191196322,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
@@ -53,12 +53,12 @@
|
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
"semantic": 80.78231292517005,
|
| 56 |
-
"anls":
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -67,13 +67,13 @@
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls": 56.
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
|
@@ -82,7 +82,7 @@
|
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
-
"semantic":
|
| 86 |
"anls": 54.65844817149165,
|
| 87 |
"n": 25
|
| 88 |
},
|
|
@@ -91,26 +91,31 @@
|
|
| 91 |
"anls": 73.59601449275362,
|
| 92 |
"n": 24
|
| 93 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
"Reference": {
|
| 95 |
-
"semantic":
|
| 96 |
-
"anls": 68.
|
| 97 |
"n": 52
|
| 98 |
},
|
| 99 |
"Reports": {
|
| 100 |
-
"semantic": 57.
|
| 101 |
"anls": 56.44955119487462,
|
| 102 |
"n": 75
|
| 103 |
},
|
| 104 |
"Technical": {
|
| 105 |
-
"semantic":
|
| 106 |
"anls": 51.60498619336015,
|
| 107 |
"n": 23
|
| 108 |
}
|
| 109 |
},
|
| 110 |
-
"n_evaluated":
|
| 111 |
-
"n_unmatched":
|
| 112 |
},
|
| 113 |
-
"reevaluated_date": "2026-01-
|
| 114 |
"source_predictions_file": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132054.jsonl",
|
| 115 |
"result_file_path": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json"
|
| 116 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-10T13:20:54.125677+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 58.77551020408164,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
54.35541948342135,
|
| 21 |
+
63.195600924741925
|
| 22 |
],
|
| 23 |
+
"anls": 58.50093245097659,
|
| 24 |
+
"page_f1": 54.72095238095238,
|
| 25 |
+
"doc_f1": 78.60761904761905,
|
| 26 |
+
"kuiper": 38.259999999999735
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 65.97707576181158,
|
| 30 |
+
"anls": 65.20641198351612,
|
| 31 |
+
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 27.939747327502424,
|
| 35 |
+
"anls": 30.547652666604648,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 58.023209283713484,
|
| 40 |
"anls": 56.55143191196322,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
|
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
"semantic": 80.78231292517005,
|
| 56 |
+
"anls": 79.22911914847398,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 48.802129547471154,
|
| 61 |
+
"anls": 48.51967409520347,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 51.02040816326531,
|
| 71 |
+
"anls": 56.616157972458616,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 64.70881035340967,
|
| 76 |
+
"anls": 62.33386966416003,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
|
|
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
+
"semantic": 42.85714285714285,
|
| 86 |
"anls": 54.65844817149165,
|
| 87 |
"n": 25
|
| 88 |
},
|
|
|
|
| 91 |
"anls": 73.59601449275362,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
+
"Other": {
|
| 95 |
+
"semantic": 100.0,
|
| 96 |
+
"anls": 100.0,
|
| 97 |
+
"n": 1
|
| 98 |
+
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 65.73783359497645,
|
| 101 |
+
"anls": 68.63568984240118,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 57.82312925170068,
|
| 106 |
"anls": 56.44955119487462,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 62.111801242236034,
|
| 111 |
"anls": 51.60498619336015,
|
| 112 |
"n": 23
|
| 113 |
}
|
| 114 |
},
|
| 115 |
+
"n_evaluated": 500,
|
| 116 |
+
"n_unmatched": 1766
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T21:57:09.736897+00:00",
|
| 119 |
"source_predictions_file": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132054.jsonl",
|
| 120 |
"result_file_path": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json"
|
| 121 |
}
|
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json
CHANGED
|
@@ -15,29 +15,29 @@
|
|
| 15 |
"submission_date": "2026-01-10T13:23:58.123387+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
42.
|
| 21 |
-
51.
|
| 22 |
],
|
| 23 |
-
"anls": 45.
|
| 24 |
-
"page_f1": 47.
|
| 25 |
-
"doc_f1": 69.
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic": 54.
|
| 30 |
-
"anls":
|
| 31 |
-
"n":
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls": 18.
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
-
"anls":
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
|
@@ -52,13 +52,13 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
"anls": 54.598842018196855,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -68,12 +68,12 @@
|
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 44.507164567954845,
|
| 71 |
-
"anls": 44.
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
|
@@ -87,18 +87,23 @@
|
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
-
"semantic":
|
| 91 |
"anls": 59.60305559882987,
|
| 92 |
"n": 24
|
| 93 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
"Reference": {
|
| 95 |
-
"semantic":
|
| 96 |
-
"anls": 50.
|
| 97 |
"n": 52
|
| 98 |
},
|
| 99 |
"Reports": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls":
|
| 102 |
"n": 75
|
| 103 |
},
|
| 104 |
"Technical": {
|
|
@@ -107,10 +112,10 @@
|
|
| 107 |
"n": 23
|
| 108 |
}
|
| 109 |
},
|
| 110 |
-
"n_evaluated":
|
| 111 |
-
"n_unmatched":
|
| 112 |
},
|
| 113 |
-
"reevaluated_date": "2026-01-
|
| 114 |
"source_predictions_file": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132358.jsonl",
|
| 115 |
"result_file_path": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json"
|
| 116 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-10T13:23:58.123387+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 47.3469387755102,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
42.886491622183534,
|
| 21 |
+
51.807385928836865
|
| 22 |
],
|
| 23 |
+
"anls": 45.99676623639152,
|
| 24 |
+
"page_f1": 47.590158730158734,
|
| 25 |
+
"doc_f1": 69.43333333333334,
|
| 26 |
+
"kuiper": 50.216000000000136
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 54.794520547945204,
|
| 30 |
+
"anls": 52.49777090665106,
|
| 31 |
+
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 17.006802721088444,
|
| 35 |
+
"anls": 18.782788634002245,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 44.01760704281713,
|
| 40 |
+
"anls": 44.29299003944969,
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 63.775510204081634,
|
| 56 |
"anls": 54.598842018196855,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 31.055900621118017,
|
| 61 |
+
"anls": 31.32939863081144,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 44.507164567954845,
|
| 71 |
+
"anls": 44.44710508540295,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 49.77600796416127,
|
| 76 |
+
"anls": 52.59959228347652,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
|
|
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
+
"semantic": 61.64965986394556,
|
| 91 |
"anls": 59.60305559882987,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
+
"Other": {
|
| 95 |
+
"semantic": 100.0,
|
| 96 |
+
"anls": 100.0,
|
| 97 |
+
"n": 1
|
| 98 |
+
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 44.15227629513344,
|
| 101 |
+
"anls": 50.50485651369861,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 55.78231292517006,
|
| 106 |
+
"anls": 49.929936422448826,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
|
|
|
| 112 |
"n": 23
|
| 113 |
}
|
| 114 |
},
|
| 115 |
+
"n_evaluated": 500,
|
| 116 |
+
"n_unmatched": 1766
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T21:58:16.385696+00:00",
|
| 119 |
"source_predictions_file": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132358.jsonl",
|
| 120 |
"result_file_path": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json"
|
| 121 |
}
|
eval/reevaluated_results/Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json
CHANGED
|
@@ -15,24 +15,24 @@
|
|
| 15 |
"submission_date": "2026-01-09T13:03:19.649656+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
63.
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls": 61.
|
| 24 |
"page_f1": 72.02476190476192,
|
| 25 |
"doc_f1": 88.24761904761905,
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls": 66.
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls":
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
-
"semantic":
|
| 46 |
"anls": 63.92691050779287,
|
| 47 |
"n": 15
|
| 48 |
},
|
|
@@ -52,23 +52,23 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
"anls": 72.62325637325637,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls": 54.
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
-
"semantic":
|
| 66 |
"anls": 68.77016129032259,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls":
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
|
@@ -82,7 +82,7 @@
|
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
-
"semantic":
|
| 86 |
"anls": 41.69842237151431,
|
| 87 |
"n": 25
|
| 88 |
},
|
|
@@ -97,17 +97,17 @@
|
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls": 64.
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
-
"anls": 65.
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
-
"semantic":
|
| 111 |
"anls": 64.75817505570946,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_predictions_20260109_130319.jsonl",
|
| 120 |
"result_file_path": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T13:03:19.649656+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 68.16326530612247,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
63.95120785444595,
|
| 21 |
+
72.37532275779898
|
| 22 |
],
|
| 23 |
+
"anls": 61.45994021171436,
|
| 24 |
"page_f1": 72.02476190476192,
|
| 25 |
"doc_f1": 88.24761904761905,
|
| 26 |
+
"kuiper": 50.710843373494285
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 73.24573665082471,
|
| 30 |
+
"anls": 66.15560605815631,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 47.9834791059281,
|
| 35 |
+
"anls": 41.74616449825649,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
|
|
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
+
"semantic": 68.02721088435374,
|
| 46 |
"anls": 63.92691050779287,
|
| 47 |
"n": 15
|
| 48 |
},
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 89.28571428571429,
|
| 56 |
"anls": 72.62325637325637,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 59.89352262644188,
|
| 61 |
+
"anls": 54.1464804322174,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
+
"semantic": 76.53061224489795,
|
| 66 |
"anls": 68.77016129032259,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 62.961354754667845,
|
| 71 |
+
"anls": 61.67613431405342,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
|
|
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
+
"semantic": 51.02040816326531,
|
| 86 |
"anls": 41.69842237151431,
|
| 87 |
"n": 25
|
| 88 |
},
|
|
|
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 64.75667189952904,
|
| 101 |
+
"anls": 64.40744082414207,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 80.95238095238096,
|
| 106 |
+
"anls": 65.21528928243777,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 73.20319432120675,
|
| 111 |
"anls": 64.75817505570946,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T21:59:23.189394+00:00",
|
| 119 |
"source_predictions_file": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_predictions_20260109_130319.jsonl",
|
| 120 |
"result_file_path": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json"
|
| 121 |
}
|
eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json
CHANGED
|
@@ -15,24 +15,24 @@
|
|
| 15 |
"submission_date": "2026-01-09T12:58:16.611348+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls":
|
| 24 |
"page_f1": 79.12333333333333,
|
| 25 |
"doc_f1": 92.98636363636363,
|
| 26 |
-
"kuiper": 35.
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls":
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
|
@@ -42,8 +42,8 @@
|
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
-
"semantic":
|
| 46 |
-
"anls":
|
| 47 |
"n": 15
|
| 48 |
},
|
| 49 |
"Education": {
|
|
@@ -57,27 +57,27 @@
|
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
-
"semantic":
|
| 66 |
"anls": 76.26728110599078,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls":
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
"semantic": 77.15281234444997,
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
"anls": 72.74221043114129,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
@@ -93,21 +93,21 @@
|
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
"semantic": 0.0,
|
| 96 |
-
"anls":
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls": 72.
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
-
"anls":
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
-
"semantic":
|
| 111 |
"anls": 60.23577215564363,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_predictions_20260109_125816.jsonl",
|
| 120 |
"result_file_path": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T12:58:16.611348+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 80.61224489795919,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
76.96923621662829,
|
| 21 |
+
84.25525357929008
|
| 22 |
],
|
| 23 |
+
"anls": 72.85884587946542,
|
| 24 |
"page_f1": 79.12333333333333,
|
| 25 |
"doc_f1": 92.98636363636363,
|
| 26 |
+
"kuiper": 35.0526315789474
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 83.58960022365112,
|
| 30 |
+
"anls": 75.8229313455021,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 66.81243926141885,
|
| 35 |
+
"anls": 61.03274369327326,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
|
|
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
+
"semantic": 85.0340136054422,
|
| 46 |
+
"anls": 76.17758784425452,
|
| 47 |
"n": 15
|
| 48 |
},
|
| 49 |
"Education": {
|
|
|
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 73.75776397515527,
|
| 61 |
+
"anls": 69.52764532211991,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
+
"semantic": 76.53061224489795,
|
| 66 |
"anls": 76.26728110599078,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 86.84324793747285,
|
| 71 |
+
"anls": 76.0805203770741,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
"semantic": 77.15281234444997,
|
| 76 |
+
"anls": 74.07111615750408,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 91.36212624584718,
|
| 81 |
"anls": 72.74221043114129,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
|
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
"semantic": 0.0,
|
| 96 |
+
"anls": 76.92307692307692,
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 72.60596546310833,
|
| 101 |
+
"anls": 72.8846701655712,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 87.07482993197281,
|
| 106 |
+
"anls": 73.90230357912102,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 75.42147293700089,
|
| 111 |
"anls": 60.23577215564363,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:00:12.094993+00:00",
|
| 119 |
"source_predictions_file": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_predictions_20260109_125816.jsonl",
|
| 120 |
"result_file_path": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json"
|
| 121 |
}
|
eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json
CHANGED
|
@@ -15,28 +15,28 @@
|
|
| 15 |
"submission_date": "2026-01-09T18:25:59.636344+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic": 58.
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
62.
|
| 22 |
],
|
| 23 |
-
"anls": 55.
|
| 24 |
"page_f1": 60.9663492063492,
|
| 25 |
"doc_f1": 78.82920634920634,
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls": 60.
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic": 30.
|
| 35 |
-
"anls": 33.
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 57.1545284780579,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
@@ -52,13 +52,13 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
-
"anls":
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic": 42.
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -68,26 +68,26 @@
|
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 66.21797655232307,
|
| 71 |
-
"anls": 67.
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
"anls": 60.95035529628296,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
"anls": 51.45105745077384,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
-
"semantic":
|
| 86 |
"anls": 54.40739778239778,
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
-
"semantic":
|
| 91 |
"anls": 73.82172131147541,
|
| 92 |
"n": 24
|
| 93 |
},
|
|
@@ -97,17 +97,17 @@
|
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls": 64.
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
"anls": 45.47473759975617,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
-
"semantic":
|
| 111 |
"anls": 35.96181299748582,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_predictions_20260109_182559.jsonl",
|
| 120 |
"result_file_path": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T18:25:59.636344+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 58.46938775510204,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
54.04525386645418,
|
| 21 |
+
62.89352164374991
|
| 22 |
],
|
| 23 |
+
"anls": 55.989429815086645,
|
| 24 |
"page_f1": 60.9663492063492,
|
| 25 |
"doc_f1": 78.82920634920634,
|
| 26 |
+
"kuiper": 46.50600000000012
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 65.27816606094493,
|
| 30 |
+
"anls": 60.972189892385046,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 30.36929057337221,
|
| 35 |
+
"anls": 33.63076957668846,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 56.022408963585434,
|
| 40 |
"anls": 57.1545284780579,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 85.0340136054422,
|
| 56 |
+
"anls": 78.93976897689768,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 42.14729370008874,
|
| 61 |
+
"anls": 43.10328860396467,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 66.21797655232307,
|
| 71 |
+
"anls": 67.7106280521734,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 60.97560975609757,
|
| 76 |
"anls": 60.95035529628296,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 53.393450403417184,
|
| 81 |
"anls": 51.45105745077384,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
+
"semantic": 53.06122448979592,
|
| 86 |
"anls": 54.40739778239778,
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
+
"semantic": 78.65646258503402,
|
| 91 |
"anls": 73.82172131147541,
|
| 92 |
"n": 24
|
| 93 |
},
|
|
|
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 62.79434850863422,
|
| 101 |
+
"anls": 64.52616096971524,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 55.102040816326536,
|
| 106 |
"anls": 45.47473759975617,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 48.802129547471154,
|
| 111 |
"anls": 35.96181299748582,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:01:09.960296+00:00",
|
| 119 |
"source_predictions_file": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_predictions_20260109_182559.jsonl",
|
| 120 |
"result_file_path": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json"
|
| 121 |
}
|
eval/reevaluated_results/Google/Gemini_2.5_Flash_with_File_Search_results_20260103_221253.json
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Gemini 2.5 Flash with File Search",
|
| 3 |
+
"organization": "Google",
|
| 4 |
+
"description": "Managed, single-shot retrieval mechanism.",
|
| 5 |
+
"link": "https://ai.google.dev/gemini-api/docs/file-search",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Conventional RAG",
|
| 8 |
+
"Semantic Search Tool"
|
| 9 |
+
],
|
| 10 |
+
"submitted_by": "Borchmann",
|
| 11 |
+
"metadata": {
|
| 12 |
+
"model_type": "api"
|
| 13 |
+
},
|
| 14 |
+
"submission_date": "2026-01-03T22:12:53.645813+00:00",
|
| 15 |
+
"results": {
|
| 16 |
+
"overall": {
|
| 17 |
+
"semantic": 71.83673469387755,
|
| 18 |
+
"semantic_ci": [
|
| 19 |
+
67.75383041519983,
|
| 20 |
+
75.91963897255526
|
| 21 |
+
],
|
| 22 |
+
"anls": 56.38605375030021,
|
| 23 |
+
"page_f1": 52.15333333333333,
|
| 24 |
+
"doc_f1": 80.91445887445887,
|
| 25 |
+
"kuiper": 14.495999999999947
|
| 26 |
+
},
|
| 27 |
+
"single_evidence": {
|
| 28 |
+
"semantic": 74.0844282918647,
|
| 29 |
+
"anls": 59.70060588908858,
|
| 30 |
+
"n": 365
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_same_doc": {
|
| 33 |
+
"semantic": 61.34596695821186,
|
| 34 |
+
"anls": 44.2307193586438,
|
| 35 |
+
"n": 84
|
| 36 |
+
},
|
| 37 |
+
"multi_evidence_multi_doc": {
|
| 38 |
+
"semantic": 73.02921168467388,
|
| 39 |
+
"anls": 52.68480979424892,
|
| 40 |
+
"n": 51
|
| 41 |
+
},
|
| 42 |
+
"by_domain": {
|
| 43 |
+
"Cases/Logs": {
|
| 44 |
+
"semantic": 78.2312925170068,
|
| 45 |
+
"anls": 76.85185185185186,
|
| 46 |
+
"n": 15
|
| 47 |
+
},
|
| 48 |
+
"Education": {
|
| 49 |
+
"semantic": 83.4879406307978,
|
| 50 |
+
"anls": 59.32605273514364,
|
| 51 |
+
"n": 22
|
| 52 |
+
},
|
| 53 |
+
"Events": {
|
| 54 |
+
"semantic": 87.15986394557822,
|
| 55 |
+
"anls": 67.94733044733044,
|
| 56 |
+
"n": 24
|
| 57 |
+
},
|
| 58 |
+
"Financial": {
|
| 59 |
+
"semantic": 64.88464951197871,
|
| 60 |
+
"anls": 53.16591091793465,
|
| 61 |
+
"n": 92
|
| 62 |
+
},
|
| 63 |
+
"Financial/Tax": {
|
| 64 |
+
"semantic": 60.58673469387755,
|
| 65 |
+
"anls": 45.19230769230769,
|
| 66 |
+
"n": 16
|
| 67 |
+
},
|
| 68 |
+
"Government/Regulatory": {
|
| 69 |
+
"semantic": 68.38905775075989,
|
| 70 |
+
"anls": 52.69755357661292,
|
| 71 |
+
"n": 47
|
| 72 |
+
},
|
| 73 |
+
"HR/Employment": {
|
| 74 |
+
"semantic": 70.93081134892981,
|
| 75 |
+
"anls": 56.2715906011529,
|
| 76 |
+
"n": 41
|
| 77 |
+
},
|
| 78 |
+
"Legal": {
|
| 79 |
+
"semantic": 85.4295206454675,
|
| 80 |
+
"anls": 63.338167000957704,
|
| 81 |
+
"n": 43
|
| 82 |
+
},
|
| 83 |
+
"Media/Publishing": {
|
| 84 |
+
"semantic": 65.30612244897961,
|
| 85 |
+
"anls": 61.1117685382045,
|
| 86 |
+
"n": 25
|
| 87 |
+
},
|
| 88 |
+
"Misc": {
|
| 89 |
+
"semantic": 70.1530612244898,
|
| 90 |
+
"anls": 60.28005464480874,
|
| 91 |
+
"n": 24
|
| 92 |
+
},
|
| 93 |
+
"Other": {
|
| 94 |
+
"semantic": 51.02040816326531,
|
| 95 |
+
"anls": 59.09090909090908,
|
| 96 |
+
"n": 1
|
| 97 |
+
},
|
| 98 |
+
"Reference": {
|
| 99 |
+
"semantic": 72.60596546310833,
|
| 100 |
+
"anls": 56.51193666615843,
|
| 101 |
+
"n": 52
|
| 102 |
+
},
|
| 103 |
+
"Reports": {
|
| 104 |
+
"semantic": 69.38775510204084,
|
| 105 |
+
"anls": 50.21750576702014,
|
| 106 |
+
"n": 75
|
| 107 |
+
},
|
| 108 |
+
"Technical": {
|
| 109 |
+
"semantic": 75.42147293700089,
|
| 110 |
+
"anls": 54.08686051995165,
|
| 111 |
+
"n": 23
|
| 112 |
+
}
|
| 113 |
+
},
|
| 114 |
+
"n_evaluated": 500,
|
| 115 |
+
"n_unmatched": 0
|
| 116 |
+
},
|
| 117 |
+
"reevaluated_date": "2026-01-22T22:02:26.923804+00:00",
|
| 118 |
+
"source_predictions_file": "Google/Gemini_2.5_Flash_with_File_Search_predictions_20260103_221253.jsonl",
|
| 119 |
+
"result_file_path": "Google/Gemini_2.5_Flash_with_File_Search_results_20260103_221253.json"
|
| 120 |
+
}
|
eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json
CHANGED
|
@@ -15,28 +15,28 @@
|
|
| 15 |
"submission_date": "2026-01-09T18:30:30.608183+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic": 60.
|
| 19 |
"semantic_ci": [
|
| 20 |
-
55.
|
| 21 |
-
64.
|
| 22 |
],
|
| 23 |
-
"anls":
|
| 24 |
"page_f1": 60.299220779220775,
|
| 25 |
"doc_f1": 74.23636363636363,
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls": 61.
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls": 36.
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 51.343142438142856,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
@@ -52,13 +52,13 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
-
"anls":
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -67,23 +67,23 @@
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls":
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
"anls": 60.44220952048519,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
-
"semantic":
|
| 86 |
-
"anls":
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
|
@@ -93,21 +93,21 @@
|
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
"semantic": 0.0,
|
| 96 |
-
"anls":
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls":
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
-
"anls": 54.
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
-
"semantic":
|
| 111 |
"anls": 40.50127359810298,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_predictions_20260109_183030.jsonl",
|
| 120 |
"result_file_path": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T18:30:30.608183+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 60.10204081632653,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
55.70140355733895,
|
| 21 |
+
64.50267807531411
|
| 22 |
],
|
| 23 |
+
"anls": 56.41686335308852,
|
| 24 |
"page_f1": 60.299220779220775,
|
| 25 |
"doc_f1": 74.23636363636363,
|
| 26 |
+
"kuiper": 38.901999999999795
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 67.0953312831982,
|
| 30 |
+
"anls": 61.720866213550785,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 34.62099125364433,
|
| 35 |
+
"anls": 36.450181479201696,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 52.02080832332933,
|
| 40 |
"anls": 51.343142438142856,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 72.27891156462587,
|
| 56 |
+
"anls": 64.7615265558814,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 50.46583850931676,
|
| 61 |
+
"anls": 49.206250224745276,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 65.13243595310465,
|
| 71 |
+
"anls": 62.58648616221405,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 58.4868093578895,
|
| 76 |
+
"anls": 49.471796951296525,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 66.44518272425249,
|
| 81 |
"anls": 60.44220952048519,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
+
"semantic": 44.897959183673464,
|
| 86 |
+
"anls": 53.44125874125874,
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
|
|
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
"semantic": 0.0,
|
| 96 |
+
"anls": 76.92307692307692,
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 66.71899529042385,
|
| 101 |
+
"anls": 64.18194203992306,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 61.904761904761905,
|
| 106 |
+
"anls": 54.81691417642573,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 44.365572315882865,
|
| 111 |
"anls": 40.50127359810298,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:03:24.876553+00:00",
|
| 119 |
"source_predictions_file": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_predictions_20260109_183030.jsonl",
|
| 120 |
"result_file_path": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json"
|
| 121 |
}
|
eval/reevaluated_results/Google/Gemini_2.5_Pro_with_File_Search_results_20260103_221943.json
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Gemini 2.5 Pro with File Search",
|
| 3 |
+
"organization": "Google",
|
| 4 |
+
"description": "Managed, single-shot retrieval mechanism.",
|
| 5 |
+
"link": "https://ai.google.dev/gemini-api/docs/file-search",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Conventional RAG",
|
| 8 |
+
"Semantic Search Tool"
|
| 9 |
+
],
|
| 10 |
+
"submitted_by": "Borchmann",
|
| 11 |
+
"metadata": {
|
| 12 |
+
"model_type": "api"
|
| 13 |
+
},
|
| 14 |
+
"submission_date": "2026-01-03T22:19:43.085381+00:00",
|
| 15 |
+
"results": {
|
| 16 |
+
"overall": {
|
| 17 |
+
"semantic": 73.06122448979592,
|
| 18 |
+
"semantic_ci": [
|
| 19 |
+
69.02799722537132,
|
| 20 |
+
77.0944517542205
|
| 21 |
+
],
|
| 22 |
+
"anls": 67.06445662551177,
|
| 23 |
+
"page_f1": 60.936190476190475,
|
| 24 |
+
"doc_f1": 87.83333333333334,
|
| 25 |
+
"kuiper": 15.25
|
| 26 |
+
},
|
| 27 |
+
"single_evidence": {
|
| 28 |
+
"semantic": 75.06290187307802,
|
| 29 |
+
"anls": 69.51189266445841,
|
| 30 |
+
"n": 365
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_same_doc": {
|
| 33 |
+
"semantic": 70.45675413022352,
|
| 34 |
+
"anls": 62.81319563063059,
|
| 35 |
+
"n": 84
|
| 36 |
+
},
|
| 37 |
+
"multi_evidence_multi_doc": {
|
| 38 |
+
"semantic": 63.025210084033624,
|
| 39 |
+
"anls": 56.5505697501097,
|
| 40 |
+
"n": 51
|
| 41 |
+
},
|
| 42 |
+
"by_domain": {
|
| 43 |
+
"Cases/Logs": {
|
| 44 |
+
"semantic": 74.82993197278913,
|
| 45 |
+
"anls": 72.85185185185185,
|
| 46 |
+
"n": 15
|
| 47 |
+
},
|
| 48 |
+
"Education": {
|
| 49 |
+
"semantic": 83.4879406307978,
|
| 50 |
+
"anls": 66.82605273514365,
|
| 51 |
+
"n": 22
|
| 52 |
+
},
|
| 53 |
+
"Events": {
|
| 54 |
+
"semantic": 80.78231292517005,
|
| 55 |
+
"anls": 63.51613609636826,
|
| 56 |
+
"n": 24
|
| 57 |
+
},
|
| 58 |
+
"Financial": {
|
| 59 |
+
"semantic": 73.75776397515527,
|
| 60 |
+
"anls": 71.81492329679233,
|
| 61 |
+
"n": 92
|
| 62 |
+
},
|
| 63 |
+
"Financial/Tax": {
|
| 64 |
+
"semantic": 44.642857142857146,
|
| 65 |
+
"anls": 48.56036324786325,
|
| 66 |
+
"n": 16
|
| 67 |
+
},
|
| 68 |
+
"Government/Regulatory": {
|
| 69 |
+
"semantic": 82.50108554059923,
|
| 70 |
+
"anls": 71.42658144393866,
|
| 71 |
+
"n": 47
|
| 72 |
+
},
|
| 73 |
+
"HR/Employment": {
|
| 74 |
+
"semantic": 74.66401194624191,
|
| 75 |
+
"anls": 67.74502588179764,
|
| 76 |
+
"n": 41
|
| 77 |
+
},
|
| 78 |
+
"Legal": {
|
| 79 |
+
"semantic": 77.12387280493593,
|
| 80 |
+
"anls": 68.13676633444075,
|
| 81 |
+
"n": 43
|
| 82 |
+
},
|
| 83 |
+
"Media/Publishing": {
|
| 84 |
+
"semantic": 69.38775510204084,
|
| 85 |
+
"anls": 72.05791173717081,
|
| 86 |
+
"n": 25
|
| 87 |
+
},
|
| 88 |
+
"Misc": {
|
| 89 |
+
"semantic": 72.27891156462587,
|
| 90 |
+
"anls": 69.65505464480874,
|
| 91 |
+
"n": 24
|
| 92 |
+
},
|
| 93 |
+
"Other": {
|
| 94 |
+
"semantic": 51.02040816326531,
|
| 95 |
+
"anls": 59.09090909090908,
|
| 96 |
+
"n": 1
|
| 97 |
+
},
|
| 98 |
+
"Reference": {
|
| 99 |
+
"semantic": 71.62480376766091,
|
| 100 |
+
"anls": 72.50339016950552,
|
| 101 |
+
"n": 52
|
| 102 |
+
},
|
| 103 |
+
"Reports": {
|
| 104 |
+
"semantic": 67.34693877551022,
|
| 105 |
+
"anls": 59.42273394165809,
|
| 106 |
+
"n": 75
|
| 107 |
+
},
|
| 108 |
+
"Technical": {
|
| 109 |
+
"semantic": 68.76663708961844,
|
| 110 |
+
"anls": 53.79711579945216,
|
| 111 |
+
"n": 23
|
| 112 |
+
}
|
| 113 |
+
},
|
| 114 |
+
"n_evaluated": 500,
|
| 115 |
+
"n_unmatched": 0
|
| 116 |
+
},
|
| 117 |
+
"reevaluated_date": "2026-01-22T22:04:21.890237+00:00",
|
| 118 |
+
"source_predictions_file": "Google/Gemini_2.5_Pro_with_File_Search_predictions_20260103_221943.jsonl",
|
| 119 |
+
"result_file_path": "Google/Gemini_2.5_Pro_with_File_Search_results_20260103_221943.json"
|
| 120 |
+
}
|
eval/reevaluated_results/Google/Gemini_3.0_Pro_(Preview)_with_File_Search_results_20260104_120431.json
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Gemini 3 Pro (Preview) with File Search",
|
| 3 |
+
"organization": "Google",
|
| 4 |
+
"description": "Managed, single-shot retrieval mechanism.",
|
| 5 |
+
"link": "https://ai.google.dev/gemini-api/docs/file-search",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Conventional RAG",
|
| 8 |
+
"Semantic Search Tool"
|
| 9 |
+
],
|
| 10 |
+
"submitted_by": "Borchmann",
|
| 11 |
+
"metadata": {
|
| 12 |
+
"model_type": "api"
|
| 13 |
+
},
|
| 14 |
+
"submission_date": "2026-01-04T12:04:31.393913+00:00",
|
| 15 |
+
"results": {
|
| 16 |
+
"overall": {
|
| 17 |
+
"semantic": 78.57142857142857,
|
| 18 |
+
"semantic_ci": [
|
| 19 |
+
74.80745735419877,
|
| 20 |
+
82.33539978865836
|
| 21 |
+
],
|
| 22 |
+
"anls": 68.92843881871933,
|
| 23 |
+
"page_f1": 70.0995238095238,
|
| 24 |
+
"doc_f1": 94.17333333333333,
|
| 25 |
+
"kuiper": 12.06600000000001
|
| 26 |
+
},
|
| 27 |
+
"single_evidence": {
|
| 28 |
+
"semantic": 80.09505171931788,
|
| 29 |
+
"anls": 71.30190923977649,
|
| 30 |
+
"n": 365
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_same_doc": {
|
| 33 |
+
"semantic": 74.1010689990282,
|
| 34 |
+
"anls": 67.2325874522391,
|
| 35 |
+
"n": 84
|
| 36 |
+
},
|
| 37 |
+
"multi_evidence_multi_doc": {
|
| 38 |
+
"semantic": 75.03001200480193,
|
| 39 |
+
"anls": 54.73500374221878,
|
| 40 |
+
"n": 51
|
| 41 |
+
},
|
| 42 |
+
"by_domain": {
|
| 43 |
+
"Cases/Logs": {
|
| 44 |
+
"semantic": 85.0340136054422,
|
| 45 |
+
"anls": 74.51761001296605,
|
| 46 |
+
"n": 15
|
| 47 |
+
},
|
| 48 |
+
"Education": {
|
| 49 |
+
"semantic": 90.44526901669758,
|
| 50 |
+
"anls": 62.23980564889655,
|
| 51 |
+
"n": 22
|
| 52 |
+
},
|
| 53 |
+
"Events": {
|
| 54 |
+
"semantic": 87.15986394557822,
|
| 55 |
+
"anls": 63.97470042977855,
|
| 56 |
+
"n": 24
|
| 57 |
+
},
|
| 58 |
+
"Financial": {
|
| 59 |
+
"semantic": 77.08518189884651,
|
| 60 |
+
"anls": 71.47919047104104,
|
| 61 |
+
"n": 92
|
| 62 |
+
},
|
| 63 |
+
"Financial/Tax": {
|
| 64 |
+
"semantic": 76.53061224489795,
|
| 65 |
+
"anls": 75.04133597883597,
|
| 66 |
+
"n": 16
|
| 67 |
+
},
|
| 68 |
+
"Government/Regulatory": {
|
| 69 |
+
"semantic": 80.33000434216238,
|
| 70 |
+
"anls": 67.12054458180654,
|
| 71 |
+
"n": 47
|
| 72 |
+
},
|
| 73 |
+
"HR/Employment": {
|
| 74 |
+
"semantic": 74.66401194624191,
|
| 75 |
+
"anls": 70.8778954315538,
|
| 76 |
+
"n": 41
|
| 77 |
+
},
|
| 78 |
+
"Legal": {
|
| 79 |
+
"semantic": 80.68343616516374,
|
| 80 |
+
"anls": 64.51676230745998,
|
| 81 |
+
"n": 43
|
| 82 |
+
},
|
| 83 |
+
"Media/Publishing": {
|
| 84 |
+
"semantic": 65.30612244897961,
|
| 85 |
+
"anls": 71.26169358330307,
|
| 86 |
+
"n": 25
|
| 87 |
+
},
|
| 88 |
+
"Misc": {
|
| 89 |
+
"semantic": 85.0340136054422,
|
| 90 |
+
"anls": 71.74962035802997,
|
| 91 |
+
"n": 24
|
| 92 |
+
},
|
| 93 |
+
"Other": {
|
| 94 |
+
"semantic": 0.0,
|
| 95 |
+
"anls": 29.54545454545454,
|
| 96 |
+
"n": 1
|
| 97 |
+
},
|
| 98 |
+
"Reference": {
|
| 99 |
+
"semantic": 80.45525902668759,
|
| 100 |
+
"anls": 76.46179993780217,
|
| 101 |
+
"n": 52
|
| 102 |
+
},
|
| 103 |
+
"Reports": {
|
| 104 |
+
"semantic": 80.95238095238096,
|
| 105 |
+
"anls": 69.86245910414694,
|
| 106 |
+
"n": 75
|
| 107 |
+
},
|
| 108 |
+
"Technical": {
|
| 109 |
+
"semantic": 59.89352262644188,
|
| 110 |
+
"anls": 47.01665330524952,
|
| 111 |
+
"n": 23
|
| 112 |
+
}
|
| 113 |
+
},
|
| 114 |
+
"n_evaluated": 500,
|
| 115 |
+
"n_unmatched": 0
|
| 116 |
+
},
|
| 117 |
+
"reevaluated_date": "2026-01-22T22:05:27.215227+00:00",
|
| 118 |
+
"source_predictions_file": "Google/Gemini_3.0_Pro_(Preview)_with_File_Search_predictions_20260104_120431.jsonl",
|
| 119 |
+
"result_file_path": "Google/Gemini_3.0_Pro_(Preview)_with_File_Search_results_20260104_120431.json"
|
| 120 |
+
}
|
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260117_193634.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Human with BM25 Search Tool",
|
| 3 |
+
"organization": "Humanity",
|
| 4 |
+
"description": "Human equipped with the same search engine as agentic baselines.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-17T19:36:34.967206+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 82.14285714285717,
|
| 19 |
+
"semantic_ci": [
|
| 20 |
+
78.5991954078934,
|
| 21 |
+
85.68651887782092
|
| 22 |
+
],
|
| 23 |
+
"anls": 83.70514272504292,
|
| 24 |
+
"page_f1": 79.25590111642744,
|
| 25 |
+
"doc_f1": 93.42612554112554,
|
| 26 |
+
"kuiper": 8.696969696969703
|
| 27 |
+
},
|
| 28 |
+
"single_evidence": {
|
| 29 |
+
"semantic": 84.14872798434443,
|
| 30 |
+
"anls": 85.91721757063848,
|
| 31 |
+
"n": 365
|
| 32 |
+
},
|
| 33 |
+
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 79.56754130223517,
|
| 35 |
+
"anls": 82.31765689212445,
|
| 36 |
+
"n": 84
|
| 37 |
+
},
|
| 38 |
+
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 72.02881152460986,
|
| 40 |
+
"anls": 70.15889745686191,
|
| 41 |
+
"n": 51
|
| 42 |
+
},
|
| 43 |
+
"by_domain": {
|
| 44 |
+
"Cases/Logs": {
|
| 45 |
+
"semantic": 68.02721088435374,
|
| 46 |
+
"anls": 72.72727272727272,
|
| 47 |
+
"n": 15
|
| 48 |
+
},
|
| 49 |
+
"Education": {
|
| 50 |
+
"semantic": 100.0,
|
| 51 |
+
"anls": 92.81192695119195,
|
| 52 |
+
"n": 22
|
| 53 |
+
},
|
| 54 |
+
"Events": {
|
| 55 |
+
"semantic": 87.15986394557822,
|
| 56 |
+
"anls": 83.66935483870968,
|
| 57 |
+
"n": 24
|
| 58 |
+
},
|
| 59 |
+
"Financial": {
|
| 60 |
+
"semantic": 78.19432120674357,
|
| 61 |
+
"anls": 79.59127298509043,
|
| 62 |
+
"n": 92
|
| 63 |
+
},
|
| 64 |
+
"Financial/Tax": {
|
| 65 |
+
"semantic": 82.90816326530613,
|
| 66 |
+
"anls": 83.28405017921146,
|
| 67 |
+
"n": 16
|
| 68 |
+
},
|
| 69 |
+
"Government/Regulatory": {
|
| 70 |
+
"semantic": 84.67216673903604,
|
| 71 |
+
"anls": 87.11278292004198,
|
| 72 |
+
"n": 47
|
| 73 |
+
},
|
| 74 |
+
"HR/Employment": {
|
| 75 |
+
"semantic": 87.10801393728221,
|
| 76 |
+
"anls": 81.26741515002432,
|
| 77 |
+
"n": 41
|
| 78 |
+
},
|
| 79 |
+
"Legal": {
|
| 80 |
+
"semantic": 88.9890840056953,
|
| 81 |
+
"anls": 85.09035538105306,
|
| 82 |
+
"n": 43
|
| 83 |
+
},
|
| 84 |
+
"Media/Publishing": {
|
| 85 |
+
"semantic": 75.51020408163265,
|
| 86 |
+
"anls": 79.91696395686839,
|
| 87 |
+
"n": 25
|
| 88 |
+
},
|
| 89 |
+
"Misc": {
|
| 90 |
+
"semantic": 74.4047619047619,
|
| 91 |
+
"anls": 79.0967077930904,
|
| 92 |
+
"n": 24
|
| 93 |
+
},
|
| 94 |
+
"Other": {
|
| 95 |
+
"semantic": 0.0,
|
| 96 |
+
"anls": 76.92307692307692,
|
| 97 |
+
"n": 1
|
| 98 |
+
},
|
| 99 |
+
"Reference": {
|
| 100 |
+
"semantic": 83.39874411302984,
|
| 101 |
+
"anls": 92.19517190376465,
|
| 102 |
+
"n": 52
|
| 103 |
+
},
|
| 104 |
+
"Reports": {
|
| 105 |
+
"semantic": 78.2312925170068,
|
| 106 |
+
"anls": 83.77634694564375,
|
| 107 |
+
"n": 75
|
| 108 |
+
},
|
| 109 |
+
"Technical": {
|
| 110 |
+
"semantic": 84.29458740017748,
|
| 111 |
+
"anls": 83.52609662978936,
|
| 112 |
+
"n": 23
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"n_evaluated": 500,
|
| 116 |
+
"n_unmatched": 0
|
| 117 |
+
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:06:05.498305+00:00",
|
| 119 |
+
"source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260117_193634.jsonl",
|
| 120 |
+
"result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260117_193634.json"
|
| 121 |
+
}
|
eval/reevaluated_results/Humanity/Human_with_Oracle_Retriever_results_20260122_214532.json
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Human with Oracle Retriever",
|
| 3 |
+
"organization": "Humanity",
|
| 4 |
+
"description": "Human given gold standard evidence pages.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Vision and Language"
|
| 8 |
+
],
|
| 9 |
+
"submitted_by": "Borchmann",
|
| 10 |
+
"metadata": {
|
| 11 |
+
"model_type": "api"
|
| 12 |
+
},
|
| 13 |
+
"submission_date": "2026-01-22T21:45:32.545870+00:00",
|
| 14 |
+
"results": {
|
| 15 |
+
"overall": {
|
| 16 |
+
"semantic": 99.38775510204081,
|
| 17 |
+
"semantic_ci": [
|
| 18 |
+
97.96443309097894,
|
| 19 |
+
100.0
|
| 20 |
+
],
|
| 21 |
+
"anls": 93.7121419059059,
|
| 22 |
+
"page_f1": 88.62341991341991,
|
| 23 |
+
"doc_f1": 97.18112554112554,
|
| 24 |
+
"kuiper": 4.630303030303009
|
| 25 |
+
},
|
| 26 |
+
"single_evidence": {
|
| 27 |
+
"semantic": 99.24517752306402,
|
| 28 |
+
"anls": 94.03593158263021,
|
| 29 |
+
"n": 365
|
| 30 |
+
},
|
| 31 |
+
"multi_evidence_same_doc": {
|
| 32 |
+
"semantic": 100.0,
|
| 33 |
+
"anls": 95.10936020828757,
|
| 34 |
+
"n": 84
|
| 35 |
+
},
|
| 36 |
+
"multi_evidence_multi_doc": {
|
| 37 |
+
"semantic": 98.03921568627453,
|
| 38 |
+
"anls": 89.09352289797572,
|
| 39 |
+
"n": 51
|
| 40 |
+
},
|
| 41 |
+
"by_domain": {
|
| 42 |
+
"Cases/Logs": {
|
| 43 |
+
"semantic": 95.23809523809524,
|
| 44 |
+
"anls": 93.33333333333333,
|
| 45 |
+
"n": 15
|
| 46 |
+
},
|
| 47 |
+
"Education": {
|
| 48 |
+
"semantic": 100.0,
|
| 49 |
+
"anls": 92.99351054603811,
|
| 50 |
+
"n": 22
|
| 51 |
+
},
|
| 52 |
+
"Events": {
|
| 53 |
+
"semantic": 100.0,
|
| 54 |
+
"anls": 97.74763766699252,
|
| 55 |
+
"n": 24
|
| 56 |
+
},
|
| 57 |
+
"Financial": {
|
| 58 |
+
"semantic": 98.15882874889085,
|
| 59 |
+
"anls": 92.61042950143054,
|
| 60 |
+
"n": 92
|
| 61 |
+
},
|
| 62 |
+
"Financial/Tax": {
|
| 63 |
+
"semantic": 100.0,
|
| 64 |
+
"anls": 96.38888888888889,
|
| 65 |
+
"n": 16
|
| 66 |
+
},
|
| 67 |
+
"Government/Regulatory": {
|
| 68 |
+
"semantic": 100.0,
|
| 69 |
+
"anls": 96.77157151500082,
|
| 70 |
+
"n": 47
|
| 71 |
+
},
|
| 72 |
+
"HR/Employment": {
|
| 73 |
+
"semantic": 97.06321553011448,
|
| 74 |
+
"anls": 91.56441076843375,
|
| 75 |
+
"n": 41
|
| 76 |
+
},
|
| 77 |
+
"Legal": {
|
| 78 |
+
"semantic": 100.0,
|
| 79 |
+
"anls": 95.3032819893285,
|
| 80 |
+
"n": 43
|
| 81 |
+
},
|
| 82 |
+
"Media/Publishing": {
|
| 83 |
+
"semantic": 93.87755102040816,
|
| 84 |
+
"anls": 90.74640522875816,
|
| 85 |
+
"n": 25
|
| 86 |
+
},
|
| 87 |
+
"Misc": {
|
| 88 |
+
"semantic": 97.78911564625852,
|
| 89 |
+
"anls": 90.94982803662161,
|
| 90 |
+
"n": 24
|
| 91 |
+
},
|
| 92 |
+
"Other": {
|
| 93 |
+
"semantic": 0.0,
|
| 94 |
+
"anls": 76.92307692307692,
|
| 95 |
+
"n": 1
|
| 96 |
+
},
|
| 97 |
+
"Reference": {
|
| 98 |
+
"semantic": 100.0,
|
| 99 |
+
"anls": 96.68889612023322,
|
| 100 |
+
"n": 52
|
| 101 |
+
},
|
| 102 |
+
"Reports": {
|
| 103 |
+
"semantic": 98.63945578231294,
|
| 104 |
+
"anls": 93.02938751681923,
|
| 105 |
+
"n": 75
|
| 106 |
+
},
|
| 107 |
+
"Technical": {
|
| 108 |
+
"semantic": 100.0,
|
| 109 |
+
"anls": 89.9146569474778,
|
| 110 |
+
"n": 23
|
| 111 |
+
}
|
| 112 |
+
},
|
| 113 |
+
"n_evaluated": 500,
|
| 114 |
+
"n_unmatched": 0
|
| 115 |
+
},
|
| 116 |
+
"reevaluated_date": "2026-01-22T22:06:32.856334+00:00",
|
| 117 |
+
"source_predictions_file": "Humanity/Human_with_Oracle_Retriever_predictions_20260122_214532.jsonl",
|
| 118 |
+
"result_file_path": "Humanity/Human_with_Oracle_Retriever_results_20260122_214532.json"
|
| 119 |
+
}
|
eval/reevaluated_results/OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json
CHANGED
|
@@ -15,28 +15,28 @@
|
|
| 15 |
"submission_date": "2026-01-09T15:32:21.908816+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls":
|
| 24 |
"page_f1": 64.14190476190477,
|
| 25 |
"doc_f1": 82.82666666666667,
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls":
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 42.087570381688025,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
@@ -52,33 +52,33 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
"anls": 67.55050505050505,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
-
"semantic":
|
| 66 |
"anls": 64.58333333333334,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 57.533651758575765,
|
| 71 |
-
"anls": 51.
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
-
"anls":
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
|
@@ -93,21 +93,21 @@
|
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
"semantic": 0.0,
|
| 96 |
-
"anls":
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
"semantic": 64.75667189952904,
|
| 101 |
-
"anls": 60.
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
-
"anls":
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
-
"semantic":
|
| 111 |
"anls": 61.60068502092203,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153221.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T15:32:21.908816+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 60.00000000000001,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
55.59775423620991,
|
| 21 |
+
64.4022457637901
|
| 22 |
],
|
| 23 |
+
"anls": 54.023238374203174,
|
| 24 |
"page_f1": 64.14190476190477,
|
| 25 |
"doc_f1": 82.82666666666667,
|
| 26 |
+
"kuiper": 43.20000000000029
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 66.25663964215825,
|
| 30 |
+
"anls": 59.82892458572295,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 42.5170068027211,
|
| 35 |
+
"anls": 36.04280504579306,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 44.01760704281713,
|
| 40 |
"anls": 42.087570381688025,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 74.4047619047619,
|
| 56 |
"anls": 67.55050505050505,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 43.81100266193434,
|
| 61 |
+
"anls": 46.01082061559715,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
+
"semantic": 76.53061224489795,
|
| 66 |
"anls": 64.58333333333334,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 57.533651758575765,
|
| 71 |
+
"anls": 51.55429065920628,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 72.17521154803384,
|
| 76 |
+
"anls": 56.33701337004763,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 61.69909824394873,
|
| 81 |
+
"anls": 57.105943152454785,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
|
|
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
"semantic": 0.0,
|
| 96 |
+
"anls": 76.92307692307692,
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
"semantic": 64.75667189952904,
|
| 101 |
+
"anls": 60.26278174219628,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 62.58503401360544,
|
| 106 |
+
"anls": 46.65638178197955,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 75.42147293700089,
|
| 111 |
"anls": 61.60068502092203,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:07:42.289399+00:00",
|
| 119 |
"source_predictions_file": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153221.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json"
|
| 121 |
}
|
eval/reevaluated_results/OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json
CHANGED
|
@@ -23,15 +23,15 @@
|
|
| 23 |
"anls": 19.21201395702391,
|
| 24 |
"page_f1": 27.60809523809524,
|
| 25 |
"doc_f1": 40.18095238095238,
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
"anls": 22.4105437044892,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
"anls": 9.153597726228908,
|
| 36 |
"n": 84
|
| 37 |
},
|
|
@@ -77,7 +77,7 @@
|
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
"anls": 20.54263565891473,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
@@ -97,17 +97,17 @@
|
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
"anls": 20.3827772417516,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic": 23.
|
| 106 |
"anls": 19.284216647617285,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
-
"semantic":
|
| 111 |
"anls": 27.075249588209658,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153812.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json"
|
| 121 |
}
|
|
|
|
| 23 |
"anls": 19.21201395702391,
|
| 24 |
"page_f1": 27.60809523809524,
|
| 25 |
"doc_f1": 40.18095238095238,
|
| 26 |
+
"kuiper": 28.60000000000006
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 24.042493709812685,
|
| 30 |
"anls": 22.4105437044892,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 6.073858114674442,
|
| 35 |
"anls": 9.153597726228908,
|
| 36 |
"n": 84
|
| 37 |
},
|
|
|
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 23.730422401518744,
|
| 81 |
"anls": 20.54263565891473,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
|
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 16.679748822605976,
|
| 101 |
"anls": 20.3827772417516,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 23.809523809523817,
|
| 106 |
"anls": 19.284216647617285,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 26.619343389529732,
|
| 111 |
"anls": 27.075249588209658,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:08:24.315093+00:00",
|
| 119 |
"source_predictions_file": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153812.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json"
|
| 121 |
}
|
eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json
CHANGED
|
@@ -15,29 +15,29 @@
|
|
| 15 |
"submission_date": "2026-01-09T15:19:12.016451+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic": 67.
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
71.
|
| 22 |
],
|
| 23 |
-
"anls": 57.
|
| 24 |
"page_f1": 67.62380952380951,
|
| 25 |
"doc_f1": 83.72666666666666,
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls":
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
-
"anls": 54.
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
|
@@ -52,13 +52,13 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
"anls": 57.55050505050505,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -68,46 +68,46 @@
|
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 68.38905775075989,
|
| 71 |
-
"anls": 58.
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
"anls": 44.265703074651974,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
-
"anls": 66.
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
-
"semantic":
|
| 86 |
"anls": 35.05751747729549,
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
-
"semantic":
|
| 91 |
"anls": 82.5164707977208,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
"semantic": 0.0,
|
| 96 |
-
"anls":
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls":
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic": 68.
|
| 106 |
-
"anls": 59.
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
-
"semantic":
|
| 111 |
"anls": 55.55075090789312,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_predictions_20260109_151912.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T15:19:12.016451+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 67.75510204081634,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
63.53045246104174,
|
| 21 |
+
71.97975162059093
|
| 22 |
],
|
| 23 |
+
"anls": 57.79787184551629,
|
| 24 |
"page_f1": 67.62380952380951,
|
| 25 |
"doc_f1": 83.72666666666666,
|
| 26 |
+
"kuiper": 64.75199999999975
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 73.24573665082471,
|
| 30 |
+
"anls": 62.10194433589725,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 51.62779397473275,
|
| 35 |
+
"anls": 41.00509049000287,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 55.022008803521395,
|
| 40 |
+
"anls": 54.65291449010614,
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 76.53061224489795,
|
| 56 |
"anls": 57.55050505050505,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 57.67524401064772,
|
| 61 |
+
"anls": 51.01775086245134,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 68.38905775075989,
|
| 71 |
+
"anls": 58.57991496289369,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 60.97560975609757,
|
| 76 |
"anls": 44.265703074651974,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 83.05647840531563,
|
| 81 |
+
"anls": 66.93395751535287,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
+
"semantic": 36.734693877551,
|
| 86 |
"anls": 35.05751747729549,
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
+
"semantic": 85.0340136054422,
|
| 91 |
"anls": 82.5164707977208,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
"semantic": 0.0,
|
| 96 |
+
"anls": 76.92307692307692,
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 77.51177394034536,
|
| 101 |
+
"anls": 68.29355847313767,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 68.70748299319727,
|
| 106 |
+
"anls": 59.504311004808905,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 59.89352262644188,
|
| 111 |
"anls": 55.55075090789312,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:09:18.181737+00:00",
|
| 119 |
"source_predictions_file": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_predictions_20260109_151912.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json"
|
| 121 |
}
|
eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_results_20260104_121551.json
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-5.2 (2025-12-11) with File Search",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Managed, single-shot retrieval mechanism.",
|
| 5 |
+
"link": "https://platform.openai.com/docs/guides/tools-file-search",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Conventional RAG",
|
| 8 |
+
"Semantic Search Tool"
|
| 9 |
+
],
|
| 10 |
+
"submitted_by": "Borchmann",
|
| 11 |
+
"metadata": {
|
| 12 |
+
"model_type": "api"
|
| 13 |
+
},
|
| 14 |
+
"submission_date": "2026-01-04T12:15:51.350064+00:00",
|
| 15 |
+
"results": {
|
| 16 |
+
"overall": {
|
| 17 |
+
"semantic": 50.0,
|
| 18 |
+
"semantic_ci": [
|
| 19 |
+
45.52884072176271,
|
| 20 |
+
54.471159278237295
|
| 21 |
+
],
|
| 22 |
+
"anls": 46.08410854382378,
|
| 23 |
+
"page_f1": 28.519365079365084,
|
| 24 |
+
"doc_f1": 68.52666666666667,
|
| 25 |
+
"kuiper": 26.695999999999877
|
| 26 |
+
},
|
| 27 |
+
"single_evidence": {
|
| 28 |
+
"semantic": 56.192339949678505,
|
| 29 |
+
"anls": 50.86266041052095,
|
| 30 |
+
"n": 365
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_same_doc": {
|
| 33 |
+
"semantic": 39.48007774538386,
|
| 34 |
+
"anls": 38.16677868634344,
|
| 35 |
+
"n": 84
|
| 36 |
+
},
|
| 37 |
+
"multi_evidence_multi_doc": {
|
| 38 |
+
"semantic": 23.0092036814726,
|
| 39 |
+
"anls": 24.92497671409598,
|
| 40 |
+
"n": 51
|
| 41 |
+
},
|
| 42 |
+
"by_domain": {
|
| 43 |
+
"Cases/Logs": {
|
| 44 |
+
"semantic": 20.40816326530612,
|
| 45 |
+
"anls": 20.0,
|
| 46 |
+
"n": 15
|
| 47 |
+
},
|
| 48 |
+
"Education": {
|
| 49 |
+
"semantic": 74.21150278293136,
|
| 50 |
+
"anls": 60.780598189689094,
|
| 51 |
+
"n": 22
|
| 52 |
+
},
|
| 53 |
+
"Events": {
|
| 54 |
+
"semantic": 63.775510204081634,
|
| 55 |
+
"anls": 49.319377600130515,
|
| 56 |
+
"n": 24
|
| 57 |
+
},
|
| 58 |
+
"Financial": {
|
| 59 |
+
"semantic": 54.902395740905064,
|
| 60 |
+
"anls": 52.85763703072669,
|
| 61 |
+
"n": 92
|
| 62 |
+
},
|
| 63 |
+
"Financial/Tax": {
|
| 64 |
+
"semantic": 25.510204081632654,
|
| 65 |
+
"anls": 24.39516129032258,
|
| 66 |
+
"n": 16
|
| 67 |
+
},
|
| 68 |
+
"Government/Regulatory": {
|
| 69 |
+
"semantic": 53.19148936170214,
|
| 70 |
+
"anls": 44.987482344705185,
|
| 71 |
+
"n": 47
|
| 72 |
+
},
|
| 73 |
+
"HR/Employment": {
|
| 74 |
+
"semantic": 42.309606769537076,
|
| 75 |
+
"anls": 45.208397582300094,
|
| 76 |
+
"n": 41
|
| 77 |
+
},
|
| 78 |
+
"Legal": {
|
| 79 |
+
"semantic": 48.647365923113426,
|
| 80 |
+
"anls": 39.61843128160958,
|
| 81 |
+
"n": 43
|
| 82 |
+
},
|
| 83 |
+
"Media/Publishing": {
|
| 84 |
+
"semantic": 40.81632653061224,
|
| 85 |
+
"anls": 38.34620945100294,
|
| 86 |
+
"n": 25
|
| 87 |
+
},
|
| 88 |
+
"Misc": {
|
| 89 |
+
"semantic": 61.64965986394556,
|
| 90 |
+
"anls": 59.15390227662317,
|
| 91 |
+
"n": 24
|
| 92 |
+
},
|
| 93 |
+
"Other": {
|
| 94 |
+
"semantic": 0.0,
|
| 95 |
+
"anls": 0.0,
|
| 96 |
+
"n": 1
|
| 97 |
+
},
|
| 98 |
+
"Reference": {
|
| 99 |
+
"semantic": 38.265306122448976,
|
| 100 |
+
"anls": 41.31709677341507,
|
| 101 |
+
"n": 52
|
| 102 |
+
},
|
| 103 |
+
"Reports": {
|
| 104 |
+
"semantic": 47.619047619047635,
|
| 105 |
+
"anls": 43.77803423273486,
|
| 106 |
+
"n": 75
|
| 107 |
+
},
|
| 108 |
+
"Technical": {
|
| 109 |
+
"semantic": 73.20319432120675,
|
| 110 |
+
"anls": 64.61964587357369,
|
| 111 |
+
"n": 23
|
| 112 |
+
}
|
| 113 |
+
},
|
| 114 |
+
"n_evaluated": 500,
|
| 115 |
+
"n_unmatched": 0
|
| 116 |
+
},
|
| 117 |
+
"reevaluated_date": "2026-01-22T22:10:25.673064+00:00",
|
| 118 |
+
"source_predictions_file": "OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_predictions_20260104_121551.jsonl",
|
| 119 |
+
"result_file_path": "OpenAI/GPT-5.2_(2025-12-11)_with_File_Search_results_20260104_121551.json"
|
| 120 |
+
}
|
eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json
CHANGED
|
@@ -15,28 +15,28 @@
|
|
| 15 |
"submission_date": "2026-01-09T15:21:04.336083+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic": 77.
|
| 19 |
"semantic_ci": [
|
| 20 |
-
73.
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls":
|
| 24 |
"page_f1": 74.16285714285713,
|
| 25 |
"doc_f1": 86.45064935064934,
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls":
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 67.63478281504847,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
@@ -57,8 +57,8 @@
|
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -67,43 +67,43 @@
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls":
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
-
"anls":
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
-
"semantic":
|
| 86 |
"anls": 65.71897407160566,
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
-
"semantic":
|
| 91 |
"anls": 86.70405982905983,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
-
"semantic":
|
| 96 |
-
"anls":
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls":
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
-
"anls": 71.
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152104.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T15:21:04.336083+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 77.6530612244898,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
73.83864314648315,
|
| 21 |
+
81.46747930249646
|
| 22 |
],
|
| 23 |
+
"anls": 71.05072433302601,
|
| 24 |
"page_f1": 74.16285714285713,
|
| 25 |
"doc_f1": 86.45064935064934,
|
| 26 |
+
"kuiper": 52.62199999999985
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 82.19178082191782,
|
| 30 |
+
"anls": 74.70786997380705,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 60.13119533527698,
|
| 35 |
+
"anls": 57.23352026792817,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 74.02961184473791,
|
| 40 |
"anls": 67.63478281504847,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
|
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 68.76663708961844,
|
| 61 |
+
"anls": 65.45826780841752,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 77.07338254450717,
|
| 71 |
+
"anls": 69.85898552527891,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 74.66401194624191,
|
| 76 |
+
"anls": 65.78837943188886,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 83.05647840531563,
|
| 81 |
+
"anls": 71.01139170906613,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
+
"semantic": 61.22448979591838,
|
| 86 |
"anls": 65.71897407160566,
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
+
"semantic": 87.15986394557822,
|
| 91 |
"anls": 86.70405982905983,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
+
"semantic": 100.0,
|
| 96 |
+
"anls": 100.0,
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 80.45525902668759,
|
| 101 |
+
"anls": 77.21482727865639,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 81.63265306122449,
|
| 106 |
+
"anls": 71.12880554965163,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:11:16.583710+00:00",
|
| 119 |
"source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152104.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json"
|
| 121 |
}
|
eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json
CHANGED
|
@@ -14,24 +14,24 @@
|
|
| 14 |
"submission_date": "2026-01-04T14:05:37.240829+00:00",
|
| 15 |
"results": {
|
| 16 |
"overall": {
|
| 17 |
-
"semantic":
|
| 18 |
"semantic_ci": [
|
| 19 |
-
|
| 20 |
-
|
| 21 |
],
|
| 22 |
-
"anls":
|
| 23 |
"page_f1": 29.277142857142856,
|
| 24 |
"doc_f1": 66.60666666666667,
|
| 25 |
-
"kuiper":
|
| 26 |
},
|
| 27 |
"single_evidence": {
|
| 28 |
-
"semantic":
|
| 29 |
-
"anls":
|
| 30 |
"n": 365
|
| 31 |
},
|
| 32 |
"multi_evidence_same_doc": {
|
| 33 |
"semantic": 36.443148688046655,
|
| 34 |
-
"anls": 38.
|
| 35 |
"n": 84
|
| 36 |
},
|
| 37 |
"multi_evidence_multi_doc": {
|
|
@@ -41,7 +41,7 @@
|
|
| 41 |
},
|
| 42 |
"by_domain": {
|
| 43 |
"Cases/Logs": {
|
| 44 |
-
"semantic":
|
| 45 |
"anls": 14.833333333333334,
|
| 46 |
"n": 15
|
| 47 |
},
|
|
@@ -51,13 +51,13 @@
|
|
| 51 |
"n": 22
|
| 52 |
},
|
| 53 |
"Events": {
|
| 54 |
-
"semantic":
|
| 55 |
"anls": 55.83149489399489,
|
| 56 |
"n": 24
|
| 57 |
},
|
| 58 |
"Financial": {
|
| 59 |
-
"semantic":
|
| 60 |
-
"anls":
|
| 61 |
"n": 92
|
| 62 |
},
|
| 63 |
"Financial/Tax": {
|
|
@@ -66,8 +66,8 @@
|
|
| 66 |
"n": 16
|
| 67 |
},
|
| 68 |
"Government/Regulatory": {
|
| 69 |
-
"semantic":
|
| 70 |
-
"anls": 41.
|
| 71 |
"n": 47
|
| 72 |
},
|
| 73 |
"HR/Employment": {
|
|
@@ -76,8 +76,8 @@
|
|
| 76 |
"n": 41
|
| 77 |
},
|
| 78 |
"Legal": {
|
| 79 |
-
"semantic":
|
| 80 |
-
"anls":
|
| 81 |
"n": 43
|
| 82 |
},
|
| 83 |
"Media/Publishing": {
|
|
@@ -96,17 +96,17 @@
|
|
| 96 |
"n": 1
|
| 97 |
},
|
| 98 |
"Reference": {
|
| 99 |
-
"semantic":
|
| 100 |
"anls": 40.48309244262362,
|
| 101 |
"n": 52
|
| 102 |
},
|
| 103 |
"Reports": {
|
| 104 |
"semantic": 47.619047619047635,
|
| 105 |
-
"anls": 46.
|
| 106 |
"n": 75
|
| 107 |
},
|
| 108 |
"Technical": {
|
| 109 |
-
"semantic":
|
| 110 |
"anls": 62.77759844334801,
|
| 111 |
"n": 23
|
| 112 |
}
|
|
@@ -114,7 +114,7 @@
|
|
| 114 |
"n_evaluated": 500,
|
| 115 |
"n_unmatched": 0
|
| 116 |
},
|
| 117 |
-
"reevaluated_date": "2026-01-
|
| 118 |
"source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_predictions_20260104_140537.jsonl",
|
| 119 |
"result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json"
|
| 120 |
}
|
|
|
|
| 14 |
"submission_date": "2026-01-04T14:05:37.240829+00:00",
|
| 15 |
"results": {
|
| 16 |
"overall": {
|
| 17 |
+
"semantic": 49.59183673469388,
|
| 18 |
"semantic_ci": [
|
| 19 |
+
45.12153634505467,
|
| 20 |
+
54.06213712433308
|
| 21 |
],
|
| 22 |
+
"anls": 45.66299621788548,
|
| 23 |
"page_f1": 29.277142857142856,
|
| 24 |
"doc_f1": 66.60666666666667,
|
| 25 |
+
"kuiper": 29.030000000000086
|
| 26 |
},
|
| 27 |
"single_evidence": {
|
| 28 |
+
"semantic": 56.05255800950517,
|
| 29 |
+
"anls": 49.87536451502012,
|
| 30 |
"n": 365
|
| 31 |
},
|
| 32 |
"multi_evidence_same_doc": {
|
| 33 |
"semantic": 36.443148688046655,
|
| 34 |
+
"anls": 38.377481387099635,
|
| 35 |
"n": 84
|
| 36 |
},
|
| 37 |
"multi_evidence_multi_doc": {
|
|
|
|
| 41 |
},
|
| 42 |
"by_domain": {
|
| 43 |
"Cases/Logs": {
|
| 44 |
+
"semantic": 13.605442176870747,
|
| 45 |
"anls": 14.833333333333334,
|
| 46 |
"n": 15
|
| 47 |
},
|
|
|
|
| 51 |
"n": 22
|
| 52 |
},
|
| 53 |
"Events": {
|
| 54 |
+
"semantic": 76.53061224489795,
|
| 55 |
"anls": 55.83149489399489,
|
| 56 |
"n": 24
|
| 57 |
},
|
| 58 |
"Financial": {
|
| 59 |
+
"semantic": 56.011535048802116,
|
| 60 |
+
"anls": 50.61975566529438,
|
| 61 |
"n": 92
|
| 62 |
},
|
| 63 |
"Financial/Tax": {
|
|
|
|
| 66 |
"n": 16
|
| 67 |
},
|
| 68 |
"Government/Regulatory": {
|
| 69 |
+
"semantic": 46.678245766391655,
|
| 70 |
+
"anls": 41.76403595954804,
|
| 71 |
"n": 47
|
| 72 |
},
|
| 73 |
"HR/Employment": {
|
|
|
|
| 76 |
"n": 41
|
| 77 |
},
|
| 78 |
"Legal": {
|
| 79 |
+
"semantic": 40.34171808258187,
|
| 80 |
+
"anls": 33.905874487269834,
|
| 81 |
"n": 43
|
| 82 |
},
|
| 83 |
"Media/Publishing": {
|
|
|
|
| 96 |
"n": 1
|
| 97 |
},
|
| 98 |
"Reference": {
|
| 99 |
+
"semantic": 42.1899529042386,
|
| 100 |
"anls": 40.48309244262362,
|
| 101 |
"n": 52
|
| 102 |
},
|
| 103 |
"Reports": {
|
| 104 |
"semantic": 47.619047619047635,
|
| 105 |
+
"anls": 46.22668610488168,
|
| 106 |
"n": 75
|
| 107 |
},
|
| 108 |
"Technical": {
|
| 109 |
+
"semantic": 64.33007985803019,
|
| 110 |
"anls": 62.77759844334801,
|
| 111 |
"n": 23
|
| 112 |
}
|
|
|
|
| 114 |
"n_evaluated": 500,
|
| 115 |
"n_unmatched": 0
|
| 116 |
},
|
| 117 |
+
"reevaluated_date": "2026-01-22T22:12:20.346525+00:00",
|
| 118 |
"source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_predictions_20260104_140537.jsonl",
|
| 119 |
"result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json"
|
| 120 |
}
|
eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json
CHANGED
|
@@ -15,34 +15,34 @@
|
|
| 15 |
"submission_date": "2026-01-09T15:26:50.820104+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls": 55.
|
| 24 |
"page_f1": 67.57095238095239,
|
| 25 |
"doc_f1": 82.35303030303031,
|
| 26 |
-
"kuiper": 73.
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic": 47.
|
| 35 |
-
"anls": 37.
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 41.86712350546175,
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
-
"semantic":
|
| 46 |
"anls": 57.16524216524217,
|
| 47 |
"n": 15
|
| 48 |
},
|
|
@@ -52,13 +52,13 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
"anls": 53.63190419293608,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -67,8 +67,8 @@
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls": 62.
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
|
@@ -77,33 +77,33 @@
|
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
"anls": 62.31744836688789,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
-
"semantic":
|
| 86 |
"anls": 39.93216037493774,
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
-
"semantic":
|
| 91 |
-
"anls": 63.
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
-
"semantic":
|
| 96 |
"anls": 0.0,
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
"semantic": 80.45525902668759,
|
| 101 |
-
"anls": 73.
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
-
"anls": 54.
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152650.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T15:26:50.820104+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 66.9387755102041,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
62.68995725738806,
|
| 21 |
+
71.18759376302013
|
| 22 |
],
|
| 23 |
+
"anls": 55.23436182110524,
|
| 24 |
"page_f1": 67.57095238095239,
|
| 25 |
"doc_f1": 82.35303030303031,
|
| 26 |
+
"kuiper": 73.23246492985982
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 73.94464635169136,
|
| 30 |
+
"anls": 61.111836074632876,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 47.9834791059281,
|
| 35 |
+
"anls": 37.81116005396503,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 48.01920768307324,
|
| 40 |
"anls": 41.86712350546175,
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
+
"semantic": 71.42857142857143,
|
| 46 |
"anls": 57.16524216524217,
|
| 47 |
"n": 15
|
| 48 |
},
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 68.02721088435374,
|
| 56 |
"anls": 53.63190419293608,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 52.68411712511091,
|
| 61 |
+
"anls": 44.09196027631984,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 73.81676074685193,
|
| 71 |
+
"anls": 62.884689959158045,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
|
|
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 78.31039392501187,
|
| 81 |
"anls": 62.31744836688789,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
+
"semantic": 53.06122448979592,
|
| 86 |
"anls": 39.93216037493774,
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
+
"semantic": 74.4047619047619,
|
| 91 |
+
"anls": 63.84908648450315,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
+
"semantic": 51.02040816326531,
|
| 96 |
"anls": 0.0,
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
"semantic": 80.45525902668759,
|
| 101 |
+
"anls": 73.08404616236015,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 68.02721088435374,
|
| 106 |
+
"anls": 54.719889251166556,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:13:30.005424+00:00",
|
| 119 |
"source_predictions_file": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152650.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json"
|
| 121 |
}
|
eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_results_20260104_122026.json
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-5 Mini (2025-08-07) with File Search",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Managed, single-shot retrieval mechanism.",
|
| 5 |
+
"link": "https://platform.openai.com/docs/guides/tools-file-search",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Conventional RAG",
|
| 8 |
+
"Semantic Search Tool"
|
| 9 |
+
],
|
| 10 |
+
"submitted_by": "Borchmann",
|
| 11 |
+
"metadata": {
|
| 12 |
+
"model_type": "api"
|
| 13 |
+
},
|
| 14 |
+
"submission_date": "2026-01-04T12:20:26.152729+00:00",
|
| 15 |
+
"results": {
|
| 16 |
+
"overall": {
|
| 17 |
+
"semantic": 48.46938775510205,
|
| 18 |
+
"semantic_ci": [
|
| 19 |
+
44.00292754203926,
|
| 20 |
+
52.935847968164836
|
| 21 |
+
],
|
| 22 |
+
"anls": 41.81689987677872,
|
| 23 |
+
"page_f1": 29.013073593073592,
|
| 24 |
+
"doc_f1": 67.32666666666667,
|
| 25 |
+
"kuiper": 28.0
|
| 26 |
+
},
|
| 27 |
+
"single_evidence": {
|
| 28 |
+
"semantic": 55.21386636846521,
|
| 29 |
+
"anls": 47.22776663670515,
|
| 30 |
+
"n": 365
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_same_doc": {
|
| 33 |
+
"semantic": 32.79883381924198,
|
| 34 |
+
"anls": 30.536053736441175,
|
| 35 |
+
"n": 84
|
| 36 |
+
},
|
| 37 |
+
"multi_evidence_multi_doc": {
|
| 38 |
+
"semantic": 26.010404161664663,
|
| 39 |
+
"anls": 21.672286316292585,
|
| 40 |
+
"n": 51
|
| 41 |
+
},
|
| 42 |
+
"by_domain": {
|
| 43 |
+
"Cases/Logs": {
|
| 44 |
+
"semantic": 13.605442176870747,
|
| 45 |
+
"anls": 16.666666666666664,
|
| 46 |
+
"n": 15
|
| 47 |
+
},
|
| 48 |
+
"Education": {
|
| 49 |
+
"semantic": 81.16883116883116,
|
| 50 |
+
"anls": 68.20851085673219,
|
| 51 |
+
"n": 22
|
| 52 |
+
},
|
| 53 |
+
"Events": {
|
| 54 |
+
"semantic": 65.9013605442177,
|
| 55 |
+
"anls": 53.08302808302808,
|
| 56 |
+
"n": 24
|
| 57 |
+
},
|
| 58 |
+
"Financial": {
|
| 59 |
+
"semantic": 52.12954747116238,
|
| 60 |
+
"anls": 46.51029159374941,
|
| 61 |
+
"n": 92
|
| 62 |
+
},
|
| 63 |
+
"Financial/Tax": {
|
| 64 |
+
"semantic": 19.132653061224488,
|
| 65 |
+
"anls": 17.775843108504397,
|
| 66 |
+
"n": 16
|
| 67 |
+
},
|
| 68 |
+
"Government/Regulatory": {
|
| 69 |
+
"semantic": 45.59270516717324,
|
| 70 |
+
"anls": 37.66328697492653,
|
| 71 |
+
"n": 47
|
| 72 |
+
},
|
| 73 |
+
"HR/Employment": {
|
| 74 |
+
"semantic": 41.06520657043304,
|
| 75 |
+
"anls": 36.31954842987894,
|
| 76 |
+
"n": 41
|
| 77 |
+
},
|
| 78 |
+
"Legal": {
|
| 79 |
+
"semantic": 43.90128144280967,
|
| 80 |
+
"anls": 39.4998961859427,
|
| 81 |
+
"n": 43
|
| 82 |
+
},
|
| 83 |
+
"Media/Publishing": {
|
| 84 |
+
"semantic": 36.734693877551,
|
| 85 |
+
"anls": 37.934534534534535,
|
| 86 |
+
"n": 25
|
| 87 |
+
},
|
| 88 |
+
"Misc": {
|
| 89 |
+
"semantic": 72.27891156462587,
|
| 90 |
+
"anls": 64.10710364514712,
|
| 91 |
+
"n": 24
|
| 92 |
+
},
|
| 93 |
+
"Other": {
|
| 94 |
+
"semantic": 0.0,
|
| 95 |
+
"anls": 0.0,
|
| 96 |
+
"n": 1
|
| 97 |
+
},
|
| 98 |
+
"Reference": {
|
| 99 |
+
"semantic": 41.208791208791204,
|
| 100 |
+
"anls": 36.730980853494025,
|
| 101 |
+
"n": 52
|
| 102 |
+
},
|
| 103 |
+
"Reports": {
|
| 104 |
+
"semantic": 47.619047619047635,
|
| 105 |
+
"anls": 37.56027917001248,
|
| 106 |
+
"n": 75
|
| 107 |
+
},
|
| 108 |
+
"Technical": {
|
| 109 |
+
"semantic": 64.33007985803019,
|
| 110 |
+
"anls": 49.94662638112699,
|
| 111 |
+
"n": 23
|
| 112 |
+
}
|
| 113 |
+
},
|
| 114 |
+
"n_evaluated": 500,
|
| 115 |
+
"n_unmatched": 0
|
| 116 |
+
},
|
| 117 |
+
"reevaluated_date": "2026-01-22T22:14:40.534356+00:00",
|
| 118 |
+
"source_predictions_file": "OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_predictions_20260104_122026.jsonl",
|
| 119 |
+
"result_file_path": "OpenAI/GPT-5_Mini_(2025-08-07)_with_File_Search_results_20260104_122026.json"
|
| 120 |
+
}
|
eval/reevaluated_results/OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json
CHANGED
|
@@ -15,28 +15,28 @@
|
|
| 15 |
"submission_date": "2026-01-09T15:28:28.366309+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls": 52.
|
| 24 |
"page_f1": 60.877142857142864,
|
| 25 |
"doc_f1": 82.2030303030303,
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls": 34.
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 30.798739429426515,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
@@ -52,13 +52,13 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
"anls": 51.78930433365917,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -67,18 +67,18 @@
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls":
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
-
"anls":
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
|
@@ -87,22 +87,22 @@
|
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
-
"semantic":
|
| 91 |
"anls": 74.53137140637142,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
-
"semantic":
|
| 96 |
"anls": 0.0,
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls":
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
"anls": 48.18660787855504,
|
| 107 |
"n": 75
|
| 108 |
},
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152828.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T15:28:28.366309+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 58.16326530612243,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
53.735254533391874,
|
| 21 |
+
62.591276078853
|
| 22 |
],
|
| 23 |
+
"anls": 52.7083705578831,
|
| 24 |
"page_f1": 60.877142857142864,
|
| 25 |
"doc_f1": 82.2030303030303,
|
| 26 |
+
"kuiper": 49.84000000000006
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 67.23511322337154,
|
| 30 |
+
"anls": 60.04044465907011,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 32.79883381924198,
|
| 35 |
+
"anls": 34.15103889857387,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 35.014005602240914,
|
| 40 |
"anls": 30.798739429426515,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 63.775510204081634,
|
| 56 |
"anls": 51.78930433365917,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 44.365572315882865,
|
| 61 |
+
"anls": 41.87226084914726,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 64.04689535388624,
|
| 71 |
+
"anls": 55.460220498205956,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 54.75360876057741,
|
| 76 |
+
"anls": 44.078093271929184,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 64.07214048410061,
|
| 81 |
+
"anls": 56.02310480217457,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
|
|
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
+
"semantic": 74.4047619047619,
|
| 91 |
"anls": 74.53137140637142,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
+
"semantic": 51.02040816326531,
|
| 96 |
"anls": 0.0,
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 62.79434850863422,
|
| 101 |
+
"anls": 62.608982452732455,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 55.78231292517006,
|
| 106 |
"anls": 48.18660787855504,
|
| 107 |
"n": 75
|
| 108 |
},
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:15:43.985703+00:00",
|
| 119 |
"source_predictions_file": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152828.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json"
|
| 121 |
}
|
eval/reevaluated_results/OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json
CHANGED
|
@@ -15,23 +15,23 @@
|
|
| 15 |
"submission_date": "2026-01-09T15:35:16.458002+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls": 46.
|
| 24 |
"page_f1": 59.905054945054935,
|
| 25 |
"doc_f1": 77.61731601731601,
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic": 29.
|
| 35 |
"anls": 26.237172573133016,
|
| 36 |
"n": 84
|
| 37 |
},
|
|
@@ -42,7 +42,7 @@
|
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
-
"semantic":
|
| 46 |
"anls": 39.64209401709402,
|
| 47 |
"n": 15
|
| 48 |
},
|
|
@@ -52,13 +52,13 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
"anls": 53.83018770627063,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -67,23 +67,23 @@
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls": 44.
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
"anls": 43.64210613408689,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
"semantic": 48.97959183673469,
|
| 86 |
-
"anls":
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
|
@@ -98,16 +98,16 @@
|
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
"semantic": 66.71899529042385,
|
| 101 |
-
"anls":
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
"anls": 45.15164464860224,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
-
"semantic":
|
| 111 |
"anls": 53.71736172158072,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153516.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T15:35:16.458002+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 52.959183673469404,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
48.490359884292864,
|
| 21 |
+
57.42800746264594
|
| 22 |
],
|
| 23 |
+
"anls": 46.73693856273607,
|
| 24 |
"page_f1": 59.905054945054935,
|
| 25 |
"doc_f1": 77.61731601731601,
|
| 26 |
+
"kuiper": 40.7714285714291
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 63.181436958344975,
|
| 30 |
+
"anls": 55.35441707653439,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 29.761904761904773,
|
| 35 |
"anls": 26.237172573133016,
|
| 36 |
"n": 84
|
| 37 |
},
|
|
|
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
+
"semantic": 51.02040816326531,
|
| 46 |
"anls": 39.64209401709402,
|
| 47 |
"n": 15
|
| 48 |
},
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 59.52380952380955,
|
| 56 |
"anls": 53.83018770627063,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 37.710736468500436,
|
| 61 |
+
"anls": 36.592053196991955,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 46.678245766391655,
|
| 71 |
+
"anls": 44.3103860759977,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 57.24240915878547,
|
| 76 |
+
"anls": 47.47703950245181,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 55.766492643569066,
|
| 81 |
"anls": 43.64210613408689,
|
| 82 |
"n": 43
|
| 83 |
},
|
| 84 |
"Media/Publishing": {
|
| 85 |
"semantic": 48.97959183673469,
|
| 86 |
+
"anls": 47.21106819031614,
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
|
|
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
"semantic": 66.71899529042385,
|
| 101 |
+
"anls": 63.387310128155136,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 54.42176870748299,
|
| 106 |
"anls": 45.15164464860224,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 68.76663708961844,
|
| 111 |
"anls": 53.71736172158072,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:17:05.838647+00:00",
|
| 119 |
"source_predictions_file": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153516.jsonl",
|
| 120 |
"result_file_path": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json"
|
| 121 |
}
|
eval/reevaluated_results/OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json
CHANGED
|
@@ -15,35 +15,35 @@
|
|
| 15 |
"submission_date": "2026-01-09T18:53:47.189606+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls":
|
| 24 |
"page_f1": 78.4607309857811,
|
| 25 |
"doc_f1": 90.20248288785363,
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
"n": 364
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls":
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 69.64985240877743,
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
-
"semantic":
|
| 46 |
-
"anls":
|
| 47 |
"n": 15
|
| 48 |
},
|
| 49 |
"Education": {
|
|
@@ -52,13 +52,13 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
"anls": 79.84423442344234,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic": 68.
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -67,17 +67,17 @@
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls":
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
"semantic": 89.59681433549028,
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
"anls": 68.10496996543507,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
@@ -92,22 +92,22 @@
|
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
-
"semantic":
|
| 96 |
-
"anls":
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls":
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
-
"anls": 71.
|
| 107 |
"n": 74
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
-
"semantic":
|
| 111 |
"anls": 55.56822369489126,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 499,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_predictions_20260109_185347.jsonl",
|
| 120 |
"result_file_path": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T18:53:47.189606+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 82.20522678009081,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
78.66223544202387,
|
| 21 |
+
85.74821811815777
|
| 22 |
],
|
| 23 |
+
"anls": 74.76545756458952,
|
| 24 |
"page_f1": 78.4607309857811,
|
| 25 |
"doc_f1": 90.20248288785363,
|
| 26 |
+
"kuiper": 25.80160320641279
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 87.04305898183449,
|
| 30 |
+
"anls": 77.74037073431458,
|
| 31 |
"n": 364
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 66.81243926141885,
|
| 35 |
+
"anls": 64.98007029276216,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 73.02921168467388,
|
| 40 |
"anls": 69.64985240877743,
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
+
"semantic": 95.23809523809524,
|
| 46 |
+
"anls": 91.7948717948718,
|
| 47 |
"n": 15
|
| 48 |
},
|
| 49 |
"Education": {
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 87.15986394557822,
|
| 56 |
"anls": 79.84423442344234,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 68.76663708961844,
|
| 61 |
+
"anls": 65.75226955943711,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 90.09986973512808,
|
| 71 |
+
"anls": 80.50699949010338,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
"semantic": 89.59681433549028,
|
| 76 |
+
"anls": 78.32170804529109,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 85.4295206454675,
|
| 81 |
"anls": 68.10496996543507,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
|
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
"Other": {
|
| 95 |
+
"semantic": 100.0,
|
| 96 |
+
"anls": 100.0,
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 88.30455259026688,
|
| 101 |
+
"anls": 84.35364711656122,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 78.59900717043573,
|
| 106 |
+
"anls": 71.94863871640305,
|
| 107 |
"n": 74
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 75.42147293700089,
|
| 111 |
"anls": 55.56822369489126,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
|
|
| 115 |
"n_evaluated": 499,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:17:55.213452+00:00",
|
| 119 |
"source_predictions_file": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_predictions_20260109_185347.jsonl",
|
| 120 |
"result_file_path": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json"
|
| 121 |
}
|
eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json
CHANGED
|
@@ -15,35 +15,35 @@
|
|
| 15 |
"submission_date": "2026-01-09T15:44:27.735534+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls":
|
| 24 |
"page_f1": 43.169719169719166,
|
| 25 |
"doc_f1": 59.24761904761905,
|
| 26 |
"kuiper": null
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic": 32.
|
| 35 |
-
"anls": 35.
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 38.05173584585349,
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
"semantic": 54.42176870748299,
|
| 46 |
-
"anls":
|
| 47 |
"n": 15
|
| 48 |
},
|
| 49 |
"Education": {
|
|
@@ -52,32 +52,32 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
-
"anls":
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
-
"semantic":
|
| 66 |
"anls": 44.99022482893451,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 54.277029960920544,
|
| 71 |
-
"anls":
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
"anls": 32.93040293040293,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
"anls": 35.73555320648344,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
@@ -97,17 +97,17 @@
|
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls":
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
-
"anls": 50.
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
-
"semantic":
|
| 111 |
"anls": 46.20014437749956,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_154427.jsonl",
|
| 120 |
"result_file_path": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T15:44:27.735534+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 48.57142857142857,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
44.10452956979305,
|
| 21 |
+
53.038327573064095
|
| 22 |
],
|
| 23 |
+
"anls": 46.86058848865213,
|
| 24 |
"page_f1": 43.169719169719166,
|
| 25 |
"doc_f1": 59.24761904761905,
|
| 26 |
"kuiper": null
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 53.955828906905246,
|
| 30 |
+
"anls": 50.80230983229035,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 32.19144800777454,
|
| 35 |
+
"anls": 35.08110270716138,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 37.01480592236896,
|
| 40 |
"anls": 38.05173584585349,
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
"semantic": 54.42176870748299,
|
| 46 |
+
"anls": 52.09116809116809,
|
| 47 |
"n": 15
|
| 48 |
},
|
| 49 |
"Education": {
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 61.64965986394556,
|
| 56 |
+
"anls": 57.140088120691566,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 51.02040816326531,
|
| 61 |
+
"anls": 53.968561691086,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
+
"semantic": 44.642857142857146,
|
| 66 |
"anls": 44.99022482893451,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
"semantic": 54.277029960920544,
|
| 71 |
+
"anls": 46.13981762917933,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 42.309606769537076,
|
| 76 |
"anls": 32.93040293040293,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 40.34171808258187,
|
| 81 |
"anls": 35.73555320648344,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
|
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 47.09576138147568,
|
| 101 |
+
"anls": 49.356185378099994,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 54.42176870748299,
|
| 106 |
+
"anls": 50.66183382822209,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 53.238686779059464,
|
| 111 |
"anls": 46.20014437749956,
|
| 112 |
"n": 23
|
| 113 |
}
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:18:55.670607+00:00",
|
| 119 |
"source_predictions_file": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_154427.jsonl",
|
| 120 |
"result_file_path": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json"
|
| 121 |
}
|
eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json
CHANGED
|
@@ -15,34 +15,34 @@
|
|
| 15 |
"submission_date": "2026-01-09T17:56:39.771528+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls":
|
| 24 |
"page_f1": 48.43228327228327,
|
| 25 |
"doc_f1": 62.30761904761904,
|
| 26 |
"kuiper": null
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls":
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 48.31663542207227,
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
-
"semantic":
|
| 46 |
"anls": 43.64672364672364,
|
| 47 |
"n": 15
|
| 48 |
},
|
|
@@ -52,13 +52,13 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
"anls": 46.90982404692082,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
@@ -67,17 +67,17 @@
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls":
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
"anls": 34.149915125524885,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
"anls": 46.299372462163156,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
@@ -97,12 +97,12 @@
|
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
-
"semantic":
|
| 101 |
-
"anls": 58.
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
-
"semantic":
|
| 106 |
"anls": 52.18098320525303,
|
| 107 |
"n": 75
|
| 108 |
},
|
|
@@ -115,7 +115,7 @@
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
-
"reevaluated_date": "2026-01-
|
| 119 |
"source_predictions_file": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_175639.jsonl",
|
| 120 |
"result_file_path": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json"
|
| 121 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-09T17:56:39.771528+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 52.85714285714286,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
48.3879879090649,
|
| 21 |
+
57.326297805220825
|
| 22 |
],
|
| 23 |
+
"anls": 48.84111465957483,
|
| 24 |
"page_f1": 48.43228327228327,
|
| 25 |
"doc_f1": 62.30761904761904,
|
| 26 |
"kuiper": null
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 56.05255800950517,
|
| 30 |
+
"anls": 50.78441123319547,
|
| 31 |
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 38.87269193391642,
|
| 35 |
+
"anls": 40.715462180302126,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 53.02120848339336,
|
| 40 |
"anls": 48.31663542207227,
|
| 41 |
"n": 51
|
| 42 |
},
|
| 43 |
"by_domain": {
|
| 44 |
"Cases/Logs": {
|
| 45 |
+
"semantic": 61.22448979591838,
|
| 46 |
"anls": 43.64672364672364,
|
| 47 |
"n": 15
|
| 48 |
},
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 53.14625850340138,
|
| 56 |
"anls": 46.90982404692082,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 56.011535048802116,
|
| 61 |
+
"anls": 54.83808397045483,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
|
|
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 66.21797655232307,
|
| 71 |
+
"anls": 51.225941217542555,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 39.82080637132901,
|
| 76 |
"anls": 34.149915125524885,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 47.46084480303749,
|
| 81 |
"anls": 46.299372462163156,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
|
|
| 97 |
"n": 1
|
| 98 |
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 56.907378335949765,
|
| 101 |
+
"anls": 58.9505008296934,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 56.4625850340136,
|
| 106 |
"anls": 52.18098320525303,
|
| 107 |
"n": 75
|
| 108 |
},
|
|
|
|
| 115 |
"n_evaluated": 500,
|
| 116 |
"n_unmatched": 0
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:19:59.654603+00:00",
|
| 119 |
"source_predictions_file": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_175639.jsonl",
|
| 120 |
"result_file_path": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json"
|
| 121 |
}
|
eval/reevaluated_results/Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json
CHANGED
|
@@ -15,28 +15,28 @@
|
|
| 15 |
"submission_date": "2026-01-10T13:22:27.811792+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls": 30.
|
| 24 |
-
"page_f1": 28.
|
| 25 |
-
"doc_f1": 51.
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls":
|
| 31 |
-
"n":
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
-
"semantic":
|
| 35 |
-
"anls":
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 18.02832244008715,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
@@ -47,32 +47,32 @@
|
|
| 47 |
"n": 15
|
| 48 |
},
|
| 49 |
"Education": {
|
| 50 |
-
"semantic":
|
| 51 |
"anls": 34.34782608695652,
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
-
"anls":
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
-
"semantic":
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
-
"semantic":
|
| 66 |
"anls": 21.39516129032258,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls": 29.
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
"anls": 37.17815890071988,
|
| 77 |
"n": 41
|
| 78 |
},
|
|
@@ -87,17 +87,22 @@
|
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
-
"semantic":
|
| 91 |
"anls": 48.707026404394824,
|
| 92 |
"n": 24
|
| 93 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
"Reference": {
|
| 95 |
-
"semantic":
|
| 96 |
-
"anls": 23.
|
| 97 |
"n": 52
|
| 98 |
},
|
| 99 |
"Reports": {
|
| 100 |
-
"semantic":
|
| 101 |
"anls": 25.79399206429042,
|
| 102 |
"n": 75
|
| 103 |
},
|
|
@@ -107,10 +112,10 @@
|
|
| 107 |
"n": 23
|
| 108 |
}
|
| 109 |
},
|
| 110 |
-
"n_evaluated":
|
| 111 |
-
"n_unmatched":
|
| 112 |
},
|
| 113 |
-
"reevaluated_date": "2026-01-
|
| 114 |
"source_predictions_file": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_predictions_20260110_132227.jsonl",
|
| 115 |
"result_file_path": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json"
|
| 116 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-10T13:22:27.811792+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 46.020408163265316,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
41.56988099714986,
|
| 21 |
+
50.470935329380765
|
| 22 |
],
|
| 23 |
+
"anls": 30.43927742127836,
|
| 24 |
+
"page_f1": 28.93380952380953,
|
| 25 |
+
"doc_f1": 51.483333333333334,
|
| 26 |
+
"kuiper": 27.468937875751397
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 52.278445624825274,
|
| 30 |
+
"anls": 36.186768374440895,
|
| 31 |
+
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
+
"semantic": 18.221574344023328,
|
| 35 |
+
"anls": 13.000283446712018,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 47.0188075230092,
|
| 40 |
"anls": 18.02832244008715,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
|
|
| 47 |
"n": 15
|
| 48 |
},
|
| 49 |
"Education": {
|
| 50 |
+
"semantic": 60.29684601113172,
|
| 51 |
"anls": 34.34782608695652,
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 74.4047619047619,
|
| 56 |
+
"anls": 55.012560563191016,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
+
"semantic": 36.601597160603376,
|
| 61 |
+
"anls": 24.625778579359373,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
+
"semantic": 22.321428571428573,
|
| 66 |
"anls": 21.39516129032258,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 40.16500217108119,
|
| 71 |
+
"anls": 29.574468085106382,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 49.77600796416127,
|
| 76 |
"anls": 37.17815890071988,
|
| 77 |
"n": 41
|
| 78 |
},
|
|
|
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
+
"semantic": 70.1530612244898,
|
| 91 |
"anls": 48.707026404394824,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
+
"Other": {
|
| 95 |
+
"semantic": 100.0,
|
| 96 |
+
"anls": 0.0,
|
| 97 |
+
"n": 1
|
| 98 |
+
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 43.171114599686035,
|
| 101 |
+
"anls": 23.509615384615383,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 44.897959183673464,
|
| 106 |
"anls": 25.79399206429042,
|
| 107 |
"n": 75
|
| 108 |
},
|
|
|
|
| 112 |
"n": 23
|
| 113 |
}
|
| 114 |
},
|
| 115 |
+
"n_evaluated": 500,
|
| 116 |
+
"n_unmatched": 1766
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:21:10.223032+00:00",
|
| 119 |
"source_predictions_file": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_predictions_20260110_132227.jsonl",
|
| 120 |
"result_file_path": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json"
|
| 121 |
}
|
eval/reevaluated_results/Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json
CHANGED
|
@@ -15,28 +15,28 @@
|
|
| 15 |
"submission_date": "2026-01-10T13:18:26.686587+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
-
"semantic":
|
| 19 |
"semantic_ci": [
|
| 20 |
-
61.
|
| 21 |
-
|
| 22 |
],
|
| 23 |
-
"anls":
|
| 24 |
-
"page_f1":
|
| 25 |
-
"doc_f1": 86.
|
| 26 |
-
"kuiper":
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
-
"semantic":
|
| 30 |
-
"anls": 64.
|
| 31 |
-
"n":
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
"semantic": 37.0505344995141,
|
| 35 |
-
"anls": 36.
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
-
"semantic":
|
| 40 |
"anls": 66.49141604533267,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
@@ -52,32 +52,32 @@
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
-
"semantic":
|
| 56 |
-
"anls":
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
"semantic": 53.79325643300798,
|
| 61 |
-
"anls":
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
-
"semantic":
|
| 66 |
"anls": 62.5648667601683,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
-
"semantic":
|
| 71 |
-
"anls":
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
-
"semantic":
|
| 76 |
-
"anls":
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
-
"semantic":
|
| 81 |
"anls": 55.536175710594314,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
@@ -87,30 +87,35 @@
|
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
-
"semantic":
|
| 91 |
"anls": 75.10160446706249,
|
| 92 |
"n": 24
|
| 93 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
"Reference": {
|
| 95 |
-
"semantic":
|
| 96 |
-
"anls": 60.
|
| 97 |
"n": 52
|
| 98 |
},
|
| 99 |
"Reports": {
|
| 100 |
-
"semantic":
|
| 101 |
"anls": 56.89167319856098,
|
| 102 |
"n": 75
|
| 103 |
},
|
| 104 |
"Technical": {
|
| 105 |
-
"semantic":
|
| 106 |
"anls": 51.450020851943364,
|
| 107 |
"n": 23
|
| 108 |
}
|
| 109 |
},
|
| 110 |
-
"n_evaluated":
|
| 111 |
-
"n_unmatched":
|
| 112 |
},
|
| 113 |
-
"reevaluated_date": "2026-01-
|
| 114 |
"source_predictions_file": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_predictions_20260110_131826.jsonl",
|
| 115 |
"result_file_path": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json"
|
| 116 |
}
|
|
|
|
| 15 |
"submission_date": "2026-01-10T13:18:26.686587+00:00",
|
| 16 |
"results": {
|
| 17 |
"overall": {
|
| 18 |
+
"semantic": 66.12244897959185,
|
| 19 |
"semantic_ci": [
|
| 20 |
+
61.850797145071965,
|
| 21 |
+
70.39410081411174
|
| 22 |
],
|
| 23 |
+
"anls": 60.25345765733381,
|
| 24 |
+
"page_f1": 65.89142857142856,
|
| 25 |
+
"doc_f1": 86.61731601731603,
|
| 26 |
+
"kuiper": 51.43661971830999
|
| 27 |
},
|
| 28 |
"single_evidence": {
|
| 29 |
+
"semantic": 72.2672630696114,
|
| 30 |
+
"anls": 64.8225984463954,
|
| 31 |
+
"n": 365
|
| 32 |
},
|
| 33 |
"multi_evidence_same_doc": {
|
| 34 |
"semantic": 37.0505344995141,
|
| 35 |
+
"anls": 36.61212115976921,
|
| 36 |
"n": 84
|
| 37 |
},
|
| 38 |
"multi_evidence_multi_doc": {
|
| 39 |
+
"semantic": 70.0280112044818,
|
| 40 |
"anls": 66.49141604533267,
|
| 41 |
"n": 51
|
| 42 |
},
|
|
|
|
| 52 |
"n": 22
|
| 53 |
},
|
| 54 |
"Events": {
|
| 55 |
+
"semantic": 80.78231292517005,
|
| 56 |
+
"anls": 69.9562373074191,
|
| 57 |
"n": 24
|
| 58 |
},
|
| 59 |
"Financial": {
|
| 60 |
"semantic": 53.79325643300798,
|
| 61 |
+
"anls": 52.97859596493658,
|
| 62 |
"n": 92
|
| 63 |
},
|
| 64 |
"Financial/Tax": {
|
| 65 |
+
"semantic": 70.1530612244898,
|
| 66 |
"anls": 62.5648667601683,
|
| 67 |
"n": 16
|
| 68 |
},
|
| 69 |
"Government/Regulatory": {
|
| 70 |
+
"semantic": 77.07338254450717,
|
| 71 |
+
"anls": 71.3587742867764,
|
| 72 |
"n": 47
|
| 73 |
},
|
| 74 |
"HR/Employment": {
|
| 75 |
+
"semantic": 64.70881035340967,
|
| 76 |
+
"anls": 60.27789718135828,
|
| 77 |
"n": 41
|
| 78 |
},
|
| 79 |
"Legal": {
|
| 80 |
+
"semantic": 67.63170384432841,
|
| 81 |
"anls": 55.536175710594314,
|
| 82 |
"n": 43
|
| 83 |
},
|
|
|
|
| 87 |
"n": 25
|
| 88 |
},
|
| 89 |
"Misc": {
|
| 90 |
+
"semantic": 72.27891156462587,
|
| 91 |
"anls": 75.10160446706249,
|
| 92 |
"n": 24
|
| 93 |
},
|
| 94 |
+
"Other": {
|
| 95 |
+
"semantic": 100.0,
|
| 96 |
+
"anls": 86.66666666666667,
|
| 97 |
+
"n": 1
|
| 98 |
+
},
|
| 99 |
"Reference": {
|
| 100 |
+
"semantic": 63.775510204081634,
|
| 101 |
+
"anls": 60.88296026156921,
|
| 102 |
"n": 52
|
| 103 |
},
|
| 104 |
"Reports": {
|
| 105 |
+
"semantic": 65.30612244897961,
|
| 106 |
"anls": 56.89167319856098,
|
| 107 |
"n": 75
|
| 108 |
},
|
| 109 |
"Technical": {
|
| 110 |
+
"semantic": 66.54835847382431,
|
| 111 |
"anls": 51.450020851943364,
|
| 112 |
"n": 23
|
| 113 |
}
|
| 114 |
},
|
| 115 |
+
"n_evaluated": 500,
|
| 116 |
+
"n_unmatched": 1766
|
| 117 |
},
|
| 118 |
+
"reevaluated_date": "2026-01-22T22:22:05.057646+00:00",
|
| 119 |
"source_predictions_file": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_predictions_20260110_131826.jsonl",
|
| 120 |
"result_file_path": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json"
|
| 121 |
}
|