Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- app.py +273 -83
- eval/batch_reevaluate.py +434 -0
- eval/evaluate.py +93 -27
- eval/metrics.py +500 -1
- eval/reevaluate_submissions.py +254 -0
- eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json +112 -0
- eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json +112 -0
- eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json +112 -0
- eval/reevaluated_results/Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json +117 -0
- eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_002125.json +115 -0
- eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json +117 -0
- eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_003320.json +115 -0
- eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json +117 -0
- eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_005202.json +115 -0
- eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json +117 -0
- eval/reevaluated_results/Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_results_20260109_002711.json +110 -0
- eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_234108.json +115 -0
- eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235325.json +115 -0
- eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235724.json +117 -0
- eval/reevaluated_results/OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json +117 -0
- eval/reevaluated_results/OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json +117 -0
- eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json +117 -0
- eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json +117 -0
- eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json +116 -0
- eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json +117 -0
- eval/reevaluated_results/OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json +117 -0
- eval/reevaluated_results/OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json +117 -0
- eval/reevaluated_results/OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json +117 -0
- eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260107_113714.json +115 -0
- eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json +117 -0
- eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json +117 -0
- eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_results_20260107_153009.json +115 -0
- eval/reevaluated_results/Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json +112 -0
- eval/reevaluated_results/Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json +112 -0
app.py
CHANGED
|
@@ -22,10 +22,15 @@ import os
|
|
| 22 |
import secrets
|
| 23 |
import shutil
|
| 24 |
import sys
|
|
|
|
|
|
|
| 25 |
from datetime import datetime, timezone
|
| 26 |
from pathlib import Path
|
| 27 |
from urllib.parse import urlencode, quote, unquote
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
import pandas as pd
|
| 30 |
import plotly.graph_objects as go
|
| 31 |
import requests
|
|
@@ -35,7 +40,15 @@ from huggingface_hub import snapshot_download, HfApi, hf_hub_download
|
|
| 35 |
# Add eval module to path
|
| 36 |
sys.path.insert(0, str(Path(__file__).parent / "eval"))
|
| 37 |
try:
|
| 38 |
-
from metrics import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
from datasets import load_dataset
|
| 40 |
EVAL_AVAILABLE = True
|
| 41 |
except ImportError:
|
|
@@ -916,10 +929,17 @@ def get_model_type_html(model_type: str) -> str:
|
|
| 916 |
return f'<span style="color: {color}; font-weight: 500;">{fallback_emoji} {model_type}</span>'
|
| 917 |
|
| 918 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 919 |
@st.cache_data(ttl=300) # Cache for 5 minutes
|
| 920 |
def load_eval_results() -> pd.DataFrame:
|
| 921 |
-
"""Load evaluation results from JSON files."""
|
| 922 |
-
|
| 923 |
|
| 924 |
results_path = Path(EVAL_RESULTS_PATH)
|
| 925 |
if not results_path.exists():
|
|
@@ -945,36 +965,76 @@ def load_eval_results() -> pd.DataFrame:
|
|
| 945 |
# Get per-domain scores if available
|
| 946 |
by_domain = result_scores.get("by_domain", {})
|
| 947 |
|
| 948 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 949 |
"Model": model_name,
|
| 950 |
"Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
|
| 951 |
"Model Type": metadata.get("model_type", "unknown"),
|
| 952 |
"Tags": tags, # Store as list
|
| 953 |
-
#
|
| 954 |
-
"Accuracy (
|
| 955 |
-
"
|
| 956 |
-
"
|
| 957 |
-
"Acc.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 958 |
# Attribution metrics
|
| 959 |
-
"Attribution (Page F1)":
|
| 960 |
-
"Attribution (Doc F1)":
|
| 961 |
# Calibration metric
|
| 962 |
-
"Effort (Kuiper)":
|
| 963 |
"Submission Date": data.get("submission_date", ""),
|
| 964 |
"Link": data.get("link", ""),
|
| 965 |
"Description": data.get("description", metadata.get("description", "")) or
|
| 966 |
generate_placeholder_description(model_name, tags, metadata.get("model_type", "")),
|
| 967 |
# Per-domain scores (stored as JSON string for DataFrame compatibility)
|
| 968 |
"_by_domain": json.dumps(by_domain) if by_domain else "{}",
|
| 969 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
except Exception as e:
|
| 971 |
st.warning(f"Error loading {result_file}: {e}")
|
| 972 |
|
| 973 |
-
if not
|
| 974 |
return pd.DataFrame()
|
| 975 |
|
|
|
|
|
|
|
|
|
|
| 976 |
df = pd.DataFrame(results)
|
| 977 |
-
df = df.sort_values("Accuracy (
|
| 978 |
return df
|
| 979 |
|
| 980 |
|
|
@@ -1045,7 +1105,8 @@ def format_model_type(model_type: str) -> str:
|
|
| 1045 |
|
| 1046 |
# Metric tooltips for table headers
|
| 1047 |
METRIC_TOOLTIPS = {
|
| 1048 |
-
"Accuracy (
|
|
|
|
| 1049 |
"Acc. Single-Hop": "Accuracy on questions requiring evidence from a single page.",
|
| 1050 |
"Acc. Cross-Page": "Accuracy on multi-hop questions requiring evidence from multiple pages within the same document.",
|
| 1051 |
"Acc. Cross-Doc": "Accuracy on multi-hop questions requiring evidence from multiple documents.",
|
|
@@ -1130,12 +1191,26 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
|
|
| 1130 |
# Render tags as badges
|
| 1131 |
cell_html = render_tags_html(value)
|
| 1132 |
cells.append(f'<td>{cell_html}</td>')
|
| 1133 |
-
elif col == "Accuracy (ANLS*)" or col.startswith("Acc."):
|
| 1134 |
-
# Format accuracy scores (
|
| 1135 |
try:
|
| 1136 |
-
|
| 1137 |
except (ValueError, TypeError):
|
| 1138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1139 |
cells.append(f'<td style="text-align: center;">{cell_html}</td>')
|
| 1140 |
elif col.startswith("Attribution"):
|
| 1141 |
# Format F1 scores (scale 0-100)
|
|
@@ -1274,7 +1349,7 @@ def create_accuracy_vs_attribution_plot(df: pd.DataFrame) -> go.Figure:
|
|
| 1274 |
df_type = df[df["Model Type"] == model_type]
|
| 1275 |
fig.add_trace(go.Scatter(
|
| 1276 |
x=df_type["Attribution (Page F1)"],
|
| 1277 |
-
y=df_type["Accuracy (
|
| 1278 |
mode="markers",
|
| 1279 |
name=model_type,
|
| 1280 |
text=df_type["Model"],
|
|
@@ -1289,7 +1364,7 @@ def create_accuracy_vs_attribution_plot(df: pd.DataFrame) -> go.Figure:
|
|
| 1289 |
fig.update_layout(
|
| 1290 |
title=dict(text="Accuracy vs Attribution", font=dict(color="white")),
|
| 1291 |
xaxis_title="Attribution (Page F1)",
|
| 1292 |
-
yaxis_title="Accuracy (
|
| 1293 |
hovermode="closest",
|
| 1294 |
template="plotly_dark",
|
| 1295 |
height=650,
|
|
@@ -1335,7 +1410,7 @@ def create_accuracy_vs_effort_plot(df: pd.DataFrame) -> go.Figure:
|
|
| 1335 |
df_type = df_filtered[df_filtered["Model Type"] == model_type]
|
| 1336 |
fig.add_trace(go.Scatter(
|
| 1337 |
x=df_type["Effort (Kuiper)"],
|
| 1338 |
-
y=df_type["Accuracy (
|
| 1339 |
mode="markers",
|
| 1340 |
name=model_type,
|
| 1341 |
text=df_type["Model"],
|
|
@@ -1350,7 +1425,7 @@ def create_accuracy_vs_effort_plot(df: pd.DataFrame) -> go.Figure:
|
|
| 1350 |
fig.update_layout(
|
| 1351 |
title=dict(text="Accuracy vs Effort", font=dict(color="white")),
|
| 1352 |
xaxis_title="Effort (Kuiper) — lower is better",
|
| 1353 |
-
yaxis_title="Accuracy (
|
| 1354 |
hovermode="closest",
|
| 1355 |
template="plotly_dark",
|
| 1356 |
height=650,
|
|
@@ -1460,7 +1535,7 @@ def show_model_details(model_name: str):
|
|
| 1460 |
# Display main metrics
|
| 1461 |
col1, col2, col3 = st.columns(3)
|
| 1462 |
with col1:
|
| 1463 |
-
st.metric("
|
| 1464 |
with col2:
|
| 1465 |
st.metric("Attribution (Page F1)", f"{model_data['Attribution (Page F1)']:.1f}%")
|
| 1466 |
with col3:
|
|
@@ -1495,7 +1570,7 @@ def show_model_details(model_name: str):
|
|
| 1495 |
|
| 1496 |
if by_domain:
|
| 1497 |
# Show per-domain chart (use overall accuracy as threshold for coloring)
|
| 1498 |
-
overall_accuracy = model_data.get('Accuracy (
|
| 1499 |
fig = create_domain_accuracy_chart(by_domain, model_name, overall_accuracy)
|
| 1500 |
st.plotly_chart(fig, width="stretch")
|
| 1501 |
else:
|
|
@@ -1620,12 +1695,73 @@ def load_gold_standard(dataset_name: str = "agentic-document-ai/dataset-PRIVATE"
|
|
| 1620 |
return {}, {}
|
| 1621 |
|
| 1622 |
|
| 1623 |
-
def
|
| 1624 |
-
"""Evaluate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1625 |
if not EVAL_AVAILABLE:
|
| 1626 |
return {"error": "Evaluation module not available"}
|
| 1627 |
|
| 1628 |
-
|
|
|
|
| 1629 |
unmatched = []
|
| 1630 |
|
| 1631 |
for pred in predictions:
|
|
@@ -1633,45 +1769,54 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
|
|
| 1633 |
qid = pred.get('id', '')
|
| 1634 |
|
| 1635 |
# Match to gold
|
|
|
|
| 1636 |
if question in gold_by_text:
|
| 1637 |
gold_data = gold_by_text[question]
|
| 1638 |
elif qid and qid in gold_by_id:
|
| 1639 |
gold_data = gold_by_id[qid]
|
|
|
|
|
|
|
|
|
|
| 1640 |
else:
|
| 1641 |
unmatched.append(question[:50] + "..." if len(question) > 50 else question)
|
| 1642 |
-
|
| 1643 |
-
|
| 1644 |
-
# Get prediction data
|
| 1645 |
-
answer = pred.get('answer', '')
|
| 1646 |
-
citations = pred.get('citations', [])
|
| 1647 |
-
search_history = pred.get('search_history', [])
|
| 1648 |
-
steps = len(search_history) if search_history else pred.get('iterations', 0)
|
| 1649 |
-
|
| 1650 |
-
# Calculate metrics
|
| 1651 |
-
anls = anls_star(answer, gold_data['answers'])
|
| 1652 |
-
correct = anls >= 0.5
|
| 1653 |
-
doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
|
| 1654 |
-
page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
|
| 1655 |
-
|
| 1656 |
-
evals.append({
|
| 1657 |
-
'question': question,
|
| 1658 |
-
'anls': anls,
|
| 1659 |
-
'correct': correct,
|
| 1660 |
-
'doc_f1': doc_f1['f1'],
|
| 1661 |
-
'page_f1': page_f1['f1'],
|
| 1662 |
-
'steps': steps,
|
| 1663 |
-
'hop_type': gold_data.get('hop_type', 'single'),
|
| 1664 |
-
'category': gold_data['category'],
|
| 1665 |
-
'domain': gold_data['domain']
|
| 1666 |
-
})
|
| 1667 |
-
|
| 1668 |
-
if not evals:
|
| 1669 |
return {"error": "No predictions matched the gold standard"}
|
| 1670 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1671 |
# Aggregate overall metrics
|
| 1672 |
n = len(evals)
|
| 1673 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1674 |
mean_anls = sum(e['anls'] for e in evals) / n * 100
|
|
|
|
| 1675 |
mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
|
| 1676 |
mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
|
| 1677 |
|
|
@@ -1684,7 +1829,6 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
|
|
| 1684 |
cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
|
| 1685 |
|
| 1686 |
# By domain
|
| 1687 |
-
from collections import defaultdict
|
| 1688 |
by_domain = defaultdict(list)
|
| 1689 |
for e in evals:
|
| 1690 |
domain = e['domain'] or 'Other'
|
|
@@ -1693,6 +1837,7 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
|
|
| 1693 |
domain_scores = {}
|
| 1694 |
for domain, domain_evals in sorted(by_domain.items()):
|
| 1695 |
domain_scores[domain] = {
|
|
|
|
| 1696 |
'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
|
| 1697 |
'n': len(domain_evals)
|
| 1698 |
}
|
|
@@ -1700,27 +1845,33 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
|
|
| 1700 |
results = {
|
| 1701 |
'n_evaluated': n,
|
| 1702 |
'n_unmatched': len(unmatched),
|
| 1703 |
-
'unmatched_samples': unmatched[:5],
|
| 1704 |
'overall': {
|
| 1705 |
-
'
|
|
|
|
|
|
|
| 1706 |
'accuracy': accuracy,
|
| 1707 |
'doc_f1': mean_doc_f1,
|
| 1708 |
'page_f1': mean_page_f1,
|
| 1709 |
'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
|
| 1710 |
},
|
| 1711 |
'single_evidence': {
|
|
|
|
| 1712 |
'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
|
| 1713 |
'n': len(single_hop)
|
| 1714 |
},
|
| 1715 |
'multi_evidence_same_doc': {
|
|
|
|
| 1716 |
'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
|
| 1717 |
'n': len(cross_page)
|
| 1718 |
},
|
| 1719 |
'multi_evidence_multi_doc': {
|
|
|
|
| 1720 |
'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
|
| 1721 |
'n': len(cross_doc)
|
| 1722 |
},
|
| 1723 |
-
'by_domain': domain_scores
|
|
|
|
| 1724 |
}
|
| 1725 |
|
| 1726 |
return results
|
|
@@ -1822,14 +1973,30 @@ def submit_results_fragment():
|
|
| 1822 |
|
| 1823 |
# Evaluate button
|
| 1824 |
if st.button("Run Evaluation", type="primary"):
|
| 1825 |
-
with st.spinner("Loading gold standard
|
| 1826 |
gold_by_text, gold_by_id = load_gold_standard()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1827 |
|
| 1828 |
-
|
| 1829 |
-
|
| 1830 |
-
|
| 1831 |
-
|
| 1832 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1833 |
|
| 1834 |
# Show evaluation results
|
| 1835 |
if st.session_state.eval_results:
|
|
@@ -1840,10 +2007,15 @@ def submit_results_fragment():
|
|
| 1840 |
else:
|
| 1841 |
st.markdown("#### Evaluation Results")
|
| 1842 |
|
| 1843 |
-
# Summary metrics
|
| 1844 |
col1, col2, col3, col4 = st.columns(4)
|
| 1845 |
with col1:
|
| 1846 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1847 |
with col2:
|
| 1848 |
st.metric("Attribution (Page F1)", f"{results['overall']['page_f1']:.1f}")
|
| 1849 |
with col3:
|
|
@@ -1854,16 +2026,32 @@ def submit_results_fragment():
|
|
| 1854 |
|
| 1855 |
# Detailed breakdown
|
| 1856 |
with st.expander("Detailed Breakdown"):
|
| 1857 |
-
|
| 1858 |
-
|
| 1859 |
-
|
| 1860 |
-
|
| 1861 |
-
|
| 1862 |
-
|
| 1863 |
-
|
| 1864 |
-
|
| 1865 |
-
|
| 1866 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1867 |
|
| 1868 |
if results['n_unmatched'] > 0:
|
| 1869 |
with st.expander(f"{results['n_unmatched']} unmatched questions"):
|
|
@@ -2333,10 +2521,11 @@ def main():
|
|
| 2333 |
# COLUMN SELECTOR - chips use SNOWFLAKE_BLUE (lighter, gradient end)
|
| 2334 |
# Mapping: short chip name -> full column name
|
| 2335 |
COLUMN_CHIP_NAMES = {
|
| 2336 |
-
"Accuracy": "Accuracy (
|
| 2337 |
"Acc. Single-Hop": "Acc. Single-Hop",
|
| 2338 |
"Acc. Cross-Page": "Acc. Cross-Page",
|
| 2339 |
"Acc. Cross-Doc": "Acc. Cross-Doc",
|
|
|
|
| 2340 |
"Attribution": "Attribution (Page F1)",
|
| 2341 |
"Attribution (Doc)": "Attribution (Doc F1)",
|
| 2342 |
"Effort": "Effort (Kuiper)",
|
|
@@ -2351,7 +2540,7 @@ def main():
|
|
| 2351 |
# Model and Organization are always visible (not in selector)
|
| 2352 |
always_visible = ["Model", "Organization"]
|
| 2353 |
# Hidden columns (used internally but not shown as separate columns)
|
| 2354 |
-
hidden_cols = ["Link", "Submission Date", "Description", "_by_domain"]
|
| 2355 |
# Full column names that are optional (Tags moved to end)
|
| 2356 |
optional_full_cols = [c for c in all_columns if c not in hidden_cols + always_visible and c != "Tags"]
|
| 2357 |
optional_full_cols.append("Tags") # Add Tags at the end
|
|
@@ -2524,8 +2713,9 @@ The task is characterized by five formal properties:
|
|
| 2524 |
st.markdown("""
|
| 2525 |
#### Metrics
|
| 2526 |
|
| 2527 |
-
##### Accuracy (
|
| 2528 |
-
- **Accuracy (
|
|
|
|
| 2529 |
- **Acc. Single-Hop**: Accuracy on questions requiring a single evidence page
|
| 2530 |
- **Acc. Cross-Page**: Accuracy on multi-hop questions within the same document
|
| 2531 |
- **Acc. Cross-Doc**: Accuracy on multi-hop questions spanning multiple documents
|
|
|
|
| 22 |
import secrets
|
| 23 |
import shutil
|
| 24 |
import sys
|
| 25 |
+
from collections import defaultdict
|
| 26 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 27 |
from datetime import datetime, timezone
|
| 28 |
from pathlib import Path
|
| 29 |
from urllib.parse import urlencode, quote, unquote
|
| 30 |
|
| 31 |
+
# Parallelization config for LLM evaluation
|
| 32 |
+
MAX_EVAL_WORKERS = 24
|
| 33 |
+
|
| 34 |
import pandas as pd
|
| 35 |
import plotly.graph_objects as go
|
| 36 |
import requests
|
|
|
|
| 40 |
# Add eval module to path
|
| 41 |
sys.path.insert(0, str(Path(__file__).parent / "eval"))
|
| 42 |
try:
|
| 43 |
+
from metrics import (
|
| 44 |
+
anls_star,
|
| 45 |
+
anls_star_llm,
|
| 46 |
+
aggregate_anls_star_llm,
|
| 47 |
+
standard_error,
|
| 48 |
+
confidence_interval,
|
| 49 |
+
citation_f1,
|
| 50 |
+
kuiper_statistic
|
| 51 |
+
)
|
| 52 |
from datasets import load_dataset
|
| 53 |
EVAL_AVAILABLE = True
|
| 54 |
except ImportError:
|
|
|
|
| 929 |
return f'<span style="color: {color}; font-weight: 500;">{fallback_emoji} {model_type}</span>'
|
| 930 |
|
| 931 |
|
| 932 |
+
def _extract_timestamp_from_filename(filename: str) -> str:
|
| 933 |
+
"""Extract timestamp from filename like 'Model_results_20260109_152104.json'."""
|
| 934 |
+
import re
|
| 935 |
+
match = re.search(r'_(\d{8}_\d{6})\.json$', filename)
|
| 936 |
+
return match.group(1) if match else "00000000_000000"
|
| 937 |
+
|
| 938 |
+
|
| 939 |
@st.cache_data(ttl=300) # Cache for 5 minutes
|
| 940 |
def load_eval_results() -> pd.DataFrame:
|
| 941 |
+
"""Load evaluation results from JSON files, keeping only the most recent per model."""
|
| 942 |
+
seen_models = {} # Track: model_name -> (timestamp, result_dict, filepath)
|
| 943 |
|
| 944 |
results_path = Path(EVAL_RESULTS_PATH)
|
| 945 |
if not results_path.exists():
|
|
|
|
| 965 |
# Get per-domain scores if available
|
| 966 |
by_domain = result_scores.get("by_domain", {})
|
| 967 |
|
| 968 |
+
# Use semantic accuracy if available, otherwise fall back to ANLS*
|
| 969 |
+
overall = result_scores.get("overall", {})
|
| 970 |
+
single_ev = result_scores.get("single_evidence", {})
|
| 971 |
+
multi_page = result_scores.get("multi_evidence_same_doc", {})
|
| 972 |
+
multi_doc = result_scores.get("multi_evidence_multi_doc", {})
|
| 973 |
+
|
| 974 |
+
# Primary metric: semantic (ANLS* + LLM) if available, otherwise ANLS*
|
| 975 |
+
semantic_acc = overall.get("semantic", overall.get("anls", 0.0))
|
| 976 |
+
semantic_ci = overall.get("semantic_ci") # 95% CI tuple
|
| 977 |
+
|
| 978 |
+
# Calculate CI on-the-fly using bias correction method if not stored
|
| 979 |
+
if not semantic_ci and semantic_acc > 0:
|
| 980 |
+
try:
|
| 981 |
+
from metrics import confidence_interval, standard_error
|
| 982 |
+
n = result_scores.get("single_evidence", {}).get("n", 500)
|
| 983 |
+
p = semantic_acc / 100.0 # Convert to proportion
|
| 984 |
+
ci = confidence_interval(p, n) # Uses calibrated q0, q1, m0, m1
|
| 985 |
+
semantic_ci = (ci[0] * 100, ci[1] * 100)
|
| 986 |
+
semantic_se = standard_error(p, n) * 100 # SE in percentage points
|
| 987 |
+
except Exception:
|
| 988 |
+
semantic_ci = None
|
| 989 |
+
semantic_se = None
|
| 990 |
+
|
| 991 |
+
anls_acc = overall.get("anls", 0.0)
|
| 992 |
+
|
| 993 |
+
result_dict = {
|
| 994 |
"Model": model_name,
|
| 995 |
"Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
|
| 996 |
"Model Type": metadata.get("model_type", "unknown"),
|
| 997 |
"Tags": tags, # Store as list
|
| 998 |
+
# Primary: Accuracy with LLM judge (ANLS* + LLM with bias correction)
|
| 999 |
+
"Accuracy (LLM judge)": semantic_acc,
|
| 1000 |
+
"_Accuracy_SE": semantic_se, # Hidden: for ±SE display
|
| 1001 |
+
"_Accuracy_CI": semantic_ci, # Hidden: for tooltip display
|
| 1002 |
+
"Acc. Single-Hop": single_ev.get("semantic", single_ev.get("anls", 0.0)),
|
| 1003 |
+
"Acc. Cross-Page": multi_page.get("semantic", multi_page.get("anls", 0.0)),
|
| 1004 |
+
"Acc. Cross-Doc": multi_doc.get("semantic", multi_doc.get("anls", 0.0)),
|
| 1005 |
+
# Secondary: Pure string-based ANLS* (hidden by default)
|
| 1006 |
+
"ANLS* (string)": anls_acc,
|
| 1007 |
# Attribution metrics
|
| 1008 |
+
"Attribution (Page F1)": overall.get("page_f1", 0.0),
|
| 1009 |
+
"Attribution (Doc F1)": overall.get("doc_f1", 0.0),
|
| 1010 |
# Calibration metric
|
| 1011 |
+
"Effort (Kuiper)": overall.get("kuiper", 0.0),
|
| 1012 |
"Submission Date": data.get("submission_date", ""),
|
| 1013 |
"Link": data.get("link", ""),
|
| 1014 |
"Description": data.get("description", metadata.get("description", "")) or
|
| 1015 |
generate_placeholder_description(model_name, tags, metadata.get("model_type", "")),
|
| 1016 |
# Per-domain scores (stored as JSON string for DataFrame compatibility)
|
| 1017 |
"_by_domain": json.dumps(by_domain) if by_domain else "{}",
|
| 1018 |
+
}
|
| 1019 |
+
|
| 1020 |
+
# Extract timestamp from filename
|
| 1021 |
+
file_timestamp = _extract_timestamp_from_filename(result_file.name)
|
| 1022 |
+
|
| 1023 |
+
# Keep only the most recent result per model
|
| 1024 |
+
if model_name not in seen_models or file_timestamp > seen_models[model_name][0]:
|
| 1025 |
+
seen_models[model_name] = (file_timestamp, result_dict)
|
| 1026 |
+
|
| 1027 |
except Exception as e:
|
| 1028 |
st.warning(f"Error loading {result_file}: {e}")
|
| 1029 |
|
| 1030 |
+
if not seen_models:
|
| 1031 |
return pd.DataFrame()
|
| 1032 |
|
| 1033 |
+
# Build results list from deduplicated models
|
| 1034 |
+
results = [result_dict for _, result_dict in seen_models.values()]
|
| 1035 |
+
|
| 1036 |
df = pd.DataFrame(results)
|
| 1037 |
+
df = df.sort_values("Accuracy (LLM judge)", ascending=False).reset_index(drop=True)
|
| 1038 |
return df
|
| 1039 |
|
| 1040 |
|
|
|
|
| 1105 |
|
| 1106 |
# Metric tooltips for table headers
|
| 1107 |
METRIC_TOOLTIPS = {
|
| 1108 |
+
"Accuracy (LLM judge)": "Answer accuracy using ANLS* + LLM judge with bias correction. Captures semantic correctness beyond string matching. Higher is better.",
|
| 1109 |
+
"ANLS* (string)": "String-based accuracy using ANLS* (Average Normalized Levenshtein Similarity). Stricter than semantic. Higher is better.",
|
| 1110 |
"Acc. Single-Hop": "Accuracy on questions requiring evidence from a single page.",
|
| 1111 |
"Acc. Cross-Page": "Accuracy on multi-hop questions requiring evidence from multiple pages within the same document.",
|
| 1112 |
"Acc. Cross-Doc": "Accuracy on multi-hop questions requiring evidence from multiple documents.",
|
|
|
|
| 1191 |
# Render tags as badges
|
| 1192 |
cell_html = render_tags_html(value)
|
| 1193 |
cells.append(f'<td>{cell_html}</td>')
|
| 1194 |
+
elif col == "Accuracy (LLM judge)" or col == "ANLS* (string)" or col.startswith("Acc."):
|
| 1195 |
+
# Format accuracy scores (scale 0-100)
|
| 1196 |
try:
|
| 1197 |
+
acc_val = f"{float(value):.1f}" if value else "0"
|
| 1198 |
except (ValueError, TypeError):
|
| 1199 |
+
acc_val = str(value)
|
| 1200 |
+
|
| 1201 |
+
# Add ±SE for main accuracy column
|
| 1202 |
+
if col == "Accuracy (LLM judge)":
|
| 1203 |
+
se = row.get("_Accuracy_SE")
|
| 1204 |
+
ci = row.get("_Accuracy_CI")
|
| 1205 |
+
if se is not None and se > 0:
|
| 1206 |
+
# Show ±SE with 95% CI as tooltip
|
| 1207 |
+
ci_tooltip = f"95% CI: [{ci[0]:.1f}, {ci[1]:.1f}]" if ci else ""
|
| 1208 |
+
se_text = f'<span style="font-size: 0.85em; color: #888;" title="{ci_tooltip}"> ± {se:.1f}</span>'
|
| 1209 |
+
cell_html = f'{acc_val}{se_text}'
|
| 1210 |
+
else:
|
| 1211 |
+
cell_html = acc_val
|
| 1212 |
+
else:
|
| 1213 |
+
cell_html = acc_val
|
| 1214 |
cells.append(f'<td style="text-align: center;">{cell_html}</td>')
|
| 1215 |
elif col.startswith("Attribution"):
|
| 1216 |
# Format F1 scores (scale 0-100)
|
|
|
|
| 1349 |
df_type = df[df["Model Type"] == model_type]
|
| 1350 |
fig.add_trace(go.Scatter(
|
| 1351 |
x=df_type["Attribution (Page F1)"],
|
| 1352 |
+
y=df_type["Accuracy (LLM judge)"],
|
| 1353 |
mode="markers",
|
| 1354 |
name=model_type,
|
| 1355 |
text=df_type["Model"],
|
|
|
|
| 1364 |
fig.update_layout(
|
| 1365 |
title=dict(text="Accuracy vs Attribution", font=dict(color="white")),
|
| 1366 |
xaxis_title="Attribution (Page F1)",
|
| 1367 |
+
yaxis_title="Accuracy (LLM judge)",
|
| 1368 |
hovermode="closest",
|
| 1369 |
template="plotly_dark",
|
| 1370 |
height=650,
|
|
|
|
| 1410 |
df_type = df_filtered[df_filtered["Model Type"] == model_type]
|
| 1411 |
fig.add_trace(go.Scatter(
|
| 1412 |
x=df_type["Effort (Kuiper)"],
|
| 1413 |
+
y=df_type["Accuracy (LLM judge)"],
|
| 1414 |
mode="markers",
|
| 1415 |
name=model_type,
|
| 1416 |
text=df_type["Model"],
|
|
|
|
| 1425 |
fig.update_layout(
|
| 1426 |
title=dict(text="Accuracy vs Effort", font=dict(color="white")),
|
| 1427 |
xaxis_title="Effort (Kuiper) — lower is better",
|
| 1428 |
+
yaxis_title="Accuracy (LLM judge)",
|
| 1429 |
hovermode="closest",
|
| 1430 |
template="plotly_dark",
|
| 1431 |
height=650,
|
|
|
|
| 1535 |
# Display main metrics
|
| 1536 |
col1, col2, col3 = st.columns(3)
|
| 1537 |
with col1:
|
| 1538 |
+
st.metric("Accuracy (LLM judge)", f"{model_data['Accuracy (LLM judge)']:.1f}%")
|
| 1539 |
with col2:
|
| 1540 |
st.metric("Attribution (Page F1)", f"{model_data['Attribution (Page F1)']:.1f}%")
|
| 1541 |
with col3:
|
|
|
|
| 1570 |
|
| 1571 |
if by_domain:
|
| 1572 |
# Show per-domain chart (use overall accuracy as threshold for coloring)
|
| 1573 |
+
overall_accuracy = model_data.get('Accuracy (LLM judge)', 0)
|
| 1574 |
fig = create_domain_accuracy_chart(by_domain, model_name, overall_accuracy)
|
| 1575 |
st.plotly_chart(fig, width="stretch")
|
| 1576 |
else:
|
|
|
|
| 1695 |
return {}, {}
|
| 1696 |
|
| 1697 |
|
| 1698 |
+
def _evaluate_single_item(args, max_retries=3):
|
| 1699 |
+
"""Evaluate a single prediction item (for parallel processing)."""
|
| 1700 |
+
import time as _time
|
| 1701 |
+
idx, pred, gold_data, use_llm_judge = args
|
| 1702 |
+
|
| 1703 |
+
question = pred.get('question', '').strip()
|
| 1704 |
+
answer = pred.get('answer', '')
|
| 1705 |
+
citations = pred.get('citations', [])
|
| 1706 |
+
search_history = pred.get('search_history', [])
|
| 1707 |
+
steps = len(search_history) if search_history else pred.get('iterations', 0)
|
| 1708 |
+
|
| 1709 |
+
# Calculate non-LLM metrics first
|
| 1710 |
+
anls = anls_star(answer, gold_data['answers'])
|
| 1711 |
+
doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
|
| 1712 |
+
page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
|
| 1713 |
+
|
| 1714 |
+
# Semantic accuracy with LLM judge (or just ANLS* if disabled)
|
| 1715 |
+
if use_llm_judge:
|
| 1716 |
+
for attempt in range(max_retries):
|
| 1717 |
+
try:
|
| 1718 |
+
llm_result = anls_star_llm(answer, gold_data['answers'], question)
|
| 1719 |
+
semantic_score = llm_result['score']
|
| 1720 |
+
break
|
| 1721 |
+
except Exception:
|
| 1722 |
+
if attempt < max_retries - 1:
|
| 1723 |
+
_time.sleep(2 ** attempt) # Exponential backoff
|
| 1724 |
+
else:
|
| 1725 |
+
raise
|
| 1726 |
+
else:
|
| 1727 |
+
semantic_score = anls
|
| 1728 |
+
|
| 1729 |
+
return {
|
| 1730 |
+
'idx': idx,
|
| 1731 |
+
'question': question,
|
| 1732 |
+
'anls': anls,
|
| 1733 |
+
'semantic_score': semantic_score,
|
| 1734 |
+
'correct': semantic_score >= 0.5,
|
| 1735 |
+
'doc_f1': doc_f1['f1'],
|
| 1736 |
+
'page_f1': page_f1['f1'],
|
| 1737 |
+
'steps': steps,
|
| 1738 |
+
'hop_type': gold_data.get('hop_type', 'single'),
|
| 1739 |
+
'category': gold_data['category'],
|
| 1740 |
+
'domain': gold_data['domain']
|
| 1741 |
+
}
|
| 1742 |
+
|
| 1743 |
+
|
| 1744 |
+
def evaluate_predictions(
|
| 1745 |
+
predictions: list,
|
| 1746 |
+
gold_by_text: dict,
|
| 1747 |
+
gold_by_id: dict,
|
| 1748 |
+
use_llm_judge: bool = True,
|
| 1749 |
+
progress_callback=None
|
| 1750 |
+
) -> dict:
|
| 1751 |
+
"""Evaluate predictions against gold standard (parallelized when using LLM judge).
|
| 1752 |
+
|
| 1753 |
+
Args:
|
| 1754 |
+
predictions: List of prediction dicts
|
| 1755 |
+
gold_by_text: Gold data indexed by question text
|
| 1756 |
+
gold_by_id: Gold data indexed by question ID
|
| 1757 |
+
use_llm_judge: If True, use ANLS*+LLM for semantic accuracy (default)
|
| 1758 |
+
progress_callback: Optional callback(current, total) for progress updates
|
| 1759 |
+
"""
|
| 1760 |
if not EVAL_AVAILABLE:
|
| 1761 |
return {"error": "Evaluation module not available"}
|
| 1762 |
|
| 1763 |
+
# First pass: match predictions to gold standard
|
| 1764 |
+
matched_items = []
|
| 1765 |
unmatched = []
|
| 1766 |
|
| 1767 |
for pred in predictions:
|
|
|
|
| 1769 |
qid = pred.get('id', '')
|
| 1770 |
|
| 1771 |
# Match to gold
|
| 1772 |
+
gold_data = None
|
| 1773 |
if question in gold_by_text:
|
| 1774 |
gold_data = gold_by_text[question]
|
| 1775 |
elif qid and qid in gold_by_id:
|
| 1776 |
gold_data = gold_by_id[qid]
|
| 1777 |
+
|
| 1778 |
+
if gold_data:
|
| 1779 |
+
matched_items.append((pred, gold_data, use_llm_judge))
|
| 1780 |
else:
|
| 1781 |
unmatched.append(question[:50] + "..." if len(question) > 50 else question)
|
| 1782 |
+
|
| 1783 |
+
if not matched_items:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1784 |
return {"error": "No predictions matched the gold standard"}
|
| 1785 |
|
| 1786 |
+
# Prepare items with index
|
| 1787 |
+
items_with_idx = [(i, pred, gold, llm) for i, (pred, gold, llm) in enumerate(matched_items)]
|
| 1788 |
+
|
| 1789 |
+
total = len(items_with_idx)
|
| 1790 |
+
evals = []
|
| 1791 |
+
completed = 0
|
| 1792 |
+
|
| 1793 |
+
# Parallel evaluation with ThreadPoolExecutor (much faster for LLM calls)
|
| 1794 |
+
with ThreadPoolExecutor(max_workers=MAX_EVAL_WORKERS) as executor:
|
| 1795 |
+
futures = {executor.submit(_evaluate_single_item, item): item[0]
|
| 1796 |
+
for item in items_with_idx}
|
| 1797 |
+
|
| 1798 |
+
for future in as_completed(futures):
|
| 1799 |
+
result = future.result() # Will raise if failed after retries
|
| 1800 |
+
evals.append(result)
|
| 1801 |
+
completed += 1
|
| 1802 |
+
if progress_callback:
|
| 1803 |
+
progress_callback(completed, total)
|
| 1804 |
+
|
| 1805 |
# Aggregate overall metrics
|
| 1806 |
n = len(evals)
|
| 1807 |
+
semantic_scores = [e['semantic_score'] for e in evals]
|
| 1808 |
+
|
| 1809 |
+
# Apply bias correction for semantic accuracy
|
| 1810 |
+
if use_llm_judge:
|
| 1811 |
+
agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
|
| 1812 |
+
mean_semantic = agg['adjusted_score'] * 100
|
| 1813 |
+
semantic_ci = (agg['ci_lower'] * 100, agg['ci_upper'] * 100)
|
| 1814 |
+
else:
|
| 1815 |
+
mean_semantic = sum(semantic_scores) / n * 100
|
| 1816 |
+
semantic_ci = None
|
| 1817 |
+
|
| 1818 |
mean_anls = sum(e['anls'] for e in evals) / n * 100
|
| 1819 |
+
accuracy = sum(e['correct'] for e in evals) / n * 100
|
| 1820 |
mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
|
| 1821 |
mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
|
| 1822 |
|
|
|
|
| 1829 |
cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
|
| 1830 |
|
| 1831 |
# By domain
|
|
|
|
| 1832 |
by_domain = defaultdict(list)
|
| 1833 |
for e in evals:
|
| 1834 |
domain = e['domain'] or 'Other'
|
|
|
|
| 1837 |
domain_scores = {}
|
| 1838 |
for domain, domain_evals in sorted(by_domain.items()):
|
| 1839 |
domain_scores[domain] = {
|
| 1840 |
+
'semantic': sum(e['semantic_score'] for e in domain_evals) / len(domain_evals) * 100,
|
| 1841 |
'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
|
| 1842 |
'n': len(domain_evals)
|
| 1843 |
}
|
|
|
|
| 1845 |
results = {
|
| 1846 |
'n_evaluated': n,
|
| 1847 |
'n_unmatched': len(unmatched),
|
| 1848 |
+
'unmatched_samples': unmatched[:5],
|
| 1849 |
'overall': {
|
| 1850 |
+
'semantic': mean_semantic, # Primary metric (ANLS* + LLM judge)
|
| 1851 |
+
'semantic_ci': semantic_ci, # 95% CI if LLM judge used
|
| 1852 |
+
'anls': mean_anls, # Secondary metric (pure ANLS*)
|
| 1853 |
'accuracy': accuracy,
|
| 1854 |
'doc_f1': mean_doc_f1,
|
| 1855 |
'page_f1': mean_page_f1,
|
| 1856 |
'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
|
| 1857 |
},
|
| 1858 |
'single_evidence': {
|
| 1859 |
+
'semantic': sum(e['semantic_score'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
|
| 1860 |
'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
|
| 1861 |
'n': len(single_hop)
|
| 1862 |
},
|
| 1863 |
'multi_evidence_same_doc': {
|
| 1864 |
+
'semantic': sum(e['semantic_score'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
|
| 1865 |
'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
|
| 1866 |
'n': len(cross_page)
|
| 1867 |
},
|
| 1868 |
'multi_evidence_multi_doc': {
|
| 1869 |
+
'semantic': sum(e['semantic_score'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
|
| 1870 |
'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
|
| 1871 |
'n': len(cross_doc)
|
| 1872 |
},
|
| 1873 |
+
'by_domain': domain_scores,
|
| 1874 |
+
'used_llm_judge': use_llm_judge
|
| 1875 |
}
|
| 1876 |
|
| 1877 |
return results
|
|
|
|
| 1973 |
|
| 1974 |
# Evaluate button
|
| 1975 |
if st.button("Run Evaluation", type="primary"):
|
| 1976 |
+
with st.spinner("Loading gold standard..."):
|
| 1977 |
gold_by_text, gold_by_id = load_gold_standard()
|
| 1978 |
+
|
| 1979 |
+
if not gold_by_text:
|
| 1980 |
+
st.error("Failed to load gold standard dataset")
|
| 1981 |
+
else:
|
| 1982 |
+
# Progress bar for evaluation
|
| 1983 |
+
progress_bar = st.progress(0, text="Evaluating predictions with semantic accuracy...")
|
| 1984 |
+
status_text = st.empty()
|
| 1985 |
|
| 1986 |
+
def update_progress(current, total):
|
| 1987 |
+
progress_bar.progress(current / total, text=f"Evaluating {current}/{total}...")
|
| 1988 |
+
|
| 1989 |
+
results = evaluate_predictions(
|
| 1990 |
+
predictions,
|
| 1991 |
+
gold_by_text,
|
| 1992 |
+
gold_by_id,
|
| 1993 |
+
use_llm_judge=True,
|
| 1994 |
+
progress_callback=update_progress
|
| 1995 |
+
)
|
| 1996 |
+
|
| 1997 |
+
progress_bar.empty()
|
| 1998 |
+
status_text.empty()
|
| 1999 |
+
st.session_state.eval_results = results
|
| 2000 |
|
| 2001 |
# Show evaluation results
|
| 2002 |
if st.session_state.eval_results:
|
|
|
|
| 2007 |
else:
|
| 2008 |
st.markdown("#### Evaluation Results")
|
| 2009 |
|
| 2010 |
+
# Summary metrics - use semantic accuracy as primary if available
|
| 2011 |
col1, col2, col3, col4 = st.columns(4)
|
| 2012 |
with col1:
|
| 2013 |
+
if 'semantic' in results['overall']:
|
| 2014 |
+
ci = results['overall'].get('semantic_ci')
|
| 2015 |
+
ci_text = f" [{ci[0]:.1f}-{ci[1]:.1f}]" if ci else ""
|
| 2016 |
+
st.metric("Accuracy (LLM judge)", f"{results['overall']['semantic']:.1f}{ci_text}")
|
| 2017 |
+
else:
|
| 2018 |
+
st.metric("Accuracy (ANLS*)", f"{results['overall']['anls']:.1f}")
|
| 2019 |
with col2:
|
| 2020 |
st.metric("Attribution (Page F1)", f"{results['overall']['page_f1']:.1f}")
|
| 2021 |
with col3:
|
|
|
|
| 2026 |
|
| 2027 |
# Detailed breakdown
|
| 2028 |
with st.expander("Detailed Breakdown"):
|
| 2029 |
+
# Check which metrics are available
|
| 2030 |
+
has_semantic = 'semantic' in results['overall']
|
| 2031 |
+
|
| 2032 |
+
if has_semantic:
|
| 2033 |
+
st.markdown(f"""
|
| 2034 |
+
| Metric | Value |
|
| 2035 |
+
|--------|-------|
|
| 2036 |
+
| **Accuracy (LLM judge)** | {results['overall']['semantic']:.1f} |
|
| 2037 |
+
| **ANLS*** (string match) | {results['overall']['anls']:.1f} |
|
| 2038 |
+
| **Acc. Single-Hop** (n={results['single_evidence']['n']}) | {results['single_evidence'].get('semantic', results['single_evidence']['anls']):.1f} |
|
| 2039 |
+
| **Acc. Cross-Page** (n={results['multi_evidence_same_doc']['n']}) | {results['multi_evidence_same_doc'].get('semantic', results['multi_evidence_same_doc']['anls']):.1f} |
|
| 2040 |
+
| **Acc. Cross-Doc** (n={results['multi_evidence_multi_doc']['n']}) | {results['multi_evidence_multi_doc'].get('semantic', results['multi_evidence_multi_doc']['anls']):.1f} |
|
| 2041 |
+
| **Attribution (Doc F1)** | {results['overall']['doc_f1']:.1f} |
|
| 2042 |
+
| **Attribution (Page F1)** | {results['overall']['page_f1']:.1f} |
|
| 2043 |
+
""")
|
| 2044 |
+
else:
|
| 2045 |
+
st.markdown(f"""
|
| 2046 |
+
| Metric | Value |
|
| 2047 |
+
|--------|-------|
|
| 2048 |
+
| **Overall ANLS*** | {results['overall']['anls']:.1f} |
|
| 2049 |
+
| **Acc. Single-Hop** (n={results['single_evidence']['n']}) | {results['single_evidence']['anls']:.1f} |
|
| 2050 |
+
| **Acc. Cross-Page** (n={results['multi_evidence_same_doc']['n']}) | {results['multi_evidence_same_doc']['anls']:.1f} |
|
| 2051 |
+
| **Acc. Cross-Doc** (n={results['multi_evidence_multi_doc']['n']}) | {results['multi_evidence_multi_doc']['anls']:.1f} |
|
| 2052 |
+
| **Attribution (Doc F1)** | {results['overall']['doc_f1']:.1f} |
|
| 2053 |
+
| **Attribution (Page F1)** | {results['overall']['page_f1']:.1f} |
|
| 2054 |
+
""")
|
| 2055 |
|
| 2056 |
if results['n_unmatched'] > 0:
|
| 2057 |
with st.expander(f"{results['n_unmatched']} unmatched questions"):
|
|
|
|
| 2521 |
# COLUMN SELECTOR - chips use SNOWFLAKE_BLUE (lighter, gradient end)
|
| 2522 |
# Mapping: short chip name -> full column name
|
| 2523 |
COLUMN_CHIP_NAMES = {
|
| 2524 |
+
"Accuracy": "Accuracy (LLM judge)",
|
| 2525 |
"Acc. Single-Hop": "Acc. Single-Hop",
|
| 2526 |
"Acc. Cross-Page": "Acc. Cross-Page",
|
| 2527 |
"Acc. Cross-Doc": "Acc. Cross-Doc",
|
| 2528 |
+
"ANLS*": "ANLS* (string)",
|
| 2529 |
"Attribution": "Attribution (Page F1)",
|
| 2530 |
"Attribution (Doc)": "Attribution (Doc F1)",
|
| 2531 |
"Effort": "Effort (Kuiper)",
|
|
|
|
| 2540 |
# Model and Organization are always visible (not in selector)
|
| 2541 |
always_visible = ["Model", "Organization"]
|
| 2542 |
# Hidden columns (used internally but not shown as separate columns)
|
| 2543 |
+
hidden_cols = ["Link", "Submission Date", "Description", "_by_domain", "_Accuracy_CI", "_Accuracy_SE"]
|
| 2544 |
# Full column names that are optional (Tags moved to end)
|
| 2545 |
optional_full_cols = [c for c in all_columns if c not in hidden_cols + always_visible and c != "Tags"]
|
| 2546 |
optional_full_cols.append("Tags") # Add Tags at the end
|
|
|
|
| 2713 |
st.markdown("""
|
| 2714 |
#### Metrics
|
| 2715 |
|
| 2716 |
+
##### Accuracy (LLM judge)
|
| 2717 |
+
- **Accuracy (LLM judge)**: Primary metric combining ANLS* string matching with an LLM judge (G-Eval framework). Captures semantic correctness beyond exact string matching, with statistical bias correction
|
| 2718 |
+
- **ANLS* (string)**: Pure string-based score using Average Normalized Levenshtein Similarity with optimal element alignment for lists/sets
|
| 2719 |
- **Acc. Single-Hop**: Accuracy on questions requiring a single evidence page
|
| 2720 |
- **Acc. Cross-Page**: Accuracy on multi-hop questions within the same document
|
| 2721 |
- **Acc. Cross-Doc**: Accuracy on multi-hop questions spanning multiple documents
|
eval/batch_reevaluate.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Batch re-evaluate all submissions with the new Semantic Accuracy metric.
|
| 4 |
+
|
| 5 |
+
This script downloads all prediction files from HuggingFace Hub and re-evaluates
|
| 6 |
+
them with the ANLS* + LLM judge metric.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
# Dry run - list files only
|
| 10 |
+
python batch_reevaluate.py --dry-run
|
| 11 |
+
|
| 12 |
+
# Re-evaluate all files
|
| 13 |
+
python batch_reevaluate.py
|
| 14 |
+
|
| 15 |
+
# Re-evaluate specific organization
|
| 16 |
+
python batch_reevaluate.py --org OpenAI
|
| 17 |
+
|
| 18 |
+
# Upload results after review
|
| 19 |
+
python batch_reevaluate.py --upload
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import json
|
| 23 |
+
import os
|
| 24 |
+
import sys
|
| 25 |
+
import time
|
| 26 |
+
from collections import defaultdict
|
| 27 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 28 |
+
from datetime import datetime, timezone
|
| 29 |
+
from pathlib import Path
|
| 30 |
+
|
| 31 |
+
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
|
| 32 |
+
from datasets import load_dataset
|
| 33 |
+
|
| 34 |
+
# Add parent for imports
|
| 35 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 36 |
+
from metrics import (
|
| 37 |
+
anls_star,
|
| 38 |
+
anls_star_llm,
|
| 39 |
+
aggregate_anls_star_llm,
|
| 40 |
+
citation_f1,
|
| 41 |
+
kuiper_statistic
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Parallelization config
|
| 45 |
+
MAX_WORKERS = 24
|
| 46 |
+
|
| 47 |
+
# Config
|
| 48 |
+
RESULTS_REPO = "agentic-document-ai/backend-results"
|
| 49 |
+
TOKEN = os.environ.get("HF_TOKEN")
|
| 50 |
+
OUTPUT_DIR = Path(__file__).parent / "reevaluated_results"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def load_gold_data():
|
| 54 |
+
"""Load gold standard from HuggingFace."""
|
| 55 |
+
print("Loading gold standard...")
|
| 56 |
+
dataset = load_dataset("agentic-document-ai/dataset-PRIVATE", split="test")
|
| 57 |
+
|
| 58 |
+
gold_by_id = {}
|
| 59 |
+
gold_by_text = {}
|
| 60 |
+
|
| 61 |
+
for ex in dataset:
|
| 62 |
+
qid = ex.get('id', '')
|
| 63 |
+
question = ex['question'].strip()
|
| 64 |
+
data = {
|
| 65 |
+
'question': question,
|
| 66 |
+
'answers': ex.get('answer_variants', []),
|
| 67 |
+
'evidence': ex.get('evidence', []),
|
| 68 |
+
'category': ex.get('document_category', ''),
|
| 69 |
+
'domain': ex.get('domain', ''),
|
| 70 |
+
'hop_type': ex.get('hop_type', 'single'),
|
| 71 |
+
}
|
| 72 |
+
gold_by_id[qid] = data
|
| 73 |
+
gold_by_text[question] = data
|
| 74 |
+
|
| 75 |
+
return gold_by_id, gold_by_text
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def find_prediction_files(org_filter: str = None):
|
| 79 |
+
"""Find all prediction JSONL files in the results repo."""
|
| 80 |
+
files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
|
| 81 |
+
pred_files = [f for f in files if '_predictions' in f and f.endswith('.jsonl')]
|
| 82 |
+
|
| 83 |
+
if org_filter:
|
| 84 |
+
pred_files = [f for f in pred_files if f.startswith(org_filter + '/')]
|
| 85 |
+
|
| 86 |
+
return pred_files
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def find_result_file(pred_file: str):
|
| 90 |
+
"""Find the corresponding results JSON file for a predictions file."""
|
| 91 |
+
# Pattern: {org}/{model}_predictions_{timestamp}.jsonl -> {org}/{model}_results_{timestamp}.json
|
| 92 |
+
parts = pred_file.rsplit('_predictions_', 1)
|
| 93 |
+
if len(parts) == 2:
|
| 94 |
+
result_file = parts[0] + '_results_' + parts[1].replace('.jsonl', '.json')
|
| 95 |
+
return result_file
|
| 96 |
+
return None
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def download_file(filepath: str) -> str:
|
| 100 |
+
"""Download a file from HuggingFace Hub."""
|
| 101 |
+
return hf_hub_download(
|
| 102 |
+
repo_id=RESULTS_REPO,
|
| 103 |
+
filename=filepath,
|
| 104 |
+
repo_type="dataset",
|
| 105 |
+
token=TOKEN
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _evaluate_single_prediction(args, max_retries=3):
|
| 110 |
+
"""Evaluate a single prediction (for parallel processing)."""
|
| 111 |
+
idx, pred, gold_data = args
|
| 112 |
+
|
| 113 |
+
answer = pred.get('answer', '')
|
| 114 |
+
question = pred.get('question', '').strip()
|
| 115 |
+
citations = pred.get('citations', [])
|
| 116 |
+
search_history = pred.get('search_history', [])
|
| 117 |
+
steps = len(search_history) if search_history else pred.get('iterations', 0)
|
| 118 |
+
|
| 119 |
+
# Calculate non-LLM metrics first
|
| 120 |
+
anls = anls_star(answer, gold_data['answers'])
|
| 121 |
+
doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
|
| 122 |
+
page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
|
| 123 |
+
|
| 124 |
+
# Retry LLM call on failure
|
| 125 |
+
for attempt in range(max_retries):
|
| 126 |
+
try:
|
| 127 |
+
llm_result = anls_star_llm(answer, gold_data['answers'], question)
|
| 128 |
+
semantic_score = llm_result['score']
|
| 129 |
+
break
|
| 130 |
+
except Exception as e:
|
| 131 |
+
if attempt < max_retries - 1:
|
| 132 |
+
print(f" Item {idx} attempt {attempt+1} failed: {e}, retrying...")
|
| 133 |
+
time.sleep(2 ** attempt) # Exponential backoff
|
| 134 |
+
else:
|
| 135 |
+
print(f" Failed item {idx} after {max_retries} retries: {e}")
|
| 136 |
+
raise
|
| 137 |
+
|
| 138 |
+
return {
|
| 139 |
+
'idx': idx,
|
| 140 |
+
'anls': anls,
|
| 141 |
+
'semantic_score': semantic_score,
|
| 142 |
+
'correct': semantic_score >= 0.5,
|
| 143 |
+
'doc_f1': doc_f1['f1'],
|
| 144 |
+
'page_f1': page_f1['f1'],
|
| 145 |
+
'steps': steps,
|
| 146 |
+
'hop_type': gold_data.get('hop_type', 'single'),
|
| 147 |
+
'category': gold_data['category'],
|
| 148 |
+
'domain': gold_data['domain']
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def evaluate_with_semantic(predictions: list, gold_by_id: dict, gold_by_text: dict) -> dict:
|
| 153 |
+
"""Evaluate predictions with semantic accuracy metric (parallelized)."""
|
| 154 |
+
|
| 155 |
+
# First, filter predictions to only those in test set
|
| 156 |
+
matched_predictions = []
|
| 157 |
+
for pred in predictions:
|
| 158 |
+
question = pred.get('question', '').strip()
|
| 159 |
+
qid = pred.get('id', '')
|
| 160 |
+
|
| 161 |
+
gold_data = None
|
| 162 |
+
if question in gold_by_text:
|
| 163 |
+
gold_data = gold_by_text[question]
|
| 164 |
+
elif qid and qid in gold_by_id:
|
| 165 |
+
gold_data = gold_by_id[qid]
|
| 166 |
+
|
| 167 |
+
if gold_data:
|
| 168 |
+
matched_predictions.append((pred, gold_data))
|
| 169 |
+
|
| 170 |
+
unmatched = len(predictions) - len(matched_predictions)
|
| 171 |
+
print(f" Matched {len(matched_predictions)}/{len(predictions)} predictions to test set (skipping {unmatched})")
|
| 172 |
+
|
| 173 |
+
total = len(matched_predictions)
|
| 174 |
+
evals = []
|
| 175 |
+
completed = 0
|
| 176 |
+
|
| 177 |
+
# Prepare items with index for tracking
|
| 178 |
+
items_with_idx = [(i, pred, gold) for i, (pred, gold) in enumerate(matched_predictions)]
|
| 179 |
+
|
| 180 |
+
# Parallel evaluation with ThreadPoolExecutor
|
| 181 |
+
print(f" Evaluating with {MAX_WORKERS} parallel workers...")
|
| 182 |
+
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
| 183 |
+
futures = {executor.submit(_evaluate_single_prediction, item): item[0]
|
| 184 |
+
for item in items_with_idx}
|
| 185 |
+
|
| 186 |
+
completed_indices = set()
|
| 187 |
+
try:
|
| 188 |
+
for future in as_completed(futures, timeout=600): # 10 min overall timeout
|
| 189 |
+
try:
|
| 190 |
+
result = future.result(timeout=120) # 2 min per item max
|
| 191 |
+
evals.append(result)
|
| 192 |
+
completed_indices.add(result['idx'])
|
| 193 |
+
completed += 1
|
| 194 |
+
if completed % 50 == 0 or completed == total:
|
| 195 |
+
print(f" Progress: {completed}/{total}")
|
| 196 |
+
except TimeoutError:
|
| 197 |
+
idx = futures[future]
|
| 198 |
+
print(f" TIMEOUT: Item {idx} took too long, skipping")
|
| 199 |
+
completed += 1
|
| 200 |
+
except TimeoutError:
|
| 201 |
+
# Find which items are still pending
|
| 202 |
+
pending = set(range(total)) - completed_indices
|
| 203 |
+
print(f" OVERALL TIMEOUT: {len(pending)} items still pending: {sorted(pending)[:10]}...")
|
| 204 |
+
# Cancel remaining futures
|
| 205 |
+
for future in futures:
|
| 206 |
+
future.cancel()
|
| 207 |
+
|
| 208 |
+
if not evals:
|
| 209 |
+
return None
|
| 210 |
+
|
| 211 |
+
# Aggregate
|
| 212 |
+
n = len(evals)
|
| 213 |
+
semantic_scores = [e['semantic_score'] for e in evals]
|
| 214 |
+
|
| 215 |
+
agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
|
| 216 |
+
|
| 217 |
+
mean_anls = sum(e['anls'] for e in evals) / n * 100
|
| 218 |
+
mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
|
| 219 |
+
mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
|
| 220 |
+
|
| 221 |
+
kuiper = kuiper_statistic(evals)
|
| 222 |
+
|
| 223 |
+
# By hop type
|
| 224 |
+
single_hop = [e for e in evals if e['hop_type'] == 'single']
|
| 225 |
+
cross_page = [e for e in evals if e['hop_type'] == 'cross_page']
|
| 226 |
+
cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
|
| 227 |
+
|
| 228 |
+
# By domain
|
| 229 |
+
by_domain = defaultdict(list)
|
| 230 |
+
for e in evals:
|
| 231 |
+
domain = e['domain'] or 'Other'
|
| 232 |
+
by_domain[domain].append(e)
|
| 233 |
+
|
| 234 |
+
domain_scores = {}
|
| 235 |
+
for domain, domain_evals in sorted(by_domain.items()):
|
| 236 |
+
domain_scores[domain] = {
|
| 237 |
+
'semantic': sum(e['semantic_score'] for e in domain_evals) / len(domain_evals) * 100,
|
| 238 |
+
'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
|
| 239 |
+
'n': len(domain_evals)
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
return {
|
| 243 |
+
'overall': {
|
| 244 |
+
'semantic': agg['adjusted_score'] * 100,
|
| 245 |
+
'semantic_ci': (agg['ci_lower'] * 100, agg['ci_upper'] * 100), # 95% CI
|
| 246 |
+
'anls': mean_anls,
|
| 247 |
+
'page_f1': mean_page_f1,
|
| 248 |
+
'doc_f1': mean_doc_f1,
|
| 249 |
+
'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
|
| 250 |
+
},
|
| 251 |
+
'single_evidence': {
|
| 252 |
+
'semantic': sum(e['semantic_score'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
|
| 253 |
+
'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
|
| 254 |
+
'n': len(single_hop)
|
| 255 |
+
},
|
| 256 |
+
'multi_evidence_same_doc': {
|
| 257 |
+
'semantic': sum(e['semantic_score'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
|
| 258 |
+
'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
|
| 259 |
+
'n': len(cross_page)
|
| 260 |
+
},
|
| 261 |
+
'multi_evidence_multi_doc': {
|
| 262 |
+
'semantic': sum(e['semantic_score'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
|
| 263 |
+
'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
|
| 264 |
+
'n': len(cross_doc)
|
| 265 |
+
},
|
| 266 |
+
'by_domain': domain_scores,
|
| 267 |
+
'n_evaluated': n,
|
| 268 |
+
'n_unmatched': unmatched
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
def main():
|
| 273 |
+
import argparse
|
| 274 |
+
parser = argparse.ArgumentParser(description="Batch re-evaluate submissions")
|
| 275 |
+
parser.add_argument('--dry-run', action='store_true', help="List files only, don't evaluate")
|
| 276 |
+
parser.add_argument('--org', type=str, help="Filter by organization (e.g., 'OpenAI')")
|
| 277 |
+
parser.add_argument('--upload', action='store_true', help="Upload already processed results to HuggingFace Hub (no re-evaluation)")
|
| 278 |
+
parser.add_argument('--skip-existing', action='store_true', help="Skip already evaluated files")
|
| 279 |
+
args = parser.parse_args()
|
| 280 |
+
|
| 281 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 282 |
+
|
| 283 |
+
# Upload-only mode: just upload existing files
|
| 284 |
+
if args.upload:
|
| 285 |
+
print("Uploading existing results to HuggingFace Hub...")
|
| 286 |
+
api = HfApi()
|
| 287 |
+
result_files = list(OUTPUT_DIR.glob("**/*.json"))
|
| 288 |
+
print(f"Found {len(result_files)} result files to upload")
|
| 289 |
+
|
| 290 |
+
for result_file in result_files:
|
| 291 |
+
rel_path = result_file.relative_to(OUTPUT_DIR)
|
| 292 |
+
print(f" Uploading: {rel_path}")
|
| 293 |
+
try:
|
| 294 |
+
api.upload_file(
|
| 295 |
+
path_or_fileobj=str(result_file),
|
| 296 |
+
path_in_repo=str(rel_path),
|
| 297 |
+
repo_id=RESULTS_REPO,
|
| 298 |
+
repo_type="dataset",
|
| 299 |
+
token=TOKEN,
|
| 300 |
+
commit_message=f"Re-evaluate with semantic accuracy: {rel_path.stem}"
|
| 301 |
+
)
|
| 302 |
+
print(f" ✓ Done")
|
| 303 |
+
except Exception as e:
|
| 304 |
+
print(f" ✗ Error: {e}")
|
| 305 |
+
print("\nUpload complete!")
|
| 306 |
+
return
|
| 307 |
+
|
| 308 |
+
# Find prediction files
|
| 309 |
+
print("Finding prediction files...")
|
| 310 |
+
pred_files = find_prediction_files(args.org)
|
| 311 |
+
print(f"Found {len(pred_files)} prediction files")
|
| 312 |
+
|
| 313 |
+
if args.dry_run:
|
| 314 |
+
for f in pred_files:
|
| 315 |
+
print(f" - {f}")
|
| 316 |
+
return
|
| 317 |
+
|
| 318 |
+
# Load gold standard
|
| 319 |
+
gold_by_id, gold_by_text = load_gold_data()
|
| 320 |
+
print(f"Loaded {len(gold_by_id)} gold examples")
|
| 321 |
+
|
| 322 |
+
# Process each file
|
| 323 |
+
for i, pred_file in enumerate(pred_files):
|
| 324 |
+
print(f"\n{'='*60}")
|
| 325 |
+
print(f"[{i+1}/{len(pred_files)}] Processing: {pred_file}")
|
| 326 |
+
print('='*60)
|
| 327 |
+
|
| 328 |
+
# Check if already processed
|
| 329 |
+
output_file = OUTPUT_DIR / (Path(pred_file).stem.replace('_predictions', '_results') + '_reevaluated.json')
|
| 330 |
+
if args.skip_existing and output_file.exists():
|
| 331 |
+
print(" Skipping (already processed)")
|
| 332 |
+
continue
|
| 333 |
+
|
| 334 |
+
try:
|
| 335 |
+
# Download predictions
|
| 336 |
+
print(" Downloading predictions...")
|
| 337 |
+
local_pred = download_file(pred_file)
|
| 338 |
+
|
| 339 |
+
predictions = []
|
| 340 |
+
with open(local_pred) as f:
|
| 341 |
+
for line in f:
|
| 342 |
+
if line.strip():
|
| 343 |
+
predictions.append(json.loads(line))
|
| 344 |
+
print(f" Loaded {len(predictions)} predictions")
|
| 345 |
+
|
| 346 |
+
# Download original results to preserve metadata
|
| 347 |
+
result_file = find_result_file(pred_file)
|
| 348 |
+
original_metadata = {}
|
| 349 |
+
if result_file:
|
| 350 |
+
try:
|
| 351 |
+
local_result = download_file(result_file)
|
| 352 |
+
with open(local_result) as f:
|
| 353 |
+
original_data = json.load(f)
|
| 354 |
+
original_metadata = {
|
| 355 |
+
'model_name': original_data.get('model_name'),
|
| 356 |
+
'organization': original_data.get('organization'),
|
| 357 |
+
'description': original_data.get('description'),
|
| 358 |
+
'link': original_data.get('link'),
|
| 359 |
+
'tags': original_data.get('tags'),
|
| 360 |
+
'submitted_by': original_data.get('submitted_by'),
|
| 361 |
+
'metadata': original_data.get('metadata'),
|
| 362 |
+
'submission_date': original_data.get('submission_date'),
|
| 363 |
+
}
|
| 364 |
+
print(f" Loaded metadata: model_name={original_metadata.get('model_name')}")
|
| 365 |
+
except Exception as e:
|
| 366 |
+
print(f" Warning: Could not load original results: {e}")
|
| 367 |
+
|
| 368 |
+
# Fallback: extract metadata from filename if not found
|
| 369 |
+
if not original_metadata.get('model_name'):
|
| 370 |
+
# Pattern: Org/Model_Name_with_Stuff_predictions_timestamp.jsonl
|
| 371 |
+
filename = Path(pred_file).stem # e.g., GPT-5_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152104
|
| 372 |
+
parts = filename.rsplit('_predictions_', 1)
|
| 373 |
+
if parts:
|
| 374 |
+
model_name = parts[0].replace('_', ' ') # Convert underscores to spaces
|
| 375 |
+
org = Path(pred_file).parts[0] if '/' in pred_file else 'Unknown'
|
| 376 |
+
original_metadata = {
|
| 377 |
+
'model_name': model_name,
|
| 378 |
+
'organization': org.replace('_', ' '),
|
| 379 |
+
'description': '',
|
| 380 |
+
'tags': ['Agentic'],
|
| 381 |
+
'metadata': {'model_type': 'unknown'},
|
| 382 |
+
}
|
| 383 |
+
print(f" Using fallback metadata: model_name={model_name}, org={org}")
|
| 384 |
+
|
| 385 |
+
# Evaluate
|
| 386 |
+
print(" Evaluating with semantic accuracy...")
|
| 387 |
+
start_time = time.time()
|
| 388 |
+
results = evaluate_with_semantic(predictions, gold_by_id, gold_by_text)
|
| 389 |
+
elapsed = time.time() - start_time
|
| 390 |
+
|
| 391 |
+
if results:
|
| 392 |
+
print(f"\n Results (took {elapsed:.1f}s):")
|
| 393 |
+
print(f" Semantic Accuracy: {results['overall']['semantic']:.1f}")
|
| 394 |
+
print(f" ANLS*: {results['overall']['anls']:.1f}")
|
| 395 |
+
print(f" Page F1: {results['overall']['page_f1']:.1f}")
|
| 396 |
+
|
| 397 |
+
# Save with original metadata
|
| 398 |
+
org = Path(pred_file).parts[0] if '/' in pred_file else 'Unknown'
|
| 399 |
+
output_filename = Path(pred_file).name.replace('_predictions', '_results').replace('.jsonl', '.json')
|
| 400 |
+
|
| 401 |
+
full_result = {
|
| 402 |
+
**original_metadata,
|
| 403 |
+
'results': results,
|
| 404 |
+
'reevaluated_date': datetime.now(timezone.utc).isoformat(),
|
| 405 |
+
'source_predictions_file': pred_file,
|
| 406 |
+
'result_file_path': f"{org}/{output_filename}",
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
# Create org subfolder
|
| 410 |
+
org_dir = OUTPUT_DIR / org
|
| 411 |
+
org_dir.mkdir(exist_ok=True)
|
| 412 |
+
|
| 413 |
+
output_file = org_dir / output_filename
|
| 414 |
+
with open(output_file, 'w') as f:
|
| 415 |
+
json.dump(full_result, f, indent=2)
|
| 416 |
+
print(f" Saved to: {output_file}")
|
| 417 |
+
else:
|
| 418 |
+
print(" No valid evaluations")
|
| 419 |
+
|
| 420 |
+
except Exception as e:
|
| 421 |
+
print(f" Error: {e}")
|
| 422 |
+
import traceback
|
| 423 |
+
traceback.print_exc()
|
| 424 |
+
continue
|
| 425 |
+
|
| 426 |
+
print(f"\n{'='*60}")
|
| 427 |
+
print("DONE!")
|
| 428 |
+
print(f"Results saved to: {OUTPUT_DIR}")
|
| 429 |
+
print(f"\nTo upload results, run: python batch_reevaluate.py --upload")
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
if __name__ == "__main__":
|
| 433 |
+
main()
|
| 434 |
+
|
eval/evaluate.py
CHANGED
|
@@ -18,7 +18,14 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
| 18 |
|
| 19 |
from datasets import load_dataset
|
| 20 |
|
| 21 |
-
from metrics import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def derive_hop_type(evidence: list) -> str:
|
|
@@ -106,11 +113,18 @@ def load_results(filepath: Path) -> List[Dict]:
|
|
| 106 |
def evaluate_single(
|
| 107 |
result: Dict,
|
| 108 |
gold_by_text: Dict[str, Dict],
|
| 109 |
-
gold_by_id: Dict[str, Dict]
|
|
|
|
| 110 |
) -> Optional[Dict[str, Any]]:
|
| 111 |
"""Evaluate a single prediction.
|
| 112 |
|
| 113 |
Matches by question text first, falls back to question ID if not found.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
"""
|
| 115 |
question = result.get('question', '').strip()
|
| 116 |
qid = result.get('id', '')
|
|
@@ -128,7 +142,15 @@ def evaluate_single(
|
|
| 128 |
|
| 129 |
# ANLS*
|
| 130 |
anls = anls_star(answer, gold_data['answers'])
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
# Citation F1
|
| 134 |
doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
|
|
@@ -141,6 +163,7 @@ def evaluate_single(
|
|
| 141 |
return {
|
| 142 |
'question': question,
|
| 143 |
'anls': anls,
|
|
|
|
| 144 |
'correct': correct,
|
| 145 |
'doc_f1': doc_f1['f1'],
|
| 146 |
'page_f1': page_f1['f1'],
|
|
@@ -151,7 +174,7 @@ def evaluate_single(
|
|
| 151 |
}
|
| 152 |
|
| 153 |
|
| 154 |
-
def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
|
| 155 |
"""Aggregate metrics across evaluations."""
|
| 156 |
if not evals:
|
| 157 |
return {}
|
|
@@ -162,6 +185,16 @@ def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
|
|
| 162 |
mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n
|
| 163 |
mean_page_f1 = sum(e['page_f1'] for e in evals) / n
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
# Kuiper
|
| 166 |
kuiper = kuiper_statistic(evals)
|
| 167 |
wasted = wasted_effort_ratio(evals)
|
|
@@ -170,6 +203,8 @@ def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
|
|
| 170 |
'n': n,
|
| 171 |
'accuracy': accuracy,
|
| 172 |
'mean_anls': mean_anls,
|
|
|
|
|
|
|
| 173 |
'doc_f1': mean_doc_f1,
|
| 174 |
'page_f1': mean_page_f1,
|
| 175 |
'kuiper_stat': kuiper['kuiper_stat'],
|
|
@@ -180,7 +215,7 @@ def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
|
|
| 180 |
}
|
| 181 |
|
| 182 |
|
| 183 |
-
def print_metrics(name: str, metrics: Dict, indent: int = 0):
|
| 184 |
"""Print metrics in a formatted way."""
|
| 185 |
prefix = " " * indent
|
| 186 |
|
|
@@ -189,8 +224,16 @@ def print_metrics(name: str, metrics: Dict, indent: int = 0):
|
|
| 189 |
return
|
| 190 |
|
| 191 |
print(f"{prefix}{name} (n={metrics['n']}):")
|
| 192 |
-
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
print(f"{prefix} Document F1: {metrics['doc_f1']:.4f}")
|
| 195 |
print(f"{prefix} Page F1: {metrics['page_f1']:.4f}")
|
| 196 |
|
|
@@ -207,16 +250,20 @@ def evaluate_file(
|
|
| 207 |
gold_by_id: Dict[str, Dict],
|
| 208 |
by_category: bool = False,
|
| 209 |
by_domain: bool = False,
|
| 210 |
-
by_hop_type: bool = True
|
|
|
|
| 211 |
) -> Dict[str, Any]:
|
| 212 |
"""Evaluate a single results file."""
|
| 213 |
results = load_results(filepath)
|
| 214 |
|
| 215 |
evals = []
|
| 216 |
unmatched = 0
|
|
|
|
| 217 |
|
| 218 |
-
for result in results:
|
| 219 |
-
|
|
|
|
|
|
|
| 220 |
if ev:
|
| 221 |
evals.append(ev)
|
| 222 |
else:
|
|
@@ -226,30 +273,30 @@ def evaluate_file(
|
|
| 226 |
print(f" Warning: {unmatched} questions not found in gold standard")
|
| 227 |
|
| 228 |
# Overall metrics
|
| 229 |
-
overall = aggregate_metrics(evals)
|
| 230 |
|
| 231 |
-
output = {'overall': overall}
|
| 232 |
|
| 233 |
# By hop type (always included by default)
|
| 234 |
if by_hop_type:
|
| 235 |
by_hop = defaultdict(list)
|
| 236 |
for e in evals:
|
| 237 |
by_hop[e.get('hop_type', 'single')].append(e)
|
| 238 |
-
output['by_hop_type'] = {hop: aggregate_metrics(items) for hop, items in sorted(by_hop.items())}
|
| 239 |
|
| 240 |
# By category
|
| 241 |
if by_category:
|
| 242 |
by_cat = defaultdict(list)
|
| 243 |
for e in evals:
|
| 244 |
by_cat[e['category'] or 'Unknown'].append(e)
|
| 245 |
-
output['by_category'] = {cat: aggregate_metrics(items) for cat, items in sorted(by_cat.items())}
|
| 246 |
|
| 247 |
# By domain
|
| 248 |
if by_domain:
|
| 249 |
by_dom = defaultdict(list)
|
| 250 |
for e in evals:
|
| 251 |
by_dom[e['domain'] or 'Other'].append(e)
|
| 252 |
-
output['by_domain'] = {dom: aggregate_metrics(items) for dom, items in sorted(by_dom.items())}
|
| 253 |
|
| 254 |
return output
|
| 255 |
|
|
@@ -273,6 +320,8 @@ Examples:
|
|
| 273 |
parser.add_argument('--by-domain', action='store_true', help='Show metrics by domain')
|
| 274 |
parser.add_argument('--compare', action='store_true', help='Compare multiple models side-by-side')
|
| 275 |
parser.add_argument('--json', action='store_true', help='Output as JSON')
|
|
|
|
|
|
|
| 276 |
|
| 277 |
args = parser.parse_args()
|
| 278 |
|
|
@@ -298,7 +347,13 @@ Examples:
|
|
| 298 |
name = name[:-8]
|
| 299 |
|
| 300 |
print(f"\nEvaluating: {filepath.name}")
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
all_results[name] = result
|
| 303 |
|
| 304 |
# Output
|
|
@@ -324,31 +379,42 @@ Examples:
|
|
| 324 |
# Comparison table
|
| 325 |
models = list(all_results.keys())
|
| 326 |
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
else:
|
| 336 |
# Detailed per-model output
|
| 337 |
for model, result in all_results.items():
|
| 338 |
print(f"\n{'─' * 40}")
|
| 339 |
-
|
|
|
|
| 340 |
|
| 341 |
if 'by_category' in result:
|
| 342 |
print(f"\n By Category:")
|
| 343 |
for cat, metrics in sorted(result['by_category'].items(),
|
| 344 |
key=lambda x: -x[1].get('n', 0)):
|
| 345 |
-
print_metrics(cat, metrics, indent=2)
|
| 346 |
|
| 347 |
if 'by_domain' in result:
|
| 348 |
print(f"\n By Domain:")
|
| 349 |
for dom, metrics in sorted(result['by_domain'].items(),
|
| 350 |
key=lambda x: -x[1].get('n', 0)):
|
| 351 |
-
print_metrics(dom, metrics, indent=2)
|
| 352 |
|
| 353 |
print()
|
| 354 |
|
|
|
|
| 18 |
|
| 19 |
from datasets import load_dataset
|
| 20 |
|
| 21 |
+
from metrics import (
|
| 22 |
+
anls_star,
|
| 23 |
+
anls_star_llm,
|
| 24 |
+
aggregate_anls_star_llm,
|
| 25 |
+
citation_f1,
|
| 26 |
+
kuiper_statistic,
|
| 27 |
+
wasted_effort_ratio
|
| 28 |
+
)
|
| 29 |
|
| 30 |
|
| 31 |
def derive_hop_type(evidence: list) -> str:
|
|
|
|
| 113 |
def evaluate_single(
|
| 114 |
result: Dict,
|
| 115 |
gold_by_text: Dict[str, Dict],
|
| 116 |
+
gold_by_id: Dict[str, Dict],
|
| 117 |
+
use_semantic: bool = False
|
| 118 |
) -> Optional[Dict[str, Any]]:
|
| 119 |
"""Evaluate a single prediction.
|
| 120 |
|
| 121 |
Matches by question text first, falls back to question ID if not found.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
result: Prediction dict with 'question', 'answer', 'citations'
|
| 125 |
+
gold_by_text: Gold data indexed by question text
|
| 126 |
+
gold_by_id: Gold data indexed by question ID
|
| 127 |
+
use_semantic: If True, also compute semantic accuracy with LLM judge
|
| 128 |
"""
|
| 129 |
question = result.get('question', '').strip()
|
| 130 |
qid = result.get('id', '')
|
|
|
|
| 142 |
|
| 143 |
# ANLS*
|
| 144 |
anls = anls_star(answer, gold_data['answers'])
|
| 145 |
+
|
| 146 |
+
# Semantic accuracy with LLM judge (if enabled)
|
| 147 |
+
if use_semantic:
|
| 148 |
+
llm_result = anls_star_llm(answer, gold_data['answers'], question)
|
| 149 |
+
semantic = llm_result['score']
|
| 150 |
+
correct = semantic >= 0.5
|
| 151 |
+
else:
|
| 152 |
+
semantic = anls
|
| 153 |
+
correct = anls >= 0.5
|
| 154 |
|
| 155 |
# Citation F1
|
| 156 |
doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
|
|
|
|
| 163 |
return {
|
| 164 |
'question': question,
|
| 165 |
'anls': anls,
|
| 166 |
+
'semantic': semantic,
|
| 167 |
'correct': correct,
|
| 168 |
'doc_f1': doc_f1['f1'],
|
| 169 |
'page_f1': page_f1['f1'],
|
|
|
|
| 174 |
}
|
| 175 |
|
| 176 |
|
| 177 |
+
def aggregate_metrics(evals: List[Dict], use_semantic: bool = False) -> Dict[str, Any]:
|
| 178 |
"""Aggregate metrics across evaluations."""
|
| 179 |
if not evals:
|
| 180 |
return {}
|
|
|
|
| 185 |
mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n
|
| 186 |
mean_page_f1 = sum(e['page_f1'] for e in evals) / n
|
| 187 |
|
| 188 |
+
# Semantic accuracy with bias correction
|
| 189 |
+
if use_semantic and 'semantic' in evals[0]:
|
| 190 |
+
semantic_scores = [e['semantic'] for e in evals]
|
| 191 |
+
agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
|
| 192 |
+
mean_semantic = agg['adjusted_score']
|
| 193 |
+
semantic_ci = (agg['ci_lower'], agg['ci_upper'])
|
| 194 |
+
else:
|
| 195 |
+
mean_semantic = mean_anls
|
| 196 |
+
semantic_ci = None
|
| 197 |
+
|
| 198 |
# Kuiper
|
| 199 |
kuiper = kuiper_statistic(evals)
|
| 200 |
wasted = wasted_effort_ratio(evals)
|
|
|
|
| 203 |
'n': n,
|
| 204 |
'accuracy': accuracy,
|
| 205 |
'mean_anls': mean_anls,
|
| 206 |
+
'mean_semantic': mean_semantic,
|
| 207 |
+
'semantic_ci': semantic_ci,
|
| 208 |
'doc_f1': mean_doc_f1,
|
| 209 |
'page_f1': mean_page_f1,
|
| 210 |
'kuiper_stat': kuiper['kuiper_stat'],
|
|
|
|
| 215 |
}
|
| 216 |
|
| 217 |
|
| 218 |
+
def print_metrics(name: str, metrics: Dict, indent: int = 0, use_semantic: bool = False):
|
| 219 |
"""Print metrics in a formatted way."""
|
| 220 |
prefix = " " * indent
|
| 221 |
|
|
|
|
| 224 |
return
|
| 225 |
|
| 226 |
print(f"{prefix}{name} (n={metrics['n']}):")
|
| 227 |
+
|
| 228 |
+
if use_semantic and 'mean_semantic' in metrics:
|
| 229 |
+
ci = metrics.get('semantic_ci')
|
| 230 |
+
ci_str = f" [{ci[0]:.2%}-{ci[1]:.2%}]" if ci else ""
|
| 231 |
+
print(f"{prefix} Semantic Accuracy: {metrics['mean_semantic']:.2%}{ci_str}")
|
| 232 |
+
print(f"{prefix} ANLS* (string): {metrics['mean_anls']:.4f}")
|
| 233 |
+
else:
|
| 234 |
+
print(f"{prefix} Accuracy (ANLS*≥0.5): {metrics['accuracy']:.1%}")
|
| 235 |
+
print(f"{prefix} Mean ANLS*: {metrics['mean_anls']:.4f}")
|
| 236 |
+
|
| 237 |
print(f"{prefix} Document F1: {metrics['doc_f1']:.4f}")
|
| 238 |
print(f"{prefix} Page F1: {metrics['page_f1']:.4f}")
|
| 239 |
|
|
|
|
| 250 |
gold_by_id: Dict[str, Dict],
|
| 251 |
by_category: bool = False,
|
| 252 |
by_domain: bool = False,
|
| 253 |
+
by_hop_type: bool = True,
|
| 254 |
+
use_semantic: bool = False
|
| 255 |
) -> Dict[str, Any]:
|
| 256 |
"""Evaluate a single results file."""
|
| 257 |
results = load_results(filepath)
|
| 258 |
|
| 259 |
evals = []
|
| 260 |
unmatched = 0
|
| 261 |
+
total = len(results)
|
| 262 |
|
| 263 |
+
for i, result in enumerate(results):
|
| 264 |
+
if use_semantic and (i + 1) % 50 == 0:
|
| 265 |
+
print(f" Processing {i+1}/{total}...")
|
| 266 |
+
ev = evaluate_single(result, gold_by_text, gold_by_id, use_semantic=use_semantic)
|
| 267 |
if ev:
|
| 268 |
evals.append(ev)
|
| 269 |
else:
|
|
|
|
| 273 |
print(f" Warning: {unmatched} questions not found in gold standard")
|
| 274 |
|
| 275 |
# Overall metrics
|
| 276 |
+
overall = aggregate_metrics(evals, use_semantic=use_semantic)
|
| 277 |
|
| 278 |
+
output = {'overall': overall, 'use_semantic': use_semantic}
|
| 279 |
|
| 280 |
# By hop type (always included by default)
|
| 281 |
if by_hop_type:
|
| 282 |
by_hop = defaultdict(list)
|
| 283 |
for e in evals:
|
| 284 |
by_hop[e.get('hop_type', 'single')].append(e)
|
| 285 |
+
output['by_hop_type'] = {hop: aggregate_metrics(items, use_semantic) for hop, items in sorted(by_hop.items())}
|
| 286 |
|
| 287 |
# By category
|
| 288 |
if by_category:
|
| 289 |
by_cat = defaultdict(list)
|
| 290 |
for e in evals:
|
| 291 |
by_cat[e['category'] or 'Unknown'].append(e)
|
| 292 |
+
output['by_category'] = {cat: aggregate_metrics(items, use_semantic) for cat, items in sorted(by_cat.items())}
|
| 293 |
|
| 294 |
# By domain
|
| 295 |
if by_domain:
|
| 296 |
by_dom = defaultdict(list)
|
| 297 |
for e in evals:
|
| 298 |
by_dom[e['domain'] or 'Other'].append(e)
|
| 299 |
+
output['by_domain'] = {dom: aggregate_metrics(items, use_semantic) for dom, items in sorted(by_dom.items())}
|
| 300 |
|
| 301 |
return output
|
| 302 |
|
|
|
|
| 320 |
parser.add_argument('--by-domain', action='store_true', help='Show metrics by domain')
|
| 321 |
parser.add_argument('--compare', action='store_true', help='Compare multiple models side-by-side')
|
| 322 |
parser.add_argument('--json', action='store_true', help='Output as JSON')
|
| 323 |
+
parser.add_argument('--semantic', action='store_true',
|
| 324 |
+
help='Use semantic accuracy (ANLS* + LLM judge) instead of pure ANLS*. Requires GOOGLE_API_KEY.')
|
| 325 |
|
| 326 |
args = parser.parse_args()
|
| 327 |
|
|
|
|
| 347 |
name = name[:-8]
|
| 348 |
|
| 349 |
print(f"\nEvaluating: {filepath.name}")
|
| 350 |
+
if args.semantic:
|
| 351 |
+
print(" Using semantic accuracy (ANLS* + LLM judge)...")
|
| 352 |
+
result = evaluate_file(
|
| 353 |
+
filepath, gold_by_text, gold_by_id,
|
| 354 |
+
args.by_category, args.by_domain,
|
| 355 |
+
use_semantic=args.semantic
|
| 356 |
+
)
|
| 357 |
all_results[name] = result
|
| 358 |
|
| 359 |
# Output
|
|
|
|
| 379 |
# Comparison table
|
| 380 |
models = list(all_results.keys())
|
| 381 |
|
| 382 |
+
if args.semantic:
|
| 383 |
+
print(f"\n{'Model':<35} {'Semantic':<10} {'ANLS*':<8} {'Doc F1':<8} {'Page F1':<8} {'Kuiper':<8}")
|
| 384 |
+
print("-" * 85)
|
| 385 |
+
|
| 386 |
+
for model in sorted(models, key=lambda m: -all_results[m]['overall'].get('mean_semantic', 0)):
|
| 387 |
+
m = all_results[model]['overall']
|
| 388 |
+
kuiper_str = f"{m['kuiper_stat']:.2f}" if not m.get('kuiper_degenerate') else "N/A"
|
| 389 |
+
print(f"{model:<35} {m.get('mean_semantic', 0):.1%} {m.get('mean_anls', 0):.4f} "
|
| 390 |
+
f"{m.get('doc_f1', 0):.4f} {m.get('page_f1', 0):.4f} {kuiper_str}")
|
| 391 |
+
else:
|
| 392 |
+
print(f"\n{'Model':<35} {'Acc':<8} {'ANLS*':<8} {'Doc F1':<8} {'Page F1':<8} {'Kuiper':<8}")
|
| 393 |
+
print("-" * 75)
|
| 394 |
+
|
| 395 |
+
for model in sorted(models, key=lambda m: -all_results[m]['overall'].get('accuracy', 0)):
|
| 396 |
+
m = all_results[model]['overall']
|
| 397 |
+
kuiper_str = f"{m['kuiper_stat']:.2f}" if not m.get('kuiper_degenerate') else "N/A"
|
| 398 |
+
print(f"{model:<35} {m.get('accuracy', 0):.1%} {m.get('mean_anls', 0):.4f} "
|
| 399 |
+
f"{m.get('doc_f1', 0):.4f} {m.get('page_f1', 0):.4f} {kuiper_str}")
|
| 400 |
else:
|
| 401 |
# Detailed per-model output
|
| 402 |
for model, result in all_results.items():
|
| 403 |
print(f"\n{'─' * 40}")
|
| 404 |
+
use_sem = result.get('use_semantic', False)
|
| 405 |
+
print_metrics(model, result['overall'], use_semantic=use_sem)
|
| 406 |
|
| 407 |
if 'by_category' in result:
|
| 408 |
print(f"\n By Category:")
|
| 409 |
for cat, metrics in sorted(result['by_category'].items(),
|
| 410 |
key=lambda x: -x[1].get('n', 0)):
|
| 411 |
+
print_metrics(cat, metrics, indent=2, use_semantic=use_sem)
|
| 412 |
|
| 413 |
if 'by_domain' in result:
|
| 414 |
print(f"\n By Domain:")
|
| 415 |
for dom, metrics in sorted(result['by_domain'].items(),
|
| 416 |
key=lambda x: -x[1].get('n', 0)):
|
| 417 |
+
print_metrics(dom, metrics, indent=2, use_semantic=use_sem)
|
| 418 |
|
| 419 |
print()
|
| 420 |
|
eval/metrics.py
CHANGED
|
@@ -3,15 +3,180 @@ Core evaluation metrics for document QA.
|
|
| 3 |
|
| 4 |
Metrics:
|
| 5 |
- ANLS*: Answer-level Normalized Levenshtein Similarity
|
|
|
|
| 6 |
- Citation F1: Document-level and Page-level F1 scores
|
| 7 |
- Kuiper Statistic: Effort-accuracy calibration measure
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
import numpy as np
|
|
|
|
| 12 |
from anls_star import anls_score
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def anls_star(predicted: Any, ground_truths: List[List[str]]) -> float:
|
| 16 |
"""
|
| 17 |
Calculate ANLS* score (case-insensitive).
|
|
@@ -49,6 +214,340 @@ def anls_star(predicted: Any, ground_truths: List[List[str]]) -> float:
|
|
| 49 |
return max_score
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def citation_f1(
|
| 53 |
predicted_citations: List[Dict[str, Any]],
|
| 54 |
gold_locations: List[Dict[str, Any]],
|
|
|
|
| 3 |
|
| 4 |
Metrics:
|
| 5 |
- ANLS*: Answer-level Normalized Levenshtein Similarity
|
| 6 |
+
- ANLS*+LLM: ANLS* with LLM fallback for semantic equivalence
|
| 7 |
- Citation F1: Document-level and Page-level F1 scores
|
| 8 |
- Kuiper Statistic: Effort-accuracy calibration measure
|
| 9 |
+
|
| 10 |
+
Bias Correction:
|
| 11 |
+
Based on "How to Correctly Report LLM-as-a-Judge Evaluations" (2511.21140v2)
|
| 12 |
"""
|
| 13 |
|
| 14 |
+
import json
|
| 15 |
+
import os
|
| 16 |
+
import time
|
| 17 |
+
from math import sqrt
|
| 18 |
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
| 19 |
import numpy as np
|
| 20 |
+
from scipy.stats import norm
|
| 21 |
from anls_star import anls_score
|
| 22 |
|
| 23 |
|
| 24 |
+
# ============================================================================
|
| 25 |
+
# LLM Judge Calibration (from human evaluation)
|
| 26 |
+
# ============================================================================
|
| 27 |
+
|
| 28 |
+
# Calibration values from 200-sample human evaluation
|
| 29 |
+
# Sensitivity: P(LLM=correct | Human=correct)
|
| 30 |
+
LLM_JUDGE_SENSITIVITY = 0.980 # q1
|
| 31 |
+
# Specificity: P(LLM=incorrect | Human=incorrect)
|
| 32 |
+
LLM_JUDGE_SPECIFICITY = 1.000 # q0
|
| 33 |
+
# Calibration sample sizes (for confidence intervals)
|
| 34 |
+
LLM_JUDGE_CALIBRATION_M1 = 152 # samples where human=correct
|
| 35 |
+
LLM_JUDGE_CALIBRATION_M0 = 48 # samples where human=incorrect
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def bias_adjusted_score(
|
| 39 |
+
raw_score: float,
|
| 40 |
+
q0: float = LLM_JUDGE_SPECIFICITY,
|
| 41 |
+
q1: float = LLM_JUDGE_SENSITIVITY
|
| 42 |
+
) -> float:
|
| 43 |
+
"""
|
| 44 |
+
Compute bias-adjusted score using Rogan-Gladen correction.
|
| 45 |
+
|
| 46 |
+
From "How to Correctly Report LLM-as-a-Judge Evaluations":
|
| 47 |
+
θ̂ = (p̂ + q₀ - 1) / (q₀ + q₁ - 1)
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
raw_score: Raw LLM judgment score (p̂)
|
| 51 |
+
q0: Specificity - P(LLM=incorrect | true=incorrect)
|
| 52 |
+
q1: Sensitivity - P(LLM=correct | true=correct)
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
Bias-adjusted score, clipped to [0, 1]
|
| 56 |
+
"""
|
| 57 |
+
if q0 + q1 <= 1:
|
| 58 |
+
# Degenerate case - judge is no better than random
|
| 59 |
+
return raw_score
|
| 60 |
+
|
| 61 |
+
adjusted = (raw_score + q0 - 1) / (q0 + q1 - 1)
|
| 62 |
+
return max(0.0, min(1.0, adjusted))
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def standard_error(
|
| 66 |
+
raw_score: float,
|
| 67 |
+
n_samples: int,
|
| 68 |
+
q0: float = LLM_JUDGE_SPECIFICITY,
|
| 69 |
+
q1: float = LLM_JUDGE_SENSITIVITY
|
| 70 |
+
) -> float:
|
| 71 |
+
"""
|
| 72 |
+
Compute bias-adjusted standard error.
|
| 73 |
+
|
| 74 |
+
SE is scaled by the bias adjustment factor to account for
|
| 75 |
+
the transformation from raw to adjusted score.
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
raw_score: Raw LLM judgment score (p̂)
|
| 79 |
+
n_samples: Number of test samples
|
| 80 |
+
q0: Specificity
|
| 81 |
+
q1: Sensitivity
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
Bias-adjusted standard error
|
| 85 |
+
"""
|
| 86 |
+
if n_samples <= 0 or q0 + q1 <= 1:
|
| 87 |
+
return 0.0
|
| 88 |
+
|
| 89 |
+
# Raw binomial SE
|
| 90 |
+
p = raw_score
|
| 91 |
+
se_raw = sqrt(p * (1 - p) / n_samples) if 0 < p < 1 else 0
|
| 92 |
+
|
| 93 |
+
# Scale by bias adjustment factor
|
| 94 |
+
se_adjusted = se_raw / (q0 + q1 - 1)
|
| 95 |
+
|
| 96 |
+
return se_adjusted
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def confidence_interval(
|
| 100 |
+
raw_score: float,
|
| 101 |
+
n_samples: int,
|
| 102 |
+
q0: float = LLM_JUDGE_SPECIFICITY,
|
| 103 |
+
q1: float = LLM_JUDGE_SENSITIVITY,
|
| 104 |
+
m0: int = LLM_JUDGE_CALIBRATION_M0,
|
| 105 |
+
m1: int = LLM_JUDGE_CALIBRATION_M1,
|
| 106 |
+
alpha: float = 0.05
|
| 107 |
+
) -> Tuple[float, float]:
|
| 108 |
+
"""
|
| 109 |
+
Compute confidence interval for bias-adjusted score.
|
| 110 |
+
|
| 111 |
+
Simplified version that uses observed q0, q1 directly when calibration
|
| 112 |
+
is high quality (q0 + q1 > 1.9). Falls back to full formula otherwise.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
raw_score: Raw LLM judgment score (p̂)
|
| 116 |
+
n_samples: Number of test samples
|
| 117 |
+
q0: Specificity
|
| 118 |
+
q1: Sensitivity
|
| 119 |
+
m0: Calibration samples where human=incorrect
|
| 120 |
+
m1: Calibration samples where human=correct
|
| 121 |
+
alpha: Significance level (default 0.05 for 95% CI)
|
| 122 |
+
|
| 123 |
+
Returns:
|
| 124 |
+
Tuple of (lower_bound, upper_bound)
|
| 125 |
+
"""
|
| 126 |
+
z = norm.ppf(1 - alpha / 2)
|
| 127 |
+
|
| 128 |
+
# For high-quality calibration (q0 + q1 > 1.9), use simplified CI
|
| 129 |
+
# that trusts the observed sensitivity/specificity
|
| 130 |
+
if q0 + q1 > 1.9:
|
| 131 |
+
# Bias-adjusted point estimate
|
| 132 |
+
theta = bias_adjusted_score(raw_score, q0, q1)
|
| 133 |
+
|
| 134 |
+
# Simple binomial SE for the test dataset only
|
| 135 |
+
# (calibration is trusted to be accurate)
|
| 136 |
+
p = raw_score
|
| 137 |
+
se_raw = sqrt(p * (1 - p) / n_samples) if n_samples > 0 else 0
|
| 138 |
+
|
| 139 |
+
# Scale SE by the bias adjustment factor
|
| 140 |
+
se_adjusted = se_raw / (q0 + q1 - 1)
|
| 141 |
+
|
| 142 |
+
lower = max(0.0, theta - z * se_adjusted)
|
| 143 |
+
upper = min(1.0, theta + z * se_adjusted)
|
| 144 |
+
return (lower, upper)
|
| 145 |
+
|
| 146 |
+
# Full formula with regularization for lower-quality calibration
|
| 147 |
+
p = (n_samples * raw_score + z**2 / 2) / (n_samples + z**2)
|
| 148 |
+
q0_adj = (m0 * q0 + 1) / (m0 + 2)
|
| 149 |
+
q1_adj = (m1 * q1 + 1) / (m1 + 2)
|
| 150 |
+
|
| 151 |
+
n_adj = n_samples + z**2
|
| 152 |
+
m0_adj = m0 + 2
|
| 153 |
+
m1_adj = m1 + 2
|
| 154 |
+
|
| 155 |
+
# Point estimate
|
| 156 |
+
if q0_adj + q1_adj <= 1:
|
| 157 |
+
return (0.0, 1.0)
|
| 158 |
+
|
| 159 |
+
theta = (p + q0_adj - 1) / (q0_adj + q1_adj - 1)
|
| 160 |
+
|
| 161 |
+
# Bias correction term
|
| 162 |
+
dth = 2 * z**2 * (
|
| 163 |
+
-(1 - theta) * q0_adj * (1 - q0_adj) / m0_adj
|
| 164 |
+
+ theta * q1_adj * (1 - q1_adj) / m1_adj
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
# Standard error
|
| 168 |
+
se = sqrt(
|
| 169 |
+
p * (1 - p) / n_adj
|
| 170 |
+
+ (1 - theta)**2 * q0_adj * (1 - q0_adj) / m0_adj
|
| 171 |
+
+ theta**2 * q1_adj * (1 - q1_adj) / m1_adj
|
| 172 |
+
) / (q0_adj + q1_adj - 1)
|
| 173 |
+
|
| 174 |
+
lower = max(0.0, theta + dth - z * se)
|
| 175 |
+
upper = min(1.0, theta + dth + z * se)
|
| 176 |
+
|
| 177 |
+
return (lower, upper)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
def anls_star(predicted: Any, ground_truths: List[List[str]]) -> float:
|
| 181 |
"""
|
| 182 |
Calculate ANLS* score (case-insensitive).
|
|
|
|
| 214 |
return max_score
|
| 215 |
|
| 216 |
|
| 217 |
+
# ============================================================================
|
| 218 |
+
# ANLS* + LLM Judge Metric
|
| 219 |
+
# ============================================================================
|
| 220 |
+
|
| 221 |
+
_GEVAL_PROMPT_TEMPLATE = """You are evaluating answer correctness for a Document QA benchmark.
|
| 222 |
+
|
| 223 |
+
## Input
|
| 224 |
+
Question: {question}
|
| 225 |
+
Predicted Answer: {predicted}
|
| 226 |
+
Gold Answer Variants: {gold_variants}
|
| 227 |
+
|
| 228 |
+
## Evaluation Criteria
|
| 229 |
+
|
| 230 |
+
**correct**: Predicted answer is semantically equivalent to at least one gold variant. Minor format differences are acceptable.
|
| 231 |
+
|
| 232 |
+
**partial**: Predicted answer contains correct core information but has a significant format issue (e.g., list presented as comma-separated string when items are short/atomic) OR includes irrelevant additions.
|
| 233 |
+
|
| 234 |
+
**incorrect**: Predicted answer is factually wrong, missing, contains different information, or fails to answer the question type (e.g., no Yes/No for binary questions). Missing unit qualifiers that change magnitude (thousands, millions) are incorrect.
|
| 235 |
+
|
| 236 |
+
## Evaluation Steps
|
| 237 |
+
|
| 238 |
+
Follow these steps in order:
|
| 239 |
+
|
| 240 |
+
Step 1 - Check for refusal: Does the answer refuse or claim inability to answer? If yes → incorrect.
|
| 241 |
+
|
| 242 |
+
Step 2 - Compare content: Does the predicted answer match the core meaning of any gold variant? If content is wrong or different → incorrect.
|
| 243 |
+
|
| 244 |
+
Step 3 - Check critical errors (any of these → incorrect):
|
| 245 |
+
- Missing scale qualifiers that change magnitude: "50" vs "$50 million" → incorrect
|
| 246 |
+
- Binary questions without explicit Yes/No: Q: "Is X true?" A: "X is observed" → incorrect (must say Yes or No)
|
| 247 |
+
- Wrong entity/value: different person, company, number than gold → incorrect
|
| 248 |
+
- Partial list with wrong items mixed in: some correct + some wrong items → incorrect
|
| 249 |
+
|
| 250 |
+
Step 4 - Check format (only if content is correct):
|
| 251 |
+
- If gold expects multiple items AND predicted is a comma-separated string (not a list) → partial
|
| 252 |
+
- If gold expects single item → no format issue possible
|
| 253 |
+
|
| 254 |
+
Step 5 - Check verbosity (only if content is correct):
|
| 255 |
+
- CORRECT (acceptable verbosity):
|
| 256 |
+
* Extra qualifiers: "three security questions" when gold is "3" → correct
|
| 257 |
+
* Relevant context: "No — Massachusetts; Washington" for "same state?" question → correct
|
| 258 |
+
* Clarifying phrases: "in his personal capacity", "per annum" → correct
|
| 259 |
+
- PARTIAL (medium verbosity) - ONLY when additions are truly irrelevant:
|
| 260 |
+
* Adding unrequested details to list items
|
| 261 |
+
* Over-specific precision: date+time when only date asked → partial
|
| 262 |
+
- INCORRECT (high verbosity):
|
| 263 |
+
* Multi-sentence responses when a word/phrase suffices
|
| 264 |
+
* Full paragraphs of explanation
|
| 265 |
+
* Conversational preambles: "Based on the document...", "The answer is..."
|
| 266 |
+
|
| 267 |
+
Based on your step-by-step analysis, provide your final judgment.
|
| 268 |
+
|
| 269 |
+
After your reasoning, you MUST call submit_judgment with your final decision."""
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
_LLM_JUDGE_TOOL = {
|
| 273 |
+
"function_declarations": [{
|
| 274 |
+
"name": "submit_judgment",
|
| 275 |
+
"description": "Submit your final judgment after reasoning through the evaluation steps",
|
| 276 |
+
"parameters": {
|
| 277 |
+
"type": "object",
|
| 278 |
+
"properties": {
|
| 279 |
+
"judgment": {
|
| 280 |
+
"type": "string",
|
| 281 |
+
"enum": ["correct", "partial", "incorrect"],
|
| 282 |
+
"description": "Final judgment: correct, partial, or incorrect"
|
| 283 |
+
},
|
| 284 |
+
"main_issue": {
|
| 285 |
+
"type": "string",
|
| 286 |
+
"enum": ["none", "refusal", "wrong_content", "missing_unit", "no_yes_no", "list_format", "verbosity_medium", "verbosity_high"],
|
| 287 |
+
"description": "The primary issue found, if any"
|
| 288 |
+
},
|
| 289 |
+
"explanation": {
|
| 290 |
+
"type": "string",
|
| 291 |
+
"description": "Brief explanation of your judgment"
|
| 292 |
+
}
|
| 293 |
+
},
|
| 294 |
+
"required": ["judgment", "main_issue", "explanation"]
|
| 295 |
+
}
|
| 296 |
+
}]
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def _get_gemini_model():
|
| 301 |
+
"""Initialize Gemini model (lazy loading)."""
|
| 302 |
+
import google.generativeai as genai
|
| 303 |
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
| 304 |
+
if not api_key:
|
| 305 |
+
raise ValueError("GOOGLE_API_KEY environment variable not set")
|
| 306 |
+
genai.configure(api_key=api_key)
|
| 307 |
+
return genai.GenerativeModel('gemini-2.5-flash')
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
def _call_gemini_with_timeout(model, prompt, timeout=30):
|
| 311 |
+
"""Call Gemini with a timeout using threading."""
|
| 312 |
+
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
|
| 313 |
+
|
| 314 |
+
def _call():
|
| 315 |
+
return model.generate_content(
|
| 316 |
+
prompt,
|
| 317 |
+
tools=[_LLM_JUDGE_TOOL],
|
| 318 |
+
tool_config={"function_calling_config": {"mode": "ANY"}},
|
| 319 |
+
request_options={"timeout": timeout}
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
| 323 |
+
future = executor.submit(_call)
|
| 324 |
+
try:
|
| 325 |
+
return future.result(timeout=timeout)
|
| 326 |
+
except FuturesTimeoutError:
|
| 327 |
+
raise TimeoutError(f"Gemini API call timed out after {timeout}s")
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def _call_llm_judge(
|
| 331 |
+
question: str,
|
| 332 |
+
predicted: Any,
|
| 333 |
+
gold_variants: List[List[str]],
|
| 334 |
+
max_retries: int = 3,
|
| 335 |
+
retry_delay: float = 1.0,
|
| 336 |
+
timeout: float = 30.0
|
| 337 |
+
) -> Dict[str, Any]:
|
| 338 |
+
"""
|
| 339 |
+
Call Gemini LLM judge with retries and timeout.
|
| 340 |
+
|
| 341 |
+
Returns:
|
| 342 |
+
Dict with 'judgment', 'main_issue', 'explanation', 'score'
|
| 343 |
+
"""
|
| 344 |
+
prompt = _GEVAL_PROMPT_TEMPLATE.format(
|
| 345 |
+
question=question,
|
| 346 |
+
predicted=json.dumps(predicted),
|
| 347 |
+
gold_variants=json.dumps(gold_variants)
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
model = _get_gemini_model()
|
| 351 |
+
|
| 352 |
+
for attempt in range(max_retries):
|
| 353 |
+
try:
|
| 354 |
+
response = _call_gemini_with_timeout(model, prompt, timeout=timeout)
|
| 355 |
+
|
| 356 |
+
# Extract function call result
|
| 357 |
+
if response.candidates and response.candidates[0].content.parts:
|
| 358 |
+
for part in response.candidates[0].content.parts:
|
| 359 |
+
if hasattr(part, 'function_call') and part.function_call.name == "submit_judgment":
|
| 360 |
+
args = dict(part.function_call.args)
|
| 361 |
+
judgment = args.get('judgment', 'incorrect')
|
| 362 |
+
|
| 363 |
+
# Map judgment to score
|
| 364 |
+
score_map = {'correct': 1.0, 'partial': 0.5, 'incorrect': 0.0}
|
| 365 |
+
args['score'] = score_map.get(judgment, 0.0)
|
| 366 |
+
return args
|
| 367 |
+
|
| 368 |
+
# No function call found - retry
|
| 369 |
+
if attempt < max_retries - 1:
|
| 370 |
+
time.sleep(retry_delay)
|
| 371 |
+
continue
|
| 372 |
+
|
| 373 |
+
except TimeoutError as e:
|
| 374 |
+
if attempt < max_retries - 1:
|
| 375 |
+
time.sleep(retry_delay)
|
| 376 |
+
continue
|
| 377 |
+
return {
|
| 378 |
+
'judgment': 'error',
|
| 379 |
+
'main_issue': 'timeout',
|
| 380 |
+
'explanation': str(e),
|
| 381 |
+
'score': 0.0
|
| 382 |
+
}
|
| 383 |
+
except Exception as e:
|
| 384 |
+
if attempt < max_retries - 1:
|
| 385 |
+
time.sleep(retry_delay * (attempt + 1)) # Exponential backoff
|
| 386 |
+
continue
|
| 387 |
+
return {
|
| 388 |
+
'judgment': 'error',
|
| 389 |
+
'main_issue': 'error',
|
| 390 |
+
'explanation': str(e),
|
| 391 |
+
'score': 0.0
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
return {
|
| 395 |
+
'judgment': 'error',
|
| 396 |
+
'main_issue': 'parse_error',
|
| 397 |
+
'explanation': 'Failed to get valid response after retries',
|
| 398 |
+
'score': 0.0
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
def anls_star_llm(
|
| 403 |
+
predicted: Any,
|
| 404 |
+
ground_truths: List[List[str]],
|
| 405 |
+
question: str = "",
|
| 406 |
+
threshold: float = 1.0
|
| 407 |
+
) -> Dict[str, Any]:
|
| 408 |
+
"""
|
| 409 |
+
ANLS* with LLM fallback for semantic equivalence checking.
|
| 410 |
+
|
| 411 |
+
If ANLS* >= threshold (default 1.0), returns ANLS* score.
|
| 412 |
+
Otherwise, calls Gemini LLM judge to evaluate semantic correctness.
|
| 413 |
+
|
| 414 |
+
Args:
|
| 415 |
+
predicted: Predicted answer (string or list)
|
| 416 |
+
ground_truths: List of answer variants
|
| 417 |
+
question: The question text (needed for LLM judge)
|
| 418 |
+
threshold: ANLS* threshold above which to skip LLM (default 1.0)
|
| 419 |
+
|
| 420 |
+
Returns:
|
| 421 |
+
Dict with:
|
| 422 |
+
- 'score': Final score (0.0, 0.5, or 1.0)
|
| 423 |
+
- 'anls_score': Raw ANLS* score
|
| 424 |
+
- 'used_llm': Whether LLM judge was called
|
| 425 |
+
- 'llm_judgment': LLM judgment details (if used)
|
| 426 |
+
"""
|
| 427 |
+
# Check for empty prediction (optimization: skip LLM, return 0)
|
| 428 |
+
is_empty = (
|
| 429 |
+
predicted is None
|
| 430 |
+
or predicted == ""
|
| 431 |
+
or predicted == []
|
| 432 |
+
or (isinstance(predicted, list) and all(not p for p in predicted))
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
+
if is_empty:
|
| 436 |
+
return {
|
| 437 |
+
'score': 0.0,
|
| 438 |
+
'anls_score': 0.0,
|
| 439 |
+
'used_llm': False,
|
| 440 |
+
'llm_judgment': {'judgment': 'incorrect', 'main_issue': 'empty', 'explanation': 'Empty prediction'}
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
# Check for overly long answers (optimization: skip LLM, return 0)
|
| 444 |
+
MAX_ANSWER_LENGTH = 2000
|
| 445 |
+
try:
|
| 446 |
+
answer_length = len(json.dumps(predicted))
|
| 447 |
+
except (TypeError, ValueError):
|
| 448 |
+
answer_length = len(str(predicted))
|
| 449 |
+
|
| 450 |
+
if answer_length > MAX_ANSWER_LENGTH:
|
| 451 |
+
return {
|
| 452 |
+
'score': 0.0,
|
| 453 |
+
'anls_score': 0.0,
|
| 454 |
+
'used_llm': False,
|
| 455 |
+
'llm_judgment': {
|
| 456 |
+
'judgment': 'incorrect',
|
| 457 |
+
'main_issue': 'too_long',
|
| 458 |
+
'explanation': f'Answer too long ({answer_length} chars > {MAX_ANSWER_LENGTH})'
|
| 459 |
+
}
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
# Check ANLS*
|
| 463 |
+
anls = anls_star(predicted, ground_truths)
|
| 464 |
+
|
| 465 |
+
result = {
|
| 466 |
+
'score': anls,
|
| 467 |
+
'anls_score': anls,
|
| 468 |
+
'used_llm': False,
|
| 469 |
+
'llm_judgment': None
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
# If ANLS* is perfect, no need for LLM
|
| 473 |
+
if anls >= threshold:
|
| 474 |
+
result['score'] = 1.0
|
| 475 |
+
return result
|
| 476 |
+
|
| 477 |
+
# Call LLM judge for cases where ANLS* < threshold
|
| 478 |
+
if question:
|
| 479 |
+
llm_result = _call_llm_judge(question, predicted, ground_truths)
|
| 480 |
+
result['used_llm'] = True
|
| 481 |
+
result['llm_judgment'] = llm_result
|
| 482 |
+
result['score'] = llm_result.get('score', 0.0)
|
| 483 |
+
|
| 484 |
+
return result
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
def aggregate_anls_star_llm(
|
| 488 |
+
scores: List[float],
|
| 489 |
+
apply_bias_correction: bool = True
|
| 490 |
+
) -> Dict[str, Any]:
|
| 491 |
+
"""
|
| 492 |
+
Compute aggregate ANLS*+LLM score with optional bias correction.
|
| 493 |
+
|
| 494 |
+
Based on "How to Correctly Report LLM-as-a-Judge Evaluations" (2511.21140v2).
|
| 495 |
+
|
| 496 |
+
Args:
|
| 497 |
+
scores: List of individual ANLS*+LLM scores (0.0, 0.5, or 1.0)
|
| 498 |
+
apply_bias_correction: Whether to apply Rogan-Gladen correction
|
| 499 |
+
|
| 500 |
+
Returns:
|
| 501 |
+
Dict with:
|
| 502 |
+
- 'raw_score': Mean of raw scores
|
| 503 |
+
- 'adjusted_score': Bias-adjusted score (if correction applied)
|
| 504 |
+
- 'se': Bias-adjusted standard error
|
| 505 |
+
- 'ci_lower': 95% CI lower bound
|
| 506 |
+
- 'ci_upper': 95% CI upper bound
|
| 507 |
+
- 'n_samples': Number of samples
|
| 508 |
+
- 'q0': Specificity used
|
| 509 |
+
- 'q1': Sensitivity used
|
| 510 |
+
"""
|
| 511 |
+
if not scores:
|
| 512 |
+
return {
|
| 513 |
+
'raw_score': 0.0,
|
| 514 |
+
'adjusted_score': 0.0,
|
| 515 |
+
'se': 0.0,
|
| 516 |
+
'ci_lower': 0.0,
|
| 517 |
+
'ci_upper': 0.0,
|
| 518 |
+
'n_samples': 0,
|
| 519 |
+
'q0': LLM_JUDGE_SPECIFICITY,
|
| 520 |
+
'q1': LLM_JUDGE_SENSITIVITY
|
| 521 |
+
}
|
| 522 |
+
|
| 523 |
+
n = len(scores)
|
| 524 |
+
raw = sum(scores) / n
|
| 525 |
+
|
| 526 |
+
result = {
|
| 527 |
+
'raw_score': raw,
|
| 528 |
+
'n_samples': n,
|
| 529 |
+
'q0': LLM_JUDGE_SPECIFICITY,
|
| 530 |
+
'q1': LLM_JUDGE_SENSITIVITY
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
if apply_bias_correction:
|
| 534 |
+
result['adjusted_score'] = bias_adjusted_score(raw)
|
| 535 |
+
result['se'] = standard_error(raw, n)
|
| 536 |
+
ci = confidence_interval(raw, n)
|
| 537 |
+
result['ci_lower'] = ci[0]
|
| 538 |
+
result['ci_upper'] = ci[1]
|
| 539 |
+
else:
|
| 540 |
+
result['adjusted_score'] = raw
|
| 541 |
+
result['se'] = sqrt(raw * (1 - raw) / n) if n > 0 and 0 < raw < 1 else 0.0
|
| 542 |
+
# Simple binomial CI without calibration correction
|
| 543 |
+
se = sqrt(raw * (1 - raw) / n) if n > 0 else 0
|
| 544 |
+
z = 1.96
|
| 545 |
+
result['ci_lower'] = max(0.0, raw - z * se)
|
| 546 |
+
result['ci_upper'] = min(1.0, raw + z * se)
|
| 547 |
+
|
| 548 |
+
return result
|
| 549 |
+
|
| 550 |
+
|
| 551 |
def citation_f1(
|
| 552 |
predicted_citations: List[Dict[str, Any]],
|
| 553 |
gold_locations: List[Dict[str, Any]],
|
eval/reevaluate_submissions.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Re-evaluate existing submissions with the new Semantic Accuracy metric.
|
| 4 |
+
|
| 5 |
+
This script:
|
| 6 |
+
1. Downloads prediction files from HuggingFace Hub
|
| 7 |
+
2. Re-evaluates them with ANLS* + LLM judge
|
| 8 |
+
3. Updates the results files with new metrics
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from datetime import datetime, timezone
|
| 16 |
+
|
| 17 |
+
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
|
| 18 |
+
from datasets import load_dataset
|
| 19 |
+
|
| 20 |
+
# Add parent for imports
|
| 21 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 22 |
+
from metrics import (
|
| 23 |
+
anls_star,
|
| 24 |
+
anls_star_llm,
|
| 25 |
+
aggregate_anls_star_llm,
|
| 26 |
+
citation_f1,
|
| 27 |
+
kuiper_statistic
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# Config
|
| 31 |
+
RESULTS_REPO = "agentic-document-ai/backend-results"
|
| 32 |
+
TOKEN = os.environ.get("HF_TOKEN")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def load_gold_data():
|
| 36 |
+
"""Load gold standard from HuggingFace."""
|
| 37 |
+
print("Loading gold standard...")
|
| 38 |
+
dataset = load_dataset("agentic-document-ai/dataset-PRIVATE", split="test")
|
| 39 |
+
|
| 40 |
+
gold_by_id = {}
|
| 41 |
+
gold_by_text = {}
|
| 42 |
+
|
| 43 |
+
for ex in dataset:
|
| 44 |
+
qid = ex.get('id', '')
|
| 45 |
+
question = ex['question'].strip()
|
| 46 |
+
data = {
|
| 47 |
+
'question': question,
|
| 48 |
+
'answers': ex.get('answer_variants', []),
|
| 49 |
+
'evidence': ex.get('evidence', []),
|
| 50 |
+
'category': ex.get('category', ''),
|
| 51 |
+
'domain': ex.get('domain', ''),
|
| 52 |
+
'hop_type': ex.get('hop_type', 'single'),
|
| 53 |
+
}
|
| 54 |
+
gold_by_id[qid] = data
|
| 55 |
+
gold_by_text[question] = data
|
| 56 |
+
|
| 57 |
+
return gold_by_id, gold_by_text
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def find_prediction_files():
|
| 61 |
+
"""Find all prediction JSONL files in the results repo."""
|
| 62 |
+
api = HfApi()
|
| 63 |
+
files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
|
| 64 |
+
|
| 65 |
+
prediction_files = [f for f in files if f.endswith('_predictions.jsonl') or '_predictions_' in f]
|
| 66 |
+
return prediction_files
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def download_predictions(filepath: str) -> list:
|
| 70 |
+
"""Download and parse a predictions file."""
|
| 71 |
+
local_path = hf_hub_download(
|
| 72 |
+
repo_id=RESULTS_REPO,
|
| 73 |
+
filename=filepath,
|
| 74 |
+
repo_type="dataset",
|
| 75 |
+
token=TOKEN
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
predictions = []
|
| 79 |
+
with open(local_path) as f:
|
| 80 |
+
for line in f:
|
| 81 |
+
if line.strip():
|
| 82 |
+
predictions.append(json.loads(line))
|
| 83 |
+
return predictions
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def evaluate_with_semantic(predictions: list, gold_by_id: dict, gold_by_text: dict) -> dict:
|
| 87 |
+
"""Evaluate predictions with semantic accuracy metric."""
|
| 88 |
+
from collections import defaultdict
|
| 89 |
+
|
| 90 |
+
evals = []
|
| 91 |
+
unmatched = 0
|
| 92 |
+
|
| 93 |
+
total = len(predictions)
|
| 94 |
+
for i, pred in enumerate(predictions):
|
| 95 |
+
if (i + 1) % 50 == 0:
|
| 96 |
+
print(f" Processing {i+1}/{total}...")
|
| 97 |
+
|
| 98 |
+
question = pred.get('question', '').strip()
|
| 99 |
+
qid = pred.get('id', '')
|
| 100 |
+
|
| 101 |
+
# Match to gold
|
| 102 |
+
gold_data = None
|
| 103 |
+
if question in gold_by_text:
|
| 104 |
+
gold_data = gold_by_text[question]
|
| 105 |
+
elif qid and qid in gold_by_id:
|
| 106 |
+
gold_data = gold_by_id[qid]
|
| 107 |
+
|
| 108 |
+
if not gold_data:
|
| 109 |
+
unmatched += 1
|
| 110 |
+
continue
|
| 111 |
+
|
| 112 |
+
answer = pred.get('answer', '')
|
| 113 |
+
citations = pred.get('citations', [])
|
| 114 |
+
search_history = pred.get('search_history', [])
|
| 115 |
+
steps = len(search_history) if search_history else pred.get('iterations', 0)
|
| 116 |
+
|
| 117 |
+
# Calculate metrics
|
| 118 |
+
anls = anls_star(answer, gold_data['answers'])
|
| 119 |
+
|
| 120 |
+
# Semantic accuracy with LLM judge
|
| 121 |
+
llm_result = anls_star_llm(answer, gold_data['answers'], question)
|
| 122 |
+
semantic_score = llm_result['score']
|
| 123 |
+
|
| 124 |
+
doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
|
| 125 |
+
page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
|
| 126 |
+
|
| 127 |
+
evals.append({
|
| 128 |
+
'anls': anls,
|
| 129 |
+
'semantic_score': semantic_score,
|
| 130 |
+
'correct': semantic_score >= 0.5,
|
| 131 |
+
'doc_f1': doc_f1['f1'],
|
| 132 |
+
'page_f1': page_f1['f1'],
|
| 133 |
+
'steps': steps,
|
| 134 |
+
'hop_type': gold_data.get('hop_type', 'single'),
|
| 135 |
+
'category': gold_data['category'],
|
| 136 |
+
'domain': gold_data['domain']
|
| 137 |
+
})
|
| 138 |
+
|
| 139 |
+
if not evals:
|
| 140 |
+
return None
|
| 141 |
+
|
| 142 |
+
# Aggregate
|
| 143 |
+
n = len(evals)
|
| 144 |
+
semantic_scores = [e['semantic_score'] for e in evals]
|
| 145 |
+
|
| 146 |
+
agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
|
| 147 |
+
mean_semantic = agg['adjusted_score'] * 100
|
| 148 |
+
|
| 149 |
+
mean_anls = sum(e['anls'] for e in evals) / n * 100
|
| 150 |
+
mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
|
| 151 |
+
mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
|
| 152 |
+
|
| 153 |
+
kuiper = kuiper_statistic(evals)
|
| 154 |
+
|
| 155 |
+
# By hop type
|
| 156 |
+
single_hop = [e for e in evals if e['hop_type'] == 'single']
|
| 157 |
+
cross_page = [e for e in evals if e['hop_type'] == 'cross_page']
|
| 158 |
+
cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
|
| 159 |
+
|
| 160 |
+
# By domain
|
| 161 |
+
by_domain = defaultdict(list)
|
| 162 |
+
for e in evals:
|
| 163 |
+
domain = e['domain'] or 'Other'
|
| 164 |
+
by_domain[domain].append(e)
|
| 165 |
+
|
| 166 |
+
domain_scores = {}
|
| 167 |
+
for domain, domain_evals in sorted(by_domain.items()):
|
| 168 |
+
domain_scores[domain] = {
|
| 169 |
+
'semantic': sum(e['semantic_score'] for e in domain_evals) / len(domain_evals) * 100,
|
| 170 |
+
'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
|
| 171 |
+
'n': len(domain_evals)
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
return {
|
| 175 |
+
'overall': {
|
| 176 |
+
'semantic': mean_semantic,
|
| 177 |
+
'anls': mean_anls,
|
| 178 |
+
'page_f1': mean_page_f1,
|
| 179 |
+
'doc_f1': mean_doc_f1,
|
| 180 |
+
'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
|
| 181 |
+
},
|
| 182 |
+
'single_evidence': {
|
| 183 |
+
'semantic': sum(e['semantic_score'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
|
| 184 |
+
'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
|
| 185 |
+
'n': len(single_hop)
|
| 186 |
+
},
|
| 187 |
+
'multi_evidence_same_doc': {
|
| 188 |
+
'semantic': sum(e['semantic_score'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
|
| 189 |
+
'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
|
| 190 |
+
'n': len(cross_page)
|
| 191 |
+
},
|
| 192 |
+
'multi_evidence_multi_doc': {
|
| 193 |
+
'semantic': sum(e['semantic_score'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
|
| 194 |
+
'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
|
| 195 |
+
'n': len(cross_doc)
|
| 196 |
+
},
|
| 197 |
+
'by_domain': domain_scores,
|
| 198 |
+
'n_evaluated': n,
|
| 199 |
+
'n_unmatched': unmatched
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def main():
|
| 204 |
+
import argparse
|
| 205 |
+
parser = argparse.ArgumentParser(description="Re-evaluate submissions with semantic accuracy")
|
| 206 |
+
parser.add_argument('--dry-run', action='store_true', help="Don't upload results")
|
| 207 |
+
parser.add_argument('--file', type=str, help="Re-evaluate specific prediction file")
|
| 208 |
+
args = parser.parse_args()
|
| 209 |
+
|
| 210 |
+
# Load gold standard
|
| 211 |
+
gold_by_id, gold_by_text = load_gold_data()
|
| 212 |
+
print(f"Loaded {len(gold_by_id)} gold examples")
|
| 213 |
+
|
| 214 |
+
# Find prediction files
|
| 215 |
+
if args.file:
|
| 216 |
+
pred_files = [args.file]
|
| 217 |
+
else:
|
| 218 |
+
print("\nFinding prediction files...")
|
| 219 |
+
pred_files = find_prediction_files()
|
| 220 |
+
print(f"Found {len(pred_files)} prediction files")
|
| 221 |
+
|
| 222 |
+
for pred_file in pred_files:
|
| 223 |
+
print(f"\n{'='*60}")
|
| 224 |
+
print(f"Processing: {pred_file}")
|
| 225 |
+
print('='*60)
|
| 226 |
+
|
| 227 |
+
try:
|
| 228 |
+
predictions = download_predictions(pred_file)
|
| 229 |
+
print(f"Loaded {len(predictions)} predictions")
|
| 230 |
+
|
| 231 |
+
results = evaluate_with_semantic(predictions, gold_by_id, gold_by_text)
|
| 232 |
+
|
| 233 |
+
if results:
|
| 234 |
+
print(f"\nResults:")
|
| 235 |
+
print(f" Semantic Accuracy: {results['overall']['semantic']:.1f}")
|
| 236 |
+
print(f" ANLS*: {results['overall']['anls']:.1f}")
|
| 237 |
+
print(f" Page F1: {results['overall']['page_f1']:.1f}")
|
| 238 |
+
|
| 239 |
+
# Save locally for review
|
| 240 |
+
output_file = Path(pred_file).stem + "_reevaluated.json"
|
| 241 |
+
with open(output_file, 'w') as f:
|
| 242 |
+
json.dump(results, f, indent=2)
|
| 243 |
+
print(f"\nSaved to: {output_file}")
|
| 244 |
+
else:
|
| 245 |
+
print("No valid evaluations")
|
| 246 |
+
|
| 247 |
+
except Exception as e:
|
| 248 |
+
print(f"Error: {e}")
|
| 249 |
+
continue
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
if __name__ == "__main__":
|
| 253 |
+
main()
|
| 254 |
+
|
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen3-VL (235B-A22B-Thinking) with BM25 Search Tool",
|
| 3 |
+
"organization": "Alibaba Group",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "open-weight"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-10T13:16:29.905067+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 59.09778741155781,
|
| 19 |
+
"anls": 57.61603118163428,
|
| 20 |
+
"page_f1": 58.72697776505391,
|
| 21 |
+
"doc_f1": 80.62601393262716,
|
| 22 |
+
"kuiper": 34.044088176352815
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 57.91583166332666,
|
| 26 |
+
"anls": 57.61603118163428,
|
| 27 |
+
"n": 499
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 66.66666666666666,
|
| 42 |
+
"anls": 61.98005698005697,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 81.81818181818183,
|
| 47 |
+
"anls": 80.39465804287939,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 66.66666666666666,
|
| 52 |
+
"anls": 66.4976376669925,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 44.565217391304344,
|
| 57 |
+
"anls": 49.53672826145982,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 56.25,
|
| 62 |
+
"anls": 57.39996898263027,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 61.702127659574465,
|
| 67 |
+
"anls": 60.56474101398679,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 56.09756097560976,
|
| 72 |
+
"anls": 53.957859669066565,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 60.46511627906976,
|
| 77 |
+
"anls": 54.79335264218985,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 40.0,
|
| 82 |
+
"anls": 49.05833333333333,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 68.75,
|
| 87 |
+
"anls": 73.07522250524337,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Reference": {
|
| 91 |
+
"semantic": 59.61538461538461,
|
| 92 |
+
"anls": 63.19327183267644,
|
| 93 |
+
"n": 52
|
| 94 |
+
},
|
| 95 |
+
"Reports": {
|
| 96 |
+
"semantic": 57.333333333333336,
|
| 97 |
+
"anls": 53.11616787903517,
|
| 98 |
+
"n": 75
|
| 99 |
+
},
|
| 100 |
+
"Technical": {
|
| 101 |
+
"semantic": 71.73913043478261,
|
| 102 |
+
"anls": 57.18864273121033,
|
| 103 |
+
"n": 23
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"n_evaluated": 499,
|
| 107 |
+
"n_unmatched": 1767
|
| 108 |
+
},
|
| 109 |
+
"reevaluated_date": "2026-01-15T19:55:28.547801+00:00",
|
| 110 |
+
"source_predictions_file": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_predictions_20260110_131629.jsonl",
|
| 111 |
+
"result_file_path": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json"
|
| 112 |
+
}
|
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen3-VL (32B-Thinking) with BM25 Search Tool",
|
| 3 |
+
"organization": "Alibaba Group",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "open-weight"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-10T13:20:54.125677+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 57.666353114392045,
|
| 19 |
+
"anls": 57.937064653000625,
|
| 20 |
+
"page_f1": 54.83061360816872,
|
| 21 |
+
"doc_f1": 78.76514934631167,
|
| 22 |
+
"kuiper": 36.33667334669349
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 56.51302605210421,
|
| 26 |
+
"anls": 57.937064653000625,
|
| 27 |
+
"n": 499
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 60.0,
|
| 42 |
+
"anls": 55.37037037037037,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 54.54545454545454,
|
| 47 |
+
"anls": 54.61297760210804,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 75.0,
|
| 52 |
+
"anls": 77.14578581514066,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 45.65217391304348,
|
| 57 |
+
"anls": 47.03746065646829,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 43.75,
|
| 62 |
+
"anls": 50.93257767828244,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 52.12765957446809,
|
| 67 |
+
"anls": 56.60682613221971,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 63.41463414634146,
|
| 72 |
+
"anls": 61.11435746903807,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 65.11627906976744,
|
| 77 |
+
"anls": 60.44405852545388,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 40.0,
|
| 82 |
+
"anls": 54.65844817149165,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 75.0,
|
| 87 |
+
"anls": 73.59601449275362,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Reference": {
|
| 91 |
+
"semantic": 63.46153846153846,
|
| 92 |
+
"anls": 68.57667578882189,
|
| 93 |
+
"n": 52
|
| 94 |
+
},
|
| 95 |
+
"Reports": {
|
| 96 |
+
"semantic": 54.0,
|
| 97 |
+
"anls": 56.44955119487462,
|
| 98 |
+
"n": 75
|
| 99 |
+
},
|
| 100 |
+
"Technical": {
|
| 101 |
+
"semantic": 60.86956521739131,
|
| 102 |
+
"anls": 51.60498619336015,
|
| 103 |
+
"n": 23
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"n_evaluated": 499,
|
| 107 |
+
"n_unmatched": 1767
|
| 108 |
+
},
|
| 109 |
+
"reevaluated_date": "2026-01-15T19:56:35.003631+00:00",
|
| 110 |
+
"source_predictions_file": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132054.jsonl",
|
| 111 |
+
"result_file_path": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json"
|
| 112 |
+
}
|
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Qwen3-VL (8B-Thinking) with BM25 Search Tool",
|
| 3 |
+
"organization": "Alibaba Group",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "open-weight"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-10T13:23:58.123387+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 46.623859964827616,
|
| 19 |
+
"anls": 45.43424080850834,
|
| 20 |
+
"page_f1": 47.685529789738204,
|
| 21 |
+
"doc_f1": 69.57247828991316,
|
| 22 |
+
"kuiper": 48.30060120240493
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 45.69138276553106,
|
| 26 |
+
"anls": 45.43424080850834,
|
| 27 |
+
"n": 499
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 40.0,
|
| 42 |
+
"anls": 36.91358024691358,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 59.09090909090909,
|
| 47 |
+
"anls": 55.55994729907773,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 58.333333333333336,
|
| 52 |
+
"anls": 54.598842018196855,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 29.347826086956523,
|
| 57 |
+
"anls": 29.932472094079802,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 37.5,
|
| 62 |
+
"anls": 51.55757767828245,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 43.61702127659575,
|
| 67 |
+
"anls": 44.439106365198185,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 46.34146341463415,
|
| 72 |
+
"anls": 50.16056789323261,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 53.48837209302325,
|
| 77 |
+
"anls": 44.799741602067186,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 40.0,
|
| 82 |
+
"anls": 49.070641025641024,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 58.333333333333336,
|
| 87 |
+
"anls": 59.60305559882987,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Reference": {
|
| 91 |
+
"semantic": 44.230769230769226,
|
| 92 |
+
"anls": 50.44584246011934,
|
| 93 |
+
"n": 52
|
| 94 |
+
},
|
| 95 |
+
"Reports": {
|
| 96 |
+
"semantic": 54.0,
|
| 97 |
+
"anls": 50.000135852648256,
|
| 98 |
+
"n": 75
|
| 99 |
+
},
|
| 100 |
+
"Technical": {
|
| 101 |
+
"semantic": 52.17391304347826,
|
| 102 |
+
"anls": 39.32779159365883,
|
| 103 |
+
"n": 23
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"n_evaluated": 499,
|
| 107 |
+
"n_unmatched": 1767
|
| 108 |
+
},
|
| 109 |
+
"reevaluated_date": "2026-01-15T19:58:05.119474+00:00",
|
| 110 |
+
"source_predictions_file": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132358.jsonl",
|
| 111 |
+
"result_file_path": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json"
|
| 112 |
+
}
|
eval/reevaluated_results/Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Claude Haiku 4.5 (2025-10-01) with BM25 Search Tool",
|
| 3 |
+
"organization": "Anthropic",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T13:03:19.649656+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 66.9387755102041,
|
| 19 |
+
"anls": 61.60747574238133,
|
| 20 |
+
"page_f1": 72.02476190476192,
|
| 21 |
+
"doc_f1": 88.24761904761905,
|
| 22 |
+
"kuiper": 50.36144578313238
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 65.60000000000001,
|
| 26 |
+
"anls": 61.60747574238133,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 70.0,
|
| 42 |
+
"anls": 63.92691050779287,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 72.72727272727273,
|
| 47 |
+
"anls": 73.53318618140752,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 85.41666666666666,
|
| 52 |
+
"anls": 72.62325637325637,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 58.152173913043484,
|
| 57 |
+
"anls": 54.29593695395653,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 71.875,
|
| 62 |
+
"anls": 68.77016129032259,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 61.702127659574465,
|
| 67 |
+
"anls": 62.779826338896896,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 53.65853658536586,
|
| 72 |
+
"anls": 52.054645053208425,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 66.27906976744185,
|
| 77 |
+
"anls": 60.50249169435216,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 48.0,
|
| 82 |
+
"anls": 41.69842237151431,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 81.25,
|
| 87 |
+
"anls": 80.07172131147541,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 60.57692307692307,
|
| 97 |
+
"anls": 64.3484267705628,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 76.0,
|
| 102 |
+
"anls": 65.36479556179735,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 71.73913043478261,
|
| 107 |
+
"anls": 64.75817505570946,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T19:59:11.288336+00:00",
|
| 115 |
+
"source_predictions_file": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_predictions_20260109_130319.jsonl",
|
| 116 |
+
"result_file_path": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json"
|
| 117 |
+
}
|
eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_002125.json
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Claude Sonnet 4.5 (2025-09-29) with BM25 Search Tool",
|
| 3 |
+
"organization": "Anthropic",
|
| 4 |
+
"description": "",
|
| 5 |
+
"link": null,
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic"
|
| 8 |
+
],
|
| 9 |
+
"submitted_by": null,
|
| 10 |
+
"metadata": {
|
| 11 |
+
"model_type": "unknown"
|
| 12 |
+
},
|
| 13 |
+
"submission_date": null,
|
| 14 |
+
"results": {
|
| 15 |
+
"overall": {
|
| 16 |
+
"semantic": 76.83673469387756,
|
| 17 |
+
"anls": 71.84394202116125,
|
| 18 |
+
"page_f1": 79.30920634920635,
|
| 19 |
+
"doc_f1": 92.87777777777778,
|
| 20 |
+
"kuiper": 45.31237322515194
|
| 21 |
+
},
|
| 22 |
+
"single_evidence": {
|
| 23 |
+
"semantic": 75.3,
|
| 24 |
+
"anls": 71.84394202116125,
|
| 25 |
+
"n": 500
|
| 26 |
+
},
|
| 27 |
+
"multi_evidence_same_doc": {
|
| 28 |
+
"semantic": 0,
|
| 29 |
+
"anls": 0,
|
| 30 |
+
"n": 0
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_multi_doc": {
|
| 33 |
+
"semantic": 0,
|
| 34 |
+
"anls": 0,
|
| 35 |
+
"n": 0
|
| 36 |
+
},
|
| 37 |
+
"by_domain": {
|
| 38 |
+
"Cases/Logs": {
|
| 39 |
+
"semantic": 76.66666666666667,
|
| 40 |
+
"anls": 69.17913105413105,
|
| 41 |
+
"n": 15
|
| 42 |
+
},
|
| 43 |
+
"Education": {
|
| 44 |
+
"semantic": 84.0909090909091,
|
| 45 |
+
"anls": 78.67387882210018,
|
| 46 |
+
"n": 22
|
| 47 |
+
},
|
| 48 |
+
"Events": {
|
| 49 |
+
"semantic": 87.5,
|
| 50 |
+
"anls": 78.43471847184719,
|
| 51 |
+
"n": 24
|
| 52 |
+
},
|
| 53 |
+
"Financial": {
|
| 54 |
+
"semantic": 67.93478260869566,
|
| 55 |
+
"anls": 67.53424957183847,
|
| 56 |
+
"n": 92
|
| 57 |
+
},
|
| 58 |
+
"Financial/Tax": {
|
| 59 |
+
"semantic": 75.0,
|
| 60 |
+
"anls": 79.76190476190477,
|
| 61 |
+
"n": 16
|
| 62 |
+
},
|
| 63 |
+
"Government/Regulatory": {
|
| 64 |
+
"semantic": 81.91489361702128,
|
| 65 |
+
"anls": 76.76053472269231,
|
| 66 |
+
"n": 47
|
| 67 |
+
},
|
| 68 |
+
"HR/Employment": {
|
| 69 |
+
"semantic": 76.82926829268293,
|
| 70 |
+
"anls": 71.16619587453502,
|
| 71 |
+
"n": 41
|
| 72 |
+
},
|
| 73 |
+
"Legal": {
|
| 74 |
+
"semantic": 75.5813953488372,
|
| 75 |
+
"anls": 63.583816672634086,
|
| 76 |
+
"n": 43
|
| 77 |
+
},
|
| 78 |
+
"Media/Publishing": {
|
| 79 |
+
"semantic": 48.0,
|
| 80 |
+
"anls": 56.169284632785775,
|
| 81 |
+
"n": 25
|
| 82 |
+
},
|
| 83 |
+
"Misc": {
|
| 84 |
+
"semantic": 89.58333333333334,
|
| 85 |
+
"anls": 83.87132448607858,
|
| 86 |
+
"n": 24
|
| 87 |
+
},
|
| 88 |
+
"Other": {
|
| 89 |
+
"semantic": 0.0,
|
| 90 |
+
"anls": 0.0,
|
| 91 |
+
"n": 1
|
| 92 |
+
},
|
| 93 |
+
"Reference": {
|
| 94 |
+
"semantic": 71.15384615384616,
|
| 95 |
+
"anls": 80.42617278480002,
|
| 96 |
+
"n": 52
|
| 97 |
+
},
|
| 98 |
+
"Reports": {
|
| 99 |
+
"semantic": 82.66666666666667,
|
| 100 |
+
"anls": 74.25747226815201,
|
| 101 |
+
"n": 75
|
| 102 |
+
},
|
| 103 |
+
"Technical": {
|
| 104 |
+
"semantic": 69.56521739130434,
|
| 105 |
+
"anls": 58.84371488722767,
|
| 106 |
+
"n": 23
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
"n_evaluated": 500,
|
| 110 |
+
"n_unmatched": 1811
|
| 111 |
+
},
|
| 112 |
+
"reevaluated_date": "2026-01-15T20:00:06.481610+00:00",
|
| 113 |
+
"source_predictions_file": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_predictions_20260109_002125.jsonl",
|
| 114 |
+
"result_file_path": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_002125.json"
|
| 115 |
+
}
|
eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Claude Sonnet 4.5 (2025-09-29) with BM25 Search Tool",
|
| 3 |
+
"organization": "Anthropic",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T12:58:16.611348+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 79.08163265306122,
|
| 19 |
+
"anls": 71.74787642305597,
|
| 20 |
+
"page_f1": 79.12333333333333,
|
| 21 |
+
"doc_f1": 92.98636363636363,
|
| 22 |
+
"kuiper": 36.338056680162076
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 77.5,
|
| 26 |
+
"anls": 71.74787642305597,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 76.66666666666667,
|
| 42 |
+
"anls": 69.51092117758785,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 81.81818181818183,
|
| 47 |
+
"anls": 78.15439830261965,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 91.66666666666666,
|
| 52 |
+
"anls": 78.43471847184719,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 70.1086956521739,
|
| 57 |
+
"anls": 66.81148919563769,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 75.0,
|
| 62 |
+
"anls": 76.26728110599078,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 86.17021276595744,
|
| 67 |
+
"anls": 74.90457714355891,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 75.60975609756098,
|
| 72 |
+
"anls": 72.85160396238213,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 88.37209302325581,
|
| 77 |
+
"anls": 72.74221043114129,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 56.00000000000001,
|
| 82 |
+
"anls": 60.75987316199324,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 89.58333333333334,
|
| 87 |
+
"anls": 83.89482072668008,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 69.23076923076923,
|
| 97 |
+
"anls": 72.21619612753193,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 83.33333333333334,
|
| 102 |
+
"anls": 74.0536995032274,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 69.56521739130434,
|
| 107 |
+
"anls": 60.23577215564363,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:01:02.709110+00:00",
|
| 115 |
+
"source_predictions_file": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_predictions_20260109_125816.jsonl",
|
| 116 |
+
"result_file_path": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json"
|
| 117 |
+
}
|
eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_003320.json
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Gemini 2.5 Flash with BM25 Search Tool",
|
| 3 |
+
"organization": "Google",
|
| 4 |
+
"description": "",
|
| 5 |
+
"link": null,
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic"
|
| 8 |
+
],
|
| 9 |
+
"submitted_by": null,
|
| 10 |
+
"metadata": {
|
| 11 |
+
"model_type": "unknown"
|
| 12 |
+
},
|
| 13 |
+
"submission_date": null,
|
| 14 |
+
"results": {
|
| 15 |
+
"overall": {
|
| 16 |
+
"semantic": 57.34693877551022,
|
| 17 |
+
"anls": 52.71594015359682,
|
| 18 |
+
"page_f1": 59.910952380952374,
|
| 19 |
+
"doc_f1": 76.47380952380952,
|
| 20 |
+
"kuiper": 40.01599999999986
|
| 21 |
+
},
|
| 22 |
+
"single_evidence": {
|
| 23 |
+
"semantic": 56.2,
|
| 24 |
+
"anls": 52.71594015359682,
|
| 25 |
+
"n": 500
|
| 26 |
+
},
|
| 27 |
+
"multi_evidence_same_doc": {
|
| 28 |
+
"semantic": 0,
|
| 29 |
+
"anls": 0,
|
| 30 |
+
"n": 0
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_multi_doc": {
|
| 33 |
+
"semantic": 0,
|
| 34 |
+
"anls": 0,
|
| 35 |
+
"n": 0
|
| 36 |
+
},
|
| 37 |
+
"by_domain": {
|
| 38 |
+
"Cases/Logs": {
|
| 39 |
+
"semantic": 73.33333333333333,
|
| 40 |
+
"anls": 63.64672364672364,
|
| 41 |
+
"n": 15
|
| 42 |
+
},
|
| 43 |
+
"Education": {
|
| 44 |
+
"semantic": 77.27272727272727,
|
| 45 |
+
"anls": 68.27344592166726,
|
| 46 |
+
"n": 22
|
| 47 |
+
},
|
| 48 |
+
"Events": {
|
| 49 |
+
"semantic": 72.91666666666666,
|
| 50 |
+
"anls": 67.7894121245185,
|
| 51 |
+
"n": 24
|
| 52 |
+
},
|
| 53 |
+
"Financial": {
|
| 54 |
+
"semantic": 41.30434782608695,
|
| 55 |
+
"anls": 42.928812913087185,
|
| 56 |
+
"n": 92
|
| 57 |
+
},
|
| 58 |
+
"Financial/Tax": {
|
| 59 |
+
"semantic": 43.75,
|
| 60 |
+
"anls": 44.89996898263027,
|
| 61 |
+
"n": 16
|
| 62 |
+
},
|
| 63 |
+
"Government/Regulatory": {
|
| 64 |
+
"semantic": 63.829787234042556,
|
| 65 |
+
"anls": 62.64704717952198,
|
| 66 |
+
"n": 47
|
| 67 |
+
},
|
| 68 |
+
"HR/Employment": {
|
| 69 |
+
"semantic": 56.09756097560976,
|
| 70 |
+
"anls": 49.2663477551747,
|
| 71 |
+
"n": 41
|
| 72 |
+
},
|
| 73 |
+
"Legal": {
|
| 74 |
+
"semantic": 55.81395348837209,
|
| 75 |
+
"anls": 47.23635639486312,
|
| 76 |
+
"n": 43
|
| 77 |
+
},
|
| 78 |
+
"Media/Publishing": {
|
| 79 |
+
"semantic": 52.0,
|
| 80 |
+
"anls": 46.0,
|
| 81 |
+
"n": 25
|
| 82 |
+
},
|
| 83 |
+
"Misc": {
|
| 84 |
+
"semantic": 66.66666666666666,
|
| 85 |
+
"anls": 68.74370865688812,
|
| 86 |
+
"n": 24
|
| 87 |
+
},
|
| 88 |
+
"Other": {
|
| 89 |
+
"semantic": 0.0,
|
| 90 |
+
"anls": 0.0,
|
| 91 |
+
"n": 1
|
| 92 |
+
},
|
| 93 |
+
"Reference": {
|
| 94 |
+
"semantic": 61.53846153846154,
|
| 95 |
+
"anls": 64.74843671979198,
|
| 96 |
+
"n": 52
|
| 97 |
+
},
|
| 98 |
+
"Reports": {
|
| 99 |
+
"semantic": 53.333333333333336,
|
| 100 |
+
"anls": 46.187273968786066,
|
| 101 |
+
"n": 75
|
| 102 |
+
},
|
| 103 |
+
"Technical": {
|
| 104 |
+
"semantic": 54.347826086956516,
|
| 105 |
+
"anls": 42.61518103800272,
|
| 106 |
+
"n": 23
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
"n_evaluated": 500,
|
| 110 |
+
"n_unmatched": 1811
|
| 111 |
+
},
|
| 112 |
+
"reevaluated_date": "2026-01-15T20:02:15.855307+00:00",
|
| 113 |
+
"source_predictions_file": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_predictions_20260109_003320.jsonl",
|
| 114 |
+
"result_file_path": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_003320.json"
|
| 115 |
+
}
|
eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Gemini 2.5 Flash with BM25 Search Tool",
|
| 3 |
+
"organization": "Google",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T18:25:59.636344+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 58.46938775510204,
|
| 19 |
+
"anls": 55.486869478144165,
|
| 20 |
+
"page_f1": 60.9663492063492,
|
| 21 |
+
"doc_f1": 78.82920634920634,
|
| 22 |
+
"kuiper": 45.08800000000012
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 57.3,
|
| 26 |
+
"anls": 55.486869478144165,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 73.33333333333333,
|
| 42 |
+
"anls": 71.7948717948718,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 77.27272727272727,
|
| 47 |
+
"anls": 72.81890046712182,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 85.41666666666666,
|
| 52 |
+
"anls": 76.85643564356435,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 40.76086956521739,
|
| 57 |
+
"anls": 40.952902757926644,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 50.0,
|
| 62 |
+
"anls": 52.31036324786324,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 64.8936170212766,
|
| 67 |
+
"anls": 67.70262933196864,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 58.536585365853654,
|
| 72 |
+
"anls": 60.95035529628296,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 54.65116279069767,
|
| 77 |
+
"anls": 51.45105745077384,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 52.0,
|
| 82 |
+
"anls": 54.40739778239778,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 77.08333333333334,
|
| 87 |
+
"anls": 73.82172131147541,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 61.53846153846154,
|
| 97 |
+
"anls": 64.46714691613596,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 53.333333333333336,
|
| 102 |
+
"anls": 45.47473759975617,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 47.82608695652174,
|
| 107 |
+
"anls": 35.96181299748582,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:03:19.966069+00:00",
|
| 115 |
+
"source_predictions_file": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_predictions_20260109_182559.jsonl",
|
| 116 |
+
"result_file_path": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json"
|
| 117 |
+
}
|
eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_005202.json
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Gemini 2.5 Pro with BM25 Search Tool",
|
| 3 |
+
"organization": "Google",
|
| 4 |
+
"description": "",
|
| 5 |
+
"link": null,
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic"
|
| 8 |
+
],
|
| 9 |
+
"submitted_by": null,
|
| 10 |
+
"metadata": {
|
| 11 |
+
"model_type": "unknown"
|
| 12 |
+
},
|
| 13 |
+
"submission_date": null,
|
| 14 |
+
"results": {
|
| 15 |
+
"overall": {
|
| 16 |
+
"semantic": 59.6938775510204,
|
| 17 |
+
"anls": 56.04493493183149,
|
| 18 |
+
"page_f1": 61.64985569985569,
|
| 19 |
+
"doc_f1": 74.58080808080808,
|
| 20 |
+
"kuiper": 28.047999999999792
|
| 21 |
+
},
|
| 22 |
+
"single_evidence": {
|
| 23 |
+
"semantic": 58.5,
|
| 24 |
+
"anls": 56.04493493183149,
|
| 25 |
+
"n": 500
|
| 26 |
+
},
|
| 27 |
+
"multi_evidence_same_doc": {
|
| 28 |
+
"semantic": 0,
|
| 29 |
+
"anls": 0,
|
| 30 |
+
"n": 0
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_multi_doc": {
|
| 33 |
+
"semantic": 0,
|
| 34 |
+
"anls": 0,
|
| 35 |
+
"n": 0
|
| 36 |
+
},
|
| 37 |
+
"by_domain": {
|
| 38 |
+
"Cases/Logs": {
|
| 39 |
+
"semantic": 73.33333333333333,
|
| 40 |
+
"anls": 63.64672364672364,
|
| 41 |
+
"n": 15
|
| 42 |
+
},
|
| 43 |
+
"Education": {
|
| 44 |
+
"semantic": 81.81818181818183,
|
| 45 |
+
"anls": 72.3102424584638,
|
| 46 |
+
"n": 22
|
| 47 |
+
},
|
| 48 |
+
"Events": {
|
| 49 |
+
"semantic": 64.58333333333334,
|
| 50 |
+
"anls": 60.78335195270679,
|
| 51 |
+
"n": 24
|
| 52 |
+
},
|
| 53 |
+
"Financial": {
|
| 54 |
+
"semantic": 40.21739130434783,
|
| 55 |
+
"anls": 43.86116897464483,
|
| 56 |
+
"n": 92
|
| 57 |
+
},
|
| 58 |
+
"Financial/Tax": {
|
| 59 |
+
"semantic": 56.25,
|
| 60 |
+
"anls": 61.754807692307686,
|
| 61 |
+
"n": 16
|
| 62 |
+
},
|
| 63 |
+
"Government/Regulatory": {
|
| 64 |
+
"semantic": 68.08510638297872,
|
| 65 |
+
"anls": 64.75420262164383,
|
| 66 |
+
"n": 47
|
| 67 |
+
},
|
| 68 |
+
"HR/Employment": {
|
| 69 |
+
"semantic": 64.63414634146342,
|
| 70 |
+
"anls": 52.864704856399555,
|
| 71 |
+
"n": 41
|
| 72 |
+
},
|
| 73 |
+
"Legal": {
|
| 74 |
+
"semantic": 60.46511627906976,
|
| 75 |
+
"anls": 51.79586563307493,
|
| 76 |
+
"n": 43
|
| 77 |
+
},
|
| 78 |
+
"Media/Publishing": {
|
| 79 |
+
"semantic": 52.0,
|
| 80 |
+
"anls": 53.47808414475082,
|
| 81 |
+
"n": 25
|
| 82 |
+
},
|
| 83 |
+
"Misc": {
|
| 84 |
+
"semantic": 64.58333333333334,
|
| 85 |
+
"anls": 66.18283242258653,
|
| 86 |
+
"n": 24
|
| 87 |
+
},
|
| 88 |
+
"Other": {
|
| 89 |
+
"semantic": 0.0,
|
| 90 |
+
"anls": 0.0,
|
| 91 |
+
"n": 1
|
| 92 |
+
},
|
| 93 |
+
"Reference": {
|
| 94 |
+
"semantic": 61.53846153846154,
|
| 95 |
+
"anls": 67.82233913890543,
|
| 96 |
+
"n": 52
|
| 97 |
+
},
|
| 98 |
+
"Reports": {
|
| 99 |
+
"semantic": 56.666666666666664,
|
| 100 |
+
"anls": 53.53357754327678,
|
| 101 |
+
"n": 75
|
| 102 |
+
},
|
| 103 |
+
"Technical": {
|
| 104 |
+
"semantic": 63.04347826086957,
|
| 105 |
+
"anls": 47.37363844512718,
|
| 106 |
+
"n": 23
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
"n_evaluated": 500,
|
| 110 |
+
"n_unmatched": 1811
|
| 111 |
+
},
|
| 112 |
+
"reevaluated_date": "2026-01-15T20:04:22.366647+00:00",
|
| 113 |
+
"source_predictions_file": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_predictions_20260109_005202.jsonl",
|
| 114 |
+
"result_file_path": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_005202.json"
|
| 115 |
+
}
|
eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Gemini 2.5 Pro with BM25 Search Tool",
|
| 3 |
+
"organization": "Google",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T18:30:30.608183+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 59.6938775510204,
|
| 19 |
+
"anls": 55.97919862778078,
|
| 20 |
+
"page_f1": 60.299220779220775,
|
| 21 |
+
"doc_f1": 74.23636363636363,
|
| 22 |
+
"kuiper": 38.90600000000025
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 58.5,
|
| 26 |
+
"anls": 55.97919862778078,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 66.66666666666666,
|
| 42 |
+
"anls": 56.98005698005698,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 72.72727272727273,
|
| 47 |
+
"anls": 66.75468690290825,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 66.66666666666666,
|
| 52 |
+
"anls": 62.67819322254806,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 47.82608695652174,
|
| 57 |
+
"anls": 48.11929370300614,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 50.0,
|
| 62 |
+
"anls": 46.96314102564102,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 65.95744680851064,
|
| 67 |
+
"anls": 64.23333377770668,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 56.09756097560976,
|
| 72 |
+
"anls": 48.92979153124233,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 63.95348837209303,
|
| 77 |
+
"anls": 60.44220952048519,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 48.0,
|
| 82 |
+
"anls": 52.95641025641026,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 64.58333333333334,
|
| 87 |
+
"anls": 70.43036975102193,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 63.46153846153846,
|
| 97 |
+
"anls": 63.5134680018838,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 60.0,
|
| 102 |
+
"anls": 54.81415365192609,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 50.0,
|
| 107 |
+
"anls": 40.50127359810298,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:06:43.674600+00:00",
|
| 115 |
+
"source_predictions_file": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_predictions_20260109_183030.jsonl",
|
| 116 |
+
"result_file_path": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json"
|
| 117 |
+
}
|
eval/reevaluated_results/Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_results_20260109_002711.json
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Gemini 3 Pro (Preview) with BM25 Search Tool",
|
| 3 |
+
"organization": "Google",
|
| 4 |
+
"description": "",
|
| 5 |
+
"link": null,
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic"
|
| 8 |
+
],
|
| 9 |
+
"submitted_by": null,
|
| 10 |
+
"metadata": {
|
| 11 |
+
"model_type": "unknown"
|
| 12 |
+
},
|
| 13 |
+
"submission_date": null,
|
| 14 |
+
"results": {
|
| 15 |
+
"overall": {
|
| 16 |
+
"semantic": 84.8636047605415,
|
| 17 |
+
"anls": 78.46249371016062,
|
| 18 |
+
"page_f1": 80.40956371617695,
|
| 19 |
+
"doc_f1": 91.83908943427981,
|
| 20 |
+
"kuiper": 27.13226452905815
|
| 21 |
+
},
|
| 22 |
+
"single_evidence": {
|
| 23 |
+
"semantic": 83.16633266533067,
|
| 24 |
+
"anls": 78.46249371016062,
|
| 25 |
+
"n": 499
|
| 26 |
+
},
|
| 27 |
+
"multi_evidence_same_doc": {
|
| 28 |
+
"semantic": 0,
|
| 29 |
+
"anls": 0,
|
| 30 |
+
"n": 0
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_multi_doc": {
|
| 33 |
+
"semantic": 0,
|
| 34 |
+
"anls": 0,
|
| 35 |
+
"n": 0
|
| 36 |
+
},
|
| 37 |
+
"by_domain": {
|
| 38 |
+
"Cases/Logs": {
|
| 39 |
+
"semantic": 83.33333333333334,
|
| 40 |
+
"anls": 75.31339031339031,
|
| 41 |
+
"n": 15
|
| 42 |
+
},
|
| 43 |
+
"Education": {
|
| 44 |
+
"semantic": 86.36363636363636,
|
| 45 |
+
"anls": 74.02302243211334,
|
| 46 |
+
"n": 22
|
| 47 |
+
},
|
| 48 |
+
"Events": {
|
| 49 |
+
"semantic": 83.33333333333334,
|
| 50 |
+
"anls": 77.06645664566456,
|
| 51 |
+
"n": 24
|
| 52 |
+
},
|
| 53 |
+
"Financial": {
|
| 54 |
+
"semantic": 72.28260869565217,
|
| 55 |
+
"anls": 69.36362154126739,
|
| 56 |
+
"n": 92
|
| 57 |
+
},
|
| 58 |
+
"Financial/Tax": {
|
| 59 |
+
"semantic": 81.25,
|
| 60 |
+
"anls": 80.57571684587813,
|
| 61 |
+
"n": 16
|
| 62 |
+
},
|
| 63 |
+
"Government/Regulatory": {
|
| 64 |
+
"semantic": 87.2340425531915,
|
| 65 |
+
"anls": 82.3254828677961,
|
| 66 |
+
"n": 47
|
| 67 |
+
},
|
| 68 |
+
"HR/Employment": {
|
| 69 |
+
"semantic": 86.58536585365853,
|
| 70 |
+
"anls": 79.69007037401929,
|
| 71 |
+
"n": 41
|
| 72 |
+
},
|
| 73 |
+
"Legal": {
|
| 74 |
+
"semantic": 93.02325581395348,
|
| 75 |
+
"anls": 85.19782543038357,
|
| 76 |
+
"n": 43
|
| 77 |
+
},
|
| 78 |
+
"Media/Publishing": {
|
| 79 |
+
"semantic": 74.0,
|
| 80 |
+
"anls": 79.1167050771702,
|
| 81 |
+
"n": 25
|
| 82 |
+
},
|
| 83 |
+
"Misc": {
|
| 84 |
+
"semantic": 93.75,
|
| 85 |
+
"anls": 91.86959699974574,
|
| 86 |
+
"n": 24
|
| 87 |
+
},
|
| 88 |
+
"Reference": {
|
| 89 |
+
"semantic": 88.46153846153845,
|
| 90 |
+
"anls": 87.98053049887939,
|
| 91 |
+
"n": 52
|
| 92 |
+
},
|
| 93 |
+
"Reports": {
|
| 94 |
+
"semantic": 84.66666666666667,
|
| 95 |
+
"anls": 78.78023745578506,
|
| 96 |
+
"n": 75
|
| 97 |
+
},
|
| 98 |
+
"Technical": {
|
| 99 |
+
"semantic": 73.91304347826086,
|
| 100 |
+
"anls": 61.21421646346686,
|
| 101 |
+
"n": 23
|
| 102 |
+
}
|
| 103 |
+
},
|
| 104 |
+
"n_evaluated": 499,
|
| 105 |
+
"n_unmatched": 1767
|
| 106 |
+
},
|
| 107 |
+
"reevaluated_date": "2026-01-15T20:07:35.074484+00:00",
|
| 108 |
+
"source_predictions_file": "Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_predictions_20260109_002711.jsonl",
|
| 109 |
+
"result_file_path": "Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_results_20260109_002711.json"
|
| 110 |
+
}
|
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_234108.json
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Human with BM25 Search Tool",
|
| 3 |
+
"organization": "Humanity",
|
| 4 |
+
"description": "",
|
| 5 |
+
"link": null,
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic"
|
| 8 |
+
],
|
| 9 |
+
"submitted_by": null,
|
| 10 |
+
"metadata": {
|
| 11 |
+
"model_type": "unknown"
|
| 12 |
+
},
|
| 13 |
+
"submission_date": null,
|
| 14 |
+
"results": {
|
| 15 |
+
"overall": {
|
| 16 |
+
"semantic": 76.53061224489795,
|
| 17 |
+
"anls": 76.89084569144522,
|
| 18 |
+
"page_f1": 74.27484848484849,
|
| 19 |
+
"doc_f1": 87.1077922077922,
|
| 20 |
+
"kuiper": 6.584782608695652
|
| 21 |
+
},
|
| 22 |
+
"single_evidence": {
|
| 23 |
+
"semantic": 75.0,
|
| 24 |
+
"anls": 76.89084569144522,
|
| 25 |
+
"n": 500
|
| 26 |
+
},
|
| 27 |
+
"multi_evidence_same_doc": {
|
| 28 |
+
"semantic": 0,
|
| 29 |
+
"anls": 0,
|
| 30 |
+
"n": 0
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_multi_doc": {
|
| 33 |
+
"semantic": 0,
|
| 34 |
+
"anls": 0,
|
| 35 |
+
"n": 0
|
| 36 |
+
},
|
| 37 |
+
"by_domain": {
|
| 38 |
+
"Cases/Logs": {
|
| 39 |
+
"semantic": 66.66666666666666,
|
| 40 |
+
"anls": 72.72727272727272,
|
| 41 |
+
"n": 15
|
| 42 |
+
},
|
| 43 |
+
"Education": {
|
| 44 |
+
"semantic": 95.45454545454545,
|
| 45 |
+
"anls": 88.2664724057374,
|
| 46 |
+
"n": 22
|
| 47 |
+
},
|
| 48 |
+
"Events": {
|
| 49 |
+
"semantic": 83.33333333333334,
|
| 50 |
+
"anls": 81.58602150537635,
|
| 51 |
+
"n": 24
|
| 52 |
+
},
|
| 53 |
+
"Financial": {
|
| 54 |
+
"semantic": 72.28260869565217,
|
| 55 |
+
"anls": 72.19996863726435,
|
| 56 |
+
"n": 92
|
| 57 |
+
},
|
| 58 |
+
"Financial/Tax": {
|
| 59 |
+
"semantic": 68.75,
|
| 60 |
+
"anls": 65.13888888888889,
|
| 61 |
+
"n": 16
|
| 62 |
+
},
|
| 63 |
+
"Government/Regulatory": {
|
| 64 |
+
"semantic": 77.6595744680851,
|
| 65 |
+
"anls": 80.70180867592104,
|
| 66 |
+
"n": 47
|
| 67 |
+
},
|
| 68 |
+
"HR/Employment": {
|
| 69 |
+
"semantic": 82.92682926829268,
|
| 70 |
+
"anls": 78.22470188707081,
|
| 71 |
+
"n": 41
|
| 72 |
+
},
|
| 73 |
+
"Legal": {
|
| 74 |
+
"semantic": 77.90697674418605,
|
| 75 |
+
"anls": 78.11361119500656,
|
| 76 |
+
"n": 43
|
| 77 |
+
},
|
| 78 |
+
"Media/Publishing": {
|
| 79 |
+
"semantic": 62.0,
|
| 80 |
+
"anls": 69.07251951242394,
|
| 81 |
+
"n": 25
|
| 82 |
+
},
|
| 83 |
+
"Misc": {
|
| 84 |
+
"semantic": 68.75,
|
| 85 |
+
"anls": 71.50538359217717,
|
| 86 |
+
"n": 24
|
| 87 |
+
},
|
| 88 |
+
"Other": {
|
| 89 |
+
"semantic": 0.0,
|
| 90 |
+
"anls": 0.0,
|
| 91 |
+
"n": 1
|
| 92 |
+
},
|
| 93 |
+
"Reference": {
|
| 94 |
+
"semantic": 77.88461538461539,
|
| 95 |
+
"anls": 87.90759949333756,
|
| 96 |
+
"n": 52
|
| 97 |
+
},
|
| 98 |
+
"Reports": {
|
| 99 |
+
"semantic": 70.0,
|
| 100 |
+
"anls": 75.95612610368379,
|
| 101 |
+
"n": 75
|
| 102 |
+
},
|
| 103 |
+
"Technical": {
|
| 104 |
+
"semantic": 76.08695652173914,
|
| 105 |
+
"anls": 73.91467899702451,
|
| 106 |
+
"n": 23
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
"n_evaluated": 500,
|
| 110 |
+
"n_unmatched": 0
|
| 111 |
+
},
|
| 112 |
+
"reevaluated_date": "2026-01-15T20:08:22.133765+00:00",
|
| 113 |
+
"source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260109_234108.jsonl",
|
| 114 |
+
"result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260109_234108.json"
|
| 115 |
+
}
|
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235325.json
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Human with BM25 Search Tool",
|
| 3 |
+
"organization": "Humanity",
|
| 4 |
+
"description": "",
|
| 5 |
+
"link": null,
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic"
|
| 8 |
+
],
|
| 9 |
+
"submitted_by": null,
|
| 10 |
+
"metadata": {
|
| 11 |
+
"model_type": "unknown"
|
| 12 |
+
},
|
| 13 |
+
"submission_date": null,
|
| 14 |
+
"results": {
|
| 15 |
+
"overall": {
|
| 16 |
+
"semantic": 80.3061224489796,
|
| 17 |
+
"anls": 81.0431479932892,
|
| 18 |
+
"page_f1": 77.30151515151516,
|
| 19 |
+
"doc_f1": 90.80112554112554,
|
| 20 |
+
"kuiper": 7.623700623700628
|
| 21 |
+
},
|
| 22 |
+
"single_evidence": {
|
| 23 |
+
"semantic": 78.7,
|
| 24 |
+
"anls": 81.0431479932892,
|
| 25 |
+
"n": 500
|
| 26 |
+
},
|
| 27 |
+
"multi_evidence_same_doc": {
|
| 28 |
+
"semantic": 0,
|
| 29 |
+
"anls": 0,
|
| 30 |
+
"n": 0
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_multi_doc": {
|
| 33 |
+
"semantic": 0,
|
| 34 |
+
"anls": 0,
|
| 35 |
+
"n": 0
|
| 36 |
+
},
|
| 37 |
+
"by_domain": {
|
| 38 |
+
"Cases/Logs": {
|
| 39 |
+
"semantic": 66.66666666666666,
|
| 40 |
+
"anls": 72.72727272727272,
|
| 41 |
+
"n": 15
|
| 42 |
+
},
|
| 43 |
+
"Education": {
|
| 44 |
+
"semantic": 95.45454545454545,
|
| 45 |
+
"anls": 88.2664724057374,
|
| 46 |
+
"n": 22
|
| 47 |
+
},
|
| 48 |
+
"Events": {
|
| 49 |
+
"semantic": 81.25,
|
| 50 |
+
"anls": 79.50268817204301,
|
| 51 |
+
"n": 24
|
| 52 |
+
},
|
| 53 |
+
"Financial": {
|
| 54 |
+
"semantic": 77.17391304347827,
|
| 55 |
+
"anls": 77.5668164633513,
|
| 56 |
+
"n": 92
|
| 57 |
+
},
|
| 58 |
+
"Financial/Tax": {
|
| 59 |
+
"semantic": 81.25,
|
| 60 |
+
"anls": 77.63888888888889,
|
| 61 |
+
"n": 16
|
| 62 |
+
},
|
| 63 |
+
"Government/Regulatory": {
|
| 64 |
+
"semantic": 80.85106382978722,
|
| 65 |
+
"anls": 84.95712782485721,
|
| 66 |
+
"n": 47
|
| 67 |
+
},
|
| 68 |
+
"HR/Employment": {
|
| 69 |
+
"semantic": 81.70731707317073,
|
| 70 |
+
"anls": 79.43814685734138,
|
| 71 |
+
"n": 41
|
| 72 |
+
},
|
| 73 |
+
"Legal": {
|
| 74 |
+
"semantic": 89.53488372093024,
|
| 75 |
+
"anls": 85.09035538105306,
|
| 76 |
+
"n": 43
|
| 77 |
+
},
|
| 78 |
+
"Media/Publishing": {
|
| 79 |
+
"semantic": 70.0,
|
| 80 |
+
"anls": 75.91696395686839,
|
| 81 |
+
"n": 25
|
| 82 |
+
},
|
| 83 |
+
"Misc": {
|
| 84 |
+
"semantic": 72.91666666666666,
|
| 85 |
+
"anls": 78.44982803662161,
|
| 86 |
+
"n": 24
|
| 87 |
+
},
|
| 88 |
+
"Other": {
|
| 89 |
+
"semantic": 0.0,
|
| 90 |
+
"anls": 0.0,
|
| 91 |
+
"n": 1
|
| 92 |
+
},
|
| 93 |
+
"Reference": {
|
| 94 |
+
"semantic": 78.84615384615384,
|
| 95 |
+
"anls": 89.83067641641446,
|
| 96 |
+
"n": 52
|
| 97 |
+
},
|
| 98 |
+
"Reports": {
|
| 99 |
+
"semantic": 70.66666666666667,
|
| 100 |
+
"anls": 78.94025308781077,
|
| 101 |
+
"n": 75
|
| 102 |
+
},
|
| 103 |
+
"Technical": {
|
| 104 |
+
"semantic": 86.95652173913044,
|
| 105 |
+
"anls": 83.52609662978936,
|
| 106 |
+
"n": 23
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
"n_evaluated": 500,
|
| 110 |
+
"n_unmatched": 0
|
| 111 |
+
},
|
| 112 |
+
"reevaluated_date": "2026-01-15T20:09:11.287271+00:00",
|
| 113 |
+
"source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260109_235325.jsonl",
|
| 114 |
+
"result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260109_235325.json"
|
| 115 |
+
}
|
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235724.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Human with BM25 Search Tool",
|
| 3 |
+
"organization": "Humanity",
|
| 4 |
+
"description": "Human equipped with the same search engine as agentic baselines.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Vision and Language",
|
| 9 |
+
"Sparse Search Tool"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "open-weight"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T23:57:24.249882+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 81.02040816326532,
|
| 19 |
+
"anls": 82.43662306660298,
|
| 20 |
+
"page_f1": 78.83484848484848,
|
| 21 |
+
"doc_f1": 92.80112554112554,
|
| 22 |
+
"kuiper": 8.217922606924656
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 79.4,
|
| 26 |
+
"anls": 82.43662306660298,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 66.66666666666666,
|
| 42 |
+
"anls": 72.72727272727272,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 90.9090909090909,
|
| 47 |
+
"anls": 88.2664724057374,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 81.25,
|
| 52 |
+
"anls": 79.50268817204301,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 76.08695652173914,
|
| 57 |
+
"anls": 77.5668164633513,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 81.25,
|
| 62 |
+
"anls": 82.47759856630825,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 80.85106382978722,
|
| 67 |
+
"anls": 84.95712782485721,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 84.14634146341463,
|
| 72 |
+
"anls": 81.26741515002432,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 89.53488372093024,
|
| 77 |
+
"anls": 85.09035538105306,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 70.0,
|
| 82 |
+
"anls": 79.91696395686839,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 72.91666666666666,
|
| 87 |
+
"anls": 78.44982803662161,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 79.8076923076923,
|
| 97 |
+
"anls": 91.40410298984105,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 76.0,
|
| 102 |
+
"anls": 83.7735864211441,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 86.95652173913044,
|
| 107 |
+
"anls": 83.52609662978936,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:09:56.056259+00:00",
|
| 115 |
+
"source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260109_235724.jsonl",
|
| 116 |
+
"result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260109_235724.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-4.1 (2025-04-14) with BM25 Search Tool",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T15:32:21.908816+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 58.571428571428555,
|
| 19 |
+
"anls": 53.29254644474454,
|
| 20 |
+
"page_f1": 64.14190476190477,
|
| 21 |
+
"doc_f1": 82.82666666666667,
|
| 22 |
+
"kuiper": 43.93199999999983
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 57.4,
|
| 26 |
+
"anls": 53.29254644474454,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 53.333333333333336,
|
| 42 |
+
"anls": 48.59180666077218,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 52.27272727272727,
|
| 47 |
+
"anls": 48.04545454545455,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 68.75,
|
| 52 |
+
"anls": 67.55050505050505,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 40.76086956521739,
|
| 57 |
+
"anls": 43.62404525327831,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 71.875,
|
| 62 |
+
"anls": 64.58333333333334,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 55.319148936170215,
|
| 67 |
+
"anls": 51.52629513848961,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 70.73170731707317,
|
| 72 |
+
"anls": 55.117501174925685,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 58.139534883720934,
|
| 77 |
+
"anls": 55.94315245478037,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 40.0,
|
| 82 |
+
"anls": 54.188065268065266,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 77.08333333333334,
|
| 87 |
+
"anls": 69.51844262295083,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 62.5,
|
| 97 |
+
"anls": 60.011945621794936,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 60.66666666666667,
|
| 102 |
+
"anls": 47.26331129213486,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 67.3913043478261,
|
| 107 |
+
"anls": 61.60068502092203,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:11:50.993374+00:00",
|
| 115 |
+
"source_predictions_file": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153221.jsonl",
|
| 116 |
+
"result_file_path": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-4.1 Nano (2025-04-14) with BM25 Search Tool",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T15:38:12.353112+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 19.18367346938775,
|
| 19 |
+
"anls": 19.21201395702391,
|
| 20 |
+
"page_f1": 27.60809523809524,
|
| 21 |
+
"doc_f1": 40.18095238095238,
|
| 22 |
+
"kuiper": 27.656000000000265
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 18.8,
|
| 26 |
+
"anls": 19.21201395702391,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 13.333333333333334,
|
| 42 |
+
"anls": 12.5,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 36.36363636363637,
|
| 47 |
+
"anls": 33.85540184453228,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 25.0,
|
| 52 |
+
"anls": 24.252897639994416,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 15.217391304347828,
|
| 57 |
+
"anls": 15.744375438721086,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 0.0,
|
| 62 |
+
"anls": 3.125,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 20.212765957446805,
|
| 67 |
+
"anls": 18.040407652422672,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 17.073170731707318,
|
| 72 |
+
"anls": 17.049790482898338,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 23.25581395348837,
|
| 77 |
+
"anls": 20.54263565891473,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 4.0,
|
| 82 |
+
"anls": 13.666666666666666,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 25.0,
|
| 87 |
+
"anls": 28.843503294839174,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 16.346153846153847,
|
| 97 |
+
"anls": 20.3827772417516,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 23.333333333333332,
|
| 102 |
+
"anls": 19.284216647617285,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 19.565217391304348,
|
| 107 |
+
"anls": 27.075249588209658,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:12:41.288382+00:00",
|
| 115 |
+
"source_predictions_file": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153812.jsonl",
|
| 116 |
+
"result_file_path": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-5.2 (2025-12-11) with BM25 Search Tool",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images. GPT-5.2 exhibits more conservative behavior than GPT-5, refusing to provide an answer when uncertain.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T15:19:12.016451+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 66.22448979591837,
|
| 19 |
+
"anls": 57.28438090955278,
|
| 20 |
+
"page_f1": 67.62380952380951,
|
| 21 |
+
"doc_f1": 83.72666666666666,
|
| 22 |
+
"kuiper": 62.57199999999988
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 64.9,
|
| 26 |
+
"anls": 57.28438090955278,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 73.33333333333333,
|
| 42 |
+
"anls": 58.46153846153847,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 77.27272727272727,
|
| 47 |
+
"anls": 59.00137741046832,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 72.91666666666666,
|
| 52 |
+
"anls": 57.55050505050505,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 53.2608695652174,
|
| 57 |
+
"anls": 49.679264550051975,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 65.625,
|
| 62 |
+
"anls": 61.08221187025536,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 67.02127659574468,
|
| 67 |
+
"anls": 58.551919442177,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 60.97560975609756,
|
| 72 |
+
"anls": 44.265703074651974,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 77.90697674418605,
|
| 77 |
+
"anls": 66.19399979865096,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 38.0,
|
| 82 |
+
"anls": 35.05751747729549,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 83.33333333333334,
|
| 87 |
+
"anls": 82.5164707977208,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 72.11538461538461,
|
| 97 |
+
"anls": 67.62508443509842,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 64.66666666666666,
|
| 102 |
+
"anls": 59.65381728416852,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 60.86956521739131,
|
| 107 |
+
"anls": 55.55075090789312,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:14:52.407712+00:00",
|
| 115 |
+
"source_predictions_file": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_predictions_20260109_151912.jsonl",
|
| 116 |
+
"result_file_path": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-5 (2025-08-07) with BM25 Search Tool",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T15:21:04.336083+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 76.02040816326532,
|
| 19 |
+
"anls": 70.03817583122695,
|
| 20 |
+
"page_f1": 74.16285714285713,
|
| 21 |
+
"doc_f1": 86.45064935064934,
|
| 22 |
+
"kuiper": 52.256000000000114
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 74.5,
|
| 26 |
+
"anls": 70.03817583122695,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 66.66666666666666,
|
| 42 |
+
"anls": 62.757834757834765,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 68.18181818181817,
|
| 47 |
+
"anls": 63.54683195592287,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 83.33333333333334,
|
| 52 |
+
"anls": 78.3838383838384,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 65.21739130434783,
|
| 57 |
+
"anls": 62.36899647186356,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 81.25,
|
| 62 |
+
"anls": 86.77496898263027,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 73.40425531914893,
|
| 67 |
+
"anls": 68.7671602173282,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 71.95121951219512,
|
| 72 |
+
"anls": 64.5688672367669,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 79.06976744186046,
|
| 77 |
+
"anls": 70.27143399236422,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 60.0,
|
| 82 |
+
"anls": 65.71897407160566,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 83.33333333333334,
|
| 87 |
+
"anls": 86.70405982905983,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 76.92307692307693,
|
| 97 |
+
"anls": 76.57306264232653,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 80.0,
|
| 102 |
+
"anls": 71.72139814224423,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 93.47826086956522,
|
| 107 |
+
"anls": 73.31752767476483,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:15:52.047010+00:00",
|
| 115 |
+
"source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152104.jsonl",
|
| 116 |
+
"result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-5 (2025-08-07) with File Search",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Managed, single-shot retrieval mechanism.",
|
| 5 |
+
"link": "https://platform.openai.com/docs/guides/tools-file-search",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Conventional RAG",
|
| 8 |
+
"Semantic Search Tool"
|
| 9 |
+
],
|
| 10 |
+
"submitted_by": "Borchmann",
|
| 11 |
+
"metadata": {
|
| 12 |
+
"model_type": "api"
|
| 13 |
+
},
|
| 14 |
+
"submission_date": "2026-01-04T14:05:37.240829+00:00",
|
| 15 |
+
"results": {
|
| 16 |
+
"overall": {
|
| 17 |
+
"semantic": 48.061224489795926,
|
| 18 |
+
"anls": 44.84773268944071,
|
| 19 |
+
"page_f1": 29.277142857142856,
|
| 20 |
+
"doc_f1": 66.60666666666667,
|
| 21 |
+
"kuiper": 31.15400000000007
|
| 22 |
+
},
|
| 23 |
+
"single_evidence": {
|
| 24 |
+
"semantic": 47.099999999999994,
|
| 25 |
+
"anls": 44.84773268944071,
|
| 26 |
+
"n": 500
|
| 27 |
+
},
|
| 28 |
+
"multi_evidence_same_doc": {
|
| 29 |
+
"semantic": 0,
|
| 30 |
+
"anls": 0,
|
| 31 |
+
"n": 0
|
| 32 |
+
},
|
| 33 |
+
"multi_evidence_multi_doc": {
|
| 34 |
+
"semantic": 0,
|
| 35 |
+
"anls": 0,
|
| 36 |
+
"n": 0
|
| 37 |
+
},
|
| 38 |
+
"by_domain": {
|
| 39 |
+
"Cases/Logs": {
|
| 40 |
+
"semantic": 13.333333333333334,
|
| 41 |
+
"anls": 14.833333333333334,
|
| 42 |
+
"n": 15
|
| 43 |
+
},
|
| 44 |
+
"Education": {
|
| 45 |
+
"semantic": 79.54545454545455,
|
| 46 |
+
"anls": 63.871507280598195,
|
| 47 |
+
"n": 22
|
| 48 |
+
},
|
| 49 |
+
"Events": {
|
| 50 |
+
"semantic": 72.91666666666666,
|
| 51 |
+
"anls": 55.83149489399489,
|
| 52 |
+
"n": 24
|
| 53 |
+
},
|
| 54 |
+
"Financial": {
|
| 55 |
+
"semantic": 49.45652173913043,
|
| 56 |
+
"anls": 46.26513610007698,
|
| 57 |
+
"n": 92
|
| 58 |
+
},
|
| 59 |
+
"Financial/Tax": {
|
| 60 |
+
"semantic": 15.625,
|
| 61 |
+
"anls": 17.540322580645164,
|
| 62 |
+
"n": 16
|
| 63 |
+
},
|
| 64 |
+
"Government/Regulatory": {
|
| 65 |
+
"semantic": 45.744680851063826,
|
| 66 |
+
"anls": 41.75603723934328,
|
| 67 |
+
"n": 47
|
| 68 |
+
},
|
| 69 |
+
"HR/Employment": {
|
| 70 |
+
"semantic": 39.02439024390244,
|
| 71 |
+
"anls": 42.22238179140625,
|
| 72 |
+
"n": 41
|
| 73 |
+
},
|
| 74 |
+
"Legal": {
|
| 75 |
+
"semantic": 37.2093023255814,
|
| 76 |
+
"anls": 32.74308378959542,
|
| 77 |
+
"n": 43
|
| 78 |
+
},
|
| 79 |
+
"Media/Publishing": {
|
| 80 |
+
"semantic": 46.0,
|
| 81 |
+
"anls": 45.83167739167739,
|
| 82 |
+
"n": 25
|
| 83 |
+
},
|
| 84 |
+
"Misc": {
|
| 85 |
+
"semantic": 64.58333333333334,
|
| 86 |
+
"anls": 67.18447826857438,
|
| 87 |
+
"n": 24
|
| 88 |
+
},
|
| 89 |
+
"Other": {
|
| 90 |
+
"semantic": 0.0,
|
| 91 |
+
"anls": 0.0,
|
| 92 |
+
"n": 1
|
| 93 |
+
},
|
| 94 |
+
"Reference": {
|
| 95 |
+
"semantic": 39.42307692307692,
|
| 96 |
+
"anls": 40.48309244262362,
|
| 97 |
+
"n": 52
|
| 98 |
+
},
|
| 99 |
+
"Reports": {
|
| 100 |
+
"semantic": 46.666666666666664,
|
| 101 |
+
"anls": 46.80494177991155,
|
| 102 |
+
"n": 75
|
| 103 |
+
},
|
| 104 |
+
"Technical": {
|
| 105 |
+
"semantic": 63.04347826086957,
|
| 106 |
+
"anls": 62.77759844334801,
|
| 107 |
+
"n": 23
|
| 108 |
+
}
|
| 109 |
+
},
|
| 110 |
+
"n_evaluated": 500,
|
| 111 |
+
"n_unmatched": 0
|
| 112 |
+
},
|
| 113 |
+
"reevaluated_date": "2026-01-15T20:17:01.554804+00:00",
|
| 114 |
+
"source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_predictions_20260104_140537.jsonl",
|
| 115 |
+
"result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json"
|
| 116 |
+
}
|
eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-5 Mini (2025-08-07) with BM25 Search Tool",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T15:26:50.820104+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 65.0,
|
| 19 |
+
"anls": 55.16542612989696,
|
| 20 |
+
"page_f1": 67.57095238095239,
|
| 21 |
+
"doc_f1": 82.35303030303031,
|
| 22 |
+
"kuiper": 71.86573146292572
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 63.7,
|
| 26 |
+
"anls": 55.16542612989696,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 66.66666666666666,
|
| 42 |
+
"anls": 57.16524216524217,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 68.18181818181817,
|
| 47 |
+
"anls": 63.349203497424845,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 62.5,
|
| 52 |
+
"anls": 53.63190419293608,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 47.28260869565217,
|
| 57 |
+
"anls": 43.770804881794874,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 46.875,
|
| 62 |
+
"anls": 39.15760869565217,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 71.27659574468085,
|
| 67 |
+
"anls": 62.856694438441366,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 60.97560975609756,
|
| 72 |
+
"anls": 51.21538014830698,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 75.5813953488372,
|
| 77 |
+
"anls": 62.31744836688789,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 56.00000000000001,
|
| 82 |
+
"anls": 39.93216037493774,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 70.83333333333334,
|
| 87 |
+
"anls": 63.35950315116982,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 76.92307692307693,
|
| 97 |
+
"anls": 73.02503210878088,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 66.66666666666666,
|
| 102 |
+
"anls": 54.869395530526155,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 67.3913043478261,
|
| 107 |
+
"anls": 53.29419750997293,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:18:52.086804+00:00",
|
| 115 |
+
"source_predictions_file": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152650.jsonl",
|
| 116 |
+
"result_file_path": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-5 Nano (2025-08-07) with BM25 Search Tool",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T15:28:28.366309+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 56.6326530612245,
|
| 19 |
+
"anls": 52.255247982009955,
|
| 20 |
+
"page_f1": 60.877142857142864,
|
| 21 |
+
"doc_f1": 82.2030303030303,
|
| 22 |
+
"kuiper": 47.40000000000003
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 55.50000000000001,
|
| 26 |
+
"anls": 52.255247982009955,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 53.333333333333336,
|
| 42 |
+
"anls": 53.461538461538474,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 63.63636363636363,
|
| 47 |
+
"anls": 54.95375836284927,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 62.5,
|
| 52 |
+
"anls": 51.78930433365917,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 40.21739130434783,
|
| 57 |
+
"anls": 40.14762316798784,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 68.75,
|
| 62 |
+
"anls": 69.68257767828244,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 64.8936170212766,
|
| 67 |
+
"anls": 56.496054764723326,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 52.4390243902439,
|
| 72 |
+
"anls": 42.85858107680723,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 58.139534883720934,
|
| 77 |
+
"anls": 55.28314708547266,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 50.0,
|
| 82 |
+
"anls": 51.784085491742935,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 70.83333333333334,
|
| 87 |
+
"anls": 74.53137140637142,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 57.692307692307686,
|
| 97 |
+
"anls": 61.940508414693205,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 54.666666666666664,
|
| 102 |
+
"anls": 48.18660787855504,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 65.21739130434783,
|
| 107 |
+
"anls": 59.014067370235345,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:19:54.021229+00:00",
|
| 115 |
+
"source_predictions_file": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152828.jsonl",
|
| 116 |
+
"result_file_path": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT 4.1 Mini (2025-04-14) with BM25 Search Tool",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T15:35:16.458002+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 51.22448979591837,
|
| 19 |
+
"anls": 46.26708858125157,
|
| 20 |
+
"page_f1": 59.905054945054935,
|
| 21 |
+
"doc_f1": 77.61731601731601,
|
| 22 |
+
"kuiper": 40.01224489795946
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 50.2,
|
| 26 |
+
"anls": 46.26708858125157,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 43.333333333333336,
|
| 42 |
+
"anls": 39.64209401709402,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 59.09090909090909,
|
| 47 |
+
"anls": 48.57647622469757,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 56.25,
|
| 52 |
+
"anls": 53.83018770627063,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 35.869565217391305,
|
| 57 |
+
"anls": 34.96285359224887,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 46.875,
|
| 62 |
+
"anls": 44.4215309712932,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 44.680851063829785,
|
| 67 |
+
"anls": 44.19583719868558,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 53.65853658536586,
|
| 72 |
+
"anls": 46.501429746354255,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 53.48837209302325,
|
| 77 |
+
"anls": 43.64210613408689,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 48.0,
|
| 82 |
+
"anls": 46.71106819031614,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 56.25,
|
| 87 |
+
"anls": 55.49877795634123,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 62.5,
|
| 97 |
+
"anls": 62.86510186138165,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 51.33333333333333,
|
| 102 |
+
"anls": 45.15164464860224,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 65.21739130434783,
|
| 107 |
+
"anls": 53.71736172158072,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:21:13.745638+00:00",
|
| 115 |
+
"source_predictions_file": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153516.jsonl",
|
| 116 |
+
"result_file_path": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "Gemini 3 Pro with BM25 Search Tool",
|
| 3 |
+
"organization": "OpenAI",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T18:53:47.189606+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 80.16032064128255,
|
| 19 |
+
"anls": 73.52101315170081,
|
| 20 |
+
"page_f1": 78.4607309857811,
|
| 21 |
+
"doc_f1": 90.20248288785363,
|
| 22 |
+
"kuiper": 26.781563126252323
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 78.55711422845691,
|
| 26 |
+
"anls": 73.52101315170081,
|
| 27 |
+
"n": 499
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 80.0,
|
| 42 |
+
"anls": 85.12820512820514,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 77.27272727272727,
|
| 47 |
+
"anls": 64.8800482891392,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 81.25,
|
| 52 |
+
"anls": 79.84423442344234,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 66.84782608695652,
|
| 57 |
+
"anls": 63.13552237747254,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 93.75,
|
| 62 |
+
"anls": 93.48332554153032,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 87.2340425531915,
|
| 67 |
+
"anls": 78.26722646935413,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 86.58536585365853,
|
| 72 |
+
"anls": 77.34609828919353,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 82.55813953488372,
|
| 77 |
+
"anls": 68.10496996543507,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 78.0,
|
| 82 |
+
"anls": 79.13892729939242,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 85.41666666666666,
|
| 87 |
+
"anls": 85.4921497584541,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 81.73076923076923,
|
| 97 |
+
"anls": 83.68517307852197,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 74.32432432432432,
|
| 102 |
+
"anls": 71.94584088751826,
|
| 103 |
+
"n": 74
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 76.08695652173914,
|
| 107 |
+
"anls": 55.56822369489126,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 499,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:23:21.812681+00:00",
|
| 115 |
+
"source_predictions_file": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_predictions_20260109_185347.jsonl",
|
| 116 |
+
"result_file_path": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260107_113714.json
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-4o (2024-08-06) with HEAVEN Retrieval",
|
| 3 |
+
"organization": "OpenAI - KAIST",
|
| 4 |
+
"description": "",
|
| 5 |
+
"link": null,
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic"
|
| 8 |
+
],
|
| 9 |
+
"submitted_by": null,
|
| 10 |
+
"metadata": {
|
| 11 |
+
"model_type": "unknown"
|
| 12 |
+
},
|
| 13 |
+
"submission_date": null,
|
| 14 |
+
"results": {
|
| 15 |
+
"overall": {
|
| 16 |
+
"semantic": 43.469387755102034,
|
| 17 |
+
"anls": 40.039307087937075,
|
| 18 |
+
"page_f1": 43.05228327228327,
|
| 19 |
+
"doc_f1": 56.64095238095238,
|
| 20 |
+
"kuiper": null
|
| 21 |
+
},
|
| 22 |
+
"single_evidence": {
|
| 23 |
+
"semantic": 42.6,
|
| 24 |
+
"anls": 40.039307087937075,
|
| 25 |
+
"n": 500
|
| 26 |
+
},
|
| 27 |
+
"multi_evidence_same_doc": {
|
| 28 |
+
"semantic": 0,
|
| 29 |
+
"anls": 0,
|
| 30 |
+
"n": 0
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_multi_doc": {
|
| 33 |
+
"semantic": 0,
|
| 34 |
+
"anls": 0,
|
| 35 |
+
"n": 0
|
| 36 |
+
},
|
| 37 |
+
"by_domain": {
|
| 38 |
+
"Cases/Logs": {
|
| 39 |
+
"semantic": 46.666666666666664,
|
| 40 |
+
"anls": 46.75783475783476,
|
| 41 |
+
"n": 15
|
| 42 |
+
},
|
| 43 |
+
"Education": {
|
| 44 |
+
"semantic": 36.36363636363637,
|
| 45 |
+
"anls": 36.95054945054945,
|
| 46 |
+
"n": 22
|
| 47 |
+
},
|
| 48 |
+
"Events": {
|
| 49 |
+
"semantic": 43.75,
|
| 50 |
+
"anls": 38.03661616161616,
|
| 51 |
+
"n": 24
|
| 52 |
+
},
|
| 53 |
+
"Financial": {
|
| 54 |
+
"semantic": 42.934782608695656,
|
| 55 |
+
"anls": 42.52300514978308,
|
| 56 |
+
"n": 92
|
| 57 |
+
},
|
| 58 |
+
"Financial/Tax": {
|
| 59 |
+
"semantic": 31.25,
|
| 60 |
+
"anls": 31.922043010752688,
|
| 61 |
+
"n": 16
|
| 62 |
+
},
|
| 63 |
+
"Government/Regulatory": {
|
| 64 |
+
"semantic": 44.680851063829785,
|
| 65 |
+
"anls": 36.32965392203914,
|
| 66 |
+
"n": 47
|
| 67 |
+
},
|
| 68 |
+
"HR/Employment": {
|
| 69 |
+
"semantic": 39.02439024390244,
|
| 70 |
+
"anls": 33.06592985170988,
|
| 71 |
+
"n": 41
|
| 72 |
+
},
|
| 73 |
+
"Legal": {
|
| 74 |
+
"semantic": 41.86046511627907,
|
| 75 |
+
"anls": 33.1515319306017,
|
| 76 |
+
"n": 43
|
| 77 |
+
},
|
| 78 |
+
"Media/Publishing": {
|
| 79 |
+
"semantic": 20.0,
|
| 80 |
+
"anls": 31.078787878787878,
|
| 81 |
+
"n": 25
|
| 82 |
+
},
|
| 83 |
+
"Misc": {
|
| 84 |
+
"semantic": 52.083333333333336,
|
| 85 |
+
"anls": 51.80921052631579,
|
| 86 |
+
"n": 24
|
| 87 |
+
},
|
| 88 |
+
"Other": {
|
| 89 |
+
"semantic": 0.0,
|
| 90 |
+
"anls": 0.0,
|
| 91 |
+
"n": 1
|
| 92 |
+
},
|
| 93 |
+
"Reference": {
|
| 94 |
+
"semantic": 42.30769230769231,
|
| 95 |
+
"anls": 46.93060276608143,
|
| 96 |
+
"n": 52
|
| 97 |
+
},
|
| 98 |
+
"Reports": {
|
| 99 |
+
"semantic": 52.0,
|
| 100 |
+
"anls": 44.41148230399428,
|
| 101 |
+
"n": 75
|
| 102 |
+
},
|
| 103 |
+
"Technical": {
|
| 104 |
+
"semantic": 41.30434782608695,
|
| 105 |
+
"anls": 38.6639124934416,
|
| 106 |
+
"n": 23
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
"n_evaluated": 500,
|
| 110 |
+
"n_unmatched": 0
|
| 111 |
+
},
|
| 112 |
+
"reevaluated_date": "2026-01-15T20:24:35.010694+00:00",
|
| 113 |
+
"source_predictions_file": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260107_113714.jsonl",
|
| 114 |
+
"result_file_path": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260107_113714.json"
|
| 115 |
+
}
|
eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-4o (2024-08-06) with HEAVEN Retrieval",
|
| 3 |
+
"organization": "OpenAI / KAIST",
|
| 4 |
+
"description": "Image-based retrieval. Best setup described in HEAVEN paper.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Conventional RAG",
|
| 8 |
+
"Semantic Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T15:44:27.735534+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 46.73469387755102,
|
| 19 |
+
"anls": 45.649762341432954,
|
| 20 |
+
"page_f1": 43.169719169719166,
|
| 21 |
+
"doc_f1": 59.24761904761905,
|
| 22 |
+
"kuiper": null
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 45.800000000000004,
|
| 26 |
+
"anls": 45.649762341432954,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 53.333333333333336,
|
| 42 |
+
"anls": 48.75783475783476,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 40.909090909090914,
|
| 47 |
+
"anls": 38.506493506493506,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 56.25,
|
| 52 |
+
"anls": 55.056754787358244,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 43.47826086956522,
|
| 57 |
+
"anls": 48.16466676354977,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 50.0,
|
| 62 |
+
"anls": 44.99022482893451,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 53.191489361702125,
|
| 67 |
+
"anls": 47.1956486962086,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 40.243902439024396,
|
| 72 |
+
"anls": 32.93040293040293,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 37.2093023255814,
|
| 77 |
+
"anls": 35.73555320648344,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 28.000000000000004,
|
| 82 |
+
"anls": 43.22,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 47.91666666666667,
|
| 87 |
+
"anls": 49.20255183413078,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 46.15384615384615,
|
| 97 |
+
"anls": 48.954805357154754,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 52.0,
|
| 102 |
+
"anls": 50.65907330372244,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 50.0,
|
| 107 |
+
"anls": 46.20014437749956,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:25:44.256079+00:00",
|
| 115 |
+
"source_predictions_file": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_154427.jsonl",
|
| 116 |
+
"result_file_path": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-5.2 (2024-08-06) with HEAVEN Retrieval",
|
| 3 |
+
"organization": "OpenAI / KAIST",
|
| 4 |
+
"description": "Image-based retrieval. Best setup described in HEAVEN paper, but with newer GPT.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Conventional RAG",
|
| 8 |
+
"Semantic Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "api"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-09T17:56:39.771528+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 50.0,
|
| 19 |
+
"anls": 47.46445252141211,
|
| 20 |
+
"page_f1": 48.43228327228327,
|
| 21 |
+
"doc_f1": 62.30761904761904,
|
| 22 |
+
"kuiper": null
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 49.0,
|
| 26 |
+
"anls": 47.46445252141211,
|
| 27 |
+
"n": 500
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 53.333333333333336,
|
| 42 |
+
"anls": 43.64672364672364,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 52.27272727272727,
|
| 47 |
+
"anls": 51.569264069264065,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 47.91666666666667,
|
| 52 |
+
"anls": 46.90982404692082,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 48.369565217391305,
|
| 57 |
+
"anls": 48.83531625708929,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 43.75,
|
| 62 |
+
"anls": 43.92031798457114,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 59.57446808510638,
|
| 67 |
+
"anls": 49.070286122357786,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 39.02439024390244,
|
| 72 |
+
"anls": 34.149915125524885,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 45.348837209302324,
|
| 77 |
+
"anls": 46.299372462163156,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 26.0,
|
| 82 |
+
"anls": 33.613578417414736,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 50.0,
|
| 87 |
+
"anls": 51.63690476190476,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Other": {
|
| 91 |
+
"semantic": 0.0,
|
| 92 |
+
"anls": 0.0,
|
| 93 |
+
"n": 1
|
| 94 |
+
},
|
| 95 |
+
"Reference": {
|
| 96 |
+
"semantic": 53.84615384615385,
|
| 97 |
+
"anls": 58.28202679165414,
|
| 98 |
+
"n": 52
|
| 99 |
+
},
|
| 100 |
+
"Reports": {
|
| 101 |
+
"semantic": 52.666666666666664,
|
| 102 |
+
"anls": 52.18098320525303,
|
| 103 |
+
"n": 75
|
| 104 |
+
},
|
| 105 |
+
"Technical": {
|
| 106 |
+
"semantic": 56.52173913043478,
|
| 107 |
+
"anls": 39.14801495210919,
|
| 108 |
+
"n": 23
|
| 109 |
+
}
|
| 110 |
+
},
|
| 111 |
+
"n_evaluated": 500,
|
| 112 |
+
"n_unmatched": 0
|
| 113 |
+
},
|
| 114 |
+
"reevaluated_date": "2026-01-15T20:27:00.066247+00:00",
|
| 115 |
+
"source_predictions_file": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_175639.jsonl",
|
| 116 |
+
"result_file_path": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json"
|
| 117 |
+
}
|
eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_results_20260107_153009.json
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GPT-5.2 (2025-12-11) with HEAVEN Retrieval",
|
| 3 |
+
"organization": "OpenAI - KAIST",
|
| 4 |
+
"description": "",
|
| 5 |
+
"link": null,
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic"
|
| 8 |
+
],
|
| 9 |
+
"submitted_by": null,
|
| 10 |
+
"metadata": {
|
| 11 |
+
"model_type": "unknown"
|
| 12 |
+
},
|
| 13 |
+
"submission_date": null,
|
| 14 |
+
"results": {
|
| 15 |
+
"overall": {
|
| 16 |
+
"semantic": 48.16326530612245,
|
| 17 |
+
"anls": 43.22495703626025,
|
| 18 |
+
"page_f1": 46.402539682539675,
|
| 19 |
+
"doc_f1": 57.27428571428571,
|
| 20 |
+
"kuiper": null
|
| 21 |
+
},
|
| 22 |
+
"single_evidence": {
|
| 23 |
+
"semantic": 47.199999999999996,
|
| 24 |
+
"anls": 43.22495703626025,
|
| 25 |
+
"n": 500
|
| 26 |
+
},
|
| 27 |
+
"multi_evidence_same_doc": {
|
| 28 |
+
"semantic": 0,
|
| 29 |
+
"anls": 0,
|
| 30 |
+
"n": 0
|
| 31 |
+
},
|
| 32 |
+
"multi_evidence_multi_doc": {
|
| 33 |
+
"semantic": 0,
|
| 34 |
+
"anls": 0,
|
| 35 |
+
"n": 0
|
| 36 |
+
},
|
| 37 |
+
"by_domain": {
|
| 38 |
+
"Cases/Logs": {
|
| 39 |
+
"semantic": 60.0,
|
| 40 |
+
"anls": 43.64672364672364,
|
| 41 |
+
"n": 15
|
| 42 |
+
},
|
| 43 |
+
"Education": {
|
| 44 |
+
"semantic": 45.45454545454545,
|
| 45 |
+
"anls": 41.99134199134198,
|
| 46 |
+
"n": 22
|
| 47 |
+
},
|
| 48 |
+
"Events": {
|
| 49 |
+
"semantic": 47.91666666666667,
|
| 50 |
+
"anls": 42.272727272727266,
|
| 51 |
+
"n": 24
|
| 52 |
+
},
|
| 53 |
+
"Financial": {
|
| 54 |
+
"semantic": 43.47826086956522,
|
| 55 |
+
"anls": 39.79157919704788,
|
| 56 |
+
"n": 92
|
| 57 |
+
},
|
| 58 |
+
"Financial/Tax": {
|
| 59 |
+
"semantic": 43.75,
|
| 60 |
+
"anls": 45.17687392862708,
|
| 61 |
+
"n": 16
|
| 62 |
+
},
|
| 63 |
+
"Government/Regulatory": {
|
| 64 |
+
"semantic": 53.191489361702125,
|
| 65 |
+
"anls": 47.41888368008188,
|
| 66 |
+
"n": 47
|
| 67 |
+
},
|
| 68 |
+
"HR/Employment": {
|
| 69 |
+
"semantic": 40.243902439024396,
|
| 70 |
+
"anls": 32.19869561332976,
|
| 71 |
+
"n": 41
|
| 72 |
+
},
|
| 73 |
+
"Legal": {
|
| 74 |
+
"semantic": 43.02325581395349,
|
| 75 |
+
"anls": 42.43497069635968,
|
| 76 |
+
"n": 43
|
| 77 |
+
},
|
| 78 |
+
"Media/Publishing": {
|
| 79 |
+
"semantic": 26.0,
|
| 80 |
+
"anls": 30.585587652734088,
|
| 81 |
+
"n": 25
|
| 82 |
+
},
|
| 83 |
+
"Misc": {
|
| 84 |
+
"semantic": 45.83333333333333,
|
| 85 |
+
"anls": 47.470238095238095,
|
| 86 |
+
"n": 24
|
| 87 |
+
},
|
| 88 |
+
"Other": {
|
| 89 |
+
"semantic": 0.0,
|
| 90 |
+
"anls": 0.0,
|
| 91 |
+
"n": 1
|
| 92 |
+
},
|
| 93 |
+
"Reference": {
|
| 94 |
+
"semantic": 53.84615384615385,
|
| 95 |
+
"anls": 56.89824130630616,
|
| 96 |
+
"n": 52
|
| 97 |
+
},
|
| 98 |
+
"Reports": {
|
| 99 |
+
"semantic": 54.0,
|
| 100 |
+
"anls": 46.64353866236792,
|
| 101 |
+
"n": 75
|
| 102 |
+
},
|
| 103 |
+
"Technical": {
|
| 104 |
+
"semantic": 54.347826086956516,
|
| 105 |
+
"anls": 39.18827260106249,
|
| 106 |
+
"n": 23
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
"n_evaluated": 500,
|
| 110 |
+
"n_unmatched": 0
|
| 111 |
+
},
|
| 112 |
+
"reevaluated_date": "2026-01-15T20:28:06.717531+00:00",
|
| 113 |
+
"source_predictions_file": "OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_predictions_20260107_153009.jsonl",
|
| 114 |
+
"result_file_path": "OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_results_20260107_153009.json"
|
| 115 |
+
}
|
eval/reevaluated_results/Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GLM-4.6V Flash with BM25 Search Tool",
|
| 3 |
+
"organization": "Z.AI",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "open-weight"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-10T13:22:27.811792+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 43.658746063555675,
|
| 19 |
+
"anls": 30.17090068718362,
|
| 20 |
+
"page_f1": 28.991793110029583,
|
| 21 |
+
"doc_f1": 51.58650634602539,
|
| 22 |
+
"kuiper": 29.321285140562065
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 42.78557114228457,
|
| 26 |
+
"anls": 30.17090068718362,
|
| 27 |
+
"n": 499
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 43.333333333333336,
|
| 42 |
+
"anls": 30.313390313390315,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 54.54545454545454,
|
| 47 |
+
"anls": 34.34782608695652,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 64.58333333333334,
|
| 52 |
+
"anls": 52.92922722985768,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 34.78260869565217,
|
| 57 |
+
"anls": 23.538822057620244,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 21.875,
|
| 62 |
+
"anls": 21.39516129032258,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 37.234042553191486,
|
| 67 |
+
"anls": 29.5464725643897,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 46.34146341463415,
|
| 72 |
+
"anls": 37.17815890071988,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 50.0,
|
| 77 |
+
"anls": 37.64410653945538,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 38.0,
|
| 82 |
+
"anls": 26.401353874883288,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 68.75,
|
| 87 |
+
"anls": 48.707026404394824,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Reference": {
|
| 91 |
+
"semantic": 40.38461538461539,
|
| 92 |
+
"anls": 23.25877926421405,
|
| 93 |
+
"n": 52
|
| 94 |
+
},
|
| 95 |
+
"Reports": {
|
| 96 |
+
"semantic": 40.666666666666664,
|
| 97 |
+
"anls": 25.79399206429042,
|
| 98 |
+
"n": 75
|
| 99 |
+
},
|
| 100 |
+
"Technical": {
|
| 101 |
+
"semantic": 36.95652173913043,
|
| 102 |
+
"anls": 24.436392914653783,
|
| 103 |
+
"n": 23
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"n_evaluated": 499,
|
| 107 |
+
"n_unmatched": 1767
|
| 108 |
+
},
|
| 109 |
+
"reevaluated_date": "2026-01-15T20:30:38.431851+00:00",
|
| 110 |
+
"source_predictions_file": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_predictions_20260110_132227.jsonl",
|
| 111 |
+
"result_file_path": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json"
|
| 112 |
+
}
|
eval/reevaluated_results/Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "GLM-4.6V with BM25 Search Tool",
|
| 3 |
+
"organization": "Z.AI",
|
| 4 |
+
"description": "Max 10 iterations, up to 5 result pages consumed as images.",
|
| 5 |
+
"link": "",
|
| 6 |
+
"tags": [
|
| 7 |
+
"Agentic",
|
| 8 |
+
"Sparse Search Tool",
|
| 9 |
+
"Vision and Language"
|
| 10 |
+
],
|
| 11 |
+
"submitted_by": "Borchmann",
|
| 12 |
+
"metadata": {
|
| 13 |
+
"model_type": "open-weight"
|
| 14 |
+
},
|
| 15 |
+
"submission_date": "2026-01-10T13:18:26.686587+00:00",
|
| 16 |
+
"results": {
|
| 17 |
+
"overall": {
|
| 18 |
+
"semantic": 64.92576990716128,
|
| 19 |
+
"anls": 59.661893537203156,
|
| 20 |
+
"page_f1": 66.02347552247352,
|
| 21 |
+
"doc_f1": 86.7908978129419,
|
| 22 |
+
"kuiper": 49.83064516129022
|
| 23 |
+
},
|
| 24 |
+
"single_evidence": {
|
| 25 |
+
"semantic": 63.62725450901804,
|
| 26 |
+
"anls": 59.661893537203156,
|
| 27 |
+
"n": 499
|
| 28 |
+
},
|
| 29 |
+
"multi_evidence_same_doc": {
|
| 30 |
+
"semantic": 0,
|
| 31 |
+
"anls": 0,
|
| 32 |
+
"n": 0
|
| 33 |
+
},
|
| 34 |
+
"multi_evidence_multi_doc": {
|
| 35 |
+
"semantic": 0,
|
| 36 |
+
"anls": 0,
|
| 37 |
+
"n": 0
|
| 38 |
+
},
|
| 39 |
+
"by_domain": {
|
| 40 |
+
"Cases/Logs": {
|
| 41 |
+
"semantic": 73.33333333333333,
|
| 42 |
+
"anls": 62.16524216524218,
|
| 43 |
+
"n": 15
|
| 44 |
+
},
|
| 45 |
+
"Education": {
|
| 46 |
+
"semantic": 61.36363636363637,
|
| 47 |
+
"anls": 54.25829440651575,
|
| 48 |
+
"n": 22
|
| 49 |
+
},
|
| 50 |
+
"Events": {
|
| 51 |
+
"semantic": 77.08333333333334,
|
| 52 |
+
"anls": 67.87290397408577,
|
| 53 |
+
"n": 24
|
| 54 |
+
},
|
| 55 |
+
"Financial": {
|
| 56 |
+
"semantic": 51.63043478260869,
|
| 57 |
+
"anls": 51.19993983845437,
|
| 58 |
+
"n": 92
|
| 59 |
+
},
|
| 60 |
+
"Financial/Tax": {
|
| 61 |
+
"semantic": 68.75,
|
| 62 |
+
"anls": 62.5648667601683,
|
| 63 |
+
"n": 16
|
| 64 |
+
},
|
| 65 |
+
"Government/Regulatory": {
|
| 66 |
+
"semantic": 73.40425531914893,
|
| 67 |
+
"anls": 70.93589720557641,
|
| 68 |
+
"n": 47
|
| 69 |
+
},
|
| 70 |
+
"HR/Employment": {
|
| 71 |
+
"semantic": 63.41463414634146,
|
| 72 |
+
"anls": 59.735891761304075,
|
| 73 |
+
"n": 41
|
| 74 |
+
},
|
| 75 |
+
"Legal": {
|
| 76 |
+
"semantic": 67.44186046511628,
|
| 77 |
+
"anls": 55.536175710594314,
|
| 78 |
+
"n": 43
|
| 79 |
+
},
|
| 80 |
+
"Media/Publishing": {
|
| 81 |
+
"semantic": 68.0,
|
| 82 |
+
"anls": 69.11970073982938,
|
| 83 |
+
"n": 25
|
| 84 |
+
},
|
| 85 |
+
"Misc": {
|
| 86 |
+
"semantic": 72.91666666666666,
|
| 87 |
+
"anls": 75.10160446706249,
|
| 88 |
+
"n": 24
|
| 89 |
+
},
|
| 90 |
+
"Reference": {
|
| 91 |
+
"semantic": 59.61538461538461,
|
| 92 |
+
"anls": 60.632124141167864,
|
| 93 |
+
"n": 52
|
| 94 |
+
},
|
| 95 |
+
"Reports": {
|
| 96 |
+
"semantic": 63.33333333333333,
|
| 97 |
+
"anls": 56.89167319856098,
|
| 98 |
+
"n": 75
|
| 99 |
+
},
|
| 100 |
+
"Technical": {
|
| 101 |
+
"semantic": 58.69565217391305,
|
| 102 |
+
"anls": 51.450020851943364,
|
| 103 |
+
"n": 23
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"n_evaluated": 499,
|
| 107 |
+
"n_unmatched": 1767
|
| 108 |
+
},
|
| 109 |
+
"reevaluated_date": "2026-01-15T20:31:43.022276+00:00",
|
| 110 |
+
"source_predictions_file": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_predictions_20260110_131826.jsonl",
|
| 111 |
+
"result_file_path": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json"
|
| 112 |
+
}
|