Borchmann commited on
Commit
4829aac
·
verified ·
1 Parent(s): 59e29ca

Upload folder using huggingface_hub

Browse files
Files changed (34) hide show
  1. app.py +273 -83
  2. eval/batch_reevaluate.py +434 -0
  3. eval/evaluate.py +93 -27
  4. eval/metrics.py +500 -1
  5. eval/reevaluate_submissions.py +254 -0
  6. eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json +112 -0
  7. eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json +112 -0
  8. eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json +112 -0
  9. eval/reevaluated_results/Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json +117 -0
  10. eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_002125.json +115 -0
  11. eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json +117 -0
  12. eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_003320.json +115 -0
  13. eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json +117 -0
  14. eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_005202.json +115 -0
  15. eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json +117 -0
  16. eval/reevaluated_results/Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_results_20260109_002711.json +110 -0
  17. eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_234108.json +115 -0
  18. eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235325.json +115 -0
  19. eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235724.json +117 -0
  20. eval/reevaluated_results/OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json +117 -0
  21. eval/reevaluated_results/OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json +117 -0
  22. eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json +117 -0
  23. eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json +117 -0
  24. eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json +116 -0
  25. eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json +117 -0
  26. eval/reevaluated_results/OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json +117 -0
  27. eval/reevaluated_results/OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json +117 -0
  28. eval/reevaluated_results/OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json +117 -0
  29. eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260107_113714.json +115 -0
  30. eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json +117 -0
  31. eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json +117 -0
  32. eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_results_20260107_153009.json +115 -0
  33. eval/reevaluated_results/Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json +112 -0
  34. eval/reevaluated_results/Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json +112 -0
app.py CHANGED
@@ -22,10 +22,15 @@ import os
22
  import secrets
23
  import shutil
24
  import sys
 
 
25
  from datetime import datetime, timezone
26
  from pathlib import Path
27
  from urllib.parse import urlencode, quote, unquote
28
 
 
 
 
29
  import pandas as pd
30
  import plotly.graph_objects as go
31
  import requests
@@ -35,7 +40,15 @@ from huggingface_hub import snapshot_download, HfApi, hf_hub_download
35
  # Add eval module to path
36
  sys.path.insert(0, str(Path(__file__).parent / "eval"))
37
  try:
38
- from metrics import anls_star, citation_f1, kuiper_statistic
 
 
 
 
 
 
 
 
39
  from datasets import load_dataset
40
  EVAL_AVAILABLE = True
41
  except ImportError:
@@ -916,10 +929,17 @@ def get_model_type_html(model_type: str) -> str:
916
  return f'<span style="color: {color}; font-weight: 500;">{fallback_emoji} {model_type}</span>'
917
 
918
 
 
 
 
 
 
 
 
919
  @st.cache_data(ttl=300) # Cache for 5 minutes
920
  def load_eval_results() -> pd.DataFrame:
921
- """Load evaluation results from JSON files."""
922
- results = []
923
 
924
  results_path = Path(EVAL_RESULTS_PATH)
925
  if not results_path.exists():
@@ -945,36 +965,76 @@ def load_eval_results() -> pd.DataFrame:
945
  # Get per-domain scores if available
946
  by_domain = result_scores.get("by_domain", {})
947
 
948
- results.append({
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
949
  "Model": model_name,
950
  "Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
951
  "Model Type": metadata.get("model_type", "unknown"),
952
  "Tags": tags, # Store as list
953
- # Answer correctness metrics (ANLS*)
954
- "Accuracy (ANLS*)": result_scores.get("overall", {}).get("anls", 0.0),
955
- "Acc. Single-Hop": result_scores.get("single_evidence", {}).get("anls", 0.0),
956
- "Acc. Cross-Page": result_scores.get("multi_evidence_same_doc", {}).get("anls", 0.0),
957
- "Acc. Cross-Doc": result_scores.get("multi_evidence_multi_doc", {}).get("anls", 0.0),
 
 
 
 
958
  # Attribution metrics
959
- "Attribution (Page F1)": result_scores.get("overall", {}).get("page_f1", 0.0),
960
- "Attribution (Doc F1)": result_scores.get("overall", {}).get("doc_f1", 0.0),
961
  # Calibration metric
962
- "Effort (Kuiper)": result_scores.get("overall", {}).get("kuiper", 0.0),
963
  "Submission Date": data.get("submission_date", ""),
964
  "Link": data.get("link", ""),
965
  "Description": data.get("description", metadata.get("description", "")) or
966
  generate_placeholder_description(model_name, tags, metadata.get("model_type", "")),
967
  # Per-domain scores (stored as JSON string for DataFrame compatibility)
968
  "_by_domain": json.dumps(by_domain) if by_domain else "{}",
969
- })
 
 
 
 
 
 
 
 
970
  except Exception as e:
971
  st.warning(f"Error loading {result_file}: {e}")
972
 
973
- if not results:
974
  return pd.DataFrame()
975
 
 
 
 
976
  df = pd.DataFrame(results)
977
- df = df.sort_values("Accuracy (ANLS*)", ascending=False).reset_index(drop=True)
978
  return df
979
 
980
 
@@ -1045,7 +1105,8 @@ def format_model_type(model_type: str) -> str:
1045
 
1046
  # Metric tooltips for table headers
1047
  METRIC_TOOLTIPS = {
1048
- "Accuracy (ANLS*)": "Overall answer accuracy using ANLS* (Average Normalized Levenshtein Similarity). Higher is better.",
 
1049
  "Acc. Single-Hop": "Accuracy on questions requiring evidence from a single page.",
1050
  "Acc. Cross-Page": "Accuracy on multi-hop questions requiring evidence from multiple pages within the same document.",
1051
  "Acc. Cross-Doc": "Accuracy on multi-hop questions requiring evidence from multiple documents.",
@@ -1130,12 +1191,26 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
1130
  # Render tags as badges
1131
  cell_html = render_tags_html(value)
1132
  cells.append(f'<td>{cell_html}</td>')
1133
- elif col == "Accuracy (ANLS*)" or col.startswith("Acc."):
1134
- # Format accuracy scores (ANLS*, scale 0-100)
1135
  try:
1136
- cell_html = f"{float(value):.1f}" if value else "0"
1137
  except (ValueError, TypeError):
1138
- cell_html = str(value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1139
  cells.append(f'<td style="text-align: center;">{cell_html}</td>')
1140
  elif col.startswith("Attribution"):
1141
  # Format F1 scores (scale 0-100)
@@ -1274,7 +1349,7 @@ def create_accuracy_vs_attribution_plot(df: pd.DataFrame) -> go.Figure:
1274
  df_type = df[df["Model Type"] == model_type]
1275
  fig.add_trace(go.Scatter(
1276
  x=df_type["Attribution (Page F1)"],
1277
- y=df_type["Accuracy (ANLS*)"],
1278
  mode="markers",
1279
  name=model_type,
1280
  text=df_type["Model"],
@@ -1289,7 +1364,7 @@ def create_accuracy_vs_attribution_plot(df: pd.DataFrame) -> go.Figure:
1289
  fig.update_layout(
1290
  title=dict(text="Accuracy vs Attribution", font=dict(color="white")),
1291
  xaxis_title="Attribution (Page F1)",
1292
- yaxis_title="Accuracy (ANLS*)",
1293
  hovermode="closest",
1294
  template="plotly_dark",
1295
  height=650,
@@ -1335,7 +1410,7 @@ def create_accuracy_vs_effort_plot(df: pd.DataFrame) -> go.Figure:
1335
  df_type = df_filtered[df_filtered["Model Type"] == model_type]
1336
  fig.add_trace(go.Scatter(
1337
  x=df_type["Effort (Kuiper)"],
1338
- y=df_type["Accuracy (ANLS*)"],
1339
  mode="markers",
1340
  name=model_type,
1341
  text=df_type["Model"],
@@ -1350,7 +1425,7 @@ def create_accuracy_vs_effort_plot(df: pd.DataFrame) -> go.Figure:
1350
  fig.update_layout(
1351
  title=dict(text="Accuracy vs Effort", font=dict(color="white")),
1352
  xaxis_title="Effort (Kuiper) — lower is better",
1353
- yaxis_title="Accuracy (ANLS*)",
1354
  hovermode="closest",
1355
  template="plotly_dark",
1356
  height=650,
@@ -1460,7 +1535,7 @@ def show_model_details(model_name: str):
1460
  # Display main metrics
1461
  col1, col2, col3 = st.columns(3)
1462
  with col1:
1463
- st.metric("Overall Accuracy", f"{model_data['Accuracy (ANLS*)']:.1f}%")
1464
  with col2:
1465
  st.metric("Attribution (Page F1)", f"{model_data['Attribution (Page F1)']:.1f}%")
1466
  with col3:
@@ -1495,7 +1570,7 @@ def show_model_details(model_name: str):
1495
 
1496
  if by_domain:
1497
  # Show per-domain chart (use overall accuracy as threshold for coloring)
1498
- overall_accuracy = model_data.get('Accuracy (ANLS*)', 0)
1499
  fig = create_domain_accuracy_chart(by_domain, model_name, overall_accuracy)
1500
  st.plotly_chart(fig, width="stretch")
1501
  else:
@@ -1620,12 +1695,73 @@ def load_gold_standard(dataset_name: str = "agentic-document-ai/dataset-PRIVATE"
1620
  return {}, {}
1621
 
1622
 
1623
- def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict) -> dict:
1624
- """Evaluate predictions against gold standard."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1625
  if not EVAL_AVAILABLE:
1626
  return {"error": "Evaluation module not available"}
1627
 
1628
- evals = []
 
1629
  unmatched = []
1630
 
1631
  for pred in predictions:
@@ -1633,45 +1769,54 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
1633
  qid = pred.get('id', '')
1634
 
1635
  # Match to gold
 
1636
  if question in gold_by_text:
1637
  gold_data = gold_by_text[question]
1638
  elif qid and qid in gold_by_id:
1639
  gold_data = gold_by_id[qid]
 
 
 
1640
  else:
1641
  unmatched.append(question[:50] + "..." if len(question) > 50 else question)
1642
- continue
1643
-
1644
- # Get prediction data
1645
- answer = pred.get('answer', '')
1646
- citations = pred.get('citations', [])
1647
- search_history = pred.get('search_history', [])
1648
- steps = len(search_history) if search_history else pred.get('iterations', 0)
1649
-
1650
- # Calculate metrics
1651
- anls = anls_star(answer, gold_data['answers'])
1652
- correct = anls >= 0.5
1653
- doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
1654
- page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
1655
-
1656
- evals.append({
1657
- 'question': question,
1658
- 'anls': anls,
1659
- 'correct': correct,
1660
- 'doc_f1': doc_f1['f1'],
1661
- 'page_f1': page_f1['f1'],
1662
- 'steps': steps,
1663
- 'hop_type': gold_data.get('hop_type', 'single'),
1664
- 'category': gold_data['category'],
1665
- 'domain': gold_data['domain']
1666
- })
1667
-
1668
- if not evals:
1669
  return {"error": "No predictions matched the gold standard"}
1670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1671
  # Aggregate overall metrics
1672
  n = len(evals)
1673
- accuracy = sum(e['correct'] for e in evals) / n * 100 # Scale to 0-100
 
 
 
 
 
 
 
 
 
 
1674
  mean_anls = sum(e['anls'] for e in evals) / n * 100
 
1675
  mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
1676
  mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
1677
 
@@ -1684,7 +1829,6 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
1684
  cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
1685
 
1686
  # By domain
1687
- from collections import defaultdict
1688
  by_domain = defaultdict(list)
1689
  for e in evals:
1690
  domain = e['domain'] or 'Other'
@@ -1693,6 +1837,7 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
1693
  domain_scores = {}
1694
  for domain, domain_evals in sorted(by_domain.items()):
1695
  domain_scores[domain] = {
 
1696
  'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
1697
  'n': len(domain_evals)
1698
  }
@@ -1700,27 +1845,33 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
1700
  results = {
1701
  'n_evaluated': n,
1702
  'n_unmatched': len(unmatched),
1703
- 'unmatched_samples': unmatched[:5], # Show first 5
1704
  'overall': {
1705
- 'anls': mean_anls,
 
 
1706
  'accuracy': accuracy,
1707
  'doc_f1': mean_doc_f1,
1708
  'page_f1': mean_page_f1,
1709
  'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
1710
  },
1711
  'single_evidence': {
 
1712
  'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
1713
  'n': len(single_hop)
1714
  },
1715
  'multi_evidence_same_doc': {
 
1716
  'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
1717
  'n': len(cross_page)
1718
  },
1719
  'multi_evidence_multi_doc': {
 
1720
  'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
1721
  'n': len(cross_doc)
1722
  },
1723
- 'by_domain': domain_scores
 
1724
  }
1725
 
1726
  return results
@@ -1822,14 +1973,30 @@ def submit_results_fragment():
1822
 
1823
  # Evaluate button
1824
  if st.button("Run Evaluation", type="primary"):
1825
- with st.spinner("Loading gold standard and evaluating..."):
1826
  gold_by_text, gold_by_id = load_gold_standard()
 
 
 
 
 
 
 
1827
 
1828
- if not gold_by_text:
1829
- st.error("Failed to load gold standard dataset")
1830
- else:
1831
- results = evaluate_predictions(predictions, gold_by_text, gold_by_id)
1832
- st.session_state.eval_results = results
 
 
 
 
 
 
 
 
 
1833
 
1834
  # Show evaluation results
1835
  if st.session_state.eval_results:
@@ -1840,10 +2007,15 @@ def submit_results_fragment():
1840
  else:
1841
  st.markdown("#### Evaluation Results")
1842
 
1843
- # Summary metrics
1844
  col1, col2, col3, col4 = st.columns(4)
1845
  with col1:
1846
- st.metric("Accuracy (ANLS*)", f"{results['overall']['anls']:.1f}")
 
 
 
 
 
1847
  with col2:
1848
  st.metric("Attribution (Page F1)", f"{results['overall']['page_f1']:.1f}")
1849
  with col3:
@@ -1854,16 +2026,32 @@ def submit_results_fragment():
1854
 
1855
  # Detailed breakdown
1856
  with st.expander("Detailed Breakdown"):
1857
- st.markdown(f"""
1858
- | Metric | Value |
1859
- |--------|-------|
1860
- | **Overall ANLS*** | {results['overall']['anls']:.1f} |
1861
- | **Acc. Single-Hop** (n={results['single_evidence']['n']}) | {results['single_evidence']['anls']:.1f} |
1862
- | **Acc. Cross-Page** (n={results['multi_evidence_same_doc']['n']}) | {results['multi_evidence_same_doc']['anls']:.1f} |
1863
- | **Acc. Cross-Doc** (n={results['multi_evidence_multi_doc']['n']}) | {results['multi_evidence_multi_doc']['anls']:.1f} |
1864
- | **Attribution (Doc F1)** | {results['overall']['doc_f1']:.1f} |
1865
- | **Attribution (Page F1)** | {results['overall']['page_f1']:.1f} |
1866
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1867
 
1868
  if results['n_unmatched'] > 0:
1869
  with st.expander(f"{results['n_unmatched']} unmatched questions"):
@@ -2333,10 +2521,11 @@ def main():
2333
  # COLUMN SELECTOR - chips use SNOWFLAKE_BLUE (lighter, gradient end)
2334
  # Mapping: short chip name -> full column name
2335
  COLUMN_CHIP_NAMES = {
2336
- "Accuracy": "Accuracy (ANLS*)",
2337
  "Acc. Single-Hop": "Acc. Single-Hop",
2338
  "Acc. Cross-Page": "Acc. Cross-Page",
2339
  "Acc. Cross-Doc": "Acc. Cross-Doc",
 
2340
  "Attribution": "Attribution (Page F1)",
2341
  "Attribution (Doc)": "Attribution (Doc F1)",
2342
  "Effort": "Effort (Kuiper)",
@@ -2351,7 +2540,7 @@ def main():
2351
  # Model and Organization are always visible (not in selector)
2352
  always_visible = ["Model", "Organization"]
2353
  # Hidden columns (used internally but not shown as separate columns)
2354
- hidden_cols = ["Link", "Submission Date", "Description", "_by_domain"]
2355
  # Full column names that are optional (Tags moved to end)
2356
  optional_full_cols = [c for c in all_columns if c not in hidden_cols + always_visible and c != "Tags"]
2357
  optional_full_cols.append("Tags") # Add Tags at the end
@@ -2524,8 +2713,9 @@ The task is characterized by five formal properties:
2524
  st.markdown("""
2525
  #### Metrics
2526
 
2527
- ##### Accuracy (ANLS*)
2528
- - **Accuracy (ANLS*)**: Main score using Average Normalized Levenshtein Similarity with optimal element alignment for lists/sets
 
2529
  - **Acc. Single-Hop**: Accuracy on questions requiring a single evidence page
2530
  - **Acc. Cross-Page**: Accuracy on multi-hop questions within the same document
2531
  - **Acc. Cross-Doc**: Accuracy on multi-hop questions spanning multiple documents
 
22
  import secrets
23
  import shutil
24
  import sys
25
+ from collections import defaultdict
26
+ from concurrent.futures import ThreadPoolExecutor, as_completed
27
  from datetime import datetime, timezone
28
  from pathlib import Path
29
  from urllib.parse import urlencode, quote, unquote
30
 
31
+ # Parallelization config for LLM evaluation
32
+ MAX_EVAL_WORKERS = 24
33
+
34
  import pandas as pd
35
  import plotly.graph_objects as go
36
  import requests
 
40
  # Add eval module to path
41
  sys.path.insert(0, str(Path(__file__).parent / "eval"))
42
  try:
43
+ from metrics import (
44
+ anls_star,
45
+ anls_star_llm,
46
+ aggregate_anls_star_llm,
47
+ standard_error,
48
+ confidence_interval,
49
+ citation_f1,
50
+ kuiper_statistic
51
+ )
52
  from datasets import load_dataset
53
  EVAL_AVAILABLE = True
54
  except ImportError:
 
929
  return f'<span style="color: {color}; font-weight: 500;">{fallback_emoji} {model_type}</span>'
930
 
931
 
932
+ def _extract_timestamp_from_filename(filename: str) -> str:
933
+ """Extract timestamp from filename like 'Model_results_20260109_152104.json'."""
934
+ import re
935
+ match = re.search(r'_(\d{8}_\d{6})\.json$', filename)
936
+ return match.group(1) if match else "00000000_000000"
937
+
938
+
939
  @st.cache_data(ttl=300) # Cache for 5 minutes
940
  def load_eval_results() -> pd.DataFrame:
941
+ """Load evaluation results from JSON files, keeping only the most recent per model."""
942
+ seen_models = {} # Track: model_name -> (timestamp, result_dict, filepath)
943
 
944
  results_path = Path(EVAL_RESULTS_PATH)
945
  if not results_path.exists():
 
965
  # Get per-domain scores if available
966
  by_domain = result_scores.get("by_domain", {})
967
 
968
+ # Use semantic accuracy if available, otherwise fall back to ANLS*
969
+ overall = result_scores.get("overall", {})
970
+ single_ev = result_scores.get("single_evidence", {})
971
+ multi_page = result_scores.get("multi_evidence_same_doc", {})
972
+ multi_doc = result_scores.get("multi_evidence_multi_doc", {})
973
+
974
+ # Primary metric: semantic (ANLS* + LLM) if available, otherwise ANLS*
975
+ semantic_acc = overall.get("semantic", overall.get("anls", 0.0))
976
+ semantic_ci = overall.get("semantic_ci") # 95% CI tuple
977
+
978
+ # Calculate CI on-the-fly using bias correction method if not stored
979
+ if not semantic_ci and semantic_acc > 0:
980
+ try:
981
+ from metrics import confidence_interval, standard_error
982
+ n = result_scores.get("single_evidence", {}).get("n", 500)
983
+ p = semantic_acc / 100.0 # Convert to proportion
984
+ ci = confidence_interval(p, n) # Uses calibrated q0, q1, m0, m1
985
+ semantic_ci = (ci[0] * 100, ci[1] * 100)
986
+ semantic_se = standard_error(p, n) * 100 # SE in percentage points
987
+ except Exception:
988
+ semantic_ci = None
989
+ semantic_se = None
990
+
991
+ anls_acc = overall.get("anls", 0.0)
992
+
993
+ result_dict = {
994
  "Model": model_name,
995
  "Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
996
  "Model Type": metadata.get("model_type", "unknown"),
997
  "Tags": tags, # Store as list
998
+ # Primary: Accuracy with LLM judge (ANLS* + LLM with bias correction)
999
+ "Accuracy (LLM judge)": semantic_acc,
1000
+ "_Accuracy_SE": semantic_se, # Hidden: for ±SE display
1001
+ "_Accuracy_CI": semantic_ci, # Hidden: for tooltip display
1002
+ "Acc. Single-Hop": single_ev.get("semantic", single_ev.get("anls", 0.0)),
1003
+ "Acc. Cross-Page": multi_page.get("semantic", multi_page.get("anls", 0.0)),
1004
+ "Acc. Cross-Doc": multi_doc.get("semantic", multi_doc.get("anls", 0.0)),
1005
+ # Secondary: Pure string-based ANLS* (hidden by default)
1006
+ "ANLS* (string)": anls_acc,
1007
  # Attribution metrics
1008
+ "Attribution (Page F1)": overall.get("page_f1", 0.0),
1009
+ "Attribution (Doc F1)": overall.get("doc_f1", 0.0),
1010
  # Calibration metric
1011
+ "Effort (Kuiper)": overall.get("kuiper", 0.0),
1012
  "Submission Date": data.get("submission_date", ""),
1013
  "Link": data.get("link", ""),
1014
  "Description": data.get("description", metadata.get("description", "")) or
1015
  generate_placeholder_description(model_name, tags, metadata.get("model_type", "")),
1016
  # Per-domain scores (stored as JSON string for DataFrame compatibility)
1017
  "_by_domain": json.dumps(by_domain) if by_domain else "{}",
1018
+ }
1019
+
1020
+ # Extract timestamp from filename
1021
+ file_timestamp = _extract_timestamp_from_filename(result_file.name)
1022
+
1023
+ # Keep only the most recent result per model
1024
+ if model_name not in seen_models or file_timestamp > seen_models[model_name][0]:
1025
+ seen_models[model_name] = (file_timestamp, result_dict)
1026
+
1027
  except Exception as e:
1028
  st.warning(f"Error loading {result_file}: {e}")
1029
 
1030
+ if not seen_models:
1031
  return pd.DataFrame()
1032
 
1033
+ # Build results list from deduplicated models
1034
+ results = [result_dict for _, result_dict in seen_models.values()]
1035
+
1036
  df = pd.DataFrame(results)
1037
+ df = df.sort_values("Accuracy (LLM judge)", ascending=False).reset_index(drop=True)
1038
  return df
1039
 
1040
 
 
1105
 
1106
  # Metric tooltips for table headers
1107
  METRIC_TOOLTIPS = {
1108
+ "Accuracy (LLM judge)": "Answer accuracy using ANLS* + LLM judge with bias correction. Captures semantic correctness beyond string matching. Higher is better.",
1109
+ "ANLS* (string)": "String-based accuracy using ANLS* (Average Normalized Levenshtein Similarity). Stricter than semantic. Higher is better.",
1110
  "Acc. Single-Hop": "Accuracy on questions requiring evidence from a single page.",
1111
  "Acc. Cross-Page": "Accuracy on multi-hop questions requiring evidence from multiple pages within the same document.",
1112
  "Acc. Cross-Doc": "Accuracy on multi-hop questions requiring evidence from multiple documents.",
 
1191
  # Render tags as badges
1192
  cell_html = render_tags_html(value)
1193
  cells.append(f'<td>{cell_html}</td>')
1194
+ elif col == "Accuracy (LLM judge)" or col == "ANLS* (string)" or col.startswith("Acc."):
1195
+ # Format accuracy scores (scale 0-100)
1196
  try:
1197
+ acc_val = f"{float(value):.1f}" if value else "0"
1198
  except (ValueError, TypeError):
1199
+ acc_val = str(value)
1200
+
1201
+ # Add ±SE for main accuracy column
1202
+ if col == "Accuracy (LLM judge)":
1203
+ se = row.get("_Accuracy_SE")
1204
+ ci = row.get("_Accuracy_CI")
1205
+ if se is not None and se > 0:
1206
+ # Show ±SE with 95% CI as tooltip
1207
+ ci_tooltip = f"95% CI: [{ci[0]:.1f}, {ci[1]:.1f}]" if ci else ""
1208
+ se_text = f'<span style="font-size: 0.85em; color: #888;" title="{ci_tooltip}"> ± {se:.1f}</span>'
1209
+ cell_html = f'{acc_val}{se_text}'
1210
+ else:
1211
+ cell_html = acc_val
1212
+ else:
1213
+ cell_html = acc_val
1214
  cells.append(f'<td style="text-align: center;">{cell_html}</td>')
1215
  elif col.startswith("Attribution"):
1216
  # Format F1 scores (scale 0-100)
 
1349
  df_type = df[df["Model Type"] == model_type]
1350
  fig.add_trace(go.Scatter(
1351
  x=df_type["Attribution (Page F1)"],
1352
+ y=df_type["Accuracy (LLM judge)"],
1353
  mode="markers",
1354
  name=model_type,
1355
  text=df_type["Model"],
 
1364
  fig.update_layout(
1365
  title=dict(text="Accuracy vs Attribution", font=dict(color="white")),
1366
  xaxis_title="Attribution (Page F1)",
1367
+ yaxis_title="Accuracy (LLM judge)",
1368
  hovermode="closest",
1369
  template="plotly_dark",
1370
  height=650,
 
1410
  df_type = df_filtered[df_filtered["Model Type"] == model_type]
1411
  fig.add_trace(go.Scatter(
1412
  x=df_type["Effort (Kuiper)"],
1413
+ y=df_type["Accuracy (LLM judge)"],
1414
  mode="markers",
1415
  name=model_type,
1416
  text=df_type["Model"],
 
1425
  fig.update_layout(
1426
  title=dict(text="Accuracy vs Effort", font=dict(color="white")),
1427
  xaxis_title="Effort (Kuiper) — lower is better",
1428
+ yaxis_title="Accuracy (LLM judge)",
1429
  hovermode="closest",
1430
  template="plotly_dark",
1431
  height=650,
 
1535
  # Display main metrics
1536
  col1, col2, col3 = st.columns(3)
1537
  with col1:
1538
+ st.metric("Accuracy (LLM judge)", f"{model_data['Accuracy (LLM judge)']:.1f}%")
1539
  with col2:
1540
  st.metric("Attribution (Page F1)", f"{model_data['Attribution (Page F1)']:.1f}%")
1541
  with col3:
 
1570
 
1571
  if by_domain:
1572
  # Show per-domain chart (use overall accuracy as threshold for coloring)
1573
+ overall_accuracy = model_data.get('Accuracy (LLM judge)', 0)
1574
  fig = create_domain_accuracy_chart(by_domain, model_name, overall_accuracy)
1575
  st.plotly_chart(fig, width="stretch")
1576
  else:
 
1695
  return {}, {}
1696
 
1697
 
1698
+ def _evaluate_single_item(args, max_retries=3):
1699
+ """Evaluate a single prediction item (for parallel processing)."""
1700
+ import time as _time
1701
+ idx, pred, gold_data, use_llm_judge = args
1702
+
1703
+ question = pred.get('question', '').strip()
1704
+ answer = pred.get('answer', '')
1705
+ citations = pred.get('citations', [])
1706
+ search_history = pred.get('search_history', [])
1707
+ steps = len(search_history) if search_history else pred.get('iterations', 0)
1708
+
1709
+ # Calculate non-LLM metrics first
1710
+ anls = anls_star(answer, gold_data['answers'])
1711
+ doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
1712
+ page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
1713
+
1714
+ # Semantic accuracy with LLM judge (or just ANLS* if disabled)
1715
+ if use_llm_judge:
1716
+ for attempt in range(max_retries):
1717
+ try:
1718
+ llm_result = anls_star_llm(answer, gold_data['answers'], question)
1719
+ semantic_score = llm_result['score']
1720
+ break
1721
+ except Exception:
1722
+ if attempt < max_retries - 1:
1723
+ _time.sleep(2 ** attempt) # Exponential backoff
1724
+ else:
1725
+ raise
1726
+ else:
1727
+ semantic_score = anls
1728
+
1729
+ return {
1730
+ 'idx': idx,
1731
+ 'question': question,
1732
+ 'anls': anls,
1733
+ 'semantic_score': semantic_score,
1734
+ 'correct': semantic_score >= 0.5,
1735
+ 'doc_f1': doc_f1['f1'],
1736
+ 'page_f1': page_f1['f1'],
1737
+ 'steps': steps,
1738
+ 'hop_type': gold_data.get('hop_type', 'single'),
1739
+ 'category': gold_data['category'],
1740
+ 'domain': gold_data['domain']
1741
+ }
1742
+
1743
+
1744
+ def evaluate_predictions(
1745
+ predictions: list,
1746
+ gold_by_text: dict,
1747
+ gold_by_id: dict,
1748
+ use_llm_judge: bool = True,
1749
+ progress_callback=None
1750
+ ) -> dict:
1751
+ """Evaluate predictions against gold standard (parallelized when using LLM judge).
1752
+
1753
+ Args:
1754
+ predictions: List of prediction dicts
1755
+ gold_by_text: Gold data indexed by question text
1756
+ gold_by_id: Gold data indexed by question ID
1757
+ use_llm_judge: If True, use ANLS*+LLM for semantic accuracy (default)
1758
+ progress_callback: Optional callback(current, total) for progress updates
1759
+ """
1760
  if not EVAL_AVAILABLE:
1761
  return {"error": "Evaluation module not available"}
1762
 
1763
+ # First pass: match predictions to gold standard
1764
+ matched_items = []
1765
  unmatched = []
1766
 
1767
  for pred in predictions:
 
1769
  qid = pred.get('id', '')
1770
 
1771
  # Match to gold
1772
+ gold_data = None
1773
  if question in gold_by_text:
1774
  gold_data = gold_by_text[question]
1775
  elif qid and qid in gold_by_id:
1776
  gold_data = gold_by_id[qid]
1777
+
1778
+ if gold_data:
1779
+ matched_items.append((pred, gold_data, use_llm_judge))
1780
  else:
1781
  unmatched.append(question[:50] + "..." if len(question) > 50 else question)
1782
+
1783
+ if not matched_items:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1784
  return {"error": "No predictions matched the gold standard"}
1785
 
1786
+ # Prepare items with index
1787
+ items_with_idx = [(i, pred, gold, llm) for i, (pred, gold, llm) in enumerate(matched_items)]
1788
+
1789
+ total = len(items_with_idx)
1790
+ evals = []
1791
+ completed = 0
1792
+
1793
+ # Parallel evaluation with ThreadPoolExecutor (much faster for LLM calls)
1794
+ with ThreadPoolExecutor(max_workers=MAX_EVAL_WORKERS) as executor:
1795
+ futures = {executor.submit(_evaluate_single_item, item): item[0]
1796
+ for item in items_with_idx}
1797
+
1798
+ for future in as_completed(futures):
1799
+ result = future.result() # Will raise if failed after retries
1800
+ evals.append(result)
1801
+ completed += 1
1802
+ if progress_callback:
1803
+ progress_callback(completed, total)
1804
+
1805
  # Aggregate overall metrics
1806
  n = len(evals)
1807
+ semantic_scores = [e['semantic_score'] for e in evals]
1808
+
1809
+ # Apply bias correction for semantic accuracy
1810
+ if use_llm_judge:
1811
+ agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
1812
+ mean_semantic = agg['adjusted_score'] * 100
1813
+ semantic_ci = (agg['ci_lower'] * 100, agg['ci_upper'] * 100)
1814
+ else:
1815
+ mean_semantic = sum(semantic_scores) / n * 100
1816
+ semantic_ci = None
1817
+
1818
  mean_anls = sum(e['anls'] for e in evals) / n * 100
1819
+ accuracy = sum(e['correct'] for e in evals) / n * 100
1820
  mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
1821
  mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
1822
 
 
1829
  cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
1830
 
1831
  # By domain
 
1832
  by_domain = defaultdict(list)
1833
  for e in evals:
1834
  domain = e['domain'] or 'Other'
 
1837
  domain_scores = {}
1838
  for domain, domain_evals in sorted(by_domain.items()):
1839
  domain_scores[domain] = {
1840
+ 'semantic': sum(e['semantic_score'] for e in domain_evals) / len(domain_evals) * 100,
1841
  'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
1842
  'n': len(domain_evals)
1843
  }
 
1845
  results = {
1846
  'n_evaluated': n,
1847
  'n_unmatched': len(unmatched),
1848
+ 'unmatched_samples': unmatched[:5],
1849
  'overall': {
1850
+ 'semantic': mean_semantic, # Primary metric (ANLS* + LLM judge)
1851
+ 'semantic_ci': semantic_ci, # 95% CI if LLM judge used
1852
+ 'anls': mean_anls, # Secondary metric (pure ANLS*)
1853
  'accuracy': accuracy,
1854
  'doc_f1': mean_doc_f1,
1855
  'page_f1': mean_page_f1,
1856
  'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
1857
  },
1858
  'single_evidence': {
1859
+ 'semantic': sum(e['semantic_score'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
1860
  'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
1861
  'n': len(single_hop)
1862
  },
1863
  'multi_evidence_same_doc': {
1864
+ 'semantic': sum(e['semantic_score'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
1865
  'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
1866
  'n': len(cross_page)
1867
  },
1868
  'multi_evidence_multi_doc': {
1869
+ 'semantic': sum(e['semantic_score'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
1870
  'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
1871
  'n': len(cross_doc)
1872
  },
1873
+ 'by_domain': domain_scores,
1874
+ 'used_llm_judge': use_llm_judge
1875
  }
1876
 
1877
  return results
 
1973
 
1974
  # Evaluate button
1975
  if st.button("Run Evaluation", type="primary"):
1976
+ with st.spinner("Loading gold standard..."):
1977
  gold_by_text, gold_by_id = load_gold_standard()
1978
+
1979
+ if not gold_by_text:
1980
+ st.error("Failed to load gold standard dataset")
1981
+ else:
1982
+ # Progress bar for evaluation
1983
+ progress_bar = st.progress(0, text="Evaluating predictions with semantic accuracy...")
1984
+ status_text = st.empty()
1985
 
1986
+ def update_progress(current, total):
1987
+ progress_bar.progress(current / total, text=f"Evaluating {current}/{total}...")
1988
+
1989
+ results = evaluate_predictions(
1990
+ predictions,
1991
+ gold_by_text,
1992
+ gold_by_id,
1993
+ use_llm_judge=True,
1994
+ progress_callback=update_progress
1995
+ )
1996
+
1997
+ progress_bar.empty()
1998
+ status_text.empty()
1999
+ st.session_state.eval_results = results
2000
 
2001
  # Show evaluation results
2002
  if st.session_state.eval_results:
 
2007
  else:
2008
  st.markdown("#### Evaluation Results")
2009
 
2010
+ # Summary metrics - use semantic accuracy as primary if available
2011
  col1, col2, col3, col4 = st.columns(4)
2012
  with col1:
2013
+ if 'semantic' in results['overall']:
2014
+ ci = results['overall'].get('semantic_ci')
2015
+ ci_text = f" [{ci[0]:.1f}-{ci[1]:.1f}]" if ci else ""
2016
+ st.metric("Accuracy (LLM judge)", f"{results['overall']['semantic']:.1f}{ci_text}")
2017
+ else:
2018
+ st.metric("Accuracy (ANLS*)", f"{results['overall']['anls']:.1f}")
2019
  with col2:
2020
  st.metric("Attribution (Page F1)", f"{results['overall']['page_f1']:.1f}")
2021
  with col3:
 
2026
 
2027
  # Detailed breakdown
2028
  with st.expander("Detailed Breakdown"):
2029
+ # Check which metrics are available
2030
+ has_semantic = 'semantic' in results['overall']
2031
+
2032
+ if has_semantic:
2033
+ st.markdown(f"""
2034
+ | Metric | Value |
2035
+ |--------|-------|
2036
+ | **Accuracy (LLM judge)** | {results['overall']['semantic']:.1f} |
2037
+ | **ANLS*** (string match) | {results['overall']['anls']:.1f} |
2038
+ | **Acc. Single-Hop** (n={results['single_evidence']['n']}) | {results['single_evidence'].get('semantic', results['single_evidence']['anls']):.1f} |
2039
+ | **Acc. Cross-Page** (n={results['multi_evidence_same_doc']['n']}) | {results['multi_evidence_same_doc'].get('semantic', results['multi_evidence_same_doc']['anls']):.1f} |
2040
+ | **Acc. Cross-Doc** (n={results['multi_evidence_multi_doc']['n']}) | {results['multi_evidence_multi_doc'].get('semantic', results['multi_evidence_multi_doc']['anls']):.1f} |
2041
+ | **Attribution (Doc F1)** | {results['overall']['doc_f1']:.1f} |
2042
+ | **Attribution (Page F1)** | {results['overall']['page_f1']:.1f} |
2043
+ """)
2044
+ else:
2045
+ st.markdown(f"""
2046
+ | Metric | Value |
2047
+ |--------|-------|
2048
+ | **Overall ANLS*** | {results['overall']['anls']:.1f} |
2049
+ | **Acc. Single-Hop** (n={results['single_evidence']['n']}) | {results['single_evidence']['anls']:.1f} |
2050
+ | **Acc. Cross-Page** (n={results['multi_evidence_same_doc']['n']}) | {results['multi_evidence_same_doc']['anls']:.1f} |
2051
+ | **Acc. Cross-Doc** (n={results['multi_evidence_multi_doc']['n']}) | {results['multi_evidence_multi_doc']['anls']:.1f} |
2052
+ | **Attribution (Doc F1)** | {results['overall']['doc_f1']:.1f} |
2053
+ | **Attribution (Page F1)** | {results['overall']['page_f1']:.1f} |
2054
+ """)
2055
 
2056
  if results['n_unmatched'] > 0:
2057
  with st.expander(f"{results['n_unmatched']} unmatched questions"):
 
2521
  # COLUMN SELECTOR - chips use SNOWFLAKE_BLUE (lighter, gradient end)
2522
  # Mapping: short chip name -> full column name
2523
  COLUMN_CHIP_NAMES = {
2524
+ "Accuracy": "Accuracy (LLM judge)",
2525
  "Acc. Single-Hop": "Acc. Single-Hop",
2526
  "Acc. Cross-Page": "Acc. Cross-Page",
2527
  "Acc. Cross-Doc": "Acc. Cross-Doc",
2528
+ "ANLS*": "ANLS* (string)",
2529
  "Attribution": "Attribution (Page F1)",
2530
  "Attribution (Doc)": "Attribution (Doc F1)",
2531
  "Effort": "Effort (Kuiper)",
 
2540
  # Model and Organization are always visible (not in selector)
2541
  always_visible = ["Model", "Organization"]
2542
  # Hidden columns (used internally but not shown as separate columns)
2543
+ hidden_cols = ["Link", "Submission Date", "Description", "_by_domain", "_Accuracy_CI", "_Accuracy_SE"]
2544
  # Full column names that are optional (Tags moved to end)
2545
  optional_full_cols = [c for c in all_columns if c not in hidden_cols + always_visible and c != "Tags"]
2546
  optional_full_cols.append("Tags") # Add Tags at the end
 
2713
  st.markdown("""
2714
  #### Metrics
2715
 
2716
+ ##### Accuracy (LLM judge)
2717
+ - **Accuracy (LLM judge)**: Primary metric combining ANLS* string matching with an LLM judge (G-Eval framework). Captures semantic correctness beyond exact string matching, with statistical bias correction
2718
+ - **ANLS* (string)**: Pure string-based score using Average Normalized Levenshtein Similarity with optimal element alignment for lists/sets
2719
  - **Acc. Single-Hop**: Accuracy on questions requiring a single evidence page
2720
  - **Acc. Cross-Page**: Accuracy on multi-hop questions within the same document
2721
  - **Acc. Cross-Doc**: Accuracy on multi-hop questions spanning multiple documents
eval/batch_reevaluate.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Batch re-evaluate all submissions with the new Semantic Accuracy metric.
4
+
5
+ This script downloads all prediction files from HuggingFace Hub and re-evaluates
6
+ them with the ANLS* + LLM judge metric.
7
+
8
+ Usage:
9
+ # Dry run - list files only
10
+ python batch_reevaluate.py --dry-run
11
+
12
+ # Re-evaluate all files
13
+ python batch_reevaluate.py
14
+
15
+ # Re-evaluate specific organization
16
+ python batch_reevaluate.py --org OpenAI
17
+
18
+ # Upload results after review
19
+ python batch_reevaluate.py --upload
20
+ """
21
+
22
+ import json
23
+ import os
24
+ import sys
25
+ import time
26
+ from collections import defaultdict
27
+ from concurrent.futures import ThreadPoolExecutor, as_completed
28
+ from datetime import datetime, timezone
29
+ from pathlib import Path
30
+
31
+ from huggingface_hub import HfApi, hf_hub_download, list_repo_files
32
+ from datasets import load_dataset
33
+
34
+ # Add parent for imports
35
+ sys.path.insert(0, str(Path(__file__).parent))
36
+ from metrics import (
37
+ anls_star,
38
+ anls_star_llm,
39
+ aggregate_anls_star_llm,
40
+ citation_f1,
41
+ kuiper_statistic
42
+ )
43
+
44
+ # Parallelization config
45
+ MAX_WORKERS = 24
46
+
47
+ # Config
48
+ RESULTS_REPO = "agentic-document-ai/backend-results"
49
+ TOKEN = os.environ.get("HF_TOKEN")
50
+ OUTPUT_DIR = Path(__file__).parent / "reevaluated_results"
51
+
52
+
53
+ def load_gold_data():
54
+ """Load gold standard from HuggingFace."""
55
+ print("Loading gold standard...")
56
+ dataset = load_dataset("agentic-document-ai/dataset-PRIVATE", split="test")
57
+
58
+ gold_by_id = {}
59
+ gold_by_text = {}
60
+
61
+ for ex in dataset:
62
+ qid = ex.get('id', '')
63
+ question = ex['question'].strip()
64
+ data = {
65
+ 'question': question,
66
+ 'answers': ex.get('answer_variants', []),
67
+ 'evidence': ex.get('evidence', []),
68
+ 'category': ex.get('document_category', ''),
69
+ 'domain': ex.get('domain', ''),
70
+ 'hop_type': ex.get('hop_type', 'single'),
71
+ }
72
+ gold_by_id[qid] = data
73
+ gold_by_text[question] = data
74
+
75
+ return gold_by_id, gold_by_text
76
+
77
+
78
+ def find_prediction_files(org_filter: str = None):
79
+ """Find all prediction JSONL files in the results repo."""
80
+ files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
81
+ pred_files = [f for f in files if '_predictions' in f and f.endswith('.jsonl')]
82
+
83
+ if org_filter:
84
+ pred_files = [f for f in pred_files if f.startswith(org_filter + '/')]
85
+
86
+ return pred_files
87
+
88
+
89
+ def find_result_file(pred_file: str):
90
+ """Find the corresponding results JSON file for a predictions file."""
91
+ # Pattern: {org}/{model}_predictions_{timestamp}.jsonl -> {org}/{model}_results_{timestamp}.json
92
+ parts = pred_file.rsplit('_predictions_', 1)
93
+ if len(parts) == 2:
94
+ result_file = parts[0] + '_results_' + parts[1].replace('.jsonl', '.json')
95
+ return result_file
96
+ return None
97
+
98
+
99
+ def download_file(filepath: str) -> str:
100
+ """Download a file from HuggingFace Hub."""
101
+ return hf_hub_download(
102
+ repo_id=RESULTS_REPO,
103
+ filename=filepath,
104
+ repo_type="dataset",
105
+ token=TOKEN
106
+ )
107
+
108
+
109
+ def _evaluate_single_prediction(args, max_retries=3):
110
+ """Evaluate a single prediction (for parallel processing)."""
111
+ idx, pred, gold_data = args
112
+
113
+ answer = pred.get('answer', '')
114
+ question = pred.get('question', '').strip()
115
+ citations = pred.get('citations', [])
116
+ search_history = pred.get('search_history', [])
117
+ steps = len(search_history) if search_history else pred.get('iterations', 0)
118
+
119
+ # Calculate non-LLM metrics first
120
+ anls = anls_star(answer, gold_data['answers'])
121
+ doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
122
+ page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
123
+
124
+ # Retry LLM call on failure
125
+ for attempt in range(max_retries):
126
+ try:
127
+ llm_result = anls_star_llm(answer, gold_data['answers'], question)
128
+ semantic_score = llm_result['score']
129
+ break
130
+ except Exception as e:
131
+ if attempt < max_retries - 1:
132
+ print(f" Item {idx} attempt {attempt+1} failed: {e}, retrying...")
133
+ time.sleep(2 ** attempt) # Exponential backoff
134
+ else:
135
+ print(f" Failed item {idx} after {max_retries} retries: {e}")
136
+ raise
137
+
138
+ return {
139
+ 'idx': idx,
140
+ 'anls': anls,
141
+ 'semantic_score': semantic_score,
142
+ 'correct': semantic_score >= 0.5,
143
+ 'doc_f1': doc_f1['f1'],
144
+ 'page_f1': page_f1['f1'],
145
+ 'steps': steps,
146
+ 'hop_type': gold_data.get('hop_type', 'single'),
147
+ 'category': gold_data['category'],
148
+ 'domain': gold_data['domain']
149
+ }
150
+
151
+
152
+ def evaluate_with_semantic(predictions: list, gold_by_id: dict, gold_by_text: dict) -> dict:
153
+ """Evaluate predictions with semantic accuracy metric (parallelized)."""
154
+
155
+ # First, filter predictions to only those in test set
156
+ matched_predictions = []
157
+ for pred in predictions:
158
+ question = pred.get('question', '').strip()
159
+ qid = pred.get('id', '')
160
+
161
+ gold_data = None
162
+ if question in gold_by_text:
163
+ gold_data = gold_by_text[question]
164
+ elif qid and qid in gold_by_id:
165
+ gold_data = gold_by_id[qid]
166
+
167
+ if gold_data:
168
+ matched_predictions.append((pred, gold_data))
169
+
170
+ unmatched = len(predictions) - len(matched_predictions)
171
+ print(f" Matched {len(matched_predictions)}/{len(predictions)} predictions to test set (skipping {unmatched})")
172
+
173
+ total = len(matched_predictions)
174
+ evals = []
175
+ completed = 0
176
+
177
+ # Prepare items with index for tracking
178
+ items_with_idx = [(i, pred, gold) for i, (pred, gold) in enumerate(matched_predictions)]
179
+
180
+ # Parallel evaluation with ThreadPoolExecutor
181
+ print(f" Evaluating with {MAX_WORKERS} parallel workers...")
182
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
183
+ futures = {executor.submit(_evaluate_single_prediction, item): item[0]
184
+ for item in items_with_idx}
185
+
186
+ completed_indices = set()
187
+ try:
188
+ for future in as_completed(futures, timeout=600): # 10 min overall timeout
189
+ try:
190
+ result = future.result(timeout=120) # 2 min per item max
191
+ evals.append(result)
192
+ completed_indices.add(result['idx'])
193
+ completed += 1
194
+ if completed % 50 == 0 or completed == total:
195
+ print(f" Progress: {completed}/{total}")
196
+ except TimeoutError:
197
+ idx = futures[future]
198
+ print(f" TIMEOUT: Item {idx} took too long, skipping")
199
+ completed += 1
200
+ except TimeoutError:
201
+ # Find which items are still pending
202
+ pending = set(range(total)) - completed_indices
203
+ print(f" OVERALL TIMEOUT: {len(pending)} items still pending: {sorted(pending)[:10]}...")
204
+ # Cancel remaining futures
205
+ for future in futures:
206
+ future.cancel()
207
+
208
+ if not evals:
209
+ return None
210
+
211
+ # Aggregate
212
+ n = len(evals)
213
+ semantic_scores = [e['semantic_score'] for e in evals]
214
+
215
+ agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
216
+
217
+ mean_anls = sum(e['anls'] for e in evals) / n * 100
218
+ mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
219
+ mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
220
+
221
+ kuiper = kuiper_statistic(evals)
222
+
223
+ # By hop type
224
+ single_hop = [e for e in evals if e['hop_type'] == 'single']
225
+ cross_page = [e for e in evals if e['hop_type'] == 'cross_page']
226
+ cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
227
+
228
+ # By domain
229
+ by_domain = defaultdict(list)
230
+ for e in evals:
231
+ domain = e['domain'] or 'Other'
232
+ by_domain[domain].append(e)
233
+
234
+ domain_scores = {}
235
+ for domain, domain_evals in sorted(by_domain.items()):
236
+ domain_scores[domain] = {
237
+ 'semantic': sum(e['semantic_score'] for e in domain_evals) / len(domain_evals) * 100,
238
+ 'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
239
+ 'n': len(domain_evals)
240
+ }
241
+
242
+ return {
243
+ 'overall': {
244
+ 'semantic': agg['adjusted_score'] * 100,
245
+ 'semantic_ci': (agg['ci_lower'] * 100, agg['ci_upper'] * 100), # 95% CI
246
+ 'anls': mean_anls,
247
+ 'page_f1': mean_page_f1,
248
+ 'doc_f1': mean_doc_f1,
249
+ 'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
250
+ },
251
+ 'single_evidence': {
252
+ 'semantic': sum(e['semantic_score'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
253
+ 'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
254
+ 'n': len(single_hop)
255
+ },
256
+ 'multi_evidence_same_doc': {
257
+ 'semantic': sum(e['semantic_score'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
258
+ 'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
259
+ 'n': len(cross_page)
260
+ },
261
+ 'multi_evidence_multi_doc': {
262
+ 'semantic': sum(e['semantic_score'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
263
+ 'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
264
+ 'n': len(cross_doc)
265
+ },
266
+ 'by_domain': domain_scores,
267
+ 'n_evaluated': n,
268
+ 'n_unmatched': unmatched
269
+ }
270
+
271
+
272
+ def main():
273
+ import argparse
274
+ parser = argparse.ArgumentParser(description="Batch re-evaluate submissions")
275
+ parser.add_argument('--dry-run', action='store_true', help="List files only, don't evaluate")
276
+ parser.add_argument('--org', type=str, help="Filter by organization (e.g., 'OpenAI')")
277
+ parser.add_argument('--upload', action='store_true', help="Upload already processed results to HuggingFace Hub (no re-evaluation)")
278
+ parser.add_argument('--skip-existing', action='store_true', help="Skip already evaluated files")
279
+ args = parser.parse_args()
280
+
281
+ OUTPUT_DIR.mkdir(exist_ok=True)
282
+
283
+ # Upload-only mode: just upload existing files
284
+ if args.upload:
285
+ print("Uploading existing results to HuggingFace Hub...")
286
+ api = HfApi()
287
+ result_files = list(OUTPUT_DIR.glob("**/*.json"))
288
+ print(f"Found {len(result_files)} result files to upload")
289
+
290
+ for result_file in result_files:
291
+ rel_path = result_file.relative_to(OUTPUT_DIR)
292
+ print(f" Uploading: {rel_path}")
293
+ try:
294
+ api.upload_file(
295
+ path_or_fileobj=str(result_file),
296
+ path_in_repo=str(rel_path),
297
+ repo_id=RESULTS_REPO,
298
+ repo_type="dataset",
299
+ token=TOKEN,
300
+ commit_message=f"Re-evaluate with semantic accuracy: {rel_path.stem}"
301
+ )
302
+ print(f" ✓ Done")
303
+ except Exception as e:
304
+ print(f" ✗ Error: {e}")
305
+ print("\nUpload complete!")
306
+ return
307
+
308
+ # Find prediction files
309
+ print("Finding prediction files...")
310
+ pred_files = find_prediction_files(args.org)
311
+ print(f"Found {len(pred_files)} prediction files")
312
+
313
+ if args.dry_run:
314
+ for f in pred_files:
315
+ print(f" - {f}")
316
+ return
317
+
318
+ # Load gold standard
319
+ gold_by_id, gold_by_text = load_gold_data()
320
+ print(f"Loaded {len(gold_by_id)} gold examples")
321
+
322
+ # Process each file
323
+ for i, pred_file in enumerate(pred_files):
324
+ print(f"\n{'='*60}")
325
+ print(f"[{i+1}/{len(pred_files)}] Processing: {pred_file}")
326
+ print('='*60)
327
+
328
+ # Check if already processed
329
+ output_file = OUTPUT_DIR / (Path(pred_file).stem.replace('_predictions', '_results') + '_reevaluated.json')
330
+ if args.skip_existing and output_file.exists():
331
+ print(" Skipping (already processed)")
332
+ continue
333
+
334
+ try:
335
+ # Download predictions
336
+ print(" Downloading predictions...")
337
+ local_pred = download_file(pred_file)
338
+
339
+ predictions = []
340
+ with open(local_pred) as f:
341
+ for line in f:
342
+ if line.strip():
343
+ predictions.append(json.loads(line))
344
+ print(f" Loaded {len(predictions)} predictions")
345
+
346
+ # Download original results to preserve metadata
347
+ result_file = find_result_file(pred_file)
348
+ original_metadata = {}
349
+ if result_file:
350
+ try:
351
+ local_result = download_file(result_file)
352
+ with open(local_result) as f:
353
+ original_data = json.load(f)
354
+ original_metadata = {
355
+ 'model_name': original_data.get('model_name'),
356
+ 'organization': original_data.get('organization'),
357
+ 'description': original_data.get('description'),
358
+ 'link': original_data.get('link'),
359
+ 'tags': original_data.get('tags'),
360
+ 'submitted_by': original_data.get('submitted_by'),
361
+ 'metadata': original_data.get('metadata'),
362
+ 'submission_date': original_data.get('submission_date'),
363
+ }
364
+ print(f" Loaded metadata: model_name={original_metadata.get('model_name')}")
365
+ except Exception as e:
366
+ print(f" Warning: Could not load original results: {e}")
367
+
368
+ # Fallback: extract metadata from filename if not found
369
+ if not original_metadata.get('model_name'):
370
+ # Pattern: Org/Model_Name_with_Stuff_predictions_timestamp.jsonl
371
+ filename = Path(pred_file).stem # e.g., GPT-5_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152104
372
+ parts = filename.rsplit('_predictions_', 1)
373
+ if parts:
374
+ model_name = parts[0].replace('_', ' ') # Convert underscores to spaces
375
+ org = Path(pred_file).parts[0] if '/' in pred_file else 'Unknown'
376
+ original_metadata = {
377
+ 'model_name': model_name,
378
+ 'organization': org.replace('_', ' '),
379
+ 'description': '',
380
+ 'tags': ['Agentic'],
381
+ 'metadata': {'model_type': 'unknown'},
382
+ }
383
+ print(f" Using fallback metadata: model_name={model_name}, org={org}")
384
+
385
+ # Evaluate
386
+ print(" Evaluating with semantic accuracy...")
387
+ start_time = time.time()
388
+ results = evaluate_with_semantic(predictions, gold_by_id, gold_by_text)
389
+ elapsed = time.time() - start_time
390
+
391
+ if results:
392
+ print(f"\n Results (took {elapsed:.1f}s):")
393
+ print(f" Semantic Accuracy: {results['overall']['semantic']:.1f}")
394
+ print(f" ANLS*: {results['overall']['anls']:.1f}")
395
+ print(f" Page F1: {results['overall']['page_f1']:.1f}")
396
+
397
+ # Save with original metadata
398
+ org = Path(pred_file).parts[0] if '/' in pred_file else 'Unknown'
399
+ output_filename = Path(pred_file).name.replace('_predictions', '_results').replace('.jsonl', '.json')
400
+
401
+ full_result = {
402
+ **original_metadata,
403
+ 'results': results,
404
+ 'reevaluated_date': datetime.now(timezone.utc).isoformat(),
405
+ 'source_predictions_file': pred_file,
406
+ 'result_file_path': f"{org}/{output_filename}",
407
+ }
408
+
409
+ # Create org subfolder
410
+ org_dir = OUTPUT_DIR / org
411
+ org_dir.mkdir(exist_ok=True)
412
+
413
+ output_file = org_dir / output_filename
414
+ with open(output_file, 'w') as f:
415
+ json.dump(full_result, f, indent=2)
416
+ print(f" Saved to: {output_file}")
417
+ else:
418
+ print(" No valid evaluations")
419
+
420
+ except Exception as e:
421
+ print(f" Error: {e}")
422
+ import traceback
423
+ traceback.print_exc()
424
+ continue
425
+
426
+ print(f"\n{'='*60}")
427
+ print("DONE!")
428
+ print(f"Results saved to: {OUTPUT_DIR}")
429
+ print(f"\nTo upload results, run: python batch_reevaluate.py --upload")
430
+
431
+
432
+ if __name__ == "__main__":
433
+ main()
434
+
eval/evaluate.py CHANGED
@@ -18,7 +18,14 @@ from typing import Any, Dict, List, Optional, Tuple
18
 
19
  from datasets import load_dataset
20
 
21
- from metrics import anls_star, citation_f1, kuiper_statistic, wasted_effort_ratio
 
 
 
 
 
 
 
22
 
23
 
24
  def derive_hop_type(evidence: list) -> str:
@@ -106,11 +113,18 @@ def load_results(filepath: Path) -> List[Dict]:
106
  def evaluate_single(
107
  result: Dict,
108
  gold_by_text: Dict[str, Dict],
109
- gold_by_id: Dict[str, Dict]
 
110
  ) -> Optional[Dict[str, Any]]:
111
  """Evaluate a single prediction.
112
 
113
  Matches by question text first, falls back to question ID if not found.
 
 
 
 
 
 
114
  """
115
  question = result.get('question', '').strip()
116
  qid = result.get('id', '')
@@ -128,7 +142,15 @@ def evaluate_single(
128
 
129
  # ANLS*
130
  anls = anls_star(answer, gold_data['answers'])
131
- correct = anls >= 0.5
 
 
 
 
 
 
 
 
132
 
133
  # Citation F1
134
  doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
@@ -141,6 +163,7 @@ def evaluate_single(
141
  return {
142
  'question': question,
143
  'anls': anls,
 
144
  'correct': correct,
145
  'doc_f1': doc_f1['f1'],
146
  'page_f1': page_f1['f1'],
@@ -151,7 +174,7 @@ def evaluate_single(
151
  }
152
 
153
 
154
- def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
155
  """Aggregate metrics across evaluations."""
156
  if not evals:
157
  return {}
@@ -162,6 +185,16 @@ def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
162
  mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n
163
  mean_page_f1 = sum(e['page_f1'] for e in evals) / n
164
 
 
 
 
 
 
 
 
 
 
 
165
  # Kuiper
166
  kuiper = kuiper_statistic(evals)
167
  wasted = wasted_effort_ratio(evals)
@@ -170,6 +203,8 @@ def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
170
  'n': n,
171
  'accuracy': accuracy,
172
  'mean_anls': mean_anls,
 
 
173
  'doc_f1': mean_doc_f1,
174
  'page_f1': mean_page_f1,
175
  'kuiper_stat': kuiper['kuiper_stat'],
@@ -180,7 +215,7 @@ def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
180
  }
181
 
182
 
183
- def print_metrics(name: str, metrics: Dict, indent: int = 0):
184
  """Print metrics in a formatted way."""
185
  prefix = " " * indent
186
 
@@ -189,8 +224,16 @@ def print_metrics(name: str, metrics: Dict, indent: int = 0):
189
  return
190
 
191
  print(f"{prefix}{name} (n={metrics['n']}):")
192
- print(f"{prefix} Accuracy (ANLS*≥0.5): {metrics['accuracy']:.1%}")
193
- print(f"{prefix} Mean ANLS*: {metrics['mean_anls']:.4f}")
 
 
 
 
 
 
 
 
194
  print(f"{prefix} Document F1: {metrics['doc_f1']:.4f}")
195
  print(f"{prefix} Page F1: {metrics['page_f1']:.4f}")
196
 
@@ -207,16 +250,20 @@ def evaluate_file(
207
  gold_by_id: Dict[str, Dict],
208
  by_category: bool = False,
209
  by_domain: bool = False,
210
- by_hop_type: bool = True
 
211
  ) -> Dict[str, Any]:
212
  """Evaluate a single results file."""
213
  results = load_results(filepath)
214
 
215
  evals = []
216
  unmatched = 0
 
217
 
218
- for result in results:
219
- ev = evaluate_single(result, gold_by_text, gold_by_id)
 
 
220
  if ev:
221
  evals.append(ev)
222
  else:
@@ -226,30 +273,30 @@ def evaluate_file(
226
  print(f" Warning: {unmatched} questions not found in gold standard")
227
 
228
  # Overall metrics
229
- overall = aggregate_metrics(evals)
230
 
231
- output = {'overall': overall}
232
 
233
  # By hop type (always included by default)
234
  if by_hop_type:
235
  by_hop = defaultdict(list)
236
  for e in evals:
237
  by_hop[e.get('hop_type', 'single')].append(e)
238
- output['by_hop_type'] = {hop: aggregate_metrics(items) for hop, items in sorted(by_hop.items())}
239
 
240
  # By category
241
  if by_category:
242
  by_cat = defaultdict(list)
243
  for e in evals:
244
  by_cat[e['category'] or 'Unknown'].append(e)
245
- output['by_category'] = {cat: aggregate_metrics(items) for cat, items in sorted(by_cat.items())}
246
 
247
  # By domain
248
  if by_domain:
249
  by_dom = defaultdict(list)
250
  for e in evals:
251
  by_dom[e['domain'] or 'Other'].append(e)
252
- output['by_domain'] = {dom: aggregate_metrics(items) for dom, items in sorted(by_dom.items())}
253
 
254
  return output
255
 
@@ -273,6 +320,8 @@ Examples:
273
  parser.add_argument('--by-domain', action='store_true', help='Show metrics by domain')
274
  parser.add_argument('--compare', action='store_true', help='Compare multiple models side-by-side')
275
  parser.add_argument('--json', action='store_true', help='Output as JSON')
 
 
276
 
277
  args = parser.parse_args()
278
 
@@ -298,7 +347,13 @@ Examples:
298
  name = name[:-8]
299
 
300
  print(f"\nEvaluating: {filepath.name}")
301
- result = evaluate_file(filepath, gold_by_text, gold_by_id, args.by_category, args.by_domain)
 
 
 
 
 
 
302
  all_results[name] = result
303
 
304
  # Output
@@ -324,31 +379,42 @@ Examples:
324
  # Comparison table
325
  models = list(all_results.keys())
326
 
327
- print(f"\n{'Model':<35} {'Acc':<8} {'ANLS*':<8} {'Doc F1':<8} {'Page F1':<8} {'Kuiper':<8}")
328
- print("-" * 75)
329
-
330
- for model in sorted(models, key=lambda m: -all_results[m]['overall'].get('accuracy', 0)):
331
- m = all_results[model]['overall']
332
- kuiper_str = f"{m['kuiper_stat']:.2f}" if not m.get('kuiper_degenerate') else "N/A"
333
- print(f"{model:<35} {m.get('accuracy', 0):.1%} {m.get('mean_anls', 0):.4f} "
334
- f"{m.get('doc_f1', 0):.4f} {m.get('page_f1', 0):.4f} {kuiper_str}")
 
 
 
 
 
 
 
 
 
 
335
  else:
336
  # Detailed per-model output
337
  for model, result in all_results.items():
338
  print(f"\n{'─' * 40}")
339
- print_metrics(model, result['overall'])
 
340
 
341
  if 'by_category' in result:
342
  print(f"\n By Category:")
343
  for cat, metrics in sorted(result['by_category'].items(),
344
  key=lambda x: -x[1].get('n', 0)):
345
- print_metrics(cat, metrics, indent=2)
346
 
347
  if 'by_domain' in result:
348
  print(f"\n By Domain:")
349
  for dom, metrics in sorted(result['by_domain'].items(),
350
  key=lambda x: -x[1].get('n', 0)):
351
- print_metrics(dom, metrics, indent=2)
352
 
353
  print()
354
 
 
18
 
19
  from datasets import load_dataset
20
 
21
+ from metrics import (
22
+ anls_star,
23
+ anls_star_llm,
24
+ aggregate_anls_star_llm,
25
+ citation_f1,
26
+ kuiper_statistic,
27
+ wasted_effort_ratio
28
+ )
29
 
30
 
31
  def derive_hop_type(evidence: list) -> str:
 
113
  def evaluate_single(
114
  result: Dict,
115
  gold_by_text: Dict[str, Dict],
116
+ gold_by_id: Dict[str, Dict],
117
+ use_semantic: bool = False
118
  ) -> Optional[Dict[str, Any]]:
119
  """Evaluate a single prediction.
120
 
121
  Matches by question text first, falls back to question ID if not found.
122
+
123
+ Args:
124
+ result: Prediction dict with 'question', 'answer', 'citations'
125
+ gold_by_text: Gold data indexed by question text
126
+ gold_by_id: Gold data indexed by question ID
127
+ use_semantic: If True, also compute semantic accuracy with LLM judge
128
  """
129
  question = result.get('question', '').strip()
130
  qid = result.get('id', '')
 
142
 
143
  # ANLS*
144
  anls = anls_star(answer, gold_data['answers'])
145
+
146
+ # Semantic accuracy with LLM judge (if enabled)
147
+ if use_semantic:
148
+ llm_result = anls_star_llm(answer, gold_data['answers'], question)
149
+ semantic = llm_result['score']
150
+ correct = semantic >= 0.5
151
+ else:
152
+ semantic = anls
153
+ correct = anls >= 0.5
154
 
155
  # Citation F1
156
  doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
 
163
  return {
164
  'question': question,
165
  'anls': anls,
166
+ 'semantic': semantic,
167
  'correct': correct,
168
  'doc_f1': doc_f1['f1'],
169
  'page_f1': page_f1['f1'],
 
174
  }
175
 
176
 
177
+ def aggregate_metrics(evals: List[Dict], use_semantic: bool = False) -> Dict[str, Any]:
178
  """Aggregate metrics across evaluations."""
179
  if not evals:
180
  return {}
 
185
  mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n
186
  mean_page_f1 = sum(e['page_f1'] for e in evals) / n
187
 
188
+ # Semantic accuracy with bias correction
189
+ if use_semantic and 'semantic' in evals[0]:
190
+ semantic_scores = [e['semantic'] for e in evals]
191
+ agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
192
+ mean_semantic = agg['adjusted_score']
193
+ semantic_ci = (agg['ci_lower'], agg['ci_upper'])
194
+ else:
195
+ mean_semantic = mean_anls
196
+ semantic_ci = None
197
+
198
  # Kuiper
199
  kuiper = kuiper_statistic(evals)
200
  wasted = wasted_effort_ratio(evals)
 
203
  'n': n,
204
  'accuracy': accuracy,
205
  'mean_anls': mean_anls,
206
+ 'mean_semantic': mean_semantic,
207
+ 'semantic_ci': semantic_ci,
208
  'doc_f1': mean_doc_f1,
209
  'page_f1': mean_page_f1,
210
  'kuiper_stat': kuiper['kuiper_stat'],
 
215
  }
216
 
217
 
218
+ def print_metrics(name: str, metrics: Dict, indent: int = 0, use_semantic: bool = False):
219
  """Print metrics in a formatted way."""
220
  prefix = " " * indent
221
 
 
224
  return
225
 
226
  print(f"{prefix}{name} (n={metrics['n']}):")
227
+
228
+ if use_semantic and 'mean_semantic' in metrics:
229
+ ci = metrics.get('semantic_ci')
230
+ ci_str = f" [{ci[0]:.2%}-{ci[1]:.2%}]" if ci else ""
231
+ print(f"{prefix} Semantic Accuracy: {metrics['mean_semantic']:.2%}{ci_str}")
232
+ print(f"{prefix} ANLS* (string): {metrics['mean_anls']:.4f}")
233
+ else:
234
+ print(f"{prefix} Accuracy (ANLS*≥0.5): {metrics['accuracy']:.1%}")
235
+ print(f"{prefix} Mean ANLS*: {metrics['mean_anls']:.4f}")
236
+
237
  print(f"{prefix} Document F1: {metrics['doc_f1']:.4f}")
238
  print(f"{prefix} Page F1: {metrics['page_f1']:.4f}")
239
 
 
250
  gold_by_id: Dict[str, Dict],
251
  by_category: bool = False,
252
  by_domain: bool = False,
253
+ by_hop_type: bool = True,
254
+ use_semantic: bool = False
255
  ) -> Dict[str, Any]:
256
  """Evaluate a single results file."""
257
  results = load_results(filepath)
258
 
259
  evals = []
260
  unmatched = 0
261
+ total = len(results)
262
 
263
+ for i, result in enumerate(results):
264
+ if use_semantic and (i + 1) % 50 == 0:
265
+ print(f" Processing {i+1}/{total}...")
266
+ ev = evaluate_single(result, gold_by_text, gold_by_id, use_semantic=use_semantic)
267
  if ev:
268
  evals.append(ev)
269
  else:
 
273
  print(f" Warning: {unmatched} questions not found in gold standard")
274
 
275
  # Overall metrics
276
+ overall = aggregate_metrics(evals, use_semantic=use_semantic)
277
 
278
+ output = {'overall': overall, 'use_semantic': use_semantic}
279
 
280
  # By hop type (always included by default)
281
  if by_hop_type:
282
  by_hop = defaultdict(list)
283
  for e in evals:
284
  by_hop[e.get('hop_type', 'single')].append(e)
285
+ output['by_hop_type'] = {hop: aggregate_metrics(items, use_semantic) for hop, items in sorted(by_hop.items())}
286
 
287
  # By category
288
  if by_category:
289
  by_cat = defaultdict(list)
290
  for e in evals:
291
  by_cat[e['category'] or 'Unknown'].append(e)
292
+ output['by_category'] = {cat: aggregate_metrics(items, use_semantic) for cat, items in sorted(by_cat.items())}
293
 
294
  # By domain
295
  if by_domain:
296
  by_dom = defaultdict(list)
297
  for e in evals:
298
  by_dom[e['domain'] or 'Other'].append(e)
299
+ output['by_domain'] = {dom: aggregate_metrics(items, use_semantic) for dom, items in sorted(by_dom.items())}
300
 
301
  return output
302
 
 
320
  parser.add_argument('--by-domain', action='store_true', help='Show metrics by domain')
321
  parser.add_argument('--compare', action='store_true', help='Compare multiple models side-by-side')
322
  parser.add_argument('--json', action='store_true', help='Output as JSON')
323
+ parser.add_argument('--semantic', action='store_true',
324
+ help='Use semantic accuracy (ANLS* + LLM judge) instead of pure ANLS*. Requires GOOGLE_API_KEY.')
325
 
326
  args = parser.parse_args()
327
 
 
347
  name = name[:-8]
348
 
349
  print(f"\nEvaluating: {filepath.name}")
350
+ if args.semantic:
351
+ print(" Using semantic accuracy (ANLS* + LLM judge)...")
352
+ result = evaluate_file(
353
+ filepath, gold_by_text, gold_by_id,
354
+ args.by_category, args.by_domain,
355
+ use_semantic=args.semantic
356
+ )
357
  all_results[name] = result
358
 
359
  # Output
 
379
  # Comparison table
380
  models = list(all_results.keys())
381
 
382
+ if args.semantic:
383
+ print(f"\n{'Model':<35} {'Semantic':<10} {'ANLS*':<8} {'Doc F1':<8} {'Page F1':<8} {'Kuiper':<8}")
384
+ print("-" * 85)
385
+
386
+ for model in sorted(models, key=lambda m: -all_results[m]['overall'].get('mean_semantic', 0)):
387
+ m = all_results[model]['overall']
388
+ kuiper_str = f"{m['kuiper_stat']:.2f}" if not m.get('kuiper_degenerate') else "N/A"
389
+ print(f"{model:<35} {m.get('mean_semantic', 0):.1%} {m.get('mean_anls', 0):.4f} "
390
+ f"{m.get('doc_f1', 0):.4f} {m.get('page_f1', 0):.4f} {kuiper_str}")
391
+ else:
392
+ print(f"\n{'Model':<35} {'Acc':<8} {'ANLS*':<8} {'Doc F1':<8} {'Page F1':<8} {'Kuiper':<8}")
393
+ print("-" * 75)
394
+
395
+ for model in sorted(models, key=lambda m: -all_results[m]['overall'].get('accuracy', 0)):
396
+ m = all_results[model]['overall']
397
+ kuiper_str = f"{m['kuiper_stat']:.2f}" if not m.get('kuiper_degenerate') else "N/A"
398
+ print(f"{model:<35} {m.get('accuracy', 0):.1%} {m.get('mean_anls', 0):.4f} "
399
+ f"{m.get('doc_f1', 0):.4f} {m.get('page_f1', 0):.4f} {kuiper_str}")
400
  else:
401
  # Detailed per-model output
402
  for model, result in all_results.items():
403
  print(f"\n{'─' * 40}")
404
+ use_sem = result.get('use_semantic', False)
405
+ print_metrics(model, result['overall'], use_semantic=use_sem)
406
 
407
  if 'by_category' in result:
408
  print(f"\n By Category:")
409
  for cat, metrics in sorted(result['by_category'].items(),
410
  key=lambda x: -x[1].get('n', 0)):
411
+ print_metrics(cat, metrics, indent=2, use_semantic=use_sem)
412
 
413
  if 'by_domain' in result:
414
  print(f"\n By Domain:")
415
  for dom, metrics in sorted(result['by_domain'].items(),
416
  key=lambda x: -x[1].get('n', 0)):
417
+ print_metrics(dom, metrics, indent=2, use_semantic=use_sem)
418
 
419
  print()
420
 
eval/metrics.py CHANGED
@@ -3,15 +3,180 @@ Core evaluation metrics for document QA.
3
 
4
  Metrics:
5
  - ANLS*: Answer-level Normalized Levenshtein Similarity
 
6
  - Citation F1: Document-level and Page-level F1 scores
7
  - Kuiper Statistic: Effort-accuracy calibration measure
 
 
 
8
  """
9
 
10
- from typing import Any, Dict, List, Set, Tuple
 
 
 
 
11
  import numpy as np
 
12
  from anls_star import anls_score
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def anls_star(predicted: Any, ground_truths: List[List[str]]) -> float:
16
  """
17
  Calculate ANLS* score (case-insensitive).
@@ -49,6 +214,340 @@ def anls_star(predicted: Any, ground_truths: List[List[str]]) -> float:
49
  return max_score
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def citation_f1(
53
  predicted_citations: List[Dict[str, Any]],
54
  gold_locations: List[Dict[str, Any]],
 
3
 
4
  Metrics:
5
  - ANLS*: Answer-level Normalized Levenshtein Similarity
6
+ - ANLS*+LLM: ANLS* with LLM fallback for semantic equivalence
7
  - Citation F1: Document-level and Page-level F1 scores
8
  - Kuiper Statistic: Effort-accuracy calibration measure
9
+
10
+ Bias Correction:
11
+ Based on "How to Correctly Report LLM-as-a-Judge Evaluations" (2511.21140v2)
12
  """
13
 
14
+ import json
15
+ import os
16
+ import time
17
+ from math import sqrt
18
+ from typing import Any, Dict, List, Optional, Set, Tuple
19
  import numpy as np
20
+ from scipy.stats import norm
21
  from anls_star import anls_score
22
 
23
 
24
+ # ============================================================================
25
+ # LLM Judge Calibration (from human evaluation)
26
+ # ============================================================================
27
+
28
+ # Calibration values from 200-sample human evaluation
29
+ # Sensitivity: P(LLM=correct | Human=correct)
30
+ LLM_JUDGE_SENSITIVITY = 0.980 # q1
31
+ # Specificity: P(LLM=incorrect | Human=incorrect)
32
+ LLM_JUDGE_SPECIFICITY = 1.000 # q0
33
+ # Calibration sample sizes (for confidence intervals)
34
+ LLM_JUDGE_CALIBRATION_M1 = 152 # samples where human=correct
35
+ LLM_JUDGE_CALIBRATION_M0 = 48 # samples where human=incorrect
36
+
37
+
38
+ def bias_adjusted_score(
39
+ raw_score: float,
40
+ q0: float = LLM_JUDGE_SPECIFICITY,
41
+ q1: float = LLM_JUDGE_SENSITIVITY
42
+ ) -> float:
43
+ """
44
+ Compute bias-adjusted score using Rogan-Gladen correction.
45
+
46
+ From "How to Correctly Report LLM-as-a-Judge Evaluations":
47
+ θ̂ = (p̂ + q₀ - 1) / (q₀ + q₁ - 1)
48
+
49
+ Args:
50
+ raw_score: Raw LLM judgment score (p̂)
51
+ q0: Specificity - P(LLM=incorrect | true=incorrect)
52
+ q1: Sensitivity - P(LLM=correct | true=correct)
53
+
54
+ Returns:
55
+ Bias-adjusted score, clipped to [0, 1]
56
+ """
57
+ if q0 + q1 <= 1:
58
+ # Degenerate case - judge is no better than random
59
+ return raw_score
60
+
61
+ adjusted = (raw_score + q0 - 1) / (q0 + q1 - 1)
62
+ return max(0.0, min(1.0, adjusted))
63
+
64
+
65
+ def standard_error(
66
+ raw_score: float,
67
+ n_samples: int,
68
+ q0: float = LLM_JUDGE_SPECIFICITY,
69
+ q1: float = LLM_JUDGE_SENSITIVITY
70
+ ) -> float:
71
+ """
72
+ Compute bias-adjusted standard error.
73
+
74
+ SE is scaled by the bias adjustment factor to account for
75
+ the transformation from raw to adjusted score.
76
+
77
+ Args:
78
+ raw_score: Raw LLM judgment score (p̂)
79
+ n_samples: Number of test samples
80
+ q0: Specificity
81
+ q1: Sensitivity
82
+
83
+ Returns:
84
+ Bias-adjusted standard error
85
+ """
86
+ if n_samples <= 0 or q0 + q1 <= 1:
87
+ return 0.0
88
+
89
+ # Raw binomial SE
90
+ p = raw_score
91
+ se_raw = sqrt(p * (1 - p) / n_samples) if 0 < p < 1 else 0
92
+
93
+ # Scale by bias adjustment factor
94
+ se_adjusted = se_raw / (q0 + q1 - 1)
95
+
96
+ return se_adjusted
97
+
98
+
99
+ def confidence_interval(
100
+ raw_score: float,
101
+ n_samples: int,
102
+ q0: float = LLM_JUDGE_SPECIFICITY,
103
+ q1: float = LLM_JUDGE_SENSITIVITY,
104
+ m0: int = LLM_JUDGE_CALIBRATION_M0,
105
+ m1: int = LLM_JUDGE_CALIBRATION_M1,
106
+ alpha: float = 0.05
107
+ ) -> Tuple[float, float]:
108
+ """
109
+ Compute confidence interval for bias-adjusted score.
110
+
111
+ Simplified version that uses observed q0, q1 directly when calibration
112
+ is high quality (q0 + q1 > 1.9). Falls back to full formula otherwise.
113
+
114
+ Args:
115
+ raw_score: Raw LLM judgment score (p̂)
116
+ n_samples: Number of test samples
117
+ q0: Specificity
118
+ q1: Sensitivity
119
+ m0: Calibration samples where human=incorrect
120
+ m1: Calibration samples where human=correct
121
+ alpha: Significance level (default 0.05 for 95% CI)
122
+
123
+ Returns:
124
+ Tuple of (lower_bound, upper_bound)
125
+ """
126
+ z = norm.ppf(1 - alpha / 2)
127
+
128
+ # For high-quality calibration (q0 + q1 > 1.9), use simplified CI
129
+ # that trusts the observed sensitivity/specificity
130
+ if q0 + q1 > 1.9:
131
+ # Bias-adjusted point estimate
132
+ theta = bias_adjusted_score(raw_score, q0, q1)
133
+
134
+ # Simple binomial SE for the test dataset only
135
+ # (calibration is trusted to be accurate)
136
+ p = raw_score
137
+ se_raw = sqrt(p * (1 - p) / n_samples) if n_samples > 0 else 0
138
+
139
+ # Scale SE by the bias adjustment factor
140
+ se_adjusted = se_raw / (q0 + q1 - 1)
141
+
142
+ lower = max(0.0, theta - z * se_adjusted)
143
+ upper = min(1.0, theta + z * se_adjusted)
144
+ return (lower, upper)
145
+
146
+ # Full formula with regularization for lower-quality calibration
147
+ p = (n_samples * raw_score + z**2 / 2) / (n_samples + z**2)
148
+ q0_adj = (m0 * q0 + 1) / (m0 + 2)
149
+ q1_adj = (m1 * q1 + 1) / (m1 + 2)
150
+
151
+ n_adj = n_samples + z**2
152
+ m0_adj = m0 + 2
153
+ m1_adj = m1 + 2
154
+
155
+ # Point estimate
156
+ if q0_adj + q1_adj <= 1:
157
+ return (0.0, 1.0)
158
+
159
+ theta = (p + q0_adj - 1) / (q0_adj + q1_adj - 1)
160
+
161
+ # Bias correction term
162
+ dth = 2 * z**2 * (
163
+ -(1 - theta) * q0_adj * (1 - q0_adj) / m0_adj
164
+ + theta * q1_adj * (1 - q1_adj) / m1_adj
165
+ )
166
+
167
+ # Standard error
168
+ se = sqrt(
169
+ p * (1 - p) / n_adj
170
+ + (1 - theta)**2 * q0_adj * (1 - q0_adj) / m0_adj
171
+ + theta**2 * q1_adj * (1 - q1_adj) / m1_adj
172
+ ) / (q0_adj + q1_adj - 1)
173
+
174
+ lower = max(0.0, theta + dth - z * se)
175
+ upper = min(1.0, theta + dth + z * se)
176
+
177
+ return (lower, upper)
178
+
179
+
180
  def anls_star(predicted: Any, ground_truths: List[List[str]]) -> float:
181
  """
182
  Calculate ANLS* score (case-insensitive).
 
214
  return max_score
215
 
216
 
217
+ # ============================================================================
218
+ # ANLS* + LLM Judge Metric
219
+ # ============================================================================
220
+
221
+ _GEVAL_PROMPT_TEMPLATE = """You are evaluating answer correctness for a Document QA benchmark.
222
+
223
+ ## Input
224
+ Question: {question}
225
+ Predicted Answer: {predicted}
226
+ Gold Answer Variants: {gold_variants}
227
+
228
+ ## Evaluation Criteria
229
+
230
+ **correct**: Predicted answer is semantically equivalent to at least one gold variant. Minor format differences are acceptable.
231
+
232
+ **partial**: Predicted answer contains correct core information but has a significant format issue (e.g., list presented as comma-separated string when items are short/atomic) OR includes irrelevant additions.
233
+
234
+ **incorrect**: Predicted answer is factually wrong, missing, contains different information, or fails to answer the question type (e.g., no Yes/No for binary questions). Missing unit qualifiers that change magnitude (thousands, millions) are incorrect.
235
+
236
+ ## Evaluation Steps
237
+
238
+ Follow these steps in order:
239
+
240
+ Step 1 - Check for refusal: Does the answer refuse or claim inability to answer? If yes → incorrect.
241
+
242
+ Step 2 - Compare content: Does the predicted answer match the core meaning of any gold variant? If content is wrong or different → incorrect.
243
+
244
+ Step 3 - Check critical errors (any of these → incorrect):
245
+ - Missing scale qualifiers that change magnitude: "50" vs "$50 million" → incorrect
246
+ - Binary questions without explicit Yes/No: Q: "Is X true?" A: "X is observed" → incorrect (must say Yes or No)
247
+ - Wrong entity/value: different person, company, number than gold → incorrect
248
+ - Partial list with wrong items mixed in: some correct + some wrong items → incorrect
249
+
250
+ Step 4 - Check format (only if content is correct):
251
+ - If gold expects multiple items AND predicted is a comma-separated string (not a list) → partial
252
+ - If gold expects single item → no format issue possible
253
+
254
+ Step 5 - Check verbosity (only if content is correct):
255
+ - CORRECT (acceptable verbosity):
256
+ * Extra qualifiers: "three security questions" when gold is "3" → correct
257
+ * Relevant context: "No — Massachusetts; Washington" for "same state?" question → correct
258
+ * Clarifying phrases: "in his personal capacity", "per annum" → correct
259
+ - PARTIAL (medium verbosity) - ONLY when additions are truly irrelevant:
260
+ * Adding unrequested details to list items
261
+ * Over-specific precision: date+time when only date asked → partial
262
+ - INCORRECT (high verbosity):
263
+ * Multi-sentence responses when a word/phrase suffices
264
+ * Full paragraphs of explanation
265
+ * Conversational preambles: "Based on the document...", "The answer is..."
266
+
267
+ Based on your step-by-step analysis, provide your final judgment.
268
+
269
+ After your reasoning, you MUST call submit_judgment with your final decision."""
270
+
271
+
272
+ _LLM_JUDGE_TOOL = {
273
+ "function_declarations": [{
274
+ "name": "submit_judgment",
275
+ "description": "Submit your final judgment after reasoning through the evaluation steps",
276
+ "parameters": {
277
+ "type": "object",
278
+ "properties": {
279
+ "judgment": {
280
+ "type": "string",
281
+ "enum": ["correct", "partial", "incorrect"],
282
+ "description": "Final judgment: correct, partial, or incorrect"
283
+ },
284
+ "main_issue": {
285
+ "type": "string",
286
+ "enum": ["none", "refusal", "wrong_content", "missing_unit", "no_yes_no", "list_format", "verbosity_medium", "verbosity_high"],
287
+ "description": "The primary issue found, if any"
288
+ },
289
+ "explanation": {
290
+ "type": "string",
291
+ "description": "Brief explanation of your judgment"
292
+ }
293
+ },
294
+ "required": ["judgment", "main_issue", "explanation"]
295
+ }
296
+ }]
297
+ }
298
+
299
+
300
+ def _get_gemini_model():
301
+ """Initialize Gemini model (lazy loading)."""
302
+ import google.generativeai as genai
303
+ api_key = os.environ.get("GOOGLE_API_KEY")
304
+ if not api_key:
305
+ raise ValueError("GOOGLE_API_KEY environment variable not set")
306
+ genai.configure(api_key=api_key)
307
+ return genai.GenerativeModel('gemini-2.5-flash')
308
+
309
+
310
+ def _call_gemini_with_timeout(model, prompt, timeout=30):
311
+ """Call Gemini with a timeout using threading."""
312
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
313
+
314
+ def _call():
315
+ return model.generate_content(
316
+ prompt,
317
+ tools=[_LLM_JUDGE_TOOL],
318
+ tool_config={"function_calling_config": {"mode": "ANY"}},
319
+ request_options={"timeout": timeout}
320
+ )
321
+
322
+ with ThreadPoolExecutor(max_workers=1) as executor:
323
+ future = executor.submit(_call)
324
+ try:
325
+ return future.result(timeout=timeout)
326
+ except FuturesTimeoutError:
327
+ raise TimeoutError(f"Gemini API call timed out after {timeout}s")
328
+
329
+
330
+ def _call_llm_judge(
331
+ question: str,
332
+ predicted: Any,
333
+ gold_variants: List[List[str]],
334
+ max_retries: int = 3,
335
+ retry_delay: float = 1.0,
336
+ timeout: float = 30.0
337
+ ) -> Dict[str, Any]:
338
+ """
339
+ Call Gemini LLM judge with retries and timeout.
340
+
341
+ Returns:
342
+ Dict with 'judgment', 'main_issue', 'explanation', 'score'
343
+ """
344
+ prompt = _GEVAL_PROMPT_TEMPLATE.format(
345
+ question=question,
346
+ predicted=json.dumps(predicted),
347
+ gold_variants=json.dumps(gold_variants)
348
+ )
349
+
350
+ model = _get_gemini_model()
351
+
352
+ for attempt in range(max_retries):
353
+ try:
354
+ response = _call_gemini_with_timeout(model, prompt, timeout=timeout)
355
+
356
+ # Extract function call result
357
+ if response.candidates and response.candidates[0].content.parts:
358
+ for part in response.candidates[0].content.parts:
359
+ if hasattr(part, 'function_call') and part.function_call.name == "submit_judgment":
360
+ args = dict(part.function_call.args)
361
+ judgment = args.get('judgment', 'incorrect')
362
+
363
+ # Map judgment to score
364
+ score_map = {'correct': 1.0, 'partial': 0.5, 'incorrect': 0.0}
365
+ args['score'] = score_map.get(judgment, 0.0)
366
+ return args
367
+
368
+ # No function call found - retry
369
+ if attempt < max_retries - 1:
370
+ time.sleep(retry_delay)
371
+ continue
372
+
373
+ except TimeoutError as e:
374
+ if attempt < max_retries - 1:
375
+ time.sleep(retry_delay)
376
+ continue
377
+ return {
378
+ 'judgment': 'error',
379
+ 'main_issue': 'timeout',
380
+ 'explanation': str(e),
381
+ 'score': 0.0
382
+ }
383
+ except Exception as e:
384
+ if attempt < max_retries - 1:
385
+ time.sleep(retry_delay * (attempt + 1)) # Exponential backoff
386
+ continue
387
+ return {
388
+ 'judgment': 'error',
389
+ 'main_issue': 'error',
390
+ 'explanation': str(e),
391
+ 'score': 0.0
392
+ }
393
+
394
+ return {
395
+ 'judgment': 'error',
396
+ 'main_issue': 'parse_error',
397
+ 'explanation': 'Failed to get valid response after retries',
398
+ 'score': 0.0
399
+ }
400
+
401
+
402
+ def anls_star_llm(
403
+ predicted: Any,
404
+ ground_truths: List[List[str]],
405
+ question: str = "",
406
+ threshold: float = 1.0
407
+ ) -> Dict[str, Any]:
408
+ """
409
+ ANLS* with LLM fallback for semantic equivalence checking.
410
+
411
+ If ANLS* >= threshold (default 1.0), returns ANLS* score.
412
+ Otherwise, calls Gemini LLM judge to evaluate semantic correctness.
413
+
414
+ Args:
415
+ predicted: Predicted answer (string or list)
416
+ ground_truths: List of answer variants
417
+ question: The question text (needed for LLM judge)
418
+ threshold: ANLS* threshold above which to skip LLM (default 1.0)
419
+
420
+ Returns:
421
+ Dict with:
422
+ - 'score': Final score (0.0, 0.5, or 1.0)
423
+ - 'anls_score': Raw ANLS* score
424
+ - 'used_llm': Whether LLM judge was called
425
+ - 'llm_judgment': LLM judgment details (if used)
426
+ """
427
+ # Check for empty prediction (optimization: skip LLM, return 0)
428
+ is_empty = (
429
+ predicted is None
430
+ or predicted == ""
431
+ or predicted == []
432
+ or (isinstance(predicted, list) and all(not p for p in predicted))
433
+ )
434
+
435
+ if is_empty:
436
+ return {
437
+ 'score': 0.0,
438
+ 'anls_score': 0.0,
439
+ 'used_llm': False,
440
+ 'llm_judgment': {'judgment': 'incorrect', 'main_issue': 'empty', 'explanation': 'Empty prediction'}
441
+ }
442
+
443
+ # Check for overly long answers (optimization: skip LLM, return 0)
444
+ MAX_ANSWER_LENGTH = 2000
445
+ try:
446
+ answer_length = len(json.dumps(predicted))
447
+ except (TypeError, ValueError):
448
+ answer_length = len(str(predicted))
449
+
450
+ if answer_length > MAX_ANSWER_LENGTH:
451
+ return {
452
+ 'score': 0.0,
453
+ 'anls_score': 0.0,
454
+ 'used_llm': False,
455
+ 'llm_judgment': {
456
+ 'judgment': 'incorrect',
457
+ 'main_issue': 'too_long',
458
+ 'explanation': f'Answer too long ({answer_length} chars > {MAX_ANSWER_LENGTH})'
459
+ }
460
+ }
461
+
462
+ # Check ANLS*
463
+ anls = anls_star(predicted, ground_truths)
464
+
465
+ result = {
466
+ 'score': anls,
467
+ 'anls_score': anls,
468
+ 'used_llm': False,
469
+ 'llm_judgment': None
470
+ }
471
+
472
+ # If ANLS* is perfect, no need for LLM
473
+ if anls >= threshold:
474
+ result['score'] = 1.0
475
+ return result
476
+
477
+ # Call LLM judge for cases where ANLS* < threshold
478
+ if question:
479
+ llm_result = _call_llm_judge(question, predicted, ground_truths)
480
+ result['used_llm'] = True
481
+ result['llm_judgment'] = llm_result
482
+ result['score'] = llm_result.get('score', 0.0)
483
+
484
+ return result
485
+
486
+
487
+ def aggregate_anls_star_llm(
488
+ scores: List[float],
489
+ apply_bias_correction: bool = True
490
+ ) -> Dict[str, Any]:
491
+ """
492
+ Compute aggregate ANLS*+LLM score with optional bias correction.
493
+
494
+ Based on "How to Correctly Report LLM-as-a-Judge Evaluations" (2511.21140v2).
495
+
496
+ Args:
497
+ scores: List of individual ANLS*+LLM scores (0.0, 0.5, or 1.0)
498
+ apply_bias_correction: Whether to apply Rogan-Gladen correction
499
+
500
+ Returns:
501
+ Dict with:
502
+ - 'raw_score': Mean of raw scores
503
+ - 'adjusted_score': Bias-adjusted score (if correction applied)
504
+ - 'se': Bias-adjusted standard error
505
+ - 'ci_lower': 95% CI lower bound
506
+ - 'ci_upper': 95% CI upper bound
507
+ - 'n_samples': Number of samples
508
+ - 'q0': Specificity used
509
+ - 'q1': Sensitivity used
510
+ """
511
+ if not scores:
512
+ return {
513
+ 'raw_score': 0.0,
514
+ 'adjusted_score': 0.0,
515
+ 'se': 0.0,
516
+ 'ci_lower': 0.0,
517
+ 'ci_upper': 0.0,
518
+ 'n_samples': 0,
519
+ 'q0': LLM_JUDGE_SPECIFICITY,
520
+ 'q1': LLM_JUDGE_SENSITIVITY
521
+ }
522
+
523
+ n = len(scores)
524
+ raw = sum(scores) / n
525
+
526
+ result = {
527
+ 'raw_score': raw,
528
+ 'n_samples': n,
529
+ 'q0': LLM_JUDGE_SPECIFICITY,
530
+ 'q1': LLM_JUDGE_SENSITIVITY
531
+ }
532
+
533
+ if apply_bias_correction:
534
+ result['adjusted_score'] = bias_adjusted_score(raw)
535
+ result['se'] = standard_error(raw, n)
536
+ ci = confidence_interval(raw, n)
537
+ result['ci_lower'] = ci[0]
538
+ result['ci_upper'] = ci[1]
539
+ else:
540
+ result['adjusted_score'] = raw
541
+ result['se'] = sqrt(raw * (1 - raw) / n) if n > 0 and 0 < raw < 1 else 0.0
542
+ # Simple binomial CI without calibration correction
543
+ se = sqrt(raw * (1 - raw) / n) if n > 0 else 0
544
+ z = 1.96
545
+ result['ci_lower'] = max(0.0, raw - z * se)
546
+ result['ci_upper'] = min(1.0, raw + z * se)
547
+
548
+ return result
549
+
550
+
551
  def citation_f1(
552
  predicted_citations: List[Dict[str, Any]],
553
  gold_locations: List[Dict[str, Any]],
eval/reevaluate_submissions.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Re-evaluate existing submissions with the new Semantic Accuracy metric.
4
+
5
+ This script:
6
+ 1. Downloads prediction files from HuggingFace Hub
7
+ 2. Re-evaluates them with ANLS* + LLM judge
8
+ 3. Updates the results files with new metrics
9
+ """
10
+
11
+ import json
12
+ import os
13
+ import sys
14
+ from pathlib import Path
15
+ from datetime import datetime, timezone
16
+
17
+ from huggingface_hub import HfApi, hf_hub_download, list_repo_files
18
+ from datasets import load_dataset
19
+
20
+ # Add parent for imports
21
+ sys.path.insert(0, str(Path(__file__).parent))
22
+ from metrics import (
23
+ anls_star,
24
+ anls_star_llm,
25
+ aggregate_anls_star_llm,
26
+ citation_f1,
27
+ kuiper_statistic
28
+ )
29
+
30
+ # Config
31
+ RESULTS_REPO = "agentic-document-ai/backend-results"
32
+ TOKEN = os.environ.get("HF_TOKEN")
33
+
34
+
35
+ def load_gold_data():
36
+ """Load gold standard from HuggingFace."""
37
+ print("Loading gold standard...")
38
+ dataset = load_dataset("agentic-document-ai/dataset-PRIVATE", split="test")
39
+
40
+ gold_by_id = {}
41
+ gold_by_text = {}
42
+
43
+ for ex in dataset:
44
+ qid = ex.get('id', '')
45
+ question = ex['question'].strip()
46
+ data = {
47
+ 'question': question,
48
+ 'answers': ex.get('answer_variants', []),
49
+ 'evidence': ex.get('evidence', []),
50
+ 'category': ex.get('category', ''),
51
+ 'domain': ex.get('domain', ''),
52
+ 'hop_type': ex.get('hop_type', 'single'),
53
+ }
54
+ gold_by_id[qid] = data
55
+ gold_by_text[question] = data
56
+
57
+ return gold_by_id, gold_by_text
58
+
59
+
60
+ def find_prediction_files():
61
+ """Find all prediction JSONL files in the results repo."""
62
+ api = HfApi()
63
+ files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
64
+
65
+ prediction_files = [f for f in files if f.endswith('_predictions.jsonl') or '_predictions_' in f]
66
+ return prediction_files
67
+
68
+
69
+ def download_predictions(filepath: str) -> list:
70
+ """Download and parse a predictions file."""
71
+ local_path = hf_hub_download(
72
+ repo_id=RESULTS_REPO,
73
+ filename=filepath,
74
+ repo_type="dataset",
75
+ token=TOKEN
76
+ )
77
+
78
+ predictions = []
79
+ with open(local_path) as f:
80
+ for line in f:
81
+ if line.strip():
82
+ predictions.append(json.loads(line))
83
+ return predictions
84
+
85
+
86
+ def evaluate_with_semantic(predictions: list, gold_by_id: dict, gold_by_text: dict) -> dict:
87
+ """Evaluate predictions with semantic accuracy metric."""
88
+ from collections import defaultdict
89
+
90
+ evals = []
91
+ unmatched = 0
92
+
93
+ total = len(predictions)
94
+ for i, pred in enumerate(predictions):
95
+ if (i + 1) % 50 == 0:
96
+ print(f" Processing {i+1}/{total}...")
97
+
98
+ question = pred.get('question', '').strip()
99
+ qid = pred.get('id', '')
100
+
101
+ # Match to gold
102
+ gold_data = None
103
+ if question in gold_by_text:
104
+ gold_data = gold_by_text[question]
105
+ elif qid and qid in gold_by_id:
106
+ gold_data = gold_by_id[qid]
107
+
108
+ if not gold_data:
109
+ unmatched += 1
110
+ continue
111
+
112
+ answer = pred.get('answer', '')
113
+ citations = pred.get('citations', [])
114
+ search_history = pred.get('search_history', [])
115
+ steps = len(search_history) if search_history else pred.get('iterations', 0)
116
+
117
+ # Calculate metrics
118
+ anls = anls_star(answer, gold_data['answers'])
119
+
120
+ # Semantic accuracy with LLM judge
121
+ llm_result = anls_star_llm(answer, gold_data['answers'], question)
122
+ semantic_score = llm_result['score']
123
+
124
+ doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
125
+ page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
126
+
127
+ evals.append({
128
+ 'anls': anls,
129
+ 'semantic_score': semantic_score,
130
+ 'correct': semantic_score >= 0.5,
131
+ 'doc_f1': doc_f1['f1'],
132
+ 'page_f1': page_f1['f1'],
133
+ 'steps': steps,
134
+ 'hop_type': gold_data.get('hop_type', 'single'),
135
+ 'category': gold_data['category'],
136
+ 'domain': gold_data['domain']
137
+ })
138
+
139
+ if not evals:
140
+ return None
141
+
142
+ # Aggregate
143
+ n = len(evals)
144
+ semantic_scores = [e['semantic_score'] for e in evals]
145
+
146
+ agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
147
+ mean_semantic = agg['adjusted_score'] * 100
148
+
149
+ mean_anls = sum(e['anls'] for e in evals) / n * 100
150
+ mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
151
+ mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
152
+
153
+ kuiper = kuiper_statistic(evals)
154
+
155
+ # By hop type
156
+ single_hop = [e for e in evals if e['hop_type'] == 'single']
157
+ cross_page = [e for e in evals if e['hop_type'] == 'cross_page']
158
+ cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
159
+
160
+ # By domain
161
+ by_domain = defaultdict(list)
162
+ for e in evals:
163
+ domain = e['domain'] or 'Other'
164
+ by_domain[domain].append(e)
165
+
166
+ domain_scores = {}
167
+ for domain, domain_evals in sorted(by_domain.items()):
168
+ domain_scores[domain] = {
169
+ 'semantic': sum(e['semantic_score'] for e in domain_evals) / len(domain_evals) * 100,
170
+ 'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
171
+ 'n': len(domain_evals)
172
+ }
173
+
174
+ return {
175
+ 'overall': {
176
+ 'semantic': mean_semantic,
177
+ 'anls': mean_anls,
178
+ 'page_f1': mean_page_f1,
179
+ 'doc_f1': mean_doc_f1,
180
+ 'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
181
+ },
182
+ 'single_evidence': {
183
+ 'semantic': sum(e['semantic_score'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
184
+ 'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
185
+ 'n': len(single_hop)
186
+ },
187
+ 'multi_evidence_same_doc': {
188
+ 'semantic': sum(e['semantic_score'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
189
+ 'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
190
+ 'n': len(cross_page)
191
+ },
192
+ 'multi_evidence_multi_doc': {
193
+ 'semantic': sum(e['semantic_score'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
194
+ 'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
195
+ 'n': len(cross_doc)
196
+ },
197
+ 'by_domain': domain_scores,
198
+ 'n_evaluated': n,
199
+ 'n_unmatched': unmatched
200
+ }
201
+
202
+
203
+ def main():
204
+ import argparse
205
+ parser = argparse.ArgumentParser(description="Re-evaluate submissions with semantic accuracy")
206
+ parser.add_argument('--dry-run', action='store_true', help="Don't upload results")
207
+ parser.add_argument('--file', type=str, help="Re-evaluate specific prediction file")
208
+ args = parser.parse_args()
209
+
210
+ # Load gold standard
211
+ gold_by_id, gold_by_text = load_gold_data()
212
+ print(f"Loaded {len(gold_by_id)} gold examples")
213
+
214
+ # Find prediction files
215
+ if args.file:
216
+ pred_files = [args.file]
217
+ else:
218
+ print("\nFinding prediction files...")
219
+ pred_files = find_prediction_files()
220
+ print(f"Found {len(pred_files)} prediction files")
221
+
222
+ for pred_file in pred_files:
223
+ print(f"\n{'='*60}")
224
+ print(f"Processing: {pred_file}")
225
+ print('='*60)
226
+
227
+ try:
228
+ predictions = download_predictions(pred_file)
229
+ print(f"Loaded {len(predictions)} predictions")
230
+
231
+ results = evaluate_with_semantic(predictions, gold_by_id, gold_by_text)
232
+
233
+ if results:
234
+ print(f"\nResults:")
235
+ print(f" Semantic Accuracy: {results['overall']['semantic']:.1f}")
236
+ print(f" ANLS*: {results['overall']['anls']:.1f}")
237
+ print(f" Page F1: {results['overall']['page_f1']:.1f}")
238
+
239
+ # Save locally for review
240
+ output_file = Path(pred_file).stem + "_reevaluated.json"
241
+ with open(output_file, 'w') as f:
242
+ json.dump(results, f, indent=2)
243
+ print(f"\nSaved to: {output_file}")
244
+ else:
245
+ print("No valid evaluations")
246
+
247
+ except Exception as e:
248
+ print(f"Error: {e}")
249
+ continue
250
+
251
+
252
+ if __name__ == "__main__":
253
+ main()
254
+
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen3-VL (235B-A22B-Thinking) with BM25 Search Tool",
3
+ "organization": "Alibaba Group",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "open-weight"
14
+ },
15
+ "submission_date": "2026-01-10T13:16:29.905067+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 59.09778741155781,
19
+ "anls": 57.61603118163428,
20
+ "page_f1": 58.72697776505391,
21
+ "doc_f1": 80.62601393262716,
22
+ "kuiper": 34.044088176352815
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 57.91583166332666,
26
+ "anls": 57.61603118163428,
27
+ "n": 499
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 66.66666666666666,
42
+ "anls": 61.98005698005697,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 81.81818181818183,
47
+ "anls": 80.39465804287939,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 66.66666666666666,
52
+ "anls": 66.4976376669925,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 44.565217391304344,
57
+ "anls": 49.53672826145982,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 56.25,
62
+ "anls": 57.39996898263027,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 61.702127659574465,
67
+ "anls": 60.56474101398679,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 56.09756097560976,
72
+ "anls": 53.957859669066565,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 60.46511627906976,
77
+ "anls": 54.79335264218985,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 40.0,
82
+ "anls": 49.05833333333333,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 68.75,
87
+ "anls": 73.07522250524337,
88
+ "n": 24
89
+ },
90
+ "Reference": {
91
+ "semantic": 59.61538461538461,
92
+ "anls": 63.19327183267644,
93
+ "n": 52
94
+ },
95
+ "Reports": {
96
+ "semantic": 57.333333333333336,
97
+ "anls": 53.11616787903517,
98
+ "n": 75
99
+ },
100
+ "Technical": {
101
+ "semantic": 71.73913043478261,
102
+ "anls": 57.18864273121033,
103
+ "n": 23
104
+ }
105
+ },
106
+ "n_evaluated": 499,
107
+ "n_unmatched": 1767
108
+ },
109
+ "reevaluated_date": "2026-01-15T19:55:28.547801+00:00",
110
+ "source_predictions_file": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_predictions_20260110_131629.jsonl",
111
+ "result_file_path": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json"
112
+ }
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen3-VL (32B-Thinking) with BM25 Search Tool",
3
+ "organization": "Alibaba Group",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "open-weight"
14
+ },
15
+ "submission_date": "2026-01-10T13:20:54.125677+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 57.666353114392045,
19
+ "anls": 57.937064653000625,
20
+ "page_f1": 54.83061360816872,
21
+ "doc_f1": 78.76514934631167,
22
+ "kuiper": 36.33667334669349
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 56.51302605210421,
26
+ "anls": 57.937064653000625,
27
+ "n": 499
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 60.0,
42
+ "anls": 55.37037037037037,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 54.54545454545454,
47
+ "anls": 54.61297760210804,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 75.0,
52
+ "anls": 77.14578581514066,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 45.65217391304348,
57
+ "anls": 47.03746065646829,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 43.75,
62
+ "anls": 50.93257767828244,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 52.12765957446809,
67
+ "anls": 56.60682613221971,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 63.41463414634146,
72
+ "anls": 61.11435746903807,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 65.11627906976744,
77
+ "anls": 60.44405852545388,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 40.0,
82
+ "anls": 54.65844817149165,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 75.0,
87
+ "anls": 73.59601449275362,
88
+ "n": 24
89
+ },
90
+ "Reference": {
91
+ "semantic": 63.46153846153846,
92
+ "anls": 68.57667578882189,
93
+ "n": 52
94
+ },
95
+ "Reports": {
96
+ "semantic": 54.0,
97
+ "anls": 56.44955119487462,
98
+ "n": 75
99
+ },
100
+ "Technical": {
101
+ "semantic": 60.86956521739131,
102
+ "anls": 51.60498619336015,
103
+ "n": 23
104
+ }
105
+ },
106
+ "n_evaluated": 499,
107
+ "n_unmatched": 1767
108
+ },
109
+ "reevaluated_date": "2026-01-15T19:56:35.003631+00:00",
110
+ "source_predictions_file": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132054.jsonl",
111
+ "result_file_path": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json"
112
+ }
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Qwen3-VL (8B-Thinking) with BM25 Search Tool",
3
+ "organization": "Alibaba Group",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "open-weight"
14
+ },
15
+ "submission_date": "2026-01-10T13:23:58.123387+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 46.623859964827616,
19
+ "anls": 45.43424080850834,
20
+ "page_f1": 47.685529789738204,
21
+ "doc_f1": 69.57247828991316,
22
+ "kuiper": 48.30060120240493
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 45.69138276553106,
26
+ "anls": 45.43424080850834,
27
+ "n": 499
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 40.0,
42
+ "anls": 36.91358024691358,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 59.09090909090909,
47
+ "anls": 55.55994729907773,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 58.333333333333336,
52
+ "anls": 54.598842018196855,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 29.347826086956523,
57
+ "anls": 29.932472094079802,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 37.5,
62
+ "anls": 51.55757767828245,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 43.61702127659575,
67
+ "anls": 44.439106365198185,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 46.34146341463415,
72
+ "anls": 50.16056789323261,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 53.48837209302325,
77
+ "anls": 44.799741602067186,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 40.0,
82
+ "anls": 49.070641025641024,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 58.333333333333336,
87
+ "anls": 59.60305559882987,
88
+ "n": 24
89
+ },
90
+ "Reference": {
91
+ "semantic": 44.230769230769226,
92
+ "anls": 50.44584246011934,
93
+ "n": 52
94
+ },
95
+ "Reports": {
96
+ "semantic": 54.0,
97
+ "anls": 50.000135852648256,
98
+ "n": 75
99
+ },
100
+ "Technical": {
101
+ "semantic": 52.17391304347826,
102
+ "anls": 39.32779159365883,
103
+ "n": 23
104
+ }
105
+ },
106
+ "n_evaluated": 499,
107
+ "n_unmatched": 1767
108
+ },
109
+ "reevaluated_date": "2026-01-15T19:58:05.119474+00:00",
110
+ "source_predictions_file": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132358.jsonl",
111
+ "result_file_path": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json"
112
+ }
eval/reevaluated_results/Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Claude Haiku 4.5 (2025-10-01) with BM25 Search Tool",
3
+ "organization": "Anthropic",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T13:03:19.649656+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 66.9387755102041,
19
+ "anls": 61.60747574238133,
20
+ "page_f1": 72.02476190476192,
21
+ "doc_f1": 88.24761904761905,
22
+ "kuiper": 50.36144578313238
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 65.60000000000001,
26
+ "anls": 61.60747574238133,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 70.0,
42
+ "anls": 63.92691050779287,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 72.72727272727273,
47
+ "anls": 73.53318618140752,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 85.41666666666666,
52
+ "anls": 72.62325637325637,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 58.152173913043484,
57
+ "anls": 54.29593695395653,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 71.875,
62
+ "anls": 68.77016129032259,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 61.702127659574465,
67
+ "anls": 62.779826338896896,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 53.65853658536586,
72
+ "anls": 52.054645053208425,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 66.27906976744185,
77
+ "anls": 60.50249169435216,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 48.0,
82
+ "anls": 41.69842237151431,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 81.25,
87
+ "anls": 80.07172131147541,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 60.57692307692307,
97
+ "anls": 64.3484267705628,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 76.0,
102
+ "anls": 65.36479556179735,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 71.73913043478261,
107
+ "anls": 64.75817505570946,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T19:59:11.288336+00:00",
115
+ "source_predictions_file": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_predictions_20260109_130319.jsonl",
116
+ "result_file_path": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json"
117
+ }
eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_002125.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Claude Sonnet 4.5 (2025-09-29) with BM25 Search Tool",
3
+ "organization": "Anthropic",
4
+ "description": "",
5
+ "link": null,
6
+ "tags": [
7
+ "Agentic"
8
+ ],
9
+ "submitted_by": null,
10
+ "metadata": {
11
+ "model_type": "unknown"
12
+ },
13
+ "submission_date": null,
14
+ "results": {
15
+ "overall": {
16
+ "semantic": 76.83673469387756,
17
+ "anls": 71.84394202116125,
18
+ "page_f1": 79.30920634920635,
19
+ "doc_f1": 92.87777777777778,
20
+ "kuiper": 45.31237322515194
21
+ },
22
+ "single_evidence": {
23
+ "semantic": 75.3,
24
+ "anls": 71.84394202116125,
25
+ "n": 500
26
+ },
27
+ "multi_evidence_same_doc": {
28
+ "semantic": 0,
29
+ "anls": 0,
30
+ "n": 0
31
+ },
32
+ "multi_evidence_multi_doc": {
33
+ "semantic": 0,
34
+ "anls": 0,
35
+ "n": 0
36
+ },
37
+ "by_domain": {
38
+ "Cases/Logs": {
39
+ "semantic": 76.66666666666667,
40
+ "anls": 69.17913105413105,
41
+ "n": 15
42
+ },
43
+ "Education": {
44
+ "semantic": 84.0909090909091,
45
+ "anls": 78.67387882210018,
46
+ "n": 22
47
+ },
48
+ "Events": {
49
+ "semantic": 87.5,
50
+ "anls": 78.43471847184719,
51
+ "n": 24
52
+ },
53
+ "Financial": {
54
+ "semantic": 67.93478260869566,
55
+ "anls": 67.53424957183847,
56
+ "n": 92
57
+ },
58
+ "Financial/Tax": {
59
+ "semantic": 75.0,
60
+ "anls": 79.76190476190477,
61
+ "n": 16
62
+ },
63
+ "Government/Regulatory": {
64
+ "semantic": 81.91489361702128,
65
+ "anls": 76.76053472269231,
66
+ "n": 47
67
+ },
68
+ "HR/Employment": {
69
+ "semantic": 76.82926829268293,
70
+ "anls": 71.16619587453502,
71
+ "n": 41
72
+ },
73
+ "Legal": {
74
+ "semantic": 75.5813953488372,
75
+ "anls": 63.583816672634086,
76
+ "n": 43
77
+ },
78
+ "Media/Publishing": {
79
+ "semantic": 48.0,
80
+ "anls": 56.169284632785775,
81
+ "n": 25
82
+ },
83
+ "Misc": {
84
+ "semantic": 89.58333333333334,
85
+ "anls": 83.87132448607858,
86
+ "n": 24
87
+ },
88
+ "Other": {
89
+ "semantic": 0.0,
90
+ "anls": 0.0,
91
+ "n": 1
92
+ },
93
+ "Reference": {
94
+ "semantic": 71.15384615384616,
95
+ "anls": 80.42617278480002,
96
+ "n": 52
97
+ },
98
+ "Reports": {
99
+ "semantic": 82.66666666666667,
100
+ "anls": 74.25747226815201,
101
+ "n": 75
102
+ },
103
+ "Technical": {
104
+ "semantic": 69.56521739130434,
105
+ "anls": 58.84371488722767,
106
+ "n": 23
107
+ }
108
+ },
109
+ "n_evaluated": 500,
110
+ "n_unmatched": 1811
111
+ },
112
+ "reevaluated_date": "2026-01-15T20:00:06.481610+00:00",
113
+ "source_predictions_file": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_predictions_20260109_002125.jsonl",
114
+ "result_file_path": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_002125.json"
115
+ }
eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Claude Sonnet 4.5 (2025-09-29) with BM25 Search Tool",
3
+ "organization": "Anthropic",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T12:58:16.611348+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 79.08163265306122,
19
+ "anls": 71.74787642305597,
20
+ "page_f1": 79.12333333333333,
21
+ "doc_f1": 92.98636363636363,
22
+ "kuiper": 36.338056680162076
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 77.5,
26
+ "anls": 71.74787642305597,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 76.66666666666667,
42
+ "anls": 69.51092117758785,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 81.81818181818183,
47
+ "anls": 78.15439830261965,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 91.66666666666666,
52
+ "anls": 78.43471847184719,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 70.1086956521739,
57
+ "anls": 66.81148919563769,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 75.0,
62
+ "anls": 76.26728110599078,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 86.17021276595744,
67
+ "anls": 74.90457714355891,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 75.60975609756098,
72
+ "anls": 72.85160396238213,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 88.37209302325581,
77
+ "anls": 72.74221043114129,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 56.00000000000001,
82
+ "anls": 60.75987316199324,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 89.58333333333334,
87
+ "anls": 83.89482072668008,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 69.23076923076923,
97
+ "anls": 72.21619612753193,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 83.33333333333334,
102
+ "anls": 74.0536995032274,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 69.56521739130434,
107
+ "anls": 60.23577215564363,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:01:02.709110+00:00",
115
+ "source_predictions_file": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_predictions_20260109_125816.jsonl",
116
+ "result_file_path": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json"
117
+ }
eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_003320.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Gemini 2.5 Flash with BM25 Search Tool",
3
+ "organization": "Google",
4
+ "description": "",
5
+ "link": null,
6
+ "tags": [
7
+ "Agentic"
8
+ ],
9
+ "submitted_by": null,
10
+ "metadata": {
11
+ "model_type": "unknown"
12
+ },
13
+ "submission_date": null,
14
+ "results": {
15
+ "overall": {
16
+ "semantic": 57.34693877551022,
17
+ "anls": 52.71594015359682,
18
+ "page_f1": 59.910952380952374,
19
+ "doc_f1": 76.47380952380952,
20
+ "kuiper": 40.01599999999986
21
+ },
22
+ "single_evidence": {
23
+ "semantic": 56.2,
24
+ "anls": 52.71594015359682,
25
+ "n": 500
26
+ },
27
+ "multi_evidence_same_doc": {
28
+ "semantic": 0,
29
+ "anls": 0,
30
+ "n": 0
31
+ },
32
+ "multi_evidence_multi_doc": {
33
+ "semantic": 0,
34
+ "anls": 0,
35
+ "n": 0
36
+ },
37
+ "by_domain": {
38
+ "Cases/Logs": {
39
+ "semantic": 73.33333333333333,
40
+ "anls": 63.64672364672364,
41
+ "n": 15
42
+ },
43
+ "Education": {
44
+ "semantic": 77.27272727272727,
45
+ "anls": 68.27344592166726,
46
+ "n": 22
47
+ },
48
+ "Events": {
49
+ "semantic": 72.91666666666666,
50
+ "anls": 67.7894121245185,
51
+ "n": 24
52
+ },
53
+ "Financial": {
54
+ "semantic": 41.30434782608695,
55
+ "anls": 42.928812913087185,
56
+ "n": 92
57
+ },
58
+ "Financial/Tax": {
59
+ "semantic": 43.75,
60
+ "anls": 44.89996898263027,
61
+ "n": 16
62
+ },
63
+ "Government/Regulatory": {
64
+ "semantic": 63.829787234042556,
65
+ "anls": 62.64704717952198,
66
+ "n": 47
67
+ },
68
+ "HR/Employment": {
69
+ "semantic": 56.09756097560976,
70
+ "anls": 49.2663477551747,
71
+ "n": 41
72
+ },
73
+ "Legal": {
74
+ "semantic": 55.81395348837209,
75
+ "anls": 47.23635639486312,
76
+ "n": 43
77
+ },
78
+ "Media/Publishing": {
79
+ "semantic": 52.0,
80
+ "anls": 46.0,
81
+ "n": 25
82
+ },
83
+ "Misc": {
84
+ "semantic": 66.66666666666666,
85
+ "anls": 68.74370865688812,
86
+ "n": 24
87
+ },
88
+ "Other": {
89
+ "semantic": 0.0,
90
+ "anls": 0.0,
91
+ "n": 1
92
+ },
93
+ "Reference": {
94
+ "semantic": 61.53846153846154,
95
+ "anls": 64.74843671979198,
96
+ "n": 52
97
+ },
98
+ "Reports": {
99
+ "semantic": 53.333333333333336,
100
+ "anls": 46.187273968786066,
101
+ "n": 75
102
+ },
103
+ "Technical": {
104
+ "semantic": 54.347826086956516,
105
+ "anls": 42.61518103800272,
106
+ "n": 23
107
+ }
108
+ },
109
+ "n_evaluated": 500,
110
+ "n_unmatched": 1811
111
+ },
112
+ "reevaluated_date": "2026-01-15T20:02:15.855307+00:00",
113
+ "source_predictions_file": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_predictions_20260109_003320.jsonl",
114
+ "result_file_path": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_003320.json"
115
+ }
eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Gemini 2.5 Flash with BM25 Search Tool",
3
+ "organization": "Google",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T18:25:59.636344+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 58.46938775510204,
19
+ "anls": 55.486869478144165,
20
+ "page_f1": 60.9663492063492,
21
+ "doc_f1": 78.82920634920634,
22
+ "kuiper": 45.08800000000012
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 57.3,
26
+ "anls": 55.486869478144165,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 73.33333333333333,
42
+ "anls": 71.7948717948718,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 77.27272727272727,
47
+ "anls": 72.81890046712182,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 85.41666666666666,
52
+ "anls": 76.85643564356435,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 40.76086956521739,
57
+ "anls": 40.952902757926644,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 50.0,
62
+ "anls": 52.31036324786324,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 64.8936170212766,
67
+ "anls": 67.70262933196864,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 58.536585365853654,
72
+ "anls": 60.95035529628296,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 54.65116279069767,
77
+ "anls": 51.45105745077384,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 52.0,
82
+ "anls": 54.40739778239778,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 77.08333333333334,
87
+ "anls": 73.82172131147541,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 61.53846153846154,
97
+ "anls": 64.46714691613596,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 53.333333333333336,
102
+ "anls": 45.47473759975617,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 47.82608695652174,
107
+ "anls": 35.96181299748582,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:03:19.966069+00:00",
115
+ "source_predictions_file": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_predictions_20260109_182559.jsonl",
116
+ "result_file_path": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json"
117
+ }
eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_005202.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Gemini 2.5 Pro with BM25 Search Tool",
3
+ "organization": "Google",
4
+ "description": "",
5
+ "link": null,
6
+ "tags": [
7
+ "Agentic"
8
+ ],
9
+ "submitted_by": null,
10
+ "metadata": {
11
+ "model_type": "unknown"
12
+ },
13
+ "submission_date": null,
14
+ "results": {
15
+ "overall": {
16
+ "semantic": 59.6938775510204,
17
+ "anls": 56.04493493183149,
18
+ "page_f1": 61.64985569985569,
19
+ "doc_f1": 74.58080808080808,
20
+ "kuiper": 28.047999999999792
21
+ },
22
+ "single_evidence": {
23
+ "semantic": 58.5,
24
+ "anls": 56.04493493183149,
25
+ "n": 500
26
+ },
27
+ "multi_evidence_same_doc": {
28
+ "semantic": 0,
29
+ "anls": 0,
30
+ "n": 0
31
+ },
32
+ "multi_evidence_multi_doc": {
33
+ "semantic": 0,
34
+ "anls": 0,
35
+ "n": 0
36
+ },
37
+ "by_domain": {
38
+ "Cases/Logs": {
39
+ "semantic": 73.33333333333333,
40
+ "anls": 63.64672364672364,
41
+ "n": 15
42
+ },
43
+ "Education": {
44
+ "semantic": 81.81818181818183,
45
+ "anls": 72.3102424584638,
46
+ "n": 22
47
+ },
48
+ "Events": {
49
+ "semantic": 64.58333333333334,
50
+ "anls": 60.78335195270679,
51
+ "n": 24
52
+ },
53
+ "Financial": {
54
+ "semantic": 40.21739130434783,
55
+ "anls": 43.86116897464483,
56
+ "n": 92
57
+ },
58
+ "Financial/Tax": {
59
+ "semantic": 56.25,
60
+ "anls": 61.754807692307686,
61
+ "n": 16
62
+ },
63
+ "Government/Regulatory": {
64
+ "semantic": 68.08510638297872,
65
+ "anls": 64.75420262164383,
66
+ "n": 47
67
+ },
68
+ "HR/Employment": {
69
+ "semantic": 64.63414634146342,
70
+ "anls": 52.864704856399555,
71
+ "n": 41
72
+ },
73
+ "Legal": {
74
+ "semantic": 60.46511627906976,
75
+ "anls": 51.79586563307493,
76
+ "n": 43
77
+ },
78
+ "Media/Publishing": {
79
+ "semantic": 52.0,
80
+ "anls": 53.47808414475082,
81
+ "n": 25
82
+ },
83
+ "Misc": {
84
+ "semantic": 64.58333333333334,
85
+ "anls": 66.18283242258653,
86
+ "n": 24
87
+ },
88
+ "Other": {
89
+ "semantic": 0.0,
90
+ "anls": 0.0,
91
+ "n": 1
92
+ },
93
+ "Reference": {
94
+ "semantic": 61.53846153846154,
95
+ "anls": 67.82233913890543,
96
+ "n": 52
97
+ },
98
+ "Reports": {
99
+ "semantic": 56.666666666666664,
100
+ "anls": 53.53357754327678,
101
+ "n": 75
102
+ },
103
+ "Technical": {
104
+ "semantic": 63.04347826086957,
105
+ "anls": 47.37363844512718,
106
+ "n": 23
107
+ }
108
+ },
109
+ "n_evaluated": 500,
110
+ "n_unmatched": 1811
111
+ },
112
+ "reevaluated_date": "2026-01-15T20:04:22.366647+00:00",
113
+ "source_predictions_file": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_predictions_20260109_005202.jsonl",
114
+ "result_file_path": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_005202.json"
115
+ }
eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Gemini 2.5 Pro with BM25 Search Tool",
3
+ "organization": "Google",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T18:30:30.608183+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 59.6938775510204,
19
+ "anls": 55.97919862778078,
20
+ "page_f1": 60.299220779220775,
21
+ "doc_f1": 74.23636363636363,
22
+ "kuiper": 38.90600000000025
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 58.5,
26
+ "anls": 55.97919862778078,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 66.66666666666666,
42
+ "anls": 56.98005698005698,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 72.72727272727273,
47
+ "anls": 66.75468690290825,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 66.66666666666666,
52
+ "anls": 62.67819322254806,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 47.82608695652174,
57
+ "anls": 48.11929370300614,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 50.0,
62
+ "anls": 46.96314102564102,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 65.95744680851064,
67
+ "anls": 64.23333377770668,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 56.09756097560976,
72
+ "anls": 48.92979153124233,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 63.95348837209303,
77
+ "anls": 60.44220952048519,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 48.0,
82
+ "anls": 52.95641025641026,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 64.58333333333334,
87
+ "anls": 70.43036975102193,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 63.46153846153846,
97
+ "anls": 63.5134680018838,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 60.0,
102
+ "anls": 54.81415365192609,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 50.0,
107
+ "anls": 40.50127359810298,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:06:43.674600+00:00",
115
+ "source_predictions_file": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_predictions_20260109_183030.jsonl",
116
+ "result_file_path": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json"
117
+ }
eval/reevaluated_results/Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_results_20260109_002711.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Gemini 3 Pro (Preview) with BM25 Search Tool",
3
+ "organization": "Google",
4
+ "description": "",
5
+ "link": null,
6
+ "tags": [
7
+ "Agentic"
8
+ ],
9
+ "submitted_by": null,
10
+ "metadata": {
11
+ "model_type": "unknown"
12
+ },
13
+ "submission_date": null,
14
+ "results": {
15
+ "overall": {
16
+ "semantic": 84.8636047605415,
17
+ "anls": 78.46249371016062,
18
+ "page_f1": 80.40956371617695,
19
+ "doc_f1": 91.83908943427981,
20
+ "kuiper": 27.13226452905815
21
+ },
22
+ "single_evidence": {
23
+ "semantic": 83.16633266533067,
24
+ "anls": 78.46249371016062,
25
+ "n": 499
26
+ },
27
+ "multi_evidence_same_doc": {
28
+ "semantic": 0,
29
+ "anls": 0,
30
+ "n": 0
31
+ },
32
+ "multi_evidence_multi_doc": {
33
+ "semantic": 0,
34
+ "anls": 0,
35
+ "n": 0
36
+ },
37
+ "by_domain": {
38
+ "Cases/Logs": {
39
+ "semantic": 83.33333333333334,
40
+ "anls": 75.31339031339031,
41
+ "n": 15
42
+ },
43
+ "Education": {
44
+ "semantic": 86.36363636363636,
45
+ "anls": 74.02302243211334,
46
+ "n": 22
47
+ },
48
+ "Events": {
49
+ "semantic": 83.33333333333334,
50
+ "anls": 77.06645664566456,
51
+ "n": 24
52
+ },
53
+ "Financial": {
54
+ "semantic": 72.28260869565217,
55
+ "anls": 69.36362154126739,
56
+ "n": 92
57
+ },
58
+ "Financial/Tax": {
59
+ "semantic": 81.25,
60
+ "anls": 80.57571684587813,
61
+ "n": 16
62
+ },
63
+ "Government/Regulatory": {
64
+ "semantic": 87.2340425531915,
65
+ "anls": 82.3254828677961,
66
+ "n": 47
67
+ },
68
+ "HR/Employment": {
69
+ "semantic": 86.58536585365853,
70
+ "anls": 79.69007037401929,
71
+ "n": 41
72
+ },
73
+ "Legal": {
74
+ "semantic": 93.02325581395348,
75
+ "anls": 85.19782543038357,
76
+ "n": 43
77
+ },
78
+ "Media/Publishing": {
79
+ "semantic": 74.0,
80
+ "anls": 79.1167050771702,
81
+ "n": 25
82
+ },
83
+ "Misc": {
84
+ "semantic": 93.75,
85
+ "anls": 91.86959699974574,
86
+ "n": 24
87
+ },
88
+ "Reference": {
89
+ "semantic": 88.46153846153845,
90
+ "anls": 87.98053049887939,
91
+ "n": 52
92
+ },
93
+ "Reports": {
94
+ "semantic": 84.66666666666667,
95
+ "anls": 78.78023745578506,
96
+ "n": 75
97
+ },
98
+ "Technical": {
99
+ "semantic": 73.91304347826086,
100
+ "anls": 61.21421646346686,
101
+ "n": 23
102
+ }
103
+ },
104
+ "n_evaluated": 499,
105
+ "n_unmatched": 1767
106
+ },
107
+ "reevaluated_date": "2026-01-15T20:07:35.074484+00:00",
108
+ "source_predictions_file": "Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_predictions_20260109_002711.jsonl",
109
+ "result_file_path": "Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_results_20260109_002711.json"
110
+ }
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_234108.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Human with BM25 Search Tool",
3
+ "organization": "Humanity",
4
+ "description": "",
5
+ "link": null,
6
+ "tags": [
7
+ "Agentic"
8
+ ],
9
+ "submitted_by": null,
10
+ "metadata": {
11
+ "model_type": "unknown"
12
+ },
13
+ "submission_date": null,
14
+ "results": {
15
+ "overall": {
16
+ "semantic": 76.53061224489795,
17
+ "anls": 76.89084569144522,
18
+ "page_f1": 74.27484848484849,
19
+ "doc_f1": 87.1077922077922,
20
+ "kuiper": 6.584782608695652
21
+ },
22
+ "single_evidence": {
23
+ "semantic": 75.0,
24
+ "anls": 76.89084569144522,
25
+ "n": 500
26
+ },
27
+ "multi_evidence_same_doc": {
28
+ "semantic": 0,
29
+ "anls": 0,
30
+ "n": 0
31
+ },
32
+ "multi_evidence_multi_doc": {
33
+ "semantic": 0,
34
+ "anls": 0,
35
+ "n": 0
36
+ },
37
+ "by_domain": {
38
+ "Cases/Logs": {
39
+ "semantic": 66.66666666666666,
40
+ "anls": 72.72727272727272,
41
+ "n": 15
42
+ },
43
+ "Education": {
44
+ "semantic": 95.45454545454545,
45
+ "anls": 88.2664724057374,
46
+ "n": 22
47
+ },
48
+ "Events": {
49
+ "semantic": 83.33333333333334,
50
+ "anls": 81.58602150537635,
51
+ "n": 24
52
+ },
53
+ "Financial": {
54
+ "semantic": 72.28260869565217,
55
+ "anls": 72.19996863726435,
56
+ "n": 92
57
+ },
58
+ "Financial/Tax": {
59
+ "semantic": 68.75,
60
+ "anls": 65.13888888888889,
61
+ "n": 16
62
+ },
63
+ "Government/Regulatory": {
64
+ "semantic": 77.6595744680851,
65
+ "anls": 80.70180867592104,
66
+ "n": 47
67
+ },
68
+ "HR/Employment": {
69
+ "semantic": 82.92682926829268,
70
+ "anls": 78.22470188707081,
71
+ "n": 41
72
+ },
73
+ "Legal": {
74
+ "semantic": 77.90697674418605,
75
+ "anls": 78.11361119500656,
76
+ "n": 43
77
+ },
78
+ "Media/Publishing": {
79
+ "semantic": 62.0,
80
+ "anls": 69.07251951242394,
81
+ "n": 25
82
+ },
83
+ "Misc": {
84
+ "semantic": 68.75,
85
+ "anls": 71.50538359217717,
86
+ "n": 24
87
+ },
88
+ "Other": {
89
+ "semantic": 0.0,
90
+ "anls": 0.0,
91
+ "n": 1
92
+ },
93
+ "Reference": {
94
+ "semantic": 77.88461538461539,
95
+ "anls": 87.90759949333756,
96
+ "n": 52
97
+ },
98
+ "Reports": {
99
+ "semantic": 70.0,
100
+ "anls": 75.95612610368379,
101
+ "n": 75
102
+ },
103
+ "Technical": {
104
+ "semantic": 76.08695652173914,
105
+ "anls": 73.91467899702451,
106
+ "n": 23
107
+ }
108
+ },
109
+ "n_evaluated": 500,
110
+ "n_unmatched": 0
111
+ },
112
+ "reevaluated_date": "2026-01-15T20:08:22.133765+00:00",
113
+ "source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260109_234108.jsonl",
114
+ "result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260109_234108.json"
115
+ }
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235325.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Human with BM25 Search Tool",
3
+ "organization": "Humanity",
4
+ "description": "",
5
+ "link": null,
6
+ "tags": [
7
+ "Agentic"
8
+ ],
9
+ "submitted_by": null,
10
+ "metadata": {
11
+ "model_type": "unknown"
12
+ },
13
+ "submission_date": null,
14
+ "results": {
15
+ "overall": {
16
+ "semantic": 80.3061224489796,
17
+ "anls": 81.0431479932892,
18
+ "page_f1": 77.30151515151516,
19
+ "doc_f1": 90.80112554112554,
20
+ "kuiper": 7.623700623700628
21
+ },
22
+ "single_evidence": {
23
+ "semantic": 78.7,
24
+ "anls": 81.0431479932892,
25
+ "n": 500
26
+ },
27
+ "multi_evidence_same_doc": {
28
+ "semantic": 0,
29
+ "anls": 0,
30
+ "n": 0
31
+ },
32
+ "multi_evidence_multi_doc": {
33
+ "semantic": 0,
34
+ "anls": 0,
35
+ "n": 0
36
+ },
37
+ "by_domain": {
38
+ "Cases/Logs": {
39
+ "semantic": 66.66666666666666,
40
+ "anls": 72.72727272727272,
41
+ "n": 15
42
+ },
43
+ "Education": {
44
+ "semantic": 95.45454545454545,
45
+ "anls": 88.2664724057374,
46
+ "n": 22
47
+ },
48
+ "Events": {
49
+ "semantic": 81.25,
50
+ "anls": 79.50268817204301,
51
+ "n": 24
52
+ },
53
+ "Financial": {
54
+ "semantic": 77.17391304347827,
55
+ "anls": 77.5668164633513,
56
+ "n": 92
57
+ },
58
+ "Financial/Tax": {
59
+ "semantic": 81.25,
60
+ "anls": 77.63888888888889,
61
+ "n": 16
62
+ },
63
+ "Government/Regulatory": {
64
+ "semantic": 80.85106382978722,
65
+ "anls": 84.95712782485721,
66
+ "n": 47
67
+ },
68
+ "HR/Employment": {
69
+ "semantic": 81.70731707317073,
70
+ "anls": 79.43814685734138,
71
+ "n": 41
72
+ },
73
+ "Legal": {
74
+ "semantic": 89.53488372093024,
75
+ "anls": 85.09035538105306,
76
+ "n": 43
77
+ },
78
+ "Media/Publishing": {
79
+ "semantic": 70.0,
80
+ "anls": 75.91696395686839,
81
+ "n": 25
82
+ },
83
+ "Misc": {
84
+ "semantic": 72.91666666666666,
85
+ "anls": 78.44982803662161,
86
+ "n": 24
87
+ },
88
+ "Other": {
89
+ "semantic": 0.0,
90
+ "anls": 0.0,
91
+ "n": 1
92
+ },
93
+ "Reference": {
94
+ "semantic": 78.84615384615384,
95
+ "anls": 89.83067641641446,
96
+ "n": 52
97
+ },
98
+ "Reports": {
99
+ "semantic": 70.66666666666667,
100
+ "anls": 78.94025308781077,
101
+ "n": 75
102
+ },
103
+ "Technical": {
104
+ "semantic": 86.95652173913044,
105
+ "anls": 83.52609662978936,
106
+ "n": 23
107
+ }
108
+ },
109
+ "n_evaluated": 500,
110
+ "n_unmatched": 0
111
+ },
112
+ "reevaluated_date": "2026-01-15T20:09:11.287271+00:00",
113
+ "source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260109_235325.jsonl",
114
+ "result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260109_235325.json"
115
+ }
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235724.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Human with BM25 Search Tool",
3
+ "organization": "Humanity",
4
+ "description": "Human equipped with the same search engine as agentic baselines.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Vision and Language",
9
+ "Sparse Search Tool"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "open-weight"
14
+ },
15
+ "submission_date": "2026-01-09T23:57:24.249882+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 81.02040816326532,
19
+ "anls": 82.43662306660298,
20
+ "page_f1": 78.83484848484848,
21
+ "doc_f1": 92.80112554112554,
22
+ "kuiper": 8.217922606924656
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 79.4,
26
+ "anls": 82.43662306660298,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 66.66666666666666,
42
+ "anls": 72.72727272727272,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 90.9090909090909,
47
+ "anls": 88.2664724057374,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 81.25,
52
+ "anls": 79.50268817204301,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 76.08695652173914,
57
+ "anls": 77.5668164633513,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 81.25,
62
+ "anls": 82.47759856630825,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 80.85106382978722,
67
+ "anls": 84.95712782485721,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 84.14634146341463,
72
+ "anls": 81.26741515002432,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 89.53488372093024,
77
+ "anls": 85.09035538105306,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 70.0,
82
+ "anls": 79.91696395686839,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 72.91666666666666,
87
+ "anls": 78.44982803662161,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 79.8076923076923,
97
+ "anls": 91.40410298984105,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 76.0,
102
+ "anls": 83.7735864211441,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 86.95652173913044,
107
+ "anls": 83.52609662978936,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:09:56.056259+00:00",
115
+ "source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260109_235724.jsonl",
116
+ "result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260109_235724.json"
117
+ }
eval/reevaluated_results/OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-4.1 (2025-04-14) with BM25 Search Tool",
3
+ "organization": "OpenAI",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T15:32:21.908816+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 58.571428571428555,
19
+ "anls": 53.29254644474454,
20
+ "page_f1": 64.14190476190477,
21
+ "doc_f1": 82.82666666666667,
22
+ "kuiper": 43.93199999999983
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 57.4,
26
+ "anls": 53.29254644474454,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 53.333333333333336,
42
+ "anls": 48.59180666077218,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 52.27272727272727,
47
+ "anls": 48.04545454545455,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 68.75,
52
+ "anls": 67.55050505050505,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 40.76086956521739,
57
+ "anls": 43.62404525327831,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 71.875,
62
+ "anls": 64.58333333333334,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 55.319148936170215,
67
+ "anls": 51.52629513848961,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 70.73170731707317,
72
+ "anls": 55.117501174925685,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 58.139534883720934,
77
+ "anls": 55.94315245478037,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 40.0,
82
+ "anls": 54.188065268065266,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 77.08333333333334,
87
+ "anls": 69.51844262295083,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 62.5,
97
+ "anls": 60.011945621794936,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 60.66666666666667,
102
+ "anls": 47.26331129213486,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 67.3913043478261,
107
+ "anls": 61.60068502092203,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:11:50.993374+00:00",
115
+ "source_predictions_file": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153221.jsonl",
116
+ "result_file_path": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json"
117
+ }
eval/reevaluated_results/OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-4.1 Nano (2025-04-14) with BM25 Search Tool",
3
+ "organization": "OpenAI",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T15:38:12.353112+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 19.18367346938775,
19
+ "anls": 19.21201395702391,
20
+ "page_f1": 27.60809523809524,
21
+ "doc_f1": 40.18095238095238,
22
+ "kuiper": 27.656000000000265
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 18.8,
26
+ "anls": 19.21201395702391,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 13.333333333333334,
42
+ "anls": 12.5,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 36.36363636363637,
47
+ "anls": 33.85540184453228,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 25.0,
52
+ "anls": 24.252897639994416,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 15.217391304347828,
57
+ "anls": 15.744375438721086,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 0.0,
62
+ "anls": 3.125,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 20.212765957446805,
67
+ "anls": 18.040407652422672,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 17.073170731707318,
72
+ "anls": 17.049790482898338,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 23.25581395348837,
77
+ "anls": 20.54263565891473,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 4.0,
82
+ "anls": 13.666666666666666,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 25.0,
87
+ "anls": 28.843503294839174,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 16.346153846153847,
97
+ "anls": 20.3827772417516,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 23.333333333333332,
102
+ "anls": 19.284216647617285,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 19.565217391304348,
107
+ "anls": 27.075249588209658,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:12:41.288382+00:00",
115
+ "source_predictions_file": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153812.jsonl",
116
+ "result_file_path": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json"
117
+ }
eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-5.2 (2025-12-11) with BM25 Search Tool",
3
+ "organization": "OpenAI",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images. GPT-5.2 exhibits more conservative behavior than GPT-5, refusing to provide an answer when uncertain.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T15:19:12.016451+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 66.22448979591837,
19
+ "anls": 57.28438090955278,
20
+ "page_f1": 67.62380952380951,
21
+ "doc_f1": 83.72666666666666,
22
+ "kuiper": 62.57199999999988
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 64.9,
26
+ "anls": 57.28438090955278,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 73.33333333333333,
42
+ "anls": 58.46153846153847,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 77.27272727272727,
47
+ "anls": 59.00137741046832,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 72.91666666666666,
52
+ "anls": 57.55050505050505,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 53.2608695652174,
57
+ "anls": 49.679264550051975,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 65.625,
62
+ "anls": 61.08221187025536,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 67.02127659574468,
67
+ "anls": 58.551919442177,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 60.97560975609756,
72
+ "anls": 44.265703074651974,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 77.90697674418605,
77
+ "anls": 66.19399979865096,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 38.0,
82
+ "anls": 35.05751747729549,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 83.33333333333334,
87
+ "anls": 82.5164707977208,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 72.11538461538461,
97
+ "anls": 67.62508443509842,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 64.66666666666666,
102
+ "anls": 59.65381728416852,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 60.86956521739131,
107
+ "anls": 55.55075090789312,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:14:52.407712+00:00",
115
+ "source_predictions_file": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_predictions_20260109_151912.jsonl",
116
+ "result_file_path": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json"
117
+ }
eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-5 (2025-08-07) with BM25 Search Tool",
3
+ "organization": "OpenAI",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T15:21:04.336083+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 76.02040816326532,
19
+ "anls": 70.03817583122695,
20
+ "page_f1": 74.16285714285713,
21
+ "doc_f1": 86.45064935064934,
22
+ "kuiper": 52.256000000000114
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 74.5,
26
+ "anls": 70.03817583122695,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 66.66666666666666,
42
+ "anls": 62.757834757834765,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 68.18181818181817,
47
+ "anls": 63.54683195592287,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 83.33333333333334,
52
+ "anls": 78.3838383838384,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 65.21739130434783,
57
+ "anls": 62.36899647186356,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 81.25,
62
+ "anls": 86.77496898263027,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 73.40425531914893,
67
+ "anls": 68.7671602173282,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 71.95121951219512,
72
+ "anls": 64.5688672367669,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 79.06976744186046,
77
+ "anls": 70.27143399236422,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 60.0,
82
+ "anls": 65.71897407160566,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 83.33333333333334,
87
+ "anls": 86.70405982905983,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 76.92307692307693,
97
+ "anls": 76.57306264232653,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 80.0,
102
+ "anls": 71.72139814224423,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 93.47826086956522,
107
+ "anls": 73.31752767476483,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:15:52.047010+00:00",
115
+ "source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152104.jsonl",
116
+ "result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json"
117
+ }
eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-5 (2025-08-07) with File Search",
3
+ "organization": "OpenAI",
4
+ "description": "Managed, single-shot retrieval mechanism.",
5
+ "link": "https://platform.openai.com/docs/guides/tools-file-search",
6
+ "tags": [
7
+ "Conventional RAG",
8
+ "Semantic Search Tool"
9
+ ],
10
+ "submitted_by": "Borchmann",
11
+ "metadata": {
12
+ "model_type": "api"
13
+ },
14
+ "submission_date": "2026-01-04T14:05:37.240829+00:00",
15
+ "results": {
16
+ "overall": {
17
+ "semantic": 48.061224489795926,
18
+ "anls": 44.84773268944071,
19
+ "page_f1": 29.277142857142856,
20
+ "doc_f1": 66.60666666666667,
21
+ "kuiper": 31.15400000000007
22
+ },
23
+ "single_evidence": {
24
+ "semantic": 47.099999999999994,
25
+ "anls": 44.84773268944071,
26
+ "n": 500
27
+ },
28
+ "multi_evidence_same_doc": {
29
+ "semantic": 0,
30
+ "anls": 0,
31
+ "n": 0
32
+ },
33
+ "multi_evidence_multi_doc": {
34
+ "semantic": 0,
35
+ "anls": 0,
36
+ "n": 0
37
+ },
38
+ "by_domain": {
39
+ "Cases/Logs": {
40
+ "semantic": 13.333333333333334,
41
+ "anls": 14.833333333333334,
42
+ "n": 15
43
+ },
44
+ "Education": {
45
+ "semantic": 79.54545454545455,
46
+ "anls": 63.871507280598195,
47
+ "n": 22
48
+ },
49
+ "Events": {
50
+ "semantic": 72.91666666666666,
51
+ "anls": 55.83149489399489,
52
+ "n": 24
53
+ },
54
+ "Financial": {
55
+ "semantic": 49.45652173913043,
56
+ "anls": 46.26513610007698,
57
+ "n": 92
58
+ },
59
+ "Financial/Tax": {
60
+ "semantic": 15.625,
61
+ "anls": 17.540322580645164,
62
+ "n": 16
63
+ },
64
+ "Government/Regulatory": {
65
+ "semantic": 45.744680851063826,
66
+ "anls": 41.75603723934328,
67
+ "n": 47
68
+ },
69
+ "HR/Employment": {
70
+ "semantic": 39.02439024390244,
71
+ "anls": 42.22238179140625,
72
+ "n": 41
73
+ },
74
+ "Legal": {
75
+ "semantic": 37.2093023255814,
76
+ "anls": 32.74308378959542,
77
+ "n": 43
78
+ },
79
+ "Media/Publishing": {
80
+ "semantic": 46.0,
81
+ "anls": 45.83167739167739,
82
+ "n": 25
83
+ },
84
+ "Misc": {
85
+ "semantic": 64.58333333333334,
86
+ "anls": 67.18447826857438,
87
+ "n": 24
88
+ },
89
+ "Other": {
90
+ "semantic": 0.0,
91
+ "anls": 0.0,
92
+ "n": 1
93
+ },
94
+ "Reference": {
95
+ "semantic": 39.42307692307692,
96
+ "anls": 40.48309244262362,
97
+ "n": 52
98
+ },
99
+ "Reports": {
100
+ "semantic": 46.666666666666664,
101
+ "anls": 46.80494177991155,
102
+ "n": 75
103
+ },
104
+ "Technical": {
105
+ "semantic": 63.04347826086957,
106
+ "anls": 62.77759844334801,
107
+ "n": 23
108
+ }
109
+ },
110
+ "n_evaluated": 500,
111
+ "n_unmatched": 0
112
+ },
113
+ "reevaluated_date": "2026-01-15T20:17:01.554804+00:00",
114
+ "source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_predictions_20260104_140537.jsonl",
115
+ "result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json"
116
+ }
eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-5 Mini (2025-08-07) with BM25 Search Tool",
3
+ "organization": "OpenAI",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T15:26:50.820104+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 65.0,
19
+ "anls": 55.16542612989696,
20
+ "page_f1": 67.57095238095239,
21
+ "doc_f1": 82.35303030303031,
22
+ "kuiper": 71.86573146292572
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 63.7,
26
+ "anls": 55.16542612989696,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 66.66666666666666,
42
+ "anls": 57.16524216524217,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 68.18181818181817,
47
+ "anls": 63.349203497424845,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 62.5,
52
+ "anls": 53.63190419293608,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 47.28260869565217,
57
+ "anls": 43.770804881794874,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 46.875,
62
+ "anls": 39.15760869565217,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 71.27659574468085,
67
+ "anls": 62.856694438441366,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 60.97560975609756,
72
+ "anls": 51.21538014830698,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 75.5813953488372,
77
+ "anls": 62.31744836688789,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 56.00000000000001,
82
+ "anls": 39.93216037493774,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 70.83333333333334,
87
+ "anls": 63.35950315116982,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 76.92307692307693,
97
+ "anls": 73.02503210878088,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 66.66666666666666,
102
+ "anls": 54.869395530526155,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 67.3913043478261,
107
+ "anls": 53.29419750997293,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:18:52.086804+00:00",
115
+ "source_predictions_file": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152650.jsonl",
116
+ "result_file_path": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json"
117
+ }
eval/reevaluated_results/OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-5 Nano (2025-08-07) with BM25 Search Tool",
3
+ "organization": "OpenAI",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T15:28:28.366309+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 56.6326530612245,
19
+ "anls": 52.255247982009955,
20
+ "page_f1": 60.877142857142864,
21
+ "doc_f1": 82.2030303030303,
22
+ "kuiper": 47.40000000000003
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 55.50000000000001,
26
+ "anls": 52.255247982009955,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 53.333333333333336,
42
+ "anls": 53.461538461538474,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 63.63636363636363,
47
+ "anls": 54.95375836284927,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 62.5,
52
+ "anls": 51.78930433365917,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 40.21739130434783,
57
+ "anls": 40.14762316798784,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 68.75,
62
+ "anls": 69.68257767828244,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 64.8936170212766,
67
+ "anls": 56.496054764723326,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 52.4390243902439,
72
+ "anls": 42.85858107680723,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 58.139534883720934,
77
+ "anls": 55.28314708547266,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 50.0,
82
+ "anls": 51.784085491742935,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 70.83333333333334,
87
+ "anls": 74.53137140637142,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 57.692307692307686,
97
+ "anls": 61.940508414693205,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 54.666666666666664,
102
+ "anls": 48.18660787855504,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 65.21739130434783,
107
+ "anls": 59.014067370235345,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:19:54.021229+00:00",
115
+ "source_predictions_file": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152828.jsonl",
116
+ "result_file_path": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json"
117
+ }
eval/reevaluated_results/OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT 4.1 Mini (2025-04-14) with BM25 Search Tool",
3
+ "organization": "OpenAI",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T15:35:16.458002+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 51.22448979591837,
19
+ "anls": 46.26708858125157,
20
+ "page_f1": 59.905054945054935,
21
+ "doc_f1": 77.61731601731601,
22
+ "kuiper": 40.01224489795946
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 50.2,
26
+ "anls": 46.26708858125157,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 43.333333333333336,
42
+ "anls": 39.64209401709402,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 59.09090909090909,
47
+ "anls": 48.57647622469757,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 56.25,
52
+ "anls": 53.83018770627063,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 35.869565217391305,
57
+ "anls": 34.96285359224887,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 46.875,
62
+ "anls": 44.4215309712932,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 44.680851063829785,
67
+ "anls": 44.19583719868558,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 53.65853658536586,
72
+ "anls": 46.501429746354255,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 53.48837209302325,
77
+ "anls": 43.64210613408689,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 48.0,
82
+ "anls": 46.71106819031614,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 56.25,
87
+ "anls": 55.49877795634123,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 62.5,
97
+ "anls": 62.86510186138165,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 51.33333333333333,
102
+ "anls": 45.15164464860224,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 65.21739130434783,
107
+ "anls": 53.71736172158072,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:21:13.745638+00:00",
115
+ "source_predictions_file": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153516.jsonl",
116
+ "result_file_path": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json"
117
+ }
eval/reevaluated_results/OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Gemini 3 Pro with BM25 Search Tool",
3
+ "organization": "OpenAI",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T18:53:47.189606+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 80.16032064128255,
19
+ "anls": 73.52101315170081,
20
+ "page_f1": 78.4607309857811,
21
+ "doc_f1": 90.20248288785363,
22
+ "kuiper": 26.781563126252323
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 78.55711422845691,
26
+ "anls": 73.52101315170081,
27
+ "n": 499
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 80.0,
42
+ "anls": 85.12820512820514,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 77.27272727272727,
47
+ "anls": 64.8800482891392,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 81.25,
52
+ "anls": 79.84423442344234,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 66.84782608695652,
57
+ "anls": 63.13552237747254,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 93.75,
62
+ "anls": 93.48332554153032,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 87.2340425531915,
67
+ "anls": 78.26722646935413,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 86.58536585365853,
72
+ "anls": 77.34609828919353,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 82.55813953488372,
77
+ "anls": 68.10496996543507,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 78.0,
82
+ "anls": 79.13892729939242,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 85.41666666666666,
87
+ "anls": 85.4921497584541,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 81.73076923076923,
97
+ "anls": 83.68517307852197,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 74.32432432432432,
102
+ "anls": 71.94584088751826,
103
+ "n": 74
104
+ },
105
+ "Technical": {
106
+ "semantic": 76.08695652173914,
107
+ "anls": 55.56822369489126,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 499,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:23:21.812681+00:00",
115
+ "source_predictions_file": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_predictions_20260109_185347.jsonl",
116
+ "result_file_path": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json"
117
+ }
eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260107_113714.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-4o (2024-08-06) with HEAVEN Retrieval",
3
+ "organization": "OpenAI - KAIST",
4
+ "description": "",
5
+ "link": null,
6
+ "tags": [
7
+ "Agentic"
8
+ ],
9
+ "submitted_by": null,
10
+ "metadata": {
11
+ "model_type": "unknown"
12
+ },
13
+ "submission_date": null,
14
+ "results": {
15
+ "overall": {
16
+ "semantic": 43.469387755102034,
17
+ "anls": 40.039307087937075,
18
+ "page_f1": 43.05228327228327,
19
+ "doc_f1": 56.64095238095238,
20
+ "kuiper": null
21
+ },
22
+ "single_evidence": {
23
+ "semantic": 42.6,
24
+ "anls": 40.039307087937075,
25
+ "n": 500
26
+ },
27
+ "multi_evidence_same_doc": {
28
+ "semantic": 0,
29
+ "anls": 0,
30
+ "n": 0
31
+ },
32
+ "multi_evidence_multi_doc": {
33
+ "semantic": 0,
34
+ "anls": 0,
35
+ "n": 0
36
+ },
37
+ "by_domain": {
38
+ "Cases/Logs": {
39
+ "semantic": 46.666666666666664,
40
+ "anls": 46.75783475783476,
41
+ "n": 15
42
+ },
43
+ "Education": {
44
+ "semantic": 36.36363636363637,
45
+ "anls": 36.95054945054945,
46
+ "n": 22
47
+ },
48
+ "Events": {
49
+ "semantic": 43.75,
50
+ "anls": 38.03661616161616,
51
+ "n": 24
52
+ },
53
+ "Financial": {
54
+ "semantic": 42.934782608695656,
55
+ "anls": 42.52300514978308,
56
+ "n": 92
57
+ },
58
+ "Financial/Tax": {
59
+ "semantic": 31.25,
60
+ "anls": 31.922043010752688,
61
+ "n": 16
62
+ },
63
+ "Government/Regulatory": {
64
+ "semantic": 44.680851063829785,
65
+ "anls": 36.32965392203914,
66
+ "n": 47
67
+ },
68
+ "HR/Employment": {
69
+ "semantic": 39.02439024390244,
70
+ "anls": 33.06592985170988,
71
+ "n": 41
72
+ },
73
+ "Legal": {
74
+ "semantic": 41.86046511627907,
75
+ "anls": 33.1515319306017,
76
+ "n": 43
77
+ },
78
+ "Media/Publishing": {
79
+ "semantic": 20.0,
80
+ "anls": 31.078787878787878,
81
+ "n": 25
82
+ },
83
+ "Misc": {
84
+ "semantic": 52.083333333333336,
85
+ "anls": 51.80921052631579,
86
+ "n": 24
87
+ },
88
+ "Other": {
89
+ "semantic": 0.0,
90
+ "anls": 0.0,
91
+ "n": 1
92
+ },
93
+ "Reference": {
94
+ "semantic": 42.30769230769231,
95
+ "anls": 46.93060276608143,
96
+ "n": 52
97
+ },
98
+ "Reports": {
99
+ "semantic": 52.0,
100
+ "anls": 44.41148230399428,
101
+ "n": 75
102
+ },
103
+ "Technical": {
104
+ "semantic": 41.30434782608695,
105
+ "anls": 38.6639124934416,
106
+ "n": 23
107
+ }
108
+ },
109
+ "n_evaluated": 500,
110
+ "n_unmatched": 0
111
+ },
112
+ "reevaluated_date": "2026-01-15T20:24:35.010694+00:00",
113
+ "source_predictions_file": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260107_113714.jsonl",
114
+ "result_file_path": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260107_113714.json"
115
+ }
eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-4o (2024-08-06) with HEAVEN Retrieval",
3
+ "organization": "OpenAI / KAIST",
4
+ "description": "Image-based retrieval. Best setup described in HEAVEN paper.",
5
+ "link": "",
6
+ "tags": [
7
+ "Conventional RAG",
8
+ "Semantic Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T15:44:27.735534+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 46.73469387755102,
19
+ "anls": 45.649762341432954,
20
+ "page_f1": 43.169719169719166,
21
+ "doc_f1": 59.24761904761905,
22
+ "kuiper": null
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 45.800000000000004,
26
+ "anls": 45.649762341432954,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 53.333333333333336,
42
+ "anls": 48.75783475783476,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 40.909090909090914,
47
+ "anls": 38.506493506493506,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 56.25,
52
+ "anls": 55.056754787358244,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 43.47826086956522,
57
+ "anls": 48.16466676354977,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 50.0,
62
+ "anls": 44.99022482893451,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 53.191489361702125,
67
+ "anls": 47.1956486962086,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 40.243902439024396,
72
+ "anls": 32.93040293040293,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 37.2093023255814,
77
+ "anls": 35.73555320648344,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 28.000000000000004,
82
+ "anls": 43.22,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 47.91666666666667,
87
+ "anls": 49.20255183413078,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 46.15384615384615,
97
+ "anls": 48.954805357154754,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 52.0,
102
+ "anls": 50.65907330372244,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 50.0,
107
+ "anls": 46.20014437749956,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:25:44.256079+00:00",
115
+ "source_predictions_file": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_154427.jsonl",
116
+ "result_file_path": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json"
117
+ }
eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-5.2 (2024-08-06) with HEAVEN Retrieval",
3
+ "organization": "OpenAI / KAIST",
4
+ "description": "Image-based retrieval. Best setup described in HEAVEN paper, but with newer GPT.",
5
+ "link": "",
6
+ "tags": [
7
+ "Conventional RAG",
8
+ "Semantic Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "api"
14
+ },
15
+ "submission_date": "2026-01-09T17:56:39.771528+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 50.0,
19
+ "anls": 47.46445252141211,
20
+ "page_f1": 48.43228327228327,
21
+ "doc_f1": 62.30761904761904,
22
+ "kuiper": null
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 49.0,
26
+ "anls": 47.46445252141211,
27
+ "n": 500
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 53.333333333333336,
42
+ "anls": 43.64672364672364,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 52.27272727272727,
47
+ "anls": 51.569264069264065,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 47.91666666666667,
52
+ "anls": 46.90982404692082,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 48.369565217391305,
57
+ "anls": 48.83531625708929,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 43.75,
62
+ "anls": 43.92031798457114,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 59.57446808510638,
67
+ "anls": 49.070286122357786,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 39.02439024390244,
72
+ "anls": 34.149915125524885,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 45.348837209302324,
77
+ "anls": 46.299372462163156,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 26.0,
82
+ "anls": 33.613578417414736,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 50.0,
87
+ "anls": 51.63690476190476,
88
+ "n": 24
89
+ },
90
+ "Other": {
91
+ "semantic": 0.0,
92
+ "anls": 0.0,
93
+ "n": 1
94
+ },
95
+ "Reference": {
96
+ "semantic": 53.84615384615385,
97
+ "anls": 58.28202679165414,
98
+ "n": 52
99
+ },
100
+ "Reports": {
101
+ "semantic": 52.666666666666664,
102
+ "anls": 52.18098320525303,
103
+ "n": 75
104
+ },
105
+ "Technical": {
106
+ "semantic": 56.52173913043478,
107
+ "anls": 39.14801495210919,
108
+ "n": 23
109
+ }
110
+ },
111
+ "n_evaluated": 500,
112
+ "n_unmatched": 0
113
+ },
114
+ "reevaluated_date": "2026-01-15T20:27:00.066247+00:00",
115
+ "source_predictions_file": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_175639.jsonl",
116
+ "result_file_path": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json"
117
+ }
eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_results_20260107_153009.json ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GPT-5.2 (2025-12-11) with HEAVEN Retrieval",
3
+ "organization": "OpenAI - KAIST",
4
+ "description": "",
5
+ "link": null,
6
+ "tags": [
7
+ "Agentic"
8
+ ],
9
+ "submitted_by": null,
10
+ "metadata": {
11
+ "model_type": "unknown"
12
+ },
13
+ "submission_date": null,
14
+ "results": {
15
+ "overall": {
16
+ "semantic": 48.16326530612245,
17
+ "anls": 43.22495703626025,
18
+ "page_f1": 46.402539682539675,
19
+ "doc_f1": 57.27428571428571,
20
+ "kuiper": null
21
+ },
22
+ "single_evidence": {
23
+ "semantic": 47.199999999999996,
24
+ "anls": 43.22495703626025,
25
+ "n": 500
26
+ },
27
+ "multi_evidence_same_doc": {
28
+ "semantic": 0,
29
+ "anls": 0,
30
+ "n": 0
31
+ },
32
+ "multi_evidence_multi_doc": {
33
+ "semantic": 0,
34
+ "anls": 0,
35
+ "n": 0
36
+ },
37
+ "by_domain": {
38
+ "Cases/Logs": {
39
+ "semantic": 60.0,
40
+ "anls": 43.64672364672364,
41
+ "n": 15
42
+ },
43
+ "Education": {
44
+ "semantic": 45.45454545454545,
45
+ "anls": 41.99134199134198,
46
+ "n": 22
47
+ },
48
+ "Events": {
49
+ "semantic": 47.91666666666667,
50
+ "anls": 42.272727272727266,
51
+ "n": 24
52
+ },
53
+ "Financial": {
54
+ "semantic": 43.47826086956522,
55
+ "anls": 39.79157919704788,
56
+ "n": 92
57
+ },
58
+ "Financial/Tax": {
59
+ "semantic": 43.75,
60
+ "anls": 45.17687392862708,
61
+ "n": 16
62
+ },
63
+ "Government/Regulatory": {
64
+ "semantic": 53.191489361702125,
65
+ "anls": 47.41888368008188,
66
+ "n": 47
67
+ },
68
+ "HR/Employment": {
69
+ "semantic": 40.243902439024396,
70
+ "anls": 32.19869561332976,
71
+ "n": 41
72
+ },
73
+ "Legal": {
74
+ "semantic": 43.02325581395349,
75
+ "anls": 42.43497069635968,
76
+ "n": 43
77
+ },
78
+ "Media/Publishing": {
79
+ "semantic": 26.0,
80
+ "anls": 30.585587652734088,
81
+ "n": 25
82
+ },
83
+ "Misc": {
84
+ "semantic": 45.83333333333333,
85
+ "anls": 47.470238095238095,
86
+ "n": 24
87
+ },
88
+ "Other": {
89
+ "semantic": 0.0,
90
+ "anls": 0.0,
91
+ "n": 1
92
+ },
93
+ "Reference": {
94
+ "semantic": 53.84615384615385,
95
+ "anls": 56.89824130630616,
96
+ "n": 52
97
+ },
98
+ "Reports": {
99
+ "semantic": 54.0,
100
+ "anls": 46.64353866236792,
101
+ "n": 75
102
+ },
103
+ "Technical": {
104
+ "semantic": 54.347826086956516,
105
+ "anls": 39.18827260106249,
106
+ "n": 23
107
+ }
108
+ },
109
+ "n_evaluated": 500,
110
+ "n_unmatched": 0
111
+ },
112
+ "reevaluated_date": "2026-01-15T20:28:06.717531+00:00",
113
+ "source_predictions_file": "OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_predictions_20260107_153009.jsonl",
114
+ "result_file_path": "OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_results_20260107_153009.json"
115
+ }
eval/reevaluated_results/Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GLM-4.6V Flash with BM25 Search Tool",
3
+ "organization": "Z.AI",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "open-weight"
14
+ },
15
+ "submission_date": "2026-01-10T13:22:27.811792+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 43.658746063555675,
19
+ "anls": 30.17090068718362,
20
+ "page_f1": 28.991793110029583,
21
+ "doc_f1": 51.58650634602539,
22
+ "kuiper": 29.321285140562065
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 42.78557114228457,
26
+ "anls": 30.17090068718362,
27
+ "n": 499
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 43.333333333333336,
42
+ "anls": 30.313390313390315,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 54.54545454545454,
47
+ "anls": 34.34782608695652,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 64.58333333333334,
52
+ "anls": 52.92922722985768,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 34.78260869565217,
57
+ "anls": 23.538822057620244,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 21.875,
62
+ "anls": 21.39516129032258,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 37.234042553191486,
67
+ "anls": 29.5464725643897,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 46.34146341463415,
72
+ "anls": 37.17815890071988,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 50.0,
77
+ "anls": 37.64410653945538,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 38.0,
82
+ "anls": 26.401353874883288,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 68.75,
87
+ "anls": 48.707026404394824,
88
+ "n": 24
89
+ },
90
+ "Reference": {
91
+ "semantic": 40.38461538461539,
92
+ "anls": 23.25877926421405,
93
+ "n": 52
94
+ },
95
+ "Reports": {
96
+ "semantic": 40.666666666666664,
97
+ "anls": 25.79399206429042,
98
+ "n": 75
99
+ },
100
+ "Technical": {
101
+ "semantic": 36.95652173913043,
102
+ "anls": 24.436392914653783,
103
+ "n": 23
104
+ }
105
+ },
106
+ "n_evaluated": 499,
107
+ "n_unmatched": 1767
108
+ },
109
+ "reevaluated_date": "2026-01-15T20:30:38.431851+00:00",
110
+ "source_predictions_file": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_predictions_20260110_132227.jsonl",
111
+ "result_file_path": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json"
112
+ }
eval/reevaluated_results/Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "GLM-4.6V with BM25 Search Tool",
3
+ "organization": "Z.AI",
4
+ "description": "Max 10 iterations, up to 5 result pages consumed as images.",
5
+ "link": "",
6
+ "tags": [
7
+ "Agentic",
8
+ "Sparse Search Tool",
9
+ "Vision and Language"
10
+ ],
11
+ "submitted_by": "Borchmann",
12
+ "metadata": {
13
+ "model_type": "open-weight"
14
+ },
15
+ "submission_date": "2026-01-10T13:18:26.686587+00:00",
16
+ "results": {
17
+ "overall": {
18
+ "semantic": 64.92576990716128,
19
+ "anls": 59.661893537203156,
20
+ "page_f1": 66.02347552247352,
21
+ "doc_f1": 86.7908978129419,
22
+ "kuiper": 49.83064516129022
23
+ },
24
+ "single_evidence": {
25
+ "semantic": 63.62725450901804,
26
+ "anls": 59.661893537203156,
27
+ "n": 499
28
+ },
29
+ "multi_evidence_same_doc": {
30
+ "semantic": 0,
31
+ "anls": 0,
32
+ "n": 0
33
+ },
34
+ "multi_evidence_multi_doc": {
35
+ "semantic": 0,
36
+ "anls": 0,
37
+ "n": 0
38
+ },
39
+ "by_domain": {
40
+ "Cases/Logs": {
41
+ "semantic": 73.33333333333333,
42
+ "anls": 62.16524216524218,
43
+ "n": 15
44
+ },
45
+ "Education": {
46
+ "semantic": 61.36363636363637,
47
+ "anls": 54.25829440651575,
48
+ "n": 22
49
+ },
50
+ "Events": {
51
+ "semantic": 77.08333333333334,
52
+ "anls": 67.87290397408577,
53
+ "n": 24
54
+ },
55
+ "Financial": {
56
+ "semantic": 51.63043478260869,
57
+ "anls": 51.19993983845437,
58
+ "n": 92
59
+ },
60
+ "Financial/Tax": {
61
+ "semantic": 68.75,
62
+ "anls": 62.5648667601683,
63
+ "n": 16
64
+ },
65
+ "Government/Regulatory": {
66
+ "semantic": 73.40425531914893,
67
+ "anls": 70.93589720557641,
68
+ "n": 47
69
+ },
70
+ "HR/Employment": {
71
+ "semantic": 63.41463414634146,
72
+ "anls": 59.735891761304075,
73
+ "n": 41
74
+ },
75
+ "Legal": {
76
+ "semantic": 67.44186046511628,
77
+ "anls": 55.536175710594314,
78
+ "n": 43
79
+ },
80
+ "Media/Publishing": {
81
+ "semantic": 68.0,
82
+ "anls": 69.11970073982938,
83
+ "n": 25
84
+ },
85
+ "Misc": {
86
+ "semantic": 72.91666666666666,
87
+ "anls": 75.10160446706249,
88
+ "n": 24
89
+ },
90
+ "Reference": {
91
+ "semantic": 59.61538461538461,
92
+ "anls": 60.632124141167864,
93
+ "n": 52
94
+ },
95
+ "Reports": {
96
+ "semantic": 63.33333333333333,
97
+ "anls": 56.89167319856098,
98
+ "n": 75
99
+ },
100
+ "Technical": {
101
+ "semantic": 58.69565217391305,
102
+ "anls": 51.450020851943364,
103
+ "n": 23
104
+ }
105
+ },
106
+ "n_evaluated": 499,
107
+ "n_unmatched": 1767
108
+ },
109
+ "reevaluated_date": "2026-01-15T20:31:43.022276+00:00",
110
+ "source_predictions_file": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_predictions_20260110_131826.jsonl",
111
+ "result_file_path": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json"
112
+ }