Spaces:

Snowflake
/

MADQA-Leaderboard

Running

App Files Files

Borchmann commited on Jan 15

Commit

4829aac

verified ·

1 Parent(s): 59e29ca

Upload folder using huggingface_hub

Browse files

Files changed (34) hide show

app.py +273 -83
eval/batch_reevaluate.py +434 -0
eval/evaluate.py +93 -27
eval/metrics.py +500 -1
eval/reevaluate_submissions.py +254 -0
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json +112 -0
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json +112 -0
eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json +112 -0
eval/reevaluated_results/Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json +117 -0
eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_002125.json +115 -0
eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json +117 -0
eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_003320.json +115 -0
eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json +117 -0
eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_005202.json +115 -0
eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json +117 -0
eval/reevaluated_results/Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_results_20260109_002711.json +110 -0
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_234108.json +115 -0
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235325.json +115 -0
eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235724.json +117 -0
eval/reevaluated_results/OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json +117 -0
eval/reevaluated_results/OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json +117 -0
eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json +117 -0
eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json +117 -0
eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json +116 -0
eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json +117 -0
eval/reevaluated_results/OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json +117 -0
eval/reevaluated_results/OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json +117 -0
eval/reevaluated_results/OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json +117 -0
eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260107_113714.json +115 -0
eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json +117 -0
eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json +117 -0
eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_results_20260107_153009.json +115 -0
eval/reevaluated_results/Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json +112 -0
eval/reevaluated_results/Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json +112 -0

app.py CHANGED Viewed

@@ -22,10 +22,15 @@ import os
 import secrets
 import shutil
 import sys
 from datetime import datetime, timezone
 from pathlib import Path
 from urllib.parse import urlencode, quote, unquote
 import pandas as pd
 import plotly.graph_objects as go
 import requests
@@ -35,7 +40,15 @@ from huggingface_hub import snapshot_download, HfApi, hf_hub_download
 # Add eval module to path
 sys.path.insert(0, str(Path(__file__).parent / "eval"))
 try:
-    from metrics import anls_star, citation_f1, kuiper_statistic
     from datasets import load_dataset
     EVAL_AVAILABLE = True
 except ImportError:
@@ -916,10 +929,17 @@ def get_model_type_html(model_type: str) -> str:
     return f'<span style="color: {color}; font-weight: 500;">{fallback_emoji} {model_type}</span>'
 @st.cache_data(ttl=300)  # Cache for 5 minutes
 def load_eval_results() -> pd.DataFrame:
-    """Load evaluation results from JSON files."""
-    results = []
     results_path = Path(EVAL_RESULTS_PATH)
     if not results_path.exists():
@@ -945,36 +965,76 @@ def load_eval_results() -> pd.DataFrame:
                     # Get per-domain scores if available
                     by_domain = result_scores.get("by_domain", {})
-                    results.append({
                         "Model": model_name,
                         "Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
                         "Model Type": metadata.get("model_type", "unknown"),
                         "Tags": tags,  # Store as list
-                        # Answer correctness metrics (ANLS*)
-                        "Accuracy (ANLS*)": result_scores.get("overall", {}).get("anls", 0.0),
-                        "Acc. Single-Hop": result_scores.get("single_evidence", {}).get("anls", 0.0),
-                        "Acc. Cross-Page": result_scores.get("multi_evidence_same_doc", {}).get("anls", 0.0),
-                        "Acc. Cross-Doc": result_scores.get("multi_evidence_multi_doc", {}).get("anls", 0.0),
                         # Attribution metrics
-                        "Attribution (Page F1)": result_scores.get("overall", {}).get("page_f1", 0.0),
-                        "Attribution (Doc F1)": result_scores.get("overall", {}).get("doc_f1", 0.0),
                         # Calibration metric
-                        "Effort (Kuiper)": result_scores.get("overall", {}).get("kuiper", 0.0),
                         "Submission Date": data.get("submission_date", ""),
                         "Link": data.get("link", ""),
                         "Description": data.get("description", metadata.get("description", "")) or
                                        generate_placeholder_description(model_name, tags, metadata.get("model_type", "")),
                         # Per-domain scores (stored as JSON string for DataFrame compatibility)
                         "_by_domain": json.dumps(by_domain) if by_domain else "{}",
-                    })
                 except Exception as e:
                     st.warning(f"Error loading {result_file}: {e}")
-    if not results:
         return pd.DataFrame()
     df = pd.DataFrame(results)
-    df = df.sort_values("Accuracy (ANLS*)", ascending=False).reset_index(drop=True)
     return df
@@ -1045,7 +1105,8 @@ def format_model_type(model_type: str) -> str:
 # Metric tooltips for table headers
 METRIC_TOOLTIPS = {
-    "Accuracy (ANLS*)": "Overall answer accuracy using ANLS* (Average Normalized Levenshtein Similarity). Higher is better.",
     "Acc. Single-Hop": "Accuracy on questions requiring evidence from a single page.",
     "Acc. Cross-Page": "Accuracy on multi-hop questions requiring evidence from multiple pages within the same document.",
     "Acc. Cross-Doc": "Accuracy on multi-hop questions requiring evidence from multiple documents.",
@@ -1130,12 +1191,26 @@ def render_leaderboard_table(df: pd.DataFrame, columns: list, show_analyze_colum
                 # Render tags as badges
                 cell_html = render_tags_html(value)
                 cells.append(f'<td>{cell_html}</td>')
-            elif col == "Accuracy (ANLS*)" or col.startswith("Acc."):
-                # Format accuracy scores (ANLS*, scale 0-100)
                 try:
-                    cell_html = f"{float(value):.1f}" if value else "0"
                 except (ValueError, TypeError):
-                    cell_html = str(value)
                 cells.append(f'<td style="text-align: center;">{cell_html}</td>')
             elif col.startswith("Attribution"):
                 # Format F1 scores (scale 0-100)
@@ -1274,7 +1349,7 @@ def create_accuracy_vs_attribution_plot(df: pd.DataFrame) -> go.Figure:
         df_type = df[df["Model Type"] == model_type]
         fig.add_trace(go.Scatter(
             x=df_type["Attribution (Page F1)"],
-            y=df_type["Accuracy (ANLS*)"],
             mode="markers",
             name=model_type,
             text=df_type["Model"],
@@ -1289,7 +1364,7 @@ def create_accuracy_vs_attribution_plot(df: pd.DataFrame) -> go.Figure:
     fig.update_layout(
         title=dict(text="Accuracy vs Attribution", font=dict(color="white")),
         xaxis_title="Attribution (Page F1)",
-        yaxis_title="Accuracy (ANLS*)",
         hovermode="closest",
         template="plotly_dark",
         height=650,
@@ -1335,7 +1410,7 @@ def create_accuracy_vs_effort_plot(df: pd.DataFrame) -> go.Figure:
         df_type = df_filtered[df_filtered["Model Type"] == model_type]
         fig.add_trace(go.Scatter(
             x=df_type["Effort (Kuiper)"],
-            y=df_type["Accuracy (ANLS*)"],
             mode="markers",
             name=model_type,
             text=df_type["Model"],
@@ -1350,7 +1425,7 @@ def create_accuracy_vs_effort_plot(df: pd.DataFrame) -> go.Figure:
     fig.update_layout(
         title=dict(text="Accuracy vs Effort", font=dict(color="white")),
         xaxis_title="Effort (Kuiper) — lower is better",
-        yaxis_title="Accuracy (ANLS*)",
         hovermode="closest",
         template="plotly_dark",
         height=650,
@@ -1460,7 +1535,7 @@ def show_model_details(model_name: str):
     # Display main metrics
     col1, col2, col3 = st.columns(3)
     with col1:
-        st.metric("Overall Accuracy", f"{model_data['Accuracy (ANLS*)']:.1f}%")
     with col2:
         st.metric("Attribution (Page F1)", f"{model_data['Attribution (Page F1)']:.1f}%")
     with col3:
@@ -1495,7 +1570,7 @@ def show_model_details(model_name: str):
     if by_domain:
         # Show per-domain chart (use overall accuracy as threshold for coloring)
-        overall_accuracy = model_data.get('Accuracy (ANLS*)', 0)
         fig = create_domain_accuracy_chart(by_domain, model_name, overall_accuracy)
         st.plotly_chart(fig, width="stretch")
     else:
@@ -1620,12 +1695,73 @@ def load_gold_standard(dataset_name: str = "agentic-document-ai/dataset-PRIVATE"
         return {}, {}
-def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict) -> dict:
-    """Evaluate predictions against gold standard."""
     if not EVAL_AVAILABLE:
         return {"error": "Evaluation module not available"}
-    evals = []
     unmatched = []
     for pred in predictions:
@@ -1633,45 +1769,54 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
         qid = pred.get('id', '')
         # Match to gold
         if question in gold_by_text:
             gold_data = gold_by_text[question]
         elif qid and qid in gold_by_id:
             gold_data = gold_by_id[qid]
         else:
             unmatched.append(question[:50] + "..." if len(question) > 50 else question)
-            continue
-        # Get prediction data
-        answer = pred.get('answer', '')
-        citations = pred.get('citations', [])
-        search_history = pred.get('search_history', [])
-        steps = len(search_history) if search_history else pred.get('iterations', 0)
-        # Calculate metrics
-        anls = anls_star(answer, gold_data['answers'])
-        correct = anls >= 0.5
-        doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
-        page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
-        evals.append({
-            'question': question,
-            'anls': anls,
-            'correct': correct,
-            'doc_f1': doc_f1['f1'],
-            'page_f1': page_f1['f1'],
-            'steps': steps,
-            'hop_type': gold_data.get('hop_type', 'single'),
-            'category': gold_data['category'],
-            'domain': gold_data['domain']
-        })
-    if not evals:
         return {"error": "No predictions matched the gold standard"}
     # Aggregate overall metrics
     n = len(evals)
-    accuracy = sum(e['correct'] for e in evals) / n * 100  # Scale to 0-100
     mean_anls = sum(e['anls'] for e in evals) / n * 100
     mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
     mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
@@ -1684,7 +1829,6 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
     cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
     # By domain
-    from collections import defaultdict
     by_domain = defaultdict(list)
     for e in evals:
         domain = e['domain'] or 'Other'
@@ -1693,6 +1837,7 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
     domain_scores = {}
     for domain, domain_evals in sorted(by_domain.items()):
         domain_scores[domain] = {
             'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
             'n': len(domain_evals)
         }
@@ -1700,27 +1845,33 @@ def evaluate_predictions(predictions: list, gold_by_text: dict, gold_by_id: dict
     results = {
         'n_evaluated': n,
         'n_unmatched': len(unmatched),
-        'unmatched_samples': unmatched[:5],  # Show first 5
         'overall': {
-            'anls': mean_anls,
             'accuracy': accuracy,
             'doc_f1': mean_doc_f1,
             'page_f1': mean_page_f1,
             'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
         },
         'single_evidence': {
             'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
             'n': len(single_hop)
         },
         'multi_evidence_same_doc': {
             'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
             'n': len(cross_page)
         },
         'multi_evidence_multi_doc': {
             'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
             'n': len(cross_doc)
         },
-        'by_domain': domain_scores
     }
     return results
@@ -1822,14 +1973,30 @@ def submit_results_fragment():
             # Evaluate button
             if st.button("Run Evaluation", type="primary"):
-                with st.spinner("Loading gold standard and evaluating..."):
                     gold_by_text, gold_by_id = load_gold_standard()
-                    if not gold_by_text:
-                        st.error("Failed to load gold standard dataset")
-                    else:
-                        results = evaluate_predictions(predictions, gold_by_text, gold_by_id)
-                        st.session_state.eval_results = results
     # Show evaluation results
     if st.session_state.eval_results:
@@ -1840,10 +2007,15 @@ def submit_results_fragment():
         else:
             st.markdown("#### Evaluation Results")
-            # Summary metrics
             col1, col2, col3, col4 = st.columns(4)
             with col1:
-                st.metric("Accuracy (ANLS*)", f"{results['overall']['anls']:.1f}")
             with col2:
                 st.metric("Attribution (Page F1)", f"{results['overall']['page_f1']:.1f}")
             with col3:
@@ -1854,16 +2026,32 @@ def submit_results_fragment():
             # Detailed breakdown
             with st.expander("Detailed Breakdown"):
-                st.markdown(f"""
-                | Metric | Value |
-                |--------|-------|
-                | **Overall ANLS*** | {results['overall']['anls']:.1f} |
-                | **Acc. Single-Hop** (n={results['single_evidence']['n']}) | {results['single_evidence']['anls']:.1f} |
-                | **Acc. Cross-Page** (n={results['multi_evidence_same_doc']['n']}) | {results['multi_evidence_same_doc']['anls']:.1f} |
-                | **Acc. Cross-Doc** (n={results['multi_evidence_multi_doc']['n']}) | {results['multi_evidence_multi_doc']['anls']:.1f} |
-                | **Attribution (Doc F1)** | {results['overall']['doc_f1']:.1f} |
-                | **Attribution (Page F1)** | {results['overall']['page_f1']:.1f} |
-                """)
             if results['n_unmatched'] > 0:
                 with st.expander(f"{results['n_unmatched']} unmatched questions"):
@@ -2333,10 +2521,11 @@ def main():
                 # COLUMN SELECTOR - chips use SNOWFLAKE_BLUE (lighter, gradient end)
                 # Mapping: short chip name -> full column name
                 COLUMN_CHIP_NAMES = {
-                    "Accuracy": "Accuracy (ANLS*)",
                     "Acc. Single-Hop": "Acc. Single-Hop",
                     "Acc. Cross-Page": "Acc. Cross-Page",
                     "Acc. Cross-Doc": "Acc. Cross-Doc",
                     "Attribution": "Attribution (Page F1)",
                     "Attribution (Doc)": "Attribution (Doc F1)",
                     "Effort": "Effort (Kuiper)",
@@ -2351,7 +2540,7 @@ def main():
                 # Model and Organization are always visible (not in selector)
                 always_visible = ["Model", "Organization"]
                 # Hidden columns (used internally but not shown as separate columns)
-                hidden_cols = ["Link", "Submission Date", "Description", "_by_domain"]
                 # Full column names that are optional (Tags moved to end)
                 optional_full_cols = [c for c in all_columns if c not in hidden_cols + always_visible and c != "Tags"]
                 optional_full_cols.append("Tags")  # Add Tags at the end
@@ -2524,8 +2713,9 @@ The task is characterized by five formal properties:
             st.markdown("""
 #### Metrics
-##### Accuracy (ANLS*)
-- **Accuracy (ANLS*)**: Main score using Average Normalized Levenshtein Similarity with optimal element alignment for lists/sets
 - **Acc. Single-Hop**: Accuracy on questions requiring a single evidence page
 - **Acc. Cross-Page**: Accuracy on multi-hop questions within the same document
 - **Acc. Cross-Doc**: Accuracy on multi-hop questions spanning multiple documents

 import secrets
 import shutil
 import sys
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime, timezone
 from pathlib import Path
 from urllib.parse import urlencode, quote, unquote
+# Parallelization config for LLM evaluation
+MAX_EVAL_WORKERS = 24
 import pandas as pd
 import plotly.graph_objects as go
 import requests
 # Add eval module to path
 sys.path.insert(0, str(Path(__file__).parent / "eval"))
 try:
+    from metrics import (
+        anls_star,
+        anls_star_llm,
+        aggregate_anls_star_llm,
+        standard_error,
+        confidence_interval,
+        citation_f1,
+        kuiper_statistic
+    )
     from datasets import load_dataset
     EVAL_AVAILABLE = True
 except ImportError:
     return f'<span style="color: {color}; font-weight: 500;">{fallback_emoji} {model_type}</span>'
+def _extract_timestamp_from_filename(filename: str) -> str:
+    """Extract timestamp from filename like 'Model_results_20260109_152104.json'."""
+    import re
+    match = re.search(r'_(\d{8}_\d{6})\.json$', filename)
+    return match.group(1) if match else "00000000_000000"
 @st.cache_data(ttl=300)  # Cache for 5 minutes
 def load_eval_results() -> pd.DataFrame:
+    """Load evaluation results from JSON files, keeping only the most recent per model."""
+    seen_models = {}  # Track: model_name -> (timestamp, result_dict, filepath)
     results_path = Path(EVAL_RESULTS_PATH)
     if not results_path.exists():
                     # Get per-domain scores if available
                     by_domain = result_scores.get("by_domain", {})
+                    # Use semantic accuracy if available, otherwise fall back to ANLS*
+                    overall = result_scores.get("overall", {})
+                    single_ev = result_scores.get("single_evidence", {})
+                    multi_page = result_scores.get("multi_evidence_same_doc", {})
+                    multi_doc = result_scores.get("multi_evidence_multi_doc", {})
+                    # Primary metric: semantic (ANLS* + LLM) if available, otherwise ANLS*
+                    semantic_acc = overall.get("semantic", overall.get("anls", 0.0))
+                    semantic_ci = overall.get("semantic_ci")  # 95% CI tuple
+                    # Calculate CI on-the-fly using bias correction method if not stored
+                    if not semantic_ci and semantic_acc > 0:
+                        try:
+                            from metrics import confidence_interval, standard_error
+                            n = result_scores.get("single_evidence", {}).get("n", 500)
+                            p = semantic_acc / 100.0  # Convert to proportion
+                            ci = confidence_interval(p, n)  # Uses calibrated q0, q1, m0, m1
+                            semantic_ci = (ci[0] * 100, ci[1] * 100)
+                            semantic_se = standard_error(p, n) * 100  # SE in percentage points
+                        except Exception:
+                            semantic_ci = None
+                            semantic_se = None
+                    anls_acc = overall.get("anls", 0.0)
+                    result_dict = {
                         "Model": model_name,
                         "Organization": data.get("organization", data.get("submitted_by", org_dir.name)),
                         "Model Type": metadata.get("model_type", "unknown"),
                         "Tags": tags,  # Store as list
+                        # Primary: Accuracy with LLM judge (ANLS* + LLM with bias correction)
+                        "Accuracy (LLM judge)": semantic_acc,
+                        "_Accuracy_SE": semantic_se,  # Hidden: for ±SE display
+                        "_Accuracy_CI": semantic_ci,  # Hidden: for tooltip display
+                        "Acc. Single-Hop": single_ev.get("semantic", single_ev.get("anls", 0.0)),
+                        "Acc. Cross-Page": multi_page.get("semantic", multi_page.get("anls", 0.0)),
+                        "Acc. Cross-Doc": multi_doc.get("semantic", multi_doc.get("anls", 0.0)),
+                        # Secondary: Pure string-based ANLS* (hidden by default)
+                        "ANLS* (string)": anls_acc,
                         # Attribution metrics
+                        "Attribution (Page F1)": overall.get("page_f1", 0.0),
+                        "Attribution (Doc F1)": overall.get("doc_f1", 0.0),
                         # Calibration metric
+                        "Effort (Kuiper)": overall.get("kuiper", 0.0),
                         "Submission Date": data.get("submission_date", ""),
                         "Link": data.get("link", ""),
                         "Description": data.get("description", metadata.get("description", "")) or
                                        generate_placeholder_description(model_name, tags, metadata.get("model_type", "")),
                         # Per-domain scores (stored as JSON string for DataFrame compatibility)
                         "_by_domain": json.dumps(by_domain) if by_domain else "{}",
+                    }
+                    # Extract timestamp from filename
+                    file_timestamp = _extract_timestamp_from_filename(result_file.name)
+                    # Keep only the most recent result per model
+                    if model_name not in seen_models or file_timestamp > seen_models[model_name][0]:
+                        seen_models[model_name] = (file_timestamp, result_dict)
                 except Exception as e:
                     st.warning(f"Error loading {result_file}: {e}")
+    if not seen_models:
         return pd.DataFrame()
+    # Build results list from deduplicated models
+    results = [result_dict for _, result_dict in seen_models.values()]
     df = pd.DataFrame(results)
+    df = df.sort_values("Accuracy (LLM judge)", ascending=False).reset_index(drop=True)
     return df
 # Metric tooltips for table headers
 METRIC_TOOLTIPS = {
+    "Accuracy (LLM judge)": "Answer accuracy using ANLS* + LLM judge with bias correction. Captures semantic correctness beyond string matching. Higher is better.",
+    "ANLS* (string)": "String-based accuracy using ANLS* (Average Normalized Levenshtein Similarity). Stricter than semantic. Higher is better.",
     "Acc. Single-Hop": "Accuracy on questions requiring evidence from a single page.",
     "Acc. Cross-Page": "Accuracy on multi-hop questions requiring evidence from multiple pages within the same document.",
     "Acc. Cross-Doc": "Accuracy on multi-hop questions requiring evidence from multiple documents.",
                 # Render tags as badges
                 cell_html = render_tags_html(value)
                 cells.append(f'<td>{cell_html}</td>')
+            elif col == "Accuracy (LLM judge)" or col == "ANLS* (string)" or col.startswith("Acc."):
+                # Format accuracy scores (scale 0-100)
                 try:
+                    acc_val = f"{float(value):.1f}" if value else "0"
                 except (ValueError, TypeError):
+                    acc_val = str(value)
+                # Add ±SE for main accuracy column
+                if col == "Accuracy (LLM judge)":
+                    se = row.get("_Accuracy_SE")
+                    ci = row.get("_Accuracy_CI")
+                    if se is not None and se > 0:
+                        # Show ±SE with 95% CI as tooltip
+                        ci_tooltip = f"95% CI: [{ci[0]:.1f}, {ci[1]:.1f}]" if ci else ""
+                        se_text = f'<span style="font-size: 0.85em; color: #888;" title="{ci_tooltip}"> ± {se:.1f}</span>'
+                        cell_html = f'{acc_val}{se_text}'
+                    else:
+                        cell_html = acc_val
+                else:
+                    cell_html = acc_val
                 cells.append(f'<td style="text-align: center;">{cell_html}</td>')
             elif col.startswith("Attribution"):
                 # Format F1 scores (scale 0-100)
         df_type = df[df["Model Type"] == model_type]
         fig.add_trace(go.Scatter(
             x=df_type["Attribution (Page F1)"],
+            y=df_type["Accuracy (LLM judge)"],
             mode="markers",
             name=model_type,
             text=df_type["Model"],
     fig.update_layout(
         title=dict(text="Accuracy vs Attribution", font=dict(color="white")),
         xaxis_title="Attribution (Page F1)",
+        yaxis_title="Accuracy (LLM judge)",
         hovermode="closest",
         template="plotly_dark",
         height=650,
         df_type = df_filtered[df_filtered["Model Type"] == model_type]
         fig.add_trace(go.Scatter(
             x=df_type["Effort (Kuiper)"],
+            y=df_type["Accuracy (LLM judge)"],
             mode="markers",
             name=model_type,
             text=df_type["Model"],
     fig.update_layout(
         title=dict(text="Accuracy vs Effort", font=dict(color="white")),
         xaxis_title="Effort (Kuiper) — lower is better",
+        yaxis_title="Accuracy (LLM judge)",
         hovermode="closest",
         template="plotly_dark",
         height=650,
     # Display main metrics
     col1, col2, col3 = st.columns(3)
     with col1:
+        st.metric("Accuracy (LLM judge)", f"{model_data['Accuracy (LLM judge)']:.1f}%")
     with col2:
         st.metric("Attribution (Page F1)", f"{model_data['Attribution (Page F1)']:.1f}%")
     with col3:
     if by_domain:
         # Show per-domain chart (use overall accuracy as threshold for coloring)
+        overall_accuracy = model_data.get('Accuracy (LLM judge)', 0)
         fig = create_domain_accuracy_chart(by_domain, model_name, overall_accuracy)
         st.plotly_chart(fig, width="stretch")
     else:
         return {}, {}
+def _evaluate_single_item(args, max_retries=3):
+    """Evaluate a single prediction item (for parallel processing)."""
+    import time as _time
+    idx, pred, gold_data, use_llm_judge = args
+    question = pred.get('question', '').strip()
+    answer = pred.get('answer', '')
+    citations = pred.get('citations', [])
+    search_history = pred.get('search_history', [])
+    steps = len(search_history) if search_history else pred.get('iterations', 0)
+    # Calculate non-LLM metrics first
+    anls = anls_star(answer, gold_data['answers'])
+    doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
+    page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
+    # Semantic accuracy with LLM judge (or just ANLS* if disabled)
+    if use_llm_judge:
+        for attempt in range(max_retries):
+            try:
+                llm_result = anls_star_llm(answer, gold_data['answers'], question)
+                semantic_score = llm_result['score']
+                break
+            except Exception:
+                if attempt < max_retries - 1:
+                    _time.sleep(2 ** attempt)  # Exponential backoff
+                else:
+                    raise
+    else:
+        semantic_score = anls
+    return {
+        'idx': idx,
+        'question': question,
+        'anls': anls,
+        'semantic_score': semantic_score,
+        'correct': semantic_score >= 0.5,
+        'doc_f1': doc_f1['f1'],
+        'page_f1': page_f1['f1'],
+        'steps': steps,
+        'hop_type': gold_data.get('hop_type', 'single'),
+        'category': gold_data['category'],
+        'domain': gold_data['domain']
+    }
+def evaluate_predictions(
+    predictions: list,
+    gold_by_text: dict,
+    gold_by_id: dict,
+    use_llm_judge: bool = True,
+    progress_callback=None
+) -> dict:
+    """Evaluate predictions against gold standard (parallelized when using LLM judge).
+    Args:
+        predictions: List of prediction dicts
+        gold_by_text: Gold data indexed by question text
+        gold_by_id: Gold data indexed by question ID
+        use_llm_judge: If True, use ANLS*+LLM for semantic accuracy (default)
+        progress_callback: Optional callback(current, total) for progress updates
+    """
     if not EVAL_AVAILABLE:
         return {"error": "Evaluation module not available"}
+    # First pass: match predictions to gold standard
+    matched_items = []
     unmatched = []
     for pred in predictions:
         qid = pred.get('id', '')
         # Match to gold
+        gold_data = None
         if question in gold_by_text:
             gold_data = gold_by_text[question]
         elif qid and qid in gold_by_id:
             gold_data = gold_by_id[qid]
+        if gold_data:
+            matched_items.append((pred, gold_data, use_llm_judge))
         else:
             unmatched.append(question[:50] + "..." if len(question) > 50 else question)
+    if not matched_items:
         return {"error": "No predictions matched the gold standard"}
+    # Prepare items with index
+    items_with_idx = [(i, pred, gold, llm) for i, (pred, gold, llm) in enumerate(matched_items)]
+    total = len(items_with_idx)
+    evals = []
+    completed = 0
+    # Parallel evaluation with ThreadPoolExecutor (much faster for LLM calls)
+    with ThreadPoolExecutor(max_workers=MAX_EVAL_WORKERS) as executor:
+        futures = {executor.submit(_evaluate_single_item, item): item[0]
+                   for item in items_with_idx}
+        for future in as_completed(futures):
+            result = future.result()  # Will raise if failed after retries
+            evals.append(result)
+            completed += 1
+            if progress_callback:
+                progress_callback(completed, total)
     # Aggregate overall metrics
     n = len(evals)
+    semantic_scores = [e['semantic_score'] for e in evals]
+    # Apply bias correction for semantic accuracy
+    if use_llm_judge:
+        agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
+        mean_semantic = agg['adjusted_score'] * 100
+        semantic_ci = (agg['ci_lower'] * 100, agg['ci_upper'] * 100)
+    else:
+        mean_semantic = sum(semantic_scores) / n * 100
+        semantic_ci = None
     mean_anls = sum(e['anls'] for e in evals) / n * 100
+    accuracy = sum(e['correct'] for e in evals) / n * 100
     mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
     mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
     cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
     # By domain
     by_domain = defaultdict(list)
     for e in evals:
         domain = e['domain'] or 'Other'
     domain_scores = {}
     for domain, domain_evals in sorted(by_domain.items()):
         domain_scores[domain] = {
+            'semantic': sum(e['semantic_score'] for e in domain_evals) / len(domain_evals) * 100,
             'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
             'n': len(domain_evals)
         }
     results = {
         'n_evaluated': n,
         'n_unmatched': len(unmatched),
+        'unmatched_samples': unmatched[:5],
         'overall': {
+            'semantic': mean_semantic,  # Primary metric (ANLS* + LLM judge)
+            'semantic_ci': semantic_ci,  # 95% CI if LLM judge used
+            'anls': mean_anls,  # Secondary metric (pure ANLS*)
             'accuracy': accuracy,
             'doc_f1': mean_doc_f1,
             'page_f1': mean_page_f1,
             'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
         },
         'single_evidence': {
+            'semantic': sum(e['semantic_score'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
             'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
             'n': len(single_hop)
         },
         'multi_evidence_same_doc': {
+            'semantic': sum(e['semantic_score'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
             'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
             'n': len(cross_page)
         },
         'multi_evidence_multi_doc': {
+            'semantic': sum(e['semantic_score'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
             'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
             'n': len(cross_doc)
         },
+        'by_domain': domain_scores,
+        'used_llm_judge': use_llm_judge
     }
     return results
             # Evaluate button
             if st.button("Run Evaluation", type="primary"):
+                with st.spinner("Loading gold standard..."):
                     gold_by_text, gold_by_id = load_gold_standard()
+                if not gold_by_text:
+                    st.error("Failed to load gold standard dataset")
+                else:
+                    # Progress bar for evaluation
+                    progress_bar = st.progress(0, text="Evaluating predictions with semantic accuracy...")
+                    status_text = st.empty()
+                    def update_progress(current, total):
+                        progress_bar.progress(current / total, text=f"Evaluating {current}/{total}...")
+                    results = evaluate_predictions(
+                        predictions,
+                        gold_by_text,
+                        gold_by_id,
+                        use_llm_judge=True,
+                        progress_callback=update_progress
+                    )
+                    progress_bar.empty()
+                    status_text.empty()
+                    st.session_state.eval_results = results
     # Show evaluation results
     if st.session_state.eval_results:
         else:
             st.markdown("#### Evaluation Results")
+            # Summary metrics - use semantic accuracy as primary if available
             col1, col2, col3, col4 = st.columns(4)
             with col1:
+                if 'semantic' in results['overall']:
+                    ci = results['overall'].get('semantic_ci')
+                    ci_text = f" [{ci[0]:.1f}-{ci[1]:.1f}]" if ci else ""
+                    st.metric("Accuracy (LLM judge)", f"{results['overall']['semantic']:.1f}{ci_text}")
+                else:
+                    st.metric("Accuracy (ANLS*)", f"{results['overall']['anls']:.1f}")
             with col2:
                 st.metric("Attribution (Page F1)", f"{results['overall']['page_f1']:.1f}")
             with col3:
             # Detailed breakdown
             with st.expander("Detailed Breakdown"):
+                # Check which metrics are available
+                has_semantic = 'semantic' in results['overall']
+                if has_semantic:
+                    st.markdown(f"""
+                    | Metric | Value |
+                    |--------|-------|
+                    | **Accuracy (LLM judge)** | {results['overall']['semantic']:.1f} |
+                    | **ANLS*** (string match) | {results['overall']['anls']:.1f} |
+                    | **Acc. Single-Hop** (n={results['single_evidence']['n']}) | {results['single_evidence'].get('semantic', results['single_evidence']['anls']):.1f} |
+                    | **Acc. Cross-Page** (n={results['multi_evidence_same_doc']['n']}) | {results['multi_evidence_same_doc'].get('semantic', results['multi_evidence_same_doc']['anls']):.1f} |
+                    | **Acc. Cross-Doc** (n={results['multi_evidence_multi_doc']['n']}) | {results['multi_evidence_multi_doc'].get('semantic', results['multi_evidence_multi_doc']['anls']):.1f} |
+                    | **Attribution (Doc F1)** | {results['overall']['doc_f1']:.1f} |
+                    | **Attribution (Page F1)** | {results['overall']['page_f1']:.1f} |
+                    """)
+                else:
+                    st.markdown(f"""
+                    | Metric | Value |
+                    |--------|-------|
+                    | **Overall ANLS*** | {results['overall']['anls']:.1f} |
+                    | **Acc. Single-Hop** (n={results['single_evidence']['n']}) | {results['single_evidence']['anls']:.1f} |
+                    | **Acc. Cross-Page** (n={results['multi_evidence_same_doc']['n']}) | {results['multi_evidence_same_doc']['anls']:.1f} |
+                    | **Acc. Cross-Doc** (n={results['multi_evidence_multi_doc']['n']}) | {results['multi_evidence_multi_doc']['anls']:.1f} |
+                    | **Attribution (Doc F1)** | {results['overall']['doc_f1']:.1f} |
+                    | **Attribution (Page F1)** | {results['overall']['page_f1']:.1f} |
+                    """)
             if results['n_unmatched'] > 0:
                 with st.expander(f"{results['n_unmatched']} unmatched questions"):
                 # COLUMN SELECTOR - chips use SNOWFLAKE_BLUE (lighter, gradient end)
                 # Mapping: short chip name -> full column name
                 COLUMN_CHIP_NAMES = {
+                    "Accuracy": "Accuracy (LLM judge)",
                     "Acc. Single-Hop": "Acc. Single-Hop",
                     "Acc. Cross-Page": "Acc. Cross-Page",
                     "Acc. Cross-Doc": "Acc. Cross-Doc",
+                    "ANLS*": "ANLS* (string)",
                     "Attribution": "Attribution (Page F1)",
                     "Attribution (Doc)": "Attribution (Doc F1)",
                     "Effort": "Effort (Kuiper)",
                 # Model and Organization are always visible (not in selector)
                 always_visible = ["Model", "Organization"]
                 # Hidden columns (used internally but not shown as separate columns)
+                hidden_cols = ["Link", "Submission Date", "Description", "_by_domain", "_Accuracy_CI", "_Accuracy_SE"]
                 # Full column names that are optional (Tags moved to end)
                 optional_full_cols = [c for c in all_columns if c not in hidden_cols + always_visible and c != "Tags"]
                 optional_full_cols.append("Tags")  # Add Tags at the end
             st.markdown("""
 #### Metrics
+##### Accuracy (LLM judge)
+- **Accuracy (LLM judge)**: Primary metric combining ANLS* string matching with an LLM judge (G-Eval framework). Captures semantic correctness beyond exact string matching, with statistical bias correction
+- **ANLS* (string)**: Pure string-based score using Average Normalized Levenshtein Similarity with optimal element alignment for lists/sets
 - **Acc. Single-Hop**: Accuracy on questions requiring a single evidence page
 - **Acc. Cross-Page**: Accuracy on multi-hop questions within the same document
 - **Acc. Cross-Doc**: Accuracy on multi-hop questions spanning multiple documents

eval/batch_reevaluate.py ADDED Viewed

	@@ -0,0 +1,434 @@

+#!/usr/bin/env python3
+"""
+Batch re-evaluate all submissions with the new Semantic Accuracy metric.
+This script downloads all prediction files from HuggingFace Hub and re-evaluates
+them with the ANLS* + LLM judge metric.
+Usage:
+    # Dry run - list files only
+    python batch_reevaluate.py --dry-run
+    # Re-evaluate all files
+    python batch_reevaluate.py
+    # Re-evaluate specific organization
+    python batch_reevaluate.py --org OpenAI
+    # Upload results after review
+    python batch_reevaluate.py --upload
+"""
+import json
+import os
+import sys
+import time
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timezone
+from pathlib import Path
+from huggingface_hub import HfApi, hf_hub_download, list_repo_files
+from datasets import load_dataset
+# Add parent for imports
+sys.path.insert(0, str(Path(__file__).parent))
+from metrics import (
+    anls_star,
+    anls_star_llm,
+    aggregate_anls_star_llm,
+    citation_f1,
+    kuiper_statistic
+)
+# Parallelization config
+MAX_WORKERS = 24
+# Config
+RESULTS_REPO = "agentic-document-ai/backend-results"
+TOKEN = os.environ.get("HF_TOKEN")
+OUTPUT_DIR = Path(__file__).parent / "reevaluated_results"
+def load_gold_data():
+    """Load gold standard from HuggingFace."""
+    print("Loading gold standard...")
+    dataset = load_dataset("agentic-document-ai/dataset-PRIVATE", split="test")
+    gold_by_id = {}
+    gold_by_text = {}
+    for ex in dataset:
+        qid = ex.get('id', '')
+        question = ex['question'].strip()
+        data = {
+            'question': question,
+            'answers': ex.get('answer_variants', []),
+            'evidence': ex.get('evidence', []),
+            'category': ex.get('document_category', ''),
+            'domain': ex.get('domain', ''),
+            'hop_type': ex.get('hop_type', 'single'),
+        }
+        gold_by_id[qid] = data
+        gold_by_text[question] = data
+    return gold_by_id, gold_by_text
+def find_prediction_files(org_filter: str = None):
+    """Find all prediction JSONL files in the results repo."""
+    files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
+    pred_files = [f for f in files if '_predictions' in f and f.endswith('.jsonl')]
+    if org_filter:
+        pred_files = [f for f in pred_files if f.startswith(org_filter + '/')]
+    return pred_files
+def find_result_file(pred_file: str):
+    """Find the corresponding results JSON file for a predictions file."""
+    # Pattern: {org}/{model}_predictions_{timestamp}.jsonl -> {org}/{model}_results_{timestamp}.json
+    parts = pred_file.rsplit('_predictions_', 1)
+    if len(parts) == 2:
+        result_file = parts[0] + '_results_' + parts[1].replace('.jsonl', '.json')
+        return result_file
+    return None
+def download_file(filepath: str) -> str:
+    """Download a file from HuggingFace Hub."""
+    return hf_hub_download(
+        repo_id=RESULTS_REPO,
+        filename=filepath,
+        repo_type="dataset",
+        token=TOKEN
+    )
+def _evaluate_single_prediction(args, max_retries=3):
+    """Evaluate a single prediction (for parallel processing)."""
+    idx, pred, gold_data = args
+    answer = pred.get('answer', '')
+    question = pred.get('question', '').strip()
+    citations = pred.get('citations', [])
+    search_history = pred.get('search_history', [])
+    steps = len(search_history) if search_history else pred.get('iterations', 0)
+    # Calculate non-LLM metrics first
+    anls = anls_star(answer, gold_data['answers'])
+    doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
+    page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
+    # Retry LLM call on failure
+    for attempt in range(max_retries):
+        try:
+            llm_result = anls_star_llm(answer, gold_data['answers'], question)
+            semantic_score = llm_result['score']
+            break
+        except Exception as e:
+            if attempt < max_retries - 1:
+                print(f"      Item {idx} attempt {attempt+1} failed: {e}, retrying...")
+                time.sleep(2 ** attempt)  # Exponential backoff
+            else:
+                print(f"      Failed item {idx} after {max_retries} retries: {e}")
+                raise
+    return {
+        'idx': idx,
+        'anls': anls,
+        'semantic_score': semantic_score,
+        'correct': semantic_score >= 0.5,
+        'doc_f1': doc_f1['f1'],
+        'page_f1': page_f1['f1'],
+        'steps': steps,
+        'hop_type': gold_data.get('hop_type', 'single'),
+        'category': gold_data['category'],
+        'domain': gold_data['domain']
+    }
+def evaluate_with_semantic(predictions: list, gold_by_id: dict, gold_by_text: dict) -> dict:
+    """Evaluate predictions with semantic accuracy metric (parallelized)."""
+    # First, filter predictions to only those in test set
+    matched_predictions = []
+    for pred in predictions:
+        question = pred.get('question', '').strip()
+        qid = pred.get('id', '')
+        gold_data = None
+        if question in gold_by_text:
+            gold_data = gold_by_text[question]
+        elif qid and qid in gold_by_id:
+            gold_data = gold_by_id[qid]
+        if gold_data:
+            matched_predictions.append((pred, gold_data))
+    unmatched = len(predictions) - len(matched_predictions)
+    print(f"    Matched {len(matched_predictions)}/{len(predictions)} predictions to test set (skipping {unmatched})")
+    total = len(matched_predictions)
+    evals = []
+    completed = 0
+    # Prepare items with index for tracking
+    items_with_idx = [(i, pred, gold) for i, (pred, gold) in enumerate(matched_predictions)]
+    # Parallel evaluation with ThreadPoolExecutor
+    print(f"    Evaluating with {MAX_WORKERS} parallel workers...")
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        futures = {executor.submit(_evaluate_single_prediction, item): item[0]
+                   for item in items_with_idx}
+        completed_indices = set()
+        try:
+            for future in as_completed(futures, timeout=600):  # 10 min overall timeout
+                try:
+                    result = future.result(timeout=120)  # 2 min per item max
+                    evals.append(result)
+                    completed_indices.add(result['idx'])
+                    completed += 1
+                    if completed % 50 == 0 or completed == total:
+                        print(f"    Progress: {completed}/{total}")
+                except TimeoutError:
+                    idx = futures[future]
+                    print(f"    TIMEOUT: Item {idx} took too long, skipping")
+                    completed += 1
+        except TimeoutError:
+            # Find which items are still pending
+            pending = set(range(total)) - completed_indices
+            print(f"    OVERALL TIMEOUT: {len(pending)} items still pending: {sorted(pending)[:10]}...")
+            # Cancel remaining futures
+            for future in futures:
+                future.cancel()
+    if not evals:
+        return None
+    # Aggregate
+    n = len(evals)
+    semantic_scores = [e['semantic_score'] for e in evals]
+    agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
+    mean_anls = sum(e['anls'] for e in evals) / n * 100
+    mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
+    mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
+    kuiper = kuiper_statistic(evals)
+    # By hop type
+    single_hop = [e for e in evals if e['hop_type'] == 'single']
+    cross_page = [e for e in evals if e['hop_type'] == 'cross_page']
+    cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
+    # By domain
+    by_domain = defaultdict(list)
+    for e in evals:
+        domain = e['domain'] or 'Other'
+        by_domain[domain].append(e)
+    domain_scores = {}
+    for domain, domain_evals in sorted(by_domain.items()):
+        domain_scores[domain] = {
+            'semantic': sum(e['semantic_score'] for e in domain_evals) / len(domain_evals) * 100,
+            'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
+            'n': len(domain_evals)
+        }
+    return {
+        'overall': {
+            'semantic': agg['adjusted_score'] * 100,
+            'semantic_ci': (agg['ci_lower'] * 100, agg['ci_upper'] * 100),  # 95% CI
+            'anls': mean_anls,
+            'page_f1': mean_page_f1,
+            'doc_f1': mean_doc_f1,
+            'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
+        },
+        'single_evidence': {
+            'semantic': sum(e['semantic_score'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
+            'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
+            'n': len(single_hop)
+        },
+        'multi_evidence_same_doc': {
+            'semantic': sum(e['semantic_score'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
+            'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
+            'n': len(cross_page)
+        },
+        'multi_evidence_multi_doc': {
+            'semantic': sum(e['semantic_score'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
+            'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
+            'n': len(cross_doc)
+        },
+        'by_domain': domain_scores,
+        'n_evaluated': n,
+        'n_unmatched': unmatched
+    }
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Batch re-evaluate submissions")
+    parser.add_argument('--dry-run', action='store_true', help="List files only, don't evaluate")
+    parser.add_argument('--org', type=str, help="Filter by organization (e.g., 'OpenAI')")
+    parser.add_argument('--upload', action='store_true', help="Upload already processed results to HuggingFace Hub (no re-evaluation)")
+    parser.add_argument('--skip-existing', action='store_true', help="Skip already evaluated files")
+    args = parser.parse_args()
+    OUTPUT_DIR.mkdir(exist_ok=True)
+    # Upload-only mode: just upload existing files
+    if args.upload:
+        print("Uploading existing results to HuggingFace Hub...")
+        api = HfApi()
+        result_files = list(OUTPUT_DIR.glob("**/*.json"))
+        print(f"Found {len(result_files)} result files to upload")
+        for result_file in result_files:
+            rel_path = result_file.relative_to(OUTPUT_DIR)
+            print(f"  Uploading: {rel_path}")
+            try:
+                api.upload_file(
+                    path_or_fileobj=str(result_file),
+                    path_in_repo=str(rel_path),
+                    repo_id=RESULTS_REPO,
+                    repo_type="dataset",
+                    token=TOKEN,
+                    commit_message=f"Re-evaluate with semantic accuracy: {rel_path.stem}"
+                )
+                print(f"    ✓ Done")
+            except Exception as e:
+                print(f"    ✗ Error: {e}")
+        print("\nUpload complete!")
+        return
+    # Find prediction files
+    print("Finding prediction files...")
+    pred_files = find_prediction_files(args.org)
+    print(f"Found {len(pred_files)} prediction files")
+    if args.dry_run:
+        for f in pred_files:
+            print(f"  - {f}")
+        return
+    # Load gold standard
+    gold_by_id, gold_by_text = load_gold_data()
+    print(f"Loaded {len(gold_by_id)} gold examples")
+    # Process each file
+    for i, pred_file in enumerate(pred_files):
+        print(f"\n{'='*60}")
+        print(f"[{i+1}/{len(pred_files)}] Processing: {pred_file}")
+        print('='*60)
+        # Check if already processed
+        output_file = OUTPUT_DIR / (Path(pred_file).stem.replace('_predictions', '_results') + '_reevaluated.json')
+        if args.skip_existing and output_file.exists():
+            print("  Skipping (already processed)")
+            continue
+        try:
+            # Download predictions
+            print("  Downloading predictions...")
+            local_pred = download_file(pred_file)
+            predictions = []
+            with open(local_pred) as f:
+                for line in f:
+                    if line.strip():
+                        predictions.append(json.loads(line))
+            print(f"  Loaded {len(predictions)} predictions")
+            # Download original results to preserve metadata
+            result_file = find_result_file(pred_file)
+            original_metadata = {}
+            if result_file:
+                try:
+                    local_result = download_file(result_file)
+                    with open(local_result) as f:
+                        original_data = json.load(f)
+                    original_metadata = {
+                        'model_name': original_data.get('model_name'),
+                        'organization': original_data.get('organization'),
+                        'description': original_data.get('description'),
+                        'link': original_data.get('link'),
+                        'tags': original_data.get('tags'),
+                        'submitted_by': original_data.get('submitted_by'),
+                        'metadata': original_data.get('metadata'),
+                        'submission_date': original_data.get('submission_date'),
+                    }
+                    print(f"  Loaded metadata: model_name={original_metadata.get('model_name')}")
+                except Exception as e:
+                    print(f"  Warning: Could not load original results: {e}")
+            # Fallback: extract metadata from filename if not found
+            if not original_metadata.get('model_name'):
+                # Pattern: Org/Model_Name_with_Stuff_predictions_timestamp.jsonl
+                filename = Path(pred_file).stem  # e.g., GPT-5_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152104
+                parts = filename.rsplit('_predictions_', 1)
+                if parts:
+                    model_name = parts[0].replace('_', ' ')  # Convert underscores to spaces
+                org = Path(pred_file).parts[0] if '/' in pred_file else 'Unknown'
+                original_metadata = {
+                    'model_name': model_name,
+                    'organization': org.replace('_', ' '),
+                    'description': '',
+                    'tags': ['Agentic'],
+                    'metadata': {'model_type': 'unknown'},
+                }
+                print(f"  Using fallback metadata: model_name={model_name}, org={org}")
+            # Evaluate
+            print("  Evaluating with semantic accuracy...")
+            start_time = time.time()
+            results = evaluate_with_semantic(predictions, gold_by_id, gold_by_text)
+            elapsed = time.time() - start_time
+            if results:
+                print(f"\n  Results (took {elapsed:.1f}s):")
+                print(f"    Semantic Accuracy: {results['overall']['semantic']:.1f}")
+                print(f"    ANLS*:             {results['overall']['anls']:.1f}")
+                print(f"    Page F1:           {results['overall']['page_f1']:.1f}")
+                # Save with original metadata
+                org = Path(pred_file).parts[0] if '/' in pred_file else 'Unknown'
+                output_filename = Path(pred_file).name.replace('_predictions', '_results').replace('.jsonl', '.json')
+                full_result = {
+                    **original_metadata,
+                    'results': results,
+                    'reevaluated_date': datetime.now(timezone.utc).isoformat(),
+                    'source_predictions_file': pred_file,
+                    'result_file_path': f"{org}/{output_filename}",
+                }
+                # Create org subfolder
+                org_dir = OUTPUT_DIR / org
+                org_dir.mkdir(exist_ok=True)
+                output_file = org_dir / output_filename
+                with open(output_file, 'w') as f:
+                    json.dump(full_result, f, indent=2)
+                print(f"  Saved to: {output_file}")
+            else:
+                print("  No valid evaluations")
+        except Exception as e:
+            print(f"  Error: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
+    print(f"\n{'='*60}")
+    print("DONE!")
+    print(f"Results saved to: {OUTPUT_DIR}")
+    print(f"\nTo upload results, run: python batch_reevaluate.py --upload")
+if __name__ == "__main__":
+    main()

eval/evaluate.py CHANGED Viewed

@@ -18,7 +18,14 @@ from typing import Any, Dict, List, Optional, Tuple
 from datasets import load_dataset
-from metrics import anls_star, citation_f1, kuiper_statistic, wasted_effort_ratio
 def derive_hop_type(evidence: list) -> str:
@@ -106,11 +113,18 @@ def load_results(filepath: Path) -> List[Dict]:
 def evaluate_single(
     result: Dict,
     gold_by_text: Dict[str, Dict],
-    gold_by_id: Dict[str, Dict]
 ) -> Optional[Dict[str, Any]]:
     """Evaluate a single prediction.
     Matches by question text first, falls back to question ID if not found.
     """
     question = result.get('question', '').strip()
     qid = result.get('id', '')
@@ -128,7 +142,15 @@ def evaluate_single(
     # ANLS*
     anls = anls_star(answer, gold_data['answers'])
-    correct = anls >= 0.5
     # Citation F1
     doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
@@ -141,6 +163,7 @@ def evaluate_single(
     return {
         'question': question,
         'anls': anls,
         'correct': correct,
         'doc_f1': doc_f1['f1'],
         'page_f1': page_f1['f1'],
@@ -151,7 +174,7 @@ def evaluate_single(
     }
-def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
     """Aggregate metrics across evaluations."""
     if not evals:
         return {}
@@ -162,6 +185,16 @@ def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
     mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n
     mean_page_f1 = sum(e['page_f1'] for e in evals) / n
     # Kuiper
     kuiper = kuiper_statistic(evals)
     wasted = wasted_effort_ratio(evals)
@@ -170,6 +203,8 @@ def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
         'n': n,
         'accuracy': accuracy,
         'mean_anls': mean_anls,
         'doc_f1': mean_doc_f1,
         'page_f1': mean_page_f1,
         'kuiper_stat': kuiper['kuiper_stat'],
@@ -180,7 +215,7 @@ def aggregate_metrics(evals: List[Dict]) -> Dict[str, Any]:
     }
-def print_metrics(name: str, metrics: Dict, indent: int = 0):
     """Print metrics in a formatted way."""
     prefix = "  " * indent
@@ -189,8 +224,16 @@ def print_metrics(name: str, metrics: Dict, indent: int = 0):
         return
     print(f"{prefix}{name} (n={metrics['n']}):")
-    print(f"{prefix}  Accuracy (ANLS*≥0.5): {metrics['accuracy']:.1%}")
-    print(f"{prefix}  Mean ANLS*:           {metrics['mean_anls']:.4f}")
     print(f"{prefix}  Document F1:          {metrics['doc_f1']:.4f}")
     print(f"{prefix}  Page F1:              {metrics['page_f1']:.4f}")
@@ -207,16 +250,20 @@ def evaluate_file(
     gold_by_id: Dict[str, Dict],
     by_category: bool = False,
     by_domain: bool = False,
-    by_hop_type: bool = True
 ) -> Dict[str, Any]:
     """Evaluate a single results file."""
     results = load_results(filepath)
     evals = []
     unmatched = 0
-    for result in results:
-        ev = evaluate_single(result, gold_by_text, gold_by_id)
         if ev:
             evals.append(ev)
         else:
@@ -226,30 +273,30 @@ def evaluate_file(
         print(f"  Warning: {unmatched} questions not found in gold standard")
     # Overall metrics
-    overall = aggregate_metrics(evals)
-    output = {'overall': overall}
     # By hop type (always included by default)
     if by_hop_type:
         by_hop = defaultdict(list)
         for e in evals:
             by_hop[e.get('hop_type', 'single')].append(e)
-        output['by_hop_type'] = {hop: aggregate_metrics(items) for hop, items in sorted(by_hop.items())}
     # By category
     if by_category:
         by_cat = defaultdict(list)
         for e in evals:
             by_cat[e['category'] or 'Unknown'].append(e)
-        output['by_category'] = {cat: aggregate_metrics(items) for cat, items in sorted(by_cat.items())}
     # By domain
     if by_domain:
         by_dom = defaultdict(list)
         for e in evals:
             by_dom[e['domain'] or 'Other'].append(e)
-        output['by_domain'] = {dom: aggregate_metrics(items) for dom, items in sorted(by_dom.items())}
     return output
@@ -273,6 +320,8 @@ Examples:
     parser.add_argument('--by-domain', action='store_true', help='Show metrics by domain')
     parser.add_argument('--compare', action='store_true', help='Compare multiple models side-by-side')
     parser.add_argument('--json', action='store_true', help='Output as JSON')
     args = parser.parse_args()
@@ -298,7 +347,13 @@ Examples:
             name = name[:-8]
         print(f"\nEvaluating: {filepath.name}")
-        result = evaluate_file(filepath, gold_by_text, gold_by_id, args.by_category, args.by_domain)
         all_results[name] = result
     # Output
@@ -324,31 +379,42 @@ Examples:
             # Comparison table
             models = list(all_results.keys())
-            print(f"\n{'Model':<35} {'Acc':<8} {'ANLS*':<8} {'Doc F1':<8} {'Page F1':<8} {'Kuiper':<8}")
-            print("-" * 75)
-            for model in sorted(models, key=lambda m: -all_results[m]['overall'].get('accuracy', 0)):
-                m = all_results[model]['overall']
-                kuiper_str = f"{m['kuiper_stat']:.2f}" if not m.get('kuiper_degenerate') else "N/A"
-                print(f"{model:<35} {m.get('accuracy', 0):.1%}    {m.get('mean_anls', 0):.4f}  "
-                      f"{m.get('doc_f1', 0):.4f}  {m.get('page_f1', 0):.4f}  {kuiper_str}")
         else:
             # Detailed per-model output
             for model, result in all_results.items():
                 print(f"\n{'─' * 40}")
-                print_metrics(model, result['overall'])
                 if 'by_category' in result:
                     print(f"\n  By Category:")
                     for cat, metrics in sorted(result['by_category'].items(),
                                               key=lambda x: -x[1].get('n', 0)):
-                        print_metrics(cat, metrics, indent=2)
                 if 'by_domain' in result:
                     print(f"\n  By Domain:")
                     for dom, metrics in sorted(result['by_domain'].items(),
                                               key=lambda x: -x[1].get('n', 0)):
-                        print_metrics(dom, metrics, indent=2)
     print()

 from datasets import load_dataset
+from metrics import (
+    anls_star,
+    anls_star_llm,
+    aggregate_anls_star_llm,
+    citation_f1,
+    kuiper_statistic,
+    wasted_effort_ratio
+)
 def derive_hop_type(evidence: list) -> str:
 def evaluate_single(
     result: Dict,
     gold_by_text: Dict[str, Dict],
+    gold_by_id: Dict[str, Dict],
+    use_semantic: bool = False
 ) -> Optional[Dict[str, Any]]:
     """Evaluate a single prediction.
     Matches by question text first, falls back to question ID if not found.
+    Args:
+        result: Prediction dict with 'question', 'answer', 'citations'
+        gold_by_text: Gold data indexed by question text
+        gold_by_id: Gold data indexed by question ID
+        use_semantic: If True, also compute semantic accuracy with LLM judge
     """
     question = result.get('question', '').strip()
     qid = result.get('id', '')
     # ANLS*
     anls = anls_star(answer, gold_data['answers'])
+    # Semantic accuracy with LLM judge (if enabled)
+    if use_semantic:
+        llm_result = anls_star_llm(answer, gold_data['answers'], question)
+        semantic = llm_result['score']
+        correct = semantic >= 0.5
+    else:
+        semantic = anls
+        correct = anls >= 0.5
     # Citation F1
     doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
     return {
         'question': question,
         'anls': anls,
+        'semantic': semantic,
         'correct': correct,
         'doc_f1': doc_f1['f1'],
         'page_f1': page_f1['f1'],
     }
+def aggregate_metrics(evals: List[Dict], use_semantic: bool = False) -> Dict[str, Any]:
     """Aggregate metrics across evaluations."""
     if not evals:
         return {}
     mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n
     mean_page_f1 = sum(e['page_f1'] for e in evals) / n
+    # Semantic accuracy with bias correction
+    if use_semantic and 'semantic' in evals[0]:
+        semantic_scores = [e['semantic'] for e in evals]
+        agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
+        mean_semantic = agg['adjusted_score']
+        semantic_ci = (agg['ci_lower'], agg['ci_upper'])
+    else:
+        mean_semantic = mean_anls
+        semantic_ci = None
     # Kuiper
     kuiper = kuiper_statistic(evals)
     wasted = wasted_effort_ratio(evals)
         'n': n,
         'accuracy': accuracy,
         'mean_anls': mean_anls,
+        'mean_semantic': mean_semantic,
+        'semantic_ci': semantic_ci,
         'doc_f1': mean_doc_f1,
         'page_f1': mean_page_f1,
         'kuiper_stat': kuiper['kuiper_stat'],
     }
+def print_metrics(name: str, metrics: Dict, indent: int = 0, use_semantic: bool = False):
     """Print metrics in a formatted way."""
     prefix = "  " * indent
         return
     print(f"{prefix}{name} (n={metrics['n']}):")
+    if use_semantic and 'mean_semantic' in metrics:
+        ci = metrics.get('semantic_ci')
+        ci_str = f" [{ci[0]:.2%}-{ci[1]:.2%}]" if ci else ""
+        print(f"{prefix}  Semantic Accuracy:    {metrics['mean_semantic']:.2%}{ci_str}")
+        print(f"{prefix}  ANLS* (string):       {metrics['mean_anls']:.4f}")
+    else:
+        print(f"{prefix}  Accuracy (ANLS*≥0.5): {metrics['accuracy']:.1%}")
+        print(f"{prefix}  Mean ANLS*:           {metrics['mean_anls']:.4f}")
     print(f"{prefix}  Document F1:          {metrics['doc_f1']:.4f}")
     print(f"{prefix}  Page F1:              {metrics['page_f1']:.4f}")
     gold_by_id: Dict[str, Dict],
     by_category: bool = False,
     by_domain: bool = False,
+    by_hop_type: bool = True,
+    use_semantic: bool = False
 ) -> Dict[str, Any]:
     """Evaluate a single results file."""
     results = load_results(filepath)
     evals = []
     unmatched = 0
+    total = len(results)
+    for i, result in enumerate(results):
+        if use_semantic and (i + 1) % 50 == 0:
+            print(f"  Processing {i+1}/{total}...")
+        ev = evaluate_single(result, gold_by_text, gold_by_id, use_semantic=use_semantic)
         if ev:
             evals.append(ev)
         else:
         print(f"  Warning: {unmatched} questions not found in gold standard")
     # Overall metrics
+    overall = aggregate_metrics(evals, use_semantic=use_semantic)
+    output = {'overall': overall, 'use_semantic': use_semantic}
     # By hop type (always included by default)
     if by_hop_type:
         by_hop = defaultdict(list)
         for e in evals:
             by_hop[e.get('hop_type', 'single')].append(e)
+        output['by_hop_type'] = {hop: aggregate_metrics(items, use_semantic) for hop, items in sorted(by_hop.items())}
     # By category
     if by_category:
         by_cat = defaultdict(list)
         for e in evals:
             by_cat[e['category'] or 'Unknown'].append(e)
+        output['by_category'] = {cat: aggregate_metrics(items, use_semantic) for cat, items in sorted(by_cat.items())}
     # By domain
     if by_domain:
         by_dom = defaultdict(list)
         for e in evals:
             by_dom[e['domain'] or 'Other'].append(e)
+        output['by_domain'] = {dom: aggregate_metrics(items, use_semantic) for dom, items in sorted(by_dom.items())}
     return output
     parser.add_argument('--by-domain', action='store_true', help='Show metrics by domain')
     parser.add_argument('--compare', action='store_true', help='Compare multiple models side-by-side')
     parser.add_argument('--json', action='store_true', help='Output as JSON')
+    parser.add_argument('--semantic', action='store_true',
+                        help='Use semantic accuracy (ANLS* + LLM judge) instead of pure ANLS*. Requires GOOGLE_API_KEY.')
     args = parser.parse_args()
             name = name[:-8]
         print(f"\nEvaluating: {filepath.name}")
+        if args.semantic:
+            print("  Using semantic accuracy (ANLS* + LLM judge)...")
+        result = evaluate_file(
+            filepath, gold_by_text, gold_by_id,
+            args.by_category, args.by_domain,
+            use_semantic=args.semantic
+        )
         all_results[name] = result
     # Output
             # Comparison table
             models = list(all_results.keys())
+            if args.semantic:
+                print(f"\n{'Model':<35} {'Semantic':<10} {'ANLS*':<8} {'Doc F1':<8} {'Page F1':<8} {'Kuiper':<8}")
+                print("-" * 85)
+                for model in sorted(models, key=lambda m: -all_results[m]['overall'].get('mean_semantic', 0)):
+                    m = all_results[model]['overall']
+                    kuiper_str = f"{m['kuiper_stat']:.2f}" if not m.get('kuiper_degenerate') else "N/A"
+                    print(f"{model:<35} {m.get('mean_semantic', 0):.1%}      {m.get('mean_anls', 0):.4f}  "
+                          f"{m.get('doc_f1', 0):.4f}  {m.get('page_f1', 0):.4f}  {kuiper_str}")
+            else:
+                print(f"\n{'Model':<35} {'Acc':<8} {'ANLS*':<8} {'Doc F1':<8} {'Page F1':<8} {'Kuiper':<8}")
+                print("-" * 75)
+                for model in sorted(models, key=lambda m: -all_results[m]['overall'].get('accuracy', 0)):
+                    m = all_results[model]['overall']
+                    kuiper_str = f"{m['kuiper_stat']:.2f}" if not m.get('kuiper_degenerate') else "N/A"
+                    print(f"{model:<35} {m.get('accuracy', 0):.1%}    {m.get('mean_anls', 0):.4f}  "
+                          f"{m.get('doc_f1', 0):.4f}  {m.get('page_f1', 0):.4f}  {kuiper_str}")
         else:
             # Detailed per-model output
             for model, result in all_results.items():
                 print(f"\n{'─' * 40}")
+                use_sem = result.get('use_semantic', False)
+                print_metrics(model, result['overall'], use_semantic=use_sem)
                 if 'by_category' in result:
                     print(f"\n  By Category:")
                     for cat, metrics in sorted(result['by_category'].items(),
                                               key=lambda x: -x[1].get('n', 0)):
+                        print_metrics(cat, metrics, indent=2, use_semantic=use_sem)
                 if 'by_domain' in result:
                     print(f"\n  By Domain:")
                     for dom, metrics in sorted(result['by_domain'].items(),
                                               key=lambda x: -x[1].get('n', 0)):
+                        print_metrics(dom, metrics, indent=2, use_semantic=use_sem)
     print()

eval/metrics.py CHANGED Viewed

@@ -3,15 +3,180 @@ Core evaluation metrics for document QA.
 Metrics:
 - ANLS*: Answer-level Normalized Levenshtein Similarity
 - Citation F1: Document-level and Page-level F1 scores
 - Kuiper Statistic: Effort-accuracy calibration measure
 """
-from typing import Any, Dict, List, Set, Tuple
 import numpy as np
 from anls_star import anls_score
 def anls_star(predicted: Any, ground_truths: List[List[str]]) -> float:
     """
     Calculate ANLS* score (case-insensitive).
@@ -49,6 +214,340 @@ def anls_star(predicted: Any, ground_truths: List[List[str]]) -> float:
     return max_score
 def citation_f1(
     predicted_citations: List[Dict[str, Any]],
     gold_locations: List[Dict[str, Any]],

 Metrics:
 - ANLS*: Answer-level Normalized Levenshtein Similarity
+- ANLS*+LLM: ANLS* with LLM fallback for semantic equivalence
 - Citation F1: Document-level and Page-level F1 scores
 - Kuiper Statistic: Effort-accuracy calibration measure
+Bias Correction:
+Based on "How to Correctly Report LLM-as-a-Judge Evaluations" (2511.21140v2)
 """
+import json
+import os
+import time
+from math import sqrt
+from typing import Any, Dict, List, Optional, Set, Tuple
 import numpy as np
+from scipy.stats import norm
 from anls_star import anls_score
+# ============================================================================
+# LLM Judge Calibration (from human evaluation)
+# ============================================================================
+# Calibration values from 200-sample human evaluation
+# Sensitivity: P(LLM=correct | Human=correct)
+LLM_JUDGE_SENSITIVITY = 0.980  # q1
+# Specificity: P(LLM=incorrect | Human=incorrect)
+LLM_JUDGE_SPECIFICITY = 1.000  # q0
+# Calibration sample sizes (for confidence intervals)
+LLM_JUDGE_CALIBRATION_M1 = 152  # samples where human=correct
+LLM_JUDGE_CALIBRATION_M0 = 48   # samples where human=incorrect
+def bias_adjusted_score(
+    raw_score: float,
+    q0: float = LLM_JUDGE_SPECIFICITY,
+    q1: float = LLM_JUDGE_SENSITIVITY
+) -> float:
+    """
+    Compute bias-adjusted score using Rogan-Gladen correction.
+    From "How to Correctly Report LLM-as-a-Judge Evaluations":
+    θ̂ = (p̂ + q₀ - 1) / (q₀ + q₁ - 1)
+    Args:
+        raw_score: Raw LLM judgment score (p̂)
+        q0: Specificity - P(LLM=incorrect | true=incorrect)
+        q1: Sensitivity - P(LLM=correct | true=correct)
+    Returns:
+        Bias-adjusted score, clipped to [0, 1]
+    """
+    if q0 + q1 <= 1:
+        # Degenerate case - judge is no better than random
+        return raw_score
+    adjusted = (raw_score + q0 - 1) / (q0 + q1 - 1)
+    return max(0.0, min(1.0, adjusted))
+def standard_error(
+    raw_score: float,
+    n_samples: int,
+    q0: float = LLM_JUDGE_SPECIFICITY,
+    q1: float = LLM_JUDGE_SENSITIVITY
+) -> float:
+    """
+    Compute bias-adjusted standard error.
+    SE is scaled by the bias adjustment factor to account for
+    the transformation from raw to adjusted score.
+    Args:
+        raw_score: Raw LLM judgment score (p̂)
+        n_samples: Number of test samples
+        q0: Specificity
+        q1: Sensitivity
+    Returns:
+        Bias-adjusted standard error
+    """
+    if n_samples <= 0 or q0 + q1 <= 1:
+        return 0.0
+    # Raw binomial SE
+    p = raw_score
+    se_raw = sqrt(p * (1 - p) / n_samples) if 0 < p < 1 else 0
+    # Scale by bias adjustment factor
+    se_adjusted = se_raw / (q0 + q1 - 1)
+    return se_adjusted
+def confidence_interval(
+    raw_score: float,
+    n_samples: int,
+    q0: float = LLM_JUDGE_SPECIFICITY,
+    q1: float = LLM_JUDGE_SENSITIVITY,
+    m0: int = LLM_JUDGE_CALIBRATION_M0,
+    m1: int = LLM_JUDGE_CALIBRATION_M1,
+    alpha: float = 0.05
+) -> Tuple[float, float]:
+    """
+    Compute confidence interval for bias-adjusted score.
+    Simplified version that uses observed q0, q1 directly when calibration
+    is high quality (q0 + q1 > 1.9). Falls back to full formula otherwise.
+    Args:
+        raw_score: Raw LLM judgment score (p̂)
+        n_samples: Number of test samples
+        q0: Specificity
+        q1: Sensitivity
+        m0: Calibration samples where human=incorrect
+        m1: Calibration samples where human=correct
+        alpha: Significance level (default 0.05 for 95% CI)
+    Returns:
+        Tuple of (lower_bound, upper_bound)
+    """
+    z = norm.ppf(1 - alpha / 2)
+    # For high-quality calibration (q0 + q1 > 1.9), use simplified CI
+    # that trusts the observed sensitivity/specificity
+    if q0 + q1 > 1.9:
+        # Bias-adjusted point estimate
+        theta = bias_adjusted_score(raw_score, q0, q1)
+        # Simple binomial SE for the test dataset only
+        # (calibration is trusted to be accurate)
+        p = raw_score
+        se_raw = sqrt(p * (1 - p) / n_samples) if n_samples > 0 else 0
+        # Scale SE by the bias adjustment factor
+        se_adjusted = se_raw / (q0 + q1 - 1)
+        lower = max(0.0, theta - z * se_adjusted)
+        upper = min(1.0, theta + z * se_adjusted)
+        return (lower, upper)
+    # Full formula with regularization for lower-quality calibration
+    p = (n_samples * raw_score + z**2 / 2) / (n_samples + z**2)
+    q0_adj = (m0 * q0 + 1) / (m0 + 2)
+    q1_adj = (m1 * q1 + 1) / (m1 + 2)
+    n_adj = n_samples + z**2
+    m0_adj = m0 + 2
+    m1_adj = m1 + 2
+    # Point estimate
+    if q0_adj + q1_adj <= 1:
+        return (0.0, 1.0)
+    theta = (p + q0_adj - 1) / (q0_adj + q1_adj - 1)
+    # Bias correction term
+    dth = 2 * z**2 * (
+        -(1 - theta) * q0_adj * (1 - q0_adj) / m0_adj
+        + theta * q1_adj * (1 - q1_adj) / m1_adj
+    )
+    # Standard error
+    se = sqrt(
+        p * (1 - p) / n_adj
+        + (1 - theta)**2 * q0_adj * (1 - q0_adj) / m0_adj
+        + theta**2 * q1_adj * (1 - q1_adj) / m1_adj
+    ) / (q0_adj + q1_adj - 1)
+    lower = max(0.0, theta + dth - z * se)
+    upper = min(1.0, theta + dth + z * se)
+    return (lower, upper)
 def anls_star(predicted: Any, ground_truths: List[List[str]]) -> float:
     """
     Calculate ANLS* score (case-insensitive).
     return max_score
+# ============================================================================
+# ANLS* + LLM Judge Metric
+# ============================================================================
+_GEVAL_PROMPT_TEMPLATE = """You are evaluating answer correctness for a Document QA benchmark.
+## Input
+Question: {question}
+Predicted Answer: {predicted}
+Gold Answer Variants: {gold_variants}
+## Evaluation Criteria
+**correct**: Predicted answer is semantically equivalent to at least one gold variant. Minor format differences are acceptable.
+**partial**: Predicted answer contains correct core information but has a significant format issue (e.g., list presented as comma-separated string when items are short/atomic) OR includes irrelevant additions.
+**incorrect**: Predicted answer is factually wrong, missing, contains different information, or fails to answer the question type (e.g., no Yes/No for binary questions). Missing unit qualifiers that change magnitude (thousands, millions) are incorrect.
+## Evaluation Steps
+Follow these steps in order:
+Step 1 - Check for refusal: Does the answer refuse or claim inability to answer? If yes → incorrect.
+Step 2 - Compare content: Does the predicted answer match the core meaning of any gold variant? If content is wrong or different → incorrect.
+Step 3 - Check critical errors (any of these → incorrect):
+- Missing scale qualifiers that change magnitude: "50" vs "$50 million" → incorrect
+- Binary questions without explicit Yes/No: Q: "Is X true?" A: "X is observed" → incorrect (must say Yes or No)
+- Wrong entity/value: different person, company, number than gold → incorrect
+- Partial list with wrong items mixed in: some correct + some wrong items → incorrect
+Step 4 - Check format (only if content is correct):
+- If gold expects multiple items AND predicted is a comma-separated string (not a list) → partial
+- If gold expects single item → no format issue possible
+Step 5 - Check verbosity (only if content is correct):
+- CORRECT (acceptable verbosity):
+  * Extra qualifiers: "three security questions" when gold is "3" → correct
+  * Relevant context: "No — Massachusetts; Washington" for "same state?" question → correct
+  * Clarifying phrases: "in his personal capacity", "per annum" → correct
+- PARTIAL (medium verbosity) - ONLY when additions are truly irrelevant:
+  * Adding unrequested details to list items
+  * Over-specific precision: date+time when only date asked → partial
+- INCORRECT (high verbosity):
+  * Multi-sentence responses when a word/phrase suffices
+  * Full paragraphs of explanation
+  * Conversational preambles: "Based on the document...", "The answer is..."
+Based on your step-by-step analysis, provide your final judgment.
+After your reasoning, you MUST call submit_judgment with your final decision."""
+_LLM_JUDGE_TOOL = {
+    "function_declarations": [{
+        "name": "submit_judgment",
+        "description": "Submit your final judgment after reasoning through the evaluation steps",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "judgment": {
+                    "type": "string",
+                    "enum": ["correct", "partial", "incorrect"],
+                    "description": "Final judgment: correct, partial, or incorrect"
+                },
+                "main_issue": {
+                    "type": "string",
+                    "enum": ["none", "refusal", "wrong_content", "missing_unit", "no_yes_no", "list_format", "verbosity_medium", "verbosity_high"],
+                    "description": "The primary issue found, if any"
+                },
+                "explanation": {
+                    "type": "string",
+                    "description": "Brief explanation of your judgment"
+                }
+            },
+            "required": ["judgment", "main_issue", "explanation"]
+        }
+    }]
+}
+def _get_gemini_model():
+    """Initialize Gemini model (lazy loading)."""
+    import google.generativeai as genai
+    api_key = os.environ.get("GOOGLE_API_KEY")
+    if not api_key:
+        raise ValueError("GOOGLE_API_KEY environment variable not set")
+    genai.configure(api_key=api_key)
+    return genai.GenerativeModel('gemini-2.5-flash')
+def _call_gemini_with_timeout(model, prompt, timeout=30):
+    """Call Gemini with a timeout using threading."""
+    from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
+    def _call():
+        return model.generate_content(
+            prompt,
+            tools=[_LLM_JUDGE_TOOL],
+            tool_config={"function_calling_config": {"mode": "ANY"}},
+            request_options={"timeout": timeout}
+        )
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(_call)
+        try:
+            return future.result(timeout=timeout)
+        except FuturesTimeoutError:
+            raise TimeoutError(f"Gemini API call timed out after {timeout}s")
+def _call_llm_judge(
+    question: str,
+    predicted: Any,
+    gold_variants: List[List[str]],
+    max_retries: int = 3,
+    retry_delay: float = 1.0,
+    timeout: float = 30.0
+) -> Dict[str, Any]:
+    """
+    Call Gemini LLM judge with retries and timeout.
+    Returns:
+        Dict with 'judgment', 'main_issue', 'explanation', 'score'
+    """
+    prompt = _GEVAL_PROMPT_TEMPLATE.format(
+        question=question,
+        predicted=json.dumps(predicted),
+        gold_variants=json.dumps(gold_variants)
+    )
+    model = _get_gemini_model()
+    for attempt in range(max_retries):
+        try:
+            response = _call_gemini_with_timeout(model, prompt, timeout=timeout)
+            # Extract function call result
+            if response.candidates and response.candidates[0].content.parts:
+                for part in response.candidates[0].content.parts:
+                    if hasattr(part, 'function_call') and part.function_call.name == "submit_judgment":
+                        args = dict(part.function_call.args)
+                        judgment = args.get('judgment', 'incorrect')
+                        # Map judgment to score
+                        score_map = {'correct': 1.0, 'partial': 0.5, 'incorrect': 0.0}
+                        args['score'] = score_map.get(judgment, 0.0)
+                        return args
+            # No function call found - retry
+            if attempt < max_retries - 1:
+                time.sleep(retry_delay)
+                continue
+        except TimeoutError as e:
+            if attempt < max_retries - 1:
+                time.sleep(retry_delay)
+                continue
+            return {
+                'judgment': 'error',
+                'main_issue': 'timeout',
+                'explanation': str(e),
+                'score': 0.0
+            }
+        except Exception as e:
+            if attempt < max_retries - 1:
+                time.sleep(retry_delay * (attempt + 1))  # Exponential backoff
+                continue
+            return {
+                'judgment': 'error',
+                'main_issue': 'error',
+                'explanation': str(e),
+                'score': 0.0
+            }
+    return {
+        'judgment': 'error',
+        'main_issue': 'parse_error',
+        'explanation': 'Failed to get valid response after retries',
+        'score': 0.0
+    }
+def anls_star_llm(
+    predicted: Any,
+    ground_truths: List[List[str]],
+    question: str = "",
+    threshold: float = 1.0
+) -> Dict[str, Any]:
+    """
+    ANLS* with LLM fallback for semantic equivalence checking.
+    If ANLS* >= threshold (default 1.0), returns ANLS* score.
+    Otherwise, calls Gemini LLM judge to evaluate semantic correctness.
+    Args:
+        predicted: Predicted answer (string or list)
+        ground_truths: List of answer variants
+        question: The question text (needed for LLM judge)
+        threshold: ANLS* threshold above which to skip LLM (default 1.0)
+    Returns:
+        Dict with:
+        - 'score': Final score (0.0, 0.5, or 1.0)
+        - 'anls_score': Raw ANLS* score
+        - 'used_llm': Whether LLM judge was called
+        - 'llm_judgment': LLM judgment details (if used)
+    """
+    # Check for empty prediction (optimization: skip LLM, return 0)
+    is_empty = (
+        predicted is None
+        or predicted == ""
+        or predicted == []
+        or (isinstance(predicted, list) and all(not p for p in predicted))
+    )
+    if is_empty:
+        return {
+            'score': 0.0,
+            'anls_score': 0.0,
+            'used_llm': False,
+            'llm_judgment': {'judgment': 'incorrect', 'main_issue': 'empty', 'explanation': 'Empty prediction'}
+        }
+    # Check for overly long answers (optimization: skip LLM, return 0)
+    MAX_ANSWER_LENGTH = 2000
+    try:
+        answer_length = len(json.dumps(predicted))
+    except (TypeError, ValueError):
+        answer_length = len(str(predicted))
+    if answer_length > MAX_ANSWER_LENGTH:
+        return {
+            'score': 0.0,
+            'anls_score': 0.0,
+            'used_llm': False,
+            'llm_judgment': {
+                'judgment': 'incorrect',
+                'main_issue': 'too_long',
+                'explanation': f'Answer too long ({answer_length} chars > {MAX_ANSWER_LENGTH})'
+            }
+        }
+    # Check ANLS*
+    anls = anls_star(predicted, ground_truths)
+    result = {
+        'score': anls,
+        'anls_score': anls,
+        'used_llm': False,
+        'llm_judgment': None
+    }
+    # If ANLS* is perfect, no need for LLM
+    if anls >= threshold:
+        result['score'] = 1.0
+        return result
+    # Call LLM judge for cases where ANLS* < threshold
+    if question:
+        llm_result = _call_llm_judge(question, predicted, ground_truths)
+        result['used_llm'] = True
+        result['llm_judgment'] = llm_result
+        result['score'] = llm_result.get('score', 0.0)
+    return result
+def aggregate_anls_star_llm(
+    scores: List[float],
+    apply_bias_correction: bool = True
+) -> Dict[str, Any]:
+    """
+    Compute aggregate ANLS*+LLM score with optional bias correction.
+    Based on "How to Correctly Report LLM-as-a-Judge Evaluations" (2511.21140v2).
+    Args:
+        scores: List of individual ANLS*+LLM scores (0.0, 0.5, or 1.0)
+        apply_bias_correction: Whether to apply Rogan-Gladen correction
+    Returns:
+        Dict with:
+        - 'raw_score': Mean of raw scores
+        - 'adjusted_score': Bias-adjusted score (if correction applied)
+        - 'se': Bias-adjusted standard error
+        - 'ci_lower': 95% CI lower bound
+        - 'ci_upper': 95% CI upper bound
+        - 'n_samples': Number of samples
+        - 'q0': Specificity used
+        - 'q1': Sensitivity used
+    """
+    if not scores:
+        return {
+            'raw_score': 0.0,
+            'adjusted_score': 0.0,
+            'se': 0.0,
+            'ci_lower': 0.0,
+            'ci_upper': 0.0,
+            'n_samples': 0,
+            'q0': LLM_JUDGE_SPECIFICITY,
+            'q1': LLM_JUDGE_SENSITIVITY
+        }
+    n = len(scores)
+    raw = sum(scores) / n
+    result = {
+        'raw_score': raw,
+        'n_samples': n,
+        'q0': LLM_JUDGE_SPECIFICITY,
+        'q1': LLM_JUDGE_SENSITIVITY
+    }
+    if apply_bias_correction:
+        result['adjusted_score'] = bias_adjusted_score(raw)
+        result['se'] = standard_error(raw, n)
+        ci = confidence_interval(raw, n)
+        result['ci_lower'] = ci[0]
+        result['ci_upper'] = ci[1]
+    else:
+        result['adjusted_score'] = raw
+        result['se'] = sqrt(raw * (1 - raw) / n) if n > 0 and 0 < raw < 1 else 0.0
+        # Simple binomial CI without calibration correction
+        se = sqrt(raw * (1 - raw) / n) if n > 0 else 0
+        z = 1.96
+        result['ci_lower'] = max(0.0, raw - z * se)
+        result['ci_upper'] = min(1.0, raw + z * se)
+    return result
 def citation_f1(
     predicted_citations: List[Dict[str, Any]],
     gold_locations: List[Dict[str, Any]],

eval/reevaluate_submissions.py ADDED Viewed

	@@ -0,0 +1,254 @@

+#!/usr/bin/env python3
+"""
+Re-evaluate existing submissions with the new Semantic Accuracy metric.
+This script:
+1. Downloads prediction files from HuggingFace Hub
+2. Re-evaluates them with ANLS* + LLM judge
+3. Updates the results files with new metrics
+"""
+import json
+import os
+import sys
+from pathlib import Path
+from datetime import datetime, timezone
+from huggingface_hub import HfApi, hf_hub_download, list_repo_files
+from datasets import load_dataset
+# Add parent for imports
+sys.path.insert(0, str(Path(__file__).parent))
+from metrics import (
+    anls_star,
+    anls_star_llm,
+    aggregate_anls_star_llm,
+    citation_f1,
+    kuiper_statistic
+)
+# Config
+RESULTS_REPO = "agentic-document-ai/backend-results"
+TOKEN = os.environ.get("HF_TOKEN")
+def load_gold_data():
+    """Load gold standard from HuggingFace."""
+    print("Loading gold standard...")
+    dataset = load_dataset("agentic-document-ai/dataset-PRIVATE", split="test")
+    gold_by_id = {}
+    gold_by_text = {}
+    for ex in dataset:
+        qid = ex.get('id', '')
+        question = ex['question'].strip()
+        data = {
+            'question': question,
+            'answers': ex.get('answer_variants', []),
+            'evidence': ex.get('evidence', []),
+            'category': ex.get('category', ''),
+            'domain': ex.get('domain', ''),
+            'hop_type': ex.get('hop_type', 'single'),
+        }
+        gold_by_id[qid] = data
+        gold_by_text[question] = data
+    return gold_by_id, gold_by_text
+def find_prediction_files():
+    """Find all prediction JSONL files in the results repo."""
+    api = HfApi()
+    files = list_repo_files(RESULTS_REPO, repo_type="dataset", token=TOKEN)
+    prediction_files = [f for f in files if f.endswith('_predictions.jsonl') or '_predictions_' in f]
+    return prediction_files
+def download_predictions(filepath: str) -> list:
+    """Download and parse a predictions file."""
+    local_path = hf_hub_download(
+        repo_id=RESULTS_REPO,
+        filename=filepath,
+        repo_type="dataset",
+        token=TOKEN
+    )
+    predictions = []
+    with open(local_path) as f:
+        for line in f:
+            if line.strip():
+                predictions.append(json.loads(line))
+    return predictions
+def evaluate_with_semantic(predictions: list, gold_by_id: dict, gold_by_text: dict) -> dict:
+    """Evaluate predictions with semantic accuracy metric."""
+    from collections import defaultdict
+    evals = []
+    unmatched = 0
+    total = len(predictions)
+    for i, pred in enumerate(predictions):
+        if (i + 1) % 50 == 0:
+            print(f"  Processing {i+1}/{total}...")
+        question = pred.get('question', '').strip()
+        qid = pred.get('id', '')
+        # Match to gold
+        gold_data = None
+        if question in gold_by_text:
+            gold_data = gold_by_text[question]
+        elif qid and qid in gold_by_id:
+            gold_data = gold_by_id[qid]
+        if not gold_data:
+            unmatched += 1
+            continue
+        answer = pred.get('answer', '')
+        citations = pred.get('citations', [])
+        search_history = pred.get('search_history', [])
+        steps = len(search_history) if search_history else pred.get('iterations', 0)
+        # Calculate metrics
+        anls = anls_star(answer, gold_data['answers'])
+        # Semantic accuracy with LLM judge
+        llm_result = anls_star_llm(answer, gold_data['answers'], question)
+        semantic_score = llm_result['score']
+        doc_f1 = citation_f1(citations, gold_data['evidence'], level='document')
+        page_f1 = citation_f1(citations, gold_data['evidence'], level='page')
+        evals.append({
+            'anls': anls,
+            'semantic_score': semantic_score,
+            'correct': semantic_score >= 0.5,
+            'doc_f1': doc_f1['f1'],
+            'page_f1': page_f1['f1'],
+            'steps': steps,
+            'hop_type': gold_data.get('hop_type', 'single'),
+            'category': gold_data['category'],
+            'domain': gold_data['domain']
+        })
+    if not evals:
+        return None
+    # Aggregate
+    n = len(evals)
+    semantic_scores = [e['semantic_score'] for e in evals]
+    agg = aggregate_anls_star_llm(semantic_scores, apply_bias_correction=True)
+    mean_semantic = agg['adjusted_score'] * 100
+    mean_anls = sum(e['anls'] for e in evals) / n * 100
+    mean_doc_f1 = sum(e['doc_f1'] for e in evals) / n * 100
+    mean_page_f1 = sum(e['page_f1'] for e in evals) / n * 100
+    kuiper = kuiper_statistic(evals)
+    # By hop type
+    single_hop = [e for e in evals if e['hop_type'] == 'single']
+    cross_page = [e for e in evals if e['hop_type'] == 'cross_page']
+    cross_doc = [e for e in evals if e['hop_type'] == 'cross_doc']
+    # By domain
+    by_domain = defaultdict(list)
+    for e in evals:
+        domain = e['domain'] or 'Other'
+        by_domain[domain].append(e)
+    domain_scores = {}
+    for domain, domain_evals in sorted(by_domain.items()):
+        domain_scores[domain] = {
+            'semantic': sum(e['semantic_score'] for e in domain_evals) / len(domain_evals) * 100,
+            'anls': sum(e['anls'] for e in domain_evals) / len(domain_evals) * 100,
+            'n': len(domain_evals)
+        }
+    return {
+        'overall': {
+            'semantic': mean_semantic,
+            'anls': mean_anls,
+            'page_f1': mean_page_f1,
+            'doc_f1': mean_doc_f1,
+            'kuiper': kuiper['kuiper_stat'] if not kuiper.get('degenerate') else None,
+        },
+        'single_evidence': {
+            'semantic': sum(e['semantic_score'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
+            'anls': sum(e['anls'] for e in single_hop) / len(single_hop) * 100 if single_hop else 0,
+            'n': len(single_hop)
+        },
+        'multi_evidence_same_doc': {
+            'semantic': sum(e['semantic_score'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
+            'anls': sum(e['anls'] for e in cross_page) / len(cross_page) * 100 if cross_page else 0,
+            'n': len(cross_page)
+        },
+        'multi_evidence_multi_doc': {
+            'semantic': sum(e['semantic_score'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
+            'anls': sum(e['anls'] for e in cross_doc) / len(cross_doc) * 100 if cross_doc else 0,
+            'n': len(cross_doc)
+        },
+        'by_domain': domain_scores,
+        'n_evaluated': n,
+        'n_unmatched': unmatched
+    }
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Re-evaluate submissions with semantic accuracy")
+    parser.add_argument('--dry-run', action='store_true', help="Don't upload results")
+    parser.add_argument('--file', type=str, help="Re-evaluate specific prediction file")
+    args = parser.parse_args()
+    # Load gold standard
+    gold_by_id, gold_by_text = load_gold_data()
+    print(f"Loaded {len(gold_by_id)} gold examples")
+    # Find prediction files
+    if args.file:
+        pred_files = [args.file]
+    else:
+        print("\nFinding prediction files...")
+        pred_files = find_prediction_files()
+        print(f"Found {len(pred_files)} prediction files")
+    for pred_file in pred_files:
+        print(f"\n{'='*60}")
+        print(f"Processing: {pred_file}")
+        print('='*60)
+        try:
+            predictions = download_predictions(pred_file)
+            print(f"Loaded {len(predictions)} predictions")
+            results = evaluate_with_semantic(predictions, gold_by_id, gold_by_text)
+            if results:
+                print(f"\nResults:")
+                print(f"  Semantic Accuracy: {results['overall']['semantic']:.1f}")
+                print(f"  ANLS*:             {results['overall']['anls']:.1f}")
+                print(f"  Page F1:           {results['overall']['page_f1']:.1f}")
+                # Save locally for review
+                output_file = Path(pred_file).stem + "_reevaluated.json"
+                with open(output_file, 'w') as f:
+                    json.dump(results, f, indent=2)
+                print(f"\nSaved to: {output_file}")
+            else:
+                print("No valid evaluations")
+        except Exception as e:
+            print(f"Error: {e}")
+            continue
+if __name__ == "__main__":
+    main()

eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "model_name": "Qwen3-VL (235B-A22B-Thinking) with BM25 Search Tool",
+  "organization": "Alibaba Group",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "open-weight"
+  },
+  "submission_date": "2026-01-10T13:16:29.905067+00:00",
+  "results": {
+    "overall": {
+      "semantic": 59.09778741155781,
+      "anls": 57.61603118163428,
+      "page_f1": 58.72697776505391,
+      "doc_f1": 80.62601393262716,
+      "kuiper": 34.044088176352815
+    },
+    "single_evidence": {
+      "semantic": 57.91583166332666,
+      "anls": 57.61603118163428,
+      "n": 499
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 66.66666666666666,
+        "anls": 61.98005698005697,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 81.81818181818183,
+        "anls": 80.39465804287939,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 66.66666666666666,
+        "anls": 66.4976376669925,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 44.565217391304344,
+        "anls": 49.53672826145982,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 56.25,
+        "anls": 57.39996898263027,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 61.702127659574465,
+        "anls": 60.56474101398679,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 56.09756097560976,
+        "anls": 53.957859669066565,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 60.46511627906976,
+        "anls": 54.79335264218985,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 40.0,
+        "anls": 49.05833333333333,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 68.75,
+        "anls": 73.07522250524337,
+        "n": 24
+      },
+      "Reference": {
+        "semantic": 59.61538461538461,
+        "anls": 63.19327183267644,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 57.333333333333336,
+        "anls": 53.11616787903517,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 71.73913043478261,
+        "anls": 57.18864273121033,
+        "n": 23
+      }
+    },
+    "n_evaluated": 499,
+    "n_unmatched": 1767
+  },
+  "reevaluated_date": "2026-01-15T19:55:28.547801+00:00",
+  "source_predictions_file": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_predictions_20260110_131629.jsonl",
+  "result_file_path": "Alibaba_Group/Qwen3-VL_(235B-A22B-Thinking)_with_BM25_Search_Tool_results_20260110_131629.json"
+}

eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "model_name": "Qwen3-VL (32B-Thinking) with BM25 Search Tool",
+  "organization": "Alibaba Group",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "open-weight"
+  },
+  "submission_date": "2026-01-10T13:20:54.125677+00:00",
+  "results": {
+    "overall": {
+      "semantic": 57.666353114392045,
+      "anls": 57.937064653000625,
+      "page_f1": 54.83061360816872,
+      "doc_f1": 78.76514934631167,
+      "kuiper": 36.33667334669349
+    },
+    "single_evidence": {
+      "semantic": 56.51302605210421,
+      "anls": 57.937064653000625,
+      "n": 499
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 60.0,
+        "anls": 55.37037037037037,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 54.54545454545454,
+        "anls": 54.61297760210804,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 75.0,
+        "anls": 77.14578581514066,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 45.65217391304348,
+        "anls": 47.03746065646829,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 43.75,
+        "anls": 50.93257767828244,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 52.12765957446809,
+        "anls": 56.60682613221971,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 63.41463414634146,
+        "anls": 61.11435746903807,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 65.11627906976744,
+        "anls": 60.44405852545388,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 40.0,
+        "anls": 54.65844817149165,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 75.0,
+        "anls": 73.59601449275362,
+        "n": 24
+      },
+      "Reference": {
+        "semantic": 63.46153846153846,
+        "anls": 68.57667578882189,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 54.0,
+        "anls": 56.44955119487462,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 60.86956521739131,
+        "anls": 51.60498619336015,
+        "n": 23
+      }
+    },
+    "n_evaluated": 499,
+    "n_unmatched": 1767
+  },
+  "reevaluated_date": "2026-01-15T19:56:35.003631+00:00",
+  "source_predictions_file": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132054.jsonl",
+  "result_file_path": "Alibaba_Group/Qwen3-VL_(32B-Thinking)_with_BM25_Search_Tool_results_20260110_132054.json"
+}

eval/reevaluated_results/Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "model_name": "Qwen3-VL (8B-Thinking) with BM25 Search Tool",
+  "organization": "Alibaba Group",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "open-weight"
+  },
+  "submission_date": "2026-01-10T13:23:58.123387+00:00",
+  "results": {
+    "overall": {
+      "semantic": 46.623859964827616,
+      "anls": 45.43424080850834,
+      "page_f1": 47.685529789738204,
+      "doc_f1": 69.57247828991316,
+      "kuiper": 48.30060120240493
+    },
+    "single_evidence": {
+      "semantic": 45.69138276553106,
+      "anls": 45.43424080850834,
+      "n": 499
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 40.0,
+        "anls": 36.91358024691358,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 59.09090909090909,
+        "anls": 55.55994729907773,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 58.333333333333336,
+        "anls": 54.598842018196855,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 29.347826086956523,
+        "anls": 29.932472094079802,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 37.5,
+        "anls": 51.55757767828245,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 43.61702127659575,
+        "anls": 44.439106365198185,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 46.34146341463415,
+        "anls": 50.16056789323261,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 53.48837209302325,
+        "anls": 44.799741602067186,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 40.0,
+        "anls": 49.070641025641024,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 58.333333333333336,
+        "anls": 59.60305559882987,
+        "n": 24
+      },
+      "Reference": {
+        "semantic": 44.230769230769226,
+        "anls": 50.44584246011934,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 54.0,
+        "anls": 50.000135852648256,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 52.17391304347826,
+        "anls": 39.32779159365883,
+        "n": 23
+      }
+    },
+    "n_evaluated": 499,
+    "n_unmatched": 1767
+  },
+  "reevaluated_date": "2026-01-15T19:58:05.119474+00:00",
+  "source_predictions_file": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_predictions_20260110_132358.jsonl",
+  "result_file_path": "Alibaba_Group/Qwen3-VL_(8B-Thinking)_with_BM25_Search_Tool_results_20260110_132358.json"
+}

eval/reevaluated_results/Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "Claude Haiku 4.5 (2025-10-01) with BM25 Search Tool",
+  "organization": "Anthropic",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T13:03:19.649656+00:00",
+  "results": {
+    "overall": {
+      "semantic": 66.9387755102041,
+      "anls": 61.60747574238133,
+      "page_f1": 72.02476190476192,
+      "doc_f1": 88.24761904761905,
+      "kuiper": 50.36144578313238
+    },
+    "single_evidence": {
+      "semantic": 65.60000000000001,
+      "anls": 61.60747574238133,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 70.0,
+        "anls": 63.92691050779287,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 72.72727272727273,
+        "anls": 73.53318618140752,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 85.41666666666666,
+        "anls": 72.62325637325637,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 58.152173913043484,
+        "anls": 54.29593695395653,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 71.875,
+        "anls": 68.77016129032259,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 61.702127659574465,
+        "anls": 62.779826338896896,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 53.65853658536586,
+        "anls": 52.054645053208425,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 66.27906976744185,
+        "anls": 60.50249169435216,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 48.0,
+        "anls": 41.69842237151431,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 81.25,
+        "anls": 80.07172131147541,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 60.57692307692307,
+        "anls": 64.3484267705628,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 76.0,
+        "anls": 65.36479556179735,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 71.73913043478261,
+        "anls": 64.75817505570946,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T19:59:11.288336+00:00",
+  "source_predictions_file": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_predictions_20260109_130319.jsonl",
+  "result_file_path": "Anthropic/Claude_Haiku_4.5_(2025-10-01)_results_20260109_130319.json"
+}

eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_002125.json ADDED Viewed

	@@ -0,0 +1,115 @@

+{
+  "model_name": "Claude Sonnet 4.5 (2025-09-29) with BM25 Search Tool",
+  "organization": "Anthropic",
+  "description": "",
+  "link": null,
+  "tags": [
+    "Agentic"
+  ],
+  "submitted_by": null,
+  "metadata": {
+    "model_type": "unknown"
+  },
+  "submission_date": null,
+  "results": {
+    "overall": {
+      "semantic": 76.83673469387756,
+      "anls": 71.84394202116125,
+      "page_f1": 79.30920634920635,
+      "doc_f1": 92.87777777777778,
+      "kuiper": 45.31237322515194
+    },
+    "single_evidence": {
+      "semantic": 75.3,
+      "anls": 71.84394202116125,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 76.66666666666667,
+        "anls": 69.17913105413105,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 84.0909090909091,
+        "anls": 78.67387882210018,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 87.5,
+        "anls": 78.43471847184719,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 67.93478260869566,
+        "anls": 67.53424957183847,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 75.0,
+        "anls": 79.76190476190477,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 81.91489361702128,
+        "anls": 76.76053472269231,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 76.82926829268293,
+        "anls": 71.16619587453502,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 75.5813953488372,
+        "anls": 63.583816672634086,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 48.0,
+        "anls": 56.169284632785775,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 89.58333333333334,
+        "anls": 83.87132448607858,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 71.15384615384616,
+        "anls": 80.42617278480002,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 82.66666666666667,
+        "anls": 74.25747226815201,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 69.56521739130434,
+        "anls": 58.84371488722767,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 1811
+  },
+  "reevaluated_date": "2026-01-15T20:00:06.481610+00:00",
+  "source_predictions_file": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_predictions_20260109_002125.jsonl",
+  "result_file_path": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_002125.json"
+}

eval/reevaluated_results/Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "Claude Sonnet 4.5 (2025-09-29) with BM25 Search Tool",
+  "organization": "Anthropic",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T12:58:16.611348+00:00",
+  "results": {
+    "overall": {
+      "semantic": 79.08163265306122,
+      "anls": 71.74787642305597,
+      "page_f1": 79.12333333333333,
+      "doc_f1": 92.98636363636363,
+      "kuiper": 36.338056680162076
+    },
+    "single_evidence": {
+      "semantic": 77.5,
+      "anls": 71.74787642305597,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 76.66666666666667,
+        "anls": 69.51092117758785,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 81.81818181818183,
+        "anls": 78.15439830261965,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 91.66666666666666,
+        "anls": 78.43471847184719,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 70.1086956521739,
+        "anls": 66.81148919563769,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 75.0,
+        "anls": 76.26728110599078,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 86.17021276595744,
+        "anls": 74.90457714355891,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 75.60975609756098,
+        "anls": 72.85160396238213,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 88.37209302325581,
+        "anls": 72.74221043114129,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 56.00000000000001,
+        "anls": 60.75987316199324,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 89.58333333333334,
+        "anls": 83.89482072668008,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 69.23076923076923,
+        "anls": 72.21619612753193,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 83.33333333333334,
+        "anls": 74.0536995032274,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 69.56521739130434,
+        "anls": 60.23577215564363,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:01:02.709110+00:00",
+  "source_predictions_file": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_predictions_20260109_125816.jsonl",
+  "result_file_path": "Anthropic/Claude_Sonnet_4.5_(2025-09-29)_with_BM25_Search_Tool_results_20260109_125816.json"
+}

eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_003320.json ADDED Viewed

	@@ -0,0 +1,115 @@

+{
+  "model_name": "Gemini 2.5 Flash with BM25 Search Tool",
+  "organization": "Google",
+  "description": "",
+  "link": null,
+  "tags": [
+    "Agentic"
+  ],
+  "submitted_by": null,
+  "metadata": {
+    "model_type": "unknown"
+  },
+  "submission_date": null,
+  "results": {
+    "overall": {
+      "semantic": 57.34693877551022,
+      "anls": 52.71594015359682,
+      "page_f1": 59.910952380952374,
+      "doc_f1": 76.47380952380952,
+      "kuiper": 40.01599999999986
+    },
+    "single_evidence": {
+      "semantic": 56.2,
+      "anls": 52.71594015359682,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 73.33333333333333,
+        "anls": 63.64672364672364,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 77.27272727272727,
+        "anls": 68.27344592166726,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 72.91666666666666,
+        "anls": 67.7894121245185,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 41.30434782608695,
+        "anls": 42.928812913087185,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 43.75,
+        "anls": 44.89996898263027,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 63.829787234042556,
+        "anls": 62.64704717952198,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 56.09756097560976,
+        "anls": 49.2663477551747,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 55.81395348837209,
+        "anls": 47.23635639486312,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 52.0,
+        "anls": 46.0,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 66.66666666666666,
+        "anls": 68.74370865688812,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 61.53846153846154,
+        "anls": 64.74843671979198,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 53.333333333333336,
+        "anls": 46.187273968786066,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 54.347826086956516,
+        "anls": 42.61518103800272,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 1811
+  },
+  "reevaluated_date": "2026-01-15T20:02:15.855307+00:00",
+  "source_predictions_file": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_predictions_20260109_003320.jsonl",
+  "result_file_path": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_003320.json"
+}

eval/reevaluated_results/Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "Gemini 2.5 Flash with BM25 Search Tool",
+  "organization": "Google",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T18:25:59.636344+00:00",
+  "results": {
+    "overall": {
+      "semantic": 58.46938775510204,
+      "anls": 55.486869478144165,
+      "page_f1": 60.9663492063492,
+      "doc_f1": 78.82920634920634,
+      "kuiper": 45.08800000000012
+    },
+    "single_evidence": {
+      "semantic": 57.3,
+      "anls": 55.486869478144165,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 73.33333333333333,
+        "anls": 71.7948717948718,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 77.27272727272727,
+        "anls": 72.81890046712182,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 85.41666666666666,
+        "anls": 76.85643564356435,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 40.76086956521739,
+        "anls": 40.952902757926644,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 50.0,
+        "anls": 52.31036324786324,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 64.8936170212766,
+        "anls": 67.70262933196864,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 58.536585365853654,
+        "anls": 60.95035529628296,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 54.65116279069767,
+        "anls": 51.45105745077384,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 52.0,
+        "anls": 54.40739778239778,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 77.08333333333334,
+        "anls": 73.82172131147541,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 61.53846153846154,
+        "anls": 64.46714691613596,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 53.333333333333336,
+        "anls": 45.47473759975617,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 47.82608695652174,
+        "anls": 35.96181299748582,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:03:19.966069+00:00",
+  "source_predictions_file": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_predictions_20260109_182559.jsonl",
+  "result_file_path": "Google/Gemini_2.5_Flash_with_BM25_Search_Tool_results_20260109_182559.json"
+}

eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_005202.json ADDED Viewed

	@@ -0,0 +1,115 @@

+{
+  "model_name": "Gemini 2.5 Pro with BM25 Search Tool",
+  "organization": "Google",
+  "description": "",
+  "link": null,
+  "tags": [
+    "Agentic"
+  ],
+  "submitted_by": null,
+  "metadata": {
+    "model_type": "unknown"
+  },
+  "submission_date": null,
+  "results": {
+    "overall": {
+      "semantic": 59.6938775510204,
+      "anls": 56.04493493183149,
+      "page_f1": 61.64985569985569,
+      "doc_f1": 74.58080808080808,
+      "kuiper": 28.047999999999792
+    },
+    "single_evidence": {
+      "semantic": 58.5,
+      "anls": 56.04493493183149,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 73.33333333333333,
+        "anls": 63.64672364672364,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 81.81818181818183,
+        "anls": 72.3102424584638,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 64.58333333333334,
+        "anls": 60.78335195270679,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 40.21739130434783,
+        "anls": 43.86116897464483,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 56.25,
+        "anls": 61.754807692307686,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 68.08510638297872,
+        "anls": 64.75420262164383,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 64.63414634146342,
+        "anls": 52.864704856399555,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 60.46511627906976,
+        "anls": 51.79586563307493,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 52.0,
+        "anls": 53.47808414475082,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 64.58333333333334,
+        "anls": 66.18283242258653,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 61.53846153846154,
+        "anls": 67.82233913890543,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 56.666666666666664,
+        "anls": 53.53357754327678,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 63.04347826086957,
+        "anls": 47.37363844512718,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 1811
+  },
+  "reevaluated_date": "2026-01-15T20:04:22.366647+00:00",
+  "source_predictions_file": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_predictions_20260109_005202.jsonl",
+  "result_file_path": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_005202.json"
+}

eval/reevaluated_results/Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "Gemini 2.5 Pro with BM25 Search Tool",
+  "organization": "Google",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T18:30:30.608183+00:00",
+  "results": {
+    "overall": {
+      "semantic": 59.6938775510204,
+      "anls": 55.97919862778078,
+      "page_f1": 60.299220779220775,
+      "doc_f1": 74.23636363636363,
+      "kuiper": 38.90600000000025
+    },
+    "single_evidence": {
+      "semantic": 58.5,
+      "anls": 55.97919862778078,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 66.66666666666666,
+        "anls": 56.98005698005698,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 72.72727272727273,
+        "anls": 66.75468690290825,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 66.66666666666666,
+        "anls": 62.67819322254806,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 47.82608695652174,
+        "anls": 48.11929370300614,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 50.0,
+        "anls": 46.96314102564102,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 65.95744680851064,
+        "anls": 64.23333377770668,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 56.09756097560976,
+        "anls": 48.92979153124233,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 63.95348837209303,
+        "anls": 60.44220952048519,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 48.0,
+        "anls": 52.95641025641026,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 64.58333333333334,
+        "anls": 70.43036975102193,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 63.46153846153846,
+        "anls": 63.5134680018838,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 60.0,
+        "anls": 54.81415365192609,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 50.0,
+        "anls": 40.50127359810298,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:06:43.674600+00:00",
+  "source_predictions_file": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_predictions_20260109_183030.jsonl",
+  "result_file_path": "Google/Gemini_2.5_Pro_with_BM25_Search_Tool_results_20260109_183030.json"
+}

eval/reevaluated_results/Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_results_20260109_002711.json ADDED Viewed

	@@ -0,0 +1,110 @@

+{
+  "model_name": "Gemini 3 Pro (Preview) with BM25 Search Tool",
+  "organization": "Google",
+  "description": "",
+  "link": null,
+  "tags": [
+    "Agentic"
+  ],
+  "submitted_by": null,
+  "metadata": {
+    "model_type": "unknown"
+  },
+  "submission_date": null,
+  "results": {
+    "overall": {
+      "semantic": 84.8636047605415,
+      "anls": 78.46249371016062,
+      "page_f1": 80.40956371617695,
+      "doc_f1": 91.83908943427981,
+      "kuiper": 27.13226452905815
+    },
+    "single_evidence": {
+      "semantic": 83.16633266533067,
+      "anls": 78.46249371016062,
+      "n": 499
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 83.33333333333334,
+        "anls": 75.31339031339031,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 86.36363636363636,
+        "anls": 74.02302243211334,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 83.33333333333334,
+        "anls": 77.06645664566456,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 72.28260869565217,
+        "anls": 69.36362154126739,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 81.25,
+        "anls": 80.57571684587813,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 87.2340425531915,
+        "anls": 82.3254828677961,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 86.58536585365853,
+        "anls": 79.69007037401929,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 93.02325581395348,
+        "anls": 85.19782543038357,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 74.0,
+        "anls": 79.1167050771702,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 93.75,
+        "anls": 91.86959699974574,
+        "n": 24
+      },
+      "Reference": {
+        "semantic": 88.46153846153845,
+        "anls": 87.98053049887939,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 84.66666666666667,
+        "anls": 78.78023745578506,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 73.91304347826086,
+        "anls": 61.21421646346686,
+        "n": 23
+      }
+    },
+    "n_evaluated": 499,
+    "n_unmatched": 1767
+  },
+  "reevaluated_date": "2026-01-15T20:07:35.074484+00:00",
+  "source_predictions_file": "Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_predictions_20260109_002711.jsonl",
+  "result_file_path": "Google/Gemini_3_Pro_(Preview)_with_BM25_Search_Tool_results_20260109_002711.json"
+}

eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_234108.json ADDED Viewed

	@@ -0,0 +1,115 @@

+{
+  "model_name": "Human with BM25 Search Tool",
+  "organization": "Humanity",
+  "description": "",
+  "link": null,
+  "tags": [
+    "Agentic"
+  ],
+  "submitted_by": null,
+  "metadata": {
+    "model_type": "unknown"
+  },
+  "submission_date": null,
+  "results": {
+    "overall": {
+      "semantic": 76.53061224489795,
+      "anls": 76.89084569144522,
+      "page_f1": 74.27484848484849,
+      "doc_f1": 87.1077922077922,
+      "kuiper": 6.584782608695652
+    },
+    "single_evidence": {
+      "semantic": 75.0,
+      "anls": 76.89084569144522,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 66.66666666666666,
+        "anls": 72.72727272727272,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 95.45454545454545,
+        "anls": 88.2664724057374,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 83.33333333333334,
+        "anls": 81.58602150537635,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 72.28260869565217,
+        "anls": 72.19996863726435,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 68.75,
+        "anls": 65.13888888888889,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 77.6595744680851,
+        "anls": 80.70180867592104,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 82.92682926829268,
+        "anls": 78.22470188707081,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 77.90697674418605,
+        "anls": 78.11361119500656,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 62.0,
+        "anls": 69.07251951242394,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 68.75,
+        "anls": 71.50538359217717,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 77.88461538461539,
+        "anls": 87.90759949333756,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 70.0,
+        "anls": 75.95612610368379,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 76.08695652173914,
+        "anls": 73.91467899702451,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:08:22.133765+00:00",
+  "source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260109_234108.jsonl",
+  "result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260109_234108.json"
+}

eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235325.json ADDED Viewed

	@@ -0,0 +1,115 @@

+{
+  "model_name": "Human with BM25 Search Tool",
+  "organization": "Humanity",
+  "description": "",
+  "link": null,
+  "tags": [
+    "Agentic"
+  ],
+  "submitted_by": null,
+  "metadata": {
+    "model_type": "unknown"
+  },
+  "submission_date": null,
+  "results": {
+    "overall": {
+      "semantic": 80.3061224489796,
+      "anls": 81.0431479932892,
+      "page_f1": 77.30151515151516,
+      "doc_f1": 90.80112554112554,
+      "kuiper": 7.623700623700628
+    },
+    "single_evidence": {
+      "semantic": 78.7,
+      "anls": 81.0431479932892,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 66.66666666666666,
+        "anls": 72.72727272727272,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 95.45454545454545,
+        "anls": 88.2664724057374,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 81.25,
+        "anls": 79.50268817204301,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 77.17391304347827,
+        "anls": 77.5668164633513,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 81.25,
+        "anls": 77.63888888888889,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 80.85106382978722,
+        "anls": 84.95712782485721,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 81.70731707317073,
+        "anls": 79.43814685734138,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 89.53488372093024,
+        "anls": 85.09035538105306,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 70.0,
+        "anls": 75.91696395686839,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 72.91666666666666,
+        "anls": 78.44982803662161,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 78.84615384615384,
+        "anls": 89.83067641641446,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 70.66666666666667,
+        "anls": 78.94025308781077,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 86.95652173913044,
+        "anls": 83.52609662978936,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:09:11.287271+00:00",
+  "source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260109_235325.jsonl",
+  "result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260109_235325.json"
+}

eval/reevaluated_results/Humanity/Human_with_BM25_Search_Tool_results_20260109_235724.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "Human with BM25 Search Tool",
+  "organization": "Humanity",
+  "description": "Human equipped with the same search engine as agentic baselines.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Vision and Language",
+    "Sparse Search Tool"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "open-weight"
+  },
+  "submission_date": "2026-01-09T23:57:24.249882+00:00",
+  "results": {
+    "overall": {
+      "semantic": 81.02040816326532,
+      "anls": 82.43662306660298,
+      "page_f1": 78.83484848484848,
+      "doc_f1": 92.80112554112554,
+      "kuiper": 8.217922606924656
+    },
+    "single_evidence": {
+      "semantic": 79.4,
+      "anls": 82.43662306660298,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 66.66666666666666,
+        "anls": 72.72727272727272,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 90.9090909090909,
+        "anls": 88.2664724057374,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 81.25,
+        "anls": 79.50268817204301,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 76.08695652173914,
+        "anls": 77.5668164633513,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 81.25,
+        "anls": 82.47759856630825,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 80.85106382978722,
+        "anls": 84.95712782485721,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 84.14634146341463,
+        "anls": 81.26741515002432,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 89.53488372093024,
+        "anls": 85.09035538105306,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 70.0,
+        "anls": 79.91696395686839,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 72.91666666666666,
+        "anls": 78.44982803662161,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 79.8076923076923,
+        "anls": 91.40410298984105,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 76.0,
+        "anls": 83.7735864211441,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 86.95652173913044,
+        "anls": 83.52609662978936,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:09:56.056259+00:00",
+  "source_predictions_file": "Humanity/Human_with_BM25_Search_Tool_predictions_20260109_235724.jsonl",
+  "result_file_path": "Humanity/Human_with_BM25_Search_Tool_results_20260109_235724.json"
+}

eval/reevaluated_results/OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "GPT-4.1 (2025-04-14) with BM25 Search Tool",
+  "organization": "OpenAI",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T15:32:21.908816+00:00",
+  "results": {
+    "overall": {
+      "semantic": 58.571428571428555,
+      "anls": 53.29254644474454,
+      "page_f1": 64.14190476190477,
+      "doc_f1": 82.82666666666667,
+      "kuiper": 43.93199999999983
+    },
+    "single_evidence": {
+      "semantic": 57.4,
+      "anls": 53.29254644474454,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 53.333333333333336,
+        "anls": 48.59180666077218,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 52.27272727272727,
+        "anls": 48.04545454545455,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 68.75,
+        "anls": 67.55050505050505,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 40.76086956521739,
+        "anls": 43.62404525327831,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 71.875,
+        "anls": 64.58333333333334,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 55.319148936170215,
+        "anls": 51.52629513848961,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 70.73170731707317,
+        "anls": 55.117501174925685,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 58.139534883720934,
+        "anls": 55.94315245478037,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 40.0,
+        "anls": 54.188065268065266,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 77.08333333333334,
+        "anls": 69.51844262295083,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 62.5,
+        "anls": 60.011945621794936,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 60.66666666666667,
+        "anls": 47.26331129213486,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 67.3913043478261,
+        "anls": 61.60068502092203,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:11:50.993374+00:00",
+  "source_predictions_file": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153221.jsonl",
+  "result_file_path": "OpenAI/GPT-4.1_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153221.json"
+}

eval/reevaluated_results/OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "GPT-4.1 Nano (2025-04-14) with BM25 Search Tool",
+  "organization": "OpenAI",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T15:38:12.353112+00:00",
+  "results": {
+    "overall": {
+      "semantic": 19.18367346938775,
+      "anls": 19.21201395702391,
+      "page_f1": 27.60809523809524,
+      "doc_f1": 40.18095238095238,
+      "kuiper": 27.656000000000265
+    },
+    "single_evidence": {
+      "semantic": 18.8,
+      "anls": 19.21201395702391,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 13.333333333333334,
+        "anls": 12.5,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 36.36363636363637,
+        "anls": 33.85540184453228,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 25.0,
+        "anls": 24.252897639994416,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 15.217391304347828,
+        "anls": 15.744375438721086,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 0.0,
+        "anls": 3.125,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 20.212765957446805,
+        "anls": 18.040407652422672,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 17.073170731707318,
+        "anls": 17.049790482898338,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 23.25581395348837,
+        "anls": 20.54263565891473,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 4.0,
+        "anls": 13.666666666666666,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 25.0,
+        "anls": 28.843503294839174,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 16.346153846153847,
+        "anls": 20.3827772417516,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 23.333333333333332,
+        "anls": 19.284216647617285,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 19.565217391304348,
+        "anls": 27.075249588209658,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:12:41.288382+00:00",
+  "source_predictions_file": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153812.jsonl",
+  "result_file_path": "OpenAI/GPT-4.1_Nano_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153812.json"
+}

eval/reevaluated_results/OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "GPT-5.2 (2025-12-11) with BM25 Search Tool",
+  "organization": "OpenAI",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images. GPT-5.2 exhibits more conservative behavior than GPT-5, refusing to provide an answer when uncertain.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T15:19:12.016451+00:00",
+  "results": {
+    "overall": {
+      "semantic": 66.22448979591837,
+      "anls": 57.28438090955278,
+      "page_f1": 67.62380952380951,
+      "doc_f1": 83.72666666666666,
+      "kuiper": 62.57199999999988
+    },
+    "single_evidence": {
+      "semantic": 64.9,
+      "anls": 57.28438090955278,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 73.33333333333333,
+        "anls": 58.46153846153847,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 77.27272727272727,
+        "anls": 59.00137741046832,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 72.91666666666666,
+        "anls": 57.55050505050505,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 53.2608695652174,
+        "anls": 49.679264550051975,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 65.625,
+        "anls": 61.08221187025536,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 67.02127659574468,
+        "anls": 58.551919442177,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 60.97560975609756,
+        "anls": 44.265703074651974,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 77.90697674418605,
+        "anls": 66.19399979865096,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 38.0,
+        "anls": 35.05751747729549,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 83.33333333333334,
+        "anls": 82.5164707977208,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 72.11538461538461,
+        "anls": 67.62508443509842,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 64.66666666666666,
+        "anls": 59.65381728416852,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 60.86956521739131,
+        "anls": 55.55075090789312,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:14:52.407712+00:00",
+  "source_predictions_file": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_predictions_20260109_151912.jsonl",
+  "result_file_path": "OpenAI/GPT-5.2_(2025-12-11)_with_BM25_Search_Tool_results_20260109_151912.json"
+}

eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "GPT-5 (2025-08-07) with BM25 Search Tool",
+  "organization": "OpenAI",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T15:21:04.336083+00:00",
+  "results": {
+    "overall": {
+      "semantic": 76.02040816326532,
+      "anls": 70.03817583122695,
+      "page_f1": 74.16285714285713,
+      "doc_f1": 86.45064935064934,
+      "kuiper": 52.256000000000114
+    },
+    "single_evidence": {
+      "semantic": 74.5,
+      "anls": 70.03817583122695,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 66.66666666666666,
+        "anls": 62.757834757834765,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 68.18181818181817,
+        "anls": 63.54683195592287,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 83.33333333333334,
+        "anls": 78.3838383838384,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 65.21739130434783,
+        "anls": 62.36899647186356,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 81.25,
+        "anls": 86.77496898263027,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 73.40425531914893,
+        "anls": 68.7671602173282,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 71.95121951219512,
+        "anls": 64.5688672367669,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 79.06976744186046,
+        "anls": 70.27143399236422,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 60.0,
+        "anls": 65.71897407160566,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 83.33333333333334,
+        "anls": 86.70405982905983,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 76.92307692307693,
+        "anls": 76.57306264232653,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 80.0,
+        "anls": 71.72139814224423,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 93.47826086956522,
+        "anls": 73.31752767476483,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:15:52.047010+00:00",
+  "source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152104.jsonl",
+  "result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152104.json"
+}

eval/reevaluated_results/OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json ADDED Viewed

	@@ -0,0 +1,116 @@

+{
+  "model_name": "GPT-5 (2025-08-07) with File Search",
+  "organization": "OpenAI",
+  "description": "Managed, single-shot retrieval mechanism.",
+  "link": "https://platform.openai.com/docs/guides/tools-file-search",
+  "tags": [
+    "Conventional RAG",
+    "Semantic Search Tool"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-04T14:05:37.240829+00:00",
+  "results": {
+    "overall": {
+      "semantic": 48.061224489795926,
+      "anls": 44.84773268944071,
+      "page_f1": 29.277142857142856,
+      "doc_f1": 66.60666666666667,
+      "kuiper": 31.15400000000007
+    },
+    "single_evidence": {
+      "semantic": 47.099999999999994,
+      "anls": 44.84773268944071,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 13.333333333333334,
+        "anls": 14.833333333333334,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 79.54545454545455,
+        "anls": 63.871507280598195,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 72.91666666666666,
+        "anls": 55.83149489399489,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 49.45652173913043,
+        "anls": 46.26513610007698,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 15.625,
+        "anls": 17.540322580645164,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 45.744680851063826,
+        "anls": 41.75603723934328,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 39.02439024390244,
+        "anls": 42.22238179140625,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 37.2093023255814,
+        "anls": 32.74308378959542,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 46.0,
+        "anls": 45.83167739167739,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 64.58333333333334,
+        "anls": 67.18447826857438,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 39.42307692307692,
+        "anls": 40.48309244262362,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 46.666666666666664,
+        "anls": 46.80494177991155,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 63.04347826086957,
+        "anls": 62.77759844334801,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:17:01.554804+00:00",
+  "source_predictions_file": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_predictions_20260104_140537.jsonl",
+  "result_file_path": "OpenAI/GPT-5_(2025-08-07)_with_File_Search_results_20260104_140537.json"
+}

eval/reevaluated_results/OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "GPT-5 Mini (2025-08-07) with BM25 Search Tool",
+  "organization": "OpenAI",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T15:26:50.820104+00:00",
+  "results": {
+    "overall": {
+      "semantic": 65.0,
+      "anls": 55.16542612989696,
+      "page_f1": 67.57095238095239,
+      "doc_f1": 82.35303030303031,
+      "kuiper": 71.86573146292572
+    },
+    "single_evidence": {
+      "semantic": 63.7,
+      "anls": 55.16542612989696,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 66.66666666666666,
+        "anls": 57.16524216524217,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 68.18181818181817,
+        "anls": 63.349203497424845,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 62.5,
+        "anls": 53.63190419293608,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 47.28260869565217,
+        "anls": 43.770804881794874,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 46.875,
+        "anls": 39.15760869565217,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 71.27659574468085,
+        "anls": 62.856694438441366,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 60.97560975609756,
+        "anls": 51.21538014830698,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 75.5813953488372,
+        "anls": 62.31744836688789,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 56.00000000000001,
+        "anls": 39.93216037493774,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 70.83333333333334,
+        "anls": 63.35950315116982,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 76.92307692307693,
+        "anls": 73.02503210878088,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 66.66666666666666,
+        "anls": 54.869395530526155,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 67.3913043478261,
+        "anls": 53.29419750997293,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:18:52.086804+00:00",
+  "source_predictions_file": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152650.jsonl",
+  "result_file_path": "OpenAI/GPT-5_Mini_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152650.json"
+}

eval/reevaluated_results/OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "GPT-5 Nano (2025-08-07) with BM25 Search Tool",
+  "organization": "OpenAI",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T15:28:28.366309+00:00",
+  "results": {
+    "overall": {
+      "semantic": 56.6326530612245,
+      "anls": 52.255247982009955,
+      "page_f1": 60.877142857142864,
+      "doc_f1": 82.2030303030303,
+      "kuiper": 47.40000000000003
+    },
+    "single_evidence": {
+      "semantic": 55.50000000000001,
+      "anls": 52.255247982009955,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 53.333333333333336,
+        "anls": 53.461538461538474,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 63.63636363636363,
+        "anls": 54.95375836284927,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 62.5,
+        "anls": 51.78930433365917,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 40.21739130434783,
+        "anls": 40.14762316798784,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 68.75,
+        "anls": 69.68257767828244,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 64.8936170212766,
+        "anls": 56.496054764723326,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 52.4390243902439,
+        "anls": 42.85858107680723,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 58.139534883720934,
+        "anls": 55.28314708547266,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 50.0,
+        "anls": 51.784085491742935,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 70.83333333333334,
+        "anls": 74.53137140637142,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 57.692307692307686,
+        "anls": 61.940508414693205,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 54.666666666666664,
+        "anls": 48.18660787855504,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 65.21739130434783,
+        "anls": 59.014067370235345,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:19:54.021229+00:00",
+  "source_predictions_file": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_predictions_20260109_152828.jsonl",
+  "result_file_path": "OpenAI/GPT-5_Nano_(2025-08-07)_with_BM25_Search_Tool_results_20260109_152828.json"
+}

eval/reevaluated_results/OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "GPT 4.1 Mini (2025-04-14) with BM25 Search Tool",
+  "organization": "OpenAI",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T15:35:16.458002+00:00",
+  "results": {
+    "overall": {
+      "semantic": 51.22448979591837,
+      "anls": 46.26708858125157,
+      "page_f1": 59.905054945054935,
+      "doc_f1": 77.61731601731601,
+      "kuiper": 40.01224489795946
+    },
+    "single_evidence": {
+      "semantic": 50.2,
+      "anls": 46.26708858125157,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 43.333333333333336,
+        "anls": 39.64209401709402,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 59.09090909090909,
+        "anls": 48.57647622469757,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 56.25,
+        "anls": 53.83018770627063,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 35.869565217391305,
+        "anls": 34.96285359224887,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 46.875,
+        "anls": 44.4215309712932,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 44.680851063829785,
+        "anls": 44.19583719868558,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 53.65853658536586,
+        "anls": 46.501429746354255,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 53.48837209302325,
+        "anls": 43.64210613408689,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 48.0,
+        "anls": 46.71106819031614,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 56.25,
+        "anls": 55.49877795634123,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 62.5,
+        "anls": 62.86510186138165,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 51.33333333333333,
+        "anls": 45.15164464860224,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 65.21739130434783,
+        "anls": 53.71736172158072,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:21:13.745638+00:00",
+  "source_predictions_file": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_predictions_20260109_153516.jsonl",
+  "result_file_path": "OpenAI/GPT_4.1_Mini_(2025-04-14)_with_BM25_Search_Tool_results_20260109_153516.json"
+}

eval/reevaluated_results/OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "Gemini 3 Pro with BM25 Search Tool",
+  "organization": "OpenAI",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T18:53:47.189606+00:00",
+  "results": {
+    "overall": {
+      "semantic": 80.16032064128255,
+      "anls": 73.52101315170081,
+      "page_f1": 78.4607309857811,
+      "doc_f1": 90.20248288785363,
+      "kuiper": 26.781563126252323
+    },
+    "single_evidence": {
+      "semantic": 78.55711422845691,
+      "anls": 73.52101315170081,
+      "n": 499
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 80.0,
+        "anls": 85.12820512820514,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 77.27272727272727,
+        "anls": 64.8800482891392,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 81.25,
+        "anls": 79.84423442344234,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 66.84782608695652,
+        "anls": 63.13552237747254,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 93.75,
+        "anls": 93.48332554153032,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 87.2340425531915,
+        "anls": 78.26722646935413,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 86.58536585365853,
+        "anls": 77.34609828919353,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 82.55813953488372,
+        "anls": 68.10496996543507,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 78.0,
+        "anls": 79.13892729939242,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 85.41666666666666,
+        "anls": 85.4921497584541,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 81.73076923076923,
+        "anls": 83.68517307852197,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 74.32432432432432,
+        "anls": 71.94584088751826,
+        "n": 74
+      },
+      "Technical": {
+        "semantic": 76.08695652173914,
+        "anls": 55.56822369489126,
+        "n": 23
+      }
+    },
+    "n_evaluated": 499,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:23:21.812681+00:00",
+  "source_predictions_file": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_predictions_20260109_185347.jsonl",
+  "result_file_path": "OpenAI/Gemini_3_Pro_with_BM25_Search_Tool_results_20260109_185347.json"
+}

eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260107_113714.json ADDED Viewed

	@@ -0,0 +1,115 @@

+{
+  "model_name": "GPT-4o (2024-08-06) with HEAVEN Retrieval",
+  "organization": "OpenAI - KAIST",
+  "description": "",
+  "link": null,
+  "tags": [
+    "Agentic"
+  ],
+  "submitted_by": null,
+  "metadata": {
+    "model_type": "unknown"
+  },
+  "submission_date": null,
+  "results": {
+    "overall": {
+      "semantic": 43.469387755102034,
+      "anls": 40.039307087937075,
+      "page_f1": 43.05228327228327,
+      "doc_f1": 56.64095238095238,
+      "kuiper": null
+    },
+    "single_evidence": {
+      "semantic": 42.6,
+      "anls": 40.039307087937075,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 46.666666666666664,
+        "anls": 46.75783475783476,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 36.36363636363637,
+        "anls": 36.95054945054945,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 43.75,
+        "anls": 38.03661616161616,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 42.934782608695656,
+        "anls": 42.52300514978308,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 31.25,
+        "anls": 31.922043010752688,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 44.680851063829785,
+        "anls": 36.32965392203914,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 39.02439024390244,
+        "anls": 33.06592985170988,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 41.86046511627907,
+        "anls": 33.1515319306017,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 20.0,
+        "anls": 31.078787878787878,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 52.083333333333336,
+        "anls": 51.80921052631579,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 42.30769230769231,
+        "anls": 46.93060276608143,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 52.0,
+        "anls": 44.41148230399428,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 41.30434782608695,
+        "anls": 38.6639124934416,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:24:35.010694+00:00",
+  "source_predictions_file": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260107_113714.jsonl",
+  "result_file_path": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260107_113714.json"
+}

eval/reevaluated_results/OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "GPT-4o (2024-08-06) with HEAVEN Retrieval",
+  "organization": "OpenAI / KAIST",
+  "description": "Image-based retrieval. Best setup described in HEAVEN paper.",
+  "link": "",
+  "tags": [
+    "Conventional RAG",
+    "Semantic Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T15:44:27.735534+00:00",
+  "results": {
+    "overall": {
+      "semantic": 46.73469387755102,
+      "anls": 45.649762341432954,
+      "page_f1": 43.169719169719166,
+      "doc_f1": 59.24761904761905,
+      "kuiper": null
+    },
+    "single_evidence": {
+      "semantic": 45.800000000000004,
+      "anls": 45.649762341432954,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 53.333333333333336,
+        "anls": 48.75783475783476,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 40.909090909090914,
+        "anls": 38.506493506493506,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 56.25,
+        "anls": 55.056754787358244,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 43.47826086956522,
+        "anls": 48.16466676354977,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 50.0,
+        "anls": 44.99022482893451,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 53.191489361702125,
+        "anls": 47.1956486962086,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 40.243902439024396,
+        "anls": 32.93040293040293,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 37.2093023255814,
+        "anls": 35.73555320648344,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 28.000000000000004,
+        "anls": 43.22,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 47.91666666666667,
+        "anls": 49.20255183413078,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 46.15384615384615,
+        "anls": 48.954805357154754,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 52.0,
+        "anls": 50.65907330372244,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 50.0,
+        "anls": 46.20014437749956,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:25:44.256079+00:00",
+  "source_predictions_file": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_154427.jsonl",
+  "result_file_path": "OpenAI_-_KAIST/GPT-4o_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_154427.json"
+}

eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_name": "GPT-5.2 (2024-08-06) with HEAVEN Retrieval",
+  "organization": "OpenAI / KAIST",
+  "description": "Image-based retrieval. Best setup described in HEAVEN paper, but with newer GPT.",
+  "link": "",
+  "tags": [
+    "Conventional RAG",
+    "Semantic Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "api"
+  },
+  "submission_date": "2026-01-09T17:56:39.771528+00:00",
+  "results": {
+    "overall": {
+      "semantic": 50.0,
+      "anls": 47.46445252141211,
+      "page_f1": 48.43228327228327,
+      "doc_f1": 62.30761904761904,
+      "kuiper": null
+    },
+    "single_evidence": {
+      "semantic": 49.0,
+      "anls": 47.46445252141211,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 53.333333333333336,
+        "anls": 43.64672364672364,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 52.27272727272727,
+        "anls": 51.569264069264065,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 47.91666666666667,
+        "anls": 46.90982404692082,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 48.369565217391305,
+        "anls": 48.83531625708929,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 43.75,
+        "anls": 43.92031798457114,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 59.57446808510638,
+        "anls": 49.070286122357786,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 39.02439024390244,
+        "anls": 34.149915125524885,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 45.348837209302324,
+        "anls": 46.299372462163156,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 26.0,
+        "anls": 33.613578417414736,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 50.0,
+        "anls": 51.63690476190476,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 53.84615384615385,
+        "anls": 58.28202679165414,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 52.666666666666664,
+        "anls": 52.18098320525303,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 56.52173913043478,
+        "anls": 39.14801495210919,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:27:00.066247+00:00",
+  "source_predictions_file": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_predictions_20260109_175639.jsonl",
+  "result_file_path": "OpenAI_-_KAIST/GPT-5.2_(2024-08-06)_with_HEAVEN_Retrieval_results_20260109_175639.json"
+}

eval/reevaluated_results/OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_results_20260107_153009.json ADDED Viewed

	@@ -0,0 +1,115 @@

+{
+  "model_name": "GPT-5.2 (2025-12-11) with HEAVEN Retrieval",
+  "organization": "OpenAI - KAIST",
+  "description": "",
+  "link": null,
+  "tags": [
+    "Agentic"
+  ],
+  "submitted_by": null,
+  "metadata": {
+    "model_type": "unknown"
+  },
+  "submission_date": null,
+  "results": {
+    "overall": {
+      "semantic": 48.16326530612245,
+      "anls": 43.22495703626025,
+      "page_f1": 46.402539682539675,
+      "doc_f1": 57.27428571428571,
+      "kuiper": null
+    },
+    "single_evidence": {
+      "semantic": 47.199999999999996,
+      "anls": 43.22495703626025,
+      "n": 500
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 60.0,
+        "anls": 43.64672364672364,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 45.45454545454545,
+        "anls": 41.99134199134198,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 47.91666666666667,
+        "anls": 42.272727272727266,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 43.47826086956522,
+        "anls": 39.79157919704788,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 43.75,
+        "anls": 45.17687392862708,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 53.191489361702125,
+        "anls": 47.41888368008188,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 40.243902439024396,
+        "anls": 32.19869561332976,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 43.02325581395349,
+        "anls": 42.43497069635968,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 26.0,
+        "anls": 30.585587652734088,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 45.83333333333333,
+        "anls": 47.470238095238095,
+        "n": 24
+      },
+      "Other": {
+        "semantic": 0.0,
+        "anls": 0.0,
+        "n": 1
+      },
+      "Reference": {
+        "semantic": 53.84615384615385,
+        "anls": 56.89824130630616,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 54.0,
+        "anls": 46.64353866236792,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 54.347826086956516,
+        "anls": 39.18827260106249,
+        "n": 23
+      }
+    },
+    "n_evaluated": 500,
+    "n_unmatched": 0
+  },
+  "reevaluated_date": "2026-01-15T20:28:06.717531+00:00",
+  "source_predictions_file": "OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_predictions_20260107_153009.jsonl",
+  "result_file_path": "OpenAI_-_KAIST/GPT-5.2_(2025-12-11)_with_HEAVEN_Retrieval_results_20260107_153009.json"
+}

eval/reevaluated_results/Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "model_name": "GLM-4.6V Flash with BM25 Search Tool",
+  "organization": "Z.AI",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "open-weight"
+  },
+  "submission_date": "2026-01-10T13:22:27.811792+00:00",
+  "results": {
+    "overall": {
+      "semantic": 43.658746063555675,
+      "anls": 30.17090068718362,
+      "page_f1": 28.991793110029583,
+      "doc_f1": 51.58650634602539,
+      "kuiper": 29.321285140562065
+    },
+    "single_evidence": {
+      "semantic": 42.78557114228457,
+      "anls": 30.17090068718362,
+      "n": 499
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 43.333333333333336,
+        "anls": 30.313390313390315,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 54.54545454545454,
+        "anls": 34.34782608695652,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 64.58333333333334,
+        "anls": 52.92922722985768,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 34.78260869565217,
+        "anls": 23.538822057620244,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 21.875,
+        "anls": 21.39516129032258,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 37.234042553191486,
+        "anls": 29.5464725643897,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 46.34146341463415,
+        "anls": 37.17815890071988,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 50.0,
+        "anls": 37.64410653945538,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 38.0,
+        "anls": 26.401353874883288,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 68.75,
+        "anls": 48.707026404394824,
+        "n": 24
+      },
+      "Reference": {
+        "semantic": 40.38461538461539,
+        "anls": 23.25877926421405,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 40.666666666666664,
+        "anls": 25.79399206429042,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 36.95652173913043,
+        "anls": 24.436392914653783,
+        "n": 23
+      }
+    },
+    "n_evaluated": 499,
+    "n_unmatched": 1767
+  },
+  "reevaluated_date": "2026-01-15T20:30:38.431851+00:00",
+  "source_predictions_file": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_predictions_20260110_132227.jsonl",
+  "result_file_path": "Z.AI/GLM-4.6V_Flash_with_BM25_Search_Tool_results_20260110_132227.json"
+}

eval/reevaluated_results/Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "model_name": "GLM-4.6V with BM25 Search Tool",
+  "organization": "Z.AI",
+  "description": "Max 10 iterations, up to 5 result pages consumed as images.",
+  "link": "",
+  "tags": [
+    "Agentic",
+    "Sparse Search Tool",
+    "Vision and Language"
+  ],
+  "submitted_by": "Borchmann",
+  "metadata": {
+    "model_type": "open-weight"
+  },
+  "submission_date": "2026-01-10T13:18:26.686587+00:00",
+  "results": {
+    "overall": {
+      "semantic": 64.92576990716128,
+      "anls": 59.661893537203156,
+      "page_f1": 66.02347552247352,
+      "doc_f1": 86.7908978129419,
+      "kuiper": 49.83064516129022
+    },
+    "single_evidence": {
+      "semantic": 63.62725450901804,
+      "anls": 59.661893537203156,
+      "n": 499
+    },
+    "multi_evidence_same_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "multi_evidence_multi_doc": {
+      "semantic": 0,
+      "anls": 0,
+      "n": 0
+    },
+    "by_domain": {
+      "Cases/Logs": {
+        "semantic": 73.33333333333333,
+        "anls": 62.16524216524218,
+        "n": 15
+      },
+      "Education": {
+        "semantic": 61.36363636363637,
+        "anls": 54.25829440651575,
+        "n": 22
+      },
+      "Events": {
+        "semantic": 77.08333333333334,
+        "anls": 67.87290397408577,
+        "n": 24
+      },
+      "Financial": {
+        "semantic": 51.63043478260869,
+        "anls": 51.19993983845437,
+        "n": 92
+      },
+      "Financial/Tax": {
+        "semantic": 68.75,
+        "anls": 62.5648667601683,
+        "n": 16
+      },
+      "Government/Regulatory": {
+        "semantic": 73.40425531914893,
+        "anls": 70.93589720557641,
+        "n": 47
+      },
+      "HR/Employment": {
+        "semantic": 63.41463414634146,
+        "anls": 59.735891761304075,
+        "n": 41
+      },
+      "Legal": {
+        "semantic": 67.44186046511628,
+        "anls": 55.536175710594314,
+        "n": 43
+      },
+      "Media/Publishing": {
+        "semantic": 68.0,
+        "anls": 69.11970073982938,
+        "n": 25
+      },
+      "Misc": {
+        "semantic": 72.91666666666666,
+        "anls": 75.10160446706249,
+        "n": 24
+      },
+      "Reference": {
+        "semantic": 59.61538461538461,
+        "anls": 60.632124141167864,
+        "n": 52
+      },
+      "Reports": {
+        "semantic": 63.33333333333333,
+        "anls": 56.89167319856098,
+        "n": 75
+      },
+      "Technical": {
+        "semantic": 58.69565217391305,
+        "anls": 51.450020851943364,
+        "n": 23
+      }
+    },
+    "n_evaluated": 499,
+    "n_unmatched": 1767
+  },
+  "reevaluated_date": "2026-01-15T20:31:43.022276+00:00",
+  "source_predictions_file": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_predictions_20260110_131826.jsonl",
+  "result_file_path": "Z.AI/GLM-4.6V_with_BM25_Search_Tool_results_20260110_131826.json"
+}