|
|
<!DOCTYPE html> |
|
|
<html lang="en"> |
|
|
|
|
|
<head> |
|
|
<meta charset="UTF-8"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
|
<meta name="description" content="DDR-Bench: A Deep Data Research Agent Benchmark for LLMs"> |
|
|
<title>DDR-Bench | Deep Data Research Benchmark</title> |
|
|
<link rel="preconnect" href="https://fonts.googleapis.com"> |
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> |
|
|
<script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script> |
|
|
<script src="https://cdn.jsdelivr.net/npm/marked@4.3.0/marked.min.js"></script> |
|
|
<link rel="stylesheet" |
|
|
href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/atom-one-dark.min.css"> |
|
|
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script> |
|
|
<script src="https://unpkg.com/sql-formatter@4.0.2/dist/sql-formatter.min.js"></script> |
|
|
<script src="data.js" defer></script> |
|
|
<script src="entropy_data.js" defer></script> |
|
|
<script src="trajectory_data.js" defer></script> |
|
|
<script src="trajectory.js" defer></script> |
|
|
<script src="benchmarking_data.js" defer></script> |
|
|
<script src="benchmarking.js" defer></script> |
|
|
<script src="charts.js" defer></script> |
|
|
<link rel="stylesheet" href="styles.css?v=3"> |
|
|
<style> |
|
|
|
|
|
.chart-loading { |
|
|
display: flex; |
|
|
align-items: center; |
|
|
justify-content: center; |
|
|
min-height: 300px; |
|
|
color: var(--color-text-muted, #64748B); |
|
|
font-size: 14px; |
|
|
} |
|
|
|
|
|
.chart-loading::after { |
|
|
content: 'Loading chart...'; |
|
|
animation: pulse 1.5s ease-in-out infinite; |
|
|
} |
|
|
|
|
|
@keyframes pulse { |
|
|
|
|
|
0%, |
|
|
100% { |
|
|
opacity: 0.4; |
|
|
} |
|
|
|
|
|
50% { |
|
|
opacity: 1; |
|
|
} |
|
|
} |
|
|
</style> |
|
|
</head> |
|
|
|
|
|
<body> |
|
|
<header class="hero"> |
|
|
<div class="hero-content"> |
|
|
|
|
|
<img src="assets/social_preview.png" alt="DDR-Bench - Deep Data Research" class="hero-preview-img"> |
|
|
<h2>Hunt Instead of Wait: Evaluating Deep Data Research on Large Language Models</h2> |
|
|
<p class="description"> |
|
|
We distinguish <em>investigatory intelligence</em> (autonomously setting goals and exploring) from |
|
|
<em>executional intelligence</em> (completing assigned tasks), arguing that true agency requires the |
|
|
former. |
|
|
To evaluate this, we introduce <strong>Deep Data Research (DDR)</strong>, an open-ended task where LLMs |
|
|
autonomously extract insights from databases, and <strong>DDR-Bench</strong>, a large-scale, |
|
|
checklist-based benchmark enabling verifiable evaluation. |
|
|
Results show that while frontier models display emerging agency, long-horizon exploration remains |
|
|
challenging, with effective investigatory intelligence depending on intrinsic agentic strategies beyond |
|
|
mere scaffolding or scaling. |
|
|
</p> |
|
|
<div class="meta-info"> |
|
|
<div class="meta-row authors"> |
|
|
<span class="meta-item"> |
|
|
<a href="https://thinkwee.top/about" target="_blank" rel="noopener noreferrer">Wei Liu</a>, |
|
|
<a href="https://github.com/yupeijei1997" target="_blank" rel="noopener noreferrer">Peijie |
|
|
Yu</a>, |
|
|
<a href="https://www.kcl.ac.uk/people/michele-orini" target="_blank" |
|
|
rel="noopener noreferrer">Michele Orini</a>, |
|
|
<a href="https://yalidu.github.io/" target="_blank" rel="noopener noreferrer">Yali Du</a>, |
|
|
<a href="https://sites.google.com/view/yulanhe/home" target="_blank" |
|
|
rel="noopener noreferrer">Yulan He</a> |
|
|
</span> |
|
|
</div> |
|
|
<div class="meta-row affiliations"> |
|
|
<a href="https://kclnlp.github.io/" target="_blank" rel="noopener noreferrer"> |
|
|
<img src="assets/kcl.svg" alt="King's College London" class="affiliation-logo kcl-logo"> |
|
|
</a> |
|
|
<a href="https://www.tencent.com/en-us/" target="_blank" rel="noopener noreferrer"> |
|
|
<img src="assets/tencent.png" alt="Tencent" class="affiliation-logo"> |
|
|
</a> |
|
|
<a href="https://www.turing.ac.uk/" target="_blank" rel="noopener noreferrer"> |
|
|
<img src="assets/alan.png" alt="The Alan Turing Institute" class="affiliation-logo"> |
|
|
</a> |
|
|
</div> |
|
|
<div class="meta-row links"> |
|
|
<a href="https://huggingface.co/collections/thinkwee/ddrbench" class="platform-btn dataset-btn" |
|
|
target="_blank" rel="noopener noreferrer"> |
|
|
<svg viewBox="0 0 24 24" width="30" height="30" fill="none" stroke="currentColor" |
|
|
stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<ellipse cx="12" cy="5" rx="9" ry="3" /> |
|
|
<path d="M3 5v14c0 1.66 4 3 9 3s9-1.34 9-3V5" /> |
|
|
<path d="M3 12c0 1.66 4 3 9 3s9-1.34 9-3" /> |
|
|
</svg> |
|
|
Dataset |
|
|
</a> |
|
|
<a href="https://github.com/thinkwee/DDR_Bench" class="platform-btn github-btn" target="_blank" |
|
|
rel="noopener noreferrer"> |
|
|
<svg viewBox="0 0 24 24" width="30" height="30" fill="currentColor"> |
|
|
<path |
|
|
d="M12 2C6.477 2 2 6.477 2 12c0 4.42 2.865 8.17 6.839 9.49.5.092.682-.217.682-.482 0-.237-.008-.866-.013-1.7-2.782.603-3.369-1.34-3.369-1.34-.454-1.156-1.11-1.463-1.11-1.463-.908-.62.069-.608.069-.608 1.003.07 1.531 1.03 1.531 1.03.892 1.529 2.341 1.087 2.91.831.092-.646.35-1.086.636-1.336-2.22-.253-4.555-1.11-4.555-4.943 0-1.091.39-1.984 1.029-2.683-.103-.253-.446-1.27.098-2.647 0 0 .84-.269 2.75 1.025A9.578 9.578 0 0112 6.836c.85.004 1.705.114 2.504.336 1.909-1.294 2.747-1.025 2.747-1.025.546 1.377.203 2.394.1 2.647.64.699 1.028 1.592 1.028 2.683 0 3.842-2.339 4.687-4.566 4.935.359.309.678.919.678 1.852 0 1.336-.012 2.415-.012 2.743 0 .267.18.578.688.48C19.138 20.167 22 16.418 22 12c0-5.523-4.477-10-10-10z" /> |
|
|
</svg> |
|
|
Code |
|
|
</a> |
|
|
<a href="https://huggingface.co/papers/2602.02039" class="platform-btn huggingface-btn" |
|
|
target="_blank" rel="noopener noreferrer"> |
|
|
<img src="assets/hf-logo-pirate.svg" alt="HuggingFace" width="30" height="30" |
|
|
class="platform-icon"> |
|
|
HuggingFace |
|
|
</a> |
|
|
<a href="https://arxiv.org/abs/2602.02039" class="platform-btn arxiv-btn" target="_blank" |
|
|
rel="noopener noreferrer"> |
|
|
<img src="assets/arxiv-logomark-small.svg" alt="arXiv" width="30" height="30" |
|
|
class="platform-icon"> |
|
|
arXiv |
|
|
</a> |
|
|
<a href="https://www.alphaxiv.org/abs/2602.02039" class="platform-btn alphaxiv-btn" target="_blank" |
|
|
rel="noopener noreferrer"> |
|
|
<img src="assets/alphaxiv_logo.png" alt="AlphaXiv" width="30" height="30" class="platform-icon"> |
|
|
AlphaXiv |
|
|
</a> |
|
|
<a href="https://thinkwee.notion.site/ddrbench" class="platform-btn notion-btn" target="_blank" |
|
|
rel="noopener noreferrer"> |
|
|
<svg viewBox="0 0 24 24" width="30" height="30" fill="currentColor"> |
|
|
<path |
|
|
d="M19 3H5c-1.103 0-2 .897-2 2v14c0 1.103.897 2 2 2h14c1.103 0 2-.897 2-2V5c0-1.103-.897-2-2-2zM9 17H7.17V7H9l5.83 6.91V7H16.83v10H15L9.17 10.09V17z" /> |
|
|
</svg> |
|
|
Notion Blog |
|
|
</a> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</header> |
|
|
|
|
|
|
|
|
<main class="content"> |
|
|
|
|
|
|
|
|
<section id="framework" class="section visible framework-section"> |
|
|
<div class="section-header"> |
|
|
<h2> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<rect width="18" height="18" x="3" y="3" rx="2" ry="2" /> |
|
|
<line x1="3" x2="21" y1="9" y2="9" /> |
|
|
<line x1="9" x2="9" y1="21" y2="9" /> |
|
|
</svg> |
|
|
Framework Overview |
|
|
</h2> |
|
|
<p>Overview of DDR-Bench.</p> |
|
|
</div> |
|
|
<div class="framework-grid"> |
|
|
<div class="framework-card"> |
|
|
<div class="framework-img-wrapper"> |
|
|
<div class="skeleton-loader"></div> |
|
|
<img src="assets/framework_task.png" alt="Task Formulation Framework" class="framework-img" |
|
|
loading="lazy" |
|
|
onload="this.classList.add('loaded'); this.previousElementSibling.style.display='none';"> |
|
|
</div> |
|
|
<h3>Task Formulation</h3> |
|
|
<p class="framework-description">A case of Claude Sonnet 4.5's trajectory and evaluation checklist |
|
|
in the MIMIC scenario of DDR-Bench. Verified fact and supporting insights are |
|
|
<u>underlined</u>. The agent is asked to perform multiple ReAct turns to explore the database |
|
|
without predefined targets or queries, autonomously mine insights from the exploration. |
|
|
</p> |
|
|
</div> |
|
|
<div class="framework-card"> |
|
|
<div class="framework-img-wrapper"> |
|
|
<div class="skeleton-loader"></div> |
|
|
<img src="assets/framework_pipeline.png" alt="Evaluation Pipeline Framework" |
|
|
class="framework-img" loading="lazy" |
|
|
onload="this.classList.add('loaded'); this.previousElementSibling.style.display='none';"> |
|
|
</div> |
|
|
<h3>Evaluation Pipeline</h3> |
|
|
<p class="framework-description"><b>Left</b>: Compared with previous tasks, <i>DDR</i> maximises |
|
|
exploration openness and agency, focusing on the direct evaluation of insight quality. |
|
|
<b>Right</b>: Overview of the DDR-Bench. The checklist derived from the freeform parts of the |
|
|
database is used to evaluate the agent generated insights from the exploration on the structured |
|
|
parts of the database. |
|
|
</p> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<section id="trajectory" class="section visible trajectory-section"> |
|
|
<div class="section-header"> |
|
|
<h2> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<polyline points="22 12 18 12 15 21 9 3 6 12 2 12"></polyline> |
|
|
</svg> |
|
|
Agent Trajectory |
|
|
</h2> |
|
|
<p>Observe the autonomous decision-making process of the agent across different scenarios.</p> |
|
|
</div> |
|
|
|
|
|
<div class="dimension-toggle"> |
|
|
<button class="dim-btn active" data-traj-scenario="mimic">MIMIC</button> |
|
|
<button class="dim-btn" data-traj-scenario="10k">10-K</button> |
|
|
<button class="dim-btn" data-traj-scenario="globem">GLOBEM</button> |
|
|
</div> |
|
|
|
|
|
<p id="trajectory-scenario-description" class="trajectory-description"> |
|
|
Exploring clinical patterns and patient outcomes in a large-scale electronic health record (EHR) |
|
|
database. |
|
|
</p> |
|
|
|
|
|
<div class="trajectory-container"> |
|
|
<div id="chat-window" class="chat-window"> |
|
|
|
|
|
<div class="loading-message">Loading trajectory data...</div> |
|
|
</div> |
|
|
<div class="scroll-hint" id="scroll-hint"> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<path d="M12 5v14M19 12l-7 7-7-7" /> |
|
|
</svg> |
|
|
<span>Scroll to see more</span> |
|
|
</div> |
|
|
</div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section id="benchmarking" class="section visible benchmarking-section"> |
|
|
<div class="section-header"> |
|
|
<h2> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<line x1="12" x2="12" y1="20" y2="10" /> |
|
|
<line x1="18" x2="18" y1="20" y2="4" /> |
|
|
<line x1="6" x2="6" y1="20" y2="16" /> |
|
|
</svg> |
|
|
Benchmarking |
|
|
</h2> |
|
|
<p>Overall average accuracy across all scenarios and evaluation metrics. |
|
|
<br> |
|
|
<span class="model-badge proprietary">Purple = Proprietary</span> |
|
|
<span class="model-badge opensource">Green = Open-source</span> |
|
|
</p> |
|
|
</div> |
|
|
<div class="charts-grid single"> |
|
|
<div class="chart-card wide"> |
|
|
<div id="benchmarking-chart" class="chart-container-benchmarking"></div> |
|
|
</div> |
|
|
</div> |
|
|
<p class="section-description">Claude 4.5 Sonnet achieves the highest overall average accuracy at 47.73%, |
|
|
significantly outperforming other models. Among open-source models, DeepSeek-V3.2 leads with 38.80%, |
|
|
followed closely by GLM-4.6 (37.52%) and Kimi K2 (36.42%). The results demonstrate a clear performance |
|
|
gap between frontier proprietary models and open-source alternatives, though top open-source models |
|
|
remain competitive with mid-tier proprietary offerings.</p> |
|
|
</section> |
|
|
|
|
|
|
|
|
|
|
|
<section id="results" class="section visible results-section"> |
|
|
<div class="section-header"> |
|
|
<h2> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<path d="M3 3v18h18" /> |
|
|
<path d="m19 9-5 5-4-4-3 3" /> |
|
|
</svg> |
|
|
Experiments |
|
|
</h2> |
|
|
<p>Main benchmark results and in-depth analysis of agent capabilities.</p> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="carousel-wrapper"> |
|
|
<button class="carousel-btn carousel-prev" aria-label="Previous"> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<path d="m15 18-6-6 6-6" /> |
|
|
</svg> |
|
|
</button> |
|
|
|
|
|
<div class="carousel-track" id="results-carousel"> |
|
|
|
|
|
<div class="carousel-card"> |
|
|
<img src="assets/overall.png" alt="Overall Performance"> |
|
|
<h4>Overall Performance</h4> |
|
|
<p class="card-caption">Systematic evaluation of mainstream LLMs across MIMIC, 10-K, and GLOBEM |
|
|
datasets reveals persistent limitations in frontier models.</p> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="carousel-card"> |
|
|
<img src="assets/qwenfamily.png" alt="Qwen Family Performance"> |
|
|
<h4>Training-time Factors Analysis</h4> |
|
|
<p class="card-caption">Training-time factors study within the Qwen family. From left to right, |
|
|
the three columns examine inference-time scaling performance across all scenarios for models |
|
|
with different parameter scales, context optimisation methods, and model generations with |
|
|
different training strategies.</p> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="carousel-card"> |
|
|
<img src="assets/reasoning.png" alt="Reasoning Budget"> |
|
|
<h4>Reasoning Budget</h4> |
|
|
<p class="card-caption">Increasing the reasoning budget reduces interaction rounds but |
|
|
illustrates |
|
|
a |
|
|
trade-off between reasoning depth and exploration efficiency.</p> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="carousel-card"> |
|
|
<img src="assets/memory.png" alt="Memory Mechanism"> |
|
|
<h4>Memory Mechanism</h4> |
|
|
<p class="card-caption">Long-short-term memory can create unpredictable behavior, often |
|
|
increasing |
|
|
tool usage without consistently improving final accuracy.</p> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="carousel-card"> |
|
|
<img src="assets/agency.png" alt="Proactive vs Reactive"> |
|
|
<h4>Proactive vs Reactive</h4> |
|
|
<p class="card-caption">Models perform significantly better with explicit queries (Reactive), |
|
|
highlighting the difficulty of true proactive goal formulation.</p> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="carousel-card"> |
|
|
<img src="assets/hallucination.png" alt="Hallucination Analysis"> |
|
|
<h4>Hallucination Analysis</h4> |
|
|
<p class="card-caption">Hallucination rates (%) across models in DDR-Bench, measured as the |
|
|
proportion of insights containing factual but unfaithful information that are not derivable |
|
|
from the provided inputs, which is low.</p> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="carousel-card"> |
|
|
<img src="assets/hallu_acc_corr.png" alt="Hallucination-Accuracy Correlation"> |
|
|
<h4>Hallucination-Accuracy Correlation</h4> |
|
|
<p class="card-caption">Hallucination rates show almost no correlation with final accuracy, |
|
|
indicating |
|
|
robustness against metric inflation via memorization.</p> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="carousel-card"> |
|
|
<img src="assets/trustworthiness.png" alt="Trustworthiness"> |
|
|
<h4>Trustworthiness</h4> |
|
|
<p class="card-caption">Verification of the LLM-as-a-Checker pipeline demonstrating high |
|
|
alignment |
|
|
with human expert judgments, and it is stable across multiple runs.</p> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<button class="carousel-btn carousel-next" aria-label="Next"> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<path d="m9 18 6-6-6-6" /> |
|
|
</svg> |
|
|
</button> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="carousel-dots" id="results-dots"></div> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section id="scaling" class="section visible"> |
|
|
<div class="section-header"> |
|
|
<h2> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<line x1="12" x2="12" y1="20" y2="10" /> |
|
|
<line x1="18" x2="18" y1="20" y2="4" /> |
|
|
<line x1="6" x2="6" y1="20" y2="16" /> |
|
|
</svg> |
|
|
Scaling Analysis |
|
|
</h2> |
|
|
<p>Explore how model performance scales with interaction turns, token usage, and inference cost.</p> |
|
|
</div> |
|
|
<div class="dimension-toggle"> |
|
|
<button class="dim-btn active" data-dim="turn">Turns</button> |
|
|
<button class="dim-btn" data-dim="token">Tokens</button> |
|
|
<button class="dim-btn" data-dim="cost">Cost</button> |
|
|
</div> |
|
|
<div id="scaling-legend" class="shared-legend"></div> |
|
|
<div class="charts-grid three-col"> |
|
|
<div class="chart-card"> |
|
|
<h3>MIMIC</h3> |
|
|
<div id="scaling-mimic" class="chart-container"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3>10-K</h3> |
|
|
<div id="scaling-10k" class="chart-container"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3>GLOBEM</h3> |
|
|
<div id="scaling-globem" class="chart-container"></div> |
|
|
</div> |
|
|
</div> |
|
|
<p class="section-description">LLMs extract more accurate insights from delaying commitment, and they |
|
|
concentrate reasoning into a small number of highly valuable late-stage interactions. These targeted |
|
|
interactions are built upon longer early exploration.</p> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section id="ranking" class="section visible"> |
|
|
<div class="section-header"> |
|
|
<h2> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<path d="M6 9H4.5a2.5 2.5 0 0 1 0-5H6" /> |
|
|
<path d="M18 9h1.5a2.5 2.5 0 0 0 0-5H18" /> |
|
|
<path d="M4 22h16" /> |
|
|
<path d="M10 14.66V17c0 .55-.47.98-.97 1.21C7.85 18.75 7 20.24 7 22" /> |
|
|
<path d="M14 14.66V17c0 .55.47.98.97 1.21C16.15 18.75 17 20.24 17 22" /> |
|
|
<path d="M18 2H6v7a6 6 0 0 0 12 0V2Z" /> |
|
|
</svg> |
|
|
Novelty vs Accuracy |
|
|
</h2> |
|
|
<p> |
|
|
Novelty (Bradley-Terry) vs Accuracy ranking |
|
|
<br> |
|
|
● = Novelty, ◇ = Accuracy. |
|
|
<br> |
|
|
<span class="model-badge proprietary">Purple = Proprietary</span> |
|
|
<span class="model-badge opensource">Green = Open-source</span> |
|
|
</p> |
|
|
</div> |
|
|
<div class="dimension-toggle"> |
|
|
<button class="dim-btn ranking-dim active" data-mode="novelty">Sort by Novelty</button> |
|
|
<button class="dim-btn ranking-dim" data-mode="accuracy">Sort by Accuracy</button> |
|
|
</div> |
|
|
<div class="charts-grid three-col"> |
|
|
|
|
|
<div class="chart-card"> |
|
|
<h3>MIMIC</h3> |
|
|
<div id="ranking-mimic" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3>10-K</h3> |
|
|
<div id="ranking-10k" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3>GLOBEM</h3> |
|
|
<div id="ranking-globem" class="chart-container-tall"></div> |
|
|
|
|
|
</div> |
|
|
</div> |
|
|
<p class="section-description">The ranking induced by novel insight usefulness closely aligns with the |
|
|
ranking based on checklist accuracy. Differences between the two rankings are small, especially among |
|
|
the top-performing models.</p> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section id="turn" class="section visible"> |
|
|
<div class="section-header"> |
|
|
<h2> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<path d="M21 12a9 9 0 1 1-9-9c2.52 0 4.93 1 6.74 2.74L21 8" /> |
|
|
<path d="M21 3v5h-5" /> |
|
|
</svg> |
|
|
Turn Distribution |
|
|
</h2> |
|
|
<p>Analyze the distribution of interaction turns across different models and datasets.</p> |
|
|
</div> |
|
|
<div class="charts-grid three-col"> |
|
|
<div class="chart-card"> |
|
|
<h3>MIMIC</h3> |
|
|
<div id="turn-mimic" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3>10-K</h3> |
|
|
<div id="turn-10k" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3>GLOBEM</h3> |
|
|
<div id="turn-globem" class="chart-container-tall"></div> |
|
|
</div> |
|
|
</div> |
|
|
<p class="section-description">Stronger models tend to explore for more rounds without external prompting. |
|
|
Knowledge-intensive databases such as 10-K and MIMIC induce more interaction rounds than signal-based |
|
|
datasets such as GLOBEM, and the resulting distributions are also more uniform.</p> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section id="entropy" class="section visible"> |
|
|
<div class="section-header"> |
|
|
<h2> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<circle cx="7.5" cy="7.5" r="1.5" /> |
|
|
<circle cx="18.5" cy="5.5" r="1.5" /> |
|
|
<circle cx="11.5" cy="11.5" r="1.5" /> |
|
|
<circle cx="7.5" cy="16.5" r="1.5" /> |
|
|
<circle cx="17.5" cy="14.5" r="1.5" /> |
|
|
</svg> |
|
|
Exploration Pattern |
|
|
</h2> |
|
|
<p>Scatter plot showing Access Entropy vs Coverage by model. Opacity represents accuracy. Higher entropy |
|
|
= more uniform access; Higher coverage = more fields explored.</p> |
|
|
</div> |
|
|
<div class="dimension-toggle"> |
|
|
<button class="toggle-btn active" data-entropy-scenario="10k">10-K</button> |
|
|
<button class="toggle-btn" data-entropy-scenario="mimic">MIMIC</button> |
|
|
</div> |
|
|
<div class="charts-grid three-col"> |
|
|
<div class="chart-card"> |
|
|
<h3 id="entropy-model-0-title">GPT-5.2</h3> |
|
|
<div id="entropy-model-0" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3 id="entropy-model-1-title">Claude-4.5-Sonnet</h3> |
|
|
<div id="entropy-model-1" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3 id="entropy-model-2-title">Gemini-3-Flash</h3> |
|
|
<div id="entropy-model-2" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3 id="entropy-model-3-title">GLM-4.6</h3> |
|
|
<div id="entropy-model-3" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3 id="entropy-model-4-title">Qwen3-Next-80B-A3B</h3> |
|
|
<div id="entropy-model-4" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3 id="entropy-model-5-title">DeepSeek-V3.2</h3> |
|
|
<div id="entropy-model-5" class="chart-container-tall"></div> |
|
|
</div> |
|
|
</div> |
|
|
<p class="section-description">Advanced LLMs tend to operate in a balanced exploration regime that combines |
|
|
adequate coverage with focused access. Such a regime is consistently observed across different |
|
|
scenarios.</p> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section id="error" class="section visible"> |
|
|
<div class="section-header"> |
|
|
<h2> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z" /> |
|
|
<line x1="12" x2="12" y1="9" y2="13" /> |
|
|
<line x1="12" x2="12.01" y1="17" y2="17" /> |
|
|
</svg> |
|
|
Error Analysis |
|
|
</h2> |
|
|
<p>Breakdown of error types encountered during agent interactions, grouped by main categories.</p> |
|
|
</div> |
|
|
<div class="charts-grid single"> |
|
|
<div class="chart-card wide"> |
|
|
<div id="error-chart" class="chart-container-double"></div> |
|
|
</div> |
|
|
</div> |
|
|
<p class="section-description">Our findings revealed that 58% of errors stemmed from insufficient |
|
|
exploration, both in terms of breadth and depth. This imbalance in exploration often leads to suboptimal |
|
|
results, regardless of the model’s overall capability. |
|
|
Additionally, around 40% of the errors were attributed to other factors. For more powerful models, |
|
|
over-reasoning was common, where the model made assumptions not fully supported by the data. In other |
|
|
cases, models misinterpreted the insights, such as mistaking a downward trend for an upward one. Less |
|
|
capable models, on the other hand, tended to make more fundamental errors, such as repeatedly debugging |
|
|
or struggling with missing data, which could disrupt the overall coherence of the analysis.</p> |
|
|
</section> |
|
|
|
|
|
|
|
|
<section id="probing" class="section visible"> |
|
|
<div class="section-header"> |
|
|
<h2> |
|
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
|
|
stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
|
|
<circle cx="11" cy="11" r="8" /> |
|
|
<path d="m21 21-4.3-4.3" /> |
|
|
</svg> |
|
|
Self-Termination |
|
|
</h2> |
|
|
<p>Analyze the willingness of models to terminate their own analysis.</p> |
|
|
</div> |
|
|
<div id="probing-legend" class="shared-legend"></div> |
|
|
<div class="charts-grid three-col"> |
|
|
<div class="chart-card"> |
|
|
<h3>MIMIC</h3> |
|
|
<div id="probing-mimic" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3>GLOBEM</h3> |
|
|
<div id="probing-globem" class="chart-container-tall"></div> |
|
|
</div> |
|
|
<div class="chart-card"> |
|
|
<h3>10-K</h3> |
|
|
<div id="probing-10k" class="chart-container-tall"></div> |
|
|
</div> |
|
|
</div> |
|
|
<p class="section-description"> Clear differences emerge across model generations. Qwen3 and Qwen3-Next |
|
|
exhibit a consistently increasing probability, indicating growing confidence that a complete report can |
|
|
be produced as more information is accumulated, whereas the Qwen2.5 series shows pronounced fluctuations |
|
|
and remains uncertain about whether exploration can be terminated at the current step. Moreover, |
|
|
Qwen3-Next maintains higher confidence with lower variance throughout, suggesting that it has more |
|
|
confidence that exploration is progressing towards a more comprehensive and deeper report.</p> |
|
|
</section> |
|
|
</main> |
|
|
|
|
|
|
|
|
<footer class="footer"> |
|
|
<p>DDR-Bench © 2026 | King's College London · Tencent · The Alan Turing Institute</p> |
|
|
</footer> |
|
|
|
|
|
|
|
|
<script> |
|
|
document.addEventListener('DOMContentLoaded', function () { |
|
|
const track = document.getElementById('results-carousel'); |
|
|
const dotsContainer = document.getElementById('results-dots'); |
|
|
const prevBtn = document.querySelector('.carousel-prev'); |
|
|
const nextBtn = document.querySelector('.carousel-next'); |
|
|
|
|
|
if (!track) return; |
|
|
|
|
|
const cards = Array.from(track.querySelectorAll('.carousel-card')); |
|
|
const cardCount = cards.length; |
|
|
let currentIndex = 0; |
|
|
let hasStartedAutoPlay = false; |
|
|
|
|
|
|
|
|
for (let i = 0; i < cardCount; i++) { |
|
|
const dot = document.createElement('button'); |
|
|
dot.className = 'carousel-dot' + (i === 0 ? ' active' : ''); |
|
|
dot.setAttribute('aria-label', `Go to slide ${i + 1}`); |
|
|
dot.addEventListener('click', () => goToSlide(i)); |
|
|
dotsContainer.appendChild(dot); |
|
|
} |
|
|
|
|
|
const dots = dotsContainer.querySelectorAll('.carousel-dot'); |
|
|
|
|
|
function updateCarousel() { |
|
|
|
|
|
const cardWidth = track.offsetWidth * 0.66666; |
|
|
const gap = 32; |
|
|
const offset = (track.offsetWidth - cardWidth) / 2 - currentIndex * (cardWidth + gap); |
|
|
|
|
|
track.style.transform = `translateX(${offset}px)`; |
|
|
|
|
|
|
|
|
cards.forEach((card, i) => { |
|
|
card.classList.remove('active', 'side'); |
|
|
if (i === currentIndex) { |
|
|
card.classList.add('active'); |
|
|
} else { |
|
|
card.classList.add('side'); |
|
|
} |
|
|
}); |
|
|
|
|
|
|
|
|
dots.forEach((dot, i) => { |
|
|
dot.classList.toggle('active', i === currentIndex); |
|
|
}); |
|
|
} |
|
|
|
|
|
function goToSlide(index) { |
|
|
|
|
|
if (index < 0) { |
|
|
currentIndex = cardCount - 1; |
|
|
} else if (index >= cardCount) { |
|
|
currentIndex = 0; |
|
|
} else { |
|
|
currentIndex = index; |
|
|
} |
|
|
updateCarousel(); |
|
|
} |
|
|
|
|
|
function nextSlide() { goToSlide(currentIndex + 1); } |
|
|
function prevSlide() { goToSlide(currentIndex - 1); } |
|
|
|
|
|
|
|
|
prevBtn.addEventListener('click', prevSlide); |
|
|
nextBtn.addEventListener('click', nextSlide); |
|
|
|
|
|
|
|
|
document.addEventListener('keydown', (e) => { |
|
|
if (e.key === 'ArrowLeft') prevSlide(); |
|
|
if (e.key === 'ArrowRight') nextSlide(); |
|
|
}); |
|
|
|
|
|
|
|
|
let touchStartX = 0; |
|
|
track.addEventListener('touchstart', (e) => { |
|
|
touchStartX = e.changedTouches[0].screenX; |
|
|
}, { passive: true }); |
|
|
|
|
|
track.addEventListener('touchend', (e) => { |
|
|
const diff = touchStartX - e.changedTouches[0].screenX; |
|
|
if (Math.abs(diff) > 50) { |
|
|
if (diff > 0) nextSlide(); |
|
|
else prevSlide(); |
|
|
} |
|
|
}, { passive: true }); |
|
|
|
|
|
|
|
|
let autoPlayInterval; |
|
|
const AUTO_PLAY_DELAY = 5000; |
|
|
|
|
|
function startAutoPlay() { |
|
|
stopAutoPlay(); |
|
|
autoPlayInterval = setInterval(nextSlide, AUTO_PLAY_DELAY); |
|
|
} |
|
|
|
|
|
function stopAutoPlay() { |
|
|
if (autoPlayInterval) clearInterval(autoPlayInterval); |
|
|
} |
|
|
|
|
|
|
|
|
const carouselWrapper = document.querySelector('.carousel-wrapper'); |
|
|
if (carouselWrapper) { |
|
|
carouselWrapper.addEventListener('mouseenter', stopAutoPlay); |
|
|
carouselWrapper.addEventListener('mouseleave', () => { |
|
|
if (hasStartedAutoPlay) startAutoPlay(); |
|
|
}); |
|
|
} |
|
|
|
|
|
|
|
|
updateCarousel(); |
|
|
|
|
|
|
|
|
const resultsSection = document.getElementById('results'); |
|
|
if (resultsSection) { |
|
|
const carouselObserver = new IntersectionObserver((entries) => { |
|
|
entries.forEach(entry => { |
|
|
if (entry.isIntersecting) { |
|
|
if (!hasStartedAutoPlay) { |
|
|
hasStartedAutoPlay = true; |
|
|
} |
|
|
startAutoPlay(); |
|
|
} else { |
|
|
stopAutoPlay(); |
|
|
} |
|
|
}); |
|
|
}, { |
|
|
threshold: 0.3 |
|
|
}); |
|
|
|
|
|
carouselObserver.observe(resultsSection); |
|
|
} |
|
|
}); |
|
|
</script> |
|
|
|
|
|
|
|
|
<script> |
|
|
document.addEventListener('DOMContentLoaded', function () { |
|
|
const chatWindow = document.getElementById('chat-window'); |
|
|
const scrollHint = document.getElementById('scroll-hint'); |
|
|
|
|
|
if (chatWindow && scrollHint) { |
|
|
|
|
|
chatWindow.addEventListener('scroll', function () { |
|
|
if (chatWindow.scrollTop > 50) { |
|
|
scrollHint.classList.add('hidden'); |
|
|
} |
|
|
}); |
|
|
|
|
|
|
|
|
setTimeout(() => { |
|
|
scrollHint.style.opacity = '1'; |
|
|
}, 1000); |
|
|
} |
|
|
}); |
|
|
</script> |
|
|
</body> |
|
|
|
|
|
</html> |