| <!DOCTYPE html> |
| <html lang="en"> |
|
|
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| <meta name="description" content="DDR-Bench: A Deep Data Research Agent Benchmark for LLMs"> |
| <title>DDR-Bench | Deep Data Research Benchmark</title> |
| <link rel="preconnect" href="https://fonts.googleapis.com"> |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> |
| <script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script> |
| <script src="https://cdn.jsdelivr.net/npm/marked@4.3.0/marked.min.js"></script> |
| <link rel="stylesheet" |
| href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/atom-one-dark.min.css"> |
| <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script> |
| <script src="https://unpkg.com/sql-formatter@4.0.2/dist/sql-formatter.min.js"></script> |
| <script src="data.js" defer></script> |
| <script src="entropy_data.js" defer></script> |
| <script src="trajectory_data.js" defer></script> |
| <script src="trajectory.js" defer></script> |
| <script src="benchmarking_data.js" defer></script> |
| <script src="benchmarking.js" defer></script> |
| <script src="charts.js" defer></script> |
| <link rel="stylesheet" href="styles.css?v=3"> |
| <style> |
| |
| .chart-loading { |
| display: flex; |
| align-items: center; |
| justify-content: center; |
| min-height: 300px; |
| color: var(--color-text-muted, #64748B); |
| font-size: 14px; |
| } |
| |
| .chart-loading::after { |
| content: 'Loading chart...'; |
| animation: pulse 1.5s ease-in-out infinite; |
| } |
| |
| @keyframes pulse { |
| |
| 0%, |
| 100% { |
| opacity: 0.4; |
| } |
| |
| 50% { |
| opacity: 1; |
| } |
| } |
| </style> |
| </head> |
|
|
| <body> |
| <header class="hero"> |
| <div class="hero-content"> |
| |
| <img src="assets/social_preview.png" alt="DDR-Bench - Deep Data Research" class="hero-preview-img"> |
| <h2>Hunt Instead of Wait: Evaluating Deep Data Research on Large Language Models</h2> |
| <p class="description"> |
| We distinguish <em>investigatory intelligence</em> (autonomously setting goals and exploring) from |
| <em>executional intelligence</em> (completing assigned tasks), arguing that true agency requires the |
| former. |
| To evaluate this, we introduce <strong>Deep Data Research (DDR)</strong>, an open-ended task where LLMs |
| autonomously extract insights from databases, and <strong>DDR-Bench</strong>, a large-scale, |
| checklist-based benchmark enabling verifiable evaluation. |
| Results show that while frontier models display emerging agency, long-horizon exploration remains |
| challenging, with effective investigatory intelligence depending on intrinsic agentic strategies beyond |
| mere scaffolding or scaling. |
| </p> |
| <div class="meta-info"> |
| <div class="meta-row authors"> |
| <span class="meta-item"> |
| <a href="https://thinkwee.top/about" target="_blank" rel="noopener noreferrer">Wei Liu</a>, |
| <a href="https://github.com/yupeijei1997" target="_blank" rel="noopener noreferrer">Peijie |
| Yu</a>, |
| <a href="https://www.kcl.ac.uk/people/michele-orini" target="_blank" |
| rel="noopener noreferrer">Michele Orini</a>, |
| <a href="https://yalidu.github.io/" target="_blank" rel="noopener noreferrer">Yali Du</a>, |
| <a href="https://sites.google.com/view/yulanhe/home" target="_blank" |
| rel="noopener noreferrer">Yulan He</a> |
| </span> |
| </div> |
| <div class="meta-row affiliations"> |
| <a href="https://kclnlp.github.io/" target="_blank" rel="noopener noreferrer"> |
| <img src="assets/kcl.svg" alt="King's College London" class="affiliation-logo kcl-logo"> |
| </a> |
| <a href="https://www.tencent.com/en-us/" target="_blank" rel="noopener noreferrer"> |
| <img src="assets/tencent.png" alt="Tencent" class="affiliation-logo"> |
| </a> |
| <a href="https://www.turing.ac.uk/" target="_blank" rel="noopener noreferrer"> |
| <img src="assets/alan.png" alt="The Alan Turing Institute" class="affiliation-logo"> |
| </a> |
| </div> |
| <div class="meta-row links"> |
| <a href="https://huggingface.co/collections/thinkwee/ddrbench" class="platform-btn dataset-btn" |
| target="_blank" rel="noopener noreferrer"> |
| <svg viewBox="0 0 24 24" width="30" height="30" fill="none" stroke="currentColor" |
| stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <ellipse cx="12" cy="5" rx="9" ry="3" /> |
| <path d="M3 5v14c0 1.66 4 3 9 3s9-1.34 9-3V5" /> |
| <path d="M3 12c0 1.66 4 3 9 3s9-1.34 9-3" /> |
| </svg> |
| Dataset |
| </a> |
| <a href="https://github.com/thinkwee/DDR_Bench" class="platform-btn github-btn" target="_blank" |
| rel="noopener noreferrer"> |
| <svg viewBox="0 0 24 24" width="30" height="30" fill="currentColor"> |
| <path |
| d="M12 2C6.477 2 2 6.477 2 12c0 4.42 2.865 8.17 6.839 9.49.5.092.682-.217.682-.482 0-.237-.008-.866-.013-1.7-2.782.603-3.369-1.34-3.369-1.34-.454-1.156-1.11-1.463-1.11-1.463-.908-.62.069-.608.069-.608 1.003.07 1.531 1.03 1.531 1.03.892 1.529 2.341 1.087 2.91.831.092-.646.35-1.086.636-1.336-2.22-.253-4.555-1.11-4.555-4.943 0-1.091.39-1.984 1.029-2.683-.103-.253-.446-1.27.098-2.647 0 0 .84-.269 2.75 1.025A9.578 9.578 0 0112 6.836c.85.004 1.705.114 2.504.336 1.909-1.294 2.747-1.025 2.747-1.025.546 1.377.203 2.394.1 2.647.64.699 1.028 1.592 1.028 2.683 0 3.842-2.339 4.687-4.566 4.935.359.309.678.919.678 1.852 0 1.336-.012 2.415-.012 2.743 0 .267.18.578.688.48C19.138 20.167 22 16.418 22 12c0-5.523-4.477-10-10-10z" /> |
| </svg> |
| Code |
| </a> |
| <a href="https://huggingface.co/papers/2602.02039" class="platform-btn huggingface-btn" |
| target="_blank" rel="noopener noreferrer"> |
| <img src="assets/hf-logo-pirate.svg" alt="HuggingFace" width="30" height="30" |
| class="platform-icon"> |
| HuggingFace |
| </a> |
| <a href="https://arxiv.org/abs/2602.02039" class="platform-btn arxiv-btn" target="_blank" |
| rel="noopener noreferrer"> |
| <img src="assets/arxiv-logomark-small.svg" alt="arXiv" width="30" height="30" |
| class="platform-icon"> |
| arXiv |
| </a> |
| <a href="https://www.alphaxiv.org/abs/2602.02039" class="platform-btn alphaxiv-btn" target="_blank" |
| rel="noopener noreferrer"> |
| <img src="assets/alphaxiv_logo.png" alt="AlphaXiv" width="30" height="30" class="platform-icon"> |
| AlphaXiv |
| </a> |
| <a href="https://thinkwee.notion.site/ddrbench" class="platform-btn notion-btn" target="_blank" |
| rel="noopener noreferrer"> |
| <svg viewBox="0 0 24 24" width="30" height="30" fill="currentColor"> |
| <path |
| d="M19 3H5c-1.103 0-2 .897-2 2v14c0 1.103.897 2 2 2h14c1.103 0 2-.897 2-2V5c0-1.103-.897-2-2-2zM9 17H7.17V7H9l5.83 6.91V7H16.83v10H15L9.17 10.09V17z" /> |
| </svg> |
| Notion Blog |
| </a> |
| </div> |
| </div> |
| </div> |
| </header> |
|
|
| |
| <main class="content"> |
|
|
| |
| <section id="framework" class="section visible framework-section"> |
| <div class="section-header"> |
| <h2> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <rect width="18" height="18" x="3" y="3" rx="2" ry="2" /> |
| <line x1="3" x2="21" y1="9" y2="9" /> |
| <line x1="9" x2="9" y1="21" y2="9" /> |
| </svg> |
| Framework Overview |
| </h2> |
| <p>Overview of DDR-Bench.</p> |
| </div> |
| <div class="framework-grid"> |
| <div class="framework-card"> |
| <div class="framework-img-wrapper"> |
| <div class="skeleton-loader"></div> |
| <img src="assets/framework_task.png" alt="Task Formulation Framework" class="framework-img" |
| loading="lazy" |
| onload="this.classList.add('loaded'); this.previousElementSibling.style.display='none';"> |
| </div> |
| <h3>Task Formulation</h3> |
| <p class="framework-description">A case of Claude Sonnet 4.5's trajectory and evaluation checklist |
| in the MIMIC scenario of DDR-Bench. Verified fact and supporting insights are |
| <u>underlined</u>. The agent is asked to perform multiple ReAct turns to explore the database |
| without predefined targets or queries, autonomously mine insights from the exploration. |
| </p> |
| </div> |
| <div class="framework-card"> |
| <div class="framework-img-wrapper"> |
| <div class="skeleton-loader"></div> |
| <img src="assets/framework_pipeline.png" alt="Evaluation Pipeline Framework" |
| class="framework-img" loading="lazy" |
| onload="this.classList.add('loaded'); this.previousElementSibling.style.display='none';"> |
| </div> |
| <h3>Evaluation Pipeline</h3> |
| <p class="framework-description"><b>Left</b>: Compared with previous tasks, <i>DDR</i> maximises |
| exploration openness and agency, focusing on the direct evaluation of insight quality. |
| <b>Right</b>: Overview of the DDR-Bench. The checklist derived from the freeform parts of the |
| database is used to evaluate the agent generated insights from the exploration on the structured |
| parts of the database. |
| </p> |
| </div> |
| </div> |
| </section> |
|
|
|
|
|
|
| |
| <section id="trajectory" class="section visible trajectory-section"> |
| <div class="section-header"> |
| <h2> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <polyline points="22 12 18 12 15 21 9 3 6 12 2 12"></polyline> |
| </svg> |
| Agent Trajectory |
| </h2> |
| <p>Observe the autonomous decision-making process of the agent across different scenarios.</p> |
| </div> |
|
|
| <div class="dimension-toggle"> |
| <button class="dim-btn active" data-traj-scenario="mimic">MIMIC</button> |
| <button class="dim-btn" data-traj-scenario="10k">10-K</button> |
| <button class="dim-btn" data-traj-scenario="globem">GLOBEM</button> |
| </div> |
|
|
| <p id="trajectory-scenario-description" class="trajectory-description"> |
| Exploring clinical patterns and patient outcomes in a large-scale electronic health record (EHR) |
| database. |
| </p> |
|
|
| <div class="trajectory-container"> |
| <div id="chat-window" class="chat-window"> |
| |
| <div class="loading-message">Loading trajectory data...</div> |
| </div> |
| <div class="scroll-hint" id="scroll-hint"> |
| <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <path d="M12 5v14M19 12l-7 7-7-7" /> |
| </svg> |
| <span>Scroll to see more</span> |
| </div> |
| </div> |
| </section> |
|
|
| |
| <section id="benchmarking" class="section visible benchmarking-section"> |
| <div class="section-header"> |
| <h2> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <line x1="12" x2="12" y1="20" y2="10" /> |
| <line x1="18" x2="18" y1="20" y2="4" /> |
| <line x1="6" x2="6" y1="20" y2="16" /> |
| </svg> |
| Benchmarking |
| </h2> |
| <p>Overall average accuracy across all scenarios and evaluation metrics. |
| <br> |
| <span class="model-badge proprietary">Purple = Proprietary</span> |
| <span class="model-badge opensource">Green = Open-source</span> |
| </p> |
| </div> |
| <div class="charts-grid single"> |
| <div class="chart-card wide"> |
| <div id="benchmarking-chart" class="chart-container-benchmarking"></div> |
| </div> |
| </div> |
| <p class="section-description">Claude 4.5 Sonnet achieves the highest overall average accuracy at 47.73%, |
| significantly outperforming other models. Among open-source models, DeepSeek-V3.2 leads with 38.80%, |
| followed closely by GLM-4.6 (37.52%) and Kimi K2 (36.42%). The results demonstrate a clear performance |
| gap between frontier proprietary models and open-source alternatives, though top open-source models |
| remain competitive with mid-tier proprietary offerings.</p> |
| </section> |
|
|
| |
|
|
| <section id="results" class="section visible results-section"> |
| <div class="section-header"> |
| <h2> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <path d="M3 3v18h18" /> |
| <path d="m19 9-5 5-4-4-3 3" /> |
| </svg> |
| Experiments |
| </h2> |
| <p>Main benchmark results and in-depth analysis of agent capabilities.</p> |
| </div> |
|
|
| |
| <div class="carousel-wrapper"> |
| <button class="carousel-btn carousel-prev" aria-label="Previous"> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <path d="m15 18-6-6 6-6" /> |
| </svg> |
| </button> |
|
|
| <div class="carousel-track" id="results-carousel"> |
| |
| <div class="carousel-card"> |
| <img src="assets/overall.png" alt="Overall Performance"> |
| <h4>Overall Performance</h4> |
| <p class="card-caption">Systematic evaluation of mainstream LLMs across MIMIC, 10-K, and GLOBEM |
| datasets reveals persistent limitations in frontier models.</p> |
| </div> |
|
|
| |
| <div class="carousel-card"> |
| <img src="assets/qwenfamily.png" alt="Qwen Family Performance"> |
| <h4>Training-time Factors Analysis</h4> |
| <p class="card-caption">Training-time factors study within the Qwen family. From left to right, |
| the three columns examine inference-time scaling performance across all scenarios for models |
| with different parameter scales, context optimisation methods, and model generations with |
| different training strategies.</p> |
| </div> |
|
|
| |
| <div class="carousel-card"> |
| <img src="assets/reasoning.png" alt="Reasoning Budget"> |
| <h4>Reasoning Budget</h4> |
| <p class="card-caption">Increasing the reasoning budget reduces interaction rounds but |
| illustrates |
| a |
| trade-off between reasoning depth and exploration efficiency.</p> |
| </div> |
|
|
| |
| <div class="carousel-card"> |
| <img src="assets/memory.png" alt="Memory Mechanism"> |
| <h4>Memory Mechanism</h4> |
| <p class="card-caption">Long-short-term memory can create unpredictable behavior, often |
| increasing |
| tool usage without consistently improving final accuracy.</p> |
| </div> |
|
|
| |
| <div class="carousel-card"> |
| <img src="assets/agency.png" alt="Proactive vs Reactive"> |
| <h4>Proactive vs Reactive</h4> |
| <p class="card-caption">Models perform significantly better with explicit queries (Reactive), |
| highlighting the difficulty of true proactive goal formulation.</p> |
| </div> |
|
|
| |
| <div class="carousel-card"> |
| <img src="assets/hallucination.png" alt="Hallucination Analysis"> |
| <h4>Hallucination Analysis</h4> |
| <p class="card-caption">Hallucination rates (%) across models in DDR-Bench, measured as the |
| proportion of insights containing factual but unfaithful information that are not derivable |
| from the provided inputs, which is low.</p> |
| </div> |
|
|
| |
| <div class="carousel-card"> |
| <img src="assets/hallu_acc_corr.png" alt="Hallucination-Accuracy Correlation"> |
| <h4>Hallucination-Accuracy Correlation</h4> |
| <p class="card-caption">Hallucination rates show almost no correlation with final accuracy, |
| indicating |
| robustness against metric inflation via memorization.</p> |
| </div> |
|
|
| |
| <div class="carousel-card"> |
| <img src="assets/trustworthiness.png" alt="Trustworthiness"> |
| <h4>Trustworthiness</h4> |
| <p class="card-caption">Verification of the LLM-as-a-Checker pipeline demonstrating high |
| alignment |
| with human expert judgments, and it is stable across multiple runs.</p> |
| </div> |
| </div> |
|
|
| <button class="carousel-btn carousel-next" aria-label="Next"> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <path d="m9 18 6-6-6-6" /> |
| </svg> |
| </button> |
| </div> |
|
|
| |
| <div class="carousel-dots" id="results-dots"></div> |
| </section> |
|
|
| |
| <section id="scaling" class="section visible"> |
| <div class="section-header"> |
| <h2> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <line x1="12" x2="12" y1="20" y2="10" /> |
| <line x1="18" x2="18" y1="20" y2="4" /> |
| <line x1="6" x2="6" y1="20" y2="16" /> |
| </svg> |
| Scaling Analysis |
| </h2> |
| <p>Explore how model performance scales with interaction turns, token usage, and inference cost.</p> |
| </div> |
| <div class="dimension-toggle"> |
| <button class="dim-btn active" data-dim="turn">Turns</button> |
| <button class="dim-btn" data-dim="token">Tokens</button> |
| <button class="dim-btn" data-dim="cost">Cost</button> |
| </div> |
| <div id="scaling-legend" class="shared-legend"></div> |
| <div class="charts-grid three-col"> |
| <div class="chart-card"> |
| <h3>MIMIC</h3> |
| <div id="scaling-mimic" class="chart-container"></div> |
| </div> |
| <div class="chart-card"> |
| <h3>10-K</h3> |
| <div id="scaling-10k" class="chart-container"></div> |
| </div> |
| <div class="chart-card"> |
| <h3>GLOBEM</h3> |
| <div id="scaling-globem" class="chart-container"></div> |
| </div> |
| </div> |
| <p class="section-description">LLMs extract more accurate insights from delaying commitment, and they |
| concentrate reasoning into a small number of highly valuable late-stage interactions. These targeted |
| interactions are built upon longer early exploration.</p> |
| </section> |
|
|
| |
| <section id="ranking" class="section visible"> |
| <div class="section-header"> |
| <h2> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <path d="M6 9H4.5a2.5 2.5 0 0 1 0-5H6" /> |
| <path d="M18 9h1.5a2.5 2.5 0 0 0 0-5H18" /> |
| <path d="M4 22h16" /> |
| <path d="M10 14.66V17c0 .55-.47.98-.97 1.21C7.85 18.75 7 20.24 7 22" /> |
| <path d="M14 14.66V17c0 .55.47.98.97 1.21C16.15 18.75 17 20.24 17 22" /> |
| <path d="M18 2H6v7a6 6 0 0 0 12 0V2Z" /> |
| </svg> |
| Novelty vs Accuracy |
| </h2> |
| <p> |
| Novelty (Bradley-Terry) vs Accuracy ranking |
| <br> |
| ● = Novelty, ◇ = Accuracy. |
| <br> |
| <span class="model-badge proprietary">Purple = Proprietary</span> |
| <span class="model-badge opensource">Green = Open-source</span> |
| </p> |
| </div> |
| <div class="dimension-toggle"> |
| <button class="dim-btn ranking-dim active" data-mode="novelty">Sort by Novelty</button> |
| <button class="dim-btn ranking-dim" data-mode="accuracy">Sort by Accuracy</button> |
| </div> |
| <div class="charts-grid three-col"> |
|
|
| <div class="chart-card"> |
| <h3>MIMIC</h3> |
| <div id="ranking-mimic" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3>10-K</h3> |
| <div id="ranking-10k" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3>GLOBEM</h3> |
| <div id="ranking-globem" class="chart-container-tall"></div> |
|
|
| </div> |
| </div> |
| <p class="section-description">The ranking induced by novel insight usefulness closely aligns with the |
| ranking based on checklist accuracy. Differences between the two rankings are small, especially among |
| the top-performing models.</p> |
| </section> |
|
|
| |
| <section id="turn" class="section visible"> |
| <div class="section-header"> |
| <h2> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <path d="M21 12a9 9 0 1 1-9-9c2.52 0 4.93 1 6.74 2.74L21 8" /> |
| <path d="M21 3v5h-5" /> |
| </svg> |
| Turn Distribution |
| </h2> |
| <p>Analyze the distribution of interaction turns across different models and datasets.</p> |
| </div> |
| <div class="charts-grid three-col"> |
| <div class="chart-card"> |
| <h3>MIMIC</h3> |
| <div id="turn-mimic" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3>10-K</h3> |
| <div id="turn-10k" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3>GLOBEM</h3> |
| <div id="turn-globem" class="chart-container-tall"></div> |
| </div> |
| </div> |
| <p class="section-description">Stronger models tend to explore for more rounds without external prompting. |
| Knowledge-intensive databases such as 10-K and MIMIC induce more interaction rounds than signal-based |
| datasets such as GLOBEM, and the resulting distributions are also more uniform.</p> |
| </section> |
|
|
| |
| <section id="entropy" class="section visible"> |
| <div class="section-header"> |
| <h2> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <circle cx="7.5" cy="7.5" r="1.5" /> |
| <circle cx="18.5" cy="5.5" r="1.5" /> |
| <circle cx="11.5" cy="11.5" r="1.5" /> |
| <circle cx="7.5" cy="16.5" r="1.5" /> |
| <circle cx="17.5" cy="14.5" r="1.5" /> |
| </svg> |
| Exploration Pattern |
| </h2> |
| <p>Scatter plot showing Access Entropy vs Coverage by model. Opacity represents accuracy. Higher entropy |
| = more uniform access; Higher coverage = more fields explored.</p> |
| </div> |
| <div class="dimension-toggle"> |
| <button class="toggle-btn active" data-entropy-scenario="10k">10-K</button> |
| <button class="toggle-btn" data-entropy-scenario="mimic">MIMIC</button> |
| </div> |
| <div class="charts-grid three-col"> |
| <div class="chart-card"> |
| <h3 id="entropy-model-0-title">GPT-5.2</h3> |
| <div id="entropy-model-0" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3 id="entropy-model-1-title">Claude-4.5-Sonnet</h3> |
| <div id="entropy-model-1" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3 id="entropy-model-2-title">Gemini-3-Flash</h3> |
| <div id="entropy-model-2" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3 id="entropy-model-3-title">GLM-4.6</h3> |
| <div id="entropy-model-3" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3 id="entropy-model-4-title">Qwen3-Next-80B-A3B</h3> |
| <div id="entropy-model-4" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3 id="entropy-model-5-title">DeepSeek-V3.2</h3> |
| <div id="entropy-model-5" class="chart-container-tall"></div> |
| </div> |
| </div> |
| <p class="section-description">Advanced LLMs tend to operate in a balanced exploration regime that combines |
| adequate coverage with focused access. Such a regime is consistently observed across different |
| scenarios.</p> |
| </section> |
|
|
| |
| <section id="error" class="section visible"> |
| <div class="section-header"> |
| <h2> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z" /> |
| <line x1="12" x2="12" y1="9" y2="13" /> |
| <line x1="12" x2="12.01" y1="17" y2="17" /> |
| </svg> |
| Error Analysis |
| </h2> |
| <p>Breakdown of error types encountered during agent interactions, grouped by main categories.</p> |
| </div> |
| <div class="charts-grid single"> |
| <div class="chart-card wide"> |
| <div id="error-chart" class="chart-container-double"></div> |
| </div> |
| </div> |
| <p class="section-description">Our findings revealed that 58% of errors stemmed from insufficient |
| exploration, both in terms of breadth and depth. This imbalance in exploration often leads to suboptimal |
| results, regardless of the model’s overall capability. |
| Additionally, around 40% of the errors were attributed to other factors. For more powerful models, |
| over-reasoning was common, where the model made assumptions not fully supported by the data. In other |
| cases, models misinterpreted the insights, such as mistaking a downward trend for an upward one. Less |
| capable models, on the other hand, tended to make more fundamental errors, such as repeatedly debugging |
| or struggling with missing data, which could disrupt the overall coherence of the analysis.</p> |
| </section> |
|
|
| |
| <section id="probing" class="section visible"> |
| <div class="section-header"> |
| <h2> |
| <svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" |
| stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <circle cx="11" cy="11" r="8" /> |
| <path d="m21 21-4.3-4.3" /> |
| </svg> |
| Self-Termination |
| </h2> |
| <p>Analyze the willingness of models to terminate their own analysis.</p> |
| </div> |
| <div id="probing-legend" class="shared-legend"></div> |
| <div class="charts-grid three-col"> |
| <div class="chart-card"> |
| <h3>MIMIC</h3> |
| <div id="probing-mimic" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3>GLOBEM</h3> |
| <div id="probing-globem" class="chart-container-tall"></div> |
| </div> |
| <div class="chart-card"> |
| <h3>10-K</h3> |
| <div id="probing-10k" class="chart-container-tall"></div> |
| </div> |
| </div> |
| <p class="section-description"> Clear differences emerge across model generations. Qwen3 and Qwen3-Next |
| exhibit a consistently increasing probability, indicating growing confidence that a complete report can |
| be produced as more information is accumulated, whereas the Qwen2.5 series shows pronounced fluctuations |
| and remains uncertain about whether exploration can be terminated at the current step. Moreover, |
| Qwen3-Next maintains higher confidence with lower variance throughout, suggesting that it has more |
| confidence that exploration is progressing towards a more comprehensive and deeper report.</p> |
| </section> |
| </main> |
|
|
| |
| <footer class="footer"> |
| <p>DDR-Bench © 2026 | King's College London · Tencent · The Alan Turing Institute</p> |
| </footer> |
|
|
| |
| <script> |
| document.addEventListener('DOMContentLoaded', function () { |
| const track = document.getElementById('results-carousel'); |
| const dotsContainer = document.getElementById('results-dots'); |
| const prevBtn = document.querySelector('.carousel-prev'); |
| const nextBtn = document.querySelector('.carousel-next'); |
| |
| if (!track) return; |
| |
| const cards = Array.from(track.querySelectorAll('.carousel-card')); |
| const cardCount = cards.length; |
| let currentIndex = 0; |
| let hasStartedAutoPlay = false; |
| |
| |
| for (let i = 0; i < cardCount; i++) { |
| const dot = document.createElement('button'); |
| dot.className = 'carousel-dot' + (i === 0 ? ' active' : ''); |
| dot.setAttribute('aria-label', `Go to slide ${i + 1}`); |
| dot.addEventListener('click', () => goToSlide(i)); |
| dotsContainer.appendChild(dot); |
| } |
| |
| const dots = dotsContainer.querySelectorAll('.carousel-dot'); |
| |
| function updateCarousel() { |
| |
| const cardWidth = track.offsetWidth * 0.66666; |
| const gap = 32; |
| const offset = (track.offsetWidth - cardWidth) / 2 - currentIndex * (cardWidth + gap); |
| |
| track.style.transform = `translateX(${offset}px)`; |
| |
| |
| cards.forEach((card, i) => { |
| card.classList.remove('active', 'side'); |
| if (i === currentIndex) { |
| card.classList.add('active'); |
| } else { |
| card.classList.add('side'); |
| } |
| }); |
| |
| |
| dots.forEach((dot, i) => { |
| dot.classList.toggle('active', i === currentIndex); |
| }); |
| } |
| |
| function goToSlide(index) { |
| |
| if (index < 0) { |
| currentIndex = cardCount - 1; |
| } else if (index >= cardCount) { |
| currentIndex = 0; |
| } else { |
| currentIndex = index; |
| } |
| updateCarousel(); |
| } |
| |
| function nextSlide() { goToSlide(currentIndex + 1); } |
| function prevSlide() { goToSlide(currentIndex - 1); } |
| |
| |
| prevBtn.addEventListener('click', prevSlide); |
| nextBtn.addEventListener('click', nextSlide); |
| |
| |
| document.addEventListener('keydown', (e) => { |
| if (e.key === 'ArrowLeft') prevSlide(); |
| if (e.key === 'ArrowRight') nextSlide(); |
| }); |
| |
| |
| let touchStartX = 0; |
| track.addEventListener('touchstart', (e) => { |
| touchStartX = e.changedTouches[0].screenX; |
| }, { passive: true }); |
| |
| track.addEventListener('touchend', (e) => { |
| const diff = touchStartX - e.changedTouches[0].screenX; |
| if (Math.abs(diff) > 50) { |
| if (diff > 0) nextSlide(); |
| else prevSlide(); |
| } |
| }, { passive: true }); |
| |
| |
| let autoPlayInterval; |
| const AUTO_PLAY_DELAY = 5000; |
| |
| function startAutoPlay() { |
| stopAutoPlay(); |
| autoPlayInterval = setInterval(nextSlide, AUTO_PLAY_DELAY); |
| } |
| |
| function stopAutoPlay() { |
| if (autoPlayInterval) clearInterval(autoPlayInterval); |
| } |
| |
| |
| const carouselWrapper = document.querySelector('.carousel-wrapper'); |
| if (carouselWrapper) { |
| carouselWrapper.addEventListener('mouseenter', stopAutoPlay); |
| carouselWrapper.addEventListener('mouseleave', () => { |
| if (hasStartedAutoPlay) startAutoPlay(); |
| }); |
| } |
| |
| |
| updateCarousel(); |
| |
| |
| const resultsSection = document.getElementById('results'); |
| if (resultsSection) { |
| const carouselObserver = new IntersectionObserver((entries) => { |
| entries.forEach(entry => { |
| if (entry.isIntersecting) { |
| if (!hasStartedAutoPlay) { |
| hasStartedAutoPlay = true; |
| } |
| startAutoPlay(); |
| } else { |
| stopAutoPlay(); |
| } |
| }); |
| }, { |
| threshold: 0.3 |
| }); |
| |
| carouselObserver.observe(resultsSection); |
| } |
| }); |
| </script> |
|
|
| |
| <script> |
| document.addEventListener('DOMContentLoaded', function () { |
| const chatWindow = document.getElementById('chat-window'); |
| const scrollHint = document.getElementById('scroll-hint'); |
| |
| if (chatWindow && scrollHint) { |
| |
| chatWindow.addEventListener('scroll', function () { |
| if (chatWindow.scrollTop > 50) { |
| scrollHint.classList.add('hidden'); |
| } |
| }); |
| |
| |
| setTimeout(() => { |
| scrollHint.style.opacity = '1'; |
| }, 1000); |
| } |
| }); |
| </script> |
| </body> |
|
|
| </html> |