Spaces:

thinkwee
/

DDR_Bench

Running

App Files Files Community

thinkwee commited on Feb 2

Commit

9f54287

1 Parent(s): 9711e49

update links

Browse files

Files changed (3) hide show

index.html +67 -41
styles.css +28 -7
trajectory.js +13 -0

index.html CHANGED Viewed

@@ -59,18 +59,25 @@
                 We distinguish <em>investigatory intelligence</em> (autonomously setting goals and exploring) from
                 <em>executional intelligence</em> (completing assigned tasks), arguing that true agency requires the
                 former.
-                <br>
                 To evaluate this, we introduce <strong>Deep Data Research (DDR)</strong>, an open-ended task where LLMs
                 autonomously extract insights from databases, and <strong>DDR-Bench</strong>, a large-scale,
                 checklist-based benchmark enabling verifiable evaluation.
-                <br>
                 Results show that while frontier models display emerging agency, long-horizon exploration remains
                 challenging, with effective investigatory intelligence depending on intrinsic agentic strategies beyond
                 mere scaffolding or scaling.
             </p>
             <div class="meta-info">
                 <div class="meta-row authors">
-                    <span class="meta-item">Wei Liu, Peijie Yu, Michele Orini, Yali Du, Yulan He</span>
                 </div>
                 <div class="meta-row affiliations">
                     <a href="https://kclnlp.github.io/" target="_blank" rel="noopener noreferrer">
@@ -84,12 +91,12 @@
                     </a>
                 </div>
                 <div class="meta-row links">
-                    <a href="https://huggingface.co/spaces/DDR-Bench" class="platform-btn huggingface-btn">
                         <img src="assets/hf-logo-pirate.svg" alt="HuggingFace" width="30" height="30"
                             class="platform-icon">
                         Data
                     </a>
-                    <a href="https://github.com/DDR-Bench" class="platform-btn github-btn">
                         <svg viewBox="0 0 24 24" width="30" height="30" fill="currentColor">
                             <path
                                 d="M12 2C6.477 2 2 6.477 2 12c0 4.42 2.865 8.17 6.839 9.49.5.092.682-.217.682-.482 0-.237-.008-.866-.013-1.7-2.782.603-3.369-1.34-3.369-1.34-.454-1.156-1.11-1.463-1.11-1.463-.908-.62.069-.608.069-.608 1.003.07 1.531 1.03 1.531 1.03.892 1.529 2.341 1.087 2.91.831.092-.646.35-1.086.636-1.336-2.22-.253-4.555-1.11-4.555-4.943 0-1.091.39-1.984 1.029-2.683-.103-.253-.446-1.27.098-2.647 0 0 .84-.269 2.75 1.025A9.578 9.578 0 0112 6.836c.85.004 1.705.114 2.504.336 1.909-1.294 2.747-1.025 2.747-1.025.546 1.377.203 2.394.1 2.647.64.699 1.028 1.592 1.028 2.683 0 3.842-2.339 4.687-4.566 4.935.359.309.678.919.678 1.852 0 1.336-.012 2.415-.012 2.743 0 .267.18.578.688.48C19.138 20.167 22 16.418 22 12c0-5.523-4.477-10-10-10z" />
@@ -121,24 +128,29 @@
                     </svg>
                     Framework Overview
                 </h2>
-                <p>System architecture and evaluation pipeline of DDR-Bench.</p>
             </div>
             <div class="framework-grid">
                 <div class="framework-card">
                     <img src="assets/framework_task.png" alt="Task Formulation Framework"
                         style="border-radius: var(--radius-md);">
                     <h3>Task Formulation</h3>
-                    <p class="framework-description">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do
-                        eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
-                        nostrud exercitation ullamco laboris.</p>
                 </div>
                 <div class="framework-card">
                     <img src="assets/framework_pipeline.png" alt="Evaluation Pipeline Framework"
                         style="border-radius: var(--radius-md);">
                     <h3>Evaluation Pipeline</h3>
-                    <p class="framework-description">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do
-                        eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
-                        nostrud exercitation ullamco laboris.</p>
                 </div>
             </div>
         </section>
@@ -164,6 +176,11 @@
                 <button class="dim-btn" data-traj-scenario="globem">GLOBEM</button>
             </div>
             <div class="trajectory-container">
                 <div id="chat-window" class="chat-window">
                     <!-- Messages will be injected here via JS -->
@@ -177,9 +194,6 @@
                     <span>Scroll to see more</span>
                 </div>
             </div>
-            <p class="trajectory-description">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod
-                tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
-                ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
         </section>
         <!-- 2. Experiment Results Section -->
@@ -217,9 +231,11 @@
                     <!-- 2. Qwen Family -->
                     <div class="carousel-card">
                         <img src="assets/qwenfamily.png" alt="Qwen Family Performance">
-                        <h4>Qwen Family Analysis</h4>
-                        <p class="card-caption">Performance scaling and behavioral differences within the Qwen model
-                            series (Qwen3-Next-80B vs 30B).</p>
                     </div>
                     <!-- 3. Reasoning -->
@@ -253,7 +269,9 @@
                     <div class="carousel-card">
                         <img src="assets/hallucination.png" alt="Hallucination Analysis">
                         <h4>Hallucination Analysis</h4>
-                        <p class="card-caption">Hallucination rates is low.</p>
                     </div>
                     <!-- 6.5 Hallucination-Accuracy Correlation -->
@@ -271,7 +289,7 @@
                         <h4>Trustworthiness</h4>
                         <p class="card-caption">Verification of the LLM-as-a-Checker pipeline demonstrating high
                             alignment
-                            with human expert judgments.</p>
                     </div>
                 </div>
@@ -321,9 +339,9 @@
                     <div id="scaling-globem" class="chart-container"></div>
                 </div>
             </div>
-            <p class="section-description">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod
-                tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
-                ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
         </section>
         <!-- 2. Ranking Comparison Section -->
@@ -370,9 +388,9 @@
                 </div>
             </div>
-            <p class="section-description">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod
-                tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
-                ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
         </section>
         <!-- 3. Turn Distribution Section -->
@@ -402,9 +420,9 @@
                     <div id="turn-globem" class="chart-container-tall"></div>
                 </div>
             </div>
-            <p class="section-description">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod
-                tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
-                ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
         </section>
         <!-- 4. Entropy Analysis Section -->
@@ -419,7 +437,7 @@
                         <circle cx="7.5" cy="16.5" r="1.5" />
                         <circle cx="17.5" cy="14.5" r="1.5" />
                     </svg>
-                    Entropy Analysis
                 </h2>
                 <p>Scatter plot showing Access Entropy vs Coverage by model. Opacity represents accuracy. Higher entropy
                     = more uniform access; Higher coverage = more fields explored.</p>
@@ -454,9 +472,9 @@
                     <div id="entropy-model-5" class="chart-container-tall"></div>
                 </div>
             </div>
-            <p class="section-description">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod
-                tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
-                ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
         </section>
         <!-- 5. Error Analysis Section -->
@@ -478,9 +496,14 @@
                     <div id="error-chart" class="chart-container-double"></div>
                 </div>
             </div>
-            <p class="section-description">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod
-                tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
-                ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
         </section>
         <!-- 6. Probing Results Section -->
@@ -492,9 +515,9 @@
                         <circle cx="11" cy="11" r="8" />
                         <path d="m21 21-4.3-4.3" />
                     </svg>
-                    Probing Results
                 </h2>
-                <p>Analyze the average log probability of FINISH messages across conversation turns and progress.</p>
             </div>
             <div id="probing-legend" class="shared-legend"></div>
             <div class="charts-grid three-col">
@@ -511,9 +534,12 @@
                     <div id="probing-10k" class="chart-container-tall"></div>
                 </div>
             </div>
-            <p class="section-description">Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod
-                tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation
-                ullamco laboris nisi ut aliquip ex ea commodo consequat.</p>
         </section>
     </main>

                 We distinguish <em>investigatory intelligence</em> (autonomously setting goals and exploring) from
                 <em>executional intelligence</em> (completing assigned tasks), arguing that true agency requires the
                 former.
                 To evaluate this, we introduce <strong>Deep Data Research (DDR)</strong>, an open-ended task where LLMs
                 autonomously extract insights from databases, and <strong>DDR-Bench</strong>, a large-scale,
                 checklist-based benchmark enabling verifiable evaluation.
                 Results show that while frontier models display emerging agency, long-horizon exploration remains
                 challenging, with effective investigatory intelligence depending on intrinsic agentic strategies beyond
                 mere scaffolding or scaling.
             </p>
             <div class="meta-info">
                 <div class="meta-row authors">
+                    <span class="meta-item">
+                        <a href="https://thinkwee.top/about" target="_blank" rel="noopener noreferrer">Wei Liu</a>,
+                        <a href="https://github.com/yupeijei1997" target="_blank" rel="noopener noreferrer">Peijie
+                            Yu</a>,
+                        <a href="https://www.kcl.ac.uk/people/michele-orini" target="_blank"
+                            rel="noopener noreferrer">Michele Orini</a>,
+                        <a href="https://yalidu.github.io/" target="_blank" rel="noopener noreferrer">Yali Du</a>,
+                        <a href="https://sites.google.com/view/yulanhe/home" target="_blank"
+                            rel="noopener noreferrer">Yulan He</a>
+                    </span>
                 </div>
                 <div class="meta-row affiliations">
                     <a href="https://kclnlp.github.io/" target="_blank" rel="noopener noreferrer">
                     </a>
                 </div>
                 <div class="meta-row links">
+                    <a href="https://huggingface.co/collections/thinkwee/ddrbench" class="platform-btn huggingface-btn">
                         <img src="assets/hf-logo-pirate.svg" alt="HuggingFace" width="30" height="30"
                             class="platform-icon">
                         Data
                     </a>
+                    <a href="https://github.com/thinkwee/DDR_Bench" class="platform-btn github-btn">
                         <svg viewBox="0 0 24 24" width="30" height="30" fill="currentColor">
                             <path
                                 d="M12 2C6.477 2 2 6.477 2 12c0 4.42 2.865 8.17 6.839 9.49.5.092.682-.217.682-.482 0-.237-.008-.866-.013-1.7-2.782.603-3.369-1.34-3.369-1.34-.454-1.156-1.11-1.463-1.11-1.463-.908-.62.069-.608.069-.608 1.003.07 1.531 1.03 1.531 1.03.892 1.529 2.341 1.087 2.91.831.092-.646.35-1.086.636-1.336-2.22-.253-4.555-1.11-4.555-4.943 0-1.091.39-1.984 1.029-2.683-.103-.253-.446-1.27.098-2.647 0 0 .84-.269 2.75 1.025A9.578 9.578 0 0112 6.836c.85.004 1.705.114 2.504.336 1.909-1.294 2.747-1.025 2.747-1.025.546 1.377.203 2.394.1 2.647.64.699 1.028 1.592 1.028 2.683 0 3.842-2.339 4.687-4.566 4.935.359.309.678.919.678 1.852 0 1.336-.012 2.415-.012 2.743 0 .267.18.578.688.48C19.138 20.167 22 16.418 22 12c0-5.523-4.477-10-10-10z" />
                     </svg>
                     Framework Overview
                 </h2>
+                <p>Overview of DDR-Bench.</p>
             </div>
             <div class="framework-grid">
                 <div class="framework-card">
                     <img src="assets/framework_task.png" alt="Task Formulation Framework"
                         style="border-radius: var(--radius-md);">
                     <h3>Task Formulation</h3>
+                    <p class="framework-description">A case of Claude Sonnet 4.5's trajectory and evaluation checklist
+                        in the MIMIC scenario of DDR-Bench. Verified fact and supporting insights are
+                        <u>underlined</u>. The agent is asked to perform multiple ReAct turns to explore the database
+                        without predefined targets or queries, autonomously mine insights from the exploration.
+                    </p>
                 </div>
                 <div class="framework-card">
                     <img src="assets/framework_pipeline.png" alt="Evaluation Pipeline Framework"
                         style="border-radius: var(--radius-md);">
                     <h3>Evaluation Pipeline</h3>
+                    <p class="framework-description"><b>Left</b>: Compared with previous tasks, <i>DDR</i> maximises
+                        exploration openness and agency, focusing on the direct evaluation of insight quality.
+                        <b>Right</b>: Overview of the DDR-Bench. The checklist derived from the freeform parts of the
+                        database is used to evaluate the agent generated insights from the exploration on the structured
+                        parts of the database.
+                    </p>
                 </div>
             </div>
         </section>
                 <button class="dim-btn" data-traj-scenario="globem">GLOBEM</button>
             </div>
+            <p id="trajectory-scenario-description" class="trajectory-description">
+                Exploring clinical patterns and patient outcomes in a large-scale electronic health record (EHR)
+                database.
+            </p>
             <div class="trajectory-container">
                 <div id="chat-window" class="chat-window">
                     <!-- Messages will be injected here via JS -->
                     <span>Scroll to see more</span>
                 </div>
             </div>
         </section>
         <!-- 2. Experiment Results Section -->
                     <!-- 2. Qwen Family -->
                     <div class="carousel-card">
                         <img src="assets/qwenfamily.png" alt="Qwen Family Performance">
+                        <h4>Training-time Factors Analysis</h4>
+                        <p class="card-caption">Training-time factors study within the Qwen family. From left to right,
+                            the three columns examine inference-time scaling performance across all scenarios for models
+                            with different parameter scales, context optimisation methods, and model generations with
+                            different training strategies.</p>
                     </div>
                     <!-- 3. Reasoning -->
                     <div class="carousel-card">
                         <img src="assets/hallucination.png" alt="Hallucination Analysis">
                         <h4>Hallucination Analysis</h4>
+                        <p class="card-caption">Hallucination rates (%) across models in DDR-Bench, measured as the
+                            proportion of insights containing factual but unfaithful information that are not derivable
+                            from the provided inputs, which is low.</p>
                     </div>
                     <!-- 6.5 Hallucination-Accuracy Correlation -->
                         <h4>Trustworthiness</h4>
                         <p class="card-caption">Verification of the LLM-as-a-Checker pipeline demonstrating high
                             alignment
+                            with human expert judgments, and it is stable across multiple runs.</p>
                     </div>
                 </div>
                     <div id="scaling-globem" class="chart-container"></div>
                 </div>
             </div>
+            <p class="section-description">LLMs extract more accurate insights from delaying commitment, and they
+                concentrate reasoning into a small number of highly valuable late-stage interactions. These targeted
+                interactions are built upon longer early exploration.</p>
         </section>
         <!-- 2. Ranking Comparison Section -->
                 </div>
             </div>
+            <p class="section-description">The ranking induced by novel insight usefulness closely aligns with the
+                ranking based on checklist accuracy. Differences between the two rankings are small, especially among
+                the top-performing models.</p>
         </section>
         <!-- 3. Turn Distribution Section -->
                     <div id="turn-globem" class="chart-container-tall"></div>
                 </div>
             </div>
+            <p class="section-description">Stronger models tend to explore for more rounds without external prompting.
+                Knowledge-intensive databases such as 10-K and MIMIC induce more interaction rounds than signal-based
+                datasets such as GLOBEM, and the resulting distributions are also more uniform.</p>
         </section>
         <!-- 4. Entropy Analysis Section -->
                         <circle cx="7.5" cy="16.5" r="1.5" />
                         <circle cx="17.5" cy="14.5" r="1.5" />
                     </svg>
+                    Exploration Pattern Analysis
                 </h2>
                 <p>Scatter plot showing Access Entropy vs Coverage by model. Opacity represents accuracy. Higher entropy
                     = more uniform access; Higher coverage = more fields explored.</p>
                     <div id="entropy-model-5" class="chart-container-tall"></div>
                 </div>
             </div>
+            <p class="section-description">Advanced LLMs tend to operate in a balanced exploration regime that combines
+                adequate coverage with focused access. Such a regime is consistently observed across different
+                scenarios.</p>
         </section>
         <!-- 5. Error Analysis Section -->
                     <div id="error-chart" class="chart-container-double"></div>
                 </div>
             </div>
+            <p class="section-description">Our findings revealed that 58% of errors stemmed from insufficient
+                exploration, both in terms of breadth and depth. This imbalance in exploration often leads to suboptimal
+                results, regardless of the model’s overall capability.
+                Additionally, around 40% of the errors were attributed to other factors. For more powerful models,
+                over-reasoning was common, where the model made assumptions not fully supported by the data. In other
+                cases, models misinterpreted the insights, such as mistaking a downward trend for an upward one. Less
+                capable models, on the other hand, tended to make more fundamental errors, such as repeatedly debugging
+                or struggling with missing data, which could disrupt the overall coherence of the analysis.</p>
         </section>
         <!-- 6. Probing Results Section -->
                         <circle cx="11" cy="11" r="8" />
                         <path d="m21 21-4.3-4.3" />
                     </svg>
+                    Self-Termination
                 </h2>
+                <p>Analyze the willingness of models to terminate their own analysis.</p>
             </div>
             <div id="probing-legend" class="shared-legend"></div>
             <div class="charts-grid three-col">
                     <div id="probing-10k" class="chart-container-tall"></div>
                 </div>
             </div>
+            <p class="section-description"> Clear differences emerge across model generations. Qwen3 and Qwen3-Next
+                exhibit a consistently increasing probability, indicating growing confidence that a complete report can
+                be produced as more information is accumulated, whereas the Qwen2.5 series shows pronounced fluctuations
+                and remains uncertain about whether exploration can be terminated at the current step. Moreover,
+                Qwen3-Next maintains higher confidence with lower variance throughout, suggesting that it has more
+                confidence that exploration is progressing towards a more comprehensive and deeper report.</p>
         </section>
     </main>

styles.css CHANGED Viewed

@@ -150,7 +150,7 @@ img {
     font-size: 21px;
     line-height: 1.7;
     color: var(--color-text-muted);
-    max-width: 900px;
     margin: 0 auto var(--space-lg);
     text-align: left;
 }
@@ -179,6 +179,26 @@ img {
     font-size: 22px;
     font-weight: 500;
     color: var(--color-text);
 }
 .meta-row.affiliations {
@@ -552,7 +572,8 @@ img {
     line-height: 1.6;
     color: var(--color-text-muted);
     text-align: center;
-    margin-top: var(--space-md);
     max-width: 1400px;
     margin-left: auto;
     margin-right: auto;
@@ -849,9 +870,8 @@ footer a:hover {
     margin: 0 auto;
     background: var(--color-surface);
     border-radius: var(--radius-lg);
-    box-shadow: var(--shadow-card);
-    border: 1px solid rgba(0, 0, 0, 0.05);
-    /* Subtle border */
     overflow: hidden;
     /* Ensure rounded corners */
     position: relative;
@@ -907,8 +927,9 @@ footer a:hover {
     overflow-y: auto;
     /* Enable vertical scroll */
     padding: var(--space-md);
-    background: #f5f5f7;
-    /* Chat background */
     font-size: 16px;
 }

     font-size: 21px;
     line-height: 1.7;
     color: var(--color-text-muted);
+    max-width: 1200px;
     margin: 0 auto var(--space-lg);
     text-align: left;
 }
     font-size: 22px;
     font-weight: 500;
     color: var(--color-text);
+    margin-bottom: var(--space-xs);
+}
+.meta-row.authors a {
+    color: var(--color-text);
+    text-decoration: none;
+    position: relative;
+    padding: 2px 6px;
+    margin: 0 -2px;
+    border-radius: 6px;
+    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
+    /* 初始状态下非常淡的下划线，作为微妙的视觉暗示 */
+    border-bottom: 1.5px solid rgba(0, 113, 227, 0.1);
+}
+.meta-row.authors a:hover {
+    color: var(--color-primary);
+    background-color: rgba(0, 113, 227, 0.05);
+    border-bottom-color: var(--color-primary);
+    transform: translateY(-1px);
 }
 .meta-row.affiliations {
     line-height: 1.6;
     color: var(--color-text-muted);
     text-align: center;
+    margin-top: 0;
+    margin-bottom: var(--space-md);
     max-width: 1400px;
     margin-left: auto;
     margin-right: auto;
     margin: 0 auto;
     background: var(--color-surface);
     border-radius: var(--radius-lg);
+    box-shadow: 0 4px 20px rgba(0, 0, 0, 0.04);
+    border: 1px solid rgba(0, 0, 0, 0.08);
     overflow: hidden;
     /* Ensure rounded corners */
     position: relative;
     overflow-y: auto;
     /* Enable vertical scroll */
     padding: var(--space-md);
+    background: #f4f9f4;
+    box-shadow: inset 0 1px 4px rgba(0, 0, 0, 0.02);
+    /* Soft light mint green background */
     font-size: 16px;
 }

trajectory.js CHANGED Viewed

@@ -3,6 +3,12 @@
 // AGENT TRAJECTORY - Chat Interface
 // ============================================================================
 let currentTrajScenario = 'mimic';
 function initTrajectory() {
@@ -41,6 +47,13 @@ function initTrajectory() {
             document.querySelectorAll('[data-traj-scenario]').forEach(b => b.classList.remove('active'));
             btn.classList.add('active');
             currentTrajScenario = btn.dataset.trajScenario;
             renderTrajectory(currentTrajScenario);
         });
     });

 // AGENT TRAJECTORY - Chat Interface
 // ============================================================================
+const SCENARIO_DESCRIPTIONS = {
+    'mimic': 'Exploring clinical patterns and patient outcomes in a large-scale electronic health record (EHR) database.',
+    '10k': 'Extracting deep insights from SEC 10-K annual reports for longitudinal financial performance analysis.',
+    'globem': 'Analyzing multi-modal longitudinal behavioral and sensor data for detecting mental health trends.'
+};
 let currentTrajScenario = 'mimic';
 function initTrajectory() {
             document.querySelectorAll('[data-traj-scenario]').forEach(b => b.classList.remove('active'));
             btn.classList.add('active');
             currentTrajScenario = btn.dataset.trajScenario;
+            // Update description
+            const descEl = document.getElementById('trajectory-scenario-description');
+            if (descEl && SCENARIO_DESCRIPTIONS[currentTrajScenario]) {
+                descEl.textContent = SCENARIO_DESCRIPTIONS[currentTrajScenario];
+            }
             renderTrajectory(currentTrajScenario);
         });
     });