Spaces:

thinkwee
/

DDR_Bench

Running

App Files Files Community

thinkwee commited on Jan 8

Commit

41d056a

1 Parent(s): 5026fae

fix display

Browse files

Files changed (4) hide show

charts.js +239 -81
data.js +59 -1
index.html +24 -4
styles.css +16 -0

charts.js CHANGED Viewed

@@ -377,9 +377,11 @@ document.querySelectorAll('.dim-btn:not(.probing-dim)').forEach(btn => {
 });
 // ============================================================================
-// RANKING COMPARISON - 3 Charts
 // ============================================================================
-function initRankingCharts() {
     const scenarios = [
         { key: 'MIMIC', id: 'mimic' },
         { key: '10K', id: '10k' },
@@ -390,106 +392,162 @@ function initRankingCharts() {
         const data = DDR_DATA.ranking[key];
         if (!data) return;
-        const models = data.slice(0, 15); // Top 15 models
         const traces = [];
-        // Connection lines
-        models.forEach((m, i) => {
             traces.push({
-                x: [m.bt_rank, m.acc_rank],
-                y: [i, i],
-                mode: 'lines',
-                line: {
-                    color: 'rgba(148, 163, 184, 0.25)',
-                    width: 1,
-                    dash: 'dash'
                 },
-                showlegend: false,
-                hoverinfo: 'skip'
             });
-        });
-        // Novelty rank points
-        traces.push({
-            x: models.map(m => m.bt_rank),
-            y: models.map((_, i) => i),
-            mode: 'markers',
-            name: 'Novelty',
-            marker: {
-                size: 10,
-                symbol: 'circle',
-                color: models.map(m => m.is_proprietary ? '#8B5CF6' : '#22C55E'),
-                line: { color: '#000', width: 0.5 }
-            },
-            text: models.map(m => `${m.model}<br>Novelty: #${m.bt_rank}<br>Win: ${m.win_rate}%`),
-            hovertemplate: '%{text}<extra></extra>'
-        });
-        // Accuracy rank points
-        traces.push({
-            x: models.map(m => m.acc_rank),
-            y: models.map((_, i) => i),
-            mode: 'markers',
-            name: 'Accuracy',
-            marker: {
-                size: 12,
-                symbol: 'diamond-open',
-                color: models.map(m => m.is_proprietary ? '#8B5CF6' : '#22C55E'),
-                line: { width: 2 }
-            },
-            text: models.map(m => `${m.model}<br>Accuracy: #${m.acc_rank}<br>${m.accuracy}%`),
-            hovertemplate: '%{text}<extra></extra>'
-        });
         const layout = {
             ...darkLayout,
             xaxis: {
                 ...darkLayout.xaxis,
                 title: { text: 'Rank', font: { size: 11, color: '#e2e8f0' } },
-                range: [Math.max(...models.map(m => Math.max(m.bt_rank, m.acc_rank))) + 1, 0],
                 dtick: 2
             },
             yaxis: {
                 ...darkLayout.yaxis,
                 tickmode: 'array',
                 tickvals: models.map((_, i) => i),
-                ticktext: models.map(m => m.model.substring(0, 15)),
                 automargin: true
             },
             showlegend: true,
             legend: {
                 ...darkLayout.legend,
-                y: -0.12
             },
-            margin: { ...darkLayout.margin, l: 120 }
         };
-        Plotly.newPlot(`ranking-${id}`, traces, layout, plotlyConfig);
     });
 }
 // ============================================================================
-// TURN DISTRIBUTION - 3 Charts (Box plots)
 // ============================================================================
 function initTurnCharts() {
     const scenarios = ['mimic', '10k', 'globem'];
     // Family colors
     const familyColors = {
-        'Claude': '#FF6D00',
-        'GPT': '#00C853',
-        'Gemini': '#2196F3',
-        'DeepSeek': '#E91E63',
-        'GLM': '#9C27B0',
-        'Kimi': '#FFA500',
-        'MiniMax': '#20B2AA',
-        'Qwen': '#0EA5E9',
-        'Llama': '#F59E0B'
     };
     function getModelColor(modelName) {
         for (const [family, color] of Object.entries(familyColors)) {
-            if (modelName.includes(family)) return color;
         }
         return '#888';
     }
@@ -498,43 +556,70 @@ function initTurnCharts() {
         const data = DDR_DATA.turn[scenario];
         if (!data) return;
-        const sortedData = [...data].sort((a, b) => a.median - b.median);
-        const traces = sortedData.map((model, i) => {
             const color = getModelColor(model.model);
-            return {
-                y: [model.model],
-                x: [model.median],
-                type: 'bar',
-                orientation: 'h',
                 name: model.model,
-                marker: {
-                    color: color,
-                    opacity: 0.8
-                },
-                text: [`${model.median}`],
-                textposition: 'outside',
-                textfont: { size: 9, color: '#94a3b8' },
-                hovertemplate: `<b>${model.model}</b><br>Median: ${model.median} turns<extra></extra>`,
                 showlegend: false
-            };
         });
         const layout = {
             ...darkLayout,
-            barmode: 'group',
             xaxis: {
                 ...darkLayout.xaxis,
                 title: { text: 'Number of Turns', font: { size: 11, color: '#e2e8f0' } },
-                range: [0, Math.max(...sortedData.map(d => d.median)) * 1.15]
             },
             yaxis: {
                 ...darkLayout.yaxis,
                 automargin: true,
-                tickfont: { size: 9 }
             },
-            margin: { ...darkLayout.margin, l: 130 }
         };
         Plotly.newPlot(`turn-${scenario}`, traces, layout, plotlyConfig);
@@ -638,6 +723,75 @@ document.querySelectorAll('.probing-dim').forEach(btn => {
     });
 });
 // ============================================================================
 // INITIALIZE ALL CHARTS
 // ============================================================================
@@ -645,6 +799,7 @@ document.addEventListener('DOMContentLoaded', () => {
     initScalingCharts();
     initRankingCharts();
     initTurnCharts();
     initProbingCharts();
 });
@@ -659,5 +814,8 @@ window.addEventListener('resize', () => {
             Plotly.Plots.resize(`turn-${s}`);
             Plotly.Plots.resize(`probing-${s}`);
         });
     }, 100);
 });

 });
 // ============================================================================
+// RANKING COMPARISON - 3 Charts with animated mode switching
 // ============================================================================
+let currentRankingMode = 'comparison';
+function renderRankingCharts(mode) {
     const scenarios = [
         { key: 'MIMIC', id: 'mimic' },
         { key: '10K', id: '10k' },
         const data = DDR_DATA.ranking[key];
         if (!data) return;
+        const models = data.slice(0, 12); // Top 12 models for better fit
         const traces = [];
+        // Get x-axis values based on mode
+        const getXValue = (m) => {
+            switch (mode) {
+                case 'novelty': return m.bt_rank;
+                case 'accuracy': return m.acc_rank;
+                default: return m.bt_rank; // For comparison, use bt_rank as base
+            }
+        };
+        if (mode === 'comparison') {
+            // Connection lines
+            models.forEach((m, i) => {
+                traces.push({
+                    x: [m.bt_rank, m.acc_rank],
+                    y: [i, i],
+                    mode: 'lines',
+                    line: {
+                        color: 'rgba(148, 163, 184, 0.3)',
+                        width: 1.5,
+                        dash: 'dot'
+                    },
+                    showlegend: false,
+                    hoverinfo: 'skip'
+                });
+            });
+            // Novelty rank points
             traces.push({
+                x: models.map(m => m.bt_rank),
+                y: models.map((_, i) => i),
+                mode: 'markers',
+                name: 'Novelty Rank',
+                marker: {
+                    size: 10,
+                    symbol: 'circle',
+                    color: '#8B5CF6',
+                    line: { color: '#fff', width: 1 }
                 },
+                text: models.map(m => `${m.model}<br>Novelty: #${m.bt_rank}<br>Win Rate: ${m.win_rate}%`),
+                hovertemplate: '%{text}<extra></extra>'
             });
+            // Accuracy rank points
+            traces.push({
+                x: models.map(m => m.acc_rank),
+                y: models.map((_, i) => i),
+                mode: 'markers',
+                name: 'Accuracy Rank',
+                marker: {
+                    size: 10,
+                    symbol: 'diamond',
+                    color: '#22C55E',
+                    line: { color: '#fff', width: 1 }
+                },
+                text: models.map(m => `${m.model}<br>Accuracy: #${m.acc_rank}<br>${m.accuracy}%`),
+                hovertemplate: '%{text}<extra></extra>'
+            });
+        } else {
+            // Single mode - just points
+            const xVals = models.map(m => mode === 'novelty' ? m.bt_rank : m.acc_rank);
+            const color = mode === 'novelty' ? '#8B5CF6' : '#22C55E';
+            const label = mode === 'novelty' ? 'Novelty' : 'Accuracy';
+            traces.push({
+                x: xVals,
+                y: models.map((_, i) => i),
+                mode: 'markers',
+                name: label,
+                marker: {
+                    size: 12,
+                    symbol: 'circle',
+                    color: color,
+                    line: { color: '#fff', width: 1 }
+                },
+                text: models.map(m => {
+                    if (mode === 'novelty') {
+                        return `${m.model}<br>Novelty: #${m.bt_rank}<br>Win Rate: ${m.win_rate}%`;
+                    } else {
+                        return `${m.model}<br>Accuracy: #${m.acc_rank}<br>${m.accuracy}%`;
+                    }
+                }),
+                hovertemplate: '%{text}<extra></extra>'
+            });
+        }
+        const maxRank = Math.max(...models.map(m => Math.max(m.bt_rank, m.acc_rank)));
         const layout = {
             ...darkLayout,
             xaxis: {
                 ...darkLayout.xaxis,
                 title: { text: 'Rank', font: { size: 11, color: '#e2e8f0' } },
+                range: [maxRank + 1, 0],
                 dtick: 2
             },
             yaxis: {
                 ...darkLayout.yaxis,
                 tickmode: 'array',
                 tickvals: models.map((_, i) => i),
+                ticktext: models.map(m => m.model.length > 18 ? m.model.substring(0, 16) + '...' : m.model),
                 automargin: true
             },
             showlegend: true,
             legend: {
                 ...darkLayout.legend,
+                y: -0.15
             },
+            margin: { ...darkLayout.margin, l: 130, b: 70 }
         };
+        Plotly.react(`ranking-${id}`, traces, layout, plotlyConfig);
     });
 }
+function initRankingCharts() {
+    renderRankingCharts('comparison');
+}
+// Ranking mode toggle event listener
+document.querySelectorAll('.ranking-dim').forEach(btn => {
+    btn.addEventListener('click', () => {
+        document.querySelectorAll('.ranking-dim').forEach(b => b.classList.remove('active'));
+        btn.classList.add('active');
+        const mode = btn.dataset.mode;
+        currentRankingMode = mode;
+        renderRankingCharts(mode);
+    });
+});
 // ============================================================================
+// TURN DISTRIBUTION - 3 Charts (Ridgeline style)
 // ============================================================================
 function initTurnCharts() {
     const scenarios = ['mimic', '10k', 'globem'];
     // Family colors
     const familyColors = {
+        'claude': '#FF6D00',
+        'gpt': '#00C853',
+        'gemini': '#2196F3',
+        'deepseek': '#E91E63',
+        'glm': '#9C27B0',
+        'kimi': '#FFA500',
+        'minimax': '#20B2AA',
+        'qwen': '#0EA5E9',
+        'llama': '#F59E0B'
     };
     function getModelColor(modelName) {
+        const lower = modelName.toLowerCase();
         for (const [family, color] of Object.entries(familyColors)) {
+            if (lower.includes(family)) return color;
         }
         return '#888';
     }
         const data = DDR_DATA.turn[scenario];
         if (!data) return;
+        // Sort by median descending (highest median at top)
+        const sortedData = [...data].sort((a, b) => b.median - a.median);
+        // Limit to top 15 models for readability
+        const displayData = sortedData.slice(0, 15);
+        const traces = [];
+        const binLabels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'];
+        const binCenters = [5, 15, 25, 35, 45, 55, 65, 75, 85, 95];
+        // Create ridgeline traces (area charts stacked vertically)
+        displayData.forEach((model, idx) => {
             const color = getModelColor(model.model);
+            const yOffset = idx;
+            // Scale distribution to fit in the row (max height ~0.8)
+            const maxDist = Math.max(...model.distribution) || 1;
+            const scaledDist = model.distribution.map(d => d / maxDist * 0.7);
+            // Create filled area trace
+            traces.push({
+                x: binCenters,
+                y: scaledDist.map(d => yOffset + d),
+                mode: 'lines',
+                fill: 'toself',
+                fillcolor: color + '40', // 25% opacity
+                line: { color: color, width: 1.5 },
                 name: model.model,
+                text: model.distribution.map((d, i) =>
+                    `${model.model}<br>${binLabels[i]} turns: ${d.toFixed(1)}%<br>Median: ${model.median}`
+                ),
+                hovertemplate: '%{text}<extra></extra>',
+                showlegend: false
+            });
+            // Add baseline
+            traces.push({
+                x: [0, 100],
+                y: [yOffset, yOffset],
+                mode: 'lines',
+                line: { color: 'rgba(148, 163, 184, 0.2)', width: 0.5 },
+                hoverinfo: 'skip',
                 showlegend: false
+            });
         });
         const layout = {
             ...darkLayout,
             xaxis: {
                 ...darkLayout.xaxis,
                 title: { text: 'Number of Turns', font: { size: 11, color: '#e2e8f0' } },
+                range: [0, 100],
+                dtick: 20
             },
             yaxis: {
                 ...darkLayout.yaxis,
+                tickmode: 'array',
+                tickvals: displayData.map((_, i) => i),
+                ticktext: displayData.map(m => m.model.length > 20 ? m.model.substring(0, 18) + '...' : m.model),
                 automargin: true,
+                range: [-0.5, displayData.length]
             },
+            margin: { ...darkLayout.margin, l: 140 },
+            showlegend: false
         };
         Plotly.newPlot(`turn-${scenario}`, traces, layout, plotlyConfig);
     });
 });
+// ============================================================================
+// ERROR ANALYSIS - Hierarchical Bar Chart
+// ============================================================================
+function initErrorChart() {
+    const data = DDR_DATA.error;
+    if (!data || data.length === 0) return;
+    // Group by main category for bracket annotations
+    const categoryGroups = {};
+    data.forEach((item, idx) => {
+        if (!categoryGroups[item.main_category]) {
+            categoryGroups[item.main_category] = { start: idx, end: idx, items: [] };
+        }
+        categoryGroups[item.main_category].end = idx;
+        categoryGroups[item.main_category].items.push(item);
+    });
+    const traces = [{
+        x: data.map(d => d.subcategory),
+        y: data.map(d => d.percentage),
+        type: 'bar',
+        marker: {
+            color: data.map(d => d.color),
+            line: { color: '#fff', width: 0.5 }
+        },
+        text: data.map(d => `${d.percentage}%`),
+        textposition: 'outside',
+        textfont: { size: 11, color: '#e2e8f0' },
+        hovertemplate: '<b>%{x}</b><br>%{y:.1f}%<br>Count: %{customdata}<extra></extra>',
+        customdata: data.map(d => d.count),
+        showlegend: false
+    }];
+    const maxPct = Math.max(...data.map(d => d.percentage));
+    // Create annotations for main category labels
+    const annotations = [];
+    Object.entries(categoryGroups).forEach(([catName, group]) => {
+        const midIdx = (group.start + group.end) / 2;
+        annotations.push({
+            x: midIdx,
+            y: maxPct * 1.15,
+            text: `<b>${catName}</b>`,
+            showarrow: false,
+            font: { size: 10, color: '#e2e8f0' },
+            xanchor: 'center',
+            yanchor: 'bottom'
+        });
+    });
+    const layout = {
+        ...darkLayout,
+        xaxis: {
+            ...darkLayout.xaxis,
+            tickangle: -30,
+            tickfont: { size: 10, color: '#94a3b8' }
+        },
+        yaxis: {
+            ...darkLayout.yaxis,
+            title: { text: 'Percentage (%)', font: { size: 11, color: '#e2e8f0' } },
+            range: [0, maxPct * 1.25]
+        },
+        annotations: annotations,
+        margin: { t: 50, r: 20, b: 100, l: 50 }
+    };
+    Plotly.newPlot('error-chart', traces, layout, plotlyConfig);
+}
 // ============================================================================
 // INITIALIZE ALL CHARTS
 // ============================================================================
     initScalingCharts();
     initRankingCharts();
     initTurnCharts();
+    initErrorChart();
     initProbingCharts();
 });
             Plotly.Plots.resize(`turn-${s}`);
             Plotly.Plots.resize(`probing-${s}`);
         });
+        if (document.getElementById('error-chart')) {
+            Plotly.Plots.resize('error-chart');
+        }
     }, 100);
 });

data.js CHANGED Viewed

@@ -3756,5 +3756,63 @@ const DDR_DATA = {
             "Qwen3-4B": "#57E389",
             "Qwen3-30B-A3B": "#26A269",
             "Qwen3-Next-80B-A3B": "#9141AC"
-    }
 };

             "Qwen3-4B": "#57E389",
             "Qwen3-30B-A3B": "#26A269",
             "Qwen3-Next-80B-A3B": "#9141AC"
+    },
+    error: [
+            {
+                    "main_category": "Fail in Exploration",
+                    "subcategory": "Insufficient Breadth",
+                    "count": 64,
+                    "percentage": 31.1,
+                    "color": "#1565C0"
+            },
+            {
+                    "main_category": "Fail in Exploration",
+                    "subcategory": "Insufficient Depth",
+                    "count": 56,
+                    "percentage": 27.2,
+                    "color": "#42A5F5"
+            },
+            {
+                    "main_category": "Poor Data-to-Insight",
+                    "subcategory": "Insight Misinterpretation",
+                    "count": 19,
+                    "percentage": 9.2,
+                    "color": "#2E7D32"
+            },
+            {
+                    "main_category": "Poor Data-to-Insight",
+                    "subcategory": "Superficial Analysis",
+                    "count": 16,
+                    "percentage": 7.8,
+                    "color": "#43A047"
+            },
+            {
+                    "main_category": "Poor Data-to-Insight",
+                    "subcategory": "Over Reasoning",
+                    "count": 15,
+                    "percentage": 7.3,
+                    "color": "#81C784"
+            },
+            {
+                    "main_category": "Lost in Context",
+                    "subcategory": "Lost in Debugging",
+                    "count": 18,
+                    "percentage": 8.7,
+                    "color": "#C62828"
+            },
+            {
+                    "main_category": "Lost in Context",
+                    "subcategory": "Fail in Summarization",
+                    "count": 10,
+                    "percentage": 4.9,
+                    "color": "#E53935"
+            },
+            {
+                    "main_category": "Lost in Context",
+                    "subcategory": "Poor Instruction Following",
+                    "count": 8,
+                    "percentage": 3.9,
+                    "color": "#EF9A9A"
+            }
+    ]
 };

index.html CHANGED Viewed

@@ -46,6 +46,7 @@
         <button class="nav-tab active" data-section="scaling">📈 Scaling Analysis</button>
         <button class="nav-tab" data-section="ranking">🏆 Ranking Comparison</button>
         <button class="nav-tab" data-section="turn">🔄 Turn Distribution</button>
         <button class="nav-tab" data-section="probing">🔍 Probing Results</button>
     </nav>
@@ -79,29 +80,35 @@
             </div>
         </section>
-        <!-- Ranking Comparison Section - 3 charts -->
         <section id="ranking" class="section">
             <div class="section-header">
                 <h2>Novelty vs Accuracy Ranking</h2>
                 <p>Compare model rankings based on Bradley-Terry pairwise ranking against traditional accuracy ranking.
                 </p>
             </div>
             <div class="charts-grid three-col">
                 <div class="chart-card">
                     <h3>MIMIC</h3>
-                    <div id="ranking-mimic" class="chart-container-tall"></div>
                 </div>
                 <div class="chart-card">
                     <h3>10-K</h3>
-                    <div id="ranking-10k" class="chart-container-tall"></div>
                 </div>
                 <div class="chart-card">
                     <h3>GLOBEM</h3>
-                    <div id="ranking-globem" class="chart-container-tall"></div>
                 </div>
             </div>
         </section>
         <!-- Turn Distribution Section - 3 charts -->
         <section id="turn" class="section">
             <div class="section-header">
@@ -124,6 +131,19 @@
             </div>
         </section>
         <!-- Probing Results Section -->
         <section id="probing" class="section">
             <div class="section-header">

         <button class="nav-tab active" data-section="scaling">📈 Scaling Analysis</button>
         <button class="nav-tab" data-section="ranking">🏆 Ranking Comparison</button>
         <button class="nav-tab" data-section="turn">🔄 Turn Distribution</button>
+        <button class="nav-tab" data-section="error">⚠️ Error Analysis</button>
         <button class="nav-tab" data-section="probing">🔍 Probing Results</button>
     </nav>
             </div>
         </section>
+        <!-- Ranking Comparison Section - 3 charts with toggle -->
         <section id="ranking" class="section">
             <div class="section-header">
                 <h2>Novelty vs Accuracy Ranking</h2>
                 <p>Compare model rankings based on Bradley-Terry pairwise ranking against traditional accuracy ranking.
                 </p>
             </div>
+            <div class="dimension-toggle">
+                <button class="dim-btn ranking-dim active" data-mode="comparison">🔀 Comparison View</button>
+                <button class="dim-btn ranking-dim" data-mode="novelty">🎯 Novelty Rank</button>
+                <button class="dim-btn ranking-dim" data-mode="accuracy">📊 Accuracy Rank</button>
+            </div>
             <div class="charts-grid three-col">
                 <div class="chart-card">
                     <h3>MIMIC</h3>
+                    <div id="ranking-mimic" class="chart-container-md"></div>
                 </div>
                 <div class="chart-card">
                     <h3>10-K</h3>
+                    <div id="ranking-10k" class="chart-container-md"></div>
                 </div>
                 <div class="chart-card">
                     <h3>GLOBEM</h3>
+                    <div id="ranking-globem" class="chart-container-md"></div>
                 </div>
             </div>
         </section>
         <!-- Turn Distribution Section - 3 charts -->
         <section id="turn" class="section">
             <div class="section-header">
             </div>
         </section>
+        <!-- Error Analysis Section -->
+        <section id="error" class="section">
+            <div class="section-header">
+                <h2>Error Type Analysis</h2>
+                <p>Breakdown of error types encountered during agent interactions, grouped by main categories.</p>
+            </div>
+            <div class="charts-grid single">
+                <div class="chart-card wide">
+                    <div id="error-chart" class="chart-container-md"></div>
+                </div>
+            </div>
+        </section>
         <!-- Probing Results Section -->
         <section id="probing" class="section">
             <div class="section-header">

styles.css CHANGED Viewed

@@ -294,11 +294,27 @@ body {
     min-height: 300px;
 }
 .chart-container-tall {
     height: 550px;
     min-height: 500px;
 }
 /* Footer */
 .footer {
     text-align: center;

     min-height: 300px;
 }
+.chart-container-md {
+    height: 450px;
+    min-height: 400px;
+}
 .chart-container-tall {
     height: 550px;
     min-height: 500px;
 }
+/* Single chart grid */
+.charts-grid.single {
+    grid-template-columns: 1fr;
+    max-width: 1000px;
+    margin: 0 auto;
+}
+.chart-card.wide {
+    padding: 1.5rem;
+}
 /* Footer */
 .footer {
     text-align: center;