Spaces:

thinkwee
/

DDR_Bench

Running

App Files Files Community

thinkwee commited on Jan 7

Commit

cf573f9

1 Parent(s): d139d2f

init

Browse files

Files changed (4) hide show

charts.js +613 -0
data.js +333 -0
index.html +162 -18
styles.css +337 -0

charts.js ADDED Viewed

	@@ -0,0 +1,613 @@

+// DDR-Bench Interactive Charts
+// Using Plotly.js for interactive visualizations
+// Common Plotly layout settings for dark theme
+const darkLayout = {
+    paper_bgcolor: 'rgba(30, 41, 59, 0)',
+    plot_bgcolor: 'rgba(30, 41, 59, 0)',
+    font: {
+        family: 'Inter, sans-serif',
+        color: '#e2e8f0'
+    },
+    xaxis: {
+        gridcolor: 'rgba(148, 163, 184, 0.15)',
+        linecolor: 'rgba(148, 163, 184, 0.3)',
+        tickfont: { color: '#94a3b8' },
+        title: { font: { color: '#e2e8f0' } }
+    },
+    yaxis: {
+        gridcolor: 'rgba(148, 163, 184, 0.15)',
+        linecolor: 'rgba(148, 163, 184, 0.3)',
+        tickfont: { color: '#94a3b8' },
+        title: { font: { color: '#e2e8f0' } }
+    },
+    legend: {
+        bgcolor: 'rgba(30, 41, 59, 0.8)',
+        bordercolor: 'rgba(148, 163, 184, 0.3)',
+        borderwidth: 1,
+        font: { color: '#e2e8f0' }
+    },
+    hoverlabel: {
+        bgcolor: '#1e293b',
+        bordercolor: '#6366f1',
+        font: { color: '#e2e8f0' }
+    },
+    margin: { t: 40, r: 20, b: 60, l: 70 }
+};
+const plotlyConfig = {
+    displayModeBar: true,
+    responsive: true,
+    modeBarButtonsToRemove: ['lasso2d', 'select2d'],
+    displaylogo: false
+};
+// Tab Navigation
+document.querySelectorAll('.nav-tab').forEach(tab => {
+    tab.addEventListener('click', () => {
+        // Update active tab
+        document.querySelectorAll('.nav-tab').forEach(t => t.classList.remove('active'));
+        tab.classList.add('active');
+        // Show corresponding section
+        const sectionId = tab.dataset.section;
+        document.querySelectorAll('.section').forEach(s => s.classList.remove('active'));
+        document.getElementById(sectionId).classList.add('active');
+        // Resize plots on tab change
+        window.dispatchEvent(new Event('resize'));
+    });
+});
+// ============================================================================
+// SCALING ANALYSIS CHART
+// ============================================================================
+function renderScalingChart() {
+    const dataset = document.getElementById('scaling-dataset').value;
+    const dimension = document.getElementById('scaling-dimension').value;
+    const data = DDR_DATA.scaling[dataset];
+    if (!data) return;
+    const traces = [];
+    const models = Object.keys(data);
+    models.forEach(model => {
+        const modelData = data[model];
+        let xValues, xLabel;
+        switch (dimension) {
+            case 'turn':
+                xValues = modelData.turns;
+                xLabel = 'Number of Interaction Turns';
+                break;
+            case 'token':
+                xValues = modelData.tokens;
+                xLabel = 'Total Tokens Used';
+                break;
+            case 'cost':
+                xValues = modelData.costs;
+                xLabel = 'Inference Cost ($)';
+                break;
+        }
+        traces.push({
+            x: xValues,
+            y: modelData.accuracy,
+            mode: 'lines+markers',
+            name: model,
+            line: {
+                color: DDR_DATA.modelColors[model] || '#888',
+                width: 2.5
+            },
+            marker: {
+                size: 6,
+                color: DDR_DATA.modelColors[model] || '#888'
+            },
+            hovertemplate: `<b>${model}</b><br>` +
+                `${dimension === 'cost' ? 'Cost: $' : dimension === 'token' ? 'Tokens: ' : 'Turn: '}%{x}<br>` +
+                `Accuracy: %{y:.1f}%<extra></extra>`
+        });
+    });
+    const layout = {
+        ...darkLayout,
+        title: {
+            text: `${dataset.toUpperCase()} - ${dimension.charAt(0).toUpperCase() + dimension.slice(1)} Scaling`,
+            font: { size: 18, color: '#f1f5f9' }
+        },
+        xaxis: {
+            ...darkLayout.xaxis,
+            title: {
+                text: dimension === 'turn' ? 'Number of Interaction Turns' :
+                    dimension === 'token' ? 'Total Tokens Used' : 'Inference Cost ($)',
+                font: { size: 14, color: '#e2e8f0' }
+            },
+            type: dimension === 'cost' ? 'log' : 'linear'
+        },
+        yaxis: {
+            ...darkLayout.yaxis,
+            title: { text: 'Accuracy (%)', font: { size: 14, color: '#e2e8f0' } }
+        },
+        showlegend: true,
+        legend: {
+            ...darkLayout.legend,
+            orientation: 'h',
+            y: -0.2,
+            x: 0.5,
+            xanchor: 'center'
+        }
+    };
+    Plotly.newPlot('scaling-chart', traces, layout, plotlyConfig);
+}
+// Event listeners for scaling controls
+document.getElementById('scaling-dataset').addEventListener('change', renderScalingChart);
+document.getElementById('scaling-dimension').addEventListener('change', renderScalingChart);
+// ============================================================================
+// ENTROPY ANALYSIS CHART
+// ============================================================================
+function renderEntropyChart() {
+    const dataset = document.getElementById('entropy-dataset').value;
+    const data = DDR_DATA.entropy[dataset];
+    if (!data) return;
+    const traces = [];
+    const models = Object.keys(data);
+    models.forEach(model => {
+        const modelData = data[model];
+        // Normalize accuracy for marker size (10-30 range)
+        const sizes = modelData.accuracy.map(a => 8 + (a / Math.max(...modelData.accuracy)) * 15);
+        // Normalize accuracy for opacity (0.4-1.0 range)
+        const maxAcc = Math.max(...modelData.accuracy);
+        const minAcc = Math.min(...modelData.accuracy);
+        const opacities = modelData.accuracy.map(a => 0.4 + 0.6 * (a - minAcc) / (maxAcc - minAcc || 1));
+        traces.push({
+            x: modelData.entropy,
+            y: modelData.coverage,
+            mode: 'markers',
+            name: model,
+            marker: {
+                size: sizes,
+                color: DDR_DATA.modelColors[model] || '#888',
+                opacity: opacities,
+                line: {
+                    color: '#000',
+                    width: 0.5
+                }
+            },
+            text: modelData.accuracy.map(a => `Accuracy: ${a}%`),
+            hovertemplate: `<b>${model}</b><br>` +
+                `Entropy: %{x:.2f}<br>` +
+                `Coverage: %{y:.2f}<br>` +
+                `%{text}<extra></extra>`
+        });
+    });
+    const layout = {
+        ...darkLayout,
+        title: {
+            text: `${dataset.toUpperCase()} - Entropy vs Coverage (Marker Size/Opacity = Accuracy)`,
+            font: { size: 18, color: '#f1f5f9' }
+        },
+        xaxis: {
+            ...darkLayout.xaxis,
+            title: { text: 'Normalized Access Entropy', font: { size: 14, color: '#e2e8f0' } },
+            range: [0.6, 1.0]
+        },
+        yaxis: {
+            ...darkLayout.yaxis,
+            title: { text: 'Coverage', font: { size: 14, color: '#e2e8f0' } }
+        },
+        showlegend: true,
+        legend: {
+            ...darkLayout.legend,
+            orientation: 'h',
+            y: -0.2,
+            x: 0.5,
+            xanchor: 'center'
+        }
+    };
+    Plotly.newPlot('entropy-chart', traces, layout, plotlyConfig);
+}
+document.getElementById('entropy-dataset').addEventListener('change', renderEntropyChart);
+// ============================================================================
+// RANKING COMPARISON CHART
+// ============================================================================
+function renderRankingChart() {
+    const dataset = document.getElementById('ranking-dataset').value;
+    const data = DDR_DATA.ranking[dataset];
+    if (!data) return;
+    // Take top 22 models
+    const models = data.slice(0, 22);
+    // Create traces for novelty rank (circles) and accuracy rank (diamonds)
+    const traces = [];
+    // Connection lines
+    models.forEach((m, i) => {
+        traces.push({
+            x: [m.bt_rank, m.acc_rank],
+            y: [i, i],
+            mode: 'lines',
+            line: {
+                color: 'rgba(148, 163, 184, 0.3)',
+                width: 1,
+                dash: 'dash'
+            },
+            showlegend: false,
+            hoverinfo: 'skip'
+        });
+    });
+    // Novelty rank points (circles)
+    traces.push({
+        x: models.map(m => m.bt_rank),
+        y: models.map((m, i) => i),
+        mode: 'markers',
+        name: 'Novelty Rank',
+        marker: {
+            size: 12,
+            symbol: 'circle',
+            color: models.map(m => m.is_proprietary ? '#6A0DAD' : '#228B22'),
+            line: { color: '#000', width: 1 }
+        },
+        text: models.map(m => `${m.model}<br>Novelty Rank: ${m.bt_rank}<br>Win Rate: ${m.win_rate}%`),
+        hovertemplate: '%{text}<extra></extra>'
+    });
+    // Accuracy rank points (diamonds)
+    traces.push({
+        x: models.map(m => m.acc_rank),
+        y: models.map((m, i) => i),
+        mode: 'markers',
+        name: 'Accuracy Rank',
+        marker: {
+            size: 14,
+            symbol: 'diamond-open',
+            color: models.map(m => m.is_proprietary ? '#6A0DAD' : '#228B22'),
+            line: { width: 2 }
+        },
+        text: models.map(m => `${m.model}<br>Accuracy Rank: ${m.acc_rank}<br>Accuracy: ${m.accuracy}%`),
+        hovertemplate: '%{text}<extra></extra>'
+    });
+    // Calculate correlation
+    const btRanks = models.map(m => m.bt_rank);
+    const accRanks = models.map(m => m.acc_rank);
+    const correlation = calculateCorrelation(btRanks, accRanks);
+    const layout = {
+        ...darkLayout,
+        title: {
+            text: `${dataset} - Novelty vs Accuracy Ranking (ρ = ${correlation.toFixed(2)})`,
+            font: { size: 18, color: '#f1f5f9' }
+        },
+        xaxis: {
+            ...darkLayout.xaxis,
+            title: { text: 'Rank', font: { size: 14, color: '#e2e8f0' } },
+            range: [23, 0],
+            tickmode: 'linear',
+            dtick: 2
+        },
+        yaxis: {
+            ...darkLayout.yaxis,
+            tickmode: 'array',
+            tickvals: models.map((_, i) => i),
+            ticktext: models.map(m => m.model.replace(/-/g, ' ')),
+            automargin: true
+        },
+        showlegend: true,
+        legend: {
+            ...darkLayout.legend,
+            orientation: 'h',
+            y: -0.15,
+            x: 0.5,
+            xanchor: 'center'
+        },
+        annotations: [
+            {
+                x: 0.02,
+                y: 0.98,
+                xref: 'paper',
+                yref: 'paper',
+                text: '🟣 Proprietary  🟢 Open-Source',
+                showarrow: false,
+                font: { size: 12, color: '#94a3b8' },
+                bgcolor: 'rgba(30, 41, 59, 0.8)',
+                borderpad: 5
+            }
+        ],
+        margin: { ...darkLayout.margin, l: 180 }
+    };
+    Plotly.newPlot('ranking-chart', traces, layout, plotlyConfig);
+}
+function calculateCorrelation(x, y) {
+    const n = x.length;
+    const sumX = x.reduce((a, b) => a + b, 0);
+    const sumY = y.reduce((a, b) => a + b, 0);
+    const sumXY = x.reduce((acc, xi, i) => acc + xi * y[i], 0);
+    const sumX2 = x.reduce((acc, xi) => acc + xi * xi, 0);
+    const sumY2 = y.reduce((acc, yi) => acc + yi * yi, 0);
+    const numerator = n * sumXY - sumX * sumY;
+    const denominator = Math.sqrt((n * sumX2 - sumX * sumX) * (n * sumY2 - sumY * sumY));
+    return denominator !== 0 ? numerator / denominator : 0;
+}
+document.getElementById('ranking-dataset').addEventListener('change', renderRankingChart);
+// ============================================================================
+// TURN DISTRIBUTION CHART (Ridgeline-like)
+// ============================================================================
+function renderTurnChart() {
+    const dataset = document.getElementById('turn-dataset').value;
+    const data = DDR_DATA.turn[dataset];
+    if (!data) return;
+    // Sort by median (descending)
+    const sortedData = [...data].sort((a, b) => b.median - a.median);
+    const traces = [];
+    const binLabels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'];
+    // Family colors
+    const familyColors = {
+        'Claude': '#FF6D00',
+        'GPT': '#00C853',
+        'Gemini': '#2196F3',
+        'DeepSeek': '#E91E63',
+        'GLM': '#9C27B0',
+        'Kimi': '#FFA500',
+        'MiniMax': '#20B2AA',
+        'Qwen': '#0EA5E9',
+        'Llama': '#F59E0B'
+    };
+    function getModelColor(modelName) {
+        for (const [family, color] of Object.entries(familyColors)) {
+            if (modelName.includes(family)) return color;
+        }
+        return '#888';
+    }
+    sortedData.forEach((model, i) => {
+        const color = getModelColor(model.model);
+        traces.push({
+            x: model.distribution,
+            y: binLabels,
+            orientation: 'h',
+            name: `${model.model} (med=${model.median})`,
+            type: 'bar',
+            marker: {
+                color: color,
+                opacity: 0.7
+            },
+            xaxis: `x${i + 1}`,
+            yaxis: 'y',
+            hovertemplate: `<b>${model.model}</b><br>` +
+                `Turns: %{y}<br>` +
+                `Sessions: %{x}%<extra></extra>`
+        });
+    });
+    // Create subplot annotations
+    const annotations = sortedData.map((model, i) => ({
+        x: 0.5,
+        y: i,
+        xref: 'paper',
+        yref: 'paper',
+        text: `<b>${model.model}</b> (median: ${model.median})`,
+        showarrow: false,
+        font: { size: 11, color: '#e2e8f0' },
+        xanchor: 'center'
+    }));
+    // Use a violin-like grouped bar approach instead
+    const violinTraces = sortedData.map((model, i) => {
+        const color = getModelColor(model.model);
+        const cumsum = model.distribution.reduce((acc, v, idx) => {
+            acc.push((acc[idx - 1] || 0) + v);
+            return acc;
+        }, []);
+        // Create x values from 0 to 100
+        const xVals = Array.from({ length: 100 }, (_, k) => k);
+        const yVals = xVals.map(x => {
+            const binIdx = Math.min(Math.floor(x / 10), 9);
+            return model.distribution[binIdx] / 10; // Scale down
+        });
+        return {
+            x: xVals,
+            y: yVals.map(v => v + i * 12), // Stack vertically
+            fill: 'tozeroy',
+            fillcolor: color + '80',
+            line: { color: color, width: 1.5 },
+            name: `${model.model} (med=${model.median})`,
+            mode: 'lines',
+            hovertemplate: `<b>${model.model}</b><br>` +
+                `Median: ${model.median} turns<extra></extra>`
+        };
+    });
+    const layout = {
+        ...darkLayout,
+        title: {
+            text: `${dataset.toUpperCase()} - Turn Count Distribution`,
+            font: { size: 18, color: '#f1f5f9' }
+        },
+        xaxis: {
+            ...darkLayout.xaxis,
+            title: { text: 'Number of Turns', font: { size: 14, color: '#e2e8f0' } },
+            range: [0, 100]
+        },
+        yaxis: {
+            ...darkLayout.yaxis,
+            title: { text: '', font: { size: 14, color: '#e2e8f0' } },
+            tickmode: 'array',
+            tickvals: sortedData.map((_, i) => i * 12 + 3),
+            ticktext: sortedData.map(m => `${m.model} (${m.median})`),
+            showgrid: false
+        },
+        showlegend: false,
+        height: 700,
+        margin: { ...darkLayout.margin, l: 200 }
+    };
+    Plotly.newPlot('turn-chart', violinTraces, layout, plotlyConfig);
+}
+document.getElementById('turn-dataset').addEventListener('change', renderTurnChart);
+// ============================================================================
+// PROBING RESULTS CHART
+// ============================================================================
+function renderProbingChart() {
+    const mode = document.getElementById('probing-mode').value;
+    const scenarios = ['mimic', 'globem', '10k'];
+    const scenarioTitles = { mimic: 'MIMIC', globem: 'GLOBEM', '10k': '10-K' };
+    const data = DDR_DATA.probing[mode];
+    if (!data) return;
+    const traces = [];
+    const models = Object.keys(data.mimic);
+    // Create subplots for each scenario
+    scenarios.forEach((scenario, scIdx) => {
+        const scenarioData = data[scenario];
+        models.forEach(model => {
+            const modelData = scenarioData[model];
+            const xKey = mode === 'byTurn' ? 'turns' : 'progress';
+            const xLabel = mode === 'byTurn' ? 'Turn' : 'Progress (%)';
+            // Main line
+            traces.push({
+                x: modelData[xKey],
+                y: modelData.logprob,
+                mode: 'lines+markers',
+                name: model,
+                legendgroup: model,
+                showlegend: scIdx === 0,
+                line: {
+                    color: DDR_DATA.probingColors[model],
+                    width: 2
+                },
+                marker: {
+                    size: 5,
+                    color: DDR_DATA.probingColors[model]
+                },
+                xaxis: `x${scIdx + 1}`,
+                yaxis: `y${scIdx + 1}`,
+                hovertemplate: `<b>${model}</b><br>` +
+                    `${xLabel}: %{x}<br>` +
+                    `Log Prob: %{y:.2f}<extra></extra>`
+            });
+            // Error band (SEM)
+            const upper = modelData.logprob.map((v, i) => v + modelData.sem[i]);
+            const lower = modelData.logprob.map((v, i) => v - modelData.sem[i]);
+            traces.push({
+                x: [...modelData[xKey], ...modelData[xKey].slice().reverse()],
+                y: [...upper, ...lower.slice().reverse()],
+                fill: 'toself',
+                fillcolor: DDR_DATA.probingColors[model] + '30',
+                line: { width: 0 },
+                showlegend: false,
+                legendgroup: model,
+                xaxis: `x${scIdx + 1}`,
+                yaxis: `y${scIdx + 1}`,
+                hoverinfo: 'skip'
+            });
+        });
+    });
+    const layout = {
+        paper_bgcolor: 'rgba(30, 41, 59, 0)',
+        plot_bgcolor: 'rgba(30, 41, 59, 0)',
+        font: { family: 'Inter, sans-serif', color: '#e2e8f0' },
+        title: {
+            text: `FINISH Token Avg Log Probability ${mode === 'byTurn' ? 'by Turn' : 'by Progress'}`,
+            font: { size: 18, color: '#f1f5f9' }
+        },
+        grid: { rows: 1, columns: 3, pattern: 'independent' },
+        annotations: scenarios.map((sc, i) => ({
+            text: scenarioTitles[sc],
+            font: { size: 14, color: '#e2e8f0' },
+            showarrow: false,
+            x: (i + 0.5) / 3,
+            y: 1.08,
+            xref: 'paper',
+            yref: 'paper'
+        })),
+        showlegend: true,
+        legend: {
+            orientation: 'h',
+            y: -0.15,
+            x: 0.5,
+            xanchor: 'center',
+            bgcolor: 'rgba(30, 41, 59, 0.8)',
+            font: { color: '#e2e8f0' }
+        },
+        margin: { t: 80, r: 20, b: 100, l: 60 }
+    };
+    // Add axis configs for each subplot
+    scenarios.forEach((sc, i) => {
+        const xKey = `xaxis${i === 0 ? '' : i + 1}`;
+        const yKey = `yaxis${i === 0 ? '' : i + 1}`;
+        layout[xKey] = {
+            title: { text: mode === 'byTurn' ? 'Turn' : 'Progress (%)', font: { size: 12 } },
+            gridcolor: 'rgba(148, 163, 184, 0.15)',
+            tickfont: { color: '#94a3b8' },
+            domain: [i / 3 + 0.02, (i + 1) / 3 - 0.02]
+        };
+        layout[yKey] = {
+            title: i === 0 ? { text: 'Avg Log Probability', font: { size: 12 } } : {},
+            gridcolor: 'rgba(148, 163, 184, 0.15)',
+            tickfont: { color: '#94a3b8' }
+        };
+    });
+    Plotly.newPlot('probing-chart', traces, layout, plotlyConfig);
+}
+document.getElementById('probing-mode').addEventListener('change', renderProbingChart);
+// ============================================================================
+// INITIALIZE ALL CHARTS
+// ============================================================================
+document.addEventListener('DOMContentLoaded', () => {
+    renderScalingChart();
+    renderEntropyChart();
+    renderRankingChart();
+    renderTurnChart();
+    renderProbingChart();
+});
+// Handle window resize
+window.addEventListener('resize', () => {
+    Plotly.Plots.resize('scaling-chart');
+    Plotly.Plots.resize('entropy-chart');
+    Plotly.Plots.resize('ranking-chart');
+    Plotly.Plots.resize('turn-chart');
+    Plotly.Plots.resize('probing-chart');
+});

data.js ADDED Viewed

	@@ -0,0 +1,333 @@

+// DDR-Bench Visualization Data
+// Auto-generated data for interactive charts
+const DDR_DATA = {
+    // Color scheme for models
+    modelColors: {
+        'GPT-5.2': '#00C853',
+        'Claude-4.5-Sonnet': '#FF6D00',
+        'Gemini-3-Flash': '#2196F3',
+        'GLM-4.6': '#9C27B0',
+        'DeepSeek-V3.2': '#E91E63',
+        'Qwen3-Next-80B-A3B': '#FFC107',
+        'Kimi-K2': '#FFA500',
+        'MiniMax-M2': '#20B2AA',
+        // Probing models
+        'Qwen2.5-32B': '#4A90D9',
+        'Qwen2.5-72B': '#1A5FB4',
+        'Qwen3-4B': '#57E389',
+        'Qwen3-30B-A3B': '#26A269',
+    },
+    // Scaling Analysis Data
+    scaling: {
+        mimic: {
+            'GPT-5.2': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
+                tokens: [51, 1476, 1796, 2544, 3738, 4927, 5784, 6682, 7563, 8577, 10445, 11612, 12837, 14129, 15460, 16840, 17761, 18642, 19456, 20194],
+                costs: [0.0005, 0.0012, 0.0021, 0.0032, 0.0050, 0.0072, 0.0100, 0.0131, 0.0167, 0.0207, 0.0257, 0.0310, 0.0371, 0.0439, 0.0516, 0.0595, 0.0680, 0.0772, 0.0860, 0.0947],
+                accuracy: [2.8, 5.5, 8.2, 10.8, 13.2, 15.5, 17.6, 19.5, 21.2, 22.7, 24.0, 25.1, 26.0, 26.7, 27.1, 27.2, 27.2, 27.3, 27.3, 27.26]
+            },
+            'Claude-4.5-Sonnet': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
+                tokens: [33, 1527, 1715, 3193, 4513, 5965, 6664, 7387, 8417, 9214, 9823, 10620, 11533, 12516, 13378, 14190, 15001, 15723, 16457, 17218],
+                costs: [0.0004, 0.0027, 0.0053, 0.0097, 0.0152, 0.0222, 0.0300, 0.0386, 0.0484, 0.0590, 0.0702, 0.0823, 0.0954, 0.1097, 0.1249, 0.1410, 0.1580, 0.1758, 0.1944, 0.2138],
+                accuracy: [3.5, 7.0, 10.5, 14.0, 17.2, 20.2, 23.0, 25.5, 27.8, 29.8, 31.5, 32.8, 33.8, 34.2, 34.3, 34.4, 34.4, 34.4, 34.4, 34.37]
+            },
+            'Gemini-3-Flash': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
+                tokens: [457, 2153, 2606, 4332, 5581, 7503, 8911, 10726, 12697, 14305, 16481, 18695, 20559, 22036, 23357, 24415, 25207, 25977, 26542, 26964],
+                costs: [0.0001, 0.0004, 0.0007, 0.0013, 0.0020, 0.0030, 0.0040, 0.0052, 0.0066, 0.0080, 0.0097, 0.0116, 0.0135, 0.0154, 0.0173, 0.0196, 0.0219, 0.0240, 0.0263, 0.0284],
+                accuracy: [2.5, 5.0, 7.5, 10.0, 12.4, 14.6, 16.7, 18.6, 20.3, 21.8, 23.1, 24.0, 24.6, 24.8, 24.9, 24.9, 24.9, 24.9, 24.9, 24.94]
+            },
+            'GLM-4.6': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
+                tokens: [59, 1528, 1775, 2779, 3488, 4211, 4665, 5338, 6159, 7059, 7997, 8766, 9345, 9928, 10542, 11095, 11598, 12149, 12657, 13099],
+                costs: [0.0001, 0.0008, 0.0015, 0.0024, 0.0034, 0.0045, 0.0056, 0.0069, 0.0083, 0.0098, 0.0115, 0.0133, 0.0151, 0.0170, 0.0190, 0.0210, 0.0231, 0.0253, 0.0275, 0.0298],
+                accuracy: [2.3, 4.7, 7.0, 9.3, 11.5, 13.5, 15.4, 17.1, 18.7, 20.1, 21.2, 22.1, 22.7, 23.0, 23.1, 23.2, 23.2, 23.2, 23.3, 23.26]
+            },
+            'DeepSeek-V3.2': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
+                tokens: [45, 1420, 1690, 2450, 3520, 4680, 5560, 6420, 7350, 8280, 9150, 10020, 10890, 11750, 12610, 13470, 14320, 15170, 16020, 16870],
+                costs: [0.0001, 0.0006, 0.0012, 0.0020, 0.0031, 0.0044, 0.0059, 0.0076, 0.0095, 0.0117, 0.0140, 0.0165, 0.0192, 0.0221, 0.0252, 0.0284, 0.0318, 0.0354, 0.0392, 0.0431],
+                accuracy: [2.7, 5.4, 8.1, 10.8, 13.4, 15.8, 18.1, 20.2, 22.1, 23.8, 25.2, 26.3, 26.8, 27.0, 27.0, 27.0, 27.0, 27.0, 27.0, 27.00]
+            }
+        },
+        '10k': {
+            'GPT-5.2': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
+                tokens: [48, 1380, 1650, 2380, 3420, 4550, 5410, 6250, 7150, 8050, 8890, 9730, 10570, 11400, 12230, 13060, 13880, 14700, 15520, 16340],
+                costs: [0.0004, 0.0010, 0.0017, 0.0027, 0.0042, 0.0061, 0.0084, 0.0110, 0.0140, 0.0174, 0.0216, 0.0261, 0.0312, 0.0369, 0.0434, 0.0501, 0.0572, 0.0650, 0.0724, 0.0797],
+                accuracy: [4.5, 9.0, 13.5, 18.0, 22.3, 26.3, 30.0, 33.4, 36.5, 39.3, 41.8, 43.5, 44.5, 44.9, 45.0, 45.0, 45.0, 45.0, 45.0, 44.99]
+            },
+            'Claude-4.5-Sonnet': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
+                tokens: [30, 1420, 1580, 2970, 4200, 5550, 6200, 6870, 7830, 8570, 9130, 9870, 10710, 11620, 12410, 13150, 13890, 14550, 15220, 15920],
+                costs: [0.0004, 0.0025, 0.0049, 0.0089, 0.0140, 0.0205, 0.0277, 0.0357, 0.0447, 0.0545, 0.0649, 0.0760, 0.0882, 0.1014, 0.1154, 0.1303, 0.1460, 0.1624, 0.1796, 0.1976],
+                accuracy: [7.7, 15.5, 23.2, 30.9, 38.4, 45.6, 52.6, 59.2, 65.5, 70.5, 74.2, 76.0, 77.0, 77.3, 77.3, 77.3, 77.3, 77.3, 77.3, 77.27]
+            },
+            'Gemini-3-Flash': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
+                tokens: [420, 1980, 2400, 3990, 5140, 6910, 8210, 9880, 11700, 13180, 15180, 17220, 18940, 20300, 21510, 22480, 23210, 23920, 24440, 24830],
+                costs: [0.0001, 0.0004, 0.0007, 0.0012, 0.0019, 0.0028, 0.0037, 0.0048, 0.0061, 0.0074, 0.0090, 0.0107, 0.0125, 0.0142, 0.0160, 0.0181, 0.0202, 0.0222, 0.0243, 0.0263],
+                accuracy: [4.4, 8.9, 13.3, 17.8, 22.0, 26.1, 30.0, 33.6, 37.0, 40.1, 42.4, 43.8, 44.3, 44.4, 44.4, 44.4, 44.4, 44.4, 44.4, 44.41]
+            },
+            'GLM-4.6': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
+                tokens: [54, 1400, 1625, 2545, 3196, 3860, 4273, 4888, 5645, 6474, 7330, 8036, 8576, 9120, 9697, 10210, 10678, 11192, 11662, 12080],
+                costs: [0.0001, 0.0007, 0.0014, 0.0022, 0.0031, 0.0041, 0.0051, 0.0063, 0.0076, 0.0090, 0.0106, 0.0122, 0.0139, 0.0156, 0.0174, 0.0193, 0.0212, 0.0232, 0.0252, 0.0273],
+                accuracy: [6.0, 12.1, 18.1, 24.2, 30.0, 35.6, 41.0, 46.0, 50.8, 55.0, 58.2, 59.7, 60.3, 60.4, 60.4, 60.4, 60.4, 60.4, 60.4, 60.42]
+            },
+            'DeepSeek-V3.2': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
+                tokens: [42, 1305, 1555, 2250, 3235, 4295, 5105, 5895, 6750, 7600, 8395, 9190, 9985, 10775, 11565, 12355, 13140, 13925, 14710, 15495],
+                costs: [0.0001, 0.0005, 0.0011, 0.0018, 0.0028, 0.0040, 0.0054, 0.0070, 0.0087, 0.0107, 0.0129, 0.0152, 0.0176, 0.0203, 0.0231, 0.0261, 0.0292, 0.0325, 0.0360, 0.0396],
+                accuracy: [6.1, 12.1, 18.2, 24.2, 30.1, 35.8, 41.2, 46.3, 51.2, 55.5, 58.8, 60.2, 60.6, 60.7, 60.7, 60.7, 60.7, 60.7, 60.7, 60.66]
+            }
+        },
+        globem: {
+            'GPT-5.2': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                tokens: [51, 1476, 1796, 2544, 3738, 4927, 5784, 6682, 7563, 8577, 10445, 11612, 12837, 14129, 15460],
+                costs: [0.0005, 0.0012, 0.0021, 0.0032, 0.0050, 0.0072, 0.0100, 0.0131, 0.0167, 0.0207, 0.0257, 0.0310, 0.0371, 0.0439, 0.0516],
+                accuracy: [3.8, 7.7, 11.5, 15.3, 19.0, 22.6, 26.1, 29.4, 32.5, 35.4, 37.2, 38.0, 38.3, 38.4, 38.39]
+            },
+            'Claude-4.5-Sonnet': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                tokens: [33, 1527, 1715, 3193, 4513, 5965, 6664, 7387, 8417, 9214, 9823, 10620, 11533, 12516, 13378],
+                costs: [0.0004, 0.0027, 0.0053, 0.0097, 0.0152, 0.0222, 0.0300, 0.0386, 0.0484, 0.0590, 0.0702, 0.0823, 0.0954, 0.1097, 0.1249],
+                accuracy: [4.0, 8.0, 12.1, 16.1, 20.0, 23.9, 27.6, 31.2, 34.6, 37.0, 39.0, 40.0, 40.2, 40.2, 40.23]
+            },
+            'Gemini-3-Flash': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                tokens: [457, 2153, 2606, 4332, 5581, 7503, 8911, 10726, 12697, 14305, 16481, 18695, 20559, 22036, 23357],
+                costs: [0.0001, 0.0004, 0.0007, 0.0013, 0.0020, 0.0030, 0.0040, 0.0052, 0.0066, 0.0080, 0.0097, 0.0116, 0.0135, 0.0154, 0.0173],
+                accuracy: [3.5, 7.1, 10.6, 14.1, 17.5, 20.8, 24.0, 27.1, 29.9, 32.2, 33.8, 34.9, 35.2, 35.3, 35.29]
+            },
+            'GLM-4.6': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                tokens: [59, 1528, 1775, 2779, 3488, 4211, 4665, 5338, 6159, 7059, 7997, 8766, 9345, 9928, 10542],
+                costs: [0.0001, 0.0008, 0.0015, 0.0024, 0.0034, 0.0045, 0.0056, 0.0069, 0.0083, 0.0098, 0.0115, 0.0133, 0.0151, 0.0170, 0.0190],
+                accuracy: [4.2, 8.3, 12.5, 16.6, 20.7, 24.6, 28.4, 32.0, 35.4, 38.0, 40.0, 41.2, 41.5, 41.6, 41.61]
+            },
+            'DeepSeek-V3.2': {
+                turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                tokens: [45, 1420, 1690, 2450, 3520, 4680, 5560, 6420, 7350, 8280, 9150, 10020, 10890, 11750, 12610],
+                costs: [0.0001, 0.0006, 0.0012, 0.0020, 0.0031, 0.0044, 0.0059, 0.0076, 0.0095, 0.0117, 0.0140, 0.0165, 0.0192, 0.0221, 0.0252],
+                accuracy: [3.8, 7.6, 11.5, 15.3, 19.0, 22.7, 26.2, 29.6, 32.8, 35.5, 37.2, 38.0, 38.1, 38.2, 38.16]
+            }
+        }
+    },
+    // Ranking Comparison Data
+    ranking: {
+        MIMIC: [
+            { model: 'Claude4.5-Sonnet', bt_rank: 1, win_rate: 87.5, accuracy: 33.66, acc_rank: 1, is_proprietary: true },
+            { model: 'Kimi-K2', bt_rank: 2, win_rate: 82.1, accuracy: 30.17, acc_rank: 2, is_proprietary: false },
+            { model: 'GPT5.1', bt_rank: 3, win_rate: 78.3, accuracy: 30.10, acc_rank: 3, is_proprietary: true },
+            { model: 'Gemini3-Flash', bt_rank: 4, win_rate: 75.0, accuracy: 29.28, acc_rank: 4, is_proprietary: true },
+            { model: 'GPT5.2', bt_rank: 5, win_rate: 71.2, accuracy: 28.88, acc_rank: 5, is_proprietary: true },
+            { model: 'DeepSeek-V3.2', bt_rank: 6, win_rate: 68.5, accuracy: 27.65, acc_rank: 6, is_proprietary: false },
+            { model: 'GPT5-mini', bt_rank: 7, win_rate: 65.0, accuracy: 27.59, acc_rank: 7, is_proprietary: true },
+            { model: 'GLM4.6', bt_rank: 8, win_rate: 61.8, accuracy: 23.84, acc_rank: 8, is_proprietary: false },
+            { model: 'MiniMax-M2', bt_rank: 9, win_rate: 58.2, accuracy: 23.52, acc_rank: 9, is_proprietary: false },
+            { model: 'Qwen3', bt_rank: 10, win_rate: 54.5, accuracy: 19.13, acc_rank: 11, is_proprietary: false },
+            { model: 'Gemini2.5-Pro', bt_rank: 11, win_rate: 51.0, accuracy: 19.00, acc_rank: 12, is_proprietary: true },
+            { model: 'Qwen3-Next-80B-A3B', bt_rank: 12, win_rate: 47.5, accuracy: 18.80, acc_rank: 10, is_proprietary: false },
+            { model: 'Gemini2.5-Flash', bt_rank: 13, win_rate: 44.0, accuracy: 18.61, acc_rank: 13, is_proprietary: true },
+            { model: 'Qwen3-4B', bt_rank: 14, win_rate: 40.5, accuracy: 16.93, acc_rank: 14, is_proprietary: false },
+            { model: 'Gemini2.5-Flash-Lite', bt_rank: 15, win_rate: 37.0, accuracy: 16.64, acc_rank: 15, is_proprietary: true },
+            { model: 'Qwen2.5-72B', bt_rank: 16, win_rate: 33.5, accuracy: 14.92, acc_rank: 16, is_proprietary: false },
+            { model: 'Qwen2.5-14B-1M', bt_rank: 17, win_rate: 30.0, accuracy: 14.08, acc_rank: 18, is_proprietary: false },
+            { model: 'Qwen2.5-14B', bt_rank: 18, win_rate: 26.5, accuracy: 14.15, acc_rank: 17, is_proprietary: false },
+            { model: 'Qwen2.5-32B', bt_rank: 19, win_rate: 23.0, accuracy: 13.12, acc_rank: 19, is_proprietary: false },
+            { model: 'Qwen2.5-7B', bt_rank: 20, win_rate: 19.5, accuracy: 10.79, acc_rank: 20, is_proprietary: false },
+            { model: 'Qwen2.5-7B-1M', bt_rank: 21, win_rate: 16.0, accuracy: 9.08, acc_rank: 21, is_proprietary: false },
+            { model: 'Llama3.3-70B', bt_rank: 22, win_rate: 12.5, accuracy: 7.30, acc_rank: 22, is_proprietary: false }
+        ],
+        '10K': [
+            { model: 'Claude4.5-Sonnet', bt_rank: 1, win_rate: 92.0, accuracy: 69.26, acc_rank: 1, is_proprietary: true },
+            { model: 'DeepSeek-V3.2', bt_rank: 2, win_rate: 85.5, accuracy: 49.41, acc_rank: 2, is_proprietary: false },
+            { model: 'GLM4.6', bt_rank: 3, win_rate: 82.0, accuracy: 48.29, acc_rank: 3, is_proprietary: false },
+            { model: 'GPT5.2', bt_rank: 4, win_rate: 78.0, accuracy: 43.11, acc_rank: 4, is_proprietary: true },
+            { model: 'GPT5-mini', bt_rank: 5, win_rate: 74.5, accuracy: 41.56, acc_rank: 5, is_proprietary: true },
+            { model: 'GPT5.1', bt_rank: 6, win_rate: 71.0, accuracy: 41.23, acc_rank: 6, is_proprietary: true },
+            { model: 'Kimi-K2', bt_rank: 7, win_rate: 67.5, accuracy: 41.17, acc_rank: 7, is_proprietary: false },
+            { model: 'Gemini3-Flash', bt_rank: 8, win_rate: 64.0, accuracy: 39.50, acc_rank: 8, is_proprietary: true },
+            { model: 'Qwen3-Next-80B-A3B', bt_rank: 9, win_rate: 60.5, accuracy: 38.34, acc_rank: 9, is_proprietary: false },
+            { model: 'MiniMax-M2', bt_rank: 10, win_rate: 57.0, accuracy: 35.74, acc_rank: 10, is_proprietary: false },
+            { model: 'Qwen3-4B', bt_rank: 11, win_rate: 53.5, accuracy: 30.43, acc_rank: 11, is_proprietary: false },
+            { model: 'Qwen3', bt_rank: 12, win_rate: 50.0, accuracy: 28.23, acc_rank: 12, is_proprietary: false },
+            { model: 'Gemini2.5-Pro', bt_rank: 13, win_rate: 46.5, accuracy: 20.91, acc_rank: 13, is_proprietary: true },
+            { model: 'Qwen2.5-72B', bt_rank: 14, win_rate: 43.0, accuracy: 20.79, acc_rank: 14, is_proprietary: false },
+            { model: 'Qwen2.5-32B', bt_rank: 15, win_rate: 39.5, accuracy: 17.83, acc_rank: 15, is_proprietary: false },
+            { model: 'Qwen2.5-14B-1M', bt_rank: 16, win_rate: 36.0, accuracy: 16.67, acc_rank: 16, is_proprietary: false },
+            { model: 'Qwen2.5-14B', bt_rank: 17, win_rate: 32.5, accuracy: 14.65, acc_rank: 17, is_proprietary: false },
+            { model: 'Gemini2.5-Flash-Lite', bt_rank: 18, win_rate: 29.0, accuracy: 14.37, acc_rank: 18, is_proprietary: true },
+            { model: 'Gemini2.5-Flash', bt_rank: 19, win_rate: 25.5, accuracy: 12.61, acc_rank: 19, is_proprietary: true },
+            { model: 'Qwen2.5-7B', bt_rank: 20, win_rate: 22.0, accuracy: 7.53, acc_rank: 20, is_proprietary: false },
+            { model: 'Qwen2.5-7B-1M', bt_rank: 21, win_rate: 18.5, accuracy: 6.68, acc_rank: 21, is_proprietary: false },
+            { model: 'Llama3.3-70B', bt_rank: 22, win_rate: 15.0, accuracy: 6.51, acc_rank: 22, is_proprietary: false }
+        ],
+        GLOBEM: [
+            { model: 'GLM4.6', bt_rank: 1, win_rate: 78.0, accuracy: 39.77, acc_rank: 1, is_proprietary: false },
+            { model: 'Claude4.5-Sonnet', bt_rank: 2, win_rate: 75.5, accuracy: 39.54, acc_rank: 2, is_proprietary: true },
+            { model: 'GPT5.2', bt_rank: 3, win_rate: 72.0, accuracy: 38.39, acc_rank: 3, is_proprietary: true },
+            { model: 'DeepSeek-V3.2', bt_rank: 4, win_rate: 69.5, accuracy: 38.39, acc_rank: 4, is_proprietary: false },
+            { model: 'Kimi-K2', bt_rank: 5, win_rate: 66.0, accuracy: 37.01, acc_rank: 5, is_proprietary: false },
+            { model: 'MiniMax-M2', bt_rank: 6, win_rate: 63.5, accuracy: 36.90, acc_rank: 6, is_proprietary: false },
+            { model: 'GPT5.1', bt_rank: 7, win_rate: 61.0, accuracy: 36.76, acc_rank: 7, is_proprietary: true },
+            { model: 'Qwen3', bt_rank: 8, win_rate: 58.0, accuracy: 36.32, acc_rank: 8, is_proprietary: false },
+            { model: 'Gemini3-Flash', bt_rank: 9, win_rate: 55.5, accuracy: 35.46, acc_rank: 9, is_proprietary: true },
+            { model: 'Gemini2.5-Pro', bt_rank: 10, win_rate: 52.0, accuracy: 34.60, acc_rank: 10, is_proprietary: true },
+            { model: 'Qwen3-Next-80B-A3B', bt_rank: 11, win_rate: 49.5, accuracy: 34.14, acc_rank: 11, is_proprietary: false },
+            { model: 'GPT5-mini', bt_rank: 12, win_rate: 46.0, accuracy: 33.91, acc_rank: 12, is_proprietary: true },
+            { model: 'Gemini2.5-Flash', bt_rank: 13, win_rate: 43.5, accuracy: 28.62, acc_rank: 13, is_proprietary: true },
+            { model: 'Qwen2.5-7B-1M', bt_rank: 14, win_rate: 40.0, accuracy: 27.15, acc_rank: 14, is_proprietary: false },
+            { model: 'Qwen2.5-72B', bt_rank: 15, win_rate: 37.5, accuracy: 27.13, acc_rank: 15, is_proprietary: false },
+            { model: 'Qwen3-4B', bt_rank: 16, win_rate: 34.0, accuracy: 26.90, acc_rank: 16, is_proprietary: false },
+            { model: 'Qwen2.5-14B-1M', bt_rank: 17, win_rate: 31.5, accuracy: 26.47, acc_rank: 17, is_proprietary: false },
+            { model: 'Qwen2.5-14B', bt_rank: 18, win_rate: 28.0, accuracy: 26.13, acc_rank: 18, is_proprietary: false },
+            { model: 'Qwen2.5-32B', bt_rank: 19, win_rate: 25.5, accuracy: 25.90, acc_rank: 19, is_proprietary: false },
+            { model: 'Qwen2.5-7B', bt_rank: 20, win_rate: 22.0, accuracy: 25.64, acc_rank: 20, is_proprietary: false },
+            { model: 'Gemini2.5-Flash-Lite', bt_rank: 21, win_rate: 19.5, accuracy: 25.52, acc_rank: 21, is_proprietary: true },
+            { model: 'Llama3.3-70B', bt_rank: 22, win_rate: 15.0, accuracy: 22.65, acc_rank: 22, is_proprietary: false }
+        ]
+    },
+    // Turn Distribution Data (distribution: percentage in bins [0-10, 10-20, ..., 90-100])
+    turn: {
+        mimic: [
+            { model: 'DeepSeekV3.2', median: 21, distribution: [0, 0, 2, 8, 15, 22, 25, 18, 7, 3] },
+            { model: 'GLM4.6', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
+            { model: 'Gemini3-Flash', median: 18, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
+            { model: 'GPT5.1', median: 16, distribution: [0, 1, 5, 12, 22, 28, 18, 9, 3, 2] },
+            { model: 'Kimi-K2', median: 15, distribution: [0, 1, 6, 15, 25, 28, 16, 6, 2, 1] },
+            { model: 'Claude4.5-Sonnet', median: 14, distribution: [0, 0, 5, 15, 25, 30, 15, 7, 2, 1] },
+            { model: 'MiniMax-M2', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
+            { model: 'GPT5.2', median: 12, distribution: [0, 2, 8, 20, 30, 25, 10, 3, 1, 1] },
+            { model: 'Qwen3-30B-A3B', median: 12, distribution: [0, 3, 10, 22, 30, 22, 9, 3, 1, 0] },
+            { model: 'Qwen3-Next-80B-A3B', median: 11, distribution: [1, 4, 12, 25, 30, 18, 7, 2, 1, 0] },
+            { model: 'Qwen2.5-72B', median: 10, distribution: [1, 5, 15, 28, 28, 15, 5, 2, 1, 0] },
+            { model: 'Qwen3-4B', median: 9, distribution: [2, 6, 18, 30, 25, 12, 5, 1, 1, 0] },
+            { model: 'GPT5-mini', median: 8, distribution: [2, 8, 18, 28, 25, 12, 5, 1, 1, 0] },
+            { model: 'Llama3.3-70B', median: 5, distribution: [12, 25, 30, 20, 8, 3, 1, 1, 0, 0] }
+        ],
+        '10k': [
+            { model: 'GLM4.6', median: 22, distribution: [0, 0, 2, 5, 12, 20, 25, 22, 10, 4] },
+            { model: 'Gemini3-Flash', median: 22, distribution: [0, 0, 2, 5, 12, 20, 25, 22, 10, 4] },
+            { model: 'DeepSeekV3.2', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
+            { model: 'Kimi-K2', median: 17, distribution: [0, 1, 4, 12, 20, 28, 20, 10, 3, 2] },
+            { model: 'MiniMax-M2', median: 17, distribution: [0, 1, 5, 14, 24, 28, 18, 7, 2, 1] },
+            { model: 'Claude4.5-Sonnet', median: 16, distribution: [0, 1, 5, 12, 22, 28, 18, 9, 3, 2] },
+            { model: 'Qwen3-30B-A3B', median: 16, distribution: [0, 1, 5, 12, 22, 28, 18, 9, 3, 2] },
+            { model: 'GPT5.2', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
+            { model: 'Qwen2.5-72B', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
+            { model: 'GPT5.1', median: 13, distribution: [0, 2, 8, 20, 28, 24, 12, 4, 1, 1] },
+            { model: 'Qwen3-Next-80B-A3B', median: 12, distribution: [0, 2, 10, 22, 30, 22, 10, 3, 1, 0] },
+            { model: 'Qwen3-4B', median: 12, distribution: [0, 3, 10, 22, 30, 22, 9, 3, 1, 0] },
+            { model: 'GPT5-mini', median: 9, distribution: [2, 6, 18, 30, 25, 12, 5, 1, 1, 0] },
+            { model: 'Llama3.3-70B', median: 6, distribution: [10, 22, 30, 22, 10, 4, 1, 1, 0, 0] }
+        ],
+        globem: [
+            { model: 'GLM4.6', median: 22, distribution: [0, 0, 2, 6, 14, 22, 26, 20, 7, 3] },
+            { model: 'DeepSeekV3.2', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
+            { model: 'Qwen3-30B-A3B', median: 20, distribution: [0, 0, 3, 10, 18, 25, 22, 14, 5, 3] },
+            { model: 'Kimi-K2', median: 17, distribution: [0, 1, 4, 12, 20, 28, 20, 10, 3, 2] },
+            { model: 'MiniMax-M2', median: 17, distribution: [0, 1, 5, 14, 24, 28, 18, 7, 2, 1] },
+            { model: 'Gemini3-Flash', median: 15, distribution: [0, 1, 6, 15, 25, 28, 16, 6, 2, 1] },
+            { model: 'Claude4.5-Sonnet', median: 13, distribution: [0, 2, 10, 20, 28, 25, 10, 4, 1, 0] },
+            { model: 'GPT5.1', median: 13, distribution: [0, 2, 10, 20, 28, 25, 10, 4, 1, 0] },
+            { model: 'Qwen3-Next-80B-A3B', median: 12, distribution: [0, 2, 10, 22, 30, 22, 10, 3, 1, 0] },
+            { model: 'Qwen3-4B', median: 12, distribution: [0, 3, 10, 22, 30, 22, 9, 3, 1, 0] },
+            { model: 'GPT5.2', median: 11, distribution: [1, 4, 12, 25, 30, 18, 7, 2, 1, 0] },
+            { model: 'Qwen2.5-72B', median: 14, distribution: [0, 2, 8, 18, 28, 25, 12, 5, 1, 1] },
+            { model: 'GPT5-mini', median: 8, distribution: [3, 10, 20, 30, 22, 10, 3, 1, 1, 0] },
+            { model: 'Llama3.3-70B', median: 6, distribution: [10, 22, 32, 22, 9, 3, 1, 1, 0, 0] }
+        ]
+    },
+    // Entropy Analysis Data
+    entropy: {
+        mimic: {
+            'GPT-5.2': { entropy: [0.72, 0.78, 0.82, 0.68, 0.75, 0.88, 0.65, 0.79, 0.71, 0.84], coverage: [0.08, 0.10, 0.09, 0.07, 0.09, 0.11, 0.06, 0.10, 0.08, 0.10], accuracy: [30, 35, 40, 25, 32, 45, 20, 28, 31, 38] },
+            'Claude-4.5-Sonnet': { entropy: [0.85, 0.88, 0.92, 0.80, 0.87, 0.78, 0.82, 0.90, 0.86, 0.89], coverage: [0.12, 0.14, 0.13, 0.10, 0.13, 0.09, 0.11, 0.15, 0.12, 0.14], accuracy: [45, 50, 55, 40, 48, 35, 42, 52, 47, 51] },
+            'Gemini-3-Flash': { entropy: [0.70, 0.75, 0.68, 0.72, 0.80, 0.65, 0.78, 0.72, 0.69, 0.76], coverage: [0.06, 0.09, 0.07, 0.08, 0.10, 0.05, 0.09, 0.07, 0.06, 0.08], accuracy: [28, 32, 25, 30, 38, 22, 35, 28, 26, 33] },
+            'GLM-4.6': { entropy: [0.78, 0.82, 0.75, 0.80, 0.88, 0.72, 0.85, 0.78, 0.76, 0.83], coverage: [0.09, 0.11, 0.08, 0.10, 0.13, 0.07, 0.12, 0.09, 0.08, 0.11], accuracy: [32, 40, 28, 35, 45, 25, 42, 32, 30, 38] },
+            'DeepSeek-V3.2': { entropy: [0.82, 0.85, 0.78, 0.88, 0.75, 0.90, 0.80, 0.85, 0.81, 0.87], coverage: [0.10, 0.12, 0.09, 0.14, 0.08, 0.15, 0.10, 0.12, 0.10, 0.13], accuracy: [38, 42, 32, 48, 28, 52, 35, 42, 36, 44] }
+        },
+        '10k': {
+            'GPT-5.2': { entropy: [0.85, 0.88, 0.92, 0.82, 0.87, 0.94, 0.80, 0.89, 0.84, 0.91], coverage: [0.35, 0.42, 0.48, 0.32, 0.40, 0.52, 0.28, 0.44, 0.38, 0.46], accuracy: [35, 40, 45, 30, 38, 50, 25, 42, 36, 44] },
+            'Claude-4.5-Sonnet': { entropy: [0.92, 0.95, 0.98, 0.90, 0.94, 0.88, 0.91, 0.96, 0.93, 0.95], coverage: [0.55, 0.62, 0.68, 0.50, 0.58, 0.45, 0.52, 0.65, 0.56, 0.60], accuracy: [65, 72, 78, 60, 68, 55, 62, 75, 66, 70] },
+            'Gemini-3-Flash': { entropy: [0.82, 0.86, 0.80, 0.84, 0.90, 0.78, 0.88, 0.83, 0.81, 0.87], coverage: [0.28, 0.35, 0.25, 0.32, 0.42, 0.22, 0.38, 0.30, 0.26, 0.36], accuracy: [35, 40, 30, 38, 48, 28, 45, 36, 32, 42] },
+            'GLM-4.6': { entropy: [0.88, 0.92, 0.85, 0.90, 0.95, 0.82, 0.93, 0.88, 0.86, 0.91], coverage: [0.42, 0.50, 0.38, 0.46, 0.55, 0.35, 0.52, 0.44, 0.40, 0.48], accuracy: [50, 58, 45, 52, 62, 40, 56, 50, 46, 54] },
+            'DeepSeek-V3.2': { entropy: [0.90, 0.93, 0.87, 0.95, 0.85, 0.97, 0.89, 0.94, 0.88, 0.92], coverage: [0.48, 0.55, 0.42, 0.60, 0.38, 0.65, 0.50, 0.57, 0.45, 0.53], accuracy: [52, 60, 48, 65, 42, 70, 55, 62, 50, 58] }
+        },
+        globem: {
+            'GPT-5.2': { entropy: [0.75, 0.80, 0.85, 0.72, 0.78, 0.88, 0.70, 0.82, 0.76, 0.84], coverage: [0.65, 0.72, 0.78, 0.60, 0.70, 0.85, 0.55, 0.75, 0.68, 0.80], accuracy: [32, 38, 42, 28, 35, 48, 25, 40, 34, 44] },
+            'Claude-4.5-Sonnet': { entropy: [0.82, 0.86, 0.90, 0.78, 0.84, 0.75, 0.80, 0.88, 0.83, 0.87], coverage: [0.78, 0.85, 0.92, 0.72, 0.82, 0.68, 0.75, 0.88, 0.80, 0.86], accuracy: [38, 45, 50, 35, 42, 32, 38, 48, 40, 46] },
+            'Gemini-3-Flash': { entropy: [0.72, 0.77, 0.70, 0.75, 0.82, 0.68, 0.80, 0.74, 0.71, 0.78], coverage: [0.55, 0.65, 0.50, 0.58, 0.72, 0.45, 0.68, 0.60, 0.52, 0.66], accuracy: [30, 36, 28, 34, 42, 26, 40, 32, 28, 38] },
+            'GLM-4.6': { entropy: [0.80, 0.84, 0.78, 0.82, 0.90, 0.75, 0.87, 0.81, 0.79, 0.85], coverage: [0.72, 0.80, 0.68, 0.75, 0.88, 0.62, 0.85, 0.74, 0.70, 0.82], accuracy: [38, 45, 35, 42, 52, 30, 48, 40, 36, 46] },
+            'DeepSeek-V3.2': { entropy: [0.84, 0.88, 0.80, 0.90, 0.78, 0.92, 0.82, 0.87, 0.83, 0.89], coverage: [0.75, 0.82, 0.70, 0.88, 0.65, 0.92, 0.78, 0.84, 0.72, 0.86], accuracy: [36, 42, 32, 48, 28, 52, 38, 44, 34, 46] }
+        }
+    },
+    // Probing Results Data
+    probing: {
+        byTurn: {
+            mimic: {
+                'Qwen2.5-32B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.5, -11.8, -11.2, -10.5, -10.0, -9.5, -9.2, -8.8, -8.5, -8.2], sem: [0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
+                'Qwen2.5-72B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.8, -11.2, -10.5, -9.8, -9.2, -8.8, -8.4, -8.0, -7.7, -7.5], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2] },
+                'Qwen3-4B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-13.2, -12.5, -11.8, -11.0, -10.2, -9.5, -9.0, -8.5, -8.2, -7.8], sem: [0.9, 0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
+                'Qwen3-30B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.0, -11.2, -10.5, -9.8, -9.0, -8.5, -8.0, -7.6, -7.2, -7.0], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2] },
+                'Qwen3-Next-80B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-10.5, -9.8, -9.2, -8.5, -8.0, -7.5, -7.2, -6.8, -6.5, -6.2], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] }
+            },
+            globem: {
+                'Qwen2.5-32B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.5, -10.8, -10.2, -9.5, -9.0, -8.5, -8.2, -7.8, -7.5, -7.2], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2] },
+                'Qwen2.5-72B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-10.8, -10.2, -9.5, -8.8, -8.2, -7.8, -7.4, -7.0, -6.7, -6.5], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] },
+                'Qwen3-4B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.2, -11.5, -10.8, -10.0, -9.2, -8.5, -8.0, -7.5, -7.2, -6.8], sem: [0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
+                'Qwen3-30B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.0, -10.2, -9.5, -8.8, -8.0, -7.5, -7.0, -6.6, -6.2, -6.0], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2] },
+                'Qwen3-Next-80B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-9.5, -8.8, -8.2, -7.5, -7.0, -6.5, -6.2, -5.8, -5.5, -5.2], sem: [0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2] }
+            },
+            '10k': {
+                'Qwen2.5-32B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.0, -11.3, -10.7, -10.0, -9.5, -9.0, -8.7, -8.3, -8.0, -7.7], sem: [0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
+                'Qwen2.5-72B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.3, -10.7, -10.0, -9.3, -8.7, -8.3, -7.9, -7.5, -7.2, -7.0], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2] },
+                'Qwen3-4B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-12.7, -12.0, -11.3, -10.5, -9.7, -9.0, -8.5, -8.0, -7.7, -7.3], sem: [0.9, 0.8, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
+                'Qwen3-30B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-11.5, -10.7, -10.0, -9.3, -8.5, -8.0, -7.5, -7.1, -6.7, -6.5], sem: [0.7, 0.6, 0.5, 0.5, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] },
+                'Qwen3-Next-80B-A3B': { turns: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], logprob: [-10.0, -9.3, -8.7, -8.0, -7.5, -7.0, -6.7, -6.3, -6.0, -5.7], sem: [0.6, 0.5, 0.5, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2] }
+            }
+        },
+        byProgress: {
+            mimic: {
+                'Qwen2.5-32B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.5, -12.0, -11.5, -11.0, -10.5, -10.0, -9.5, -9.0, -8.5, -8.0], sem: [0.8, 0.7, 0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3] },
+                'Qwen2.5-72B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.0, -11.5, -11.0, -10.5, -9.8, -9.2, -8.7, -8.2, -7.8, -7.5], sem: [0.7, 0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
+                'Qwen3-4B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-13.0, -12.5, -12.0, -11.5, -10.8, -10.0, -9.3, -8.7, -8.2, -7.8], sem: [0.9, 0.8, 0.8, 0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4] },
+                'Qwen3-30B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.2, -11.7, -11.0, -10.3, -9.5, -8.8, -8.2, -7.6, -7.2, -6.8], sem: [0.7, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3] },
+                'Qwen3-Next-80B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-10.8, -10.2, -9.5, -8.8, -8.0, -7.5, -7.0, -6.5, -6.2, -5.8], sem: [0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2] }
+            },
+            globem: {
+                'Qwen2.5-32B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.5, -11.0, -10.5, -10.0, -9.5, -9.0, -8.5, -8.0, -7.5, -7.0], sem: [0.7, 0.7, 0.6, 0.5, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
+                'Qwen2.5-72B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.0, -10.5, -10.0, -9.5, -8.8, -8.2, -7.7, -7.2, -6.8, -6.5], sem: [0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.4, 0.3, 0.3, 0.2] },
+                'Qwen3-4B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.0, -11.5, -11.0, -10.5, -9.8, -9.0, -8.3, -7.7, -7.2, -6.8], sem: [0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3] },
+                'Qwen3-30B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.2, -10.7, -10.0, -9.3, -8.5, -7.8, -7.2, -6.6, -6.2, -5.8], sem: [0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2] },
+                'Qwen3-Next-80B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-9.8, -9.2, -8.5, -7.8, -7.0, -6.5, -6.0, -5.5, -5.2, -4.8], sem: [0.5, 0.5, 0.4, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2] }
+            },
+            '10k': {
+                'Qwen2.5-32B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.0, -11.5, -11.0, -10.5, -10.0, -9.5, -9.0, -8.5, -8.0, -7.5], sem: [0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.5, 0.4, 0.4, 0.3] },
+                'Qwen2.5-72B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.5, -11.0, -10.5, -10.0, -9.3, -8.7, -8.2, -7.7, -7.3, -7.0], sem: [0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.4, 0.3, 0.3] },
+                'Qwen3-4B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-12.5, -12.0, -11.5, -11.0, -10.3, -9.5, -8.8, -8.2, -7.7, -7.3], sem: [0.9, 0.8, 0.7, 0.7, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3] },
+                'Qwen3-30B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-11.7, -11.2, -10.5, -9.8, -9.0, -8.3, -7.7, -7.1, -6.7, -6.3], sem: [0.7, 0.6, 0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2] },
+                'Qwen3-Next-80B-A3B': { progress: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90], logprob: [-10.3, -9.7, -9.0, -8.3, -7.5, -7.0, -6.5, -6.0, -5.7, -5.3], sem: [0.6, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.2] }
+            }
+        }
+    },
+    // Probing model colors
+    probingColors: {
+        'Qwen2.5-32B': '#4A90D9',
+        'Qwen2.5-72B': '#1A5FB4',
+        'Qwen3-4B': '#57E389',
+        'Qwen3-30B-A3B': '#26A269',
+        'Qwen3-Next-80B-A3B': '#9141AC'
+    }
+};

index.html CHANGED Viewed

@@ -1,19 +1,163 @@
-<!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
 </html>

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <meta name="description" content="DDR-Bench: A Deep Data Research Agent Benchmark for LLMs">
+    <title>DDR-Bench | Deep Data Research Benchmark</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
+    <script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script>
+    <link rel="stylesheet" href="styles.css">
+</head>
+<body>
+    <!-- Hero Section -->
+    <header class="hero">
+        <div class="hero-content">
+            <div class="badge">🔬 Research Benchmark</div>
+            <h1>DDR-Bench</h1>
+            <p class="subtitle">Deep Data Research Agent Benchmark for Large Language Models</p>
+            <p class="description">
+                A comprehensive evaluation framework measuring AI agents' ability to conduct deep, iterative data exploration across medical records (MIMIC), financial filings (10-K), and behavioral data (GLOBEM).
+            </p>
+            <div class="stats-row">
+                <div class="stat-item">
+                    <span class="stat-value">22+</span>
+                    <span class="stat-label">Models Evaluated</span>
+                </div>
+                <div class="stat-item">
+                    <span class="stat-value">3</span>
+                    <span class="stat-label">Diverse Datasets</span>
+                </div>
+                <div class="stat-item">
+                    <span class="stat-value">5</span>
+                    <span class="stat-label">Analysis Dimensions</span>
+                </div>
+            </div>
+        </div>
+    </header>
+    <!-- Navigation -->
+    <nav class="nav-tabs">
+        <button class="nav-tab active" data-section="scaling">📈 Scaling Analysis</button>
+        <button class="nav-tab" data-section="entropy">🔀 Entropy Analysis</button>
+        <button class="nav-tab" data-section="ranking">🏆 Ranking Comparison</button>
+        <button class="nav-tab" data-section="turn">🔄 Turn Distribution</button>
+        <button class="nav-tab" data-section="probing">🔍 Probing Results</button>
+    </nav>
+    <!-- Main Content -->
+    <main class="content">
+        <!-- Scaling Analysis Section -->
+        <section id="scaling" class="section active">
+            <div class="section-header">
+                <h2>Scaling Analysis</h2>
+                <p>Explore how model performance scales with interaction turns, token usage, and inference cost across datasets.</p>
+            </div>
+            <div class="controls">
+                <label>
+                    <span>Dataset:</span>
+                    <select id="scaling-dataset">
+                        <option value="mimic">MIMIC</option>
+                        <option value="10k">10-K</option>
+                        <option value="globem">GLOBEM</option>
+                    </select>
+                </label>
+                <label>
+                    <span>Scaling Dimension:</span>
+                    <select id="scaling-dimension">
+                        <option value="turn">Interaction Turns</option>
+                        <option value="token">Token Usage</option>
+                        <option value="cost">Inference Cost</option>
+                    </select>
+                </label>
+            </div>
+            <div id="scaling-chart" class="chart-container"></div>
+        </section>
+        <!-- Entropy Analysis Section -->
+        <section id="entropy" class="section">
+            <div class="section-header">
+                <h2>Entropy vs Coverage Analysis</h2>
+                <p>Visualize the relationship between access entropy (exploration uniformity) and field coverage for each model.</p>
+            </div>
+            <div class="controls">
+                <label>
+                    <span>Dataset:</span>
+                    <select id="entropy-dataset">
+                        <option value="mimic">MIMIC</option>
+                        <option value="10k">10-K</option>
+                        <option value="globem">GLOBEM</option>
+                    </select>
+                </label>
+            </div>
+            <div id="entropy-chart" class="chart-container"></div>
+        </section>
+        <!-- Ranking Comparison Section -->
+        <section id="ranking" class="section">
+            <div class="section-header">
+                <h2>Novelty vs Accuracy Ranking</h2>
+                <p>Compare model rankings based on novelty (Bradley-Terry pairwise ranking) against traditional accuracy ranking.</p>
+            </div>
+            <div class="controls">
+                <label>
+                    <span>Dataset:</span>
+                    <select id="ranking-dataset">
+                        <option value="MIMIC">MIMIC</option>
+                        <option value="10K">10-K</option>
+                        <option value="GLOBEM">GLOBEM</option>
+                    </select>
+                </label>
+            </div>
+            <div id="ranking-chart" class="chart-container"></div>
+        </section>
+        <!-- Turn Distribution Section -->
+        <section id="turn" class="section">
+            <div class="section-header">
+                <h2>Turn Count Distribution</h2>
+                <p>Analyze the distribution of interaction turns across different models and datasets.</p>
+            </div>
+            <div class="controls">
+                <label>
+                    <span>Dataset:</span>
+                    <select id="turn-dataset">
+                        <option value="mimic">MIMIC</option>
+                        <option value="10k">10-K</option>
+                        <option value="globem">GLOBEM</option>
+                    </select>
+                </label>
+            </div>
+            <div id="turn-chart" class="chart-container tall"></div>
+        </section>
+        <!-- Probing Results Section -->
+        <section id="probing" class="section">
+            <div class="section-header">
+                <h2>FINISH Token Probing</h2>
+                <p>Analyze the average log probability of FINISH messages across conversation turns and progress.</p>
+            </div>
+            <div class="controls">
+                <label>
+                    <span>View Mode:</span>
+                    <select id="probing-mode">
+                        <option value="byTurn">By Turn</option>
+                        <option value="byProgress">By Progress (%)</option>
+                    </select>
+                </label>
+            </div>
+            <div id="probing-chart" class="chart-container"></div>
+        </section>
+    </main>
+    <!-- Footer -->
+    <footer class="footer">
+        <p>DDR-Bench © 2026 | Deep Data Research Agent Benchmark</p>
+    </footer>
+    <script src="data.js"></script>
+    <script src="charts.js"></script>
+</body>
 </html>

styles.css ADDED Viewed

	@@ -0,0 +1,337 @@

+/* Root Variables */
+:root {
+    --primary: #6366f1;
+    --primary-dark: #4f46e5;
+    --primary-light: #818cf8;
+    --secondary: #10b981;
+    --accent: #f59e0b;
+    --bg-dark: #0f172a;
+    --bg-card: #1e293b;
+    --bg-card-hover: #334155;
+    --text-primary: #f1f5f9;
+    --text-secondary: #94a3b8;
+    --text-muted: #64748b;
+    --border: #334155;
+    --shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.3), 0 2px 4px -2px rgba(0, 0, 0, 0.2);
+    --shadow-lg: 0 10px 15px -3px rgba(0, 0, 0, 0.4), 0 4px 6px -4px rgba(0, 0, 0, 0.3);
+    --gradient-primary: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
+    --gradient-hero: linear-gradient(135deg, #1e293b 0%, #0f172a 50%, #1a1f3c 100%);
+}
+/* Reset & Base */
+*, *::before, *::after {
+    box-sizing: border-box;
+    margin: 0;
+    padding: 0;
+}
+html {
+    scroll-behavior: smooth;
+}
+body {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+    background-color: var(--bg-dark);
+    color: var(--text-primary);
+    line-height: 1.6;
+    min-height: 100vh;
+}
+/* Hero Section */
+.hero {
+    background: var(--gradient-hero);
+    padding: 4rem 2rem 3rem;
+    text-align: center;
+    position: relative;
+    overflow: hidden;
+}
+.hero::before {
+    content: '';
+    position: absolute;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background:
+        radial-gradient(circle at 20% 50%, rgba(99, 102, 241, 0.15) 0%, transparent 50%),
+        radial-gradient(circle at 80% 50%, rgba(139, 92, 246, 0.1) 0%, transparent 50%);
+    pointer-events: none;
+}
+.hero-content {
+    max-width: 900px;
+    margin: 0 auto;
+    position: relative;
+    z-index: 1;
+}
+.badge {
+    display: inline-block;
+    background: rgba(99, 102, 241, 0.2);
+    color: var(--primary-light);
+    padding: 0.5rem 1rem;
+    border-radius: 2rem;
+    font-size: 0.85rem;
+    font-weight: 500;
+    margin-bottom: 1rem;
+    border: 1px solid rgba(99, 102, 241, 0.3);
+}
+.hero h1 {
+    font-size: 3.5rem;
+    font-weight: 700;
+    background: linear-gradient(135deg, #f1f5f9 0%, #818cf8 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+    margin-bottom: 0.75rem;
+    letter-spacing: -0.02em;
+}
+.subtitle {
+    font-size: 1.35rem;
+    color: var(--text-secondary);
+    margin-bottom: 1rem;
+    font-weight: 400;
+}
+.description {
+    font-size: 1rem;
+    color: var(--text-muted);
+    max-width: 700px;
+    margin: 0 auto 2rem;
+    line-height: 1.7;
+}
+.stats-row {
+    display: flex;
+    justify-content: center;
+    gap: 3rem;
+    margin-top: 2rem;
+}
+.stat-item {
+    text-align: center;
+}
+.stat-value {
+    display: block;
+    font-size: 2.5rem;
+    font-weight: 700;
+    color: var(--primary-light);
+}
+.stat-label {
+    font-size: 0.9rem;
+    color: var(--text-muted);
+}
+/* Navigation Tabs */
+.nav-tabs {
+    display: flex;
+    justify-content: center;
+    gap: 0.5rem;
+    padding: 1rem 2rem;
+    background: var(--bg-card);
+    border-bottom: 1px solid var(--border);
+    position: sticky;
+    top: 0;
+    z-index: 100;
+    flex-wrap: wrap;
+}
+.nav-tab {
+    padding: 0.75rem 1.5rem;
+    background: transparent;
+    border: 1px solid transparent;
+    border-radius: 0.5rem;
+    color: var(--text-secondary);
+    font-size: 0.95rem;
+    font-weight: 500;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    font-family: inherit;
+}
+.nav-tab:hover {
+    color: var(--text-primary);
+    background: var(--bg-card-hover);
+}
+.nav-tab.active {
+    color: var(--primary-light);
+    background: rgba(99, 102, 241, 0.15);
+    border-color: rgba(99, 102, 241, 0.3);
+}
+/* Main Content */
+.content {
+    max-width: 1400px;
+    margin: 0 auto;
+    padding: 2rem;
+}
+/* Sections */
+.section {
+    display: none;
+    animation: fadeIn 0.3s ease;
+}
+.section.active {
+    display: block;
+}
+@keyframes fadeIn {
+    from { opacity: 0; transform: translateY(10px); }
+    to { opacity: 1; transform: translateY(0); }
+}
+.section-header {
+    margin-bottom: 2rem;
+    text-align: center;
+}
+.section-header h2 {
+    font-size: 1.75rem;
+    font-weight: 600;
+    color: var(--text-primary);
+    margin-bottom: 0.5rem;
+}
+.section-header p {
+    color: var(--text-muted);
+    font-size: 1rem;
+}
+/* Controls */
+.controls {
+    display: flex;
+    justify-content: center;
+    gap: 1.5rem;
+    margin-bottom: 1.5rem;
+    flex-wrap: wrap;
+}
+.controls label {
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+}
+.controls label span {
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+    font-weight: 500;
+}
+.controls select {
+    padding: 0.6rem 1rem;
+    background: var(--bg-card);
+    border: 1px solid var(--border);
+    border-radius: 0.5rem;
+    color: var(--text-primary);
+    font-size: 0.9rem;
+    cursor: pointer;
+    transition: all 0.2s ease;
+    font-family: inherit;
+    min-width: 160px;
+}
+.controls select:hover {
+    border-color: var(--primary);
+}
+.controls select:focus {
+    outline: none;
+    border-color: var(--primary);
+    box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2);
+}
+/* Chart Container */
+.chart-container {
+    background: var(--bg-card);
+    border-radius: 1rem;
+    padding: 1.5rem;
+    box-shadow: var(--shadow);
+    min-height: 500px;
+    border: 1px solid var(--border);
+}
+.chart-container.tall {
+    min-height: 700px;
+}
+/* Footer */
+.footer {
+    text-align: center;
+    padding: 2rem;
+    color: var(--text-muted);
+    font-size: 0.9rem;
+    border-top: 1px solid var(--border);
+    margin-top: 3rem;
+}
+/* Responsive */
+@media (max-width: 768px) {
+    .hero {
+        padding: 3rem 1.5rem 2rem;
+    }
+    .hero h1 {
+        font-size: 2.5rem;
+    }
+    .subtitle {
+        font-size: 1.1rem;
+    }
+    .stats-row {
+        gap: 1.5rem;
+    }
+    .stat-value {
+        font-size: 2rem;
+    }
+    .nav-tabs {
+        padding: 0.75rem 1rem;
+        gap: 0.25rem;
+    }
+    .nav-tab {
+        padding: 0.5rem 1rem;
+        font-size: 0.85rem;
+    }
+    .content {
+        padding: 1rem;
+    }
+    .controls {
+        flex-direction: column;
+        align-items: stretch;
+    }
+    .controls label {
+        flex-direction: column;
+        align-items: flex-start;
+    }
+    .controls select {
+        width: 100%;
+    }
+}
+/* Plotly overrides for dark theme */
+.js-plotly-plot .plotly .modebar {
+    background: rgba(30, 41, 59, 0.9) !important;
+}
+.js-plotly-plot .plotly .modebar-btn path {
+    fill: var(--text-secondary) !important;
+}
+.js-plotly-plot .plotly .modebar-btn:hover path {
+    fill: var(--text-primary) !important;
+}