Spaces:

adgw
/

Text_Quality_Rating_Benchmark

Running

App Files Files Community

adgw commited on 3 days ago

Commit

28913ff

verified ·

1 Parent(s): 00e5e3d

Update benchmark leaderboard

Browse files

Files changed (1) hide show

index.html +77 -165

index.html CHANGED Viewed

@@ -24,6 +24,23 @@
       color: #64748b; font-size: 12px; margin-bottom: 20px;
       line-height: 1.8;
     }
     .sep { color: #334155; margin: 0 8px; }
     .scoring-note {
       display: inline-flex; gap: 16px; flex-wrap: wrap;
@@ -33,7 +50,6 @@
     }
     .scoring-note span { display: flex; align-items: center; gap: 5px; }
     .dot { width: 9px; height: 9px; border-radius: 50%; flex-shrink: 0; }
     .filter-label { font-size: 10px; text-transform: uppercase; letter-spacing: 0.1em; color: #475569; margin-bottom: 10px; }
     #chips { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 24px; }
     .chip {
@@ -44,7 +60,6 @@
     }
     .chip:hover { border-color: #7dd3fc; color: #e2e8f0; }
     .chip.active { background: #1e3a5f; border-color: #38bdf8; color: #7dd3fc; }
     .metric-toggle {
       display: flex; width: fit-content;
       border: 1px solid #1e2a3a; border-radius: 6px;
@@ -55,7 +70,6 @@
       border: none; background: #131820; color: #64748b; transition: all .15s;
     }
     .mt-btn.active { background: #1e3a5f; color: #7dd3fc; }
     .table-wrap {
       overflow-x: auto; border-radius: 10px;
       border: 1px solid #1e2a3a; margin-bottom: 52px;
@@ -85,7 +99,6 @@
       border-right: 1px solid rgba(255,255,255,0.04);
     }
     td.empty { text-align: center; color: #2d3748; background: #111520; border-right: 1px solid rgba(255,255,255,0.04); }
     .section-title {
       font-family: 'Syne', sans-serif; font-size: 18px; font-weight: 700;
       color: #f1f5f9; margin-bottom: 4px;
@@ -94,35 +107,6 @@
       background: #111827; border: 1px solid #1e2a3a;
       border-radius: 10px; padding: 24px 20px;
     }
-    .dist-wrap {
-      overflow-x: auto; border-radius: 10px;
-      border: 1px solid #1e2a3a; margin-bottom: 16px;
-    }
-    .dist-wrap table { border-collapse: collapse; width: auto; min-width: 100%; font-size: 12px; }
-    .dist-wrap thead tr { background: #111827; border-bottom: 2px solid #1e2a3a; }
-    .dist-wrap th {
-      padding: 10px 10px; white-space: nowrap; font-size: 10px;
-      text-transform: uppercase; letter-spacing: 0.07em; color: #475569; font-weight: 700;
-    }
-    .dist-wrap th.lang-h { text-align: left; width: 140px; padding-left: 14px; color: #64748b; }
-    .dist-wrap th.score-h { width: 70px; text-align: center; }
-    .dist-wrap th.total-h { width: 80px; text-align: center; color: #94a3b8; }
-    .dist-wrap td { padding: 8px 10px; border-bottom: 1px solid #0f1520; white-space: nowrap; }
-    .dist-wrap td.lang-d { padding-left: 14px; color: #cbd5e1; font-weight: 600; font-size: 12px; }
-    .dist-wrap td.count-d { text-align: center; font-size: 12px; }
-    .dist-wrap td.total-d { text-align: center; font-weight: 700; font-size: 12px; color: #94a3b8; }
-    .dist-bar {
-      display: inline-block; height: 6px; border-radius: 3px;
-      background: #2563eb; vertical-align: middle; margin-left: 4px; opacity: 0.7;
-    }
-    /* ── analysis sections ── */
-    .analysis-grid {
-      display: grid;
-      grid-template-columns: 1fr 1fr;
-      gap: 24px;
-      margin-bottom: 52px;
-    }
-    @media (max-width: 900px) { .analysis-grid { grid-template-columns: 1fr; } }
     .analysis-card {
       background: #111827; border: 1px solid #1e2a3a;
       border-radius: 10px; padding: 22px 20px;
@@ -131,6 +115,12 @@
       font-family: 'Syne', sans-serif; font-size: 14px; font-weight: 700;
       color: #f1f5f9; margin-bottom: 4px;
     }
     .analysis-card .card-sub {
       font-size: 11px; color: #475569; margin-bottom: 16px; line-height: 1.5;
     }
@@ -141,7 +131,7 @@
     }
     .model-select:focus { outline: none; border-color: #38bdf8; }
     .footer { margin-top: 20px; font-size: 11px; color: #2d3748; text-align: right; }
-    ::-webkit-scrollbar { height: 5px; background: #0d1117; }
     ::-webkit-scrollbar-thumb { background: #2d3748; border-radius: 3px; }
   </style>
 </head>
@@ -149,10 +139,20 @@
   <h1>Text Quality Rating Benchmark</h1>
   <p class="meta-subtitle">
     LLM accuracy at rating text quality on a 1–6 scale across multiple languages
-    <span class="sep">·</span> Labeled by DeepSeek V3.2 &amp; judged by Gemini 3 Flash
     <span class="sep">·</span> Documents sourced from FineWeb dataset
   </p>
   <div class="scoring-note">
     <span><span class="dot" style="background:#22c55e"></span>Exact match = 1.0 pt</span>
     <span><span class="dot" style="background:#eab308"></span>Off by ±1 = 0.5 pt</span>
@@ -174,17 +174,6 @@
     </table>
   </div>
-  <!-- DISTRIBUTION SECTION (disabled for testing)
-  <p class="section-title">Dataset Distribution</p>
-  <p class="subtitle" style="margin-bottom:20px">Number of unique texts per rating score (1–6) for each language</p>
-  <div class="dist-wrap">
-    <table id="dist-table">
-      <thead id="dist-head"></thead>
-      <tbody id="dist-body"></tbody>
-    </table>
-  </div>
-  -->
   <p class="section-title" style="margin-top:52px">Global Model Comparison</p>
   <p class="subtitle" style="margin-bottom:20px">Weighted Score vs Exact Accuracy — all languages combined, sorted by Weighted Score</p>
   <div class="chart-wrap">
@@ -194,24 +183,28 @@
   <p class="section-title" style="margin-bottom:4px;margin-top:52px">Model Error Analysis</p>
   <p class="subtitle" style="margin-bottom:20px">Bias, critical misclassifications and confusion patterns</p>
-  <!-- Bias lollipop -->
   <div class="analysis-card" style="margin-bottom:24px">
     <h3>Prediction Bias</h3>
     <p class="card-sub">Average error (predicted − ground truth). Negative = underestimation, positive = overestimation.</p>
-    <div id="biasChartContainer" style="position:relative">
-      <canvas id="biasChart"></canvas>
     </div>
   </div>
-  <!-- Critical confusion 1/2 vs 5/6 -->
   <div class="analysis-card" style="margin-bottom:52px">
     <h3>Critical Confusion Rate</h3>
     <p class="card-sub">
       % of low-quality texts (rating 1–2) predicted as high-quality (5–6) and vice versa.
       These are the most dangerous misclassifications.
     </p>
-    <div id="criticalChartContainer" style="position:relative">
-      <canvas id="criticalChart"></canvas>
     </div>
   </div>
@@ -233,7 +226,6 @@
   const ALL_LANGS   = ["ar", "az", "be", "bg", "bo", "ca", "cn", "cs", "da", "el", "es", "et", "eu", "fa", "fi", "fr", "gl", "hu", "hv", "is", "it", "ka", "la", "li", "lv", "mk", "mt", "nl", "no", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "stack", "sv", "tr", "uk"];
   const LANG_NAMES  = {"af": "Afrikaans", "ab": "Arabic", "az": "Azerbaijani", "be": "Belarusian", "bo": "Bosnian", "bg": "Bulgarian", "bn": "Brunei", "ca": "Catalan", "cs": "Czech", "cn": "Chinese", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Faroese", "fi": "Finnish", "fr": "French", "ga": "Irish", "gl": "Galician", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "ar": "Armenian", "hv": "Croatia", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "ka": "Georgian", "kk": "Kazakh", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "la": "Latin", "li": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "mt": "Maltese", "my": "Burmese", "ne": "Nepali", "nl": "Dutch", "no": "Norwegian", "pa": "Punjabi", "pe": "Persian", "pl": "Polish", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Filipino", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "vi": "Vietnamese", "zh": "Chinese", "zu": "Zulu"};
   const LANG_COUNTS = {"ar": 1790, "az": 1795, "be": 1794, "bg": 1721, "bo": 1795, "ca": 1712, "cn": 2686, "cs": 1779, "da": 1784, "el": 1794, "es": 1652, "et": 1788, "eu": 2684, "fa": 2697, "fi": 1784, "fr": 1761, "gl": 2698, "hu": 1796, "hv": 1796, "is": 1790, "it": 1791, "ka": 1629, "la": 2522, "li": 1793, "lv": 1795, "mk": 1799, "mt": 1797, "nl": 1795, "no": 1799, "pt": 1795, "ro": 1790, "ru": 1424, "sk": 1784, "sl": 1788, "sq": 1793, "sr": 1798, "stack": 2590, "sv": 1797, "tr": 1799, "uk": 1747};
-  const LANG_DIST   = {"ar": {"1": 100, "2": 100, "5": 100, "6": 100}, "az": {"1": 100, "2": 100, "5": 100, "6": 100}, "be": {"1": 100, "2": 100, "5": 100, "6": 100}, "bg": {"1": 100, "2": 100, "5": 100, "6": 62}, "bo": {"1": 100, "2": 100, "5": 100, "6": 100}, "ca": {"1": 73, "2": 100, "5": 100, "6": 86}, "cn": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "cs": {"1": 100, "2": 100, "5": 100, "6": 100}, "da": {"1": 100, "2": 100, "5": 100, "6": 100}, "el": {"1": 100, "2": 100, "5": 100, "6": 100}, "es": {"1": 100, "2": 100, "5": 100, "6": 33}, "et": {"1": 100, "2": 100, "5": 100, "6": 100}, "eu": {"1": 97, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "fa": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "fi": {"1": 100, "2": 100, "5": 100, "6": 100}, "fr": {"1": 100, "2": 100, "5": 100, "6": 82}, "gl": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "hu": {"1": 100, "2": 100, "5": 100, "6": 100}, "hv": {"1": 100, "2": 100, "5": 100, "6": 100}, "is": {"1": 100, "2": 100, "5": 100, "6": 100}, "it": {"1": 100, "2": 100, "5": 100, "6": 100}, "ka": {"1": 18, "2": 100, "5": 100, "6": 100}, "la": {"1": 100, "2": 96, "3": 17, "4": 100, "5": 100, "6": 100}, "li": {"1": 100, "2": 100, "5": 100, "6": 100}, "lv": {"1": 100, "2": 100, "5": 100, "6": 100}, "mk": {"1": 100, "2": 100, "5": 100, "6": 100}, "mt": {"1": 100, "2": 100, "5": 100, "6": 100}, "nl": {"1": 100, "2": 100, "5": 100, "6": 100}, "no": {"1": 100, "2": 100, "5": 100, "6": 100}, "pt": {"1": 100, "2": 100, "5": 100, "6": 100}, "ro": {"1": 100, "2": 100, "5": 100, "6": 100}, "ru": {"1": 100, "2": 100, "5": 100, "6": 3}, "sk": {"1": 100, "2": 100, "5": 100, "6": 100}, "sl": {"1": 100, "2": 100, "5": 100, "6": 100}, "sq": {"1": 100, "2": 100, "5": 100, "6": 100}, "sr": {"1": 100, "2": 100, "5": 100, "6": 100}, "stack": {"1": 49, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "sv": {"1": 100, "2": 100, "5": 100, "6": 100}, "tr": {"1": 100, "2": 100, "5": 100, "6": 100}, "uk": {"1": 100, "2": 100, "5": 100, "6": 74}};
   function langName(c) { return LANG_NAMES[c] || c.toUpperCase(); }
@@ -283,7 +275,7 @@
     ALL_LANGS.forEach(l => mk(langName(l), l, selLangs.includes(l)));
   }
-  // chart
   let chartInstance = null;
   function renderChart() {
     const sorted = [...ALL_ROWS].sort((a, b) => b.avg_wp - a.avg_wp);
@@ -302,116 +294,27 @@
       data: {
         labels,
         datasets: [
-          {
-            label: 'Weighted Score',
-            data: wpData,
-            backgroundColor: '#2563eb',
-            borderRadius: 3,
-            barPercentage: 0.72,
-          },
-          {
-            label: 'Exact Accuracy',
-            data: exData,
-            backgroundColor: '#16a34a',
-            borderRadius: 3,
-            barPercentage: 0.72,
-          },
         ]
       },
       options: {
-        indexAxis: 'y',
-        responsive: true,
-        maintainAspectRatio: false,
-        animation: { duration: 500 },
         plugins: {
-          legend: {
-            position: 'bottom',
-            labels: {
-              color: '#94a3b8',
-              font: { family: 'JetBrains Mono', size: 11 },
-              boxWidth: 14, padding: 20,
-            }
-          },
-          tooltip: {
-            backgroundColor: '#1e2a3a',
-            titleColor: '#e2e8f0',
-            bodyColor: '#94a3b8',
-            callbacks: {
-              label: ctx => ` ${ctx.dataset.label}: ${ctx.parsed.x}%`
-            }
-          },
         },
         scales: {
-          x: {
-            min: 0, max: 108,
-            grid: { color: '#1a2236' },
-            ticks: {
-              color: '#64748b',
-              font: { family: 'JetBrains Mono', size: 10 },
-              callback: v => v + '%',
-            },
-            title: {
-              display: true, text: 'Percent (%)',
-              color: '#64748b',
-              font: { family: 'JetBrains Mono', size: 11 },
-            }
-          },
-          y: {
-            grid: { display: false },
-            ticks: {
-              color: '#cbd5e1',
-              font: { family: 'JetBrains Mono', size: 11 },
-            }
-          }
         }
       }
     });
   }
-  // distribution table
-  function renderDist() {
-    const scores = [1, 2, 3, 4, 5, 6];
-    const langs  = [...ALL_LANGS];
-    // max count for bar scaling
-    let maxCount = 0;
-    langs.forEach(l => {
-      const d = LANG_DIST[l] || {};
-      scores.forEach(s => { if ((d[s]||0) > maxCount) maxCount = d[s]||0; });
-    });
-    // header
-    const head = document.getElementById('dist-head');
-    head.innerHTML = `<tr>
-      <th class="lang-h">Language</th>
-      ${scores.map(s => `<th class="score-h">Rating ${s}</th>`).join('')}
-      <th class="total-h">Total texts</th>
-    </tr>`;
-    // body
-    const body = document.getElementById('dist-body');
-    body.innerHTML = langs.map(lang => {
-      const d     = LANG_DIST[lang] || {};
-      const total = Object.values(d).reduce((a,b) => a+b, 0);
-      const cells = scores.map(s => {
-        const n   = d[s] || 0;
-        const bar = maxCount > 0 ? Math.round((n / maxCount) * 48) : 0;
-        return `<td class="count-d">
-          ${n > 0 ? `${n}<span class="dist-bar" style="width:${bar}px"></span>` : '<span style="color:#2d3748">—</span>'}
-        </td>`;
-      }).join('');
-      return `<tr>
-        <td class="lang-d">${langName(lang)}</td>
-        ${cells}
-        <td class="total-d">${total.toLocaleString()}</td>
-      </tr>`;
-    }).join('');
-  }
   // table
   function render() {
     renderChips();
     const visLangs = selLangs.length > 0 ? [...selLangs].sort() : [...ALL_LANGS];
     let rows = ALL_ROWS.map(row => {
@@ -430,7 +333,6 @@
       return sortDir * (va - vb);
     });
-    // header
     const head = document.getElementById('lb-head');
     const mkBtn = (label, col) => {
       const active = sortCol === col;
@@ -452,7 +354,6 @@
       };
     });
-    // body
     const body = document.getElementById('lb-body');
     body.innerHTML = rows.map((row, i) => {
       const avgPct   = (row._avg * 100).toFixed(1) + '%';
@@ -482,7 +383,7 @@
     const ctx = document.getElementById('biasChart').getContext('2d');
     const h   = Math.max(260, sorted.length * 26 + 40);
-    document.getElementById('biasChart').style.height = h + 'px';
     new Chart(ctx, {
       type: 'bar',
@@ -544,7 +445,7 @@
     const ctx = document.getElementById('criticalChart').getContext('2d');
     const h   = Math.max(260, sorted.length * 26 + 60);
-    document.getElementById('criticalChart').style.height = h + 'px';
     new Chart(ctx, {
       type: 'bar',
@@ -593,8 +494,8 @@
     const data = [];
     scores.forEach((gt, ri) => {
-      const preds   = conf[gt] || {};
-      const rowSum  = Object.values(preds).reduce((a, b) => a + b, 0);
       scores.forEach((pred, ci) => {
         const v = rowSum > 0 ? (preds[pred] || 0) : 0;
         data.push({ x: ci, y: ri, v });
@@ -606,9 +507,9 @@
     document.getElementById('confusionChart').style.height = '340px';
     function cellColor(ri, ci, v) {
-      if (ri === ci)           return `rgba(22,163,74,${0.15 + v * 0.85})`;
       if (Math.abs(ri-ci)>=3) return `rgba(220,38,38,${v * 0.9})`;
-      return                          `rgba(37,99,235,${v * 0.75})`;
     }
     confChartInstance = new Chart(ctx, {
@@ -624,12 +525,24 @@
           } }
         },
         scales: {
-          x: { type: 'linear', min: -0.5, max: 5.5,
-            ticks: { stepSize: 1, callback: v => 'Pred ' + (scores[v] || ''), color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } },
             grid: { color: '#1a2236' },
           },
-          y: { type: 'linear', min: -0.5, max: 5.5,
-            ticks: { stepSize: 1, callback: v => 'GT ' + (scores[v] || ''), color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } },
             grid: { color: '#1a2236' },
           }
         }
@@ -638,8 +551,8 @@
         id: 'heatmap',
         afterDraw(chart) {
           const {ctx, scales: {x, y}} = chart;
-          const cellW = x.getPixelForValue(1) - x.getPixelForValue(0);
-          const cellH = y.getPixelForValue(0) - y.getPixelForValue(1);
           data.forEach(d => {
             const cx = x.getPixelForValue(d.x);
             const cy = y.getPixelForValue(d.y);
@@ -660,7 +573,6 @@
   render();
   renderChart();
-  // renderDist();  // disabled for testing
   renderBias();
   renderCritical();
   populateConfSelect();

       color: #64748b; font-size: 12px; margin-bottom: 20px;
       line-height: 1.8;
     }
+    .methodology-box {
+      background: #111827; border: 1px solid #1e2a3a;
+      border-radius: 8px; padding: 18px 22px;
+      margin-bottom: 24px; max-width: 900px;
+    }
+    .methodology-box h3 {
+      font-family: 'Syne', sans-serif; font-size: 14px;
+      color: #e2e8f0; margin-bottom: 8px; font-weight: 700;
+    }
+    .methodology-box p, .methodology-box li {
+      font-size: 11.5px; color: #94a3b8; line-height: 1.6;
+    }
+    .methodology-box ul {
+      margin-top: 8px; padding-left: 20px;
+    }
+    .methodology-box li { margin-bottom: 4px; }
+    .highlight { color: #7dd3fc; font-weight: 600; }
     .sep { color: #334155; margin: 0 8px; }
     .scoring-note {
       display: inline-flex; gap: 16px; flex-wrap: wrap;
     }
     .scoring-note span { display: flex; align-items: center; gap: 5px; }
     .dot { width: 9px; height: 9px; border-radius: 50%; flex-shrink: 0; }
     .filter-label { font-size: 10px; text-transform: uppercase; letter-spacing: 0.1em; color: #475569; margin-bottom: 10px; }
     #chips { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 24px; }
     .chip {
     }
     .chip:hover { border-color: #7dd3fc; color: #e2e8f0; }
     .chip.active { background: #1e3a5f; border-color: #38bdf8; color: #7dd3fc; }
     .metric-toggle {
       display: flex; width: fit-content;
       border: 1px solid #1e2a3a; border-radius: 6px;
       border: none; background: #131820; color: #64748b; transition: all .15s;
     }
     .mt-btn.active { background: #1e3a5f; color: #7dd3fc; }
     .table-wrap {
       overflow-x: auto; border-radius: 10px;
       border: 1px solid #1e2a3a; margin-bottom: 52px;
       border-right: 1px solid rgba(255,255,255,0.04);
     }
     td.empty { text-align: center; color: #2d3748; background: #111520; border-right: 1px solid rgba(255,255,255,0.04); }
     .section-title {
       font-family: 'Syne', sans-serif; font-size: 18px; font-weight: 700;
       color: #f1f5f9; margin-bottom: 4px;
       background: #111827; border: 1px solid #1e2a3a;
       border-radius: 10px; padding: 24px 20px;
     }
     .analysis-card {
       background: #111827; border: 1px solid #1e2a3a;
       border-radius: 10px; padding: 22px 20px;
       font-family: 'Syne', sans-serif; font-size: 14px; font-weight: 700;
       color: #f1f5f9; margin-bottom: 4px;
     }
+    .chart-scroll-wrap {
+      max-height: 380px;
+      overflow-y: auto;
+      overflow-x: hidden;
+      padding-right: 8px;
+    }
     .analysis-card .card-sub {
       font-size: 11px; color: #475569; margin-bottom: 16px; line-height: 1.5;
     }
     }
     .model-select:focus { outline: none; border-color: #38bdf8; }
     .footer { margin-top: 20px; font-size: 11px; color: #2d3748; text-align: right; }
+    ::-webkit-scrollbar { width: 6px; height: 6px; background: #0d1117; }
     ::-webkit-scrollbar-thumb { background: #2d3748; border-radius: 3px; }
   </style>
 </head>
   <h1>Text Quality Rating Benchmark</h1>
   <p class="meta-subtitle">
     LLM accuracy at rating text quality on a 1–6 scale across multiple languages
     <span class="sep">·</span> Documents sourced from FineWeb dataset
   </p>
+  <div class="methodology-box">
+    <h3>Methodology</h3>
+    <p>The core objective of this benchmark is to evaluate how effectively Large Language Models can assess text quality, simulating the process of filtering data for LLM pre-training. The dataset curation followed a strict pipeline:</p>
+    <ul>
+      <li><span class="highlight">Initial Scoring:</span> Multilingual texts sampled from the FineWeb dataset were evaluated by <strong>DeepSeek V3.2</strong>, which assigned them a quality and substantiveness rating on a scale from 1 (lowest quality) to 6 (highest quality).</li>
+      <li><span class="highlight">Verification:</span> These initial scores were subsequently verified by an independent judge, <strong>Gemini 3 Flash</strong>.</li>
+      <li><span class="highlight">Filtering:</span> To ensure the highest ground-truth reliability, only the documents that received the absolute highest approval rating during the Gemini verification phase were included in this benchmark.</li>
+      <li><span class="highlight">Version:</span> 1.0, *de excluded in this version</li>
+    </ul>
+  </div>
   <div class="scoring-note">
     <span><span class="dot" style="background:#22c55e"></span>Exact match = 1.0 pt</span>
     <span><span class="dot" style="background:#eab308"></span>Off by ±1 = 0.5 pt</span>
     </table>
   </div>
   <p class="section-title" style="margin-top:52px">Global Model Comparison</p>
   <p class="subtitle" style="margin-bottom:20px">Weighted Score vs Exact Accuracy — all languages combined, sorted by Weighted Score</p>
   <div class="chart-wrap">
   <p class="section-title" style="margin-bottom:4px;margin-top:52px">Model Error Analysis</p>
   <p class="subtitle" style="margin-bottom:20px">Bias, critical misclassifications and confusion patterns</p>
+  <!-- Bias lollipop — full width -->
   <div class="analysis-card" style="margin-bottom:24px">
     <h3>Prediction Bias</h3>
     <p class="card-sub">Average error (predicted − ground truth). Negative = underestimation, positive = overestimation.</p>
+    <div class="chart-scroll-wrap">
+      <div id="biasChartContainer" style="position:relative">
+        <canvas id="biasChart"></canvas>
+      </div>
     </div>
   </div>
+  <!-- Critical confusion — full width, below bias -->
   <div class="analysis-card" style="margin-bottom:52px">
     <h3>Critical Confusion Rate</h3>
     <p class="card-sub">
       % of low-quality texts (rating 1–2) predicted as high-quality (5–6) and vice versa.
       These are the most dangerous misclassifications.
     </p>
+    <div class="chart-scroll-wrap">
+      <div id="criticalChartContainer" style="position:relative">
+        <canvas id="criticalChart"></canvas>
+      </div>
     </div>
   </div>
   const ALL_LANGS   = ["ar", "az", "be", "bg", "bo", "ca", "cn", "cs", "da", "el", "es", "et", "eu", "fa", "fi", "fr", "gl", "hu", "hv", "is", "it", "ka", "la", "li", "lv", "mk", "mt", "nl", "no", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "stack", "sv", "tr", "uk"];
   const LANG_NAMES  = {"af": "Afrikaans", "ab": "Arabic", "az": "Azerbaijani", "be": "Belarusian", "bo": "Bosnian", "bg": "Bulgarian", "bn": "Brunei", "ca": "Catalan", "cs": "Czech", "cn": "Chinese", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Faroese", "fi": "Finnish", "fr": "French", "ga": "Irish", "gl": "Galician", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "ar": "Armenian", "hv": "Croatia", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "ka": "Georgian", "kk": "Kazakh", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "la": "Latin", "li": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "mt": "Maltese", "my": "Burmese", "ne": "Nepali", "nl": "Dutch", "no": "Norwegian", "pa": "Punjabi", "pe": "Persian", "pl": "Polish", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Filipino", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "vi": "Vietnamese", "zh": "Chinese", "zu": "Zulu"};
   const LANG_COUNTS = {"ar": 1790, "az": 1795, "be": 1794, "bg": 1721, "bo": 1795, "ca": 1712, "cn": 2686, "cs": 1779, "da": 1784, "el": 1794, "es": 1652, "et": 1788, "eu": 2684, "fa": 2697, "fi": 1784, "fr": 1761, "gl": 2698, "hu": 1796, "hv": 1796, "is": 1790, "it": 1791, "ka": 1629, "la": 2522, "li": 1793, "lv": 1795, "mk": 1799, "mt": 1797, "nl": 1795, "no": 1799, "pt": 1795, "ro": 1790, "ru": 1424, "sk": 1784, "sl": 1788, "sq": 1793, "sr": 1798, "stack": 2590, "sv": 1797, "tr": 1799, "uk": 1747};
   function langName(c) { return LANG_NAMES[c] || c.toUpperCase(); }
     ALL_LANGS.forEach(l => mk(langName(l), l, selLangs.includes(l)));
   }
+  // global chart
   let chartInstance = null;
   function renderChart() {
     const sorted = [...ALL_ROWS].sort((a, b) => b.avg_wp - a.avg_wp);
       data: {
         labels,
         datasets: [
+          { label: 'Weighted Score', data: wpData, backgroundColor: '#2563eb', borderRadius: 3, barPercentage: 0.72 },
+          { label: 'Exact Accuracy', data: exData, backgroundColor: '#16a34a', borderRadius: 3, barPercentage: 0.72 },
         ]
       },
       options: {
+        indexAxis: 'y', responsive: true, maintainAspectRatio: false, animation: { duration: 500 },
         plugins: {
+          legend: { position: 'bottom', labels: { color: '#94a3b8', font: { family: 'JetBrains Mono', size: 11 }, boxWidth: 14, padding: 20 } },
+          tooltip: { backgroundColor: '#1e2a3a', titleColor: '#e2e8f0', bodyColor: '#94a3b8', callbacks: { label: ctx => ` ${ctx.dataset.label}: ${ctx.parsed.x}%` } },
         },
         scales: {
+          x: { min: 0, max: 108, grid: { color: '#1a2236' }, ticks: { color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }, callback: v => v + '%' }, title: { display: true, text: 'Percent (%)', color: '#64748b', font: { family: 'JetBrains Mono', size: 11 } } },
+          y: { grid: { display: false }, ticks: { color: '#cbd5e1', font: { family: 'JetBrains Mono', size: 11 } } }
         }
       }
     });
   }
   // table
   function render() {
     renderChips();
     const visLangs = selLangs.length > 0 ? [...selLangs].sort() : [...ALL_LANGS];
     let rows = ALL_ROWS.map(row => {
       return sortDir * (va - vb);
     });
     const head = document.getElementById('lb-head');
     const mkBtn = (label, col) => {
       const active = sortCol === col;
       };
     });
     const body = document.getElementById('lb-body');
     body.innerHTML = rows.map((row, i) => {
       const avgPct   = (row._avg * 100).toFixed(1) + '%';
     const ctx = document.getElementById('biasChart').getContext('2d');
     const h   = Math.max(260, sorted.length * 26 + 40);
+    document.getElementById('biasChartContainer').style.height = h + 'px';
     new Chart(ctx, {
       type: 'bar',
     const ctx = document.getElementById('criticalChart').getContext('2d');
     const h   = Math.max(260, sorted.length * 26 + 60);
+    document.getElementById('criticalChartContainer').style.height = h + 'px';
     new Chart(ctx, {
       type: 'bar',
     const data = [];
     scores.forEach((gt, ri) => {
+      const preds  = conf[gt] || {};
+      const rowSum = Object.values(preds).reduce((a, b) => a + b, 0);
       scores.forEach((pred, ci) => {
         const v = rowSum > 0 ? (preds[pred] || 0) : 0;
         data.push({ x: ci, y: ri, v });
     document.getElementById('confusionChart').style.height = '340px';
     function cellColor(ri, ci, v) {
+      if (ri === ci)          return `rgba(22,163,74,${0.15 + v * 0.85})`;
       if (Math.abs(ri-ci)>=3) return `rgba(220,38,38,${v * 0.9})`;
+      return                         `rgba(37,99,235,${v * 0.75})`;
     }
     confChartInstance = new Chart(ctx, {
           } }
         },
         scales: {
+          x: {
+            type: 'linear', min: -0.5, max: 5.5,
+            ticks: {
+              stepSize: 1,
+              callback: v => (Number.isInteger(v) && v >= 0 && v <= 5) ? 'Pred ' + scores[v] : '',
+              color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }
+            },
             grid: { color: '#1a2236' },
+            position: 'top'
           },
+          y: {
+            type: 'linear', min: -0.5, max: 5.5,
+            reverse: true,
+            ticks: {
+              stepSize: 1,
+              callback: v => (Number.isInteger(v) && v >= 0 && v <= 5) ? 'GT ' + scores[v] : '',
+              color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }
+            },
             grid: { color: '#1a2236' },
           }
         }
         id: 'heatmap',
         afterDraw(chart) {
           const {ctx, scales: {x, y}} = chart;
+          const cellW = Math.abs(x.getPixelForValue(1) - x.getPixelForValue(0));
+          const cellH = Math.abs(y.getPixelForValue(1) - y.getPixelForValue(0));
           data.forEach(d => {
             const cx = x.getPixelForValue(d.x);
             const cy = y.getPixelForValue(d.y);
   render();
   renderChart();
   renderBias();
   renderCritical();
   populateConfSelect();