adgw commited on
Commit
28913ff
·
verified ·
1 Parent(s): 00e5e3d

Update benchmark leaderboard

Browse files
Files changed (1) hide show
  1. index.html +77 -165
index.html CHANGED
@@ -24,6 +24,23 @@
24
  color: #64748b; font-size: 12px; margin-bottom: 20px;
25
  line-height: 1.8;
26
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  .sep { color: #334155; margin: 0 8px; }
28
  .scoring-note {
29
  display: inline-flex; gap: 16px; flex-wrap: wrap;
@@ -33,7 +50,6 @@
33
  }
34
  .scoring-note span { display: flex; align-items: center; gap: 5px; }
35
  .dot { width: 9px; height: 9px; border-radius: 50%; flex-shrink: 0; }
36
-
37
  .filter-label { font-size: 10px; text-transform: uppercase; letter-spacing: 0.1em; color: #475569; margin-bottom: 10px; }
38
  #chips { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 24px; }
39
  .chip {
@@ -44,7 +60,6 @@
44
  }
45
  .chip:hover { border-color: #7dd3fc; color: #e2e8f0; }
46
  .chip.active { background: #1e3a5f; border-color: #38bdf8; color: #7dd3fc; }
47
-
48
  .metric-toggle {
49
  display: flex; width: fit-content;
50
  border: 1px solid #1e2a3a; border-radius: 6px;
@@ -55,7 +70,6 @@
55
  border: none; background: #131820; color: #64748b; transition: all .15s;
56
  }
57
  .mt-btn.active { background: #1e3a5f; color: #7dd3fc; }
58
-
59
  .table-wrap {
60
  overflow-x: auto; border-radius: 10px;
61
  border: 1px solid #1e2a3a; margin-bottom: 52px;
@@ -85,7 +99,6 @@
85
  border-right: 1px solid rgba(255,255,255,0.04);
86
  }
87
  td.empty { text-align: center; color: #2d3748; background: #111520; border-right: 1px solid rgba(255,255,255,0.04); }
88
-
89
  .section-title {
90
  font-family: 'Syne', sans-serif; font-size: 18px; font-weight: 700;
91
  color: #f1f5f9; margin-bottom: 4px;
@@ -94,35 +107,6 @@
94
  background: #111827; border: 1px solid #1e2a3a;
95
  border-radius: 10px; padding: 24px 20px;
96
  }
97
- .dist-wrap {
98
- overflow-x: auto; border-radius: 10px;
99
- border: 1px solid #1e2a3a; margin-bottom: 16px;
100
- }
101
- .dist-wrap table { border-collapse: collapse; width: auto; min-width: 100%; font-size: 12px; }
102
- .dist-wrap thead tr { background: #111827; border-bottom: 2px solid #1e2a3a; }
103
- .dist-wrap th {
104
- padding: 10px 10px; white-space: nowrap; font-size: 10px;
105
- text-transform: uppercase; letter-spacing: 0.07em; color: #475569; font-weight: 700;
106
- }
107
- .dist-wrap th.lang-h { text-align: left; width: 140px; padding-left: 14px; color: #64748b; }
108
- .dist-wrap th.score-h { width: 70px; text-align: center; }
109
- .dist-wrap th.total-h { width: 80px; text-align: center; color: #94a3b8; }
110
- .dist-wrap td { padding: 8px 10px; border-bottom: 1px solid #0f1520; white-space: nowrap; }
111
- .dist-wrap td.lang-d { padding-left: 14px; color: #cbd5e1; font-weight: 600; font-size: 12px; }
112
- .dist-wrap td.count-d { text-align: center; font-size: 12px; }
113
- .dist-wrap td.total-d { text-align: center; font-weight: 700; font-size: 12px; color: #94a3b8; }
114
- .dist-bar {
115
- display: inline-block; height: 6px; border-radius: 3px;
116
- background: #2563eb; vertical-align: middle; margin-left: 4px; opacity: 0.7;
117
- }
118
- /* ── analysis sections ── */
119
- .analysis-grid {
120
- display: grid;
121
- grid-template-columns: 1fr 1fr;
122
- gap: 24px;
123
- margin-bottom: 52px;
124
- }
125
- @media (max-width: 900px) { .analysis-grid { grid-template-columns: 1fr; } }
126
  .analysis-card {
127
  background: #111827; border: 1px solid #1e2a3a;
128
  border-radius: 10px; padding: 22px 20px;
@@ -131,6 +115,12 @@
131
  font-family: 'Syne', sans-serif; font-size: 14px; font-weight: 700;
132
  color: #f1f5f9; margin-bottom: 4px;
133
  }
 
 
 
 
 
 
134
  .analysis-card .card-sub {
135
  font-size: 11px; color: #475569; margin-bottom: 16px; line-height: 1.5;
136
  }
@@ -141,7 +131,7 @@
141
  }
142
  .model-select:focus { outline: none; border-color: #38bdf8; }
143
  .footer { margin-top: 20px; font-size: 11px; color: #2d3748; text-align: right; }
144
- ::-webkit-scrollbar { height: 5px; background: #0d1117; }
145
  ::-webkit-scrollbar-thumb { background: #2d3748; border-radius: 3px; }
146
  </style>
147
  </head>
@@ -149,10 +139,20 @@
149
  <h1>Text Quality Rating Benchmark</h1>
150
  <p class="meta-subtitle">
151
  LLM accuracy at rating text quality on a 1–6 scale across multiple languages
152
- <span class="sep">·</span> Labeled by DeepSeek V3.2 &amp; judged by Gemini 3 Flash
153
  <span class="sep">·</span> Documents sourced from FineWeb dataset
154
  </p>
155
 
 
 
 
 
 
 
 
 
 
 
 
156
  <div class="scoring-note">
157
  <span><span class="dot" style="background:#22c55e"></span>Exact match = 1.0 pt</span>
158
  <span><span class="dot" style="background:#eab308"></span>Off by ±1 = 0.5 pt</span>
@@ -174,17 +174,6 @@
174
  </table>
175
  </div>
176
 
177
- <!-- DISTRIBUTION SECTION (disabled for testing)
178
- <p class="section-title">Dataset Distribution</p>
179
- <p class="subtitle" style="margin-bottom:20px">Number of unique texts per rating score (1–6) for each language</p>
180
- <div class="dist-wrap">
181
- <table id="dist-table">
182
- <thead id="dist-head"></thead>
183
- <tbody id="dist-body"></tbody>
184
- </table>
185
- </div>
186
- -->
187
-
188
  <p class="section-title" style="margin-top:52px">Global Model Comparison</p>
189
  <p class="subtitle" style="margin-bottom:20px">Weighted Score vs Exact Accuracy — all languages combined, sorted by Weighted Score</p>
190
  <div class="chart-wrap">
@@ -194,24 +183,28 @@
194
  <p class="section-title" style="margin-bottom:4px;margin-top:52px">Model Error Analysis</p>
195
  <p class="subtitle" style="margin-bottom:20px">Bias, critical misclassifications and confusion patterns</p>
196
 
197
- <!-- Bias lollipop -->
198
  <div class="analysis-card" style="margin-bottom:24px">
199
  <h3>Prediction Bias</h3>
200
  <p class="card-sub">Average error (predicted − ground truth). Negative = underestimation, positive = overestimation.</p>
201
- <div id="biasChartContainer" style="position:relative">
202
- <canvas id="biasChart"></canvas>
 
 
203
  </div>
204
  </div>
205
 
206
- <!-- Critical confusion 1/2 vs 5/6 -->
207
  <div class="analysis-card" style="margin-bottom:52px">
208
  <h3>Critical Confusion Rate</h3>
209
  <p class="card-sub">
210
  % of low-quality texts (rating 1–2) predicted as high-quality (5–6) and vice versa.
211
  These are the most dangerous misclassifications.
212
  </p>
213
- <div id="criticalChartContainer" style="position:relative">
214
- <canvas id="criticalChart"></canvas>
 
 
215
  </div>
216
  </div>
217
 
@@ -233,7 +226,6 @@
233
  const ALL_LANGS = ["ar", "az", "be", "bg", "bo", "ca", "cn", "cs", "da", "el", "es", "et", "eu", "fa", "fi", "fr", "gl", "hu", "hv", "is", "it", "ka", "la", "li", "lv", "mk", "mt", "nl", "no", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "stack", "sv", "tr", "uk"];
234
  const LANG_NAMES = {"af": "Afrikaans", "ab": "Arabic", "az": "Azerbaijani", "be": "Belarusian", "bo": "Bosnian", "bg": "Bulgarian", "bn": "Brunei", "ca": "Catalan", "cs": "Czech", "cn": "Chinese", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Faroese", "fi": "Finnish", "fr": "French", "ga": "Irish", "gl": "Galician", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "ar": "Armenian", "hv": "Croatia", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "ka": "Georgian", "kk": "Kazakh", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "la": "Latin", "li": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "mt": "Maltese", "my": "Burmese", "ne": "Nepali", "nl": "Dutch", "no": "Norwegian", "pa": "Punjabi", "pe": "Persian", "pl": "Polish", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Filipino", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "vi": "Vietnamese", "zh": "Chinese", "zu": "Zulu"};
235
  const LANG_COUNTS = {"ar": 1790, "az": 1795, "be": 1794, "bg": 1721, "bo": 1795, "ca": 1712, "cn": 2686, "cs": 1779, "da": 1784, "el": 1794, "es": 1652, "et": 1788, "eu": 2684, "fa": 2697, "fi": 1784, "fr": 1761, "gl": 2698, "hu": 1796, "hv": 1796, "is": 1790, "it": 1791, "ka": 1629, "la": 2522, "li": 1793, "lv": 1795, "mk": 1799, "mt": 1797, "nl": 1795, "no": 1799, "pt": 1795, "ro": 1790, "ru": 1424, "sk": 1784, "sl": 1788, "sq": 1793, "sr": 1798, "stack": 2590, "sv": 1797, "tr": 1799, "uk": 1747};
236
- const LANG_DIST = {"ar": {"1": 100, "2": 100, "5": 100, "6": 100}, "az": {"1": 100, "2": 100, "5": 100, "6": 100}, "be": {"1": 100, "2": 100, "5": 100, "6": 100}, "bg": {"1": 100, "2": 100, "5": 100, "6": 62}, "bo": {"1": 100, "2": 100, "5": 100, "6": 100}, "ca": {"1": 73, "2": 100, "5": 100, "6": 86}, "cn": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "cs": {"1": 100, "2": 100, "5": 100, "6": 100}, "da": {"1": 100, "2": 100, "5": 100, "6": 100}, "el": {"1": 100, "2": 100, "5": 100, "6": 100}, "es": {"1": 100, "2": 100, "5": 100, "6": 33}, "et": {"1": 100, "2": 100, "5": 100, "6": 100}, "eu": {"1": 97, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "fa": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "fi": {"1": 100, "2": 100, "5": 100, "6": 100}, "fr": {"1": 100, "2": 100, "5": 100, "6": 82}, "gl": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "hu": {"1": 100, "2": 100, "5": 100, "6": 100}, "hv": {"1": 100, "2": 100, "5": 100, "6": 100}, "is": {"1": 100, "2": 100, "5": 100, "6": 100}, "it": {"1": 100, "2": 100, "5": 100, "6": 100}, "ka": {"1": 18, "2": 100, "5": 100, "6": 100}, "la": {"1": 100, "2": 96, "3": 17, "4": 100, "5": 100, "6": 100}, "li": {"1": 100, "2": 100, "5": 100, "6": 100}, "lv": {"1": 100, "2": 100, "5": 100, "6": 100}, "mk": {"1": 100, "2": 100, "5": 100, "6": 100}, "mt": {"1": 100, "2": 100, "5": 100, "6": 100}, "nl": {"1": 100, "2": 100, "5": 100, "6": 100}, "no": {"1": 100, "2": 100, "5": 100, "6": 100}, "pt": {"1": 100, "2": 100, "5": 100, "6": 100}, "ro": {"1": 100, "2": 100, "5": 100, "6": 100}, "ru": {"1": 100, "2": 100, "5": 100, "6": 3}, "sk": {"1": 100, "2": 100, "5": 100, "6": 100}, "sl": {"1": 100, "2": 100, "5": 100, "6": 100}, "sq": {"1": 100, "2": 100, "5": 100, "6": 100}, "sr": {"1": 100, "2": 100, "5": 100, "6": 100}, "stack": {"1": 49, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "sv": {"1": 100, "2": 100, "5": 100, "6": 100}, "tr": {"1": 100, "2": 100, "5": 100, "6": 100}, "uk": {"1": 100, "2": 100, "5": 100, "6": 74}};
237
 
238
  function langName(c) { return LANG_NAMES[c] || c.toUpperCase(); }
239
 
@@ -283,7 +275,7 @@
283
  ALL_LANGS.forEach(l => mk(langName(l), l, selLangs.includes(l)));
284
  }
285
 
286
- // chart
287
  let chartInstance = null;
288
  function renderChart() {
289
  const sorted = [...ALL_ROWS].sort((a, b) => b.avg_wp - a.avg_wp);
@@ -302,116 +294,27 @@
302
  data: {
303
  labels,
304
  datasets: [
305
- {
306
- label: 'Weighted Score',
307
- data: wpData,
308
- backgroundColor: '#2563eb',
309
- borderRadius: 3,
310
- barPercentage: 0.72,
311
- },
312
- {
313
- label: 'Exact Accuracy',
314
- data: exData,
315
- backgroundColor: '#16a34a',
316
- borderRadius: 3,
317
- barPercentage: 0.72,
318
- },
319
  ]
320
  },
321
  options: {
322
- indexAxis: 'y',
323
- responsive: true,
324
- maintainAspectRatio: false,
325
- animation: { duration: 500 },
326
  plugins: {
327
- legend: {
328
- position: 'bottom',
329
- labels: {
330
- color: '#94a3b8',
331
- font: { family: 'JetBrains Mono', size: 11 },
332
- boxWidth: 14, padding: 20,
333
- }
334
- },
335
- tooltip: {
336
- backgroundColor: '#1e2a3a',
337
- titleColor: '#e2e8f0',
338
- bodyColor: '#94a3b8',
339
- callbacks: {
340
- label: ctx => ` ${ctx.dataset.label}: ${ctx.parsed.x}%`
341
- }
342
- },
343
  },
344
  scales: {
345
- x: {
346
- min: 0, max: 108,
347
- grid: { color: '#1a2236' },
348
- ticks: {
349
- color: '#64748b',
350
- font: { family: 'JetBrains Mono', size: 10 },
351
- callback: v => v + '%',
352
- },
353
- title: {
354
- display: true, text: 'Percent (%)',
355
- color: '#64748b',
356
- font: { family: 'JetBrains Mono', size: 11 },
357
- }
358
- },
359
- y: {
360
- grid: { display: false },
361
- ticks: {
362
- color: '#cbd5e1',
363
- font: { family: 'JetBrains Mono', size: 11 },
364
- }
365
- }
366
  }
367
  }
368
  });
369
  }
370
 
371
- // distribution table
372
- function renderDist() {
373
- const scores = [1, 2, 3, 4, 5, 6];
374
- const langs = [...ALL_LANGS];
375
-
376
- // max count for bar scaling
377
- let maxCount = 0;
378
- langs.forEach(l => {
379
- const d = LANG_DIST[l] || {};
380
- scores.forEach(s => { if ((d[s]||0) > maxCount) maxCount = d[s]||0; });
381
- });
382
-
383
- // header
384
- const head = document.getElementById('dist-head');
385
- head.innerHTML = `<tr>
386
- <th class="lang-h">Language</th>
387
- ${scores.map(s => `<th class="score-h">Rating ${s}</th>`).join('')}
388
- <th class="total-h">Total texts</th>
389
- </tr>`;
390
-
391
- // body
392
- const body = document.getElementById('dist-body');
393
- body.innerHTML = langs.map(lang => {
394
- const d = LANG_DIST[lang] || {};
395
- const total = Object.values(d).reduce((a,b) => a+b, 0);
396
- const cells = scores.map(s => {
397
- const n = d[s] || 0;
398
- const bar = maxCount > 0 ? Math.round((n / maxCount) * 48) : 0;
399
- return `<td class="count-d">
400
- ${n > 0 ? `${n}<span class="dist-bar" style="width:${bar}px"></span>` : '<span style="color:#2d3748">—</span>'}
401
- </td>`;
402
- }).join('');
403
- return `<tr>
404
- <td class="lang-d">${langName(lang)}</td>
405
- ${cells}
406
- <td class="total-d">${total.toLocaleString()}</td>
407
- </tr>`;
408
- }).join('');
409
- }
410
-
411
  // table
412
  function render() {
413
  renderChips();
414
-
415
  const visLangs = selLangs.length > 0 ? [...selLangs].sort() : [...ALL_LANGS];
416
 
417
  let rows = ALL_ROWS.map(row => {
@@ -430,7 +333,6 @@
430
  return sortDir * (va - vb);
431
  });
432
 
433
- // header
434
  const head = document.getElementById('lb-head');
435
  const mkBtn = (label, col) => {
436
  const active = sortCol === col;
@@ -452,7 +354,6 @@
452
  };
453
  });
454
 
455
- // body
456
  const body = document.getElementById('lb-body');
457
  body.innerHTML = rows.map((row, i) => {
458
  const avgPct = (row._avg * 100).toFixed(1) + '%';
@@ -482,7 +383,7 @@
482
 
483
  const ctx = document.getElementById('biasChart').getContext('2d');
484
  const h = Math.max(260, sorted.length * 26 + 40);
485
- document.getElementById('biasChart').style.height = h + 'px';
486
 
487
  new Chart(ctx, {
488
  type: 'bar',
@@ -544,7 +445,7 @@
544
 
545
  const ctx = document.getElementById('criticalChart').getContext('2d');
546
  const h = Math.max(260, sorted.length * 26 + 60);
547
- document.getElementById('criticalChart').style.height = h + 'px';
548
 
549
  new Chart(ctx, {
550
  type: 'bar',
@@ -593,8 +494,8 @@
593
 
594
  const data = [];
595
  scores.forEach((gt, ri) => {
596
- const preds = conf[gt] || {};
597
- const rowSum = Object.values(preds).reduce((a, b) => a + b, 0);
598
  scores.forEach((pred, ci) => {
599
  const v = rowSum > 0 ? (preds[pred] || 0) : 0;
600
  data.push({ x: ci, y: ri, v });
@@ -606,9 +507,9 @@
606
  document.getElementById('confusionChart').style.height = '340px';
607
 
608
  function cellColor(ri, ci, v) {
609
- if (ri === ci) return `rgba(22,163,74,${0.15 + v * 0.85})`;
610
  if (Math.abs(ri-ci)>=3) return `rgba(220,38,38,${v * 0.9})`;
611
- return `rgba(37,99,235,${v * 0.75})`;
612
  }
613
 
614
  confChartInstance = new Chart(ctx, {
@@ -624,12 +525,24 @@
624
  } }
625
  },
626
  scales: {
627
- x: { type: 'linear', min: -0.5, max: 5.5,
628
- ticks: { stepSize: 1, callback: v => 'Pred ' + (scores[v] || ''), color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } },
 
 
 
 
 
629
  grid: { color: '#1a2236' },
 
630
  },
631
- y: { type: 'linear', min: -0.5, max: 5.5,
632
- ticks: { stepSize: 1, callback: v => 'GT ' + (scores[v] || ''), color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } },
 
 
 
 
 
 
633
  grid: { color: '#1a2236' },
634
  }
635
  }
@@ -638,8 +551,8 @@
638
  id: 'heatmap',
639
  afterDraw(chart) {
640
  const {ctx, scales: {x, y}} = chart;
641
- const cellW = x.getPixelForValue(1) - x.getPixelForValue(0);
642
- const cellH = y.getPixelForValue(0) - y.getPixelForValue(1);
643
  data.forEach(d => {
644
  const cx = x.getPixelForValue(d.x);
645
  const cy = y.getPixelForValue(d.y);
@@ -660,7 +573,6 @@
660
 
661
  render();
662
  renderChart();
663
- // renderDist(); // disabled for testing
664
  renderBias();
665
  renderCritical();
666
  populateConfSelect();
 
24
  color: #64748b; font-size: 12px; margin-bottom: 20px;
25
  line-height: 1.8;
26
  }
27
+ .methodology-box {
28
+ background: #111827; border: 1px solid #1e2a3a;
29
+ border-radius: 8px; padding: 18px 22px;
30
+ margin-bottom: 24px; max-width: 900px;
31
+ }
32
+ .methodology-box h3 {
33
+ font-family: 'Syne', sans-serif; font-size: 14px;
34
+ color: #e2e8f0; margin-bottom: 8px; font-weight: 700;
35
+ }
36
+ .methodology-box p, .methodology-box li {
37
+ font-size: 11.5px; color: #94a3b8; line-height: 1.6;
38
+ }
39
+ .methodology-box ul {
40
+ margin-top: 8px; padding-left: 20px;
41
+ }
42
+ .methodology-box li { margin-bottom: 4px; }
43
+ .highlight { color: #7dd3fc; font-weight: 600; }
44
  .sep { color: #334155; margin: 0 8px; }
45
  .scoring-note {
46
  display: inline-flex; gap: 16px; flex-wrap: wrap;
 
50
  }
51
  .scoring-note span { display: flex; align-items: center; gap: 5px; }
52
  .dot { width: 9px; height: 9px; border-radius: 50%; flex-shrink: 0; }
 
53
  .filter-label { font-size: 10px; text-transform: uppercase; letter-spacing: 0.1em; color: #475569; margin-bottom: 10px; }
54
  #chips { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 24px; }
55
  .chip {
 
60
  }
61
  .chip:hover { border-color: #7dd3fc; color: #e2e8f0; }
62
  .chip.active { background: #1e3a5f; border-color: #38bdf8; color: #7dd3fc; }
 
63
  .metric-toggle {
64
  display: flex; width: fit-content;
65
  border: 1px solid #1e2a3a; border-radius: 6px;
 
70
  border: none; background: #131820; color: #64748b; transition: all .15s;
71
  }
72
  .mt-btn.active { background: #1e3a5f; color: #7dd3fc; }
 
73
  .table-wrap {
74
  overflow-x: auto; border-radius: 10px;
75
  border: 1px solid #1e2a3a; margin-bottom: 52px;
 
99
  border-right: 1px solid rgba(255,255,255,0.04);
100
  }
101
  td.empty { text-align: center; color: #2d3748; background: #111520; border-right: 1px solid rgba(255,255,255,0.04); }
 
102
  .section-title {
103
  font-family: 'Syne', sans-serif; font-size: 18px; font-weight: 700;
104
  color: #f1f5f9; margin-bottom: 4px;
 
107
  background: #111827; border: 1px solid #1e2a3a;
108
  border-radius: 10px; padding: 24px 20px;
109
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  .analysis-card {
111
  background: #111827; border: 1px solid #1e2a3a;
112
  border-radius: 10px; padding: 22px 20px;
 
115
  font-family: 'Syne', sans-serif; font-size: 14px; font-weight: 700;
116
  color: #f1f5f9; margin-bottom: 4px;
117
  }
118
+ .chart-scroll-wrap {
119
+ max-height: 380px;
120
+ overflow-y: auto;
121
+ overflow-x: hidden;
122
+ padding-right: 8px;
123
+ }
124
  .analysis-card .card-sub {
125
  font-size: 11px; color: #475569; margin-bottom: 16px; line-height: 1.5;
126
  }
 
131
  }
132
  .model-select:focus { outline: none; border-color: #38bdf8; }
133
  .footer { margin-top: 20px; font-size: 11px; color: #2d3748; text-align: right; }
134
+ ::-webkit-scrollbar { width: 6px; height: 6px; background: #0d1117; }
135
  ::-webkit-scrollbar-thumb { background: #2d3748; border-radius: 3px; }
136
  </style>
137
  </head>
 
139
  <h1>Text Quality Rating Benchmark</h1>
140
  <p class="meta-subtitle">
141
  LLM accuracy at rating text quality on a 1–6 scale across multiple languages
 
142
  <span class="sep">·</span> Documents sourced from FineWeb dataset
143
  </p>
144
 
145
+ <div class="methodology-box">
146
+ <h3>Methodology</h3>
147
+ <p>The core objective of this benchmark is to evaluate how effectively Large Language Models can assess text quality, simulating the process of filtering data for LLM pre-training. The dataset curation followed a strict pipeline:</p>
148
+ <ul>
149
+ <li><span class="highlight">Initial Scoring:</span> Multilingual texts sampled from the FineWeb dataset were evaluated by <strong>DeepSeek V3.2</strong>, which assigned them a quality and substantiveness rating on a scale from 1 (lowest quality) to 6 (highest quality).</li>
150
+ <li><span class="highlight">Verification:</span> These initial scores were subsequently verified by an independent judge, <strong>Gemini 3 Flash</strong>.</li>
151
+ <li><span class="highlight">Filtering:</span> To ensure the highest ground-truth reliability, only the documents that received the absolute highest approval rating during the Gemini verification phase were included in this benchmark.</li>
152
+ <li><span class="highlight">Version:</span> 1.0, *de excluded in this version</li>
153
+ </ul>
154
+ </div>
155
+
156
  <div class="scoring-note">
157
  <span><span class="dot" style="background:#22c55e"></span>Exact match = 1.0 pt</span>
158
  <span><span class="dot" style="background:#eab308"></span>Off by ±1 = 0.5 pt</span>
 
174
  </table>
175
  </div>
176
 
 
 
 
 
 
 
 
 
 
 
 
177
  <p class="section-title" style="margin-top:52px">Global Model Comparison</p>
178
  <p class="subtitle" style="margin-bottom:20px">Weighted Score vs Exact Accuracy — all languages combined, sorted by Weighted Score</p>
179
  <div class="chart-wrap">
 
183
  <p class="section-title" style="margin-bottom:4px;margin-top:52px">Model Error Analysis</p>
184
  <p class="subtitle" style="margin-bottom:20px">Bias, critical misclassifications and confusion patterns</p>
185
 
186
+ <!-- Bias lollipop — full width -->
187
  <div class="analysis-card" style="margin-bottom:24px">
188
  <h3>Prediction Bias</h3>
189
  <p class="card-sub">Average error (predicted − ground truth). Negative = underestimation, positive = overestimation.</p>
190
+ <div class="chart-scroll-wrap">
191
+ <div id="biasChartContainer" style="position:relative">
192
+ <canvas id="biasChart"></canvas>
193
+ </div>
194
  </div>
195
  </div>
196
 
197
+ <!-- Critical confusion full width, below bias -->
198
  <div class="analysis-card" style="margin-bottom:52px">
199
  <h3>Critical Confusion Rate</h3>
200
  <p class="card-sub">
201
  % of low-quality texts (rating 1–2) predicted as high-quality (5–6) and vice versa.
202
  These are the most dangerous misclassifications.
203
  </p>
204
+ <div class="chart-scroll-wrap">
205
+ <div id="criticalChartContainer" style="position:relative">
206
+ <canvas id="criticalChart"></canvas>
207
+ </div>
208
  </div>
209
  </div>
210
 
 
226
  const ALL_LANGS = ["ar", "az", "be", "bg", "bo", "ca", "cn", "cs", "da", "el", "es", "et", "eu", "fa", "fi", "fr", "gl", "hu", "hv", "is", "it", "ka", "la", "li", "lv", "mk", "mt", "nl", "no", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "stack", "sv", "tr", "uk"];
227
  const LANG_NAMES = {"af": "Afrikaans", "ab": "Arabic", "az": "Azerbaijani", "be": "Belarusian", "bo": "Bosnian", "bg": "Bulgarian", "bn": "Brunei", "ca": "Catalan", "cs": "Czech", "cn": "Chinese", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Faroese", "fi": "Finnish", "fr": "French", "ga": "Irish", "gl": "Galician", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "ar": "Armenian", "hv": "Croatia", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "ka": "Georgian", "kk": "Kazakh", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "la": "Latin", "li": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "mt": "Maltese", "my": "Burmese", "ne": "Nepali", "nl": "Dutch", "no": "Norwegian", "pa": "Punjabi", "pe": "Persian", "pl": "Polish", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Filipino", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "vi": "Vietnamese", "zh": "Chinese", "zu": "Zulu"};
228
  const LANG_COUNTS = {"ar": 1790, "az": 1795, "be": 1794, "bg": 1721, "bo": 1795, "ca": 1712, "cn": 2686, "cs": 1779, "da": 1784, "el": 1794, "es": 1652, "et": 1788, "eu": 2684, "fa": 2697, "fi": 1784, "fr": 1761, "gl": 2698, "hu": 1796, "hv": 1796, "is": 1790, "it": 1791, "ka": 1629, "la": 2522, "li": 1793, "lv": 1795, "mk": 1799, "mt": 1797, "nl": 1795, "no": 1799, "pt": 1795, "ro": 1790, "ru": 1424, "sk": 1784, "sl": 1788, "sq": 1793, "sr": 1798, "stack": 2590, "sv": 1797, "tr": 1799, "uk": 1747};
 
229
 
230
  function langName(c) { return LANG_NAMES[c] || c.toUpperCase(); }
231
 
 
275
  ALL_LANGS.forEach(l => mk(langName(l), l, selLangs.includes(l)));
276
  }
277
 
278
+ // global chart
279
  let chartInstance = null;
280
  function renderChart() {
281
  const sorted = [...ALL_ROWS].sort((a, b) => b.avg_wp - a.avg_wp);
 
294
  data: {
295
  labels,
296
  datasets: [
297
+ { label: 'Weighted Score', data: wpData, backgroundColor: '#2563eb', borderRadius: 3, barPercentage: 0.72 },
298
+ { label: 'Exact Accuracy', data: exData, backgroundColor: '#16a34a', borderRadius: 3, barPercentage: 0.72 },
 
 
 
 
 
 
 
 
 
 
 
 
299
  ]
300
  },
301
  options: {
302
+ indexAxis: 'y', responsive: true, maintainAspectRatio: false, animation: { duration: 500 },
 
 
 
303
  plugins: {
304
+ legend: { position: 'bottom', labels: { color: '#94a3b8', font: { family: 'JetBrains Mono', size: 11 }, boxWidth: 14, padding: 20 } },
305
+ tooltip: { backgroundColor: '#1e2a3a', titleColor: '#e2e8f0', bodyColor: '#94a3b8', callbacks: { label: ctx => ` ${ctx.dataset.label}: ${ctx.parsed.x}%` } },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  },
307
  scales: {
308
+ x: { min: 0, max: 108, grid: { color: '#1a2236' }, ticks: { color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }, callback: v => v + '%' }, title: { display: true, text: 'Percent (%)', color: '#64748b', font: { family: 'JetBrains Mono', size: 11 } } },
309
+ y: { grid: { display: false }, ticks: { color: '#cbd5e1', font: { family: 'JetBrains Mono', size: 11 } } }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  }
311
  }
312
  });
313
  }
314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  // table
316
  function render() {
317
  renderChips();
 
318
  const visLangs = selLangs.length > 0 ? [...selLangs].sort() : [...ALL_LANGS];
319
 
320
  let rows = ALL_ROWS.map(row => {
 
333
  return sortDir * (va - vb);
334
  });
335
 
 
336
  const head = document.getElementById('lb-head');
337
  const mkBtn = (label, col) => {
338
  const active = sortCol === col;
 
354
  };
355
  });
356
 
 
357
  const body = document.getElementById('lb-body');
358
  body.innerHTML = rows.map((row, i) => {
359
  const avgPct = (row._avg * 100).toFixed(1) + '%';
 
383
 
384
  const ctx = document.getElementById('biasChart').getContext('2d');
385
  const h = Math.max(260, sorted.length * 26 + 40);
386
+ document.getElementById('biasChartContainer').style.height = h + 'px';
387
 
388
  new Chart(ctx, {
389
  type: 'bar',
 
445
 
446
  const ctx = document.getElementById('criticalChart').getContext('2d');
447
  const h = Math.max(260, sorted.length * 26 + 60);
448
+ document.getElementById('criticalChartContainer').style.height = h + 'px';
449
 
450
  new Chart(ctx, {
451
  type: 'bar',
 
494
 
495
  const data = [];
496
  scores.forEach((gt, ri) => {
497
+ const preds = conf[gt] || {};
498
+ const rowSum = Object.values(preds).reduce((a, b) => a + b, 0);
499
  scores.forEach((pred, ci) => {
500
  const v = rowSum > 0 ? (preds[pred] || 0) : 0;
501
  data.push({ x: ci, y: ri, v });
 
507
  document.getElementById('confusionChart').style.height = '340px';
508
 
509
  function cellColor(ri, ci, v) {
510
+ if (ri === ci) return `rgba(22,163,74,${0.15 + v * 0.85})`;
511
  if (Math.abs(ri-ci)>=3) return `rgba(220,38,38,${v * 0.9})`;
512
+ return `rgba(37,99,235,${v * 0.75})`;
513
  }
514
 
515
  confChartInstance = new Chart(ctx, {
 
525
  } }
526
  },
527
  scales: {
528
+ x: {
529
+ type: 'linear', min: -0.5, max: 5.5,
530
+ ticks: {
531
+ stepSize: 1,
532
+ callback: v => (Number.isInteger(v) && v >= 0 && v <= 5) ? 'Pred ' + scores[v] : '',
533
+ color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }
534
+ },
535
  grid: { color: '#1a2236' },
536
+ position: 'top'
537
  },
538
+ y: {
539
+ type: 'linear', min: -0.5, max: 5.5,
540
+ reverse: true,
541
+ ticks: {
542
+ stepSize: 1,
543
+ callback: v => (Number.isInteger(v) && v >= 0 && v <= 5) ? 'GT ' + scores[v] : '',
544
+ color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }
545
+ },
546
  grid: { color: '#1a2236' },
547
  }
548
  }
 
551
  id: 'heatmap',
552
  afterDraw(chart) {
553
  const {ctx, scales: {x, y}} = chart;
554
+ const cellW = Math.abs(x.getPixelForValue(1) - x.getPixelForValue(0));
555
+ const cellH = Math.abs(y.getPixelForValue(1) - y.getPixelForValue(0));
556
  data.forEach(d => {
557
  const cx = x.getPixelForValue(d.x);
558
  const cy = y.getPixelForValue(d.y);
 
573
 
574
  render();
575
  renderChart();
 
576
  renderBias();
577
  renderCritical();
578
  populateConfSelect();