Update benchmark leaderboard
Browse files- index.html +77 -165
index.html
CHANGED
|
@@ -24,6 +24,23 @@
|
|
| 24 |
color: #64748b; font-size: 12px; margin-bottom: 20px;
|
| 25 |
line-height: 1.8;
|
| 26 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
.sep { color: #334155; margin: 0 8px; }
|
| 28 |
.scoring-note {
|
| 29 |
display: inline-flex; gap: 16px; flex-wrap: wrap;
|
|
@@ -33,7 +50,6 @@
|
|
| 33 |
}
|
| 34 |
.scoring-note span { display: flex; align-items: center; gap: 5px; }
|
| 35 |
.dot { width: 9px; height: 9px; border-radius: 50%; flex-shrink: 0; }
|
| 36 |
-
|
| 37 |
.filter-label { font-size: 10px; text-transform: uppercase; letter-spacing: 0.1em; color: #475569; margin-bottom: 10px; }
|
| 38 |
#chips { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 24px; }
|
| 39 |
.chip {
|
|
@@ -44,7 +60,6 @@
|
|
| 44 |
}
|
| 45 |
.chip:hover { border-color: #7dd3fc; color: #e2e8f0; }
|
| 46 |
.chip.active { background: #1e3a5f; border-color: #38bdf8; color: #7dd3fc; }
|
| 47 |
-
|
| 48 |
.metric-toggle {
|
| 49 |
display: flex; width: fit-content;
|
| 50 |
border: 1px solid #1e2a3a; border-radius: 6px;
|
|
@@ -55,7 +70,6 @@
|
|
| 55 |
border: none; background: #131820; color: #64748b; transition: all .15s;
|
| 56 |
}
|
| 57 |
.mt-btn.active { background: #1e3a5f; color: #7dd3fc; }
|
| 58 |
-
|
| 59 |
.table-wrap {
|
| 60 |
overflow-x: auto; border-radius: 10px;
|
| 61 |
border: 1px solid #1e2a3a; margin-bottom: 52px;
|
|
@@ -85,7 +99,6 @@
|
|
| 85 |
border-right: 1px solid rgba(255,255,255,0.04);
|
| 86 |
}
|
| 87 |
td.empty { text-align: center; color: #2d3748; background: #111520; border-right: 1px solid rgba(255,255,255,0.04); }
|
| 88 |
-
|
| 89 |
.section-title {
|
| 90 |
font-family: 'Syne', sans-serif; font-size: 18px; font-weight: 700;
|
| 91 |
color: #f1f5f9; margin-bottom: 4px;
|
|
@@ -94,35 +107,6 @@
|
|
| 94 |
background: #111827; border: 1px solid #1e2a3a;
|
| 95 |
border-radius: 10px; padding: 24px 20px;
|
| 96 |
}
|
| 97 |
-
.dist-wrap {
|
| 98 |
-
overflow-x: auto; border-radius: 10px;
|
| 99 |
-
border: 1px solid #1e2a3a; margin-bottom: 16px;
|
| 100 |
-
}
|
| 101 |
-
.dist-wrap table { border-collapse: collapse; width: auto; min-width: 100%; font-size: 12px; }
|
| 102 |
-
.dist-wrap thead tr { background: #111827; border-bottom: 2px solid #1e2a3a; }
|
| 103 |
-
.dist-wrap th {
|
| 104 |
-
padding: 10px 10px; white-space: nowrap; font-size: 10px;
|
| 105 |
-
text-transform: uppercase; letter-spacing: 0.07em; color: #475569; font-weight: 700;
|
| 106 |
-
}
|
| 107 |
-
.dist-wrap th.lang-h { text-align: left; width: 140px; padding-left: 14px; color: #64748b; }
|
| 108 |
-
.dist-wrap th.score-h { width: 70px; text-align: center; }
|
| 109 |
-
.dist-wrap th.total-h { width: 80px; text-align: center; color: #94a3b8; }
|
| 110 |
-
.dist-wrap td { padding: 8px 10px; border-bottom: 1px solid #0f1520; white-space: nowrap; }
|
| 111 |
-
.dist-wrap td.lang-d { padding-left: 14px; color: #cbd5e1; font-weight: 600; font-size: 12px; }
|
| 112 |
-
.dist-wrap td.count-d { text-align: center; font-size: 12px; }
|
| 113 |
-
.dist-wrap td.total-d { text-align: center; font-weight: 700; font-size: 12px; color: #94a3b8; }
|
| 114 |
-
.dist-bar {
|
| 115 |
-
display: inline-block; height: 6px; border-radius: 3px;
|
| 116 |
-
background: #2563eb; vertical-align: middle; margin-left: 4px; opacity: 0.7;
|
| 117 |
-
}
|
| 118 |
-
/* ── analysis sections ── */
|
| 119 |
-
.analysis-grid {
|
| 120 |
-
display: grid;
|
| 121 |
-
grid-template-columns: 1fr 1fr;
|
| 122 |
-
gap: 24px;
|
| 123 |
-
margin-bottom: 52px;
|
| 124 |
-
}
|
| 125 |
-
@media (max-width: 900px) { .analysis-grid { grid-template-columns: 1fr; } }
|
| 126 |
.analysis-card {
|
| 127 |
background: #111827; border: 1px solid #1e2a3a;
|
| 128 |
border-radius: 10px; padding: 22px 20px;
|
|
@@ -131,6 +115,12 @@
|
|
| 131 |
font-family: 'Syne', sans-serif; font-size: 14px; font-weight: 700;
|
| 132 |
color: #f1f5f9; margin-bottom: 4px;
|
| 133 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
.analysis-card .card-sub {
|
| 135 |
font-size: 11px; color: #475569; margin-bottom: 16px; line-height: 1.5;
|
| 136 |
}
|
|
@@ -141,7 +131,7 @@
|
|
| 141 |
}
|
| 142 |
.model-select:focus { outline: none; border-color: #38bdf8; }
|
| 143 |
.footer { margin-top: 20px; font-size: 11px; color: #2d3748; text-align: right; }
|
| 144 |
-
::-webkit-scrollbar { height:
|
| 145 |
::-webkit-scrollbar-thumb { background: #2d3748; border-radius: 3px; }
|
| 146 |
</style>
|
| 147 |
</head>
|
|
@@ -149,10 +139,20 @@
|
|
| 149 |
<h1>Text Quality Rating Benchmark</h1>
|
| 150 |
<p class="meta-subtitle">
|
| 151 |
LLM accuracy at rating text quality on a 1–6 scale across multiple languages
|
| 152 |
-
<span class="sep">·</span> Labeled by DeepSeek V3.2 & judged by Gemini 3 Flash
|
| 153 |
<span class="sep">·</span> Documents sourced from FineWeb dataset
|
| 154 |
</p>
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
<div class="scoring-note">
|
| 157 |
<span><span class="dot" style="background:#22c55e"></span>Exact match = 1.0 pt</span>
|
| 158 |
<span><span class="dot" style="background:#eab308"></span>Off by ±1 = 0.5 pt</span>
|
|
@@ -174,17 +174,6 @@
|
|
| 174 |
</table>
|
| 175 |
</div>
|
| 176 |
|
| 177 |
-
<!-- DISTRIBUTION SECTION (disabled for testing)
|
| 178 |
-
<p class="section-title">Dataset Distribution</p>
|
| 179 |
-
<p class="subtitle" style="margin-bottom:20px">Number of unique texts per rating score (1–6) for each language</p>
|
| 180 |
-
<div class="dist-wrap">
|
| 181 |
-
<table id="dist-table">
|
| 182 |
-
<thead id="dist-head"></thead>
|
| 183 |
-
<tbody id="dist-body"></tbody>
|
| 184 |
-
</table>
|
| 185 |
-
</div>
|
| 186 |
-
-->
|
| 187 |
-
|
| 188 |
<p class="section-title" style="margin-top:52px">Global Model Comparison</p>
|
| 189 |
<p class="subtitle" style="margin-bottom:20px">Weighted Score vs Exact Accuracy — all languages combined, sorted by Weighted Score</p>
|
| 190 |
<div class="chart-wrap">
|
|
@@ -194,24 +183,28 @@
|
|
| 194 |
<p class="section-title" style="margin-bottom:4px;margin-top:52px">Model Error Analysis</p>
|
| 195 |
<p class="subtitle" style="margin-bottom:20px">Bias, critical misclassifications and confusion patterns</p>
|
| 196 |
|
| 197 |
-
<!-- Bias lollipop -->
|
| 198 |
<div class="analysis-card" style="margin-bottom:24px">
|
| 199 |
<h3>Prediction Bias</h3>
|
| 200 |
<p class="card-sub">Average error (predicted − ground truth). Negative = underestimation, positive = overestimation.</p>
|
| 201 |
-
<div
|
| 202 |
-
<
|
|
|
|
|
|
|
| 203 |
</div>
|
| 204 |
</div>
|
| 205 |
|
| 206 |
-
<!-- Critical confusion
|
| 207 |
<div class="analysis-card" style="margin-bottom:52px">
|
| 208 |
<h3>Critical Confusion Rate</h3>
|
| 209 |
<p class="card-sub">
|
| 210 |
% of low-quality texts (rating 1–2) predicted as high-quality (5–6) and vice versa.
|
| 211 |
These are the most dangerous misclassifications.
|
| 212 |
</p>
|
| 213 |
-
<div
|
| 214 |
-
<
|
|
|
|
|
|
|
| 215 |
</div>
|
| 216 |
</div>
|
| 217 |
|
|
@@ -233,7 +226,6 @@
|
|
| 233 |
const ALL_LANGS = ["ar", "az", "be", "bg", "bo", "ca", "cn", "cs", "da", "el", "es", "et", "eu", "fa", "fi", "fr", "gl", "hu", "hv", "is", "it", "ka", "la", "li", "lv", "mk", "mt", "nl", "no", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "stack", "sv", "tr", "uk"];
|
| 234 |
const LANG_NAMES = {"af": "Afrikaans", "ab": "Arabic", "az": "Azerbaijani", "be": "Belarusian", "bo": "Bosnian", "bg": "Bulgarian", "bn": "Brunei", "ca": "Catalan", "cs": "Czech", "cn": "Chinese", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Faroese", "fi": "Finnish", "fr": "French", "ga": "Irish", "gl": "Galician", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "ar": "Armenian", "hv": "Croatia", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "ka": "Georgian", "kk": "Kazakh", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "la": "Latin", "li": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "mt": "Maltese", "my": "Burmese", "ne": "Nepali", "nl": "Dutch", "no": "Norwegian", "pa": "Punjabi", "pe": "Persian", "pl": "Polish", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Filipino", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "vi": "Vietnamese", "zh": "Chinese", "zu": "Zulu"};
|
| 235 |
const LANG_COUNTS = {"ar": 1790, "az": 1795, "be": 1794, "bg": 1721, "bo": 1795, "ca": 1712, "cn": 2686, "cs": 1779, "da": 1784, "el": 1794, "es": 1652, "et": 1788, "eu": 2684, "fa": 2697, "fi": 1784, "fr": 1761, "gl": 2698, "hu": 1796, "hv": 1796, "is": 1790, "it": 1791, "ka": 1629, "la": 2522, "li": 1793, "lv": 1795, "mk": 1799, "mt": 1797, "nl": 1795, "no": 1799, "pt": 1795, "ro": 1790, "ru": 1424, "sk": 1784, "sl": 1788, "sq": 1793, "sr": 1798, "stack": 2590, "sv": 1797, "tr": 1799, "uk": 1747};
|
| 236 |
-
const LANG_DIST = {"ar": {"1": 100, "2": 100, "5": 100, "6": 100}, "az": {"1": 100, "2": 100, "5": 100, "6": 100}, "be": {"1": 100, "2": 100, "5": 100, "6": 100}, "bg": {"1": 100, "2": 100, "5": 100, "6": 62}, "bo": {"1": 100, "2": 100, "5": 100, "6": 100}, "ca": {"1": 73, "2": 100, "5": 100, "6": 86}, "cn": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "cs": {"1": 100, "2": 100, "5": 100, "6": 100}, "da": {"1": 100, "2": 100, "5": 100, "6": 100}, "el": {"1": 100, "2": 100, "5": 100, "6": 100}, "es": {"1": 100, "2": 100, "5": 100, "6": 33}, "et": {"1": 100, "2": 100, "5": 100, "6": 100}, "eu": {"1": 97, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "fa": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "fi": {"1": 100, "2": 100, "5": 100, "6": 100}, "fr": {"1": 100, "2": 100, "5": 100, "6": 82}, "gl": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "hu": {"1": 100, "2": 100, "5": 100, "6": 100}, "hv": {"1": 100, "2": 100, "5": 100, "6": 100}, "is": {"1": 100, "2": 100, "5": 100, "6": 100}, "it": {"1": 100, "2": 100, "5": 100, "6": 100}, "ka": {"1": 18, "2": 100, "5": 100, "6": 100}, "la": {"1": 100, "2": 96, "3": 17, "4": 100, "5": 100, "6": 100}, "li": {"1": 100, "2": 100, "5": 100, "6": 100}, "lv": {"1": 100, "2": 100, "5": 100, "6": 100}, "mk": {"1": 100, "2": 100, "5": 100, "6": 100}, "mt": {"1": 100, "2": 100, "5": 100, "6": 100}, "nl": {"1": 100, "2": 100, "5": 100, "6": 100}, "no": {"1": 100, "2": 100, "5": 100, "6": 100}, "pt": {"1": 100, "2": 100, "5": 100, "6": 100}, "ro": {"1": 100, "2": 100, "5": 100, "6": 100}, "ru": {"1": 100, "2": 100, "5": 100, "6": 3}, "sk": {"1": 100, "2": 100, "5": 100, "6": 100}, "sl": {"1": 100, "2": 100, "5": 100, "6": 100}, "sq": {"1": 100, "2": 100, "5": 100, "6": 100}, "sr": {"1": 100, "2": 100, "5": 100, "6": 100}, "stack": {"1": 49, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "sv": {"1": 100, "2": 100, "5": 100, "6": 100}, "tr": {"1": 100, "2": 100, "5": 100, "6": 100}, "uk": {"1": 100, "2": 100, "5": 100, "6": 74}};
|
| 237 |
|
| 238 |
function langName(c) { return LANG_NAMES[c] || c.toUpperCase(); }
|
| 239 |
|
|
@@ -283,7 +275,7 @@
|
|
| 283 |
ALL_LANGS.forEach(l => mk(langName(l), l, selLangs.includes(l)));
|
| 284 |
}
|
| 285 |
|
| 286 |
-
// chart
|
| 287 |
let chartInstance = null;
|
| 288 |
function renderChart() {
|
| 289 |
const sorted = [...ALL_ROWS].sort((a, b) => b.avg_wp - a.avg_wp);
|
|
@@ -302,116 +294,27 @@
|
|
| 302 |
data: {
|
| 303 |
labels,
|
| 304 |
datasets: [
|
| 305 |
-
{
|
| 306 |
-
|
| 307 |
-
data: wpData,
|
| 308 |
-
backgroundColor: '#2563eb',
|
| 309 |
-
borderRadius: 3,
|
| 310 |
-
barPercentage: 0.72,
|
| 311 |
-
},
|
| 312 |
-
{
|
| 313 |
-
label: 'Exact Accuracy',
|
| 314 |
-
data: exData,
|
| 315 |
-
backgroundColor: '#16a34a',
|
| 316 |
-
borderRadius: 3,
|
| 317 |
-
barPercentage: 0.72,
|
| 318 |
-
},
|
| 319 |
]
|
| 320 |
},
|
| 321 |
options: {
|
| 322 |
-
indexAxis: 'y',
|
| 323 |
-
responsive: true,
|
| 324 |
-
maintainAspectRatio: false,
|
| 325 |
-
animation: { duration: 500 },
|
| 326 |
plugins: {
|
| 327 |
-
legend: {
|
| 328 |
-
|
| 329 |
-
labels: {
|
| 330 |
-
color: '#94a3b8',
|
| 331 |
-
font: { family: 'JetBrains Mono', size: 11 },
|
| 332 |
-
boxWidth: 14, padding: 20,
|
| 333 |
-
}
|
| 334 |
-
},
|
| 335 |
-
tooltip: {
|
| 336 |
-
backgroundColor: '#1e2a3a',
|
| 337 |
-
titleColor: '#e2e8f0',
|
| 338 |
-
bodyColor: '#94a3b8',
|
| 339 |
-
callbacks: {
|
| 340 |
-
label: ctx => ` ${ctx.dataset.label}: ${ctx.parsed.x}%`
|
| 341 |
-
}
|
| 342 |
-
},
|
| 343 |
},
|
| 344 |
scales: {
|
| 345 |
-
x: {
|
| 346 |
-
|
| 347 |
-
grid: { color: '#1a2236' },
|
| 348 |
-
ticks: {
|
| 349 |
-
color: '#64748b',
|
| 350 |
-
font: { family: 'JetBrains Mono', size: 10 },
|
| 351 |
-
callback: v => v + '%',
|
| 352 |
-
},
|
| 353 |
-
title: {
|
| 354 |
-
display: true, text: 'Percent (%)',
|
| 355 |
-
color: '#64748b',
|
| 356 |
-
font: { family: 'JetBrains Mono', size: 11 },
|
| 357 |
-
}
|
| 358 |
-
},
|
| 359 |
-
y: {
|
| 360 |
-
grid: { display: false },
|
| 361 |
-
ticks: {
|
| 362 |
-
color: '#cbd5e1',
|
| 363 |
-
font: { family: 'JetBrains Mono', size: 11 },
|
| 364 |
-
}
|
| 365 |
-
}
|
| 366 |
}
|
| 367 |
}
|
| 368 |
});
|
| 369 |
}
|
| 370 |
|
| 371 |
-
// distribution table
|
| 372 |
-
function renderDist() {
|
| 373 |
-
const scores = [1, 2, 3, 4, 5, 6];
|
| 374 |
-
const langs = [...ALL_LANGS];
|
| 375 |
-
|
| 376 |
-
// max count for bar scaling
|
| 377 |
-
let maxCount = 0;
|
| 378 |
-
langs.forEach(l => {
|
| 379 |
-
const d = LANG_DIST[l] || {};
|
| 380 |
-
scores.forEach(s => { if ((d[s]||0) > maxCount) maxCount = d[s]||0; });
|
| 381 |
-
});
|
| 382 |
-
|
| 383 |
-
// header
|
| 384 |
-
const head = document.getElementById('dist-head');
|
| 385 |
-
head.innerHTML = `<tr>
|
| 386 |
-
<th class="lang-h">Language</th>
|
| 387 |
-
${scores.map(s => `<th class="score-h">Rating ${s}</th>`).join('')}
|
| 388 |
-
<th class="total-h">Total texts</th>
|
| 389 |
-
</tr>`;
|
| 390 |
-
|
| 391 |
-
// body
|
| 392 |
-
const body = document.getElementById('dist-body');
|
| 393 |
-
body.innerHTML = langs.map(lang => {
|
| 394 |
-
const d = LANG_DIST[lang] || {};
|
| 395 |
-
const total = Object.values(d).reduce((a,b) => a+b, 0);
|
| 396 |
-
const cells = scores.map(s => {
|
| 397 |
-
const n = d[s] || 0;
|
| 398 |
-
const bar = maxCount > 0 ? Math.round((n / maxCount) * 48) : 0;
|
| 399 |
-
return `<td class="count-d">
|
| 400 |
-
${n > 0 ? `${n}<span class="dist-bar" style="width:${bar}px"></span>` : '<span style="color:#2d3748">—</span>'}
|
| 401 |
-
</td>`;
|
| 402 |
-
}).join('');
|
| 403 |
-
return `<tr>
|
| 404 |
-
<td class="lang-d">${langName(lang)}</td>
|
| 405 |
-
${cells}
|
| 406 |
-
<td class="total-d">${total.toLocaleString()}</td>
|
| 407 |
-
</tr>`;
|
| 408 |
-
}).join('');
|
| 409 |
-
}
|
| 410 |
-
|
| 411 |
// table
|
| 412 |
function render() {
|
| 413 |
renderChips();
|
| 414 |
-
|
| 415 |
const visLangs = selLangs.length > 0 ? [...selLangs].sort() : [...ALL_LANGS];
|
| 416 |
|
| 417 |
let rows = ALL_ROWS.map(row => {
|
|
@@ -430,7 +333,6 @@
|
|
| 430 |
return sortDir * (va - vb);
|
| 431 |
});
|
| 432 |
|
| 433 |
-
// header
|
| 434 |
const head = document.getElementById('lb-head');
|
| 435 |
const mkBtn = (label, col) => {
|
| 436 |
const active = sortCol === col;
|
|
@@ -452,7 +354,6 @@
|
|
| 452 |
};
|
| 453 |
});
|
| 454 |
|
| 455 |
-
// body
|
| 456 |
const body = document.getElementById('lb-body');
|
| 457 |
body.innerHTML = rows.map((row, i) => {
|
| 458 |
const avgPct = (row._avg * 100).toFixed(1) + '%';
|
|
@@ -482,7 +383,7 @@
|
|
| 482 |
|
| 483 |
const ctx = document.getElementById('biasChart').getContext('2d');
|
| 484 |
const h = Math.max(260, sorted.length * 26 + 40);
|
| 485 |
-
document.getElementById('
|
| 486 |
|
| 487 |
new Chart(ctx, {
|
| 488 |
type: 'bar',
|
|
@@ -544,7 +445,7 @@
|
|
| 544 |
|
| 545 |
const ctx = document.getElementById('criticalChart').getContext('2d');
|
| 546 |
const h = Math.max(260, sorted.length * 26 + 60);
|
| 547 |
-
document.getElementById('
|
| 548 |
|
| 549 |
new Chart(ctx, {
|
| 550 |
type: 'bar',
|
|
@@ -593,8 +494,8 @@
|
|
| 593 |
|
| 594 |
const data = [];
|
| 595 |
scores.forEach((gt, ri) => {
|
| 596 |
-
const preds
|
| 597 |
-
const rowSum
|
| 598 |
scores.forEach((pred, ci) => {
|
| 599 |
const v = rowSum > 0 ? (preds[pred] || 0) : 0;
|
| 600 |
data.push({ x: ci, y: ri, v });
|
|
@@ -606,9 +507,9 @@
|
|
| 606 |
document.getElementById('confusionChart').style.height = '340px';
|
| 607 |
|
| 608 |
function cellColor(ri, ci, v) {
|
| 609 |
-
if (ri === ci)
|
| 610 |
if (Math.abs(ri-ci)>=3) return `rgba(220,38,38,${v * 0.9})`;
|
| 611 |
-
return
|
| 612 |
}
|
| 613 |
|
| 614 |
confChartInstance = new Chart(ctx, {
|
|
@@ -624,12 +525,24 @@
|
|
| 624 |
} }
|
| 625 |
},
|
| 626 |
scales: {
|
| 627 |
-
x: {
|
| 628 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 629 |
grid: { color: '#1a2236' },
|
|
|
|
| 630 |
},
|
| 631 |
-
y: {
|
| 632 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 633 |
grid: { color: '#1a2236' },
|
| 634 |
}
|
| 635 |
}
|
|
@@ -638,8 +551,8 @@
|
|
| 638 |
id: 'heatmap',
|
| 639 |
afterDraw(chart) {
|
| 640 |
const {ctx, scales: {x, y}} = chart;
|
| 641 |
-
const cellW = x.getPixelForValue(1) - x.getPixelForValue(0);
|
| 642 |
-
const cellH = y.getPixelForValue(
|
| 643 |
data.forEach(d => {
|
| 644 |
const cx = x.getPixelForValue(d.x);
|
| 645 |
const cy = y.getPixelForValue(d.y);
|
|
@@ -660,7 +573,6 @@
|
|
| 660 |
|
| 661 |
render();
|
| 662 |
renderChart();
|
| 663 |
-
// renderDist(); // disabled for testing
|
| 664 |
renderBias();
|
| 665 |
renderCritical();
|
| 666 |
populateConfSelect();
|
|
|
|
| 24 |
color: #64748b; font-size: 12px; margin-bottom: 20px;
|
| 25 |
line-height: 1.8;
|
| 26 |
}
|
| 27 |
+
.methodology-box {
|
| 28 |
+
background: #111827; border: 1px solid #1e2a3a;
|
| 29 |
+
border-radius: 8px; padding: 18px 22px;
|
| 30 |
+
margin-bottom: 24px; max-width: 900px;
|
| 31 |
+
}
|
| 32 |
+
.methodology-box h3 {
|
| 33 |
+
font-family: 'Syne', sans-serif; font-size: 14px;
|
| 34 |
+
color: #e2e8f0; margin-bottom: 8px; font-weight: 700;
|
| 35 |
+
}
|
| 36 |
+
.methodology-box p, .methodology-box li {
|
| 37 |
+
font-size: 11.5px; color: #94a3b8; line-height: 1.6;
|
| 38 |
+
}
|
| 39 |
+
.methodology-box ul {
|
| 40 |
+
margin-top: 8px; padding-left: 20px;
|
| 41 |
+
}
|
| 42 |
+
.methodology-box li { margin-bottom: 4px; }
|
| 43 |
+
.highlight { color: #7dd3fc; font-weight: 600; }
|
| 44 |
.sep { color: #334155; margin: 0 8px; }
|
| 45 |
.scoring-note {
|
| 46 |
display: inline-flex; gap: 16px; flex-wrap: wrap;
|
|
|
|
| 50 |
}
|
| 51 |
.scoring-note span { display: flex; align-items: center; gap: 5px; }
|
| 52 |
.dot { width: 9px; height: 9px; border-radius: 50%; flex-shrink: 0; }
|
|
|
|
| 53 |
.filter-label { font-size: 10px; text-transform: uppercase; letter-spacing: 0.1em; color: #475569; margin-bottom: 10px; }
|
| 54 |
#chips { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 24px; }
|
| 55 |
.chip {
|
|
|
|
| 60 |
}
|
| 61 |
.chip:hover { border-color: #7dd3fc; color: #e2e8f0; }
|
| 62 |
.chip.active { background: #1e3a5f; border-color: #38bdf8; color: #7dd3fc; }
|
|
|
|
| 63 |
.metric-toggle {
|
| 64 |
display: flex; width: fit-content;
|
| 65 |
border: 1px solid #1e2a3a; border-radius: 6px;
|
|
|
|
| 70 |
border: none; background: #131820; color: #64748b; transition: all .15s;
|
| 71 |
}
|
| 72 |
.mt-btn.active { background: #1e3a5f; color: #7dd3fc; }
|
|
|
|
| 73 |
.table-wrap {
|
| 74 |
overflow-x: auto; border-radius: 10px;
|
| 75 |
border: 1px solid #1e2a3a; margin-bottom: 52px;
|
|
|
|
| 99 |
border-right: 1px solid rgba(255,255,255,0.04);
|
| 100 |
}
|
| 101 |
td.empty { text-align: center; color: #2d3748; background: #111520; border-right: 1px solid rgba(255,255,255,0.04); }
|
|
|
|
| 102 |
.section-title {
|
| 103 |
font-family: 'Syne', sans-serif; font-size: 18px; font-weight: 700;
|
| 104 |
color: #f1f5f9; margin-bottom: 4px;
|
|
|
|
| 107 |
background: #111827; border: 1px solid #1e2a3a;
|
| 108 |
border-radius: 10px; padding: 24px 20px;
|
| 109 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
.analysis-card {
|
| 111 |
background: #111827; border: 1px solid #1e2a3a;
|
| 112 |
border-radius: 10px; padding: 22px 20px;
|
|
|
|
| 115 |
font-family: 'Syne', sans-serif; font-size: 14px; font-weight: 700;
|
| 116 |
color: #f1f5f9; margin-bottom: 4px;
|
| 117 |
}
|
| 118 |
+
.chart-scroll-wrap {
|
| 119 |
+
max-height: 380px;
|
| 120 |
+
overflow-y: auto;
|
| 121 |
+
overflow-x: hidden;
|
| 122 |
+
padding-right: 8px;
|
| 123 |
+
}
|
| 124 |
.analysis-card .card-sub {
|
| 125 |
font-size: 11px; color: #475569; margin-bottom: 16px; line-height: 1.5;
|
| 126 |
}
|
|
|
|
| 131 |
}
|
| 132 |
.model-select:focus { outline: none; border-color: #38bdf8; }
|
| 133 |
.footer { margin-top: 20px; font-size: 11px; color: #2d3748; text-align: right; }
|
| 134 |
+
::-webkit-scrollbar { width: 6px; height: 6px; background: #0d1117; }
|
| 135 |
::-webkit-scrollbar-thumb { background: #2d3748; border-radius: 3px; }
|
| 136 |
</style>
|
| 137 |
</head>
|
|
|
|
| 139 |
<h1>Text Quality Rating Benchmark</h1>
|
| 140 |
<p class="meta-subtitle">
|
| 141 |
LLM accuracy at rating text quality on a 1–6 scale across multiple languages
|
|
|
|
| 142 |
<span class="sep">·</span> Documents sourced from FineWeb dataset
|
| 143 |
</p>
|
| 144 |
|
| 145 |
+
<div class="methodology-box">
|
| 146 |
+
<h3>Methodology</h3>
|
| 147 |
+
<p>The core objective of this benchmark is to evaluate how effectively Large Language Models can assess text quality, simulating the process of filtering data for LLM pre-training. The dataset curation followed a strict pipeline:</p>
|
| 148 |
+
<ul>
|
| 149 |
+
<li><span class="highlight">Initial Scoring:</span> Multilingual texts sampled from the FineWeb dataset were evaluated by <strong>DeepSeek V3.2</strong>, which assigned them a quality and substantiveness rating on a scale from 1 (lowest quality) to 6 (highest quality).</li>
|
| 150 |
+
<li><span class="highlight">Verification:</span> These initial scores were subsequently verified by an independent judge, <strong>Gemini 3 Flash</strong>.</li>
|
| 151 |
+
<li><span class="highlight">Filtering:</span> To ensure the highest ground-truth reliability, only the documents that received the absolute highest approval rating during the Gemini verification phase were included in this benchmark.</li>
|
| 152 |
+
<li><span class="highlight">Version:</span> 1.0, *de excluded in this version</li>
|
| 153 |
+
</ul>
|
| 154 |
+
</div>
|
| 155 |
+
|
| 156 |
<div class="scoring-note">
|
| 157 |
<span><span class="dot" style="background:#22c55e"></span>Exact match = 1.0 pt</span>
|
| 158 |
<span><span class="dot" style="background:#eab308"></span>Off by ±1 = 0.5 pt</span>
|
|
|
|
| 174 |
</table>
|
| 175 |
</div>
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
<p class="section-title" style="margin-top:52px">Global Model Comparison</p>
|
| 178 |
<p class="subtitle" style="margin-bottom:20px">Weighted Score vs Exact Accuracy — all languages combined, sorted by Weighted Score</p>
|
| 179 |
<div class="chart-wrap">
|
|
|
|
| 183 |
<p class="section-title" style="margin-bottom:4px;margin-top:52px">Model Error Analysis</p>
|
| 184 |
<p class="subtitle" style="margin-bottom:20px">Bias, critical misclassifications and confusion patterns</p>
|
| 185 |
|
| 186 |
+
<!-- Bias lollipop — full width -->
|
| 187 |
<div class="analysis-card" style="margin-bottom:24px">
|
| 188 |
<h3>Prediction Bias</h3>
|
| 189 |
<p class="card-sub">Average error (predicted − ground truth). Negative = underestimation, positive = overestimation.</p>
|
| 190 |
+
<div class="chart-scroll-wrap">
|
| 191 |
+
<div id="biasChartContainer" style="position:relative">
|
| 192 |
+
<canvas id="biasChart"></canvas>
|
| 193 |
+
</div>
|
| 194 |
</div>
|
| 195 |
</div>
|
| 196 |
|
| 197 |
+
<!-- Critical confusion — full width, below bias -->
|
| 198 |
<div class="analysis-card" style="margin-bottom:52px">
|
| 199 |
<h3>Critical Confusion Rate</h3>
|
| 200 |
<p class="card-sub">
|
| 201 |
% of low-quality texts (rating 1–2) predicted as high-quality (5–6) and vice versa.
|
| 202 |
These are the most dangerous misclassifications.
|
| 203 |
</p>
|
| 204 |
+
<div class="chart-scroll-wrap">
|
| 205 |
+
<div id="criticalChartContainer" style="position:relative">
|
| 206 |
+
<canvas id="criticalChart"></canvas>
|
| 207 |
+
</div>
|
| 208 |
</div>
|
| 209 |
</div>
|
| 210 |
|
|
|
|
| 226 |
const ALL_LANGS = ["ar", "az", "be", "bg", "bo", "ca", "cn", "cs", "da", "el", "es", "et", "eu", "fa", "fi", "fr", "gl", "hu", "hv", "is", "it", "ka", "la", "li", "lv", "mk", "mt", "nl", "no", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "stack", "sv", "tr", "uk"];
|
| 227 |
const LANG_NAMES = {"af": "Afrikaans", "ab": "Arabic", "az": "Azerbaijani", "be": "Belarusian", "bo": "Bosnian", "bg": "Bulgarian", "bn": "Brunei", "ca": "Catalan", "cs": "Czech", "cn": "Chinese", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Faroese", "fi": "Finnish", "fr": "French", "ga": "Irish", "gl": "Galician", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "ar": "Armenian", "hv": "Croatia", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "ka": "Georgian", "kk": "Kazakh", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "la": "Latin", "li": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "mt": "Maltese", "my": "Burmese", "ne": "Nepali", "nl": "Dutch", "no": "Norwegian", "pa": "Punjabi", "pe": "Persian", "pl": "Polish", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Filipino", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "vi": "Vietnamese", "zh": "Chinese", "zu": "Zulu"};
|
| 228 |
const LANG_COUNTS = {"ar": 1790, "az": 1795, "be": 1794, "bg": 1721, "bo": 1795, "ca": 1712, "cn": 2686, "cs": 1779, "da": 1784, "el": 1794, "es": 1652, "et": 1788, "eu": 2684, "fa": 2697, "fi": 1784, "fr": 1761, "gl": 2698, "hu": 1796, "hv": 1796, "is": 1790, "it": 1791, "ka": 1629, "la": 2522, "li": 1793, "lv": 1795, "mk": 1799, "mt": 1797, "nl": 1795, "no": 1799, "pt": 1795, "ro": 1790, "ru": 1424, "sk": 1784, "sl": 1788, "sq": 1793, "sr": 1798, "stack": 2590, "sv": 1797, "tr": 1799, "uk": 1747};
|
|
|
|
| 229 |
|
| 230 |
function langName(c) { return LANG_NAMES[c] || c.toUpperCase(); }
|
| 231 |
|
|
|
|
| 275 |
ALL_LANGS.forEach(l => mk(langName(l), l, selLangs.includes(l)));
|
| 276 |
}
|
| 277 |
|
| 278 |
+
// global chart
|
| 279 |
let chartInstance = null;
|
| 280 |
function renderChart() {
|
| 281 |
const sorted = [...ALL_ROWS].sort((a, b) => b.avg_wp - a.avg_wp);
|
|
|
|
| 294 |
data: {
|
| 295 |
labels,
|
| 296 |
datasets: [
|
| 297 |
+
{ label: 'Weighted Score', data: wpData, backgroundColor: '#2563eb', borderRadius: 3, barPercentage: 0.72 },
|
| 298 |
+
{ label: 'Exact Accuracy', data: exData, backgroundColor: '#16a34a', borderRadius: 3, barPercentage: 0.72 },
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
]
|
| 300 |
},
|
| 301 |
options: {
|
| 302 |
+
indexAxis: 'y', responsive: true, maintainAspectRatio: false, animation: { duration: 500 },
|
|
|
|
|
|
|
|
|
|
| 303 |
plugins: {
|
| 304 |
+
legend: { position: 'bottom', labels: { color: '#94a3b8', font: { family: 'JetBrains Mono', size: 11 }, boxWidth: 14, padding: 20 } },
|
| 305 |
+
tooltip: { backgroundColor: '#1e2a3a', titleColor: '#e2e8f0', bodyColor: '#94a3b8', callbacks: { label: ctx => ` ${ctx.dataset.label}: ${ctx.parsed.x}%` } },
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
},
|
| 307 |
scales: {
|
| 308 |
+
x: { min: 0, max: 108, grid: { color: '#1a2236' }, ticks: { color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }, callback: v => v + '%' }, title: { display: true, text: 'Percent (%)', color: '#64748b', font: { family: 'JetBrains Mono', size: 11 } } },
|
| 309 |
+
y: { grid: { display: false }, ticks: { color: '#cbd5e1', font: { family: 'JetBrains Mono', size: 11 } } }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
}
|
| 311 |
}
|
| 312 |
});
|
| 313 |
}
|
| 314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
// table
|
| 316 |
function render() {
|
| 317 |
renderChips();
|
|
|
|
| 318 |
const visLangs = selLangs.length > 0 ? [...selLangs].sort() : [...ALL_LANGS];
|
| 319 |
|
| 320 |
let rows = ALL_ROWS.map(row => {
|
|
|
|
| 333 |
return sortDir * (va - vb);
|
| 334 |
});
|
| 335 |
|
|
|
|
| 336 |
const head = document.getElementById('lb-head');
|
| 337 |
const mkBtn = (label, col) => {
|
| 338 |
const active = sortCol === col;
|
|
|
|
| 354 |
};
|
| 355 |
});
|
| 356 |
|
|
|
|
| 357 |
const body = document.getElementById('lb-body');
|
| 358 |
body.innerHTML = rows.map((row, i) => {
|
| 359 |
const avgPct = (row._avg * 100).toFixed(1) + '%';
|
|
|
|
| 383 |
|
| 384 |
const ctx = document.getElementById('biasChart').getContext('2d');
|
| 385 |
const h = Math.max(260, sorted.length * 26 + 40);
|
| 386 |
+
document.getElementById('biasChartContainer').style.height = h + 'px';
|
| 387 |
|
| 388 |
new Chart(ctx, {
|
| 389 |
type: 'bar',
|
|
|
|
| 445 |
|
| 446 |
const ctx = document.getElementById('criticalChart').getContext('2d');
|
| 447 |
const h = Math.max(260, sorted.length * 26 + 60);
|
| 448 |
+
document.getElementById('criticalChartContainer').style.height = h + 'px';
|
| 449 |
|
| 450 |
new Chart(ctx, {
|
| 451 |
type: 'bar',
|
|
|
|
| 494 |
|
| 495 |
const data = [];
|
| 496 |
scores.forEach((gt, ri) => {
|
| 497 |
+
const preds = conf[gt] || {};
|
| 498 |
+
const rowSum = Object.values(preds).reduce((a, b) => a + b, 0);
|
| 499 |
scores.forEach((pred, ci) => {
|
| 500 |
const v = rowSum > 0 ? (preds[pred] || 0) : 0;
|
| 501 |
data.push({ x: ci, y: ri, v });
|
|
|
|
| 507 |
document.getElementById('confusionChart').style.height = '340px';
|
| 508 |
|
| 509 |
function cellColor(ri, ci, v) {
|
| 510 |
+
if (ri === ci) return `rgba(22,163,74,${0.15 + v * 0.85})`;
|
| 511 |
if (Math.abs(ri-ci)>=3) return `rgba(220,38,38,${v * 0.9})`;
|
| 512 |
+
return `rgba(37,99,235,${v * 0.75})`;
|
| 513 |
}
|
| 514 |
|
| 515 |
confChartInstance = new Chart(ctx, {
|
|
|
|
| 525 |
} }
|
| 526 |
},
|
| 527 |
scales: {
|
| 528 |
+
x: {
|
| 529 |
+
type: 'linear', min: -0.5, max: 5.5,
|
| 530 |
+
ticks: {
|
| 531 |
+
stepSize: 1,
|
| 532 |
+
callback: v => (Number.isInteger(v) && v >= 0 && v <= 5) ? 'Pred ' + scores[v] : '',
|
| 533 |
+
color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }
|
| 534 |
+
},
|
| 535 |
grid: { color: '#1a2236' },
|
| 536 |
+
position: 'top'
|
| 537 |
},
|
| 538 |
+
y: {
|
| 539 |
+
type: 'linear', min: -0.5, max: 5.5,
|
| 540 |
+
reverse: true,
|
| 541 |
+
ticks: {
|
| 542 |
+
stepSize: 1,
|
| 543 |
+
callback: v => (Number.isInteger(v) && v >= 0 && v <= 5) ? 'GT ' + scores[v] : '',
|
| 544 |
+
color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }
|
| 545 |
+
},
|
| 546 |
grid: { color: '#1a2236' },
|
| 547 |
}
|
| 548 |
}
|
|
|
|
| 551 |
id: 'heatmap',
|
| 552 |
afterDraw(chart) {
|
| 553 |
const {ctx, scales: {x, y}} = chart;
|
| 554 |
+
const cellW = Math.abs(x.getPixelForValue(1) - x.getPixelForValue(0));
|
| 555 |
+
const cellH = Math.abs(y.getPixelForValue(1) - y.getPixelForValue(0));
|
| 556 |
data.forEach(d => {
|
| 557 |
const cx = x.getPixelForValue(d.x);
|
| 558 |
const cy = y.getPixelForValue(d.y);
|
|
|
|
| 573 |
|
| 574 |
render();
|
| 575 |
renderChart();
|
|
|
|
| 576 |
renderBias();
|
| 577 |
renderCritical();
|
| 578 |
populateConfSelect();
|