adgw's picture
Update benchmark leaderboard
1aa5456 verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<title>Text Quality Rating Benchmark</title>
<link rel="preconnect" href="https://fonts.googleapis.com"/>
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;600;700&family=Syne:wght@700;800&display=swap" rel="stylesheet"/>
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/4.4.1/chart.umd.min.js"></script>
<style>
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
body {
background: #0d1117; color: #e2e8f0;
font-family: 'JetBrains Mono', monospace;
padding: 36px 28px 80px; min-height: 100vh;
}
h1 {
font-family: 'Syne', sans-serif;
font-size: clamp(18px, 3vw, 28px); font-weight: 800;
letter-spacing: -0.02em; color: #f1f5f9; margin-bottom: 6px;
}
.subtitle { color: #64748b; font-size: 12px; margin-bottom: 12px; }
.meta-subtitle {
color: #64748b; font-size: 12px; margin-bottom: 20px;
line-height: 1.8;
}
.methodology-box {
background: #111827; border: 1px solid #1e2a3a;
border-radius: 8px; padding: 18px 22px;
margin-bottom: 24px; max-width: 900px;
}
.methodology-box h3 {
font-family: 'Syne', sans-serif; font-size: 14px;
color: #e2e8f0; margin-bottom: 8px; font-weight: 700;
}
.methodology-box p, .methodology-box li {
font-size: 11.5px; color: #94a3b8; line-height: 1.6;
}
.methodology-box ul {
margin-top: 8px; padding-left: 20px;
}
.methodology-box li { margin-bottom: 4px; }
.highlight { color: #7dd3fc; font-weight: 600; }
.sep { color: #334155; margin: 0 8px; }
.scoring-note {
display: inline-flex; gap: 16px; flex-wrap: wrap;
background: #131820; border: 1px solid #1e2a3a;
border-radius: 8px; padding: 8px 14px;
font-size: 11px; color: #94a3b8; margin-bottom: 28px;
}
.scoring-note span { display: flex; align-items: center; gap: 5px; }
.dot { width: 9px; height: 9px; border-radius: 50%; flex-shrink: 0; }
.dataset-toggle {
display: inline-flex; margin-bottom: 20px;
border: 1px solid #1e2a3a; border-radius: 8px; overflow: hidden;
}
.ds-btn {
padding: 8px 20px; font: inherit; font-size: 12px; font-weight: 700;
cursor: pointer; border: none; background: #131820; color: #475569;
transition: all .15s; letter-spacing: 0.03em;
}
.ds-btn:hover { color: #94a3b8; }
.ds-btn.active { background: #1e3a5f; color: #7dd3fc; }
.ds-btn:disabled { opacity: 0.3; cursor: not-allowed; }
.ds-badge {
display: inline-block; font-size: 9px; font-weight: 700;
padding: 1px 5px; border-radius: 4px; margin-left: 6px;
background: #0f2840; color: #38bdf8; vertical-align: middle;
letter-spacing: 0.05em;
}
.filter-label { font-size: 10px; text-transform: uppercase; letter-spacing: 0.1em; color: #475569; margin-bottom: 10px; }
#chips { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 24px; }
.chip {
padding: 4px 11px; border-radius: 20px; font-size: 11px;
font-family: 'JetBrains Mono', monospace; cursor: pointer;
border: 1px solid #2d3748; background: #161b26; color: #94a3b8;
transition: all .15s; user-select: none;
}
.chip:hover { border-color: #7dd3fc; color: #e2e8f0; }
.chip.active { background: #1e3a5f; border-color: #38bdf8; color: #7dd3fc; }
.metric-toggle {
display: flex; width: fit-content;
border: 1px solid #1e2a3a; border-radius: 6px;
overflow: hidden; margin-bottom: 16px;
}
.mt-btn {
padding: 6px 14px; font: inherit; font-size: 11px; cursor: pointer;
border: none; background: #131820; color: #64748b; transition: all .15s;
}
.mt-btn.active { background: #1e3a5f; color: #7dd3fc; }
.table-wrap {
overflow-x: auto; border-radius: 10px;
border: 1px solid #1e2a3a; margin-bottom: 52px;
}
table { border-collapse: collapse; width: auto; min-width: 100%; font-size: 12px; }
thead tr { background: #111827; border-bottom: 2px solid #1e2a3a; }
th { padding: 11px 6px; white-space: nowrap; }
th.rank-col { width: 44px; padding-left: 14px; }
th.model-col { text-align: left; width: 220px; min-width: 180px; padding-left: 14px; }
th.avg-col { width: 110px; }
th.lang-col { width: 90px; }
.sort-btn {
background: none; border: none; color: #64748b; cursor: pointer;
font: inherit; font-size: 10px; font-weight: 700;
text-transform: uppercase; letter-spacing: 0.07em;
display: inline-flex; align-items: center; gap: 3px; padding: 0; white-space: nowrap;
}
.sort-btn:hover { color: #7dd3fc; }
.sort-btn.active { color: #e2e8f0; }
tbody tr { border-bottom: 1px solid #0f1520; transition: filter .1s; }
tbody tr:hover { filter: brightness(1.15); }
td { padding: 8px 6px; white-space: nowrap; }
td.rank { padding-left: 14px; color: #475569; font-weight: 700; font-size: 13px; }
td.model { padding-left: 14px; color: #cbd5e1; font-weight: 600; width: 220px; max-width: 220px; overflow: hidden; text-overflow: ellipsis; }
td.score {
text-align: center; font-weight: 700; font-size: 11.5px;
border-right: 1px solid rgba(255,255,255,0.04);
}
td.empty { text-align: center; color: #2d3748; background: #111520; border-right: 1px solid rgba(255,255,255,0.04); }
.section-title {
font-family: 'Syne', sans-serif; font-size: 18px; font-weight: 700;
color: #f1f5f9; margin-bottom: 4px;
}
.chart-wrap {
background: #111827; border: 1px solid #1e2a3a;
border-radius: 10px; padding: 24px 20px;
}
.analysis-card {
background: #111827; border: 1px solid #1e2a3a;
border-radius: 10px; padding: 22px 20px;
}
.analysis-card h3 {
font-family: 'Syne', sans-serif; font-size: 14px; font-weight: 700;
color: #f1f5f9; margin-bottom: 4px;
}
.chart-scroll-wrap {
max-height: 380px;
overflow-y: auto;
overflow-x: hidden;
padding-right: 8px;
}
.analysis-card .card-sub {
font-size: 11px; color: #475569; margin-bottom: 16px; line-height: 1.5;
}
.model-select {
background: #1a2236; border: 1px solid #2d3748; border-radius: 6px;
color: #cbd5e1; font: inherit; font-size: 11px;
padding: 5px 10px; margin-bottom: 14px; cursor: pointer; width: 100%;
}
.model-select:focus { outline: none; border-color: #38bdf8; }
.dist-wrap {
overflow-x: auto; border-radius: 10px;
border: 1px solid #1e2a3a; margin-bottom: 16px;
}
.dist-wrap table { border-collapse: collapse; width: auto; min-width: 100%; font-size: 12px; }
.dist-wrap thead tr { background: #111827; border-bottom: 2px solid #1e2a3a; }
.dist-wrap th {
padding: 10px 10px; white-space: nowrap; font-size: 10px;
text-transform: uppercase; letter-spacing: 0.07em; color: #475569; font-weight: 700;
}
.dist-wrap th.lang-h { text-align: left; width: 140px; padding-left: 14px; color: #64748b; }
.dist-wrap th.score-h { width: 70px; text-align: center; }
.dist-wrap th.total-h { width: 80px; text-align: center; color: #94a3b8; }
.dist-wrap td { padding: 8px 10px; border-bottom: 1px solid #0f1520; white-space: nowrap; }
.dist-wrap td.lang-d { padding-left: 14px; color: #cbd5e1; font-weight: 600; font-size: 12px; }
.dist-wrap td.count-d { text-align: center; font-size: 12px; }
.dist-wrap td.total-d { text-align: center; font-weight: 700; font-size: 12px; color: #94a3b8; }
.dist-bar {
display: inline-block; height: 6px; border-radius: 3px;
background: #2563eb; vertical-align: middle; margin-left: 4px; opacity: 0.7;
}
.footer { margin-top: 20px; font-size: 11px; color: #2d3748; text-align: right; }
::-webkit-scrollbar { width: 6px; height: 6px; background: #0d1117; }
::-webkit-scrollbar-thumb { background: #2d3748; border-radius: 3px; }
</style>
</head>
<body>
<h1>Text Quality Rating Benchmark</h1>
<p class="meta-subtitle">
LLM accuracy at rating text quality on a 1–6 scale across multiple languages
<span class="sep">·</span> Documents sourced from FineWeb dataset
</p>
<div class="methodology-box">
<h3>Methodology</h3>
<p>The core objective of this benchmark is to evaluate how effectively Large Language Models can assess text quality, simulating the process of filtering data for LLM pre-training. The dataset curation followed a strict pipeline:</p>
<ul>
<li><span class="highlight">Initial Scoring:</span> Multilingual texts sampled from the FineWeb dataset were evaluated by <strong>DeepSeek V3.2</strong>, which assigned them a quality and substantiveness rating on a scale from 1 (lowest quality) to 6 (highest quality).</li>
<li><span class="highlight">Verification:</span> These initial scores were subsequently verified by an independent judge, <strong>Gemini 3 Flash</strong>.</li>
<li><span class="highlight">Filtering:</span> To ensure the highest ground-truth reliability, only the documents that received the absolute highest approval rating during the Gemini verification phase were included in this benchmark.</li>
<li><span class="highlight">Version:</span> 1.0</li>
</ul>
</div>
<div class="scoring-note">
<span><span class="dot" style="background:#22c55e"></span>Exact match = 1.0 pt</span>
<span><span class="dot" style="background:#eab308"></span>Off by ±1 = 0.5 pt</span>
<span><span class="dot" style="background:#ef4444"></span>Off by ≥2 = 0.0 pt</span>
</div>
<div id="dataset-toggle-wrap">
<div class="filter-label" style="margin-bottom:8px">Dataset</div>
<div class="dataset-toggle" style="margin-bottom:20px">
<button class="ds-btn active" id="ds-btn-1" onclick="setDataset(1)">
FineWeb <span class="ds-badge">WEB</span>
</button>
<button class="ds-btn" id="ds-btn-2" onclick="setDataset(2)" id="ds-btn-2">
FinePDF <span class="ds-badge">PDF</span>
</button>
</div>
</div>
<div class="filter-label">Filter by language</div>
<div id="chips"></div>
<div class="metric-toggle">
<button class="mt-btn active" id="btn-wp" onclick="setMetric('wp')">Weighted Score</button>
<button class="mt-btn" id="btn-ex" onclick="setMetric('ex')">Exact Accuracy</button>
<button class="mt-btn" id="btn-parsed" onclick="setMetric('parsed')">Parse Rate</button>
<button class="mt-btn" id="btn-mae" onclick="setMetric('mae')">MAE</button>
</div>
<div class="table-wrap">
<table id="lb-table">
<thead id="lb-head"></thead>
<tbody id="lb-body"></tbody>
</table>
</div>
<p class="section-title" style="margin-top:52px">Global Model Comparison</p>
<p class="subtitle" style="margin-bottom:20px">Weighted Score vs Exact Accuracy — all languages combined, sorted by Weighted Score</p>
<div class="chart-wrap">
<canvas id="globalChart"></canvas>
</div>
<p class="section-title" style="margin-top:52px">Dataset Distribution</p>
<p class="subtitle" style="margin-bottom:20px">Number of unique texts per rating score (1–6) for each language — sourced from original files</p>
<div class="dist-wrap">
<table id="dist-table">
<thead id="dist-head"></thead>
<tbody id="dist-body"></tbody>
</table>
</div>
<p class="section-title" style="margin-bottom:4px;margin-top:52px">Model Error Analysis</p>
<p class="subtitle" style="margin-bottom:20px">Bias, critical misclassifications and confusion patterns</p>
<!-- Bias lollipop — full width -->
<div class="analysis-card" style="margin-bottom:24px">
<h3>Prediction Bias</h3>
<p class="card-sub">Average error (predicted − ground truth). Negative = underestimation, positive = overestimation.</p>
<div class="chart-scroll-wrap">
<div id="biasChartContainer" style="position:relative">
<canvas id="biasChart"></canvas>
</div>
</div>
</div>
<!-- Critical confusion — full width, below bias -->
<div class="analysis-card" style="margin-bottom:52px">
<h3>Critical Confusion Rate</h3>
<p class="card-sub">
% of low-quality texts (rating 1–2) predicted as high-quality (5–6) and vice versa.
These are the most dangerous misclassifications.
</p>
<div class="chart-scroll-wrap">
<div id="criticalChartContainer" style="position:relative">
<canvas id="criticalChart"></canvas>
</div>
</div>
</div>
<!-- Full confusion heatmap with model dropdown -->
<div class="analysis-card" style="margin-bottom:52px">
<h3>Confusion Matrix</h3>
<p class="card-sub">Row = ground truth rating, column = predicted rating. Values show % of predictions within each true class.</p>
<select class="model-select" id="confModelSelect" onchange="renderConfusion()"></select>
<div id="confusionWrap" style="overflow-x:auto">
<canvas id="confusionChart"></canvas>
</div>
</div>
<div class="footer" id="footer"></div>
<script>
(function() {
const ALL_ROWS = [{"model": "Qwen/Qwen3.5-397B-A17B-FP8", "avg_exact": 0.60605, "avg_wp": 0.768996, "avg_bias": 0.2132, "avg_parsed": 1.0, "avg_mae": 0.4719, "total": 21952, "lang_exact": {"ab": 0.508333, "ar": 0.6725, "az": 0.695, "be": 0.73, "bg": 0.712707, "bo": 0.7575, "ca": 0.657382, "cn": 0.546667, "cs": 0.7525, "cy": 0.471667, "da": 0.585, "de": 0.56, "el": 0.6325, "en": 0.648333, "es": 0.786787, "et": 0.6775, "eu": 0.420436, "fa": 0.395, "fi": 0.7575, "fr": 0.643979, "gl": 0.5, "he": 0.511667, "hi": 0.568015, "hu": 0.6525, "hv": 0.715, "ir": 0.591667, "is": 0.675, "it": 0.76, "ka": 0.556604, "kz": 0.43, "la": 0.692008, "li": 0.625, "lv": 0.625, "mk": 0.5125, "mt": 0.655, "nl": 0.655, "no": 0.5625, "pl": 0.508772, "pt": 0.665, "ro": 0.61, "ru": 0.581667, "sk": 0.665, "sl": 0.7625, "sq": 0.7325, "sr": 0.615, "sv": 0.6125, "tr": 0.62, "uk": 0.489547}, "lang_wp": {"ab": 0.7075, "ar": 0.80375, "az": 0.805, "be": 0.83125, "bg": 0.839779, "bo": 0.87375, "ca": 0.786908, "cn": 0.721667, "cs": 0.8625, "cy": 0.664167, "da": 0.7825, "de": 0.765, "el": 0.805, "en": 0.806667, "es": 0.887387, "et": 0.82875, "eu": 0.623116, "fa": 0.606667, "fi": 0.8675, "fr": 0.793194, "gl": 0.6825, "he": 0.721667, "hi": 0.751838, "hu": 0.81, "hv": 0.855, "ir": 0.750833, "is": 0.805, "it": 0.87375, "ka": 0.687107, "kz": 0.644167, "la": 0.824561, "li": 0.80625, "lv": 0.7875, "mk": 0.69375, "mt": 0.76125, "nl": 0.82625, "no": 0.715, "pl": 0.70614, "pt": 0.815, "ro": 0.765, "ru": 0.7625, "sk": 0.8275, "sl": 0.87625, "sq": 0.85375, "sr": 0.7975, "sv": 0.79375, "tr": 0.7975, "uk": 0.702091}, "lang_parsed": {"ab": 1.0, "ar": 1.0, "az": 1.0, "be": 1.0, "bg": 1.0, "bo": 1.0, "ca": 1.0, "cn": 1.0, "cs": 1.0, "cy": 1.0, "da": 1.0, "de": 1.0, "el": 1.0, "en": 1.0, "es": 1.0, "et": 1.0, "eu": 1.0, "fa": 1.0, "fi": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "hu": 1.0, "hv": 1.0, "ir": 1.0, "is": 1.0, "it": 1.0, "ka": 1.0, "kz": 1.0, "la": 1.0, "li": 1.0, "lv": 1.0, "mk": 1.0, "mt": 1.0, "nl": 1.0, "no": 1.0, "pl": 1.0, "pt": 1.0, "ro": 1.0, "ru": 1.0, "sk": 1.0, "sl": 1.0, "sq": 1.0, "sr": 1.0, "sv": 1.0, "tr": 1.0, "uk": 1.0}, "lang_mae": {"ab": 0.5933, "ar": 0.4025, "az": 0.405, "be": 0.36, "bg": 0.3232, "bo": 0.255, "ca": 0.429, "cn": 0.5617, "cs": 0.2925, "cy": 0.6883, "da": 0.435, "de": 0.47, "el": 0.3925, "en": 0.3917, "es": 0.2252, "et": 0.3425, "eu": 0.804, "fa": 0.8133, "fi": 0.2675, "fr": 0.4162, "gl": 0.66, "he": 0.5583, "hi": 0.4982, "hu": 0.3825, "hv": 0.29, "ir": 0.5017, "is": 0.405, "it": 0.2575, "ka": 0.6352, "kz": 0.7283, "la": 0.3548, "li": 0.3875, "lv": 0.43, "mk": 0.6325, "mt": 0.5075, "nl": 0.3475, "no": 0.6175, "pl": 0.6, "pt": 0.3775, "ro": 0.5025, "ru": 0.4817, "sk": 0.3475, "sl": 0.25, "sq": 0.2925, "sr": 0.4075, "sv": 0.4125, "tr": 0.41, "uk": 0.601}, "confusion": {"1": {"1": 0.7238, "2": 0.2468, "4": 0.0134, "3": 0.0139, "6": 0.0006, "5": 0.0015}, "2": {"4": 0.162, "2": 0.5413, "3": 0.1193, "5": 0.014, "1": 0.1614, "6": 0.0021}, "3": {"4": 0.5814, "5": 0.1859, "2": 0.0798, "3": 0.1411, "6": 0.0099, "1": 0.002}, "4": {"4": 0.3681, "5": 0.5431, "6": 0.0494, "3": 0.02, "2": 0.0187, "1": 0.0006}, "5": {"5": 0.6818, "6": 0.2325, "4": 0.0831, "2": 0.0023, "3": 0.0002}, "6": {"6": 0.7118, "5": 0.2707, "4": 0.0156, "2": 0.0015, "1": 0.0002, "3": 0.0002}}}, {"model": "speakleash/Bielik-11B-v3.0-Instruct", "avg_exact": 0.445539, "avg_wp": 0.650048, "avg_bias": -0.0673, "avg_parsed": 0.981352, "avg_mae": 0.7343, "total": 21933, "lang_exact": {"ab": 0.360601, "ar": 0.094148, "az": 0.5325, "be": 0.4675, "bg": 0.574586, "bo": 0.565, "ca": 0.367688, "cn": 0.416388, "cs": 0.5575, "cy": 0.335, "da": 0.4975, "de": 0.371667, "el": 0.505, "en": 0.493333, "es": 0.696697, "et": 0.5525, "eu": 0.304858, "fa": 0.43, "fi": 0.5475, "fr": 0.60733, "gl": 0.356667, "he": 0.335017, "hi": 0.379374, "hu": 0.5425, "hv": 0.575, "ir": 0.373333, "is": 0.5, "it": 0.64, "ka": 0.310127, "kz": 0.266667, "la": 0.393762, "li": 0.47, "lv": 0.4225, "mk": 0.3725, "mt": 0.4125, "nl": 0.5375, "no": 0.485, "pl": 0.349123, "pt": 0.53, "ro": 0.4575, "ru": 0.423333, "sk": 0.505, "sl": 0.515, "sq": 0.5775, "sr": 0.4325, "sv": 0.5175, "tr": 0.5125, "uk": 0.449477}, "lang_wp": {"ab": 0.570952, "ar": 0.232824, "az": 0.72625, "be": 0.685, "bg": 0.75, "bo": 0.72625, "ca": 0.58078, "cn": 0.646321, "cs": 0.725, "cy": 0.544167, "da": 0.715, "de": 0.630833, "el": 0.71375, "en": 0.716667, "es": 0.837838, "et": 0.76625, "eu": 0.5, "fa": 0.658333, "fi": 0.695, "fr": 0.786649, "gl": 0.583333, "he": 0.560606, "hi": 0.563536, "hu": 0.7325, "hv": 0.7225, "ir": 0.575, "is": 0.715, "it": 0.80375, "ka": 0.503165, "kz": 0.520833, "la": 0.554581, "li": 0.70125, "lv": 0.65125, "mk": 0.5775, "mt": 0.63875, "nl": 0.76, "no": 0.65875, "pl": 0.558772, "pt": 0.7275, "ro": 0.67375, "ru": 0.639167, "sk": 0.69125, "sl": 0.73125, "sq": 0.7675, "sr": 0.6725, "sv": 0.7375, "tr": 0.745, "uk": 0.650697}, "lang_parsed": {"ab": 0.926544, "ar": 0.913486, "az": 1.0, "be": 0.98, "bg": 0.972376, "bo": 0.935, "ca": 1.0, "cn": 0.996656, "cs": 0.9225, "cy": 1.0, "da": 0.9825, "de": 0.996667, "el": 1.0, "en": 0.995, "es": 1.0, "et": 0.995, "eu": 0.99665, "fa": 1.0, "fi": 0.955, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 0.972376, "hu": 0.9675, "hv": 0.905, "ir": 1.0, "is": 0.9775, "it": 0.9975, "ka": 1.0, "kz": 1.0, "la": 0.998051, "li": 1.0, "lv": 0.9925, "mk": 0.9975, "mt": 1.0, "nl": 1.0, "no": 1.0, "pl": 0.875439, "pt": 1.0, "ro": 1.0, "ru": 0.991667, "sk": 0.9575, "sl": 0.955, "sq": 0.97, "sr": 0.9825, "sv": 1.0, "tr": 0.995, "uk": 0.987805}, "lang_mae": {"ab": 0.8216, "ar": 2.0585, "az": 0.5775, "be": 0.6454, "bg": 0.4943, "bo": 0.5909, "ca": 0.8719, "cn": 0.7215, "cs": 0.4444, "cy": 0.9967, "da": 0.5496, "de": 0.7358, "el": 0.59, "en": 0.5678, "es": 0.3273, "et": 0.4648, "eu": 1.1714, "fa": 0.705, "fi": 0.7853, "fr": 0.4346, "gl": 0.8667, "he": 0.9192, "hi": 0.9034, "hu": 0.4884, "hv": 0.5083, "ir": 0.9583, "is": 0.5473, "it": 0.3885, "ka": 1.1361, "kz": 1.0033, "la": 1.2793, "li": 0.605, "lv": 0.7657, "mk": 0.8972, "mt": 0.7475, "nl": 0.48, "no": 0.7225, "pl": 0.7395, "pt": 0.5475, "ro": 0.7025, "ru": 0.7277, "sk": 0.6057, "sl": 0.4764, "sq": 0.4175, "sr": 0.6845, "sv": 0.525, "tr": 0.5176, "uk": 0.6896}, "confusion": {"1": {"1": 0.8352, "4": 0.0327, "2": 0.0513, "3": 0.0734, "5": 0.0074}, "2": {"4": 0.1633, "1": 0.3314, "3": 0.2951, "2": 0.1803, "5": 0.0298}, "3": {"2": 0.0419, "3": 0.1863, "4": 0.4225, "1": 0.0572, "5": 0.2914, "6": 0.0007}, "4": {"4": 0.3742, "3": 0.0815, "5": 0.4861, "1": 0.0373, "2": 0.0202, "6": 0.0006}, "5": {"4": 0.0846, "5": 0.8737, "3": 0.02, "1": 0.0204, "6": 0.0002, "2": 0.0011}, "6": {"3": 0.0125, "5": 0.8709, "4": 0.0618, "1": 0.0444, "2": 0.0028, "6": 0.0077}}}, {"model": "allenai/Olmo-3.1-32B-Instruct", "avg_exact": 0.376751, "avg_wp": 0.571491, "avg_bias": 0.5526, "avg_parsed": 1.0, "avg_mae": 0.9768, "total": 14848, "lang_exact": {"ab": 0.392453, "ar": 0.09375, "az": 0.453165, "be": 0.323834, "bg": 0.319865, "bo": 0.448795, "ca": 0.437143, "cn": 0.441176, "cs": 0.505988, "cy": 0.408475, "da": 0.449239, "de": 0.360544, "el": 0.367454, "en": 0.478261, "es": 0.415625, "et": 0.37931, "eu": 0.322635, "fa": 0.392256, "fi": 0.49226, "fr": 0.467213, "gl": 0.322148, "he": 0.326087, "hi": 0.420945, "hu": 0.435013, "hv": 0.419048, "ir": 0.368866, "is": 0.344828, "it": 0.536232, "ka": 0.068182, "kz": 0.347518, "la": 0.269565, "li": 0.333333, "lv": 0.309524, "mk": 0.315789, "mt": 0.234375}, "lang_wp": {"ab": 0.582075, "ar": 0.225446, "az": 0.64557, "be": 0.541451, "bg": 0.557239, "bo": 0.680723, "ca": 0.584286, "cn": 0.650519, "cs": 0.691617, "cy": 0.614407, "da": 0.640863, "de": 0.581633, "el": 0.562992, "en": 0.660535, "es": 0.657813, "et": 0.62069, "eu": 0.518581, "fa": 0.571549, "fi": 0.660991, "fr": 0.643443, "gl": 0.495805, "he": 0.555254, "hi": 0.575975, "hu": 0.616711, "hv": 0.666667, "ir": 0.566836, "is": 0.514368, "it": 0.723188, "ka": 0.201299, "kz": 0.531915, "la": 0.423913, "li": 0.580103, "lv": 0.547619, "mk": 0.489975, "mt": 0.419271}, "lang_parsed": {"ab": 1.0, "ar": 1.0, "az": 1.0, "be": 1.0, "bg": 1.0, "bo": 1.0, "ca": 1.0, "cn": 1.0, "cs": 1.0, "cy": 1.0, "da": 1.0, "de": 1.0, "el": 1.0, "en": 1.0, "es": 1.0, "et": 1.0, "eu": 1.0, "fa": 1.0, "fi": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "hu": 1.0, "hv": 1.0, "ir": 1.0, "is": 1.0, "it": 1.0, "ka": 1.0, "kz": 1.0, "la": 1.0, "li": 1.0, "lv": 1.0, "mk": 1.0, "mt": 1.0}, "lang_mae": {"ab": 0.9642, "ar": 1.9152, "az": 0.7696, "be": 0.9663, "bg": 0.9933, "bo": 0.6867, "ca": 0.9514, "cn": 0.7388, "cs": 0.6677, "cy": 0.8508, "da": 0.7893, "de": 0.8759, "el": 0.9606, "en": 0.7642, "es": 0.7219, "et": 0.7931, "eu": 1.0861, "fa": 0.9259, "fi": 0.774, "fr": 0.7896, "gl": 1.1426, "he": 0.942, "hi": 0.9815, "hu": 0.8674, "hv": 0.6857, "ir": 0.934, "is": 1.5776, "it": 0.5913, "ka": 1.8214, "kz": 1.0993, "la": 1.6674, "li": 0.8889, "lv": 1.1032, "mk": 1.1855, "mt": 1.3438}, "confusion": {"1": {"2": 0.216, "6": 0.0532, "3": 0.2946, "1": 0.383, "4": 0.0421, "5": 0.0111}, "2": {"3": 0.5044, "4": 0.1503, "1": 0.0884, "6": 0.0605, "2": 0.1195, "5": 0.0769}, "3": {"3": 0.4307, "4": 0.2787, "6": 0.0785, "5": 0.1782, "1": 0.0228, "2": 0.011}, "4": {"4": 0.3239, "3": 0.2, "5": 0.2714, "6": 0.1788, "2": 0.0047, "1": 0.0212}, "5": {"5": 0.4194, "6": 0.3548, "4": 0.1425, "3": 0.0704, "1": 0.012, "2": 0.0009}, "6": {"6": 0.6655, "5": 0.2015, "4": 0.0861, "3": 0.0337, "1": 0.0116, "2": 0.0016}}}, {"model": "meta-llama/Llama-3.3-70B-Instruct", "avg_exact": 0.421966, "avg_wp": 0.552045, "avg_bias": 0.9204, "avg_parsed": 0.999818, "avg_mae": 1.0478, "total": 21952, "lang_exact": {"ab": 0.3, "ar": 0.485, "az": 0.505, "be": 0.4825, "bg": 0.549724, "bo": 0.6125, "ca": 0.481894, "cn": 0.386667, "cs": 0.57, "cy": 0.275, "da": 0.4825, "de": 0.363333, "el": 0.4175, "en": 0.358333, "es": 0.489489, "et": 0.545, "eu": 0.249581, "fa": 0.341667, "fi": 0.535, "fr": 0.439791, "gl": 0.263333, "he": 0.326667, "hi": 0.301471, "hu": 0.4625, "hv": 0.5825, "ir": 0.335, "is": 0.62, "it": 0.5375, "ka": 0.477987, "kz": 0.228333, "la": 0.463938, "li": 0.4275, "lv": 0.4825, "mk": 0.3325, "mt": 0.4675, "nl": 0.5625, "no": 0.4925, "pl": 0.280702, "pt": 0.5025, "ro": 0.415, "ru": 0.388333, "sk": 0.46, "sl": 0.5875, "sq": 0.495, "sr": 0.45, "sv": 0.425, "tr": 0.5475, "uk": 0.283972}, "lang_wp": {"ab": 0.453333, "ar": 0.565, "az": 0.605, "be": 0.5775, "bg": 0.640884, "bo": 0.70875, "ca": 0.547354, "cn": 0.530833, "cs": 0.665, "cy": 0.454167, "da": 0.6275, "de": 0.519167, "el": 0.55875, "en": 0.5425, "es": 0.605105, "et": 0.695, "eu": 0.38526, "fa": 0.485833, "fi": 0.64125, "fr": 0.552356, "gl": 0.4375, "he": 0.4875, "hi": 0.440257, "hu": 0.58625, "hv": 0.6925, "ir": 0.485833, "is": 0.725, "it": 0.63625, "ka": 0.561321, "kz": 0.376667, "la": 0.615984, "li": 0.55875, "lv": 0.60875, "mk": 0.44125, "mt": 0.59875, "nl": 0.68125, "no": 0.58625, "pl": 0.435088, "pt": 0.60125, "ro": 0.5325, "ru": 0.529167, "sk": 0.58125, "sl": 0.71, "sq": 0.63625, "sr": 0.5775, "sv": 0.5475, "tr": 0.69125, "uk": 0.399826}, "lang_parsed": {"ab": 1.0, "ar": 1.0, "az": 1.0, "be": 1.0, "bg": 1.0, "bo": 1.0, "ca": 1.0, "cn": 1.0, "cs": 1.0, "cy": 1.0, "da": 1.0, "de": 1.0, "el": 1.0, "en": 1.0, "es": 1.0, "et": 1.0, "eu": 1.0, "fa": 1.0, "fi": 0.9925, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "hu": 1.0, "hv": 1.0, "ir": 1.0, "is": 1.0, "it": 1.0, "ka": 1.0, "kz": 1.0, "la": 1.0, "li": 1.0, "lv": 1.0, "mk": 1.0, "mt": 1.0, "nl": 1.0, "no": 1.0, "pl": 1.0, "pt": 1.0, "ro": 1.0, "ru": 1.0, "sk": 1.0, "sl": 1.0, "sq": 1.0, "sr": 1.0, "sv": 0.9975, "tr": 1.0, "uk": 1.0}, "lang_mae": {"ab": 1.29, "ar": 1.2725, "az": 1.025, "be": 1.0575, "bg": 0.8204, "bo": 0.6125, "ca": 1.2033, "cn": 1.04, "cs": 0.725, "cy": 1.2683, "da": 0.83, "de": 1.035, "el": 1.0475, "en": 0.9967, "es": 0.8649, "et": 0.6425, "eu": 1.5394, "fa": 1.2017, "fi": 0.7632, "fr": 1.0471, "gl": 1.385, "he": 1.1683, "hi": 1.3015, "hu": 1.0175, "hv": 0.65, "ir": 1.1633, "is": 0.65, "it": 0.795, "ka": 1.2013, "kz": 1.5567, "la": 0.922, "li": 1.0325, "lv": 0.9475, "mk": 1.5725, "mt": 1.02, "nl": 0.6775, "no": 0.96, "pl": 1.2246, "pt": 0.9025, "ro": 1.1375, "ru": 0.995, "sk": 0.95, "sl": 0.6275, "sq": 0.7875, "sr": 1.0225, "sv": 1.0526, "tr": 0.655, "uk": 1.3432}, "confusion": {"1": {"1": 0.4084, "2": 0.2733, "5": 0.0553, "4": 0.2302, "6": 0.0038, "3": 0.029}, "2": {"4": 0.4936, "5": 0.2946, "2": 0.1358, "6": 0.0159, "1": 0.0455, "3": 0.0146}, "3": {"5": 0.7686, "6": 0.0758, "4": 0.1457, "2": 0.004, "1": 0.0059}, "4": {"5": 0.7606, "6": 0.1988, "4": 0.0381, "1": 0.0013, "2": 0.0013}, "5": {"5": 0.6645, "6": 0.3313, "4": 0.004, "2": 0.0002}, "6": {"6": 0.7575, "5": 0.2392, "4": 0.0029, "1": 0.0004}}}, {"model": "CYFRAGOVPL/Llama-PLLuM-70B-chat-250801", "avg_exact": 0.298105, "avg_wp": 0.518928, "avg_bias": -0.3311, "avg_parsed": 0.996492, "avg_mae": 1.0453, "total": 21952, "lang_exact": {"ab": 0.296667, "ar": 0.395, "az": 0.2425, "be": 0.345, "bg": 0.303867, "bo": 0.33, "ca": 0.169916, "cn": 0.323333, "cs": 0.4025, "cy": 0.361667, "da": 0.3525, "de": 0.335, "el": 0.1825, "en": 0.295, "es": 0.312312, "et": 0.3075, "eu": 0.271357, "fa": 0.315, "fi": 0.2375, "fr": 0.374346, "gl": 0.283333, "he": 0.323333, "hi": 0.395221, "hu": 0.275, "hv": 0.3525, "ir": 0.298333, "is": 0.405, "it": 0.315, "ka": 0.081761, "kz": 0.305, "la": 0.2846, "li": 0.21, "lv": 0.27, "mk": 0.075, "mt": 0.3225, "nl": 0.3125, "no": 0.295, "pl": 0.315789, "pt": 0.2975, "ro": 0.2475, "ru": 0.335, "sk": 0.2675, "sl": 0.2125, "sq": 0.1675, "sr": 0.32, "sv": 0.3375, "tr": 0.2925, "uk": 0.358885}, "lang_wp": {"ab": 0.515833, "ar": 0.61125, "az": 0.4675, "be": 0.5825, "bg": 0.517956, "bo": 0.55875, "ca": 0.4039, "cn": 0.520833, "cs": 0.61875, "cy": 0.583333, "da": 0.5775, "de": 0.5775, "el": 0.41375, "en": 0.4825, "es": 0.555556, "et": 0.515, "eu": 0.464824, "fa": 0.515833, "fi": 0.455, "fr": 0.590314, "gl": 0.48, "he": 0.535, "hi": 0.606618, "hu": 0.49875, "hv": 0.585, "ir": 0.465833, "is": 0.63625, "it": 0.53125, "ka": 0.292453, "kz": 0.520833, "la": 0.475634, "li": 0.47375, "lv": 0.515, "mk": 0.305, "mt": 0.58, "nl": 0.55625, "no": 0.495, "pl": 0.534211, "pt": 0.515, "ro": 0.455, "ru": 0.59, "sk": 0.47625, "sl": 0.4625, "sq": 0.40625, "sr": 0.56875, "sv": 0.57875, "tr": 0.5325, "uk": 0.589721}, "lang_parsed": {"ab": 1.0, "ar": 1.0, "az": 1.0, "be": 1.0, "bg": 0.997238, "bo": 0.9975, "ca": 1.0, "cn": 1.0, "cs": 1.0, "cy": 1.0, "da": 0.9975, "de": 1.0, "el": 1.0, "en": 0.995, "es": 1.0, "et": 1.0, "eu": 0.998325, "fa": 1.0, "fi": 0.9925, "fr": 1.0, "gl": 0.998333, "he": 0.998333, "hi": 1.0, "hu": 0.9975, "hv": 1.0, "ir": 0.998333, "is": 1.0, "it": 0.9975, "ka": 0.996855, "kz": 0.995, "la": 1.0, "li": 1.0, "lv": 1.0, "mk": 0.9975, "mt": 1.0, "nl": 1.0, "no": 0.9975, "pl": 0.907018, "pt": 1.0, "ro": 0.9975, "ru": 1.0, "sk": 1.0, "sl": 1.0, "sq": 1.0, "sr": 1.0, "sv": 0.9975, "tr": 0.9975, "uk": 1.0}, "lang_mae": {"ab": 1.0233, "ar": 0.885, "az": 1.15, "be": 0.885, "bg": 1.036, "bo": 0.9323, "ca": 1.2897, "cn": 1.0767, "cs": 0.85, "cy": 0.92, "da": 0.8822, "de": 0.905, "el": 1.245, "en": 1.1993, "es": 0.9129, "et": 1.155, "eu": 1.2383, "fa": 1.09, "fi": 1.2368, "fr": 0.8586, "gl": 1.1235, "he": 0.9666, "hi": 0.8199, "hu": 1.0426, "hv": 0.9125, "ir": 1.3556, "is": 0.765, "it": 0.9975, "ka": 1.5331, "kz": 1.0519, "la": 1.3957, "li": 1.065, "lv": 1.0075, "mk": 1.4211, "mt": 0.8875, "nl": 0.92, "no": 1.1378, "pl": 0.8781, "pt": 1.0225, "ro": 1.193, "ru": 0.8533, "sk": 1.1675, "sl": 1.11, "sq": 1.2375, "sr": 0.92, "sv": 0.8972, "tr": 0.9474, "uk": 0.8554}, "confusion": {"1": {"1": 0.6736, "3": 0.2988, "4": 0.0116, "5": 0.0075, "2": 0.0079, "6": 0.0006}, "2": {"3": 0.5543, "1": 0.3632, "5": 0.0304, "4": 0.044, "6": 0.0017, "2": 0.0065}, "3": {"3": 0.5333, "4": 0.1967, "1": 0.1281, "5": 0.1406, "6": 0.0007, "2": 0.0007}, "4": {"3": 0.4161, "4": 0.266, "5": 0.2365, "1": 0.0776, "6": 0.0025, "2": 0.0013}, "5": {"4": 0.3081, "3": 0.2347, "5": 0.427, "1": 0.0258, "6": 0.0038, "2": 0.0006}, "6": {"4": 0.2463, "5": 0.539, "3": 0.1553, "1": 0.0383, "6": 0.0209, "2": 0.0002}}}, {"model": "speakleash/Bielik-11B-v2.6-Instruct", "avg_exact": 0.26891, "avg_wp": 0.480144, "avg_bias": 0.4487, "avg_parsed": 1.0, "avg_mae": 1.1156, "total": 21933, "lang_exact": {"ab": 0.297162, "ar": 0.002545, "az": 0.2625, "be": 0.2875, "bg": 0.292818, "bo": 0.365, "ca": 0.306407, "cn": 0.274247, "cs": 0.3325, "cy": 0.196667, "da": 0.415, "de": 0.288333, "el": 0.28, "en": 0.301667, "es": 0.42042, "et": 0.28, "eu": 0.232831, "fa": 0.293333, "fi": 0.295, "fr": 0.272251, "gl": 0.258333, "he": 0.215488, "hi": 0.267035, "hu": 0.2325, "hv": 0.2975, "ir": 0.186667, "is": 0.1125, "it": 0.33, "ka": 0.132911, "kz": 0.215, "la": 0.204678, "li": 0.23, "lv": 0.1775, "mk": 0.305, "mt": 0.2825, "nl": 0.45, "no": 0.2725, "pl": 0.292982, "pt": 0.2525, "ro": 0.215, "ru": 0.293333, "sk": 0.32, "sl": 0.38, "sq": 0.16, "sr": 0.355, "sv": 0.3575, "tr": 0.2675, "uk": 0.249129}, "lang_wp": {"ab": 0.488314, "ar": 0.128499, "az": 0.4725, "be": 0.49125, "bg": 0.524862, "bo": 0.58, "ca": 0.493036, "cn": 0.441472, "cs": 0.595, "cy": 0.393333, "da": 0.63125, "de": 0.48, "el": 0.48625, "en": 0.525833, "es": 0.600601, "et": 0.50375, "eu": 0.449749, "fa": 0.506667, "fi": 0.53, "fr": 0.502618, "gl": 0.460833, "he": 0.43266, "hi": 0.443831, "hu": 0.46875, "hv": 0.5525, "ir": 0.360833, "is": 0.2825, "it": 0.58125, "ka": 0.349684, "kz": 0.4175, "la": 0.397661, "li": 0.47, "lv": 0.43125, "mk": 0.49125, "mt": 0.44375, "nl": 0.675, "no": 0.4775, "pl": 0.510526, "pt": 0.47625, "ro": 0.47, "ru": 0.5325, "sk": 0.5675, "sl": 0.6125, "sq": 0.3875, "sr": 0.56125, "sv": 0.5525, "tr": 0.4825, "uk": 0.482578}, "lang_parsed": {"ab": 1.0, "ar": 1.0, "az": 1.0, "be": 1.0, "bg": 1.0, "bo": 1.0, "ca": 1.0, "cn": 1.0, "cs": 1.0, "cy": 1.0, "da": 1.0, "de": 1.0, "el": 1.0, "en": 1.0, "es": 1.0, "et": 1.0, "eu": 1.0, "fa": 1.0, "fi": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "hu": 1.0, "hv": 1.0, "ir": 1.0, "is": 1.0, "it": 1.0, "ka": 1.0, "kz": 1.0, "la": 1.0, "li": 1.0, "lv": 1.0, "mk": 1.0, "mt": 1.0, "nl": 1.0, "no": 1.0, "pl": 1.0, "pt": 1.0, "ro": 1.0, "ru": 1.0, "sk": 1.0, "sl": 1.0, "sq": 1.0, "sr": 1.0, "sv": 1.0, "tr": 1.0, "uk": 1.0}, "lang_mae": {"ab": 1.1302, "ar": 2.0204, "az": 1.1125, "be": 1.0675, "bg": 0.989, "bo": 0.85, "ca": 1.1783, "cn": 1.2324, "cs": 0.8525, "cy": 1.3217, "da": 0.76, "de": 1.0733, "el": 1.1475, "en": 0.9983, "es": 0.8228, "et": 1.0075, "eu": 1.1859, "fa": 1.0367, "fi": 0.99, "fr": 1.0183, "gl": 1.16, "he": 1.2121, "hi": 1.2449, "hu": 1.1325, "hv": 0.905, "ir": 1.4883, "is": 1.81, "it": 0.8575, "ka": 1.5158, "kz": 1.2533, "la": 1.3899, "li": 1.08, "lv": 1.1575, "mk": 1.095, "mt": 1.3, "nl": 0.655, "no": 1.175, "pl": 1.0175, "pt": 1.0825, "ro": 1.105, "ru": 0.95, "sk": 0.8725, "sl": 0.7775, "sq": 1.265, "sr": 0.9025, "sv": 0.92, "tr": 1.055, "uk": 1.0418}, "confusion": {"1": {"3": 0.7791, "5": 0.0137, "1": 0.1906, "6": 0.0115, "4": 0.0036, "2": 0.0015}, "2": {"3": 0.8773, "6": 0.0123, "5": 0.0831, "4": 0.0161, "1": 0.0111, "2": 0.0002}, "3": {"5": 0.3681, "3": 0.5488, "6": 0.06, "4": 0.0218, "1": 0.0013}, "4": {"3": 0.4065, "5": 0.4784, "6": 0.0926, "4": 0.0163, "1": 0.0063}, "5": {"5": 0.5982, "3": 0.1713, "6": 0.2082, "4": 0.0213, "1": 0.001}, "6": {"5": 0.5402, "3": 0.1576, "6": 0.2809, "4": 0.0203, "1": 0.0011}}}, {"model": "CYFRAGOVPL/pllum-12b-nc-chat-250715", "avg_exact": 0.181031, "avg_wp": 0.290794, "avg_bias": -0.1791, "avg_parsed": 0.682125, "avg_mae": 1.7019, "total": 21952, "lang_exact": {"ab": 0.161667, "ar": 0.0975, "az": 0.2425, "be": 0.2625, "bg": 0.198895, "bo": 0.1675, "ca": 0.147632, "cn": 0.131667, "cs": 0.1425, "cy": 0.153333, "da": 0.2825, "de": 0.146667, "el": 0.1975, "en": 0.196667, "es": 0.24024, "et": 0.24, "eu": 0.155779, "fa": 0.16, "fi": 0.195, "fr": 0.180628, "gl": 0.2, "he": 0.166667, "hi": 0.183824, "hu": 0.215, "hv": 0.235, "ir": 0.111667, "is": 0.17, "it": 0.1825, "ka": 0.157233, "kz": 0.136667, "la": 0.251462, "li": 0.215, "lv": 0.155, "mk": 0.1775, "mt": 0.15, "nl": 0.1925, "no": 0.215, "pl": 0.115789, "pt": 0.1625, "ro": 0.2425, "ru": 0.236667, "sk": 0.2175, "sl": 0.2425, "sq": 0.1075, "sr": 0.155, "sv": 0.22, "tr": 0.195, "uk": 0.121951}, "lang_wp": {"ab": 0.265833, "ar": 0.17375, "az": 0.3925, "be": 0.3925, "bg": 0.299724, "bo": 0.24625, "ca": 0.236769, "cn": 0.211667, "cs": 0.2425, "cy": 0.285, "da": 0.4425, "de": 0.2475, "el": 0.335, "en": 0.264167, "es": 0.355856, "et": 0.34, "eu": 0.250419, "fa": 0.263333, "fi": 0.2975, "fr": 0.306283, "gl": 0.345, "he": 0.26, "hi": 0.318015, "hu": 0.3275, "hv": 0.34375, "ir": 0.189167, "is": 0.30125, "it": 0.2575, "ka": 0.295597, "kz": 0.264167, "la": 0.359649, "li": 0.28625, "lv": 0.26125, "mk": 0.2925, "mt": 0.29125, "nl": 0.335, "no": 0.28875, "pl": 0.17193, "pt": 0.25375, "ro": 0.41375, "ru": 0.358333, "sk": 0.3175, "sl": 0.39875, "sq": 0.21125, "sr": 0.2725, "sv": 0.39, "tr": 0.29625, "uk": 0.213415}, "lang_parsed": {"ab": 0.63, "ar": 0.4875, "az": 0.8725, "be": 0.6975, "bg": 0.588398, "bo": 0.515, "ca": 0.481894, "cn": 0.588333, "cs": 0.5275, "cy": 0.86, "da": 0.84, "de": 0.57, "el": 0.795, "en": 0.51, "es": 0.672673, "et": 0.7525, "eu": 0.748744, "fa": 0.801667, "fi": 0.645, "fr": 0.612565, "gl": 0.69, "he": 0.643333, "hi": 0.871324, "hu": 0.7525, "hv": 0.63, "ir": 0.588333, "is": 0.7675, "it": 0.5375, "ka": 0.748428, "kz": 0.836667, "la": 0.807018, "li": 0.585, "lv": 0.545, "mk": 0.74, "mt": 0.7725, "nl": 0.7175, "no": 0.6175, "pl": 0.477193, "pt": 0.5375, "ro": 0.855, "ru": 0.816667, "sk": 0.695, "sl": 0.83, "sq": 0.5775, "sr": 0.525, "sv": 0.7975, "tr": 0.76, "uk": 0.721254}, "lang_mae": {"ab": 1.4735, "ar": 2.2154, "az": 1.6848, "be": 1.3728, "bg": 1.4507, "bo": 1.7233, "ca": 1.3873, "cn": 1.9773, "cs": 1.6635, "cy": 1.6667, "da": 1.2232, "de": 1.4474, "el": 2.0472, "en": 1.3954, "es": 1.442, "et": 1.7807, "eu": 2.0045, "fa": 1.9459, "fi": 1.845, "fr": 1.6068, "gl": 1.2367, "he": 1.6632, "hi": 1.7511, "hu": 1.9967, "hv": 1.5238, "ir": 1.9065, "is": 1.8078, "it": 1.6512, "ka": 1.7521, "kz": 2.0159, "la": 1.7101, "li": 1.8077, "lv": 1.6743, "mk": 1.7872, "mt": 1.7508, "nl": 1.331, "no": 1.8462, "pl": 1.7353, "pt": 1.7116, "ro": 1.4123, "ru": 1.4878, "sk": 1.8094, "sl": 1.5361, "sq": 2.0346, "sr": 1.1524, "sv": 1.442, "tr": 2.2368, "uk": 2.1304}, "confusion": {"1": {"6": 0.1641, "2": 0.1494, "1": 0.5374, "3": 0.0636, "5": 0.0381, "4": 0.0473}, "2": {"2": 0.1685, "1": 0.3392, "4": 0.0745, "3": 0.1322, "6": 0.217, "5": 0.0686}, "3": {"4": 0.1226, "3": 0.0934, "1": 0.2472, "6": 0.2585, "5": 0.1802, "2": 0.0981}, "4": {"6": 0.2685, "5": 0.2005, "4": 0.1191, "1": 0.2601, "3": 0.0856, "2": 0.0663}, "5": {"4": 0.1975, "2": 0.032, "6": 0.2114, "5": 0.2705, "1": 0.2123, "3": 0.0763}, "6": {"4": 0.1714, "5": 0.2743, "6": 0.2136, "1": 0.2624, "2": 0.0327, "3": 0.0457}}}];
const ALL_LANGS = ["sq", "ab", "ar", "az", "eu", "be", "bo", "bg", "ca", "cn", "hv", "cs", "da", "nl", "en", "et", "fa", "fi", "fr", "gl", "ka", "de", "el", "he", "hi", "hu", "is", "ir", "it", "kz", "la", "lv", "li", "mk", "mt", "no", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sv", "tr", "uk", "cy"];
const LANG_NAMES = {"af": "Afrikaans", "ab": "Arabic", "az": "Azerbaijani", "ar": "Armenian", "be": "Belarusian", "bo": "Bosnian", "bg": "Bulgarian", "bn": "Brunei", "ca": "Catalan", "cs": "Czech", "cn": "Chinese", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Faroese", "fi": "Finnish", "fr": "French", "ga": "Irish", "gl": "Galician", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "hv": "Croatia", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ir": "Irish", "ja": "Japanese", "ka": "Georgian", "kz": "Kazakh", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "la": "Latin", "li": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "mt": "Maltese", "my": "Burmese", "ne": "Nepali", "nl": "Dutch", "no": "Norwegian", "pa": "Punjabi", "pe": "Persian", "pl": "Polish", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Filipino", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "vi": "Vietnamese", "zh": "Chinese", "zu": "Zulu"};
const LANG_COUNTS = {"ab": 600, "ar": 400, "az": 400, "be": 400, "bg": 362, "bo": 400, "ca": 359, "cn": 600, "cs": 400, "cy": 600, "da": 400, "de": 600, "el": 400, "en": 600, "es": 333, "et": 400, "eu": 597, "fa": 600, "fi": 400, "fr": 382, "gl": 600, "he": 600, "hi": 544, "hu": 400, "hv": 400, "ir": 600, "is": 400, "it": 400, "ka": 318, "kz": 600, "la": 513, "li": 400, "lv": 400, "mk": 400, "mt": 400, "nl": 400, "no": 400, "pl": 571, "pt": 400, "ro": 400, "ru": 600, "sk": 400, "sl": 400, "sq": 400, "sr": 400, "sv": 400, "tr": 400, "uk": 574};
const LANG_DIST = {"ab": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "ar": {"1": 100, "2": 100, "5": 100, "6": 100}, "az": {"1": 100, "2": 100, "5": 100, "6": 100}, "be": {"1": 100, "2": 100, "5": 100, "6": 100}, "bg": {"1": 100, "2": 100, "5": 100, "6": 62}, "bo": {"1": 100, "2": 100, "5": 100, "6": 100}, "ca": {"1": 73, "2": 100, "5": 100, "6": 86}, "cn": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "cs": {"1": 100, "2": 100, "5": 100, "6": 100}, "cy": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "da": {"1": 100, "2": 100, "5": 100, "6": 100}, "de": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "el": {"1": 100, "2": 100, "5": 100, "6": 100}, "en": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "es": {"1": 100, "2": 100, "5": 100, "6": 33}, "et": {"1": 100, "2": 100, "5": 100, "6": 100}, "eu": {"1": 97, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "fa": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "fi": {"1": 100, "2": 100, "5": 100, "6": 100}, "fr": {"1": 100, "2": 100, "5": 100, "6": 82}, "gl": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "he": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "hi": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 44}, "hu": {"1": 100, "2": 100, "5": 100, "6": 100}, "hv": {"1": 100, "2": 100, "5": 100, "6": 100}, "ir": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "is": {"1": 100, "2": 100, "5": 100, "6": 100}, "it": {"1": 100, "2": 100, "5": 100, "6": 100}, "ka": {"1": 18, "2": 100, "5": 100, "6": 100}, "kz": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "la": {"1": 100, "2": 96, "3": 17, "4": 100, "5": 100, "6": 100}, "li": {"1": 100, "2": 100, "5": 100, "6": 100}, "lv": {"1": 100, "2": 100, "5": 100, "6": 100}, "mk": {"1": 100, "2": 100, "5": 100, "6": 100}, "mt": {"1": 100, "2": 100, "5": 100, "6": 100}, "nl": {"1": 100, "2": 100, "5": 100, "6": 100}, "no": {"1": 100, "2": 100, "5": 100, "6": 100}, "pl": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 71}, "pt": {"1": 100, "2": 100, "5": 100, "6": 100}, "ro": {"1": 100, "2": 100, "5": 100, "6": 100}, "ru": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "sk": {"1": 100, "2": 100, "5": 100, "6": 100}, "sl": {"1": 100, "2": 100, "5": 100, "6": 100}, "sq": {"1": 100, "2": 100, "5": 100, "6": 100}, "sr": {"1": 100, "2": 100, "5": 100, "6": 100}, "sv": {"1": 100, "2": 100, "5": 100, "6": 100}, "tr": {"1": 100, "2": 100, "5": 100, "6": 100}, "uk": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 74}};
// second dataset
const HAS_SECOND = true;
const ALL_ROWS_2 = [{"model": "mistralai/Mistral-Small-24B-Instruct-2501", "avg_exact": 0.31779, "avg_wp": 0.560195, "avg_bias": 0.0284, "avg_parsed": 1.0, "avg_mae": 0.9057, "total": 7999, "lang_exact": {"ab": 0.294382, "cn": 0.307407, "cy": 0.321932, "de": 0.336824, "en": 0.415755, "es": 0.382353, "eu": 0.289817, "fr": 0.302867, "gl": 0.305609, "he": 0.306641, "hi": 0.241055, "ir": 0.272727, "kz": 0.376923, "la": 0.319444, "pl": 0.341727, "sl": 0.295585, "uk": 0.318681}, "lang_wp": {"ab": 0.529213, "cn": 0.544444, "cy": 0.560362, "de": 0.576789, "en": 0.671772, "es": 0.639706, "eu": 0.523499, "fr": 0.580645, "gl": 0.56383, "he": 0.544922, "hi": 0.461394, "ir": 0.513834, "kz": 0.621154, "la": 0.402778, "pl": 0.570144, "sl": 0.536468, "uk": 0.596703}, "lang_parsed": {"ab": 1.0, "cn": 1.0, "cy": 1.0, "de": 1.0, "en": 1.0, "es": 1.0, "eu": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "ir": 1.0, "kz": 1.0, "la": 1.0, "pl": 1.0, "sl": 1.0, "uk": 1.0}, "lang_mae": {"ab": 0.9843, "cn": 0.963, "cy": 0.8954, "de": 0.8709, "en": 0.6718, "es": 0.7243, "eu": 0.9687, "fr": 0.8548, "gl": 0.8878, "he": 0.9414, "hi": 1.1337, "ir": 1.0099, "kz": 0.7577, "la": 1.3056, "pl": 0.8759, "sl": 0.9539, "uk": 0.8154}, "confusion": {"3": {"4": 0.5322, "2": 0.0946, "3": 0.346, "1": 0.01, "6": 0.0007, "5": 0.0165}, "2": {"4": 0.3167, "2": 0.2579, "3": 0.4001, "1": 0.0219, "5": 0.0034}, "1": {"2": 0.4823, "3": 0.2102, "1": 0.2138, "4": 0.0919, "6": 0.0018}, "4": {"4": 0.6669, "3": 0.2057, "2": 0.0512, "5": 0.0708, "1": 0.0054}, "5": {"4": 0.5988, "5": 0.3031, "3": 0.0791, "2": 0.0132, "6": 0.0059}, "6": {"4": 0.399, "5": 0.5197, "3": 0.0335, "6": 0.0369, "2": 0.0075, "1": 0.0034}}}, {"model": "allenai/Olmo-3.1-32B-Instruct", "avg_exact": 0.35418, "avg_wp": 0.549877, "avg_bias": 0.6144, "avg_parsed": 1.0, "avg_mae": 1.0553, "total": 7739, "lang_exact": {"ab": 0.355499, "cn": 0.368715, "cy": 0.346154, "de": 0.37766, "en": 0.369892, "es": 0.468635, "eu": 0.312668, "fr": 0.4, "gl": 0.321569, "he": 0.281319, "hi": 0.357853, "ir": 0.348606, "kz": 0.164444, "la": 0.464789, "pl": 0.388385, "sl": 0.327485, "uk": 0.295943}, "lang_wp": {"ab": 0.528133, "cn": 0.566108, "cy": 0.552632, "de": 0.575355, "en": 0.563441, "es": 0.664207, "eu": 0.494609, "fr": 0.597297, "gl": 0.52549, "he": 0.497802, "hi": 0.549702, "ir": 0.557769, "kz": 0.386667, "la": 0.489437, "pl": 0.595281, "sl": 0.515595, "uk": 0.502387}, "lang_parsed": {"ab": 1.0, "cn": 1.0, "cy": 1.0, "de": 1.0, "en": 1.0, "es": 1.0, "eu": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "ir": 1.0, "kz": 1.0, "la": 1.0, "pl": 1.0, "sl": 1.0, "uk": 1.0}, "lang_mae": {"ab": 1.202, "cn": 0.9777, "cy": 1.0223, "de": 1.0018, "en": 1.0817, "es": 0.7583, "eu": 1.1968, "fr": 0.9099, "gl": 1.1, "he": 1.1231, "hi": 1.0676, "ir": 0.9622, "kz": 1.44, "la": 1.8592, "pl": 0.9274, "sl": 1.1384, "uk": 1.1074}, "confusion": {"1": {"1": 0.5078, "6": 0.0683, "3": 0.1588, "2": 0.2133, "5": 0.0185, "4": 0.0332}, "2": {"1": 0.1237, "3": 0.3462, "4": 0.1493, "5": 0.0891, "2": 0.1465, "6": 0.1451}, "3": {"1": 0.0211, "2": 0.0605, "6": 0.1355, "3": 0.3336, "4": 0.2658, "5": 0.1835}, "4": {"5": 0.2575, "3": 0.2026, "4": 0.2484, "6": 0.2498, "2": 0.0236, "1": 0.018}, "5": {"4": 0.1315, "6": 0.5058, "5": 0.2848, "3": 0.0623, "1": 0.0086, "2": 0.007}, "6": {"6": 0.718, "5": 0.1694, "4": 0.0613, "3": 0.027, "2": 0.0009, "1": 0.0234}}}, {"model": "Qwen/Qwen2.5-14B-Instruct", "avg_exact": 0.278385, "avg_wp": 0.516447, "avg_bias": 0.053, "avg_parsed": 1.0, "avg_mae": 0.9821, "total": 7874, "lang_exact": {"ab": 0.253968, "cn": 0.291367, "cy": 0.308316, "de": 0.278571, "en": 0.324561, "es": 0.305556, "eu": 0.248649, "fr": 0.337545, "gl": 0.252964, "he": 0.276062, "hi": 0.226824, "ir": 0.282869, "kz": 0.341772, "la": 0.262411, "pl": 0.21558, "sl": 0.268627, "uk": 0.269142}, "lang_wp": {"ab": 0.503401, "cn": 0.555755, "cy": 0.548682, "de": 0.5125, "en": 0.569079, "es": 0.557407, "eu": 0.467568, "fr": 0.563177, "gl": 0.499012, "he": 0.52027, "hi": 0.455621, "ir": 0.513944, "kz": 0.586498, "la": 0.358156, "pl": 0.447464, "sl": 0.510784, "uk": 0.520882}, "lang_parsed": {"ab": 1.0, "cn": 1.0, "cy": 1.0, "de": 1.0, "en": 1.0, "es": 1.0, "eu": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "ir": 1.0, "kz": 1.0, "la": 1.0, "pl": 1.0, "sl": 1.0, "uk": 1.0}, "lang_mae": {"ab": 1.0113, "cn": 0.8903, "cy": 0.9148, "de": 0.9839, "en": 0.8684, "es": 0.8889, "eu": 1.0973, "fr": 0.8809, "gl": 1.0158, "he": 0.9807, "hi": 1.1144, "ir": 1.0, "kz": 0.827, "la": 1.3333, "pl": 1.125, "sl": 1.0039, "uk": 0.9606}, "confusion": {"1": {"1": 0.1435, "2": 0.4625, "3": 0.3373, "4": 0.0567}, "2": {"3": 0.4684, "2": 0.1699, "4": 0.3432, "1": 0.011, "5": 0.0076}, "3": {"4": 0.5922, "2": 0.049, "3": 0.3429, "5": 0.0144, "1": 0.0007, "6": 0.0007}, "4": {"4": 0.7609, "3": 0.1776, "5": 0.043, "2": 0.0178, "6": 0.0007}, "5": {"4": 0.8014, "5": 0.143, "3": 0.0504, "2": 0.003, "6": 0.0023}, "6": {"5": 0.2314, "4": 0.7293, "6": 0.007, "3": 0.0297, "2": 0.0026}}}, {"model": "utter-project/EuroLLM-22B-Instruct-2512", "avg_exact": 0.298877, "avg_wp": 0.513694, "avg_bias": 0.574, "avg_parsed": 0.999874, "avg_mae": 1.1195, "total": 7923, "lang_exact": {"ab": 0.361751, "cn": 0.33945, "cy": 0.254582, "de": 0.326353, "en": 0.253363, "es": 0.368324, "eu": 0.22372, "fr": 0.389892, "gl": 0.294004, "he": 0.257778, "hi": 0.302026, "ir": 0.351248, "kz": 0.293878, "la": 0.194444, "pl": 0.216696, "sl": 0.272553, "uk": 0.266234}, "lang_wp": {"ab": 0.570276, "cn": 0.569725, "cy": 0.482688, "de": 0.546248, "en": 0.439462, "es": 0.581952, "eu": 0.443396, "fr": 0.591155, "gl": 0.525145, "he": 0.444444, "hi": 0.524862, "ir": 0.56238, "kz": 0.504082, "la": 0.524306, "pl": 0.409414, "sl": 0.482726, "uk": 0.492424}, "lang_parsed": {"ab": 1.0, "cn": 0.998165, "cy": 1.0, "de": 1.0, "en": 1.0, "es": 1.0, "eu": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "ir": 1.0, "kz": 1.0, "la": 1.0, "pl": 1.0, "sl": 1.0, "uk": 1.0}, "lang_mae": {"ab": 0.9401, "cn": 0.9154, "cy": 1.1527, "de": 1.0489, "en": 1.3161, "es": 0.9134, "eu": 1.3477, "fr": 0.9206, "gl": 1.0387, "he": 1.3711, "hi": 1.0829, "ir": 0.9712, "kz": 1.1796, "la": 1.2083, "pl": 1.4938, "sl": 1.2054, "uk": 1.132}, "confusion": {"2": {"1": 0.068, "5": 0.455, "2": 0.3491, "4": 0.0034, "3": 0.1203, "6": 0.0041}, "1": {"1": 0.3846, "2": 0.4139, "5": 0.1465, "3": 0.0513, "6": 0.0037}, "3": {"5": 0.7158, "2": 0.1568, "1": 0.018, "3": 0.1065, "6": 0.0022, "4": 0.0007}, "4": {"5": 0.8645, "2": 0.0735, "3": 0.049, "4": 0.0027, "6": 0.0027, "1": 0.0075}, "5": {"5": 0.936, "2": 0.0387, "3": 0.0097, "6": 0.0082, "1": 0.0045, "4": 0.003}, "6": {"5": 0.9403, "1": 0.0094, "6": 0.0256, "2": 0.0179, "4": 0.0017, "3": 0.0051}}}, {"model": "CYFRAGOVPL/Llama-PLLuM-70B-chat-250801", "avg_exact": 0.275562, "avg_wp": 0.474108, "avg_bias": -0.6122, "avg_parsed": 0.996364, "avg_mae": 1.2257, "total": 9076, "lang_exact": {"ab": 0.297479, "cn": 0.248333, "cy": 0.276786, "de": 0.363333, "en": 0.243762, "es": 0.295, "eu": 0.166667, "fr": 0.318333, "gl": 0.201923, "he": 0.243655, "hi": 0.281667, "ir": 0.262887, "kz": 0.303681, "la": 0.216931, "pl": 0.316667, "sl": 0.331667, "uk": 0.245283}, "lang_wp": {"ab": 0.485714, "cn": 0.39, "cy": 0.504464, "de": 0.561667, "en": 0.487524, "es": 0.491667, "eu": 0.376623, "fr": 0.511667, "gl": 0.414423, "he": 0.423012, "hi": 0.471667, "ir": 0.444158, "kz": 0.546012, "la": 0.304233, "pl": 0.520833, "sl": 0.5425, "uk": 0.471698}, "lang_parsed": {"ab": 0.986555, "cn": 1.0, "cy": 0.998214, "de": 1.0, "en": 0.992322, "es": 0.996667, "eu": 0.997835, "fr": 0.998333, "gl": 1.0, "he": 0.99154, "hi": 1.0, "ir": 0.994845, "kz": 0.996933, "la": 1.0, "pl": 0.993333, "sl": 0.996667, "uk": 0.998113}, "lang_mae": {"ab": 1.1806, "cn": 1.5733, "cy": 1.102, "de": 1.0117, "en": 1.1277, "es": 1.1488, "eu": 1.4967, "fr": 1.1002, "gl": 1.3058, "he": 1.3379, "hi": 1.2433, "ir": 1.3368, "kz": 1.0092, "la": 2.3439, "pl": 1.0419, "sl": 0.9916, "uk": 1.2042}, "confusion": {"1": {"3": 0.1541, "1": 0.786, "4": 0.0317, "5": 0.0154, "6": 0.0051, "2": 0.0077}, "2": {"3": 0.3323, "4": 0.1092, "1": 0.4874, "5": 0.0595, "2": 0.0071, "6": 0.0045}, "3": {"1": 0.3083, "3": 0.3807, "5": 0.1244, "4": 0.1765, "2": 0.0054, "6": 0.0047}, "4": {"4": 0.2148, "3": 0.3557, "5": 0.216, "1": 0.2037, "6": 0.0055, "2": 0.0043}, "5": {"5": 0.3859, "4": 0.2643, "3": 0.2506, "1": 0.0842, "2": 0.0044, "6": 0.0106}, "6": {"4": 0.2679, "3": 0.1772, "5": 0.4247, "1": 0.1019, "6": 0.0253, "2": 0.0031}}}, {"model": "mistralai/Mistral-Nemo-Instruct-2407", "avg_exact": 0.282611, "avg_wp": 0.468668, "avg_bias": -0.2413, "avg_parsed": 0.963425, "avg_mae": 1.2449, "total": 8011, "lang_exact": {"ab": 0.29148, "cn": 0.3, "cy": 0.181087, "de": 0.279232, "en": 0.323851, "es": 0.310662, "eu": 0.263708, "fr": 0.351254, "gl": 0.257253, "he": 0.289062, "hi": 0.265683, "ir": 0.243083, "kz": 0.296154, "la": 0.180556, "pl": 0.354317, "sl": 0.28215, "uk": 0.248352}, "lang_wp": {"ab": 0.483184, "cn": 0.492593, "cy": 0.343058, "de": 0.507853, "en": 0.507659, "es": 0.511949, "eu": 0.425587, "fr": 0.562724, "gl": 0.434236, "he": 0.487305, "hi": 0.417897, "ir": 0.380435, "kz": 0.457692, "la": 0.322917, "pl": 0.546763, "sl": 0.478887, "uk": 0.465934}, "lang_parsed": {"ab": 0.950673, "cn": 0.983333, "cy": 0.987928, "de": 0.95637, "en": 0.964989, "es": 0.977941, "eu": 0.942559, "fr": 0.958781, "gl": 0.912959, "he": 0.988281, "hi": 0.968635, "ir": 0.966403, "kz": 0.915385, "la": 0.9375, "pl": 0.992806, "sl": 0.955854, "uk": 0.967033}, "lang_mae": {"ab": 1.1085, "cn": 1.2298, "cy": 1.8045, "de": 1.042, "en": 1.0, "es": 1.1015, "eu": 1.3989, "fr": 0.8991, "gl": 1.2712, "he": 1.1739, "hi": 1.499, "ir": 1.5808, "kz": 1.1723, "la": 2.2148, "pl": 1.0145, "sl": 1.2129, "uk": 1.1795}, "confusion": {"1": {"1": 0.4749, "2": 0.3597, "5": 0.0799, "4": 0.0288, "3": 0.0483, "6": 0.0084}, "2": {"4": 0.1264, "2": 0.4316, "5": 0.176, "1": 0.1669, "3": 0.0866, "6": 0.0126}, "3": {"4": 0.2181, "1": 0.0934, "2": 0.3071, "3": 0.1225, "5": 0.248, "6": 0.0109}, "4": {"4": 0.2169, "2": 0.2479, "5": 0.3336, "1": 0.076, "3": 0.1105, "6": 0.0152}, "5": {"4": 0.1957, "2": 0.1995, "5": 0.4612, "1": 0.0491, "6": 0.0269, "3": 0.0675}, "6": {"5": 0.5312, "4": 0.1379, "2": 0.1544, "6": 0.0478, "1": 0.08, "3": 0.0487}}}, {"model": "Qwen/Qwen2.5-7B-Instruct", "avg_exact": 0.285497, "avg_wp": 0.461773, "avg_bias": 0.9564, "avg_parsed": 1.0, "avg_mae": 1.212, "total": 7874, "lang_exact": {"ab": 0.283447, "cn": 0.305755, "cy": 0.219067, "de": 0.341071, "en": 0.296053, "es": 0.362963, "eu": 0.267568, "fr": 0.370036, "gl": 0.304348, "he": 0.272201, "hi": 0.258383, "ir": 0.213147, "kz": 0.278481, "la": 0.35461, "pl": 0.286232, "sl": 0.190196, "uk": 0.266821}, "lang_wp": {"ab": 0.451247, "cn": 0.48741, "cy": 0.408722, "de": 0.525893, "en": 0.501096, "es": 0.531481, "eu": 0.422973, "fr": 0.546029, "gl": 0.503953, "he": 0.44305, "hi": 0.431953, "ir": 0.376494, "kz": 0.440928, "la": 0.574468, "pl": 0.429348, "sl": 0.347059, "uk": 0.472158}, "lang_parsed": {"ab": 1.0, "cn": 1.0, "cy": 1.0, "de": 1.0, "en": 1.0, "es": 1.0, "eu": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "ir": 1.0, "kz": 1.0, "la": 1.0, "pl": 1.0, "sl": 1.0, "uk": 1.0}, "lang_mae": {"ab": 1.3107, "cn": 1.1259, "cy": 1.3266, "de": 1.0429, "en": 1.0658, "es": 1.0093, "eu": 1.3081, "fr": 0.9585, "gl": 1.0514, "he": 1.2838, "hi": 1.284, "ir": 1.4422, "kz": 1.1772, "la": 1.0, "pl": 1.4094, "sl": 1.5529, "uk": 1.1439}, "confusion": {"1": {"1": 0.3062, "3": 0.2678, "4": 0.3291, "6": 0.0283, "2": 0.0375, "5": 0.0311}, "2": {"2": 0.0186, "3": 0.1403, "4": 0.6272, "5": 0.1039, "1": 0.0413, "6": 0.0688}, "3": {"4": 0.611, "6": 0.1318, "2": 0.0029, "5": 0.2089, "3": 0.0389, "1": 0.0065}, "4": {"6": 0.2281, "4": 0.4597, "5": 0.2937, "3": 0.0143, "1": 0.0041}, "5": {"6": 0.4387, "4": 0.2092, "5": 0.3506, "3": 0.0015}, "6": {"6": 0.6052, "4": 0.1231, "5": 0.2681, "3": 0.0026, "1": 0.0009}}}, {"model": "allenai/Olmo-3-7B-Instruct", "avg_exact": 0.228195, "avg_wp": 0.409937, "avg_bias": 0.5317, "avg_parsed": 0.999612, "avg_mae": 1.4272, "total": 7739, "lang_exact": {"ab": 0.176471, "cn": 0.214153, "cy": 0.204453, "de": 0.23227, "en": 0.292473, "es": 0.287823, "eu": 0.221024, "fr": 0.23964, "gl": 0.264706, "he": 0.178022, "hi": 0.252485, "ir": 0.181275, "kz": 0.213333, "la": 0.211268, "pl": 0.257713, "sl": 0.163743, "uk": 0.250597}, "lang_wp": {"ab": 0.391304, "cn": 0.368715, "cy": 0.385628, "de": 0.437943, "en": 0.489247, "es": 0.511993, "eu": 0.392183, "fr": 0.423423, "gl": 0.431373, "he": 0.36044, "hi": 0.437376, "ir": 0.35757, "kz": 0.397778, "la": 0.31338, "pl": 0.434664, "sl": 0.309942, "uk": 0.435561}, "lang_parsed": {"ab": 1.0, "cn": 1.0, "cy": 1.0, "de": 0.998227, "en": 1.0, "es": 1.0, "eu": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 0.998012, "ir": 1.0, "kz": 1.0, "la": 1.0, "pl": 0.998185, "sl": 1.0, "uk": 1.0}, "lang_mae": {"ab": 1.3632, "cn": 1.5736, "cy": 1.4555, "de": 1.3091, "en": 1.1634, "es": 1.0996, "eu": 1.5553, "fr": 1.3892, "gl": 1.4078, "he": 1.5231, "hi": 1.3625, "ir": 1.498, "kz": 1.3911, "la": 1.7042, "pl": 1.4691, "sl": 1.8226, "uk": 1.3771}, "confusion": {"1": {"3": 0.2114, "4": 0.157, "2": 0.4035, "1": 0.0905, "5": 0.0766, "6": 0.0609}, "2": {"1": 0.027, "3": 0.1728, "4": 0.2122, "5": 0.1638, "6": 0.114, "2": 0.3103}, "3": {"3": 0.1151, "2": 0.2367, "4": 0.2251, "6": 0.1653, "5": 0.2535, "1": 0.0044}, "4": {"4": 0.1985, "3": 0.0985, "6": 0.2415, "2": 0.186, "5": 0.2741, "1": 0.0014}, "5": {"4": 0.1877, "6": 0.3879, "3": 0.0724, "5": 0.2033, "2": 0.148, "1": 0.0008}, "6": {"6": 0.4639, "5": 0.1597, "4": 0.1634, "3": 0.0731, "2": 0.139, "1": 0.0009}}}, {"model": "mistralai/Mistral-7B-Instruct-v0.3", "avg_exact": 0.229035, "avg_wp": 0.398141, "avg_bias": 0.9606, "avg_parsed": 0.994066, "avg_mae": 1.5188, "total": 7584, "lang_exact": {"ab": 0.224599, "cn": 0.208566, "cy": 0.220619, "de": 0.253142, "en": 0.16895, "es": 0.36194, "eu": 0.208914, "fr": 0.261993, "gl": 0.208791, "he": 0.199557, "hi": 0.240481, "ir": 0.197154, "kz": 0.228814, "la": 0.154412, "pl": 0.259191, "sl": 0.182711, "uk": 0.223502}, "lang_wp": {"ab": 0.415775, "cn": 0.388268, "cy": 0.371134, "de": 0.435368, "en": 0.326484, "es": 0.539179, "eu": 0.410864, "fr": 0.445572, "gl": 0.369231, "he": 0.379157, "hi": 0.403808, "ir": 0.327236, "kz": 0.427966, "la": 0.386029, "pl": 0.409007, "sl": 0.319253, "uk": 0.396313}, "lang_parsed": {"ab": 0.994652, "cn": 0.998138, "cy": 0.995876, "de": 0.994614, "en": 0.993151, "es": 0.994403, "eu": 0.997214, "fr": 0.99262, "gl": 0.997802, "he": 0.997783, "hi": 0.98998, "ir": 0.989837, "kz": 0.970339, "la": 0.977941, "pl": 1.0, "sl": 0.996071, "uk": 0.995392}, "lang_mae": {"ab": 1.371, "cn": 1.4981, "cy": 1.6418, "de": 1.37, "en": 1.7586, "es": 1.1032, "eu": 1.486, "fr": 1.3309, "gl": 1.5991, "he": 1.4422, "hi": 1.5668, "ir": 1.8029, "kz": 1.31, "la": 1.6692, "pl": 1.5625, "sl": 1.8679, "uk": 1.4769}, "confusion": {"2": {"4": 0.0677, "5": 0.3656, "1": 0.1214, "3": 0.2357, "2": 0.0247, "6": 0.1849}, "1": {"3": 0.2608, "1": 0.3499, "2": 0.0582, "5": 0.1839, "6": 0.1126, "4": 0.0347}, "4": {"3": 0.1364, "5": 0.4961, "4": 0.0473, "6": 0.2608, "1": 0.0382, "2": 0.0212}, "5": {"5": 0.5188, "2": 0.016, "3": 0.0721, "6": 0.3443, "4": 0.0328, "1": 0.016}, "3": {"5": 0.4767, "4": 0.0665, "3": 0.1663, "2": 0.0237, "6": 0.2106, "1": 0.0562}, "6": {"5": 0.4899, "4": 0.0308, "3": 0.0645, "6": 0.3744, "2": 0.0115, "1": 0.0289}}}, {"model": "cjvt/GaMS3-12B-Instruct", "avg_exact": 0.223201, "avg_wp": 0.38902, "avg_bias": 0.7368, "avg_parsed": 0.999876, "avg_mae": 1.4632, "total": 8060, "lang_exact": {"ab": 0.301339, "cn": 0.212613, "cy": 0.157258, "de": 0.217014, "en": 0.235931, "es": 0.210238, "eu": 0.18254, "fr": 0.183124, "gl": 0.223938, "he": 0.25, "hi": 0.268116, "ir": 0.200393, "kz": 0.155894, "la": 0.536913, "pl": 0.2375, "sl": 0.193858, "uk": 0.21692}, "lang_wp": {"ab": 0.459821, "cn": 0.364865, "cy": 0.336694, "de": 0.378472, "en": 0.452381, "es": 0.390311, "eu": 0.371693, "fr": 0.35368, "gl": 0.399614, "he": 0.436024, "hi": 0.424819, "ir": 0.311395, "kz": 0.368821, "la": 0.583893, "pl": 0.372321, "sl": 0.350288, "uk": 0.402386}, "lang_parsed": {"ab": 1.0, "cn": 1.0, "cy": 1.0, "de": 1.0, "en": 1.0, "es": 1.0, "eu": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "ir": 1.0, "kz": 1.0, "la": 1.0, "pl": 1.0, "sl": 1.0, "uk": 0.997831}, "lang_mae": {"ab": 1.2612, "cn": 1.5964, "cy": 1.5323, "de": 1.5556, "en": 1.1645, "es": 1.3346, "eu": 1.4603, "fr": 1.4237, "gl": 1.444, "he": 1.2559, "hi": 1.3967, "ir": 1.8468, "kz": 1.4335, "la": 1.255, "pl": 1.7446, "sl": 1.5566, "uk": 1.3543}, "confusion": {"2": {"4": 0.3051, "1": 0.0567, "5": 0.2137, "3": 0.2594, "6": 0.1563, "2": 0.0089}, "3": {"4": 0.269, "5": 0.3915, "6": 0.1516, "3": 0.1715, "1": 0.0114, "2": 0.005}, "1": {"1": 0.3159, "6": 0.1182, "4": 0.1336, "3": 0.3655, "5": 0.0379, "2": 0.0289}, "4": {"4": 0.2047, "6": 0.1785, "3": 0.1604, "5": 0.4396, "1": 0.0114, "2": 0.0054}, "5": {"5": 0.3671, "4": 0.2143, "6": 0.2339, "3": 0.1752, "1": 0.0072, "2": 0.0022}, "6": {"6": 0.3165, "5": 0.2314, "4": 0.243, "3": 0.1901, "1": 0.0165, "2": 0.0025}}}, {"model": "swiss-ai/Apertus-8B-Instruct-2509", "avg_exact": 0.226841, "avg_wp": 0.368914, "avg_bias": 1.3785, "avg_parsed": 0.989763, "avg_mae": 1.7385, "total": 8010, "lang_exact": {"ab": 0.238202, "cn": 0.22037, "cy": 0.205231, "de": 0.247818, "en": 0.207877, "es": 0.277574, "eu": 0.224543, "fr": 0.340502, "gl": 0.224371, "he": 0.132812, "hi": 0.208487, "ir": 0.20751, "kz": 0.165385, "la": 0.506944, "pl": 0.223022, "sl": 0.197697, "uk": 0.178022}, "lang_wp": {"ab": 0.347191, "cn": 0.387963, "cy": 0.325956, "de": 0.410122, "en": 0.364333, "es": 0.418199, "eu": 0.390339, "fr": 0.537634, "gl": 0.341393, "he": 0.24707, "hi": 0.353321, "ir": 0.368577, "kz": 0.35, "la": 0.548611, "pl": 0.317446, "sl": 0.330134, "uk": 0.331868}, "lang_parsed": {"ab": 0.966292, "cn": 0.998148, "cy": 0.993964, "de": 1.0, "en": 1.0, "es": 0.996324, "eu": 0.986945, "fr": 0.996416, "gl": 0.996132, "he": 0.986328, "hi": 0.990775, "ir": 0.998024, "kz": 0.903846, "la": 0.972222, "pl": 0.998201, "sl": 0.996161, "uk": 0.984615}, "lang_mae": {"ab": 1.8721, "cn": 1.5158, "cy": 1.913, "de": 1.5445, "en": 1.6893, "es": 1.4668, "eu": 1.6323, "fr": 1.0917, "gl": 1.8699, "he": 2.2693, "hi": 1.8994, "ir": 1.7446, "kz": 1.4723, "la": 1.3929, "pl": 2.1369, "sl": 1.8825, "uk": 1.8438}, "confusion": {"2": {"6": 0.5169, "3": 0.1884, "1": 0.0856, "2": 0.1304, "5": 0.0649, "4": 0.0138}, "1": {"6": 0.3163, "3": 0.2203, "1": 0.2477, "4": 0.0165, "5": 0.0238, "2": 0.1755}, "3": {"6": 0.6273, "3": 0.1288, "1": 0.0317, "2": 0.0791, "4": 0.0187, "5": 0.1144}, "4": {"6": 0.6785, "5": 0.1539, "3": 0.0828, "1": 0.0321, "4": 0.0178, "2": 0.0349}, "5": {"5": 0.1532, "6": 0.7654, "3": 0.0244, "1": 0.0222, "2": 0.0185, "4": 0.0163}, "6": {"6": 0.7995, "5": 0.1261, "2": 0.0068, "4": 0.0161, "3": 0.0279, "1": 0.0237}}}, {"model": "speakleash/Bielik-1.5B-v3.0-Instruct", "avg_exact": 0.206902, "avg_wp": 0.363133, "avg_bias": -0.3521, "avg_parsed": 0.999854, "avg_mae": 1.7134, "total": 6868, "lang_exact": {"ab": 0.178694, "cn": 0.230932, "cy": 0.164927, "de": 0.225746, "en": 0.265306, "es": 0.238921, "eu": 0.143258, "fr": 0.210832, "gl": 0.229167, "he": 0.174263, "hi": 0.178082, "ir": 0.164241, "kz": 0.114094, "la": 0.126866, "pl": 0.268078, "sl": 0.268537, "uk": 0.143791}, "lang_wp": {"ab": 0.302405, "cn": 0.377119, "cy": 0.317328, "de": 0.403918, "en": 0.452806, "es": 0.416185, "eu": 0.296348, "fr": 0.404255, "gl": 0.436343, "he": 0.323056, "hi": 0.286301, "ir": 0.277547, "kz": 0.288591, "la": 0.164179, "pl": 0.417989, "sl": 0.428858, "uk": 0.289216}, "lang_parsed": {"ab": 1.0, "cn": 1.0, "cy": 1.0, "de": 1.0, "en": 0.997449, "es": 1.0, "eu": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "ir": 1.0, "kz": 1.0, "la": 1.0, "pl": 1.0, "sl": 1.0, "uk": 1.0}, "lang_mae": {"ab": 2.1512, "cn": 1.6589, "cy": 1.9019, "de": 1.5392, "en": 1.2506, "es": 1.4644, "eu": 2.014, "fr": 1.4255, "gl": 1.1968, "he": 1.8365, "hi": 2.2658, "ir": 2.1497, "kz": 2.0738, "la": 2.6493, "pl": 1.5185, "sl": 1.4469, "uk": 1.9869}, "confusion": {"1": {"1": 0.5015, "5": 0.2251, "6": 0.0336, "2": 0.0809, "4": 0.1333, "3": 0.0257}, "2": {"1": 0.4241, "5": 0.1975, "4": 0.2221, "2": 0.1047, "6": 0.0269, "3": 0.0247}, "3": {"1": 0.3597, "2": 0.0854, "5": 0.2182, "4": 0.2877, "3": 0.0253, "6": 0.0237}, "4": {"1": 0.3423, "5": 0.2571, "6": 0.0384, "4": 0.2678, "2": 0.0806, "3": 0.0138}, "5": {"1": 0.2749, "5": 0.3123, "6": 0.0447, "4": 0.2603, "2": 0.0868, "3": 0.021}, "6": {"1": 0.2752, "5": 0.3571, "2": 0.0539, "6": 0.0585, "4": 0.2307, "3": 0.0246}}}, {"model": "speakleash/Bielik-4.5B-v3.0-Instruct", "avg_exact": 0.188847, "avg_wp": 0.338818, "avg_bias": 1.1003, "avg_parsed": 1.0, "avg_mae": 1.7567, "total": 6868, "lang_exact": {"ab": 0.154639, "cn": 0.152542, "cy": 0.17119, "de": 0.169776, "en": 0.191327, "es": 0.22158, "eu": 0.129213, "fr": 0.193424, "gl": 0.196759, "he": 0.201072, "hi": 0.191781, "ir": 0.2079, "kz": 0.255034, "la": 0.462687, "pl": 0.17284, "sl": 0.162325, "uk": 0.202614}, "lang_wp": {"ab": 0.286942, "cn": 0.313559, "cy": 0.331942, "de": 0.29291, "en": 0.359694, "es": 0.32948, "eu": 0.29073, "fr": 0.330754, "gl": 0.359954, "he": 0.348525, "hi": 0.349315, "ir": 0.373181, "kz": 0.489933, "la": 0.55597, "pl": 0.324515, "sl": 0.313627, "uk": 0.367647}, "lang_parsed": {"ab": 1.0, "cn": 1.0, "cy": 1.0, "de": 1.0, "en": 1.0, "es": 1.0, "eu": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "ir": 1.0, "kz": 1.0, "la": 1.0, "pl": 1.0, "sl": 1.0, "uk": 1.0}, "lang_mae": {"ab": 1.8007, "cn": 1.786, "cy": 1.739, "de": 1.9888, "en": 1.5357, "es": 1.9094, "eu": 2.0674, "fr": 1.7988, "gl": 1.5995, "he": 1.6756, "hi": 1.7342, "ir": 1.5738, "kz": 1.1812, "la": 1.2015, "pl": 1.8554, "sl": 1.8858, "uk": 1.6471}, "confusion": {"2": {"4": 0.1429, "5": 0.6133, "1": 0.101, "3": 0.0868, "6": 0.0561}, "3": {"5": 0.6411, "4": 0.1186, "1": 0.0822, "6": 0.0909, "3": 0.0672}, "1": {"1": 0.1598, "4": 0.1696, "5": 0.5276, "3": 0.1016, "6": 0.0414}, "4": {"4": 0.0875, "5": 0.6454, "1": 0.086, "6": 0.1266, "3": 0.0537, "2": 0.0008}, "5": {"5": 0.6694, "4": 0.0502, "6": 0.1763, "3": 0.0247, "1": 0.0795}, "6": {"5": 0.5855, "4": 0.0691, "3": 0.0187, "6": 0.2377, "1": 0.089}}}, {"model": "utter-project/EuroLLM-9B-Instruct-2512", "avg_exact": 0.205604, "avg_wp": 0.316168, "avg_bias": 1.7841, "avg_parsed": 0.999874, "avg_mae": 2.0126, "total": 7923, "lang_exact": {"ab": 0.239631, "cn": 0.212844, "cy": 0.219959, "de": 0.157068, "en": 0.136771, "es": 0.235727, "eu": 0.188679, "fr": 0.194946, "gl": 0.197292, "he": 0.171111, "hi": 0.233886, "ir": 0.253359, "kz": 0.102041, "la": 0.534722, "pl": 0.213144, "sl": 0.195777, "uk": 0.177489}, "lang_wp": {"ab": 0.35023, "cn": 0.317431, "cy": 0.385947, "de": 0.254799, "en": 0.25, "es": 0.346225, "eu": 0.295148, "fr": 0.311372, "gl": 0.296905, "he": 0.278889, "hi": 0.3407, "ir": 0.37524, "kz": 0.265306, "la": 0.604167, "pl": 0.29929, "sl": 0.287908, "uk": 0.287879}, "lang_parsed": {"ab": 1.0, "cn": 1.0, "cy": 0.997963, "de": 1.0, "en": 1.0, "es": 1.0, "eu": 1.0, "fr": 1.0, "gl": 1.0, "he": 1.0, "hi": 1.0, "ir": 1.0, "kz": 1.0, "la": 1.0, "pl": 1.0, "sl": 1.0, "uk": 1.0}, "lang_mae": {"ab": 1.9885, "cn": 2.0661, "cy": 1.5184, "de": 2.3874, "en": 2.0987, "es": 1.8858, "eu": 2.0377, "fr": 2.0018, "gl": 2.0406, "he": 2.1822, "hi": 2.0166, "ir": 1.7332, "kz": 1.8612, "la": 1.1319, "pl": 2.2274, "sl": 2.2188, "uk": 2.0649}, "confusion": {"1": {"6": 0.4478, "1": 0.294, "3": 0.0907, "2": 0.0833, "5": 0.0815, "4": 0.0027}, "2": {"1": 0.068, "3": 0.1003, "6": 0.7127, "2": 0.0536, "5": 0.0605, "4": 0.0048}, "3": {"6": 0.8115, "1": 0.0237, "2": 0.0245, "3": 0.0827, "5": 0.0532, "4": 0.0043}, "4": {"6": 0.8735, "2": 0.0102, "1": 0.0177, "3": 0.0633, "5": 0.0327, "4": 0.0027}, "5": {"6": 0.9033, "3": 0.0536, "5": 0.0186, "1": 0.0186, "2": 0.006}, "6": {"6": 0.9274, "1": 0.0111, "3": 0.0384, "5": 0.0179, "2": 0.0017, "4": 0.0034}}}, {"model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", "avg_exact": 0.094012, "avg_wp": 0.168646, "avg_bias": 0.3332, "avg_parsed": 0.36242, "avg_mae": 1.2653, "total": 7999, "lang_exact": {"ab": 0.105618, "cn": 0.092593, "cy": 0.104628, "de": 0.099476, "en": 0.098468, "es": 0.09375, "eu": 0.078329, "fr": 0.129032, "gl": 0.090909, "he": 0.080078, "hi": 0.111111, "ir": 0.088933, "kz": 0.111538, "la": 0.034722, "pl": 0.06295, "sl": 0.101727, "uk": 0.074725}, "lang_wp": {"ab": 0.186517, "cn": 0.167593, "cy": 0.165996, "de": 0.173647, "en": 0.173961, "es": 0.176471, "eu": 0.159269, "fr": 0.206093, "gl": 0.166344, "he": 0.158203, "hi": 0.19774, "ir": 0.153162, "kz": 0.201923, "la": 0.059028, "pl": 0.130396, "sl": 0.177543, "uk": 0.146154}, "lang_parsed": {"ab": 0.4, "cn": 0.366667, "cy": 0.331992, "de": 0.354276, "en": 0.371991, "es": 0.380515, "eu": 0.318538, "fr": 0.399642, "gl": 0.353965, "he": 0.378906, "hi": 0.416196, "ir": 0.326087, "kz": 0.415385, "la": 0.201389, "pl": 0.31295, "sl": 0.414587, "uk": 0.314286}, "lang_mae": {"ab": 1.3034, "cn": 1.2778, "cy": 1.2, "de": 1.1527, "en": 1.2647, "es": 1.2367, "eu": 1.1803, "fr": 1.148, "gl": 1.2623, "he": 1.3557, "hi": 1.2579, "ir": 1.2485, "kz": 1.2315, "la": 2.0345, "pl": 1.3563, "sl": 1.4074, "uk": 1.1888}, "confusion": {"1": {"5": 0.1048, "1": 0.3238, "3": 0.1683, "4": 0.2127, "6": 0.0381, "2": 0.1524}, "2": {"5": 0.3046, "1": 0.1996, "3": 0.0966, "6": 0.0231, "2": 0.0987, "4": 0.2773}, "3": {"2": 0.0522, "3": 0.0703, "5": 0.3775, "4": 0.3735, "1": 0.0904, "6": 0.0361}, "4": {"5": 0.5109, "4": 0.2945, "1": 0.0782, "2": 0.0382, "6": 0.0455, "3": 0.0327}, "5": {"5": 0.6578, "4": 0.1765, "6": 0.0606, "1": 0.0588, "3": 0.0285, "2": 0.0178}, "6": {"5": 0.7234, "4": 0.1263, "3": 0.006, "1": 0.0441, "2": 0.0261, "6": 0.0741}}}];
const ALL_LANGS_2 = ["ab", "eu", "cn", "en", "fr", "gl", "de", "he", "hi", "ir", "kz", "la", "pl", "sl", "es", "uk", "cy"];
const LANG_COUNTS_2 = {"ab": 600, "cn": 600, "cy": 560, "de": 600, "en": 521, "es": 600, "eu": 462, "fr": 600, "gl": 520, "he": 600, "hi": 600, "ir": 582, "kz": 328, "la": 189, "pl": 600, "sl": 600, "uk": 530};
const LANG_DIST_2 = {"ab": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "cn": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "cy": {"1": 60, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "de": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "en": {"1": 21, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "es": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "eu": {"1": 10, "2": 100, "3": 52, "4": 100, "5": 100, "6": 100}, "fr": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "gl": {"1": 20, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "he": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "hi": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "ir": {"1": 82, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "kz": {"1": 6, "2": 53, "3": 37, "4": 100, "5": 100, "6": 32}, "la": {"1": 34, "2": 3, "3": 3, "4": 35, "5": 14, "6": 100}, "pl": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "sl": {"1": 100, "2": 100, "3": 100, "4": 100, "5": 100, "6": 100}, "uk": {"1": 40, "2": 100, "3": 90, "4": 100, "5": 100, "6": 100}};
const TOTAL_PREDS_2 = 117547;
function langName(c) { return LANG_NAMES[c] || c.toUpperCase(); }
function scoreColor(v) {
const r = v < 0.5 ? 220 : Math.round(220 - (v - 0.5) * 2 * 160);
const g = v > 0.5 ? 160 : Math.round(v * 2 * 160);
return `rgb(${r},${g},40)`;
}
function textColor(v) { return v > 0.35 ? '#fff' : '#ccc'; }
let selLangs = [];
let sortCol = 'avg';
let sortDir = -1;
let metric = 'wp';
let activeDS = 1; // 1 = FineWeb, 2 = FinePDF
// active dataset accessors
function dsRows() { return activeDS === 1 ? ALL_ROWS : ALL_ROWS_2; }
function dsLangs() { return activeDS === 1 ? ALL_LANGS : ALL_LANGS_2; }
function dsLangCounts() { return activeDS === 1 ? LANG_COUNTS : LANG_COUNTS_2; }
function dsLangDist() { return activeDS === 1 ? LANG_DIST : LANG_DIST_2; }
function dsTotalPreds() { return activeDS === 1 ? 146522 : TOTAL_PREDS_2; }
window.setDataset = function(ds) {
if (ds === 2 && !HAS_SECOND) return;
activeDS = ds;
document.getElementById('ds-btn-1').classList.toggle('active', ds === 1);
document.getElementById('ds-btn-2').classList.toggle('active', ds === 2);
selLangs = []; // reset language filter when switching dataset
sortCol = 'avg';
sortDir = -1;
render();
renderChart();
renderDist();
renderBias();
renderCritical();
// reset confusion dropdown for new dataset
const sel = document.getElementById('confModelSelect');
sel.innerHTML = '';
dsRows().forEach((row, i) => {
const opt = document.createElement('option');
opt.value = i;
opt.textContent = row.model;
sel.appendChild(opt);
});
renderConfusion();
};
window.setMetric = function(m) {
metric = m;
document.getElementById('btn-wp').classList.toggle('active', m === 'wp');
document.getElementById('btn-ex').classList.toggle('active', m === 'ex');
document.getElementById('btn-parsed').classList.toggle('active', m === 'parsed');
document.getElementById('btn-mae').classList.toggle('active', m === 'mae');
render();
};
function getScore(row, lang) {
if (lang) {
if (metric === 'wp') return row.lang_wp[lang];
if (metric === 'ex') return row.lang_exact[lang];
if (metric === 'parsed') return row.lang_parsed ? row.lang_parsed[lang] : undefined;
if (metric === 'mae') return row.lang_mae ? row.lang_mae[lang] : undefined;
}
if (metric === 'wp') return row.avg_wp;
if (metric === 'ex') return row.avg_exact;
if (metric === 'parsed') return row.avg_parsed ?? undefined;
if (metric === 'mae') return row.avg_mae ?? undefined;
return undefined;
}
// chips
const chipsEl = document.getElementById('chips');
function renderChips() {
chipsEl.innerHTML = '';
function mk(label, key, active) {
const s = document.createElement('span');
s.className = 'chip' + (active ? ' active' : '');
s.textContent = label;
s.onclick = () => {
if (key === '__all__') { selLangs = []; }
else {
const i = selLangs.indexOf(key);
i >= 0 ? selLangs.splice(i, 1) : selLangs.push(key);
}
render();
};
chipsEl.appendChild(s);
}
mk('All', '__all__', selLangs.length === 0);
dsLangs().forEach(l => mk(langName(l), l, selLangs.includes(l)));
}
// global chart
let chartInstance = null;
function renderChart() {
const sorted = [...dsRows()].sort((a, b) => b.avg_wp - a.avg_wp);
const labels = sorted.map(r => r.model);
const wpData = sorted.map(r => +(r.avg_wp * 100).toFixed(1));
const exData = sorted.map(r => +(r.avg_exact * 100).toFixed(1));
const ctx = document.getElementById('globalChart').getContext('2d');
if (chartInstance) chartInstance.destroy();
const h = Math.max(320, sorted.length * 32 + 80);
document.getElementById('globalChart').style.height = h + 'px';
chartInstance = new Chart(ctx, {
type: 'bar',
data: {
labels,
datasets: [
{ label: 'Weighted Score', data: wpData, backgroundColor: '#2563eb', borderRadius: 3, barPercentage: 0.72 },
{ label: 'Exact Accuracy', data: exData, backgroundColor: '#16a34a', borderRadius: 3, barPercentage: 0.72 },
]
},
options: {
indexAxis: 'y', responsive: true, maintainAspectRatio: false, animation: { duration: 500 },
plugins: {
legend: { position: 'bottom', labels: { color: '#94a3b8', font: { family: 'JetBrains Mono', size: 11 }, boxWidth: 14, padding: 20 } },
tooltip: { backgroundColor: '#1e2a3a', titleColor: '#e2e8f0', bodyColor: '#94a3b8', callbacks: { label: ctx => ` ${ctx.dataset.label}: ${ctx.parsed.x}%` } },
},
scales: {
x: { min: 0, max: 108, grid: { color: '#1a2236' }, ticks: { color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }, callback: v => v + '%' }, title: { display: true, text: 'Percent (%)', color: '#64748b', font: { family: 'JetBrains Mono', size: 11 } } },
y: { grid: { display: false }, ticks: { color: '#cbd5e1', font: { family: 'JetBrains Mono', size: 11 } } }
}
}
});
}
// table
function render() {
renderChips();
const byName = (a, b) => langName(a).localeCompare(langName(b));
const visLangs = selLangs.length > 0 ? [...selLangs].sort(byName) : [...dsLangs()].sort(byName);
let rows = dsRows().map(row => {
const ls = {};
visLangs.forEach(l => { const v = getScore(row, l); if (v !== undefined) ls[l] = v; });
const vals = Object.values(ls);
const avg = selLangs.length === 0
? getScore(row, null)
: (vals.length ? vals.reduce((a, b) => a + b, 0) / vals.length : null);
return { ...row, _avg: avg, _ls: ls };
}).filter(r => r._avg !== null);
rows.sort((a, b) => {
const va = sortCol === 'avg' ? a._avg : (a._ls[sortCol] ?? -1);
const vb = sortCol === 'avg' ? b._avg : (b._ls[sortCol] ?? -1);
return sortDir * (va - vb);
});
const head = document.getElementById('lb-head');
const mkBtn = (label, col) => {
const active = sortCol === col;
const arrow = active ? (sortDir === -1 ? ' ↓' : ' ↑') : '';
return `<button class="sort-btn${active ? ' active' : ''}" data-col="${col}">${label}${arrow}</button>`;
};
const ml = metric === 'wp' ? 'Wtd Score' : metric === 'ex' ? 'Exact Acc' : metric === 'parsed' ? 'Parse Rate' : 'MAE';
head.innerHTML = `<tr>
<th class="rank-col"></th>
<th class="model-col"><span style="font-size:10px;text-transform:uppercase;letter-spacing:.08em;color:#475569">Model</span></th>
<th class="avg-col">${mkBtn(ml + ' avg', 'avg')}</th>
${visLangs.map(l => `<th class="lang-col">${mkBtn(langName(l), l)}<br><span style="font-size:9px;color:#475569;font-weight:400;letter-spacing:0">${dsLangCounts()[l] !== undefined ? 'n=' + dsLangCounts()[l] : ''}</span></th>`).join('')}
</tr>`;
head.querySelectorAll('.sort-btn').forEach(btn => {
btn.onclick = () => {
const col = btn.dataset.col;
sortCol === col ? (sortDir = -sortDir) : (sortCol = col, sortDir = -1);
render();
};
});
const body = document.getElementById('lb-body');
body.innerHTML = rows.map((row, i) => {
const avgPct = (row._avg * 100).toFixed(1) + '%';
const isMae = metric === 'mae';
const fmtVal = v => isMae ? v.toFixed(2) : (v * 100).toFixed(1) + '%';
// MAE: lower is better so invert color (0=best=green, 5=worst=red)
const cellBg = v => isMae ? scoreColor(1 - Math.min(v, 5) / 5) : scoreColor(v);
const langCells = visLangs.map(l => {
const v = row._ls[l];
if (v === undefined || v === null) return `<td class="empty">—</td>`;
return `<td class="score" style="background:${cellBg(v)};color:${textColor(isMae ? 1 - Math.min(v,5)/5 : v)}">${fmtVal(v)}</td>`;
}).join('');
const avgBg = isMae ? scoreColor(1 - Math.min(row._avg,5)/5) : scoreColor(row._avg);
const avgTxt = isMae ? textColor(1 - Math.min(row._avg,5)/5) : textColor(row._avg);
return `<tr>
<td class="rank">${i + 1}</td>
<td class="model" title="${row.model}">${row.model}</td>
<td class="score" style="background:${avgBg};color:${avgTxt};font-size:13px;font-weight:800">${fmtVal(row._avg)}</td>
${langCells}
</tr>`;
}).join('');
document.getElementById('footer').textContent =
`146522 predictions · ${dsLangs().length} languages · ${dsRows().length} models`;
}
// ── dataset distribution ──
function renderDist() {
const scores = [1, 2, 3, 4, 5, 6];
const langs = [...dsLangs()];
if (!LANG_DIST || Object.keys(LANG_DIST).length === 0) {
document.getElementById('dist-head').innerHTML =
'<tr><th class="lang-h" colspan="8" style="color:#475569;text-align:center;padding:20px">No source directory provided — run with --source-dir to enable this section.</th></tr>';
return;
}
let maxCount = 0;
langs.forEach(l => {
const d = dsLangDist()[l] || {};
scores.forEach(s => { if ((d[s] || 0) > maxCount) maxCount = d[s] || 0; });
});
document.getElementById('dist-head').innerHTML = `<tr>
<th class="lang-h">Language</th>
${scores.map(s => `<th class="score-h">Rating ${s}</th>`).join('')}
<th class="total-h">Total</th>
</tr>`;
document.getElementById('dist-body').innerHTML = langs.map(lang => {
const d = dsLangDist()[lang] || {};
const total = Object.values(d).reduce((a, b) => a + b, 0);
if (total === 0) return '';
const cells = scores.map(s => {
const n = d[s] || 0;
const bar = maxCount > 0 ? Math.round((n / maxCount) * 48) : 0;
return `<td class="count-d">${
n > 0
? `${n}<span class="dist-bar" style="width:${bar}px"></span>`
: '<span style="color:#2d3748">—</span>'
}</td>`;
}).join('');
return `<tr>
<td class="lang-d">${langName(lang)}</td>
${cells}
<td class="total-d">${total.toLocaleString()}</td>
</tr>`;
}).join('');
}
// ── bias lollipop ──
function renderBias() {
const sorted = [...dsRows()].sort((a, b) => a.avg_bias - b.avg_bias);
const labels = sorted.map(r => r.model);
const values = sorted.map(r => +(r.avg_bias).toFixed(3));
const colors = values.map(v => v >= 0 ? '#16a34a' : '#dc2626');
const ctx = document.getElementById('biasChart').getContext('2d');
const h = Math.max(260, sorted.length * 26 + 40);
document.getElementById('biasChartContainer').style.height = h + 'px';
new Chart(ctx, {
type: 'bar',
data: { labels, datasets: [{
label: 'Mean Error', data: values, backgroundColor: colors,
borderRadius: 3, barPercentage: 0.45,
}] },
options: {
indexAxis: 'y', responsive: true, maintainAspectRatio: false,
animation: { duration: 400 },
plugins: {
legend: { display: false },
tooltip: { backgroundColor: '#1e2a3a', callbacks: {
label: ctx => ` Bias: ${ctx.parsed.x > 0 ? '+' : ''}${ctx.parsed.x.toFixed(3)}`
} }
},
scales: {
x: { grid: { color: '#1a2236' },
ticks: { color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } },
title: { display: true, text: 'Mean Error (pred − gt)', color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } }
},
y: { grid: { display: false },
ticks: { color: '#cbd5e1', font: { family: 'JetBrains Mono', size: 10 } }
}
}
}
});
}
// ── critical confusion ──
function renderCritical() {
const LOW = new Set([1, 2]);
const HIGH = new Set([5, 6]);
const sorted = [...dsRows()].sort((a, b) => b.avg_wp - a.avg_wp);
const labels = sorted.map(r => r.model);
const lh = [], hl = [];
sorted.forEach(row => {
let lhNumer = 0, lhDenom = 0, hlNumer = 0, hlDenom = 0;
Object.entries(row.confusion || {}).forEach(([gtStr, preds]) => {
const gt = parseInt(gtStr);
const rowTotal = Object.values(preds).reduce((a, b) => a + b, 0);
if (LOW.has(gt)) {
lhDenom += rowTotal;
Object.entries(preds).forEach(([pStr, v]) => {
if (HIGH.has(parseInt(pStr))) lhNumer += v * rowTotal;
});
}
if (HIGH.has(gt)) {
hlDenom += rowTotal;
Object.entries(preds).forEach(([pStr, v]) => {
if (LOW.has(parseInt(pStr))) hlNumer += v * rowTotal;
});
}
});
lh.push(lhDenom > 0 ? +(lhNumer / lhDenom * 100).toFixed(1) : 0);
hl.push(hlDenom > 0 ? +(hlNumer / hlDenom * 100).toFixed(1) : 0);
});
const ctx = document.getElementById('criticalChart').getContext('2d');
const h = Math.max(260, sorted.length * 26 + 60);
document.getElementById('criticalChartContainer').style.height = h + 'px';
new Chart(ctx, {
type: 'bar',
data: { labels, datasets: [
{ label: 'Low→High (1–2 pred as 5–6)', data: lh, backgroundColor: '#dc2626', borderRadius: 3, barPercentage: 0.7 },
{ label: 'High→Low (5–6 pred as 1–2)', data: hl, backgroundColor: '#f97316', borderRadius: 3, barPercentage: 0.7 },
] },
options: {
indexAxis: 'y', responsive: true, maintainAspectRatio: false,
animation: { duration: 400 },
plugins: {
legend: { position: 'bottom', labels: { color: '#94a3b8', font: { family: 'JetBrains Mono', size: 10 }, boxWidth: 12, padding: 16 } },
tooltip: { backgroundColor: '#1e2a3a', callbacks: { label: ctx => ` ${ctx.dataset.label}: ${ctx.parsed.x}%` } }
},
scales: {
x: { min: 0, grid: { color: '#1a2236' },
ticks: { color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }, callback: v => v + '%' },
title: { display: true, text: '% of predictions within true class', color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } }
},
y: { grid: { display: false },
ticks: { color: '#cbd5e1', font: { family: 'JetBrains Mono', size: 10 } }
}
}
}
});
}
// ── confusion heatmap with dropdown ──
let confChartInstance = null;
function populateConfSelect() {
const sel = document.getElementById('confModelSelect');
dsRows().forEach((row, i) => {
const opt = document.createElement('option');
opt.value = i;
opt.textContent = row.model;
sel.appendChild(opt);
});
}
window.renderConfusion = function() {
const idx = parseInt(document.getElementById('confModelSelect').value || '0');
const row = ALL_ROWS[idx];
const conf = row.confusion || {};
const scores = [1, 2, 3, 4, 5, 6];
const data = [];
scores.forEach((gt, ri) => {
const preds = conf[gt] || {};
const rowSum = Object.values(preds).reduce((a, b) => a + b, 0);
scores.forEach((pred, ci) => {
const v = rowSum > 0 ? (preds[pred] || 0) : 0;
data.push({ x: ci, y: ri, v });
});
});
const ctx = document.getElementById('confusionChart').getContext('2d');
if (confChartInstance) confChartInstance.destroy();
document.getElementById('confusionChart').style.height = '340px';
function cellColor(ri, ci, v) {
if (ri === ci) return `rgba(22,163,74,${0.15 + v * 0.85})`;
if (Math.abs(ri-ci)>=3) return `rgba(220,38,38,${v * 0.9})`;
return `rgba(37,99,235,${v * 0.75})`;
}
confChartInstance = new Chart(ctx, {
type: 'scatter',
data: { datasets: [{ data, pointRadius: 0 }] },
options: {
responsive: true, maintainAspectRatio: false, animation: { duration: 300 },
plugins: {
legend: { display: false },
tooltip: { backgroundColor: '#1e2a3a', callbacks: {
title: items => `GT ${scores[items[0].raw.y]} → Pred ${scores[items[0].raw.x]}`,
label: item => ` ${(item.raw.v * 100).toFixed(1)}% of true-class predictions`
} }
},
scales: {
x: {
type: 'linear', min: -0.5, max: 5.5,
ticks: {
stepSize: 1,
callback: v => (Number.isInteger(v) && v >= 0 && v <= 5) ? 'Pred ' + scores[v] : '',
color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }
},
grid: { color: '#1a2236' },
position: 'top'
},
y: {
type: 'linear', min: -0.5, max: 5.5,
reverse: true,
ticks: {
stepSize: 1,
callback: v => (Number.isInteger(v) && v >= 0 && v <= 5) ? 'GT ' + scores[v] : '',
color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }
},
grid: { color: '#1a2236' },
}
}
},
plugins: [{
id: 'heatmap',
afterDraw(chart) {
const {ctx, scales: {x, y}} = chart;
const cellW = Math.abs(x.getPixelForValue(1) - x.getPixelForValue(0));
const cellH = Math.abs(y.getPixelForValue(1) - y.getPixelForValue(0));
data.forEach(d => {
const cx = x.getPixelForValue(d.x);
const cy = y.getPixelForValue(d.y);
ctx.fillStyle = cellColor(d.y, d.x, d.v);
ctx.fillRect(cx - cellW/2 + 1, cy - cellH/2 + 1, cellW - 2, cellH - 2);
if (d.v > 0.005) {
ctx.fillStyle = d.v > 0.3 ? '#fff' : '#94a3b8';
ctx.font = `bold 11px JetBrains Mono, monospace`;
ctx.textAlign = 'center';
ctx.textBaseline = 'middle';
ctx.fillText((d.v * 100).toFixed(0) + '%', cx, cy);
}
});
}
}]
});
};
// hide dataset toggle if only one dataset
if (!HAS_SECOND) {
document.getElementById('ds-btn-2').disabled = true;
document.getElementById('ds-btn-2').style.opacity = '0.35';
document.getElementById('ds-btn-2').title = 'FinePDF dataset not loaded';
}
render();
renderChart();
renderDist();
renderBias();
renderCritical();
populateConfSelect();
renderConfusion();
})();
</script>
</body>
</html>