adgw commited on
Commit
95330bf
·
verified ·
1 Parent(s): e33afa4

Update benchmark leaderboard

Browse files
Files changed (1) hide show
  1. index.html +260 -4
index.html CHANGED
@@ -20,6 +20,11 @@
20
  letter-spacing: -0.02em; color: #f1f5f9; margin-bottom: 6px;
21
  }
22
  .subtitle { color: #64748b; font-size: 12px; margin-bottom: 12px; }
 
 
 
 
 
23
  .scoring-note {
24
  display: inline-flex; gap: 16px; flex-wrap: wrap;
25
  background: #131820; border: 1px solid #1e2a3a;
@@ -110,6 +115,31 @@
110
  display: inline-block; height: 6px; border-radius: 3px;
111
  background: #2563eb; vertical-align: middle; margin-left: 4px; opacity: 0.7;
112
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  .footer { margin-top: 20px; font-size: 11px; color: #2d3748; text-align: right; }
114
  ::-webkit-scrollbar { height: 5px; background: #0d1117; }
115
  ::-webkit-scrollbar-thumb { background: #2d3748; border-radius: 3px; }
@@ -119,7 +149,7 @@
119
  <h1>Text Quality Rating Benchmark</h1>
120
  <p class="meta-subtitle">
121
  LLM accuracy at rating text quality on a 1–6 scale across multiple languages
122
- <span class="sep">·</span> Labeled by DeepSeek V3.2 &amp; judged by Gemini 3.0 Flash
123
  <span class="sep">·</span> Documents sourced from FineWeb dataset
124
  </p>
125
 
@@ -144,7 +174,7 @@
144
  </table>
145
  </div>
146
 
147
- DISTRIBUTION SECTION (disabled for testing)
148
  <p class="section-title">Dataset Distribution</p>
149
  <p class="subtitle" style="margin-bottom:20px">Number of unique texts per rating score (1–6) for each language</p>
150
  <div class="dist-wrap">
@@ -153,6 +183,7 @@
153
  <tbody id="dist-body"></tbody>
154
  </table>
155
  </div>
 
156
 
157
  <p class="section-title" style="margin-top:52px">Global Model Comparison</p>
158
  <p class="subtitle" style="margin-bottom:20px">Weighted Score vs Exact Accuracy — all languages combined, sorted by Weighted Score</p>
@@ -160,11 +191,47 @@
160
  <canvas id="globalChart"></canvas>
161
  </div>
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  <div class="footer" id="footer"></div>
164
 
165
  <script>
166
  (function() {
167
- const ALL_ROWS = [{"model": "Qwen/Qwen3.5-397B-A17B-FP8", "avg_exact": 0.656148, "avg_wp": 0.808234, "total": 17112, "lang_exact": {"ar": 0.675, "az": 0.7025, "be": 0.785, "bg": 0.759669, "bo": 0.735, "ca": 0.732591, "cn": 0.628333, "cs": 0.7425, "da": 0.565, "el": 0.6075, "en": 0.46, "es": 0.777778, "et": 0.6575, "eu": 0.474037, "fa": 0.565, "fi": 0.77, "fr": 0.769634, "gl": 0.521667, "hu": 0.6475, "hv": 0.7175, "is": 0.715, "it": 0.78, "ka": 0.688679, "la": 0.662768, "li": 0.6, "lv": 0.725, "mk": 0.54, "mt": 0.74, "nl": 0.58, "no": 0.695, "pl": 0.486865, "pt": 0.695, "ro": 0.625, "ru": 0.759076, "sk": 0.63, "sl": 0.715, "sq": 0.8025, "sr": 0.5625, "sv": 0.66, "tr": 0.55, "uk": 0.764706}, "lang_wp": {"ar": 0.8225, "az": 0.83375, "be": 0.87375, "bg": 0.870166, "bo": 0.85625, "ca": 0.85376, "cn": 0.7875, "cs": 0.85125, "da": 0.77125, "el": 0.79625, "en": 0.71125, "es": 0.876877, "et": 0.8175, "eu": 0.68593, "fa": 0.758333, "fi": 0.87125, "fr": 0.870419, "gl": 0.721667, "hu": 0.79625, "hv": 0.85625, "is": 0.8475, "it": 0.87875, "ka": 0.794025, "la": 0.798246, "li": 0.78875, "lv": 0.85625, "mk": 0.75125, "mt": 0.8475, "nl": 0.78, "no": 0.8075, "pl": 0.707531, "pt": 0.80625, "ro": 0.7825, "ru": 0.866337, "sk": 0.8, "sl": 0.85, "sq": 0.89375, "sr": 0.77375, "sv": 0.82125, "tr": 0.755, "uk": 0.874332}}, {"model": "google/gemini-3-flash-preview", "avg_exact": 0.577296, "avg_wp": 0.760411, "total": 1753, "lang_exact": {"ar": 0.575, "az": 0.55, "be": 0.625, "bg": 0.75, "bo": 0.575, "ca": 0.725, "cn": 0.566667, "cs": 0.525, "da": 0.475, "el": 0.575, "en": 0.4, "es": 0.825, "et": 0.625, "eu": 0.416667, "fa": 0.516667, "fi": 0.625, "fr": 0.675, "gl": 0.533333, "hu": 0.575, "hv": 0.7, "is": 0.55, "it": 0.725, "ka": 0.6, "la": 0.5, "li": 0.5, "lv": 0.6, "mk": 0.525, "mt": 0.7, "nl": 0.45, "no": 0.6, "pl": 0.366667, "pt": 0.525, "ro": 0.625, "ru": 0.69697, "sk": 0.525, "sl": 0.7, "sq": 0.675, "sr": 0.475, "sv": 0.6, "tr": 0.6, "uk": 0.6}, "lang_wp": {"ar": 0.7875, "az": 0.6875, "be": 0.8125, "bg": 0.8625, "bo": 0.775, "ca": 0.8375, "cn": 0.783333, "cs": 0.725, "da": 0.725, "el": 0.775, "en": 0.65, "es": 0.9125, "et": 0.8125, "eu": 0.608333, "fa": 0.725, "fi": 0.775, "fr": 0.8125, "gl": 0.7, "hu": 0.7875, "hv": 0.85, "is": 0.7125, "it": 0.85, "ka": 0.8, "la": 0.658333, "li": 0.725, "lv": 0.7875, "mk": 0.7125, "mt": 0.8, "nl": 0.7, "no": 0.775, "pl": 0.641667, "pt": 0.75, "ro": 0.75, "ru": 0.833333, "sk": 0.7625, "sl": 0.8375, "sq": 0.825, "sr": 0.7375, "sv": 0.8, "tr": 0.7625, "uk": 0.7875}}, {"model": "openai/gpt-4o-mini", "avg_exact": 0.596461, "avg_wp": 0.757991, "total": 1752, "lang_exact": {"ar": 0.625, "az": 0.5, "be": 0.725, "bg": 0.675, "bo": 0.675, "ca": 0.475, "cn": 0.7, "cs": 0.6, "da": 0.525, "el": 0.525, "en": 0.425, "es": 0.8, "et": 0.875, "eu": 0.35, "fa": 0.533333, "fi": 0.65, "fr": 0.575, "gl": 0.433333, "hu": 0.65, "hv": 0.7, "is": 0.6, "it": 0.725, "ka": 0.475, "la": 0.4, "li": 0.675, "lv": 0.725, "mk": 0.4, "mt": 0.6, "nl": 0.475, "no": 0.675, "pl": 0.383333, "pt": 0.75, "ro": 0.5, "ru": 0.848485, "sk": 0.7, "sl": 0.7, "sq": 0.7, "sr": 0.475, "sv": 0.725, "tr": 0.65, "uk": 0.692308}, "lang_wp": {"ar": 0.7875, "az": 0.725, "be": 0.8625, "bg": 0.8125, "bo": 0.8375, "ca": 0.675, "cn": 0.841667, "cs": 0.7625, "da": 0.725, "el": 0.7375, "en": 0.5875, "es": 0.9, "et": 0.9375, "eu": 0.583333, "fa": 0.733333, "fi": 0.775, "fr": 0.7625, "gl": 0.666667, "hu": 0.8125, "hv": 0.825, "is": 0.7875, "it": 0.8375, "ka": 0.6625, "la": 0.566667, "li": 0.8125, "lv": 0.8625, "mk": 0.575, "mt": 0.7875, "nl": 0.7, "no": 0.7875, "pl": 0.566667, "pt": 0.8625, "ro": 0.7, "ru": 0.893939, "sk": 0.825, "sl": 0.8375, "sq": 0.825, "sr": 0.6875, "sv": 0.8375, "tr": 0.8, "uk": 0.833333}}, {"model": "qwen/qwen3-235b-a22b-2507", "avg_exact": 0.496292, "avg_wp": 0.693953, "total": 1753, "lang_exact": {"ar": 0.6, "az": 0.475, "be": 0.55, "bg": 0.65, "bo": 0.55, "ca": 0.525, "cn": 0.433333, "cs": 0.425, "da": 0.45, "el": 0.55, "en": 0.425, "es": 0.525, "et": 0.5, "eu": 0.416667, "fa": 0.616667, "fi": 0.55, "fr": 0.6, "gl": 0.433333, "hu": 0.65, "hv": 0.625, "is": 0.475, "it": 0.65, "ka": 0.325, "la": 0.433333, "li": 0.4, "lv": 0.575, "mk": 0.35, "mt": 0.475, "nl": 0.35, "no": 0.4, "pl": 0.316667, "pt": 0.575, "ro": 0.5, "ru": 0.454545, "sk": 0.45, "sl": 0.575, "sq": 0.6, "sr": 0.5, "sv": 0.475, "tr": 0.45, "uk": 0.625}, "lang_wp": {"ar": 0.775, "az": 0.7, "be": 0.75, "bg": 0.8, "bo": 0.75, "ca": 0.7, "cn": 0.616667, "cs": 0.6375, "da": 0.6625, "el": 0.725, "en": 0.55, "es": 0.725, "et": 0.75, "eu": 0.608333, "fa": 0.783333, "fi": 0.725, "fr": 0.7625, "gl": 0.625, "hu": 0.8125, "hv": 0.8, "is": 0.7, "it": 0.8125, "ka": 0.6, "la": 0.558333, "li": 0.6625, "lv": 0.775, "mk": 0.575, "mt": 0.675, "nl": 0.6, "no": 0.65, "pl": 0.458333, "pt": 0.775, "ro": 0.7375, "ru": 0.712121, "sk": 0.7, "sl": 0.775, "sq": 0.7625, "sr": 0.725, "sv": 0.725, "tr": 0.675, "uk": 0.8}}, {"model": "deepseek/deepseek-v3.2", "avg_exact": 0.403879, "avg_wp": 0.629492, "total": 1753, "lang_exact": {"ar": 0.375, "az": 0.325, "be": 0.475, "bg": 0.475, "bo": 0.475, "ca": 0.425, "cn": 0.533333, "cs": 0.5, "da": 0.275, "el": 0.25, "en": 0.525, "es": 0.425, "et": 0.425, "eu": 0.333333, "fa": 0.35, "fi": 0.375, "fr": 0.45, "gl": 0.483333, "hu": 0.4, "hv": 0.475, "is": 0.575, "it": 0.425, "ka": 0.35, "la": 0.3, "li": 0.375, "lv": 0.4, "mk": 0.325, "mt": 0.375, "nl": 0.325, "no": 0.4, "pl": 0.366667, "pt": 0.475, "ro": 0.25, "ru": 0.484848, "sk": 0.375, "sl": 0.6, "sq": 0.375, "sr": 0.3, "sv": 0.375, "tr": 0.375, "uk": 0.425}, "lang_wp": {"ar": 0.625, "az": 0.5625, "be": 0.725, "bg": 0.675, "bo": 0.7, "ca": 0.6375, "cn": 0.741667, "cs": 0.6875, "da": 0.5125, "el": 0.475, "en": 0.6875, "es": 0.7, "et": 0.7, "eu": 0.541667, "fa": 0.633333, "fi": 0.5875, "fr": 0.6875, "gl": 0.683333, "hu": 0.65, "hv": 0.7125, "is": 0.7625, "it": 0.6875, "ka": 0.575, "la": 0.5, "li": 0.5375, "lv": 0.6625, "mk": 0.55, "mt": 0.5625, "nl": 0.5625, "no": 0.625, "pl": 0.558333, "pt": 0.7, "ro": 0.4625, "ru": 0.712121, "sk": 0.625, "sl": 0.775, "sq": 0.6625, "sr": 0.6, "sv": 0.6, "tr": 0.625, "uk": 0.6125}}, {"model": "z-ai/glm-4-32b", "avg_exact": 0.432402, "avg_wp": 0.620936, "total": 1753, "lang_exact": {"ar": 0.375, "az": 0.5, "be": 0.625, "bg": 0.375, "bo": 0.625, "ca": 0.35, "cn": 0.383333, "cs": 0.625, "da": 0.35, "el": 0.55, "en": 0.3, "es": 0.35, "et": 0.525, "eu": 0.3, "fa": 0.383333, "fi": 0.525, "fr": 0.525, "gl": 0.25, "hu": 0.5, "hv": 0.625, "is": 0.35, "it": 0.4, "ka": 0.5, "la": 0.433333, "li": 0.425, "lv": 0.525, "mk": 0.225, "mt": 0.35, "nl": 0.425, "no": 0.475, "pl": 0.35, "pt": 0.25, "ro": 0.3, "ru": 0.515152, "sk": 0.525, "sl": 0.475, "sq": 0.7, "sr": 0.375, "sv": 0.4, "tr": 0.425, "uk": 0.525}, "lang_wp": {"ar": 0.525, "az": 0.7, "be": 0.775, "bg": 0.5375, "bo": 0.7625, "ca": 0.5, "cn": 0.583333, "cs": 0.775, "da": 0.6, "el": 0.7375, "en": 0.45, "es": 0.575, "et": 0.725, "eu": 0.533333, "fa": 0.608333, "fi": 0.6375, "fr": 0.6625, "gl": 0.375, "hu": 0.65, "hv": 0.8125, "is": 0.5875, "it": 0.6, "ka": 0.65, "la": 0.558333, "li": 0.65, "lv": 0.7125, "mk": 0.4125, "mt": 0.5625, "nl": 0.675, "no": 0.675, "pl": 0.575, "pt": 0.4375, "ro": 0.525, "ru": 0.69697, "sk": 0.7375, "sl": 0.6875, "sq": 0.8375, "sr": 0.6, "sv": 0.625, "tr": 0.675, "uk": 0.7125}}, {"model": "speakleash/Bielik-11B-v3.0-Instruct", "avg_exact": 0.41462, "avg_wp": 0.601942, "total": 1751, "lang_exact": {"ar": 0.025, "az": 0.525, "be": 0.2, "bg": 0.525, "bo": 0.675, "ca": 0.325, "cn": 0.271186, "cs": 0.5, "da": 0.425, "el": 0.325, "en": 0.5, "es": 0.475, "et": 0.55, "eu": 0.25, "fa": 0.366667, "fi": 0.425, "fr": 0.5, "gl": 0.4, "hu": 0.475, "hv": 0.525, "is": 0.225, "it": 0.475, "ka": 0.230769, "la": 0.3, "li": 0.45, "lv": 0.425, "mk": 0.275, "mt": 0.375, "nl": 0.45, "no": 0.475, "pl": 0.366667, "pt": 0.475, "ro": 0.425, "ru": 0.606061, "sk": 0.475, "sl": 0.55, "sq": 0.5, "sr": 0.45, "sv": 0.425, "tr": 0.45, "uk": 0.625}, "lang_wp": {"ar": 0.15, "az": 0.6875, "be": 0.275, "bg": 0.675, "bo": 0.7875, "ca": 0.6125, "cn": 0.40678, "cs": 0.6125, "da": 0.675, "el": 0.5625, "en": 0.65, "es": 0.7125, "et": 0.7375, "eu": 0.483333, "fa": 0.583333, "fi": 0.5375, "fr": 0.725, "gl": 0.608333, "hu": 0.6625, "hv": 0.675, "is": 0.3875, "it": 0.6875, "ka": 0.371795, "la": 0.483333, "li": 0.675, "lv": 0.675, "mk": 0.575, "mt": 0.575, "nl": 0.7125, "no": 0.6875, "pl": 0.541667, "pt": 0.6875, "ro": 0.65, "ru": 0.727273, "sk": 0.6125, "sl": 0.6625, "sq": 0.6625, "sr": 0.6, "sv": 0.7, "tr": 0.7, "uk": 0.75}}, {"model": "google/gemini-2.0-flash-lite-001", "avg_exact": 0.385054, "avg_wp": 0.586423, "total": 1753, "lang_exact": {"ar": 0.325, "az": 0.325, "be": 0.55, "bg": 0.6, "bo": 0.4, "ca": 0.275, "cn": 0.45, "cs": 0.3, "da": 0.4, "el": 0.225, "en": 0.525, "es": 0.4, "et": 0.55, "eu": 0.333333, "fa": 0.516667, "fi": 0.4, "fr": 0.425, "gl": 0.383333, "hu": 0.45, "hv": 0.425, "is": 0.2, "it": 0.425, "ka": 0.3, "la": 0.216667, "li": 0.425, "lv": 0.475, "mk": 0.2, "mt": 0.425, "nl": 0.4, "no": 0.325, "pl": 0.366667, "pt": 0.375, "ro": 0.275, "ru": 0.606061, "sk": 0.425, "sl": 0.375, "sq": 0.425, "sr": 0.25, "sv": 0.425, "tr": 0.4, "uk": 0.275}, "lang_wp": {"ar": 0.525, "az": 0.6, "be": 0.75, "bg": 0.7625, "bo": 0.5875, "ca": 0.525, "cn": 0.608333, "cs": 0.5, "da": 0.55, "el": 0.5, "en": 0.7125, "es": 0.65, "et": 0.6875, "eu": 0.566667, "fa": 0.691667, "fi": 0.6375, "fr": 0.6125, "gl": 0.633333, "hu": 0.6375, "hv": 0.625, "is": 0.45, "it": 0.6125, "ka": 0.525, "la": 0.383333, "li": 0.575, "lv": 0.7, "mk": 0.4625, "mt": 0.65, "nl": 0.6125, "no": 0.4875, "pl": 0.55, "pt": 0.525, "ro": 0.4375, "ru": 0.712121, "sk": 0.6, "sl": 0.6125, "sq": 0.6125, "sr": 0.4875, "sv": 0.6375, "tr": 0.5875, "uk": 0.525}}, {"model": "google/gemma-3-12b-it", "avg_exact": 0.337707, "avg_wp": 0.573873, "total": 1753, "lang_exact": {"ar": 0.275, "az": 0.35, "be": 0.35, "bg": 0.475, "bo": 0.45, "ca": 0.225, "cn": 0.383333, "cs": 0.45, "da": 0.375, "el": 0.275, "en": 0.45, "es": 0.325, "et": 0.4, "eu": 0.233333, "fa": 0.25, "fi": 0.4, "fr": 0.425, "gl": 0.166667, "hu": 0.35, "hv": 0.4, "is": 0.475, "it": 0.325, "ka": 0.3, "la": 0.3, "li": 0.25, "lv": 0.425, "mk": 0.275, "mt": 0.25, "nl": 0.45, "no": 0.375, "pl": 0.366667, "pt": 0.25, "ro": 0.25, "ru": 0.575758, "sk": 0.275, "sl": 0.35, "sq": 0.35, "sr": 0.325, "sv": 0.175, "tr": 0.325, "uk": 0.375}, "lang_wp": {"ar": 0.5, "az": 0.625, "be": 0.6125, "bg": 0.6625, "bo": 0.675, "ca": 0.4875, "cn": 0.583333, "cs": 0.625, "da": 0.5875, "el": 0.5125, "en": 0.5625, "es": 0.5875, "et": 0.625, "eu": 0.466667, "fa": 0.533333, "fi": 0.625, "fr": 0.675, "gl": 0.375, "hu": 0.6125, "hv": 0.6375, "is": 0.6875, "it": 0.5375, "ka": 0.475, "la": 0.525, "li": 0.5875, "lv": 0.6625, "mk": 0.55, "mt": 0.5375, "nl": 0.725, "no": 0.5625, "pl": 0.575, "pt": 0.525, "ro": 0.4875, "ru": 0.787879, "sk": 0.5625, "sl": 0.6, "sq": 0.5125, "sr": 0.5875, "sv": 0.5125, "tr": 0.6125, "uk": 0.575}}, {"model": "mistralai/mistral-nemo", "avg_exact": 0.309184, "avg_wp": 0.499715, "total": 1753, "lang_exact": {"ar": 0.325, "az": 0.45, "be": 0.475, "bg": 0.325, "bo": 0.375, "ca": 0.25, "cn": 0.383333, "cs": 0.425, "da": 0.375, "el": 0.3, "en": 0.25, "es": 0.35, "et": 0.225, "eu": 0.216667, "fa": 0.266667, "fi": 0.35, "fr": 0.275, "gl": 0.283333, "hu": 0.2, "hv": 0.425, "is": 0.3, "it": 0.2, "ka": 0.425, "la": 0.183333, "li": 0.325, "lv": 0.425, "mk": 0.375, "mt": 0.325, "nl": 0.35, "no": 0.375, "pl": 0.25, "pt": 0.35, "ro": 0.2, "ru": 0.212121, "sk": 0.375, "sl": 0.35, "sq": 0.3, "sr": 0.25, "sv": 0.175, "tr": 0.275, "uk": 0.25}, "lang_wp": {"ar": 0.5, "az": 0.6125, "be": 0.675, "bg": 0.4625, "bo": 0.5, "ca": 0.5125, "cn": 0.608333, "cs": 0.5125, "da": 0.5375, "el": 0.4625, "en": 0.5, "es": 0.575, "et": 0.4875, "eu": 0.441667, "fa": 0.5, "fi": 0.525, "fr": 0.5625, "gl": 0.475, "hu": 0.425, "hv": 0.5875, "is": 0.4625, "it": 0.4, "ka": 0.5875, "la": 0.333333, "li": 0.525, "lv": 0.6625, "mk": 0.5375, "mt": 0.425, "nl": 0.4875, "no": 0.5375, "pl": 0.466667, "pt": 0.525, "ro": 0.3, "ru": 0.484848, "sk": 0.6, "sl": 0.575, "sq": 0.425, "sr": 0.45, "sv": 0.4375, "tr": 0.45, "uk": 0.4375}}, {"model": "z-ai/glm-4.5-air", "avg_exact": 0.36203, "avg_wp": 0.498575, "total": 1754, "lang_exact": {"ar": 0.317073, "az": 0.4, "be": 0.525, "bg": 0.475, "bo": 0.45, "ca": 0.375, "cn": 0.4, "cs": 0.35, "da": 0.325, "el": 0.325, "en": 0.275, "es": 0.35, "et": 0.6, "eu": 0.3, "fa": 0.283333, "fi": 0.45, "fr": 0.3, "gl": 0.383333, "hu": 0.375, "hv": 0.175, "is": 0.25, "it": 0.125, "ka": 0.35, "la": 0.15, "li": 0.175, "lv": 0.1, "mk": 0.2, "mt": 0.275, "nl": 0.225, "no": 0.375, "pl": 0.35, "pt": 0.525, "ro": 0.4, "ru": 0.484848, "sk": 0.55, "sl": 0.625, "sq": 0.65, "sr": 0.325, "sv": 0.5, "tr": 0.45, "uk": 0.5}, "lang_wp": {"ar": 0.52439, "az": 0.55, "be": 0.6375, "bg": 0.6375, "bo": 0.575, "ca": 0.475, "cn": 0.6, "cs": 0.5, "da": 0.4875, "el": 0.5125, "en": 0.4375, "es": 0.5, "et": 0.75, "eu": 0.433333, "fa": 0.525, "fi": 0.5125, "fr": 0.475, "gl": 0.541667, "hu": 0.425, "hv": 0.1875, "is": 0.3625, "it": 0.15, "ka": 0.4125, "la": 0.208333, "li": 0.25, "lv": 0.2, "mk": 0.3375, "mt": 0.4375, "nl": 0.375, "no": 0.5125, "pl": 0.5, "pt": 0.6625, "ro": 0.5375, "ru": 0.621212, "sk": 0.7, "sl": 0.775, "sq": 0.75, "sr": 0.55, "sv": 0.625, "tr": 0.6375, "uk": 0.6625}}, {"model": "meta-llama/llama-4-scout", "avg_exact": 0.380137, "avg_wp": 0.497717, "total": 1752, "lang_exact": {"ar": 0.325, "az": 0.475, "be": 0.3, "bg": 0.375, "bo": 0.425, "ca": 0.3, "cn": 0.25, "cs": 0.525, "da": 0.375, "el": 0.275, "en": 0.225, "es": 0.475, "et": 0.425, "eu": 0.254237, "fa": 0.4, "fi": 0.5, "fr": 0.375, "gl": 0.2, "hu": 0.45, "hv": 0.45, "is": 0.55, "it": 0.425, "ka": 0.3, "la": 0.316667, "li": 0.35, "lv": 0.45, "mk": 0.275, "mt": 0.35, "nl": 0.325, "no": 0.475, "pl": 0.35, "pt": 0.5, "ro": 0.325, "ru": 0.545455, "sk": 0.5, "sl": 0.4, "sq": 0.575, "sr": 0.275, "sv": 0.425, "tr": 0.425, "uk": 0.35}, "lang_wp": {"ar": 0.4625, "az": 0.5625, "be": 0.4, "bg": 0.45, "bo": 0.5375, "ca": 0.3875, "cn": 0.441667, "cs": 0.6, "da": 0.4875, "el": 0.4, "en": 0.325, "es": 0.5875, "et": 0.5875, "eu": 0.389831, "fa": 0.55, "fi": 0.625, "fr": 0.4375, "gl": 0.433333, "hu": 0.5375, "hv": 0.525, "is": 0.65, "it": 0.4625, "ka": 0.4125, "la": 0.425, "li": 0.45, "lv": 0.575, "mk": 0.425, "mt": 0.5125, "nl": 0.475, "no": 0.6125, "pl": 0.441667, "pt": 0.6, "ro": 0.475, "ru": 0.575758, "sk": 0.55, "sl": 0.525, "sq": 0.6625, "sr": 0.4125, "sv": 0.5375, "tr": 0.6, "uk": 0.4625}}, {"model": "meta-llama/llama-3.3-70b-instruct", "avg_exact": 0.366589, "avg_wp": 0.49652, "total": 1724, "lang_exact": {"ar": 0.384615, "az": 0.394737, "be": 0.475, "bg": 0.4, "bo": 0.45, "ca": 0.25, "cn": 0.305085, "cs": 0.525, "da": 0.358974, "el": 0.447368, "en": 0.25641, "es": 0.324324, "et": 0.512821, "eu": 0.293103, "fa": 0.310345, "fi": 0.538462, "fr": 0.384615, "gl": 0.183333, "hu": 0.45, "hv": 0.4, "is": 0.45, "it": 0.384615, "ka": 0.35, "la": 0.310345, "li": 0.25641, "lv": 0.425, "mk": 0.225, "mt": 0.275, "nl": 0.425, "no": 0.375, "pl": 0.4, "pt": 0.358974, "ro": 0.131579, "ru": 0.515152, "sk": 0.425, "sl": 0.435897, "sq": 0.282051, "sr": 0.333333, "sv": 0.410256, "tr": 0.475, "uk": 0.35}, "lang_wp": {"ar": 0.5, "az": 0.513158, "be": 0.5625, "bg": 0.5375, "bo": 0.5375, "ca": 0.3625, "cn": 0.466102, "cs": 0.5875, "da": 0.5, "el": 0.5, "en": 0.435897, "es": 0.405405, "et": 0.679487, "eu": 0.431034, "fa": 0.465517, "fi": 0.653846, "fr": 0.448718, "gl": 0.383333, "hu": 0.5375, "hv": 0.5125, "is": 0.625, "it": 0.512821, "ka": 0.4625, "la": 0.431034, "li": 0.423077, "lv": 0.6, "mk": 0.3625, "mt": 0.3875, "nl": 0.5875, "no": 0.475, "pl": 0.558333, "pt": 0.474359, "ro": 0.315789, "ru": 0.530303, "sk": 0.6375, "sl": 0.551282, "sq": 0.487179, "sr": 0.448718, "sv": 0.512821, "tr": 0.625, "uk": 0.4375}}, {"model": "openai/gpt-4.1-nano", "avg_exact": 0.293212, "avg_wp": 0.494295, "total": 1753, "lang_exact": {"ar": 0.25, "az": 0.275, "be": 0.25, "bg": 0.35, "bo": 0.325, "ca": 0.125, "cn": 0.4, "cs": 0.35, "da": 0.375, "el": 0.225, "en": 0.1, "es": 0.275, "et": 0.45, "eu": 0.2, "fa": 0.366667, "fi": 0.5, "fr": 0.325, "gl": 0.3, "hu": 0.25, "hv": 0.35, "is": 0.3, "it": 0.3, "ka": 0.125, "la": 0.2, "li": 0.275, "lv": 0.425, "mk": 0.125, "mt": 0.25, "nl": 0.3, "no": 0.275, "pl": 0.3, "pt": 0.225, "ro": 0.25, "ru": 0.393939, "sk": 0.275, "sl": 0.3, "sq": 0.325, "sr": 0.325, "sv": 0.375, "tr": 0.325, "uk": 0.325}, "lang_wp": {"ar": 0.5, "az": 0.5125, "be": 0.4875, "bg": 0.55, "bo": 0.525, "ca": 0.4, "cn": 0.616667, "cs": 0.55, "da": 0.425, "el": 0.475, "en": 0.4125, "es": 0.4625, "et": 0.6125, "eu": 0.391667, "fa": 0.6, "fi": 0.6, "fr": 0.5125, "gl": 0.5, "hu": 0.475, "hv": 0.5375, "is": 0.55, "it": 0.5125, "ka": 0.3125, "la": 0.316667, "li": 0.5375, "lv": 0.6125, "mk": 0.3125, "mt": 0.3875, "nl": 0.4125, "no": 0.425, "pl": 0.541667, "pt": 0.425, "ro": 0.4625, "ru": 0.651515, "sk": 0.4625, "sl": 0.575, "sq": 0.5625, "sr": 0.5375, "sv": 0.5, "tr": 0.4875, "uk": 0.5625}}, {"model": "google/gemma-3-27b-it", "avg_exact": 0.252139, "avg_wp": 0.488591, "total": 1753, "lang_exact": {"ar": 0.3, "az": 0.15, "be": 0.225, "bg": 0.3, "bo": 0.325, "ca": 0.175, "cn": 0.333333, "cs": 0.325, "da": 0.1, "el": 0.15, "en": 0.5, "es": 0.3, "et": 0.225, "eu": 0.233333, "fa": 0.266667, "fi": 0.225, "fr": 0.2, "gl": 0.25, "hu": 0.25, "hv": 0.25, "is": 0.375, "it": 0.225, "ka": 0.175, "la": 0.366667, "li": 0.1, "lv": 0.325, "mk": 0.15, "mt": 0.225, "nl": 0.325, "no": 0.175, "pl": 0.416667, "pt": 0.25, "ro": 0.05, "ru": 0.454545, "sk": 0.2, "sl": 0.275, "sq": 0.25, "sr": 0.1, "sv": 0.025, "tr": 0.325, "uk": 0.325}, "lang_wp": {"ar": 0.4875, "az": 0.4, "be": 0.4625, "bg": 0.4875, "bo": 0.5875, "ca": 0.4, "cn": 0.575, "cs": 0.4625, "da": 0.375, "el": 0.325, "en": 0.65, "es": 0.5125, "et": 0.5125, "eu": 0.475, "fa": 0.55, "fi": 0.475, "fr": 0.475, "gl": 0.466667, "hu": 0.55, "hv": 0.5, "is": 0.5625, "it": 0.4875, "ka": 0.375, "la": 0.525, "li": 0.4125, "lv": 0.5875, "mk": 0.275, "mt": 0.5125, "nl": 0.5875, "no": 0.3875, "pl": 0.591667, "pt": 0.4625, "ro": 0.35, "ru": 0.666667, "sk": 0.5375, "sl": 0.5125, "sq": 0.475, "sr": 0.4125, "sv": 0.3625, "tr": 0.5625, "uk": 0.5625}}, {"model": "qwen/qwen-2.5-7b-instruct", "avg_exact": 0.26526, "avg_wp": 0.484598, "total": 1753, "lang_exact": {"ar": 0.2, "az": 0.15, "be": 0.25, "bg": 0.35, "bo": 0.4, "ca": 0.225, "cn": 0.316667, "cs": 0.3, "da": 0.325, "el": 0.2, "en": 0.4, "es": 0.325, "et": 0.25, "eu": 0.216667, "fa": 0.3, "fi": 0.275, "fr": 0.35, "gl": 0.366667, "hu": 0.225, "hv": 0.3, "is": 0.325, "it": 0.3, "ka": 0.125, "la": 0.183333, "li": 0.2, "lv": 0.35, "mk": 0.15, "mt": 0.225, "nl": 0.275, "no": 0.3, "pl": 0.216667, "pt": 0.25, "ro": 0.2, "ru": 0.393939, "sk": 0.325, "sl": 0.275, "sq": 0.1, "sr": 0.05, "sv": 0.35, "tr": 0.3, "uk": 0.275}, "lang_wp": {"ar": 0.3625, "az": 0.4, "be": 0.5125, "bg": 0.575, "bo": 0.625, "ca": 0.425, "cn": 0.525, "cs": 0.4875, "da": 0.525, "el": 0.425, "en": 0.6125, "es": 0.5625, "et": 0.5, "eu": 0.425, "fa": 0.466667, "fi": 0.5125, "fr": 0.5375, "gl": 0.566667, "hu": 0.475, "hv": 0.525, "is": 0.5125, "it": 0.5, "ka": 0.3125, "la": 0.383333, "li": 0.3875, "lv": 0.65, "mk": 0.3375, "mt": 0.4125, "nl": 0.525, "no": 0.5125, "pl": 0.425, "pt": 0.475, "ro": 0.4125, "ru": 0.621212, "sk": 0.5625, "sl": 0.5, "sq": 0.35, "sr": 0.35, "sv": 0.6, "tr": 0.5625, "uk": 0.5125}}, {"model": "meta-llama/llama-4-maverick", "avg_exact": 0.268291, "avg_wp": 0.47344, "total": 17112, "lang_exact": {"ar": 0.27, "az": 0.235, "be": 0.2225, "bg": 0.273481, "bo": 0.285, "ca": 0.259053, "cn": 0.388333, "cs": 0.285, "da": 0.225, "el": 0.26, "en": 0.28, "es": 0.267267, "et": 0.315, "eu": 0.221106, "fa": 0.278333, "fi": 0.2425, "fr": 0.225131, "gl": 0.271667, "hu": 0.24, "hv": 0.31, "is": 0.41, "it": 0.2775, "ka": 0.113208, "la": 0.397661, "li": 0.2375, "lv": 0.2725, "mk": 0.19, "mt": 0.2125, "nl": 0.3575, "no": 0.185, "pl": 0.355517, "pt": 0.235, "ro": 0.1525, "ru": 0.330033, "sk": 0.2025, "sl": 0.2675, "sq": 0.2825, "sr": 0.2, "sv": 0.185, "tr": 0.395, "uk": 0.248663}, "lang_wp": {"ar": 0.435, "az": 0.38125, "be": 0.475, "bg": 0.476519, "bo": 0.54, "ca": 0.415042, "cn": 0.620833, "cs": 0.52125, "da": 0.37125, "el": 0.4625, "en": 0.4425, "es": 0.462462, "et": 0.5675, "eu": 0.403685, "fa": 0.490833, "fi": 0.5075, "fr": 0.454188, "gl": 0.45, "hu": 0.42875, "hv": 0.53875, "is": 0.57625, "it": 0.48875, "ka": 0.278302, "la": 0.562378, "li": 0.43375, "lv": 0.485, "mk": 0.3675, "mt": 0.40125, "nl": 0.59, "no": 0.38125, "pl": 0.573555, "pt": 0.425, "ro": 0.38625, "ru": 0.617162, "sk": 0.42375, "sl": 0.48125, "sq": 0.56625, "sr": 0.3875, "sv": 0.3425, "tr": 0.60875, "uk": 0.471925}}, {"model": "google/gemma-3-4b-it", "avg_exact": 0.222031, "avg_wp": 0.460126, "total": 1743, "lang_exact": {"ar": 0.1, "az": 0.15, "be": 0.225, "bg": 0.2, "bo": 0.125, "ca": 0.051282, "cn": 0.298246, "cs": 0.394737, "da": 0.25, "el": 0.175, "en": 0.425, "es": 0.225, "et": 0.375, "eu": 0.216667, "fa": 0.216667, "fi": 0.25, "fr": 0.125, "gl": 0.25, "hu": 0.125, "hv": 0.225, "is": 0.15, "it": 0.275, "ka": 0.1, "la": 0.166667, "li": 0.175, "lv": 0.2, "mk": 0.15, "mt": 0.025, "nl": 0.2, "no": 0.25, "pl": 0.316667, "pt": 0.461538, "ro": 0.125, "ru": 0.272727, "sk": 0.263158, "sl": 0.225, "sq": 0.358974, "sr": 0.225, "sv": 0.25, "tr": 0.2, "uk": 0.275}, "lang_wp": {"ar": 0.425, "az": 0.5, "be": 0.5, "bg": 0.4125, "bo": 0.45, "ca": 0.294872, "cn": 0.54386, "cs": 0.539474, "da": 0.45, "el": 0.375, "en": 0.6375, "es": 0.425, "et": 0.5875, "eu": 0.441667, "fa": 0.45, "fi": 0.525, "fr": 0.35, "gl": 0.541667, "hu": 0.4, "hv": 0.475, "is": 0.375, "it": 0.4875, "ka": 0.325, "la": 0.35, "li": 0.4875, "lv": 0.525, "mk": 0.3625, "mt": 0.325, "nl": 0.4375, "no": 0.475, "pl": 0.516667, "pt": 0.615385, "ro": 0.3375, "ru": 0.545455, "sk": 0.526316, "sl": 0.425, "sq": 0.538462, "sr": 0.4875, "sv": 0.5, "tr": 0.4375, "uk": 0.45}}, {"model": "mistralai/mixtral-8x7b-instruct", "avg_exact": 0.245143, "avg_wp": 0.453429, "total": 1750, "lang_exact": {"ar": 0.175, "az": 0.2, "be": 0.15, "bg": 0.225, "bo": 0.25, "ca": 0.175, "cn": 0.254237, "cs": 0.4, "da": 0.25, "el": 0.2, "en": 0.3, "es": 0.3, "et": 0.275, "eu": 0.25, "fa": 0.183333, "fi": 0.275, "fr": 0.25, "gl": 0.216667, "hu": 0.25, "hv": 0.225, "is": 0.15, "it": 0.225, "ka": 0.153846, "la": 0.283333, "li": 0.2, "lv": 0.25, "mk": 0.225, "mt": 0.225, "nl": 0.3, "no": 0.325, "pl": 0.283333, "pt": 0.225, "ro": 0.3, "ru": 0.34375, "sk": 0.225, "sl": 0.325, "sq": 0.2, "sr": 0.25, "sv": 0.225, "tr": 0.275, "uk": 0.275}, "lang_wp": {"ar": 0.2875, "az": 0.3875, "be": 0.45, "bg": 0.4375, "bo": 0.425, "ca": 0.3375, "cn": 0.466102, "cs": 0.5625, "da": 0.5125, "el": 0.4, "en": 0.5125, "es": 0.5125, "et": 0.4875, "eu": 0.408333, "fa": 0.408333, "fi": 0.525, "fr": 0.4125, "gl": 0.408333, "hu": 0.4875, "hv": 0.475, "is": 0.4, "it": 0.4125, "ka": 0.269231, "la": 0.466667, "li": 0.4375, "lv": 0.5, "mk": 0.3875, "mt": 0.3625, "nl": 0.5625, "no": 0.55, "pl": 0.45, "pt": 0.475, "ro": 0.525, "ru": 0.53125, "sk": 0.4625, "sl": 0.5, "sq": 0.4, "sr": 0.475, "sv": 0.55, "tr": 0.4875, "uk": 0.55}}, {"model": "mistralai/mistral-small-3.2-24b-instruct", "avg_exact": 0.25029, "avg_wp": 0.450929, "total": 1722, "lang_exact": {"ar": 0.2, "az": 0.25, "be": 0.275, "bg": 0.225, "bo": 0.275, "ca": 0.225, "cn": 0.316667, "cs": 0.225, "da": 0.35, "el": 0.25, "en": 0.3, "es": 0.3, "et": 0.15, "eu": 0.183333, "fa": 0.333333, "fi": 0.275, "fr": 0.25, "gl": 0.166667, "hu": 0.275, "hv": 0.275, "is": 0.375, "it": 0.275, "ka": 0.2, "la": 0.4, "li": 0.205128, "lv": 0.325, "mk": 0.2, "mt": 0.1, "nl": 0.225, "no": 0.25, "pl": 0.3, "pt": 0.25, "ro": 0.222222, "ru": 0.3, "sk": 0.142857, "sl": 0.193548, "sq": 0.157895, "sr": 0.2, "sv": 0.3, "tr": 0.175, "uk": 0.225}, "lang_wp": {"ar": 0.4125, "az": 0.4375, "be": 0.4875, "bg": 0.425, "bo": 0.45, "ca": 0.425, "cn": 0.55, "cs": 0.3875, "da": 0.55, "el": 0.3625, "en": 0.4125, "es": 0.425, "et": 0.4, "eu": 0.458333, "fa": 0.575, "fi": 0.3875, "fr": 0.425, "gl": 0.408333, "hu": 0.4875, "hv": 0.5125, "is": 0.6, "it": 0.4125, "ka": 0.375, "la": 0.566667, "li": 0.423077, "lv": 0.575, "mk": 0.3625, "mt": 0.35, "nl": 0.4125, "no": 0.375, "pl": 0.483333, "pt": 0.4, "ro": 0.375, "ru": 0.416667, "sk": 0.410714, "sl": 0.467742, "sq": 0.513158, "sr": 0.4625, "sv": 0.4375, "tr": 0.5125, "uk": 0.3875}}, {"model": "mistralai/mistral-7b-instruct", "avg_exact": 0.243871, "avg_wp": 0.418065, "total": 1550, "lang_exact": {"ar": 0.058824, "az": 0.27027, "be": 0.382353, "bg": 0.243243, "bo": 0.314286, "ca": 0.285714, "cn": 0.339286, "cs": 0.428571, "da": 0.28, "el": 0.194444, "en": 0.125, "es": 0.241379, "et": 0.482759, "eu": 0.226415, "fa": 0.355932, "fi": 0.269231, "fr": 0.184211, "gl": 0.206897, "hu": 0.138889, "hv": 0.361111, "is": 0.166667, "it": 0.28125, "ka": 0.277778, "la": 0.241379, "li": 0.171429, "lv": 0.142857, "mk": 0.205128, "mt": 0.162162, "nl": 0.285714, "no": 0.205128, "pl": 0.272727, "pt": 0.189189, "ro": 0.138889, "ru": 0.354839, "sk": 0.236842, "sl": 0.289474, "sq": 0.27027, "sr": 0.282051, "sv": 0.210526, "tr": 0.102564, "uk": 0.175}, "lang_wp": {"ar": 0.191176, "az": 0.445946, "be": 0.558824, "bg": 0.432432, "bo": 0.542857, "ca": 0.457143, "cn": 0.508929, "cs": 0.52381, "da": 0.4, "el": 0.388889, "en": 0.265625, "es": 0.396552, "et": 0.62069, "eu": 0.349057, "fa": 0.542373, "fi": 0.365385, "fr": 0.381579, "gl": 0.387931, "hu": 0.319444, "hv": 0.513889, "is": 0.416667, "it": 0.5, "ka": 0.416667, "la": 0.431034, "li": 0.285714, "lv": 0.328571, "mk": 0.397436, "mt": 0.337838, "nl": 0.5, "no": 0.384615, "pl": 0.445455, "pt": 0.378378, "ro": 0.333333, "ru": 0.467742, "sk": 0.434211, "sl": 0.394737, "sq": 0.486486, "sr": 0.487179, "sv": 0.421053, "tr": 0.320513, "uk": 0.35}}, {"model": "mistralai/mistral-small-24b-instruct-2501", "avg_exact": 0.179897, "avg_wp": 0.363221, "total": 1751, "lang_exact": {"ar": 0.05, "az": 0.25, "be": 0.075, "bg": 0.2, "bo": 0.25, "ca": 0.25, "cn": 0.254237, "cs": 0.15, "da": 0.225, "el": 0.2, "en": 0.25, "es": 0.25, "et": 0.25, "eu": 0.2, "fa": 0.2, "fi": 0.225, "fr": 0.225, "gl": 0.183333, "hu": 0.125, "hv": 0.175, "is": 0.15, "it": 0.25, "ka": 0.025, "la": 0.35, "li": 0.05, "lv": 0.125, "mk": 0.225, "mt": 0.25, "nl": 0.175, "no": 0.225, "pl": 0.233333, "pt": 0.225, "ro": 0.1, "ru": 0.0, "sk": 0.15, "sl": 0.225, "sq": 0.025, "sr": 0.0, "sv": 0.25, "tr": 0.075, "uk": 0.075}, "lang_wp": {"ar": 0.225, "az": 0.45, "be": 0.3125, "bg": 0.4375, "bo": 0.3875, "ca": 0.3625, "cn": 0.457627, "cs": 0.3375, "da": 0.375, "el": 0.35, "en": 0.4125, "es": 0.4125, "et": 0.4125, "eu": 0.358333, "fa": 0.4, "fi": 0.3375, "fr": 0.375, "gl": 0.291667, "hu": 0.275, "hv": 0.35, "is": 0.375, "it": 0.4, "ka": 0.2625, "la": 0.516667, "li": 0.3, "lv": 0.3625, "mk": 0.375, "mt": 0.4, "nl": 0.3375, "no": 0.3625, "pl": 0.416667, "pt": 0.4125, "ro": 0.225, "ru": 0.28125, "sk": 0.325, "sl": 0.475, "sq": 0.375, "sr": 0.225, "sv": 0.4, "tr": 0.275, "uk": 0.325}}, {"model": "mistralai/ministral-14b-2512", "avg_exact": 0.196235, "avg_wp": 0.353394, "total": 1753, "lang_exact": {"ar": 0.15, "az": 0.225, "be": 0.175, "bg": 0.2, "bo": 0.25, "ca": 0.225, "cn": 0.233333, "cs": 0.25, "da": 0.175, "el": 0.25, "en": 0.275, "es": 0.175, "et": 0.075, "eu": 0.266667, "fa": 0.216667, "fi": 0.125, "fr": 0.25, "gl": 0.216667, "hu": 0.175, "hv": 0.2, "is": 0.325, "it": 0.125, "ka": 0.05, "la": 0.416667, "li": 0.125, "lv": 0.1, "mk": 0.175, "mt": 0.05, "nl": 0.1, "no": 0.125, "pl": 0.233333, "pt": 0.15, "ro": 0.175, "ru": 0.181818, "sk": 0.175, "sl": 0.125, "sq": 0.175, "sr": 0.225, "sv": 0.275, "tr": 0.225, "uk": 0.2}, "lang_wp": {"ar": 0.3125, "az": 0.3625, "be": 0.35, "bg": 0.4125, "bo": 0.35, "ca": 0.4, "cn": 0.425, "cs": 0.4125, "da": 0.325, "el": 0.375, "en": 0.4125, "es": 0.35, "et": 0.225, "eu": 0.425, "fa": 0.441667, "fi": 0.275, "fr": 0.375, "gl": 0.383333, "hu": 0.3, "hv": 0.3375, "is": 0.4875, "it": 0.3125, "ka": 0.2125, "la": 0.533333, "li": 0.2625, "lv": 0.2875, "mk": 0.3375, "mt": 0.175, "nl": 0.225, "no": 0.225, "pl": 0.433333, "pt": 0.2875, "ro": 0.3375, "ru": 0.333333, "sk": 0.3375, "sl": 0.325, "sq": 0.3375, "sr": 0.325, "sv": 0.4, "tr": 0.425, "uk": 0.375}}, {"model": "meta-llama/llama-3.1-8b-instruct", "avg_exact": 0.209262, "avg_wp": 0.341052, "total": 1749, "lang_exact": {"ar": 0.135135, "az": 0.2, "be": 0.3, "bg": 0.175, "bo": 0.2, "ca": 0.225, "cn": 0.3, "cs": 0.25, "da": 0.25, "el": 0.225, "en": 0.275, "es": 0.075, "et": 0.275, "eu": 0.116667, "fa": 0.083333, "fi": 0.333333, "fr": 0.1, "gl": 0.133333, "hu": 0.175, "hv": 0.375, "is": 0.25, "it": 0.2, "ka": 0.225, "la": 0.116667, "li": 0.2, "lv": 0.225, "mk": 0.275, "mt": 0.2, "nl": 0.225, "no": 0.225, "pl": 0.083333, "pt": 0.225, "ro": 0.075, "ru": 0.242424, "sk": 0.225, "sl": 0.25, "sq": 0.225, "sr": 0.25, "sv": 0.375, "tr": 0.275, "uk": 0.225}, "lang_wp": {"ar": 0.297297, "az": 0.275, "be": 0.4625, "bg": 0.3375, "bo": 0.3125, "ca": 0.35, "cn": 0.441667, "cs": 0.4, "da": 0.3875, "el": 0.325, "en": 0.4, "es": 0.2125, "et": 0.3875, "eu": 0.191667, "fa": 0.216667, "fi": 0.5, "fr": 0.2375, "gl": 0.266667, "hu": 0.3375, "hv": 0.5375, "is": 0.375, "it": 0.325, "ka": 0.35, "la": 0.325, "li": 0.3, "lv": 0.4, "mk": 0.4, "mt": 0.3, "nl": 0.3625, "no": 0.3375, "pl": 0.258333, "pt": 0.35, "ro": 0.2, "ru": 0.30303, "sk": 0.3625, "sl": 0.375, "sq": 0.3375, "sr": 0.375, "sv": 0.475, "tr": 0.4, "uk": 0.3625}}, {"model": "speakleash/Bielik-11B-v2.6-Instruct", "avg_exact": 0.1502, "avg_wp": 0.340091, "total": 1751, "lang_exact": {"ar": 0.05, "az": 0.075, "be": 0.0, "bg": 0.175, "bo": 0.2, "ca": 0.175, "cn": 0.186441, "cs": 0.1, "da": 0.225, "el": 0.275, "en": 0.025, "es": 0.2, "et": 0.2, "eu": 0.166667, "fa": 0.266667, "fi": 0.125, "fr": 0.225, "gl": 0.216667, "hu": 0.2, "hv": 0.225, "is": 0.0, "it": 0.125, "ka": 0.0, "la": 0.2, "li": 0.225, "lv": 0.125, "mk": 0.2, "mt": 0.15, "nl": 0.225, "no": 0.15, "pl": 0.183333, "pt": 0.075, "ro": 0.075, "ru": 0.090909, "sk": 0.125, "sl": 0.225, "sq": 0.0, "sr": 0.05, "sv": 0.15, "tr": 0.225, "uk": 0.075}, "lang_wp": {"ar": 0.175, "az": 0.25, "be": 0.1875, "bg": 0.3875, "bo": 0.4, "ca": 0.425, "cn": 0.29661, "cs": 0.3, "da": 0.4125, "el": 0.45, "en": 0.2, "es": 0.4625, "et": 0.4375, "eu": 0.333333, "fa": 0.433333, "fi": 0.35, "fr": 0.4375, "gl": 0.333333, "hu": 0.3375, "hv": 0.4375, "is": 0.1125, "it": 0.375, "ka": 0.128205, "la": 0.308333, "li": 0.4375, "lv": 0.3875, "mk": 0.4125, "mt": 0.35, "nl": 0.4625, "no": 0.4125, "pl": 0.4, "pt": 0.4125, "ro": 0.2625, "ru": 0.30303, "sk": 0.2875, "sl": 0.4375, "sq": 0.2, "sr": 0.2, "sv": 0.3375, "tr": 0.375, "uk": 0.25}}, {"model": "CYFRAGOVPL/Llama-PLLuM-70B-chat-250801", "avg_exact": 0.227169, "avg_wp": 0.335616, "total": 1752, "lang_exact": {"ar": 0.15, "az": 0.275, "be": 0.25, "bg": 0.175, "bo": 0.25, "ca": 0.175, "cn": 0.152542, "cs": 0.225, "da": 0.175, "el": 0.225, "en": 0.225, "es": 0.125, "et": 0.25, "eu": 0.1, "fa": 0.35, "fi": 0.25, "fr": 0.225, "gl": 0.133333, "hu": 0.175, "hv": 0.375, "is": 0.4, "it": 0.2, "ka": 0.175, "la": 0.3, "li": 0.25, "lv": 0.325, "mk": 0.225, "mt": 0.275, "nl": 0.25, "no": 0.25, "pl": 0.1, "pt": 0.25, "ro": 0.25, "ru": 0.30303, "sk": 0.2, "sl": 0.175, "sq": 0.25, "sr": 0.325, "sv": 0.25, "tr": 0.275, "uk": 0.15}, "lang_wp": {"ar": 0.35, "az": 0.4, "be": 0.3625, "bg": 0.275, "bo": 0.4, "ca": 0.2375, "cn": 0.194915, "cs": 0.375, "da": 0.2875, "el": 0.3125, "en": 0.2375, "es": 0.125, "et": 0.375, "eu": 0.183333, "fa": 0.508333, "fi": 0.3625, "fr": 0.35, "gl": 0.2, "hu": 0.2125, "hv": 0.55, "is": 0.5875, "it": 0.275, "ka": 0.2375, "la": 0.5, "li": 0.35, "lv": 0.4625, "mk": 0.275, "mt": 0.4625, "nl": 0.3625, "no": 0.3125, "pl": 0.141667, "pt": 0.3625, "ro": 0.3625, "ru": 0.439394, "sk": 0.325, "sl": 0.2875, "sq": 0.375, "sr": 0.475, "sv": 0.375, "tr": 0.4125, "uk": 0.2375}}, {"model": "CYFRAGOVPL/pllum-12b-nc-chat-250715", "avg_exact": 0.146689, "avg_wp": 0.234018, "total": 1752, "lang_exact": {"ar": 0.075, "az": 0.175, "be": 0.275, "bg": 0.125, "bo": 0.325, "ca": 0.275, "cn": 0.101695, "cs": 0.075, "da": 0.125, "el": 0.175, "en": 0.15, "es": 0.2, "et": 0.075, "eu": 0.083333, "fa": 0.216667, "fi": 0.075, "fr": 0.25, "gl": 0.1, "hu": 0.025, "hv": 0.325, "is": 0.15, "it": 0.125, "ka": 0.075, "la": 0.133333, "li": 0.15, "lv": 0.1, "mk": 0.075, "mt": 0.1, "nl": 0.075, "no": 0.2, "pl": 0.116667, "pt": 0.1, "ro": 0.35, "ru": 0.212121, "sk": 0.15, "sl": 0.15, "sq": 0.075, "sr": 0.175, "sv": 0.125, "tr": 0.15, "uk": 0.075}, "lang_wp": {"ar": 0.0875, "az": 0.3375, "be": 0.3625, "bg": 0.2, "bo": 0.4875, "ca": 0.3625, "cn": 0.144068, "cs": 0.1125, "da": 0.2125, "el": 0.25, "en": 0.275, "es": 0.275, "et": 0.15, "eu": 0.116667, "fa": 0.35, "fi": 0.1375, "fr": 0.3875, "gl": 0.183333, "hu": 0.125, "hv": 0.475, "is": 0.225, "it": 0.225, "ka": 0.1125, "la": 0.308333, "li": 0.2125, "lv": 0.2375, "mk": 0.15, "mt": 0.2, "nl": 0.15, "no": 0.275, "pl": 0.141667, "pt": 0.1375, "ro": 0.425, "ru": 0.30303, "sk": 0.2625, "sl": 0.3, "sq": 0.1375, "sr": 0.25, "sv": 0.1625, "tr": 0.2625, "uk": 0.175}}];
168
  const ALL_LANGS = ["ar", "az", "be", "bg", "bo", "ca", "cn", "cs", "da", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "gl", "hu", "hv", "is", "it", "ka", "la", "li", "lv", "mk", "mt", "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "sv", "tr", "uk"];
169
  const LANG_NAMES = {"af": "Afrikaans", "ar": "Arabic", "az": "Azerbaijani", "be": "Belarusian", "bo": "Bosnian", "bg": "Bulgarian", "bn": "Bulgarian", "ca": "Catalan", "cs": "Czech", "cn": "Chinese", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Faroese", "fi": "Finnish", "fr": "French", "ga": "Irish", "gl": "Galician", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "hy": "Armenian", "hv": "Croatia", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "ka": "Georgian", "kk": "Kazakh", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "la": "Latin", "li": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "mt": "Maltese", "my": "Burmese", "ne": "Nepali", "nl": "Dutch", "no": "Norwegian", "pa": "Punjabi", "pe": "Persian", "pl": "Polish", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Filipino", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "vi": "Vietnamese", "zh": "Chinese", "zu": "Zulu"};
170
  const LANG_COUNTS = {"ar": 1790, "az": 1795, "be": 1794, "bg": 1721, "bo": 1795, "ca": 1712, "cn": 2686, "cs": 1779, "da": 1784, "el": 1794, "en": 1791, "es": 1652, "et": 1788, "eu": 2684, "fa": 2697, "fi": 1784, "fr": 1761, "gl": 2698, "hu": 1796, "hv": 1796, "is": 1790, "it": 1791, "ka": 1629, "la": 2522, "li": 1793, "lv": 1795, "mk": 1799, "mt": 1797, "nl": 1795, "no": 1799, "pl": 2637, "pt": 1795, "ro": 1790, "ru": 1424, "sk": 1784, "sl": 1788, "sq": 1793, "sr": 1798, "sv": 1797, "tr": 1799, "uk": 1747};
@@ -408,9 +475,198 @@
408
  `77760 predictions · ${ALL_LANGS.length} languages · ${ALL_ROWS.length} models`;
409
  }
410
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  render();
412
  renderChart();
413
- renderDist(); // disabled for testing
 
 
 
 
414
  })();
415
  </script>
416
  </body>
 
20
  letter-spacing: -0.02em; color: #f1f5f9; margin-bottom: 6px;
21
  }
22
  .subtitle { color: #64748b; font-size: 12px; margin-bottom: 12px; }
23
+ .meta-subtitle {
24
+ color: #64748b; font-size: 12px; margin-bottom: 20px;
25
+ line-height: 1.8;
26
+ }
27
+ .sep { color: #334155; margin: 0 8px; }
28
  .scoring-note {
29
  display: inline-flex; gap: 16px; flex-wrap: wrap;
30
  background: #131820; border: 1px solid #1e2a3a;
 
115
  display: inline-block; height: 6px; border-radius: 3px;
116
  background: #2563eb; vertical-align: middle; margin-left: 4px; opacity: 0.7;
117
  }
118
+ /* ── analysis sections ── */
119
+ .analysis-grid {
120
+ display: grid;
121
+ grid-template-columns: 1fr 1fr;
122
+ gap: 24px;
123
+ margin-bottom: 52px;
124
+ }
125
+ @media (max-width: 900px) { .analysis-grid { grid-template-columns: 1fr; } }
126
+ .analysis-card {
127
+ background: #111827; border: 1px solid #1e2a3a;
128
+ border-radius: 10px; padding: 22px 20px;
129
+ }
130
+ .analysis-card h3 {
131
+ font-family: 'Syne', sans-serif; font-size: 14px; font-weight: 700;
132
+ color: #f1f5f9; margin-bottom: 4px;
133
+ }
134
+ .analysis-card .card-sub {
135
+ font-size: 11px; color: #475569; margin-bottom: 16px; line-height: 1.5;
136
+ }
137
+ .model-select {
138
+ background: #1a2236; border: 1px solid #2d3748; border-radius: 6px;
139
+ color: #cbd5e1; font: inherit; font-size: 11px;
140
+ padding: 5px 10px; margin-bottom: 14px; cursor: pointer; width: 100%;
141
+ }
142
+ .model-select:focus { outline: none; border-color: #38bdf8; }
143
  .footer { margin-top: 20px; font-size: 11px; color: #2d3748; text-align: right; }
144
  ::-webkit-scrollbar { height: 5px; background: #0d1117; }
145
  ::-webkit-scrollbar-thumb { background: #2d3748; border-radius: 3px; }
 
149
  <h1>Text Quality Rating Benchmark</h1>
150
  <p class="meta-subtitle">
151
  LLM accuracy at rating text quality on a 1–6 scale across multiple languages
152
+ <span class="sep">·</span> Labeled by DeepSeek V3.2 &amp; judged by Gemini 3 Flash
153
  <span class="sep">·</span> Documents sourced from FineWeb dataset
154
  </p>
155
 
 
174
  </table>
175
  </div>
176
 
177
+ <!-- DISTRIBUTION SECTION (disabled for testing)
178
  <p class="section-title">Dataset Distribution</p>
179
  <p class="subtitle" style="margin-bottom:20px">Number of unique texts per rating score (1–6) for each language</p>
180
  <div class="dist-wrap">
 
183
  <tbody id="dist-body"></tbody>
184
  </table>
185
  </div>
186
+ -->
187
 
188
  <p class="section-title" style="margin-top:52px">Global Model Comparison</p>
189
  <p class="subtitle" style="margin-bottom:20px">Weighted Score vs Exact Accuracy — all languages combined, sorted by Weighted Score</p>
 
191
  <canvas id="globalChart"></canvas>
192
  </div>
193
 
194
+ <p class="section-title" style="margin-bottom:4px;margin-top:52px">Model Error Analysis</p>
195
+ <p class="subtitle" style="margin-bottom:20px">Bias, critical misclassifications and confusion patterns</p>
196
+
197
+ <div class="analysis-grid">
198
+
199
+ <!-- Bias lollipop -->
200
+ <div class="analysis-card">
201
+ <h3>Prediction Bias</h3>
202
+ <p class="card-sub">Average error (predicted − ground truth). Negative = underestimation, positive = overestimation.</p>
203
+ <div style="position:relative">
204
+ <canvas id="biasChart"></canvas>
205
+ </div>
206
+ </div>
207
+
208
+ <!-- Critical confusion 1/2 vs 5/6 -->
209
+ <div class="analysis-card">
210
+ <h3>Critical Confusion Rate</h3>
211
+ <p class="card-sub">
212
+ % of low-quality texts (rating 1–2) predicted as high-quality (5–6) and vice versa.
213
+ These are the most dangerous misclassifications.
214
+ </p>
215
+ <canvas id="criticalChart"></canvas>
216
+ </div>
217
+
218
+ </div>
219
+
220
+ <!-- Full confusion heatmap with model dropdown -->
221
+ <div class="analysis-card" style="margin-bottom:52px">
222
+ <h3>Confusion Matrix</h3>
223
+ <p class="card-sub">Row = ground truth rating, column = predicted rating. Values show % of predictions within each true class.</p>
224
+ <select class="model-select" id="confModelSelect" onchange="renderConfusion()"></select>
225
+ <div id="confusionWrap" style="overflow-x:auto">
226
+ <canvas id="confusionChart"></canvas>
227
+ </div>
228
+ </div>
229
+
230
  <div class="footer" id="footer"></div>
231
 
232
  <script>
233
  (function() {
234
+ const ALL_ROWS = [{"model": "Qwen/Qwen3.5-397B-A17B-FP8", "avg_exact": 0.656148, "avg_wp": 0.808234, "avg_bias": -0.1102, "total": 17112, "lang_exact": {"ar": 0.675, "az": 0.7025, "be": 0.785, "bg": 0.759669, "bo": 0.735, "ca": 0.732591, "cn": 0.628333, "cs": 0.7425, "da": 0.565, "el": 0.6075, "en": 0.46, "es": 0.777778, "et": 0.6575, "eu": 0.474037, "fa": 0.565, "fi": 0.77, "fr": 0.769634, "gl": 0.521667, "hu": 0.6475, "hv": 0.7175, "is": 0.715, "it": 0.78, "ka": 0.688679, "la": 0.662768, "li": 0.6, "lv": 0.725, "mk": 0.54, "mt": 0.74, "nl": 0.58, "no": 0.695, "pl": 0.486865, "pt": 0.695, "ro": 0.625, "ru": 0.759076, "sk": 0.63, "sl": 0.715, "sq": 0.8025, "sr": 0.5625, "sv": 0.66, "tr": 0.55, "uk": 0.764706}, "lang_wp": {"ar": 0.8225, "az": 0.83375, "be": 0.87375, "bg": 0.870166, "bo": 0.85625, "ca": 0.85376, "cn": 0.7875, "cs": 0.85125, "da": 0.77125, "el": 0.79625, "en": 0.71125, "es": 0.876877, "et": 0.8175, "eu": 0.68593, "fa": 0.758333, "fi": 0.87125, "fr": 0.870419, "gl": 0.721667, "hu": 0.79625, "hv": 0.85625, "is": 0.8475, "it": 0.87875, "ka": 0.794025, "la": 0.798246, "li": 0.78875, "lv": 0.85625, "mk": 0.75125, "mt": 0.8475, "nl": 0.78, "no": 0.8075, "pl": 0.707531, "pt": 0.80625, "ro": 0.7825, "ru": 0.866337, "sk": 0.8, "sl": 0.85, "sq": 0.89375, "sr": 0.77375, "sv": 0.82125, "tr": 0.755, "uk": 0.874332}, "confusion": {"1": {"1": 0.7778, "2": 0.2149, "3": 0.0035, "4": 0.003, "5": 0.0003, "6": 0.0005}, "2": {"2": 0.6939, "1": 0.239, "3": 0.0258, "4": 0.0335, "5": 0.002, "6": 0.0058}, "5": {"5": 0.659, "4": 0.1832, "6": 0.1293, "2": 0.0205, "3": 0.008}, "6": {"5": 0.3482, "6": 0.5891, "3": 0.0008, "4": 0.0455, "2": 0.0154, "1": 0.0011}, "3": {"4": 0.376, "2": 0.3614, "5": 0.0535, "3": 0.1977, "1": 0.0081, "6": 0.0032}, "4": {"4": 0.49, "5": 0.2871, "3": 0.0871, "2": 0.1114, "6": 0.02, "1": 0.0043}}}, {"model": "google/gemini-3-flash-preview", "avg_exact": 0.577296, "avg_wp": 0.760411, "avg_bias": 0.0496, "total": 1753, "lang_exact": {"ar": 0.575, "az": 0.55, "be": 0.625, "bg": 0.75, "bo": 0.575, "ca": 0.725, "cn": 0.566667, "cs": 0.525, "da": 0.475, "el": 0.575, "en": 0.4, "es": 0.825, "et": 0.625, "eu": 0.416667, "fa": 0.516667, "fi": 0.625, "fr": 0.675, "gl": 0.533333, "hu": 0.575, "hv": 0.7, "is": 0.55, "it": 0.725, "ka": 0.6, "la": 0.5, "li": 0.5, "lv": 0.6, "mk": 0.525, "mt": 0.7, "nl": 0.45, "no": 0.6, "pl": 0.366667, "pt": 0.525, "ro": 0.625, "ru": 0.69697, "sk": 0.525, "sl": 0.7, "sq": 0.675, "sr": 0.475, "sv": 0.6, "tr": 0.6, "uk": 0.6}, "lang_wp": {"ar": 0.7875, "az": 0.6875, "be": 0.8125, "bg": 0.8625, "bo": 0.775, "ca": 0.8375, "cn": 0.783333, "cs": 0.725, "da": 0.725, "el": 0.775, "en": 0.65, "es": 0.9125, "et": 0.8125, "eu": 0.608333, "fa": 0.725, "fi": 0.775, "fr": 0.8125, "gl": 0.7, "hu": 0.7875, "hv": 0.85, "is": 0.7125, "it": 0.85, "ka": 0.8, "la": 0.658333, "li": 0.725, "lv": 0.7875, "mk": 0.7125, "mt": 0.8, "nl": 0.7, "no": 0.775, "pl": 0.641667, "pt": 0.75, "ro": 0.75, "ru": 0.833333, "sk": 0.7625, "sl": 0.8375, "sq": 0.825, "sr": 0.7375, "sv": 0.8, "tr": 0.7625, "uk": 0.7875}, "confusion": {"2": {"2": 0.435, "1": 0.3325, "3": 0.135, "5": 0.03, "4": 0.06, "6": 0.0075}, "1": {"2": 0.1463, "1": 0.822, "4": 0.0073, "5": 0.0073, "3": 0.0171}, "5": {"5": 0.5268, "6": 0.3537, "4": 0.0976, "3": 0.0195, "2": 0.0024}, "6": {"6": 0.6336, "5": 0.3333, "4": 0.028, "3": 0.0025, "2": 0.0025}, "3": {"4": 0.3286, "3": 0.2286, "2": 0.2429, "5": 0.1571, "1": 0.0429}, "4": {"4": 0.2857, "5": 0.4286, "3": 0.1286, "6": 0.1, "2": 0.0571}}}, {"model": "openai/gpt-4o-mini", "avg_exact": 0.596461, "avg_wp": 0.757991, "avg_bias": -0.1427, "total": 1752, "lang_exact": {"ar": 0.625, "az": 0.5, "be": 0.725, "bg": 0.675, "bo": 0.675, "ca": 0.475, "cn": 0.7, "cs": 0.6, "da": 0.525, "el": 0.525, "en": 0.425, "es": 0.8, "et": 0.875, "eu": 0.35, "fa": 0.533333, "fi": 0.65, "fr": 0.575, "gl": 0.433333, "hu": 0.65, "hv": 0.7, "is": 0.6, "it": 0.725, "ka": 0.475, "la": 0.4, "li": 0.675, "lv": 0.725, "mk": 0.4, "mt": 0.6, "nl": 0.475, "no": 0.675, "pl": 0.383333, "pt": 0.75, "ro": 0.5, "ru": 0.848485, "sk": 0.7, "sl": 0.7, "sq": 0.7, "sr": 0.475, "sv": 0.725, "tr": 0.65, "uk": 0.692308}, "lang_wp": {"ar": 0.7875, "az": 0.725, "be": 0.8625, "bg": 0.8125, "bo": 0.8375, "ca": 0.675, "cn": 0.841667, "cs": 0.7625, "da": 0.725, "el": 0.7375, "en": 0.5875, "es": 0.9, "et": 0.9375, "eu": 0.583333, "fa": 0.733333, "fi": 0.775, "fr": 0.7625, "gl": 0.666667, "hu": 0.8125, "hv": 0.825, "is": 0.7875, "it": 0.8375, "ka": 0.6625, "la": 0.566667, "li": 0.8125, "lv": 0.8625, "mk": 0.575, "mt": 0.7875, "nl": 0.7, "no": 0.7875, "pl": 0.566667, "pt": 0.8625, "ro": 0.7, "ru": 0.893939, "sk": 0.825, "sl": 0.8375, "sq": 0.825, "sr": 0.6875, "sv": 0.8375, "tr": 0.8, "uk": 0.833333}, "confusion": {"5": {"5": 0.6537, "4": 0.1439, "6": 0.1561, "2": 0.0146, "3": 0.0244, "1": 0.0073}, "2": {"3": 0.155, "1": 0.2625, "4": 0.0275, "2": 0.54, "5": 0.0125, "6": 0.0025}, "1": {"2": 0.1951, "1": 0.7488, "3": 0.0512, "4": 0.0049}, "6": {"5": 0.3316, "6": 0.5306, "4": 0.0561, "3": 0.0281, "2": 0.0306, "1": 0.023}, "3": {"4": 0.2286, "3": 0.3286, "2": 0.2429, "1": 0.1714, "5": 0.0286}, "4": {"4": 0.3286, "3": 0.1429, "5": 0.3286, "2": 0.1286, "6": 0.0286, "1": 0.0429}}}, {"model": "qwen/qwen3-235b-a22b-2507", "avg_exact": 0.496292, "avg_wp": 0.693953, "avg_bias": -0.4398, "total": 1753, "lang_exact": {"ar": 0.6, "az": 0.475, "be": 0.55, "bg": 0.65, "bo": 0.55, "ca": 0.525, "cn": 0.433333, "cs": 0.425, "da": 0.45, "el": 0.55, "en": 0.425, "es": 0.525, "et": 0.5, "eu": 0.416667, "fa": 0.616667, "fi": 0.55, "fr": 0.6, "gl": 0.433333, "hu": 0.65, "hv": 0.625, "is": 0.475, "it": 0.65, "ka": 0.325, "la": 0.433333, "li": 0.4, "lv": 0.575, "mk": 0.35, "mt": 0.475, "nl": 0.35, "no": 0.4, "pl": 0.316667, "pt": 0.575, "ro": 0.5, "ru": 0.454545, "sk": 0.45, "sl": 0.575, "sq": 0.6, "sr": 0.5, "sv": 0.475, "tr": 0.45, "uk": 0.625}, "lang_wp": {"ar": 0.775, "az": 0.7, "be": 0.75, "bg": 0.8, "bo": 0.75, "ca": 0.7, "cn": 0.616667, "cs": 0.6375, "da": 0.6625, "el": 0.725, "en": 0.55, "es": 0.725, "et": 0.75, "eu": 0.608333, "fa": 0.783333, "fi": 0.725, "fr": 0.7625, "gl": 0.625, "hu": 0.8125, "hv": 0.8, "is": 0.7, "it": 0.8125, "ka": 0.6, "la": 0.558333, "li": 0.6625, "lv": 0.775, "mk": 0.575, "mt": 0.675, "nl": 0.6, "no": 0.65, "pl": 0.458333, "pt": 0.775, "ro": 0.7375, "ru": 0.712121, "sk": 0.7, "sl": 0.775, "sq": 0.7625, "sr": 0.725, "sv": 0.725, "tr": 0.675, "uk": 0.8}, "confusion": {"1": {"1": 0.9293, "2": 0.0439, "3": 0.0244, "4": 0.0024}, "2": {"4": 0.02, "3": 0.095, "1": 0.81, "6": 0.0025, "2": 0.0725}, "5": {"5": 0.4171, "4": 0.2585, "3": 0.061, "6": 0.2073, "1": 0.0463, "2": 0.0098}, "6": {"6": 0.6438, "4": 0.0662, "5": 0.2239, "3": 0.0382, "2": 0.0051, "1": 0.0229}, "3": {"2": 0.0857, "1": 0.5714, "3": 0.2143, "4": 0.1286}, "4": {"1": 0.2714, "4": 0.3, "2": 0.1143, "6": 0.0429, "3": 0.1714, "5": 0.1}}}, {"model": "deepseek/deepseek-v3.2", "avg_exact": 0.403879, "avg_wp": 0.629492, "avg_bias": -0.5031, "total": 1753, "lang_exact": {"ar": 0.375, "az": 0.325, "be": 0.475, "bg": 0.475, "bo": 0.475, "ca": 0.425, "cn": 0.533333, "cs": 0.5, "da": 0.275, "el": 0.25, "en": 0.525, "es": 0.425, "et": 0.425, "eu": 0.333333, "fa": 0.35, "fi": 0.375, "fr": 0.45, "gl": 0.483333, "hu": 0.4, "hv": 0.475, "is": 0.575, "it": 0.425, "ka": 0.35, "la": 0.3, "li": 0.375, "lv": 0.4, "mk": 0.325, "mt": 0.375, "nl": 0.325, "no": 0.4, "pl": 0.366667, "pt": 0.475, "ro": 0.25, "ru": 0.484848, "sk": 0.375, "sl": 0.6, "sq": 0.375, "sr": 0.3, "sv": 0.375, "tr": 0.375, "uk": 0.425}, "lang_wp": {"ar": 0.625, "az": 0.5625, "be": 0.725, "bg": 0.675, "bo": 0.7, "ca": 0.6375, "cn": 0.741667, "cs": 0.6875, "da": 0.5125, "el": 0.475, "en": 0.6875, "es": 0.7, "et": 0.7, "eu": 0.541667, "fa": 0.633333, "fi": 0.5875, "fr": 0.6875, "gl": 0.683333, "hu": 0.65, "hv": 0.7125, "is": 0.7625, "it": 0.6875, "ka": 0.575, "la": 0.5, "li": 0.5375, "lv": 0.6625, "mk": 0.55, "mt": 0.5625, "nl": 0.5625, "no": 0.625, "pl": 0.558333, "pt": 0.7, "ro": 0.4625, "ru": 0.712121, "sk": 0.625, "sl": 0.775, "sq": 0.6625, "sr": 0.6, "sv": 0.6, "tr": 0.625, "uk": 0.6125}, "confusion": {"1": {"1": 0.8829, "2": 0.0683, "3": 0.0195, "4": 0.022, "6": 0.0024, "5": 0.0049}, "2": {"1": 0.5925, "2": 0.205, "3": 0.11, "4": 0.0725, "6": 0.01, "5": 0.01}, "5": {"4": 0.5195, "6": 0.0317, "5": 0.3366, "3": 0.0829, "2": 0.0171, "1": 0.0122}, "6": {"4": 0.2316, "6": 0.1832, "5": 0.5242, "1": 0.0305, "3": 0.0254, "2": 0.0051}, "3": {"3": 0.2286, "4": 0.3286, "2": 0.1429, "1": 0.2571, "5": 0.0429}, "4": {"3": 0.1571, "4": 0.5429, "5": 0.0857, "1": 0.1714, "2": 0.0429}}}, {"model": "z-ai/glm-4-32b", "avg_exact": 0.432402, "avg_wp": 0.620936, "avg_bias": 0.0877, "total": 1753, "lang_exact": {"ar": 0.375, "az": 0.5, "be": 0.625, "bg": 0.375, "bo": 0.625, "ca": 0.35, "cn": 0.383333, "cs": 0.625, "da": 0.35, "el": 0.55, "en": 0.3, "es": 0.35, "et": 0.525, "eu": 0.3, "fa": 0.383333, "fi": 0.525, "fr": 0.525, "gl": 0.25, "hu": 0.5, "hv": 0.625, "is": 0.35, "it": 0.4, "ka": 0.5, "la": 0.433333, "li": 0.425, "lv": 0.525, "mk": 0.225, "mt": 0.35, "nl": 0.425, "no": 0.475, "pl": 0.35, "pt": 0.25, "ro": 0.3, "ru": 0.515152, "sk": 0.525, "sl": 0.475, "sq": 0.7, "sr": 0.375, "sv": 0.4, "tr": 0.425, "uk": 0.525}, "lang_wp": {"ar": 0.525, "az": 0.7, "be": 0.775, "bg": 0.5375, "bo": 0.7625, "ca": 0.5, "cn": 0.583333, "cs": 0.775, "da": 0.6, "el": 0.7375, "en": 0.45, "es": 0.575, "et": 0.725, "eu": 0.533333, "fa": 0.608333, "fi": 0.6375, "fr": 0.6625, "gl": 0.375, "hu": 0.65, "hv": 0.8125, "is": 0.5875, "it": 0.6, "ka": 0.65, "la": 0.558333, "li": 0.65, "lv": 0.7125, "mk": 0.4125, "mt": 0.5625, "nl": 0.675, "no": 0.675, "pl": 0.575, "pt": 0.4375, "ro": 0.525, "ru": 0.69697, "sk": 0.7375, "sl": 0.6875, "sq": 0.8375, "sr": 0.6, "sv": 0.625, "tr": 0.675, "uk": 0.7125}, "confusion": {"1": {"1": 0.7217, "3": 0.2389, "5": 0.0369, "4": 0.0025}, "2": {"3": 0.5682, "5": 0.0732, "1": 0.3207, "4": 0.0278, "6": 0.0101}, "5": {"5": 0.5479, "6": 0.1818, "3": 0.1425, "4": 0.1057, "1": 0.0221}, "6": {"6": 0.553, "5": 0.3488, "3": 0.0646, "4": 0.0181, "1": 0.0155}, "3": {"3": 0.2429, "4": 0.1571, "5": 0.3857, "1": 0.1857, "6": 0.0286}, "4": {"4": 0.1618, "5": 0.4853, "3": 0.1912, "1": 0.1176, "6": 0.0441}}}, {"model": "speakleash/Bielik-11B-v3.0-Instruct", "avg_exact": 0.41462, "avg_wp": 0.601942, "avg_bias": -0.2942, "total": 1751, "lang_exact": {"ar": 0.025, "az": 0.525, "be": 0.2, "bg": 0.525, "bo": 0.675, "ca": 0.325, "cn": 0.271186, "cs": 0.5, "da": 0.425, "el": 0.325, "en": 0.5, "es": 0.475, "et": 0.55, "eu": 0.25, "fa": 0.366667, "fi": 0.425, "fr": 0.5, "gl": 0.4, "hu": 0.475, "hv": 0.525, "is": 0.225, "it": 0.475, "ka": 0.230769, "la": 0.3, "li": 0.45, "lv": 0.425, "mk": 0.275, "mt": 0.375, "nl": 0.45, "no": 0.475, "pl": 0.366667, "pt": 0.475, "ro": 0.425, "ru": 0.606061, "sk": 0.475, "sl": 0.55, "sq": 0.5, "sr": 0.45, "sv": 0.425, "tr": 0.45, "uk": 0.625}, "lang_wp": {"ar": 0.15, "az": 0.6875, "be": 0.275, "bg": 0.675, "bo": 0.7875, "ca": 0.6125, "cn": 0.40678, "cs": 0.6125, "da": 0.675, "el": 0.5625, "en": 0.65, "es": 0.7125, "et": 0.7375, "eu": 0.483333, "fa": 0.583333, "fi": 0.5375, "fr": 0.725, "gl": 0.608333, "hu": 0.6625, "hv": 0.675, "is": 0.3875, "it": 0.6875, "ka": 0.371795, "la": 0.483333, "li": 0.675, "lv": 0.675, "mk": 0.575, "mt": 0.575, "nl": 0.7125, "no": 0.6875, "pl": 0.541667, "pt": 0.6875, "ro": 0.65, "ru": 0.727273, "sk": 0.6125, "sl": 0.6625, "sq": 0.6625, "sr": 0.6, "sv": 0.7, "tr": 0.7, "uk": 0.75}, "confusion": {"1": {"3": 0.0442, "5": 0.026, "1": 0.8623, "2": 0.0338, "4": 0.0338}, "2": {"1": 0.5652, "3": 0.1576, "5": 0.0408, "2": 0.1087, "4": 0.1277}, "5": {"3": 0.0281, "5": 0.8235, "1": 0.0358, "4": 0.1125}, "6": {"3": 0.0278, "5": 0.8056, "4": 0.0741, "1": 0.0833, "6": 0.0093}, "3": {"1": 0.2687, "3": 0.0597, "5": 0.0746, "4": 0.5821, "2": 0.0149}, "4": {"3": 0.0909, "5": 0.3939, "1": 0.1061, "2": 0.0303, "4": 0.3788}}}, {"model": "google/gemini-2.0-flash-lite-001", "avg_exact": 0.385054, "avg_wp": 0.586423, "avg_bias": -0.2835, "total": 1753, "lang_exact": {"ar": 0.325, "az": 0.325, "be": 0.55, "bg": 0.6, "bo": 0.4, "ca": 0.275, "cn": 0.45, "cs": 0.3, "da": 0.4, "el": 0.225, "en": 0.525, "es": 0.4, "et": 0.55, "eu": 0.333333, "fa": 0.516667, "fi": 0.4, "fr": 0.425, "gl": 0.383333, "hu": 0.45, "hv": 0.425, "is": 0.2, "it": 0.425, "ka": 0.3, "la": 0.216667, "li": 0.425, "lv": 0.475, "mk": 0.2, "mt": 0.425, "nl": 0.4, "no": 0.325, "pl": 0.366667, "pt": 0.375, "ro": 0.275, "ru": 0.606061, "sk": 0.425, "sl": 0.375, "sq": 0.425, "sr": 0.25, "sv": 0.425, "tr": 0.4, "uk": 0.275}, "lang_wp": {"ar": 0.525, "az": 0.6, "be": 0.75, "bg": 0.7625, "bo": 0.5875, "ca": 0.525, "cn": 0.608333, "cs": 0.5, "da": 0.55, "el": 0.5, "en": 0.7125, "es": 0.65, "et": 0.6875, "eu": 0.566667, "fa": 0.691667, "fi": 0.6375, "fr": 0.6125, "gl": 0.633333, "hu": 0.6375, "hv": 0.625, "is": 0.45, "it": 0.6125, "ka": 0.525, "la": 0.383333, "li": 0.575, "lv": 0.7, "mk": 0.4625, "mt": 0.65, "nl": 0.6125, "no": 0.4875, "pl": 0.55, "pt": 0.525, "ro": 0.4375, "ru": 0.712121, "sk": 0.6, "sl": 0.6125, "sq": 0.6125, "sr": 0.4875, "sv": 0.6375, "tr": 0.5875, "uk": 0.525}, "confusion": {"2": {"3": 0.3775, "1": 0.2725, "2": 0.28, "4": 0.0525, "6": 0.0175}, "1": {"3": 0.1098, "1": 0.7171, "2": 0.1585, "4": 0.0098, "6": 0.0049}, "5": {"4": 0.5293, "3": 0.1341, "5": 0.178, "6": 0.1561, "1": 0.0024}, "6": {"6": 0.3333, "4": 0.3868, "5": 0.1323, "3": 0.1349, "1": 0.0102, "2": 0.0025}, "3": {"6": 0.0571, "3": 0.5143, "4": 0.2143, "5": 0.0714, "1": 0.0571, "2": 0.0857}, "4": {"4": 0.4143, "6": 0.1714, "3": 0.3571, "2": 0.0286, "5": 0.0286}}}, {"model": "google/gemma-3-12b-it", "avg_exact": 0.337707, "avg_wp": 0.573873, "avg_bias": -0.0331, "total": 1753, "lang_exact": {"ar": 0.275, "az": 0.35, "be": 0.35, "bg": 0.475, "bo": 0.45, "ca": 0.225, "cn": 0.383333, "cs": 0.45, "da": 0.375, "el": 0.275, "en": 0.45, "es": 0.325, "et": 0.4, "eu": 0.233333, "fa": 0.25, "fi": 0.4, "fr": 0.425, "gl": 0.166667, "hu": 0.35, "hv": 0.4, "is": 0.475, "it": 0.325, "ka": 0.3, "la": 0.3, "li": 0.25, "lv": 0.425, "mk": 0.275, "mt": 0.25, "nl": 0.45, "no": 0.375, "pl": 0.366667, "pt": 0.25, "ro": 0.25, "ru": 0.575758, "sk": 0.275, "sl": 0.35, "sq": 0.35, "sr": 0.325, "sv": 0.175, "tr": 0.325, "uk": 0.375}, "lang_wp": {"ar": 0.5, "az": 0.625, "be": 0.6125, "bg": 0.6625, "bo": 0.675, "ca": 0.4875, "cn": 0.583333, "cs": 0.625, "da": 0.5875, "el": 0.5125, "en": 0.5625, "es": 0.5875, "et": 0.625, "eu": 0.466667, "fa": 0.533333, "fi": 0.625, "fr": 0.675, "gl": 0.375, "hu": 0.6125, "hv": 0.6375, "is": 0.6875, "it": 0.5375, "ka": 0.475, "la": 0.525, "li": 0.5875, "lv": 0.6625, "mk": 0.55, "mt": 0.5375, "nl": 0.725, "no": 0.5625, "pl": 0.575, "pt": 0.525, "ro": 0.4875, "ru": 0.787879, "sk": 0.5625, "sl": 0.6, "sq": 0.5125, "sr": 0.5875, "sv": 0.5125, "tr": 0.6125, "uk": 0.575}, "confusion": {"1": {"2": 0.4366, "4": 0.0195, "1": 0.3195, "3": 0.2024, "6": 0.0122, "5": 0.0098}, "2": {"4": 0.0675, "3": 0.4425, "2": 0.24, "5": 0.045, "1": 0.1825, "6": 0.0225}, "5": {"5": 0.7171, "3": 0.0268, "4": 0.0854, "1": 0.1244, "6": 0.0439, "2": 0.0024}, "6": {"6": 0.1148, "5": 0.6862, "3": 0.0459, "4": 0.0995, "1": 0.0434, "2": 0.0102}, "3": {"3": 0.2429, "4": 0.3143, "5": 0.2, "1": 0.1857, "2": 0.0571}, "4": {"5": 0.6286, "4": 0.1286, "3": 0.1, "1": 0.1286, "6": 0.0143}}}, {"model": "mistralai/mistral-nemo", "avg_exact": 0.309184, "avg_wp": 0.499715, "avg_bias": -0.3506, "total": 1753, "lang_exact": {"ar": 0.325, "az": 0.45, "be": 0.475, "bg": 0.325, "bo": 0.375, "ca": 0.25, "cn": 0.383333, "cs": 0.425, "da": 0.375, "el": 0.3, "en": 0.25, "es": 0.35, "et": 0.225, "eu": 0.216667, "fa": 0.266667, "fi": 0.35, "fr": 0.275, "gl": 0.283333, "hu": 0.2, "hv": 0.425, "is": 0.3, "it": 0.2, "ka": 0.425, "la": 0.183333, "li": 0.325, "lv": 0.425, "mk": 0.375, "mt": 0.325, "nl": 0.35, "no": 0.375, "pl": 0.25, "pt": 0.35, "ro": 0.2, "ru": 0.212121, "sk": 0.375, "sl": 0.35, "sq": 0.3, "sr": 0.25, "sv": 0.175, "tr": 0.275, "uk": 0.25}, "lang_wp": {"ar": 0.5, "az": 0.6125, "be": 0.675, "bg": 0.4625, "bo": 0.5, "ca": 0.5125, "cn": 0.608333, "cs": 0.5125, "da": 0.5375, "el": 0.4625, "en": 0.5, "es": 0.575, "et": 0.4875, "eu": 0.441667, "fa": 0.5, "fi": 0.525, "fr": 0.5625, "gl": 0.475, "hu": 0.425, "hv": 0.5875, "is": 0.4625, "it": 0.4, "ka": 0.5875, "la": 0.333333, "li": 0.525, "lv": 0.6625, "mk": 0.5375, "mt": 0.425, "nl": 0.4875, "no": 0.5375, "pl": 0.466667, "pt": 0.525, "ro": 0.3, "ru": 0.484848, "sk": 0.6, "sl": 0.575, "sq": 0.425, "sr": 0.45, "sv": 0.4375, "tr": 0.45, "uk": 0.4375}, "confusion": {"2": {"1": 0.2982, "5": 0.1028, "2": 0.3759, "4": 0.0602, "3": 0.1604, "6": 0.0025}, "1": {"2": 0.2689, "1": 0.4572, "5": 0.11, "3": 0.1149, "4": 0.0342, "6": 0.0147}, "5": {"4": 0.2836, "5": 0.4132, "2": 0.0856, "1": 0.066, "3": 0.1467, "6": 0.0049}, "6": {"5": 0.5109, "2": 0.0656, "1": 0.1148, "4": 0.1967, "6": 0.0137, "3": 0.0984}, "3": {"2": 0.2429, "4": 0.2143, "3": 0.1857, "5": 0.2714, "1": 0.0857}, "4": {"5": 0.4143, "2": 0.1429, "3": 0.1286, "4": 0.2571, "1": 0.0571}}}, {"model": "z-ai/glm-4.5-air", "avg_exact": 0.36203, "avg_wp": 0.498575, "avg_bias": -0.5908, "total": 1754, "lang_exact": {"ar": 0.317073, "az": 0.4, "be": 0.525, "bg": 0.475, "bo": 0.45, "ca": 0.375, "cn": 0.4, "cs": 0.35, "da": 0.325, "el": 0.325, "en": 0.275, "es": 0.35, "et": 0.6, "eu": 0.3, "fa": 0.283333, "fi": 0.45, "fr": 0.3, "gl": 0.383333, "hu": 0.375, "hv": 0.175, "is": 0.25, "it": 0.125, "ka": 0.35, "la": 0.15, "li": 0.175, "lv": 0.1, "mk": 0.2, "mt": 0.275, "nl": 0.225, "no": 0.375, "pl": 0.35, "pt": 0.525, "ro": 0.4, "ru": 0.484848, "sk": 0.55, "sl": 0.625, "sq": 0.65, "sr": 0.325, "sv": 0.5, "tr": 0.45, "uk": 0.5}, "lang_wp": {"ar": 0.52439, "az": 0.55, "be": 0.6375, "bg": 0.6375, "bo": 0.575, "ca": 0.475, "cn": 0.6, "cs": 0.5, "da": 0.4875, "el": 0.5125, "en": 0.4375, "es": 0.5, "et": 0.75, "eu": 0.433333, "fa": 0.525, "fi": 0.5125, "fr": 0.475, "gl": 0.541667, "hu": 0.425, "hv": 0.1875, "is": 0.3625, "it": 0.15, "ka": 0.4125, "la": 0.208333, "li": 0.25, "lv": 0.2, "mk": 0.3375, "mt": 0.4375, "nl": 0.375, "no": 0.5125, "pl": 0.5, "pt": 0.6625, "ro": 0.5375, "ru": 0.621212, "sk": 0.7, "sl": 0.775, "sq": 0.75, "sr": 0.55, "sv": 0.625, "tr": 0.6375, "uk": 0.6625}, "confusion": {"1": {"1": 0.7601, "3": 0.0665, "2": 0.1676, "6": 0.0029, "5": 0.0029}, "2": {"1": 0.3848, "3": 0.2182, "2": 0.3909, "6": 0.003, "5": 0.003}, "5": {"4": 0.0496, "6": 0.0496, "5": 0.3469, "3": 0.5364, "2": 0.0117, "1": 0.0058}, "6": {"6": 0.2669, "5": 0.4141, "3": 0.2638, "4": 0.0337, "2": 0.0031, "1": 0.0184}, "3": {"3": 0.6, "2": 0.1167, "1": 0.2833}, "4": {"3": 0.661, "5": 0.1186, "2": 0.1017, "6": 0.0339, "1": 0.0678, "4": 0.0169}}}, {"model": "meta-llama/llama-4-scout", "avg_exact": 0.380137, "avg_wp": 0.497717, "avg_bias": 1.0869, "total": 1752, "lang_exact": {"ar": 0.325, "az": 0.475, "be": 0.3, "bg": 0.375, "bo": 0.425, "ca": 0.3, "cn": 0.25, "cs": 0.525, "da": 0.375, "el": 0.275, "en": 0.225, "es": 0.475, "et": 0.425, "eu": 0.254237, "fa": 0.4, "fi": 0.5, "fr": 0.375, "gl": 0.2, "hu": 0.45, "hv": 0.45, "is": 0.55, "it": 0.425, "ka": 0.3, "la": 0.316667, "li": 0.35, "lv": 0.45, "mk": 0.275, "mt": 0.35, "nl": 0.325, "no": 0.475, "pl": 0.35, "pt": 0.5, "ro": 0.325, "ru": 0.545455, "sk": 0.5, "sl": 0.4, "sq": 0.575, "sr": 0.275, "sv": 0.425, "tr": 0.425, "uk": 0.35}, "lang_wp": {"ar": 0.4625, "az": 0.5625, "be": 0.4, "bg": 0.45, "bo": 0.5375, "ca": 0.3875, "cn": 0.441667, "cs": 0.6, "da": 0.4875, "el": 0.4, "en": 0.325, "es": 0.5875, "et": 0.5875, "eu": 0.389831, "fa": 0.55, "fi": 0.625, "fr": 0.4375, "gl": 0.433333, "hu": 0.5375, "hv": 0.525, "is": 0.65, "it": 0.4625, "ka": 0.4125, "la": 0.425, "li": 0.45, "lv": 0.575, "mk": 0.425, "mt": 0.5125, "nl": 0.475, "no": 0.6125, "pl": 0.441667, "pt": 0.6, "ro": 0.475, "ru": 0.575758, "sk": 0.55, "sl": 0.525, "sq": 0.6625, "sr": 0.4125, "sv": 0.5375, "tr": 0.6, "uk": 0.4625}, "confusion": {"5": {"5": 0.6634, "6": 0.2317, "4": 0.0537, "1": 0.0488, "2": 0.0024}, "2": {"4": 0.1253, "5": 0.5965, "2": 0.0251, "6": 0.0877, "1": 0.1404, "3": 0.0251}, "1": {"1": 0.357, "6": 0.1296, "4": 0.0733, "5": 0.3741, "2": 0.0416, "3": 0.0244}, "6": {"5": 0.3766, "6": 0.5903, "4": 0.0305, "1": 0.0025}, "3": {"4": 0.2, "5": 0.6571, "6": 0.0571, "1": 0.0714, "3": 0.0143}, "4": {"5": 0.7246, "6": 0.1739, "4": 0.0725, "1": 0.0145, "2": 0.0145}}}, {"model": "meta-llama/llama-3.3-70b-instruct", "avg_exact": 0.366589, "avg_wp": 0.49652, "avg_bias": 0.846, "total": 1724, "lang_exact": {"ar": 0.384615, "az": 0.394737, "be": 0.475, "bg": 0.4, "bo": 0.45, "ca": 0.25, "cn": 0.305085, "cs": 0.525, "da": 0.358974, "el": 0.447368, "en": 0.25641, "es": 0.324324, "et": 0.512821, "eu": 0.293103, "fa": 0.310345, "fi": 0.538462, "fr": 0.384615, "gl": 0.183333, "hu": 0.45, "hv": 0.4, "is": 0.45, "it": 0.384615, "ka": 0.35, "la": 0.310345, "li": 0.25641, "lv": 0.425, "mk": 0.225, "mt": 0.275, "nl": 0.425, "no": 0.375, "pl": 0.4, "pt": 0.358974, "ro": 0.131579, "ru": 0.515152, "sk": 0.425, "sl": 0.435897, "sq": 0.282051, "sr": 0.333333, "sv": 0.410256, "tr": 0.475, "uk": 0.35}, "lang_wp": {"ar": 0.5, "az": 0.513158, "be": 0.5625, "bg": 0.5375, "bo": 0.5375, "ca": 0.3625, "cn": 0.466102, "cs": 0.5875, "da": 0.5, "el": 0.5, "en": 0.435897, "es": 0.405405, "et": 0.679487, "eu": 0.431034, "fa": 0.465517, "fi": 0.653846, "fr": 0.448718, "gl": 0.383333, "hu": 0.5375, "hv": 0.5125, "is": 0.625, "it": 0.512821, "ka": 0.4625, "la": 0.431034, "li": 0.423077, "lv": 0.6, "mk": 0.3625, "mt": 0.3875, "nl": 0.5875, "no": 0.475, "pl": 0.558333, "pt": 0.474359, "ro": 0.315789, "ru": 0.530303, "sk": 0.6375, "sl": 0.551282, "sq": 0.487179, "sr": 0.448718, "sv": 0.512821, "tr": 0.625, "uk": 0.4375}, "confusion": {"1": {"5": 0.041, "4": 0.4641, "1": 0.3026, "3": 0.0436, "2": 0.1462, "6": 0.0026}, "2": {"4": 0.6972, "5": 0.1349, "2": 0.0636, "6": 0.0229, "3": 0.0433, "1": 0.0382}, "5": {"4": 0.06, "6": 0.2025, "5": 0.7325, "1": 0.005}, "6": {"5": 0.5026, "6": 0.4689, "4": 0.0259, "1": 0.0026}, "3": {"5": 0.5397, "4": 0.3016, "6": 0.0952, "1": 0.0476, "2": 0.0159}, "4": {"5": 0.5797, "6": 0.1884, "4": 0.2174, "1": 0.0145}}}, {"model": "openai/gpt-4.1-nano", "avg_exact": 0.293212, "avg_wp": 0.494295, "avg_bias": -0.6606, "total": 1753, "lang_exact": {"ar": 0.25, "az": 0.275, "be": 0.25, "bg": 0.35, "bo": 0.325, "ca": 0.125, "cn": 0.4, "cs": 0.35, "da": 0.375, "el": 0.225, "en": 0.1, "es": 0.275, "et": 0.45, "eu": 0.2, "fa": 0.366667, "fi": 0.5, "fr": 0.325, "gl": 0.3, "hu": 0.25, "hv": 0.35, "is": 0.3, "it": 0.3, "ka": 0.125, "la": 0.2, "li": 0.275, "lv": 0.425, "mk": 0.125, "mt": 0.25, "nl": 0.3, "no": 0.275, "pl": 0.3, "pt": 0.225, "ro": 0.25, "ru": 0.393939, "sk": 0.275, "sl": 0.3, "sq": 0.325, "sr": 0.325, "sv": 0.375, "tr": 0.325, "uk": 0.325}, "lang_wp": {"ar": 0.5, "az": 0.5125, "be": 0.4875, "bg": 0.55, "bo": 0.525, "ca": 0.4, "cn": 0.616667, "cs": 0.55, "da": 0.425, "el": 0.475, "en": 0.4125, "es": 0.4625, "et": 0.6125, "eu": 0.391667, "fa": 0.6, "fi": 0.6, "fr": 0.5125, "gl": 0.5, "hu": 0.475, "hv": 0.5375, "is": 0.55, "it": 0.5125, "ka": 0.3125, "la": 0.316667, "li": 0.5375, "lv": 0.6125, "mk": 0.3125, "mt": 0.3875, "nl": 0.4125, "no": 0.425, "pl": 0.541667, "pt": 0.425, "ro": 0.4625, "ru": 0.651515, "sk": 0.4625, "sl": 0.575, "sq": 0.5625, "sr": 0.5375, "sv": 0.5, "tr": 0.4875, "uk": 0.5625}, "confusion": {"1": {"2": 0.6683, "4": 0.0244, "1": 0.2659, "3": 0.0415}, "2": {"2": 0.7325, "4": 0.035, "1": 0.0575, "3": 0.1675, "5": 0.0075}, "5": {"4": 0.5927, "2": 0.0951, "5": 0.0439, "3": 0.2463, "6": 0.0195, "1": 0.0024}, "6": {"6": 0.1018, "5": 0.0763, "4": 0.4148, "2": 0.2188, "3": 0.1705, "1": 0.0178}, "3": {"3": 0.4, "2": 0.3429, "4": 0.1571, "1": 0.0857, "5": 0.0143}, "4": {"4": 0.3714, "3": 0.3286, "2": 0.2571, "5": 0.0286, "1": 0.0143}}}, {"model": "google/gemma-3-27b-it", "avg_exact": 0.252139, "avg_wp": 0.488591, "avg_bias": 0.3009, "total": 1753, "lang_exact": {"ar": 0.3, "az": 0.15, "be": 0.225, "bg": 0.3, "bo": 0.325, "ca": 0.175, "cn": 0.333333, "cs": 0.325, "da": 0.1, "el": 0.15, "en": 0.5, "es": 0.3, "et": 0.225, "eu": 0.233333, "fa": 0.266667, "fi": 0.225, "fr": 0.2, "gl": 0.25, "hu": 0.25, "hv": 0.25, "is": 0.375, "it": 0.225, "ka": 0.175, "la": 0.366667, "li": 0.1, "lv": 0.325, "mk": 0.15, "mt": 0.225, "nl": 0.325, "no": 0.175, "pl": 0.416667, "pt": 0.25, "ro": 0.05, "ru": 0.454545, "sk": 0.2, "sl": 0.275, "sq": 0.25, "sr": 0.1, "sv": 0.025, "tr": 0.325, "uk": 0.325}, "lang_wp": {"ar": 0.4875, "az": 0.4, "be": 0.4625, "bg": 0.4875, "bo": 0.5875, "ca": 0.4, "cn": 0.575, "cs": 0.4625, "da": 0.375, "el": 0.325, "en": 0.65, "es": 0.5125, "et": 0.5125, "eu": 0.475, "fa": 0.55, "fi": 0.475, "fr": 0.475, "gl": 0.466667, "hu": 0.55, "hv": 0.5, "is": 0.5625, "it": 0.4875, "ka": 0.375, "la": 0.525, "li": 0.4125, "lv": 0.5875, "mk": 0.275, "mt": 0.5125, "nl": 0.5875, "no": 0.3875, "pl": 0.591667, "pt": 0.4625, "ro": 0.35, "ru": 0.666667, "sk": 0.5375, "sl": 0.5125, "sq": 0.475, "sr": 0.4125, "sv": 0.3625, "tr": 0.5625, "uk": 0.5625}, "confusion": {"1": {"3": 0.4415, "1": 0.1951, "4": 0.0976, "2": 0.2537, "5": 0.0049, "6": 0.0073}, "2": {"4": 0.3175, "3": 0.5875, "2": 0.0675, "5": 0.0175, "1": 0.01}, "5": {"4": 0.3716, "5": 0.6186, "3": 0.0073, "6": 0.0024}, "6": {"4": 0.2519, "5": 0.6889, "3": 0.0206, "6": 0.036, "1": 0.0026}, "3": {"4": 0.6429, "5": 0.0714, "3": 0.2571, "1": 0.0286}, "4": {"4": 0.7143, "5": 0.2143, "3": 0.0714}}}, {"model": "qwen/qwen-2.5-7b-instruct", "avg_exact": 0.26526, "avg_wp": 0.484598, "avg_bias": -0.2856, "total": 1753, "lang_exact": {"ar": 0.2, "az": 0.15, "be": 0.25, "bg": 0.35, "bo": 0.4, "ca": 0.225, "cn": 0.316667, "cs": 0.3, "da": 0.325, "el": 0.2, "en": 0.4, "es": 0.325, "et": 0.25, "eu": 0.216667, "fa": 0.3, "fi": 0.275, "fr": 0.35, "gl": 0.366667, "hu": 0.225, "hv": 0.3, "is": 0.325, "it": 0.3, "ka": 0.125, "la": 0.183333, "li": 0.2, "lv": 0.35, "mk": 0.15, "mt": 0.225, "nl": 0.275, "no": 0.3, "pl": 0.216667, "pt": 0.25, "ro": 0.2, "ru": 0.393939, "sk": 0.325, "sl": 0.275, "sq": 0.1, "sr": 0.05, "sv": 0.35, "tr": 0.3, "uk": 0.275}, "lang_wp": {"ar": 0.3625, "az": 0.4, "be": 0.5125, "bg": 0.575, "bo": 0.625, "ca": 0.425, "cn": 0.525, "cs": 0.4875, "da": 0.525, "el": 0.425, "en": 0.6125, "es": 0.5625, "et": 0.5, "eu": 0.425, "fa": 0.466667, "fi": 0.5125, "fr": 0.5375, "gl": 0.566667, "hu": 0.475, "hv": 0.525, "is": 0.5125, "it": 0.5, "ka": 0.3125, "la": 0.383333, "li": 0.3875, "lv": 0.65, "mk": 0.3375, "mt": 0.4125, "nl": 0.525, "no": 0.5125, "pl": 0.425, "pt": 0.475, "ro": 0.4125, "ru": 0.621212, "sk": 0.5625, "sl": 0.5, "sq": 0.35, "sr": 0.35, "sv": 0.6, "tr": 0.5625, "uk": 0.5125}, "confusion": {"1": {"3": 0.2073, "1": 0.6415, "4": 0.0854, "2": 0.061, "5": 0.0049}, "2": {"4": 0.1859, "3": 0.3593, "1": 0.397, "2": 0.0427, "5": 0.0126, "6": 0.0025}, "5": {"4": 0.622, "6": 0.0732, "5": 0.2195, "3": 0.0683, "1": 0.0146, "2": 0.0024}, "6": {"4": 0.5725, "6": 0.0992, "3": 0.056, "5": 0.2595, "1": 0.0051, "2": 0.0076}, "3": {"3": 0.2286, "4": 0.4571, "1": 0.1857, "2": 0.0429, "5": 0.0714, "6": 0.0143}, "4": {"2": 0.0429, "4": 0.5714, "3": 0.2143, "5": 0.0857, "6": 0.0143, "1": 0.0714}}}, {"model": "meta-llama/llama-4-maverick", "avg_exact": 0.268291, "avg_wp": 0.47344, "avg_bias": -0.0631, "total": 17112, "lang_exact": {"ar": 0.27, "az": 0.235, "be": 0.2225, "bg": 0.273481, "bo": 0.285, "ca": 0.259053, "cn": 0.388333, "cs": 0.285, "da": 0.225, "el": 0.26, "en": 0.28, "es": 0.267267, "et": 0.315, "eu": 0.221106, "fa": 0.278333, "fi": 0.2425, "fr": 0.225131, "gl": 0.271667, "hu": 0.24, "hv": 0.31, "is": 0.41, "it": 0.2775, "ka": 0.113208, "la": 0.397661, "li": 0.2375, "lv": 0.2725, "mk": 0.19, "mt": 0.2125, "nl": 0.3575, "no": 0.185, "pl": 0.355517, "pt": 0.235, "ro": 0.1525, "ru": 0.330033, "sk": 0.2025, "sl": 0.2675, "sq": 0.2825, "sr": 0.2, "sv": 0.185, "tr": 0.395, "uk": 0.248663}, "lang_wp": {"ar": 0.435, "az": 0.38125, "be": 0.475, "bg": 0.476519, "bo": 0.54, "ca": 0.415042, "cn": 0.620833, "cs": 0.52125, "da": 0.37125, "el": 0.4625, "en": 0.4425, "es": 0.462462, "et": 0.5675, "eu": 0.403685, "fa": 0.490833, "fi": 0.5075, "fr": 0.454188, "gl": 0.45, "hu": 0.42875, "hv": 0.53875, "is": 0.57625, "it": 0.48875, "ka": 0.278302, "la": 0.562378, "li": 0.43375, "lv": 0.485, "mk": 0.3675, "mt": 0.40125, "nl": 0.59, "no": 0.38125, "pl": 0.573555, "pt": 0.425, "ro": 0.38625, "ru": 0.617162, "sk": 0.42375, "sl": 0.48125, "sq": 0.56625, "sr": 0.3875, "sv": 0.3425, "tr": 0.60875, "uk": 0.471925}, "confusion": {"1": {"4": 0.2138, "1": 0.6218, "3": 0.0839, "2": 0.0678, "5": 0.0101, "6": 0.0026}, "2": {"4": 0.477, "3": 0.1474, "2": 0.0717, "1": 0.2827, "5": 0.0186, "6": 0.0026}, "5": {"4": 0.6117, "5": 0.2445, "6": 0.038, "1": 0.0987, "3": 0.0065, "2": 0.0005}, "6": {"5": 0.5201, "4": 0.3008, "6": 0.1339, "1": 0.0377, "3": 0.005, "2": 0.0025}, "3": {"3": 0.0877, "4": 0.7083, "5": 0.1298, "2": 0.0253, "1": 0.0455, "6": 0.0034}, "4": {"4": 0.6123, "5": 0.2996, "3": 0.0206, "6": 0.0206, "2": 0.0132, "1": 0.0338}}}, {"model": "google/gemma-3-4b-it", "avg_exact": 0.222031, "avg_wp": 0.460126, "avg_bias": -0.1819, "total": 1743, "lang_exact": {"ar": 0.1, "az": 0.15, "be": 0.225, "bg": 0.2, "bo": 0.125, "ca": 0.051282, "cn": 0.298246, "cs": 0.394737, "da": 0.25, "el": 0.175, "en": 0.425, "es": 0.225, "et": 0.375, "eu": 0.216667, "fa": 0.216667, "fi": 0.25, "fr": 0.125, "gl": 0.25, "hu": 0.125, "hv": 0.225, "is": 0.15, "it": 0.275, "ka": 0.1, "la": 0.166667, "li": 0.175, "lv": 0.2, "mk": 0.15, "mt": 0.025, "nl": 0.2, "no": 0.25, "pl": 0.316667, "pt": 0.461538, "ro": 0.125, "ru": 0.272727, "sk": 0.263158, "sl": 0.225, "sq": 0.358974, "sr": 0.225, "sv": 0.25, "tr": 0.2, "uk": 0.275}, "lang_wp": {"ar": 0.425, "az": 0.5, "be": 0.5, "bg": 0.4125, "bo": 0.45, "ca": 0.294872, "cn": 0.54386, "cs": 0.539474, "da": 0.45, "el": 0.375, "en": 0.6375, "es": 0.425, "et": 0.5875, "eu": 0.441667, "fa": 0.45, "fi": 0.525, "fr": 0.35, "gl": 0.541667, "hu": 0.4, "hv": 0.475, "is": 0.375, "it": 0.4875, "ka": 0.325, "la": 0.35, "li": 0.4875, "lv": 0.525, "mk": 0.3625, "mt": 0.325, "nl": 0.4375, "no": 0.475, "pl": 0.516667, "pt": 0.615385, "ro": 0.3375, "ru": 0.545455, "sk": 0.526316, "sl": 0.425, "sq": 0.538462, "sr": 0.4875, "sv": 0.5, "tr": 0.4375, "uk": 0.45}, "confusion": {"1": {"2": 0.5575, "3": 0.2812, "1": 0.1174, "6": 0.0147, "4": 0.022, "5": 0.0073}, "2": {"5": 0.0126, "4": 0.0578, "6": 0.0075, "3": 0.6131, "2": 0.2864, "1": 0.0226}, "5": {"4": 0.3985, "6": 0.0196, "5": 0.3496, "3": 0.2274, "2": 0.0049}, "6": {"5": 0.2828, "4": 0.2699, "3": 0.329, "6": 0.0617, "2": 0.0206, "1": 0.036}, "3": {"3": 0.6029, "5": 0.1176, "4": 0.1471, "2": 0.1029, "1": 0.0294}, "4": {"4": 0.2429, "3": 0.4286, "5": 0.3, "2": 0.0143, "1": 0.0143}}}, {"model": "mistralai/mixtral-8x7b-instruct", "avg_exact": 0.245143, "avg_wp": 0.453429, "avg_bias": 0.4636, "total": 1750, "lang_exact": {"ar": 0.175, "az": 0.2, "be": 0.15, "bg": 0.225, "bo": 0.25, "ca": 0.175, "cn": 0.254237, "cs": 0.4, "da": 0.25, "el": 0.2, "en": 0.3, "es": 0.3, "et": 0.275, "eu": 0.25, "fa": 0.183333, "fi": 0.275, "fr": 0.25, "gl": 0.216667, "hu": 0.25, "hv": 0.225, "is": 0.15, "it": 0.225, "ka": 0.153846, "la": 0.283333, "li": 0.2, "lv": 0.25, "mk": 0.225, "mt": 0.225, "nl": 0.3, "no": 0.325, "pl": 0.283333, "pt": 0.225, "ro": 0.3, "ru": 0.34375, "sk": 0.225, "sl": 0.325, "sq": 0.2, "sr": 0.25, "sv": 0.225, "tr": 0.275, "uk": 0.275}, "lang_wp": {"ar": 0.2875, "az": 0.3875, "be": 0.45, "bg": 0.4375, "bo": 0.425, "ca": 0.3375, "cn": 0.466102, "cs": 0.5625, "da": 0.5125, "el": 0.4, "en": 0.5125, "es": 0.5125, "et": 0.4875, "eu": 0.408333, "fa": 0.408333, "fi": 0.525, "fr": 0.4125, "gl": 0.408333, "hu": 0.4875, "hv": 0.475, "is": 0.4, "it": 0.4125, "ka": 0.269231, "la": 0.466667, "li": 0.4375, "lv": 0.5, "mk": 0.3875, "mt": 0.3625, "nl": 0.5625, "no": 0.55, "pl": 0.45, "pt": 0.475, "ro": 0.525, "ru": 0.53125, "sk": 0.4625, "sl": 0.5, "sq": 0.4, "sr": 0.475, "sv": 0.55, "tr": 0.4875, "uk": 0.55}, "confusion": {"1": {"4": 0.1744, "3": 0.398, "2": 0.2875, "5": 0.0491, "1": 0.0909}, "2": {"4": 0.3501, "5": 0.1511, "1": 0.063, "3": 0.3753, "2": 0.0605}, "5": {"5": 0.8, "3": 0.0366, "4": 0.1415, "1": 0.0122, "6": 0.0098}, "6": {"5": 0.7621, "3": 0.0588, "4": 0.1228, "2": 0.0128, "6": 0.0281, "1": 0.0153}, "3": {"3": 0.1143, "5": 0.3714, "4": 0.4429, "1": 0.0429, "2": 0.0286}, "4": {"5": 0.5857, "4": 0.3, "6": 0.0286, "1": 0.0286, "3": 0.0571}}}, {"model": "mistralai/mistral-small-3.2-24b-instruct", "avg_exact": 0.25029, "avg_wp": 0.450929, "avg_bias": 0.6868, "total": 1722, "lang_exact": {"ar": 0.2, "az": 0.25, "be": 0.275, "bg": 0.225, "bo": 0.275, "ca": 0.225, "cn": 0.316667, "cs": 0.225, "da": 0.35, "el": 0.25, "en": 0.3, "es": 0.3, "et": 0.15, "eu": 0.183333, "fa": 0.333333, "fi": 0.275, "fr": 0.25, "gl": 0.166667, "hu": 0.275, "hv": 0.275, "is": 0.375, "it": 0.275, "ka": 0.2, "la": 0.4, "li": 0.205128, "lv": 0.325, "mk": 0.2, "mt": 0.1, "nl": 0.225, "no": 0.25, "pl": 0.3, "pt": 0.25, "ro": 0.222222, "ru": 0.3, "sk": 0.142857, "sl": 0.193548, "sq": 0.157895, "sr": 0.2, "sv": 0.3, "tr": 0.175, "uk": 0.225}, "lang_wp": {"ar": 0.4125, "az": 0.4375, "be": 0.4875, "bg": 0.425, "bo": 0.45, "ca": 0.425, "cn": 0.55, "cs": 0.3875, "da": 0.55, "el": 0.3625, "en": 0.4125, "es": 0.425, "et": 0.4, "eu": 0.458333, "fa": 0.575, "fi": 0.3875, "fr": 0.425, "gl": 0.408333, "hu": 0.4875, "hv": 0.5125, "is": 0.6, "it": 0.4125, "ka": 0.375, "la": 0.566667, "li": 0.423077, "lv": 0.575, "mk": 0.3625, "mt": 0.35, "nl": 0.4125, "no": 0.375, "pl": 0.483333, "pt": 0.4, "ro": 0.375, "ru": 0.416667, "sk": 0.410714, "sl": 0.467742, "sq": 0.513158, "sr": 0.4625, "sv": 0.4375, "tr": 0.5125, "uk": 0.3875}, "confusion": {"1": {"3": 0.5664, "4": 0.1228, "2": 0.0902, "1": 0.0902, "5": 0.1303}, "2": {"3": 0.4444, "4": 0.2879, "5": 0.2348, "2": 0.0126, "1": 0.0202}, "5": {"4": 0.1156, "5": 0.8668, "1": 0.005, "6": 0.0075, "3": 0.005}, "6": {"5": 0.8892, "4": 0.0722, "6": 0.0335, "3": 0.0052}, "3": {"4": 0.4143, "5": 0.4, "3": 0.1714, "6": 0.0143}, "4": {"5": 0.5857, "4": 0.2857, "3": 0.1, "6": 0.0286}}}, {"model": "mistralai/mistral-7b-instruct", "avg_exact": 0.243871, "avg_wp": 0.418065, "avg_bias": 0.3484, "total": 1550, "lang_exact": {"ar": 0.058824, "az": 0.27027, "be": 0.382353, "bg": 0.243243, "bo": 0.314286, "ca": 0.285714, "cn": 0.339286, "cs": 0.428571, "da": 0.28, "el": 0.194444, "en": 0.125, "es": 0.241379, "et": 0.482759, "eu": 0.226415, "fa": 0.355932, "fi": 0.269231, "fr": 0.184211, "gl": 0.206897, "hu": 0.138889, "hv": 0.361111, "is": 0.166667, "it": 0.28125, "ka": 0.277778, "la": 0.241379, "li": 0.171429, "lv": 0.142857, "mk": 0.205128, "mt": 0.162162, "nl": 0.285714, "no": 0.205128, "pl": 0.272727, "pt": 0.189189, "ro": 0.138889, "ru": 0.354839, "sk": 0.236842, "sl": 0.289474, "sq": 0.27027, "sr": 0.282051, "sv": 0.210526, "tr": 0.102564, "uk": 0.175}, "lang_wp": {"ar": 0.191176, "az": 0.445946, "be": 0.558824, "bg": 0.432432, "bo": 0.542857, "ca": 0.457143, "cn": 0.508929, "cs": 0.52381, "da": 0.4, "el": 0.388889, "en": 0.265625, "es": 0.396552, "et": 0.62069, "eu": 0.349057, "fa": 0.542373, "fi": 0.365385, "fr": 0.381579, "gl": 0.387931, "hu": 0.319444, "hv": 0.513889, "is": 0.416667, "it": 0.5, "ka": 0.416667, "la": 0.431034, "li": 0.285714, "lv": 0.328571, "mk": 0.397436, "mt": 0.337838, "nl": 0.5, "no": 0.384615, "pl": 0.445455, "pt": 0.378378, "ro": 0.333333, "ru": 0.467742, "sk": 0.434211, "sl": 0.394737, "sq": 0.486486, "sr": 0.487179, "sv": 0.421053, "tr": 0.320513, "uk": 0.35}, "confusion": {"1": {"6": 0.0292, "3": 0.414, "5": 0.0962, "4": 0.1224, "2": 0.1341, "1": 0.2041}, "2": {"5": 0.2367, "3": 0.3373, "6": 0.0355, "2": 0.071, "4": 0.2219, "1": 0.0976}, "5": {"6": 0.0875, "4": 0.1545, "1": 0.0875, "5": 0.6093, "3": 0.0554, "2": 0.0058}, "6": {"5": 0.5989, "1": 0.0917, "6": 0.1519, "3": 0.0544, "4": 0.0974, "2": 0.0057}, "3": {"5": 0.4308, "2": 0.1077, "3": 0.2, "1": 0.0923, "4": 0.1385, "6": 0.0308}, "4": {"5": 0.5152, "4": 0.1364, "1": 0.1364, "6": 0.0758, "3": 0.0758, "2": 0.0606}}}, {"model": "mistralai/mistral-small-24b-instruct-2501", "avg_exact": 0.179897, "avg_wp": 0.363221, "avg_bias": -0.8472, "total": 1751, "lang_exact": {"ar": 0.05, "az": 0.25, "be": 0.075, "bg": 0.2, "bo": 0.25, "ca": 0.25, "cn": 0.254237, "cs": 0.15, "da": 0.225, "el": 0.2, "en": 0.25, "es": 0.25, "et": 0.25, "eu": 0.2, "fa": 0.2, "fi": 0.225, "fr": 0.225, "gl": 0.183333, "hu": 0.125, "hv": 0.175, "is": 0.15, "it": 0.25, "ka": 0.025, "la": 0.35, "li": 0.05, "lv": 0.125, "mk": 0.225, "mt": 0.25, "nl": 0.175, "no": 0.225, "pl": 0.233333, "pt": 0.225, "ro": 0.1, "ru": 0.0, "sk": 0.15, "sl": 0.225, "sq": 0.025, "sr": 0.0, "sv": 0.25, "tr": 0.075, "uk": 0.075}, "lang_wp": {"ar": 0.225, "az": 0.45, "be": 0.3125, "bg": 0.4375, "bo": 0.3875, "ca": 0.3625, "cn": 0.457627, "cs": 0.3375, "da": 0.375, "el": 0.35, "en": 0.4125, "es": 0.4125, "et": 0.4125, "eu": 0.358333, "fa": 0.4, "fi": 0.3375, "fr": 0.375, "gl": 0.291667, "hu": 0.275, "hv": 0.35, "is": 0.375, "it": 0.4, "ka": 0.2625, "la": 0.516667, "li": 0.3, "lv": 0.3625, "mk": 0.375, "mt": 0.4, "nl": 0.3375, "no": 0.3625, "pl": 0.416667, "pt": 0.4125, "ro": 0.225, "ru": 0.28125, "sk": 0.325, "sl": 0.475, "sq": 0.375, "sr": 0.225, "sv": 0.4, "tr": 0.275, "uk": 0.325}, "confusion": {"1": {"4": 0.1324, "2": 0.1961, "3": 0.1225, "1": 0.5319, "6": 0.0123, "5": 0.0049}, "2": {"4": 0.25, "3": 0.1725, "2": 0.0875, "1": 0.4775, "5": 0.0075, "6": 0.005}, "5": {"4": 0.527, "2": 0.0098, "3": 0.0343, "1": 0.4093, "5": 0.0172, "6": 0.0025}, "6": {"4": 0.6752, "5": 0.0997, "6": 0.0153, "1": 0.179, "3": 0.0307}, "3": {"3": 0.1571, "4": 0.4571, "5": 0.0143, "6": 0.0143, "1": 0.2571, "2": 0.1}, "4": {"4": 0.5571, "6": 0.0143, "1": 0.2714, "2": 0.0429, "3": 0.1143}}}, {"model": "mistralai/ministral-14b-2512", "avg_exact": 0.196235, "avg_wp": 0.353394, "avg_bias": 0.71, "total": 1753, "lang_exact": {"ar": 0.15, "az": 0.225, "be": 0.175, "bg": 0.2, "bo": 0.25, "ca": 0.225, "cn": 0.233333, "cs": 0.25, "da": 0.175, "el": 0.25, "en": 0.275, "es": 0.175, "et": 0.075, "eu": 0.266667, "fa": 0.216667, "fi": 0.125, "fr": 0.25, "gl": 0.216667, "hu": 0.175, "hv": 0.2, "is": 0.325, "it": 0.125, "ka": 0.05, "la": 0.416667, "li": 0.125, "lv": 0.1, "mk": 0.175, "mt": 0.05, "nl": 0.1, "no": 0.125, "pl": 0.233333, "pt": 0.15, "ro": 0.175, "ru": 0.181818, "sk": 0.175, "sl": 0.125, "sq": 0.175, "sr": 0.225, "sv": 0.275, "tr": 0.225, "uk": 0.2}, "lang_wp": {"ar": 0.3125, "az": 0.3625, "be": 0.35, "bg": 0.4125, "bo": 0.35, "ca": 0.4, "cn": 0.425, "cs": 0.4125, "da": 0.325, "el": 0.375, "en": 0.4125, "es": 0.35, "et": 0.225, "eu": 0.425, "fa": 0.441667, "fi": 0.275, "fr": 0.375, "gl": 0.383333, "hu": 0.3, "hv": 0.3375, "is": 0.4875, "it": 0.3125, "ka": 0.2125, "la": 0.533333, "li": 0.2625, "lv": 0.2875, "mk": 0.3375, "mt": 0.175, "nl": 0.225, "no": 0.225, "pl": 0.433333, "pt": 0.2875, "ro": 0.3375, "ru": 0.333333, "sk": 0.3375, "sl": 0.325, "sq": 0.3375, "sr": 0.325, "sv": 0.4, "tr": 0.425, "uk": 0.375}, "confusion": {"1": {"3": 0.3, "5": 0.2293, "4": 0.3268, "6": 0.0537, "1": 0.0902}, "2": {"4": 0.335, "5": 0.29, "3": 0.225, "6": 0.0475, "1": 0.1025}, "5": {"4": 0.2296, "5": 0.5506, "3": 0.0543, "6": 0.079, "1": 0.0864}, "6": {"5": 0.5872, "4": 0.1795, "6": 0.1385, "1": 0.0615, "3": 0.0333}, "3": {"5": 0.4143, "4": 0.3143, "3": 0.1571, "6": 0.0714, "1": 0.0429}, "4": {"5": 0.5571, "4": 0.2714, "3": 0.0714, "6": 0.1}}}, {"model": "meta-llama/llama-3.1-8b-instruct", "avg_exact": 0.209262, "avg_wp": 0.341052, "avg_bias": -0.1998, "total": 1749, "lang_exact": {"ar": 0.135135, "az": 0.2, "be": 0.3, "bg": 0.175, "bo": 0.2, "ca": 0.225, "cn": 0.3, "cs": 0.25, "da": 0.25, "el": 0.225, "en": 0.275, "es": 0.075, "et": 0.275, "eu": 0.116667, "fa": 0.083333, "fi": 0.333333, "fr": 0.1, "gl": 0.133333, "hu": 0.175, "hv": 0.375, "is": 0.25, "it": 0.2, "ka": 0.225, "la": 0.116667, "li": 0.2, "lv": 0.225, "mk": 0.275, "mt": 0.2, "nl": 0.225, "no": 0.225, "pl": 0.083333, "pt": 0.225, "ro": 0.075, "ru": 0.242424, "sk": 0.225, "sl": 0.25, "sq": 0.225, "sr": 0.25, "sv": 0.375, "tr": 0.275, "uk": 0.225}, "lang_wp": {"ar": 0.297297, "az": 0.275, "be": 0.4625, "bg": 0.3375, "bo": 0.3125, "ca": 0.35, "cn": 0.441667, "cs": 0.4, "da": 0.3875, "el": 0.325, "en": 0.4, "es": 0.2125, "et": 0.3875, "eu": 0.191667, "fa": 0.216667, "fi": 0.5, "fr": 0.2375, "gl": 0.266667, "hu": 0.3375, "hv": 0.5375, "is": 0.375, "it": 0.325, "ka": 0.35, "la": 0.325, "li": 0.3, "lv": 0.4, "mk": 0.4, "mt": 0.3, "nl": 0.3625, "no": 0.3375, "pl": 0.258333, "pt": 0.35, "ro": 0.2, "ru": 0.30303, "sk": 0.3625, "sl": 0.375, "sq": 0.3375, "sr": 0.375, "sv": 0.475, "tr": 0.4, "uk": 0.3625}, "confusion": {"1": {"1": 0.388, "2": 0.0651, "5": 0.2161, "4": 0.2396, "6": 0.0755, "3": 0.0156}, "2": {"1": 0.4027, "4": 0.1973, "5": 0.2453, "6": 0.104, "3": 0.0027, "2": 0.048}, "5": {"5": 0.2825, "4": 0.155, "1": 0.415, "2": 0.005, "6": 0.1425}, "6": {"4": 0.1362, "1": 0.3316, "2": 0.0051, "5": 0.3111, "6": 0.2159}, "3": {"5": 0.3857, "6": 0.1143, "1": 0.3714, "4": 0.1, "2": 0.0286}, "4": {"4": 0.029, "5": 0.4783, "6": 0.087, "1": 0.3478, "2": 0.029, "3": 0.029}}}, {"model": "speakleash/Bielik-11B-v2.6-Instruct", "avg_exact": 0.1502, "avg_wp": 0.340091, "avg_bias": 0.5597, "total": 1751, "lang_exact": {"ar": 0.05, "az": 0.075, "be": 0.0, "bg": 0.175, "bo": 0.2, "ca": 0.175, "cn": 0.186441, "cs": 0.1, "da": 0.225, "el": 0.275, "en": 0.025, "es": 0.2, "et": 0.2, "eu": 0.166667, "fa": 0.266667, "fi": 0.125, "fr": 0.225, "gl": 0.216667, "hu": 0.2, "hv": 0.225, "is": 0.0, "it": 0.125, "ka": 0.0, "la": 0.2, "li": 0.225, "lv": 0.125, "mk": 0.2, "mt": 0.15, "nl": 0.225, "no": 0.15, "pl": 0.183333, "pt": 0.075, "ro": 0.075, "ru": 0.090909, "sk": 0.125, "sl": 0.225, "sq": 0.0, "sr": 0.05, "sv": 0.15, "tr": 0.225, "uk": 0.075}, "lang_wp": {"ar": 0.175, "az": 0.25, "be": 0.1875, "bg": 0.3875, "bo": 0.4, "ca": 0.425, "cn": 0.29661, "cs": 0.3, "da": 0.4125, "el": 0.45, "en": 0.2, "es": 0.4625, "et": 0.4375, "eu": 0.333333, "fa": 0.433333, "fi": 0.35, "fr": 0.4375, "gl": 0.333333, "hu": 0.3375, "hv": 0.4375, "is": 0.1125, "it": 0.375, "ka": 0.128205, "la": 0.308333, "li": 0.4375, "lv": 0.3875, "mk": 0.4125, "mt": 0.35, "nl": 0.4625, "no": 0.4125, "pl": 0.4, "pt": 0.4125, "ro": 0.2625, "ru": 0.30303, "sk": 0.2875, "sl": 0.4375, "sq": 0.2, "sr": 0.2, "sv": 0.3375, "tr": 0.375, "uk": 0.25}, "confusion": {"1": {"3": 0.8971, "2": 0.0049, "6": 0.076, "5": 0.0074, "1": 0.0147}, "2": {"3": 0.84, "6": 0.13, "5": 0.0225, "4": 0.005, "1": 0.0025}, "5": {"3": 0.3659, "6": 0.4439, "4": 0.0512, "5": 0.1366, "1": 0.0024}, "6": {"3": 0.2901, "6": 0.4148, "4": 0.0712, "5": 0.2239}, "3": {"6": 0.4143, "5": 0.0429, "3": 0.5429}, "4": {"6": 0.5, "3": 0.4571, "5": 0.0429}}}, {"model": "CYFRAGOVPL/Llama-PLLuM-70B-chat-250801", "avg_exact": 0.227169, "avg_wp": 0.335616, "avg_bias": -1.8853, "total": 1752, "lang_exact": {"ar": 0.15, "az": 0.275, "be": 0.25, "bg": 0.175, "bo": 0.25, "ca": 0.175, "cn": 0.152542, "cs": 0.225, "da": 0.175, "el": 0.225, "en": 0.225, "es": 0.125, "et": 0.25, "eu": 0.1, "fa": 0.35, "fi": 0.25, "fr": 0.225, "gl": 0.133333, "hu": 0.175, "hv": 0.375, "is": 0.4, "it": 0.2, "ka": 0.175, "la": 0.3, "li": 0.25, "lv": 0.325, "mk": 0.225, "mt": 0.275, "nl": 0.25, "no": 0.25, "pl": 0.1, "pt": 0.25, "ro": 0.25, "ru": 0.30303, "sk": 0.2, "sl": 0.175, "sq": 0.25, "sr": 0.325, "sv": 0.25, "tr": 0.275, "uk": 0.15}, "lang_wp": {"ar": 0.35, "az": 0.4, "be": 0.3625, "bg": 0.275, "bo": 0.4, "ca": 0.2375, "cn": 0.194915, "cs": 0.375, "da": 0.2875, "el": 0.3125, "en": 0.2375, "es": 0.125, "et": 0.375, "eu": 0.183333, "fa": 0.508333, "fi": 0.3625, "fr": 0.35, "gl": 0.2, "hu": 0.2125, "hv": 0.55, "is": 0.5875, "it": 0.275, "ka": 0.2375, "la": 0.5, "li": 0.35, "lv": 0.4625, "mk": 0.275, "mt": 0.4625, "nl": 0.3625, "no": 0.3125, "pl": 0.141667, "pt": 0.3625, "ro": 0.3625, "ru": 0.439394, "sk": 0.325, "sl": 0.2875, "sq": 0.375, "sr": 0.475, "sv": 0.375, "tr": 0.4125, "uk": 0.2375}, "confusion": {"1": {"1": 0.9601, "2": 0.0114, "3": 0.0199, "5": 0.0028, "6": 0.0057}, "2": {"4": 0.016, "1": 0.9169, "3": 0.0479, "5": 0.016, "2": 0.0032}, "5": {"4": 0.0897, "5": 0.1063, "1": 0.7774, "3": 0.0199, "6": 0.0066}, "6": {"5": 0.0915, "4": 0.0881, "1": 0.7695, "6": 0.0441, "3": 0.0068}, "3": {"1": 0.7407, "3": 0.2037, "4": 0.0556}, "4": {"1": 0.5652, "5": 0.1304, "3": 0.1957, "4": 0.087, "6": 0.0217}}}, {"model": "CYFRAGOVPL/pllum-12b-nc-chat-250715", "avg_exact": 0.146689, "avg_wp": 0.234018, "avg_bias": -1.0019, "total": 1752, "lang_exact": {"ar": 0.075, "az": 0.175, "be": 0.275, "bg": 0.125, "bo": 0.325, "ca": 0.275, "cn": 0.101695, "cs": 0.075, "da": 0.125, "el": 0.175, "en": 0.15, "es": 0.2, "et": 0.075, "eu": 0.083333, "fa": 0.216667, "fi": 0.075, "fr": 0.25, "gl": 0.1, "hu": 0.025, "hv": 0.325, "is": 0.15, "it": 0.125, "ka": 0.075, "la": 0.133333, "li": 0.15, "lv": 0.1, "mk": 0.075, "mt": 0.1, "nl": 0.075, "no": 0.2, "pl": 0.116667, "pt": 0.1, "ro": 0.35, "ru": 0.212121, "sk": 0.15, "sl": 0.15, "sq": 0.075, "sr": 0.175, "sv": 0.125, "tr": 0.15, "uk": 0.075}, "lang_wp": {"ar": 0.0875, "az": 0.3375, "be": 0.3625, "bg": 0.2, "bo": 0.4875, "ca": 0.3625, "cn": 0.144068, "cs": 0.1125, "da": 0.2125, "el": 0.25, "en": 0.275, "es": 0.275, "et": 0.15, "eu": 0.116667, "fa": 0.35, "fi": 0.1375, "fr": 0.3875, "gl": 0.183333, "hu": 0.125, "hv": 0.475, "is": 0.225, "it": 0.225, "ka": 0.1125, "la": 0.308333, "li": 0.2125, "lv": 0.2375, "mk": 0.15, "mt": 0.2, "nl": 0.15, "no": 0.275, "pl": 0.141667, "pt": 0.1375, "ro": 0.425, "ru": 0.30303, "sk": 0.2625, "sl": 0.3, "sq": 0.1375, "sr": 0.25, "sv": 0.1625, "tr": 0.2625, "uk": 0.175}, "confusion": {"1": {"1": 0.6739, "4": 0.0739, "3": 0.0783, "2": 0.0565, "6": 0.1, "5": 0.0174}, "5": {"1": 0.5176, "3": 0.0211, "6": 0.1373, "4": 0.1585, "5": 0.162, "2": 0.0035}, "6": {"1": 0.5607, "5": 0.1308, "4": 0.1402, "2": 0.028, "6": 0.1168, "3": 0.0234}, "2": {"1": 0.5738, "5": 0.0779, "2": 0.0738, "6": 0.0779, "4": 0.1189, "3": 0.0779}, "3": {"3": 0.1316, "1": 0.3158, "6": 0.1579, "2": 0.1053, "4": 0.2632, "5": 0.0263}, "4": {"1": 0.3415, "6": 0.2683, "5": 0.0976, "4": 0.1951, "3": 0.0976}}}];
235
  const ALL_LANGS = ["ar", "az", "be", "bg", "bo", "ca", "cn", "cs", "da", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "gl", "hu", "hv", "is", "it", "ka", "la", "li", "lv", "mk", "mt", "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sq", "sr", "sv", "tr", "uk"];
236
  const LANG_NAMES = {"af": "Afrikaans", "ar": "Arabic", "az": "Azerbaijani", "be": "Belarusian", "bo": "Bosnian", "bg": "Bulgarian", "bn": "Bulgarian", "ca": "Catalan", "cs": "Czech", "cn": "Chinese", "cy": "Welsh", "da": "Danish", "de": "German", "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish", "et": "Estonian", "eu": "Basque", "fa": "Faroese", "fi": "Finnish", "fr": "French", "ga": "Irish", "gl": "Galician", "gu": "Gujarati", "he": "Hebrew", "hi": "Hindi", "hr": "Croatian", "hu": "Hungarian", "hy": "Armenian", "hv": "Croatia", "id": "Indonesian", "is": "Icelandic", "it": "Italian", "ja": "Japanese", "ka": "Georgian", "kk": "Kazakh", "km": "Khmer", "kn": "Kannada", "ko": "Korean", "la": "Latin", "li": "Lithuanian", "lv": "Latvian", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian", "mr": "Marathi", "ms": "Malay", "mt": "Maltese", "my": "Burmese", "ne": "Nepali", "nl": "Dutch", "no": "Norwegian", "pa": "Punjabi", "pe": "Persian", "pl": "Polish", "pt": "Portuguese", "ro": "Romanian", "ru": "Russian", "si": "Sinhala", "sk": "Slovak", "sl": "Slovenian", "sq": "Albanian", "sr": "Serbian", "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu", "th": "Thai", "tl": "Filipino", "tr": "Turkish", "uk": "Ukrainian", "ur": "Urdu", "uz": "Uzbek", "vi": "Vietnamese", "zh": "Chinese", "zu": "Zulu"};
237
  const LANG_COUNTS = {"ar": 1790, "az": 1795, "be": 1794, "bg": 1721, "bo": 1795, "ca": 1712, "cn": 2686, "cs": 1779, "da": 1784, "el": 1794, "en": 1791, "es": 1652, "et": 1788, "eu": 2684, "fa": 2697, "fi": 1784, "fr": 1761, "gl": 2698, "hu": 1796, "hv": 1796, "is": 1790, "it": 1791, "ka": 1629, "la": 2522, "li": 1793, "lv": 1795, "mk": 1799, "mt": 1797, "nl": 1795, "no": 1799, "pl": 2637, "pt": 1795, "ro": 1790, "ru": 1424, "sk": 1784, "sl": 1788, "sq": 1793, "sr": 1798, "sv": 1797, "tr": 1799, "uk": 1747};
 
475
  `77760 predictions · ${ALL_LANGS.length} languages · ${ALL_ROWS.length} models`;
476
  }
477
 
478
+ // ── bias lollipop ──
479
+ function renderBias() {
480
+ const sorted = [...ALL_ROWS].sort((a, b) => a.avg_bias - b.avg_bias);
481
+ const labels = sorted.map(r => r.model);
482
+ const values = sorted.map(r => +(r.avg_bias).toFixed(3));
483
+ const colors = values.map(v => v >= 0 ? '#16a34a' : '#dc2626');
484
+
485
+ const ctx = document.getElementById('biasChart').getContext('2d');
486
+ const h = Math.max(260, sorted.length * 26 + 40);
487
+ document.getElementById('biasChart').style.height = h + 'px';
488
+
489
+ new Chart(ctx, {
490
+ type: 'bar',
491
+ data: { labels, datasets: [{
492
+ label: 'Mean Error', data: values, backgroundColor: colors,
493
+ borderRadius: 3, barPercentage: 0.45,
494
+ }] },
495
+ options: {
496
+ indexAxis: 'y', responsive: true, maintainAspectRatio: false,
497
+ animation: { duration: 400 },
498
+ plugins: {
499
+ legend: { display: false },
500
+ tooltip: { backgroundColor: '#1e2a3a', callbacks: {
501
+ label: ctx => ` Bias: ${ctx.parsed.x > 0 ? '+' : ''}${ctx.parsed.x.toFixed(3)}`
502
+ } }
503
+ },
504
+ scales: {
505
+ x: { grid: { color: '#1a2236' },
506
+ ticks: { color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } },
507
+ title: { display: true, text: 'Mean Error (pred − gt)', color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } }
508
+ },
509
+ y: { grid: { display: false },
510
+ ticks: { color: '#cbd5e1', font: { family: 'JetBrains Mono', size: 10 } }
511
+ }
512
+ }
513
+ }
514
+ });
515
+ }
516
+
517
+ // ── critical confusion ──
518
+ function renderCritical() {
519
+ const LOW = new Set([1, 2]);
520
+ const HIGH = new Set([5, 6]);
521
+ const sorted = [...ALL_ROWS].sort((a, b) => b.avg_wp - a.avg_wp);
522
+ const labels = sorted.map(r => r.model);
523
+ const lh = [], hl = [];
524
+
525
+ sorted.forEach(row => {
526
+ let lhNumer = 0, lhDenom = 0, hlNumer = 0, hlDenom = 0;
527
+ Object.entries(row.confusion || {}).forEach(([gtStr, preds]) => {
528
+ const gt = parseInt(gtStr);
529
+ const rowTotal = Object.values(preds).reduce((a, b) => a + b, 0);
530
+ if (LOW.has(gt)) {
531
+ lhDenom += rowTotal;
532
+ Object.entries(preds).forEach(([pStr, v]) => {
533
+ if (HIGH.has(parseInt(pStr))) lhNumer += v * rowTotal;
534
+ });
535
+ }
536
+ if (HIGH.has(gt)) {
537
+ hlDenom += rowTotal;
538
+ Object.entries(preds).forEach(([pStr, v]) => {
539
+ if (LOW.has(parseInt(pStr))) hlNumer += v * rowTotal;
540
+ });
541
+ }
542
+ });
543
+ lh.push(lhDenom > 0 ? +(lhNumer / lhDenom * 100).toFixed(1) : 0);
544
+ hl.push(hlDenom > 0 ? +(hlNumer / hlDenom * 100).toFixed(1) : 0);
545
+ });
546
+
547
+ const ctx = document.getElementById('criticalChart').getContext('2d');
548
+ const h = Math.max(260, sorted.length * 26 + 60);
549
+ document.getElementById('criticalChart').style.height = h + 'px';
550
+
551
+ new Chart(ctx, {
552
+ type: 'bar',
553
+ data: { labels, datasets: [
554
+ { label: 'Low→High (1–2 pred as 5–6)', data: lh, backgroundColor: '#dc2626', borderRadius: 3, barPercentage: 0.7 },
555
+ { label: 'High→Low (5–6 pred as 1–2)', data: hl, backgroundColor: '#f97316', borderRadius: 3, barPercentage: 0.7 },
556
+ ] },
557
+ options: {
558
+ indexAxis: 'y', responsive: true, maintainAspectRatio: false,
559
+ animation: { duration: 400 },
560
+ plugins: {
561
+ legend: { position: 'bottom', labels: { color: '#94a3b8', font: { family: 'JetBrains Mono', size: 10 }, boxWidth: 12, padding: 16 } },
562
+ tooltip: { backgroundColor: '#1e2a3a', callbacks: { label: ctx => ` ${ctx.dataset.label}: ${ctx.parsed.x}%` } }
563
+ },
564
+ scales: {
565
+ x: { min: 0, grid: { color: '#1a2236' },
566
+ ticks: { color: '#64748b', font: { family: 'JetBrains Mono', size: 10 }, callback: v => v + '%' },
567
+ title: { display: true, text: '% of predictions within true class', color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } }
568
+ },
569
+ y: { grid: { display: false },
570
+ ticks: { color: '#cbd5e1', font: { family: 'JetBrains Mono', size: 10 } }
571
+ }
572
+ }
573
+ }
574
+ });
575
+ }
576
+
577
+ // ── confusion heatmap with dropdown ──
578
+ let confChartInstance = null;
579
+
580
+ function populateConfSelect() {
581
+ const sel = document.getElementById('confModelSelect');
582
+ ALL_ROWS.forEach((row, i) => {
583
+ const opt = document.createElement('option');
584
+ opt.value = i;
585
+ opt.textContent = row.model;
586
+ sel.appendChild(opt);
587
+ });
588
+ }
589
+
590
+ window.renderConfusion = function() {
591
+ const idx = parseInt(document.getElementById('confModelSelect').value || '0');
592
+ const row = ALL_ROWS[idx];
593
+ const conf = row.confusion || {};
594
+ const scores = [1, 2, 3, 4, 5, 6];
595
+
596
+ const data = [];
597
+ scores.forEach((gt, ri) => {
598
+ const preds = conf[gt] || {};
599
+ const rowSum = Object.values(preds).reduce((a, b) => a + b, 0);
600
+ scores.forEach((pred, ci) => {
601
+ const v = rowSum > 0 ? (preds[pred] || 0) : 0;
602
+ data.push({ x: ci, y: ri, v });
603
+ });
604
+ });
605
+
606
+ const ctx = document.getElementById('confusionChart').getContext('2d');
607
+ if (confChartInstance) confChartInstance.destroy();
608
+ document.getElementById('confusionChart').style.height = '340px';
609
+
610
+ function cellColor(ri, ci, v) {
611
+ if (ri === ci) return `rgba(22,163,74,${0.15 + v * 0.85})`;
612
+ if (Math.abs(ri-ci)>=3) return `rgba(220,38,38,${v * 0.9})`;
613
+ return `rgba(37,99,235,${v * 0.75})`;
614
+ }
615
+
616
+ confChartInstance = new Chart(ctx, {
617
+ type: 'scatter',
618
+ data: { datasets: [{ data, pointRadius: 0 }] },
619
+ options: {
620
+ responsive: true, maintainAspectRatio: false, animation: { duration: 300 },
621
+ plugins: {
622
+ legend: { display: false },
623
+ tooltip: { backgroundColor: '#1e2a3a', callbacks: {
624
+ title: items => `GT ${scores[items[0].raw.y]} → Pred ${scores[items[0].raw.x]}`,
625
+ label: item => ` ${(item.raw.v * 100).toFixed(1)}% of true-class predictions`
626
+ } }
627
+ },
628
+ scales: {
629
+ x: { type: 'linear', min: -0.5, max: 5.5,
630
+ ticks: { stepSize: 1, callback: v => 'Pred ' + (scores[v] || ''), color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } },
631
+ grid: { color: '#1a2236' },
632
+ },
633
+ y: { type: 'linear', min: -0.5, max: 5.5,
634
+ ticks: { stepSize: 1, callback: v => 'GT ' + (scores[v] || ''), color: '#64748b', font: { family: 'JetBrains Mono', size: 10 } },
635
+ grid: { color: '#1a2236' },
636
+ }
637
+ }
638
+ },
639
+ plugins: [{
640
+ id: 'heatmap',
641
+ afterDraw(chart) {
642
+ const {ctx, scales: {x, y}} = chart;
643
+ const cellW = x.getPixelForValue(1) - x.getPixelForValue(0);
644
+ const cellH = y.getPixelForValue(0) - y.getPixelForValue(1);
645
+ data.forEach(d => {
646
+ const cx = x.getPixelForValue(d.x);
647
+ const cy = y.getPixelForValue(d.y);
648
+ ctx.fillStyle = cellColor(d.y, d.x, d.v);
649
+ ctx.fillRect(cx - cellW/2 + 1, cy - cellH/2 + 1, cellW - 2, cellH - 2);
650
+ if (d.v > 0.005) {
651
+ ctx.fillStyle = d.v > 0.3 ? '#fff' : '#94a3b8';
652
+ ctx.font = `bold 11px JetBrains Mono, monospace`;
653
+ ctx.textAlign = 'center';
654
+ ctx.textBaseline = 'middle';
655
+ ctx.fillText((d.v * 100).toFixed(0) + '%', cx, cy);
656
+ }
657
+ });
658
+ }
659
+ }]
660
+ });
661
+ };
662
+
663
  render();
664
  renderChart();
665
+ // renderDist(); // disabled for testing
666
+ renderBias();
667
+ renderCritical();
668
+ populateConfSelect();
669
+ renderConfusion();
670
  })();
671
  </script>
672
  </body>