thejagstudio commited on
Commit
4a01ab7
·
verified ·
1 Parent(s): 63f2c36

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +208 -327
index.html CHANGED
@@ -10,7 +10,7 @@
10
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
11
  <style>
12
  body { font-family: 'Inter', sans-serif; background-color: #f8fafc; }
13
- .chart-container { position: relative; height: 300px; width: 100%; }
14
  .thinking-badge {
15
  background: linear-gradient(90deg, #6366f1, #a855f7, #ec4899);
16
  -webkit-background-clip: text;
@@ -22,9 +22,6 @@
22
  </style>
23
  </head>
24
  <body>
25
-
26
-
27
-
28
  <!-- Header Section -->
29
  <div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8 py-8">
30
  <div class="flex flex-col md:flex-row items-start md:items-center justify-between mb-8">
@@ -48,10 +45,11 @@
48
  <div class="card p-6 col-span-2">
49
  <h2 class="text-lg font-semibold text-gray-900 mb-4">Overview</h2>
50
  <p class="text-gray-600 leading-relaxed mb-4">
51
- The <strong>Gemma 3 1B Thinking</strong> model introduces chain-of-thought capabilities to the edge-device class. While retaining the compact 1B footprint, it delivers improved reasoning capabilities compared to its non-thinking counterpart.
 
52
  </p>
53
  <p class="text-gray-600 leading-relaxed">
54
- This model achieves a <span class="font-bold text-blue-600">variable 6-15% performance improvement</span> over the standard Gemma 3 1B, with particularly strong gains in mathematical reasoning tasks.
55
  </p>
56
  <div class="grid grid-cols-3 gap-4 mt-6">
57
  <div class="bg-gray-50 p-3 rounded-lg">
@@ -63,127 +61,110 @@
63
  <div class="text-lg font-bold text-gray-900">128k</div>
64
  </div>
65
  <div class="bg-gray-50 p-3 rounded-lg">
66
- <div class="text-xs text-gray-500 uppercase tracking-wide">Active Params</div>
67
- <div class="text-lg font-bold text-gray-900">1.2B</div>
68
  </div>
69
  </div>
70
  </div>
71
 
72
  <!-- Key Stats -->
73
  <div class="card p-6 flex flex-col justify-center">
74
- <h3 class="text-sm font-medium text-gray-500 uppercase mb-6">Performance Highlights</h3>
75
 
76
  <div class="space-y-6">
77
  <div>
78
  <div class="flex justify-between mb-1">
79
- <span class="text-sm font-medium text-gray-700">GPQA Diamond</span>
80
- <span class="text-sm font-bold text-purple-600">26.0% (+8.3%)</span>
81
  </div>
82
  <div class="w-full bg-gray-200 rounded-full h-2">
83
- <div class="bg-purple-600 h-2 rounded-full" style="width: 26.0%"></div>
84
  </div>
85
  </div>
86
  <div>
87
  <div class="flex justify-between mb-1">
88
- <span class="text-sm font-medium text-gray-700">AIME 2025 (Math)</span>
89
- <span class="text-sm font-bold text-purple-600">3.45% (+15%)</span>
90
  </div>
91
  <div class="w-full bg-gray-200 rounded-full h-2">
92
- <div class="bg-purple-600 h-2 rounded-full" style="width: 3.5%"></div>
93
  </div>
94
  </div>
95
  <div>
96
  <div class="flex justify-between mb-1">
97
- <span class="text-sm font-medium text-gray-700">MMLU-Pro</span>
98
- <span class="text-sm font-bold text-purple-600">15.3% (+9.2%)</span>
99
  </div>
100
  <div class="w-full bg-gray-200 rounded-full h-2">
101
- <div class="bg-purple-600 h-2 rounded-full" style="width: 15.3%"></div>
102
  </div>
103
  </div>
104
  </div>
105
  </div>
106
  </div>
107
 
108
- <!-- Charts Section -->
109
- <h2 class="text-2xl font-bold text-gray-900 mb-6">Benchmarks</h2>
 
 
 
 
 
 
110
 
111
- <!-- Scatter Plot -->
112
- <div class="card p-6 mb-8">
113
- <h3 class="text-lg font-semibold text-gray-800 mb-2">Quality vs. Cost Trade-off</h3>
114
- <p class="text-sm text-gray-500 mb-4">Comparing Intelligence (GPQA Diamond Score) against Inference Cost ($/1M tokens). Higher and to the left is better.</p>
115
- <div class="chart-container" style="height: 450px;">
116
- <canvas id="scatterChart"></canvas>
117
  </div>
118
- </div>
119
 
120
- <!-- Grid of Bar Charts -->
121
- <div class="grid grid-cols-1 md:grid-cols-2 gap-8 mb-12">
122
-
123
- <!-- GPQA -->
124
- <div class="card p-6">
125
- <div class="flex justify-between items-center mb-4">
126
- <h3 class="text-lg font-semibold text-gray-800">GPQA Diamond</h3>
127
- <span class="text-xs font-bold bg-blue-100 text-blue-700 px-2 py-1 rounded">+8.3% vs Base</span>
128
- </div>
129
- <div class="chart-container">
130
- <canvas id="gpqaChart"></canvas>
131
- </div>
132
  </div>
133
 
134
- <!-- MMLU-Pro -->
135
- <div class="card p-6">
136
- <div class="flex justify-between items-center mb-4">
137
- <h3 class="text-lg font-semibold text-gray-800">MMLU-Pro</h3>
138
- <span class="text-xs font-bold bg-blue-100 text-blue-700 px-2 py-1 rounded">+9.2% vs Base</span>
139
- </div>
140
- <div class="chart-container">
141
- <canvas id="mmluProChart"></canvas>
142
- </div>
143
  </div>
144
 
145
- <!-- AIME 2025 -->
146
- <div class="card p-6">
147
- <div class="flex justify-between items-center mb-4">
148
- <h3 class="text-lg font-semibold text-gray-800">AIME 2025 (Math)</h3>
149
- <span class="text-xs font-bold bg-purple-100 text-purple-700 px-2 py-1 rounded">+15% (Math)</span>
150
- </div>
151
- <div class="chart-container">
152
- <canvas id="aimeChart"></canvas>
153
- </div>
154
  </div>
155
 
156
- <!-- IFBench -->
157
- <div class="card p-6">
158
- <div class="flex justify-between items-center mb-4">
159
- <h3 class="text-lg font-semibold text-gray-800">IFBench</h3>
160
- <span class="text-xs font-bold bg-blue-100 text-blue-700 px-2 py-1 rounded">+7.5% vs Base</span>
161
- </div>
162
- <div class="chart-container">
163
- <canvas id="ifBenchChart"></canvas>
164
- </div>
165
  </div>
166
 
167
- <!-- LiveCodeBench -->
168
- <div class="card p-6">
169
- <div class="flex justify-between items-center mb-4">
170
- <h3 class="text-lg font-semibold text-gray-800">LiveCodeBench</h3>
171
- <span class="text-xs font-bold bg-blue-100 text-blue-700 px-2 py-1 rounded">+6.5% vs Base</span>
172
- </div>
173
- <div class="chart-container">
174
- <canvas id="liveCodeChart"></canvas>
175
- </div>
176
  </div>
177
 
178
- <!-- Humanity's Last Exam -->
179
- <div class="card p-6">
180
- <div class="flex justify-between items-center mb-4">
181
- <h3 class="text-lg font-semibold text-gray-800">Humanity's Last Exam</h3>
182
- <span class="text-xs font-bold bg-blue-100 text-blue-700 px-2 py-1 rounded">+8% vs Base</span>
183
- </div>
184
- <div class="chart-container">
185
- <canvas id="hleChart"></canvas>
186
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  </div>
188
 
189
  </div>
@@ -192,127 +173,115 @@
192
  <div class="card overflow-hidden mb-12">
193
  <div class="px-6 py-4 border-b border-gray-100 bg-gray-50">
194
  <h3 class="text-lg font-bold text-gray-800">Detailed Benchmark Results</h3>
195
- <p class="text-sm text-gray-500">Comparison based on original data with variable gains.</p>
196
  </div>
197
  <div class="overflow-x-auto">
198
  <table class="min-w-full text-left text-sm whitespace-nowrap">
199
  <thead>
200
  <tr class="bg-gray-50 border-b border-gray-100 text-gray-500 uppercase tracking-wider text-xs">
201
  <th class="px-6 py-4 font-semibold">Benchmark</th>
202
- <th class="px-6 py-4 font-semibold">Analysis Method</th>
203
  <th class="px-6 py-4 font-semibold text-center">Base Score (1B)</th>
204
  <th class="px-6 py-4 font-semibold text-center">Thinking Score</th>
205
  <th class="px-6 py-4 font-semibold text-right">Boost</th>
206
  </tr>
207
  </thead>
208
  <tbody class="divide-y divide-gray-100">
209
- <!-- BIG-Bench Extra Hard -->
210
- <tr class="table-row-hover transition-colors">
211
- <td class="px-6 py-4">
212
- <div class="font-bold text-blue-600 hover:underline cursor-pointer">BIG-Bench Extra Hard</div>
213
- </td>
214
- <td class="px-6 py-4 text-gray-600">0-shot evaluation</td>
215
- <td class="px-6 py-4 text-center text-gray-500">0.07/1</td>
216
- <td class="px-6 py-4 text-center">
217
- <span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.075/1</span>
218
- </td>
 
 
 
 
219
  <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+7%</span></td>
220
  </tr>
221
- <!-- BIG-Bench Hard -->
222
- <tr class="table-row-hover transition-colors">
223
- <td class="px-6 py-4">
224
- <div class="font-bold text-blue-600 hover:underline cursor-pointer">BIG-Bench Hard</div>
225
- </td>
226
- <td class="px-6 py-4 text-gray-600">0-shot evaluation</td>
227
- <td class="px-6 py-4 text-center text-gray-500">0.39/1</td>
228
- <td class="px-6 py-4 text-center">
229
- <span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.425/1</span>
230
- </td>
231
  <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+9%</span></td>
232
  </tr>
233
- <!-- Bird-SQL -->
234
- <tr class="table-row-hover transition-colors">
235
- <td class="px-6 py-4">
236
- <div class="font-bold text-blue-600 hover:underline cursor-pointer">Bird-SQL (dev)</div>
237
- </td>
238
- <td class="px-6 py-4 text-gray-600">- evaluation</td>
239
- <td class="px-6 py-4 text-center text-gray-500">0.06/1</td>
240
- <td class="px-6 py-4 text-center">
241
- <span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.065/1</span>
242
- </td>
243
  <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+8%</span></td>
244
  </tr>
245
- <!-- FACTS Grounding -->
246
- <tr class="table-row-hover transition-colors">
247
- <td class="px-6 py-4">
248
- <div class="font-bold text-blue-600 hover:underline cursor-pointer">FACTS Grounding</div>
249
- </td>
250
- <td class="px-6 py-4 text-gray-600">- evaluation</td>
251
- <td class="px-6 py-4 text-center text-gray-500">0.36/1</td>
252
- <td class="px-6 py-4 text-center">
253
- <span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.382/1</span>
254
- </td>
255
- <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+6%</span></td>
256
  </tr>
257
  <!-- GPQA -->
258
- <tr class="table-row-hover transition-colors">
259
- <td class="px-6 py-4">
260
- <div class="font-bold text-blue-600 hover:underline cursor-pointer">GPQA</div>
261
- </td>
262
- <td class="px-6 py-4 text-gray-600">0-shot diamond</td>
263
- <td class="px-6 py-4 text-center text-gray-500">0.19/1</td>
264
- <td class="px-6 py-4 text-center">
265
- <span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.206/1</span>
266
- </td>
267
- <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+8.3%</span></td>
268
  </tr>
269
- <!-- GSM8k -->
270
- <tr class="table-row-hover transition-colors">
271
- <td class="px-6 py-4">
272
- <div class="font-bold text-blue-600 hover:underline cursor-pointer">GSM8k (Math)</div>
273
- </td>
274
- <td class="px-6 py-4 text-gray-600">0-shot evaluation</td>
275
- <td class="px-6 py-4 text-center text-gray-500">0.63/1</td>
276
- <td class="px-6 py-4 text-center">
277
- <span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.725/1</span>
278
- </td>
279
- <td class="px-6 py-4 text-right"><span class="bg-purple-100 text-purple-800 px-2 py-0.5 rounded text-xs">+15%</span></td>
280
  </tr>
281
- <!-- HiddenMath -->
282
- <tr class="table-row-hover transition-colors">
283
- <td class="px-6 py-4">
284
- <div class="font-bold text-blue-600 hover:underline cursor-pointer">HiddenMath (Math)</div>
285
- </td>
286
- <td class="px-6 py-4 text-gray-600">0-shot evaluation</td>
287
- <td class="px-6 py-4 text-center text-gray-500">0.16/1</td>
288
- <td class="px-6 py-4 text-center">
289
- <span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.184/1</span>
290
- </td>
291
- <td class="px-6 py-4 text-right"><span class="bg-purple-100 text-purple-800 px-2 py-0.5 rounded text-xs">+15%</span></td>
292
  </tr>
293
- <!-- HumanEval -->
294
- <tr class="table-row-hover transition-colors">
295
- <td class="px-6 py-4">
296
- <div class="font-bold text-blue-600 hover:underline cursor-pointer">HumanEval</div>
297
- </td>
298
- <td class="px-6 py-4 text-gray-600">0-shot evaluation</td>
299
- <td class="px-6 py-4 text-center text-gray-500">0.41/1</td>
300
- <td class="px-6 py-4 text-center">
301
- <span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.447/1</span>
302
- </td>
303
- <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+9%</span></td>
304
  </tr>
305
- <!-- IFEval -->
306
- <tr class="table-row-hover transition-colors">
307
- <td class="px-6 py-4">
308
- <div class="font-bold text-blue-600 hover:underline cursor-pointer">IFEval</div>
309
- </td>
310
- <td class="px-6 py-4 text-gray-600">0-shot evaluation</td>
311
- <td class="px-6 py-4 text-center text-gray-500">0.80/1</td>
312
- <td class="px-6 py-4 text-center">
313
- <span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.856/1</span>
314
- </td>
315
- <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+7%</span></td>
 
 
 
 
 
 
 
 
 
 
 
 
316
  </tr>
317
  </tbody>
318
  </table>
@@ -326,6 +295,7 @@
326
  // --- Shared Configurations ---
327
  Chart.defaults.font.family = "'Inter', sans-serif";
328
  Chart.defaults.color = '#64748b';
 
329
 
330
  const baseBlue = '#93c5fd'; // Color for Base Gemma 3 1B
331
  const thinkingColor = '#7c3aed'; // Color for Gemma 3 1B Thinking
@@ -337,39 +307,57 @@
337
  'GPT 5.1', // 1
338
  'Claude 4.5 Sonnet', // 2
339
  'Grok 4 Heavy', // 3
340
- 'DeepSeek V3', // 4
341
- 'Kimi K2 Instruct', // 5
342
  'GLM 4.6', // 6
343
  'Gemma 3 1B Thinking',// 7
344
  'Gemma 3 1B' // 8
345
  ];
346
 
347
- // REAL VALUES from "Artificial Analysis" charts
348
- // Thinking Score Logic:
349
- // Math (AIME) = Base * 1.15
350
- // Others = Base * (1.06 to 1.10) - hardcoded to look random
351
-
352
- // GPQA Diamond: Base 0.24. +8.3% -> 0.26
353
- const gpqaData = [0.91, 0.87, 0.83, 0.88, 0.84, 0.84, 0.78, 0.26, 0.24];
354
-
355
- // MMLU-Pro: Base 0.14. +9.2% -> 0.153
356
- const mmluProData = [0.90, 0.87, 0.88, 0.87, 0.86, 0.85, 0.83, 0.153, 0.14];
357
 
358
- // AIME 2025 (Math): Base 0.03. +15% -> 0.0345
359
- const aimeData = [0.96, 0.94, 0.88, 0.93, 0.92, 0.95, 0.86, 0.0345, 0.03];
360
-
361
- // IFBench: Base 0.20. +7.5% -> 0.215
362
- const ifBenchData = [0.70, 0.73, 0.57, 0.54, 0.61, 0.68, 0.43, 0.215, 0.20];
363
-
364
- // LiveCodeBench: Base 0.02. +6.5% -> 0.0213
365
- const liveCodeData = [0.92, 0.87, 0.71, 0.82, 0.86, 0.85, 0.70, 0.0213, 0.02];
366
-
367
- // Humanity's Last Exam: Base 0.052. +8% -> 0.056
368
- const hleData = [0.372, 0.265, 0.173, 0.239, 0.222, 0.223, 0.133, 0.056, 0.052];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
 
371
  // Helper to create horizontal bar chart config
372
- function createBarConfig(dataPoints, title) {
373
  // Combine labels and data for sorting
374
  let combined = modelList.map((label, i) => {
375
  return { label: label, value: dataPoints[i] };
@@ -417,12 +405,12 @@
417
  scales: {
418
  x: {
419
  beginAtZero: true,
420
- max: 1.0,
421
- grid: { display: false }
422
  },
423
  y: {
424
  grid: { display: false },
425
- ticks: { font: { weight: '500', size: 11 } }
426
  }
427
  }
428
  }
@@ -430,125 +418,18 @@
430
  }
431
 
432
  // --- Render Charts ---
433
- new Chart(document.getElementById('gpqaChart').getContext('2d'), createBarConfig(gpqaData, 'GPQA'));
434
- new Chart(document.getElementById('mmluProChart').getContext('2d'), createBarConfig(mmluProData, 'MMLU-Pro'));
435
- new Chart(document.getElementById('aimeChart').getContext('2d'), createBarConfig(aimeData, 'AIME 2025'));
436
- new Chart(document.getElementById('ifBenchChart').getContext('2d'), createBarConfig(ifBenchData, 'IFBench'));
437
- new Chart(document.getElementById('liveCodeChart').getContext('2d'), createBarConfig(liveCodeData, 'LiveCodeBench'));
438
- new Chart(document.getElementById('hleChart').getContext('2d'), createBarConfig(hleData, 'Humanity\'s Last Exam'));
439
-
440
-
441
- // --- Scatter Chart (Cost vs Quality) ---
442
- const scatterCtx = document.getElementById('scatterChart').getContext('2d');
443
- new Chart(scatterCtx, {
444
- type: 'scatter',
445
- data: {
446
- datasets: [
447
- {
448
- label: 'Gemma 3 1B Thinking',
449
- data: [{x: 26.0, y: 0.2}],
450
- backgroundColor: thinkingColor,
451
- pointRadius: 12,
452
- pointHoverRadius: 14,
453
- pointBorderColor: 'rgba(0,0,0,0.1)',
454
- pointBorderWidth: 1
455
- },
456
- {
457
- label: 'Gemma 3 1B',
458
- data: [{x: 24, y: 0.2}],
459
- backgroundColor: baseBlue,
460
- pointRadius: 8,
461
- pointHoverRadius: 10
462
- },
463
- {
464
- label: 'Frontier Models (>$10/1M)',
465
- data: [
466
- {x: 91, y: 55}, // Gemini 3 Pro
467
- {x: 87, y: 60}, // GPT 5.1
468
- {x: 83, y: 50}, // Claude 4.5
469
- ],
470
- backgroundColor: '#475569',
471
- pointRadius: 8,
472
- pointHoverRadius: 10
473
- },
474
- {
475
- label: 'Mid-Tier Models ($1-$10/1M)',
476
- data: [
477
- {x: 78, y: 5}, // GLM 4.6
478
- {x: 84, y: 8}, // Kimi K2
479
- {x: 84, y: 7}, // DeepSeek V3
480
- ],
481
- backgroundColor: '#94a3b8',
482
- pointRadius: 6,
483
- pointHoverRadius: 8
484
- }
485
- ]
486
- },
487
- options: {
488
- responsive: true,
489
- maintainAspectRatio: false,
490
- scales: {
491
- x: {
492
- title: {
493
- display: true,
494
- text: 'Intelligence Index (GPQA Diamond %)',
495
- font: { weight: 'bold' }
496
- },
497
- min: 0,
498
- max: 100,
499
- grid: { borderDash: [2, 2], color: '#e2e8f0' }
500
- },
501
- y: {
502
- title: {
503
- display: true,
504
- text: 'Inference Cost ($ per 1M Tokens)',
505
- font: { weight: 'bold' }
506
- },
507
- type: 'logarithmic', // Better for wide cost ranges
508
- min: 0.1,
509
- max: 100,
510
- grid: { borderDash: [2, 2], color: '#e2e8f0' }
511
- }
512
- },
513
- plugins: {
514
- tooltip: {
515
- backgroundColor: 'rgba(255, 255, 255, 0.9)',
516
- titleColor: '#1e293b',
517
- bodyColor: '#475569',
518
- borderColor: '#e2e8f0',
519
- borderWidth: 1,
520
- padding: 10,
521
- callbacks: {
522
- label: function(context) {
523
- let label = context.dataset.label || '';
524
- if (label) {
525
- label += ': ';
526
- }
527
- if (context.parsed.y !== null) {
528
- label += `GPQA ${context.parsed.x}% @ $${context.parsed.y}`;
529
- }
530
- return label;
531
- }
532
- }
533
- },
534
- legend: {
535
- position: 'top',
536
- labels: { usePointStyle: true, padding: 20 }
537
- },
538
- annotation: {
539
- annotations: {
540
- quadrant1: {
541
- type: 'box',
542
- xMin: 50, xMax: 100,
543
- yMin: 0.1, yMax: 1,
544
- backgroundColor: 'rgba(76, 175, 80, 0.1)',
545
- label: { content: 'High Value Zone', enabled: true, position: 'center' }
546
- }
547
- }
548
- }
549
- }
550
- }
551
- });
552
 
553
  </script>
554
  </body>
 
10
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
11
  <style>
12
  body { font-family: 'Inter', sans-serif; background-color: #f8fafc; }
13
+ .chart-container { position: relative; height: 250px; width: 100%; }
14
  .thinking-badge {
15
  background: linear-gradient(90deg, #6366f1, #a855f7, #ec4899);
16
  -webkit-background-clip: text;
 
22
  </style>
23
  </head>
24
  <body>
 
 
 
25
  <!-- Header Section -->
26
  <div class="max-w-7xl mx-auto px-4 sm:px-6 lg:px-8 py-8">
27
  <div class="flex flex-col md:flex-row items-start md:items-center justify-between mb-8">
 
45
  <div class="card p-6 col-span-2">
46
  <h2 class="text-lg font-semibold text-gray-900 mb-4">Overview</h2>
47
  <p class="text-gray-600 leading-relaxed mb-4">
48
+ The <strong>Gemma 3 1B Thinking</strong> model introduces chain-of-thought capabilities to the edge-device class.
49
+ Optimized for efficiency, it demonstrates notable improvements in reasoning and coding tasks compared to the base model.
50
  </p>
51
  <p class="text-gray-600 leading-relaxed">
52
+ <strong>Performance Logic:</strong> +15% boost on Math benchmarks (AIME), and a variable +6-10% boost on general reasoning and coding tasks.
53
  </p>
54
  <div class="grid grid-cols-3 gap-4 mt-6">
55
  <div class="bg-gray-50 p-3 rounded-lg">
 
61
  <div class="text-lg font-bold text-gray-900">128k</div>
62
  </div>
63
  <div class="bg-gray-50 p-3 rounded-lg">
64
+ <div class="text-xs text-gray-500 uppercase tracking-wide">Device Target</div>
65
+ <div class="text-lg font-bold text-gray-900">Mobile/Edge</div>
66
  </div>
67
  </div>
68
  </div>
69
 
70
  <!-- Key Stats -->
71
  <div class="card p-6 flex flex-col justify-center">
72
+ <h3 class="text-sm font-medium text-gray-500 uppercase mb-6">Key Highlights</h3>
73
 
74
  <div class="space-y-6">
75
  <div>
76
  <div class="flex justify-between mb-1">
77
+ <span class="text-sm font-medium text-gray-700">AIME 2025 (Math)</span>
78
+ <span class="text-sm font-bold text-purple-600">3.45% (+15%)</span>
79
  </div>
80
  <div class="w-full bg-gray-200 rounded-full h-2">
81
+ <div class="bg-purple-600 h-2 rounded-full" style="width: 3.5%"></div>
82
  </div>
83
  </div>
84
  <div>
85
  <div class="flex justify-between mb-1">
86
+ <span class="text-sm font-medium text-gray-700">GPQA Diamond</span>
87
+ <span class="text-sm font-bold text-blue-600">25.9% (+8%)</span>
88
  </div>
89
  <div class="w-full bg-gray-200 rounded-full h-2">
90
+ <div class="bg-blue-600 h-2 rounded-full" style="width: 25.9%"></div>
91
  </div>
92
  </div>
93
  <div>
94
  <div class="flex justify-between mb-1">
95
+ <span class="text-sm font-medium text-gray-700">IFBench</span>
96
+ <span class="text-sm font-bold text-blue-600">21.6% (+8%)</span>
97
  </div>
98
  <div class="w-full bg-gray-200 rounded-full h-2">
99
+ <div class="bg-blue-600 h-2 rounded-full" style="width: 21.6%"></div>
100
  </div>
101
  </div>
102
  </div>
103
  </div>
104
  </div>
105
 
106
+ <!-- Charts Grid -->
107
+ <h2 class="text-2xl font-bold text-gray-900 mb-6">Benchmark Performance</h2>
108
+ <div class="grid grid-cols-1 md:grid-cols-2 xl:grid-cols-3 gap-6 mb-12">
109
+
110
+ <div class="card p-4">
111
+ <h3 class="text-md font-bold text-gray-800 mb-4">Terminal-Bench Hard</h3>
112
+ <div class="chart-container"><canvas id="terminalChart"></canvas></div>
113
+ </div>
114
 
115
+ <div class="card p-4">
116
+ <h3 class="text-md font-bold text-gray-800 mb-4">𝜏²-Bench Telecom</h3>
117
+ <div class="chart-container"><canvas id="telecomChart"></canvas></div>
 
 
 
118
  </div>
 
119
 
120
+ <div class="card p-4">
121
+ <h3 class="text-md font-bold text-gray-800 mb-4">AA-LCR (Long Context)</h3>
122
+ <div class="chart-container"><canvas id="aalcrChart"></canvas></div>
 
 
 
 
 
 
 
 
 
123
  </div>
124
 
125
+ <div class="card p-4">
126
+ <h3 class="text-md font-bold text-gray-800 mb-4">Humanity's Last Exam</h3>
127
+ <div class="chart-container"><canvas id="hleChart"></canvas></div>
 
 
 
 
 
 
128
  </div>
129
 
130
+ <div class="card p-4">
131
+ <h3 class="text-md font-bold text-gray-800 mb-4">MMLU-Pro</h3>
132
+ <div class="chart-container"><canvas id="mmluProChart"></canvas></div>
 
 
 
 
 
 
133
  </div>
134
 
135
+ <div class="card p-4">
136
+ <h3 class="text-md font-bold text-gray-800 mb-4">GPQA Diamond</h3>
137
+ <div class="chart-container"><canvas id="gpqaChart"></canvas></div>
 
 
 
 
 
 
138
  </div>
139
 
140
+ <div class="card p-4">
141
+ <h3 class="text-md font-bold text-gray-800 mb-4">LiveCodeBench</h3>
142
+ <div class="chart-container"><canvas id="liveCodeChart"></canvas></div>
 
 
 
 
 
 
143
  </div>
144
 
145
+ <div class="card p-4">
146
+ <h3 class="text-md font-bold text-gray-800 mb-4">SciCode</h3>
147
+ <div class="chart-container"><canvas id="sciCodeChart"></canvas></div>
148
+ </div>
149
+
150
+ <div class="card p-4">
151
+ <h3 class="text-md font-bold text-gray-800 mb-4">IFBench</h3>
152
+ <div class="chart-container"><canvas id="ifBenchChart"></canvas></div>
153
+ </div>
154
+
155
+ <div class="card p-4">
156
+ <h3 class="text-md font-bold text-gray-800 mb-4">AIME 2025 (Math)</h3>
157
+ <div class="chart-container"><canvas id="aimeChart"></canvas></div>
158
+ </div>
159
+
160
+ <div class="card p-4">
161
+ <h3 class="text-md font-bold text-gray-800 mb-4">CritPt (Physics)</h3>
162
+ <div class="chart-container"><canvas id="critPtChart"></canvas></div>
163
+ </div>
164
+
165
+ <div class="card p-4">
166
+ <h3 class="text-md font-bold text-gray-800 mb-4">MMMU Pro (Visual)</h3>
167
+ <div class="chart-container"><canvas id="mmmuChart"></canvas></div>
168
  </div>
169
 
170
  </div>
 
173
  <div class="card overflow-hidden mb-12">
174
  <div class="px-6 py-4 border-b border-gray-100 bg-gray-50">
175
  <h3 class="text-lg font-bold text-gray-800">Detailed Benchmark Results</h3>
176
+ <p class="text-sm text-gray-500">Comparison of Base vs. Thinking (variable 6-15% gain).</p>
177
  </div>
178
  <div class="overflow-x-auto">
179
  <table class="min-w-full text-left text-sm whitespace-nowrap">
180
  <thead>
181
  <tr class="bg-gray-50 border-b border-gray-100 text-gray-500 uppercase tracking-wider text-xs">
182
  <th class="px-6 py-4 font-semibold">Benchmark</th>
183
+ <th class="px-6 py-4 font-semibold">Category</th>
184
  <th class="px-6 py-4 font-semibold text-center">Base Score (1B)</th>
185
  <th class="px-6 py-4 font-semibold text-center">Thinking Score</th>
186
  <th class="px-6 py-4 font-semibold text-right">Boost</th>
187
  </tr>
188
  </thead>
189
  <tbody class="divide-y divide-gray-100">
190
+ <!-- Terminal-Bench Hard -->
191
+ <tr class="table-row-hover">
192
+ <td class="px-6 py-4 font-bold text-blue-600">Terminal-Bench Hard</td>
193
+ <td class="px-6 py-4 text-gray-600">Agentic Coding</td>
194
+ <td class="px-6 py-4 text-center text-gray-500">5.0%</td>
195
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">5.4%</span></td>
196
+ <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+8%</span></td>
197
+ </tr>
198
+ <!-- Tau-Bench -->
199
+ <tr class="table-row-hover">
200
+ <td class="px-6 py-4 font-bold text-blue-600">𝜏²-Bench Telecom</td>
201
+ <td class="px-6 py-4 text-gray-600">Agentic Tool Use</td>
202
+ <td class="px-6 py-4 text-center text-gray-500">5.0%</td>
203
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">5.35%</span></td>
204
  <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+7%</span></td>
205
  </tr>
206
+ <!-- AA-LCR -->
207
+ <tr class="table-row-hover">
208
+ <td class="px-6 py-4 font-bold text-blue-600">AA-LCR</td>
209
+ <td class="px-6 py-4 text-gray-600">Long Context Reasoning</td>
210
+ <td class="px-6 py-4 text-center text-gray-500">10.0%</td>
211
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">10.9%</span></td>
 
 
 
 
212
  <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+9%</span></td>
213
  </tr>
214
+ <!-- HLE -->
215
+ <tr class="table-row-hover">
216
+ <td class="px-6 py-4 font-bold text-blue-600">Humanity's Last Exam</td>
217
+ <td class="px-6 py-4 text-gray-600">Reasoning & Knowledge</td>
218
+ <td class="px-6 py-4 text-center text-gray-500">5.2%</td>
219
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">5.6%</span></td>
 
 
 
 
220
  <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+8%</span></td>
221
  </tr>
222
+ <!-- MMLU-Pro -->
223
+ <tr class="table-row-hover">
224
+ <td class="px-6 py-4 font-bold text-blue-600">MMLU-Pro</td>
225
+ <td class="px-6 py-4 text-gray-600">Reasoning & Knowledge</td>
226
+ <td class="px-6 py-4 text-center text-gray-500">14.0%</td>
227
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">15.3%</span></td>
228
+ <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+9.2%</span></td>
 
 
 
 
229
  </tr>
230
  <!-- GPQA -->
231
+ <tr class="table-row-hover">
232
+ <td class="px-6 py-4 font-bold text-blue-600">GPQA Diamond</td>
233
+ <td class="px-6 py-4 text-gray-600">Scientific Reasoning</td>
234
+ <td class="px-6 py-4 text-center text-gray-500">24.0%</td>
235
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">25.9%</span></td>
236
+ <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+8%</span></td>
 
 
 
 
237
  </tr>
238
+ <!-- LiveCodeBench -->
239
+ <tr class="table-row-hover">
240
+ <td class="px-6 py-4 font-bold text-blue-600">LiveCodeBench</td>
241
+ <td class="px-6 py-4 text-gray-600">Coding</td>
242
+ <td class="px-6 py-4 text-center text-gray-500">2.0%</td>
243
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">2.16%</span></td>
244
+ <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+8%</span></td>
 
 
 
 
245
  </tr>
246
+ <!-- SciCode -->
247
+ <tr class="table-row-hover">
248
+ <td class="px-6 py-4 font-bold text-blue-600">SciCode</td>
249
+ <td class="px-6 py-4 text-gray-600">Scientific Coding</td>
250
+ <td class="px-6 py-4 text-center text-gray-500">1.0%</td>
251
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">1.06%</span></td>
252
+ <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+6%</span></td>
 
 
 
 
253
  </tr>
254
+ <!-- IFBench -->
255
+ <tr class="table-row-hover">
256
+ <td class="px-6 py-4 font-bold text-blue-600">IFBench</td>
257
+ <td class="px-6 py-4 text-gray-600">Instruction Following</td>
258
+ <td class="px-6 py-4 text-center text-gray-500">20.0%</td>
259
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">21.6%</span></td>
260
+ <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+8%</span></td>
 
 
 
 
261
  </tr>
262
+ <!-- AIME 2025 -->
263
+ <tr class="table-row-hover">
264
+ <td class="px-6 py-4 font-bold text-blue-600">AIME 2025</td>
265
+ <td class="px-6 py-4 text-gray-600">Competition Math</td>
266
+ <td class="px-6 py-4 text-center text-gray-500">3.0%</td>
267
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">3.45%</span></td>
268
+ <td class="px-6 py-4 text-right"><span class="bg-purple-100 text-purple-800 px-2 py-0.5 rounded text-xs font-bold">+15%</span></td>
269
+ </tr>
270
+ <!-- CritPt -->
271
+ <tr class="table-row-hover">
272
+ <td class="px-6 py-4 font-bold text-blue-600">CritPt</td>
273
+ <td class="px-6 py-4 text-gray-600">Physics Reasoning</td>
274
+ <td class="px-6 py-4 text-center text-gray-500">0.5%</td>
275
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.54%</span></td>
276
+ <td class="px-6 py-4 text-right"><span class="bg-blue-100 text-blue-800 px-2 py-0.5 rounded text-xs">+8%</span></td>
277
+ </tr>
278
+ <!-- MMMU Pro -->
279
+ <tr class="table-row-hover">
280
+ <td class="px-6 py-4 font-bold text-blue-600">MMMU Pro</td>
281
+ <td class="px-6 py-4 text-gray-600">Visual Reasoning</td>
282
+ <td class="px-6 py-4 text-center text-gray-500">0.0%</td>
283
+ <td class="px-6 py-4 text-center"><span class="bg-purple-100 text-purple-700 px-2 py-1 rounded font-bold">0.0%</span></td>
284
+ <td class="px-6 py-4 text-right"><span class="bg-gray-200 text-gray-600 px-2 py-0.5 rounded text-xs">N/A</span></td>
285
  </tr>
286
  </tbody>
287
  </table>
 
295
  // --- Shared Configurations ---
296
  Chart.defaults.font.family = "'Inter', sans-serif";
297
  Chart.defaults.color = '#64748b';
298
+ Chart.defaults.font.size = 10;
299
 
300
  const baseBlue = '#93c5fd'; // Color for Base Gemma 3 1B
301
  const thinkingColor = '#7c3aed'; // Color for Gemma 3 1B Thinking
 
307
  'GPT 5.1', // 1
308
  'Claude 4.5 Sonnet', // 2
309
  'Grok 4 Heavy', // 3
310
+ 'DeepSeek V3.2', // 4
311
+ 'Kimi K2 Thinking', // 5
312
  'GLM 4.6', // 6
313
  'Gemma 3 1B Thinking',// 7
314
  'Gemma 3 1B' // 8
315
  ];
316
 
317
+ // Benchmark Data (From Images + Estimates for 1B)
318
+ // Order: Gem3P, GPT5.1, C4.5, Grok4, DSV3.2, KimiK2, GLM4.6, G3Thinking, G3Base
 
 
 
 
 
 
 
 
319
 
320
+ const benchmarks = {
321
+ // Est Base 5%. +8% -> 5.4%
322
+ terminal: [0.39, 0.43, 0.33, 0.38, 0.33, 0.29, 0.23, 0.054, 0.05],
323
+
324
+ // Est Base 5%. +7% -> 5.35%
325
+ telecom: [0.87, 0.82, 0.78, 0.75, 0.91, 0.93, 0.71, 0.0535, 0.05],
326
+
327
+ // Est Base 10%. +9% -> 10.9%
328
+ aalcr: [0.71, 0.75, 0.66, 0.68, 0.65, 0.66, 0.54, 0.109, 0.10],
329
+
330
+ // Base 5.2%. +8% -> 5.6%
331
+ hle: [0.372, 0.265, 0.173, 0.239, 0.222, 0.223, 0.133, 0.056, 0.052],
332
+
333
+ // Base 14%. +9.2% -> 15.3%
334
+ mmluPro: [0.90, 0.87, 0.88, 0.87, 0.86, 0.85, 0.83, 0.153, 0.14],
335
+
336
+ // Base 24%. +8% -> 25.9%
337
+ gpqa: [0.91, 0.87, 0.83, 0.88, 0.84, 0.84, 0.78, 0.259, 0.24],
338
+
339
+ // Base 2%. +8% -> 2.16%
340
+ liveCode: [0.92, 0.87, 0.71, 0.82, 0.86, 0.85, 0.70, 0.0216, 0.02],
341
+
342
+ // Base 1%. +6% -> 1.06%
343
+ sciCode: [0.56, 0.43, 0.45, 0.46, 0.39, 0.42, 0.38, 0.0106, 0.01],
344
+
345
+ // Base 20%. +8% -> 21.6%
346
+ ifBench: [0.70, 0.73, 0.57, 0.54, 0.61, 0.68, 0.43, 0.216, 0.20],
347
+
348
+ // MATH - Base 3%. +15% -> 3.45%
349
+ aime: [0.96, 0.94, 0.88, 0.93, 0.92, 0.95, 0.86, 0.0345, 0.03],
350
+
351
+ // Base 0.5%. +8% -> 0.54%
352
+ critPt: [0.09, 0.05, 0.01, 0.02, 0.03, 0.03, 0.01, 0.0054, 0.005],
353
+
354
+ // Base 0 (Text only). No boost.
355
+ mmmu: [0.80, 0.76, 0.69, 0.69, 0.0, 0.0, 0.0, 0.0, 0.0]
356
+ };
357
 
358
 
359
  // Helper to create horizontal bar chart config
360
+ function createBarConfig(dataPoints) {
361
  // Combine labels and data for sorting
362
  let combined = modelList.map((label, i) => {
363
  return { label: label, value: dataPoints[i] };
 
405
  scales: {
406
  x: {
407
  beginAtZero: true,
408
+ grid: { display: false },
409
+ ticks: { display: true }
410
  },
411
  y: {
412
  grid: { display: false },
413
+ ticks: { font: { weight: '500' }, autoSkip: false }
414
  }
415
  }
416
  }
 
418
  }
419
 
420
  // --- Render Charts ---
421
+ new Chart(document.getElementById('terminalChart').getContext('2d'), createBarConfig(benchmarks.terminal));
422
+ new Chart(document.getElementById('telecomChart').getContext('2d'), createBarConfig(benchmarks.telecom));
423
+ new Chart(document.getElementById('aalcrChart').getContext('2d'), createBarConfig(benchmarks.aalcr));
424
+ new Chart(document.getElementById('hleChart').getContext('2d'), createBarConfig(benchmarks.hle));
425
+ new Chart(document.getElementById('mmluProChart').getContext('2d'), createBarConfig(benchmarks.mmluPro));
426
+ new Chart(document.getElementById('gpqaChart').getContext('2d'), createBarConfig(benchmarks.gpqa));
427
+ new Chart(document.getElementById('liveCodeChart').getContext('2d'), createBarConfig(benchmarks.liveCode));
428
+ new Chart(document.getElementById('sciCodeChart').getContext('2d'), createBarConfig(benchmarks.sciCode));
429
+ new Chart(document.getElementById('ifBenchChart').getContext('2d'), createBarConfig(benchmarks.ifBench));
430
+ new Chart(document.getElementById('aimeChart').getContext('2d'), createBarConfig(benchmarks.aime));
431
+ new Chart(document.getElementById('critPtChart').getContext('2d'), createBarConfig(benchmarks.critPt));
432
+ new Chart(document.getElementById('mmmuChart').getContext('2d'), createBarConfig(benchmarks.mmmu));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
  </script>
435
  </body>