appvoid commited on
Commit
bfd97f5
·
verified ·
1 Parent(s): 2faf7af

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +81 -69
index.html CHANGED
@@ -3,6 +3,7 @@
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
 
6
  <style>
7
  * {
8
  margin: 0;
@@ -41,8 +42,9 @@
41
 
42
  .table-wrapper {
43
  text-align: center;
44
- background: black;
45
  border-radius: 16px;
 
46
  box-shadow: 0 20px 60px rgba(0,0,0,0.3);
47
  overflow: hidden;
48
  animation: fadeIn 0.6s ease-out;
@@ -78,10 +80,6 @@
78
  letter-spacing: 0.5px;
79
  }
80
 
81
- th:first-child {
82
- border-radius: 0;
83
- }
84
-
85
  tbody tr {
86
  border-bottom: 1px solid #222;
87
  transition: all 0.3s ease;
@@ -99,6 +97,11 @@
99
  td {
100
  padding: 18px 20px;
101
  font-size: 0.95rem;
 
 
 
 
 
102
  }
103
 
104
  .rank {
@@ -119,12 +122,13 @@
119
  .score {
120
  font-weight: 700;
121
  font-size: .8rem;
 
122
  }
123
 
124
  .progress-container {
125
  width: 100%;
126
  height: 8px;
127
- background: #e2e8f0;
128
  border-radius: 10px;
129
  overflow: hidden;
130
  margin-top: 8px;
@@ -154,7 +158,7 @@
154
  letter-spacing: 0.5px;
155
  }
156
 
157
- .badge-best {
158
  background: linear-gradient(135deg, #48bb78 0%, #38a169 100%);
159
  color: white;
160
  }
@@ -185,7 +189,29 @@
185
  font-weight: 500;
186
  font-size: 0.85rem;
187
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
 
 
 
 
 
 
189
  @media (max-width: 768px) {
190
  h1 {
191
  font-size: 1.8rem;
@@ -209,27 +235,6 @@
209
  }
210
  }
211
 
212
- .legend {
213
- display: flex;
214
- justify-content: center;
215
- gap: 20px;
216
- margin-top: 30px;
217
- flex-wrap: wrap;
218
- }
219
-
220
- .legend-item {
221
- display: flex;
222
- align-items: center;
223
- gap: 8px;
224
- color: white;
225
- font-size: 0.9rem;
226
- }
227
-
228
- .legend-color {
229
- width: 30px;
230
- height: 8px;
231
- border-radius: 4px;
232
- }
233
  </style>
234
  </head>
235
  <body>
@@ -269,7 +274,7 @@
269
  </div>
270
  <div class="legend-item">
271
  <div class="legend-color" style="background: linear-gradient(90deg, #f56565 0%, #e53e3e 100%);"></div>
272
- <span>Poor (<69)</span>
273
  </div>
274
  </div>
275
  </div>
@@ -280,7 +285,6 @@
280
  rank: 1,
281
  name: "granite-4.0-h-tiny",
282
  score: 103.5,
283
- maxScore: 125,
284
  strengths: "Extremely well-rounded; top-tier in logic, math, translation, and synonyms.",
285
  weaknesses: "Fails completely at rhyming; hallucinates facts in summarization tasks."
286
  },
@@ -288,7 +292,6 @@
288
  rank: 2,
289
  name: "Qwen3-4B-Instruct",
290
  score: 102,
291
- maxScore: 125,
292
  strengths: "Top performer, excels in core NLP, logic, and factual recall.",
293
  weaknesses: "Prone to factual hallucinations in summarization tasks."
294
  },
@@ -296,126 +299,134 @@
296
  rank: 3,
297
  name: "lfm2-8b",
298
  score: 99,
299
- maxScore: 125,
300
  strengths: "Very logical, provides detailed, nuanced answers, strong at misconception correction.",
301
  weaknesses: "Struggles with creative tasks like rhyming and procedural sequencing."
302
  },
303
  {
304
  rank: 4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  name: "granite-3.1-3b-instruct",
306
  score: 93.5,
307
- maxScore: 125,
308
  strengths: "Highly capable when it works; excellent at summarization and logic.",
309
  weaknesses: "Unreliable; frequently outputs junk characters ('{') instead of answering."
310
  },
311
  {
312
- rank: 5,
313
  name: "lfm2-2.6b",
314
  score: 93.5,
315
- maxScore: 125,
316
  strengths: "Strong core capabilities, great at grammar and misconception correction.",
317
  weaknesses: "Significant weakness in analogy, rhyming, and sequencing tasks."
 
 
 
 
 
 
 
318
  },
319
  {
320
- rank: 6,
321
  name: "Qwen3-1.7B",
322
  score: 92.5,
323
- maxScore: 125,
324
  strengths: "Good overall performance on core tasks and math.",
325
  weaknesses: "Fails completely on rhyming and has some odd analogy mistakes."
326
  },
327
  {
328
- rank: 7,
329
  name: "Llama-3.2-1B-Instruct",
330
  score: 92,
331
- maxScore: 125,
332
  strengths: "Great at core NLP, math, and code generation.",
333
  weaknesses: "Fails badly on misconception correction, sequencing, and paraphrasing."
334
  },
335
  {
336
- rank: 8,
337
  name: "lfm2-1.2b",
338
  score: 90.5,
339
- maxScore: 125,
340
  strengths: "Strong core skills like grammar, math, and translation.",
341
  weaknesses: "Knowledge gaps (object location) and hallucinates facts in headlines."
342
  },
343
  {
344
- rank: 9,
345
  name: "Falcon-H1-1.5B-Deep-Instruct",
346
  score: 89,
347
- maxScore: 125,
348
  strengths: "Excellent summarizer and paraphraser, strong on synonyms.",
349
  weaknesses: "Very poor at logical deduction, rhyming, and categorization."
350
  },
351
  {
352
- rank: 10,
353
  name: "Falcon-H1-1.5B-Instruct",
354
  score: 81,
355
- maxScore: 125,
356
  strengths: "Good at logic, math, and factual questions.",
357
  weaknesses: "Fails translation completely and often gives blank/junk answers."
358
  },
359
  {
360
- rank: 11,
361
  name: "lfm2-700m",
362
  score: 75.5,
363
- maxScore: 125,
364
  strengths: "Handles sentiment, math, and logic correctly.",
365
  weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
366
  },
367
  {
368
- rank: 12,
369
  name: "qwen2.5-0.5b-instruct",
370
  score: 72,
371
- maxScore: 125,
372
  strengths: "Decent at math, basic commands, and some logic.",
373
  weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
374
  },
375
  {
376
- rank: 13,
377
  name: "Dolphin3.0-Qwen2.5-0.5B",
378
  score: 69.5,
379
- maxScore: 125,
380
  strengths: "Best of the small models; handles math and antonyms well.",
381
  weaknesses: "Completely fails synonym generation and most grammar correction tasks."
382
  },
383
  {
384
- rank: 14,
385
  name: "qwen3-0.6B",
386
  score: 67,
387
- maxScore: 125,
388
  strengths: "Correct on basic math and antonyms.",
389
  weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
390
  },
391
  {
392
- rank: 15,
 
 
 
 
 
 
 
393
  name: "qwen2.5-0.5B",
394
  score: 60,
395
- maxScore: 125,
396
  strengths: "Passes basic math and antonym tasks.",
397
  weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
398
  },
399
  {
400
- rank: 16,
401
  name: "NxMobileLM-1.5B-SFT",
402
  score: 59.5,
403
- maxScore: 125,
404
  strengths: "Passes math and some grammar/logic.",
405
  weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
406
- },
407
- {
408
- rank: 17,
409
- name: "prithivMLmods-QWQ-500M",
410
- score: 55,
411
- maxScore: 125,
412
- strengths: "Handles math and most logic correctly.",
413
- weaknesses: "Very poor overall; fails most creative tasks, hallucinates facts, outputs numbers for text."
414
  }
415
  ];
416
 
 
 
417
  function getRatingBadge(score) {
418
- if (score >= 108) return '<span class="badge badge-best">Excellent</span>';
419
  if (score >= 91) return '<span class="badge badge-good">Good</span>';
420
  if (score >= 69) return '<span class="badge badge-average">Average</span>';
421
  return '<span class="badge badge-poor">Poor</span>';
@@ -425,16 +436,17 @@
425
  const tbody = document.querySelector('#performanceTable tbody');
426
 
427
  models.forEach((model, index) => {
428
- const percentage = (model.score / model.maxScore) * 100;
429
 
430
  const row = document.createElement('tr');
431
- row.style.animationDelay = `${index * 0.1}s`;
 
432
 
433
  row.innerHTML = `
434
  <td class="rank">#${model.rank}</td>
435
  <td class="model-name">${model.name}</td>
436
  <td>
437
- <div class="score">${model.score} / ${model.maxScore}</div>
438
  <div class="progress-container">
439
  <div class="progress-bar" style="width: ${percentage}%"></div>
440
  </div>
 
3
  <head>
4
  <meta charset="UTF-8">
5
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Meta Leaderboard - Top 20 Models</title>
7
  <style>
8
  * {
9
  margin: 0;
 
42
 
43
  .table-wrapper {
44
  text-align: center;
45
+ background: #111;
46
  border-radius: 16px;
47
+ border: 1px solid #333;
48
  box-shadow: 0 20px 60px rgba(0,0,0,0.3);
49
  overflow: hidden;
50
  animation: fadeIn 0.6s ease-out;
 
80
  letter-spacing: 0.5px;
81
  }
82
 
 
 
 
 
83
  tbody tr {
84
  border-bottom: 1px solid #222;
85
  transition: all 0.3s ease;
 
97
  td {
98
  padding: 18px 20px;
99
  font-size: 0.95rem;
100
+ text-align: left;
101
+ }
102
+
103
+ td:first-child, td:last-child {
104
+ text-align: center;
105
  }
106
 
107
  .rank {
 
122
  .score {
123
  font-weight: 700;
124
  font-size: .8rem;
125
+ text-align: center;
126
  }
127
 
128
  .progress-container {
129
  width: 100%;
130
  height: 8px;
131
+ background: #444;
132
  border-radius: 10px;
133
  overflow: hidden;
134
  margin-top: 8px;
 
158
  letter-spacing: 0.5px;
159
  }
160
 
161
+ .badge-excellent {
162
  background: linear-gradient(135deg, #48bb78 0%, #38a169 100%);
163
  color: white;
164
  }
 
189
  font-weight: 500;
190
  font-size: 0.85rem;
191
  }
192
+
193
+ .legend {
194
+ display: flex;
195
+ justify-content: center;
196
+ gap: 20px;
197
+ margin-top: 30px;
198
+ flex-wrap: wrap;
199
+ }
200
+
201
+ .legend-item {
202
+ display: flex;
203
+ align-items: center;
204
+ gap: 8px;
205
+ color: white;
206
+ font-size: 0.9rem;
207
+ }
208
 
209
+ .legend-color {
210
+ width: 30px;
211
+ height: 8px;
212
+ border-radius: 4px;
213
+ }
214
+
215
  @media (max-width: 768px) {
216
  h1 {
217
  font-size: 1.8rem;
 
235
  }
236
  }
237
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  </style>
239
  </head>
240
  <body>
 
274
  </div>
275
  <div class="legend-item">
276
  <div class="legend-color" style="background: linear-gradient(90deg, #f56565 0%, #e53e3e 100%);"></div>
277
+ <span>Poor (&lt;69)</span>
278
  </div>
279
  </div>
280
  </div>
 
285
  rank: 1,
286
  name: "granite-4.0-h-tiny",
287
  score: 103.5,
 
288
  strengths: "Extremely well-rounded; top-tier in logic, math, translation, and synonyms.",
289
  weaknesses: "Fails completely at rhyming; hallucinates facts in summarization tasks."
290
  },
 
292
  rank: 2,
293
  name: "Qwen3-4B-Instruct",
294
  score: 102,
 
295
  strengths: "Top performer, excels in core NLP, logic, and factual recall.",
296
  weaknesses: "Prone to factual hallucinations in summarization tasks."
297
  },
 
299
  rank: 3,
300
  name: "lfm2-8b",
301
  score: 99,
 
302
  strengths: "Very logical, provides detailed, nuanced answers, strong at misconception correction.",
303
  weaknesses: "Struggles with creative tasks like rhyming and procedural sequencing."
304
  },
305
  {
306
  rank: 4,
307
+ name: "Qwen3-MOE-4x0.6B-2.4B-Writing-Thunder-V1.2.Q8_0.gguf",
308
+ score: 96,
309
+ strengths: "Strong in logic, math, grammar, and summarization.",
310
+ weaknesses: "Struggles with rhyming, synonyms, some translation, and procedural sequencing."
311
+ },
312
+ {
313
+ rank: 5,
314
+ name: "granite-3.3-2b-instruct-Q8_0.gguf",
315
+ score: 95,
316
+ strengths: "Excels at core NLP, logic, math, and misconception correction.",
317
+ weaknesses: "Fails completely at NER, rhyming, and procedural sequencing."
318
+ },
319
+ {
320
+ rank: 6,
321
  name: "granite-3.1-3b-instruct",
322
  score: 93.5,
 
323
  strengths: "Highly capable when it works; excellent at summarization and logic.",
324
  weaknesses: "Unreliable; frequently outputs junk characters ('{') instead of answering."
325
  },
326
  {
327
+ rank: 7,
328
  name: "lfm2-2.6b",
329
  score: 93.5,
 
330
  strengths: "Strong core capabilities, great at grammar and misconception correction.",
331
  weaknesses: "Significant weakness in analogy, rhyming, and sequencing tasks."
332
+ },
333
+ {
334
+ rank: 8,
335
+ name: "EXAONE-3.5-2.4B-Instruct-abliterated.Q8_0.gguf",
336
+ score: 93,
337
+ strengths: "Excellent at reasoning, summarization, grammar, and misconception correction.",
338
+ weaknesses: "Fails completely at translation and sequencing; unreliable output formatting."
339
  },
340
  {
341
+ rank: 9,
342
  name: "Qwen3-1.7B",
343
  score: 92.5,
 
344
  strengths: "Good overall performance on core tasks and math.",
345
  weaknesses: "Fails completely on rhyming and has some odd analogy mistakes."
346
  },
347
  {
348
+ rank: 10,
349
  name: "Llama-3.2-1B-Instruct",
350
  score: 92,
 
351
  strengths: "Great at core NLP, math, and code generation.",
352
  weaknesses: "Fails badly on misconception correction, sequencing, and paraphrasing."
353
  },
354
  {
355
+ rank: 11,
356
  name: "lfm2-1.2b",
357
  score: 90.5,
 
358
  strengths: "Strong core skills like grammar, math, and translation.",
359
  weaknesses: "Knowledge gaps (object location) and hallucinates facts in headlines."
360
  },
361
  {
362
+ rank: 12,
363
  name: "Falcon-H1-1.5B-Deep-Instruct",
364
  score: 89,
 
365
  strengths: "Excellent summarizer and paraphraser, strong on synonyms.",
366
  weaknesses: "Very poor at logical deduction, rhyming, and categorization."
367
  },
368
  {
369
+ rank: 13,
370
  name: "Falcon-H1-1.5B-Instruct",
371
  score: 81,
 
372
  strengths: "Good at logic, math, and factual questions.",
373
  weaknesses: "Fails translation completely and often gives blank/junk answers."
374
  },
375
  {
376
+ rank: 14,
377
  name: "lfm2-700m",
378
  score: 75.5,
 
379
  strengths: "Handles sentiment, math, and logic correctly.",
380
  weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
381
  },
382
  {
383
+ rank: 15,
384
  name: "qwen2.5-0.5b-instruct",
385
  score: 72,
 
386
  strengths: "Decent at math, basic commands, and some logic.",
387
  weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
388
  },
389
  {
390
+ rank: 16,
391
  name: "Dolphin3.0-Qwen2.5-0.5B",
392
  score: 69.5,
 
393
  strengths: "Best of the small models; handles math and antonyms well.",
394
  weaknesses: "Completely fails synonym generation and most grammar correction tasks."
395
  },
396
  {
397
+ rank: 17,
398
  name: "qwen3-0.6B",
399
  score: 67,
 
400
  strengths: "Correct on basic math and antonyms.",
401
  weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
402
  },
403
  {
404
+ rank: 18,
405
+ name: "Auto-Completer-0.2.Q8_0.gguf",
406
+ score: 60,
407
+ strengths: "Perfect in Antonyms, Translation, Math, and Logic.",
408
+ weaknesses: "Complete failure in most other areas; reinforces misconceptions, cannot follow sequences."
409
+ },
410
+ {
411
+ rank: 19,
412
  name: "qwen2.5-0.5B",
413
  score: 60,
 
414
  strengths: "Passes basic math and antonym tasks.",
415
  weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
416
  },
417
  {
418
+ rank: 20,
419
  name: "NxMobileLM-1.5B-SFT",
420
  score: 59.5,
 
421
  strengths: "Passes math and some grammar/logic.",
422
  weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
 
 
 
 
 
 
 
 
423
  }
424
  ];
425
 
426
+ const maxScore = 125;
427
+
428
  function getRatingBadge(score) {
429
+ if (score >= 108) return '<span class="badge badge-excellent">Excellent</span>';
430
  if (score >= 91) return '<span class="badge badge-good">Good</span>';
431
  if (score >= 69) return '<span class="badge badge-average">Average</span>';
432
  return '<span class="badge badge-poor">Poor</span>';
 
436
  const tbody = document.querySelector('#performanceTable tbody');
437
 
438
  models.forEach((model, index) => {
439
+ const percentage = (model.score / maxScore) * 100;
440
 
441
  const row = document.createElement('tr');
442
+ row.style.animation = `fadeIn 0.5s ease-out ${index * 0.05}s forwards`;
443
+ row.style.opacity = 0;
444
 
445
  row.innerHTML = `
446
  <td class="rank">#${model.rank}</td>
447
  <td class="model-name">${model.name}</td>
448
  <td>
449
+ <div class="score">${model.score} / ${maxScore}</div>
450
  <div class="progress-container">
451
  <div class="progress-bar" style="width: ${percentage}%"></div>
452
  </div>