appvoid commited on
Commit
2faf7af
·
verified ·
1 Parent(s): 0102db1

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +23 -15
index.html CHANGED
@@ -278,6 +278,14 @@
278
  const models = [
279
  {
280
  rank: 1,
 
 
 
 
 
 
 
 
281
  name: "Qwen3-4B-Instruct",
282
  score: 102,
283
  maxScore: 125,
@@ -285,7 +293,7 @@
285
  weaknesses: "Prone to factual hallucinations in summarization tasks."
286
  },
287
  {
288
- rank: 2,
289
  name: "lfm2-8b",
290
  score: 99,
291
  maxScore: 125,
@@ -293,7 +301,7 @@
293
  weaknesses: "Struggles with creative tasks like rhyming and procedural sequencing."
294
  },
295
  {
296
- rank: 3,
297
  name: "granite-3.1-3b-instruct",
298
  score: 93.5,
299
  maxScore: 125,
@@ -301,7 +309,7 @@
301
  weaknesses: "Unreliable; frequently outputs junk characters ('{') instead of answering."
302
  },
303
  {
304
- rank: 4,
305
  name: "lfm2-2.6b",
306
  score: 93.5,
307
  maxScore: 125,
@@ -309,7 +317,7 @@
309
  weaknesses: "Significant weakness in analogy, rhyming, and sequencing tasks."
310
  },
311
  {
312
- rank: 5,
313
  name: "Qwen3-1.7B",
314
  score: 92.5,
315
  maxScore: 125,
@@ -317,7 +325,7 @@
317
  weaknesses: "Fails completely on rhyming and has some odd analogy mistakes."
318
  },
319
  {
320
- rank: 6,
321
  name: "Llama-3.2-1B-Instruct",
322
  score: 92,
323
  maxScore: 125,
@@ -325,7 +333,7 @@
325
  weaknesses: "Fails badly on misconception correction, sequencing, and paraphrasing."
326
  },
327
  {
328
- rank: 7,
329
  name: "lfm2-1.2b",
330
  score: 90.5,
331
  maxScore: 125,
@@ -333,7 +341,7 @@
333
  weaknesses: "Knowledge gaps (object location) and hallucinates facts in headlines."
334
  },
335
  {
336
- rank: 8,
337
  name: "Falcon-H1-1.5B-Deep-Instruct",
338
  score: 89,
339
  maxScore: 125,
@@ -341,7 +349,7 @@
341
  weaknesses: "Very poor at logical deduction, rhyming, and categorization."
342
  },
343
  {
344
- rank: 9,
345
  name: "Falcon-H1-1.5B-Instruct",
346
  score: 81,
347
  maxScore: 125,
@@ -349,7 +357,7 @@
349
  weaknesses: "Fails translation completely and often gives blank/junk answers."
350
  },
351
  {
352
- rank: 10,
353
  name: "lfm2-700m",
354
  score: 75.5,
355
  maxScore: 125,
@@ -357,7 +365,7 @@
357
  weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
358
  },
359
  {
360
- rank: 11,
361
  name: "qwen2.5-0.5b-instruct",
362
  score: 72,
363
  maxScore: 125,
@@ -365,7 +373,7 @@
365
  weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
366
  },
367
  {
368
- rank: 12,
369
  name: "Dolphin3.0-Qwen2.5-0.5B",
370
  score: 69.5,
371
  maxScore: 125,
@@ -373,7 +381,7 @@
373
  weaknesses: "Completely fails synonym generation and most grammar correction tasks."
374
  },
375
  {
376
- rank: 13,
377
  name: "qwen3-0.6B",
378
  score: 67,
379
  maxScore: 125,
@@ -381,7 +389,7 @@
381
  weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
382
  },
383
  {
384
- rank: 14,
385
  name: "qwen2.5-0.5B",
386
  score: 60,
387
  maxScore: 125,
@@ -389,7 +397,7 @@
389
  weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
390
  },
391
  {
392
- rank: 15,
393
  name: "NxMobileLM-1.5B-SFT",
394
  score: 59.5,
395
  maxScore: 125,
@@ -397,7 +405,7 @@
397
  weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
398
  },
399
  {
400
- rank: 16,
401
  name: "prithivMLmods-QWQ-500M",
402
  score: 55,
403
  maxScore: 125,
 
278
  const models = [
279
  {
280
  rank: 1,
281
+ name: "granite-4.0-h-tiny",
282
+ score: 103.5,
283
+ maxScore: 125,
284
+ strengths: "Extremely well-rounded; top-tier in logic, math, translation, and synonyms.",
285
+ weaknesses: "Fails completely at rhyming; hallucinates facts in summarization tasks."
286
+ },
287
+ {
288
+ rank: 2,
289
  name: "Qwen3-4B-Instruct",
290
  score: 102,
291
  maxScore: 125,
 
293
  weaknesses: "Prone to factual hallucinations in summarization tasks."
294
  },
295
  {
296
+ rank: 3,
297
  name: "lfm2-8b",
298
  score: 99,
299
  maxScore: 125,
 
301
  weaknesses: "Struggles with creative tasks like rhyming and procedural sequencing."
302
  },
303
  {
304
+ rank: 4,
305
  name: "granite-3.1-3b-instruct",
306
  score: 93.5,
307
  maxScore: 125,
 
309
  weaknesses: "Unreliable; frequently outputs junk characters ('{') instead of answering."
310
  },
311
  {
312
+ rank: 5,
313
  name: "lfm2-2.6b",
314
  score: 93.5,
315
  maxScore: 125,
 
317
  weaknesses: "Significant weakness in analogy, rhyming, and sequencing tasks."
318
  },
319
  {
320
+ rank: 6,
321
  name: "Qwen3-1.7B",
322
  score: 92.5,
323
  maxScore: 125,
 
325
  weaknesses: "Fails completely on rhyming and has some odd analogy mistakes."
326
  },
327
  {
328
+ rank: 7,
329
  name: "Llama-3.2-1B-Instruct",
330
  score: 92,
331
  maxScore: 125,
 
333
  weaknesses: "Fails badly on misconception correction, sequencing, and paraphrasing."
334
  },
335
  {
336
+ rank: 8,
337
  name: "lfm2-1.2b",
338
  score: 90.5,
339
  maxScore: 125,
 
341
  weaknesses: "Knowledge gaps (object location) and hallucinates facts in headlines."
342
  },
343
  {
344
+ rank: 9,
345
  name: "Falcon-H1-1.5B-Deep-Instruct",
346
  score: 89,
347
  maxScore: 125,
 
349
  weaknesses: "Very poor at logical deduction, rhyming, and categorization."
350
  },
351
  {
352
+ rank: 10,
353
  name: "Falcon-H1-1.5B-Instruct",
354
  score: 81,
355
  maxScore: 125,
 
357
  weaknesses: "Fails translation completely and often gives blank/junk answers."
358
  },
359
  {
360
+ rank: 11,
361
  name: "lfm2-700m",
362
  score: 75.5,
363
  maxScore: 125,
 
365
  weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
366
  },
367
  {
368
+ rank: 12,
369
  name: "qwen2.5-0.5b-instruct",
370
  score: 72,
371
  maxScore: 125,
 
373
  weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
374
  },
375
  {
376
+ rank: 13,
377
  name: "Dolphin3.0-Qwen2.5-0.5B",
378
  score: 69.5,
379
  maxScore: 125,
 
381
  weaknesses: "Completely fails synonym generation and most grammar correction tasks."
382
  },
383
  {
384
+ rank: 14,
385
  name: "qwen3-0.6B",
386
  score: 67,
387
  maxScore: 125,
 
389
  weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
390
  },
391
  {
392
+ rank: 15,
393
  name: "qwen2.5-0.5B",
394
  score: 60,
395
  maxScore: 125,
 
397
  weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
398
  },
399
  {
400
+ rank: 16,
401
  name: "NxMobileLM-1.5B-SFT",
402
  score: 59.5,
403
  maxScore: 125,
 
405
  weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
406
  },
407
  {
408
+ rank: 17,
409
  name: "prithivMLmods-QWQ-500M",
410
  score: 55,
411
  maxScore: 125,