Spaces:
Running
Running
Update index.html
Browse files- index.html +23 -15
index.html
CHANGED
|
@@ -278,6 +278,14 @@
|
|
| 278 |
const models = [
|
| 279 |
{
|
| 280 |
rank: 1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 281 |
name: "Qwen3-4B-Instruct",
|
| 282 |
score: 102,
|
| 283 |
maxScore: 125,
|
|
@@ -285,7 +293,7 @@
|
|
| 285 |
weaknesses: "Prone to factual hallucinations in summarization tasks."
|
| 286 |
},
|
| 287 |
{
|
| 288 |
-
rank:
|
| 289 |
name: "lfm2-8b",
|
| 290 |
score: 99,
|
| 291 |
maxScore: 125,
|
|
@@ -293,7 +301,7 @@
|
|
| 293 |
weaknesses: "Struggles with creative tasks like rhyming and procedural sequencing."
|
| 294 |
},
|
| 295 |
{
|
| 296 |
-
rank:
|
| 297 |
name: "granite-3.1-3b-instruct",
|
| 298 |
score: 93.5,
|
| 299 |
maxScore: 125,
|
|
@@ -301,7 +309,7 @@
|
|
| 301 |
weaknesses: "Unreliable; frequently outputs junk characters ('{') instead of answering."
|
| 302 |
},
|
| 303 |
{
|
| 304 |
-
rank:
|
| 305 |
name: "lfm2-2.6b",
|
| 306 |
score: 93.5,
|
| 307 |
maxScore: 125,
|
|
@@ -309,7 +317,7 @@
|
|
| 309 |
weaknesses: "Significant weakness in analogy, rhyming, and sequencing tasks."
|
| 310 |
},
|
| 311 |
{
|
| 312 |
-
rank:
|
| 313 |
name: "Qwen3-1.7B",
|
| 314 |
score: 92.5,
|
| 315 |
maxScore: 125,
|
|
@@ -317,7 +325,7 @@
|
|
| 317 |
weaknesses: "Fails completely on rhyming and has some odd analogy mistakes."
|
| 318 |
},
|
| 319 |
{
|
| 320 |
-
rank:
|
| 321 |
name: "Llama-3.2-1B-Instruct",
|
| 322 |
score: 92,
|
| 323 |
maxScore: 125,
|
|
@@ -325,7 +333,7 @@
|
|
| 325 |
weaknesses: "Fails badly on misconception correction, sequencing, and paraphrasing."
|
| 326 |
},
|
| 327 |
{
|
| 328 |
-
rank:
|
| 329 |
name: "lfm2-1.2b",
|
| 330 |
score: 90.5,
|
| 331 |
maxScore: 125,
|
|
@@ -333,7 +341,7 @@
|
|
| 333 |
weaknesses: "Knowledge gaps (object location) and hallucinates facts in headlines."
|
| 334 |
},
|
| 335 |
{
|
| 336 |
-
rank:
|
| 337 |
name: "Falcon-H1-1.5B-Deep-Instruct",
|
| 338 |
score: 89,
|
| 339 |
maxScore: 125,
|
|
@@ -341,7 +349,7 @@
|
|
| 341 |
weaknesses: "Very poor at logical deduction, rhyming, and categorization."
|
| 342 |
},
|
| 343 |
{
|
| 344 |
-
rank:
|
| 345 |
name: "Falcon-H1-1.5B-Instruct",
|
| 346 |
score: 81,
|
| 347 |
maxScore: 125,
|
|
@@ -349,7 +357,7 @@
|
|
| 349 |
weaknesses: "Fails translation completely and often gives blank/junk answers."
|
| 350 |
},
|
| 351 |
{
|
| 352 |
-
rank:
|
| 353 |
name: "lfm2-700m",
|
| 354 |
score: 75.5,
|
| 355 |
maxScore: 125,
|
|
@@ -357,7 +365,7 @@
|
|
| 357 |
weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
|
| 358 |
},
|
| 359 |
{
|
| 360 |
-
rank:
|
| 361 |
name: "qwen2.5-0.5b-instruct",
|
| 362 |
score: 72,
|
| 363 |
maxScore: 125,
|
|
@@ -365,7 +373,7 @@
|
|
| 365 |
weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
|
| 366 |
},
|
| 367 |
{
|
| 368 |
-
rank:
|
| 369 |
name: "Dolphin3.0-Qwen2.5-0.5B",
|
| 370 |
score: 69.5,
|
| 371 |
maxScore: 125,
|
|
@@ -373,7 +381,7 @@
|
|
| 373 |
weaknesses: "Completely fails synonym generation and most grammar correction tasks."
|
| 374 |
},
|
| 375 |
{
|
| 376 |
-
rank:
|
| 377 |
name: "qwen3-0.6B",
|
| 378 |
score: 67,
|
| 379 |
maxScore: 125,
|
|
@@ -381,7 +389,7 @@
|
|
| 381 |
weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
|
| 382 |
},
|
| 383 |
{
|
| 384 |
-
rank:
|
| 385 |
name: "qwen2.5-0.5B",
|
| 386 |
score: 60,
|
| 387 |
maxScore: 125,
|
|
@@ -389,7 +397,7 @@
|
|
| 389 |
weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
|
| 390 |
},
|
| 391 |
{
|
| 392 |
-
rank:
|
| 393 |
name: "NxMobileLM-1.5B-SFT",
|
| 394 |
score: 59.5,
|
| 395 |
maxScore: 125,
|
|
@@ -397,7 +405,7 @@
|
|
| 397 |
weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
|
| 398 |
},
|
| 399 |
{
|
| 400 |
-
rank:
|
| 401 |
name: "prithivMLmods-QWQ-500M",
|
| 402 |
score: 55,
|
| 403 |
maxScore: 125,
|
|
|
|
| 278 |
const models = [
|
| 279 |
{
|
| 280 |
rank: 1,
|
| 281 |
+
name: "granite-4.0-h-tiny",
|
| 282 |
+
score: 103.5,
|
| 283 |
+
maxScore: 125,
|
| 284 |
+
strengths: "Extremely well-rounded; top-tier in logic, math, translation, and synonyms.",
|
| 285 |
+
weaknesses: "Fails completely at rhyming; hallucinates facts in summarization tasks."
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
rank: 2,
|
| 289 |
name: "Qwen3-4B-Instruct",
|
| 290 |
score: 102,
|
| 291 |
maxScore: 125,
|
|
|
|
| 293 |
weaknesses: "Prone to factual hallucinations in summarization tasks."
|
| 294 |
},
|
| 295 |
{
|
| 296 |
+
rank: 3,
|
| 297 |
name: "lfm2-8b",
|
| 298 |
score: 99,
|
| 299 |
maxScore: 125,
|
|
|
|
| 301 |
weaknesses: "Struggles with creative tasks like rhyming and procedural sequencing."
|
| 302 |
},
|
| 303 |
{
|
| 304 |
+
rank: 4,
|
| 305 |
name: "granite-3.1-3b-instruct",
|
| 306 |
score: 93.5,
|
| 307 |
maxScore: 125,
|
|
|
|
| 309 |
weaknesses: "Unreliable; frequently outputs junk characters ('{') instead of answering."
|
| 310 |
},
|
| 311 |
{
|
| 312 |
+
rank: 5,
|
| 313 |
name: "lfm2-2.6b",
|
| 314 |
score: 93.5,
|
| 315 |
maxScore: 125,
|
|
|
|
| 317 |
weaknesses: "Significant weakness in analogy, rhyming, and sequencing tasks."
|
| 318 |
},
|
| 319 |
{
|
| 320 |
+
rank: 6,
|
| 321 |
name: "Qwen3-1.7B",
|
| 322 |
score: 92.5,
|
| 323 |
maxScore: 125,
|
|
|
|
| 325 |
weaknesses: "Fails completely on rhyming and has some odd analogy mistakes."
|
| 326 |
},
|
| 327 |
{
|
| 328 |
+
rank: 7,
|
| 329 |
name: "Llama-3.2-1B-Instruct",
|
| 330 |
score: 92,
|
| 331 |
maxScore: 125,
|
|
|
|
| 333 |
weaknesses: "Fails badly on misconception correction, sequencing, and paraphrasing."
|
| 334 |
},
|
| 335 |
{
|
| 336 |
+
rank: 8,
|
| 337 |
name: "lfm2-1.2b",
|
| 338 |
score: 90.5,
|
| 339 |
maxScore: 125,
|
|
|
|
| 341 |
weaknesses: "Knowledge gaps (object location) and hallucinates facts in headlines."
|
| 342 |
},
|
| 343 |
{
|
| 344 |
+
rank: 9,
|
| 345 |
name: "Falcon-H1-1.5B-Deep-Instruct",
|
| 346 |
score: 89,
|
| 347 |
maxScore: 125,
|
|
|
|
| 349 |
weaknesses: "Very poor at logical deduction, rhyming, and categorization."
|
| 350 |
},
|
| 351 |
{
|
| 352 |
+
rank: 10,
|
| 353 |
name: "Falcon-H1-1.5B-Instruct",
|
| 354 |
score: 81,
|
| 355 |
maxScore: 125,
|
|
|
|
| 357 |
weaknesses: "Fails translation completely and often gives blank/junk answers."
|
| 358 |
},
|
| 359 |
{
|
| 360 |
+
rank: 11,
|
| 361 |
name: "lfm2-700m",
|
| 362 |
score: 75.5,
|
| 363 |
maxScore: 125,
|
|
|
|
| 365 |
weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
|
| 366 |
},
|
| 367 |
{
|
| 368 |
+
rank: 12,
|
| 369 |
name: "qwen2.5-0.5b-instruct",
|
| 370 |
score: 72,
|
| 371 |
maxScore: 125,
|
|
|
|
| 373 |
weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
|
| 374 |
},
|
| 375 |
{
|
| 376 |
+
rank: 13,
|
| 377 |
name: "Dolphin3.0-Qwen2.5-0.5B",
|
| 378 |
score: 69.5,
|
| 379 |
maxScore: 125,
|
|
|
|
| 381 |
weaknesses: "Completely fails synonym generation and most grammar correction tasks."
|
| 382 |
},
|
| 383 |
{
|
| 384 |
+
rank: 14,
|
| 385 |
name: "qwen3-0.6B",
|
| 386 |
score: 67,
|
| 387 |
maxScore: 125,
|
|
|
|
| 389 |
weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
|
| 390 |
},
|
| 391 |
{
|
| 392 |
+
rank: 15,
|
| 393 |
name: "qwen2.5-0.5B",
|
| 394 |
score: 60,
|
| 395 |
maxScore: 125,
|
|
|
|
| 397 |
weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
|
| 398 |
},
|
| 399 |
{
|
| 400 |
+
rank: 16,
|
| 401 |
name: "NxMobileLM-1.5B-SFT",
|
| 402 |
score: 59.5,
|
| 403 |
maxScore: 125,
|
|
|
|
| 405 |
weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
|
| 406 |
},
|
| 407 |
{
|
| 408 |
+
rank: 17,
|
| 409 |
name: "prithivMLmods-QWQ-500M",
|
| 410 |
score: 55,
|
| 411 |
maxScore: 125,
|