Spaces:

appvoid
/

meta-arena

Running

App Files Files Community

appvoid commited on Oct 15, 2025

Commit

2faf7af

verified ·

1 Parent(s): 0102db1

Update index.html

Browse files

Files changed (1) hide show

index.html +23 -15

index.html CHANGED Viewed

@@ -278,6 +278,14 @@
         const models = [
             {
                 rank: 1,
                 name: "Qwen3-4B-Instruct",
                 score: 102,
                 maxScore: 125,
@@ -285,7 +293,7 @@
                 weaknesses: "Prone to factual hallucinations in summarization tasks."
             },
             {
-                rank: 2,
                 name: "lfm2-8b",
                 score: 99,
                 maxScore: 125,
@@ -293,7 +301,7 @@
                 weaknesses: "Struggles with creative tasks like rhyming and procedural sequencing."
             },
             {
-                rank: 3,
                 name: "granite-3.1-3b-instruct",
                 score: 93.5,
                 maxScore: 125,
@@ -301,7 +309,7 @@
                 weaknesses: "Unreliable; frequently outputs junk characters ('{') instead of answering."
             },
             {
-                rank: 4,
                 name: "lfm2-2.6b",
                 score: 93.5,
                 maxScore: 125,
@@ -309,7 +317,7 @@
                 weaknesses: "Significant weakness in analogy, rhyming, and sequencing tasks."
             },
             {
-                rank: 5,
                 name: "Qwen3-1.7B",
                 score: 92.5,
                 maxScore: 125,
@@ -317,7 +325,7 @@
                 weaknesses: "Fails completely on rhyming and has some odd analogy mistakes."
             },
             {
-                rank: 6,
                 name: "Llama-3.2-1B-Instruct",
                 score: 92,
                 maxScore: 125,
@@ -325,7 +333,7 @@
                 weaknesses: "Fails badly on misconception correction, sequencing, and paraphrasing."
             },
             {
-                rank: 7,
                 name: "lfm2-1.2b",
                 score: 90.5,
                 maxScore: 125,
@@ -333,7 +341,7 @@
                 weaknesses: "Knowledge gaps (object location) and hallucinates facts in headlines."
             },
             {
-                rank: 8,
                 name: "Falcon-H1-1.5B-Deep-Instruct",
                 score: 89,
                 maxScore: 125,
@@ -341,7 +349,7 @@
                 weaknesses: "Very poor at logical deduction, rhyming, and categorization."
             },
             {
-                rank: 9,
                 name: "Falcon-H1-1.5B-Instruct",
                 score: 81,
                 maxScore: 125,
@@ -349,7 +357,7 @@
                 weaknesses: "Fails translation completely and often gives blank/junk answers."
             },
             {
-                rank: 10,
                 name: "lfm2-700m",
                 score: 75.5,
                 maxScore: 125,
@@ -357,7 +365,7 @@
                 weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
             },
             {
-                rank: 11,
                 name: "qwen2.5-0.5b-instruct",
                 score: 72,
                 maxScore: 125,
@@ -365,7 +373,7 @@
                 weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
             },
             {
-                rank: 12,
                 name: "Dolphin3.0-Qwen2.5-0.5B",
                 score: 69.5,
                 maxScore: 125,
@@ -373,7 +381,7 @@
                 weaknesses: "Completely fails synonym generation and most grammar correction tasks."
             },
             {
-                rank: 13,
                 name: "qwen3-0.6B",
                 score: 67,
                 maxScore: 125,
@@ -381,7 +389,7 @@
                 weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
             },
             {
-                rank: 14,
                 name: "qwen2.5-0.5B",
                 score: 60,
                 maxScore: 125,
@@ -389,7 +397,7 @@
                 weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
             },
             {
-                rank: 15,
                 name: "NxMobileLM-1.5B-SFT",
                 score: 59.5,
                 maxScore: 125,
@@ -397,7 +405,7 @@
                 weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
             },
             {
-                rank: 16,
                 name: "prithivMLmods-QWQ-500M",
                 score: 55,
                 maxScore: 125,

         const models = [
             {
                 rank: 1,
+                name: "granite-4.0-h-tiny",
+                score: 103.5,
+                maxScore: 125,
+                strengths: "Extremely well-rounded; top-tier in logic, math, translation, and synonyms.",
+                weaknesses: "Fails completely at rhyming; hallucinates facts in summarization tasks."
+            },
+            {
+                rank: 2,
                 name: "Qwen3-4B-Instruct",
                 score: 102,
                 maxScore: 125,
                 weaknesses: "Prone to factual hallucinations in summarization tasks."
             },
             {
+                rank: 3,
                 name: "lfm2-8b",
                 score: 99,
                 maxScore: 125,
                 weaknesses: "Struggles with creative tasks like rhyming and procedural sequencing."
             },
             {
+                rank: 4,
                 name: "granite-3.1-3b-instruct",
                 score: 93.5,
                 maxScore: 125,
                 weaknesses: "Unreliable; frequently outputs junk characters ('{') instead of answering."
             },
             {
+                rank: 5,
                 name: "lfm2-2.6b",
                 score: 93.5,
                 maxScore: 125,
                 weaknesses: "Significant weakness in analogy, rhyming, and sequencing tasks."
             },
             {
+                rank: 6,
                 name: "Qwen3-1.7B",
                 score: 92.5,
                 maxScore: 125,
                 weaknesses: "Fails completely on rhyming and has some odd analogy mistakes."
             },
             {
+                rank: 7,
                 name: "Llama-3.2-1B-Instruct",
                 score: 92,
                 maxScore: 125,
                 weaknesses: "Fails badly on misconception correction, sequencing, and paraphrasing."
             },
             {
+                rank: 8,
                 name: "lfm2-1.2b",
                 score: 90.5,
                 maxScore: 125,
                 weaknesses: "Knowledge gaps (object location) and hallucinates facts in headlines."
             },
             {
+                rank: 9,
                 name: "Falcon-H1-1.5B-Deep-Instruct",
                 score: 89,
                 maxScore: 125,
                 weaknesses: "Very poor at logical deduction, rhyming, and categorization."
             },
             {
+                rank: 10,
                 name: "Falcon-H1-1.5B-Instruct",
                 score: 81,
                 maxScore: 125,
                 weaknesses: "Fails translation completely and often gives blank/junk answers."
             },
             {
+                rank: 11,
                 name: "lfm2-700m",
                 score: 75.5,
                 maxScore: 125,
                 weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
             },
             {
+                rank: 12,
                 name: "qwen2.5-0.5b-instruct",
                 score: 72,
                 maxScore: 125,
                 weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
             },
             {
+                rank: 13,
                 name: "Dolphin3.0-Qwen2.5-0.5B",
                 score: 69.5,
                 maxScore: 125,
                 weaknesses: "Completely fails synonym generation and most grammar correction tasks."
             },
             {
+                rank: 14,
                 name: "qwen3-0.6B",
                 score: 67,
                 maxScore: 125,
                 weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
             },
             {
+                rank: 15,
                 name: "qwen2.5-0.5B",
                 score: 60,
                 maxScore: 125,
                 weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
             },
             {
+                rank: 16,
                 name: "NxMobileLM-1.5B-SFT",
                 score: 59.5,
                 maxScore: 125,
                 weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
             },
             {
+                rank: 17,
                 name: "prithivMLmods-QWQ-500M",
                 score: 55,
                 maxScore: 125,