Spaces:

appvoid
/

meta-arena

Running

App Files Files Community

appvoid commited on Oct 19, 2025

Commit

1b51b33

verified ·

1 Parent(s): bfd97f5

Update index.html

Browse files

Files changed (1) hide show

index.html +36 -33

index.html CHANGED Viewed

@@ -367,59 +367,59 @@
             },
             {
                 rank: 13,
                 name: "Falcon-H1-1.5B-Instruct",
                 score: 81,
                 strengths: "Good at logic, math, and factual questions.",
                 weaknesses: "Fails translation completely and often gives blank/junk answers."
             },
-            {
-                rank: 14,
-                name: "lfm2-700m",
-                score: 75.5,
-                strengths: "Handles sentiment, math, and logic correctly.",
-                weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
-            },
             {
                 rank: 15,
-                name: "qwen2.5-0.5b-instruct",
-                score: 72,
-                strengths: "Decent at math, basic commands, and some logic.",
-                weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
             },
             {
                 rank: 16,
-                name: "Dolphin3.0-Qwen2.5-0.5B",
-                score: 69.5,
-                strengths: "Best of the small models; handles math and antonyms well.",
-                weaknesses: "Completely fails synonym generation and most grammar correction tasks."
             },
             {
                 rank: 17,
-                name: "qwen3-0.6B",
-                score: 67,
-                strengths: "Correct on basic math and antonyms.",
-                weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
             },
             {
                 rank: 18,
-                name: "Auto-Completer-0.2.Q8_0.gguf",
-                score: 60,
-                strengths: "Perfect in Antonyms, Translation, Math, and Logic.",
-                weaknesses: "Complete failure in most other areas; reinforces misconceptions, cannot follow sequences."
             },
             {
                 rank: 19,
-                name: "qwen2.5-0.5B",
-                score: 60,
-                strengths: "Passes basic math and antonym tasks.",
-                weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
             },
             {
                 rank: 20,
-                name: "NxMobileLM-1.5B-SFT",
-                score: 59.5,
-                strengths: "Passes math and some grammar/logic.",
-                weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
             }
         ];
@@ -435,7 +435,10 @@
         function populateTable() {
             const tbody = document.querySelector('#performanceTable tbody');
-            models.forEach((model, index) => {
                 const percentage = (model.score / maxScore) * 100;
                 const row = document.createElement('tr');
@@ -446,7 +449,7 @@
                     <td class="rank">#${model.rank}</td>
                     <td class="model-name">${model.name}</td>
                     <td>
-                        <div class="score">${model.score} / ${maxScore}</div>
                         <div class="progress-container">
                             <div class="progress-bar" style="width: ${percentage}%"></div>
                         </div>

             },
             {
                 rank: 13,
+                name: "arco-3",
+                score: 83,
+                strengths: "One of the most powerful 0.6b models; perfect at code gen, sentiment, math, and core knowledge.",
+                weaknesses: "Fails completely at summarization (hallucinations), sequencing, and rhyming. Poor reasoning."
+            },
+            {
+                rank: 14,
                 name: "Falcon-H1-1.5B-Instruct",
                 score: 81,
                 strengths: "Good at logic, math, and factual questions.",
                 weaknesses: "Fails translation completely and often gives blank/junk answers."
             },
             {
                 rank: 15,
+                name: "Llama-3.2-SUN-HDIC-1B-Instruct.Q8_0.gguf",
+                score: 79,
+                strengths: "Strong in synonyms, math, and factual recall; decent at core NLP.",
+                weaknesses: "Complete failure at summarization and misconception correction; bad factual hallucinations."
             },
             {
                 rank: 16,
+                name: "Piaget-0.6B.Q8_0.gguf",
+                score: 78,
+                strengths: "Excellent at core knowledge tasks: Sentiment, Object Location, Antonyms, Categorization, Math, Factual QA.",
+                weaknesses: "Complete failure at Summarization, Sequencing, and Rhyming. Very poor at Grammar and Misconception Correction."
             },
             {
                 rank: 17,
+                name: "lfm2-700m",
+                score: 75.5,
+                strengths: "Handles sentiment, math, and logic correctly.",
+                weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
             },
             {
                 rank: 18,
+                name: "Qwen3-psychological-reasoning-0.6B.Q8_0.gguf",
+                score: 73,
+                strengths: "Excels at factual recall and classification (Sentiment, Object Location, Math, Factual QA, NER).",
+                weaknesses: "Very poor at reasoning and creativity; complete failure in summarization, sequencing, and rhyming."
             },
             {
                 rank: 19,
+                name: "qwen2.5-0.5b-instruct",
+                score: 72,
+                strengths: "Decent at math, basic commands, and some logic.",
+                weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
             },
             {
                 rank: 20,
+                name: "qwen3-0.6b-notetaker-q8_0.gguf",
+                score: 71,
+                strengths: "Excels at a wide range of core knowledge and classification tasks (sentiment, math, NER, factual QA).",
+                weaknesses: "Complete failure at complex reasoning, creativity, and nuanced language (cause/effect, idioms, sequencing)."
             }
         ];
         function populateTable() {
             const tbody = document.querySelector('#performanceTable tbody');
+            // Filter to top 20 for display
+            const top20Models = models.slice(0, 20);
+            top20Models.forEach((model, index) => {
                 const percentage = (model.score / maxScore) * 100;
                 const row = document.createElement('tr');
                     <td class="rank">#${model.rank}</td>
                     <td class="model-name">${model.name}</td>
                     <td>
+                        <div class="score">${model.score.toFixed(1)} / ${maxScore}</div>
                         <div class="progress-container">
                             <div class="progress-bar" style="width: ${percentage}%"></div>
                         </div>