Spaces:

appvoid
/

meta-arena

Running

App Files Files Community

appvoid commited on Oct 19, 2025

Commit

bfd97f5

verified ·

1 Parent(s): 2faf7af

Update index.html

Browse files

Files changed (1) hide show

index.html +81 -69

index.html CHANGED Viewed

@@ -3,6 +3,7 @@
 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <style>
         * {
             margin: 0;
@@ -41,8 +42,9 @@
         .table-wrapper {
             text-align: center;
-            background: black;
             border-radius: 16px;
             box-shadow: 0 20px 60px rgba(0,0,0,0.3);
             overflow: hidden;
             animation: fadeIn 0.6s ease-out;
@@ -78,10 +80,6 @@
             letter-spacing: 0.5px;
         }
-        th:first-child {
-            border-radius: 0;
-        }
         tbody tr {
             border-bottom: 1px solid #222;
             transition: all 0.3s ease;
@@ -99,6 +97,11 @@
         td {
             padding: 18px 20px;
             font-size: 0.95rem;
         }
         .rank {
@@ -119,12 +122,13 @@
         .score {
             font-weight: 700;
             font-size: .8rem;
         }
         .progress-container {
             width: 100%;
             height: 8px;
-            background: #e2e8f0;
             border-radius: 10px;
             overflow: hidden;
             margin-top: 8px;
@@ -154,7 +158,7 @@
             letter-spacing: 0.5px;
         }
-        .badge-best {
             background: linear-gradient(135deg, #48bb78 0%, #38a169 100%);
             color: white;
         }
@@ -185,7 +189,29 @@
             font-weight: 500;
             font-size: 0.85rem;
         }
         @media (max-width: 768px) {
             h1 {
                 font-size: 1.8rem;
@@ -209,27 +235,6 @@
             }
         }
-        .legend {
-            display: flex;
-            justify-content: center;
-            gap: 20px;
-            margin-top: 30px;
-            flex-wrap: wrap;
-        }
-        .legend-item {
-            display: flex;
-            align-items: center;
-            gap: 8px;
-            color: white;
-            font-size: 0.9rem;
-        }
-        .legend-color {
-            width: 30px;
-            height: 8px;
-            border-radius: 4px;
-        }
     </style>
 </head>
 <body>
@@ -269,7 +274,7 @@
             </div>
             <div class="legend-item">
                 <div class="legend-color" style="background: linear-gradient(90deg, #f56565 0%, #e53e3e 100%);"></div>
-                <span>Poor (<69)</span>
             </div>
         </div>
     </div>
@@ -280,7 +285,6 @@
                 rank: 1,
                 name: "granite-4.0-h-tiny",
                 score: 103.5,
-                maxScore: 125,
                 strengths: "Extremely well-rounded; top-tier in logic, math, translation, and synonyms.",
                 weaknesses: "Fails completely at rhyming; hallucinates facts in summarization tasks."
             },
@@ -288,7 +292,6 @@
                 rank: 2,
                 name: "Qwen3-4B-Instruct",
                 score: 102,
-                maxScore: 125,
                 strengths: "Top performer, excels in core NLP, logic, and factual recall.",
                 weaknesses: "Prone to factual hallucinations in summarization tasks."
             },
@@ -296,126 +299,134 @@
                 rank: 3,
                 name: "lfm2-8b",
                 score: 99,
-                maxScore: 125,
                 strengths: "Very logical, provides detailed, nuanced answers, strong at misconception correction.",
                 weaknesses: "Struggles with creative tasks like rhyming and procedural sequencing."
             },
             {
                 rank: 4,
                 name: "granite-3.1-3b-instruct",
                 score: 93.5,
-                maxScore: 125,
                 strengths: "Highly capable when it works; excellent at summarization and logic.",
                 weaknesses: "Unreliable; frequently outputs junk characters ('{') instead of answering."
             },
             {
-                rank: 5,
                 name: "lfm2-2.6b",
                 score: 93.5,
-                maxScore: 125,
                 strengths: "Strong core capabilities, great at grammar and misconception correction.",
                 weaknesses: "Significant weakness in analogy, rhyming, and sequencing tasks."
             },
             {
-                rank: 6,
                 name: "Qwen3-1.7B",
                 score: 92.5,
-                maxScore: 125,
                 strengths: "Good overall performance on core tasks and math.",
                 weaknesses: "Fails completely on rhyming and has some odd analogy mistakes."
             },
             {
-                rank: 7,
                 name: "Llama-3.2-1B-Instruct",
                 score: 92,
-                maxScore: 125,
                 strengths: "Great at core NLP, math, and code generation.",
                 weaknesses: "Fails badly on misconception correction, sequencing, and paraphrasing."
             },
             {
-                rank: 8,
                 name: "lfm2-1.2b",
                 score: 90.5,
-                maxScore: 125,
                 strengths: "Strong core skills like grammar, math, and translation.",
                 weaknesses: "Knowledge gaps (object location) and hallucinates facts in headlines."
             },
             {
-                rank: 9,
                 name: "Falcon-H1-1.5B-Deep-Instruct",
                 score: 89,
-                maxScore: 125,
                 strengths: "Excellent summarizer and paraphraser, strong on synonyms.",
                 weaknesses: "Very poor at logical deduction, rhyming, and categorization."
             },
             {
-                rank: 10,
                 name: "Falcon-H1-1.5B-Instruct",
                 score: 81,
-                maxScore: 125,
                 strengths: "Good at logic, math, and factual questions.",
                 weaknesses: "Fails translation completely and often gives blank/junk answers."
             },
             {
-                rank: 11,
                 name: "lfm2-700m",
                 score: 75.5,
-                maxScore: 125,
                 strengths: "Handles sentiment, math, and logic correctly.",
                 weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
             },
             {
-                rank: 12,
                 name: "qwen2.5-0.5b-instruct",
                 score: 72,
-                maxScore: 125,
                 strengths: "Decent at math, basic commands, and some logic.",
                 weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
             },
             {
-                rank: 13,
                 name: "Dolphin3.0-Qwen2.5-0.5B",
                 score: 69.5,
-                maxScore: 125,
                 strengths: "Best of the small models; handles math and antonyms well.",
                 weaknesses: "Completely fails synonym generation and most grammar correction tasks."
             },
             {
-                rank: 14,
                 name: "qwen3-0.6B",
                 score: 67,
-                maxScore: 125,
                 strengths: "Correct on basic math and antonyms.",
                 weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
             },
             {
-                rank: 15,
                 name: "qwen2.5-0.5B",
                 score: 60,
-                maxScore: 125,
                 strengths: "Passes basic math and antonym tasks.",
                 weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
             },
             {
-                rank: 16,
                 name: "NxMobileLM-1.5B-SFT",
                 score: 59.5,
-                maxScore: 125,
                 strengths: "Passes math and some grammar/logic.",
                 weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
-            },
-            {
-                rank: 17,
-                name: "prithivMLmods-QWQ-500M",
-                score: 55,
-                maxScore: 125,
-                strengths: "Handles math and most logic correctly.",
-                weaknesses: "Very poor overall; fails most creative tasks, hallucinates facts, outputs numbers for text."
             }
         ];
         function getRatingBadge(score) {
-            if (score >= 108) return '<span class="badge badge-best">Excellent</span>';
             if (score >= 91) return '<span class="badge badge-good">Good</span>';
             if (score >= 69) return '<span class="badge badge-average">Average</span>';
             return '<span class="badge badge-poor">Poor</span>';
@@ -425,16 +436,17 @@
             const tbody = document.querySelector('#performanceTable tbody');
             models.forEach((model, index) => {
-                const percentage = (model.score / model.maxScore) * 100;
                 const row = document.createElement('tr');
-                row.style.animationDelay = `${index * 0.1}s`;
                 row.innerHTML = `
                     <td class="rank">#${model.rank}</td>
                     <td class="model-name">${model.name}</td>
                     <td>
-                        <div class="score">${model.score} / ${model.maxScore}</div>
                         <div class="progress-container">
                             <div class="progress-bar" style="width: ${percentage}%"></div>
                         </div>

 <head>
     <meta charset="UTF-8">
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Meta Leaderboard - Top 20 Models</title>
     <style>
         * {
             margin: 0;
         .table-wrapper {
             text-align: center;
+            background: #111;
             border-radius: 16px;
+            border: 1px solid #333;
             box-shadow: 0 20px 60px rgba(0,0,0,0.3);
             overflow: hidden;
             animation: fadeIn 0.6s ease-out;
             letter-spacing: 0.5px;
         }
         tbody tr {
             border-bottom: 1px solid #222;
             transition: all 0.3s ease;
         td {
             padding: 18px 20px;
             font-size: 0.95rem;
+            text-align: left;
+        }
+        td:first-child, td:last-child {
+            text-align: center;
         }
         .rank {
         .score {
             font-weight: 700;
             font-size: .8rem;
+            text-align: center;
         }
         .progress-container {
             width: 100%;
             height: 8px;
+            background: #444;
             border-radius: 10px;
             overflow: hidden;
             margin-top: 8px;
             letter-spacing: 0.5px;
         }
+        .badge-excellent {
             background: linear-gradient(135deg, #48bb78 0%, #38a169 100%);
             color: white;
         }
             font-weight: 500;
             font-size: 0.85rem;
         }
+        .legend {
+            display: flex;
+            justify-content: center;
+            gap: 20px;
+            margin-top: 30px;
+            flex-wrap: wrap;
+        }
+        .legend-item {
+            display: flex;
+            align-items: center;
+            gap: 8px;
+            color: white;
+            font-size: 0.9rem;
+        }
+        .legend-color {
+            width: 30px;
+            height: 8px;
+            border-radius: 4px;
+        }
         @media (max-width: 768px) {
             h1 {
                 font-size: 1.8rem;
             }
         }
     </style>
 </head>
 <body>
             </div>
             <div class="legend-item">
                 <div class="legend-color" style="background: linear-gradient(90deg, #f56565 0%, #e53e3e 100%);"></div>
+                <span>Poor (&lt;69)</span>
             </div>
         </div>
     </div>
                 rank: 1,
                 name: "granite-4.0-h-tiny",
                 score: 103.5,
                 strengths: "Extremely well-rounded; top-tier in logic, math, translation, and synonyms.",
                 weaknesses: "Fails completely at rhyming; hallucinates facts in summarization tasks."
             },
                 rank: 2,
                 name: "Qwen3-4B-Instruct",
                 score: 102,
                 strengths: "Top performer, excels in core NLP, logic, and factual recall.",
                 weaknesses: "Prone to factual hallucinations in summarization tasks."
             },
                 rank: 3,
                 name: "lfm2-8b",
                 score: 99,
                 strengths: "Very logical, provides detailed, nuanced answers, strong at misconception correction.",
                 weaknesses: "Struggles with creative tasks like rhyming and procedural sequencing."
             },
             {
                 rank: 4,
+                name: "Qwen3-MOE-4x0.6B-2.4B-Writing-Thunder-V1.2.Q8_0.gguf",
+                score: 96,
+                strengths: "Strong in logic, math, grammar, and summarization.",
+                weaknesses: "Struggles with rhyming, synonyms, some translation, and procedural sequencing."
+            },
+            {
+                rank: 5,
+                name: "granite-3.3-2b-instruct-Q8_0.gguf",
+                score: 95,
+                strengths: "Excels at core NLP, logic, math, and misconception correction.",
+                weaknesses: "Fails completely at NER, rhyming, and procedural sequencing."
+            },
+            {
+                rank: 6,
                 name: "granite-3.1-3b-instruct",
                 score: 93.5,
                 strengths: "Highly capable when it works; excellent at summarization and logic.",
                 weaknesses: "Unreliable; frequently outputs junk characters ('{') instead of answering."
             },
             {
+                rank: 7,
                 name: "lfm2-2.6b",
                 score: 93.5,
                 strengths: "Strong core capabilities, great at grammar and misconception correction.",
                 weaknesses: "Significant weakness in analogy, rhyming, and sequencing tasks."
+            },
+             {
+                rank: 8,
+                name: "EXAONE-3.5-2.4B-Instruct-abliterated.Q8_0.gguf",
+                score: 93,
+                strengths: "Excellent at reasoning, summarization, grammar, and misconception correction.",
+                weaknesses: "Fails completely at translation and sequencing; unreliable output formatting."
             },
             {
+                rank: 9,
                 name: "Qwen3-1.7B",
                 score: 92.5,
                 strengths: "Good overall performance on core tasks and math.",
                 weaknesses: "Fails completely on rhyming and has some odd analogy mistakes."
             },
             {
+                rank: 10,
                 name: "Llama-3.2-1B-Instruct",
                 score: 92,
                 strengths: "Great at core NLP, math, and code generation.",
                 weaknesses: "Fails badly on misconception correction, sequencing, and paraphrasing."
             },
             {
+                rank: 11,
                 name: "lfm2-1.2b",
                 score: 90.5,
                 strengths: "Strong core skills like grammar, math, and translation.",
                 weaknesses: "Knowledge gaps (object location) and hallucinates facts in headlines."
             },
             {
+                rank: 12,
                 name: "Falcon-H1-1.5B-Deep-Instruct",
                 score: 89,
                 strengths: "Excellent summarizer and paraphraser, strong on synonyms.",
                 weaknesses: "Very poor at logical deduction, rhyming, and categorization."
             },
             {
+                rank: 13,
                 name: "Falcon-H1-1.5B-Instruct",
                 score: 81,
                 strengths: "Good at logic, math, and factual questions.",
                 weaknesses: "Fails translation completely and often gives blank/junk answers."
             },
             {
+                rank: 14,
                 name: "lfm2-700m",
                 score: 75.5,
                 strengths: "Handles sentiment, math, and logic correctly.",
                 weaknesses: "Many failures in reasoning (cause/effect), tool use, synonyms, and grammar."
             },
             {
+                rank: 15,
                 name: "qwen2.5-0.5b-instruct",
                 score: 72,
                 strengths: "Decent at math, basic commands, and some logic.",
                 weaknesses: "Fails creative tasks (rhyming, synonyms) and suffers major headline hallucinations."
             },
             {
+                rank: 16,
                 name: "Dolphin3.0-Qwen2.5-0.5B",
                 score: 69.5,
                 strengths: "Best of the small models; handles math and antonyms well.",
                 weaknesses: "Completely fails synonym generation and most grammar correction tasks."
             },
             {
+                rank: 17,
                 name: "qwen3-0.6B",
                 score: 67,
                 strengths: "Correct on basic math and antonyms.",
                 weaknesses: "Riddled with bizarre, nonsensical answers (e.g., '3D', '2D Notation')."
             },
             {
+                rank: 18,
+                name: "Auto-Completer-0.2.Q8_0.gguf",
+                score: 60,
+                strengths: "Perfect in Antonyms, Translation, Math, and Logic.",
+                weaknesses: "Complete failure in most other areas; reinforces misconceptions, cannot follow sequences."
+            },
+            {
+                rank: 19,
                 name: "qwen2.5-0.5B",
                 score: 60,
                 strengths: "Passes basic math and antonym tasks.",
                 weaknesses: "Very unreliable; outputs long numbers for text tasks, fails creative tasks."
             },
             {
+                rank: 20,
                 name: "NxMobileLM-1.5B-SFT",
                 score: 59.5,
                 strengths: "Passes math and some grammar/logic.",
                 weaknesses: "Extremely unreliable, with frequent junk ('{', '1', emojis) or non-English outputs."
             }
         ];
+        const maxScore = 125;
         function getRatingBadge(score) {
+            if (score >= 108) return '<span class="badge badge-excellent">Excellent</span>';
             if (score >= 91) return '<span class="badge badge-good">Good</span>';
             if (score >= 69) return '<span class="badge badge-average">Average</span>';
             return '<span class="badge badge-poor">Poor</span>';
             const tbody = document.querySelector('#performanceTable tbody');
             models.forEach((model, index) => {
+                const percentage = (model.score / maxScore) * 100;
                 const row = document.createElement('tr');
+                row.style.animation = `fadeIn 0.5s ease-out ${index * 0.05}s forwards`;
+                row.style.opacity = 0;
                 row.innerHTML = `
                     <td class="rank">#${model.rank}</td>
                     <td class="model-name">${model.name}</td>
                     <td>
+                        <div class="score">${model.score} / ${maxScore}</div>
                         <div class="progress-container">
                             <div class="progress-bar" style="width: ${percentage}%"></div>
                         </div>