Spaces:
Running
Running
Update index.html
Browse files- index.html +9 -3
index.html
CHANGED
|
@@ -525,13 +525,13 @@ footer{padding:40px 0;border-top:1px solid var(--border);text-align:center}
|
|
| 525 |
<a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank">Dataset</a>
|
| 526 |
<a href="https://huggingface.co/blog/FINAL-Bench/metacognitive" target="_blank">Article</a>
|
| 527 |
<a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank">Leaderboard</a>
|
| 528 |
-
<a href="
|
| 529 |
</div>
|
| 530 |
<div class="footer-copy">FINAL Bench v3.0 — Kim, Kim, Choi, Jang — VIDRAFT / Ginigen AI, Seoul</div>
|
| 531 |
</footer>
|
| 532 |
</div>
|
| 533 |
|
| 534 |
-
<script
|
| 535 |
const models=[
|
| 536 |
{name:'Kimi K2.5',provider:'Moonshot AI',bl:{score:68.71,pq:.775,ma:.775,er:.450,id:.767,fc:.750},mc:{score:78.54,pq:.767,ma:.742,er:.908,id:.750,fc:.725}},
|
| 537 |
{name:'GPT-5.2',provider:'OpenAI',bl:{score:62.76,pq:.750,ma:.750,er:.336,id:.724,fc:.681},mc:{score:76.50,pq:.758,ma:.767,er:.792,id:.733,fc:.767}},
|
|
@@ -566,4 +566,10 @@ function renderAnalysisCharts(){
|
|
| 566 |
const dd=models.map(m=>({x:m.bl.score,y:m.mc.score-m.bl.score}));new Chart(document.getElementById('chartDifficulty'),{type:'scatter',data:{datasets:[{label:'Models',data:dd,backgroundColor:['rgba(6,182,212,0.8)','rgba(59,130,246,0.8)','rgba(139,92,246,0.8)','rgba(245,158,11,0.8)','rgba(236,72,153,0.8)','rgba(16,185,129,0.8)','rgba(239,68,68,0.8)','rgba(251,191,36,0.8)','rgba(168,85,247,0.8)'],pointRadius:10,pointHoverRadius:14}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{display:false},tooltip:{callbacks:{label:function(c){return models[c.dataIndex].name+': BL='+c.parsed.x.toFixed(1)+' Δ=+'+c.parsed.y.toFixed(1);}}}},scales:{x:{title:{display:true,text:'Baseline Score',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}},y:{title:{display:true,text:'MetaCog Gain',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}}}}});
|
| 567 |
const tl=['A:Trap','B:Contradict','C:Progressive','D:MultiConst','E:SelfCorr','F:ExpertPanel','G:Pivot','H:Uncertain'];const td=[13.8,15.2,14.1,13.5,14.8,13.2,14.5,14.9];new Chart(document.getElementById('chartTicos'),{type:'bar',data:{labels:tl,datasets:[{label:'Mean Δ',data:td,backgroundColor:['rgba(239,68,68,0.7)','rgba(245,158,11,0.7)','rgba(16,185,129,0.7)','rgba(59,130,246,0.7)','rgba(6,182,212,0.7)','rgba(139,92,246,0.7)','rgba(236,72,153,0.7)','rgba(251,191,36,0.7)'],borderRadius:6}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{display:false}},scales:{x:{ticks:{color:'#94a3b8',font:{size:10}},grid:{display:false}},y:{min:10,max:18,title:{display:true,text:'Mean Δ_MC',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}}}}});}
|
| 568 |
|
| 569 |
-
function renderSafetyCharts(){const s=[...models].sort((a,b)=>(b.bl.ma-b.bl.er)-(a.bl.ma-a.bl.er));new Chart(document.getElementById('chartSafetyGap'),{type:'bar',data:{labels:s.map(m=>m.name),datasets:[{label:'MA (Declarative)',data:s.map(m=>m.bl.ma),backgroundColor:'rgba(245,158,11,0.7)',borderRadius:4},{label:'ER (Procedural)',data:s.map(m=>m.bl.er),backgroundColor:'rgba(239,68,68,0.7)',borderRadius:4}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{labels:{color:'#94a3b8'}},tooltip:{callbacks:{afterBody:function(c){const i=c[0].dataIndex;return 'Gap: '+(s[i].bl.ma-s[i].bl.er).toFixed(3);}}}},scales:{x:{ticks:{color:'#94a3b8',font:{size:11}},grid:{display:false}},y:{min:0,max:1,ticks:{color:'#64748b'},grid:{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 525 |
<a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank">Dataset</a>
|
| 526 |
<a href="https://huggingface.co/blog/FINAL-Bench/metacognitive" target="_blank">Article</a>
|
| 527 |
<a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank">Leaderboard</a>
|
| 528 |
+
<a href="mailto:arxivgpt@gmail.com">Contact</a>
|
| 529 |
</div>
|
| 530 |
<div class="footer-copy">FINAL Bench v3.0 — Kim, Kim, Choi, Jang — VIDRAFT / Ginigen AI, Seoul</div>
|
| 531 |
</footer>
|
| 532 |
</div>
|
| 533 |
|
| 534 |
+
<script>
|
| 535 |
const models=[
|
| 536 |
{name:'Kimi K2.5',provider:'Moonshot AI',bl:{score:68.71,pq:.775,ma:.775,er:.450,id:.767,fc:.750},mc:{score:78.54,pq:.767,ma:.742,er:.908,id:.750,fc:.725}},
|
| 537 |
{name:'GPT-5.2',provider:'OpenAI',bl:{score:62.76,pq:.750,ma:.750,er:.336,id:.724,fc:.681},mc:{score:76.50,pq:.758,ma:.767,er:.792,id:.733,fc:.767}},
|
|
|
|
| 566 |
const dd=models.map(m=>({x:m.bl.score,y:m.mc.score-m.bl.score}));new Chart(document.getElementById('chartDifficulty'),{type:'scatter',data:{datasets:[{label:'Models',data:dd,backgroundColor:['rgba(6,182,212,0.8)','rgba(59,130,246,0.8)','rgba(139,92,246,0.8)','rgba(245,158,11,0.8)','rgba(236,72,153,0.8)','rgba(16,185,129,0.8)','rgba(239,68,68,0.8)','rgba(251,191,36,0.8)','rgba(168,85,247,0.8)'],pointRadius:10,pointHoverRadius:14}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{display:false},tooltip:{callbacks:{label:function(c){return models[c.dataIndex].name+': BL='+c.parsed.x.toFixed(1)+' Δ=+'+c.parsed.y.toFixed(1);}}}},scales:{x:{title:{display:true,text:'Baseline Score',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}},y:{title:{display:true,text:'MetaCog Gain',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}}}}});
|
| 567 |
const tl=['A:Trap','B:Contradict','C:Progressive','D:MultiConst','E:SelfCorr','F:ExpertPanel','G:Pivot','H:Uncertain'];const td=[13.8,15.2,14.1,13.5,14.8,13.2,14.5,14.9];new Chart(document.getElementById('chartTicos'),{type:'bar',data:{labels:tl,datasets:[{label:'Mean Δ',data:td,backgroundColor:['rgba(239,68,68,0.7)','rgba(245,158,11,0.7)','rgba(16,185,129,0.7)','rgba(59,130,246,0.7)','rgba(6,182,212,0.7)','rgba(139,92,246,0.7)','rgba(236,72,153,0.7)','rgba(251,191,36,0.7)'],borderRadius:6}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{display:false}},scales:{x:{ticks:{color:'#94a3b8',font:{size:10}},grid:{display:false}},y:{min:10,max:18,title:{display:true,text:'Mean Δ_MC',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}}}}});}
|
| 568 |
|
| 569 |
+
function renderSafetyCharts(){const s=[...models].sort((a,b)=>(b.bl.ma-b.bl.er)-(a.bl.ma-a.bl.er));new Chart(document.getElementById('chartSafetyGap'),{type:'bar',data:{labels:s.map(m=>m.name),datasets:[{label:'MA (Declarative)',data:s.map(m=>m.bl.ma),backgroundColor:'rgba(245,158,11,0.7)',borderRadius:4},{label:'ER (Procedural)',data:s.map(m=>m.bl.er),backgroundColor:'rgba(239,68,68,0.7)',borderRadius:4}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{labels:{color:'#94a3b8'}},tooltip:{callbacks:{afterBody:function(c){const i=c[0].dataIndex;return 'Gap: '+(s[i].bl.ma-s[i].bl.er).toFixed(3);}}}},scales:{x:{ticks:{color:'#94a3b8',font:{size:11}},grid:{display:false}},y:{min:0,max:1,ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}}}}});}
|
| 570 |
+
|
| 571 |
+
// Initialize
|
| 572 |
+
renderBaseline();renderMetacog();renderDelta();renderGapViz();renderLeaderboardChart();
|
| 573 |
+
</script>
|
| 574 |
+
</body>
|
| 575 |
+
</html>
|