SeaWolf-AI commited on
Commit
1711cad
·
verified ·
1 Parent(s): 8fa7460

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +9 -3
index.html CHANGED
@@ -525,13 +525,13 @@ footer{padding:40px 0;border-top:1px solid var(--border);text-align:center}
525
  <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank">Dataset</a>
526
  <a href="https://huggingface.co/blog/FINAL-Bench/metacognitive" target="_blank">Article</a>
527
  <a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank">Leaderboard</a>
528
- <a href="/cdn-cgi/l/email-protection#ddbcafa5b4abbaada99dbab0bcb4b1f3beb2b0">Contact</a>
529
  </div>
530
  <div class="footer-copy">FINAL Bench v3.0 — Kim, Kim, Choi, Jang — VIDRAFT / Ginigen AI, Seoul</div>
531
  </footer>
532
  </div>
533
 
534
- <script data-cfasync="false" src="/cdn-cgi/scripts/5c5dd728/cloudflare-static/email-decode.min.js"></script><script>
535
  const models=[
536
  {name:'Kimi K2.5',provider:'Moonshot AI',bl:{score:68.71,pq:.775,ma:.775,er:.450,id:.767,fc:.750},mc:{score:78.54,pq:.767,ma:.742,er:.908,id:.750,fc:.725}},
537
  {name:'GPT-5.2',provider:'OpenAI',bl:{score:62.76,pq:.750,ma:.750,er:.336,id:.724,fc:.681},mc:{score:76.50,pq:.758,ma:.767,er:.792,id:.733,fc:.767}},
@@ -566,4 +566,10 @@ function renderAnalysisCharts(){
566
  const dd=models.map(m=>({x:m.bl.score,y:m.mc.score-m.bl.score}));new Chart(document.getElementById('chartDifficulty'),{type:'scatter',data:{datasets:[{label:'Models',data:dd,backgroundColor:['rgba(6,182,212,0.8)','rgba(59,130,246,0.8)','rgba(139,92,246,0.8)','rgba(245,158,11,0.8)','rgba(236,72,153,0.8)','rgba(16,185,129,0.8)','rgba(239,68,68,0.8)','rgba(251,191,36,0.8)','rgba(168,85,247,0.8)'],pointRadius:10,pointHoverRadius:14}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{display:false},tooltip:{callbacks:{label:function(c){return models[c.dataIndex].name+': BL='+c.parsed.x.toFixed(1)+' Δ=+'+c.parsed.y.toFixed(1);}}}},scales:{x:{title:{display:true,text:'Baseline Score',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}},y:{title:{display:true,text:'MetaCog Gain',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}}}}});
567
  const tl=['A:Trap','B:Contradict','C:Progressive','D:MultiConst','E:SelfCorr','F:ExpertPanel','G:Pivot','H:Uncertain'];const td=[13.8,15.2,14.1,13.5,14.8,13.2,14.5,14.9];new Chart(document.getElementById('chartTicos'),{type:'bar',data:{labels:tl,datasets:[{label:'Mean Δ',data:td,backgroundColor:['rgba(239,68,68,0.7)','rgba(245,158,11,0.7)','rgba(16,185,129,0.7)','rgba(59,130,246,0.7)','rgba(6,182,212,0.7)','rgba(139,92,246,0.7)','rgba(236,72,153,0.7)','rgba(251,191,36,0.7)'],borderRadius:6}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{display:false}},scales:{x:{ticks:{color:'#94a3b8',font:{size:10}},grid:{display:false}},y:{min:10,max:18,title:{display:true,text:'Mean Δ_MC',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}}}}});}
568
 
569
- function renderSafetyCharts(){const s=[...models].sort((a,b)=>(b.bl.ma-b.bl.er)-(a.bl.ma-a.bl.er));new Chart(document.getElementById('chartSafetyGap'),{type:'bar',data:{labels:s.map(m=>m.name),datasets:[{label:'MA (Declarative)',data:s.map(m=>m.bl.ma),backgroundColor:'rgba(245,158,11,0.7)',borderRadius:4},{label:'ER (Procedural)',data:s.map(m=>m.bl.er),backgroundColor:'rgba(239,68,68,0.7)',borderRadius:4}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{labels:{color:'#94a3b8'}},tooltip:{callbacks:{afterBody:function(c){const i=c[0].dataIndex;return 'Gap: '+(s[i].bl.ma-s[i].bl.er).toFixed(3);}}}},scales:{x:{ticks:{color:'#94a3b8',font:{size:11}},grid:{display:false}},y:{min:0,max:1,ticks:{color:'#64748b'},grid:{colo
 
 
 
 
 
 
 
525
  <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank">Dataset</a>
526
  <a href="https://huggingface.co/blog/FINAL-Bench/metacognitive" target="_blank">Article</a>
527
  <a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank">Leaderboard</a>
528
+ <a href="mailto:arxivgpt@gmail.com">Contact</a>
529
  </div>
530
  <div class="footer-copy">FINAL Bench v3.0 — Kim, Kim, Choi, Jang — VIDRAFT / Ginigen AI, Seoul</div>
531
  </footer>
532
  </div>
533
 
534
+ <script>
535
  const models=[
536
  {name:'Kimi K2.5',provider:'Moonshot AI',bl:{score:68.71,pq:.775,ma:.775,er:.450,id:.767,fc:.750},mc:{score:78.54,pq:.767,ma:.742,er:.908,id:.750,fc:.725}},
537
  {name:'GPT-5.2',provider:'OpenAI',bl:{score:62.76,pq:.750,ma:.750,er:.336,id:.724,fc:.681},mc:{score:76.50,pq:.758,ma:.767,er:.792,id:.733,fc:.767}},
 
566
  const dd=models.map(m=>({x:m.bl.score,y:m.mc.score-m.bl.score}));new Chart(document.getElementById('chartDifficulty'),{type:'scatter',data:{datasets:[{label:'Models',data:dd,backgroundColor:['rgba(6,182,212,0.8)','rgba(59,130,246,0.8)','rgba(139,92,246,0.8)','rgba(245,158,11,0.8)','rgba(236,72,153,0.8)','rgba(16,185,129,0.8)','rgba(239,68,68,0.8)','rgba(251,191,36,0.8)','rgba(168,85,247,0.8)'],pointRadius:10,pointHoverRadius:14}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{display:false},tooltip:{callbacks:{label:function(c){return models[c.dataIndex].name+': BL='+c.parsed.x.toFixed(1)+' Δ=+'+c.parsed.y.toFixed(1);}}}},scales:{x:{title:{display:true,text:'Baseline Score',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}},y:{title:{display:true,text:'MetaCog Gain',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}}}}});
567
  const tl=['A:Trap','B:Contradict','C:Progressive','D:MultiConst','E:SelfCorr','F:ExpertPanel','G:Pivot','H:Uncertain'];const td=[13.8,15.2,14.1,13.5,14.8,13.2,14.5,14.9];new Chart(document.getElementById('chartTicos'),{type:'bar',data:{labels:tl,datasets:[{label:'Mean Δ',data:td,backgroundColor:['rgba(239,68,68,0.7)','rgba(245,158,11,0.7)','rgba(16,185,129,0.7)','rgba(59,130,246,0.7)','rgba(6,182,212,0.7)','rgba(139,92,246,0.7)','rgba(236,72,153,0.7)','rgba(251,191,36,0.7)'],borderRadius:6}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{display:false}},scales:{x:{ticks:{color:'#94a3b8',font:{size:10}},grid:{display:false}},y:{min:10,max:18,title:{display:true,text:'Mean Δ_MC',color:'#94a3b8'},ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}}}}});}
568
 
569
+ function renderSafetyCharts(){const s=[...models].sort((a,b)=>(b.bl.ma-b.bl.er)-(a.bl.ma-a.bl.er));new Chart(document.getElementById('chartSafetyGap'),{type:'bar',data:{labels:s.map(m=>m.name),datasets:[{label:'MA (Declarative)',data:s.map(m=>m.bl.ma),backgroundColor:'rgba(245,158,11,0.7)',borderRadius:4},{label:'ER (Procedural)',data:s.map(m=>m.bl.er),backgroundColor:'rgba(239,68,68,0.7)',borderRadius:4}]},options:{responsive:true,maintainAspectRatio:false,plugins:{legend:{labels:{color:'#94a3b8'}},tooltip:{callbacks:{afterBody:function(c){const i=c[0].dataIndex;return 'Gap: '+(s[i].bl.ma-s[i].bl.er).toFixed(3);}}}},scales:{x:{ticks:{color:'#94a3b8',font:{size:11}},grid:{display:false}},y:{min:0,max:1,ticks:{color:'#64748b'},grid:{color:'rgba(255,255,255,0.05)'}}}}});}
570
+
571
+ // Initialize
572
+ renderBaseline();renderMetacog();renderDelta();renderGapViz();renderLeaderboardChart();
573
+ </script>
574
+ </body>
575
+ </html>