Spaces:
Sleeping
Sleeping
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"/> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"/> | |
| <title>SocraticEnv β Model Leaderboard</title> | |
| <style> | |
| * { margin:0; padding:0; box-sizing:border-box; } | |
| body { font-family:'Segoe UI',system-ui,sans-serif; background:#0d1117; color:#e6edf3; min-height:100vh; } | |
| .header { | |
| background:#161b22; border-bottom:1px solid #30363d; | |
| padding:16px 32px; display:flex; align-items:center; | |
| justify-content:space-between; | |
| } | |
| .header-left { display:flex; align-items:center; gap:12px; } | |
| .logo { | |
| width:36px; height:36px; | |
| background:linear-gradient(135deg,#7c3aed,#a855f7); | |
| border-radius:8px; display:flex; align-items:center; | |
| justify-content:center; font-size:18px; | |
| } | |
| .header h1 { font-size:18px; font-weight:600; } | |
| .header p { font-size:12px; color:#8b949e; margin-top:2px; } | |
| .nav-links { display:flex; gap:8px; } | |
| .nav-link { | |
| padding:6px 14px; border-radius:8px; font-size:12px; | |
| font-weight:600; text-decoration:none; border:1px solid #30363d; | |
| color:#8b949e; background:#21262d; transition:all 0.2s; | |
| } | |
| .nav-link:hover { color:#e6edf3; border-color:#7c3aed; } | |
| .nav-link.active { color:#a855f7; border-color:#7c3aed; background:#13111e; } | |
| .container { max-width:1000px; margin:0 auto; padding:32px 24px; } | |
| .page-title { font-size:24px; font-weight:700; margin-bottom:6px; } | |
| .page-sub { font-size:13px; color:#8b949e; margin-bottom:28px; } | |
| /* Run panel */ | |
| .run-panel { | |
| background:#161b22; border:1px solid #30363d; | |
| border-radius:12px; padding:20px; margin-bottom:28px; | |
| } | |
| .run-title { font-size:14px; font-weight:600; margin-bottom:14px; color:#e6edf3; } | |
| .run-row { display:flex; gap:10px; align-items:center; } | |
| .run-input { | |
| flex:1; background:#0d1117; border:1px solid #30363d; | |
| border-radius:8px; padding:9px 14px; color:#e6edf3; | |
| font-size:13px; font-family:inherit; | |
| } | |
| .run-input:focus { outline:none; border-color:#7c3aed; } | |
| .run-input::placeholder { color:#484f58; } | |
| .btn { | |
| padding:9px 18px; border-radius:8px; font-size:13px; | |
| font-weight:600; border:none; cursor:pointer; | |
| transition:all 0.2s; white-space:nowrap; | |
| } | |
| .btn-primary { background:#7c3aed; color:white; } | |
| .btn-primary:hover { background:#6d28d9; } | |
| .btn-primary:disabled { background:#3d2070; color:#8b6bb5; cursor:not-allowed; } | |
| .run-status { | |
| margin-top:12px; font-size:12px; color:#8b949e; | |
| min-height:20px; display:flex; align-items:center; gap:8px; | |
| } | |
| .spinner { | |
| width:14px; height:14px; border:2px solid #30363d; | |
| border-top-color:#7c3aed; border-radius:50%; | |
| animation:spin 0.8s linear infinite; display:none; | |
| } | |
| @keyframes spin { to { transform:rotate(360deg); } } | |
| /* Stats row */ | |
| .stats-row { display:grid; grid-template-columns:repeat(3,1fr); gap:12px; margin-bottom:24px; } | |
| .stat-card { | |
| background:#161b22; border:1px solid #30363d; | |
| border-radius:10px; padding:16px; text-align:center; | |
| } | |
| .stat-val { font-size:28px; font-weight:700; color:#7c3aed; } | |
| .stat-lbl { font-size:11px; color:#8b949e; margin-top:4px; } | |
| /* Table */ | |
| .table-wrap { | |
| background:#161b22; border:1px solid #30363d; | |
| border-radius:12px; overflow:hidden; | |
| } | |
| .table-header { | |
| display:grid; | |
| grid-template-columns:40px 1fr 100px 100px 100px 110px 140px; | |
| padding:10px 16px; background:#0d1117; | |
| border-bottom:1px solid #30363d; | |
| font-size:10px; font-weight:600; color:#8b949e; | |
| letter-spacing:0.8px; text-transform:uppercase; | |
| } | |
| .table-row { | |
| display:grid; | |
| grid-template-columns:40px 1fr 100px 100px 100px 110px 140px; | |
| padding:14px 16px; border-bottom:1px solid #21262d; | |
| align-items:center; transition:background 0.15s; | |
| } | |
| .table-row:last-child { border-bottom:none; } | |
| .table-row:hover { background:#1c2128; } | |
| .table-row.top { background:#13111e; } | |
| .rank { font-size:14px; font-weight:700; color:#8b949e; } | |
| .rank.gold { color:#f59e0b; } | |
| .rank.silver { color:#94a3b8; } | |
| .rank.bronze { color:#cd7f32; } | |
| .model-name { font-size:13px; font-weight:600; color:#e6edf3; } | |
| .model-time { font-size:10px; color:#484f58; margin-top:2px; } | |
| .score-cell { text-align:center; } | |
| .score-val { | |
| font-size:13px; font-weight:600; | |
| padding:3px 10px; border-radius:6px; display:inline-block; | |
| } | |
| .score-high { background:#1a3a2a; color:#3fb950; } | |
| .score-mid { background:#332d1a; color:#d29922; } | |
| .score-low { background:#3a1a1a; color:#f85149; } | |
| .overall-val { | |
| font-size:15px; font-weight:700; text-align:center; | |
| } | |
| .bar-wrap { display:flex; align-items:center; gap:6px; } | |
| .bar-bg { flex:1; height:6px; background:#21262d; border-radius:3px; overflow:hidden; } | |
| .bar-fill { height:100%; border-radius:3px; transition:width 0.6s ease; } | |
| .delete-btn { | |
| background:none; border:none; color:#484f58; | |
| cursor:pointer; font-size:12px; padding:4px 8px; | |
| border-radius:4px; transition:all 0.2s; | |
| } | |
| .delete-btn:hover { color:#f85149; background:#3a1a1a; } | |
| /* Empty state */ | |
| .empty { | |
| text-align:center; padding:48px 24px; | |
| color:#8b949e; | |
| } | |
| .empty-icon { font-size:40px; opacity:0.3; margin-bottom:12px; } | |
| .empty-title { font-size:15px; font-weight:600; margin-bottom:6px; } | |
| .empty-sub { font-size:12px; } | |
| /* Seed panel */ | |
| .seed-panel { | |
| background:#161b22; border:1px solid #30363d; | |
| border-radius:12px; padding:16px 20px; | |
| margin-bottom:20px; display:flex; | |
| align-items:center; justify-content:space-between; | |
| gap:16px; | |
| } | |
| .seed-text { font-size:12px; color:#8b949e; } | |
| .seed-text strong { color:#e6edf3; } | |
| .btn-secondary { | |
| background:#21262d; color:#e6edf3; | |
| border:1px solid #30363d; | |
| } | |
| .btn-secondary:hover { background:#30363d; } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="header"> | |
| <div class="header-left"> | |
| <div class="logo">π</div> | |
| <div> | |
| <h1>SocraticEnv</h1> | |
| <p>OpenEnv Hackathon Β· Meta Γ PyTorch Γ Scaler</p> | |
| </div> | |
| </div> | |
| <div class="nav-links"> | |
| <a href="/ui" class="nav-link">Live Demo</a> | |
| <a href="/leaderboard" class="nav-link active">Leaderboard</a> | |
| <a href="/docs" class="nav-link">API Docs</a> | |
| </div> | |
| </div> | |
| <div class="container"> | |
| <div class="page-title">Model Leaderboard</div> | |
| <div class="page-sub">Compare AI models on Socratic reasoning ability across all 3 tasks. Which model thinks best under pressure?</div> | |
| <!-- Seed with default data --> | |
| <div class="seed-panel" id="seedPanel" style="display:none"> | |
| <div class="seed-text">No entries yet. <strong>Seed with baseline scores</strong> to populate the leaderboard with known model performance.</div> | |
| <button class="btn btn-secondary" onclick="seedBaseline()">Seed Baseline Data</button> | |
| </div> | |
| <!-- Run evaluation panel --> | |
| <div class="run-panel"> | |
| <div class="run-title">Run a new model evaluation</div> | |
| <div class="run-row"> | |
| <input class="run-input" id="modelName" placeholder="Enter a display name e.g. Llama 3.1 8B, GPT-4o, Mistral 7B..." /> | |
| <button class="btn btn-primary" id="runBtn" onclick="runEval()">Run Evaluation</button> | |
| </div> | |
| <div class="run-status" id="runStatus"> | |
| <div class="spinner" id="spinner"></div> | |
| <span id="statusText">Enter a model name and click Run to benchmark the current model against all 3 tasks.</span> | |
| </div> | |
| </div> | |
| <!-- Stats --> | |
| <div class="stats-row"> | |
| <div class="stat-card"> | |
| <div class="stat-val" id="statModels">0</div> | |
| <div class="stat-lbl">Models evaluated</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-val" id="statBest">β</div> | |
| <div class="stat-lbl">Best overall score</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-val" id="statHardest">β</div> | |
| <div class="stat-lbl">Hardest task avg</div> | |
| </div> | |
| </div> | |
| <!-- Table --> | |
| <div class="table-wrap"> | |
| <div class="table-header"> | |
| <div>Rank</div> | |
| <div>Model</div> | |
| <div>Easy</div> | |
| <div>Medium</div> | |
| <div>Hard</div> | |
| <div>Overall</div> | |
| <div>Progress</div> | |
| </div> | |
| <div id="tableBody"> | |
| <div class="empty"> | |
| <div class="empty-icon">π</div> | |
| <div class="empty-title">No models evaluated yet</div> | |
| <div class="empty-sub">Run an evaluation above to add the first entry</div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <script> | |
| const API = window.location.origin; | |
| async function loadLeaderboard() { | |
| try { | |
| const r = await fetch(`${API}/leaderboard`); | |
| const data = await r.json(); | |
| renderTable(data.entries); | |
| updateStats(data.entries); | |
| if (data.entries.length === 0) { | |
| document.getElementById('seedPanel').style.display = 'flex'; | |
| } else { | |
| document.getElementById('seedPanel').style.display = 'none'; | |
| } | |
| } catch(e) { | |
| console.error(e); | |
| } | |
| } | |
| function scoreClass(s) { | |
| return s >= 0.7 ? 'score-high' : s >= 0.5 ? 'score-mid' : 'score-low'; | |
| } | |
| function overallColor(s) { | |
| return s >= 0.7 ? '#3fb950' : s >= 0.5 ? '#d29922' : '#f85149'; | |
| } | |
| function rankLabel(i) { | |
| if (i === 0) return '<span class="rank gold">π₯</span>'; | |
| if (i === 1) return '<span class="rank silver">π₯</span>'; | |
| if (i === 2) return '<span class="rank bronze">π₯</span>'; | |
| return `<span class="rank">${i+1}</span>`; | |
| } | |
| function renderTable(entries) { | |
| const body = document.getElementById('tableBody'); | |
| if (!entries || entries.length === 0) { | |
| body.innerHTML = ` | |
| <div class="empty"> | |
| <div class="empty-icon">π</div> | |
| <div class="empty-title">No models evaluated yet</div> | |
| <div class="empty-sub">Run an evaluation above to add the first entry</div> | |
| </div>`; | |
| return; | |
| } | |
| body.innerHTML = entries.map((e, i) => ` | |
| <div class="table-row ${i===0?'top':''}"> | |
| <div>${rankLabel(i)}</div> | |
| <div> | |
| <div class="model-name">${e.model_name}</div> | |
| <div class="model-time">${e.timestamp || ''}</div> | |
| </div> | |
| <div class="score-cell"> | |
| <span class="score-val ${scoreClass(e.factual_recall)}">${e.factual_recall.toFixed(3)}</span> | |
| </div> | |
| <div class="score-cell"> | |
| <span class="score-val ${scoreClass(e.socratic_dialogue)}">${e.socratic_dialogue.toFixed(3)}</span> | |
| </div> | |
| <div class="score-cell"> | |
| <span class="score-val ${scoreClass(e.misconception_trap)}">${e.misconception_trap.toFixed(3)}</span> | |
| </div> | |
| <div class="overall-val" style="color:${overallColor(e.overall)}">${e.overall.toFixed(3)}</div> | |
| <div> | |
| <div class="bar-wrap"> | |
| <div class="bar-bg"> | |
| <div class="bar-fill" style="width:${e.overall*100}%;background:${overallColor(e.overall)}"></div> | |
| </div> | |
| <button class="delete-btn" onclick="deleteEntry('${e.model_name}')">β</button> | |
| </div> | |
| </div> | |
| </div>`).join(''); | |
| } | |
| function updateStats(entries) { | |
| document.getElementById('statModels').textContent = entries.length; | |
| if (entries.length > 0) { | |
| document.getElementById('statBest').textContent = entries[0].overall.toFixed(3); | |
| const hardAvg = entries.reduce((s,e) => s + e.misconception_trap, 0) / entries.length; | |
| document.getElementById('statHardest').textContent = hardAvg.toFixed(3); | |
| } | |
| } | |
| async function runEval() { | |
| const name = document.getElementById('modelName').value.trim(); | |
| if (!name) { | |
| document.getElementById('statusText').textContent = 'β οΈ Please enter a model name first.'; | |
| return; | |
| } | |
| const btn = document.getElementById('runBtn'); | |
| const spinner = document.getElementById('spinner'); | |
| const statusText = document.getElementById('statusText'); | |
| btn.disabled = true; | |
| spinner.style.display = 'block'; | |
| statusText.textContent = `Running ${name} against all 3 tasks... this takes ~30 seconds.`; | |
| try { | |
| const r = await fetch(`${API}/leaderboard/run`, { | |
| method: 'POST', | |
| headers: { 'Content-Type': 'application/json' }, | |
| body: JSON.stringify({ model_name: name }), | |
| }); | |
| const data = await r.json(); | |
| if (data.error) { | |
| statusText.textContent = `β Error: ${data.error}`; | |
| } else { | |
| statusText.textContent = `β Done! ${name} scored ${data.overall.toFixed(3)} overall.`; | |
| document.getElementById('modelName').value = ''; | |
| loadLeaderboard(); | |
| } | |
| } catch(e) { | |
| statusText.textContent = `β Failed: ${e.message}`; | |
| } finally { | |
| btn.disabled = false; | |
| spinner.style.display = 'none'; | |
| } | |
| } | |
| async function deleteEntry(modelName) { | |
| if (!confirm(`Remove ${modelName} from leaderboard?`)) return; | |
| await fetch(`${API}/leaderboard/${encodeURIComponent(modelName)}`, { method: 'DELETE' }); | |
| loadLeaderboard(); | |
| } | |
| async function seedBaseline() { | |
| const baseline = [ | |
| { model_name: "Llama 3.1 8B (baseline)", factual_recall: 0.71, socratic_dialogue: 0.68, misconception_trap: 0.58, overall: 0.657, timestamp: "Baseline β 2026-04-06" }, | |
| { model_name: "Random agent", factual_recall: 0.18, socratic_dialogue: 0.22, misconception_trap: 0.10, overall: 0.167, timestamp: "Baseline β 2026-04-06" }, | |
| ]; | |
| for (const entry of baseline) { | |
| await fetch(`${API}/leaderboard`, { | |
| method: 'POST', | |
| headers: { 'Content-Type': 'application/json' }, | |
| body: JSON.stringify(entry), | |
| }); | |
| } | |
| loadLeaderboard(); | |
| } | |
| // Load on page start | |
| loadLeaderboard(); | |
| </script> | |
| </body> | |
| </html> |