socratic-env / static /leaderboard.html
Developer-Amar's picture
Initial Commit
519736d
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<title>SocraticEnv β€” Model Leaderboard</title>
<style>
* { margin:0; padding:0; box-sizing:border-box; }
body { font-family:'Segoe UI',system-ui,sans-serif; background:#0d1117; color:#e6edf3; min-height:100vh; }
.header {
background:#161b22; border-bottom:1px solid #30363d;
padding:16px 32px; display:flex; align-items:center;
justify-content:space-between;
}
.header-left { display:flex; align-items:center; gap:12px; }
.logo {
width:36px; height:36px;
background:linear-gradient(135deg,#7c3aed,#a855f7);
border-radius:8px; display:flex; align-items:center;
justify-content:center; font-size:18px;
}
.header h1 { font-size:18px; font-weight:600; }
.header p { font-size:12px; color:#8b949e; margin-top:2px; }
.nav-links { display:flex; gap:8px; }
.nav-link {
padding:6px 14px; border-radius:8px; font-size:12px;
font-weight:600; text-decoration:none; border:1px solid #30363d;
color:#8b949e; background:#21262d; transition:all 0.2s;
}
.nav-link:hover { color:#e6edf3; border-color:#7c3aed; }
.nav-link.active { color:#a855f7; border-color:#7c3aed; background:#13111e; }
.container { max-width:1000px; margin:0 auto; padding:32px 24px; }
.page-title { font-size:24px; font-weight:700; margin-bottom:6px; }
.page-sub { font-size:13px; color:#8b949e; margin-bottom:28px; }
/* Run panel */
.run-panel {
background:#161b22; border:1px solid #30363d;
border-radius:12px; padding:20px; margin-bottom:28px;
}
.run-title { font-size:14px; font-weight:600; margin-bottom:14px; color:#e6edf3; }
.run-row { display:flex; gap:10px; align-items:center; }
.run-input {
flex:1; background:#0d1117; border:1px solid #30363d;
border-radius:8px; padding:9px 14px; color:#e6edf3;
font-size:13px; font-family:inherit;
}
.run-input:focus { outline:none; border-color:#7c3aed; }
.run-input::placeholder { color:#484f58; }
.btn {
padding:9px 18px; border-radius:8px; font-size:13px;
font-weight:600; border:none; cursor:pointer;
transition:all 0.2s; white-space:nowrap;
}
.btn-primary { background:#7c3aed; color:white; }
.btn-primary:hover { background:#6d28d9; }
.btn-primary:disabled { background:#3d2070; color:#8b6bb5; cursor:not-allowed; }
.run-status {
margin-top:12px; font-size:12px; color:#8b949e;
min-height:20px; display:flex; align-items:center; gap:8px;
}
.spinner {
width:14px; height:14px; border:2px solid #30363d;
border-top-color:#7c3aed; border-radius:50%;
animation:spin 0.8s linear infinite; display:none;
}
@keyframes spin { to { transform:rotate(360deg); } }
/* Stats row */
.stats-row { display:grid; grid-template-columns:repeat(3,1fr); gap:12px; margin-bottom:24px; }
.stat-card {
background:#161b22; border:1px solid #30363d;
border-radius:10px; padding:16px; text-align:center;
}
.stat-val { font-size:28px; font-weight:700; color:#7c3aed; }
.stat-lbl { font-size:11px; color:#8b949e; margin-top:4px; }
/* Table */
.table-wrap {
background:#161b22; border:1px solid #30363d;
border-radius:12px; overflow:hidden;
}
.table-header {
display:grid;
grid-template-columns:40px 1fr 100px 100px 100px 110px 140px;
padding:10px 16px; background:#0d1117;
border-bottom:1px solid #30363d;
font-size:10px; font-weight:600; color:#8b949e;
letter-spacing:0.8px; text-transform:uppercase;
}
.table-row {
display:grid;
grid-template-columns:40px 1fr 100px 100px 100px 110px 140px;
padding:14px 16px; border-bottom:1px solid #21262d;
align-items:center; transition:background 0.15s;
}
.table-row:last-child { border-bottom:none; }
.table-row:hover { background:#1c2128; }
.table-row.top { background:#13111e; }
.rank { font-size:14px; font-weight:700; color:#8b949e; }
.rank.gold { color:#f59e0b; }
.rank.silver { color:#94a3b8; }
.rank.bronze { color:#cd7f32; }
.model-name { font-size:13px; font-weight:600; color:#e6edf3; }
.model-time { font-size:10px; color:#484f58; margin-top:2px; }
.score-cell { text-align:center; }
.score-val {
font-size:13px; font-weight:600;
padding:3px 10px; border-radius:6px; display:inline-block;
}
.score-high { background:#1a3a2a; color:#3fb950; }
.score-mid { background:#332d1a; color:#d29922; }
.score-low { background:#3a1a1a; color:#f85149; }
.overall-val {
font-size:15px; font-weight:700; text-align:center;
}
.bar-wrap { display:flex; align-items:center; gap:6px; }
.bar-bg { flex:1; height:6px; background:#21262d; border-radius:3px; overflow:hidden; }
.bar-fill { height:100%; border-radius:3px; transition:width 0.6s ease; }
.delete-btn {
background:none; border:none; color:#484f58;
cursor:pointer; font-size:12px; padding:4px 8px;
border-radius:4px; transition:all 0.2s;
}
.delete-btn:hover { color:#f85149; background:#3a1a1a; }
/* Empty state */
.empty {
text-align:center; padding:48px 24px;
color:#8b949e;
}
.empty-icon { font-size:40px; opacity:0.3; margin-bottom:12px; }
.empty-title { font-size:15px; font-weight:600; margin-bottom:6px; }
.empty-sub { font-size:12px; }
/* Seed panel */
.seed-panel {
background:#161b22; border:1px solid #30363d;
border-radius:12px; padding:16px 20px;
margin-bottom:20px; display:flex;
align-items:center; justify-content:space-between;
gap:16px;
}
.seed-text { font-size:12px; color:#8b949e; }
.seed-text strong { color:#e6edf3; }
.btn-secondary {
background:#21262d; color:#e6edf3;
border:1px solid #30363d;
}
.btn-secondary:hover { background:#30363d; }
</style>
</head>
<body>
<div class="header">
<div class="header-left">
<div class="logo">πŸŽ“</div>
<div>
<h1>SocraticEnv</h1>
<p>OpenEnv Hackathon Β· Meta Γ— PyTorch Γ— Scaler</p>
</div>
</div>
<div class="nav-links">
<a href="/ui" class="nav-link">Live Demo</a>
<a href="/leaderboard" class="nav-link active">Leaderboard</a>
<a href="/docs" class="nav-link">API Docs</a>
</div>
</div>
<div class="container">
<div class="page-title">Model Leaderboard</div>
<div class="page-sub">Compare AI models on Socratic reasoning ability across all 3 tasks. Which model thinks best under pressure?</div>
<!-- Seed with default data -->
<div class="seed-panel" id="seedPanel" style="display:none">
<div class="seed-text">No entries yet. <strong>Seed with baseline scores</strong> to populate the leaderboard with known model performance.</div>
<button class="btn btn-secondary" onclick="seedBaseline()">Seed Baseline Data</button>
</div>
<!-- Run evaluation panel -->
<div class="run-panel">
<div class="run-title">Run a new model evaluation</div>
<div class="run-row">
<input class="run-input" id="modelName" placeholder="Enter a display name e.g. Llama 3.1 8B, GPT-4o, Mistral 7B..." />
<button class="btn btn-primary" id="runBtn" onclick="runEval()">Run Evaluation</button>
</div>
<div class="run-status" id="runStatus">
<div class="spinner" id="spinner"></div>
<span id="statusText">Enter a model name and click Run to benchmark the current model against all 3 tasks.</span>
</div>
</div>
<!-- Stats -->
<div class="stats-row">
<div class="stat-card">
<div class="stat-val" id="statModels">0</div>
<div class="stat-lbl">Models evaluated</div>
</div>
<div class="stat-card">
<div class="stat-val" id="statBest">β€”</div>
<div class="stat-lbl">Best overall score</div>
</div>
<div class="stat-card">
<div class="stat-val" id="statHardest">β€”</div>
<div class="stat-lbl">Hardest task avg</div>
</div>
</div>
<!-- Table -->
<div class="table-wrap">
<div class="table-header">
<div>Rank</div>
<div>Model</div>
<div>Easy</div>
<div>Medium</div>
<div>Hard</div>
<div>Overall</div>
<div>Progress</div>
</div>
<div id="tableBody">
<div class="empty">
<div class="empty-icon">πŸ†</div>
<div class="empty-title">No models evaluated yet</div>
<div class="empty-sub">Run an evaluation above to add the first entry</div>
</div>
</div>
</div>
</div>
<script>
const API = window.location.origin;
async function loadLeaderboard() {
try {
const r = await fetch(`${API}/leaderboard`);
const data = await r.json();
renderTable(data.entries);
updateStats(data.entries);
if (data.entries.length === 0) {
document.getElementById('seedPanel').style.display = 'flex';
} else {
document.getElementById('seedPanel').style.display = 'none';
}
} catch(e) {
console.error(e);
}
}
function scoreClass(s) {
return s >= 0.7 ? 'score-high' : s >= 0.5 ? 'score-mid' : 'score-low';
}
function overallColor(s) {
return s >= 0.7 ? '#3fb950' : s >= 0.5 ? '#d29922' : '#f85149';
}
function rankLabel(i) {
if (i === 0) return '<span class="rank gold">πŸ₯‡</span>';
if (i === 1) return '<span class="rank silver">πŸ₯ˆ</span>';
if (i === 2) return '<span class="rank bronze">πŸ₯‰</span>';
return `<span class="rank">${i+1}</span>`;
}
function renderTable(entries) {
const body = document.getElementById('tableBody');
if (!entries || entries.length === 0) {
body.innerHTML = `
<div class="empty">
<div class="empty-icon">πŸ†</div>
<div class="empty-title">No models evaluated yet</div>
<div class="empty-sub">Run an evaluation above to add the first entry</div>
</div>`;
return;
}
body.innerHTML = entries.map((e, i) => `
<div class="table-row ${i===0?'top':''}">
<div>${rankLabel(i)}</div>
<div>
<div class="model-name">${e.model_name}</div>
<div class="model-time">${e.timestamp || ''}</div>
</div>
<div class="score-cell">
<span class="score-val ${scoreClass(e.factual_recall)}">${e.factual_recall.toFixed(3)}</span>
</div>
<div class="score-cell">
<span class="score-val ${scoreClass(e.socratic_dialogue)}">${e.socratic_dialogue.toFixed(3)}</span>
</div>
<div class="score-cell">
<span class="score-val ${scoreClass(e.misconception_trap)}">${e.misconception_trap.toFixed(3)}</span>
</div>
<div class="overall-val" style="color:${overallColor(e.overall)}">${e.overall.toFixed(3)}</div>
<div>
<div class="bar-wrap">
<div class="bar-bg">
<div class="bar-fill" style="width:${e.overall*100}%;background:${overallColor(e.overall)}"></div>
</div>
<button class="delete-btn" onclick="deleteEntry('${e.model_name}')">βœ•</button>
</div>
</div>
</div>`).join('');
}
function updateStats(entries) {
document.getElementById('statModels').textContent = entries.length;
if (entries.length > 0) {
document.getElementById('statBest').textContent = entries[0].overall.toFixed(3);
const hardAvg = entries.reduce((s,e) => s + e.misconception_trap, 0) / entries.length;
document.getElementById('statHardest').textContent = hardAvg.toFixed(3);
}
}
async function runEval() {
const name = document.getElementById('modelName').value.trim();
if (!name) {
document.getElementById('statusText').textContent = '⚠️ Please enter a model name first.';
return;
}
const btn = document.getElementById('runBtn');
const spinner = document.getElementById('spinner');
const statusText = document.getElementById('statusText');
btn.disabled = true;
spinner.style.display = 'block';
statusText.textContent = `Running ${name} against all 3 tasks... this takes ~30 seconds.`;
try {
const r = await fetch(`${API}/leaderboard/run`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model_name: name }),
});
const data = await r.json();
if (data.error) {
statusText.textContent = `❌ Error: ${data.error}`;
} else {
statusText.textContent = `βœ… Done! ${name} scored ${data.overall.toFixed(3)} overall.`;
document.getElementById('modelName').value = '';
loadLeaderboard();
}
} catch(e) {
statusText.textContent = `❌ Failed: ${e.message}`;
} finally {
btn.disabled = false;
spinner.style.display = 'none';
}
}
async function deleteEntry(modelName) {
if (!confirm(`Remove ${modelName} from leaderboard?`)) return;
await fetch(`${API}/leaderboard/${encodeURIComponent(modelName)}`, { method: 'DELETE' });
loadLeaderboard();
}
async function seedBaseline() {
const baseline = [
{ model_name: "Llama 3.1 8B (baseline)", factual_recall: 0.71, socratic_dialogue: 0.68, misconception_trap: 0.58, overall: 0.657, timestamp: "Baseline β€” 2026-04-06" },
{ model_name: "Random agent", factual_recall: 0.18, socratic_dialogue: 0.22, misconception_trap: 0.10, overall: 0.167, timestamp: "Baseline β€” 2026-04-06" },
];
for (const entry of baseline) {
await fetch(`${API}/leaderboard`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(entry),
});
}
loadLeaderboard();
}
// Load on page start
loadLeaderboard();
</script>
</body>
</html>