| | <!DOCTYPE html> |
| | <html lang="en"> |
| | <head> |
| | <meta charset="UTF-8"> |
| | <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| | <title>Eval Suite Visualization</title> |
| | <script src="https://cdn.plot.ly/plotly-2.35.2.min.js" charset="utf-8"></script> |
| | <style> |
| | * { box-sizing: border-box; margin: 0; padding: 0; } |
| | body { |
| | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; |
| | background: #f8f9fa; |
| | color: #1a1a2e; |
| | padding: 24px; |
| | } |
| | |
| | |
| | .page-header { |
| | display: flex; |
| | align-items: center; |
| | justify-content: space-between; |
| | margin-bottom: 24px; |
| | } |
| | .page-header h1 { |
| | font-size: 1.5rem; |
| | font-weight: 600; |
| | color: #1a1a2e; |
| | } |
| | .btn { |
| | padding: 8px 16px; |
| | border: 1px solid #dee2e6; |
| | border-radius: 6px; |
| | background: #fff; |
| | font-size: 0.875rem; |
| | color: #495057; |
| | cursor: pointer; |
| | transition: background 0.15s; |
| | } |
| | .btn:hover { background: #e9ecef; } |
| | .btn-primary { |
| | background: #4361ee; |
| | color: #fff; |
| | border-color: #4361ee; |
| | } |
| | .btn-primary:hover { background: #3a56d4; } |
| | .btn-sm { |
| | padding: 4px 10px; |
| | font-size: 0.75rem; |
| | } |
| | .btn-danger { color: #e63946; border-color: #e6394640; } |
| | .btn-danger:hover { background: #e6394610; } |
| | |
| | |
| | #panels-container { |
| | display: grid; |
| | grid-template-columns: repeat(2, 1fr); |
| | gap: 20px; |
| | } |
| | |
| | |
| | .panel { |
| | background: #fff; |
| | border: 1px solid #dee2e6; |
| | border-radius: 8px; |
| | overflow: hidden; |
| | } |
| | .panel-toolbar { |
| | display: flex; |
| | align-items: center; |
| | justify-content: flex-end; |
| | gap: 6px; |
| | padding: 6px 10px; |
| | border-bottom: 1px solid #dee2e6; |
| | background: #f8f9fa; |
| | } |
| | .panel-controls { |
| | padding: 16px; |
| | border-bottom: 1px solid #dee2e6; |
| | } |
| | .panel-controls.collapsed { display: none; } |
| | .controls-row { |
| | display: flex; |
| | flex-wrap: wrap; |
| | gap: 12px; |
| | align-items: flex-end; |
| | } |
| | .controls-row + .controls-row { margin-top: 12px; } |
| | .control-group { |
| | display: flex; |
| | flex-direction: column; |
| | gap: 4px; |
| | } |
| | .control-group label { |
| | font-size: 0.7rem; |
| | font-weight: 600; |
| | text-transform: uppercase; |
| | letter-spacing: 0.05em; |
| | color: #6c757d; |
| | } |
| | select { |
| | padding: 6px 10px; |
| | border: 1px solid #dee2e6; |
| | border-radius: 6px; |
| | background: #fff; |
| | font-size: 0.8rem; |
| | color: #1a1a2e; |
| | min-width: 160px; |
| | cursor: pointer; |
| | } |
| | select:focus { |
| | outline: none; |
| | border-color: #4361ee; |
| | box-shadow: 0 0 0 3px rgba(67, 97, 238, 0.15); |
| | } |
| | |
| | |
| | .models-section { |
| | margin-top: 12px; |
| | } |
| | .models-header { |
| | display: flex; |
| | align-items: center; |
| | gap: 8px; |
| | margin-bottom: 8px; |
| | } |
| | .models-header span { |
| | font-size: 0.7rem; |
| | font-weight: 600; |
| | text-transform: uppercase; |
| | letter-spacing: 0.05em; |
| | color: #6c757d; |
| | } |
| | .checkbox-grid { |
| | display: flex; |
| | flex-wrap: wrap; |
| | gap: 6px 16px; |
| | } |
| | .checkbox-item { |
| | display: flex; |
| | align-items: center; |
| | gap: 5px; |
| | cursor: pointer; |
| | font-size: 0.8rem; |
| | } |
| | .checkbox-item input[type="checkbox"] { |
| | width: 14px; |
| | height: 14px; |
| | cursor: pointer; |
| | accent-color: #4361ee; |
| | } |
| | .checkbox-item .model-name.missing { |
| | text-decoration: line-through; |
| | opacity: 0.5; |
| | cursor: help; |
| | } |
| | .checkbox-item .model-name.missing:hover { |
| | opacity: 0.8; |
| | } |
| | .model-separator { |
| | width: 100%; |
| | border-top: 1px solid #eee; |
| | margin: 4px 0; |
| | } |
| | |
| | |
| | .panel-chart-wrapper { |
| | position: relative; |
| | } |
| | .panel-chart { |
| | min-height: 100px; |
| | overflow: hidden; |
| | } |
| | .title-hover-zone { |
| | position: absolute; |
| | top: 0; |
| | left: 50px; |
| | right: 50px; |
| | height: 40px; |
| | cursor: pointer; |
| | z-index: 10; |
| | display: flex; |
| | align-items: center; |
| | justify-content: center; |
| | } |
| | .title-info-icon { |
| | position: absolute; |
| | top: 50%; |
| | transform: translateY(-50%); |
| | width: 18px; |
| | height: 18px; |
| | border-radius: 50%; |
| | background: #e9ecef; |
| | color: #495057; |
| | font-size: 11px; |
| | font-weight: 600; |
| | display: flex; |
| | align-items: center; |
| | justify-content: center; |
| | opacity: 0.6; |
| | transition: opacity 0.15s; |
| | } |
| | .title-hover-zone:hover .title-info-icon { |
| | opacity: 1; |
| | } |
| | |
| | |
| | .panel-resize-handle { |
| | height: 6px; |
| | cursor: ns-resize; |
| | background: linear-gradient(to bottom, #dee2e6 1px, transparent 1px, transparent 3px, #dee2e6 3px); |
| | background-size: 100% 4px; |
| | background-position: center; |
| | transition: background-color 0.15s; |
| | } |
| | .panel-resize-handle:hover, |
| | .panel-resize-handle.active { |
| | background-color: #e9ecef; |
| | } |
| | .loading { |
| | display: flex; |
| | align-items: center; |
| | justify-content: center; |
| | padding: 1rem 0; |
| | color: #adb5bd; |
| | font-size: 0.85rem; |
| | } |
| | |
| | |
| | .custom-tooltip { |
| | position: fixed; |
| | pointer-events: none; |
| | background: rgba(0, 0, 0, 0.85); |
| | color: #fff; |
| | padding: 8px 12px 12px; |
| | border-radius: 4px; |
| | font-size: 11px; |
| | line-height: 1.5; |
| | z-index: 9999; |
| | display: none; |
| | white-space: nowrap; |
| | } |
| | .custom-tooltip.scrollable { |
| | pointer-events: auto; |
| | overflow-y: auto; |
| | white-space: normal; |
| | min-width: 200px; |
| | max-width: 400px; |
| | } |
| | |
| | |
| | .add-panel-row { |
| | display: flex; |
| | justify-content: center; |
| | padding: 20px; |
| | } |
| | |
| | |
| | #init-loading { |
| | display: flex; |
| | align-items: center; |
| | justify-content: center; |
| | height: 300px; |
| | color: #6c757d; |
| | font-size: 1rem; |
| | } |
| | </style> |
| | </head> |
| | <body> |
| | <div class="page-header"> |
| | <h1>Eval Suite Visualization</h1> |
| | </div> |
| |
|
| | <div id="init-loading">Initializing DuckDB...</div> |
| | <div id="panels-container"></div> |
| | <div class="custom-tooltip" id="custom-tooltip"></div> |
| |
|
| | <div class="add-panel-row" id="add-panel-row" style="display:none"> |
| | <button class="btn btn-primary" id="btn-add-panel">+ Add Panel</button> |
| | </div> |
| |
|
| | <script type="module"> |
| | import * as duckdb from 'https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@1.29.0/+esm'; |
| | import jsyaml from 'https://cdn.jsdelivr.net/npm/js-yaml@4.1.0/+esm'; |
| | |
| | |
| | let db = null; |
| | let conn = null; |
| | let panelCounter = 0; |
| | const panels = new Map(); |
| | |
| | |
| | const COLOR_PALETTE = [ |
| | '#4361ee', '#e63946', '#2a9d8f', '#e9c46a', '#f4a261', |
| | '#264653', '#7209b7', '#06d6a0', '#ef476f', '#ff6b6b', |
| | '#48bfe3', '#d4a017', '#b5838d', '#588157', '#9d4edd', |
| | '#f77f00', '#3a86a7', '#8338ec', '#ff006e', '#fb5607', |
| | ]; |
| | |
| | const PARQUET_URL = 'https://huggingface.co/datasets/ellamind/eval-scores/resolve/main/scores.parquet'; |
| | |
| | |
| | let ALL_MODELS = []; |
| | let MODEL_COLORS = {}; |
| | let CONFIG = {}; |
| | |
| | |
| | async function initDuckDB() { |
| | const JSDELIVR_BUNDLES = duckdb.getJsDelivrBundles(); |
| | const bundle = await duckdb.selectBundle(JSDELIVR_BUNDLES); |
| | const worker_url = URL.createObjectURL( |
| | new Blob([`importScripts("${bundle.mainWorker}");`], { type: 'text/javascript' }) |
| | ); |
| | const worker = new Worker(worker_url); |
| | const logger = new duckdb.ConsoleLogger(); |
| | db = new duckdb.AsyncDuckDB(logger, worker); |
| | await db.instantiate(bundle.mainModule, bundle.pthreadWorker); |
| | URL.revokeObjectURL(worker_url); |
| | conn = await db.connect(); |
| | } |
| | |
| | async function loadParquet() { |
| | const response = await fetch(PARQUET_URL); |
| | const buffer = new Uint8Array(await response.arrayBuffer()); |
| | await db.registerFileBuffer('scores.parquet', buffer); |
| | await conn.query(`CREATE VIEW scores AS SELECT * FROM 'scores.parquet'`); |
| | } |
| | |
| | |
| | async function query(sql) { |
| | const result = await conn.query(sql); |
| | return result.toArray().map(row => row.toJSON()); |
| | } |
| | |
| | function esc(s) { return s.replace(/'/g, "''"); } |
| | function sqlIn(vals) { return vals.map(v => `'${esc(v)}'`).join(', '); } |
| | |
| | |
| | function populateSelect(el, options, selected) { |
| | el.innerHTML = ''; |
| | options.forEach(opt => { |
| | const o = document.createElement('option'); |
| | if (typeof opt === 'object') { o.value = opt.value; o.textContent = opt.label; } |
| | else { o.value = opt; o.textContent = opt; } |
| | el.appendChild(o); |
| | }); |
| | if (selected && options.some(o => (typeof o === 'object' ? o.value : o) === selected)) { |
| | el.value = selected; |
| | } |
| | } |
| | |
| | function formatTokens(value) { |
| | if (value >= 1e12) return (value / 1e12).toFixed(1) + 'T'; |
| | if (value >= 1e9) return (value / 1e9).toFixed(0) + 'B'; |
| | if (value >= 1e6) return (value / 1e6).toFixed(0) + 'M'; |
| | if (value >= 1e3) return (value / 1e3).toFixed(0) + 'K'; |
| | return value.toString(); |
| | } |
| | |
| | function niceTicks(min, max, maxTicks = 8) { |
| | if (min === max) return [min]; |
| | const range = max - min; |
| | |
| | const rawStep = range / maxTicks; |
| | const mag = Math.pow(10, Math.floor(Math.log10(rawStep))); |
| | const normalized = rawStep / mag; |
| | let step; |
| | if (normalized <= 1.5) step = 1 * mag; |
| | else if (normalized <= 3.5) step = 2 * mag; |
| | else if (normalized <= 7.5) step = 5 * mag; |
| | else step = 10 * mag; |
| | |
| | const start = Math.ceil(min / step) * step; |
| | const ticks = []; |
| | for (let v = start; v <= max; v += step) { |
| | ticks.push(Math.round(v)); |
| | } |
| | |
| | if (ticks.length === 0 || ticks[0] - min > step * 0.3) ticks.unshift(Math.round(min)); |
| | if (max - ticks[ticks.length - 1] > step * 0.3) ticks.push(Math.round(max)); |
| | return ticks; |
| | } |
| | |
| | function movingAverage(values, w) { |
| | if (w <= 1) return values; |
| | return values.map((_, i) => { |
| | const n = Math.min(i + 1, w); |
| | let sum = 0; |
| | for (let j = i - n + 1; j <= i; j++) sum += values[j]; |
| | return sum / n; |
| | }); |
| | } |
| | |
| | async function loadConfig() { |
| | try { |
| | const resp = await fetch('config.yaml'); |
| | if (resp.ok) { |
| | CONFIG = jsyaml.load(await resp.text()) || {}; |
| | } |
| | } catch (e) { |
| | console.warn('Could not load config.yaml, using defaults:', e); |
| | } |
| | } |
| | |
| | async function loadModels() { |
| | ALL_MODELS = await query(` |
| | WITH raw AS ( |
| | SELECT DISTINCT model, model_display_name, is_checkpoint |
| | FROM scores |
| | ), |
| | ckpt_models AS ( |
| | SELECT model FROM raw WHERE is_checkpoint = true |
| | ) |
| | SELECT r.model, r.model_display_name, r.is_checkpoint |
| | FROM raw r |
| | WHERE r.is_checkpoint = true |
| | OR r.model NOT IN (SELECT model FROM ckpt_models) |
| | ORDER BY r.is_checkpoint DESC, r.model_display_name |
| | `); |
| | |
| | |
| | const configColors = CONFIG.model_colors || {}; |
| | MODEL_COLORS = {}; |
| | let paletteIdx = 0; |
| | ALL_MODELS.forEach(m => { |
| | const name = m.model_display_name; |
| | if (configColors[name]) { |
| | MODEL_COLORS[name] = configColors[name]; |
| | } else { |
| | MODEL_COLORS[name] = COLOR_PALETTE[paletteIdx % COLOR_PALETTE.length]; |
| | paletteIdx++; |
| | } |
| | }); |
| | } |
| | |
| | |
| | class Panel { |
| | constructor(id) { |
| | this.id = id; |
| | this.el = {}; |
| | this.collapsed = false; |
| | this.build(); |
| | } |
| | |
| | build() { |
| | const container = document.getElementById('panels-container'); |
| | const panel = document.createElement('div'); |
| | panel.className = 'panel'; |
| | panel.id = `panel-${this.id}`; |
| | |
| | panel.innerHTML = ` |
| | <div class="panel-toolbar"> |
| | <button class="btn btn-sm" id="ptoggle-${this.id}">Collapse</button> |
| | <button class="btn btn-sm" id="pexport-png-${this.id}">PNG</button> |
| | <button class="btn btn-sm" id="pexport-svg-${this.id}">SVG</button> |
| | <button class="btn btn-sm btn-danger" id="premove-${this.id}">Remove</button> |
| | </div> |
| | <div class="panel-controls" id="pcontrols-${this.id}"> |
| | <div class="controls-row"> |
| | <div class="control-group"> |
| | <label>Eval Suite</label> |
| | <select id="psuite-${this.id}"></select> |
| | </div> |
| | <div class="control-group"> |
| | <label>Group</label> |
| | <select id="pgroup-${this.id}"></select> |
| | </div> |
| | <div class="control-group"> |
| | <label>Task</label> |
| | <select id="ptask-${this.id}"></select> |
| | </div> |
| | <div class="control-group"> |
| | <label>Metric</label> |
| | <select id="pmetric-${this.id}"></select> |
| | </div> |
| | <div class="control-group"> |
| | <label>Smoothing</label> |
| | <select id="psmooth-${this.id}"> |
| | <option value="1" selected>None</option> |
| | <option value="2">2</option> |
| | <option value="3">3</option> |
| | <option value="4">4</option> |
| | <option value="5">5</option> |
| | </select> |
| | </div> |
| | <div class="control-group"> |
| | <label>Chart Type</label> |
| | <select id="pchart-type-${this.id}"> |
| | <option value="auto" selected>Auto</option> |
| | <option value="line">Line</option> |
| | <option value="bar">Bar</option> |
| | </select> |
| | </div> |
| | </div> |
| | <div class="models-section"> |
| | <div class="models-header"> |
| | <span>Models</span> |
| | <button class="btn btn-sm" id="pmodels-all-${this.id}">All</button> |
| | <button class="btn btn-sm" id="pmodels-none-${this.id}">None</button> |
| | <button class="btn btn-sm" id="pmodels-ckpt-${this.id}">Checkpoints</button> |
| | <button class="btn btn-sm" id="pmodels-base-${this.id}">Baselines</button> |
| | </div> |
| | <div class="checkbox-grid" id="pmodels-${this.id}"></div> |
| | </div> |
| | </div> |
| | <div class="panel-chart-wrapper"> |
| | <div class="title-hover-zone" id="ptitle-hover-${this.id}" style="display:none"></div> |
| | <div class="panel-chart" id="pchart-${this.id}"></div> |
| | </div> |
| | <div class="panel-resize-handle" id="presize-${this.id}"></div> |
| | `; |
| | |
| | container.appendChild(panel); |
| | |
| | |
| | this.el.panel = panel; |
| | this.el.controls = panel.querySelector(`#pcontrols-${this.id}`); |
| | this.el.suite = panel.querySelector(`#psuite-${this.id}`); |
| | this.el.group = panel.querySelector(`#pgroup-${this.id}`); |
| | this.el.task = panel.querySelector(`#ptask-${this.id}`); |
| | this.el.metric = panel.querySelector(`#pmetric-${this.id}`); |
| | this.el.smooth = panel.querySelector(`#psmooth-${this.id}`); |
| | this.el.chartType = panel.querySelector(`#pchart-type-${this.id}`); |
| | this.el.models = panel.querySelector(`#pmodels-${this.id}`); |
| | this.el.chart = panel.querySelector(`#pchart-${this.id}`); |
| | this.el.titleHover = panel.querySelector(`#ptitle-hover-${this.id}`); |
| | this.el.resize = panel.querySelector(`#presize-${this.id}`); |
| | this.chartHeight = null; |
| | |
| | |
| | panel.querySelector(`#ptoggle-${this.id}`).addEventListener('click', () => this.toggleControls()); |
| | panel.querySelector(`#premove-${this.id}`).addEventListener('click', () => this.remove()); |
| | panel.querySelector(`#pexport-png-${this.id}`).addEventListener('click', () => this.export('png')); |
| | panel.querySelector(`#pexport-svg-${this.id}`).addEventListener('click', () => this.export('svg')); |
| | |
| | this.el.suite.addEventListener('change', () => this.onSuiteChange()); |
| | this.el.group.addEventListener('change', () => this.onGroupChange()); |
| | this.el.task.addEventListener('change', () => this.onTaskChange()); |
| | this.el.metric.addEventListener('change', () => this.renderChart()); |
| | this.el.smooth.addEventListener('change', () => this.renderChart()); |
| | this.el.chartType.addEventListener('change', () => this.renderChart()); |
| | |
| | panel.querySelector(`#pmodels-all-${this.id}`).addEventListener('click', () => this.setModels(true)); |
| | panel.querySelector(`#pmodels-none-${this.id}`).addEventListener('click', () => this.setModels(false)); |
| | panel.querySelector(`#pmodels-ckpt-${this.id}`).addEventListener('click', () => this.setModelsByType(true)); |
| | panel.querySelector(`#pmodels-base-${this.id}`).addEventListener('click', () => this.setModelsByType(false)); |
| | |
| | |
| | this.el.resize.addEventListener('mousedown', (e) => this.startResize(e)); |
| | |
| | this.buildModelCheckboxes(); |
| | } |
| | |
| | toggleControls() { |
| | this.collapsed = !this.collapsed; |
| | this.el.controls.classList.toggle('collapsed', this.collapsed); |
| | this.el.panel.querySelector(`#ptoggle-${this.id}`).textContent = |
| | this.collapsed ? 'Expand' : 'Collapse'; |
| | } |
| | |
| | remove() { |
| | this.el.panel.remove(); |
| | panels.delete(this.id); |
| | } |
| | |
| | buildModelCheckboxes() { |
| | const container = this.el.models; |
| | container.innerHTML = ''; |
| | let lastCkpt = null; |
| | |
| | for (const m of ALL_MODELS) { |
| | if (lastCkpt !== null && lastCkpt !== m.is_checkpoint) { |
| | const sep = document.createElement('div'); |
| | sep.className = 'model-separator'; |
| | container.appendChild(sep); |
| | } |
| | lastCkpt = m.is_checkpoint; |
| | |
| | const lbl = document.createElement('label'); |
| | lbl.className = 'checkbox-item'; |
| | |
| | const cb = document.createElement('input'); |
| | cb.type = 'checkbox'; |
| | cb.value = m.model_display_name; |
| | cb.checked = true; |
| | cb.dataset.isCheckpoint = m.is_checkpoint; |
| | cb.addEventListener('change', () => this.renderChart()); |
| | |
| | const dot = document.createElement('span'); |
| | dot.style.cssText = `display:inline-block;width:9px;height:9px;border-radius:50%;background:${MODEL_COLORS[m.model_display_name]}`; |
| | |
| | const name = document.createElement('span'); |
| | name.className = 'model-name'; |
| | name.dataset.modelName = m.model_display_name; |
| | name.textContent = ' ' + m.model_display_name; |
| | if (!m.is_checkpoint) { |
| | name.style.fontStyle = 'italic'; |
| | } |
| | |
| | name.addEventListener('mouseenter', (e) => { |
| | const tip = name.dataset.missingTip; |
| | if (!tip) return; |
| | const tooltip = document.getElementById('custom-tooltip'); |
| | if (tooltip.classList.contains('scrollable')) return; |
| | tooltip.innerHTML = tip; |
| | tooltip.style.display = 'block'; |
| | tooltip._modelTip = true; |
| | const rect = name.getBoundingClientRect(); |
| | tooltip.style.left = (rect.left) + 'px'; |
| | tooltip.style.top = (rect.bottom + 4) + 'px'; |
| | }); |
| | name.addEventListener('mouseleave', () => { |
| | const tooltip = document.getElementById('custom-tooltip'); |
| | if (tooltip._modelTip) { |
| | tooltip.style.display = 'none'; |
| | tooltip._modelTip = false; |
| | } |
| | }); |
| | |
| | lbl.append(cb, dot, name); |
| | container.appendChild(lbl); |
| | } |
| | } |
| | |
| | setModels(checked) { |
| | this.el.models.querySelectorAll('input').forEach(cb => cb.checked = checked); |
| | this.renderChart(); |
| | } |
| | |
| | setModelsByType(isCheckpoint) { |
| | this.el.models.querySelectorAll('input').forEach(cb => { |
| | cb.checked = (cb.dataset.isCheckpoint === String(isCheckpoint)); |
| | }); |
| | this.renderChart(); |
| | } |
| | |
| | getSelectedModels() { |
| | return Array.from(this.el.models.querySelectorAll('input:checked')).map(cb => cb.value); |
| | } |
| | |
| | getSmoothing() { |
| | return parseInt(this.el.smooth.value, 10) || 1; |
| | } |
| | |
| | getChartType() { |
| | return this.el.chartType.value; |
| | } |
| | |
| | getSelectedTask() { |
| | const v = this.el.task.value; |
| | return v === '__group__' ? this.el.group.value : v; |
| | } |
| | |
| | |
| | async populateSuites(defaults) { |
| | const rows = await query(` |
| | SELECT DISTINCT task AS value, task_display_name AS label |
| | FROM scores |
| | WHERE task_type = 'eval_suite' AND task != 'test_fix' |
| | ORDER BY task |
| | `); |
| | populateSelect(this.el.suite, rows, defaults?.suite); |
| | await this.onSuiteChange(defaults); |
| | } |
| | |
| | async onSuiteChange(defaults) { |
| | const suite = this.el.suite.value; |
| | if (!suite) return; |
| | |
| | const rows = await query(` |
| | SELECT DISTINCT task AS value, task_display_name AS label |
| | FROM scores |
| | WHERE parent_task = '${esc(suite)}' |
| | AND task_type = 'task_group' |
| | ORDER BY task |
| | `); |
| | const options = [ |
| | { value: suite, label: `${suite} (aggregate)` }, |
| | ...rows, |
| | ]; |
| | populateSelect(this.el.group, options, defaults?.group); |
| | await this.onGroupChange(defaults); |
| | } |
| | |
| | async onGroupChange(defaults) { |
| | const group = this.el.group.value; |
| | if (!group) return; |
| | |
| | const rows = await query(` |
| | SELECT DISTINCT task AS value, task_display_name AS label |
| | FROM scores |
| | WHERE parent_task = '${esc(group)}' |
| | AND task_type = 'benchmark' |
| | ORDER BY task |
| | `); |
| | |
| | if (rows.length === 0) { |
| | populateSelect(this.el.task, [{ value: '__group__', label: '(aggregate)' }]); |
| | } else { |
| | populateSelect(this.el.task, [ |
| | { value: '__group__', label: `(aggregate: ${group})` }, |
| | ...rows, |
| | ]); |
| | } |
| | if (defaults?.task) this.el.task.value = defaults.task; |
| | await this.onTaskChange(defaults); |
| | } |
| | |
| | async onTaskChange(defaults) { |
| | const task = this.getSelectedTask(); |
| | if (!task) return; |
| | |
| | const rows = await query(` |
| | SELECT DISTINCT metric FROM scores WHERE task = '${esc(task)}' ORDER BY metric |
| | `); |
| | const prev = defaults?.metric || this.el.metric.value; |
| | populateSelect(this.el.metric, rows.map(r => r.metric), prev); |
| | if (defaults?.chartType) this.el.chartType.value = defaults.chartType; |
| | await this.renderChart(); |
| | } |
| | |
| | async updateMissingModels(task, metric) { |
| | const nameEls = this.el.models.querySelectorAll('.model-name'); |
| | if (!task || !metric) { |
| | nameEls.forEach(el => { |
| | el.classList.remove('missing'); |
| | delete el.dataset.missingTip; |
| | }); |
| | return; |
| | } |
| | const available = await query(` |
| | SELECT DISTINCT model_display_name FROM scores |
| | WHERE task = '${esc(task)}' AND metric = '${esc(metric)}' |
| | AND tokens_trained IS NOT NULL |
| | `); |
| | const availableSet = new Set(available.map(r => r.model_display_name)); |
| | nameEls.forEach(el => { |
| | const modelName = el.dataset.modelName; |
| | if (!availableSet.has(modelName)) { |
| | el.classList.add('missing'); |
| | el.dataset.missingTip = `No scores for "${modelName}" on this task / metric`; |
| | } else { |
| | el.classList.remove('missing'); |
| | delete el.dataset.missingTip; |
| | } |
| | }); |
| | } |
| | |
| | |
| | async renderChart() { |
| | const task = this.getSelectedTask(); |
| | const metric = this.el.metric.value; |
| | const models = this.getSelectedModels(); |
| | |
| | await this.updateMissingModels(task, metric); |
| | |
| | if (!task || !metric || models.length === 0) { |
| | this.el.chart.innerHTML = ''; |
| | return; |
| | } |
| | |
| | const rows = await query(` |
| | SELECT model, model_display_name, tokens_trained, score, score_stderr, |
| | is_checkpoint, higher_is_better, step |
| | FROM scores |
| | WHERE task = '${esc(task)}' |
| | AND metric = '${esc(metric)}' |
| | AND model_display_name IN (${sqlIn(models)}) |
| | AND tokens_trained IS NOT NULL |
| | ORDER BY model_display_name, tokens_trained |
| | `); |
| | |
| | if (rows.length === 0) { |
| | this.el.chart.innerHTML = '<div class="loading">No data for this selection</div>'; |
| | return; |
| | } |
| | |
| | |
| | const mergedRows = this.mergeFinalCheckpoints(rows); |
| | |
| | |
| | const chartType = this.resolveChartType(mergedRows); |
| | const higherIsBetter = mergedRows[0]?.higher_is_better; |
| | |
| | |
| | let subtaskTree = null; |
| | try { |
| | const stRows = await query(` |
| | SELECT subtask_tree FROM scores |
| | WHERE task = '${esc(task)}' AND metric = '${esc(metric)}' |
| | AND subtask_tree IS NOT NULL |
| | LIMIT 1 |
| | `); |
| | if (stRows.length > 0 && stRows[0].subtask_tree) { |
| | subtaskTree = JSON.parse(stRows[0].subtask_tree); |
| | } |
| | } catch (e) { |
| | |
| | } |
| | |
| | if (chartType === 'bar') { |
| | this.drawBarChart(mergedRows, task, metric, higherIsBetter, subtaskTree); |
| | } else { |
| | this.drawLineChart(mergedRows, task, metric, higherIsBetter, subtaskTree); |
| | } |
| | } |
| | |
| | mergeFinalCheckpoints(rows) { |
| | |
| | |
| | const regular = []; |
| | const finals = []; |
| | for (const r of rows) { |
| | if (r.step === null || r.step === undefined) { |
| | finals.push(r); |
| | } else { |
| | regular.push(r); |
| | } |
| | } |
| | if (finals.length === 0) return rows; |
| | |
| | |
| | const modelToSeries = {}; |
| | for (const r of regular) { |
| | if (r.is_checkpoint) { |
| | modelToSeries[r.model] = r.model_display_name; |
| | } |
| | } |
| | |
| | const result = [...regular]; |
| | for (const fc of finals) { |
| | const seriesName = modelToSeries[fc.model]; |
| | if (seriesName) { |
| | |
| | result.push({ ...fc, model_display_name: seriesName, is_checkpoint: true }); |
| | } else { |
| | |
| | result.push(fc); |
| | } |
| | } |
| | return result; |
| | } |
| | |
| | resolveChartType(rows) { |
| | const pref = this.getChartType(); |
| | if (pref !== 'auto') return pref; |
| | |
| | |
| | const byModel = {}; |
| | for (const r of rows) { |
| | if (!byModel[r.model_display_name]) byModel[r.model_display_name] = new Set(); |
| | byModel[r.model_display_name].add(Number(r.tokens_trained)); |
| | } |
| | const allSingle = Object.values(byModel).every(s => s.size <= 1); |
| | return allSingle ? 'bar' : 'line'; |
| | } |
| | |
| | formatChartTitle(task, metric, higherIsBetter) { |
| | const arrow = higherIsBetter === true ? ' \u2191' : higherIsBetter === false ? ' \u2193' : ''; |
| | return `${task} \u2014 ${metric}${arrow}`; |
| | } |
| | |
| | renderSubtaskTree(map, keys, depth = 0) { |
| | if (!keys || keys.length === 0) return ''; |
| | const indent = depth * 16; |
| | return keys.map(key => { |
| | const children = map[key]; |
| | let html = `<div style="padding-left:${indent}px">${key}</div>`; |
| | if (children) { |
| | html += this.renderSubtaskTree(map, children, depth + 1); |
| | } |
| | return html; |
| | }).join(''); |
| | } |
| | |
| | setupTitleTooltip(subtaskTree) { |
| | const hoverZone = this.el.titleHover; |
| | hoverZone.innerHTML = ''; |
| | if (!subtaskTree || typeof subtaskTree !== 'object' || Object.keys(subtaskTree).length === 0) { |
| | hoverZone.style.display = 'none'; |
| | return; |
| | } |
| | hoverZone.style.display = ''; |
| | |
| | |
| | const icon = document.createElement('span'); |
| | icon.className = 'title-info-icon'; |
| | icon.textContent = 'i'; |
| | hoverZone.appendChild(icon); |
| | const titleEl = this.el.chart.querySelector('.gtitle'); |
| | if (titleEl) { |
| | const wrapperRect = this.el.chart.closest('.panel-chart-wrapper').getBoundingClientRect(); |
| | const titleRect = titleEl.getBoundingClientRect(); |
| | icon.style.left = (titleRect.right - wrapperRect.left - 50 + 6) + 'px'; |
| | } else { |
| | icon.style.right = '0px'; |
| | } |
| | const tooltip = document.getElementById('custom-tooltip'); |
| | |
| | const allChildren = new Set(Object.values(subtaskTree).flat()); |
| | const rootKeys = Object.keys(subtaskTree).filter(k => !allChildren.has(k)); |
| | const html = this.renderSubtaskTree(subtaskTree, rootKeys); |
| | |
| | const positionTooltip = () => { |
| | const titleEl = this.el.chart.querySelector('.gtitle'); |
| | const chartRect = this.el.chart.getBoundingClientRect(); |
| | const tw = tooltip.offsetWidth; |
| | let tipTop; |
| | if (titleEl) { |
| | const titleRect = titleEl.getBoundingClientRect(); |
| | const titleCenter = (titleRect.left + titleRect.right) / 2; |
| | tooltip.style.left = (titleCenter - tw / 2) + 'px'; |
| | tipTop = titleRect.bottom + 4; |
| | } else { |
| | tooltip.style.left = (chartRect.left + chartRect.width / 2 - tw / 2) + 'px'; |
| | tipTop = chartRect.top + 40; |
| | } |
| | tooltip.style.top = tipTop + 'px'; |
| | tooltip.style.maxHeight = Math.max(0, chartRect.bottom - tipTop) + 'px'; |
| | }; |
| | |
| | this._titleClick = (e) => { |
| | |
| | if (tooltip.style.display === 'block' && tooltip._panelId === this.id) { |
| | tooltip.style.display = 'none'; |
| | tooltip.classList.remove('scrollable'); |
| | tooltip._panelId = null; |
| | window.removeEventListener('scroll', this._titleScroll, true); |
| | return; |
| | } |
| | tooltip.innerHTML = html; |
| | tooltip.classList.add('scrollable'); |
| | tooltip.style.display = 'block'; |
| | tooltip._panelId = this.id; |
| | positionTooltip(); |
| | window.addEventListener('scroll', this._titleScroll, true); |
| | }; |
| | |
| | this._titleScroll = () => { |
| | if (tooltip.style.display === 'block' && tooltip._panelId === this.id) { |
| | positionTooltip(); |
| | } |
| | }; |
| | |
| | this._titleOutsideClick = (e) => { |
| | if (tooltip._panelId !== this.id) return; |
| | if (tooltip.contains(e.target) || hoverZone.contains(e.target)) return; |
| | tooltip.style.display = 'none'; |
| | tooltip.classList.remove('scrollable'); |
| | tooltip._panelId = null; |
| | window.removeEventListener('scroll', this._titleScroll, true); |
| | }; |
| | |
| | hoverZone.addEventListener('click', this._titleClick); |
| | document.addEventListener('mousedown', this._titleOutsideClick); |
| | } |
| | |
| | startResize(e) { |
| | e.preventDefault(); |
| | const startY = e.clientY; |
| | const startH = this.el.chart.offsetHeight; |
| | this.el.resize.classList.add('active'); |
| | |
| | const onMove = (ev) => { |
| | const delta = ev.clientY - startY; |
| | const newH = Math.max(200, startH + delta); |
| | this.chartHeight = newH; |
| | Plotly.relayout(this.el.chart, { height: newH }); |
| | }; |
| | |
| | const onUp = () => { |
| | this.el.resize.classList.remove('active'); |
| | document.removeEventListener('mousemove', onMove); |
| | document.removeEventListener('mouseup', onUp); |
| | }; |
| | |
| | document.addEventListener('mousemove', onMove); |
| | document.addEventListener('mouseup', onUp); |
| | } |
| | |
| | getChartHeight(fallback) { |
| | return this.chartHeight || fallback; |
| | } |
| | |
| | cleanupTooltip() { |
| | const tooltip = document.getElementById('custom-tooltip'); |
| | tooltip.style.display = 'none'; |
| | const chart = this.el.chart; |
| | chart.removeAllListeners?.('plotly_hover'); |
| | chart.removeAllListeners?.('plotly_unhover'); |
| | if (this._tooltipMouseMove) { |
| | chart.removeEventListener('mousemove', this._tooltipMouseMove); |
| | this._tooltipMouseMove = null; |
| | } |
| | if (this._tooltipMouseLeave) { |
| | chart.removeEventListener('mouseleave', this._tooltipMouseLeave); |
| | this._tooltipMouseLeave = null; |
| | } |
| | |
| | if (this._titleClick) { |
| | const hz = this.el.titleHover; |
| | hz.removeEventListener('click', this._titleClick); |
| | hz.style.display = 'none'; |
| | this._titleClick = null; |
| | } |
| | if (this._titleScroll) { |
| | window.removeEventListener('scroll', this._titleScroll, true); |
| | this._titleScroll = null; |
| | } |
| | if (this._titleOutsideClick) { |
| | document.removeEventListener('mousedown', this._titleOutsideClick); |
| | this._titleOutsideClick = null; |
| | } |
| | if (tooltip._panelId === this.id) { |
| | tooltip.classList.remove('scrollable'); |
| | tooltip._panelId = null; |
| | } |
| | } |
| | |
| | drawLineChart(rows, task, metric, higherIsBetter, subtasks) { |
| | this.cleanupTooltip(); |
| | const w = this.getSmoothing(); |
| | |
| | |
| | const byModel = {}; |
| | for (const r of rows) { |
| | const name = r.model_display_name; |
| | if (!byModel[name]) byModel[name] = { points: [], isCheckpoint: r.is_checkpoint }; |
| | byModel[name].points.push({ x: Number(r.tokens_trained), y: r.score }); |
| | } |
| | for (const d of Object.values(byModel)) d.points.sort((a, b) => a.x - b.x); |
| | |
| | |
| | let xMin = Infinity, xMax = -Infinity; |
| | for (const d of Object.values(byModel)) { |
| | if (d.isCheckpoint) { |
| | for (const p of d.points) { |
| | xMin = Math.min(xMin, p.x); |
| | xMax = Math.max(xMax, p.x); |
| | } |
| | } |
| | } |
| | if (!isFinite(xMin)) { xMin = 0; xMax = 1; } |
| | |
| | const traces = []; |
| | for (const [name, d] of Object.entries(byModel)) { |
| | const color = MODEL_COLORS[name] || '#999'; |
| | if (d.isCheckpoint && d.points.length > 1) { |
| | traces.push({ |
| | x: d.points.map(p => p.x), |
| | y: movingAverage(d.points.map(p => p.y), w), |
| | name, mode: 'lines+markers', |
| | line: { color, width: 2 }, marker: { size: 5 }, |
| | }); |
| | } else { |
| | const score = d.points[0]?.y; |
| | if (score != null) { |
| | traces.push({ |
| | x: [xMin, xMax], y: [score, score], |
| | name, mode: 'lines', |
| | line: { color, width: 2, dash: 'dash' }, |
| | }); |
| | } |
| | } |
| | } |
| | |
| | |
| | const tickVals = niceTicks(xMin, xMax); |
| | |
| | Plotly.react(this.el.chart, traces, { |
| | title: { text: this.formatChartTitle(task, metric, higherIsBetter), font: { size: 14, color: '#1a1a2e' } }, |
| | hoverlabel: { namelength: -1 }, |
| | xaxis: { |
| | title: { text: 'Tokens Trained', font: { size: 12 } }, |
| | tickfont: { size: 10 }, tickvals: tickVals, ticktext: tickVals.map(formatTokens), |
| | gridcolor: '#e9ecef', zeroline: false, |
| | }, |
| | yaxis: { |
| | title: { text: 'Score', font: { size: 12 } }, |
| | tickfont: { size: 10 }, gridcolor: '#e9ecef', zeroline: false, autorange: true, |
| | }, |
| | legend: { orientation: 'h', yanchor: 'bottom', y: 1.05, x: 0, font: { size: 11 } }, |
| | margin: { t: 80, r: 20, b: 70, l: 50 }, |
| | plot_bgcolor: '#fff', paper_bgcolor: '#fff', |
| | font: { family: '-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif' }, |
| | height: this.getChartHeight(600), |
| | }, { responsive: true }); |
| | |
| | this.setupTitleTooltip(subtasks); |
| | } |
| | |
| | drawBarChart(rows, task, metric, higherIsBetter, subtasks) { |
| | this.cleanupTooltip(); |
| | |
| | const byModel = {}; |
| | for (const r of rows) { |
| | const name = r.model_display_name; |
| | const tokens = Number(r.tokens_trained); |
| | if (!byModel[name] || tokens > byModel[name].tokens) { |
| | byModel[name] = { score: r.score, tokens, isCheckpoint: r.is_checkpoint }; |
| | } |
| | } |
| | |
| | |
| | const sorted = Object.entries(byModel) |
| | .sort((a, b) => higherIsBetter !== false ? b[1].score - a[1].score : a[1].score - b[1].score); |
| | |
| | const names = sorted.map(([n]) => n); |
| | const scores = sorted.map(([, d]) => d.score); |
| | const colors = sorted.map(([n]) => MODEL_COLORS[n] || '#999'); |
| | const tokens = sorted.map(([, d]) => formatTokens(d.tokens)); |
| | const hovertext = sorted.map(([n, d]) => |
| | `${n}<br>Score: ${d.score.toFixed(4)}<br>Tokens: ${formatTokens(d.tokens)}` |
| | ); |
| | |
| | |
| | const annotations = names.map((name, i) => ({ |
| | x: 0, |
| | y: name, |
| | text: tokens[i], |
| | hovertext: 'Tokens Trained', |
| | xanchor: 'left', |
| | yanchor: 'middle', |
| | showarrow: false, |
| | font: { size: 10, color: '#000' }, |
| | xshift: 4, |
| | })); |
| | |
| | Plotly.react(this.el.chart, [{ |
| | type: 'bar', |
| | orientation: 'h', |
| | y: names, |
| | x: scores, |
| | marker: { color: colors }, |
| | text: scores.map(s => s.toFixed(4)), |
| | textposition: 'outside', |
| | textfont: { size: 11 }, |
| | hoverinfo: 'none', |
| | customdata: hovertext, |
| | }], { |
| | title: { text: this.formatChartTitle(task, metric, higherIsBetter), font: { size: 14, color: '#1a1a2e' } }, |
| | hovermode: 'closest', |
| | annotations, |
| | xaxis: { |
| | title: { text: 'Score', font: { size: 12 } }, |
| | tickfont: { size: 10 }, gridcolor: '#e9ecef', zeroline: false, |
| | }, |
| | yaxis: { |
| | tickfont: { size: 11 }, automargin: true, |
| | categoryorder: 'array', categoryarray: names.slice().reverse(), |
| | }, |
| | margin: { t: 60, r: 80, b: 60, l: 10 }, |
| | plot_bgcolor: '#fff', paper_bgcolor: '#fff', |
| | font: { family: '-apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif' }, |
| | height: this.getChartHeight(Math.max(400, names.length * 40 + 100)), |
| | showlegend: false, |
| | }, { responsive: true }); |
| | |
| | |
| | const tooltip = document.getElementById('custom-tooltip'); |
| | const chart = this.el.chart; |
| | chart.on('plotly_hover', (data) => { |
| | if (tooltip.classList.contains('scrollable')) return; |
| | const pt = data.points[0]; |
| | tooltip.innerHTML = pt.customdata; |
| | tooltip.style.display = 'block'; |
| | }); |
| | chart.on('plotly_unhover', () => { |
| | if (tooltip.classList.contains('scrollable')) return; |
| | tooltip.style.display = 'none'; |
| | }); |
| | this._tooltipMouseMove = (e) => { |
| | if (tooltip.classList.contains('scrollable')) return; |
| | if (tooltip.style.display === 'block') { |
| | tooltip.style.left = (e.clientX + 12) + 'px'; |
| | tooltip.style.top = (e.clientY - 10) + 'px'; |
| | } |
| | }; |
| | this._tooltipMouseLeave = () => { |
| | if (tooltip.classList.contains('scrollable')) return; |
| | tooltip.style.display = 'none'; |
| | }; |
| | chart.addEventListener('mousemove', this._tooltipMouseMove); |
| | chart.addEventListener('mouseleave', this._tooltipMouseLeave); |
| | |
| | this.setupTitleTooltip(subtasks); |
| | } |
| | |
| | export(format) { |
| | const task = this.getSelectedTask(); |
| | const metric = this.el.metric.value; |
| | let filename = `${task}_${metric}`.replace(/[^a-zA-Z0-9_-]/g, '_'); |
| | Plotly.downloadImage(this.el.chart, { format, scale: 3, filename }); |
| | } |
| | } |
| | |
| | |
| | async function addPanel(defaults) { |
| | const id = panelCounter++; |
| | const panel = new Panel(id); |
| | panels.set(id, panel); |
| | await panel.populateSuites(defaults); |
| | return panel; |
| | } |
| | |
| | |
| | const elInitLoading = document.getElementById('init-loading'); |
| | const elAddPanelRow = document.getElementById('add-panel-row'); |
| | |
| | async function init() { |
| | try { |
| | elInitLoading.textContent = 'Loading config...'; |
| | await loadConfig(); |
| | |
| | elInitLoading.textContent = 'Initializing DuckDB...'; |
| | await initDuckDB(); |
| | |
| | elInitLoading.textContent = 'Loading data from HuggingFace...'; |
| | await loadParquet(); |
| | |
| | elInitLoading.textContent = 'Loading models...'; |
| | await loadModels(); |
| | |
| | elInitLoading.style.display = 'none'; |
| | elAddPanelRow.style.display = ''; |
| | |
| | |
| | await Promise.all([ |
| | addPanel({ suite: 'eng_base_easy', group: 'eng_base_easy_bpb', metric: 'bits_per_byte', chartType: 'bar' }), |
| | addPanel({ suite: 'deu_base_easy', group: 'deu_base_easy_bpb', metric: 'bits_per_byte', chartType: 'bar' }), |
| | addPanel({ suite: 'eng_base_easy', group: 'eng_base_easy_rc', metric: 'acc_norm', chartType: 'bar' }), |
| | addPanel({ suite: 'deu_base_easy', group: 'deu_base_easy_rc', metric: 'acc_norm', chartType: 'bar' }), |
| | ]); |
| | } catch (err) { |
| | elInitLoading.innerHTML = `<span style="color:#e63946"> |
| | Error: ${err.message}<br> |
| | <small>Check browser console for details.</small> |
| | </span>`; |
| | console.error('Init failed:', err); |
| | } |
| | } |
| | |
| | document.getElementById('btn-add-panel').addEventListener('click', () => addPanel()); |
| | |
| | init(); |
| | </script> |
| | </body> |
| | </html> |
| |
|