Spaces:
Running
Running
| class LLMBenchmarkDashboard { | |
| constructor() { | |
| this.currentJudge = 'gpt5.4'; | |
| this.searchQuery = ''; | |
| this.judgeData = { | |
| o3: { general: null, complicated: null }, | |
| 'gpt5.4': { general: null, complicated: null } | |
| }; | |
| this.generalSort = { column: 'overall_score', direction: 'desc' }; | |
| this.complicatedSort = { column: 'overall_score', direction: 'desc' }; | |
| this.metricDisplayNames = { | |
| comprehension_score: 'Comprehension', | |
| structure_score: 'Coherence', | |
| prose_style_score: 'Style', | |
| creativity_score: 'Creativity', | |
| depth_score: 'Depth', | |
| helpfulness_score: 'Helpfulness', | |
| overall_score: 'Overall' | |
| }; | |
| this.metricDescriptions = { | |
| comprehension_score: 'How well the response understands the prompt intent and stays on topic.', | |
| structure_score: 'How clear, logical, and well-organized the writing is.', | |
| prose_style_score: 'The quality of language, grammar, and adherence to the requested tone.', | |
| creativity_score: 'The novelty of ideas and uniqueness of perspective.', | |
| depth_score: 'The level of detail, analysis, and substance provided.', | |
| helpfulness_score: 'How effectively the response fulfills the user\'s overall goal.', | |
| overall_score: 'Average score across all six criteria.' | |
| }; | |
| this.judgePaths = { | |
| o3: { | |
| general: 'data/all-scores.json', | |
| complicated: 'data/complicated-writing-scores.json' | |
| }, | |
| 'gpt5.4': { | |
| general: 'data/gpt5.4-judge-all-scores.json', | |
| complicated: 'data/gpt5.4-judge-complicated-writing-scores.json' | |
| } | |
| }; | |
| this.modelLinks = { | |
| 'Monomer-24B-Writer': 'https://huggingface.co/zake7749/Monomer-24B-Writer-Preview', | |
| 'Monomer-8B-Writer': 'https://huggingface.co/zake7749/Monomer-8B-Writer-Preview' | |
| }; | |
| this.init(); | |
| } | |
| async init() { | |
| this.showLoading(true); | |
| const promises = []; | |
| for (const judge of ['o3', 'gpt5.4']) { | |
| for (const type of ['general', 'complicated']) { | |
| promises.push(this.loadData(this.judgePaths[judge][type], judge, type)); | |
| } | |
| } | |
| await Promise.all(promises); | |
| this.renderTable('general'); | |
| this.renderTable('complicated'); | |
| this.setupSearch(); | |
| this.setupJudgeToggle(); | |
| this.setupTabs(); | |
| this.showLoading(false); | |
| } | |
| async loadData(path, judge, type) { | |
| try { | |
| const response = await fetch(path); | |
| if (!response.ok) { | |
| throw new Error(`HTTP error! status: ${response.status}`); | |
| } | |
| this.judgeData[judge][type] = await response.json(); | |
| } catch (error) { | |
| console.error(`Error loading ${judge}/${type} data:`, error); | |
| } | |
| } | |
| get generalData() { | |
| return this.judgeData[this.currentJudge].general; | |
| } | |
| get complicatedData() { | |
| return this.judgeData[this.currentJudge].complicated; | |
| } | |
| setupJudgeToggle() { | |
| const buttons = document.querySelectorAll('.judge-btn'); | |
| buttons.forEach(btn => { | |
| btn.addEventListener('click', () => { | |
| const judge = btn.dataset.judge; | |
| if (judge === this.currentJudge) return; | |
| this.currentJudge = judge; | |
| buttons.forEach(b => b.classList.remove('active')); | |
| btn.classList.add('active'); | |
| this.renderTable('general'); | |
| this.renderTable('complicated'); | |
| }); | |
| }); | |
| } | |
| setupSearch() { | |
| const searchInput = document.getElementById('globalSearch'); | |
| if (searchInput) { | |
| searchInput.addEventListener('input', (e) => { | |
| this.searchQuery = e.target.value.toLowerCase().trim(); | |
| this.renderTable('general'); | |
| this.renderTable('complicated'); | |
| }); | |
| } | |
| } | |
| setupTabs() { | |
| const tabBtns = document.querySelectorAll('.tab-btn'); | |
| const tabContents = document.querySelectorAll('.tab-content'); | |
| tabBtns.forEach(btn => { | |
| btn.addEventListener('click', () => { | |
| const targetId = btn.dataset.target; | |
| // Update active state of buttons | |
| tabBtns.forEach(b => b.classList.remove('active')); | |
| btn.classList.add('active'); | |
| // Update active state of content | |
| tabContents.forEach(content => { | |
| if (content.id === targetId) { | |
| content.classList.add('active'); | |
| } else { | |
| content.classList.remove('active'); | |
| } | |
| }); | |
| }); | |
| }); | |
| } | |
| renderTable(type) { | |
| const data = type === 'general' ? this.generalData : this.complicatedData; | |
| const sortState = type === 'general' ? this.generalSort : this.complicatedSort; | |
| const tableContainer = document.getElementById(type === 'general' ? 'generalTable' : 'complicatedTable'); | |
| if (!data) return; | |
| const models = Object.keys(data); | |
| const metrics = Object.keys(data[models[0]]); | |
| const tableHTML = ` | |
| <table> | |
| <thead> | |
| <tr> | |
| <th class="sortable${sortState.column === 'model' ? ' sort-' + sortState.direction : ''}" data-type="${type}" data-column="model">Model</th> | |
| ${metrics.map(metric => ` | |
| <th class="sortable${sortState.column === metric ? ' sort-' + sortState.direction : ''}" data-type="${type}" data-column="${metric}">${this.metricDisplayNames[metric] || metric}</th> | |
| `).join('')} | |
| </tr> | |
| </thead> | |
| <tbody> | |
| ${this.getSortedTableData(data, sortState, metrics).map((row, index) => { | |
| const isMonomer = this.modelLinks[row.model]; | |
| const isBaseline = row.model === 'gpt-4.1-2025-04-14'; | |
| let rowClass = ''; | |
| if (isMonomer) rowClass = 'highlight-row'; | |
| if (isBaseline) rowClass = 'baseline-row'; | |
| // Generate rank medal for the sorted column if it's a metric | |
| let medalHtml = ''; | |
| if (sortState.column !== 'model' && ['desc', 'asc'].includes(sortState.direction)) { | |
| const rank = sortState.direction === 'desc' ? index + 1 : Object.keys(data).length - index; | |
| if (rank === 1) medalHtml = '<span class="rank-medal">🥇</span>'; | |
| else if (rank === 2) medalHtml = '<span class="rank-medal">🥈</span>'; | |
| else if (rank === 3) medalHtml = '<span class="rank-medal">🥉</span>'; | |
| } | |
| return ` | |
| <tr class="${rowClass}"> | |
| <td class="model-cell"> | |
| ${medalHtml} | |
| ${isMonomer ? `<a href="${this.modelLinks[row.model]}" target="_blank" rel="noopener" class="model-link">${row.model}</a>` : row.model} | |
| ${isBaseline ? ' (Baseline)' : ''} | |
| </td> | |
| ${metrics.map(metric => ` | |
| <td class="score-cell" style="background-color: ${this.getHeatmapColor(row[metric], type, metric)}">${this.formatScore(row[metric])}</td> | |
| `).join('')} | |
| </tr> | |
| `; | |
| }).join('')} | |
| </tbody> | |
| </table> | |
| `; | |
| tableContainer.innerHTML = tableHTML; | |
| this.setupTableSorting(type); | |
| } | |
| getSortedTableData(data, sortState, metrics) { | |
| const models = Object.keys(data); | |
| let tableData = models | |
| .filter(model => { | |
| if (!this.searchQuery) return true; | |
| return model.toLowerCase().includes(this.searchQuery); | |
| }) | |
| .map(model => ({ | |
| model, | |
| ...data[model] | |
| })); | |
| if (sortState.column) { | |
| tableData.sort((a, b) => { | |
| let aVal = a[sortState.column]; | |
| let bVal = b[sortState.column]; | |
| if (sortState.column === 'model') { | |
| aVal = aVal.toLowerCase(); | |
| bVal = bVal.toLowerCase(); | |
| } | |
| if (aVal < bVal) return sortState.direction === 'asc' ? -1 : 1; | |
| if (aVal > bVal) return sortState.direction === 'asc' ? 1 : -1; | |
| return 0; | |
| }); | |
| } | |
| return tableData; | |
| } | |
| formatScore(value) { | |
| if (typeof value === 'number') { | |
| return value.toFixed(2); | |
| } | |
| return value; | |
| } | |
| getHeatmapColor(val, type, metric) { | |
| if (val === null || val === undefined) return 'transparent'; | |
| // Use 5.0 as the neutral midpoint since that is the baseline | |
| // Less than 5.0: increasingly red. Greater than 5.0: increasingly green/purple. | |
| // Let's go from 3.0 to 9.0 as the typical min/max | |
| const minVal = 3.5; | |
| const maxVal = 8.5; | |
| const baseline = 5.0; | |
| let color = 'transparent'; | |
| if (val < baseline) { | |
| // Bad score: red hue | |
| const intensity = Math.min(1, (baseline - val) / (baseline - minVal)); | |
| // e.g. rgba(255, 99, 132, intensity) | |
| // But we want extremely light background colors so text remains readable | |
| color = `rgba(239, 68, 68, ${intensity * 0.2})`; | |
| } else if (val > baseline) { | |
| // Good score: purple/green hue | |
| const intensity = Math.min(1, (val - baseline) / (maxVal - baseline)); | |
| color = `rgba(16, 185, 129, ${intensity * 0.2})`; | |
| } | |
| return color; | |
| } | |
| setupTableSorting(type) { | |
| const tableContainer = document.getElementById(type === 'general' ? 'generalTable' : 'complicatedTable'); | |
| const headers = tableContainer.querySelectorAll('th.sortable'); | |
| headers.forEach(header => { | |
| header.addEventListener('click', () => { | |
| const column = header.dataset.column; | |
| this.handleSort(type, column, header); | |
| }); | |
| }); | |
| } | |
| handleSort(type, column, headerElement) { | |
| const sortState = type === 'general' ? this.generalSort : this.complicatedSort; | |
| if (sortState.column === column) { | |
| sortState.direction = sortState.direction === 'asc' ? 'desc' : 'asc'; | |
| } else { | |
| sortState.column = column; | |
| sortState.direction = 'asc'; | |
| } | |
| const tableContainer = document.getElementById(type === 'general' ? 'generalTable' : 'complicatedTable'); | |
| tableContainer.querySelectorAll('th.sortable').forEach(th => { | |
| th.classList.remove('sort-asc', 'sort-desc'); | |
| }); | |
| headerElement.classList.add(`sort-${sortState.direction}`); | |
| this.renderTable(type); | |
| } | |
| showLoading(show) { | |
| const loading = document.getElementById('loading'); | |
| if (show) { | |
| loading.classList.remove('hidden'); | |
| } else { | |
| loading.classList.add('hidden'); | |
| } | |
| } | |
| showError(message) { | |
| const loading = document.getElementById('loading'); | |
| loading.innerHTML = ` | |
| <div class="no-data"> | |
| <i class="fas fa-exclamation-triangle"></i> | |
| <p>${message}</p> | |
| </div> | |
| `; | |
| } | |
| } | |
| document.addEventListener('DOMContentLoaded', () => { | |
| new LLMBenchmarkDashboard(); | |
| }); | |