File size: 11,868 Bytes
8eecd10
 
384e89d
 
8eecd10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384e89d
 
 
 
 
 
 
 
 
8eecd10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384e89d
8eecd10
384e89d
8eecd10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384e89d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8eecd10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384e89d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8eecd10
384e89d
8eecd10
 
 
384e89d
8eecd10
 
 
 
 
 
 
 
 
 
384e89d
 
 
 
 
 
 
 
 
8eecd10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384e89d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8eecd10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
class LLMBenchmarkDashboard {
    constructor() {
        this.currentJudge = 'gpt5.4';
        this.searchQuery = '';
        this.judgeData = {
            o3: { general: null, complicated: null },
            'gpt5.4': { general: null, complicated: null }
        };
        this.generalSort = { column: 'overall_score', direction: 'desc' };
        this.complicatedSort = { column: 'overall_score', direction: 'desc' };
        this.metricDisplayNames = {
            comprehension_score: 'Comprehension',
            structure_score: 'Coherence',
            prose_style_score: 'Style',
            creativity_score: 'Creativity',
            depth_score: 'Depth',
            helpfulness_score: 'Helpfulness',
            overall_score: 'Overall'
        };
        this.metricDescriptions = {
            comprehension_score: 'How well the response understands the prompt intent and stays on topic.',
            structure_score: 'How clear, logical, and well-organized the writing is.',
            prose_style_score: 'The quality of language, grammar, and adherence to the requested tone.',
            creativity_score: 'The novelty of ideas and uniqueness of perspective.',
            depth_score: 'The level of detail, analysis, and substance provided.',
            helpfulness_score: 'How effectively the response fulfills the user\'s overall goal.',
            overall_score: 'Average score across all six criteria.'
        };
        this.judgePaths = {
            o3: {
                general: 'data/all-scores.json',
                complicated: 'data/complicated-writing-scores.json'
            },
            'gpt5.4': {
                general: 'data/gpt5.4-judge-all-scores.json',
                complicated: 'data/gpt5.4-judge-complicated-writing-scores.json'
            }
        };
        this.modelLinks = {
            'Monomer-24B-Writer': 'https://huggingface.co/zake7749/Monomer-24B-Writer-Preview',
            'Monomer-8B-Writer': 'https://huggingface.co/zake7749/Monomer-8B-Writer-Preview'
        };
        this.init();
    }

    async init() {
        this.showLoading(true);
        const promises = [];
        for (const judge of ['o3', 'gpt5.4']) {
            for (const type of ['general', 'complicated']) {
                promises.push(this.loadData(this.judgePaths[judge][type], judge, type));
            }
        }
        await Promise.all(promises);
        this.renderTable('general');
        this.renderTable('complicated');
        this.setupSearch();
        this.setupJudgeToggle();
        this.setupTabs();
        this.showLoading(false);
    }

    async loadData(path, judge, type) {
        try {
            const response = await fetch(path);
            if (!response.ok) {
                throw new Error(`HTTP error! status: ${response.status}`);
            }
            this.judgeData[judge][type] = await response.json();
        } catch (error) {
            console.error(`Error loading ${judge}/${type} data:`, error);
        }
    }

    get generalData() {
        return this.judgeData[this.currentJudge].general;
    }

    get complicatedData() {
        return this.judgeData[this.currentJudge].complicated;
    }

    setupJudgeToggle() {
        const buttons = document.querySelectorAll('.judge-btn');
        buttons.forEach(btn => {
            btn.addEventListener('click', () => {
                const judge = btn.dataset.judge;
                if (judge === this.currentJudge) return;

                this.currentJudge = judge;
                buttons.forEach(b => b.classList.remove('active'));
                btn.classList.add('active');

                this.renderTable('general');
                this.renderTable('complicated');
            });
        });
    }

    setupSearch() {
        const searchInput = document.getElementById('globalSearch');
        if (searchInput) {
            searchInput.addEventListener('input', (e) => {
                this.searchQuery = e.target.value.toLowerCase().trim();
                this.renderTable('general');
                this.renderTable('complicated');
            });
        }
    }

    setupTabs() {
        const tabBtns = document.querySelectorAll('.tab-btn');
        const tabContents = document.querySelectorAll('.tab-content');

        tabBtns.forEach(btn => {
            btn.addEventListener('click', () => {
                const targetId = btn.dataset.target;

                // Update active state of buttons
                tabBtns.forEach(b => b.classList.remove('active'));
                btn.classList.add('active');

                // Update active state of content
                tabContents.forEach(content => {
                    if (content.id === targetId) {
                        content.classList.add('active');
                    } else {
                        content.classList.remove('active');
                    }
                });
            });
        });
    }

    renderTable(type) {
        const data = type === 'general' ? this.generalData : this.complicatedData;
        const sortState = type === 'general' ? this.generalSort : this.complicatedSort;
        const tableContainer = document.getElementById(type === 'general' ? 'generalTable' : 'complicatedTable');
        if (!data) return;

        const models = Object.keys(data);
        const metrics = Object.keys(data[models[0]]);

        const tableHTML = `
            <table>
                <thead>
                    <tr>
                        <th class="sortable${sortState.column === 'model' ? ' sort-' + sortState.direction : ''}" data-type="${type}" data-column="model">Model</th>
                        ${metrics.map(metric => `
                            <th class="sortable${sortState.column === metric ? ' sort-' + sortState.direction : ''}" data-type="${type}" data-column="${metric}">${this.metricDisplayNames[metric] || metric}</th>
                        `).join('')}
                    </tr>
                </thead>
                <tbody>
                    ${this.getSortedTableData(data, sortState, metrics).map((row, index) => {
            const isMonomer = this.modelLinks[row.model];
            const isBaseline = row.model === 'gpt-4.1-2025-04-14';
            let rowClass = '';
            if (isMonomer) rowClass = 'highlight-row';
            if (isBaseline) rowClass = 'baseline-row';

            // Generate rank medal for the sorted column if it's a metric
            let medalHtml = '';
            if (sortState.column !== 'model' && ['desc', 'asc'].includes(sortState.direction)) {
                const rank = sortState.direction === 'desc' ? index + 1 : Object.keys(data).length - index;
                if (rank === 1) medalHtml = '<span class="rank-medal">🥇</span>';
                else if (rank === 2) medalHtml = '<span class="rank-medal">🥈</span>';
                else if (rank === 3) medalHtml = '<span class="rank-medal">🥉</span>';
            }

            return `
                        <tr class="${rowClass}">
                            <td class="model-cell">
                                ${medalHtml}
                                ${isMonomer ? `<a href="${this.modelLinks[row.model]}" target="_blank" rel="noopener" class="model-link">${row.model}</a>` : row.model}
                                ${isBaseline ? ' (Baseline)' : ''}
                            </td>
                            ${metrics.map(metric => `
                                <td class="score-cell" style="background-color: ${this.getHeatmapColor(row[metric], type, metric)}">${this.formatScore(row[metric])}</td>
                            `).join('')}
                        </tr>
                    `;
        }).join('')}
                </tbody>
            </table>
        `;

        tableContainer.innerHTML = tableHTML;
        this.setupTableSorting(type);
    }

    getSortedTableData(data, sortState, metrics) {
        const models = Object.keys(data);
        let tableData = models
            .filter(model => {
                if (!this.searchQuery) return true;
                return model.toLowerCase().includes(this.searchQuery);
            })
            .map(model => ({
                model,
                ...data[model]
            }));

        if (sortState.column) {
            tableData.sort((a, b) => {
                let aVal = a[sortState.column];
                let bVal = b[sortState.column];
                if (sortState.column === 'model') {
                    aVal = aVal.toLowerCase();
                    bVal = bVal.toLowerCase();
                }
                if (aVal < bVal) return sortState.direction === 'asc' ? -1 : 1;
                if (aVal > bVal) return sortState.direction === 'asc' ? 1 : -1;
                return 0;
            });
        }
        return tableData;
    }

    formatScore(value) {
        if (typeof value === 'number') {
            return value.toFixed(2);
        }
        return value;
    }

    getHeatmapColor(val, type, metric) {
        if (val === null || val === undefined) return 'transparent';

        // Use 5.0 as the neutral midpoint since that is the baseline
        // Less than 5.0: increasingly red. Greater than 5.0: increasingly green/purple.
        // Let's go from 3.0 to 9.0 as the typical min/max
        const minVal = 3.5;
        const maxVal = 8.5;
        const baseline = 5.0;

        let color = 'transparent';
        if (val < baseline) {
            // Bad score: red hue
            const intensity = Math.min(1, (baseline - val) / (baseline - minVal));
            // e.g. rgba(255, 99, 132, intensity)
            // But we want extremely light background colors so text remains readable
            color = `rgba(239, 68, 68, ${intensity * 0.2})`;
        } else if (val > baseline) {
            // Good score: purple/green hue
            const intensity = Math.min(1, (val - baseline) / (maxVal - baseline));
            color = `rgba(16, 185, 129, ${intensity * 0.2})`;
        }
        return color;
    }

    setupTableSorting(type) {
        const tableContainer = document.getElementById(type === 'general' ? 'generalTable' : 'complicatedTable');
        const headers = tableContainer.querySelectorAll('th.sortable');
        headers.forEach(header => {
            header.addEventListener('click', () => {
                const column = header.dataset.column;
                this.handleSort(type, column, header);
            });
        });
    }

    handleSort(type, column, headerElement) {
        const sortState = type === 'general' ? this.generalSort : this.complicatedSort;
        if (sortState.column === column) {
            sortState.direction = sortState.direction === 'asc' ? 'desc' : 'asc';
        } else {
            sortState.column = column;
            sortState.direction = 'asc';
        }
        const tableContainer = document.getElementById(type === 'general' ? 'generalTable' : 'complicatedTable');
        tableContainer.querySelectorAll('th.sortable').forEach(th => {
            th.classList.remove('sort-asc', 'sort-desc');
        });
        headerElement.classList.add(`sort-${sortState.direction}`);
        this.renderTable(type);
    }

    showLoading(show) {
        const loading = document.getElementById('loading');
        if (show) {
            loading.classList.remove('hidden');
        } else {
            loading.classList.add('hidden');
        }
    }

    showError(message) {
        const loading = document.getElementById('loading');
        loading.innerHTML = `
            <div class="no-data">
                <i class="fas fa-exclamation-triangle"></i>
                <p>${message}</p>
            </div>
        `;
    }
}

document.addEventListener('DOMContentLoaded', () => {
    new LLMBenchmarkDashboard();
});