Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>LLM Benchmark Overview</title> | |
| <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600&display=swap" rel="stylesheet"> | |
| <style> | |
| /* General Body and Font Styles */ | |
| body { | |
| font-family: 'Inter', sans-serif; | |
| background-color: #1a1a2e; /* Dark background */ | |
| color: #e0e0e0; /* Light text */ | |
| margin: 0; | |
| padding: 20px; | |
| line-height: 1.6; | |
| font-size: 14px; /* Reduced base font size */ | |
| } | |
| /* Header Styling */ | |
| h1 { | |
| text-align: center; | |
| color: #a766ff; /* Neo purple */ | |
| margin-bottom: 30px; | |
| font-weight: 600; | |
| font-size: 2.2em; | |
| text-shadow: 0 0 10px rgba(167, 102, 255, 0.4); | |
| } | |
| /* Introduction Text */ | |
| body > div:nth-of-type(1) { /* Targeting the intro div */ | |
| max-width: 900px; | |
| margin: 0 auto 30px auto; | |
| text-align: justify; | |
| background-color: #2a2a4a; /* Slightly lighter dark background */ | |
| padding: 20px; | |
| border-radius: 12px; | |
| box-shadow: 0 4px 15px rgba(0, 0, 0, 0.3); | |
| font-size: 0.95em; | |
| } | |
| /* Table Container and Shadow */ | |
| .table-container { | |
| overflow-x: auto; | |
| margin-top: 20px; | |
| position: relative; | |
| border-radius: 12px; | |
| box-shadow: 0 8px 20px rgba(0, 0, 0, 0.5); /* Stronger shadow */ | |
| } | |
| /* Table Styling */ | |
| table { | |
| width: auto; /* Changed from 100% to auto to allow min-width to force overflow */ | |
| border-collapse: collapse; | |
| margin: 0 auto; | |
| background-color: #2a2a4a; /* Darker table background */ | |
| border-radius: 12px; | |
| overflow: hidden; /* Ensures rounded corners apply */ | |
| min-width: 950px; /* Ensure a minimum width for the table itself (sum of column min-widths) */ | |
| table-layout: fixed; /* Keep fixed layout for column width control */ | |
| } | |
| /* Table Headers and Cells */ | |
| th, td { | |
| padding: 10px 15px; /* Reduced vertical padding from 12px to 10px */ | |
| text-align: left; | |
| border: 1px solid #3a3a5a; /* Darker border */ | |
| font-size: 0.9em; /* Smaller font for table content */ | |
| vertical-align: top; /* Align content to top */ | |
| white-space: normal; /* Ensure cells allow content to wrap */ | |
| word-wrap: break-word; /* Ensure long words break within cells */ | |
| } | |
| /* Table Header Specifics */ | |
| th { | |
| background-color: #3a3a5a; /* Dark header background */ | |
| color: #c0c0c0; /* Lighter header text */ | |
| font-weight: 600; | |
| position: relative; | |
| /* white-space: normal and word-wrap: break-word are now in th, td general rule */ | |
| } | |
| /* Resizable Column Handler */ | |
| th.resizable .resizer { | |
| position: absolute; | |
| top: 0; | |
| right: 0; | |
| width: 8px; /* Wider resizer for easier grabbing */ | |
| height: 100%; | |
| cursor: col-resize; | |
| background-color: rgba(167, 102, 255, 0.2); /* Semi-transparent purple */ | |
| transition: background-color 0.2s ease-in-out; | |
| } | |
| th.resizable .resizer:hover { | |
| background-color: rgba(167, 102, 255, 0.5); /* More visible on hover */ | |
| } | |
| /* Alternating Row Colors */ | |
| tr:nth-child(even) { | |
| background-color: #2f2f50; /* Slightly different shade for even rows */ | |
| } | |
| /* Specific Column Styling for wider columns */ | |
| /* Adjusted widths for better display */ | |
| th:nth-child(1), td:nth-child(1) { width: 15%; min-width: 120px; } /* Evaluated task */ | |
| th:nth-child(2), td:nth-child(2) { width: 15%; min-width: 120px; } /* Benchmark Name */ | |
| th:nth-child(3), td:nth-child(3) { width: 20%; min-width: 150px; } /* Metric often used */ | |
| th:nth-child(4), td:nth-child(4) { width: 20%; min-width: 150px; } /* Question + context example */ | |
| th:nth-child(5), td:nth-child(5) { width: 20%; min-width: 150px; } /* Answer examp */ | |
| th:nth-child(6), td:nth-child(6) { width: 15%; min-width: 120px; } /* Paper */ | |
| th:nth-child(7), td:nth-child(7) { width: 15%; min-width: 120px; } /* HF or Git link */ | |
| /* Inner div for truncated content */ | |
| .cell-content { | |
| cursor: pointer; /* Keep cursor pointer for expandability */ | |
| overflow: hidden; | |
| text-overflow: ellipsis; | |
| display: -webkit-box; | |
| -webkit-line-clamp: 4; /* Limit to 4 lines */ | |
| -webkit-box-orient: vertical; | |
| /* white-space: normal and word-wrap: break-word are now in th, td general rule */ | |
| } | |
| /* Hover effect on the cell, not the inner content */ | |
| td:hover { | |
| background-color: #3a3a5a; /* Highlight on hover */ | |
| } | |
| /* Filter and Search Bar Styling */ | |
| .filter { | |
| margin-bottom: 25px; | |
| text-align: center; | |
| display: flex; | |
| flex-wrap: wrap; | |
| justify-content: center; | |
| align-items: center; | |
| gap: 15px; | |
| } | |
| .filter label { | |
| font-size: 1em; | |
| margin-right: 5px; | |
| color: #a766ff; /* Neo purple */ | |
| font-weight: 600; | |
| } | |
| .filter select, .filter input[type="text"] { | |
| padding: 8px 12px; | |
| font-size: 0.95em; | |
| border: 1px solid #5a5a7a; /* Darker border */ | |
| border-radius: 8px; | |
| background-color: #3a3a5a; /* Dark input background */ | |
| color: #e0e0e0; /* Light input text */ | |
| outline: none; | |
| transition: border-color 0.2s ease-in-out, box-shadow 0.2s ease-in-out; | |
| } | |
| .filter select:focus, .filter input[type="text"]:focus { | |
| border-color: #a766ff; | |
| box-shadow: 0 0 8px rgba(167, 102, 255, 0.5); | |
| } | |
| .filter input[type="text"] { | |
| flex-grow: 1; /* Allow search input to grow */ | |
| max-width: 400px; | |
| } | |
| /* Modal and Overlay Styling */ | |
| .modal { | |
| position: fixed; | |
| top: 50%; | |
| left: 50%; | |
| transform: translate(-50%, -50%); | |
| background-color: #2a2a4a; /* Dark modal background */ | |
| box-shadow: 0 8px 20px rgba(0, 0, 0, 0.6); /* Stronger shadow */ | |
| padding: 30px; | |
| z-index: 1000; | |
| border-radius: 12px; | |
| max-width: 90%; | |
| max-height: 90%; | |
| overflow: auto; | |
| color: #e0e0e0; /* Light text */ | |
| font-size: 1em; | |
| white-space: pre-wrap; /* Preserve formatting for modal content */ | |
| } | |
| .overlay { | |
| position: fixed; | |
| top: 0; | |
| left: 0; | |
| width: 100%; | |
| height: 100%; | |
| background: rgba(0, 0, 0, 0.8); /* Darker overlay */ | |
| z-index: 999; | |
| } | |
| /* Link Styling */ | |
| a { | |
| color: #a766ff; /* Neo purple for links */ | |
| text-decoration: none; | |
| transition: color 0.2s ease-in-out; | |
| } | |
| a:hover { | |
| color: #c08cff; /* Lighter purple on hover */ | |
| text-decoration: underline; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <h1>LLM Benchmark Overview (Update Ongoing)</h1> | |
| <div>As the development and evaluation of large language models (LLMs) continue to evolve, I conducted an overview of the principal benchmarks commonly found in research papers. My goal is to create a clear and comprehensive resource that summarizes what is being tested in LLMs, with concrete examples, key metrics, and direct links to related papers and repositories. This document serves as a centralized matrix that will be continuously updated with insights from future papers I review.</div> | |
| <div class="filter"> | |
| <label for="metricFilter">Filter by Evaluated task:</label> | |
| <select id="metricFilter"> | |
| <option value="">All</option> | |
| </select> | |
| <input type="text" id="searchInput" placeholder="Search for benchmark names..." style="margin-bottom: 0;"> | |
| </div> | |
| <div class="table-container"> | |
| <table id="csvTable"> | |
| <thead> | |
| <!-- Headers will be dynamically added --> | |
| </thead> | |
| <tbody> | |
| <!-- Rows will be dynamically added here --> | |
| </tbody> | |
| </table> | |
| </div> | |
| <div class="overlay" id="overlay" style="display: none;"></div> | |
| <div class="modal" id="modal" style="display: none;"></div> | |
| <script> | |
| // Custom CSV parser to handle quoted fields with commas/newlines | |
| function parseCSV(content) { | |
| const rows = []; | |
| let currentRow = []; | |
| let currentField = ''; | |
| let insideQuotes = false; | |
| for (let i = 0; i < content.length; i++) { | |
| const char = content[i]; | |
| if (char === '"') { | |
| insideQuotes = !insideQuotes; | |
| } else if (char === ',' && !insideQuotes) { | |
| currentRow.push(currentField.trim()); | |
| currentField = ''; | |
| } else if (char === '\n' && !insideQuotes) { | |
| currentRow.push(currentField.trim()); | |
| rows.push(currentRow); | |
| currentRow = []; | |
| currentField = ''; | |
| } else { | |
| currentField += char; | |
| } | |
| } | |
| // Add the last field and row if any | |
| if (currentField) currentRow.push(currentField.trim()); | |
| if (currentRow.length > 0) rows.push(currentRow); | |
| const headers = rows.shift(); // First row is headers | |
| return { headers, rows }; | |
| } | |
| // Function to load CSV from Hugging Face (commented out for Canvas preview) | |
| async function loadCSVFromHuggingFace(dataset, filename, token) { | |
| const url = `https://huggingface.co/datasets/${dataset}/resolve/main/${filename}`; | |
| try { | |
| const response = await fetch(url, { | |
| headers: { | |
| 'Authorization': `Bearer ${token}`, // Uncommented for authentication | |
| }, | |
| }); | |
| if (!response.ok) { | |
| throw new Error(`Failed to fetch file: ${response.statusText}`); | |
| } | |
| const content = await response.text(); | |
| return parseCSV(content); | |
| } catch (error) { | |
| console.error("Error loading CSV from Hugging Face:", error); | |
| // Display a user-friendly message if data loading fails | |
| return { headers: [], rows: [] }; | |
| } | |
| } | |
| const metricFilter = document.getElementById('metricFilter'); | |
| const table = document.getElementById('csvTable'); | |
| const tableHead = table.querySelector('thead'); | |
| const tableBody = table.querySelector('tbody'); | |
| const overlay = document.getElementById('overlay'); | |
| const modal = document.getElementById('modal'); | |
| const searchInput = document.getElementById('searchInput'); | |
| // Search functionality | |
| searchInput.addEventListener('input', function () { | |
| const filterText = this.value.trim().toLowerCase(); | |
| const rows = table.querySelectorAll('tbody tr'); | |
| rows.forEach(row => { | |
| const benchmarkNameCell = row.cells[1]; // Assuming Benchmark Name is the second column | |
| if (benchmarkNameCell) { | |
| const name = benchmarkNameCell.textContent.trim().toLowerCase(); | |
| // Check if the current filter value from the dropdown is also applied | |
| const currentMetricFilter = metricFilter.value; | |
| const rowMetricType = row.dataset.metricType; // Get the original metric type | |
| const matchesSearch = name.includes(filterText); | |
| const matchesMetric = !currentMetricFilter || rowMetricType === currentMetricFilter; | |
| row.style.display = (matchesSearch && matchesMetric) ? '' : 'none'; | |
| } | |
| }); | |
| }); | |
| // Makes table columns resizable | |
| function makeResizable() { | |
| const thElements = document.querySelectorAll('th'); | |
| thElements.forEach(th => { | |
| // Remove existing resizer to prevent duplicates on re-render | |
| let existingResizer = th.querySelector('.resizer'); | |
| if (existingResizer) { | |
| existingResizer.remove(); | |
| } | |
| const resizer = document.createElement('div'); | |
| resizer.classList.add('resizer'); | |
| th.appendChild(resizer); | |
| let startX; | |
| let startWidth; | |
| const initResize = (e) => { | |
| startX = e.type === 'mousedown' ? e.pageX : e.touches[0].pageX; | |
| startWidth = th.offsetWidth; | |
| document.addEventListener('mousemove', resizeColumn); | |
| document.addEventListener('mouseup', stopResize); | |
| document.addEventListener('touchmove', resizeColumn, { passive: false }); | |
| document.addEventListener('touchend', stopResize); | |
| }; | |
| const resizeColumn = (e) => { | |
| const currentX = e.type === 'mousemove' ? e.pageX : e.touches[0].pageX; | |
| const newWidth = startWidth + (currentX - startX); | |
| th.style.width = `${newWidth}px`; | |
| // Prevent text selection during resize | |
| e.preventDefault(); | |
| }; | |
| const stopResize = () => { | |
| document.removeEventListener('mousemove', resizeColumn); | |
| document.removeEventListener('mouseup', stopResize); | |
| document.removeEventListener('touchmove', resizeColumn); | |
| document.removeEventListener('touchend', stopResize); | |
| }; | |
| resizer.addEventListener('mousedown', initResize); | |
| resizer.addEventListener('touchstart', initResize, { passive: false }); | |
| }); | |
| } | |
| // Populates the filter dropdown with unique metric types | |
| function populateFilterOptions(data, headerIndex) { | |
| const uniqueMetricTypes = [...new Set(data.map(row => row[headerIndex]))].sort(); // Sort options alphabetically | |
| metricFilter.innerHTML = '<option value="">All</option>'; // Reset and add "All" option | |
| uniqueMetricTypes.forEach(type => { | |
| if (type) { // Avoid adding empty strings as options | |
| const option = document.createElement('option'); | |
| option.value = type; | |
| option.textContent = type; | |
| metricFilter.appendChild(option); | |
| } | |
| }); | |
| } | |
| // Populates the table with data, applying filters and search | |
| function populateTable(headers, rows, filterValue, headerIndex) { | |
| tableHead.innerHTML = ''; | |
| tableBody.innerHTML = ''; | |
| const headerRow = document.createElement('tr'); | |
| headers.forEach(header => { | |
| const th = document.createElement('th'); | |
| th.textContent = header; | |
| th.classList.add('resizable'); | |
| headerRow.appendChild(th); | |
| }); | |
| tableHead.appendChild(headerRow); | |
| const searchFilterText = searchInput.value.trim().toLowerCase(); | |
| rows | |
| .filter(row => { | |
| const matchesMetric = !filterValue || row[headerIndex] === filterValue; | |
| const benchmarkName = row[1] ? row[1].toLowerCase() : ''; // Assuming Benchmark Name is at index 1 | |
| const matchesSearch = benchmarkName.includes(searchFilterText); | |
| return matchesMetric && matchesSearch; | |
| }) | |
| .sort((a, b) => a[0].localeCompare(b[0])) // Sort by the first column (Evaluated task) | |
| .forEach(row => { | |
| const tr = document.createElement('tr'); | |
| tr.dataset.metricType = row[headerIndex]; // Store original metric type for search filtering | |
| row.forEach((value, index) => { | |
| const td = document.createElement('td'); | |
| const contentDiv = document.createElement('div'); // Create inner div for content | |
| contentDiv.classList.add('cell-content'); // Add class for truncation styles | |
| // Handle links for 'Paper' and 'HF or Git link' columns | |
| if (headers[index] === 'Paper' && value) { | |
| const link = document.createElement('a'); | |
| link.href = value; | |
| link.textContent = 'Paper Link'; | |
| link.target = '_blank'; | |
| contentDiv.appendChild(link); // Append link to inner div | |
| } else if (headers[index] === 'HF or Git link' && value) { | |
| const link = document.createElement('a'); | |
| link.href = value; | |
| link.textContent = 'Dataset Link'; | |
| link.target = '_blank'; | |
| contentDiv.appendChild(link); // Append link to inner div | |
| } else { | |
| contentDiv.textContent = value; // Set text content to inner div | |
| } | |
| td.appendChild(contentDiv); // Append inner div to td | |
| // Add click listener to the td for modal display | |
| td.title = 'Click to expand'; | |
| td.addEventListener('click', () => { | |
| overlay.style.display = 'block'; | |
| modal.style.display = 'block'; | |
| modal.textContent = value; // Display full content in modal | |
| }); | |
| tr.appendChild(td); | |
| }); | |
| tableBody.appendChild(tr); | |
| }); | |
| makeResizable(); // Re-apply resizable functionality after table population | |
| } | |
| // Close modal on overlay click | |
| overlay.addEventListener('click', () => { | |
| overlay.style.display = 'none'; | |
| modal.style.display = 'none'; | |
| }); | |
| // Filter change listener | |
| metricFilter.addEventListener('change', () => { | |
| const filterValue = metricFilter.value; | |
| populateTable(parsedCSV.headers, parsedCSV.rows, filterValue, 0); // Re-populate table with new filter | |
| }); | |
| let parsedCSV; | |
| loadCSVFromHuggingFace('UlrickBL/benchmark_overview', 'benchmark_overview.csv', window.huggingface.variables.HF_TOKEN) | |
| .then(({ headers, rows }) => { | |
| parsedCSV = { headers, rows }; | |
| populateFilterOptions(rows, 0); | |
| populateTable(headers, rows, '', 0); | |
| }) | |
| .catch(error => { | |
| console.error("Failed to load CSV data:", error); | |
| // Display a user-friendly message if data loading fails | |
| tableBody.innerHTML = '<tr><td colspan="7" style="text-align: center; color: #ff6b6b;">Failed to load data. Please check the dataset link or your internet connection.</td></tr>'; | |
| }); | |
| </script> | |
| </body> | |
| </html> | |