| | <!DOCTYPE html> |
| | <html> |
| |
|
| | <head> |
| | <title>FeatureSelect Leaderboard</title> |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"> |
| | <link rel="icon" href="https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/AlpacaFarm_small.png"> |
| | <link href="https://cdn.jsdelivr.net/css-toggle-switch/latest/toggle-switch.css" rel="stylesheet"/> |
| |
|
| | <style> |
| | body { |
| | font-family: Arial, sans-serif; |
| | margin: 0; |
| | padding: 50px 20px; |
| | background-color: #FFFFFF; |
| | color: #000000; |
| | } |
| | |
| | .container { |
| | max-width: 700px; |
| | margin: auto; |
| | } |
| | |
| | #branding { |
| | text-align: center; |
| | margin-bottom: 20px; |
| | } |
| | |
| | #branding h1 { |
| | margin: 0; |
| | font-size: 2em; |
| | } |
| | |
| | h2 { |
| | margin: 0; |
| | font-size: 1.2em; |
| | color: #777; |
| | } |
| | |
| | table { |
| | max-width: 700px; |
| | width: 100%; |
| | table-layout: fixed; |
| | margin: auto; |
| | font-size: 1em; |
| | } |
| | |
| | table th, |
| | table td { |
| | padding: 6px; |
| | word-wrap: break-word; |
| | vertical-align: middle; |
| | } |
| | |
| | table th { |
| | border-bottom: 2px solid #000; |
| | } |
| | |
| | th.rank, td.rank { |
| | width: 9%; |
| | padding-left: 10px; |
| | text-align: left; |
| | } |
| | |
| | th.name, td.name { |
| | width: 55%; |
| | padding-left: 30px; |
| | text-align: left; |
| | } |
| | |
| | th:not(.rank):not(.name), |
| | td:not(.rank):not(.name) { |
| | text-align: right; |
| | padding-right: 10px; |
| | } |
| | |
| | th.winRate, td.winRate { |
| | width: 17%; |
| | padding-right: 30px; |
| | } |
| | |
| | th { |
| | text-align: right; |
| | padding-bottom: 15px; |
| | } |
| | |
| | td { |
| | padding-bottom: 10px; |
| | } |
| | |
| | #leaderboard tr th.winRate, |
| | #leaderboard tr td.winRate { |
| | color: #999999; |
| | } |
| | |
| | #leaderboard tr th.rank, |
| | #leaderboard tr td.rank { |
| | color: #999999; |
| | } |
| | |
| | table tr:nth-child(even) { |
| | background-color: #E8E8E8; |
| | } |
| | |
| | table tr:nth-child(odd) { |
| | background-color: #F8F8F8; |
| | } |
| | |
| | .switch-toggle { |
| | display: inline-block; |
| | vertical-align: middle; |
| | } |
| | |
| | .switch-toggle input + label { |
| | padding: 2px; |
| | padding-left: 7px; |
| | padding-right: 7px; |
| | cursor: pointer; |
| | background-color: lightgrey; |
| | border: 1px solid transparent; |
| | font-size: 16px; |
| | } |
| | |
| | .switch-toggle input:checked + label { |
| | border-color: green; |
| | color: green; |
| | } |
| | |
| | .switch-toggle input:not(:checked) + label { |
| | color: black; |
| | box-shadow: none !important; |
| | user-select: none; |
| | } |
| | |
| | |
| | .toggle-line { |
| | display: flex; |
| | justify-content: center; |
| | align-items: center; |
| | margin-bottom: 20px; |
| | font-size: 17px; |
| | } |
| | |
| | .toggle-line .switch-toggle { |
| | margin: 0 10px; |
| | } |
| | </style> |
| | |
| | </head> |
| |
|
| | <body> |
| | <div class="container"> |
| | <div id="branding"> |
| |
|
| | <h1>FeatureSelect |
| | |
| | |
| | |
| | Leaderboard |
| | </h1> |
| | <br> |
| | <h2>An Automatic Evaluator for FeatureSelect Methods</h2> |
| | |
| | |
| | |
| | |
| | <small id="caution" style="color: #8C1515;"> |
| | <b> Length-controlled</b> (LC) win rates alleviate length biases of GPT-4, but it may favor models finetuned on its outputs. |
| | </small> |
| | <br> |
| | <a href="https://github.com/Fss2652530458/AutoFS"> |
| | <img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub logo" style="height: 1.5em;/* margin-bottom: 0; */"> |
| | </a> |
| | </div> |
| |
|
| |
|
| | |
| | <div class="toggle-line"> |
| |
|
| | Version: |
| | <div class="switch-toggle switch-evaluator" style="margin-right: 4em"> |
| | <input id="alpaca_eval" name="version" type="radio"/> |
| | <label for="alpaca_eval" onclick="">AlpacaEval</label> |
| | <input id="alpaca_eval_2" name="version" type="radio" checked="checked"/> |
| | <label for="alpaca_eval_2" onclick="">AlpacaEval 2.0</label> |
| | </div> |
| |
|
| | Filter: |
| | <div class="switch-toggle switch-filter"> |
| | <input id="community" name="filter" type="radio"/> |
| | <label for="community" onclick="">Community</label> |
| | <input id="verified" name="filter" type="radio" checked="checked"/> |
| | <label for="verified" onclick="">Verified</label> |
| | |
| | |
| | </div> |
| |
|
| |
|
| |
|
| | </div> |
| | |
| | <div class="container" style="text-align: center; margin-bottom: 10px; margin-top: -10px;"> |
| | <small id="alpaca_eval_info" style="color: #777;"> |
| | Baseline: GPT-4 Preview (11/06) | Auto-annotator: GPT-4 Preview (11/06) |
| | </small> |
| | </div> |
| |
|
| |
|
| | |
| | <table id="leaderboard"> |
| | <tr> |
| | <th class="rank">Rank</th> |
| | <th class="name" onclick="sortTable('algorithm')">Algorithm <span id="arrow-algorithm">↕</span></th> |
| | <th class="lenWinRate" onclick="sortTable('num_features')">#Features <span id="arrow-num_features">↕</span></th> |
| | <th class="winRate" onclick="sortTable('mean_f1')">Mean F1 <span id="arrow-mean_f1">↕</span></th> |
| | <th class="winRate" onclick="sortTable('mean_auc')">Mean AUC <span id="arrow-mean_auc">↕</span></th> |
| | <th class="winRate" onclick="sortTable('time')">Time (s) <span id="arrow-time">↕</span></th> |
| | </tr> |
| | </table> |
| |
|
| | |
| | <div id="documentation"> |
| | <div style="text-align: center;"> |
| | <a href="https://github.com/tatsu-lab/alpaca_eval" style="display: inline-block;"> |
| | <i class="fab fa-fw fa-github" aria-hidden="true"></i> Github |
| | </a> |
| | </div> |
| | <br> |
| | <h2>About AlpacaEval</h2> |
| | <p> |
| | <a href="https://github.com/tatsu-lab/alpaca_eval" target="_blank">AlpacaEval</a> |
| | an LLM-based automatic evaluation that is fast, cheap, and reliable. |
| | It is based on the |
| | <a href="https://crfm.stanford.edu/2023/05/22/alpaca-farm.html">AlpacaFarm</a> |
| | evaluation set, |
| | which tests the ability of models to follow general user instructions. |
| | These responses are then compared to reference responses (Davinci003 for AlpacaEval, GPT-4 Preview for AlpacaEval 2.0) by |
| | the provided GPT-4 based auto-annotators, |
| | which results in the win rates presented above. |
| | AlpacaEval displays a high agreement rate with ground truth human annotations, |
| | and leaderboard rankings on AlpacaEval are very correlated with leaderboard rankings |
| | based on human annotators. |
| | Please see our |
| | <a href="https://github.com/tatsu-lab/alpaca_eval#analysis" target="_blank">documentation</a> |
| | for more details on our analysis. |
| | </p> |
| | <h2>Adding new models</h2> |
| | <p> |
| | We welcome new model contributions to the leaderboard from the community! |
| | To do so, please follow the steps in the |
| | <a href="https://github.com/tatsu-lab/alpaca_eval#contributing" target="_blank">contributions |
| | section</a>. |
| | Specifically, you'll need to run the model on the evaluation set, |
| | auto-annotate the outputs, and submit a PR with the model config and leaderboard results. |
| | We've also set up a |
| | <a href="https://discord.gg/GJMxJSVZZM" target="_blank">Discord</a> |
| | for community support and discussion. |
| | </p> |
| | <h2>Adding new evaluators or eval sets </h2> |
| | <p> |
| | We also welcome contributions for new evaluators or new eval sets! |
| | For making new evaluators, we release our ground-truth |
| | <a href="https://github.com/tatsu-lab/alpaca_eval#data-release" target="_blank">human annotations</a> |
| | and <a href="https://github.com/tatsu-lab/alpaca_eval#analyzing-an-evaluator" target="_blank">comparison |
| | metrics</a>. |
| | We also release a |
| | <a href="https://github.com/tatsu-lab/alpaca_eval#analyzing-an-eval-set" target="_blank">rough guide</a> |
| | to follow for making new eval sets. |
| | We specifically encourage contributions for harder instructions distributions and for safety testing of |
| | LLMs. |
| | </p> |
| | <h2>AlpacaEval limitations</h2> |
| | <p> |
| | 这里是简介 |
| | </p> |
| | </div> |
| |
|
| | </div> |
| |
|
| | <script> |
| | const alpacaEvalRadio = document.getElementById('alpaca_eval'); |
| | const alpacaEval2Radio = document.getElementById('alpaca_eval_2'); |
| | |
| | const communityRadio = document.getElementById('community'); |
| | const verifiedRadio = document.getElementById('verified'); |
| | |
| | |
| | const table = document.getElementById('leaderboard'); |
| | |
| | const urls = { |
| | 'alpaca_eval': 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval/alpaca_eval_gpt4_leaderboard.csv', |
| | 'alpaca_eval_2': 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv', |
| | |
| | } |
| | |
| | let currentUrl = urls['alpaca_eval_2']; |
| | |
| | function updateTable(url) { |
| | while (table.rows.length > 1) { |
| | table.deleteRow(1); |
| | } |
| | |
| | Papa.parse(url, { |
| | download: true, |
| | header: true, |
| | complete: function (results) { |
| | console.log(results.data); |
| | let rank = 0; |
| | results.data.forEach(row => { |
| | if (row['name'] || row['win_rate'] || row['length_controlled_winrate']) { |
| | let filter = row['filter']; |
| | |
| | if ((communityRadio.checked && (filter === 'verified' || filter === 'minimal' || filter === 'community')) || |
| | (verifiedRadio.checked && (filter === 'verified' || filter === 'minimal'))) { |
| | |
| | const tr = document.createElement('tr'); |
| | const rankTd = document.createElement('td'); |
| | const nameTd = document.createElement('td'); |
| | const winRateTd = document.createElement('td'); |
| | |
| | const lenWinRateTd = document.createElement('td'); |
| | |
| | rankTd.classList.add('rank'); |
| | nameTd.classList.add('name'); |
| | winRateTd.classList.add('winRate'); |
| | lenWinRateTd.classList.add('lenWinRate'); |
| | |
| | |
| | rank++; |
| | rankTd.textContent = rank; |
| | |
| | if (row['link'] && row['link'].trim() !== '') { |
| | const a = document.createElement('a'); |
| | a.textContent = row['name']; |
| | a.href = row['link']; |
| | a.target = "_blank"; |
| | nameTd.appendChild(a); |
| | } else { |
| | nameTd.textContent = row['name']; |
| | } |
| | |
| | |
| | if (row['samples'] && row['samples'].trim() !== '') { |
| | const samplesLink = document.createElement('a'); |
| | samplesLink.textContent = " 📄"; |
| | samplesLink.href = row['samples']; |
| | samplesLink.target = "_blank"; |
| | samplesLink.style.textDecoration = "none"; |
| | nameTd.appendChild(samplesLink); |
| | } |
| | |
| | winRateTd.textContent = Number(row['win_rate']).toFixed(1) + '%'; |
| | |
| | if (row['length_controlled_winrate'] === '') { |
| | lenWinRateTd.textContent = 'N/A'; |
| | } else { |
| | lenWinRateTd.textContent = Number(row['length_controlled_winrate']).toFixed(1) + '%'; |
| | } |
| | |
| | |
| | |
| | |
| | tr.appendChild(rankTd); |
| | tr.appendChild(nameTd); |
| | tr.appendChild(lenWinRateTd); |
| | tr.appendChild(winRateTd); |
| | |
| | |
| | table.appendChild(tr); |
| | } |
| | } |
| | }); |
| | } |
| | }); |
| | } |
| | |
| | function updateInfoMessage(version) { |
| | let infoText; |
| | if (version === 'alpaca_eval_2') { |
| | infoText = 'Baseline: GPT-4 Preview (11/06) | Auto-annotator: GPT-4 Preview (11/06)'; |
| | } else if (version === 'alpaca_eval') { |
| | infoText = 'Baseline: Davinci003 | Auto-annotator: GPT-4'; |
| | } |
| | document.getElementById('alpaca_eval_info').innerHTML = infoText; |
| | } |
| | |
| | updateTable(urls['alpaca_eval_2']); |
| | |
| | alpacaEval2Radio.addEventListener('click', function () { |
| | currentUrl = urls['alpaca_eval_2']; |
| | updateTable(currentUrl); |
| | updateInfoMessage('alpaca_eval_2'); |
| | }); |
| | |
| | alpacaEvalRadio.addEventListener('click', function () { |
| | currentUrl = urls['alpaca_eval']; |
| | updateTable(currentUrl); |
| | updateInfoMessage('alpaca_eval'); |
| | }); |
| | |
| | communityRadio.addEventListener('click', function () { |
| | updateTable(currentUrl); |
| | }); |
| | |
| | verifiedRadio.addEventListener('click', function () { |
| | updateTable(currentUrl); |
| | }); |
| | |
| | |
| | |
| | |
| | |
| | updateCautionMessage('alpaca_eval_2'); |
| | </script> |
| |
|
| |
|
| | </body> |
| |
|
| | </html> |