Spaces:

Fancy-yousa
/

FeatureSelect-Methods-Leaderboard

Sleeping

File size: 16,010 Bytes

94e0fc9

<!DOCTYPE html>
<html>

<head>
    <title>FeatureSelect Leaderboard</title>

    <!-- Google tag (gtag.js) -->
    <!-- <script async src="https://www.googletagmanager.com/gtag/js?id=G-VWV023WWP4"></script> -->
    <!-- <script>
        window.dataLayer = window.dataLayer || [];

        function gtag() {
            dataLayer.push(arguments);
        }

        gtag('js', new Date());

        gtag('config', 'G-VWV023WWP4');
    </script> -->

    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
    <link rel="icon" href="https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/AlpacaFarm_small.png">
    <link href="https://cdn.jsdelivr.net/css-toggle-switch/latest/toggle-switch.css" rel="stylesheet"/>

    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 50px 20px;
            background-color: #FFFFFF;
            color: #000000;
        }

        .container {
            max-width: 700px;
            margin: auto;
        }

        #branding {
            text-align: center;
            margin-bottom: 20px;
        }

        #branding h1 {
            margin: 0;
            font-size: 2em;
        }

        h2 {
            margin: 0;
            font-size: 1.2em;
            color: #777;
        }

        table {
            max-width: 700px;
            width: 100%;
            table-layout: fixed;
            margin: auto;
            font-size: 1em;
        }

        table th,
        table td {
            padding: 6px;
            word-wrap: break-word;
            vertical-align: middle;
        }

        table th {
            border-bottom: 2px solid #000;
        }

        th.rank, td.rank {
            width: 9%; /* Adjust as needed */
            padding-left: 10px; /* Small margin */
            text-align: left;
        }

        th.name, td.name {
            width: 55%;
            padding-left: 30px;
            text-align: left;
        }

        th:not(.rank):not(.name),
        td:not(.rank):not(.name) {
            text-align: right;
            padding-right: 10px;
        }

        th.winRate, td.winRate {
            width: 17%;
            padding-right: 30px;
        }

        th {
            text-align: right;
            padding-bottom: 15px;
        }

        td {
            padding-bottom: 10px;
        }

        #leaderboard tr th.winRate,
        #leaderboard tr td.winRate {
            color: #999999;
        }

        #leaderboard tr th.rank,
        #leaderboard tr td.rank {
            color: #999999;
        }

        table tr:nth-child(even) {
            background-color: #E8E8E8;
        }

        table tr:nth-child(odd) {
            background-color: #F8F8F8;
        }

        .switch-toggle {
            display: inline-block;
            vertical-align: middle;
        }

        .switch-toggle input + label {
            padding: 2px;
            padding-left: 7px;
            padding-right: 7px;
            cursor: pointer;
            background-color: lightgrey;
            border: 1px solid transparent;
            font-size: 16px;
        }

        .switch-toggle input:checked + label {
            border-color: green;
            color: green;
        }

        .switch-toggle input:not(:checked) + label {
            color: black;
            box-shadow: none !important;
            user-select: none;
        }


        .toggle-line {
            display: flex;
            justify-content: center;
            align-items: center;
            margin-bottom: 20px;
            font-size: 17px;
        }

        .toggle-line .switch-toggle {
            margin: 0 10px;
        }
    </style>
    <!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.0/papaparse.min.js"></script> -->
</head>

<body>
<div class="container">
    <div id="branding">

        <h1>FeatureSelect
            <!-- <a href="https://github.com/tatsu-lab/alpaca_eval/tree/main">
                <img src="https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/AlpacaFarm_small.png"
                     alt="Logo" style="height: 2em; vertical-align: middle;"></a> -->
            Leaderboard
        </h1>
        <br>
        <h2>An Automatic Evaluator for FeatureSelect Methods</h2>
<!--        <small id="alpaca_eval_info" style="color: #777;">-->
<!--            Baseline: GPT-4 Preview &nbsp; | &nbsp; Auto-annotator: GPT-4 Preview-->
<!--        </small>-->
<!--        <br>-->
        <small id="caution" style="color: #8C1515;">
            <b> Length-controlled</b> (LC) win rates alleviate length biases of GPT-4, but it may favor models finetuned on its outputs.
        </small>
        <br>
        <a href="https://github.com/Fss2652530458/AutoFS">
            <img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub logo" style="height: 1.5em;/* margin-bottom: 0; */">
        </a>
    </div>


<!--           选择器                    -->
    <div class="toggle-line">

        Version:
        <div class="switch-toggle switch-evaluator" style="margin-right: 4em">
            <input id="alpaca_eval" name="version" type="radio"/>
            <label for="alpaca_eval" onclick="">AlpacaEval</label>
            <input id="alpaca_eval_2" name="version" type="radio" checked="checked"/>
            <label for="alpaca_eval_2" onclick="">AlpacaEval 2.0</label>
        </div>

        Filter:
        <div class="switch-toggle switch-filter">
            <input id="community" name="filter" type="radio"/>
            <label for="community" onclick="">Community</label>
            <input id="verified" name="filter" type="radio" checked="checked"/>
            <label for="verified" onclick="">Verified</label>
<!--            <input id="minimal" name="compactness" type="radio"/>-->
<!--            <label for="minimal" onclick="">Minimal</label>-->
        </div>



    </div>
<!--            Baseline小灰字-->
    <div class="container" style="text-align: center; margin-bottom: 10px; margin-top: -10px;">
        <small id="alpaca_eval_info" style="color: #777;">
            Baseline: GPT-4 Preview (11/06) &nbsp; | &nbsp; Auto-annotator: GPT-4 Preview (11/06)
        </small>
    </div>


<!--            排行榜本体-->
    <table id="leaderboard">
        <tr>
            <th class="rank">Rank</th>
            <th class="name" onclick="sortTable('algorithm')">Algorithm <span id="arrow-algorithm">↕</span></th>
            <th class="lenWinRate" onclick="sortTable('num_features')">#Features <span id="arrow-num_features">↕</span></th>
            <th class="winRate" onclick="sortTable('mean_f1')">Mean F1 <span id="arrow-mean_f1">↕</span></th>
            <th class="winRate" onclick="sortTable('mean_auc')">Mean AUC <span id="arrow-mean_auc">↕</span></th>
            <th class="winRate" onclick="sortTable('time')">Time (s) <span id="arrow-time">↕</span></th>
        </tr>
    </table>

    <!--            文档简介-->
    <div id="documentation">
        <div style="text-align: center;">
            <a href="https://github.com/tatsu-lab/alpaca_eval" style="display: inline-block;">
                <i class="fab fa-fw fa-github" aria-hidden="true"></i> Github
            </a>
        </div>
        <br>
        <h2>About AlpacaEval</h2>
        <p>
            <a href="https://github.com/tatsu-lab/alpaca_eval" target="_blank">AlpacaEval</a>
            an LLM-based automatic evaluation that is fast, cheap, and reliable.
            It is based on the
            <a href="https://crfm.stanford.edu/2023/05/22/alpaca-farm.html">AlpacaFarm</a>
            evaluation set,
            which tests the ability of models to follow general user instructions.
            These responses are then compared to reference responses (Davinci003 for AlpacaEval, GPT-4 Preview for AlpacaEval 2.0) by
            the provided GPT-4 based auto-annotators,
            which results in the win rates presented above.
            AlpacaEval displays a high agreement rate with ground truth human annotations,
            and leaderboard rankings on AlpacaEval are very correlated with leaderboard rankings
            based on human annotators.
            Please see our
            <a href="https://github.com/tatsu-lab/alpaca_eval#analysis" target="_blank">documentation</a>
            for more details on our analysis.
        </p>
        <h2>Adding new models</h2>
        <p>
            We welcome new model contributions to the leaderboard from the community!
            To do so, please follow the steps in the
            <a href="https://github.com/tatsu-lab/alpaca_eval#contributing" target="_blank">contributions
                section</a>.
            Specifically, you'll need to run the model on the evaluation set,
            auto-annotate the outputs, and submit a PR with the model config and leaderboard results.
            We've also set up a
            <a href="https://discord.gg/GJMxJSVZZM" target="_blank">Discord</a>
            for community support and discussion.
        </p>
        <h2>Adding new evaluators or eval sets </h2>
        <p>
            We also welcome contributions for new evaluators or new eval sets!
            For making new evaluators, we release our ground-truth
            <a href="https://github.com/tatsu-lab/alpaca_eval#data-release" target="_blank">human annotations</a>
            and <a href="https://github.com/tatsu-lab/alpaca_eval#analyzing-an-evaluator" target="_blank">comparison
            metrics</a>.
            We also release a
            <a href="https://github.com/tatsu-lab/alpaca_eval#analyzing-an-eval-set" target="_blank">rough guide</a>
            to follow for making new eval sets.
            We specifically encourage contributions for harder instructions distributions and for safety testing of
            LLMs.
        </p>
        <h2>AlpacaEval limitations</h2>
        <p>
           这里是简介
        </p>
    </div>

</div>

<script>
    const alpacaEvalRadio = document.getElementById('alpaca_eval');
    const alpacaEval2Radio = document.getElementById('alpaca_eval_2');

    const communityRadio = document.getElementById('community');
    const verifiedRadio = document.getElementById('verified');
    // const minimalRadio = document.getElementById('minimal');

    const table = document.getElementById('leaderboard');

    const urls = {
        'alpaca_eval': 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval/alpaca_eval_gpt4_leaderboard.csv',
        'alpaca_eval_2': 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv',
        // 'claude': 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/claude_leaderboard.csv',
    }

    let currentUrl = urls['alpaca_eval_2'];

    function updateTable(url) {
        while (table.rows.length > 1) {
            table.deleteRow(1);
        }

        Papa.parse(url, {
            download: true,
            header: true,
            complete: function (results) {
                console.log(results.data);
                let rank = 0; // Initialize rank counter
                results.data.forEach(row => {
                    if (row['name'] || row['win_rate'] || row['length_controlled_winrate']) { //|| row['avg_length']
                        let filter = row['filter'];

                        if ((communityRadio.checked && (filter === 'verified' || filter === 'minimal' || filter === 'community')) ||
                            (verifiedRadio.checked && (filter === 'verified' || filter === 'minimal'))) {

                            const tr = document.createElement('tr');
                            const rankTd = document.createElement('td');
                            const nameTd = document.createElement('td');
                            const winRateTd = document.createElement('td');
                            //const lengthTd = document.createElement('td');
                            const lenWinRateTd = document.createElement('td');

                            rankTd.classList.add('rank');
                            nameTd.classList.add('name');
                            winRateTd.classList.add('winRate');
                            lenWinRateTd.classList.add('lenWinRate');

                            // Set the rank value
                            rank++;
                            rankTd.textContent = rank;

                            if (row['link'] && row['link'].trim() !== '') {
                                const a = document.createElement('a');
                                a.textContent = row['name'];
                                a.href = row['link'];
                                a.target = "_blank";
                                nameTd.appendChild(a);
                            } else {
                                nameTd.textContent = row['name'];
                            }


                            if (row['samples'] && row['samples'].trim() !== '') {
                                const samplesLink = document.createElement('a');
                                samplesLink.textContent = " 📄"; // adding a space before emoji to separate from name
                                samplesLink.href = row['samples'];
                                samplesLink.target = "_blank";
                                samplesLink.style.textDecoration = "none";
                                nameTd.appendChild(samplesLink);
                            }

                            winRateTd.textContent = Number(row['win_rate']).toFixed(1) + '%';

                            if (row['length_controlled_winrate'] === '') {
                                lenWinRateTd.textContent = 'N/A';
                            } else {
                                lenWinRateTd.textContent = Number(row['length_controlled_winrate']).toFixed(1) + '%';
                            }
                            //lenWinRateTd.textContent = Number(row['length_controlled_winrate']).toFixed(1) + '%';
                            //lengthTd.textContent = Math.round(Number(row['avg_length'])).toString() ;


                            tr.appendChild(rankTd);
                            tr.appendChild(nameTd);
                            tr.appendChild(lenWinRateTd);
                            tr.appendChild(winRateTd);
                            //tr.appendChild(lengthTd);

                            table.appendChild(tr);
                        }
                    }
                });
            }
        });
    }

    function updateInfoMessage(version) {
        let infoText;
        if (version === 'alpaca_eval_2') {
            infoText = 'Baseline: GPT-4 Preview (11/06) &nbsp; | &nbsp; Auto-annotator: GPT-4 Preview (11/06)';
        } else if (version === 'alpaca_eval') {
            infoText = 'Baseline: Davinci003 &nbsp; | &nbsp; Auto-annotator: GPT-4';
        }
        document.getElementById('alpaca_eval_info').innerHTML = infoText;
    }

    updateTable(urls['alpaca_eval_2']);

    alpacaEval2Radio.addEventListener('click', function () {
        currentUrl = urls['alpaca_eval_2'];
        updateTable(currentUrl);
        updateInfoMessage('alpaca_eval_2');
    });

    alpacaEvalRadio.addEventListener('click', function () {
        currentUrl = urls['alpaca_eval'];
        updateTable(currentUrl);
        updateInfoMessage('alpaca_eval');
    });

    communityRadio.addEventListener('click', function () {
        updateTable(currentUrl);
    });

    verifiedRadio.addEventListener('click', function () {
        updateTable(currentUrl);
    });

    // minimalRadio.addEventListener('click', function () {
    //     updateTable(currentUrl);
    // });

    updateCautionMessage('alpaca_eval_2');
</script>


</body>

</html>