Spaces:

Fancy-yousa
/

FeatureSelect-Methods-Leaderboard

Sleeping

App Files Files Community

FeatureSelect-Methods-Leaderboard / Webapp /templates /indexa.html

Fancy-yousa

Upload 17 files

0f332f8 verified 4 months ago

raw

history blame contribute delete

16 kB

	<!DOCTYPE html>
	<html>

	<head>
	<title>FeatureSelect Leaderboard</title>

	<!-- Google tag (gtag.js) -->
	<!-- <script async src="https://www.googletagmanager.com/gtag/js?id=G-VWV023WWP4"></script> -->
	<!-- <script>
	window.dataLayer = window.dataLayer \|\| [];

	function gtag() {
	dataLayer.push(arguments);
	}

	gtag('js', new Date());

	gtag('config', 'G-VWV023WWP4');
	</script> -->

	<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css">
	<link rel="icon" href="https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/AlpacaFarm_small.png">
	<link href="https://cdn.jsdelivr.net/css-toggle-switch/latest/toggle-switch.css" rel="stylesheet"/>

	<style>
	body {
	font-family: Arial, sans-serif;
	margin: 0;
	padding: 50px 20px;
	background-color: #FFFFFF;
	color: #000000;
	}

	.container {
	max-width: 700px;
	margin: auto;
	}

	#branding {
	text-align: center;
	margin-bottom: 20px;
	}

	#branding h1 {
	margin: 0;
	font-size: 2em;
	}

	h2 {
	margin: 0;
	font-size: 1.2em;
	color: #777;
	}

	table {
	max-width: 700px;
	width: 100%;
	table-layout: fixed;
	margin: auto;
	font-size: 1em;
	}

	table th,
	table td {
	padding: 6px;
	word-wrap: break-word;
	vertical-align: middle;
	}

	table th {
	border-bottom: 2px solid #000;
	}

	th.rank, td.rank {
	width: 9%; /* Adjust as needed */
	padding-left: 10px; /* Small margin */
	text-align: left;
	}

	th.name, td.name {
	width: 55%;
	padding-left: 30px;
	text-align: left;
	}

	th:not(.rank):not(.name),
	td:not(.rank):not(.name) {
	text-align: right;
	padding-right: 10px;
	}

	th.winRate, td.winRate {
	width: 17%;
	padding-right: 30px;
	}

	th {
	text-align: right;
	padding-bottom: 15px;
	}

	td {
	padding-bottom: 10px;
	}

	#leaderboard tr th.winRate,
	#leaderboard tr td.winRate {
	color: #999999;
	}

	#leaderboard tr th.rank,
	#leaderboard tr td.rank {
	color: #999999;
	}

	table tr:nth-child(even) {
	background-color: #E8E8E8;
	}

	table tr:nth-child(odd) {
	background-color: #F8F8F8;
	}

	.switch-toggle {
	display: inline-block;
	vertical-align: middle;
	}

	.switch-toggle input + label {
	padding: 2px;
	padding-left: 7px;
	padding-right: 7px;
	cursor: pointer;
	background-color: lightgrey;
	border: 1px solid transparent;
	font-size: 16px;
	}

	.switch-toggle input:checked + label {
	border-color: green;
	color: green;
	}

	.switch-toggle input:not(:checked) + label {
	color: black;
	box-shadow: none !important;
	user-select: none;
	}


	.toggle-line {
	display: flex;
	justify-content: center;
	align-items: center;
	margin-bottom: 20px;
	font-size: 17px;
	}

	.toggle-line .switch-toggle {
	margin: 0 10px;
	}
	</style>
	<!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/PapaParse/5.3.0/papaparse.min.js"></script> -->
	</head>

	<body>
	<div class="container">
	<div id="branding">

	<h1>FeatureSelect
	<!-- <a href="https://github.com/tatsu-lab/alpaca_eval/tree/main">
	<img src="https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/AlpacaFarm_small.png"
	alt="Logo" style="height: 2em; vertical-align: middle;"></a> -->
	Leaderboard
	</h1>
	<br>
	<h2>An Automatic Evaluator for FeatureSelect Methods</h2>
	<!-- <small id="alpaca_eval_info" style="color: #777;">-->
	<!-- Baseline: GPT-4 Preview   \|   Auto-annotator: GPT-4 Preview-->
	<!-- </small>-->
	<!-- <br>-->
	<small id="caution" style="color: #8C1515;">
	<b> Length-controlled</b> (LC) win rates alleviate length biases of GPT-4, but it may favor models finetuned on its outputs.
	</small>
	<br>
	<a href="https://github.com/Fss2652530458/AutoFS">
	<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" alt="GitHub logo" style="height: 1.5em;/* margin-bottom: 0; */">
	</a>
	</div>


	<!-- 选择器 -->
	<div class="toggle-line">

	Version:
	<div class="switch-toggle switch-evaluator" style="margin-right: 4em">
	<input id="alpaca_eval" name="version" type="radio"/>
	<label for="alpaca_eval" onclick="">AlpacaEval</label>
	<input id="alpaca_eval_2" name="version" type="radio" checked="checked"/>
	<label for="alpaca_eval_2" onclick="">AlpacaEval 2.0</label>
	</div>

	Filter:
	<div class="switch-toggle switch-filter">
	<input id="community" name="filter" type="radio"/>
	<label for="community" onclick="">Community</label>
	<input id="verified" name="filter" type="radio" checked="checked"/>
	<label for="verified" onclick="">Verified</label>
	<!-- <input id="minimal" name="compactness" type="radio"/>-->
	<!-- <label for="minimal" onclick="">Minimal</label>-->
	</div>



	</div>
	<!-- Baseline小灰字-->
	<div class="container" style="text-align: center; margin-bottom: 10px; margin-top: -10px;">
	<small id="alpaca_eval_info" style="color: #777;">
	Baseline: GPT-4 Preview (11/06)   \|   Auto-annotator: GPT-4 Preview (11/06)
	</small>
	</div>


	<!-- 排行榜本体-->
	<table id="leaderboard">
	<tr>
	<th class="rank">Rank</th>
	<th class="name" onclick="sortTable('algorithm')">Algorithm <span id="arrow-algorithm">↕</span></th>
	<th class="lenWinRate" onclick="sortTable('num_features')">#Features <span id="arrow-num_features">↕</span></th>
	<th class="winRate" onclick="sortTable('mean_f1')">Mean F1 <span id="arrow-mean_f1">↕</span></th>
	<th class="winRate" onclick="sortTable('mean_auc')">Mean AUC <span id="arrow-mean_auc">↕</span></th>
	<th class="winRate" onclick="sortTable('time')">Time (s) <span id="arrow-time">↕</span></th>
	</tr>
	</table>

	<!-- 文档简介-->
	<div id="documentation">
	<div style="text-align: center;">
	<a href="https://github.com/tatsu-lab/alpaca_eval" style="display: inline-block;">
	<i class="fab fa-fw fa-github" aria-hidden="true"></i> Github
	</a>
	</div>
	<br>
	<h2>About AlpacaEval</h2>
	<p>
	<a href="https://github.com/tatsu-lab/alpaca_eval" target="_blank">AlpacaEval</a>
	an LLM-based automatic evaluation that is fast, cheap, and reliable.
	It is based on the
	<a href="https://crfm.stanford.edu/2023/05/22/alpaca-farm.html">AlpacaFarm</a>
	evaluation set,
	which tests the ability of models to follow general user instructions.
	These responses are then compared to reference responses (Davinci003 for AlpacaEval, GPT-4 Preview for AlpacaEval 2.0) by
	the provided GPT-4 based auto-annotators,
	which results in the win rates presented above.
	AlpacaEval displays a high agreement rate with ground truth human annotations,
	and leaderboard rankings on AlpacaEval are very correlated with leaderboard rankings
	based on human annotators.
	Please see our
	<a href="https://github.com/tatsu-lab/alpaca_eval#analysis" target="_blank">documentation</a>
	for more details on our analysis.
	</p>
	<h2>Adding new models</h2>
	<p>
	We welcome new model contributions to the leaderboard from the community!
	To do so, please follow the steps in the
	<a href="https://github.com/tatsu-lab/alpaca_eval#contributing" target="_blank">contributions
	section</a>.
	Specifically, you'll need to run the model on the evaluation set,
	auto-annotate the outputs, and submit a PR with the model config and leaderboard results.
	We've also set up a
	<a href="https://discord.gg/GJMxJSVZZM" target="_blank">Discord</a>
	for community support and discussion.
	</p>
	<h2>Adding new evaluators or eval sets </h2>
	<p>
	We also welcome contributions for new evaluators or new eval sets!
	For making new evaluators, we release our ground-truth
	<a href="https://github.com/tatsu-lab/alpaca_eval#data-release" target="_blank">human annotations</a>
	and <a href="https://github.com/tatsu-lab/alpaca_eval#analyzing-an-evaluator" target="_blank">comparison
	metrics</a>.
	We also release a
	<a href="https://github.com/tatsu-lab/alpaca_eval#analyzing-an-eval-set" target="_blank">rough guide</a>
	to follow for making new eval sets.
	We specifically encourage contributions for harder instructions distributions and for safety testing of
	LLMs.
	</p>
	<h2>AlpacaEval limitations</h2>
	<p>
	这里是简介
	</p>
	</div>

	</div>

	<script>
	const alpacaEvalRadio = document.getElementById('alpaca_eval');
	const alpacaEval2Radio = document.getElementById('alpaca_eval_2');

	const communityRadio = document.getElementById('community');
	const verifiedRadio = document.getElementById('verified');
	// const minimalRadio = document.getElementById('minimal');

	const table = document.getElementById('leaderboard');

	const urls = {
	'alpaca_eval': 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval/alpaca_eval_gpt4_leaderboard.csv',
	'alpaca_eval_2': 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv',
	// 'claude': 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/claude_leaderboard.csv',
	}

	let currentUrl = urls['alpaca_eval_2'];

	function updateTable(url) {
	while (table.rows.length > 1) {
	table.deleteRow(1);
	}

	Papa.parse(url, {
	download: true,
	header: true,
	complete: function (results) {
	console.log(results.data);
	let rank = 0; // Initialize rank counter
	results.data.forEach(row => {
	if (row['name'] \|\| row['win_rate'] \|\| row['length_controlled_winrate']) { //\|\| row['avg_length']
	let filter = row['filter'];

	if ((communityRadio.checked && (filter === 'verified' \|\| filter === 'minimal' \|\| filter === 'community')) \|\|
	(verifiedRadio.checked && (filter === 'verified' \|\| filter === 'minimal'))) {

	const tr = document.createElement('tr');
	const rankTd = document.createElement('td');
	const nameTd = document.createElement('td');
	const winRateTd = document.createElement('td');
	//const lengthTd = document.createElement('td');
	const lenWinRateTd = document.createElement('td');

	rankTd.classList.add('rank');
	nameTd.classList.add('name');
	winRateTd.classList.add('winRate');
	lenWinRateTd.classList.add('lenWinRate');

	// Set the rank value
	rank++;
	rankTd.textContent = rank;

	if (row['link'] && row['link'].trim() !== '') {
	const a = document.createElement('a');
	a.textContent = row['name'];
	a.href = row['link'];
	a.target = "_blank";
	nameTd.appendChild(a);
	} else {
	nameTd.textContent = row['name'];
	}


	if (row['samples'] && row['samples'].trim() !== '') {
	const samplesLink = document.createElement('a');
	samplesLink.textContent = " 📄"; // adding a space before emoji to separate from name
	samplesLink.href = row['samples'];
	samplesLink.target = "_blank";
	samplesLink.style.textDecoration = "none";
	nameTd.appendChild(samplesLink);
	}

	winRateTd.textContent = Number(row['win_rate']).toFixed(1) + '%';

	if (row['length_controlled_winrate'] === '') {
	lenWinRateTd.textContent = 'N/A';
	} else {
	lenWinRateTd.textContent = Number(row['length_controlled_winrate']).toFixed(1) + '%';
	}
	//lenWinRateTd.textContent = Number(row['length_controlled_winrate']).toFixed(1) + '%';
	//lengthTd.textContent = Math.round(Number(row['avg_length'])).toString() ;


	tr.appendChild(rankTd);
	tr.appendChild(nameTd);
	tr.appendChild(lenWinRateTd);
	tr.appendChild(winRateTd);
	//tr.appendChild(lengthTd);

	table.appendChild(tr);
	}
	}
	});
	}
	});
	}

	function updateInfoMessage(version) {
	let infoText;
	if (version === 'alpaca_eval_2') {
	infoText = 'Baseline: GPT-4 Preview (11/06)   \|   Auto-annotator: GPT-4 Preview (11/06)';
	} else if (version === 'alpaca_eval') {
	infoText = 'Baseline: Davinci003   \|   Auto-annotator: GPT-4';
	}
	document.getElementById('alpaca_eval_info').innerHTML = infoText;
	}

	updateTable(urls['alpaca_eval_2']);

	alpacaEval2Radio.addEventListener('click', function () {
	currentUrl = urls['alpaca_eval_2'];
	updateTable(currentUrl);
	updateInfoMessage('alpaca_eval_2');
	});

	alpacaEvalRadio.addEventListener('click', function () {
	currentUrl = urls['alpaca_eval'];
	updateTable(currentUrl);
	updateInfoMessage('alpaca_eval');
	});

	communityRadio.addEventListener('click', function () {
	updateTable(currentUrl);
	});

	verifiedRadio.addEventListener('click', function () {
	updateTable(currentUrl);
	});

	// minimalRadio.addEventListener('click', function () {
	// updateTable(currentUrl);
	// });

	updateCautionMessage('alpaca_eval_2');
	</script>


	</body>

	</html>