Spaces:

mangopy
/

ToolRet-leaderboard

Running

App Files Files Community

ToolRet-leaderboard / app.py

mangopy

Update app.py

f4ecde5 verified 10 months ago

raw

history blame contribute delete

8.91 kB

	import os
	from functools import reduce
	from collections import defaultdict
	from yaml import safe_load

	import pandas as pd
	import gradio as gr

	CONFIG = safe_load(open("config.yaml"))
	label_map = {'Avg':"All", "API":"Web API", "Code": "Code Function", "Customized": "Customized App"}
	data = defaultdict(dict)
	for setting in CONFIG['settings']:
	for data_type in CONFIG['types']:
	file_path = os.path.join("data", f"{CONFIG['settings_mapping'][setting]}-{data_type}.xlsx")
	df = pd.read_excel(file_path)

	df["Average"] = df.iloc[:, 1:-2].mean(axis=1)

	df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int)

	df = df.sort_values("Rank", ascending=True)

	cols = df.columns.tolist()
	first_cols = []
	if "Rank" in cols:
	first_cols.append("Rank")
	if "Model" in cols:
	first_cols.append("Model")
	if "Average" in cols:
	first_cols.append("Average")
	remaining_cols = [col for col in cols if col not in first_cols]
	df = df[first_cols + remaining_cols]

	# 数值格式化：对于数值列（除 Rank 列），如果最大值 <= 1 则认为是比例数据（乘以 100 后保留两位小数），否则直接保留两位小数
	numeric_cols = df.select_dtypes(include=['float', 'int']).columns
	for col in numeric_cols:
	if col != "Rank":
	if df[col].max() <= 1:
	df[col] = (df[col] * 100).round(2)
	else:
	df[col] = df[col].round(2)

	data[setting][data_type] = df

	css = """
	table thead th, table thead td {
	text-align: center !important;
	}
	table {
	--cell-width-1: 250px;
	}
	table > tbody > tr > td:nth-child(2) > div {
	overflow-x: auto;
	}
	.filter-checkbox-group {
	max-width: max-content;
	}
	table > tbody > tr > td:nth-child(2) {
	white-space: nowrap;
	width: auto;
	}
	table > tbody > tr > td:not(:nth-child(2)) {
	white-space: normal;
	width: 100px;
	text-align: center !important;
	vertical-align: middle;
	}

	.outer-tabs {
	border: 2px solid #ccc;
	border-radius: 8px;
	padding: 10px;
	margin-bottom: 20px;
	}
	.outer-tabs .tab {
	background-color: #e0e0e0;
	border: 1px solid #bfbfbf;
	border-radius: 4px 4px 0 0;
	margin-right: 10px;
	padding: 8px 16px;
	font-weight: bold;
	}
	.outer-tabs .tab.active {
	background-color: #ffffff;
	border-bottom: 2px solid #0078d7;
	}

	.inner-tabs {
	border: 2px solid #aaa;
	border-radius: 8px;
	padding: 5px;
	margin-top: 10px;
	}
	.inner-tabs .tab {
	background-color: #f5f5f5;
	border: 1px solid #ccc;
	border-radius: 4px 4px 0 0;
	margin-right: 8px;
	padding: 6px 12px;
	font-size: 0.9em;
	}
	.inner-tabs .tab.active {
	background-color: #ffffff;
	border-bottom: 2px solid #0078d7;
	}
	"""

	MODEL_TYPES = [
	"sparse retrieval",
	"dense retrieval",
	"embedding model",
	"re-ranking model"
	]

	NUMERIC_INTERVALS = {
	"<100M": pd.Interval(0, 100, closed='right'),
	"100M to 250M": pd.Interval(100, 250, closed='right'),
	"250M to 500M": pd.Interval(250, 500, closed='right'),
	"500M to 1B": pd.Interval(500, 1000, closed='right'),
	">1B": pd.Interval(1000, 1_000_000, closed='right'),
	}

	def filter_data(search_query, model_types, model_sizes):
	outputs = []
	for setting in CONFIG['settings']:
	for data_type in CONFIG['types']:
	df = data[setting][data_type].copy()

	if search_query:
	queries = [q.strip().lower() for q in search_query.split(";") if q.strip()]
	mask_search = df["Model"].str.lower().apply(lambda x: any(q in x for q in queries))
	df = df[mask_search]

	if model_types and set(model_types) != set(MODEL_TYPES):
	df = df[df["Model Type"].isin(model_types)]

	def parse_params(val):
	try:
	if isinstance(val, str):
	val = val.strip()
	if val.lower() == "unknown":
	return None
	if val.endswith("M"):
	return float(val[:-1])
	elif val.endswith("B"):
	return float(val[:-1]) * 1000
	else:
	return float(val)
	else:
	return float(val)
	except:
	return None

	df["params_numeric"] = df["Number of Parameters"].apply(parse_params)
	if model_sizes and set(model_sizes) != set(NUMERIC_INTERVALS.keys()):
	mask_size = df["params_numeric"].apply(
	lambda x: any(x is not None and x in NUMERIC_INTERVALS[label] for label in model_sizes)
	)
	df = df[mask_size]

	if "params_numeric" in df.columns:
	df = df.drop(columns=["params_numeric"])

	df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int)
	df = df.sort_values("Rank", ascending=True)

	cols = df.columns.tolist()
	first_cols = []
	if "Rank" in cols:
	first_cols.append("Rank")
	if "Model" in cols:
	first_cols.append("Model")
	if "Average" in cols:
	first_cols.append("Average")
	remaining_cols = [col for col in cols if col not in first_cols]
	df = df[first_cols + remaining_cols]

	outputs.append(df)
	return outputs

	head = """
	<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
	"""


	with gr.Blocks(css=css, fill_width=True, theme=gr.themes.Base(), head=head ) as demo:
	gr.Markdown("""
	## Tool-Retrieval benchmark leaderboard

	Welcome to the ToolRet benchmark leaderboard!

	- Search: Enter keywords for the model name in the search box. Use a semicolon (`;`) to separate multiple keywords.
	- Model Type: We provide a wide range of open-source models. Choose the model type(s) you're interested in.
	- Model Size: Select the parameter count range to filter models accordingly.

	Click the Filter Data button to update the display with the filtered data.
	""")

	with gr.Row():
	search_box = gr.Textbox(
	label="Search Models (separate multiple keywords with ';')",
	placeholder="🔍 Enter model name..."
	)
	model_type_checkbox_group = gr.CheckboxGroup(
	label="Model types",
	choices=MODEL_TYPES,
	value=MODEL_TYPES,
	interactive=True,
	elem_classes=["filter-checkbox-group"],
	scale=3
	)
	model_size_checkbox_group = gr.CheckboxGroup(
	label="Model sizes (Parameter Count)",
	choices=list(NUMERIC_INTERVALS.keys()),
	value=list(NUMERIC_INTERVALS.keys()),
	interactive=True,
	elem_classes=["filter-checkbox-group"],
	scale=2,
	)

	submit_button = gr.Button("Filter Data")

	output_dfs = []
	with gr.Tabs(elem_classes="outer-tabs") as result_tabs:
	for setting in CONFIG['settings']:
	with gr.Tab(label=setting):
	with gr.Tabs(elem_classes="inner-tabs") as inner_tabs:
	for data_type in CONFIG['types']:
	with gr.Tab(label=label_map[data_type]):
	df_component = gr.DataFrame(value=data[setting][data_type], type="pandas")
	output_dfs.append(df_component)

	submit_button.click(
	fn=filter_data,
	inputs=[search_box, model_type_checkbox_group, model_size_checkbox_group],
	outputs=output_dfs
	)

	gr.Markdown("""
	## Acknowledgement
	This work present the first diverse tool retrieval benchmark to evaluate the tool retrieval performance of a wide range of information retrieval models. We sincerely thank prior work, such as MAIR and ToolBench, which inspire this project or provide strong technique reference.

	## Citation
	```text
	@article{ToolRetrieval,
	title = {Retrieval Models Aren't Tool-Savvy: Benchmarking Tool Retrieval for Large Language Models},
	author = {Zhengliang Shi, Yuhan Wang, Lingyong Yan, Pengjie Ren, Shuaiqiang Wang, Dawei Yin, Zhaochun Ren},
	year = 2025,
	journal = {arXiv},
	}
	```
	This demo is created by [Gradio](https://gradio.app/)
	""")

	demo.launch(share=True)