mangopy's picture
Update app.py
f4ecde5 verified
import os
from functools import reduce
from collections import defaultdict
from yaml import safe_load
import pandas as pd
import gradio as gr
CONFIG = safe_load(open("config.yaml"))
label_map = {'Avg':"All", "API":"Web API", "Code": "Code Function", "Customized": "Customized App"}
data = defaultdict(dict)
for setting in CONFIG['settings']:
for data_type in CONFIG['types']:
file_path = os.path.join("data", f"{CONFIG['settings_mapping'][setting]}-{data_type}.xlsx")
df = pd.read_excel(file_path)
df["Average"] = df.iloc[:, 1:-2].mean(axis=1)
df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int)
df = df.sort_values("Rank", ascending=True)
cols = df.columns.tolist()
first_cols = []
if "Rank" in cols:
first_cols.append("Rank")
if "Model" in cols:
first_cols.append("Model")
if "Average" in cols:
first_cols.append("Average")
remaining_cols = [col for col in cols if col not in first_cols]
df = df[first_cols + remaining_cols]
# 数值格式化:对于数值列(除 Rank 列),如果最大值 <= 1 则认为是比例数据(乘以 100 后保留两位小数),否则直接保留两位小数
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
for col in numeric_cols:
if col != "Rank":
if df[col].max() <= 1:
df[col] = (df[col] * 100).round(2)
else:
df[col] = df[col].round(2)
data[setting][data_type] = df
css = """
table thead th, table thead td {
text-align: center !important;
}
table {
--cell-width-1: 250px;
}
table > tbody > tr > td:nth-child(2) > div {
overflow-x: auto;
}
.filter-checkbox-group {
max-width: max-content;
}
table > tbody > tr > td:nth-child(2) {
white-space: nowrap;
width: auto;
}
table > tbody > tr > td:not(:nth-child(2)) {
white-space: normal;
width: 100px;
text-align: center !important;
vertical-align: middle;
}
.outer-tabs {
border: 2px solid #ccc;
border-radius: 8px;
padding: 10px;
margin-bottom: 20px;
}
.outer-tabs .tab {
background-color: #e0e0e0;
border: 1px solid #bfbfbf;
border-radius: 4px 4px 0 0;
margin-right: 10px;
padding: 8px 16px;
font-weight: bold;
}
.outer-tabs .tab.active {
background-color: #ffffff;
border-bottom: 2px solid #0078d7;
}
.inner-tabs {
border: 2px solid #aaa;
border-radius: 8px;
padding: 5px;
margin-top: 10px;
}
.inner-tabs .tab {
background-color: #f5f5f5;
border: 1px solid #ccc;
border-radius: 4px 4px 0 0;
margin-right: 8px;
padding: 6px 12px;
font-size: 0.9em;
}
.inner-tabs .tab.active {
background-color: #ffffff;
border-bottom: 2px solid #0078d7;
}
"""
MODEL_TYPES = [
"sparse retrieval",
"dense retrieval",
"embedding model",
"re-ranking model"
]
NUMERIC_INTERVALS = {
"<100M": pd.Interval(0, 100, closed='right'),
"100M to 250M": pd.Interval(100, 250, closed='right'),
"250M to 500M": pd.Interval(250, 500, closed='right'),
"500M to 1B": pd.Interval(500, 1000, closed='right'),
">1B": pd.Interval(1000, 1_000_000, closed='right'),
}
def filter_data(search_query, model_types, model_sizes):
outputs = []
for setting in CONFIG['settings']:
for data_type in CONFIG['types']:
df = data[setting][data_type].copy()
if search_query:
queries = [q.strip().lower() for q in search_query.split(";") if q.strip()]
mask_search = df["Model"].str.lower().apply(lambda x: any(q in x for q in queries))
df = df[mask_search]
if model_types and set(model_types) != set(MODEL_TYPES):
df = df[df["Model Type"].isin(model_types)]
def parse_params(val):
try:
if isinstance(val, str):
val = val.strip()
if val.lower() == "unknown":
return None
if val.endswith("M"):
return float(val[:-1])
elif val.endswith("B"):
return float(val[:-1]) * 1000
else:
return float(val)
else:
return float(val)
except:
return None
df["params_numeric"] = df["Number of Parameters"].apply(parse_params)
if model_sizes and set(model_sizes) != set(NUMERIC_INTERVALS.keys()):
mask_size = df["params_numeric"].apply(
lambda x: any(x is not None and x in NUMERIC_INTERVALS[label] for label in model_sizes)
)
df = df[mask_size]
if "params_numeric" in df.columns:
df = df.drop(columns=["params_numeric"])
df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int)
df = df.sort_values("Rank", ascending=True)
cols = df.columns.tolist()
first_cols = []
if "Rank" in cols:
first_cols.append("Rank")
if "Model" in cols:
first_cols.append("Model")
if "Average" in cols:
first_cols.append("Average")
remaining_cols = [col for col in cols if col not in first_cols]
df = df[first_cols + remaining_cols]
outputs.append(df)
return outputs
head = """
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
"""
with gr.Blocks(css=css, fill_width=True, theme=gr.themes.Base(), head=head ) as demo:
gr.Markdown("""
## Tool-Retrieval benchmark leaderboard
Welcome to the ToolRet benchmark leaderboard!
- **Search**: Enter keywords for the model name in the search box. Use a semicolon (`;`) to separate multiple keywords.
- **Model Type**: We provide a wide range of open-source models. Choose the model type(s) you're interested in.
- **Model Size**: Select the parameter count range to filter models accordingly.
**Click the Filter Data button to update the display with the filtered data.**
""")
with gr.Row():
search_box = gr.Textbox(
label="Search Models (separate multiple keywords with ';')",
placeholder="🔍 Enter model name..."
)
model_type_checkbox_group = gr.CheckboxGroup(
label="Model types",
choices=MODEL_TYPES,
value=MODEL_TYPES,
interactive=True,
elem_classes=["filter-checkbox-group"],
scale=3
)
model_size_checkbox_group = gr.CheckboxGroup(
label="Model sizes (Parameter Count)",
choices=list(NUMERIC_INTERVALS.keys()),
value=list(NUMERIC_INTERVALS.keys()),
interactive=True,
elem_classes=["filter-checkbox-group"],
scale=2,
)
submit_button = gr.Button("Filter Data")
output_dfs = []
with gr.Tabs(elem_classes="outer-tabs") as result_tabs:
for setting in CONFIG['settings']:
with gr.Tab(label=setting):
with gr.Tabs(elem_classes="inner-tabs") as inner_tabs:
for data_type in CONFIG['types']:
with gr.Tab(label=label_map[data_type]):
df_component = gr.DataFrame(value=data[setting][data_type], type="pandas")
output_dfs.append(df_component)
submit_button.click(
fn=filter_data,
inputs=[search_box, model_type_checkbox_group, model_size_checkbox_group],
outputs=output_dfs
)
gr.Markdown("""
## Acknowledgement
This work present the first diverse tool retrieval benchmark to evaluate the tool retrieval performance of a wide range of information retrieval models. We sincerely thank prior work, such as MAIR and ToolBench, which inspire this project or provide strong technique reference.
## Citation
```text
@article{ToolRetrieval,
title = {Retrieval Models Aren't Tool-Savvy: Benchmarking Tool Retrieval for Large Language Models},
author = {Zhengliang Shi, Yuhan Wang, Lingyong Yan, Pengjie Ren, Shuaiqiang Wang, Dawei Yin, Zhaochun Ren},
year = 2025,
journal = {arXiv},
}
```
This demo is created by [Gradio](https://gradio.app/)
""")
demo.launch(share=True)