Spaces:

mangopy
/

ToolRet-leaderboard

Running

File size: 8,913 Bytes

import os
from functools import reduce
from collections import defaultdict
from yaml import safe_load

import pandas as pd
import gradio as gr

CONFIG = safe_load(open("config.yaml"))
label_map = {'Avg':"All", "API":"Web API", "Code": "Code Function", "Customized": "Customized App"}
data = defaultdict(dict)
for setting in CONFIG['settings']:
    for data_type in CONFIG['types']:
        file_path = os.path.join("data", f"{CONFIG['settings_mapping'][setting]}-{data_type}.xlsx")
        df = pd.read_excel(file_path)
        
        df["Average"] = df.iloc[:, 1:-2].mean(axis=1)
        
        df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int)
        
        df = df.sort_values("Rank", ascending=True)
        
        cols = df.columns.tolist()
        first_cols = []
        if "Rank" in cols:
            first_cols.append("Rank")
        if "Model" in cols:
            first_cols.append("Model")
        if "Average" in cols:
            first_cols.append("Average")
        remaining_cols = [col for col in cols if col not in first_cols]
        df = df[first_cols + remaining_cols]
        
        # 数值格式化：对于数值列（除 Rank 列），如果最大值 <= 1 则认为是比例数据（乘以 100 后保留两位小数），否则直接保留两位小数
        numeric_cols = df.select_dtypes(include=['float', 'int']).columns
        for col in numeric_cols:
            if col != "Rank":
                if df[col].max() <= 1:
                    df[col] = (df[col] * 100).round(2)
                else:
                    df[col] = df[col].round(2)
                    
        data[setting][data_type] = df

css = """
table thead th, table thead td {
    text-align: center !important;
}
table {
    --cell-width-1: 250px;
}
table > tbody > tr > td:nth-child(2) > div {
    overflow-x: auto;
}
.filter-checkbox-group {
    max-width: max-content;
}
table > tbody > tr > td:nth-child(2) {
    white-space: nowrap;
    width: auto;
}
table > tbody > tr > td:not(:nth-child(2)) {
    white-space: normal;
    width: 100px;
    text-align: center !important;
    vertical-align: middle;
}

.outer-tabs {
    border: 2px solid #ccc;
    border-radius: 8px;
    padding: 10px;
    margin-bottom: 20px;
}
.outer-tabs .tab {
    background-color: #e0e0e0;
    border: 1px solid #bfbfbf;
    border-radius: 4px 4px 0 0;
    margin-right: 10px;
    padding: 8px 16px;
    font-weight: bold;
}
.outer-tabs .tab.active {
    background-color: #ffffff;
    border-bottom: 2px solid #0078d7;
}

.inner-tabs {
    border: 2px solid #aaa;
    border-radius: 8px;
    padding: 5px;
    margin-top: 10px;
}
.inner-tabs .tab {
    background-color: #f5f5f5;
    border: 1px solid #ccc;
    border-radius: 4px 4px 0 0;
    margin-right: 8px;
    padding: 6px 12px;
    font-size: 0.9em;
}
.inner-tabs .tab.active {
    background-color: #ffffff;
    border-bottom: 2px solid #0078d7;
}
"""

MODEL_TYPES = [
    "sparse retrieval",
    "dense retrieval",
    "embedding model",
    "re-ranking model"
]

NUMERIC_INTERVALS = {
    "<100M": pd.Interval(0, 100, closed='right'),
    "100M to 250M": pd.Interval(100, 250, closed='right'),
    "250M to 500M": pd.Interval(250, 500, closed='right'),
    "500M to 1B": pd.Interval(500, 1000, closed='right'),
    ">1B": pd.Interval(1000, 1_000_000, closed='right'),
}

def filter_data(search_query, model_types, model_sizes):
    outputs = []
    for setting in CONFIG['settings']:
        for data_type in CONFIG['types']:
            df = data[setting][data_type].copy()
            
            if search_query:
                queries = [q.strip().lower() for q in search_query.split(";") if q.strip()]
                mask_search = df["Model"].str.lower().apply(lambda x: any(q in x for q in queries))
                df = df[mask_search]
            
            if model_types and set(model_types) != set(MODEL_TYPES):
                df = df[df["Model Type"].isin(model_types)]
            
            def parse_params(val):
                try:
                    if isinstance(val, str):
                        val = val.strip()
                        if val.lower() == "unknown":
                            return None
                        if val.endswith("M"):
                            return float(val[:-1])
                        elif val.endswith("B"):
                            return float(val[:-1]) * 1000
                        else:
                            return float(val)
                    else:
                        return float(val)
                except:
                    return None
            
            df["params_numeric"] = df["Number of Parameters"].apply(parse_params)
            if model_sizes and set(model_sizes) != set(NUMERIC_INTERVALS.keys()):
                mask_size = df["params_numeric"].apply(
                    lambda x: any(x is not None and x in NUMERIC_INTERVALS[label] for label in model_sizes)
                )
                df = df[mask_size]
            
            if "params_numeric" in df.columns:
                df = df.drop(columns=["params_numeric"])
            
            df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int)
            df = df.sort_values("Rank", ascending=True)
            
            cols = df.columns.tolist()
            first_cols = []
            if "Rank" in cols:
                first_cols.append("Rank")
            if "Model" in cols:
                first_cols.append("Model")
            if "Average" in cols:
                first_cols.append("Average")
            remaining_cols = [col for col in cols if col not in first_cols]
            df = df[first_cols + remaining_cols]
            
            outputs.append(df)
    return outputs

head = """
  <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
"""


with gr.Blocks(css=css, fill_width=True, theme=gr.themes.Base(), head=head ) as demo:
    gr.Markdown("""
        ## Tool-Retrieval benchmark leaderboard 

        Welcome to the ToolRet benchmark leaderboard!

        - **Search**: Enter keywords for the model name in the search box. Use a semicolon (`;`) to separate multiple keywords.
        - **Model Type**: We provide a wide range of open-source models. Choose the model type(s) you're interested in.
        - **Model Size**: Select the parameter count range to filter models accordingly.

        **Click the Filter Data button to update the display with the filtered data.**
        """)
    
    with gr.Row():
        search_box = gr.Textbox(
            label="Search Models (separate multiple keywords with ';')",
            placeholder="🔍 Enter model name..."
        )
        model_type_checkbox_group = gr.CheckboxGroup(
            label="Model types",
            choices=MODEL_TYPES, 
            value=MODEL_TYPES,
            interactive=True,
            elem_classes=["filter-checkbox-group"],
            scale=3
        )
        model_size_checkbox_group = gr.CheckboxGroup(
            label="Model sizes (Parameter Count)",
            choices=list(NUMERIC_INTERVALS.keys()),
            value=list(NUMERIC_INTERVALS.keys()),
            interactive=True,
            elem_classes=["filter-checkbox-group"],
            scale=2,
        )
    
    submit_button = gr.Button("Filter Data")
    
    output_dfs = []
    with gr.Tabs(elem_classes="outer-tabs") as result_tabs:
        for setting in CONFIG['settings']:
            with gr.Tab(label=setting):
                with gr.Tabs(elem_classes="inner-tabs") as inner_tabs:
                    for data_type in CONFIG['types']:
                        with gr.Tab(label=label_map[data_type]):
                            df_component = gr.DataFrame(value=data[setting][data_type], type="pandas")
                            output_dfs.append(df_component)
    
    submit_button.click(
         fn=filter_data,
         inputs=[search_box, model_type_checkbox_group, model_size_checkbox_group],
         outputs=output_dfs
    )
    
    gr.Markdown("""
        ## Acknowledgement
        This work present the first diverse tool retrieval benchmark to evaluate the tool retrieval performance of a wide range of information retrieval models.         We sincerely thank prior work, such as MAIR and ToolBench, which inspire this project or provide strong technique reference.
        
        ## Citation
        ```text
        @article{ToolRetrieval,
          title    = {Retrieval Models Aren't Tool-Savvy: Benchmarking Tool Retrieval for Large Language Models},
          author   = {Zhengliang Shi, Yuhan Wang, Lingyong Yan, Pengjie Ren, Shuaiqiang Wang, Dawei Yin, Zhaochun Ren},
          year     = 2025,
          journal  = {arXiv},
        }
        ```
        This demo is created by [Gradio](https://gradio.app/)
        """)

demo.launch(share=True)