File size: 8,913 Bytes
dffc8ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4ecde5
dffc8ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4ecde5
 
 
 
5fec7f9
f4ecde5
dffc8ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import os
from functools import reduce
from collections import defaultdict
from yaml import safe_load

import pandas as pd
import gradio as gr

CONFIG = safe_load(open("config.yaml"))
label_map = {'Avg':"All", "API":"Web API", "Code": "Code Function", "Customized": "Customized App"}
data = defaultdict(dict)
for setting in CONFIG['settings']:
    for data_type in CONFIG['types']:
        file_path = os.path.join("data", f"{CONFIG['settings_mapping'][setting]}-{data_type}.xlsx")
        df = pd.read_excel(file_path)
        
        df["Average"] = df.iloc[:, 1:-2].mean(axis=1)
        
        df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int)
        
        df = df.sort_values("Rank", ascending=True)
        
        cols = df.columns.tolist()
        first_cols = []
        if "Rank" in cols:
            first_cols.append("Rank")
        if "Model" in cols:
            first_cols.append("Model")
        if "Average" in cols:
            first_cols.append("Average")
        remaining_cols = [col for col in cols if col not in first_cols]
        df = df[first_cols + remaining_cols]
        
        # 数值格式化:对于数值列(除 Rank 列),如果最大值 <= 1 则认为是比例数据(乘以 100 后保留两位小数),否则直接保留两位小数
        numeric_cols = df.select_dtypes(include=['float', 'int']).columns
        for col in numeric_cols:
            if col != "Rank":
                if df[col].max() <= 1:
                    df[col] = (df[col] * 100).round(2)
                else:
                    df[col] = df[col].round(2)
                    
        data[setting][data_type] = df

css = """
table thead th, table thead td {
    text-align: center !important;
}
table {
    --cell-width-1: 250px;
}
table > tbody > tr > td:nth-child(2) > div {
    overflow-x: auto;
}
.filter-checkbox-group {
    max-width: max-content;
}
table > tbody > tr > td:nth-child(2) {
    white-space: nowrap;
    width: auto;
}
table > tbody > tr > td:not(:nth-child(2)) {
    white-space: normal;
    width: 100px;
    text-align: center !important;
    vertical-align: middle;
}

.outer-tabs {
    border: 2px solid #ccc;
    border-radius: 8px;
    padding: 10px;
    margin-bottom: 20px;
}
.outer-tabs .tab {
    background-color: #e0e0e0;
    border: 1px solid #bfbfbf;
    border-radius: 4px 4px 0 0;
    margin-right: 10px;
    padding: 8px 16px;
    font-weight: bold;
}
.outer-tabs .tab.active {
    background-color: #ffffff;
    border-bottom: 2px solid #0078d7;
}

.inner-tabs {
    border: 2px solid #aaa;
    border-radius: 8px;
    padding: 5px;
    margin-top: 10px;
}
.inner-tabs .tab {
    background-color: #f5f5f5;
    border: 1px solid #ccc;
    border-radius: 4px 4px 0 0;
    margin-right: 8px;
    padding: 6px 12px;
    font-size: 0.9em;
}
.inner-tabs .tab.active {
    background-color: #ffffff;
    border-bottom: 2px solid #0078d7;
}
"""

MODEL_TYPES = [
    "sparse retrieval",
    "dense retrieval",
    "embedding model",
    "re-ranking model"
]

NUMERIC_INTERVALS = {
    "<100M": pd.Interval(0, 100, closed='right'),
    "100M to 250M": pd.Interval(100, 250, closed='right'),
    "250M to 500M": pd.Interval(250, 500, closed='right'),
    "500M to 1B": pd.Interval(500, 1000, closed='right'),
    ">1B": pd.Interval(1000, 1_000_000, closed='right'),
}

def filter_data(search_query, model_types, model_sizes):
    outputs = []
    for setting in CONFIG['settings']:
        for data_type in CONFIG['types']:
            df = data[setting][data_type].copy()
            
            if search_query:
                queries = [q.strip().lower() for q in search_query.split(";") if q.strip()]
                mask_search = df["Model"].str.lower().apply(lambda x: any(q in x for q in queries))
                df = df[mask_search]
            
            if model_types and set(model_types) != set(MODEL_TYPES):
                df = df[df["Model Type"].isin(model_types)]
            
            def parse_params(val):
                try:
                    if isinstance(val, str):
                        val = val.strip()
                        if val.lower() == "unknown":
                            return None
                        if val.endswith("M"):
                            return float(val[:-1])
                        elif val.endswith("B"):
                            return float(val[:-1]) * 1000
                        else:
                            return float(val)
                    else:
                        return float(val)
                except:
                    return None
            
            df["params_numeric"] = df["Number of Parameters"].apply(parse_params)
            if model_sizes and set(model_sizes) != set(NUMERIC_INTERVALS.keys()):
                mask_size = df["params_numeric"].apply(
                    lambda x: any(x is not None and x in NUMERIC_INTERVALS[label] for label in model_sizes)
                )
                df = df[mask_size]
            
            if "params_numeric" in df.columns:
                df = df.drop(columns=["params_numeric"])
            
            df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int)
            df = df.sort_values("Rank", ascending=True)
            
            cols = df.columns.tolist()
            first_cols = []
            if "Rank" in cols:
                first_cols.append("Rank")
            if "Model" in cols:
                first_cols.append("Model")
            if "Average" in cols:
                first_cols.append("Average")
            remaining_cols = [col for col in cols if col not in first_cols]
            df = df[first_cols + remaining_cols]
            
            outputs.append(df)
    return outputs

head = """
  <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
"""


with gr.Blocks(css=css, fill_width=True, theme=gr.themes.Base(), head=head ) as demo:
    gr.Markdown("""
        ## Tool-Retrieval benchmark leaderboard 

        Welcome to the ToolRet benchmark leaderboard!

        - **Search**: Enter keywords for the model name in the search box. Use a semicolon (`;`) to separate multiple keywords.
        - **Model Type**: We provide a wide range of open-source models. Choose the model type(s) you're interested in.
        - **Model Size**: Select the parameter count range to filter models accordingly.

        **Click the Filter Data button to update the display with the filtered data.**
        """)
    
    with gr.Row():
        search_box = gr.Textbox(
            label="Search Models (separate multiple keywords with ';')",
            placeholder="🔍 Enter model name..."
        )
        model_type_checkbox_group = gr.CheckboxGroup(
            label="Model types",
            choices=MODEL_TYPES, 
            value=MODEL_TYPES,
            interactive=True,
            elem_classes=["filter-checkbox-group"],
            scale=3
        )
        model_size_checkbox_group = gr.CheckboxGroup(
            label="Model sizes (Parameter Count)",
            choices=list(NUMERIC_INTERVALS.keys()),
            value=list(NUMERIC_INTERVALS.keys()),
            interactive=True,
            elem_classes=["filter-checkbox-group"],
            scale=2,
        )
    
    submit_button = gr.Button("Filter Data")
    
    output_dfs = []
    with gr.Tabs(elem_classes="outer-tabs") as result_tabs:
        for setting in CONFIG['settings']:
            with gr.Tab(label=setting):
                with gr.Tabs(elem_classes="inner-tabs") as inner_tabs:
                    for data_type in CONFIG['types']:
                        with gr.Tab(label=label_map[data_type]):
                            df_component = gr.DataFrame(value=data[setting][data_type], type="pandas")
                            output_dfs.append(df_component)
    
    submit_button.click(
         fn=filter_data,
         inputs=[search_box, model_type_checkbox_group, model_size_checkbox_group],
         outputs=output_dfs
    )
    
    gr.Markdown("""
        ## Acknowledgement
        This work present the first diverse tool retrieval benchmark to evaluate the tool retrieval performance of a wide range of information retrieval models.         We sincerely thank prior work, such as MAIR and ToolBench, which inspire this project or provide strong technique reference.
        
        ## Citation
        ```text
        @article{ToolRetrieval,
          title    = {Retrieval Models Aren't Tool-Savvy: Benchmarking Tool Retrieval for Large Language Models},
          author   = {Zhengliang Shi, Yuhan Wang, Lingyong Yan, Pengjie Ren, Shuaiqiang Wang, Dawei Yin, Zhaochun Ren},
          year     = 2025,
          journal  = {arXiv},
        }
        ```
        This demo is created by [Gradio](https://gradio.app/)
        """)

demo.launch(share=True)