import os from functools import reduce from collections import defaultdict from yaml import safe_load import pandas as pd import gradio as gr CONFIG = safe_load(open("config.yaml")) label_map = {'Avg':"All", "API":"Web API", "Code": "Code Function", "Customized": "Customized App"} data = defaultdict(dict) for setting in CONFIG['settings']: for data_type in CONFIG['types']: file_path = os.path.join("data", f"{CONFIG['settings_mapping'][setting]}-{data_type}.xlsx") df = pd.read_excel(file_path) df["Average"] = df.iloc[:, 1:-2].mean(axis=1) df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int) df = df.sort_values("Rank", ascending=True) cols = df.columns.tolist() first_cols = [] if "Rank" in cols: first_cols.append("Rank") if "Model" in cols: first_cols.append("Model") if "Average" in cols: first_cols.append("Average") remaining_cols = [col for col in cols if col not in first_cols] df = df[first_cols + remaining_cols] # 数值格式化:对于数值列(除 Rank 列),如果最大值 <= 1 则认为是比例数据(乘以 100 后保留两位小数),否则直接保留两位小数 numeric_cols = df.select_dtypes(include=['float', 'int']).columns for col in numeric_cols: if col != "Rank": if df[col].max() <= 1: df[col] = (df[col] * 100).round(2) else: df[col] = df[col].round(2) data[setting][data_type] = df css = """ table thead th, table thead td { text-align: center !important; } table { --cell-width-1: 250px; } table > tbody > tr > td:nth-child(2) > div { overflow-x: auto; } .filter-checkbox-group { max-width: max-content; } table > tbody > tr > td:nth-child(2) { white-space: nowrap; width: auto; } table > tbody > tr > td:not(:nth-child(2)) { white-space: normal; width: 100px; text-align: center !important; vertical-align: middle; } .outer-tabs { border: 2px solid #ccc; border-radius: 8px; padding: 10px; margin-bottom: 20px; } .outer-tabs .tab { background-color: #e0e0e0; border: 1px solid #bfbfbf; border-radius: 4px 4px 0 0; margin-right: 10px; padding: 8px 16px; font-weight: bold; } .outer-tabs .tab.active { background-color: #ffffff; border-bottom: 2px solid #0078d7; } .inner-tabs { border: 2px solid #aaa; border-radius: 8px; padding: 5px; margin-top: 10px; } .inner-tabs .tab { background-color: #f5f5f5; border: 1px solid #ccc; border-radius: 4px 4px 0 0; margin-right: 8px; padding: 6px 12px; font-size: 0.9em; } .inner-tabs .tab.active { background-color: #ffffff; border-bottom: 2px solid #0078d7; } """ MODEL_TYPES = [ "sparse retrieval", "dense retrieval", "embedding model", "re-ranking model" ] NUMERIC_INTERVALS = { "<100M": pd.Interval(0, 100, closed='right'), "100M to 250M": pd.Interval(100, 250, closed='right'), "250M to 500M": pd.Interval(250, 500, closed='right'), "500M to 1B": pd.Interval(500, 1000, closed='right'), ">1B": pd.Interval(1000, 1_000_000, closed='right'), } def filter_data(search_query, model_types, model_sizes): outputs = [] for setting in CONFIG['settings']: for data_type in CONFIG['types']: df = data[setting][data_type].copy() if search_query: queries = [q.strip().lower() for q in search_query.split(";") if q.strip()] mask_search = df["Model"].str.lower().apply(lambda x: any(q in x for q in queries)) df = df[mask_search] if model_types and set(model_types) != set(MODEL_TYPES): df = df[df["Model Type"].isin(model_types)] def parse_params(val): try: if isinstance(val, str): val = val.strip() if val.lower() == "unknown": return None if val.endswith("M"): return float(val[:-1]) elif val.endswith("B"): return float(val[:-1]) * 1000 else: return float(val) else: return float(val) except: return None df["params_numeric"] = df["Number of Parameters"].apply(parse_params) if model_sizes and set(model_sizes) != set(NUMERIC_INTERVALS.keys()): mask_size = df["params_numeric"].apply( lambda x: any(x is not None and x in NUMERIC_INTERVALS[label] for label in model_sizes) ) df = df[mask_size] if "params_numeric" in df.columns: df = df.drop(columns=["params_numeric"]) df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int) df = df.sort_values("Rank", ascending=True) cols = df.columns.tolist() first_cols = [] if "Rank" in cols: first_cols.append("Rank") if "Model" in cols: first_cols.append("Model") if "Average" in cols: first_cols.append("Average") remaining_cols = [col for col in cols if col not in first_cols] df = df[first_cols + remaining_cols] outputs.append(df) return outputs head = """ """ with gr.Blocks(css=css, fill_width=True, theme=gr.themes.Base(), head=head ) as demo: gr.Markdown(""" ## Tool-Retrieval benchmark leaderboard Welcome to the ToolRet benchmark leaderboard! - **Search**: Enter keywords for the model name in the search box. Use a semicolon (`;`) to separate multiple keywords. - **Model Type**: We provide a wide range of open-source models. Choose the model type(s) you're interested in. - **Model Size**: Select the parameter count range to filter models accordingly. **Click the Filter Data button to update the display with the filtered data.** """) with gr.Row(): search_box = gr.Textbox( label="Search Models (separate multiple keywords with ';')", placeholder="🔍 Enter model name..." ) model_type_checkbox_group = gr.CheckboxGroup( label="Model types", choices=MODEL_TYPES, value=MODEL_TYPES, interactive=True, elem_classes=["filter-checkbox-group"], scale=3 ) model_size_checkbox_group = gr.CheckboxGroup( label="Model sizes (Parameter Count)", choices=list(NUMERIC_INTERVALS.keys()), value=list(NUMERIC_INTERVALS.keys()), interactive=True, elem_classes=["filter-checkbox-group"], scale=2, ) submit_button = gr.Button("Filter Data") output_dfs = [] with gr.Tabs(elem_classes="outer-tabs") as result_tabs: for setting in CONFIG['settings']: with gr.Tab(label=setting): with gr.Tabs(elem_classes="inner-tabs") as inner_tabs: for data_type in CONFIG['types']: with gr.Tab(label=label_map[data_type]): df_component = gr.DataFrame(value=data[setting][data_type], type="pandas") output_dfs.append(df_component) submit_button.click( fn=filter_data, inputs=[search_box, model_type_checkbox_group, model_size_checkbox_group], outputs=output_dfs ) gr.Markdown(""" ## Acknowledgement This work present the first diverse tool retrieval benchmark to evaluate the tool retrieval performance of a wide range of information retrieval models. We sincerely thank prior work, such as MAIR and ToolBench, which inspire this project or provide strong technique reference. ## Citation ```text @article{ToolRetrieval, title = {Retrieval Models Aren't Tool-Savvy: Benchmarking Tool Retrieval for Large Language Models}, author = {Zhengliang Shi, Yuhan Wang, Lingyong Yan, Pengjie Ren, Shuaiqiang Wang, Dawei Yin, Zhaochun Ren}, year = 2025, journal = {arXiv}, } ``` This demo is created by [Gradio](https://gradio.app/) """) demo.launch(share=True)