Spaces:
Running
Running
| import os | |
| from functools import reduce | |
| from collections import defaultdict | |
| from yaml import safe_load | |
| import pandas as pd | |
| import gradio as gr | |
| CONFIG = safe_load(open("config.yaml")) | |
| label_map = {'Avg':"All", "API":"Web API", "Code": "Code Function", "Customized": "Customized App"} | |
| data = defaultdict(dict) | |
| for setting in CONFIG['settings']: | |
| for data_type in CONFIG['types']: | |
| file_path = os.path.join("data", f"{CONFIG['settings_mapping'][setting]}-{data_type}.xlsx") | |
| df = pd.read_excel(file_path) | |
| df["Average"] = df.iloc[:, 1:-2].mean(axis=1) | |
| df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int) | |
| df = df.sort_values("Rank", ascending=True) | |
| cols = df.columns.tolist() | |
| first_cols = [] | |
| if "Rank" in cols: | |
| first_cols.append("Rank") | |
| if "Model" in cols: | |
| first_cols.append("Model") | |
| if "Average" in cols: | |
| first_cols.append("Average") | |
| remaining_cols = [col for col in cols if col not in first_cols] | |
| df = df[first_cols + remaining_cols] | |
| # 数值格式化:对于数值列(除 Rank 列),如果最大值 <= 1 则认为是比例数据(乘以 100 后保留两位小数),否则直接保留两位小数 | |
| numeric_cols = df.select_dtypes(include=['float', 'int']).columns | |
| for col in numeric_cols: | |
| if col != "Rank": | |
| if df[col].max() <= 1: | |
| df[col] = (df[col] * 100).round(2) | |
| else: | |
| df[col] = df[col].round(2) | |
| data[setting][data_type] = df | |
| css = """ | |
| table thead th, table thead td { | |
| text-align: center !important; | |
| } | |
| table { | |
| --cell-width-1: 250px; | |
| } | |
| table > tbody > tr > td:nth-child(2) > div { | |
| overflow-x: auto; | |
| } | |
| .filter-checkbox-group { | |
| max-width: max-content; | |
| } | |
| table > tbody > tr > td:nth-child(2) { | |
| white-space: nowrap; | |
| width: auto; | |
| } | |
| table > tbody > tr > td:not(:nth-child(2)) { | |
| white-space: normal; | |
| width: 100px; | |
| text-align: center !important; | |
| vertical-align: middle; | |
| } | |
| .outer-tabs { | |
| border: 2px solid #ccc; | |
| border-radius: 8px; | |
| padding: 10px; | |
| margin-bottom: 20px; | |
| } | |
| .outer-tabs .tab { | |
| background-color: #e0e0e0; | |
| border: 1px solid #bfbfbf; | |
| border-radius: 4px 4px 0 0; | |
| margin-right: 10px; | |
| padding: 8px 16px; | |
| font-weight: bold; | |
| } | |
| .outer-tabs .tab.active { | |
| background-color: #ffffff; | |
| border-bottom: 2px solid #0078d7; | |
| } | |
| .inner-tabs { | |
| border: 2px solid #aaa; | |
| border-radius: 8px; | |
| padding: 5px; | |
| margin-top: 10px; | |
| } | |
| .inner-tabs .tab { | |
| background-color: #f5f5f5; | |
| border: 1px solid #ccc; | |
| border-radius: 4px 4px 0 0; | |
| margin-right: 8px; | |
| padding: 6px 12px; | |
| font-size: 0.9em; | |
| } | |
| .inner-tabs .tab.active { | |
| background-color: #ffffff; | |
| border-bottom: 2px solid #0078d7; | |
| } | |
| """ | |
| MODEL_TYPES = [ | |
| "sparse retrieval", | |
| "dense retrieval", | |
| "embedding model", | |
| "re-ranking model" | |
| ] | |
| NUMERIC_INTERVALS = { | |
| "<100M": pd.Interval(0, 100, closed='right'), | |
| "100M to 250M": pd.Interval(100, 250, closed='right'), | |
| "250M to 500M": pd.Interval(250, 500, closed='right'), | |
| "500M to 1B": pd.Interval(500, 1000, closed='right'), | |
| ">1B": pd.Interval(1000, 1_000_000, closed='right'), | |
| } | |
| def filter_data(search_query, model_types, model_sizes): | |
| outputs = [] | |
| for setting in CONFIG['settings']: | |
| for data_type in CONFIG['types']: | |
| df = data[setting][data_type].copy() | |
| if search_query: | |
| queries = [q.strip().lower() for q in search_query.split(";") if q.strip()] | |
| mask_search = df["Model"].str.lower().apply(lambda x: any(q in x for q in queries)) | |
| df = df[mask_search] | |
| if model_types and set(model_types) != set(MODEL_TYPES): | |
| df = df[df["Model Type"].isin(model_types)] | |
| def parse_params(val): | |
| try: | |
| if isinstance(val, str): | |
| val = val.strip() | |
| if val.lower() == "unknown": | |
| return None | |
| if val.endswith("M"): | |
| return float(val[:-1]) | |
| elif val.endswith("B"): | |
| return float(val[:-1]) * 1000 | |
| else: | |
| return float(val) | |
| else: | |
| return float(val) | |
| except: | |
| return None | |
| df["params_numeric"] = df["Number of Parameters"].apply(parse_params) | |
| if model_sizes and set(model_sizes) != set(NUMERIC_INTERVALS.keys()): | |
| mask_size = df["params_numeric"].apply( | |
| lambda x: any(x is not None and x in NUMERIC_INTERVALS[label] for label in model_sizes) | |
| ) | |
| df = df[mask_size] | |
| if "params_numeric" in df.columns: | |
| df = df.drop(columns=["params_numeric"]) | |
| df["Rank"] = df["Average"].rank(ascending=False, method='min').astype(int) | |
| df = df.sort_values("Rank", ascending=True) | |
| cols = df.columns.tolist() | |
| first_cols = [] | |
| if "Rank" in cols: | |
| first_cols.append("Rank") | |
| if "Model" in cols: | |
| first_cols.append("Model") | |
| if "Average" in cols: | |
| first_cols.append("Average") | |
| remaining_cols = [col for col in cols if col not in first_cols] | |
| df = df[first_cols + remaining_cols] | |
| outputs.append(df) | |
| return outputs | |
| head = """ | |
| <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet"> | |
| """ | |
| with gr.Blocks(css=css, fill_width=True, theme=gr.themes.Base(), head=head ) as demo: | |
| gr.Markdown(""" | |
| ## Tool-Retrieval benchmark leaderboard | |
| Welcome to the ToolRet benchmark leaderboard! | |
| - **Search**: Enter keywords for the model name in the search box. Use a semicolon (`;`) to separate multiple keywords. | |
| - **Model Type**: We provide a wide range of open-source models. Choose the model type(s) you're interested in. | |
| - **Model Size**: Select the parameter count range to filter models accordingly. | |
| **Click the Filter Data button to update the display with the filtered data.** | |
| """) | |
| with gr.Row(): | |
| search_box = gr.Textbox( | |
| label="Search Models (separate multiple keywords with ';')", | |
| placeholder="🔍 Enter model name..." | |
| ) | |
| model_type_checkbox_group = gr.CheckboxGroup( | |
| label="Model types", | |
| choices=MODEL_TYPES, | |
| value=MODEL_TYPES, | |
| interactive=True, | |
| elem_classes=["filter-checkbox-group"], | |
| scale=3 | |
| ) | |
| model_size_checkbox_group = gr.CheckboxGroup( | |
| label="Model sizes (Parameter Count)", | |
| choices=list(NUMERIC_INTERVALS.keys()), | |
| value=list(NUMERIC_INTERVALS.keys()), | |
| interactive=True, | |
| elem_classes=["filter-checkbox-group"], | |
| scale=2, | |
| ) | |
| submit_button = gr.Button("Filter Data") | |
| output_dfs = [] | |
| with gr.Tabs(elem_classes="outer-tabs") as result_tabs: | |
| for setting in CONFIG['settings']: | |
| with gr.Tab(label=setting): | |
| with gr.Tabs(elem_classes="inner-tabs") as inner_tabs: | |
| for data_type in CONFIG['types']: | |
| with gr.Tab(label=label_map[data_type]): | |
| df_component = gr.DataFrame(value=data[setting][data_type], type="pandas") | |
| output_dfs.append(df_component) | |
| submit_button.click( | |
| fn=filter_data, | |
| inputs=[search_box, model_type_checkbox_group, model_size_checkbox_group], | |
| outputs=output_dfs | |
| ) | |
| gr.Markdown(""" | |
| ## Acknowledgement | |
| This work present the first diverse tool retrieval benchmark to evaluate the tool retrieval performance of a wide range of information retrieval models. We sincerely thank prior work, such as MAIR and ToolBench, which inspire this project or provide strong technique reference. | |
| ## Citation | |
| ```text | |
| @article{ToolRetrieval, | |
| title = {Retrieval Models Aren't Tool-Savvy: Benchmarking Tool Retrieval for Large Language Models}, | |
| author = {Zhengliang Shi, Yuhan Wang, Lingyong Yan, Pengjie Ren, Shuaiqiang Wang, Dawei Yin, Zhaochun Ren}, | |
| year = 2025, | |
| journal = {arXiv}, | |
| } | |
| ``` | |
| This demo is created by [Gradio](https://gradio.app/) | |
| """) | |
| demo.launch(share=True) | |