from __future__ import annotations import zipfile from dataclasses import dataclass from pathlib import Path import gradio as gr import pandas as pd import website_texts from apscheduler.schedulers.background import BackgroundScheduler from constants import Constants, model_type_emoji from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns from website_texts import ( ABOUT_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, TITLE, VERSION_HISTORY_BUTTON_TEXT, ) def get_model_family(model_name: str) -> str: prefixes_mapping = { Constants.reference: ["AutoGluon"], Constants.neural_network: ["REALMLP", "TabM", "FASTAI", "MNCA", "NN_TORCH"], Constants.tree: ["GBM", "CAT", "EBM", "XGB", "XT", "RF"], Constants.foundational: ["TABDPT", "TABICL", "TABPFN"], Constants.baseline: ["KNN", "LR"], } for method_type, prefixes in prefixes_mapping.items(): for prefix in prefixes: if prefix.lower() in model_name.lower(): return method_type return Constants.other @dataclass class LBContainer: name: str base_path_to_results: str blurb: str @property def _base_path(self): return Path(__file__).parent / "data" / self.base_path_to_results def load_df_leaderboard(self) -> pd.DataFrame: df = pd.read_csv(self._base_path / "website_leaderboard.csv") df = df.rename(columns={"1#": "#"}) return df def _handle_img_zip(self, img_name: str) -> str: _base_path = self._base_path / img_name zip_path = _base_path.with_suffix(".png.zip") img_path = _base_path.with_suffix(".png") with zipfile.ZipFile(zip_path, "r") as zipf: zipf.extractall(img_path.parent) return str(img_path) def get_path_to_tuning_impact_elo(self) -> str: return self._handle_img_zip("tuning-impact-elo") def get_path_to_pareto_front_improvability_vs_time_infer(self) -> str: return self._handle_img_zip("pareto_front_improvability_vs_time_infer") def get_path_to_pareto_n_configs_imp(self) -> str: return self._handle_img_zip("pareto_n_configs_imp") def get_path_to_winrate_matrix(self) -> str: return self._handle_img_zip("winrate_matrix") def make_overview_images(lb: LBContainer, subset_name): # Main Figure gr.Image( lb.get_path_to_tuning_impact_elo(), label=f"Leaderboard Overview [{subset_name}]", show_label=True, height=500, show_share_button=True, ) with gr.Row(): with gr.Column(scale=1): gr.Image( value=lb.get_path_to_pareto_front_improvability_vs_time_infer(), label=f"Inference Time Pareto Front [{subset_name}]", height=400, show_label=True, show_share_button=True, ) with gr.Column(scale=1): gr.Image( value=lb.get_path_to_pareto_n_configs_imp(), label=f"Tuning Trajectories [{subset_name}]", height=400, show_label=True, show_share_button=True, ) def make_overview_leaderboard(lbs: [LBContainer]): # Create column per LB all_models = { m.split("[")[0].strip() for lb in lbs for m in lb.df_leaderboard[ ~lb.df_leaderboard["TypeName"].isin(["Reference Pipeline"]) ]["Model"] .unique() .tolist() } full_df = None for lb in lbs: df = lb.df_leaderboard.copy() df = df[~df["TypeName"].isin(["Reference Pipeline"])] df[lb.name] = df["Elo [⬆️]"].rank(ascending=False, method="first").astype(int) df = df.sort_values(by=lb.name, ascending=True) # Adding indicators does not work as it makes it a string and then not sort # correctly. # df[lb.name] = df[lb.name].astype(str) # df[lb.name] = df[lb.name].replace({ # "1": "πŸ₯‡ 1", # "2": "πŸ₯ˆ 2", # "3": "πŸ₯‰ 3", # } # ) df = df[["Type", "Model", lb.name]] # Remove imputed message. df["Model"] = ( df["Model"].apply(lambda x: x.split("[")[0].strip()).astype("string") ) if full_df is None: # TODO: add support in case a model did not run on the full LB. assert all_models.difference(set(df["Model"].unique())) == set() full_df = df else: df = df[["Model", lb.name]] df_models = set(df["Model"].unique()) missing_models = all_models.difference(df_models) if missing_models: missing_models_df = pd.DataFrame( [[mm, "--"] for mm in missing_models], columns=["Model", lb.name], ) df = pd.concat([df, missing_models_df], ignore_index=True) df["Model"] = df["Model"].astype("string") # Merge full_df = full_df.merge(df, how="left", on="Model", validate="1:1") medal_colors = ["#998A00", "#808080", "#8C5520"] # Highlight function def highlight_top3(col): styles = [""] * len(col) for index_i in range(len(col)): if (not isinstance(col.iloc[index_i], str)) and col.iloc[index_i] <= 3: styles[index_i] = ( f"background-color: {medal_colors[col.iloc[index_i] - 1]};" ) return styles styler = full_df.style.apply(highlight_top3, axis=0, subset=[lb.name for lb in lbs]) return gr.DataFrame( styler, pinned_columns=2, interactive=False, show_search="search", label="The ranking of all models (with imputation) across various leaderboards.", ) def make_leaderboard(lb: LBContainer) -> Leaderboard: df_leaderboard = lb.load_df_leaderboard() # -- Add filters df_leaderboard["TypeFiler"] = df_leaderboard["TypeName"].apply( lambda m: f"{m} {model_type_emoji[m]}" ) df_leaderboard["Only Default"] = df_leaderboard["Model"].str.endswith("(default)") df_leaderboard["Only Tuned"] = df_leaderboard["Model"].str.endswith("(tuned)") df_leaderboard["Only Tuned + Ensemble"] = df_leaderboard["Model"].str.endswith( "(tuned + ensemble)" ) | df_leaderboard["Model"].str.endswith("(4h)") filter_columns = [ ColumnFilter("TypeFiler", type="checkboxgroup", label="πŸ€– Model Types"), ColumnFilter("Only Default", type="boolean", default=False), ColumnFilter("Only Tuned", type="boolean", default=False), ColumnFilter("Only Tuned + Ensemble", type="boolean", default=False), ] # Add Imputed count postfix if any(df_leaderboard["Imputed"]): df_leaderboard["Imputed"] = df_leaderboard["Imputed"].replace( { True: "Imputed", False: "Not Imputed", } ) filter_columns.append( ColumnFilter( "Imputed", type="checkboxgroup", label="(Not) Imputed Models", info="We impute the performance for models that cannot run on all" " datasets due to task or dataset size constraints. We impute with" " the performance of a default RandomForest." " We add a postfix [X% IMPUTED] to the model if any results were" " imputed. The X% shows the percentage of" " datasets that were imputed. In general, imputation negatively" " represents the model performance, punishing the model for not" " being able to run on all datasets.", ) ) return Leaderboard( value=df_leaderboard, select_columns=SelectColumns( default_selection=list(df_leaderboard.columns), cant_deselect=["Type", "Model"], label="Select Columns to Display:", ), hide_columns=[ "TypeName", "TypeFiler", "RefModel", "Only Default", "Only Tuned", "Only Tuned + Ensemble", "Imputed", ], search_columns=["Model", "TypeName"], filter_columns=filter_columns, bool_checkboxgroup_label="Custom Views (exclusive, only toggle one at a time):", height=800, ) @dataclass class LBMatrixElement: imputation: str splits: str tasks: str datasets: str def get_path_to_results(self) -> str: return ( f"imputation_{self.imputation}/" f"splits_{self.splits}/" f"tasks_{self.tasks}/" f"datasets_{self.datasets}/" ) @dataclass class LBMatrix: imputation = ["no", "yes"] splits = ["all", "lite"] tasks = ["all", "classification", "regression"] datasets = ["all", "small", "medium", "tabpfn"] # TODO: get correct numbers blurb_map_n_datasets = { "all": { "all": 51, "small": 35, "medium": 16, "tabpfn": 33, }, "classification": { "all": 30, "small": 20, "medium": 10, "tabpfn": 20, }, "regression": { "all": 21, "small": 15, "medium": 6, "tabpfn": 13, }, } @staticmethod def get_name_for_lb(lb_key, lb_value): if lb_key == "imputation": return "Models (w/o imputation)" if lb_value == "no" else "Models (with imputation)" if lb_key == "splits": return "All Repeats" if lb_value == "all" else "Lite" if lb_key == "tasks": match lb_value: case "all": return "All Tasks" case "classification": return "Classification" case "regression": return "Regression" case _: raise ValueError() if lb_key == "datasets": match lb_value: case "all": return "All Datasets" case "small": return "Small" case "medium": return "Medium" case "tabpfn": return "TabPFNv2-data" case _: raise ValueError() raise ValueError() def element_to_blurb(self, element: LBMatrixElement) -> str: n_datasets = self.blurb_map_n_datasets[element.tasks][element.datasets] datasets_name = ( element.datasets if element.datasets != "tabpfn" else "TabPFNv2-compatible" ) blurb = f"Leaderboard for {n_datasets} datasets ({datasets_name} datasets, {element.tasks} tasks) " if element.splits == "lite": blurb += "for one split (1st fold, 1st repeat) " blurb += "including all " if element.imputation == "yes": blurb += "(imputed) " blurb += f"models." return blurb def main(): css = """ .markdown-text-box { padding: 4px; border-radius: 2px; } """ js_func = """ function refresh() { const url = new URL(window.location); if (url.searchParams.get('__theme') !== 'dark') { url.searchParams.set('__theme', 'dark'); window.location.href = url.href; } } """ demo = gr.Blocks(css=css, js=js_func, title="TabArena") with demo: gr.HTML(TITLE) # -- Introduction gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Column(), gr.Accordion("πŸ“Š Datasets", open=False): gr.Markdown( website_texts.OVERVIEW_DATASETS, elem_classes="markdown-text-box" ) with gr.Column(), gr.Accordion("πŸ€– Models", open=False): gr.Markdown( website_texts.OVERVIEW_MODELS, elem_classes="markdown-text-box" ) with gr.Row(): with gr.Column(), gr.Accordion("πŸ“ˆ Metrics", open=False): gr.Markdown( website_texts.OVERVIEW_METRICS, elem_classes="markdown-text-box" ) with gr.Column(), gr.Accordion("πŸ“Š Reference Pipeline", open=False): gr.Markdown( website_texts.OVERVIEW_REF_PIPE, elem_classes="markdown-text-box" ) with gr.Row(), gr.Accordion("πŸ“ More Details", open=False): gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text-box") with gr.Row(), gr.Accordion("πŸ“™ Citation", open=False): gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=7, elem_id="citation-button", show_copy_button=True, ) # -- Get all LBs we need: # all_lbs = _get_lbs() # # -- LB Overview # gr.Markdown("## πŸ—ΊοΈ TabArena Overview") # ordered_lbs = [ # ta, # ta_clf, # ta_reg, # ta_tabicl, # ta_tabpfn, # ta_tabpfn_tabicl, # ta_lite, # ] # make_overview_leaderboard(lbs=ordered_lbs) gr.Markdown("## πŸ† TabArena Leaderboards") lb_matrix = LBMatrix() # Imputation with gr.Tabs(elem_classes="tab-buttons"): for impute_id, impute_t in enumerate(lb_matrix.imputation): impute_t_name = lb_matrix.get_name_for_lb("imputation", impute_t) with gr.TabItem( impute_t_name, elem_id="llm-benchmark-tab-table", id=impute_id ): # Splits with gr.Tabs(elem_classes="tab-buttons"): for splits_id, splits_t in enumerate(lb_matrix.splits): splits_t = lb_matrix.get_name_for_lb("splits", splits_t) with gr.TabItem( splits_t, elem_id="llm-benchmark-tab-table", id=f"{impute_id}_{splits_id}", ): # Tasks with gr.Tabs(elem_classes="tab-buttons"): for tasks_id, tasks_t in enumerate(lb_matrix.tasks): tasks_t_name = lb_matrix.get_name_for_lb( "tasks", tasks_t ) with gr.TabItem( tasks_t_name, elem_id="llm-benchmark-tab-table", id=f"{impute_id}_{splits_id}_{tasks_id}", ): # Datasets with gr.Tabs(elem_classes="tab-buttons"): for ( datasets_id, datasets_t, ) in enumerate(lb_matrix.datasets): datasets_t_name = ( lb_matrix.get_name_for_lb( "datasets", datasets_t ) ) with gr.TabItem( datasets_t_name, elem_id="llm-benchmark-tab-table", id=f"{impute_id}_{splits_id}_{tasks_id}_{datasets_id}", ): # Load LB lb_element = LBMatrixElement( imputation=lb_matrix.imputation[ impute_id ], splits=lb_matrix.splits[ splits_id ], tasks=lb_matrix.tasks[ tasks_id ], datasets=lb_matrix.datasets[ datasets_id ], ) lb = LBContainer( name=f"{impute_t_name} | {splits_t} | {tasks_t_name} | {datasets_t_name}", base_path_to_results=lb_element.get_path_to_results(), blurb=lb_matrix.element_to_blurb( lb_element ), ) gr.Markdown( lb.blurb, elem_classes="markdown-text", ) make_overview_images( lb, subset_name=lb.name ) make_leaderboard(lb) gr.Image( lb.get_path_to_winrate_matrix(), label=f"Winmatrix Overview [{lb.name}]", show_label=True, height=800, show_share_button=True, ) with gr.Row(), gr.Accordion("πŸ“‚ Version History", open=False): gr.Markdown(VERSION_HISTORY_BUTTON_TEXT, elem_classes="markdown-text") scheduler = BackgroundScheduler() # scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch() demo.launch() if __name__ == "__main__": main()