| import gradio as gr |
| import pandas as pd |
|
|
| |
| |
| |
|
|
| llm_judge_filename = "llm_judge_results.jsonl" |
| response_generation_filename = "report_generation.jsonl" |
| response_generation_w_docs_filename = "report_generation_w_docs.jsonl" |
|
|
| def load_filename_into_df(filename): |
| df = pd.read_json(filename, lines=True) |
| return df |
|
|
|
|
| color_map = { |
| "Closed-source Instruct": "#4492F7" , |
| "Open-weight Instruct": "#0856f1", |
| "Closed-source Reasoning": "#fac05d" , |
| "Open-weight Reasoning": "#f59c03", |
| } |
|
|
| CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n |
| [Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench)\n |
| Want to see your favorite models added? Run it with our code, send us the scores or ping us to run it for you!""" |
|
|
|
|
| def color_model_type_column(df, color_map): |
| """ |
| Apply color to the 'Model Type' column of the DataFrame based on a given color mapping. |
| Parameters: |
| df (pd.DataFrame): The DataFrame containing the 'Model Type' column. |
| color_map (dict): A dictionary mapping model types to colors. |
| Returns: |
| pd.Styler: The styled DataFrame. |
| """ |
|
|
| |
| def apply_color(val): |
| color = color_map.get(val, "default") |
| return f"background-color: {color}" |
|
|
| |
| |
| |
| |
|
|
| format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Model", "Category", "Input Tokens", "Output Tokens", "Cost"]} |
| format_dict["Response Characters"] = "{:d}" |
| format_dict["Input Tokens"] = "{:d}" |
| format_dict["Output Tokens"] = "{:d}" |
| format_dict[""] = "{:d}" |
| format_dict["Cost"] = "{:.2f}" |
| |
|
|
| return df.style.map(apply_color, subset=["Category"]).format(format_dict, na_rep="") |
|
|
|
|
| def regex_table(dataframe, regex, filter_button, style=True): |
| """ |
| Takes a model name as a regex, then returns only the rows that has that in it. |
| """ |
| |
| regex_list = [x.strip() for x in regex.split(",")] |
| |
| combined_regex = "|".join(regex_list) |
|
|
| if isinstance(filter_button, list) or isinstance(filter_button, str): |
| |
| if "Open-weight" not in filter_button: |
| dataframe = dataframe[~dataframe["Category"].str.contains("Open-weight", case=False, na=False)] |
| if "Closed-source" not in filter_button: |
| dataframe = dataframe[~dataframe["Category"].str.contains("Closed-source", case=False, na=False)] |
| if "Reasoning" not in filter_button: |
| dataframe = dataframe[~dataframe["Category"].str.contains("Reasoning", case=False, na=False)] |
| if "Instruct" not in filter_button: |
| dataframe = dataframe[~dataframe["Category"].str.contains("Instruct", case=False, na=False)] |
| |
| data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)] |
|
|
| |
| data = data.sort_values(by="Overall", ascending=False) |
|
|
| data.reset_index(drop=True, inplace=True) |
|
|
| data.insert(0, "", range(1, 1 + len(data))) |
|
|
| if style: |
| |
| data = color_model_type_column(data, color_map) |
|
|
| return data |
|
|
|
|
| |
| theme = gr.themes.Default(primary_hue="blue") |
|
|
| |
| |
| |
|
|
| with gr.Blocks(theme=theme) as app: |
| |
| with gr.Row(): |
| with gr.Column(scale=6): |
| gr.Markdown(CAPTION_V2) |
|
|
| with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big: |
| with gr.TabItem("Report Generation"): |
| with gr.Row(): |
| with gr.Column(scale=7): |
| gr.Markdown("Report Generation Leaderboard") |
| |
| with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs: |
| with gr.TabItem("Leaderboard"): |
| with gr.Row(): |
| search_1 = gr.Textbox( |
| label="Model Search (delimit with , )", |
| placeholder="Model Search (delimit with , )", |
| show_label=False, |
| scale=8, |
| ) |
| model_types_1 = gr.CheckboxGroup( |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"], |
| value=["Open-weight", "Closed-source", "Reasoning", "Instruct"], |
| show_label=False, |
| scale=8, |
| ) |
|
|
| with gr.Row(): |
| col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12 |
| df_response_generation = load_filename_into_df(response_generation_filename) |
|
|
| rewardbench_table_hidden = gr.Dataframe( |
| df_response_generation.values, |
| datatype=col_types_response_generation, |
| headers=df_response_generation.columns.tolist(), |
| visible=False, |
| ) |
|
|
| rewardbench_table = gr.Dataframe( |
| regex_table( |
| df_response_generation.copy(), |
| "", |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"] |
| ), |
| datatype=col_types_response_generation, |
| headers=df_response_generation.columns.tolist(), |
| elem_id="response_generation_dataframe", |
| row_count=(25, "dynamic"), |
| ) |
|
|
| with gr.TabItem("LLM Judge"): |
| with gr.Row(): |
| gr.Markdown("LLM Judge Leaderboard") |
| with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs: |
| with gr.TabItem("Leaderboard"): |
| with gr.Row(): |
| search_1_v1 = gr.Textbox( |
| label="Model Search (delimit with , )", |
| placeholder="Model Search (delimit with , )", |
| show_label=False, |
| ) |
| model_types_1_v1 = gr.CheckboxGroup( |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"], |
| value=["Open-weight", "Closed-source", "Reasoning", "Instruct"], |
| label="Model Types", |
| show_label=False, |
| |
| ) |
| |
| with gr.Row(): |
| col_types_llm_judge = ["number"] + ["markdown"] + ["str"] + ["number"] * 16 |
| df_llm_judge = load_filename_into_df(llm_judge_filename) |
|
|
| rewardbench_table_hidden_v1 = gr.Dataframe( |
| df_llm_judge.values, |
| datatype=col_types_llm_judge, |
| headers=df_llm_judge.columns.tolist(), |
| visible=False, |
| ) |
|
|
| rewardbench_table_v1 = gr.Dataframe( |
| regex_table( |
| df_llm_judge.copy(), |
| "", |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"], |
| ), |
| datatype=col_types_llm_judge, |
| headers=df_llm_judge.columns.tolist(), |
| elem_id="llm_judge_dataframe", |
| row_count=(25, "dynamic"), |
| ) |
| |
| with gr.TabItem("Report Generation w Docs"): |
| with gr.Row(): |
| with gr.Column(scale=7): |
| gr.Markdown("Report Generation Leaderboard with Grounding Documents") |
| |
| with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs: |
| with gr.TabItem("Leaderboard"): |
| with gr.Row(): |
| search_1_v2 = gr.Textbox( |
| label="Model Search (delimit with , )", |
| placeholder="Model Search (delimit with , )", |
| show_label=False, |
| scale=8, |
| ) |
| model_types_1_v2 = gr.CheckboxGroup( |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"], |
| value=["Open-weight", "Closed-source", "Reasoning", "Instruct"], |
| show_label=False, |
| scale=8, |
| ) |
|
|
| with gr.Row(): |
| col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12 |
| df_response_generation_w_docs = load_filename_into_df(response_generation_w_docs_filename) |
|
|
| rewardbench_table_hidden_v2 = gr.Dataframe( |
| df_response_generation_w_docs.values, |
| datatype=col_types_response_generation, |
| headers=df_response_generation_w_docs.columns.tolist(), |
| visible=False, |
| ) |
|
|
| rewardbench_table_v2 = gr.Dataframe( |
| regex_table( |
| df_response_generation_w_docs.copy(), |
| "", |
| ["Open-weight", "Closed-source", "Reasoning", "Instruct"] |
| ), |
| datatype=col_types_response_generation, |
| headers=df_response_generation_w_docs.columns.tolist(), |
| elem_id="response_generation_dataframe", |
| row_count=(25, "dynamic"), |
| ) |
| |
| search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table) |
| search_1_v1.change( |
| regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1 |
| ) |
| search_1_v2.change( |
| regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2 |
| ) |
|
|
| model_types_1.change( |
| regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table |
| ) |
| model_types_1_v1.change( |
| regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1 |
| ) |
|
|
| model_types_1_v2.change( |
| regex_table, inputs=[rewardbench_table_hidden_v2, search_1_v2, model_types_1_v2], outputs=rewardbench_table_v2 |
| ) |
|
|
| with gr.Row(): |
| with gr.Accordion("📚 Citation and Credits", open=False): |
| citation_button = gr.Textbox( |
| value=r"""@misc{wang2025profbenchmultidomainrubricsrequiring, |
| title={ProfBench: Multi-Domain Rubrics requiring Professional Knowledge to Answer and Judge}, |
| author={Zhilin Wang and Jaehun Jung and Ximing Lu and Shizhe Diao and Ellie Evans and Jiaqi Zeng and Pavlo Molchanov and Yejin Choi and Jan Kautz and Yi Dong}, |
| year={2025}, |
| eprint={2510.18941}, |
| archivePrefix={arXiv}, |
| primaryClass={cs.CL}, |
| url={https://arxiv.org/abs/2510.18941}, |
| }""", |
| lines=10, |
| label="If you find the results helpful, please cite the following. ", |
| elem_id="citation-button", |
| show_copy_button=True, |
| ) |
| gr.Textbox("Leaderboard adapted from allenai/reward-bench ", label="Leaderboard credits",) |
|
|
| app.launch() |
|
|