Spaces:

nvidia
/

ProfBench

Running

File size: 10,369 Bytes

ee52384

import gradio as gr
import pandas as pd

###########################################
#                 Load Data               #
###########################################

llm_judge_filename = "llm_judge_results.jsonl"
response_generation_filename = "report_generation_w_docs.jsonl"

def load_filename_into_df(filename):
    df = pd.read_json(filename, lines=True)
    return df


color_map = {
    "Closed-source Instruct": "#B8D2F5" ,
    "Open-weight Instruct": "#6f96e5",
    "Closed-source Reasoning": "#fce8c5" ,
    "Open-weight Reasoning": "#ffcd75",
}

CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench)\n
Want to see your favorite models added? Run it with our code, send us the scores or ping us to run it for you!"""


def color_model_type_column(df, color_map):
    """
    Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
    Parameters:
    df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
    color_map (dict): A dictionary mapping model types to colors.
    Returns:
    pd.Styler: The styled DataFrame.
    """

    # Function to apply color based on the model type
    def apply_color(val):
        color = color_map.get(val, "default")  # Default color if not specified in color_map
        return f"background-color: {color}"

    # # Format for different columns
    # format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]}
    # format_dict["Average"] = "{:.2f}"
    # format_dict[""] = "{:d}"

    format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Model", "Category", "Input Tokens", "Output Tokens", "Cost"]}
    format_dict["Response Characters"] = "{:d}"
    format_dict["Input Tokens"] = "{:d}"
    format_dict["Output Tokens"] = "{:d}"
    format_dict[""] = "{:d}"
    format_dict["Cost"] = "{:.2f}"
    

    return df.style.applymap(apply_color, subset=["Category"]).format(format_dict, na_rep="")


def regex_table(dataframe, regex, filter_button, style=True):
    """
    Takes a model name as a regex, then returns only the rows that has that in it.
    """
    # Split regex statement by comma and trim whitespace around regexes
    regex_list = [x.strip() for x in regex.split(",")]
    # Join the list into a single regex pattern with '|' acting as OR
    combined_regex = "|".join(regex_list)

    if isinstance(filter_button, list) or isinstance(filter_button, str):
    
        if "Open-weight" not in filter_button:
            dataframe = dataframe[~dataframe["Category"].str.contains("Open-weight", case=False, na=False)]
        if "Closed-source" not in filter_button:
            dataframe = dataframe[~dataframe["Category"].str.contains("Closed-source", case=False, na=False)]
        if "Reasoning" not in filter_button:
            dataframe = dataframe[~dataframe["Category"].str.contains("Reasoning", case=False, na=False)]
        if "Instruct" not in filter_button:
            dataframe = dataframe[~dataframe["Category"].str.contains("Instruct", case=False, na=False)]
    
    data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]

    # if update the score to not use prior sets, do so
    data = data.sort_values(by="Overall", ascending=False)

    data.reset_index(drop=True, inplace=True)

    data.insert(0, "", range(1, 1 + len(data)))

    if style:
        # apply color
        data = color_model_type_column(data, color_map)

    return data


# Using a string for a predefined color
theme = gr.themes.Default(primary_hue="blue")

#############################################
#                 Gradio App                #
#############################################

with gr.Blocks(theme=theme) as app:
    # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
    with gr.Row():
        with gr.Column(scale=6):
            gr.Markdown(CAPTION_V2)

    with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big:
        with gr.TabItem("Report Generation w Docs"):
            with gr.Row():
                with gr.Column(scale=7):
                    gr.Markdown("Report Generation Leaderboard with Grounding Documents")
                
            with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                with gr.TabItem("Leaderboard"):
                    with gr.Row():
                        search_1 = gr.Textbox(
                            label="Model Search (delimit with , )",
                            placeholder="Model Search (delimit with , )",
                            show_label=False,
                            scale=8,
                        )
                        model_types_1 = gr.CheckboxGroup(
                            ["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            show_label=False,
                            scale=8,
                        )

                    with gr.Row():
                        col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12
                        df_response_generation = load_filename_into_df(response_generation_filename)

                        rewardbench_table_hidden = gr.Dataframe(
                            df_response_generation.values,
                            datatype=col_types_response_generation,
                            headers=df_response_generation.columns.tolist(),
                            visible=False,
                        )

                        rewardbench_table = gr.Dataframe(
                            regex_table(
                                df_response_generation.copy(),
                                "",
                                ["Open-weight", "Closed-source", "Reasoning", "Instruct"]
                            ),
                            datatype=col_types_response_generation,
                            headers=df_response_generation.columns.tolist(),
                            elem_id="response_generation_dataframe",
                            height=800,  # 800 px ≈ ~25 rows on default row-height
                        )

        with gr.TabItem("LLM Judge"):
            with gr.Row():
                gr.Markdown("LLM Judge Leaderboard")
            with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
                with gr.TabItem("Leaderboard"):
                    with gr.Row():
                        search_1_v1 = gr.Textbox(
                            label="Model Search (delimit with , )",
                            placeholder="Model Search (delimit with , )",
                            show_label=False,
                        )
                        model_types_1_v1 = gr.CheckboxGroup(
                            ["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            label="Model Types",
                            show_label=False,
                            #  info="Which model types to include.",
                        )
                        
                    with gr.Row():
                        col_types_llm_judge = ["number"] + ["markdown"] + ["str"] + ["number"] * 16
                        df_llm_judge = load_filename_into_df(llm_judge_filename)

                        rewardbench_table_hidden_v1 = gr.Dataframe(
                            df_llm_judge.values,
                            datatype=col_types_llm_judge,
                            headers=df_llm_judge.columns.tolist(),
                            visible=False,
                        )

                        rewardbench_table_v1 = gr.Dataframe(
                            regex_table(
                                df_llm_judge.copy(),
                                "",
                                ["Open-weight", "Closed-source", "Reasoning", "Instruct"],
                            ),
                            datatype=col_types_llm_judge,
                            headers=df_llm_judge.columns.tolist(),
                            elem_id="llm_judge_dataframe",
                            height=800,  # 800 px ≈ ~25 rows on default row-height
                        )
            
    search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
    search_1_v1.change(
        regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
    )

    model_types_1.change(
        regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
    )
    model_types_1_v1.change(
        regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
    )

    with gr.Row():
        with gr.Accordion("📚 Citation and Credits", open=False):
            citation_button = gr.Textbox(
                value=r"""@misc{wang2025profbenchmultidomainrubricsrequiring,
      title={ProfBench: Multi-Domain Rubrics requiring Professional Knowledge to Answer and Judge}, 
      author={Zhilin Wang and Jaehun Jung and Ximing Lu and Shizhe Diao and Ellie Evans and Jiaqi Zeng and Pavlo Molchanov and Yejin Choi and Jan Kautz and Yi Dong},
      year={2025},
      eprint={2510.18941},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2510.18941}, 
}""",
                lines=10,
                label="If you find the results helpful, please cite the following. ",
                elem_id="citation-button",
                show_copy_button=True,
            )
            gr.Textbox("Leaderboard adapted from allenai/reward-bench ", label="Leaderboard credits",)

app.launch()  # had .queue() before launch before... not sure if that's necessary