File size: 10,369 Bytes
ee52384 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 |
import gradio as gr
import pandas as pd
###########################################
# Load Data #
###########################################
llm_judge_filename = "llm_judge_results.jsonl"
response_generation_filename = "report_generation_w_docs.jsonl"
def load_filename_into_df(filename):
df = pd.read_json(filename, lines=True)
return df
color_map = {
"Closed-source Instruct": "#B8D2F5" ,
"Open-weight Instruct": "#6f96e5",
"Closed-source Reasoning": "#fce8c5" ,
"Open-weight Reasoning": "#ffcd75",
}
CAPTION_V2 = f"""**ProfBench**: Human-annotated rubrics on addressing professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench)\n
Want to see your favorite models added? Run it with our code, send us the scores or ping us to run it for you!"""
def color_model_type_column(df, color_map):
"""
Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
Parameters:
df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
color_map (dict): A dictionary mapping model types to colors.
Returns:
pd.Styler: The styled DataFrame.
"""
# Function to apply color based on the model type
def apply_color(val):
color = color_map.get(val, "default") # Default color if not specified in color_map
return f"background-color: {color}"
# # Format for different columns
# format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Average", "Model", "Model Type"]}
# format_dict["Average"] = "{:.2f}"
# format_dict[""] = "{:d}"
format_dict = {col: "{:.1f}" for col in df.columns if col not in ["Model", "Category", "Input Tokens", "Output Tokens", "Cost"]}
format_dict["Response Characters"] = "{:d}"
format_dict["Input Tokens"] = "{:d}"
format_dict["Output Tokens"] = "{:d}"
format_dict[""] = "{:d}"
format_dict["Cost"] = "{:.2f}"
return df.style.applymap(apply_color, subset=["Category"]).format(format_dict, na_rep="")
def regex_table(dataframe, regex, filter_button, style=True):
"""
Takes a model name as a regex, then returns only the rows that has that in it.
"""
# Split regex statement by comma and trim whitespace around regexes
regex_list = [x.strip() for x in regex.split(",")]
# Join the list into a single regex pattern with '|' acting as OR
combined_regex = "|".join(regex_list)
if isinstance(filter_button, list) or isinstance(filter_button, str):
if "Open-weight" not in filter_button:
dataframe = dataframe[~dataframe["Category"].str.contains("Open-weight", case=False, na=False)]
if "Closed-source" not in filter_button:
dataframe = dataframe[~dataframe["Category"].str.contains("Closed-source", case=False, na=False)]
if "Reasoning" not in filter_button:
dataframe = dataframe[~dataframe["Category"].str.contains("Reasoning", case=False, na=False)]
if "Instruct" not in filter_button:
dataframe = dataframe[~dataframe["Category"].str.contains("Instruct", case=False, na=False)]
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
# if update the score to not use prior sets, do so
data = data.sort_values(by="Overall", ascending=False)
data.reset_index(drop=True, inplace=True)
data.insert(0, "", range(1, 1 + len(data)))
if style:
# apply color
data = color_model_type_column(data, color_map)
return data
# Using a string for a predefined color
theme = gr.themes.Default(primary_hue="blue")
#############################################
# Gradio App #
#############################################
with gr.Blocks(theme=theme) as app:
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
with gr.Row():
with gr.Column(scale=6):
gr.Markdown(CAPTION_V2)
with gr.Tabs(elem_id="outer-tabs", elem_classes="tabs-big") as tabs_big:
with gr.TabItem("Report Generation w Docs"):
with gr.Row():
with gr.Column(scale=7):
gr.Markdown("Report Generation Leaderboard with Grounding Documents")
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
with gr.TabItem("Leaderboard"):
with gr.Row():
search_1 = gr.Textbox(
label="Model Search (delimit with , )",
placeholder="Model Search (delimit with , )",
show_label=False,
scale=8,
)
model_types_1 = gr.CheckboxGroup(
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
show_label=False,
scale=8,
)
with gr.Row():
col_types_response_generation = ["number"] + ["markdown"] + ["str"] + ["number"] * 12
df_response_generation = load_filename_into_df(response_generation_filename)
rewardbench_table_hidden = gr.Dataframe(
df_response_generation.values,
datatype=col_types_response_generation,
headers=df_response_generation.columns.tolist(),
visible=False,
)
rewardbench_table = gr.Dataframe(
regex_table(
df_response_generation.copy(),
"",
["Open-weight", "Closed-source", "Reasoning", "Instruct"]
),
datatype=col_types_response_generation,
headers=df_response_generation.columns.tolist(),
elem_id="response_generation_dataframe",
height=800, # 800 px β ~25 rows on default row-height
)
with gr.TabItem("LLM Judge"):
with gr.Row():
gr.Markdown("LLM Judge Leaderboard")
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
with gr.TabItem("Leaderboard"):
with gr.Row():
search_1_v1 = gr.Textbox(
label="Model Search (delimit with , )",
placeholder="Model Search (delimit with , )",
show_label=False,
)
model_types_1_v1 = gr.CheckboxGroup(
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
value=["Open-weight", "Closed-source", "Reasoning", "Instruct"],
label="Model Types",
show_label=False,
# info="Which model types to include.",
)
with gr.Row():
col_types_llm_judge = ["number"] + ["markdown"] + ["str"] + ["number"] * 16
df_llm_judge = load_filename_into_df(llm_judge_filename)
rewardbench_table_hidden_v1 = gr.Dataframe(
df_llm_judge.values,
datatype=col_types_llm_judge,
headers=df_llm_judge.columns.tolist(),
visible=False,
)
rewardbench_table_v1 = gr.Dataframe(
regex_table(
df_llm_judge.copy(),
"",
["Open-weight", "Closed-source", "Reasoning", "Instruct"],
),
datatype=col_types_llm_judge,
headers=df_llm_judge.columns.tolist(),
elem_id="llm_judge_dataframe",
height=800, # 800 px β ~25 rows on default row-height
)
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
search_1_v1.change(
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
)
model_types_1.change(
regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
)
model_types_1_v1.change(
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
)
with gr.Row():
with gr.Accordion("π Citation and Credits", open=False):
citation_button = gr.Textbox(
value=r"""@misc{wang2025profbenchmultidomainrubricsrequiring,
title={ProfBench: Multi-Domain Rubrics requiring Professional Knowledge to Answer and Judge},
author={Zhilin Wang and Jaehun Jung and Ximing Lu and Shizhe Diao and Ellie Evans and Jiaqi Zeng and Pavlo Molchanov and Yejin Choi and Jan Kautz and Yi Dong},
year={2025},
eprint={2510.18941},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2510.18941},
}""",
lines=10,
label="If you find the results helpful, please cite the following. ",
elem_id="citation-button",
show_copy_button=True,
)
gr.Textbox("Leaderboard adapted from allenai/reward-bench ", label="Leaderboard credits",)
app.launch() # had .queue() before launch before... not sure if that's necessary
|