github-actions[bot]
Clean Push to Hugging Face
a806362
import gradio as gr
from gradio_leaderboard import Leaderboard
import pandas as pd
import json
import os
import filePaths
import nameMapping
import pr
##################### Leaderboard Paths + Variables #####################
pathLst = filePaths.PATHLIST
pretrain_10K, pretrain_30K, finetune = pathLst[0], pathLst[1], pathLst[2]
toxicity_homogeneous, toxicity_heterogeneous, factual = pathLst[3], pathLst[4], pathLst[5]
import nameMapping
leaderboard_names = nameMapping.LEADERBOARD_NAMES
trainingNamesSet = nameMapping.TRAINING_LEADERBOARDS
########################## Data Loading ###########################
def load_leaderboard_data(file_path):
"""
Load leaderboard data from JSON file.
"""
if os.path.exists(file_path):
with open(file_path, 'r') as f:
return json.load(f)
return []
def add_ranking_column(data, id):
"""
Add ranking column dynamically for display based on selected metric aggregation.
"""
if id == 'toxicity': # Toxicity: AUPRC
key_fn = lambda x: x["AUPRC"]
elif id == 'factual': # Factual: Avg of Recall@50 and MRR
key_fn = lambda x: (x["Recall@50"]+x["MRR"])/2
elif id == 'pretrain': # Pretrain: Avg
key_fn = lambda x: x["avg"]
else: # FineTune: Avg of Metrics
key_fn = lambda x: (x["MMLU"]+x["GSM8K"]+x["BBH"])/3
sorted_data = sorted(data, key=key_fn, reverse=True)
for index, entry in enumerate(sorted_data):
entry["Rank"] = index + 1
return sorted_data
def load_data(filePath, id):
"""
Load initial leaderboard data.
"""
return pd.DataFrame(add_ranking_column(load_leaderboard_data(filePath), id))
pretrain_10K_data = load_data(pretrain_10K, "pretrain")
pretrain_30K_data = load_data(pretrain_30K, "pretrain")
finetune_data = load_data(finetune, "finetune")
homogeneous_data = load_data(toxicity_homogeneous, "toxicity")
heterogeneous_data = load_data(toxicity_heterogeneous, "toxicity")
factual_data = load_data(factual, "factual")
########################## Leaderboard Columns + Helpers ###########################
def get_leaderboard_columns(leaderboard_name):
"""
Returns the Expected Columns for Leaderboard
"""
leaderboardNameMap = nameMapping.DROPDOWN_NAME_MAPPING
if leaderboard_name in leaderboardNameMap["toxicity"]:
return nameMapping.TOXICITY_COLS
elif leaderboard_name in leaderboardNameMap["factual"]:
return nameMapping.FACTUAL_COLS
elif leaderboard_name in leaderboardNameMap["finetune"]:
return nameMapping.FINETUNE_COLS
else: # pretrain
return nameMapping.PRETRAIN_COLS
def get_model_sizes(leaderboard_name):
"""
Returns Model Sizes for Applications Leaderboards
"""
nameFileMapping = {"Homogeneous": toxicity_homogeneous,
"Heterogeneous": toxicity_heterogeneous,
"Factual Attribution": factual}
leaderboardJson = load_leaderboard_data(nameFileMapping[leaderboard_name])
modelSizes = set()
for row in leaderboardJson:
modelSizes.add(row["Model Size"])
return ['All'] + list(modelSizes)
################### Submission Helper Functions #############################
def update_fields(leaderboard):
"""
Determine visibility of group / display additional metrics in submission area.
"""
nameMap = nameMapping.DROPDOWN_NAME_MAPPING
return {
pretrain_group: gr.update(visible=(leaderboard in nameMap['pretrain'])),
finetune_group: gr.update(visible=(leaderboard in nameMap['finetune'])),
toxicity_group: gr.update(visible=(leaderboard in nameMap['toxicity'])),
factual_group: gr.update(visible=(leaderboard in nameMap['factual']))
}
def validate_inputs(*inputFields):
(leaderboard_dropdown, method_name, method_dropdown, model_name, model_size, paper_link, scores,
pre_avg, pre_sciq, pre_arc_easy, pre_arc_chall, pre_logiqa,
pre_boolq, pre_hellaswag, pre_piqa, pre_wino, pre_open,
fine_mmlu, fine_gsm, fine_bbh,
tox_toxicChat, tox_xsTest, tox_jbb, tox_auprc,
fac_recall, fac_mrr) = inputFields
if not all([leaderboard_dropdown, model_name, method_name, method_dropdown, model_size]):
raise gr.Error("All fields must be filled out and with the correct type.")
if not paper_link:
raise gr.Error("Please fill in out the Paper/Code/Contact Link info.")
if not scores:
raise gr.Error("Please upload data attribution scores in .pt file.")
# Check Metrics Non-Empty
nameMap = nameMapping.DROPDOWN_NAME_MAPPING
# nameMap['pretrain'] nameMap['finetune'] nameMap['finetune'] nameMap['factual']
if leaderboard_dropdown in nameMap['pretrain']:
metricsList = [pre_avg, pre_sciq, pre_arc_easy, pre_arc_chall, pre_logiqa, pre_boolq, pre_hellaswag, pre_piqa, pre_wino, pre_open]
elif leaderboard_dropdown in nameMap['finetune']:
metricsList = [fine_mmlu, fine_gsm, fine_bbh]
elif leaderboard_dropdown in nameMap['toxicity']:
metricsList = [tox_toxicChat, tox_xsTest, tox_jbb, tox_auprc]
elif leaderboard_dropdown in nameMap['factual']:
metricsList = [fac_recall, fac_mrr]
if not all(metricsList):
raise gr.Error("Metrics must be filled out.")
if not all(metric > 0 for metric in metricsList):
raise gr.Error("Metrics must be positive.")
######## Dynamically Update Ranking when Filtering on Model Size ###############
def update_rankings(filtered_df, id):
df_with_rank = filtered_df.copy() # create copy to avoid modifying original
if id == 'toxicity': # Toxicity: AUPRC
df_with_rank = df_with_rank.sort_values(by="AUPRC", ascending=False)
elif id == 'factual': # Factual: Avg of Recall@50 and MRR
average_scores = df_with_rank[["Recall@50", "MRR"]].mean(axis=1)
sorted_index = average_scores .sort_values(ascending=False).index
df_with_rank = df_with_rank.loc[sorted_index]
df_with_rank["Rank"] = range(1, len(df_with_rank) + 1) # Add rank column
return df_with_rank
def filter_and_rank(df, filter_value, id):
if filter_value == "All":
filtered_df = df
else:
filtered_df = df[df["Model Size"] == filter_value]
return update_rankings(filtered_df, id)
def rerank_leaderboard(filter_value, dfPath, idNum):
df = load_data(dfPath, idNum)
filtered_ranked_df = filter_and_rank(df, filter_value, idNum)
return filtered_ranked_df
#################### Leaderboards Code ##############################
with gr.Blocks(css="""
body, .gradio-container {
font-family: 'roboto';
}
""") as demo:
gr.Markdown("""
# Data Attribution Methods Leaderboards
""")
gr.Markdown(f"""
Survey and ranking of data attribution methods on data selection and
downstream application tasks for the Date-LM Evaluation paper.
**Leaderboard Submission**:
- To submit your team's scores, click on the "Submit Scores" tab.
**Data Attribution Method Categories**:
- Gradient (ex. GradDot, GradSim, LESS, DataInf, EKFAC)
- Similarity (ex. RepSim)
- Modeling (ex. MATES)
- Lexical (ex. BM25)
- Baseline (ex. GradSafe, OpenAI Moderation, LLM Classifiers)
- Other
**Search Feature**:
- Input the name of the method you would like to search / filter for, and
then press "Enter". The original row from the leaderboard table will be displayed.
"""
)
with gr.Tabs():
with gr.TabItem("Training Data Selection"):
with gr.Tabs(): # Subtabs container
with gr.TabItem("Pre-Training (10K)"): # Subtab
gr.Markdown("""DATE-LM Task Description: Trained pythia-1B model on Fineweb using
Lambada reference dataset. Testing results conducted on 10K step checkpoint.
Ranking Metric: highest score in **avg** column""") # description
l1 = Leaderboard(
value=pd.DataFrame(pretrain_10K_data),
select_columns=get_leaderboard_columns("Pre-Training (10K)"),
search_columns=['Method'],
filter_columns=["Attribution Method Type", "Method", "avg"],
)
with gr.TabItem("Pre-Training (30K)"):
gr.Markdown("""DATE-LM Task Description: Trained pythia-1B model on Fineweb using
Lambada reference dataset. Testing results conducted on 30K step checkpoint.
Ranking Metric: highest score in **avg** column""")
l2 = Leaderboard(
value=pd.DataFrame(pretrain_30K_data),
select_columns=get_leaderboard_columns("Pre-Training (30K)"),
search_columns=["Method"],
filter_columns=["Attribution Method Type", "Method", "avg"],
)
with gr.TabItem("Fine-Tuning"):
gr.Markdown("""DATE-LM Task Description: Targeted instruction tuning setting.
Given a diverse instruction set and a eval dataset, we select data that would yield
optimal performance on the eval data. For this task, the training data pool is
Tulu3 (unfiltered) and the eval data is MMLU, GSM8K, and BBH.
Ranking Metric: average of the **MMLU**, **GSM8K**, and **BBH** scores""")
l3 = Leaderboard(
value=pd.DataFrame(finetune_data),
select_columns=get_leaderboard_columns("Fine-Tuning"),
search_columns=["Method"],
filter_columns=["Attribution Method Type", "MMLU", "GSM8K", "BBH"],
)
with gr.TabItem("Applications"):
with gr.Tabs():
with gr.TabItem("Toxicity/Bias"):
with gr.Tabs():
with gr.TabItem("Homogeneous"):
gr.Markdown("""DATE-LM Task Description: This leaderboard presents detection AUPRC results of baseline methods and data attribution methods in the homogenous setting
(i.e., detecting small amount of toxic/biased data embedded into larger benign data).
Ranking Metric: **AUPRC** (an average of ToxicChat, XSTest-response, JailBreakBench)""")
category_filter4 = gr.Dropdown(
choices=get_model_sizes("Homogeneous"),
value="All",
label="Filter Model Size"
) # ensures page placement above leaderboard
l4 = Leaderboard(
value=pd.DataFrame(homogeneous_data),
select_columns=get_leaderboard_columns("Homogeneous"),
search_columns=["Method"],
filter_columns=["Attribution Method Type", "Model", "AUPRC"],
)
data_path4 = gr.Textbox(value=toxicity_homogeneous, visible=False)
id_str4 = gr.Textbox(value="toxicity", visible=False)
category_filter4.change(
fn=rerank_leaderboard,
inputs=[category_filter4, data_path4, id_str4],
outputs=[l4]
)
with gr.TabItem("Heterogeneous"):
gr.Markdown("""DATE-LM Task Description: This leaderboard presents detection AUPRC results of baseline methods and data attribution methods in the heterogeneous setting
(i.e., safety-aligned examples that resemble unsafe data in format but contain safe responses).
Ranking Metric: **AUPRC** (an average of ToxicChat, XSTest-response, JailBreakBench)""")
category_filter5 = gr.Dropdown(
choices=get_model_sizes("Heterogeneous"),
value="All",
label="Filter Model Size"
)
l5 = Leaderboard(
value=pd.DataFrame(heterogeneous_data),
select_columns=get_leaderboard_columns("Heterogeneous"),
search_columns=["Method"],
filter_columns=["Attribution Method Type", "Model", "AUPRC"]
)
data_path5 = gr.Textbox(value=toxicity_heterogeneous, visible=False)
id_str5 = gr.Textbox(value="toxicity", visible=False)
category_filter5.change(
fn=rerank_leaderboard,
inputs=[category_filter5, data_path5, id_str5],
outputs=[l5]
)
with gr.TabItem("Factual Attribution"):
gr.Markdown("""DATE-LM Task Description: Identifying the specific training examples that support a model's generated facts.
Ranking Metric: average of **Recall@50** and **MRR**""")
category_filter6 = gr.Dropdown(
choices=get_model_sizes("Factual Attribution"),
value="All",
label="Filter Model Size"
)
l6 = Leaderboard(
value=pd.DataFrame(factual_data),
select_columns=get_leaderboard_columns("Factual Attribution"),
search_columns=["Method"],
filter_columns=["Attribution Method Type", "Model", "Recall@50", "MRR"],
)
data_path6 = gr.Textbox(value=factual, visible=False)
id_str6 = gr.Textbox(value="factual", visible=False)
category_filter6.change(
fn=rerank_leaderboard,
inputs=[category_filter6, data_path6, id_str6],
outputs=[l6]
)
with gr.TabItem("Submit Scores 🚀"):
with gr.Column():
gr.Markdown("""### Submit Your Score to a Leaderboard
Note: Please first select the leaderboard you would like to submit to. This will display the fields for the
corresponding metrics that are needed.
""")
leaderboard_dropdown = gr.Dropdown(
label="Select Leaderboard",
choices=nameMapping.LEADERBOARD_NAMES,
value=None
)
method_name = gr.Textbox(label="Method Name")
method_dropdown = gr.Dropdown(
label="Method Type",
choices=["Gradient", "Similarity", "Representation-Based", "Modeling", "Baseline", "Lexical", "Other"],
value=None
)
# model_size = gr.Dropdown(
# label="Model Size",
# choices=["400M", "1B", "3B", "7B"],
# value=None
# )
model_name = gr.Textbox(label="Model Name")
model_size = gr.Textbox(label="Model Size (ex. 410M, 1B, 8B)")
paper_link = gr.Textbox(label="Paper/Code/Contact Link")
scores = gr.File(label='Upload Data Attribution Scores File (.pt)', height=150, file_types=[".pt"])
# Dynamically Display Needed Fields for Each Leaderboard Type
with gr.Column(visible=False) as pretrain_group:
pre_avg = gr.Number(label="Avg")
pre_sciq = gr.Number(label="sciq")
pre_arc_easy = gr.Number(label="arc_easy")
pre_arc_chall = gr.Number(label="arc_challenge")
pre_logiqa = gr.Number(label="logiqa")
pre_boolq = gr.Number(label="boolq")
pre_hellaswag = gr.Number(label="hellaswag")
pre_piqa = gr.Number(label="piqa")
pre_wino = gr.Number(label="winogrande")
pre_open = gr.Number(label="openbookqa")
with gr.Column(visible=False) as finetune_group:
fine_mmlu = gr.Number(label="MMLU")
fine_gsm = gr.Number(label="GSM8K")
fine_bbh = gr.Number(label="BBH")
with gr.Column(visible=False) as toxicity_group:
tox_toxicChat = gr.Number(label="ToxicChat")
tox_xsTest = gr.Number(label="XSTest-response")
tox_jbb = gr.Number(label="JailBreakBench")
tox_auprc = gr.Number(label="AUPRC")
with gr.Column(visible=False) as factual_group:
fac_recall = gr.Number(label="Recall@50")
fac_mrr = gr.Number(label="MRR")
# with gr.Group(visible=False) as training_group:
# acc = gr.Number(label="Accuracy")
# applications_group = gr.Column(visible=False)
# with applications_group:
# f1_score = gr.Number(label="F1")
# auprc_score = gr.Number(label="AUPRC")
# acc1 = gr.Number(label="Accuracy")
# Submit button
submit_button = gr.Button("Submit")
leaderboard_dropdown.change(update_fields, inputs=[leaderboard_dropdown], outputs=[pretrain_group, finetune_group, toxicity_group, factual_group])
# information lists
inputsList = [leaderboard_dropdown, method_name, method_dropdown, model_name, model_size, paper_link, scores, \
pre_avg, pre_sciq, pre_arc_easy, pre_arc_chall, pre_logiqa, pre_boolq, pre_hellaswag, pre_piqa, pre_wino, pre_open, \
fine_mmlu, fine_gsm, fine_bbh, \
tox_toxicChat, tox_xsTest, tox_jbb, tox_auprc, \
fac_recall, fac_mrr]
submit_button.click(
validate_inputs, inputs=inputsList, outputs=[]
).success(fn=pr.submit_and_open_PR, inputs=inputsList, outputs=[gr.Textbox(label="Opened PR on Github")])
if __name__ == "__main__":
demo.launch(debug=True)