|
|
import gradio as gr |
|
|
from gradio_leaderboard import Leaderboard |
|
|
import pandas as pd |
|
|
import json |
|
|
import os |
|
|
|
|
|
import filePaths |
|
|
import nameMapping |
|
|
import pr |
|
|
|
|
|
|
|
|
|
|
|
pathLst = filePaths.PATHLIST |
|
|
pretrain_10K, pretrain_30K, finetune = pathLst[0], pathLst[1], pathLst[2] |
|
|
toxicity_homogeneous, toxicity_heterogeneous, factual = pathLst[3], pathLst[4], pathLst[5] |
|
|
|
|
|
import nameMapping |
|
|
leaderboard_names = nameMapping.LEADERBOARD_NAMES |
|
|
trainingNamesSet = nameMapping.TRAINING_LEADERBOARDS |
|
|
|
|
|
|
|
|
|
|
|
def load_leaderboard_data(file_path): |
|
|
""" |
|
|
Load leaderboard data from JSON file. |
|
|
""" |
|
|
if os.path.exists(file_path): |
|
|
with open(file_path, 'r') as f: |
|
|
return json.load(f) |
|
|
return [] |
|
|
|
|
|
def add_ranking_column(data, id): |
|
|
""" |
|
|
Add ranking column dynamically for display based on selected metric aggregation. |
|
|
""" |
|
|
if id == 'toxicity': |
|
|
key_fn = lambda x: x["AUPRC"] |
|
|
elif id == 'factual': |
|
|
key_fn = lambda x: (x["Recall@50"]+x["MRR"])/2 |
|
|
elif id == 'pretrain': |
|
|
key_fn = lambda x: x["avg"] |
|
|
else: |
|
|
key_fn = lambda x: (x["MMLU"]+x["GSM8K"]+x["BBH"])/3 |
|
|
|
|
|
sorted_data = sorted(data, key=key_fn, reverse=True) |
|
|
for index, entry in enumerate(sorted_data): |
|
|
entry["Rank"] = index + 1 |
|
|
return sorted_data |
|
|
|
|
|
def load_data(filePath, id): |
|
|
""" |
|
|
Load initial leaderboard data. |
|
|
""" |
|
|
return pd.DataFrame(add_ranking_column(load_leaderboard_data(filePath), id)) |
|
|
|
|
|
pretrain_10K_data = load_data(pretrain_10K, "pretrain") |
|
|
pretrain_30K_data = load_data(pretrain_30K, "pretrain") |
|
|
finetune_data = load_data(finetune, "finetune") |
|
|
homogeneous_data = load_data(toxicity_homogeneous, "toxicity") |
|
|
heterogeneous_data = load_data(toxicity_heterogeneous, "toxicity") |
|
|
factual_data = load_data(factual, "factual") |
|
|
|
|
|
|
|
|
|
|
|
def get_leaderboard_columns(leaderboard_name): |
|
|
""" |
|
|
Returns the Expected Columns for Leaderboard |
|
|
""" |
|
|
leaderboardNameMap = nameMapping.DROPDOWN_NAME_MAPPING |
|
|
if leaderboard_name in leaderboardNameMap["toxicity"]: |
|
|
return nameMapping.TOXICITY_COLS |
|
|
elif leaderboard_name in leaderboardNameMap["factual"]: |
|
|
return nameMapping.FACTUAL_COLS |
|
|
elif leaderboard_name in leaderboardNameMap["finetune"]: |
|
|
return nameMapping.FINETUNE_COLS |
|
|
else: |
|
|
return nameMapping.PRETRAIN_COLS |
|
|
|
|
|
def get_model_sizes(leaderboard_name): |
|
|
""" |
|
|
Returns Model Sizes for Applications Leaderboards |
|
|
""" |
|
|
nameFileMapping = {"Homogeneous": toxicity_homogeneous, |
|
|
"Heterogeneous": toxicity_heterogeneous, |
|
|
"Factual Attribution": factual} |
|
|
leaderboardJson = load_leaderboard_data(nameFileMapping[leaderboard_name]) |
|
|
modelSizes = set() |
|
|
|
|
|
for row in leaderboardJson: |
|
|
modelSizes.add(row["Model Size"]) |
|
|
|
|
|
return ['All'] + list(modelSizes) |
|
|
|
|
|
|
|
|
|
|
|
def update_fields(leaderboard): |
|
|
""" |
|
|
Determine visibility of group / display additional metrics in submission area. |
|
|
""" |
|
|
nameMap = nameMapping.DROPDOWN_NAME_MAPPING |
|
|
return { |
|
|
pretrain_group: gr.update(visible=(leaderboard in nameMap['pretrain'])), |
|
|
finetune_group: gr.update(visible=(leaderboard in nameMap['finetune'])), |
|
|
toxicity_group: gr.update(visible=(leaderboard in nameMap['toxicity'])), |
|
|
factual_group: gr.update(visible=(leaderboard in nameMap['factual'])) |
|
|
} |
|
|
|
|
|
def validate_inputs(*inputFields): |
|
|
(leaderboard_dropdown, method_name, method_dropdown, model_name, model_size, paper_link, scores, |
|
|
pre_avg, pre_sciq, pre_arc_easy, pre_arc_chall, pre_logiqa, |
|
|
pre_boolq, pre_hellaswag, pre_piqa, pre_wino, pre_open, |
|
|
fine_mmlu, fine_gsm, fine_bbh, |
|
|
tox_toxicChat, tox_xsTest, tox_jbb, tox_auprc, |
|
|
fac_recall, fac_mrr) = inputFields |
|
|
|
|
|
if not all([leaderboard_dropdown, model_name, method_name, method_dropdown, model_size]): |
|
|
raise gr.Error("All fields must be filled out and with the correct type.") |
|
|
|
|
|
if not paper_link: |
|
|
raise gr.Error("Please fill in out the Paper/Code/Contact Link info.") |
|
|
|
|
|
if not scores: |
|
|
raise gr.Error("Please upload data attribution scores in .pt file.") |
|
|
|
|
|
|
|
|
nameMap = nameMapping.DROPDOWN_NAME_MAPPING |
|
|
|
|
|
if leaderboard_dropdown in nameMap['pretrain']: |
|
|
metricsList = [pre_avg, pre_sciq, pre_arc_easy, pre_arc_chall, pre_logiqa, pre_boolq, pre_hellaswag, pre_piqa, pre_wino, pre_open] |
|
|
elif leaderboard_dropdown in nameMap['finetune']: |
|
|
metricsList = [fine_mmlu, fine_gsm, fine_bbh] |
|
|
elif leaderboard_dropdown in nameMap['toxicity']: |
|
|
metricsList = [tox_toxicChat, tox_xsTest, tox_jbb, tox_auprc] |
|
|
elif leaderboard_dropdown in nameMap['factual']: |
|
|
metricsList = [fac_recall, fac_mrr] |
|
|
|
|
|
if not all(metricsList): |
|
|
raise gr.Error("Metrics must be filled out.") |
|
|
if not all(metric > 0 for metric in metricsList): |
|
|
raise gr.Error("Metrics must be positive.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def update_rankings(filtered_df, id): |
|
|
df_with_rank = filtered_df.copy() |
|
|
|
|
|
if id == 'toxicity': |
|
|
df_with_rank = df_with_rank.sort_values(by="AUPRC", ascending=False) |
|
|
elif id == 'factual': |
|
|
average_scores = df_with_rank[["Recall@50", "MRR"]].mean(axis=1) |
|
|
sorted_index = average_scores .sort_values(ascending=False).index |
|
|
df_with_rank = df_with_rank.loc[sorted_index] |
|
|
|
|
|
df_with_rank["Rank"] = range(1, len(df_with_rank) + 1) |
|
|
|
|
|
return df_with_rank |
|
|
|
|
|
def filter_and_rank(df, filter_value, id): |
|
|
if filter_value == "All": |
|
|
filtered_df = df |
|
|
else: |
|
|
filtered_df = df[df["Model Size"] == filter_value] |
|
|
return update_rankings(filtered_df, id) |
|
|
|
|
|
def rerank_leaderboard(filter_value, dfPath, idNum): |
|
|
df = load_data(dfPath, idNum) |
|
|
filtered_ranked_df = filter_and_rank(df, filter_value, idNum) |
|
|
return filtered_ranked_df |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(css=""" |
|
|
body, .gradio-container { |
|
|
font-family: 'roboto'; |
|
|
} |
|
|
""") as demo: |
|
|
gr.Markdown(""" |
|
|
# Data Attribution Methods Leaderboards |
|
|
""") |
|
|
gr.Markdown(f""" |
|
|
Survey and ranking of data attribution methods on data selection and |
|
|
downstream application tasks for the Date-LM Evaluation paper. |
|
|
|
|
|
**Leaderboard Submission**: |
|
|
- To submit your team's scores, click on the "Submit Scores" tab. |
|
|
|
|
|
**Data Attribution Method Categories**: |
|
|
- Gradient (ex. GradDot, GradSim, LESS, DataInf, EKFAC) |
|
|
- Similarity (ex. RepSim) |
|
|
- Modeling (ex. MATES) |
|
|
- Lexical (ex. BM25) |
|
|
- Baseline (ex. GradSafe, OpenAI Moderation, LLM Classifiers) |
|
|
- Other |
|
|
|
|
|
**Search Feature**: |
|
|
- Input the name of the method you would like to search / filter for, and |
|
|
then press "Enter". The original row from the leaderboard table will be displayed. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("Training Data Selection"): |
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("Pre-Training (10K)"): |
|
|
gr.Markdown("""DATE-LM Task Description: Trained pythia-1B model on Fineweb using |
|
|
Lambada reference dataset. Testing results conducted on 10K step checkpoint. |
|
|
|
|
|
Ranking Metric: highest score in **avg** column""") |
|
|
l1 = Leaderboard( |
|
|
value=pd.DataFrame(pretrain_10K_data), |
|
|
select_columns=get_leaderboard_columns("Pre-Training (10K)"), |
|
|
search_columns=['Method'], |
|
|
filter_columns=["Attribution Method Type", "Method", "avg"], |
|
|
) |
|
|
with gr.TabItem("Pre-Training (30K)"): |
|
|
gr.Markdown("""DATE-LM Task Description: Trained pythia-1B model on Fineweb using |
|
|
Lambada reference dataset. Testing results conducted on 30K step checkpoint. |
|
|
|
|
|
Ranking Metric: highest score in **avg** column""") |
|
|
l2 = Leaderboard( |
|
|
value=pd.DataFrame(pretrain_30K_data), |
|
|
select_columns=get_leaderboard_columns("Pre-Training (30K)"), |
|
|
search_columns=["Method"], |
|
|
filter_columns=["Attribution Method Type", "Method", "avg"], |
|
|
) |
|
|
with gr.TabItem("Fine-Tuning"): |
|
|
gr.Markdown("""DATE-LM Task Description: Targeted instruction tuning setting. |
|
|
Given a diverse instruction set and a eval dataset, we select data that would yield |
|
|
optimal performance on the eval data. For this task, the training data pool is |
|
|
Tulu3 (unfiltered) and the eval data is MMLU, GSM8K, and BBH. |
|
|
|
|
|
Ranking Metric: average of the **MMLU**, **GSM8K**, and **BBH** scores""") |
|
|
l3 = Leaderboard( |
|
|
value=pd.DataFrame(finetune_data), |
|
|
select_columns=get_leaderboard_columns("Fine-Tuning"), |
|
|
search_columns=["Method"], |
|
|
filter_columns=["Attribution Method Type", "MMLU", "GSM8K", "BBH"], |
|
|
) |
|
|
with gr.TabItem("Applications"): |
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("Toxicity/Bias"): |
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("Homogeneous"): |
|
|
gr.Markdown("""DATE-LM Task Description: This leaderboard presents detection AUPRC results of baseline methods and data attribution methods in the homogenous setting |
|
|
(i.e., detecting small amount of toxic/biased data embedded into larger benign data). |
|
|
|
|
|
Ranking Metric: **AUPRC** (an average of ToxicChat, XSTest-response, JailBreakBench)""") |
|
|
category_filter4 = gr.Dropdown( |
|
|
choices=get_model_sizes("Homogeneous"), |
|
|
value="All", |
|
|
label="Filter Model Size" |
|
|
) |
|
|
l4 = Leaderboard( |
|
|
value=pd.DataFrame(homogeneous_data), |
|
|
select_columns=get_leaderboard_columns("Homogeneous"), |
|
|
search_columns=["Method"], |
|
|
filter_columns=["Attribution Method Type", "Model", "AUPRC"], |
|
|
) |
|
|
data_path4 = gr.Textbox(value=toxicity_homogeneous, visible=False) |
|
|
id_str4 = gr.Textbox(value="toxicity", visible=False) |
|
|
category_filter4.change( |
|
|
fn=rerank_leaderboard, |
|
|
inputs=[category_filter4, data_path4, id_str4], |
|
|
outputs=[l4] |
|
|
) |
|
|
with gr.TabItem("Heterogeneous"): |
|
|
gr.Markdown("""DATE-LM Task Description: This leaderboard presents detection AUPRC results of baseline methods and data attribution methods in the heterogeneous setting |
|
|
(i.e., safety-aligned examples that resemble unsafe data in format but contain safe responses). |
|
|
|
|
|
Ranking Metric: **AUPRC** (an average of ToxicChat, XSTest-response, JailBreakBench)""") |
|
|
category_filter5 = gr.Dropdown( |
|
|
choices=get_model_sizes("Heterogeneous"), |
|
|
value="All", |
|
|
label="Filter Model Size" |
|
|
) |
|
|
l5 = Leaderboard( |
|
|
value=pd.DataFrame(heterogeneous_data), |
|
|
select_columns=get_leaderboard_columns("Heterogeneous"), |
|
|
search_columns=["Method"], |
|
|
filter_columns=["Attribution Method Type", "Model", "AUPRC"] |
|
|
) |
|
|
data_path5 = gr.Textbox(value=toxicity_heterogeneous, visible=False) |
|
|
id_str5 = gr.Textbox(value="toxicity", visible=False) |
|
|
category_filter5.change( |
|
|
fn=rerank_leaderboard, |
|
|
inputs=[category_filter5, data_path5, id_str5], |
|
|
outputs=[l5] |
|
|
) |
|
|
with gr.TabItem("Factual Attribution"): |
|
|
gr.Markdown("""DATE-LM Task Description: Identifying the specific training examples that support a model's generated facts. |
|
|
|
|
|
Ranking Metric: average of **Recall@50** and **MRR**""") |
|
|
category_filter6 = gr.Dropdown( |
|
|
choices=get_model_sizes("Factual Attribution"), |
|
|
value="All", |
|
|
label="Filter Model Size" |
|
|
) |
|
|
l6 = Leaderboard( |
|
|
value=pd.DataFrame(factual_data), |
|
|
select_columns=get_leaderboard_columns("Factual Attribution"), |
|
|
search_columns=["Method"], |
|
|
filter_columns=["Attribution Method Type", "Model", "Recall@50", "MRR"], |
|
|
) |
|
|
data_path6 = gr.Textbox(value=factual, visible=False) |
|
|
id_str6 = gr.Textbox(value="factual", visible=False) |
|
|
category_filter6.change( |
|
|
fn=rerank_leaderboard, |
|
|
inputs=[category_filter6, data_path6, id_str6], |
|
|
outputs=[l6] |
|
|
) |
|
|
with gr.TabItem("Submit Scores 🚀"): |
|
|
with gr.Column(): |
|
|
gr.Markdown("""### Submit Your Score to a Leaderboard |
|
|
|
|
|
Note: Please first select the leaderboard you would like to submit to. This will display the fields for the |
|
|
corresponding metrics that are needed. |
|
|
""") |
|
|
|
|
|
leaderboard_dropdown = gr.Dropdown( |
|
|
label="Select Leaderboard", |
|
|
choices=nameMapping.LEADERBOARD_NAMES, |
|
|
value=None |
|
|
) |
|
|
|
|
|
method_name = gr.Textbox(label="Method Name") |
|
|
method_dropdown = gr.Dropdown( |
|
|
label="Method Type", |
|
|
choices=["Gradient", "Similarity", "Representation-Based", "Modeling", "Baseline", "Lexical", "Other"], |
|
|
value=None |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_name = gr.Textbox(label="Model Name") |
|
|
model_size = gr.Textbox(label="Model Size (ex. 410M, 1B, 8B)") |
|
|
|
|
|
paper_link = gr.Textbox(label="Paper/Code/Contact Link") |
|
|
|
|
|
scores = gr.File(label='Upload Data Attribution Scores File (.pt)', height=150, file_types=[".pt"]) |
|
|
|
|
|
|
|
|
|
|
|
with gr.Column(visible=False) as pretrain_group: |
|
|
pre_avg = gr.Number(label="Avg") |
|
|
pre_sciq = gr.Number(label="sciq") |
|
|
pre_arc_easy = gr.Number(label="arc_easy") |
|
|
pre_arc_chall = gr.Number(label="arc_challenge") |
|
|
pre_logiqa = gr.Number(label="logiqa") |
|
|
pre_boolq = gr.Number(label="boolq") |
|
|
pre_hellaswag = gr.Number(label="hellaswag") |
|
|
pre_piqa = gr.Number(label="piqa") |
|
|
pre_wino = gr.Number(label="winogrande") |
|
|
pre_open = gr.Number(label="openbookqa") |
|
|
|
|
|
with gr.Column(visible=False) as finetune_group: |
|
|
fine_mmlu = gr.Number(label="MMLU") |
|
|
fine_gsm = gr.Number(label="GSM8K") |
|
|
fine_bbh = gr.Number(label="BBH") |
|
|
|
|
|
with gr.Column(visible=False) as toxicity_group: |
|
|
tox_toxicChat = gr.Number(label="ToxicChat") |
|
|
tox_xsTest = gr.Number(label="XSTest-response") |
|
|
tox_jbb = gr.Number(label="JailBreakBench") |
|
|
tox_auprc = gr.Number(label="AUPRC") |
|
|
|
|
|
with gr.Column(visible=False) as factual_group: |
|
|
fac_recall = gr.Number(label="Recall@50") |
|
|
fac_mrr = gr.Number(label="MRR") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
submit_button = gr.Button("Submit") |
|
|
|
|
|
leaderboard_dropdown.change(update_fields, inputs=[leaderboard_dropdown], outputs=[pretrain_group, finetune_group, toxicity_group, factual_group]) |
|
|
|
|
|
|
|
|
inputsList = [leaderboard_dropdown, method_name, method_dropdown, model_name, model_size, paper_link, scores, \ |
|
|
pre_avg, pre_sciq, pre_arc_easy, pre_arc_chall, pre_logiqa, pre_boolq, pre_hellaswag, pre_piqa, pre_wino, pre_open, \ |
|
|
fine_mmlu, fine_gsm, fine_bbh, \ |
|
|
tox_toxicChat, tox_xsTest, tox_jbb, tox_auprc, \ |
|
|
fac_recall, fac_mrr] |
|
|
|
|
|
submit_button.click( |
|
|
validate_inputs, inputs=inputsList, outputs=[] |
|
|
).success(fn=pr.submit_and_open_PR, inputs=inputsList, outputs=[gr.Textbox(label="Opened PR on Github")]) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(debug=True) |
|
|
|