import gradio as gr
from gradio_leaderboard import Leaderboard
import pandas as pd
import json
import os

import filePaths 
import nameMapping
import pr

##################### Leaderboard Paths + Variables #####################

pathLst = filePaths.PATHLIST
pretrain_10K, pretrain_30K, finetune = pathLst[0], pathLst[1], pathLst[2]
toxicity_homogeneous, toxicity_heterogeneous, factual = pathLst[3], pathLst[4], pathLst[5]

import nameMapping
leaderboard_names = nameMapping.LEADERBOARD_NAMES
trainingNamesSet = nameMapping.TRAINING_LEADERBOARDS

########################## Data Loading ###########################

def load_leaderboard_data(file_path):
    """
    Load leaderboard data from JSON file.
    """
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)
    return []

def add_ranking_column(data, id):
    """
    Add ranking column dynamically for display based on selected metric aggregation.
    """
    if id == 'toxicity': # Toxicity: AUPRC
        key_fn = lambda x: x["AUPRC"]
    elif id == 'factual': # Factual: Avg of Recall@50 and MRR
        key_fn = lambda x: (x["Recall@50"]+x["MRR"])/2
    elif id == 'pretrain': # Pretrain: Avg
        key_fn = lambda x: x["avg"]
    else: # FineTune: Avg of Metrics
        key_fn = lambda x: (x["MMLU"]+x["GSM8K"]+x["BBH"])/3

    sorted_data = sorted(data, key=key_fn, reverse=True)
    for index, entry in enumerate(sorted_data):
        entry["Rank"] = index + 1 
    return sorted_data

def load_data(filePath, id):
    """
    Load initial leaderboard data.
    """
    return pd.DataFrame(add_ranking_column(load_leaderboard_data(filePath), id))

pretrain_10K_data = load_data(pretrain_10K, "pretrain")
pretrain_30K_data = load_data(pretrain_30K, "pretrain")
finetune_data = load_data(finetune, "finetune")
homogeneous_data = load_data(toxicity_homogeneous, "toxicity")
heterogeneous_data = load_data(toxicity_heterogeneous, "toxicity")
factual_data = load_data(factual, "factual")

########################## Leaderboard Columns + Helpers ###########################

def get_leaderboard_columns(leaderboard_name):
    """
    Returns the Expected Columns for Leaderboard
    """
    leaderboardNameMap = nameMapping.DROPDOWN_NAME_MAPPING
    if leaderboard_name in leaderboardNameMap["toxicity"]:
        return nameMapping.TOXICITY_COLS
    elif leaderboard_name in leaderboardNameMap["factual"]:
        return nameMapping.FACTUAL_COLS
    elif leaderboard_name in leaderboardNameMap["finetune"]:    
        return nameMapping.FINETUNE_COLS
    else: # pretrain
        return nameMapping.PRETRAIN_COLS

def get_model_sizes(leaderboard_name):
    """
    Returns Model Sizes for Applications Leaderboards
    """
    nameFileMapping = {"Homogeneous": toxicity_homogeneous, 
                       "Heterogeneous": toxicity_heterogeneous,
                       "Factual Attribution": factual}
    leaderboardJson = load_leaderboard_data(nameFileMapping[leaderboard_name])
    modelSizes = set()

    for row in leaderboardJson:
        modelSizes.add(row["Model Size"])
    
    return ['All'] + list(modelSizes)

################### Submission Helper Functions #############################

def update_fields(leaderboard):
    """
    Determine visibility of group / display additional metrics in submission area.
    """
    nameMap = nameMapping.DROPDOWN_NAME_MAPPING
    return {
        pretrain_group: gr.update(visible=(leaderboard in nameMap['pretrain'])),
        finetune_group: gr.update(visible=(leaderboard in nameMap['finetune'])),
        toxicity_group: gr.update(visible=(leaderboard in nameMap['toxicity'])),
        factual_group: gr.update(visible=(leaderboard in nameMap['factual']))
    }

def validate_inputs(*inputFields):
    (leaderboard_dropdown, method_name, method_dropdown, model_name, model_size, paper_link, scores, 
     pre_avg, pre_sciq, pre_arc_easy, pre_arc_chall, pre_logiqa, 
     pre_boolq, pre_hellaswag, pre_piqa, pre_wino, pre_open, 
     fine_mmlu, fine_gsm, fine_bbh, 
     tox_toxicChat, tox_xsTest, tox_jbb, tox_auprc, 
     fac_recall, fac_mrr) = inputFields
    
    if not all([leaderboard_dropdown, model_name, method_name, method_dropdown, model_size]):
        raise gr.Error("All fields must be filled out and with the correct type.")
    
    if not paper_link:
        raise gr.Error("Please fill in out the Paper/Code/Contact Link info.")
    
    if not scores:
        raise gr.Error("Please upload data attribution scores in .pt file.")
    
    # Check Metrics Non-Empty
    nameMap = nameMapping.DROPDOWN_NAME_MAPPING
    # nameMap['pretrain'] nameMap['finetune'] nameMap['finetune'] nameMap['factual']
    if leaderboard_dropdown in nameMap['pretrain']:
        metricsList = [pre_avg, pre_sciq, pre_arc_easy, pre_arc_chall, pre_logiqa, pre_boolq, pre_hellaswag, pre_piqa, pre_wino, pre_open]
    elif leaderboard_dropdown in nameMap['finetune']:
        metricsList = [fine_mmlu, fine_gsm, fine_bbh]
    elif leaderboard_dropdown in nameMap['toxicity']:
        metricsList = [tox_toxicChat, tox_xsTest, tox_jbb, tox_auprc]
    elif leaderboard_dropdown in nameMap['factual']:
        metricsList = [fac_recall, fac_mrr]

    if not all(metricsList):
        raise gr.Error("Metrics must be filled out.")
    if not all(metric > 0 for metric in metricsList):
        raise gr.Error("Metrics must be positive.")

    
######## Dynamically Update Ranking when Filtering on Model Size ###############

def update_rankings(filtered_df, id):
    df_with_rank = filtered_df.copy() # create copy to avoid modifying original
    
    if id == 'toxicity': # Toxicity: AUPRC
        df_with_rank = df_with_rank.sort_values(by="AUPRC", ascending=False)
    elif id == 'factual': # Factual: Avg of Recall@50 and MRR
        average_scores  = df_with_rank[["Recall@50", "MRR"]].mean(axis=1)
        sorted_index = average_scores .sort_values(ascending=False).index
        df_with_rank = df_with_rank.loc[sorted_index]
    
    df_with_rank["Rank"] = range(1, len(df_with_rank) + 1) # Add rank column
    
    return df_with_rank

def filter_and_rank(df, filter_value, id):
    if filter_value == "All":
        filtered_df = df
    else:
        filtered_df = df[df["Model Size"] == filter_value]
    return update_rankings(filtered_df, id)

def rerank_leaderboard(filter_value, dfPath, idNum):
    df = load_data(dfPath, idNum)
    filtered_ranked_df = filter_and_rank(df, filter_value, idNum)
    return filtered_ranked_df

#################### Leaderboards Code ##############################

with gr.Blocks(css="""
    body, .gradio-container {
        font-family: 'roboto';
    }
""") as demo:
    gr.Markdown("""
    # Data Attribution Methods Leaderboards
    """)
    gr.Markdown(f"""
    Survey and ranking of data attribution methods on data selection and 
                downstream application tasks for the Date-LM Evaluation paper.

    **Leaderboard Submission**:                        
    - To submit your team's scores, click on the "Submit Scores" tab.
                
    **Data Attribution Method Categories**: 
    - Gradient (ex. GradDot, GradSim, LESS, DataInf, EKFAC)
    - Similarity (ex. RepSim)
    - Modeling (ex. MATES)
    - Lexical (ex. BM25)
    - Baseline (ex. GradSafe, OpenAI Moderation, LLM Classifiers)
    - Other                                

    **Search Feature**: 
    - Input the name of the method you would like to search / filter for, and
                then press "Enter". The original row from the leaderboard table will be displayed.   
    """
    )

    with gr.Tabs():
        with gr.TabItem("Training Data Selection"):
            with gr.Tabs():  # Subtabs container
                with gr.TabItem("Pre-Training (10K)"):  # Subtab
                    gr.Markdown("""DATE-LM Task Description: Trained pythia-1B model on Fineweb using 
                                Lambada reference dataset. Testing results conducted on 10K step checkpoint.
                                
                                Ranking Metric: highest score in **avg** column""") # description
                    l1 = Leaderboard(
                            value=pd.DataFrame(pretrain_10K_data),
                            select_columns=get_leaderboard_columns("Pre-Training (10K)"),
                            search_columns=['Method'],
                            filter_columns=["Attribution Method Type", "Method", "avg"],
                        )
                with gr.TabItem("Pre-Training (30K)"):
                    gr.Markdown("""DATE-LM Task Description: Trained pythia-1B model on Fineweb using 
                                Lambada reference dataset. Testing results conducted on 30K step checkpoint.
                                
                                Ranking Metric: highest score in **avg** column""")
                    l2 = Leaderboard(
                        value=pd.DataFrame(pretrain_30K_data),
                        select_columns=get_leaderboard_columns("Pre-Training (30K)"),
                        search_columns=["Method"],
                        filter_columns=["Attribution Method Type", "Method", "avg"],
                    )
                with gr.TabItem("Fine-Tuning"):
                    gr.Markdown("""DATE-LM Task Description: Targeted instruction tuning setting.
                                 Given a diverse instruction set and a eval dataset, we select data that would yield 
                                optimal performance on the eval data. For this task, the training data pool is 
                                Tulu3 (unfiltered) and the eval data is MMLU, GSM8K, and BBH. 
                                
                                Ranking Metric: average of the **MMLU**, **GSM8K**, and **BBH** scores""")
                    l3 = Leaderboard(
                        value=pd.DataFrame(finetune_data),
                        select_columns=get_leaderboard_columns("Fine-Tuning"),
                        search_columns=["Method"],
                        filter_columns=["Attribution Method Type", "MMLU", "GSM8K", "BBH"],
                    )
        with gr.TabItem("Applications"):
            with gr.Tabs():
                with gr.TabItem("Toxicity/Bias"):
                    with gr.Tabs(): 
                        with gr.TabItem("Homogeneous"):
                            gr.Markdown("""DATE-LM Task Description: This leaderboard presents detection AUPRC results of baseline methods and data attribution methods in the homogenous setting 
                                        (i.e., detecting small amount of toxic/biased data embedded into larger benign data).
                                        
                                        Ranking Metric: **AUPRC** (an average of ToxicChat, XSTest-response, JailBreakBench)""")
                            category_filter4 = gr.Dropdown(
                                choices=get_model_sizes("Homogeneous"),
                                value="All",
                                label="Filter Model Size"
                            ) # ensures page placement above leaderboard
                            l4 = Leaderboard(
                                value=pd.DataFrame(homogeneous_data),
                                select_columns=get_leaderboard_columns("Homogeneous"),
                                search_columns=["Method"],
                                filter_columns=["Attribution Method Type", "Model", "AUPRC"],
                            )
                            data_path4 = gr.Textbox(value=toxicity_homogeneous, visible=False)
                            id_str4 = gr.Textbox(value="toxicity", visible=False)
                            category_filter4.change(
                                fn=rerank_leaderboard,
                                inputs=[category_filter4, data_path4, id_str4],
                                outputs=[l4]
                            )
                        with gr.TabItem("Heterogeneous"):
                            gr.Markdown("""DATE-LM Task Description: This leaderboard presents detection AUPRC results of baseline methods and data attribution methods in the heterogeneous setting 
                                        (i.e., safety-aligned examples that resemble unsafe data in format but contain safe responses).
                                        
                                        Ranking Metric: **AUPRC** (an average of ToxicChat, XSTest-response, JailBreakBench)""")
                            category_filter5 = gr.Dropdown(
                                choices=get_model_sizes("Heterogeneous"),
                                value="All",
                                label="Filter Model Size"
                            )
                            l5 = Leaderboard(
                                value=pd.DataFrame(heterogeneous_data),
                                select_columns=get_leaderboard_columns("Heterogeneous"),
                                search_columns=["Method"],
                                filter_columns=["Attribution Method Type", "Model", "AUPRC"]
                            )
                            data_path5 = gr.Textbox(value=toxicity_heterogeneous, visible=False)
                            id_str5 = gr.Textbox(value="toxicity", visible=False)
                            category_filter5.change(
                                fn=rerank_leaderboard,
                                inputs=[category_filter5, data_path5, id_str5],
                                outputs=[l5]
                            )
                with gr.TabItem("Factual Attribution"):
                    gr.Markdown("""DATE-LM Task Description: Identifying the specific training examples that support a model's generated facts.
                                        
                                   Ranking Metric: average of **Recall@50** and **MRR**""")
                    category_filter6 = gr.Dropdown(
                        choices=get_model_sizes("Factual Attribution"),
                        value="All",
                        label="Filter Model Size"
                    )
                    l6 = Leaderboard(
                        value=pd.DataFrame(factual_data),
                        select_columns=get_leaderboard_columns("Factual Attribution"),
                        search_columns=["Method"],
                        filter_columns=["Attribution Method Type", "Model", "Recall@50", "MRR"],
                    )
                    data_path6 = gr.Textbox(value=factual, visible=False)
                    id_str6 = gr.Textbox(value="factual", visible=False)
                    category_filter6.change(
                        fn=rerank_leaderboard,
                        inputs=[category_filter6, data_path6, id_str6],
                        outputs=[l6]
                    )
        with gr.TabItem("Submit Scores 🚀"):
            with gr.Column():
                gr.Markdown("""### Submit Your Score to a Leaderboard
                            
                Note: Please first select the leaderboard you would like to submit to. This will display the fields for the 
                            corresponding metrics that are needed. 
                """)

                leaderboard_dropdown = gr.Dropdown(
                    label="Select Leaderboard",
                    choices=nameMapping.LEADERBOARD_NAMES,
                    value=None
                )

                method_name = gr.Textbox(label="Method Name")
                method_dropdown = gr.Dropdown(
                    label="Method Type",
                    choices=["Gradient", "Similarity", "Representation-Based", "Modeling", "Baseline", "Lexical", "Other"],
                    value=None
                )

                # model_size = gr.Dropdown(
                #     label="Model Size",
                #     choices=["400M", "1B", "3B", "7B"],
                #     value=None
                # ) 
                model_name = gr.Textbox(label="Model Name")
                model_size = gr.Textbox(label="Model Size (ex. 410M, 1B, 8B)")

                paper_link = gr.Textbox(label="Paper/Code/Contact Link") 
                
                scores = gr.File(label='Upload Data Attribution Scores File (.pt)', height=150, file_types=[".pt"])
                
                # Dynamically Display Needed Fields for Each Leaderboard Type

                with gr.Column(visible=False) as pretrain_group:
                    pre_avg = gr.Number(label="Avg")
                    pre_sciq = gr.Number(label="sciq")
                    pre_arc_easy = gr.Number(label="arc_easy")
                    pre_arc_chall = gr.Number(label="arc_challenge")
                    pre_logiqa = gr.Number(label="logiqa")
                    pre_boolq = gr.Number(label="boolq")
                    pre_hellaswag = gr.Number(label="hellaswag")
                    pre_piqa = gr.Number(label="piqa")
                    pre_wino = gr.Number(label="winogrande")
                    pre_open = gr.Number(label="openbookqa")

                with gr.Column(visible=False) as finetune_group:
                    fine_mmlu = gr.Number(label="MMLU")
                    fine_gsm = gr.Number(label="GSM8K")
                    fine_bbh = gr.Number(label="BBH")

                with gr.Column(visible=False) as toxicity_group: 
                    tox_toxicChat = gr.Number(label="ToxicChat")
                    tox_xsTest = gr.Number(label="XSTest-response")
                    tox_jbb = gr.Number(label="JailBreakBench")
                    tox_auprc = gr.Number(label="AUPRC") 

                with gr.Column(visible=False) as factual_group:
                    fac_recall = gr.Number(label="Recall@50")
                    fac_mrr = gr.Number(label="MRR")

                # with gr.Group(visible=False) as training_group:
                #     acc = gr.Number(label="Accuracy")
                    
                # applications_group = gr.Column(visible=False)
                # with applications_group:
                #     f1_score = gr.Number(label="F1")
                #     auprc_score = gr.Number(label="AUPRC")
                #     acc1 = gr.Number(label="Accuracy")
                
                # Submit button
                submit_button = gr.Button("Submit")

                leaderboard_dropdown.change(update_fields, inputs=[leaderboard_dropdown], outputs=[pretrain_group, finetune_group, toxicity_group, factual_group])
                
                # information lists
                inputsList = [leaderboard_dropdown, method_name, method_dropdown, model_name, model_size, paper_link, scores, \
                              pre_avg, pre_sciq, pre_arc_easy, pre_arc_chall, pre_logiqa, pre_boolq, pre_hellaswag, pre_piqa, pre_wino, pre_open, \
                              fine_mmlu, fine_gsm, fine_bbh, \
                              tox_toxicChat, tox_xsTest, tox_jbb, tox_auprc, \
                              fac_recall, fac_mrr]
                
                submit_button.click(
                    validate_inputs, inputs=inputsList, outputs=[]
                ).success(fn=pr.submit_and_open_PR, inputs=inputsList, outputs=[gr.Textbox(label="Opened PR on Github")])
                
if __name__ == "__main__":
    demo.launch(debug=True)