File size: 33,048 Bytes

714cf46

'''
Adapted from https://github.com/OATML-Markslab/ProteinGym/blob/main/proteingym/performance_DMS_benchmarks.py
Changes:
- Added ability to restrict analysis to a subset of DMS IDs, when they are specified in main.py
- Evaluates only models that are specified in main.py
'''
import pandas as pd
import numpy as np
import os
import argparse
from scipy.stats import spearmanr
from sklearn.metrics import roc_auc_score, matthews_corrcoef
import warnings
import json
warnings.simplefilter(action='ignore', category=FutureWarning)
            
def minmax(x):
    return ( (x - np.min(x)) / (np.max(x) - np.min(x)) ) 

def calc_ndcg(y_true, y_score, **kwargs):
    '''
    Inputs:
        y_true: an array of the true scores where higher score is better
        y_score: an array of the predicted scores where higher score is better
    Options:
        quantile: If True, uses the top k quantile of the distribution
        top: under the quantile setting this is the top quantile to
            keep in the gains calc. This is a PERCENTAGE (i.e input 10 for top 10%)
    Notes:
        Currently we're calculating NDCG on the continuous value of the DMS
        I tried it on the binary value as well and the metrics seemed mostly
        the same.
    '''
    if 'quantile' not in kwargs:
        kwargs['quantile'] = True
    if 'top' not in kwargs:
        kwargs['top'] = 10
    if kwargs['quantile']:
        k = np.floor(y_true.shape[0]*(kwargs['top']/100)).astype(int)
    else:
        k = kwargs['top']
    if isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_score, pd.Series):
        y_score = y_score.values
    gains = minmax(y_true)
    ranks = np.argsort(np.argsort(-y_score)) + 1
    
    if k == 'all':
        k = len(ranks)
    #sub to top k
    ranks_k = ranks[ranks <= k]
    gains_k = gains[ranks <= k]
    #all terms with a gain of 0 go to 0
    ranks_fil = ranks_k[gains_k != 0]
    gains_fil = gains_k[gains_k != 0]
    
    #if none of the ranks made it return 0
    if len(ranks_fil) == 0:
        return (0)
    
    #discounted cumulative gains
    dcg = np.sum([g/np.log2(r+1) for r,g in zip(ranks_fil, gains_fil)])
    
    #ideal dcg - calculated based on the top k actual gains
    ideal_ranks = np.argsort(np.argsort(-gains)) + 1
    ideal_ranks_k = ideal_ranks[ideal_ranks <= k]
    ideal_gains_k = gains[ideal_ranks <= k]
    ideal_ranks_fil = ideal_ranks_k[ideal_gains_k != 0]
    ideal_gains_fil = ideal_gains_k[ideal_gains_k != 0]
    idcg = np.sum([g/np.log2(r+1) for r,g in zip(ideal_ranks_fil, ideal_gains_fil)])
    
    #normalize
    ndcg = dcg/idcg
    
    return (ndcg)

def calc_toprecall(true_scores, model_scores, top_true=10, top_model=10):  
    top_true = (true_scores >= np.percentile(true_scores, 100-top_true))
    top_model = (model_scores >= np.percentile(model_scores, 100-top_model))
    
    TP = (top_true) & (top_model)
    recall = TP.sum() / (top_true.sum()) if top_true.sum() > 0 else 0
    
    return (recall)

def standardization(x):
    """Assumes input is numpy array or pandas series"""
    return (x - x.mean()) / x.std()

def compute_bootstrap_standard_error(df, number_assay_reshuffle=10000):
    """
    Computes the non-parametric bootstrap standard error for the mean estimate of a given performance metric (eg., Spearman, AUC) across DMS assays (ie., the sample standard deviation of the mean across bootstrap samples)
    """
    model_names = df.columns
    mean_performance_across_samples = []
    for sample in range(number_assay_reshuffle):
        mean_performance_across_samples.append(df.sample(frac=1.0, replace=True).mean(axis=0)) #Resample a dataset of the same size (with replacement) then take the sample mean
    mean_performance_across_samples=pd.DataFrame(data=mean_performance_across_samples,columns=model_names)
    return mean_performance_across_samples.std(ddof=1)

def compute_bootstrap_standard_error_functional_categories(df, number_assay_reshuffle=10000):
    """
    Computes the non-parametric bootstrap standard error for the mean estimate of a given performance metric (eg., Spearman, AUC) across DMS assays (ie., the sample standard deviation of the mean across bootstrap samples)
    """
    model_names = df.columns
    mean_performance_across_samples = {}
    for category, group in df.groupby("Selection Type"):
        mean_performance_across_samples[category] = []
        for sample in range(number_assay_reshuffle):
            mean_performance_across_samples[category].append(group.sample(frac=1.0, replace=True).mean(axis=0)) #Resample a dataset of the same size (with replacement) then take the sample mean
        mean_performance_across_samples[category]=pd.DataFrame(data=mean_performance_across_samples[category])
    categories = list(mean_performance_across_samples.keys())
    combined_averages = mean_performance_across_samples[categories[0]].copy()
    for category in categories[1:]:
        combined_averages += mean_performance_across_samples[category]
    combined_averages /= len(categories)
    return combined_averages.std(ddof=1)


proteingym_folder_path = os.path.dirname(os.path.realpath(__file__))

def main():
    parser = argparse.ArgumentParser(description='ProteinGym performance analysis')
    parser.add_argument('--input_scoring_files_folder', type=str, help='Name of folder where all input scores are present (expects one scoring file per DMS)')
    parser.add_argument('--output_performance_file_folder', default='./outputs/tranception_performance', type=str, help='Name of folder where to save performance analysis files')
    parser.add_argument('--DMS_reference_file_path', type=str, help='Reference file with list of DMSs to consider')
    parser.add_argument('--indel_mode', action='store_true', help='Whether to score sequences with insertions and deletions')
    parser.add_argument('--performance_by_depth', action='store_true', help='Whether to compute performance by mutation depth')
    parser.add_argument('--config_file', default=f'{os.path.dirname(proteingym_folder_path)}/config.json', type=str, help='Path to config file containing model information')
    parser.add_argument('--selected_model_names', nargs='+', default=None, help='Required to obtain column names from config file')
    parser.add_argument('--dms_ids', nargs='+', default=None, help='Subset of DMS ids to include; if omitted, all DMS in the reference are used')
    parser.add_argument('--scoring_method', choices=['masked_marginal','wildtype_marginal','mutant_marginal','pll','global_log_prob'], default='masked_marginal', help='Which zero-shot scoring file suffix to read')
    args = parser.parse_args()
    
    mapping_protein_seq_DMS = pd.read_csv(args.DMS_reference_file_path)
    # Optionally restrict analysis to a subset of DMS IDs
    if args.dms_ids is not None and len(args.dms_ids) > 0:
        requested_dms_ids = set(str(x) for x in args.dms_ids)
        mapping_protein_seq_DMS = mapping_protein_seq_DMS[mapping_protein_seq_DMS['DMS_id'].astype(str).isin(requested_dms_ids)]
        if mapping_protein_seq_DMS.empty:
            print("No matching DMS ids after filtering; nothing to compute.")
            return
    mapping_protein_seq_DMS["MSA_Neff_L_category"] = mapping_protein_seq_DMS["MSA_Neff_L_category"].apply(lambda x: x[0].upper() + x[1:] if type(x) == str else x)
    num_DMS=len(mapping_protein_seq_DMS)
    print("There are {} DMSs in mapping file".format(num_DMS))
    
    with open(args.config_file) as f:
        config = json.load(f)
    with open(f"{os.path.dirname(os.path.realpath(__file__))}/constants.json") as f:
        constants = json.load(f)
    uniprot_function_lookup = mapping_protein_seq_DMS[["UniProt_ID","coarse_selection_type"]]
    uniprot_function_lookup.columns = ["UniProt_ID", "Selection Type"]
    uniprot_Neff_lookup = mapping_protein_seq_DMS[['UniProt_ID','MSA_Neff_L_category']].drop_duplicates()
    uniprot_Neff_lookup.columns=['UniProt_ID','MSA_Neff_L_category']
    uniprot_taxon_lookup = mapping_protein_seq_DMS[['UniProt_ID','taxon']].drop_duplicates()
    uniprot_taxon_lookup.columns=['UniProt_ID','Taxon']
    if args.indel_mode:
        args.performance_by_depth = False

    if not args.indel_mode:
        score_variables = list(config.get("model_list_zero_shot_substitutions_DMS", {}).keys())
    else:
        score_variables = list(config.get("model_list_zero_shot_indels_DMS", {}).keys())
    if not os.path.isdir(args.output_performance_file_folder):
        os.mkdir(args.output_performance_file_folder)
    for metric in ['Spearman','AUC','MCC',"NDCG","Top_recall"]:
        if not os.path.isdir(args.output_performance_file_folder+os.sep+metric):
            os.mkdir(args.output_performance_file_folder+os.sep+metric)
    
    # Will populate model_types dynamically after scanning available model columns
    model_types = pd.DataFrame(columns=['Model type'])
    model_details = pd.DataFrame.from_dict(constants.get("model_details", {}), columns=['Model details'], orient='index')
    model_references = pd.DataFrame.from_dict(constants.get("model_references", {}), columns=['References'], orient='index')
    clean_names = constants.get("clean_names", {})
    performance_all_DMS={}
    output_filename={}
    for metric in ['Spearman','AUC','MCC', "NDCG", "Top_recall"]:
        performance_all_DMS[metric] = {}
        mutation_type = "substitutions" if not args.indel_mode else "indels"
        output_filename[metric] = "DMS_" + mutation_type + "_" + metric
        # Start with only metadata rows; model rows will be added dynamically via outer merges
        performance_all_DMS[metric]['number_mutants'] = -1
        performance_all_DMS[metric]["Selection Type"] = -1
        performance_all_DMS[metric]["UniProt_ID"] = -1
        performance_all_DMS[metric]['MSA_Neff_L_category'] = -1
        performance_all_DMS[metric]['Taxon'] = -1
        performance_all_DMS[metric] = pd.DataFrame.from_dict(performance_all_DMS[metric], orient='index').reset_index()
        performance_all_DMS[metric].columns = ['score', 'score_index']

    list_DMS = mapping_protein_seq_DMS["DMS_id"]
    i = 0
    all_models_found = set()
    for DMS_id in list_DMS:
        try:
            print(DMS_id)    
            UniProt_ID = mapping_protein_seq_DMS["UniProt_ID"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0]
            selection_type = mapping_protein_seq_DMS["coarse_selection_type"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0]
            MSA_Neff_L_category = mapping_protein_seq_DMS["MSA_Neff_L_category"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0]
            Taxon = mapping_protein_seq_DMS["taxon"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0]
            if args.indel_mode:
                suffix = 'pll'
            else:
                suffix = args.scoring_method
            score_path = os.path.join(args.input_scoring_files_folder, f"{DMS_id}_zs_{suffix}.csv")
            if os.path.exists(score_path):
                merged_scores = pd.read_csv(score_path, dtype={'target_seq': str})
            if 'mutant' not in merged_scores: merged_scores['mutant'] = merged_scores['mutated_sequence'] #if mutant not in DMS file we default to mutated_sequence (eg., for indels)
            # Ensure binary labels exist for AUC/MCC if not provided
            if 'DMS_score_bin' not in merged_scores and 'DMS_score' in merged_scores:
                median_cutoff = merged_scores['DMS_score'].median()
                merged_scores['DMS_score_bin'] = (merged_scores['DMS_score'] >= median_cutoff).astype(int)
            if 'DMS_score' not in merged_scores:
                print(f"DMS_score column missing for {DMS_id}; skipping this DMS")
                continue
        except:
            print(f"Scoring file for {DMS_id} missing")
            continue

        if not args.indel_mode and args.performance_by_depth:
            if 'mutant' in merged_scores:
                merged_scores['mutation_depth'] = merged_scores['mutant'].apply(lambda x: len(x.split(":")))
                merged_scores['mutation_depth_grouped'] = merged_scores['mutation_depth'].apply(lambda x: '5+' if x >= 5 else str(x))
            else:
                print("No 'mutant' or 'mutated_sequence' column to compute mutation depth for DMS {}; setting to nan".format(DMS_id))
                merged_scores['mutation_depth_grouped'] = np.nan
        performance_DMS = {}
        for metric in ['Spearman','AUC','MCC','NDCG','Top_recall']:
            performance_DMS[metric]={}
        # Determine available model score columns
        score_columns_present = [name for name in args.selected_model_names if name in merged_scores.columns]
        score_columns_present = list(dict.fromkeys(score_columns_present))
        all_models_found.update(score_columns_present)
        for score in score_columns_present:
            if score not in merged_scores:
                print("Model scores for {} not in merged scores for DMS {}".format(score,DMS_id))
                performance_DMS["Spearman"][score] = np.nan
                performance_DMS["AUC"][score] = np.nan
                performance_DMS["MCC"][score] = np.nan
                performance_DMS["NDCG"][score] = np.nan 
                performance_DMS["Top_recall"][score] = np.nan
                continue
            performance_DMS['Spearman'][score] = spearmanr(merged_scores['DMS_score'], merged_scores[score])[0]
            performance_DMS["NDCG"][score] = calc_ndcg(merged_scores['DMS_score'], merged_scores[score])
            performance_DMS["Top_recall"][score] = calc_toprecall(merged_scores['DMS_score'], merged_scores[score])
            try:
                performance_DMS['AUC'][score] = roc_auc_score(y_true=merged_scores['DMS_score_bin'], y_score=merged_scores[score])
            except:
                print("AUC issue with: {} for model: {}".format(DMS_id,score))
                performance_DMS['AUC'][score] = np.nan
            try:
                median_cutoff=merged_scores[score].median()
                merged_scores[score+"_bin"]=merged_scores[score].map(lambda x: 1 if x >= median_cutoff else 0)
                performance_DMS['MCC'][score] = matthews_corrcoef(y_true=merged_scores['DMS_score_bin'], y_pred=merged_scores[score+"_bin"])
            except:
                print("MCC issue with: {} for model: {}".format(DMS_id,score))
                performance_DMS['MCC'][score] = np.nan

        if not args.indel_mode and args.performance_by_depth:
            for score in score_columns_present:
                if score not in merged_scores:
                    print("Model scores for {} not in merged scores for DMS {}".format(score,DMS_id))
                    for depth in ['1','2','3','4','5+']:
                        performance_DMS["Spearman"][score+'_'+depth] = np.nan
                        performance_DMS["AUC"][score+'_'+depth] = np.nan
                        performance_DMS["MCC"][score+'_'+depth] = np.nan 
                        performance_DMS["NDCG"][score+'_'+depth] = np.nan
                        performance_DMS["Top_recall"][score+'_'+depth] = np.nan
                    continue
                for depth in ['1','2','3','4','5+']:
                    merged_scores_depth = merged_scores[merged_scores.mutation_depth_grouped==depth]
                    if len(merged_scores_depth) > 0:
                        performance_DMS['Spearman'][score+'_'+depth] = spearmanr(merged_scores_depth['DMS_score'], merged_scores_depth[score])[0]
                        performance_DMS["NDCG"][score+'_'+depth] = calc_ndcg(merged_scores_depth['DMS_score'], merged_scores_depth[score])
                        performance_DMS["Top_recall"][score+'_'+depth] = calc_toprecall(merged_scores_depth['DMS_score'], merged_scores_depth[score])
                        try:
                            performance_DMS['AUC'][score+'_'+depth] = roc_auc_score(y_true=merged_scores_depth['DMS_score_bin'], y_score=merged_scores_depth[score])
                        except:
                            performance_DMS['AUC'][score+'_'+depth] = np.nan
                        try:
                            performance_DMS['MCC'][score+'_'+depth] = matthews_corrcoef(y_true=merged_scores_depth['DMS_score_bin'], y_pred=merged_scores_depth[score+"_bin"])
                        except:
                            performance_DMS['MCC'][score+'_'+depth] = np.nan
                    else:
                        performance_DMS['Spearman'][score+'_'+depth] = np.nan
                        performance_DMS['AUC'][score+'_'+depth] = np.nan
                        performance_DMS['MCC'][score+'_'+depth] = np.nan
                        performance_DMS["NDCG"][score+'_'+depth] = np.nan
                        performance_DMS["Top_recall"][score+'_'+depth] = np.nan
        print("Number of mutants: {}".format(len(merged_scores['DMS_score'].values)))
        for metric in ['Spearman','AUC','MCC','NDCG','Top_recall']:
            performance_DMS[metric]['number_mutants']=len(merged_scores['DMS_score'].values)
            performance_DMS[metric]['UniProt_ID'] = UniProt_ID
            performance_DMS[metric]["Selection Type"] = selection_type
            performance_DMS[metric]['MSA_Neff_L_category'] = MSA_Neff_L_category
            performance_DMS[metric]['Taxon'] = Taxon
            performance_DMS[metric] = pd.DataFrame.from_dict(performance_DMS[metric],orient='index').reset_index()
            performance_DMS[metric].columns=['score',DMS_id]
            performance_all_DMS[metric]=pd.merge(performance_all_DMS[metric],performance_DMS[metric],on='score',how='outer')
    # Build model types dynamically
    try:
        if not args.indel_mode:
            cfg_models = config.get("model_list_zero_shot_substitutions_DMS", {})
        else:
            cfg_models = config.get("model_list_zero_shot_indels_DMS", {})
        model_types_dict = {}
        for model in sorted(all_models_found):
            model_types_dict[model] = cfg_models.get(model, {}).get("model_type", "Unknown")
        model_types = pd.DataFrame.from_dict(model_types_dict, columns=['Model type'], orient='index')
    except Exception:
        model_types = pd.DataFrame(columns=['Model type'])
    for metric in ['Spearman','AUC','MCC','NDCG','Top_recall']:
        performance_all_DMS[metric]=performance_all_DMS[metric].set_index('score')
        del performance_all_DMS[metric]['score_index']
        performance_all_DMS[metric]=performance_all_DMS[metric].transpose()
        for var in performance_all_DMS[metric]:
            if var not in ['UniProt_ID','MSA_Neff_L_category','Taxon',"Selection Type"]:
                performance_all_DMS[metric][var]=performance_all_DMS[metric][var].astype(float).round(3)
            if var in ['number_mutants']:
                performance_all_DMS[metric][var]=performance_all_DMS[metric][var].astype(int)
        if not args.indel_mode and args.performance_by_depth:
            all_columns = performance_all_DMS[metric].columns
            performance_all_DMS_html=performance_all_DMS[metric].copy()
            performance_all_DMS_html.columns=performance_all_DMS_html.columns.map(lambda x: clean_names[x] if x in clean_names else x)
            all_not_depth_columns = all_columns[[all_columns[x].split("_")[-1] not in ['1','2','3','4','5+'] for x in range(len(all_columns))]]
            all_not_depth_columns_clean = all_not_depth_columns.map(lambda x: clean_names[x] if x in clean_names else x)
            performance_all_DMS_html[all_not_depth_columns_clean].to_html(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_DMS_level.html')
            DMS_perf_to_save = performance_all_DMS[metric].copy()[all_not_depth_columns]
            DMS_perf_to_save.columns = DMS_perf_to_save.columns.map(lambda x: clean_names[x] if x in clean_names else x)
            DMS_perf_to_save.to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_DMS_level.csv', index_label="DMS ID")
        else:
            performance_all_DMS_html=performance_all_DMS[metric].copy()
            performance_all_DMS_html.columns = performance_all_DMS_html.columns.map(lambda x: clean_names[x] if x in clean_names else x)
            performance_all_DMS_html.to_html(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_DMS_level.html')
            DMS_perf_to_save = performance_all_DMS[metric].copy()
            DMS_perf_to_save.columns = DMS_perf_to_save.columns.map(lambda x: clean_names[x] if x in clean_names else x)
            DMS_perf_to_save.to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_DMS_level.csv', index_label="DMS ID")
        
        if not args.indel_mode:
            uniprot_metric_performance = performance_all_DMS[metric].groupby(['UniProt_ID']).mean(numeric_only=True)
            uniprot_function_metric_performance = performance_all_DMS[metric].groupby(['UniProt_ID',"Selection Type"]).mean(numeric_only=True)
            uniprot_metric_performance = uniprot_metric_performance.reset_index()
            uniprot_metric_performance = pd.merge(uniprot_metric_performance,uniprot_Neff_lookup,on='UniProt_ID', how='left')
            uniprot_metric_performance = pd.merge(uniprot_metric_performance,uniprot_taxon_lookup,on='UniProt_ID', how='left')
            uniprot_metric_performance = pd.merge(uniprot_metric_performance,uniprot_function_lookup,on="UniProt_ID",how="left")
            del uniprot_metric_performance['number_mutants']
            del uniprot_function_metric_performance["number_mutants"]
            uniprot_level_average = uniprot_metric_performance.mean(numeric_only=True)
            uniprot_function_level_average = uniprot_function_metric_performance.groupby("Selection Type").mean(numeric_only=True)
            # bootstrap_standard_error = pd.DataFrame(compute_bootstrap_standard_error_functional_categories(uniprot_function_metric_performance.subtract(uniprot_function_metric_performance['TranceptEVE_L'],axis=0)),columns=["Bootstrap_standard_error_"+metric])
            uniprot_function_level_average = uniprot_function_level_average.reset_index()
            final_average = uniprot_function_level_average.mean(numeric_only=True) 
            if args.performance_by_depth:
                cols = [column for column in all_not_depth_columns if column not in ["number_mutants","Taxon","MSA_Neff_L_category","Selection Type","UniProt_ID"]]
                top_model = final_average.loc[cols].idxmax()
            else:
                top_model = final_average.idxmax()
            bootstrap_standard_error = pd.DataFrame(compute_bootstrap_standard_error_functional_categories(uniprot_function_metric_performance.subtract(uniprot_function_metric_performance[top_model],axis=0)),columns=["Bootstrap_standard_error_"+metric])
            uniprot_metric_performance.loc['Average'] = uniprot_level_average
            uniprot_function_level_average.loc['Average'] = final_average
            uniprot_metric_performance=uniprot_metric_performance.round(3)
            uniprot_function_level_average=uniprot_function_level_average.round(3)
            if args.performance_by_depth:
                uniprot_metric_performance[[column for column in all_not_depth_columns if column != "number_mutants"]].to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_Uniprot_level.csv', index=False)
                performance_by_depth = {}
                all_not_depth_columns = [x for x in all_not_depth_columns if x not in ['number_mutants',"UniProt_ID","MSA_Neff_L_category","Taxon"]]
                for depth in ['1','2','3','4','5+']:
                    depth_columns = all_columns[[all_columns[x].split("_")[-1]==depth for x in range(len(all_columns))]]
                    performance_by_depth[depth] = uniprot_function_metric_performance[depth_columns].mean(numeric_only=True).reset_index()
                    performance_by_depth[depth]['model_name'] = performance_by_depth[depth]['score'].map(lambda x: '_'.join(x.split('_')[:-1]))
                    performance_by_depth[depth]=performance_by_depth[depth][['model_name',0]]
                    performance_by_depth[depth].columns = ['model_name','Depth_'+depth]
                    performance_by_depth[depth].set_index('model_name', inplace=True)
                uniprot_function_level_average = uniprot_function_level_average[all_not_depth_columns]
            else:
                uniprot_metric_performance.to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_Uniprot_level.csv', index=False)
            uniprot_function_level_average.to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + "_Uniprot_Selection_Type_level.csv",index=False)
            if args.performance_by_depth:
                performance_by_MSA_depth = performance_all_DMS[metric].groupby(["UniProt_ID","MSA_Neff_L_category"]).mean(numeric_only=True).groupby(["MSA_Neff_L_category"]).mean(numeric_only=True)[[col for col in all_not_depth_columns if col != "Selection Type"]].transpose()
            else:
                performance_by_MSA_depth = performance_all_DMS[metric].groupby(["UniProt_ID","MSA_Neff_L_category"]).mean(numeric_only=True).groupby(["MSA_Neff_L_category"]).mean(numeric_only=True).transpose()
            performance_by_MSA_depth = performance_by_MSA_depth.reindex(columns=['Low','Medium','High'])
            performance_by_MSA_depth.columns = ['Low_MSA_depth','Medium_MSA_depth','High_MSA_depth']
            if args.performance_by_depth:
                performance_by_taxon = performance_all_DMS[metric].groupby(["UniProt_ID","Taxon"]).mean(numeric_only=True).groupby(["Taxon"]).mean(numeric_only=True)[[col for col in all_not_depth_columns if col != "Selection Type"]].transpose()
            else:
                performance_by_taxon = performance_all_DMS[metric].groupby(["UniProt_ID","Taxon"]).mean(numeric_only=True).groupby(["Taxon"]).mean(numeric_only=True).transpose()
            performance_by_taxon = performance_by_taxon.reindex(columns=['Human','Eukaryote','Prokaryote','Virus'])
            performance_by_taxon.columns = ['Taxa_Human','Taxa_Other_Eukaryote','Taxa_Prokaryote','Taxa_Virus']
            performance_by_function = uniprot_function_level_average.drop(labels="Average",axis=0).set_index("Selection Type").transpose()
            performance_by_function.columns = ["Function_"+x for x in performance_by_function.columns]
            
            summary_performance = pd.merge(pd.DataFrame(final_average,columns=['Average_'+metric]), performance_by_MSA_depth,left_index=True, right_index=True,how='inner')
            summary_performance = pd.merge(summary_performance, performance_by_taxon,left_index=True, right_index=True,how='inner')
            summary_performance = pd.merge(summary_performance, performance_by_function,left_index=True, right_index=True, how='inner')
            if args.performance_by_depth:
                for depth in ['1','2','3','4','5+']:
                    summary_performance = pd.merge(summary_performance, performance_by_depth[depth],left_index=True, right_index=True,how='inner')
            final_column_order = ['Model_name','Model type','Average_'+metric,'Bootstrap_standard_error_'+metric,'Function_Activity','Function_Binding','Function_Expression','Function_OrganismalFitness','Function_Stability','Low_MSA_depth','Medium_MSA_depth','High_MSA_depth','Taxa_Human','Taxa_Other_Eukaryote','Taxa_Prokaryote','Taxa_Virus','Depth_1','Depth_2','Depth_3','Depth_4','Depth_5+','Model details','References']

        else:
            performance_all_DMS[metric].loc["Average"] = performance_all_DMS[metric].mean(numeric_only=True)
            uniprot_metric_performance = performance_all_DMS[metric].groupby(['UniProt_ID']).mean(numeric_only=True)
            uniprot_function_metric_performance = performance_all_DMS[metric].groupby(['UniProt_ID',"Selection Type"]).mean(numeric_only=True)
            uniprot_metric_performance = pd.merge(uniprot_metric_performance,uniprot_function_lookup,on="UniProt_ID",how="left")
            del uniprot_metric_performance['number_mutants']
            uniprot_level_average = uniprot_metric_performance.mean(numeric_only=True)
            del uniprot_function_metric_performance["number_mutants"]
            uniprot_function_level_average = uniprot_function_metric_performance.groupby("Selection Type").mean(numeric_only=True)
            # bootstrap_standard_error = pd.DataFrame(compute_bootstrap_standard_error_functional_categories(uniprot_function_metric_performance.subtract(uniprot_function_metric_performance['TranceptEVE_M'],axis=0)),columns=["Bootstrap_standard_error_"+metric])
            uniprot_function_level_average = uniprot_function_level_average.reset_index()
            final_average = uniprot_function_level_average.mean(numeric_only=True) 
            top_model = final_average.idxmax()
            bootstrap_standard_error = pd.DataFrame(compute_bootstrap_standard_error_functional_categories(uniprot_function_metric_performance.subtract(uniprot_function_metric_performance[top_model],axis=0)),columns=["Bootstrap_standard_error_"+metric])
            uniprot_metric_performance.loc['Average'] = uniprot_level_average
            uniprot_function_level_average.loc['Average'] = final_average
            uniprot_metric_performance=uniprot_metric_performance.round(3)
            uniprot_function_level_average=uniprot_function_level_average.round(3)
            
            performance_by_MSA_depth = performance_all_DMS[metric].groupby(["UniProt_ID","MSA_Neff_L_category"]).mean(numeric_only=True).groupby(["MSA_Neff_L_category"]).mean(numeric_only=True).transpose()
            performance_by_MSA_depth = performance_by_MSA_depth.reindex(columns=['Low','Medium','High'])
            performance_by_MSA_depth.columns = ['Low_MSA_depth','Medium_MSA_depth','High_MSA_depth']
            performance_by_taxon = performance_all_DMS[metric].groupby(["UniProt_ID","Taxon"]).mean(numeric_only=True).groupby(["Taxon"]).mean(numeric_only=True).transpose()
            performance_by_taxon = performance_by_taxon.reindex(columns=['Human','Eukaryote','Prokaryote','Virus'])
            performance_by_taxon.columns = ['Taxa_Human','Taxa_Other_Eukaryote','Taxa_Prokaryote','Taxa_Virus']
            performance_by_function = uniprot_function_level_average.drop(labels="Average",axis=0).set_index("Selection Type").transpose()
            performance_by_function.columns = ["Function_"+x for x in performance_by_function.columns]
            
            summary_performance = pd.merge(pd.DataFrame(final_average,columns=['Average_'+metric]), performance_by_MSA_depth,left_index=True,right_index=True,how='inner')
            summary_performance = pd.merge(summary_performance, performance_by_taxon,left_index=True, right_index=True,how='inner')
            summary_performance = pd.merge(summary_performance, performance_by_function,left_index=True, right_index=True, how='inner')
            final_column_order = ['Model_name','Model type','Average_'+metric,'Bootstrap_standard_error_'+metric,'Function_Activity','Function_Binding','Function_Expression','Function_OrganismalFitness','Function_Stability','Low_MSA_depth','Medium_MSA_depth','High_MSA_depth','Taxa_Human','Taxa_Other_Eukaryote','Taxa_Prokaryote','Taxa_Virus','Model details','References']
        summary_performance.sort_values(by='Average_'+metric,ascending=False,inplace=True)
        summary_performance.index.name = 'Model_name'
        summary_performance.reset_index(inplace=True)
        summary_performance.index = range(1,len(summary_performance)+1)
        summary_performance.index.name = 'Model_rank'
        summary_performance = pd.merge(summary_performance, bootstrap_standard_error, left_on='Model_name', right_index=True, how='left')
        summary_performance = pd.merge(summary_performance, model_types, left_on='Model_name', right_index=True, how='left')
        summary_performance = pd.merge(summary_performance, model_details, left_on='Model_name', right_index=True, how='left')
        summary_performance = pd.merge(summary_performance, model_references, left_on='Model_name', right_index=True, how='left')
        summary_performance=summary_performance.round(3)
        summary_performance['Model_name']=summary_performance['Model_name'].map(lambda x: clean_names[x] if x in clean_names else x)
        summary_performance=summary_performance.reindex(columns=final_column_order)
        summary_performance.to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + 'Summary_performance_'+output_filename[metric]+'.csv')
        summary_performance.to_html(args.output_performance_file_folder + os.sep + metric + os.sep + 'Summary_performance_'+output_filename[metric]+'.html')

if __name__ == '__main__':
    main()