Upload folder using huggingface_hub

714cf46 verified 20 days ago

33 kB

	'''
	Adapted from https://github.com/OATML-Markslab/ProteinGym/blob/main/proteingym/performance_DMS_benchmarks.py
	Changes:
	- Added ability to restrict analysis to a subset of DMS IDs, when they are specified in main.py
	- Evaluates only models that are specified in main.py
	'''
	import pandas as pd
	import numpy as np
	import os
	import argparse
	from scipy.stats import spearmanr
	from sklearn.metrics import roc_auc_score, matthews_corrcoef
	import warnings
	import json
	warnings.simplefilter(action='ignore', category=FutureWarning)

	def minmax(x):
	return ( (x - np.min(x)) / (np.max(x) - np.min(x)) )

	def calc_ndcg(y_true, y_score, **kwargs):
	'''
	Inputs:
	y_true: an array of the true scores where higher score is better
	y_score: an array of the predicted scores where higher score is better
	Options:
	quantile: If True, uses the top k quantile of the distribution
	top: under the quantile setting this is the top quantile to
	keep in the gains calc. This is a PERCENTAGE (i.e input 10 for top 10%)
	Notes:
	Currently we're calculating NDCG on the continuous value of the DMS
	I tried it on the binary value as well and the metrics seemed mostly
	the same.
	'''
	if 'quantile' not in kwargs:
	kwargs['quantile'] = True
	if 'top' not in kwargs:
	kwargs['top'] = 10
	if kwargs['quantile']:
	k = np.floor(y_true.shape[0]*(kwargs['top']/100)).astype(int)
	else:
	k = kwargs['top']
	if isinstance(y_true, pd.Series):
	y_true = y_true.values
	if isinstance(y_score, pd.Series):
	y_score = y_score.values
	gains = minmax(y_true)
	ranks = np.argsort(np.argsort(-y_score)) + 1

	if k == 'all':
	k = len(ranks)
	#sub to top k
	ranks_k = ranks[ranks <= k]
	gains_k = gains[ranks <= k]
	#all terms with a gain of 0 go to 0
	ranks_fil = ranks_k[gains_k != 0]
	gains_fil = gains_k[gains_k != 0]

	#if none of the ranks made it return 0
	if len(ranks_fil) == 0:
	return (0)

	#discounted cumulative gains
	dcg = np.sum([g/np.log2(r+1) for r,g in zip(ranks_fil, gains_fil)])

	#ideal dcg - calculated based on the top k actual gains
	ideal_ranks = np.argsort(np.argsort(-gains)) + 1
	ideal_ranks_k = ideal_ranks[ideal_ranks <= k]
	ideal_gains_k = gains[ideal_ranks <= k]
	ideal_ranks_fil = ideal_ranks_k[ideal_gains_k != 0]
	ideal_gains_fil = ideal_gains_k[ideal_gains_k != 0]
	idcg = np.sum([g/np.log2(r+1) for r,g in zip(ideal_ranks_fil, ideal_gains_fil)])

	#normalize
	ndcg = dcg/idcg

	return (ndcg)

	def calc_toprecall(true_scores, model_scores, top_true=10, top_model=10):
	top_true = (true_scores >= np.percentile(true_scores, 100-top_true))
	top_model = (model_scores >= np.percentile(model_scores, 100-top_model))

	TP = (top_true) & (top_model)
	recall = TP.sum() / (top_true.sum()) if top_true.sum() > 0 else 0

	return (recall)

	def standardization(x):
	"""Assumes input is numpy array or pandas series"""
	return (x - x.mean()) / x.std()

	def compute_bootstrap_standard_error(df, number_assay_reshuffle=10000):
	"""
	Computes the non-parametric bootstrap standard error for the mean estimate of a given performance metric (eg., Spearman, AUC) across DMS assays (ie., the sample standard deviation of the mean across bootstrap samples)
	"""
	model_names = df.columns
	mean_performance_across_samples = []
	for sample in range(number_assay_reshuffle):
	mean_performance_across_samples.append(df.sample(frac=1.0, replace=True).mean(axis=0)) #Resample a dataset of the same size (with replacement) then take the sample mean
	mean_performance_across_samples=pd.DataFrame(data=mean_performance_across_samples,columns=model_names)
	return mean_performance_across_samples.std(ddof=1)

	def compute_bootstrap_standard_error_functional_categories(df, number_assay_reshuffle=10000):
	"""
	Computes the non-parametric bootstrap standard error for the mean estimate of a given performance metric (eg., Spearman, AUC) across DMS assays (ie., the sample standard deviation of the mean across bootstrap samples)
	"""
	model_names = df.columns
	mean_performance_across_samples = {}
	for category, group in df.groupby("Selection Type"):
	mean_performance_across_samples[category] = []
	for sample in range(number_assay_reshuffle):
	mean_performance_across_samples[category].append(group.sample(frac=1.0, replace=True).mean(axis=0)) #Resample a dataset of the same size (with replacement) then take the sample mean
	mean_performance_across_samples[category]=pd.DataFrame(data=mean_performance_across_samples[category])
	categories = list(mean_performance_across_samples.keys())
	combined_averages = mean_performance_across_samples[categories[0]].copy()
	for category in categories[1:]:
	combined_averages += mean_performance_across_samples[category]
	combined_averages /= len(categories)
	return combined_averages.std(ddof=1)


	proteingym_folder_path = os.path.dirname(os.path.realpath(__file__))

	def main():
	parser = argparse.ArgumentParser(description='ProteinGym performance analysis')
	parser.add_argument('--input_scoring_files_folder', type=str, help='Name of folder where all input scores are present (expects one scoring file per DMS)')
	parser.add_argument('--output_performance_file_folder', default='./outputs/tranception_performance', type=str, help='Name of folder where to save performance analysis files')
	parser.add_argument('--DMS_reference_file_path', type=str, help='Reference file with list of DMSs to consider')
	parser.add_argument('--indel_mode', action='store_true', help='Whether to score sequences with insertions and deletions')
	parser.add_argument('--performance_by_depth', action='store_true', help='Whether to compute performance by mutation depth')
	parser.add_argument('--config_file', default=f'{os.path.dirname(proteingym_folder_path)}/config.json', type=str, help='Path to config file containing model information')
	parser.add_argument('--selected_model_names', nargs='+', default=None, help='Required to obtain column names from config file')
	parser.add_argument('--dms_ids', nargs='+', default=None, help='Subset of DMS ids to include; if omitted, all DMS in the reference are used')
	parser.add_argument('--scoring_method', choices=['masked_marginal','wildtype_marginal','mutant_marginal','pll','global_log_prob'], default='masked_marginal', help='Which zero-shot scoring file suffix to read')
	args = parser.parse_args()

	mapping_protein_seq_DMS = pd.read_csv(args.DMS_reference_file_path)
	# Optionally restrict analysis to a subset of DMS IDs
	if args.dms_ids is not None and len(args.dms_ids) > 0:
	requested_dms_ids = set(str(x) for x in args.dms_ids)
	mapping_protein_seq_DMS = mapping_protein_seq_DMS[mapping_protein_seq_DMS['DMS_id'].astype(str).isin(requested_dms_ids)]
	if mapping_protein_seq_DMS.empty:
	print("No matching DMS ids after filtering; nothing to compute.")
	return
	mapping_protein_seq_DMS["MSA_Neff_L_category"] = mapping_protein_seq_DMS["MSA_Neff_L_category"].apply(lambda x: x[0].upper() + x[1:] if type(x) == str else x)
	num_DMS=len(mapping_protein_seq_DMS)
	print("There are {} DMSs in mapping file".format(num_DMS))

	with open(args.config_file) as f:
	config = json.load(f)
	with open(f"{os.path.dirname(os.path.realpath(__file__))}/constants.json") as f:
	constants = json.load(f)
	uniprot_function_lookup = mapping_protein_seq_DMS[["UniProt_ID","coarse_selection_type"]]
	uniprot_function_lookup.columns = ["UniProt_ID", "Selection Type"]
	uniprot_Neff_lookup = mapping_protein_seq_DMS[['UniProt_ID','MSA_Neff_L_category']].drop_duplicates()
	uniprot_Neff_lookup.columns=['UniProt_ID','MSA_Neff_L_category']
	uniprot_taxon_lookup = mapping_protein_seq_DMS[['UniProt_ID','taxon']].drop_duplicates()
	uniprot_taxon_lookup.columns=['UniProt_ID','Taxon']
	if args.indel_mode:
	args.performance_by_depth = False

	if not args.indel_mode:
	score_variables = list(config.get("model_list_zero_shot_substitutions_DMS", {}).keys())
	else:
	score_variables = list(config.get("model_list_zero_shot_indels_DMS", {}).keys())
	if not os.path.isdir(args.output_performance_file_folder):
	os.mkdir(args.output_performance_file_folder)
	for metric in ['Spearman','AUC','MCC',"NDCG","Top_recall"]:
	if not os.path.isdir(args.output_performance_file_folder+os.sep+metric):
	os.mkdir(args.output_performance_file_folder+os.sep+metric)

	# Will populate model_types dynamically after scanning available model columns
	model_types = pd.DataFrame(columns=['Model type'])
	model_details = pd.DataFrame.from_dict(constants.get("model_details", {}), columns=['Model details'], orient='index')
	model_references = pd.DataFrame.from_dict(constants.get("model_references", {}), columns=['References'], orient='index')
	clean_names = constants.get("clean_names", {})
	performance_all_DMS={}
	output_filename={}
	for metric in ['Spearman','AUC','MCC', "NDCG", "Top_recall"]:
	performance_all_DMS[metric] = {}
	mutation_type = "substitutions" if not args.indel_mode else "indels"
	output_filename[metric] = "DMS_" + mutation_type + "_" + metric
	# Start with only metadata rows; model rows will be added dynamically via outer merges
	performance_all_DMS[metric]['number_mutants'] = -1
	performance_all_DMS[metric]["Selection Type"] = -1
	performance_all_DMS[metric]["UniProt_ID"] = -1
	performance_all_DMS[metric]['MSA_Neff_L_category'] = -1
	performance_all_DMS[metric]['Taxon'] = -1
	performance_all_DMS[metric] = pd.DataFrame.from_dict(performance_all_DMS[metric], orient='index').reset_index()
	performance_all_DMS[metric].columns = ['score', 'score_index']

	list_DMS = mapping_protein_seq_DMS["DMS_id"]
	i = 0
	all_models_found = set()
	for DMS_id in list_DMS:
	try:
	print(DMS_id)
	UniProt_ID = mapping_protein_seq_DMS["UniProt_ID"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0]
	selection_type = mapping_protein_seq_DMS["coarse_selection_type"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0]
	MSA_Neff_L_category = mapping_protein_seq_DMS["MSA_Neff_L_category"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0]
	Taxon = mapping_protein_seq_DMS["taxon"][mapping_protein_seq_DMS["DMS_id"]==DMS_id].values[0]
	if args.indel_mode:
	suffix = 'pll'
	else:
	suffix = args.scoring_method
	score_path = os.path.join(args.input_scoring_files_folder, f"{DMS_id}_zs_{suffix}.csv")
	if os.path.exists(score_path):
	merged_scores = pd.read_csv(score_path, dtype={'target_seq': str})
	if 'mutant' not in merged_scores: merged_scores['mutant'] = merged_scores['mutated_sequence'] #if mutant not in DMS file we default to mutated_sequence (eg., for indels)
	# Ensure binary labels exist for AUC/MCC if not provided
	if 'DMS_score_bin' not in merged_scores and 'DMS_score' in merged_scores:
	median_cutoff = merged_scores['DMS_score'].median()
	merged_scores['DMS_score_bin'] = (merged_scores['DMS_score'] >= median_cutoff).astype(int)
	if 'DMS_score' not in merged_scores:
	print(f"DMS_score column missing for {DMS_id}; skipping this DMS")
	continue
	except:
	print(f"Scoring file for {DMS_id} missing")
	continue

	if not args.indel_mode and args.performance_by_depth:
	if 'mutant' in merged_scores:
	merged_scores['mutation_depth'] = merged_scores['mutant'].apply(lambda x: len(x.split(":")))
	merged_scores['mutation_depth_grouped'] = merged_scores['mutation_depth'].apply(lambda x: '5+' if x >= 5 else str(x))
	else:
	print("No 'mutant' or 'mutated_sequence' column to compute mutation depth for DMS {}; setting to nan".format(DMS_id))
	merged_scores['mutation_depth_grouped'] = np.nan
	performance_DMS = {}
	for metric in ['Spearman','AUC','MCC','NDCG','Top_recall']:
	performance_DMS[metric]={}
	# Determine available model score columns
	score_columns_present = [name for name in args.selected_model_names if name in merged_scores.columns]
	score_columns_present = list(dict.fromkeys(score_columns_present))
	all_models_found.update(score_columns_present)
	for score in score_columns_present:
	if score not in merged_scores:
	print("Model scores for {} not in merged scores for DMS {}".format(score,DMS_id))
	performance_DMS["Spearman"][score] = np.nan
	performance_DMS["AUC"][score] = np.nan
	performance_DMS["MCC"][score] = np.nan
	performance_DMS["NDCG"][score] = np.nan
	performance_DMS["Top_recall"][score] = np.nan
	continue
	performance_DMS['Spearman'][score] = spearmanr(merged_scores['DMS_score'], merged_scores[score])[0]
	performance_DMS["NDCG"][score] = calc_ndcg(merged_scores['DMS_score'], merged_scores[score])
	performance_DMS["Top_recall"][score] = calc_toprecall(merged_scores['DMS_score'], merged_scores[score])
	try:
	performance_DMS['AUC'][score] = roc_auc_score(y_true=merged_scores['DMS_score_bin'], y_score=merged_scores[score])
	except:
	print("AUC issue with: {} for model: {}".format(DMS_id,score))
	performance_DMS['AUC'][score] = np.nan
	try:
	median_cutoff=merged_scores[score].median()
	merged_scores[score+"_bin"]=merged_scores[score].map(lambda x: 1 if x >= median_cutoff else 0)
	performance_DMS['MCC'][score] = matthews_corrcoef(y_true=merged_scores['DMS_score_bin'], y_pred=merged_scores[score+"_bin"])
	except:
	print("MCC issue with: {} for model: {}".format(DMS_id,score))
	performance_DMS['MCC'][score] = np.nan

	if not args.indel_mode and args.performance_by_depth:
	for score in score_columns_present:
	if score not in merged_scores:
	print("Model scores for {} not in merged scores for DMS {}".format(score,DMS_id))
	for depth in ['1','2','3','4','5+']:
	performance_DMS["Spearman"][score+'_'+depth] = np.nan
	performance_DMS["AUC"][score+'_'+depth] = np.nan
	performance_DMS["MCC"][score+'_'+depth] = np.nan
	performance_DMS["NDCG"][score+'_'+depth] = np.nan
	performance_DMS["Top_recall"][score+'_'+depth] = np.nan
	continue
	for depth in ['1','2','3','4','5+']:
	merged_scores_depth = merged_scores[merged_scores.mutation_depth_grouped==depth]
	if len(merged_scores_depth) > 0:
	performance_DMS['Spearman'][score+'_'+depth] = spearmanr(merged_scores_depth['DMS_score'], merged_scores_depth[score])[0]
	performance_DMS["NDCG"][score+'_'+depth] = calc_ndcg(merged_scores_depth['DMS_score'], merged_scores_depth[score])
	performance_DMS["Top_recall"][score+'_'+depth] = calc_toprecall(merged_scores_depth['DMS_score'], merged_scores_depth[score])
	try:
	performance_DMS['AUC'][score+'_'+depth] = roc_auc_score(y_true=merged_scores_depth['DMS_score_bin'], y_score=merged_scores_depth[score])
	except:
	performance_DMS['AUC'][score+'_'+depth] = np.nan
	try:
	performance_DMS['MCC'][score+'_'+depth] = matthews_corrcoef(y_true=merged_scores_depth['DMS_score_bin'], y_pred=merged_scores_depth[score+"_bin"])
	except:
	performance_DMS['MCC'][score+'_'+depth] = np.nan
	else:
	performance_DMS['Spearman'][score+'_'+depth] = np.nan
	performance_DMS['AUC'][score+'_'+depth] = np.nan
	performance_DMS['MCC'][score+'_'+depth] = np.nan
	performance_DMS["NDCG"][score+'_'+depth] = np.nan
	performance_DMS["Top_recall"][score+'_'+depth] = np.nan
	print("Number of mutants: {}".format(len(merged_scores['DMS_score'].values)))
	for metric in ['Spearman','AUC','MCC','NDCG','Top_recall']:
	performance_DMS[metric]['number_mutants']=len(merged_scores['DMS_score'].values)
	performance_DMS[metric]['UniProt_ID'] = UniProt_ID
	performance_DMS[metric]["Selection Type"] = selection_type
	performance_DMS[metric]['MSA_Neff_L_category'] = MSA_Neff_L_category
	performance_DMS[metric]['Taxon'] = Taxon
	performance_DMS[metric] = pd.DataFrame.from_dict(performance_DMS[metric],orient='index').reset_index()
	performance_DMS[metric].columns=['score',DMS_id]
	performance_all_DMS[metric]=pd.merge(performance_all_DMS[metric],performance_DMS[metric],on='score',how='outer')
	# Build model types dynamically
	try:
	if not args.indel_mode:
	cfg_models = config.get("model_list_zero_shot_substitutions_DMS", {})
	else:
	cfg_models = config.get("model_list_zero_shot_indels_DMS", {})
	model_types_dict = {}
	for model in sorted(all_models_found):
	model_types_dict[model] = cfg_models.get(model, {}).get("model_type", "Unknown")
	model_types = pd.DataFrame.from_dict(model_types_dict, columns=['Model type'], orient='index')
	except Exception:
	model_types = pd.DataFrame(columns=['Model type'])
	for metric in ['Spearman','AUC','MCC','NDCG','Top_recall']:
	performance_all_DMS[metric]=performance_all_DMS[metric].set_index('score')
	del performance_all_DMS[metric]['score_index']
	performance_all_DMS[metric]=performance_all_DMS[metric].transpose()
	for var in performance_all_DMS[metric]:
	if var not in ['UniProt_ID','MSA_Neff_L_category','Taxon',"Selection Type"]:
	performance_all_DMS[metric][var]=performance_all_DMS[metric][var].astype(float).round(3)
	if var in ['number_mutants']:
	performance_all_DMS[metric][var]=performance_all_DMS[metric][var].astype(int)
	if not args.indel_mode and args.performance_by_depth:
	all_columns = performance_all_DMS[metric].columns
	performance_all_DMS_html=performance_all_DMS[metric].copy()
	performance_all_DMS_html.columns=performance_all_DMS_html.columns.map(lambda x: clean_names[x] if x in clean_names else x)
	all_not_depth_columns = all_columns[[all_columns[x].split("_")[-1] not in ['1','2','3','4','5+'] for x in range(len(all_columns))]]
	all_not_depth_columns_clean = all_not_depth_columns.map(lambda x: clean_names[x] if x in clean_names else x)
	performance_all_DMS_html[all_not_depth_columns_clean].to_html(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_DMS_level.html')
	DMS_perf_to_save = performance_all_DMS[metric].copy()[all_not_depth_columns]
	DMS_perf_to_save.columns = DMS_perf_to_save.columns.map(lambda x: clean_names[x] if x in clean_names else x)
	DMS_perf_to_save.to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_DMS_level.csv', index_label="DMS ID")
	else:
	performance_all_DMS_html=performance_all_DMS[metric].copy()
	performance_all_DMS_html.columns = performance_all_DMS_html.columns.map(lambda x: clean_names[x] if x in clean_names else x)
	performance_all_DMS_html.to_html(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_DMS_level.html')
	DMS_perf_to_save = performance_all_DMS[metric].copy()
	DMS_perf_to_save.columns = DMS_perf_to_save.columns.map(lambda x: clean_names[x] if x in clean_names else x)
	DMS_perf_to_save.to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_DMS_level.csv', index_label="DMS ID")

	if not args.indel_mode:
	uniprot_metric_performance = performance_all_DMS[metric].groupby(['UniProt_ID']).mean(numeric_only=True)
	uniprot_function_metric_performance = performance_all_DMS[metric].groupby(['UniProt_ID',"Selection Type"]).mean(numeric_only=True)
	uniprot_metric_performance = uniprot_metric_performance.reset_index()
	uniprot_metric_performance = pd.merge(uniprot_metric_performance,uniprot_Neff_lookup,on='UniProt_ID', how='left')
	uniprot_metric_performance = pd.merge(uniprot_metric_performance,uniprot_taxon_lookup,on='UniProt_ID', how='left')
	uniprot_metric_performance = pd.merge(uniprot_metric_performance,uniprot_function_lookup,on="UniProt_ID",how="left")
	del uniprot_metric_performance['number_mutants']
	del uniprot_function_metric_performance["number_mutants"]
	uniprot_level_average = uniprot_metric_performance.mean(numeric_only=True)
	uniprot_function_level_average = uniprot_function_metric_performance.groupby("Selection Type").mean(numeric_only=True)
	# bootstrap_standard_error = pd.DataFrame(compute_bootstrap_standard_error_functional_categories(uniprot_function_metric_performance.subtract(uniprot_function_metric_performance['TranceptEVE_L'],axis=0)),columns=["Bootstrap_standard_error_"+metric])
	uniprot_function_level_average = uniprot_function_level_average.reset_index()
	final_average = uniprot_function_level_average.mean(numeric_only=True)
	if args.performance_by_depth:
	cols = [column for column in all_not_depth_columns if column not in ["number_mutants","Taxon","MSA_Neff_L_category","Selection Type","UniProt_ID"]]
	top_model = final_average.loc[cols].idxmax()
	else:
	top_model = final_average.idxmax()
	bootstrap_standard_error = pd.DataFrame(compute_bootstrap_standard_error_functional_categories(uniprot_function_metric_performance.subtract(uniprot_function_metric_performance[top_model],axis=0)),columns=["Bootstrap_standard_error_"+metric])
	uniprot_metric_performance.loc['Average'] = uniprot_level_average
	uniprot_function_level_average.loc['Average'] = final_average
	uniprot_metric_performance=uniprot_metric_performance.round(3)
	uniprot_function_level_average=uniprot_function_level_average.round(3)
	if args.performance_by_depth:
	uniprot_metric_performance[[column for column in all_not_depth_columns if column != "number_mutants"]].to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_Uniprot_level.csv', index=False)
	performance_by_depth = {}
	all_not_depth_columns = [x for x in all_not_depth_columns if x not in ['number_mutants',"UniProt_ID","MSA_Neff_L_category","Taxon"]]
	for depth in ['1','2','3','4','5+']:
	depth_columns = all_columns[[all_columns[x].split("_")[-1]==depth for x in range(len(all_columns))]]
	performance_by_depth[depth] = uniprot_function_metric_performance[depth_columns].mean(numeric_only=True).reset_index()
	performance_by_depth[depth]['model_name'] = performance_by_depth[depth]['score'].map(lambda x: '_'.join(x.split('_')[:-1]))
	performance_by_depth[depth]=performance_by_depth[depth][['model_name',0]]
	performance_by_depth[depth].columns = ['model_name','Depth_'+depth]
	performance_by_depth[depth].set_index('model_name', inplace=True)
	uniprot_function_level_average = uniprot_function_level_average[all_not_depth_columns]
	else:
	uniprot_metric_performance.to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + '_Uniprot_level.csv', index=False)
	uniprot_function_level_average.to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + output_filename[metric] + "_Uniprot_Selection_Type_level.csv",index=False)
	if args.performance_by_depth:
	performance_by_MSA_depth = performance_all_DMS[metric].groupby(["UniProt_ID","MSA_Neff_L_category"]).mean(numeric_only=True).groupby(["MSA_Neff_L_category"]).mean(numeric_only=True)[[col for col in all_not_depth_columns if col != "Selection Type"]].transpose()
	else:
	performance_by_MSA_depth = performance_all_DMS[metric].groupby(["UniProt_ID","MSA_Neff_L_category"]).mean(numeric_only=True).groupby(["MSA_Neff_L_category"]).mean(numeric_only=True).transpose()
	performance_by_MSA_depth = performance_by_MSA_depth.reindex(columns=['Low','Medium','High'])
	performance_by_MSA_depth.columns = ['Low_MSA_depth','Medium_MSA_depth','High_MSA_depth']
	if args.performance_by_depth:
	performance_by_taxon = performance_all_DMS[metric].groupby(["UniProt_ID","Taxon"]).mean(numeric_only=True).groupby(["Taxon"]).mean(numeric_only=True)[[col for col in all_not_depth_columns if col != "Selection Type"]].transpose()
	else:
	performance_by_taxon = performance_all_DMS[metric].groupby(["UniProt_ID","Taxon"]).mean(numeric_only=True).groupby(["Taxon"]).mean(numeric_only=True).transpose()
	performance_by_taxon = performance_by_taxon.reindex(columns=['Human','Eukaryote','Prokaryote','Virus'])
	performance_by_taxon.columns = ['Taxa_Human','Taxa_Other_Eukaryote','Taxa_Prokaryote','Taxa_Virus']
	performance_by_function = uniprot_function_level_average.drop(labels="Average",axis=0).set_index("Selection Type").transpose()
	performance_by_function.columns = ["Function_"+x for x in performance_by_function.columns]

	summary_performance = pd.merge(pd.DataFrame(final_average,columns=['Average_'+metric]), performance_by_MSA_depth,left_index=True, right_index=True,how='inner')
	summary_performance = pd.merge(summary_performance, performance_by_taxon,left_index=True, right_index=True,how='inner')
	summary_performance = pd.merge(summary_performance, performance_by_function,left_index=True, right_index=True, how='inner')
	if args.performance_by_depth:
	for depth in ['1','2','3','4','5+']:
	summary_performance = pd.merge(summary_performance, performance_by_depth[depth],left_index=True, right_index=True,how='inner')
	final_column_order = ['Model_name','Model type','Average_'+metric,'Bootstrap_standard_error_'+metric,'Function_Activity','Function_Binding','Function_Expression','Function_OrganismalFitness','Function_Stability','Low_MSA_depth','Medium_MSA_depth','High_MSA_depth','Taxa_Human','Taxa_Other_Eukaryote','Taxa_Prokaryote','Taxa_Virus','Depth_1','Depth_2','Depth_3','Depth_4','Depth_5+','Model details','References']

	else:
	performance_all_DMS[metric].loc["Average"] = performance_all_DMS[metric].mean(numeric_only=True)
	uniprot_metric_performance = performance_all_DMS[metric].groupby(['UniProt_ID']).mean(numeric_only=True)
	uniprot_function_metric_performance = performance_all_DMS[metric].groupby(['UniProt_ID',"Selection Type"]).mean(numeric_only=True)
	uniprot_metric_performance = pd.merge(uniprot_metric_performance,uniprot_function_lookup,on="UniProt_ID",how="left")
	del uniprot_metric_performance['number_mutants']
	uniprot_level_average = uniprot_metric_performance.mean(numeric_only=True)
	del uniprot_function_metric_performance["number_mutants"]
	uniprot_function_level_average = uniprot_function_metric_performance.groupby("Selection Type").mean(numeric_only=True)
	# bootstrap_standard_error = pd.DataFrame(compute_bootstrap_standard_error_functional_categories(uniprot_function_metric_performance.subtract(uniprot_function_metric_performance['TranceptEVE_M'],axis=0)),columns=["Bootstrap_standard_error_"+metric])
	uniprot_function_level_average = uniprot_function_level_average.reset_index()
	final_average = uniprot_function_level_average.mean(numeric_only=True)
	top_model = final_average.idxmax()
	bootstrap_standard_error = pd.DataFrame(compute_bootstrap_standard_error_functional_categories(uniprot_function_metric_performance.subtract(uniprot_function_metric_performance[top_model],axis=0)),columns=["Bootstrap_standard_error_"+metric])
	uniprot_metric_performance.loc['Average'] = uniprot_level_average
	uniprot_function_level_average.loc['Average'] = final_average
	uniprot_metric_performance=uniprot_metric_performance.round(3)
	uniprot_function_level_average=uniprot_function_level_average.round(3)

	performance_by_MSA_depth = performance_all_DMS[metric].groupby(["UniProt_ID","MSA_Neff_L_category"]).mean(numeric_only=True).groupby(["MSA_Neff_L_category"]).mean(numeric_only=True).transpose()
	performance_by_MSA_depth = performance_by_MSA_depth.reindex(columns=['Low','Medium','High'])
	performance_by_MSA_depth.columns = ['Low_MSA_depth','Medium_MSA_depth','High_MSA_depth']
	performance_by_taxon = performance_all_DMS[metric].groupby(["UniProt_ID","Taxon"]).mean(numeric_only=True).groupby(["Taxon"]).mean(numeric_only=True).transpose()
	performance_by_taxon = performance_by_taxon.reindex(columns=['Human','Eukaryote','Prokaryote','Virus'])
	performance_by_taxon.columns = ['Taxa_Human','Taxa_Other_Eukaryote','Taxa_Prokaryote','Taxa_Virus']
	performance_by_function = uniprot_function_level_average.drop(labels="Average",axis=0).set_index("Selection Type").transpose()
	performance_by_function.columns = ["Function_"+x for x in performance_by_function.columns]

	summary_performance = pd.merge(pd.DataFrame(final_average,columns=['Average_'+metric]), performance_by_MSA_depth,left_index=True,right_index=True,how='inner')
	summary_performance = pd.merge(summary_performance, performance_by_taxon,left_index=True, right_index=True,how='inner')
	summary_performance = pd.merge(summary_performance, performance_by_function,left_index=True, right_index=True, how='inner')
	final_column_order = ['Model_name','Model type','Average_'+metric,'Bootstrap_standard_error_'+metric,'Function_Activity','Function_Binding','Function_Expression','Function_OrganismalFitness','Function_Stability','Low_MSA_depth','Medium_MSA_depth','High_MSA_depth','Taxa_Human','Taxa_Other_Eukaryote','Taxa_Prokaryote','Taxa_Virus','Model details','References']
	summary_performance.sort_values(by='Average_'+metric,ascending=False,inplace=True)
	summary_performance.index.name = 'Model_name'
	summary_performance.reset_index(inplace=True)
	summary_performance.index = range(1,len(summary_performance)+1)
	summary_performance.index.name = 'Model_rank'
	summary_performance = pd.merge(summary_performance, bootstrap_standard_error, left_on='Model_name', right_index=True, how='left')
	summary_performance = pd.merge(summary_performance, model_types, left_on='Model_name', right_index=True, how='left')
	summary_performance = pd.merge(summary_performance, model_details, left_on='Model_name', right_index=True, how='left')
	summary_performance = pd.merge(summary_performance, model_references, left_on='Model_name', right_index=True, how='left')
	summary_performance=summary_performance.round(3)
	summary_performance['Model_name']=summary_performance['Model_name'].map(lambda x: clean_names[x] if x in clean_names else x)
	summary_performance=summary_performance.reindex(columns=final_column_order)
	summary_performance.to_csv(args.output_performance_file_folder + os.sep + metric + os.sep + 'Summary_performance_'+output_filename[metric]+'.csv')
	summary_performance.to_html(args.output_performance_file_folder + os.sep + metric + os.sep + 'Summary_performance_'+output_filename[metric]+'.html')

	if __name__ == '__main__':
	main()