# Author: Juan Parras & Patricia A. Apellániz # Email: patricia.alonsod@upm.es # Date: 31/07/2025 # Package imports import os import sys import pickle import numpy as np import scipy.stats as stats import statsmodels.stats.multitest as multitest from tabulate import tabulate current_dir = os.path.dirname(os.path.abspath(__file__)) parent_dir = os.path.abspath(os.path.join(current_dir, "..")) sys.path.append(parent_dir) from src.models.model_utils import get_model ##################################### ### ENVIRONMENT AND CONFIGURATION ### ##################################### def create_results_folder(results_folder, args): for dataset in args['datasets']: folder_path = os.path.join(results_folder, dataset) os.makedirs(folder_path, exist_ok=True) def get_config(task): args = {} # Model selection # Options are 'all', 'mlp', 'lr', 'rf', 'nam', 'kan', 'kan_gam' models = 'all' if models == 'all': args['models'] = ['mlp', 'lr', 'rf', 'nam', 'kan', 'kan_gam'] else: args['models'] = [models] # Dataset selection # Options are 'all', 'heart', 'diabetes_h', 'diabetes_130', 'obesity', 'obesity_bin', 'breast_cancer' datasets = 'all' if datasets == 'all': args['datasets'] = ['heart', 'diabetes_h', 'diabetes_130', 'obesity', 'obesity_bin', 'breast_cancer'] else: args['datasets'] = [datasets] # Set the path to the results folder base_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) args['base_folder'] = base_folder args['data_folder'] = os.path.join(base_folder, 'data') results_folder = 'results_performance' if task == 'performance' else 'results_interpretability' args['results_folder'] = os.path.join(base_folder, results_folder) # Training parameters args['train'] = not True # If True, train the models. If False, load the models from disk args['n_folds'] = 3 # Number of folds for cross-validation args['n_jobs'] = 1 # Number of jobs to run in parallel # Interpretability parameters (representation info) args['n_dists'] = 5 # Representation parameter: threshold on the number of closest patients to show args['max_atribs_radar'] = 10 # Max # of attributes to show in the radar plot, filtered by variance args['max_plot_curves'] = 5 # Max # of curves to plot in the partial dependence plots args['max_pats_to_save'] = 5 # Max # of patients to save in the patients folder return args ##################################### ### RESULTS REPRESENTATION ### ##################################### def get_results_table(args): table_col_names = ['Dataset', 'Model', 'Accuracy', 'ROC-AUC', 'F1-Score', 'Precision', 'Recall', 'Time'] results_table = [] results_table_no_ci = [] for dataset in args['datasets']: models = args['models'].copy() if dataset not in ['heart', 'obesity_bin', 'breast_cancer']: models.remove('nam') for model_name in models: # Load the metrics with open(os.path.join(args['results_folder'], dataset, model_name + '.pkl'), 'rb') as f: metrics = pickle.load(f) # Approximate all metrics to two decimal places no_ci_metrics = {'accuracy': 0.0, 'roc_auc': 0.0, 'f1_score': 0.0, 'precision': 0.0, 'recall': 0.0} for key in ['accuracy', 'roc_auc', 'f1', 'precision', 'recall']: # metrics[key] = np.round(metrics[key], 2) no_ci_metrics[key] = metrics[key]['mean'] metrics[key] = str(np.round(metrics[key]['mean'], 2)) + " (" + str( np.round(metrics[key]['CI_95%'][0], 3)) + ", " + str(np.round(metrics[key]['CI_95%'][1], 3)) + ")" # Add the avg time to the results _, p = get_model(model_name) l = [len(p[key]) for key in p.keys()] avg_time = metrics['time'] / np.prod(l) # print(f"Dataset: {dataset}, model: {model_name}, metrics: {metrics}") results_table.append([dataset, model_name, metrics['accuracy'], metrics['roc_auc'], metrics['f1'], metrics['precision'], metrics['recall'], avg_time]) results_table_no_ci.append([dataset, model_name, no_ci_metrics['accuracy'], no_ci_metrics['roc_auc'], no_ci_metrics['f1'], no_ci_metrics['precision'], no_ci_metrics['recall'], avg_time]) print('\n\n---------------Performance results---------------\n') # print(tabulate(results_table, headers=table_col_names, tablefmt='latex', floatfmt=".2f")) print(tabulate(results_table, headers=table_col_names, floatfmt=".2f")) return results_table, table_col_names, results_table_no_ci def friedman_test(all_data, comp_index, alpha, higher_is_better): """ Perform the Friedman test on the provided data. Based on Demsar06. :param all_data: 2D numpy array of shape (n_methods, n_datasets) where each row is a method and each column is a dataset. :param comp_index: Method to set as baseline for post-hoc tests, as in Demsar06. Should be the best performing metric... :param alpha: significance level for the test. :return: Friedman test p_value, davenport p-value and pairwise to the best baseline post-hoc p-values. """ # Check that comp_index gives the best performing method (double check just in case...) avg_performance = np.mean(all_data, axis=1) # Average performance across datasets for each method if higher_is_better: assert comp_index == np.argmax(avg_performance), "comp_index must be the index of the best performing method." else: assert comp_index == np.argmin(avg_performance), "comp_index must be the index of the best performing method." # Manual implementation of the Friedman test--to compute post-hoc metrics later on n_methods, n_reps = all_data.shape ranking_matrix = np.zeros_like(all_data) for k in range(n_reps): # Rank the methods for each dataset/fold if higher_is_better: ranking_matrix[:, k] = stats.rankdata(-all_data[:, k], method='average') # Average ranks for ties else: ranking_matrix[:, k] = stats.rankdata(all_data[:, k], method='average') # Average ranks for ties # Calculate the Friedman test statistic average_rank = np.mean(ranking_matrix, axis=1) friedman_stat = (12 * n_reps / (n_methods * (n_methods + 1))) * ( np.sum(np.square(average_rank)) - (n_methods * (n_methods + 1) ** 2 / 4)) # Friedman test statistic friedman_p_value = stats.chi2.sf(friedman_stat, df=n_methods - 1) # p-value for the Friedman test davenport_stat = friedman_stat * (n_reps - 1) / (n_reps * (n_methods - 1)) # Davenport's statistic davenport_p_value = stats.f.sf(davenport_stat, dfn=n_methods - 1, dfd=(n_methods - 1) * (n_methods) * (n_reps - 1)) # If we reject, we can perform post-hoc tests here. # TODO: Unsure if this is OK, need to account for higher is better in the p-values!! z_stat = np.zeros(n_methods) for j in range(n_methods): z_stat[j] = (average_rank[comp_index] - average_rank[j]) / np.sqrt( (n_methods * (n_methods + 1)) / (6 * n_reps)) # Z-statistic for post-hoc tests p_values_post_hoc = stats.norm.cdf(z_stat) _, p_values_adjusted_post_hoc, _, _ = multitest.multipletests(p_values_post_hoc, alpha=alpha, method='holm') # Holm-Bonferroni correction return friedman_p_value, davenport_p_value, p_values_post_hoc, p_values_adjusted_post_hoc def get_p_values_from_table_data(data, alpha=0.05, higher_is_better=True, output_latex=False, list_of_methods=None, list_of_metrics=None): """ Function to get p-values from a table of data in a structured way, automatically comparing with the best method for each metric. :param data: Organized as a numpy array: methods_to_compare x metrics x datasets/folds. Note that all datasets/folds need to have the same ordering: we use paired tests!! :param alpha: float, significance level for the hypothesis test. :param higher_is_better: bool or list of bool, if True, higher values are better, otherwise lower values are better. :param output_latex: bool, if True, outputs the table in LaTeX format, to copy and paste into a LaTeX document. :param list_of_methods: List of method names, if None, uses the default names. :param list_of_metrics: List of metric names, if None, uses the default names. :return: Outputs a p-value table comparing each method to the specified comparison method. """ assert isinstance(data, np.ndarray), "Data must be a numpy array." assert data.ndim == 3, "Data must be a 3D numpy array with shape (n_methods, n_metrics, n_reps)." n_methods, n_metrics, n_reps = data.shape average_results = np.nanmean(data, axis=2) # Average over repetitions, we have an array of shape (n_methods, n_metrics) if list_of_methods is None: list_of_methods = [f'Method {i + 1}' for i in range(data.shape[0])] if list_of_metrics is None: list_of_metrics = [f'Metric {i + 1}' for i in range(data.shape[1])] if not isinstance(higher_is_better, bool): assert len( higher_is_better) == n_metrics, "If higher_is_better is a list, it must have the same length as the number of metrics." else: higher_is_better = [higher_is_better] * data.shape[1] # If it's a single bool, replicate it for all metrics max_idxs = np.argmax(average_results, axis=0) min_idxs = np.argmin(average_results, axis=0) comp_index = [max_idxs[i] if higher_is_better[i] else min_idxs[i] for i in range(n_metrics)] for i in range(n_metrics): # Print the data for complete reference print(f'\n---------------Data for metric {list_of_metrics[i]}, where higher_is_better is {higher_is_better[i]}---------------') for j in range(n_methods): vals_to_show = data[j, i, :] vals_to_show_str = ', '.join([f"{v:.3f}" if not np.isnan(v) else "nan" for v in vals_to_show]) print(f'{list_of_methods[j]}: [{vals_to_show_str}] / avg: {np.nanmean(data[j, i, :]):.3f}') table_metrics = ['Average metric'] + [f"{np.nanmean(data[j, i, :]):.3f}" for j in range(n_methods)] # First method: use paired Wilcoxon signed-rank test to obtain p-values, and correct them using Holm-Bonferroni method. This is done per-metric, so if we have many metrics, we will have many p-values. baseline_values = data[comp_index[i], i, :] # Baseline values for the metric # remove nan values from baseline_values baseline_values = baseline_values[~np.isnan(baseline_values)] p_values = [] for j in range(n_methods): test_values = data[j, i, :] # Test values for the metric # remove nan values from test_values test_values = test_values[~np.isnan(test_values)] # If the lengths of baseline_values and test_values are different, we need to remove the corresponding values from both # min_length = min(len(baseline_values), len(test_values)) if comp_index[ i] == j: # If we are comparing the baseline method with itself, we skip this comparison, as the Wilcoxon test will throw an error p_values.append(1.0) # No difference, p-value is 1 continue if higher_is_better[i]: # If higher is better, we want to test if the test values are significantly lower than the baseline values (i.e., significantly worse) _, p_value = stats.wilcoxon(test_values, baseline_values, alternative='less') else: # If lower is better, we want to test if the test values are significantly higher than the baseline values (i.e., significantly worse) _, p_value = stats.wilcoxon(test_values, baseline_values, alternative='greater') p_values.append(p_value) # Apply Holm-Bonferroni correction print('\n') p_values = np.array(p_values) _, corrected_p_vals, _, _ = multitest.multipletests(np.array(p_values), alpha=alpha, method='holm') # Prepare a table to store all data for this metric table_wilcoxon_corr = ['Paired Wilcoxon tests (corrected)'] table_wilcoxon_unc = ['Paired Wilcoxon tests (uncorrected)'] for j in range(n_methods): p_val_str = f"{corrected_p_vals[j]:.3f}" if corrected_p_vals[j] >= 1e-3 else "<1e-3" # Format p-values if corrected_p_vals[j] >= alpha: p_val_str += '*' # Mark best values if j == comp_index[i]: p_val_str += ' (baseline)' # Mark the baseline method table_wilcoxon_corr.append(p_val_str) p_val_str = f"{p_values[j]:.3f}" if p_values[j] >= 1e-3 else "<1e-3" # Format small p-values if p_values[j] >= alpha: p_val_str += '*' # Mark best values if j == comp_index[i]: p_val_str += ' (baseline)' # Mark the baseline method table_wilcoxon_unc.append(p_val_str) # Second method: the Friedman test, which is a non-parametric test for repeated measures done on all metrics at once. Blocks = methods, treatments = datasets / folds (we could also implement one on datasets * metrics, a general one, later on). We rely on Demsar06 for this implementation. fr_data = data[:, i, :] # remove columns with nan values fr_data = fr_data[:, ~np.isnan(fr_data).any(axis=0)] friedman_p_value, davenport_p_value, p_values_post_hoc_unc, p_values_post_hoc_corr = friedman_test(fr_data, comp_index[i], alpha, higher_is_better[i]) # Prepare this for the table friedman_post_hoc_table_corr = ['Friedman post-hoc tests (Corrected)'] friedman_post_hoc_table_unc = ['Friedman post-hoc tests (Uncorrected)'] for j in range(n_methods): p_val_str = f"{p_values_post_hoc_corr[j]:.3f}" if p_values_post_hoc_corr[ j] >= 1e-3 else "<1e-3" # Format p-values if p_values_post_hoc_corr[j] >= alpha: p_val_str += '*' # Mark best values if j == comp_index[i]: p_val_str += ' (baseline)' # Mark the baseline method friedman_post_hoc_table_corr.append(p_val_str) p_val_str = f"{p_values_post_hoc_unc[j]:.3f}" if p_values_post_hoc_unc[ j] >= 1e-3 else "<1e-3" # Format small p-values if p_values_post_hoc_unc[j] >= alpha: p_val_str += '*' # Mark best values if j == comp_index[i]: p_val_str += ' (baseline)' # Mark the baseline method friedman_post_hoc_table_unc.append(p_val_str) if friedman_p_value < 1e-3: friedman_p_value_str = "<1e-3" # Format small p-values else: friedman_p_value_str = f"{friedman_p_value:.3f}" if davenport_p_value < 1e-3: davenport_p_value_str = "<1e-3" # Format small p-values else: davenport_p_value_str = f"{davenport_p_value:.3f}" print( f'Friedman p-value: {friedman_p_value_str}, Davenport p-value: {davenport_p_value_str} for metric {list_of_metrics[i]}') if n_reps <= 10 or n_methods <= 5: print( 'Since the number of data points is small, the Friedman test may not be reliable. Consider using a larger dataset or a different test.') table_data = [table_metrics, table_wilcoxon_unc, table_wilcoxon_corr, friedman_post_hoc_table_unc, friedman_post_hoc_table_corr] if output_latex: print(tabulate(table_data, headers=[f'Metric {list_of_metrics[i]}'] + list_of_methods, tablefmt='latex')) else: print(tabulate(table_data, headers=[f'Metric {list_of_metrics[i]}'] + list_of_methods, tablefmt='grid')) # Finally, run a Friedman test on all metrics at once all_data = data.copy() # For all metrics where lower is better, we need to invert the data so that higher is better always for j in range(n_metrics): if not higher_is_better[j]: all_data[:, j, :] = -all_data[:, j, :] # Invert the data for lower is better metrics # Now, reshape the data to have shape (n_methods, n_metrics * n_reps) all_data = all_data.reshape(n_methods, n_metrics * n_reps) avg_metrics = np.nanmean(all_data, axis=1) # Average over repetitions and metrics best_method = np.argmax(avg_metrics) # Best method across all metrics (remember, higher is better now!) friedman_p_value, davenport_p_value, p_values_post_hoc_unc, p_values_post_hoc_corr = friedman_test(all_data, best_method, alpha, higher_is_better=True) print(f'\n\n-------Friedman test on all metrics-------\np-value: {friedman_p_value:.4f}, Davenport p-value: {davenport_p_value:.4f}') # Prepare this for the table friedman_post_hoc_table_unc = ['Friedman post-hoc tests (all metrics, uncorrected)'] friedman_post_hoc_table_corr = ['Friedman post-hoc tests (all metrics, corrected)'] for j in range(n_methods): p_val_str = f"{p_values_post_hoc_corr[j]:.3f}" if p_values_post_hoc_corr[ j] >= 1e-3 else "<1e-3" # Format p-values if p_values_post_hoc_corr[j] >= alpha: p_val_str += '*' # Mark best values if j == best_method: p_val_str += ' (baseline)' # Mark the baseline method friedman_post_hoc_table_corr.append(p_val_str) p_val_str = f"{p_values_post_hoc_unc[j]:.3f}" if p_values_post_hoc_unc[ j] >= 1e-3 else "<1e-3" # Format small p-values if p_values_post_hoc_unc[j] >= alpha: p_val_str += '*' # Mark best values if j == best_method: p_val_str += ' (baseline)' # Mark the baseline method friedman_post_hoc_table_unc.append(p_val_str) table_data = [friedman_post_hoc_table_unc, friedman_post_hoc_table_corr] if output_latex: print(tabulate(table_data, headers=['All metrics'] + list_of_methods, tablefmt='latex')) else: print(tabulate(table_data, headers=['All metrics'] + list_of_methods, tablefmt='grid')) ##################################### ### INTERPRETABILITY ### #####################################