# Author: Juan Parras & Patricia A. Apellániz # Email: patricia.alonsod@upm.es # Date: 05/08/2025 # Package imports from kan import * from scipy.stats import t from sklearn.model_selection import GridSearchCV, KFold from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score from src.models.models import Mlp_model, LogisticRegressionModel, RandomForestModel, Kan_model, NAMModel def get_metrics(y_true, y_pred, y_proba): y_true = np.squeeze(y_true) y_pred = np.squeeze(y_pred) y_proba = np.squeeze(y_proba) binary = False if len(y_proba.shape) == 1: binary = True elif y_proba.shape[1] <= 2: binary = True if y_proba.shape[1] == 2: y_proba = y_proba[:, 1] # Check for the right method to apply if binary: # Binary classification return {'accuracy': accuracy_score(y_true, y_pred), 'precision': precision_score(y_true, y_pred, zero_division=0), 'recall': recall_score(y_true, y_pred, zero_division=0), 'f1': f1_score(y_true, y_pred, zero_division=0), 'roc_auc': roc_auc_score(y_true, y_proba)} else: # Multiclass classification return {'accuracy': accuracy_score(y_true, y_pred), 'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0), 'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0), 'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0), 'roc_auc': roc_auc_score(y_true, y_proba, average='weighted', multi_class='ovo')} def get_bootstrap_metrics(y_true, y_pred, y_proba, n_bootstrap=1000, ci=95): y_true = np.array(y_true) y_pred = np.array(y_pred) y_proba = np.array(y_proba) original_classes = np.unique(y_true) metrics_list = [] for _ in range(n_bootstrap): indices = np.random.choice(len(y_true), len(y_true), replace=True) y_true_sample = y_true[indices] # Check if all classes are present in the sample if not np.all(np.isin(original_classes, np.unique(y_true_sample))): continue # Skip this iteration if not all classes are present sample_metrics = get_metrics(y_true_sample, y_pred[indices], y_proba[indices]) metrics_list.append(sample_metrics) # Group metrics by name metric_names = metrics_list[0].keys() all_metrics = {k: [] for k in metric_names} for m in metrics_list: for k in m: all_metrics[k].append(m[k]) # Calculate mean and confidence intervals alpha = 1 - ci / 100 t_val = t.ppf(1 - alpha / 2, df=n_bootstrap - 1) metrics_with_ci = {} for k, values in all_metrics.items(): values = np.array(values) mean = np.mean(values) std_err = np.std(values, ddof=1) / np.sqrt(n_bootstrap) ci_range = t_val * std_err metrics_with_ci[k] = { 'mean': mean, f'CI_{ci}%': (mean - ci_range, mean + ci_range) } return metrics_with_ci def get_params(model_name, default): if model_name == 'mlp': # MLP model parameters params = {'hidden_layer_sizes': [(32,), (64,), (128,), (256,)], # the number of neurons in the hidden layers 'max_iter': [10000], # the maximum number of iterations 'early_stopping': [True], # whether to use early stopping to terminate training when validation score is not improving 'alpha': [0.0001, 0.001], # L2 penalty (regularization term) parameter } elif model_name == 'lr': # LR model parameters params = {'C': [0.1], # regularization strength; smaller values specify stronger regularization 'penalty': ['l2', 'l1'], # type of regularization to use ('l1', 'l2', or none) 'solver': ['liblinear'], # optimization solvers 'max_iter': [1000], # maximum number of iterations for solvers 'class_weight': ['balanced', None], # adjust weights inversely proportional to class frequencies 'random_state': [0], # the seed used by the random number generator } elif model_name == 'rf': # RF model parameters params = {'n_estimators': [20, 50], # the number of trees in the forest 'criterion': ['gini'], # the function to measure the quality of a split (default='gini') 'max_depth': [10, 20], # the maximum depth of the tree 'min_samples_split': [2], # the minimum number of samples required to split an internal node 'min_samples_leaf': [1, 5], # the minimum number of samples required to be at a leaf node 'class_weight': ['balanced', None], 'max_features': ['log2'], # the number of features to consider when looking for the best split 'bootstrap': [True], # whether bootstrap samples are used when building trees (default=True) 'random_state': [0], # the seed used by the random number generator (default=0) 'n_jobs': [1], # the number of jobs to run in parallel for both fit and predict (default=5) } elif model_name in ['kan', 'kan_gam']: # KAN model parameters params = {'hidden_dim': [0, 5, [5, 5]], # the dimension of the hidden layers 'batch_size': [-1], # the number of samples to use for each training step (i.e., use all of them) 'grid': [1, 3, 5], # the number of grid points in the input space 'k': [1, 3, 5], # the polynomial order in the spline 'seed': [0], # the seed used by the random number generator 'lr': [0.001], # the learning rate 'early_stop': [True], # whether to use early stopping to terminate training when validation score is not improving 'steps': [10000], # the number of training steps 'lamb': [0.1, 0.01, 0.001], # the regularization strength 'lamb_entropy': [0.1], # the regularization strength for the entropy term 'weight': [True, False], # whether to use the weight term (i.e., to balance the classes) 'sparse_init': [True, False], # whether to use a sparse initialization 'mult_kan': [False], # whether to use multiplication nodes in the KAN model } if model_name == 'kan_gam': params['hidden_dim'] = [0] # The hidden dimension is not used in the GAM version params['mult_kan'] = [False] # The GAM version does not use multiplication nodes elif model_name == 'nam': # NAM model parameters params = {'num_epochs': [1000], 'num_learners': [10, 20], 'metric': ['aucroc'], 'early_stop_mode': ['max'], 'n_jobs': [1], 'random_state': [0], 'num_basis_functions': [32, 64, 128], 'hidden_size': [[64, 32], [128, 64]], } else: raise ValueError(f"Model name {model_name} not recognized") # If default, select the first value of each parameter if default: for key in params.keys(): params[key] = params[key][0] return params def get_model(model_name, default=False): params = get_params(model_name, default) if model_name == 'mlp': return Mlp_model(), params elif model_name == 'lr': return LogisticRegressionModel(), params elif model_name == 'rf': return RandomForestModel(), params elif model_name == 'kan' or model_name == 'kan_gam': return Kan_model(), params elif model_name == 'nam': return NAMModel(), params else: raise ValueError(f"Model name {model_name} not found") def get_best_params(model_name, x_train, y_train, args): n_jobs = args['n_jobs'] n_splits_cv = args['n_folds'] n_classes = len(np.unique(y_train)) if n_classes > 2 and model_name == 'nam': # NAM does not support multiclass problems return None, None else: model, hyperparameters = get_model(model_name, default=False) # Configure the cross-validation procedure for parameter tuning cv_inner = KFold(n_splits=n_splits_cv, shuffle=True, random_state=0) # Define search if n_classes > 2: # Multiclass problem search = GridSearchCV(model, hyperparameters, scoring=['f1_weighted', 'roc_auc_ovo_weighted', 'recall_weighted'], refit='f1_weighted', cv=cv_inner, n_jobs=n_jobs, error_score=0.0, verbose=4) # Other option: roc_auc_ovo_weighted else: # Binary problem search = GridSearchCV(model, hyperparameters, scoring=['f1', 'roc_auc', 'recall'], refit='f1', cv=cv_inner, n_jobs=n_jobs, error_score=0.0, verbose=4) # Execute search result = search.fit(x_train, y_train.squeeze()) return result.best_params_, result.best_estimator_