Spaces:

marElizo
/

classification_with_kans

Sleeping

File size: 9,688 Bytes

c52261f

# Author: Juan Parras & Patricia A. Apellániz
# Email: patricia.alonsod@upm.es
# Date: 05/08/2025

# Package imports
from kan import *
from scipy.stats import t
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from src.models.models import Mlp_model, LogisticRegressionModel, RandomForestModel, Kan_model, NAMModel


def get_metrics(y_true, y_pred, y_proba):
    y_true = np.squeeze(y_true)
    y_pred = np.squeeze(y_pred)
    y_proba = np.squeeze(y_proba)

    binary = False
    if len(y_proba.shape) == 1:
        binary = True
    elif y_proba.shape[1] <= 2:
        binary = True
        if y_proba.shape[1] == 2:
            y_proba = y_proba[:, 1]

    # Check for the right method to apply
    if binary:  # Binary classification
        return {'accuracy': accuracy_score(y_true, y_pred),
                'precision': precision_score(y_true, y_pred, zero_division=0),
                'recall': recall_score(y_true, y_pred, zero_division=0),
                'f1': f1_score(y_true, y_pred, zero_division=0),
                'roc_auc': roc_auc_score(y_true, y_proba)}
    else:  # Multiclass classification
        return {'accuracy': accuracy_score(y_true, y_pred),
                'precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
                'recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
                'f1': f1_score(y_true, y_pred, average='weighted', zero_division=0),
                'roc_auc': roc_auc_score(y_true, y_proba, average='weighted', multi_class='ovo')}


def get_bootstrap_metrics(y_true, y_pred, y_proba, n_bootstrap=1000, ci=95):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_proba = np.array(y_proba)
    original_classes = np.unique(y_true)
    metrics_list = []

    for _ in range(n_bootstrap):
        indices = np.random.choice(len(y_true), len(y_true), replace=True)
        y_true_sample = y_true[indices]

        # Check if all classes are present in the sample
        if not np.all(np.isin(original_classes, np.unique(y_true_sample))):
            continue  # Skip this iteration if not all classes are present

        sample_metrics = get_metrics(y_true_sample, y_pred[indices], y_proba[indices])
        metrics_list.append(sample_metrics)

    # Group metrics by name
    metric_names = metrics_list[0].keys()
    all_metrics = {k: [] for k in metric_names}
    for m in metrics_list:
        for k in m:
            all_metrics[k].append(m[k])

    # Calculate mean and confidence intervals
    alpha = 1 - ci / 100
    t_val = t.ppf(1 - alpha / 2, df=n_bootstrap - 1)
    metrics_with_ci = {}

    for k, values in all_metrics.items():
        values = np.array(values)
        mean = np.mean(values)
        std_err = np.std(values, ddof=1) / np.sqrt(n_bootstrap)
        ci_range = t_val * std_err
        metrics_with_ci[k] = {
            'mean': mean,
            f'CI_{ci}%': (mean - ci_range, mean + ci_range)
        }

    return metrics_with_ci


def get_params(model_name, default):
    if model_name == 'mlp':
        # MLP model parameters
        params = {'hidden_layer_sizes': [(32,), (64,), (128,), (256,)],  # the number of neurons in the hidden layers
                  'max_iter': [10000],  # the maximum number of iterations
                  'early_stopping': [True],
                  # whether to use early stopping to terminate training when validation score is not improving
                  'alpha': [0.0001, 0.001],  # L2 penalty (regularization term) parameter
                  }

    elif model_name == 'lr':
        # LR model parameters
        params = {'C': [0.1],  # regularization strength; smaller values specify stronger regularization
                  'penalty': ['l2', 'l1'],  # type of regularization to use ('l1', 'l2', or none)
                  'solver': ['liblinear'],  # optimization solvers
                  'max_iter': [1000],  # maximum number of iterations for solvers
                  'class_weight': ['balanced', None],  # adjust weights inversely proportional to class frequencies
                  'random_state': [0],  # the seed used by the random number generator
                  }

    elif model_name == 'rf':
        # RF model parameters
        params = {'n_estimators': [20, 50],  # the number of trees in the forest
                  'criterion': ['gini'],  # the function to measure the quality of a split (default='gini')
                  'max_depth': [10, 20],  # the maximum depth of the tree
                  'min_samples_split': [2],  # the minimum number of samples required to split an internal node
                  'min_samples_leaf': [1, 5],  # the minimum number of samples required to be at a leaf node
                  'class_weight': ['balanced', None],
                  'max_features': ['log2'],  # the number of features to consider when looking for the best split
                  'bootstrap': [True],  # whether bootstrap samples are used when building trees (default=True)
                  'random_state': [0],  # the seed used by the random number generator (default=0)
                  'n_jobs': [1],  # the number of jobs to run in parallel for both fit and predict (default=5)
                  }

    elif model_name in ['kan', 'kan_gam']:
        # KAN model parameters
        params = {'hidden_dim': [0, 5, [5, 5]],  # the dimension of the hidden layers
                  'batch_size': [-1],  # the number of samples to use for each training step (i.e., use all of them)
                  'grid': [1, 3, 5],  # the number of grid points in the input space
                  'k': [1, 3, 5],  # the polynomial order in the spline
                  'seed': [0],  # the seed used by the random number generator
                  'lr': [0.001],  # the learning rate
                  'early_stop': [True],
                  # whether to use early stopping to terminate training when validation score is not improving
                  'steps': [10000],  # the number of training steps
                  'lamb': [0.1, 0.01, 0.001],  # the regularization strength
                  'lamb_entropy': [0.1],  # the regularization strength for the entropy term
                  'weight': [True, False],  # whether to use the weight term (i.e., to balance the classes)
                  'sparse_init': [True, False],  # whether to use a sparse initialization
                  'mult_kan': [False],  # whether to use multiplication nodes in the KAN model
                  }

        if model_name == 'kan_gam':
            params['hidden_dim'] = [0]  # The hidden dimension is not used in the GAM version
            params['mult_kan'] = [False]  # The GAM version does not use multiplication nodes

    elif model_name == 'nam':
        # NAM model parameters
        params = {'num_epochs': [1000],
                  'num_learners': [10, 20],
                  'metric': ['aucroc'],
                  'early_stop_mode': ['max'],
                  'n_jobs': [1],
                  'random_state': [0],
                  'num_basis_functions': [32, 64, 128],
                  'hidden_size': [[64, 32], [128, 64]],
                  }
    else:
        raise ValueError(f"Model name {model_name} not recognized")

    # If default, select the first value of each parameter
    if default:
        for key in params.keys():
            params[key] = params[key][0]

    return params


def get_model(model_name, default=False):
    params = get_params(model_name, default)
    if model_name == 'mlp':
        return Mlp_model(), params
    elif model_name == 'lr':
        return LogisticRegressionModel(), params
    elif model_name == 'rf':
        return RandomForestModel(), params
    elif model_name == 'kan' or model_name == 'kan_gam':
        return Kan_model(), params
    elif model_name == 'nam':
        return NAMModel(), params
    else:
        raise ValueError(f"Model name {model_name} not found")


def get_best_params(model_name, x_train, y_train, args):
    n_jobs = args['n_jobs']
    n_splits_cv = args['n_folds']
    n_classes = len(np.unique(y_train))
    if n_classes > 2 and model_name == 'nam':  # NAM does not support multiclass problems
        return None, None
    else:
        model, hyperparameters = get_model(model_name, default=False)

        # Configure the cross-validation procedure for parameter tuning
        cv_inner = KFold(n_splits=n_splits_cv, shuffle=True, random_state=0)

        # Define search
        if n_classes > 2:  # Multiclass problem
            search = GridSearchCV(model,
                                  hyperparameters,
                                  scoring=['f1_weighted', 'roc_auc_ovo_weighted', 'recall_weighted'],
                                  refit='f1_weighted',
                                  cv=cv_inner,
                                  n_jobs=n_jobs,
                                  error_score=0.0,
                                  verbose=4)  # Other option: roc_auc_ovo_weighted
        else:  # Binary problem
            search = GridSearchCV(model,
                                  hyperparameters,
                                  scoring=['f1', 'roc_auc', 'recall'],
                                  refit='f1',
                                  cv=cv_inner,
                                  n_jobs=n_jobs,
                                  error_score=0.0,
                                  verbose=4)

        # Execute search
        result = search.fit(x_train, y_train.squeeze())

        return result.best_params_, result.best_estimator_