### Modified version of lazy predict from https://github.com/shankarpandala/lazypredict import numpy as np import pandas as pd import time import warnings import xgboost import lightgbm from tqdm import tqdm from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder from sklearn.compose import ColumnTransformer from sklearn.utils import all_estimators from sklearn.base import RegressorMixin from sklearn.base import ClassifierMixin from sklearn.metrics import ( accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, r2_score, mean_squared_error, ) try: from utils import print_message from seed_utils import get_global_seed except ImportError: from ..utils import print_message from ..seed_utils import get_global_seed warnings.filterwarnings("ignore") pd.set_option("display.precision", 2) pd.set_option("display.float_format", lambda x: "%.2f" % x) removed_classifiers = [ "ClassifierChain", "ComplementNB", "GradientBoostingClassifier", "GaussianProcessClassifier", "HistGradientBoostingClassifier", "MLPClassifier", "LogisticRegressionCV", "MultiOutputClassifier", "MultinomialNB", "OneVsOneClassifier", "OneVsRestClassifier", "OutputCodeClassifier", "RadiusNeighborsClassifier", "VotingClassifier", "LogisticRegressionCV", "CalibratedClassifierCV", "RidgeClassifierCV", "LinearSVC", "Perceptron", "MLPClassifier", "SGDClassifier", # O(n²) memory models - too slow for large datasets "LabelPropagation", "LabelSpreading", "SVC", "NuSVC", # Sequential ensemble models - slow for large datasets "AdaBoostClassifier", "BaggingClassifier", # O(n×m) prediction time - slow for large test sets "KNeighborsClassifier", # Unbounded tree depth - very slow on high-dim data "DecisionTreeClassifier", "ExtraTreeClassifier", "ExtraTreesClassifier", # Fails on negative values after StandardScaler "CategoricalNB", # O(d²) or O(d³) - slow on high-dimensional data (4608 features) "LinearDiscriminantAnalysis", "QuadraticDiscriminantAnalysis", # Requires estimator argument "FixedThresholdClassifier", "TunedThresholdClassifierCV", ] removed_regressors = [ "TheilSenRegressor", "ARDRegression", "CCA", "IsotonicRegression", "StackingRegressor", "MultiOutputRegressor", "MultiTaskElasticNet", "MultiTaskElasticNetCV", "MultiTaskLasso", "MultiTaskLassoCV", "PLSCanonical", "PLSRegression", "RadiusNeighborsRegressor", "RegressorChain", "VotingRegressor", "OrthogonalMatchingPursuitCV", "LassoLars", "LarsCV", "LassoCV", "RidgeCV", "LassoLarsCV", "ElasticNetCV", "LinearSVR", "LassoLarsIC", # Sequential ensemble models - slow for large datasets "AdaBoostRegressor", "BaggingRegressor", # O(n×m) prediction time - slow for large test sets "KNeighborsRegressor", # Unbounded tree depth - very slow on high-dim data "DecisionTreeRegressor", "ExtraTreeRegressor", "ExtraTreesRegressor", ] # Tuple of (name, class) CLASSIFIERS = [ est for est in all_estimators() if (issubclass(est[1], ClassifierMixin) and (est[0] not in removed_classifiers)) ] CLASSIFIER_DICT = {model[0]: model[1] for model in CLASSIFIERS} """ CLASSIFIERS = [ 'LogisticRegression', 'SVC', 'PassiveAggressiveClassifier', 'LabelPropagation', 'LabelSpreading', 'RandomForestClassifier', 'GradientBoostingClassifier', 'QuadraticDiscriminantAnalysis', 'HistGradientBoostingClassifier', 'RidgeClassifier', 'AdaBoostClassifier', 'ExtraTreesClassifier', 'KNeighborsClassifier', 'BaggingClassifier', 'BernoulliNB', 'LinearDiscriminantAnalysis', 'GaussianNB', 'NuSVC', 'DecisionTreeClassifier', 'NearestCentroid', 'ExtraTreeClassifier', 'CheckingClassifier', 'DummyClassifier' ] """ REGRESSORS = [ est for est in all_estimators() if (issubclass(est[1], RegressorMixin) and (est[0] not in removed_regressors)) ] REGRESSOR_DICT = {model[0]: model[1] for model in REGRESSORS} ALL_MODELS = CLASSIFIERS + REGRESSORS ALL_MODEL_DICT = {model[0]: model[1] for model in ALL_MODELS} """ REGRESSORS = [ 'ExtraTreesRegressor', 'Lasso', 'PassiveAggressiveRegressor', 'SGDRegressor', 'Ridge', 'BayesianRidge', 'TransformedTargetRegressor', 'LinearRegression', 'Lars', 'HuberRegressor', 'RandomForestRegressor', 'AdaBoostRegressor', 'LGBMRegressor', 'HistGradientBoostingRegressor', 'PoissonRegressor', 'ElasticNet', 'KNeighborsRegressor', 'OrthogonalMatchingPursuit', 'BaggingRegressor', 'GradientBoostingRegressor', 'TweedieRegressor', 'XGBRegressor', 'GammaRegressor', 'RANSACRegressor', 'ExtraTreeRegressor', 'NuSVR', 'SVR', 'DummyRegressor', 'DecisionTreeRegressor', 'GaussianProcessRegressor', 'MLPRegressor', 'KernelRidge' ] """ REGRESSORS.append(("XGBRegressor", xgboost.XGBRegressor)) REGRESSORS.append(("LGBMRegressor", lightgbm.LGBMRegressor)) # REGRESSORS.append(('CatBoostRegressor',catboost.CatBoostRegressor)) CLASSIFIERS.append(("XGBClassifier", xgboost.XGBClassifier)) CLASSIFIERS.append(("LGBMClassifier", lightgbm.LGBMClassifier)) # CLASSIFIERS.append(('CatBoostClassifier',catboost.CatBoostClassifier)) # Update dicts with XGB and LGBM CLASSIFIER_DICT["XGBClassifier"] = xgboost.XGBClassifier CLASSIFIER_DICT["LGBMClassifier"] = lightgbm.LGBMClassifier REGRESSOR_DICT["XGBRegressor"] = xgboost.XGBRegressor REGRESSOR_DICT["LGBMRegressor"] = lightgbm.LGBMRegressor ALL_MODEL_DICT["XGBClassifier"] = xgboost.XGBClassifier ALL_MODEL_DICT["LGBMClassifier"] = lightgbm.LGBMClassifier ALL_MODEL_DICT["XGBRegressor"] = xgboost.XGBRegressor ALL_MODEL_DICT["LGBMRegressor"] = lightgbm.LGBMRegressor numeric_transformer = Pipeline( steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())] ) categorical_transformer_low = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("encoding", OneHotEncoder(handle_unknown="ignore", sparse_output=False)), ] ) categorical_transformer_high = Pipeline( steps=[ ("imputer", SimpleImputer(strategy="constant", fill_value="missing")), # 'OrdianlEncoder' Raise a ValueError when encounters an unknown value. Check https://github.com/scikit-learn/scikit-learn/pull/13423 ("encoding", OrdinalEncoder()), ] ) # Helper function def get_card_split(df, cols, n=11): """ Splits categorical columns into 2 lists based on cardinality (i.e # of unique values) Parameters ---------- df : Pandas DataFrame DataFrame from which the cardinality of the columns is calculated. cols : list-like Categorical columns to list n : int, optional (default=11) The value of 'n' will be used to split columns. Returns ------- card_low : list-like Columns with cardinality < n card_high : list-like Columns with cardinality >= n """ cond = df[cols].nunique() > n card_high = cols[cond] card_low = cols[~cond] return card_low, card_high # Helper class for performing classification class LazyClassifier: """ This module helps in fitting to all the classification algorithms that are available in Scikit-learn Parameters ---------- verbose : int, optional (default=0) For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. ignore_warnings : bool, optional (default=True) When set to True, the warning related to algorigms that are not able to run are ignored. custom_metric : function, optional (default=None) When function is provided, models are evaluated based on the custom evaluation metric provided. prediction : bool, optional (default=False) When set to True, the predictions of all the models models are returned as dataframe. classifiers : list, optional (default="all") When function is provided, trains the chosen classifier(s). """ def __init__( self, verbose=0, ignore_warnings=True, custom_metric=None, predictions=False, random_state=None, classifiers="all", ): self.verbose = verbose self.ignore_warnings = ignore_warnings self.custom_metric = custom_metric self.predictions = predictions self.models = {} self.random_state = random_state or get_global_seed() self.classifiers = classifiers def fit(self, X_train, X_test, y_train, y_test): """Fit Classification algorithms to X_train and y_train, predict and score on X_test, y_test. Parameters ---------- X_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. X_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. y_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. y_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. Returns ------- scores : Pandas DataFrame Returns metrics of all the models in a Pandas DataFrame. predictions : Pandas DataFrame Returns predictions of all the models in a Pandas DataFrame. """ Accuracy = [] B_Accuracy = [] ROC_AUC = [] F1 = [] names = [] TIME = [] predictions = {} if self.custom_metric is not None: CUSTOM_METRIC = [] if isinstance(X_train, np.ndarray): X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) numeric_features = X_train.select_dtypes(include=[np.number]).columns categorical_features = X_train.select_dtypes(include=["object"]).columns categorical_low, categorical_high = get_card_split( X_train, categorical_features ) preprocessor = ColumnTransformer( transformers=[ ("numeric", numeric_transformer, numeric_features), ("categorical_low", categorical_transformer_low, categorical_low), ("categorical_high", categorical_transformer_high, categorical_high), ] ) # Precompute preprocessing once for all models (major optimization for large datasets) print_message("Preprocessing data once for all models...") preprocess_start = time.time() X_train_transformed = preprocessor.fit_transform(X_train) X_test_transformed = preprocessor.transform(X_test) print_message(f"Preprocessing completed in {time.time() - preprocess_start:.1f}s") if self.classifiers == "all": self.classifiers = CLASSIFIERS else: try: temp_list = [] for classifier in self.classifiers: full_name = (classifier.__name__, classifier) temp_list.append(full_name) self.classifiers = temp_list except Exception as exception: print_message(exception) print_message("Invalid Classifier(s)") # Track failed models failed_models = [] total_start = time.time() for name, model in tqdm(self.classifiers, desc="Training classifiers"): print_message(f"Starting {name}...") start = time.time() try: # Build model kwargs model_kwargs = {} if "random_state" in model().get_params().keys(): model_kwargs["random_state"] = self.random_state # Enable parallelization for models that support it if "n_jobs" in model().get_params().keys(): model_kwargs["n_jobs"] = -1 # Enable verbose for boosting models to show iteration progress if name in ("XGBClassifier", "LGBMClassifier"): model_kwargs["verbose"] = 1 # Train directly on preprocessed data (no Pipeline needed) clf = model(**model_kwargs) clf.fit(X_train_transformed, y_train) self.models[name] = clf y_pred = clf.predict(X_test_transformed) accuracy = accuracy_score(y_test, y_pred, normalize=True) b_accuracy = balanced_accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred, average="weighted") try: roc_auc = roc_auc_score(y_test, y_pred) except Exception as exception: roc_auc = None if self.ignore_warnings is False: print_message("ROC AUC couldn't be calculated for " + name) print_message(exception) fit_time = time.time() - start names.append(name) Accuracy.append(accuracy) B_Accuracy.append(b_accuracy) ROC_AUC.append(roc_auc) F1.append(f1) TIME.append(fit_time) print_message(f" {name} completed in {fit_time:.1f}s | Acc: {accuracy:.3f} | F1: {f1:.3f}") if self.custom_metric is not None: custom_metric = self.custom_metric(y_test, y_pred) CUSTOM_METRIC.append(custom_metric) if self.predictions: predictions[name] = y_pred except Exception as exception: failed_models.append(name) if self.ignore_warnings is False: print_message(f'\n{name} model failed to execute') print_message(exception) if self.custom_metric is None: scores = pd.DataFrame( { "Model": names, "Accuracy": Accuracy, "Balanced Accuracy": B_Accuracy, "ROC AUC": ROC_AUC, "F1 Score": F1, "Time Taken": TIME, } ) else: scores = pd.DataFrame( { "Model": names, "Accuracy": Accuracy, "Balanced Accuracy": B_Accuracy, "ROC AUC": ROC_AUC, "F1 Score": F1, "Custom Metric": CUSTOM_METRIC, "Time Taken": TIME, } ) scores = scores.sort_values(by="Balanced Accuracy", ascending=False).set_index( "Model" ) # Print summary total_time = time.time() - total_start n_success = len(names) n_failed = len(failed_models) best_model = scores.index[0] if len(scores) > 0 else "N/A" best_score = scores["Balanced Accuracy"].iloc[0] if len(scores) > 0 else 0 if self.verbose > 0: # Full table + failed models summary = f"\nLazyClassifier Results ({n_success} succeeded, {n_failed} failed, {total_time:.1f}s)\n" summary += scores.to_string() if failed_models: summary += f"\n\nFailed: {', '.join(failed_models)}" print_message(summary) else: # 1-line summary print_message(f"Completed {n_success + n_failed} classifiers in {total_time:.1f}s | {n_success} succeeded, {n_failed} failed | Best: {best_model} ({best_score:.2f})") if self.predictions: predictions_df = pd.DataFrame.from_dict(predictions) return scores, predictions_df return scores def provide_models(self, X_train, X_test, y_train, y_test): """ This function returns all the model objects trained in fit function. If fit is not called already, then we call fit and then return the models. Parameters ---------- X_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. X_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. y_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. y_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. Returns ------- models: dict-object, Returns a dictionary with each model pipeline as value with key as name of models. """ if len(self.models.keys()) == 0: self.fit(X_train, X_test, y_train, y_test) return self.models def adjusted_rsquared(r2, n, p): return 1 - (1 - r2) * ((n - 1) / (n - p - 1)) # Helper class for performing classification class LazyRegressor: """ This module helps in fitting regression models that are available in Scikit-learn Parameters ---------- verbose : int, optional (default=0) For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. ignore_warnings : bool, optional (default=True) When set to True, the warning related to algorigms that are not able to run are ignored. custom_metric : function, optional (default=None) When function is provided, models are evaluated based on the custom evaluation metric provided. prediction : bool, optional (default=False) When set to True, the predictions of all the models models are returned as dataframe. regressors : list, optional (default="all") When function is provided, trains the chosen regressor(s). """ def __init__( self, verbose=0, ignore_warnings=True, custom_metric=None, predictions=False, random_state=None, regressors="all", ): self.verbose = verbose self.ignore_warnings = ignore_warnings self.custom_metric = custom_metric self.predictions = predictions self.models = {} self.random_state = random_state or get_global_seed() self.regressors = regressors def fit(self, X_train, X_test, y_train, y_test): """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test. Parameters ---------- X_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. X_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. y_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. y_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. Returns ------- scores : Pandas DataFrame Returns metrics of all the models in a Pandas DataFrame. predictions : Pandas DataFrame Returns predictions of all the models in a Pandas DataFrame. """ R2 = [] ADJR2 = [] RMSE = [] # WIN = [] names = [] TIME = [] predictions = {} if self.custom_metric: CUSTOM_METRIC = [] if isinstance(X_train, np.ndarray): X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) numeric_features = X_train.select_dtypes(include=[np.number]).columns categorical_features = X_train.select_dtypes(include=["object"]).columns categorical_low, categorical_high = get_card_split( X_train, categorical_features ) preprocessor = ColumnTransformer( transformers=[ ("numeric", numeric_transformer, numeric_features), ("categorical_low", categorical_transformer_low, categorical_low), ("categorical_high", categorical_transformer_high, categorical_high), ] ) # Precompute preprocessing once for all models (major optimization for large datasets) print_message("Preprocessing data once for all models...") preprocess_start = time.time() X_train_transformed = preprocessor.fit_transform(X_train) X_test_transformed = preprocessor.transform(X_test) print_message(f"Preprocessing completed in {time.time() - preprocess_start:.1f}s") if self.regressors == "all": self.regressors = REGRESSORS else: try: temp_list = [] for regressor in self.regressors: full_name = (regressor.__name__, regressor) temp_list.append(full_name) self.regressors = temp_list except Exception as exception: print_message(exception) print_message("Invalid Regressor(s)") # Track failed models failed_models = [] total_start = time.time() for name, model in tqdm(self.regressors, desc="Training regressors"): print_message(f"Starting {name}...") start = time.time() try: # Build model kwargs model_kwargs = {} if "random_state" in model().get_params().keys(): model_kwargs["random_state"] = self.random_state # Enable parallelization for models that support it if "n_jobs" in model().get_params().keys(): model_kwargs["n_jobs"] = -1 # Enable verbose for boosting models to show iteration progress if name in ("XGBRegressor", "LGBMRegressor"): model_kwargs["verbose"] = 1 # Train directly on preprocessed data (no Pipeline needed) reg = model(**model_kwargs) reg.fit(X_train_transformed, y_train) self.models[name] = reg y_pred = reg.predict(X_test_transformed) r_squared = r2_score(y_test, y_pred) adj_rsquared = adjusted_rsquared( r_squared, X_test.shape[0], X_test.shape[1] ) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) fit_time = time.time() - start names.append(name) R2.append(r_squared) ADJR2.append(adj_rsquared) RMSE.append(rmse) TIME.append(fit_time) print_message(f" {name} completed in {fit_time:.1f}s | R²: {r_squared:.3f} | RMSE: {rmse:.3f}") if self.custom_metric: custom_metric = self.custom_metric(y_test, y_pred) CUSTOM_METRIC.append(custom_metric) if self.predictions: predictions[name] = y_pred except Exception as exception: failed_models.append(name) if self.ignore_warnings is False: print_message(f'\n{name} model failed to execute') print_message(exception) scores = { "Model": names, "Adjusted R-Squared": ADJR2, "R-Squared": R2, "RMSE": RMSE, "Time Taken": TIME, } if self.custom_metric: scores[self.custom_metric.__name__] = CUSTOM_METRIC scores = pd.DataFrame(scores) scores = scores.sort_values(by="Adjusted R-Squared", ascending=False).set_index( "Model" ) # Print summary total_time = time.time() - total_start n_success = len(names) n_failed = len(failed_models) best_model = scores.index[0] if len(scores) > 0 else "N/A" best_score = scores["Adjusted R-Squared"].iloc[0] if len(scores) > 0 else 0 if self.verbose > 0: # Full table + failed models summary = f"\nLazyRegressor Results ({n_success} succeeded, {n_failed} failed, {total_time:.1f}s)\n" summary += scores.to_string() if failed_models: summary += f"\n\nFailed: {', '.join(failed_models)}" print_message(summary) else: # 1-line summary print_message(f"Completed {n_success + n_failed} regressors in {total_time:.1f}s | {n_success} succeeded, {n_failed} failed | Best: {best_model} ({best_score:.2f})") if self.predictions: predictions_df = pd.DataFrame.from_dict(predictions) return scores, predictions_df return scores def provide_models(self, X_train, X_test, y_train, y_test): """ This function returns all the model objects trained in fit function. If fit is not called already, then we call fit and then return the models. Parameters ---------- X_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. X_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. y_train : array-like, Training vectors, where rows is the number of samples and columns is the number of features. y_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. Returns ------- models: dict-object, Returns a dictionary with each model pipeline as value with key as name of models. """ if len(self.models.keys()) == 0: self.fit(X_train, X_test, y_train, y_test) return self.models