### Modified version of lazy predict from https://github.com/shankarpandala/lazypredict
import numpy as np
import pandas as pd
import time
import warnings
import xgboost
import lightgbm
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils import all_estimators
from sklearn.base import RegressorMixin
from sklearn.base import ClassifierMixin
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    roc_auc_score,
    f1_score,
    r2_score,
    mean_squared_error,
)

try:
    from utils import print_message
    from seed_utils import get_global_seed
except ImportError:
    from ..utils import print_message
    from ..seed_utils import get_global_seed

warnings.filterwarnings("ignore")
pd.set_option("display.precision", 2)
pd.set_option("display.float_format", lambda x: "%.2f" % x)


removed_classifiers = [
    "ClassifierChain",
    "ComplementNB",
    "GradientBoostingClassifier",
    "GaussianProcessClassifier",
    "HistGradientBoostingClassifier",
    "MLPClassifier",
    "LogisticRegressionCV", 
    "MultiOutputClassifier", 
    "MultinomialNB", 
    "OneVsOneClassifier",
    "OneVsRestClassifier",
    "OutputCodeClassifier",
    "RadiusNeighborsClassifier",
    "VotingClassifier",
    "LogisticRegressionCV",
    "CalibratedClassifierCV",
    "RidgeClassifierCV",
    "LinearSVC",
    "Perceptron",
    "MLPClassifier",
    "SGDClassifier",
    # O(n²) memory models - too slow for large datasets
    "LabelPropagation",
    "LabelSpreading", 
    "SVC",
    "NuSVC",
    # Sequential ensemble models - slow for large datasets
    "AdaBoostClassifier",
    "BaggingClassifier",
    # O(n×m) prediction time - slow for large test sets
    "KNeighborsClassifier",
    # Unbounded tree depth - very slow on high-dim data
    "DecisionTreeClassifier",
    "ExtraTreeClassifier",
    "ExtraTreesClassifier",
    # Fails on negative values after StandardScaler
    "CategoricalNB",
    # O(d²) or O(d³) - slow on high-dimensional data (4608 features)
    "LinearDiscriminantAnalysis",
    "QuadraticDiscriminantAnalysis",
    # Requires estimator argument
    "FixedThresholdClassifier",
    "TunedThresholdClassifierCV",
]

removed_regressors = [
    "TheilSenRegressor",
    "ARDRegression", 
    "CCA", 
    "IsotonicRegression", 
    "StackingRegressor",
    "MultiOutputRegressor", 
    "MultiTaskElasticNet", 
    "MultiTaskElasticNetCV", 
    "MultiTaskLasso", 
    "MultiTaskLassoCV", 
    "PLSCanonical", 
    "PLSRegression", 
    "RadiusNeighborsRegressor", 
    "RegressorChain", 
    "VotingRegressor",
    "OrthogonalMatchingPursuitCV",
    "LassoLars",
    "LarsCV",
    "LassoCV",
    "RidgeCV",
    "LassoLarsCV",
    "ElasticNetCV",
    "LinearSVR",
    "LassoLarsIC",
    # Sequential ensemble models - slow for large datasets
    "AdaBoostRegressor",
    "BaggingRegressor",
    # O(n×m) prediction time - slow for large test sets
    "KNeighborsRegressor",
    # Unbounded tree depth - very slow on high-dim data
    "DecisionTreeRegressor",
    "ExtraTreeRegressor",
    "ExtraTreesRegressor",
]

# Tuple of (name, class)
CLASSIFIERS = [
    est
    for est in all_estimators()
    if (issubclass(est[1], ClassifierMixin) and (est[0] not in removed_classifiers))
]
CLASSIFIER_DICT = {model[0]: model[1] for model in CLASSIFIERS}

"""
CLASSIFIERS = [
    'LogisticRegression',
    'SVC', 
    'PassiveAggressiveClassifier',
    'LabelPropagation',
    'LabelSpreading',
    'RandomForestClassifier',
    'GradientBoostingClassifier', 
    'QuadraticDiscriminantAnalysis',
    'HistGradientBoostingClassifier',
    'RidgeClassifier',
    'AdaBoostClassifier',
    'ExtraTreesClassifier',
    'KNeighborsClassifier',
    'BaggingClassifier',
    'BernoulliNB',
    'LinearDiscriminantAnalysis',
    'GaussianNB',
    'NuSVC',
    'DecisionTreeClassifier',
    'NearestCentroid',
    'ExtraTreeClassifier',
    'CheckingClassifier',
    'DummyClassifier'
]
"""

REGRESSORS = [
    est
    for est in all_estimators()
    if (issubclass(est[1], RegressorMixin) and (est[0] not in removed_regressors))
]
REGRESSOR_DICT = {model[0]: model[1] for model in REGRESSORS}

ALL_MODELS = CLASSIFIERS + REGRESSORS
ALL_MODEL_DICT = {model[0]: model[1] for model in ALL_MODELS}

"""
REGRESSORS = [
    'ExtraTreesRegressor',
    'Lasso',
    'PassiveAggressiveRegressor',
    'SGDRegressor',
    'Ridge',
    'BayesianRidge',
    'TransformedTargetRegressor',
    'LinearRegression',
    'Lars',
    'HuberRegressor',
    'RandomForestRegressor',
    'AdaBoostRegressor',
    'LGBMRegressor',
    'HistGradientBoostingRegressor',
    'PoissonRegressor',
    'ElasticNet',
    'KNeighborsRegressor',
    'OrthogonalMatchingPursuit',
    'BaggingRegressor',
    'GradientBoostingRegressor',
    'TweedieRegressor',
    'XGBRegressor',
    'GammaRegressor',
    'RANSACRegressor',
    'ExtraTreeRegressor',
    'NuSVR',
    'SVR',
    'DummyRegressor',
    'DecisionTreeRegressor',
    'GaussianProcessRegressor',
    'MLPRegressor',
    'KernelRidge'
]
"""

REGRESSORS.append(("XGBRegressor", xgboost.XGBRegressor))
REGRESSORS.append(("LGBMRegressor", lightgbm.LGBMRegressor))
# REGRESSORS.append(('CatBoostRegressor',catboost.CatBoostRegressor))

CLASSIFIERS.append(("XGBClassifier", xgboost.XGBClassifier))
CLASSIFIERS.append(("LGBMClassifier", lightgbm.LGBMClassifier))
# CLASSIFIERS.append(('CatBoostClassifier',catboost.CatBoostClassifier))

# Update dicts with XGB and LGBM
CLASSIFIER_DICT["XGBClassifier"] = xgboost.XGBClassifier
CLASSIFIER_DICT["LGBMClassifier"] = lightgbm.LGBMClassifier
REGRESSOR_DICT["XGBRegressor"] = xgboost.XGBRegressor
REGRESSOR_DICT["LGBMRegressor"] = lightgbm.LGBMRegressor
ALL_MODEL_DICT["XGBClassifier"] = xgboost.XGBClassifier
ALL_MODEL_DICT["LGBMClassifier"] = lightgbm.LGBMClassifier
ALL_MODEL_DICT["XGBRegressor"] = xgboost.XGBRegressor
ALL_MODEL_DICT["LGBMRegressor"] = lightgbm.LGBMRegressor

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

categorical_transformer_low = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoding", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

categorical_transformer_high = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        # 'OrdianlEncoder' Raise a ValueError when encounters an unknown value. Check https://github.com/scikit-learn/scikit-learn/pull/13423
        ("encoding", OrdinalEncoder()),
    ]
)


# Helper function
def get_card_split(df, cols, n=11):
    """
    Splits categorical columns into 2 lists based on cardinality (i.e # of unique values)
    Parameters
    ----------
    df : Pandas DataFrame
        DataFrame from which the cardinality of the columns is calculated.
    cols : list-like
        Categorical columns to list
    n : int, optional (default=11)
        The value of 'n' will be used to split columns.
    Returns
    -------
    card_low : list-like
        Columns with cardinality < n
    card_high : list-like
        Columns with cardinality >= n
    """
    cond = df[cols].nunique() > n
    card_high = cols[cond]
    card_low = cols[~cond]
    return card_low, card_high


# Helper class for performing classification
class LazyClassifier:
    """
    This module helps in fitting to all the classification algorithms that are available in Scikit-learn
    Parameters
    ----------
    verbose : int, optional (default=0)
        For the liblinear and lbfgs solvers set verbose to any positive
        number for verbosity.
    ignore_warnings : bool, optional (default=True)
        When set to True, the warning related to algorigms that are not able to run are ignored.
    custom_metric : function, optional (default=None)
        When function is provided, models are evaluated based on the custom evaluation metric provided.
    prediction : bool, optional (default=False)
        When set to True, the predictions of all the models models are returned as dataframe.
    classifiers : list, optional (default="all")
        When function is provided, trains the chosen classifier(s).
    """

    def __init__(
        self,
        verbose=0,
        ignore_warnings=True,
        custom_metric=None,
        predictions=False,
        random_state=None,
        classifiers="all",
    ):
        self.verbose = verbose
        self.ignore_warnings = ignore_warnings
        self.custom_metric = custom_metric
        self.predictions = predictions
        self.models = {}
        self.random_state = random_state or get_global_seed()
        self.classifiers = classifiers

    def fit(self, X_train, X_test, y_train, y_test):
        """Fit Classification algorithms to X_train and y_train, predict and score on X_test, y_test.
        Parameters
        ----------
        X_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        X_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        y_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        y_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        Returns
        -------
        scores : Pandas DataFrame
            Returns metrics of all the models in a Pandas DataFrame.
        predictions : Pandas DataFrame
            Returns predictions of all the models in a Pandas DataFrame.
        """
        Accuracy = []
        B_Accuracy = []
        ROC_AUC = []
        F1 = []
        names = []
        TIME = []
        predictions = {}

        if self.custom_metric is not None:
            CUSTOM_METRIC = []

        if isinstance(X_train, np.ndarray):
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

        numeric_features = X_train.select_dtypes(include=[np.number]).columns
        categorical_features = X_train.select_dtypes(include=["object"]).columns

        categorical_low, categorical_high = get_card_split(
            X_train, categorical_features
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ("numeric", numeric_transformer, numeric_features),
                ("categorical_low", categorical_transformer_low, categorical_low),
                ("categorical_high", categorical_transformer_high, categorical_high),
            ]
        )
        
        # Precompute preprocessing once for all models (major optimization for large datasets)
        print_message("Preprocessing data once for all models...")
        preprocess_start = time.time()
        X_train_transformed = preprocessor.fit_transform(X_train)
        X_test_transformed = preprocessor.transform(X_test)
        print_message(f"Preprocessing completed in {time.time() - preprocess_start:.1f}s")

        if self.classifiers == "all":
            self.classifiers = CLASSIFIERS
        else:
            try:
                temp_list = []
                for classifier in self.classifiers:
                    full_name = (classifier.__name__, classifier)
                    temp_list.append(full_name)
                self.classifiers = temp_list
            except Exception as exception:
                print_message(exception)
                print_message("Invalid Classifier(s)")

        # Track failed models
        failed_models = []
        total_start = time.time()

        for name, model in tqdm(self.classifiers, desc="Training classifiers"):
            print_message(f"Starting {name}...")
            start = time.time()
            try:
                # Build model kwargs
                model_kwargs = {}
                if "random_state" in model().get_params().keys():
                    model_kwargs["random_state"] = self.random_state
                # Enable parallelization for models that support it
                if "n_jobs" in model().get_params().keys():
                    model_kwargs["n_jobs"] = -1
                # Enable verbose for boosting models to show iteration progress
                if name in ("XGBClassifier", "LGBMClassifier"):
                    model_kwargs["verbose"] = 1
                
                # Train directly on preprocessed data (no Pipeline needed)
                clf = model(**model_kwargs)
                clf.fit(X_train_transformed, y_train)
                self.models[name] = clf
                y_pred = clf.predict(X_test_transformed)
                accuracy = accuracy_score(y_test, y_pred, normalize=True)
                b_accuracy = balanced_accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average="weighted")
                try:
                    roc_auc = roc_auc_score(y_test, y_pred)
                except Exception as exception:
                    roc_auc = None
                    if self.ignore_warnings is False:
                        print_message("ROC AUC couldn't be calculated for " + name)
                        print_message(exception)
                fit_time = time.time() - start
                names.append(name)
                Accuracy.append(accuracy)
                B_Accuracy.append(b_accuracy)
                ROC_AUC.append(roc_auc)
                F1.append(f1)
                TIME.append(fit_time)
                
                print_message(f"  {name} completed in {fit_time:.1f}s | Acc: {accuracy:.3f} | F1: {f1:.3f}")

                if self.custom_metric is not None:
                    custom_metric = self.custom_metric(y_test, y_pred)
                    CUSTOM_METRIC.append(custom_metric)

                if self.predictions:
                    predictions[name] = y_pred

            except Exception as exception:
                failed_models.append(name)
                if self.ignore_warnings is False:
                    print_message(f'\n{name} model failed to execute')
                    print_message(exception)

        if self.custom_metric is None:
            scores = pd.DataFrame(
                {
                    "Model": names,
                    "Accuracy": Accuracy,
                    "Balanced Accuracy": B_Accuracy,
                    "ROC AUC": ROC_AUC,
                    "F1 Score": F1,
                    "Time Taken": TIME,
                }
            )
        else:
            scores = pd.DataFrame(
                {
                    "Model": names,
                    "Accuracy": Accuracy,
                    "Balanced Accuracy": B_Accuracy,
                    "ROC AUC": ROC_AUC,
                    "F1 Score": F1,
                    "Custom Metric": CUSTOM_METRIC,
                    "Time Taken": TIME,
                }
            )
        scores = scores.sort_values(by="Balanced Accuracy", ascending=False).set_index(
            "Model"
        )

        # Print summary
        total_time = time.time() - total_start
        n_success = len(names)
        n_failed = len(failed_models)
        best_model = scores.index[0] if len(scores) > 0 else "N/A"
        best_score = scores["Balanced Accuracy"].iloc[0] if len(scores) > 0 else 0

        if self.verbose > 0:
            # Full table + failed models
            summary = f"\nLazyClassifier Results ({n_success} succeeded, {n_failed} failed, {total_time:.1f}s)\n"
            summary += scores.to_string()
            if failed_models:
                summary += f"\n\nFailed: {', '.join(failed_models)}"
            print_message(summary)
        else:
            # 1-line summary
            print_message(f"Completed {n_success + n_failed} classifiers in {total_time:.1f}s | {n_success} succeeded, {n_failed} failed | Best: {best_model} ({best_score:.2f})")

        if self.predictions:
            predictions_df = pd.DataFrame.from_dict(predictions)
            return scores, predictions_df
        return scores

    def provide_models(self, X_train, X_test, y_train, y_test):
        """
        This function returns all the model objects trained in fit function.
        If fit is not called already, then we call fit and then return the models.
        Parameters
        ----------
        X_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        X_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        y_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        y_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        Returns
        -------
        models: dict-object,
            Returns a dictionary with each model pipeline as value 
            with key as name of models.
        """
        if len(self.models.keys()) == 0:
            self.fit(X_train, X_test, y_train, y_test)

        return self.models


def adjusted_rsquared(r2, n, p):
    return 1 - (1 - r2) * ((n - 1) / (n - p - 1))


# Helper class for performing classification
class LazyRegressor:
    """
    This module helps in fitting regression models that are available in Scikit-learn
    Parameters
    ----------
    verbose : int, optional (default=0)
        For the liblinear and lbfgs solvers set verbose to any positive
        number for verbosity.
    ignore_warnings : bool, optional (default=True)
        When set to True, the warning related to algorigms that are not able to run are ignored.
    custom_metric : function, optional (default=None)
        When function is provided, models are evaluated based on the custom evaluation metric provided.
    prediction : bool, optional (default=False)
        When set to True, the predictions of all the models models are returned as dataframe.
    regressors : list, optional (default="all")
        When function is provided, trains the chosen regressor(s).
    """

    def __init__(
        self,
        verbose=0,
        ignore_warnings=True,
        custom_metric=None,
        predictions=False,
        random_state=None,
        regressors="all",
    ):
        self.verbose = verbose
        self.ignore_warnings = ignore_warnings
        self.custom_metric = custom_metric
        self.predictions = predictions
        self.models = {}
        self.random_state = random_state or get_global_seed()
        self.regressors = regressors

    def fit(self, X_train, X_test, y_train, y_test):
        """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.
        Parameters
        ----------
        X_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        X_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        y_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        y_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        Returns
        -------
        scores : Pandas DataFrame
            Returns metrics of all the models in a Pandas DataFrame.
        predictions : Pandas DataFrame
            Returns predictions of all the models in a Pandas DataFrame.
        """
        R2 = []
        ADJR2 = []
        RMSE = []
        # WIN = []
        names = []
        TIME = []
        predictions = {}

        if self.custom_metric:
            CUSTOM_METRIC = []

        if isinstance(X_train, np.ndarray):
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)

        numeric_features = X_train.select_dtypes(include=[np.number]).columns
        categorical_features = X_train.select_dtypes(include=["object"]).columns

        categorical_low, categorical_high = get_card_split(
            X_train, categorical_features
        )

        preprocessor = ColumnTransformer(
            transformers=[
                ("numeric", numeric_transformer, numeric_features),
                ("categorical_low", categorical_transformer_low, categorical_low),
                ("categorical_high", categorical_transformer_high, categorical_high),
            ]
        )
        
        # Precompute preprocessing once for all models (major optimization for large datasets)
        print_message("Preprocessing data once for all models...")
        preprocess_start = time.time()
        X_train_transformed = preprocessor.fit_transform(X_train)
        X_test_transformed = preprocessor.transform(X_test)
        print_message(f"Preprocessing completed in {time.time() - preprocess_start:.1f}s")

        if self.regressors == "all":
            self.regressors = REGRESSORS
        else:
            try:
                temp_list = []
                for regressor in self.regressors:
                    full_name = (regressor.__name__, regressor)
                    temp_list.append(full_name)
                self.regressors = temp_list
            except Exception as exception:
                print_message(exception)
                print_message("Invalid Regressor(s)")

        # Track failed models
        failed_models = []
        total_start = time.time()

        for name, model in tqdm(self.regressors, desc="Training regressors"):
            print_message(f"Starting {name}...")
            start = time.time()
            try:
                # Build model kwargs
                model_kwargs = {}
                if "random_state" in model().get_params().keys():
                    model_kwargs["random_state"] = self.random_state
                # Enable parallelization for models that support it
                if "n_jobs" in model().get_params().keys():
                    model_kwargs["n_jobs"] = -1
                # Enable verbose for boosting models to show iteration progress
                if name in ("XGBRegressor", "LGBMRegressor"):
                    model_kwargs["verbose"] = 1
                
                # Train directly on preprocessed data (no Pipeline needed)
                reg = model(**model_kwargs)
                reg.fit(X_train_transformed, y_train)
                self.models[name] = reg
                y_pred = reg.predict(X_test_transformed)

                r_squared = r2_score(y_test, y_pred)
                adj_rsquared = adjusted_rsquared(
                    r_squared, X_test.shape[0], X_test.shape[1]
                )
                rmse = np.sqrt(mean_squared_error(y_test, y_pred))

                fit_time = time.time() - start
                names.append(name)
                R2.append(r_squared)
                ADJR2.append(adj_rsquared)
                RMSE.append(rmse)
                TIME.append(fit_time)
                
                print_message(f"  {name} completed in {fit_time:.1f}s | R²: {r_squared:.3f} | RMSE: {rmse:.3f}")

                if self.custom_metric:
                    custom_metric = self.custom_metric(y_test, y_pred)
                    CUSTOM_METRIC.append(custom_metric)

                if self.predictions:
                    predictions[name] = y_pred

            except Exception as exception:
                failed_models.append(name)
                if self.ignore_warnings is False:
                    print_message(f'\n{name} model failed to execute')
                    print_message(exception)

        scores = {
            "Model": names,
            "Adjusted R-Squared": ADJR2,
            "R-Squared": R2,
            "RMSE": RMSE,
            "Time Taken": TIME,
        }

        if self.custom_metric:
            scores[self.custom_metric.__name__] = CUSTOM_METRIC

        scores = pd.DataFrame(scores)
        scores = scores.sort_values(by="Adjusted R-Squared", ascending=False).set_index(
            "Model"
        )

        # Print summary
        total_time = time.time() - total_start
        n_success = len(names)
        n_failed = len(failed_models)
        best_model = scores.index[0] if len(scores) > 0 else "N/A"
        best_score = scores["Adjusted R-Squared"].iloc[0] if len(scores) > 0 else 0

        if self.verbose > 0:
            # Full table + failed models
            summary = f"\nLazyRegressor Results ({n_success} succeeded, {n_failed} failed, {total_time:.1f}s)\n"
            summary += scores.to_string()
            if failed_models:
                summary += f"\n\nFailed: {', '.join(failed_models)}"
            print_message(summary)
        else:
            # 1-line summary
            print_message(f"Completed {n_success + n_failed} regressors in {total_time:.1f}s | {n_success} succeeded, {n_failed} failed | Best: {best_model} ({best_score:.2f})")

        if self.predictions:
            predictions_df = pd.DataFrame.from_dict(predictions)
            return scores, predictions_df
        return scores

    def provide_models(self, X_train, X_test, y_train, y_test):
        """
        This function returns all the model objects trained in fit function.
        If fit is not called already, then we call fit and then return the models.
        Parameters
        ----------
        X_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        X_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        y_train : array-like,
            Training vectors, where rows is the number of samples
            and columns is the number of features.
        y_test : array-like,
            Testing vectors, where rows is the number of samples
            and columns is the number of features.
        Returns
        -------
        models: dict-object,
            Returns a dictionary with each model pipeline as value 
            with key as name of models.
        """
        if len(self.models.keys()) == 0:
            self.fit(X_train, X_test, y_train, y_test)

        return self.models