# +++
# IMPORT REQUIRED LIBRARIES:
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Data management
import numpy as np
import pandas as pd

# For File path operations
from pathlib import Path

# Machine Learning
# import sklearn
# from sklearn.datasets import fetch_openml  # contains a lot of useful datasets
# Importing preprocessing modules from sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder   # StandardScaler: to standardize numerical values [-1, 1]; OneHotEncoder: to convert categorical values into numeric as needed for ML processing
from sklearn.impute import SimpleImputer                          # To handle missing values (if any)
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
# Train & Test partitioning for model estimation
from sklearn.model_selection import train_test_split   # train_test_split: for splitting datasets
# Regresion modeling
from sklearn.linear_model import LinearRegression     # For mumerical target variable prediction (charges)
# Metric scores for Regression use cases (Continuous target variable)
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score   # For regresion model evaluation

# Model Serialization 
import joblib   # for saving (or serializing) the model.   You can also use pickle.
#----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Train python script:
train_py_script = "train_ic.py"

# 1) DATA ACQUISITION AND PREPARATION:
print("\n1) DATA ACQUISITION AND PREPARATION:\n")

# Obtain current directory and data file path
current_directory = Path.cwd()
print(f"current_directory: {current_directory}\n")
# Use joinpath to add subdirectories and a filename
data_file_path = current_directory.joinpath("data", "insurance.csv")
print(f"data_file_path: {data_file_path}\n")

# Read data: Read the dataset from the CSV file into a DataFrame
data_df = pd.read_csv(filepath_or_buffer=data_file_path)

print(f"DataFrame:\n\tRows:    {data_df.shape[0]}\n\tColumns: {data_df.shape[1]}\n")
print(f"DataFrame data info:\n")
print(f"{data_df.info()}\n")

# Drop the columns which are not required for modelling
# Remove the "index" variable (column) from the DataFrame
# data_df.drop(columns='index', inplace=True)
data_df = data_df.drop(columns=['index'])

# Reset the indeces (re-indexing), without re-adding the index as a column in the new DataFrame
data_df.reset_index(inplace=True, drop=True)

# Split data in to numerical and categorical features used for modeling
# Define numerical and categorical feature columns
# Store the numerical features (variables) in a new varable called numerical_features
numerical_features = data_df.select_dtypes(include=['number']).columns.to_list()   # could have been done this, but this would include the "charges" column, which we don't want to include as a predictor (X) variable, because it is the response/target (y) variable
numerical_features.remove('charges')   # remove 'charges' column from numerical_features list to be used for ML processing
print(f"Numerical variables (predictors / independent variables): {numerical_features}\n")
print(f"DataFrame showing numerical variables only:\n{data_df[numerical_features]}\n")
# print(f"DataFrame showing numerical variables only:\n{data_df[numerical_features].head(5)}\n")

# Store the categorical features (variables) in a new varable called categorical_features
categorical_features = data_df.select_dtypes(include=['object', 'string', 'category']).columns.to_list()
print(f"Categorical variables (predictors / independent variables): {categorical_features}\n")
print(f"DataFrame showing categorical variables only:\n{data_df[categorical_features]}\n")
# print(f"DataFrame showing categorical variables only:\n{data_df[categorical_features].head(5)}\n")

# Store the boolean features (variables) in a new varable called boolean_features
boolean_features = data_df.select_dtypes(include=['bool']).columns.to_list()
print(f"Boolean variables (predictors / independent variables): {boolean_features}\n")

# target (dependent) variable to predict
target = ['charges']
print(f"Target (response / dependent) variable: {target}\n")
print(f"Target (dependent) variable:\nMean = {data_df[target].mean()}\nMedian = {data_df[target].median()}\nStandard_deviation = {data_df[target].std()}\nMinumum = {data_df[target].min()}\nMaximum = {data_df[target].max()}\n")


# 2) MODEL ESTIMATION:
print("\n2) MODEL ESTIMATION:\n")

print("Creating data subsets")
# Separate features (predictors or independent variables) {X} and target (response or dependent) variable {y}
X = data_df[numerical_features + categorical_features]
y = data_df[target]
print(f"Separation of:\n[X]: features (predictors or independent variables):\n{X}\n\n[y]: target (response or dependent) variable:\n{y}\n")

# Split the dataset into training and testing sets
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=None    
)
print(f"Data split (train & test)\nVerifying that Xtrain is 80% and Xtest is 20% of the total number of observations in X\nX.shape : {X.shape}\nXtrain.shape : {Xtrain.shape}\nXtest.shape : {Xtest.shape}\n")  # to verify that Xtrain is 80% and Xtest is 20% of the total number of observations in X

# Creating a pipeline for numerical feature processing, including imputation of missing values with median and standard scaling.
# This code creates a pipeline named numerical_pipeline to process numerical features. 
# It consists of two steps:
# 'imputer': Imputes missing values using the median strategy with SimpleImputer.
# 'scaler': Standardizes the numerical features using StandardScaler.
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),  # in case of missing values replace NaN with the Median value of that column.
    ('scaler', StandardScaler(with_mean=True, with_std=True))              # apply StandarScaller on the numerical features
])

# Creating a pipeline for categorical feature processing, including imputation of missing values with the most frequent value and one-hot encoding with handling of unknown categories.
# This code constructs a pipeline named categorical_pipeline for processing categorical features, involving imputation of missing values using the most frequent value and one-hot encoding with handling of unknown categories.
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')),   # missing_values can be set to either `np.nan` or `pd.NA`, and are replaced by the Mode of that column.
    ('onehot', OneHotEncoder(handle_unknown='ignore'))                            # apply OneHotEncoder on the categorical features
])

# Data pre-processing step
# Note: make_column_transformer only works with pandas DataFrame
# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
preprocessor = make_column_transformer(
    (numerical_pipeline, numerical_features),
    (categorical_pipeline, categorical_features),
    n_jobs=-1,                                      # use all available processors to run jobs in parallel
    verbose=False,
    verbose_feature_names_out=True
)

# Regression step
# Ordinary least squares Linear Regression.
# LinearRegression fits a linear model with coefficients w = (w1, ..., wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.
#   fit_intercept : bool, default=True
#                   Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).
#   n_jobs=-1 hyperparameter means: use all available processors to run all the jobs in parallel to speed up execution
model_linear_regression = LinearRegression(fit_intercept=True, n_jobs=-1) 

print("Estimating Model Pipeline")
# Creating a pipeline combining preprocessing steps (imputation, scaling, and encoding) with linear regression modeling.
# make_pipeline parameters are *steps : list of Estimator objects
#                                       List of the scikit-learn estimators that are chained together.

model_pipeline = make_pipeline(
    preprocessor,
    model_linear_regression
)

# Fit the model.
# Fit all the transformers one after the other and sequentially transform the data. Finally, fit the transformed data using the final estimator.
# X : iterable
#     Training data. Must fulfill input requirements of first step of the pipeline.
# y : iterable, default=None
#     Training targets. Must fulfill label requirements for all steps of the pipeline.
model_pipeline.fit(X=Xtrain, y=ytrain)

# Get the backend architecture of the model_pipeline
print(f"\nArchitecture of the model pipeline:\n{model_pipeline.named_steps}\n\n")

# Get parameters for this estimator.
# Returns the parameters given in the constructor as well as the estimators contained within the steps of the Pipeline.
print(f"\nParameters given in the constructor as well as the estimators contained within the steps of the model pipeline:\n{model_pipeline.get_params(deep=True)}\n\n")


# 3) MODEL EVALUATION:
print("\n3) MODEL EVALUATION:\n")

print("Model predictive power Metrics")

# Mean Squared Error
print(f"MSE: {mean_squared_error(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest))}")   

# Root Mean Squared Error
# print(f"RMSE: {mean_squared_error(ytest, model_pipeline.predict(X=Xtest), squared=False)}")   # squared is deprecated; use root_mean_squared_error instead.
# print(f"RMSE: {np.sqrt(mean_squared_error(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest)))}")   # RMSE is  sqrt(MSE)
print(f"RMSE: {root_mean_squared_error(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest))}\n--> Meaning: on average the predicted insurance charge is off by this amount.")   
# Meaning: on average the predicted insurance charge is off by $5,796

# R-Squared
print(f"R-squared: {r2_score(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest))}\n--> Meaning: The closer the R-squared is to 1, the more the independent variables explain the variability of the dependent variable. It provides an indication of the goodness of fit of the model.")
# Meaning: R-squared of 0.78 means that 78% of the variance in the dependent variable is explained by the independent variables. The closer the R-squared is to 1, the more the independent variables explain the variability of the dependent variable.  It provides an indication of the goodness of fit of the model.

# 4) MODEL SERIALIZATION:
print("\n4) MODEL SERIALIZATION:\n")

print("Serializing Model")

# Use joinpath to add subdirectories and a filename
saved_model_path = current_directory.joinpath("model_ic.joblib")
print(f"saved_model_path: {saved_model_path}\n")

# Persist an arbitrary Python object into one file.  The python object could be a class, a function, a trained neural network, a script for later executions, etc.
filename_of_stored_object = joblib.dump(value=model_pipeline, filename=saved_model_path)
print(f"\nThe training script is a python object stored in file: {train_py_script}\nExecute the python script as: !python {train_py_script}\n")

# +++