# +++ # IMPORT REQUIRED LIBRARIES: #---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Data management import numpy as np import pandas as pd # For File path operations from pathlib import Path # Machine Learning # import sklearn # from sklearn.datasets import fetch_openml # contains a lot of useful datasets # Importing preprocessing modules from sklearn from sklearn.preprocessing import StandardScaler, OneHotEncoder # StandardScaler: to standardize numerical values [-1, 1]; OneHotEncoder: to convert categorical values into numeric as needed for ML processing from sklearn.impute import SimpleImputer # To handle missing values (if any) from sklearn.compose import make_column_transformer from sklearn.pipeline import Pipeline from sklearn.pipeline import make_pipeline # Train & Test partitioning for model estimation from sklearn.model_selection import train_test_split # train_test_split: for splitting datasets # Regresion modeling from sklearn.linear_model import LinearRegression # For mumerical target variable prediction (charges) # Metric scores for Regression use cases (Continuous target variable) from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score # For regresion model evaluation # Model Serialization import joblib # for saving (or serializing) the model. You can also use pickle. #---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Train python script: train_py_script = "train_ic.py" # 1) DATA ACQUISITION AND PREPARATION: print("\n1) DATA ACQUISITION AND PREPARATION:\n") # Obtain current directory and data file path current_directory = Path.cwd() print(f"current_directory: {current_directory}\n") # Use joinpath to add subdirectories and a filename data_file_path = current_directory.joinpath("data", "insurance.csv") print(f"data_file_path: {data_file_path}\n") # Read data: Read the dataset from the CSV file into a DataFrame data_df = pd.read_csv(filepath_or_buffer=data_file_path) print(f"DataFrame:\n\tRows: {data_df.shape[0]}\n\tColumns: {data_df.shape[1]}\n") print(f"DataFrame data info:\n") print(f"{data_df.info()}\n") # Drop the columns which are not required for modelling # Remove the "index" variable (column) from the DataFrame # data_df.drop(columns='index', inplace=True) data_df = data_df.drop(columns=['index']) # Reset the indeces (re-indexing), without re-adding the index as a column in the new DataFrame data_df.reset_index(inplace=True, drop=True) # Split data in to numerical and categorical features used for modeling # Define numerical and categorical feature columns # Store the numerical features (variables) in a new varable called numerical_features numerical_features = data_df.select_dtypes(include=['number']).columns.to_list() # could have been done this, but this would include the "charges" column, which we don't want to include as a predictor (X) variable, because it is the response/target (y) variable numerical_features.remove('charges') # remove 'charges' column from numerical_features list to be used for ML processing print(f"Numerical variables (predictors / independent variables): {numerical_features}\n") print(f"DataFrame showing numerical variables only:\n{data_df[numerical_features]}\n") # print(f"DataFrame showing numerical variables only:\n{data_df[numerical_features].head(5)}\n") # Store the categorical features (variables) in a new varable called categorical_features categorical_features = data_df.select_dtypes(include=['object', 'string', 'category']).columns.to_list() print(f"Categorical variables (predictors / independent variables): {categorical_features}\n") print(f"DataFrame showing categorical variables only:\n{data_df[categorical_features]}\n") # print(f"DataFrame showing categorical variables only:\n{data_df[categorical_features].head(5)}\n") # Store the boolean features (variables) in a new varable called boolean_features boolean_features = data_df.select_dtypes(include=['bool']).columns.to_list() print(f"Boolean variables (predictors / independent variables): {boolean_features}\n") # target (dependent) variable to predict target = ['charges'] print(f"Target (response / dependent) variable: {target}\n") print(f"Target (dependent) variable:\nMean = {data_df[target].mean()}\nMedian = {data_df[target].median()}\nStandard_deviation = {data_df[target].std()}\nMinumum = {data_df[target].min()}\nMaximum = {data_df[target].max()}\n") # 2) MODEL ESTIMATION: print("\n2) MODEL ESTIMATION:\n") print("Creating data subsets") # Separate features (predictors or independent variables) {X} and target (response or dependent) variable {y} X = data_df[numerical_features + categorical_features] y = data_df[target] print(f"Separation of:\n[X]: features (predictors or independent variables):\n{X}\n\n[y]: target (response or dependent) variable:\n{y}\n") # Split the dataset into training and testing sets Xtrain, Xtest, ytrain, ytest = train_test_split( X, y, test_size=0.2, random_state=42, shuffle=True, stratify=None ) print(f"Data split (train & test)\nVerifying that Xtrain is 80% and Xtest is 20% of the total number of observations in X\nX.shape : {X.shape}\nXtrain.shape : {Xtrain.shape}\nXtest.shape : {Xtest.shape}\n") # to verify that Xtrain is 80% and Xtest is 20% of the total number of observations in X # Creating a pipeline for numerical feature processing, including imputation of missing values with median and standard scaling. # This code creates a pipeline named numerical_pipeline to process numerical features. # It consists of two steps: # 'imputer': Imputes missing values using the median strategy with SimpleImputer. # 'scaler': Standardizes the numerical features using StandardScaler. numerical_pipeline = Pipeline([ ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), # in case of missing values replace NaN with the Median value of that column. ('scaler', StandardScaler(with_mean=True, with_std=True)) # apply StandarScaller on the numerical features ]) # Creating a pipeline for categorical feature processing, including imputation of missing values with the most frequent value and one-hot encoding with handling of unknown categories. # This code constructs a pipeline named categorical_pipeline for processing categorical features, involving imputation of missing values using the most frequent value and one-hot encoding with handling of unknown categories. categorical_pipeline = Pipeline([ ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')), # missing_values can be set to either `np.nan` or `pd.NA`, and are replaced by the Mode of that column. ('onehot', OneHotEncoder(handle_unknown='ignore')) # apply OneHotEncoder on the categorical features ]) # Data pre-processing step # Note: make_column_transformer only works with pandas DataFrame # Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately. preprocessor = make_column_transformer( (numerical_pipeline, numerical_features), (categorical_pipeline, categorical_features), n_jobs=-1, # use all available processors to run jobs in parallel verbose=False, verbose_feature_names_out=True ) # Regression step # Ordinary least squares Linear Regression. # LinearRegression fits a linear model with coefficients w = (w1, ..., wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation. # fit_intercept : bool, default=True # Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). # n_jobs=-1 hyperparameter means: use all available processors to run all the jobs in parallel to speed up execution model_linear_regression = LinearRegression(fit_intercept=True, n_jobs=-1) print("Estimating Model Pipeline") # Creating a pipeline combining preprocessing steps (imputation, scaling, and encoding) with linear regression modeling. # make_pipeline parameters are *steps : list of Estimator objects # List of the scikit-learn estimators that are chained together. model_pipeline = make_pipeline( preprocessor, model_linear_regression ) # Fit the model. # Fit all the transformers one after the other and sequentially transform the data. Finally, fit the transformed data using the final estimator. # X : iterable # Training data. Must fulfill input requirements of first step of the pipeline. # y : iterable, default=None # Training targets. Must fulfill label requirements for all steps of the pipeline. model_pipeline.fit(X=Xtrain, y=ytrain) # Get the backend architecture of the model_pipeline print(f"\nArchitecture of the model pipeline:\n{model_pipeline.named_steps}\n\n") # Get parameters for this estimator. # Returns the parameters given in the constructor as well as the estimators contained within the steps of the Pipeline. print(f"\nParameters given in the constructor as well as the estimators contained within the steps of the model pipeline:\n{model_pipeline.get_params(deep=True)}\n\n") # 3) MODEL EVALUATION: print("\n3) MODEL EVALUATION:\n") print("Model predictive power Metrics") # Mean Squared Error print(f"MSE: {mean_squared_error(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest))}") # Root Mean Squared Error # print(f"RMSE: {mean_squared_error(ytest, model_pipeline.predict(X=Xtest), squared=False)}") # squared is deprecated; use root_mean_squared_error instead. # print(f"RMSE: {np.sqrt(mean_squared_error(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest)))}") # RMSE is sqrt(MSE) print(f"RMSE: {root_mean_squared_error(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest))}\n--> Meaning: on average the predicted insurance charge is off by this amount.") # Meaning: on average the predicted insurance charge is off by $5,796 # R-Squared print(f"R-squared: {r2_score(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest))}\n--> Meaning: The closer the R-squared is to 1, the more the independent variables explain the variability of the dependent variable. It provides an indication of the goodness of fit of the model.") # Meaning: R-squared of 0.78 means that 78% of the variance in the dependent variable is explained by the independent variables. The closer the R-squared is to 1, the more the independent variables explain the variability of the dependent variable. It provides an indication of the goodness of fit of the model. # 4) MODEL SERIALIZATION: print("\n4) MODEL SERIALIZATION:\n") print("Serializing Model") # Use joinpath to add subdirectories and a filename saved_model_path = current_directory.joinpath("model_ic.joblib") print(f"saved_model_path: {saved_model_path}\n") # Persist an arbitrary Python object into one file. The python object could be a class, a function, a trained neural network, a script for later executions, etc. filename_of_stored_object = joblib.dump(value=model_pipeline, filename=saved_model_path) print(f"\nThe training script is a python object stored in file: {train_py_script}\nExecute the python script as: !python {train_py_script}\n") # +++