Spaces:
Runtime error
Runtime error
| # +++ | |
| # IMPORT REQUIRED LIBRARIES: | |
| #---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| # Data management | |
| import numpy as np | |
| import pandas as pd | |
| # For File path operations | |
| from pathlib import Path | |
| # Machine Learning | |
| # import sklearn | |
| # from sklearn.datasets import fetch_openml # contains a lot of useful datasets | |
| # Importing preprocessing modules from sklearn | |
| from sklearn.preprocessing import StandardScaler, OneHotEncoder # StandardScaler: to standardize numerical values [-1, 1]; OneHotEncoder: to convert categorical values into numeric as needed for ML processing | |
| from sklearn.impute import SimpleImputer # To handle missing values (if any) | |
| from sklearn.compose import make_column_transformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.pipeline import make_pipeline | |
| # Train & Test partitioning for model estimation | |
| from sklearn.model_selection import train_test_split # train_test_split: for splitting datasets | |
| # Regresion modeling | |
| from sklearn.linear_model import LinearRegression # For mumerical target variable prediction (charges) | |
| # Metric scores for Regression use cases (Continuous target variable) | |
| from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score # For regresion model evaluation | |
| # Model Serialization | |
| import joblib # for saving (or serializing) the model. You can also use pickle. | |
| #---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
| # Train python script: | |
| train_py_script = "train_ic.py" | |
| # 1) DATA ACQUISITION AND PREPARATION: | |
| print("\n1) DATA ACQUISITION AND PREPARATION:\n") | |
| # Obtain current directory and data file path | |
| current_directory = Path.cwd() | |
| print(f"current_directory: {current_directory}\n") | |
| # Use joinpath to add subdirectories and a filename | |
| data_file_path = current_directory.joinpath("data", "insurance.csv") | |
| print(f"data_file_path: {data_file_path}\n") | |
| # Read data: Read the dataset from the CSV file into a DataFrame | |
| data_df = pd.read_csv(filepath_or_buffer=data_file_path) | |
| print(f"DataFrame:\n\tRows: {data_df.shape[0]}\n\tColumns: {data_df.shape[1]}\n") | |
| print(f"DataFrame data info:\n") | |
| print(f"{data_df.info()}\n") | |
| # Drop the columns which are not required for modelling | |
| # Remove the "index" variable (column) from the DataFrame | |
| # data_df.drop(columns='index', inplace=True) | |
| data_df = data_df.drop(columns=['index']) | |
| # Reset the indeces (re-indexing), without re-adding the index as a column in the new DataFrame | |
| data_df.reset_index(inplace=True, drop=True) | |
| # Split data in to numerical and categorical features used for modeling | |
| # Define numerical and categorical feature columns | |
| # Store the numerical features (variables) in a new varable called numerical_features | |
| numerical_features = data_df.select_dtypes(include=['number']).columns.to_list() # could have been done this, but this would include the "charges" column, which we don't want to include as a predictor (X) variable, because it is the response/target (y) variable | |
| numerical_features.remove('charges') # remove 'charges' column from numerical_features list to be used for ML processing | |
| print(f"Numerical variables (predictors / independent variables): {numerical_features}\n") | |
| print(f"DataFrame showing numerical variables only:\n{data_df[numerical_features]}\n") | |
| # print(f"DataFrame showing numerical variables only:\n{data_df[numerical_features].head(5)}\n") | |
| # Store the categorical features (variables) in a new varable called categorical_features | |
| categorical_features = data_df.select_dtypes(include=['object', 'string', 'category']).columns.to_list() | |
| print(f"Categorical variables (predictors / independent variables): {categorical_features}\n") | |
| print(f"DataFrame showing categorical variables only:\n{data_df[categorical_features]}\n") | |
| # print(f"DataFrame showing categorical variables only:\n{data_df[categorical_features].head(5)}\n") | |
| # Store the boolean features (variables) in a new varable called boolean_features | |
| boolean_features = data_df.select_dtypes(include=['bool']).columns.to_list() | |
| print(f"Boolean variables (predictors / independent variables): {boolean_features}\n") | |
| # target (dependent) variable to predict | |
| target = ['charges'] | |
| print(f"Target (response / dependent) variable: {target}\n") | |
| print(f"Target (dependent) variable:\nMean = {data_df[target].mean()}\nMedian = {data_df[target].median()}\nStandard_deviation = {data_df[target].std()}\nMinumum = {data_df[target].min()}\nMaximum = {data_df[target].max()}\n") | |
| # 2) MODEL ESTIMATION: | |
| print("\n2) MODEL ESTIMATION:\n") | |
| print("Creating data subsets") | |
| # Separate features (predictors or independent variables) {X} and target (response or dependent) variable {y} | |
| X = data_df[numerical_features + categorical_features] | |
| y = data_df[target] | |
| print(f"Separation of:\n[X]: features (predictors or independent variables):\n{X}\n\n[y]: target (response or dependent) variable:\n{y}\n") | |
| # Split the dataset into training and testing sets | |
| Xtrain, Xtest, ytrain, ytest = train_test_split( | |
| X, y, | |
| test_size=0.2, | |
| random_state=42, | |
| shuffle=True, | |
| stratify=None | |
| ) | |
| print(f"Data split (train & test)\nVerifying that Xtrain is 80% and Xtest is 20% of the total number of observations in X\nX.shape : {X.shape}\nXtrain.shape : {Xtrain.shape}\nXtest.shape : {Xtest.shape}\n") # to verify that Xtrain is 80% and Xtest is 20% of the total number of observations in X | |
| # Creating a pipeline for numerical feature processing, including imputation of missing values with median and standard scaling. | |
| # This code creates a pipeline named numerical_pipeline to process numerical features. | |
| # It consists of two steps: | |
| # 'imputer': Imputes missing values using the median strategy with SimpleImputer. | |
| # 'scaler': Standardizes the numerical features using StandardScaler. | |
| numerical_pipeline = Pipeline([ | |
| ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), # in case of missing values replace NaN with the Median value of that column. | |
| ('scaler', StandardScaler(with_mean=True, with_std=True)) # apply StandarScaller on the numerical features | |
| ]) | |
| # Creating a pipeline for categorical feature processing, including imputation of missing values with the most frequent value and one-hot encoding with handling of unknown categories. | |
| # This code constructs a pipeline named categorical_pipeline for processing categorical features, involving imputation of missing values using the most frequent value and one-hot encoding with handling of unknown categories. | |
| categorical_pipeline = Pipeline([ | |
| ('imputer', SimpleImputer(missing_values=pd.NA, strategy='most_frequent')), # missing_values can be set to either `np.nan` or `pd.NA`, and are replaced by the Mode of that column. | |
| ('onehot', OneHotEncoder(handle_unknown='ignore')) # apply OneHotEncoder on the categorical features | |
| ]) | |
| # Data pre-processing step | |
| # Note: make_column_transformer only works with pandas DataFrame | |
| # Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately. | |
| preprocessor = make_column_transformer( | |
| (numerical_pipeline, numerical_features), | |
| (categorical_pipeline, categorical_features), | |
| n_jobs=-1, # use all available processors to run jobs in parallel | |
| verbose=False, | |
| verbose_feature_names_out=True | |
| ) | |
| # Regression step | |
| # Ordinary least squares Linear Regression. | |
| # LinearRegression fits a linear model with coefficients w = (w1, ..., wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation. | |
| # fit_intercept : bool, default=True | |
| # Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). | |
| # n_jobs=-1 hyperparameter means: use all available processors to run all the jobs in parallel to speed up execution | |
| model_linear_regression = LinearRegression(fit_intercept=True, n_jobs=-1) | |
| print("Estimating Model Pipeline") | |
| # Creating a pipeline combining preprocessing steps (imputation, scaling, and encoding) with linear regression modeling. | |
| # make_pipeline parameters are *steps : list of Estimator objects | |
| # List of the scikit-learn estimators that are chained together. | |
| model_pipeline = make_pipeline( | |
| preprocessor, | |
| model_linear_regression | |
| ) | |
| # Fit the model. | |
| # Fit all the transformers one after the other and sequentially transform the data. Finally, fit the transformed data using the final estimator. | |
| # X : iterable | |
| # Training data. Must fulfill input requirements of first step of the pipeline. | |
| # y : iterable, default=None | |
| # Training targets. Must fulfill label requirements for all steps of the pipeline. | |
| model_pipeline.fit(X=Xtrain, y=ytrain) | |
| # Get the backend architecture of the model_pipeline | |
| print(f"\nArchitecture of the model pipeline:\n{model_pipeline.named_steps}\n\n") | |
| # Get parameters for this estimator. | |
| # Returns the parameters given in the constructor as well as the estimators contained within the steps of the Pipeline. | |
| print(f"\nParameters given in the constructor as well as the estimators contained within the steps of the model pipeline:\n{model_pipeline.get_params(deep=True)}\n\n") | |
| # 3) MODEL EVALUATION: | |
| print("\n3) MODEL EVALUATION:\n") | |
| print("Model predictive power Metrics") | |
| # Mean Squared Error | |
| print(f"MSE: {mean_squared_error(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest))}") | |
| # Root Mean Squared Error | |
| # print(f"RMSE: {mean_squared_error(ytest, model_pipeline.predict(X=Xtest), squared=False)}") # squared is deprecated; use root_mean_squared_error instead. | |
| # print(f"RMSE: {np.sqrt(mean_squared_error(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest)))}") # RMSE is sqrt(MSE) | |
| print(f"RMSE: {root_mean_squared_error(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest))}\n--> Meaning: on average the predicted insurance charge is off by this amount.") | |
| # Meaning: on average the predicted insurance charge is off by $5,796 | |
| # R-Squared | |
| print(f"R-squared: {r2_score(y_true=ytest, y_pred=model_pipeline.predict(X=Xtest))}\n--> Meaning: The closer the R-squared is to 1, the more the independent variables explain the variability of the dependent variable. It provides an indication of the goodness of fit of the model.") | |
| # Meaning: R-squared of 0.78 means that 78% of the variance in the dependent variable is explained by the independent variables. The closer the R-squared is to 1, the more the independent variables explain the variability of the dependent variable. It provides an indication of the goodness of fit of the model. | |
| # 4) MODEL SERIALIZATION: | |
| print("\n4) MODEL SERIALIZATION:\n") | |
| print("Serializing Model") | |
| # Use joinpath to add subdirectories and a filename | |
| saved_model_path = current_directory.joinpath("model_ic.joblib") | |
| print(f"saved_model_path: {saved_model_path}\n") | |
| # Persist an arbitrary Python object into one file. The python object could be a class, a function, a trained neural network, a script for later executions, etc. | |
| filename_of_stored_object = joblib.dump(value=model_pipeline, filename=saved_model_path) | |
| print(f"\nThe training script is a python object stored in file: {train_py_script}\nExecute the python script as: !python {train_py_script}\n") | |
| # +++ | |