| |
| import pandas as pd |
| import joblib |
|
|
| from sklearn.preprocessing import StandardScaler, OneHotEncoder |
| from sklearn.compose import make_column_transformer |
| from sklearn.compose import ColumnTransformer |
| from sklearn.pipeline import make_pipeline |
| from sklearn.model_selection import train_test_split |
| from sklearn.linear_model import LinearRegression |
| from sklearn.metrics import mean_squared_error, r2_score |
|
|
| |
| data_df = pd.read_csv("insurance.csv") |
| |
| data_df = data_df.drop(columns=['index']) |
|
|
| X = data_df.drop(columns='charges') |
| y = data_df['charges'] |
|
|
| categorical_features = X.select_dtypes(include=['object']).columns |
| numeric_features = X.select_dtypes(include=['number']).columns |
|
|
| print("Creating data subsets") |
|
|
| preprocessor = make_column_transformer( |
| (StandardScaler(), numeric_features), |
| (OneHotEncoder(handle_unknown='ignore'), categorical_features) |
| ) |
|
|
| Xtrain, Xtest, ytrain, ytest = train_test_split( |
| X, y, |
| test_size=0.2, |
| random_state=42 |
| ) |
| model_linear_regression = LinearRegression(n_jobs=-1) |
|
|
| print("Estimating Model Pipeline") |
|
|
| model_pipeline = make_pipeline( |
| preprocessor, |
| model_linear_regression |
| ) |
|
|
| model_pipeline.fit(Xtrain, ytrain) |
|
|
| print("Logging Metrics") |
| print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}") |
|
|
| print("Serializing Model") |
|
|
| saved_model_path = "model.joblib" |
|
|
| joblib.dump(model_pipeline, saved_model_path) |
|
|