Spaces:
Sleeping
Sleeping
| # polynomial_regression_model.py | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.metrics import mean_absolute_error, r2_score | |
| class PolynomialRegressionInsuranceModel: | |
| """ | |
| A Polynomial Regression-based insurance claim prediction model with: | |
| 1. Data loading & cleaning | |
| 2. Preprocessing (categorical encoding, numerical scaling, polynomial features) | |
| 3. Model training and evaluation | |
| 4. Consistent API: preprocessing, predict, postprocessing | |
| """ | |
| def __init__(self, csv_path): | |
| """ | |
| Initializes the model by loading data, preprocessing, training, and evaluating. | |
| Parameters: | |
| csv_path (str): Path to the cleaned insurance data CSV file. | |
| """ | |
| # ----------------------------------------------------- | |
| # 1. Load and clean the data | |
| # ----------------------------------------------------- | |
| df = pd.read_csv(csv_path) | |
| # Drop irrelevant columns and handle missing values | |
| df = df.drop(columns=['index', 'PatientID'], errors='ignore').dropna() | |
| # ----------------------------------------------------- | |
| # 2. Handle outliers in the target variable 'claim' | |
| # ----------------------------------------------------- | |
| target_column = 'claim' | |
| mean_y = df[target_column].mean() | |
| std_y = df[target_column].std() | |
| threshold_low = mean_y - 3.5 * std_y | |
| threshold_high = mean_y + 3.5 * std_y | |
| df = df[(df[target_column] >= threshold_low) & (df[target_column] <= threshold_high)] | |
| # ----------------------------------------------------- | |
| # 3. Define features and target | |
| # ----------------------------------------------------- | |
| self.features = df.drop(columns=[target_column]) | |
| self.target = df[target_column].values # or df['claim'].to_numpy() | |
| # ----------------------------------------------------- | |
| # 4. Define preprocessing pipelines | |
| # ----------------------------------------------------- | |
| categorical_columns = ['gender', 'smoker', 'region', 'diabetic'] | |
| numerical_columns = ['bmi', 'bloodpressure', 'children', 'age'] | |
| # Pipeline for categorical features | |
| categorical_pipeline = Pipeline([ | |
| ('onehot', OneHotEncoder(handle_unknown='ignore')) | |
| ]) | |
| # Pipeline for numerical features | |
| numerical_pipeline = Pipeline([ | |
| ('scaler', StandardScaler()) | |
| ]) | |
| # Combine pipelines using ColumnTransformer | |
| self.preprocessor = ColumnTransformer([ | |
| ('categorical', categorical_pipeline, categorical_columns), | |
| ('numerical', numerical_pipeline, numerical_columns) | |
| ]) | |
| # Pipeline for polynomial features | |
| self.poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False) | |
| # ----------------------------------------------------- | |
| # 5. Combine preprocessing and polynomial features | |
| # ----------------------------------------------------- | |
| self.full_preprocessor = Pipeline([ | |
| ('preprocessor', self.preprocessor), | |
| ('poly', self.poly) | |
| ]) | |
| # Transform the features | |
| X_preprocessed = self.full_preprocessor.fit_transform(self.features) | |
| # ----------------------------------------------------- | |
| # 6. Train-test split | |
| # ----------------------------------------------------- | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X_preprocessed, | |
| self.target, | |
| test_size=0.2, | |
| random_state=42 | |
| ) | |
| # ----------------------------------------------------- | |
| # 7. Initialize and train the Linear Regression model | |
| # ----------------------------------------------------- | |
| self.model = LinearRegression() | |
| # Perform 5-fold cross-validation on training data | |
| cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring='r2') | |
| print(f"[Polynomial Regression] Cross-Validation R2 Scores: {cv_scores}") | |
| print(f"[Polynomial Regression] Average CV R2 Score: {cv_scores.mean():.3f}") | |
| # Train the model on the full training data | |
| self.model.fit(X_train, y_train) | |
| # ----------------------------------------------------- | |
| # 8. Evaluate the model on the test set | |
| # ----------------------------------------------------- | |
| y_pred = self.model.predict(X_test) | |
| mae = mean_absolute_error(y_test, y_pred) | |
| r2 = r2_score(y_test, y_pred) | |
| self.__scores = { | |
| 'MAE': mae, | |
| 'R2': r2, | |
| 'Cross-Validation R2 Scores': cv_scores, | |
| 'Average CV R2': cv_scores.mean() | |
| } | |
| print(f"[Polynomial Regression] Test MAE: {mae:.3f}") | |
| print(f"[Polynomial Regression] Test R^2: {r2:.3f}") | |
| def preprocessing(self, raw_df): | |
| """ | |
| Preprocesses new raw data by applying the same transformations as training. | |
| Parameters: | |
| raw_df (pd.DataFrame): New data with the same feature columns as training (excluding 'claim'). | |
| Returns: | |
| np.ndarray: Transformed feature matrix ready for prediction. | |
| """ | |
| return self.full_preprocessor.transform(raw_df) | |
| def predict(self, preprocessed_data): | |
| """ | |
| Makes predictions on preprocessed data. | |
| Parameters: | |
| preprocessed_data (np.ndarray): Transformed feature matrix. | |
| Returns: | |
| np.ndarray: Predicted claim amounts. | |
| """ | |
| preds = self.model.predict(preprocessed_data) | |
| return self.postprocessing(preds) | |
| def postprocessing(self, preds): | |
| """ | |
| Postprocesses predictions. Currently a pass-through, but can be extended. | |
| Parameters: | |
| preds (np.ndarray): Raw predictions from the model. | |
| Returns: | |
| np.ndarray: Final predictions. | |
| """ | |
| return preds | |
| def get_scores(self): | |
| """ | |
| Retrieves the evaluation metrics. | |
| Returns: | |
| dict: Dictionary containing MAE, R2, and cross-validation scores. | |
| """ | |
| return self.__scores | |
| def get_coefficients(self): | |
| """ | |
| Retrieves the model's coefficients. | |
| Returns: | |
| pd.DataFrame: DataFrame of feature coefficients. | |
| """ | |
| # Extract feature names after preprocessing and polynomial transformation | |
| categorical_features = self.preprocessor.named_transformers_['categorical'].named_steps['onehot'].get_feature_names_out(['gender', 'smoker', 'region', 'diabetic']) | |
| numerical_features = self.preprocessor.named_transformers_['numerical'].named_steps['scaler'].get_feature_names_out(['bmi', 'bloodpressure', 'children', 'age']) | |
| all_features = np.concatenate([categorical_features, numerical_features]) | |
| # Get feature names after polynomial transformation | |
| poly_feature_names = self.poly.get_feature_names_out(all_features) | |
| # Create DataFrame of coefficients | |
| coefficients = pd.DataFrame({ | |
| 'Feature': poly_feature_names, | |
| 'Coefficient': self.model.coef_ | |
| }).sort_values(by='Coefficient', ascending=False) | |
| return coefficients | |
| if __name__ == "__main__": | |
| # ----------------------------------------------------- | |
| # 9. Instantiate and train the model | |
| # ----------------------------------------------------- | |
| model = PolynomialRegressionInsuranceModel("cleaned_insurance_data.csv") | |
| # ----------------------------------------------------- | |
| # 10. Export the entire model class instance | |
| # ----------------------------------------------------- | |
| joblib.dump(model, "PolynomialRegressionInsuranceModel.joblib") | |
| print("Exported PolynomialRegressionInsuranceModel to PolynomialRegressionInsuranceModel.joblib") | |