Spaces:

3nthusiast
/

Insurance-Claim-Predict

Sleeping

File size: 8,274 Bytes

72d9d59

# polynomial_regression_model.py

import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

class PolynomialRegressionInsuranceModel:
    """
    A Polynomial Regression-based insurance claim prediction model with:
      1. Data loading & cleaning
      2. Preprocessing (categorical encoding, numerical scaling, polynomial features)
      3. Model training and evaluation
      4. Consistent API: preprocessing, predict, postprocessing
    """

    def __init__(self, csv_path):
        """
        Initializes the model by loading data, preprocessing, training, and evaluating.
        
        Parameters:
            csv_path (str): Path to the cleaned insurance data CSV file.
        """
        # -----------------------------------------------------
        # 1. Load and clean the data
        # -----------------------------------------------------
        df = pd.read_csv(csv_path)
        # Drop irrelevant columns and handle missing values
        df = df.drop(columns=['index', 'PatientID'], errors='ignore').dropna()

        # -----------------------------------------------------
        # 2. Handle outliers in the target variable 'claim'
        # -----------------------------------------------------
        target_column = 'claim'
        mean_y = df[target_column].mean()
        std_y = df[target_column].std()
        threshold_low = mean_y - 3.5 * std_y
        threshold_high = mean_y + 3.5 * std_y
        df = df[(df[target_column] >= threshold_low) & (df[target_column] <= threshold_high)]

        # -----------------------------------------------------
        # 3. Define features and target
        # -----------------------------------------------------
        self.features = df.drop(columns=[target_column])
        self.target = df[target_column].values  # or df['claim'].to_numpy()

        # -----------------------------------------------------
        # 4. Define preprocessing pipelines
        # -----------------------------------------------------
        categorical_columns = ['gender', 'smoker', 'region', 'diabetic']
        numerical_columns = ['bmi', 'bloodpressure', 'children', 'age']

        # Pipeline for categorical features
        categorical_pipeline = Pipeline([
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        # Pipeline for numerical features
        numerical_pipeline = Pipeline([
            ('scaler', StandardScaler())
        ])

        # Combine pipelines using ColumnTransformer
        self.preprocessor = ColumnTransformer([
            ('categorical', categorical_pipeline, categorical_columns),
            ('numerical', numerical_pipeline, numerical_columns)
        ])

        # Pipeline for polynomial features
        self.poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

        # -----------------------------------------------------
        # 5. Combine preprocessing and polynomial features
        # -----------------------------------------------------
        self.full_preprocessor = Pipeline([
            ('preprocessor', self.preprocessor),
            ('poly', self.poly)
        ])

        # Transform the features
        X_preprocessed = self.full_preprocessor.fit_transform(self.features)

        # -----------------------------------------------------
        # 6. Train-test split
        # -----------------------------------------------------
        X_train, X_test, y_train, y_test = train_test_split(
            X_preprocessed,
            self.target,
            test_size=0.2,
            random_state=42
        )

        # -----------------------------------------------------
        # 7. Initialize and train the Linear Regression model
        # -----------------------------------------------------
        self.model = LinearRegression()

        # Perform 5-fold cross-validation on training data
        cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring='r2')
        print(f"[Polynomial Regression] Cross-Validation R2 Scores: {cv_scores}")
        print(f"[Polynomial Regression] Average CV R2 Score: {cv_scores.mean():.3f}")

        # Train the model on the full training data
        self.model.fit(X_train, y_train)

        # -----------------------------------------------------
        # 8. Evaluate the model on the test set
        # -----------------------------------------------------
        y_pred = self.model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        self.__scores = {
            'MAE': mae,
            'R2': r2,
            'Cross-Validation R2 Scores': cv_scores,
            'Average CV R2': cv_scores.mean()
        }

        print(f"[Polynomial Regression] Test MAE: {mae:.3f}")
        print(f"[Polynomial Regression] Test R^2: {r2:.3f}")

    def preprocessing(self, raw_df):
        """
        Preprocesses new raw data by applying the same transformations as training.
        
        Parameters:
            raw_df (pd.DataFrame): New data with the same feature columns as training (excluding 'claim').
        
        Returns:
            np.ndarray: Transformed feature matrix ready for prediction.
        """
        return self.full_preprocessor.transform(raw_df)

    def predict(self, preprocessed_data):
        """
        Makes predictions on preprocessed data.
        
        Parameters:
            preprocessed_data (np.ndarray): Transformed feature matrix.
        
        Returns:
            np.ndarray: Predicted claim amounts.
        """
        preds = self.model.predict(preprocessed_data)
        return self.postprocessing(preds)

    def postprocessing(self, preds):
        """
        Postprocesses predictions. Currently a pass-through, but can be extended.
        
        Parameters:
            preds (np.ndarray): Raw predictions from the model.
        
        Returns:
            np.ndarray: Final predictions.
        """
        return preds

    def get_scores(self):
        """
        Retrieves the evaluation metrics.
        
        Returns:
            dict: Dictionary containing MAE, R2, and cross-validation scores.
        """
        return self.__scores

    def get_coefficients(self):
        """
        Retrieves the model's coefficients.
        
        Returns:
            pd.DataFrame: DataFrame of feature coefficients.
        """
        # Extract feature names after preprocessing and polynomial transformation
        categorical_features = self.preprocessor.named_transformers_['categorical'].named_steps['onehot'].get_feature_names_out(['gender', 'smoker', 'region', 'diabetic'])
        numerical_features = self.preprocessor.named_transformers_['numerical'].named_steps['scaler'].get_feature_names_out(['bmi', 'bloodpressure', 'children', 'age'])
        all_features = np.concatenate([categorical_features, numerical_features])

        # Get feature names after polynomial transformation
        poly_feature_names = self.poly.get_feature_names_out(all_features)

        # Create DataFrame of coefficients
        coefficients = pd.DataFrame({
            'Feature': poly_feature_names,
            'Coefficient': self.model.coef_
        }).sort_values(by='Coefficient', ascending=False)

        return coefficients

if __name__ == "__main__":
    # -----------------------------------------------------
    # 9. Instantiate and train the model
    # -----------------------------------------------------
    model = PolynomialRegressionInsuranceModel("cleaned_insurance_data.csv")

    # -----------------------------------------------------
    # 10. Export the entire model class instance
    # -----------------------------------------------------
    joblib.dump(model, "PolynomialRegressionInsuranceModel.joblib")
    print("Exported PolynomialRegressionInsuranceModel to PolynomialRegressionInsuranceModel.joblib")