Mateusz Paszynski
add polyreg and linreg
72d9d59
# polynomial_regression_model.py
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
class PolynomialRegressionInsuranceModel:
"""
A Polynomial Regression-based insurance claim prediction model with:
1. Data loading & cleaning
2. Preprocessing (categorical encoding, numerical scaling, polynomial features)
3. Model training and evaluation
4. Consistent API: preprocessing, predict, postprocessing
"""
def __init__(self, csv_path):
"""
Initializes the model by loading data, preprocessing, training, and evaluating.
Parameters:
csv_path (str): Path to the cleaned insurance data CSV file.
"""
# -----------------------------------------------------
# 1. Load and clean the data
# -----------------------------------------------------
df = pd.read_csv(csv_path)
# Drop irrelevant columns and handle missing values
df = df.drop(columns=['index', 'PatientID'], errors='ignore').dropna()
# -----------------------------------------------------
# 2. Handle outliers in the target variable 'claim'
# -----------------------------------------------------
target_column = 'claim'
mean_y = df[target_column].mean()
std_y = df[target_column].std()
threshold_low = mean_y - 3.5 * std_y
threshold_high = mean_y + 3.5 * std_y
df = df[(df[target_column] >= threshold_low) & (df[target_column] <= threshold_high)]
# -----------------------------------------------------
# 3. Define features and target
# -----------------------------------------------------
self.features = df.drop(columns=[target_column])
self.target = df[target_column].values # or df['claim'].to_numpy()
# -----------------------------------------------------
# 4. Define preprocessing pipelines
# -----------------------------------------------------
categorical_columns = ['gender', 'smoker', 'region', 'diabetic']
numerical_columns = ['bmi', 'bloodpressure', 'children', 'age']
# Pipeline for categorical features
categorical_pipeline = Pipeline([
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Pipeline for numerical features
numerical_pipeline = Pipeline([
('scaler', StandardScaler())
])
# Combine pipelines using ColumnTransformer
self.preprocessor = ColumnTransformer([
('categorical', categorical_pipeline, categorical_columns),
('numerical', numerical_pipeline, numerical_columns)
])
# Pipeline for polynomial features
self.poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
# -----------------------------------------------------
# 5. Combine preprocessing and polynomial features
# -----------------------------------------------------
self.full_preprocessor = Pipeline([
('preprocessor', self.preprocessor),
('poly', self.poly)
])
# Transform the features
X_preprocessed = self.full_preprocessor.fit_transform(self.features)
# -----------------------------------------------------
# 6. Train-test split
# -----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
X_preprocessed,
self.target,
test_size=0.2,
random_state=42
)
# -----------------------------------------------------
# 7. Initialize and train the Linear Regression model
# -----------------------------------------------------
self.model = LinearRegression()
# Perform 5-fold cross-validation on training data
cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring='r2')
print(f"[Polynomial Regression] Cross-Validation R2 Scores: {cv_scores}")
print(f"[Polynomial Regression] Average CV R2 Score: {cv_scores.mean():.3f}")
# Train the model on the full training data
self.model.fit(X_train, y_train)
# -----------------------------------------------------
# 8. Evaluate the model on the test set
# -----------------------------------------------------
y_pred = self.model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
self.__scores = {
'MAE': mae,
'R2': r2,
'Cross-Validation R2 Scores': cv_scores,
'Average CV R2': cv_scores.mean()
}
print(f"[Polynomial Regression] Test MAE: {mae:.3f}")
print(f"[Polynomial Regression] Test R^2: {r2:.3f}")
def preprocessing(self, raw_df):
"""
Preprocesses new raw data by applying the same transformations as training.
Parameters:
raw_df (pd.DataFrame): New data with the same feature columns as training (excluding 'claim').
Returns:
np.ndarray: Transformed feature matrix ready for prediction.
"""
return self.full_preprocessor.transform(raw_df)
def predict(self, preprocessed_data):
"""
Makes predictions on preprocessed data.
Parameters:
preprocessed_data (np.ndarray): Transformed feature matrix.
Returns:
np.ndarray: Predicted claim amounts.
"""
preds = self.model.predict(preprocessed_data)
return self.postprocessing(preds)
def postprocessing(self, preds):
"""
Postprocesses predictions. Currently a pass-through, but can be extended.
Parameters:
preds (np.ndarray): Raw predictions from the model.
Returns:
np.ndarray: Final predictions.
"""
return preds
def get_scores(self):
"""
Retrieves the evaluation metrics.
Returns:
dict: Dictionary containing MAE, R2, and cross-validation scores.
"""
return self.__scores
def get_coefficients(self):
"""
Retrieves the model's coefficients.
Returns:
pd.DataFrame: DataFrame of feature coefficients.
"""
# Extract feature names after preprocessing and polynomial transformation
categorical_features = self.preprocessor.named_transformers_['categorical'].named_steps['onehot'].get_feature_names_out(['gender', 'smoker', 'region', 'diabetic'])
numerical_features = self.preprocessor.named_transformers_['numerical'].named_steps['scaler'].get_feature_names_out(['bmi', 'bloodpressure', 'children', 'age'])
all_features = np.concatenate([categorical_features, numerical_features])
# Get feature names after polynomial transformation
poly_feature_names = self.poly.get_feature_names_out(all_features)
# Create DataFrame of coefficients
coefficients = pd.DataFrame({
'Feature': poly_feature_names,
'Coefficient': self.model.coef_
}).sort_values(by='Coefficient', ascending=False)
return coefficients
if __name__ == "__main__":
# -----------------------------------------------------
# 9. Instantiate and train the model
# -----------------------------------------------------
model = PolynomialRegressionInsuranceModel("cleaned_insurance_data.csv")
# -----------------------------------------------------
# 10. Export the entire model class instance
# -----------------------------------------------------
joblib.dump(model, "PolynomialRegressionInsuranceModel.joblib")
print("Exported PolynomialRegressionInsuranceModel to PolynomialRegressionInsuranceModel.joblib")