Spaces:

3nthusiast
/

Insurance-Claim-Predict

Sleeping

Insurance-Claim-Predict / models /polyreg.py

Mateusz Paszynski

add polyreg and linreg

72d9d59 about 1 year ago

8.27 kB

	# polynomial_regression_model.py

	import pandas as pd
	import numpy as np
	import joblib

	from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_absolute_error, r2_score

	class PolynomialRegressionInsuranceModel:
	"""
	A Polynomial Regression-based insurance claim prediction model with:
	1. Data loading & cleaning
	2. Preprocessing (categorical encoding, numerical scaling, polynomial features)
	3. Model training and evaluation
	4. Consistent API: preprocessing, predict, postprocessing
	"""

	def __init__(self, csv_path):
	"""
	Initializes the model by loading data, preprocessing, training, and evaluating.

	Parameters:
	csv_path (str): Path to the cleaned insurance data CSV file.
	"""
	# -----------------------------------------------------
	# 1. Load and clean the data
	# -----------------------------------------------------
	df = pd.read_csv(csv_path)
	# Drop irrelevant columns and handle missing values
	df = df.drop(columns=['index', 'PatientID'], errors='ignore').dropna()

	# -----------------------------------------------------
	# 2. Handle outliers in the target variable 'claim'
	# -----------------------------------------------------
	target_column = 'claim'
	mean_y = df[target_column].mean()
	std_y = df[target_column].std()
	threshold_low = mean_y - 3.5 * std_y
	threshold_high = mean_y + 3.5 * std_y
	df = df[(df[target_column] >= threshold_low) & (df[target_column] <= threshold_high)]

	# -----------------------------------------------------
	# 3. Define features and target
	# -----------------------------------------------------
	self.features = df.drop(columns=[target_column])
	self.target = df[target_column].values # or df['claim'].to_numpy()

	# -----------------------------------------------------
	# 4. Define preprocessing pipelines
	# -----------------------------------------------------
	categorical_columns = ['gender', 'smoker', 'region', 'diabetic']
	numerical_columns = ['bmi', 'bloodpressure', 'children', 'age']

	# Pipeline for categorical features
	categorical_pipeline = Pipeline([
	('onehot', OneHotEncoder(handle_unknown='ignore'))
	])

	# Pipeline for numerical features
	numerical_pipeline = Pipeline([
	('scaler', StandardScaler())
	])

	# Combine pipelines using ColumnTransformer
	self.preprocessor = ColumnTransformer([
	('categorical', categorical_pipeline, categorical_columns),
	('numerical', numerical_pipeline, numerical_columns)
	])

	# Pipeline for polynomial features
	self.poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

	# -----------------------------------------------------
	# 5. Combine preprocessing and polynomial features
	# -----------------------------------------------------
	self.full_preprocessor = Pipeline([
	('preprocessor', self.preprocessor),
	('poly', self.poly)
	])

	# Transform the features
	X_preprocessed = self.full_preprocessor.fit_transform(self.features)

	# -----------------------------------------------------
	# 6. Train-test split
	# -----------------------------------------------------
	X_train, X_test, y_train, y_test = train_test_split(
	X_preprocessed,
	self.target,
	test_size=0.2,
	random_state=42
	)

	# -----------------------------------------------------
	# 7. Initialize and train the Linear Regression model
	# -----------------------------------------------------
	self.model = LinearRegression()

	# Perform 5-fold cross-validation on training data
	cv_scores = cross_val_score(self.model, X_train, y_train, cv=5, scoring='r2')
	print(f"[Polynomial Regression] Cross-Validation R2 Scores: {cv_scores}")
	print(f"[Polynomial Regression] Average CV R2 Score: {cv_scores.mean():.3f}")

	# Train the model on the full training data
	self.model.fit(X_train, y_train)

	# -----------------------------------------------------
	# 8. Evaluate the model on the test set
	# -----------------------------------------------------
	y_pred = self.model.predict(X_test)
	mae = mean_absolute_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)
	self.__scores = {
	'MAE': mae,
	'R2': r2,
	'Cross-Validation R2 Scores': cv_scores,
	'Average CV R2': cv_scores.mean()
	}

	print(f"[Polynomial Regression] Test MAE: {mae:.3f}")
	print(f"[Polynomial Regression] Test R^2: {r2:.3f}")

	def preprocessing(self, raw_df):
	"""
	Preprocesses new raw data by applying the same transformations as training.

	Parameters:
	raw_df (pd.DataFrame): New data with the same feature columns as training (excluding 'claim').

	Returns:
	np.ndarray: Transformed feature matrix ready for prediction.
	"""
	return self.full_preprocessor.transform(raw_df)

	def predict(self, preprocessed_data):
	"""
	Makes predictions on preprocessed data.

	Parameters:
	preprocessed_data (np.ndarray): Transformed feature matrix.

	Returns:
	np.ndarray: Predicted claim amounts.
	"""
	preds = self.model.predict(preprocessed_data)
	return self.postprocessing(preds)

	def postprocessing(self, preds):
	"""
	Postprocesses predictions. Currently a pass-through, but can be extended.

	Parameters:
	preds (np.ndarray): Raw predictions from the model.

	Returns:
	np.ndarray: Final predictions.
	"""
	return preds

	def get_scores(self):
	"""
	Retrieves the evaluation metrics.

	Returns:
	dict: Dictionary containing MAE, R2, and cross-validation scores.
	"""
	return self.__scores

	def get_coefficients(self):
	"""
	Retrieves the model's coefficients.

	Returns:
	pd.DataFrame: DataFrame of feature coefficients.
	"""
	# Extract feature names after preprocessing and polynomial transformation
	categorical_features = self.preprocessor.named_transformers_['categorical'].named_steps['onehot'].get_feature_names_out(['gender', 'smoker', 'region', 'diabetic'])
	numerical_features = self.preprocessor.named_transformers_['numerical'].named_steps['scaler'].get_feature_names_out(['bmi', 'bloodpressure', 'children', 'age'])
	all_features = np.concatenate([categorical_features, numerical_features])

	# Get feature names after polynomial transformation
	poly_feature_names = self.poly.get_feature_names_out(all_features)

	# Create DataFrame of coefficients
	coefficients = pd.DataFrame({
	'Feature': poly_feature_names,
	'Coefficient': self.model.coef_
	}).sort_values(by='Coefficient', ascending=False)

	return coefficients

	if __name__ == "__main__":
	# -----------------------------------------------------
	# 9. Instantiate and train the model
	# -----------------------------------------------------
	model = PolynomialRegressionInsuranceModel("cleaned_insurance_data.csv")

	# -----------------------------------------------------
	# 10. Export the entire model class instance
	# -----------------------------------------------------
	joblib.dump(model, "PolynomialRegressionInsuranceModel.joblib")
	print("Exported PolynomialRegressionInsuranceModel to PolynomialRegressionInsuranceModel.joblib")