Spaces:

3nthusiast
/

Insurance-Claim-Predict

Sleeping

Insurance-Claim-Predict / models /randomforest.py

Mateusz Paszynski

publish website

5de1466 about 1 year ago

5.94 kB

	import pandas as pd
	import numpy as np
	import joblib

	from sklearn.experimental import enable_iterative_imputer
	from sklearn.impute import IterativeImputer
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import OneHotEncoder, StandardScaler
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_absolute_error, r2_score


	class RandomForestInsuranceModel:
	"""
	A Random Forest regressor class with:
	1. Data loading & cleaning (iterative imputation, outlier clipping)
	2. A fixed set of hyperparameters (n_estimators=100, max_depth=4, min_samples_split=15)
	3. A ColumnTransformer for numeric & categorical data
	4. Consistent API: preprocessing, predict, postprocessing
	"""

	def __init__(self, csv_path):
	"""
	Loads the CSV, cleans data, sets up the column transformer,
	trains a RandomForestRegressor with fixed hyperparameters,
	and evaluates on a test set.
	"""
	# -----------------------------------------------------
	# 1. Load and clean data
	# -----------------------------------------------------
	df = pd.read_csv(csv_path)
	# Drop irrelevant columns if present, remove any leftover NaNs
	df = df.drop(columns=["index", "PatientID"], errors="ignore").dropna()

	# Apply iterative imputation for the specified columns
	self._impute(df, columns=['age', 'bmi', 'bloodpressure'])

	# Clip outliers in 'claim' (1st to 98th percentile)
	lower_percentile = df['claim'].quantile(0.01)
	upper_percentile = df['claim'].quantile(0.98)
	df = df[
	(df['claim'] >= lower_percentile) & (df['claim'] <= upper_percentile)
	]

	# -----------------------------------------------------
	# 2. Separate features & target
	# -----------------------------------------------------
	features = df.drop(columns=['claim'])
	target = df['claim'].values # or df['claim'].to_numpy()

	# -----------------------------------------------------
	# 3. Create ColumnTransformer
	# -----------------------------------------------------
	text_pipeline = Pipeline([
	('one-hot', OneHotEncoder(handle_unknown='ignore'))
	])

	nums_pipeline = Pipeline([
	('normalize', StandardScaler(with_mean=False))
	])

	self.ct = ColumnTransformer([
	('categorical', text_pipeline, ['diabetic', 'gender', 'region', 'smoker']),
	('numerical', nums_pipeline, ['children', 'age', 'bmi', 'bloodpressure'])
	])

	# Fit the ColumnTransformer on the entire dataset
	X_full_transformed = self.ct.fit_transform(features)

	# -----------------------------------------------------
	# 4. Train/test split
	# -----------------------------------------------------
	X_train, X_test, y_train, y_test = train_test_split(
	X_full_transformed,
	target,
	test_size=0.2,
	random_state=42
	)

	# -----------------------------------------------------
	# 5. RandomForest with fixed hyperparameters
	# -----------------------------------------------------
	self.model = RandomForestRegressor(
	n_estimators=100,
	max_depth=4,
	min_samples_split=15,
	random_state=42
	)
	self.model.fit(X_train, y_train)

	# -----------------------------------------------------
	# 6. Evaluate
	# -----------------------------------------------------
	mae, r2 = self._evaluate(X_test, y_test)
	print(f"[RANDOM FOREST] Test MAE: {mae:.3f}")
	print(f"[RANDOM FOREST] Test R^2: {r2:.3f}")

	# -------------------------------------------
	# Private: iterative imputation
	# -------------------------------------------
	def _impute(self, df, columns):
	imp = IterativeImputer(max_iter=5, verbose=2)
	arr = imp.fit_transform(df[columns])
	df[columns] = arr

	# -------------------------------------------
	# Private: evaluation
	# -------------------------------------------
	def _evaluate(self, X_test, y_test):
	y_pred = self.model.predict(X_test)
	mae = mean_absolute_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)
	return mae, r2

	# -------------------------------------------
	# Public: preprocessing
	# -------------------------------------------
	def preprocessing(self, raw_df):
	"""
	Takes a new DataFrame with the columns the pipeline expects,
	and returns the transformed matrix.
	"""
	return self.ct.transform(raw_df)

	# -------------------------------------------
	# Public: predict
	# -------------------------------------------
	def predict(self, preprocessed_data):
	"""
	Takes feature data already processed by `preprocessing`,
	returns predictions in the original claim scale.
	"""
	preds = self.model.predict(preprocessed_data)
	return self.postprocessing(preds)

	# -------------------------------------------
	# Public: postprocessing
	# -------------------------------------------
	def postprocessing(self, preds):
	"""
	Currently a pass-through, as there's no target scaling to invert.
	"""
	return preds


	if __name__ == "__main__":
	# Instantiate and train on "cleaned_insurance_data.csv"
	rf_model = RandomForestInsuranceModel("cleaned_insurance_data.csv")

	# Export the entire trained class instance (including the ColumnTransformer)
	joblib.dump(rf_model, "RandomForestInsuranceModel.joblib")
	print("Exported RandomForestInsuranceModel to RandomForestInsuranceModel.joblib")