Spaces:

3nthusiast
/

Insurance-Claim-Predict

Sleeping

Insurance-Claim-Predict / models /knn.py

Mateusz Paszynski

publish website

5de1466 about 1 year ago

4.38 kB

	import pandas as pd
	import numpy as np
	import joblib

	from sklearn.model_selection import train_test_split, GridSearchCV
	from sklearn.neighbors import KNeighborsRegressor
	from sklearn.metrics import mean_squared_error, r2_score

	class KNNInsuranceModel:
	"""
	KNN-based insurance model with:
	1. Data loading & cleaning
	2. Preprocessing for 'smoker' (Yes=1, No=0)
	3. Grid search for best hyperparameters
	4. A consistent API with preprocessing, predict, and postprocessing
	"""
	def __init__(self, csv_path):
	# --------------------------------------------------
	# 1. Load data
	# --------------------------------------------------
	insurance_df = pd.read_csv(csv_path)
	# Drop columns if they exist, ignore if not
	insurance_df = insurance_df.drop(columns=["index", "PatientID"], errors="ignore").dropna()

	# Convert smoker: 'Yes' -> 1, 'No' (or anything else) -> 0
	insurance_df["smoker"] = np.where(insurance_df["smoker"] == 'Yes', 1, 0)

	# For training, we use columns [bloodpressure, bmi, smoker] as in your snippet
	X = insurance_df[["bloodpressure", "bmi", "smoker"]]
	y = insurance_df["claim"]

	# --------------------------------------------------
	# 2. Train-test split
	# --------------------------------------------------
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42
	)

	# --------------------------------------------------
	# 3. Grid search for best KNN
	# --------------------------------------------------
	param_grid = {
	'n_neighbors': range(1, 31),
	'weights': ['uniform', 'distance'],
	'metric': ['minkowski', 'euclidean', 'manhattan']
	}
	grid_search = GridSearchCV(
	KNeighborsRegressor(),
	param_grid,
	cv=5
	)
	grid_search.fit(X_train, y_train)

	# The best estimator from the grid
	self.model = grid_search.best_estimator_
	self.model.fit(X_train, y_train)

	# --------------------------------------------------
	# 4. Evaluate on test set
	# --------------------------------------------------
	y_pred = self.model.predict(X_test)
	mse = mean_squared_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)
	self.__scores = [mse, r2]

	print(f"[KNN] Test MSE: {mse:.3f}")
	print(f"[KNN] Test R^2: {r2:.3f}")

	def preprocessing(self, raw_df):
	"""
	For new data, replicate the same steps:
	1) Convert 'smoker' to 0/1
	2) Extract columns [bloodpressure, bmi, smoker]
	Returns a DataFrame or numpy array in the same format as training X.
	"""
	# Copy to avoid mutating original df
	df_copy = raw_df.copy()
	# Convert 'smoker'
	if 'smoker' in df_copy.columns:
	df_copy["smoker"] = np.where(df_copy["smoker"] == 'Yes', 1, 0)
	else:
	# If missing, default to 0 or handle as needed
	df_copy["smoker"] = 0

	# Ensure we only use the same columns as training
	return df_copy[["bloodpressure", "bmi", "smoker"]]

	def predict(self, preprocessed_data):
	"""
	Takes feature data already processed by `preprocessing`,
	returns predictions (in original claim scale, since we didn't scale).
	"""
	preds = self.model.predict(preprocessed_data)
	return self.postprocessing(preds)

	def postprocessing(self, preds):
	"""
	No target scaling to invert, so just return `preds`.
	"""
	return preds

	def getScores(self):
	return f"MSE: {self.__scores[0]} \nR2: {self.__scores[1]}"

	if __name__ == "__main__":
	# --------------------------------------------------
	# 5. Instantiate and train on 'cleaned_insurance_data.csv'
	# --------------------------------------------------
	knn_wrapper = KNNInsuranceModel("cleaned_insurance_data.csv")

	# --------------------------------------------------
	# 6. Export the entire model class for later use
	# --------------------------------------------------
	joblib.dump(knn_wrapper, "KNNInsuranceModel.joblib")
	print("KNNInsuranceModel exported to KNNInsuranceModel.joblib")