import pandas as pd import numpy as np import joblib from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error, r2_score class KNNInsuranceModel: """ KNN-based insurance model with: 1. Data loading & cleaning 2. Preprocessing for 'smoker' (Yes=1, No=0) 3. Grid search for best hyperparameters 4. A consistent API with preprocessing, predict, and postprocessing """ def __init__(self, csv_path): # -------------------------------------------------- # 1. Load data # -------------------------------------------------- insurance_df = pd.read_csv(csv_path) # Drop columns if they exist, ignore if not insurance_df = insurance_df.drop(columns=["index", "PatientID"], errors="ignore").dropna() # Convert smoker: 'Yes' -> 1, 'No' (or anything else) -> 0 insurance_df["smoker"] = np.where(insurance_df["smoker"] == 'Yes', 1, 0) # For training, we use columns [bloodpressure, bmi, smoker] as in your snippet X = insurance_df[["bloodpressure", "bmi", "smoker"]] y = insurance_df["claim"] # -------------------------------------------------- # 2. Train-test split # -------------------------------------------------- X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # -------------------------------------------------- # 3. Grid search for best KNN # -------------------------------------------------- param_grid = { 'n_neighbors': range(1, 31), 'weights': ['uniform', 'distance'], 'metric': ['minkowski', 'euclidean', 'manhattan'] } grid_search = GridSearchCV( KNeighborsRegressor(), param_grid, cv=5 ) grid_search.fit(X_train, y_train) # The best estimator from the grid self.model = grid_search.best_estimator_ self.model.fit(X_train, y_train) # -------------------------------------------------- # 4. Evaluate on test set # -------------------------------------------------- y_pred = self.model.predict(X_test) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) self.__scores = [mse, r2] print(f"[KNN] Test MSE: {mse:.3f}") print(f"[KNN] Test R^2: {r2:.3f}") def preprocessing(self, raw_df): """ For new data, replicate the same steps: 1) Convert 'smoker' to 0/1 2) Extract columns [bloodpressure, bmi, smoker] Returns a DataFrame or numpy array in the same format as training X. """ # Copy to avoid mutating original df df_copy = raw_df.copy() # Convert 'smoker' if 'smoker' in df_copy.columns: df_copy["smoker"] = np.where(df_copy["smoker"] == 'Yes', 1, 0) else: # If missing, default to 0 or handle as needed df_copy["smoker"] = 0 # Ensure we only use the same columns as training return df_copy[["bloodpressure", "bmi", "smoker"]] def predict(self, preprocessed_data): """ Takes feature data already processed by `preprocessing`, returns predictions (in original claim scale, since we didn't scale). """ preds = self.model.predict(preprocessed_data) return self.postprocessing(preds) def postprocessing(self, preds): """ No target scaling to invert, so just return `preds`. """ return preds def getScores(self): return f"MSE: {self.__scores[0]} \nR2: {self.__scores[1]}" if __name__ == "__main__": # -------------------------------------------------- # 5. Instantiate and train on 'cleaned_insurance_data.csv' # -------------------------------------------------- knn_wrapper = KNNInsuranceModel("cleaned_insurance_data.csv") # -------------------------------------------------- # 6. Export the entire model class for later use # -------------------------------------------------- joblib.dump(knn_wrapper, "KNNInsuranceModel.joblib") print("KNNInsuranceModel exported to KNNInsuranceModel.joblib")