Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| from sklearn.model_selection import train_test_split, GridSearchCV | |
| from sklearn.neighbors import KNeighborsRegressor | |
| from sklearn.metrics import mean_squared_error, r2_score | |
| class KNNInsuranceModel: | |
| """ | |
| KNN-based insurance model with: | |
| 1. Data loading & cleaning | |
| 2. Preprocessing for 'smoker' (Yes=1, No=0) | |
| 3. Grid search for best hyperparameters | |
| 4. A consistent API with preprocessing, predict, and postprocessing | |
| """ | |
| def __init__(self, csv_path): | |
| # -------------------------------------------------- | |
| # 1. Load data | |
| # -------------------------------------------------- | |
| insurance_df = pd.read_csv(csv_path) | |
| # Drop columns if they exist, ignore if not | |
| insurance_df = insurance_df.drop(columns=["index", "PatientID"], errors="ignore").dropna() | |
| # Convert smoker: 'Yes' -> 1, 'No' (or anything else) -> 0 | |
| insurance_df["smoker"] = np.where(insurance_df["smoker"] == 'Yes', 1, 0) | |
| # For training, we use columns [bloodpressure, bmi, smoker] as in your snippet | |
| X = insurance_df[["bloodpressure", "bmi", "smoker"]] | |
| y = insurance_df["claim"] | |
| # -------------------------------------------------- | |
| # 2. Train-test split | |
| # -------------------------------------------------- | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42 | |
| ) | |
| # -------------------------------------------------- | |
| # 3. Grid search for best KNN | |
| # -------------------------------------------------- | |
| param_grid = { | |
| 'n_neighbors': range(1, 31), | |
| 'weights': ['uniform', 'distance'], | |
| 'metric': ['minkowski', 'euclidean', 'manhattan'] | |
| } | |
| grid_search = GridSearchCV( | |
| KNeighborsRegressor(), | |
| param_grid, | |
| cv=5 | |
| ) | |
| grid_search.fit(X_train, y_train) | |
| # The best estimator from the grid | |
| self.model = grid_search.best_estimator_ | |
| self.model.fit(X_train, y_train) | |
| # -------------------------------------------------- | |
| # 4. Evaluate on test set | |
| # -------------------------------------------------- | |
| y_pred = self.model.predict(X_test) | |
| mse = mean_squared_error(y_test, y_pred) | |
| r2 = r2_score(y_test, y_pred) | |
| self.__scores = [mse, r2] | |
| print(f"[KNN] Test MSE: {mse:.3f}") | |
| print(f"[KNN] Test R^2: {r2:.3f}") | |
| def preprocessing(self, raw_df): | |
| """ | |
| For new data, replicate the same steps: | |
| 1) Convert 'smoker' to 0/1 | |
| 2) Extract columns [bloodpressure, bmi, smoker] | |
| Returns a DataFrame or numpy array in the same format as training X. | |
| """ | |
| # Copy to avoid mutating original df | |
| df_copy = raw_df.copy() | |
| # Convert 'smoker' | |
| if 'smoker' in df_copy.columns: | |
| df_copy["smoker"] = np.where(df_copy["smoker"] == 'Yes', 1, 0) | |
| else: | |
| # If missing, default to 0 or handle as needed | |
| df_copy["smoker"] = 0 | |
| # Ensure we only use the same columns as training | |
| return df_copy[["bloodpressure", "bmi", "smoker"]] | |
| def predict(self, preprocessed_data): | |
| """ | |
| Takes feature data already processed by `preprocessing`, | |
| returns predictions (in original claim scale, since we didn't scale). | |
| """ | |
| preds = self.model.predict(preprocessed_data) | |
| return self.postprocessing(preds) | |
| def postprocessing(self, preds): | |
| """ | |
| No target scaling to invert, so just return `preds`. | |
| """ | |
| return preds | |
| def getScores(self): | |
| return f"MSE: {self.__scores[0]} \nR2: {self.__scores[1]}" | |
| if __name__ == "__main__": | |
| # -------------------------------------------------- | |
| # 5. Instantiate and train on 'cleaned_insurance_data.csv' | |
| # -------------------------------------------------- | |
| knn_wrapper = KNNInsuranceModel("cleaned_insurance_data.csv") | |
| # -------------------------------------------------- | |
| # 6. Export the entire model class for later use | |
| # -------------------------------------------------- | |
| joblib.dump(knn_wrapper, "KNNInsuranceModel.joblib") | |
| print("KNNInsuranceModel exported to KNNInsuranceModel.joblib") | |