Spaces:
Sleeping
Sleeping
File size: 4,375 Bytes
5de1466 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
class KNNInsuranceModel:
"""
KNN-based insurance model with:
1. Data loading & cleaning
2. Preprocessing for 'smoker' (Yes=1, No=0)
3. Grid search for best hyperparameters
4. A consistent API with preprocessing, predict, and postprocessing
"""
def __init__(self, csv_path):
# --------------------------------------------------
# 1. Load data
# --------------------------------------------------
insurance_df = pd.read_csv(csv_path)
# Drop columns if they exist, ignore if not
insurance_df = insurance_df.drop(columns=["index", "PatientID"], errors="ignore").dropna()
# Convert smoker: 'Yes' -> 1, 'No' (or anything else) -> 0
insurance_df["smoker"] = np.where(insurance_df["smoker"] == 'Yes', 1, 0)
# For training, we use columns [bloodpressure, bmi, smoker] as in your snippet
X = insurance_df[["bloodpressure", "bmi", "smoker"]]
y = insurance_df["claim"]
# --------------------------------------------------
# 2. Train-test split
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# --------------------------------------------------
# 3. Grid search for best KNN
# --------------------------------------------------
param_grid = {
'n_neighbors': range(1, 31),
'weights': ['uniform', 'distance'],
'metric': ['minkowski', 'euclidean', 'manhattan']
}
grid_search = GridSearchCV(
KNeighborsRegressor(),
param_grid,
cv=5
)
grid_search.fit(X_train, y_train)
# The best estimator from the grid
self.model = grid_search.best_estimator_
self.model.fit(X_train, y_train)
# --------------------------------------------------
# 4. Evaluate on test set
# --------------------------------------------------
y_pred = self.model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
self.__scores = [mse, r2]
print(f"[KNN] Test MSE: {mse:.3f}")
print(f"[KNN] Test R^2: {r2:.3f}")
def preprocessing(self, raw_df):
"""
For new data, replicate the same steps:
1) Convert 'smoker' to 0/1
2) Extract columns [bloodpressure, bmi, smoker]
Returns a DataFrame or numpy array in the same format as training X.
"""
# Copy to avoid mutating original df
df_copy = raw_df.copy()
# Convert 'smoker'
if 'smoker' in df_copy.columns:
df_copy["smoker"] = np.where(df_copy["smoker"] == 'Yes', 1, 0)
else:
# If missing, default to 0 or handle as needed
df_copy["smoker"] = 0
# Ensure we only use the same columns as training
return df_copy[["bloodpressure", "bmi", "smoker"]]
def predict(self, preprocessed_data):
"""
Takes feature data already processed by `preprocessing`,
returns predictions (in original claim scale, since we didn't scale).
"""
preds = self.model.predict(preprocessed_data)
return self.postprocessing(preds)
def postprocessing(self, preds):
"""
No target scaling to invert, so just return `preds`.
"""
return preds
def getScores(self):
return f"MSE: {self.__scores[0]} \nR2: {self.__scores[1]}"
if __name__ == "__main__":
# --------------------------------------------------
# 5. Instantiate and train on 'cleaned_insurance_data.csv'
# --------------------------------------------------
knn_wrapper = KNNInsuranceModel("cleaned_insurance_data.csv")
# --------------------------------------------------
# 6. Export the entire model class for later use
# --------------------------------------------------
joblib.dump(knn_wrapper, "KNNInsuranceModel.joblib")
print("KNNInsuranceModel exported to KNNInsuranceModel.joblib")
|