Spaces:
Sleeping
Sleeping
Mateusz Paszynski commited on
Commit ·
5de1466
1
Parent(s): 75b972f
publish website
Browse files- KNNInsuranceModel.joblib +3 -0
- NuSVRInsuranceModel.joblib +3 -0
- RandomForestInsuranceModel.joblib +3 -0
- XGBoostInsuranceModel.joblib +3 -0
- app.py +133 -0
- models/XGBoost.py +142 -0
- models/knn.py +115 -0
- models/randomforest.py +154 -0
- models/svr.py +161 -0
KNNInsuranceModel.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e42bf88f3ea8c987e04d270daeae1944e61ecbb51d655b74c15f667d6b1b65f7
|
| 3 |
+
size 75472
|
NuSVRInsuranceModel.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a21361ad305bcc706235a87f72f5f147f6dc2ceddbd51c9d36e12bb9e1fe65b2
|
| 3 |
+
size 90887
|
RandomForestInsuranceModel.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7091796dc3eb05b4fe48607cfb0f9419ffa847343e8baf3534965e7378d2c96d
|
| 3 |
+
size 225027
|
XGBoostInsuranceModel.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4a06f22700c735de1f450ca10404a71fca66509570b3c1aa10af8576b4feb61d
|
| 3 |
+
size 184349
|
app.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import joblib
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from models.svr import NuSVRInsuranceModel
|
| 6 |
+
from models.randomforest import RandomForestInsuranceModel
|
| 7 |
+
from models.XGBoost import XGBoostInsuranceModel
|
| 8 |
+
from models.knn import KNNInsuranceModel
|
| 9 |
+
import os
|
| 10 |
+
# -------------------------------------------------------
|
| 11 |
+
# Placeholder code to load pre-trained models
|
| 12 |
+
# (Replace 'model_NuSVR.joblib', etc. with your actual file paths)
|
| 13 |
+
# -------------------------------------------------------
|
| 14 |
+
model_nusvr = joblib.load(os.path.join('.', 'Website', 'NuSVRInsuranceModel.joblib'))
|
| 15 |
+
model_xgb = joblib.load(os.path.join('.', 'Website', 'XGBoostInsuranceModel.joblib'))
|
| 16 |
+
model_knn = joblib.load(os.path.join('.', 'Website', 'KNNInsuranceModel.joblib'))
|
| 17 |
+
model_rf = joblib.load(os.path.join('.', 'Website', 'RandomForestInsuranceModel.joblib'))
|
| 18 |
+
|
| 19 |
+
# # Dictionary to map model choice to the actual loaded model
|
| 20 |
+
models_dict = {
|
| 21 |
+
"NuSVR": model_nusvr,
|
| 22 |
+
"XGBoost": model_xgb,
|
| 23 |
+
"KNN": model_knn,
|
| 24 |
+
"Random Forest": model_rf
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
# -------------------------------------------------------
|
| 28 |
+
# Simple label encodings or numeric mappings:
|
| 29 |
+
# - gender: {"male":0, "female":1}
|
| 30 |
+
# - diabetic: {"No":0, "Yes":1}
|
| 31 |
+
# - smoker: {"No":0, "Yes":1}
|
| 32 |
+
# - region: {"southwest":0, "southeast":1, "northwest":2, "northeast":3}
|
| 33 |
+
# Adjust this to however you pre-processed the data for training!
|
| 34 |
+
# -------------------------------------------------------
|
| 35 |
+
region_mapping = {"southwest": 0, "southeast": 1, "northwest": 2, "northeast": 3}
|
| 36 |
+
gender_mapping = {"male": 0, "female": 1}
|
| 37 |
+
yes_no_mapping = {"No": 0, "Yes": 1}
|
| 38 |
+
|
| 39 |
+
def predict_insurance_claim(
|
| 40 |
+
model_choice,
|
| 41 |
+
age,
|
| 42 |
+
gender,
|
| 43 |
+
bmi,
|
| 44 |
+
blood_pressure,
|
| 45 |
+
diabetic,
|
| 46 |
+
children,
|
| 47 |
+
smoker,
|
| 48 |
+
region
|
| 49 |
+
):
|
| 50 |
+
# Convert categorical values using the same logic used during training
|
| 51 |
+
try:
|
| 52 |
+
gender_val = gender_mapping[gender.lower()]
|
| 53 |
+
diabetic_val = yes_no_mapping[diabetic]
|
| 54 |
+
smoker_val = yes_no_mapping[smoker]
|
| 55 |
+
region_val = region_mapping[region.lower()]
|
| 56 |
+
except KeyError:
|
| 57 |
+
return "Invalid input for categorical field."
|
| 58 |
+
|
| 59 |
+
# Construct a single-row DataFrame with the correct column names
|
| 60 |
+
user_data = {
|
| 61 |
+
'age': [age],
|
| 62 |
+
'gender': [gender],
|
| 63 |
+
'bmi': [bmi],
|
| 64 |
+
'bloodpressure': [blood_pressure],
|
| 65 |
+
'diabetic': [diabetic],
|
| 66 |
+
'children': [children],
|
| 67 |
+
'smoker': [smoker],
|
| 68 |
+
'region': [region]
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
user_df = pd.DataFrame(user_data)
|
| 72 |
+
|
| 73 |
+
chosen_model = models_dict[model_choice]
|
| 74 |
+
|
| 75 |
+
d = chosen_model.preprocessing(user_df)
|
| 76 |
+
|
| 77 |
+
y_pred = chosen_model.predict(d)
|
| 78 |
+
|
| 79 |
+
return float(y_pred[0])
|
| 80 |
+
|
| 81 |
+
# -------------------------------------------------------
|
| 82 |
+
# Gradio Interface
|
| 83 |
+
# -------------------------------------------------------
|
| 84 |
+
def build_interface():
|
| 85 |
+
# Dropdown to choose model
|
| 86 |
+
with gr.Row():
|
| 87 |
+
model_dropdown = gr.Dropdown(
|
| 88 |
+
choices=["NuSVR", "XGBoost", "KNN", "Random Forest"],
|
| 89 |
+
value="NuSVR",
|
| 90 |
+
label="Select Model"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
with gr.Row():
|
| 94 |
+
# Numeric inputs
|
| 95 |
+
age_input = gr.Number(value=39.0, label="Age")
|
| 96 |
+
bmi_input = gr.Number(value=23.2, label="BMI")
|
| 97 |
+
bp_input = gr.Number(value=91.0, label="Blood Pressure")
|
| 98 |
+
children_input = gr.Number(value=0, label="Children")
|
| 99 |
+
|
| 100 |
+
# Dropdowns for categorical data
|
| 101 |
+
gender_input = gr.Dropdown(choices=["male", "female"], value="male", label="Gender")
|
| 102 |
+
diabetic_input = gr.Dropdown(choices=["No", "Yes"], value="Yes", label="Diabetic")
|
| 103 |
+
smoker_input = gr.Dropdown(choices=["No", "Yes"], value="No", label="Smoker")
|
| 104 |
+
region_input = gr.Dropdown(choices=["southwest", "southeast", "northwest", "northeast"],
|
| 105 |
+
value="southeast",
|
| 106 |
+
label="Region")
|
| 107 |
+
|
| 108 |
+
# Output
|
| 109 |
+
output_label = gr.Textbox(label="Predicted Claim")
|
| 110 |
+
|
| 111 |
+
# Interface
|
| 112 |
+
demo = gr.Interface(
|
| 113 |
+
fn=predict_insurance_claim,
|
| 114 |
+
inputs=[
|
| 115 |
+
model_dropdown,
|
| 116 |
+
age_input,
|
| 117 |
+
gender_input,
|
| 118 |
+
bmi_input,
|
| 119 |
+
bp_input,
|
| 120 |
+
diabetic_input,
|
| 121 |
+
children_input,
|
| 122 |
+
smoker_input,
|
| 123 |
+
region_input
|
| 124 |
+
],
|
| 125 |
+
outputs=output_label,
|
| 126 |
+
title="Insurance Claim Prediction"
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
return demo
|
| 130 |
+
|
| 131 |
+
if __name__ == "__main__":
|
| 132 |
+
interface = build_interface()
|
| 133 |
+
interface.launch(server_name="localhost", server_port=7861, share=True)
|
models/XGBoost.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import joblib
|
| 4 |
+
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.ensemble import GradientBoostingRegressor
|
| 7 |
+
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
| 8 |
+
from sklearn.preprocessing import PolynomialFeatures
|
| 9 |
+
from sklearn.impute import SimpleImputer
|
| 10 |
+
|
| 11 |
+
class XGBoostInsuranceModel:
|
| 12 |
+
"""
|
| 13 |
+
A Gradient Boosting-based insurance model class with:
|
| 14 |
+
1. Data loading & dropping unnecessary columns
|
| 15 |
+
2. Dummy-encoding for categorical variables
|
| 16 |
+
3. SimpleImputer for missing data
|
| 17 |
+
4. PolynomialFeatures for interactions
|
| 18 |
+
5. Train/test splits and final evaluation
|
| 19 |
+
6. A consistent API with `preprocessing`, `predict`, `postprocessing`
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, csv_path):
|
| 23 |
+
# -----------------------------------------------------
|
| 24 |
+
# 1. Load & prepare the data
|
| 25 |
+
# -----------------------------------------------------
|
| 26 |
+
df = pd.read_csv(csv_path)
|
| 27 |
+
# Drop these columns if present (ignore errors if they're missing)
|
| 28 |
+
df = df.drop(columns=['index', 'PatientID'], errors='ignore')
|
| 29 |
+
|
| 30 |
+
# Separate features & target
|
| 31 |
+
X = df.drop(columns=['claim'])
|
| 32 |
+
y = df['claim'].values
|
| 33 |
+
|
| 34 |
+
# We'll discover the categorical columns by using get_dummies once
|
| 35 |
+
# But to replicate these transformations on new data,
|
| 36 |
+
# we must track all dummy columns.
|
| 37 |
+
X_dummies = pd.get_dummies(X, drop_first=True)
|
| 38 |
+
self.all_dummy_cols = X_dummies.columns.tolist()
|
| 39 |
+
|
| 40 |
+
# Create a SimpleImputer
|
| 41 |
+
self.imputer = SimpleImputer(strategy='mean')
|
| 42 |
+
# Create a PolynomialFeatures transformer (degree=2, interaction_only=True)
|
| 43 |
+
self.poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
|
| 44 |
+
|
| 45 |
+
# -----------------------------------------------------
|
| 46 |
+
# 2. Fit transformations on training set only
|
| 47 |
+
# (but we do an initial get_dummies for the entire data
|
| 48 |
+
# to discover all possible dummy columns).
|
| 49 |
+
# -----------------------------------------------------
|
| 50 |
+
# We'll do the train-test split on X_dummies (since that's how you had it).
|
| 51 |
+
X_train_dummies, X_test_dummies, y_train, y_test = train_test_split(
|
| 52 |
+
X_dummies, y, test_size=0.2, random_state=42
|
| 53 |
+
)
|
| 54 |
+
# A second split from X_train to get a validation set (as in your code)
|
| 55 |
+
X_train_dummies, X_val_dummies, y_train, y_val = train_test_split(
|
| 56 |
+
X_train_dummies, y_train, test_size=0.25, random_state=42
|
| 57 |
+
)
|
| 58 |
+
# Fit the imputer on the training set
|
| 59 |
+
X_train_imputed = self.imputer.fit_transform(X_train_dummies)
|
| 60 |
+
# Fit polynomial on the training set
|
| 61 |
+
X_train_poly = self.poly.fit_transform(X_train_imputed)
|
| 62 |
+
|
| 63 |
+
# -----------------------------------------------------
|
| 64 |
+
# 3. Initialize and train the GradientBoostingRegressor
|
| 65 |
+
# -----------------------------------------------------
|
| 66 |
+
self.model = GradientBoostingRegressor(
|
| 67 |
+
n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
|
| 68 |
+
)
|
| 69 |
+
self.model.fit(X_train_poly, y_train)
|
| 70 |
+
|
| 71 |
+
# -----------------------------------------------------
|
| 72 |
+
# 4. Evaluate on the test set
|
| 73 |
+
# -----------------------------------------------------
|
| 74 |
+
X_test_imputed = self.imputer.transform(X_test_dummies)
|
| 75 |
+
X_test_poly = self.poly.transform(X_test_imputed)
|
| 76 |
+
y_test_pred = self.model.predict(X_test_poly)
|
| 77 |
+
|
| 78 |
+
test_mse = mean_squared_error(y_test, y_test_pred)
|
| 79 |
+
test_mae = mean_absolute_error(y_test, y_test_pred)
|
| 80 |
+
test_r2 = r2_score(y_test, y_test_pred)
|
| 81 |
+
self.__scores = [test_mae, test_mse, test_r2]
|
| 82 |
+
|
| 83 |
+
print(f"[XGBoostInsuranceModel] MAE: {test_mae:.3f} | MSE: {test_mse:.3f} | R^2: {test_r2:.3f}")
|
| 84 |
+
|
| 85 |
+
def preprocessing(self, raw_df):
|
| 86 |
+
"""
|
| 87 |
+
Takes a new DataFrame with the same columns as the original CSV minus 'claim'.
|
| 88 |
+
Then:
|
| 89 |
+
1. Turns categorical features into dummy columns
|
| 90 |
+
2. Ensures the dummy columns match those used during training
|
| 91 |
+
3. Imputes missing values
|
| 92 |
+
4. Applies PolynomialFeatures transform
|
| 93 |
+
Returns transformed data (numpy array).
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
# 1. Convert raw_df to dummies, potentially missing or extra columns
|
| 97 |
+
temp_dummies = pd.get_dummies(raw_df, drop_first=True)
|
| 98 |
+
|
| 99 |
+
# Ensure it has exactly the same dummy columns as the training data
|
| 100 |
+
for col in self.all_dummy_cols:
|
| 101 |
+
if col not in temp_dummies.columns:
|
| 102 |
+
temp_dummies[col] = 0
|
| 103 |
+
# If there are extra columns not in self.all_dummy_cols, drop them
|
| 104 |
+
temp_dummies = temp_dummies[self.all_dummy_cols]
|
| 105 |
+
|
| 106 |
+
# 2. Imputation
|
| 107 |
+
temp_imputed = self.imputer.transform(temp_dummies)
|
| 108 |
+
|
| 109 |
+
# 3. Polynomial Features
|
| 110 |
+
temp_poly = self.poly.transform(temp_imputed)
|
| 111 |
+
|
| 112 |
+
return temp_poly
|
| 113 |
+
|
| 114 |
+
def predict(self, preprocessed_data):
|
| 115 |
+
"""
|
| 116 |
+
Receives data already output by `preprocessing`.
|
| 117 |
+
Returns predictions in the original scale (no inverse transform needed).
|
| 118 |
+
"""
|
| 119 |
+
preds = self.model.predict(preprocessed_data)
|
| 120 |
+
return self.postprocessing(preds)
|
| 121 |
+
|
| 122 |
+
def postprocessing(self, preds):
|
| 123 |
+
"""
|
| 124 |
+
Currently a pass-through, because we haven't scaled 'claim'.
|
| 125 |
+
In a different scenario, you might invert-scale predictions here.
|
| 126 |
+
"""
|
| 127 |
+
return preds
|
| 128 |
+
|
| 129 |
+
def getScores(self):
|
| 130 |
+
"""
|
| 131 |
+
Returns a string with the test metrics.
|
| 132 |
+
(MAE, MSE, R^2) from the last training process.
|
| 133 |
+
"""
|
| 134 |
+
return f"MAE: {self.__scores[0]} | MSE: {self.__scores[1]} | R^2: {self.__scores[2]}"
|
| 135 |
+
|
| 136 |
+
if __name__ == "__main__":
|
| 137 |
+
# Instantiate, train, and evaluate the model on "cleaned_insurance_data.csv"
|
| 138 |
+
xgb_model = XGBoostInsuranceModel("cleaned_insurance_data.csv")
|
| 139 |
+
|
| 140 |
+
# Export the entire class instance for future use
|
| 141 |
+
joblib.dump(xgb_model, "XGBoostInsuranceModel.joblib")
|
| 142 |
+
print("Exported XGBoostInsuranceModel to XGBoostInsuranceModel.joblib")
|
models/knn.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import joblib
|
| 4 |
+
|
| 5 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
| 6 |
+
from sklearn.neighbors import KNeighborsRegressor
|
| 7 |
+
from sklearn.metrics import mean_squared_error, r2_score
|
| 8 |
+
|
| 9 |
+
class KNNInsuranceModel:
|
| 10 |
+
"""
|
| 11 |
+
KNN-based insurance model with:
|
| 12 |
+
1. Data loading & cleaning
|
| 13 |
+
2. Preprocessing for 'smoker' (Yes=1, No=0)
|
| 14 |
+
3. Grid search for best hyperparameters
|
| 15 |
+
4. A consistent API with preprocessing, predict, and postprocessing
|
| 16 |
+
"""
|
| 17 |
+
def __init__(self, csv_path):
|
| 18 |
+
# --------------------------------------------------
|
| 19 |
+
# 1. Load data
|
| 20 |
+
# --------------------------------------------------
|
| 21 |
+
insurance_df = pd.read_csv(csv_path)
|
| 22 |
+
# Drop columns if they exist, ignore if not
|
| 23 |
+
insurance_df = insurance_df.drop(columns=["index", "PatientID"], errors="ignore").dropna()
|
| 24 |
+
|
| 25 |
+
# Convert smoker: 'Yes' -> 1, 'No' (or anything else) -> 0
|
| 26 |
+
insurance_df["smoker"] = np.where(insurance_df["smoker"] == 'Yes', 1, 0)
|
| 27 |
+
|
| 28 |
+
# For training, we use columns [bloodpressure, bmi, smoker] as in your snippet
|
| 29 |
+
X = insurance_df[["bloodpressure", "bmi", "smoker"]]
|
| 30 |
+
y = insurance_df["claim"]
|
| 31 |
+
|
| 32 |
+
# --------------------------------------------------
|
| 33 |
+
# 2. Train-test split
|
| 34 |
+
# --------------------------------------------------
|
| 35 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 36 |
+
X, y, test_size=0.2, random_state=42
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# --------------------------------------------------
|
| 40 |
+
# 3. Grid search for best KNN
|
| 41 |
+
# --------------------------------------------------
|
| 42 |
+
param_grid = {
|
| 43 |
+
'n_neighbors': range(1, 31),
|
| 44 |
+
'weights': ['uniform', 'distance'],
|
| 45 |
+
'metric': ['minkowski', 'euclidean', 'manhattan']
|
| 46 |
+
}
|
| 47 |
+
grid_search = GridSearchCV(
|
| 48 |
+
KNeighborsRegressor(),
|
| 49 |
+
param_grid,
|
| 50 |
+
cv=5
|
| 51 |
+
)
|
| 52 |
+
grid_search.fit(X_train, y_train)
|
| 53 |
+
|
| 54 |
+
# The best estimator from the grid
|
| 55 |
+
self.model = grid_search.best_estimator_
|
| 56 |
+
self.model.fit(X_train, y_train)
|
| 57 |
+
|
| 58 |
+
# --------------------------------------------------
|
| 59 |
+
# 4. Evaluate on test set
|
| 60 |
+
# --------------------------------------------------
|
| 61 |
+
y_pred = self.model.predict(X_test)
|
| 62 |
+
mse = mean_squared_error(y_test, y_pred)
|
| 63 |
+
r2 = r2_score(y_test, y_pred)
|
| 64 |
+
self.__scores = [mse, r2]
|
| 65 |
+
|
| 66 |
+
print(f"[KNN] Test MSE: {mse:.3f}")
|
| 67 |
+
print(f"[KNN] Test R^2: {r2:.3f}")
|
| 68 |
+
|
| 69 |
+
def preprocessing(self, raw_df):
|
| 70 |
+
"""
|
| 71 |
+
For new data, replicate the same steps:
|
| 72 |
+
1) Convert 'smoker' to 0/1
|
| 73 |
+
2) Extract columns [bloodpressure, bmi, smoker]
|
| 74 |
+
Returns a DataFrame or numpy array in the same format as training X.
|
| 75 |
+
"""
|
| 76 |
+
# Copy to avoid mutating original df
|
| 77 |
+
df_copy = raw_df.copy()
|
| 78 |
+
# Convert 'smoker'
|
| 79 |
+
if 'smoker' in df_copy.columns:
|
| 80 |
+
df_copy["smoker"] = np.where(df_copy["smoker"] == 'Yes', 1, 0)
|
| 81 |
+
else:
|
| 82 |
+
# If missing, default to 0 or handle as needed
|
| 83 |
+
df_copy["smoker"] = 0
|
| 84 |
+
|
| 85 |
+
# Ensure we only use the same columns as training
|
| 86 |
+
return df_copy[["bloodpressure", "bmi", "smoker"]]
|
| 87 |
+
|
| 88 |
+
def predict(self, preprocessed_data):
|
| 89 |
+
"""
|
| 90 |
+
Takes feature data already processed by `preprocessing`,
|
| 91 |
+
returns predictions (in original claim scale, since we didn't scale).
|
| 92 |
+
"""
|
| 93 |
+
preds = self.model.predict(preprocessed_data)
|
| 94 |
+
return self.postprocessing(preds)
|
| 95 |
+
|
| 96 |
+
def postprocessing(self, preds):
|
| 97 |
+
"""
|
| 98 |
+
No target scaling to invert, so just return `preds`.
|
| 99 |
+
"""
|
| 100 |
+
return preds
|
| 101 |
+
|
| 102 |
+
def getScores(self):
|
| 103 |
+
return f"MSE: {self.__scores[0]} \nR2: {self.__scores[1]}"
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
# --------------------------------------------------
|
| 107 |
+
# 5. Instantiate and train on 'cleaned_insurance_data.csv'
|
| 108 |
+
# --------------------------------------------------
|
| 109 |
+
knn_wrapper = KNNInsuranceModel("cleaned_insurance_data.csv")
|
| 110 |
+
|
| 111 |
+
# --------------------------------------------------
|
| 112 |
+
# 6. Export the entire model class for later use
|
| 113 |
+
# --------------------------------------------------
|
| 114 |
+
joblib.dump(knn_wrapper, "KNNInsuranceModel.joblib")
|
| 115 |
+
print("KNNInsuranceModel exported to KNNInsuranceModel.joblib")
|
models/randomforest.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import joblib
|
| 4 |
+
|
| 5 |
+
from sklearn.experimental import enable_iterative_imputer
|
| 6 |
+
from sklearn.impute import IterativeImputer
|
| 7 |
+
from sklearn.compose import ColumnTransformer
|
| 8 |
+
from sklearn.pipeline import Pipeline
|
| 9 |
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
| 10 |
+
from sklearn.model_selection import train_test_split
|
| 11 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 12 |
+
from sklearn.metrics import mean_absolute_error, r2_score
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class RandomForestInsuranceModel:
|
| 16 |
+
"""
|
| 17 |
+
A Random Forest regressor class with:
|
| 18 |
+
1. Data loading & cleaning (iterative imputation, outlier clipping)
|
| 19 |
+
2. A fixed set of hyperparameters (n_estimators=100, max_depth=4, min_samples_split=15)
|
| 20 |
+
3. A ColumnTransformer for numeric & categorical data
|
| 21 |
+
4. Consistent API: preprocessing, predict, postprocessing
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, csv_path):
|
| 25 |
+
"""
|
| 26 |
+
Loads the CSV, cleans data, sets up the column transformer,
|
| 27 |
+
trains a RandomForestRegressor with fixed hyperparameters,
|
| 28 |
+
and evaluates on a test set.
|
| 29 |
+
"""
|
| 30 |
+
# -----------------------------------------------------
|
| 31 |
+
# 1. Load and clean data
|
| 32 |
+
# -----------------------------------------------------
|
| 33 |
+
df = pd.read_csv(csv_path)
|
| 34 |
+
# Drop irrelevant columns if present, remove any leftover NaNs
|
| 35 |
+
df = df.drop(columns=["index", "PatientID"], errors="ignore").dropna()
|
| 36 |
+
|
| 37 |
+
# Apply iterative imputation for the specified columns
|
| 38 |
+
self._impute(df, columns=['age', 'bmi', 'bloodpressure'])
|
| 39 |
+
|
| 40 |
+
# Clip outliers in 'claim' (1st to 98th percentile)
|
| 41 |
+
lower_percentile = df['claim'].quantile(0.01)
|
| 42 |
+
upper_percentile = df['claim'].quantile(0.98)
|
| 43 |
+
df = df[
|
| 44 |
+
(df['claim'] >= lower_percentile) & (df['claim'] <= upper_percentile)
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
# -----------------------------------------------------
|
| 48 |
+
# 2. Separate features & target
|
| 49 |
+
# -----------------------------------------------------
|
| 50 |
+
features = df.drop(columns=['claim'])
|
| 51 |
+
target = df['claim'].values # or df['claim'].to_numpy()
|
| 52 |
+
|
| 53 |
+
# -----------------------------------------------------
|
| 54 |
+
# 3. Create ColumnTransformer
|
| 55 |
+
# -----------------------------------------------------
|
| 56 |
+
text_pipeline = Pipeline([
|
| 57 |
+
('one-hot', OneHotEncoder(handle_unknown='ignore'))
|
| 58 |
+
])
|
| 59 |
+
|
| 60 |
+
nums_pipeline = Pipeline([
|
| 61 |
+
('normalize', StandardScaler(with_mean=False))
|
| 62 |
+
])
|
| 63 |
+
|
| 64 |
+
self.ct = ColumnTransformer([
|
| 65 |
+
('categorical', text_pipeline, ['diabetic', 'gender', 'region', 'smoker']),
|
| 66 |
+
('numerical', nums_pipeline, ['children', 'age', 'bmi', 'bloodpressure'])
|
| 67 |
+
])
|
| 68 |
+
|
| 69 |
+
# Fit the ColumnTransformer on the entire dataset
|
| 70 |
+
X_full_transformed = self.ct.fit_transform(features)
|
| 71 |
+
|
| 72 |
+
# -----------------------------------------------------
|
| 73 |
+
# 4. Train/test split
|
| 74 |
+
# -----------------------------------------------------
|
| 75 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 76 |
+
X_full_transformed,
|
| 77 |
+
target,
|
| 78 |
+
test_size=0.2,
|
| 79 |
+
random_state=42
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# -----------------------------------------------------
|
| 83 |
+
# 5. RandomForest with fixed hyperparameters
|
| 84 |
+
# -----------------------------------------------------
|
| 85 |
+
self.model = RandomForestRegressor(
|
| 86 |
+
n_estimators=100,
|
| 87 |
+
max_depth=4,
|
| 88 |
+
min_samples_split=15,
|
| 89 |
+
random_state=42
|
| 90 |
+
)
|
| 91 |
+
self.model.fit(X_train, y_train)
|
| 92 |
+
|
| 93 |
+
# -----------------------------------------------------
|
| 94 |
+
# 6. Evaluate
|
| 95 |
+
# -----------------------------------------------------
|
| 96 |
+
mae, r2 = self._evaluate(X_test, y_test)
|
| 97 |
+
print(f"[RANDOM FOREST] Test MAE: {mae:.3f}")
|
| 98 |
+
print(f"[RANDOM FOREST] Test R^2: {r2:.3f}")
|
| 99 |
+
|
| 100 |
+
# -------------------------------------------
|
| 101 |
+
# Private: iterative imputation
|
| 102 |
+
# -------------------------------------------
|
| 103 |
+
def _impute(self, df, columns):
|
| 104 |
+
imp = IterativeImputer(max_iter=5, verbose=2)
|
| 105 |
+
arr = imp.fit_transform(df[columns])
|
| 106 |
+
df[columns] = arr
|
| 107 |
+
|
| 108 |
+
# -------------------------------------------
|
| 109 |
+
# Private: evaluation
|
| 110 |
+
# -------------------------------------------
|
| 111 |
+
def _evaluate(self, X_test, y_test):
|
| 112 |
+
y_pred = self.model.predict(X_test)
|
| 113 |
+
mae = mean_absolute_error(y_test, y_pred)
|
| 114 |
+
r2 = r2_score(y_test, y_pred)
|
| 115 |
+
return mae, r2
|
| 116 |
+
|
| 117 |
+
# -------------------------------------------
|
| 118 |
+
# Public: preprocessing
|
| 119 |
+
# -------------------------------------------
|
| 120 |
+
def preprocessing(self, raw_df):
|
| 121 |
+
"""
|
| 122 |
+
Takes a new DataFrame with the columns the pipeline expects,
|
| 123 |
+
and returns the transformed matrix.
|
| 124 |
+
"""
|
| 125 |
+
return self.ct.transform(raw_df)
|
| 126 |
+
|
| 127 |
+
# -------------------------------------------
|
| 128 |
+
# Public: predict
|
| 129 |
+
# -------------------------------------------
|
| 130 |
+
def predict(self, preprocessed_data):
|
| 131 |
+
"""
|
| 132 |
+
Takes feature data already processed by `preprocessing`,
|
| 133 |
+
returns predictions in the original claim scale.
|
| 134 |
+
"""
|
| 135 |
+
preds = self.model.predict(preprocessed_data)
|
| 136 |
+
return self.postprocessing(preds)
|
| 137 |
+
|
| 138 |
+
# -------------------------------------------
|
| 139 |
+
# Public: postprocessing
|
| 140 |
+
# -------------------------------------------
|
| 141 |
+
def postprocessing(self, preds):
|
| 142 |
+
"""
|
| 143 |
+
Currently a pass-through, as there's no target scaling to invert.
|
| 144 |
+
"""
|
| 145 |
+
return preds
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
if __name__ == "__main__":
|
| 149 |
+
# Instantiate and train on "cleaned_insurance_data.csv"
|
| 150 |
+
rf_model = RandomForestInsuranceModel("cleaned_insurance_data.csv")
|
| 151 |
+
|
| 152 |
+
# Export the entire trained class instance (including the ColumnTransformer)
|
| 153 |
+
joblib.dump(rf_model, "RandomForestInsuranceModel.joblib")
|
| 154 |
+
print("Exported RandomForestInsuranceModel to RandomForestInsuranceModel.joblib")
|
models/svr.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
from sklearn.svm import NuSVR
|
| 5 |
+
from sklearn.compose import ColumnTransformer
|
| 6 |
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler
|
| 7 |
+
from sklearn.pipeline import Pipeline
|
| 8 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 9 |
+
from sklearn.model_selection import train_test_split
|
| 10 |
+
from sklearn.metrics import mean_absolute_error, r2_score
|
| 11 |
+
|
| 12 |
+
import joblib
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class NuSVRInsuranceModel:
|
| 16 |
+
"""
|
| 17 |
+
This class encapsulates:
|
| 18 |
+
1. Preprocessing: column transformations, scaling
|
| 19 |
+
2. Prediction: using NuSVR
|
| 20 |
+
3. Postprocessing: inverse-transform predictions to original scale
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
# --- Custom Transformer defined INSIDE the class ---
|
| 24 |
+
class MultiplyScaler(BaseEstimator, TransformerMixin):
|
| 25 |
+
def __init__(self, factor=2):
|
| 26 |
+
self.factor = factor
|
| 27 |
+
|
| 28 |
+
def fit(self, X, y=None):
|
| 29 |
+
return self
|
| 30 |
+
|
| 31 |
+
def transform(self, X):
|
| 32 |
+
return X * self.factor
|
| 33 |
+
|
| 34 |
+
def __init__(self):
|
| 35 |
+
"""
|
| 36 |
+
In the constructor, define the column pipelines, the main ColumnTransformer,
|
| 37 |
+
the target scaler, and the model.
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
# Example pipelines (adjust as needed)
|
| 41 |
+
text_pipeline = Pipeline([
|
| 42 |
+
('one-hot', OneHotEncoder())
|
| 43 |
+
])
|
| 44 |
+
|
| 45 |
+
nums_pipeline = Pipeline([
|
| 46 |
+
('normalize', StandardScaler(with_mean=True)),
|
| 47 |
+
])
|
| 48 |
+
|
| 49 |
+
nums_pipeline_strong = Pipeline([
|
| 50 |
+
('normalize', StandardScaler(with_mean=True)),
|
| 51 |
+
# Note we reference the nested class here
|
| 52 |
+
('scalarMultiply', NuSVRInsuranceModel.MultiplyScaler(factor=2))
|
| 53 |
+
])
|
| 54 |
+
|
| 55 |
+
smoke_pipeline = Pipeline([
|
| 56 |
+
('one-hot', OneHotEncoder()),
|
| 57 |
+
('normalize', StandardScaler(with_mean=False)),
|
| 58 |
+
('scalar-multiply', NuSVRInsuranceModel.MultiplyScaler(factor=5))
|
| 59 |
+
])
|
| 60 |
+
|
| 61 |
+
region_pipeline = Pipeline([
|
| 62 |
+
('categories', OrdinalEncoder())
|
| 63 |
+
])
|
| 64 |
+
|
| 65 |
+
# Create ColumnTransformer
|
| 66 |
+
# Adjust columns to match your dataset's actual column names
|
| 67 |
+
self.ct = ColumnTransformer([
|
| 68 |
+
('str_handler', text_pipeline, ['diabetic', 'gender']),
|
| 69 |
+
('smoke_handle', smoke_pipeline, ['smoker']),
|
| 70 |
+
('floats_ints_weak', nums_pipeline, ['children', 'age']),
|
| 71 |
+
('floats_ints_strong', nums_pipeline_strong, ['bmi', 'bloodpressure']),
|
| 72 |
+
])
|
| 73 |
+
|
| 74 |
+
# Target scaler (for the 'claim' column)
|
| 75 |
+
self.target_scaler = MinMaxScaler(feature_range=(-0.5, 0.5))
|
| 76 |
+
|
| 77 |
+
# NuSVR model with desired hyperparameters
|
| 78 |
+
self.model = NuSVR(C=10, gamma='scale', kernel='rbf', nu=0.80)
|
| 79 |
+
|
| 80 |
+
def preprocessing(self, df):
|
| 81 |
+
"""
|
| 82 |
+
Takes a raw dataframe (with the relevant columns) and applies the
|
| 83 |
+
fitted ColumnTransformer used in training.
|
| 84 |
+
Returns the transformed feature matrix.
|
| 85 |
+
"""
|
| 86 |
+
return self.ct.transform(df)
|
| 87 |
+
|
| 88 |
+
def predict(self, preprocessed_data):
|
| 89 |
+
"""
|
| 90 |
+
Takes already-preprocessed data (matrix/array) and outputs the
|
| 91 |
+
final predictions in the original scale.
|
| 92 |
+
"""
|
| 93 |
+
y_pred_scaled = self.model.predict(preprocessed_data)
|
| 94 |
+
return self.postprocessing(y_pred_scaled)
|
| 95 |
+
|
| 96 |
+
def postprocessing(self, y_pred_scaled):
|
| 97 |
+
"""
|
| 98 |
+
Takes scaled predictions (in the target_scaler domain) and inversely
|
| 99 |
+
transforms them back to the original target domain.
|
| 100 |
+
"""
|
| 101 |
+
y_pred_original = self.target_scaler.inverse_transform(
|
| 102 |
+
y_pred_scaled.reshape(-1, 1)
|
| 103 |
+
)
|
| 104 |
+
return y_pred_original.ravel()
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
if __name__ == "__main__":
|
| 108 |
+
# -------------------------------------------------
|
| 109 |
+
# 1. Load data
|
| 110 |
+
# -------------------------------------------------
|
| 111 |
+
df = pd.read_csv('cleaned_insurance_data.csv')
|
| 112 |
+
|
| 113 |
+
# Separate features and target
|
| 114 |
+
features = df.drop(columns=['claim', 'PatientID', 'index'])
|
| 115 |
+
target = df['claim']
|
| 116 |
+
|
| 117 |
+
# -------------------------------------------------
|
| 118 |
+
# 2. Instantiate our NuSVRInsuranceModel
|
| 119 |
+
# -------------------------------------------------
|
| 120 |
+
nusvr_wrapper = NuSVRInsuranceModel()
|
| 121 |
+
|
| 122 |
+
# -------------------------------------------------
|
| 123 |
+
# 3. Train-test split
|
| 124 |
+
# -------------------------------------------------
|
| 125 |
+
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
|
| 126 |
+
features, target, test_size=0.25, random_state=42
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
# -------------------------------------------------
|
| 130 |
+
# 4. Fit ColumnTransformer & target scaler on TRAIN data
|
| 131 |
+
# -------------------------------------------------
|
| 132 |
+
# Fit the ColumnTransformer
|
| 133 |
+
X_train_t = nusvr_wrapper.ct.fit_transform(X_train_raw)
|
| 134 |
+
# Fit the target scaler
|
| 135 |
+
y_train_t = nusvr_wrapper.target_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()
|
| 136 |
+
|
| 137 |
+
# -------------------------------------------------
|
| 138 |
+
# 5. Train the NuSVR model
|
| 139 |
+
# -------------------------------------------------
|
| 140 |
+
nusvr_wrapper.model.fit(X_train_t, y_train_t)
|
| 141 |
+
|
| 142 |
+
# -------------------------------------------------
|
| 143 |
+
# 6. Evaluate on test data
|
| 144 |
+
# -------------------------------------------------
|
| 145 |
+
# Preprocess the test features with the same pipeline
|
| 146 |
+
X_test_t = nusvr_wrapper.preprocessing(X_test_raw)
|
| 147 |
+
|
| 148 |
+
# Make predictions (in original scale)
|
| 149 |
+
y_pred = nusvr_wrapper.predict(X_test_t)
|
| 150 |
+
|
| 151 |
+
mae = mean_absolute_error(y_test, y_pred)
|
| 152 |
+
r2 = r2_score(y_test, y_pred)
|
| 153 |
+
|
| 154 |
+
print(f"Test MAE (original scale): {mae:.3f}")
|
| 155 |
+
print(f"Test R^2 (original scale): {r2:.3f}")
|
| 156 |
+
|
| 157 |
+
# -------------------------------------------------
|
| 158 |
+
# 7. Export the fitted model
|
| 159 |
+
# -------------------------------------------------
|
| 160 |
+
joblib.dump(nusvr_wrapper, "nusvr_insurance_model.joblib")
|
| 161 |
+
print("Fitted NuSVRInsuranceModel saved to nusvr_insurance_model.joblib")
|