Mateusz Paszynski commited on
Commit
5de1466
·
1 Parent(s): 75b972f

publish website

Browse files
KNNInsuranceModel.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e42bf88f3ea8c987e04d270daeae1944e61ecbb51d655b74c15f667d6b1b65f7
3
+ size 75472
NuSVRInsuranceModel.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a21361ad305bcc706235a87f72f5f147f6dc2ceddbd51c9d36e12bb9e1fe65b2
3
+ size 90887
RandomForestInsuranceModel.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7091796dc3eb05b4fe48607cfb0f9419ffa847343e8baf3534965e7378d2c96d
3
+ size 225027
XGBoostInsuranceModel.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a06f22700c735de1f450ca10404a71fca66509570b3c1aa10af8576b4feb61d
3
+ size 184349
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import joblib
3
+ import numpy as np
4
+ import pandas as pd
5
+ from models.svr import NuSVRInsuranceModel
6
+ from models.randomforest import RandomForestInsuranceModel
7
+ from models.XGBoost import XGBoostInsuranceModel
8
+ from models.knn import KNNInsuranceModel
9
+ import os
10
+ # -------------------------------------------------------
11
+ # Placeholder code to load pre-trained models
12
+ # (Replace 'model_NuSVR.joblib', etc. with your actual file paths)
13
+ # -------------------------------------------------------
14
+ model_nusvr = joblib.load(os.path.join('.', 'Website', 'NuSVRInsuranceModel.joblib'))
15
+ model_xgb = joblib.load(os.path.join('.', 'Website', 'XGBoostInsuranceModel.joblib'))
16
+ model_knn = joblib.load(os.path.join('.', 'Website', 'KNNInsuranceModel.joblib'))
17
+ model_rf = joblib.load(os.path.join('.', 'Website', 'RandomForestInsuranceModel.joblib'))
18
+
19
+ # # Dictionary to map model choice to the actual loaded model
20
+ models_dict = {
21
+ "NuSVR": model_nusvr,
22
+ "XGBoost": model_xgb,
23
+ "KNN": model_knn,
24
+ "Random Forest": model_rf
25
+ }
26
+
27
+ # -------------------------------------------------------
28
+ # Simple label encodings or numeric mappings:
29
+ # - gender: {"male":0, "female":1}
30
+ # - diabetic: {"No":0, "Yes":1}
31
+ # - smoker: {"No":0, "Yes":1}
32
+ # - region: {"southwest":0, "southeast":1, "northwest":2, "northeast":3}
33
+ # Adjust this to however you pre-processed the data for training!
34
+ # -------------------------------------------------------
35
+ region_mapping = {"southwest": 0, "southeast": 1, "northwest": 2, "northeast": 3}
36
+ gender_mapping = {"male": 0, "female": 1}
37
+ yes_no_mapping = {"No": 0, "Yes": 1}
38
+
39
+ def predict_insurance_claim(
40
+ model_choice,
41
+ age,
42
+ gender,
43
+ bmi,
44
+ blood_pressure,
45
+ diabetic,
46
+ children,
47
+ smoker,
48
+ region
49
+ ):
50
+ # Convert categorical values using the same logic used during training
51
+ try:
52
+ gender_val = gender_mapping[gender.lower()]
53
+ diabetic_val = yes_no_mapping[diabetic]
54
+ smoker_val = yes_no_mapping[smoker]
55
+ region_val = region_mapping[region.lower()]
56
+ except KeyError:
57
+ return "Invalid input for categorical field."
58
+
59
+ # Construct a single-row DataFrame with the correct column names
60
+ user_data = {
61
+ 'age': [age],
62
+ 'gender': [gender],
63
+ 'bmi': [bmi],
64
+ 'bloodpressure': [blood_pressure],
65
+ 'diabetic': [diabetic],
66
+ 'children': [children],
67
+ 'smoker': [smoker],
68
+ 'region': [region]
69
+ }
70
+
71
+ user_df = pd.DataFrame(user_data)
72
+
73
+ chosen_model = models_dict[model_choice]
74
+
75
+ d = chosen_model.preprocessing(user_df)
76
+
77
+ y_pred = chosen_model.predict(d)
78
+
79
+ return float(y_pred[0])
80
+
81
+ # -------------------------------------------------------
82
+ # Gradio Interface
83
+ # -------------------------------------------------------
84
+ def build_interface():
85
+ # Dropdown to choose model
86
+ with gr.Row():
87
+ model_dropdown = gr.Dropdown(
88
+ choices=["NuSVR", "XGBoost", "KNN", "Random Forest"],
89
+ value="NuSVR",
90
+ label="Select Model"
91
+ )
92
+
93
+ with gr.Row():
94
+ # Numeric inputs
95
+ age_input = gr.Number(value=39.0, label="Age")
96
+ bmi_input = gr.Number(value=23.2, label="BMI")
97
+ bp_input = gr.Number(value=91.0, label="Blood Pressure")
98
+ children_input = gr.Number(value=0, label="Children")
99
+
100
+ # Dropdowns for categorical data
101
+ gender_input = gr.Dropdown(choices=["male", "female"], value="male", label="Gender")
102
+ diabetic_input = gr.Dropdown(choices=["No", "Yes"], value="Yes", label="Diabetic")
103
+ smoker_input = gr.Dropdown(choices=["No", "Yes"], value="No", label="Smoker")
104
+ region_input = gr.Dropdown(choices=["southwest", "southeast", "northwest", "northeast"],
105
+ value="southeast",
106
+ label="Region")
107
+
108
+ # Output
109
+ output_label = gr.Textbox(label="Predicted Claim")
110
+
111
+ # Interface
112
+ demo = gr.Interface(
113
+ fn=predict_insurance_claim,
114
+ inputs=[
115
+ model_dropdown,
116
+ age_input,
117
+ gender_input,
118
+ bmi_input,
119
+ bp_input,
120
+ diabetic_input,
121
+ children_input,
122
+ smoker_input,
123
+ region_input
124
+ ],
125
+ outputs=output_label,
126
+ title="Insurance Claim Prediction"
127
+ )
128
+
129
+ return demo
130
+
131
+ if __name__ == "__main__":
132
+ interface = build_interface()
133
+ interface.launch(server_name="localhost", server_port=7861, share=True)
models/XGBoost.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.ensemble import GradientBoostingRegressor
7
+ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
8
+ from sklearn.preprocessing import PolynomialFeatures
9
+ from sklearn.impute import SimpleImputer
10
+
11
+ class XGBoostInsuranceModel:
12
+ """
13
+ A Gradient Boosting-based insurance model class with:
14
+ 1. Data loading & dropping unnecessary columns
15
+ 2. Dummy-encoding for categorical variables
16
+ 3. SimpleImputer for missing data
17
+ 4. PolynomialFeatures for interactions
18
+ 5. Train/test splits and final evaluation
19
+ 6. A consistent API with `preprocessing`, `predict`, `postprocessing`
20
+ """
21
+
22
+ def __init__(self, csv_path):
23
+ # -----------------------------------------------------
24
+ # 1. Load & prepare the data
25
+ # -----------------------------------------------------
26
+ df = pd.read_csv(csv_path)
27
+ # Drop these columns if present (ignore errors if they're missing)
28
+ df = df.drop(columns=['index', 'PatientID'], errors='ignore')
29
+
30
+ # Separate features & target
31
+ X = df.drop(columns=['claim'])
32
+ y = df['claim'].values
33
+
34
+ # We'll discover the categorical columns by using get_dummies once
35
+ # But to replicate these transformations on new data,
36
+ # we must track all dummy columns.
37
+ X_dummies = pd.get_dummies(X, drop_first=True)
38
+ self.all_dummy_cols = X_dummies.columns.tolist()
39
+
40
+ # Create a SimpleImputer
41
+ self.imputer = SimpleImputer(strategy='mean')
42
+ # Create a PolynomialFeatures transformer (degree=2, interaction_only=True)
43
+ self.poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
44
+
45
+ # -----------------------------------------------------
46
+ # 2. Fit transformations on training set only
47
+ # (but we do an initial get_dummies for the entire data
48
+ # to discover all possible dummy columns).
49
+ # -----------------------------------------------------
50
+ # We'll do the train-test split on X_dummies (since that's how you had it).
51
+ X_train_dummies, X_test_dummies, y_train, y_test = train_test_split(
52
+ X_dummies, y, test_size=0.2, random_state=42
53
+ )
54
+ # A second split from X_train to get a validation set (as in your code)
55
+ X_train_dummies, X_val_dummies, y_train, y_val = train_test_split(
56
+ X_train_dummies, y_train, test_size=0.25, random_state=42
57
+ )
58
+ # Fit the imputer on the training set
59
+ X_train_imputed = self.imputer.fit_transform(X_train_dummies)
60
+ # Fit polynomial on the training set
61
+ X_train_poly = self.poly.fit_transform(X_train_imputed)
62
+
63
+ # -----------------------------------------------------
64
+ # 3. Initialize and train the GradientBoostingRegressor
65
+ # -----------------------------------------------------
66
+ self.model = GradientBoostingRegressor(
67
+ n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
68
+ )
69
+ self.model.fit(X_train_poly, y_train)
70
+
71
+ # -----------------------------------------------------
72
+ # 4. Evaluate on the test set
73
+ # -----------------------------------------------------
74
+ X_test_imputed = self.imputer.transform(X_test_dummies)
75
+ X_test_poly = self.poly.transform(X_test_imputed)
76
+ y_test_pred = self.model.predict(X_test_poly)
77
+
78
+ test_mse = mean_squared_error(y_test, y_test_pred)
79
+ test_mae = mean_absolute_error(y_test, y_test_pred)
80
+ test_r2 = r2_score(y_test, y_test_pred)
81
+ self.__scores = [test_mae, test_mse, test_r2]
82
+
83
+ print(f"[XGBoostInsuranceModel] MAE: {test_mae:.3f} | MSE: {test_mse:.3f} | R^2: {test_r2:.3f}")
84
+
85
+ def preprocessing(self, raw_df):
86
+ """
87
+ Takes a new DataFrame with the same columns as the original CSV minus 'claim'.
88
+ Then:
89
+ 1. Turns categorical features into dummy columns
90
+ 2. Ensures the dummy columns match those used during training
91
+ 3. Imputes missing values
92
+ 4. Applies PolynomialFeatures transform
93
+ Returns transformed data (numpy array).
94
+ """
95
+
96
+ # 1. Convert raw_df to dummies, potentially missing or extra columns
97
+ temp_dummies = pd.get_dummies(raw_df, drop_first=True)
98
+
99
+ # Ensure it has exactly the same dummy columns as the training data
100
+ for col in self.all_dummy_cols:
101
+ if col not in temp_dummies.columns:
102
+ temp_dummies[col] = 0
103
+ # If there are extra columns not in self.all_dummy_cols, drop them
104
+ temp_dummies = temp_dummies[self.all_dummy_cols]
105
+
106
+ # 2. Imputation
107
+ temp_imputed = self.imputer.transform(temp_dummies)
108
+
109
+ # 3. Polynomial Features
110
+ temp_poly = self.poly.transform(temp_imputed)
111
+
112
+ return temp_poly
113
+
114
+ def predict(self, preprocessed_data):
115
+ """
116
+ Receives data already output by `preprocessing`.
117
+ Returns predictions in the original scale (no inverse transform needed).
118
+ """
119
+ preds = self.model.predict(preprocessed_data)
120
+ return self.postprocessing(preds)
121
+
122
+ def postprocessing(self, preds):
123
+ """
124
+ Currently a pass-through, because we haven't scaled 'claim'.
125
+ In a different scenario, you might invert-scale predictions here.
126
+ """
127
+ return preds
128
+
129
+ def getScores(self):
130
+ """
131
+ Returns a string with the test metrics.
132
+ (MAE, MSE, R^2) from the last training process.
133
+ """
134
+ return f"MAE: {self.__scores[0]} | MSE: {self.__scores[1]} | R^2: {self.__scores[2]}"
135
+
136
+ if __name__ == "__main__":
137
+ # Instantiate, train, and evaluate the model on "cleaned_insurance_data.csv"
138
+ xgb_model = XGBoostInsuranceModel("cleaned_insurance_data.csv")
139
+
140
+ # Export the entire class instance for future use
141
+ joblib.dump(xgb_model, "XGBoostInsuranceModel.joblib")
142
+ print("Exported XGBoostInsuranceModel to XGBoostInsuranceModel.joblib")
models/knn.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+
5
+ from sklearn.model_selection import train_test_split, GridSearchCV
6
+ from sklearn.neighbors import KNeighborsRegressor
7
+ from sklearn.metrics import mean_squared_error, r2_score
8
+
9
+ class KNNInsuranceModel:
10
+ """
11
+ KNN-based insurance model with:
12
+ 1. Data loading & cleaning
13
+ 2. Preprocessing for 'smoker' (Yes=1, No=0)
14
+ 3. Grid search for best hyperparameters
15
+ 4. A consistent API with preprocessing, predict, and postprocessing
16
+ """
17
+ def __init__(self, csv_path):
18
+ # --------------------------------------------------
19
+ # 1. Load data
20
+ # --------------------------------------------------
21
+ insurance_df = pd.read_csv(csv_path)
22
+ # Drop columns if they exist, ignore if not
23
+ insurance_df = insurance_df.drop(columns=["index", "PatientID"], errors="ignore").dropna()
24
+
25
+ # Convert smoker: 'Yes' -> 1, 'No' (or anything else) -> 0
26
+ insurance_df["smoker"] = np.where(insurance_df["smoker"] == 'Yes', 1, 0)
27
+
28
+ # For training, we use columns [bloodpressure, bmi, smoker] as in your snippet
29
+ X = insurance_df[["bloodpressure", "bmi", "smoker"]]
30
+ y = insurance_df["claim"]
31
+
32
+ # --------------------------------------------------
33
+ # 2. Train-test split
34
+ # --------------------------------------------------
35
+ X_train, X_test, y_train, y_test = train_test_split(
36
+ X, y, test_size=0.2, random_state=42
37
+ )
38
+
39
+ # --------------------------------------------------
40
+ # 3. Grid search for best KNN
41
+ # --------------------------------------------------
42
+ param_grid = {
43
+ 'n_neighbors': range(1, 31),
44
+ 'weights': ['uniform', 'distance'],
45
+ 'metric': ['minkowski', 'euclidean', 'manhattan']
46
+ }
47
+ grid_search = GridSearchCV(
48
+ KNeighborsRegressor(),
49
+ param_grid,
50
+ cv=5
51
+ )
52
+ grid_search.fit(X_train, y_train)
53
+
54
+ # The best estimator from the grid
55
+ self.model = grid_search.best_estimator_
56
+ self.model.fit(X_train, y_train)
57
+
58
+ # --------------------------------------------------
59
+ # 4. Evaluate on test set
60
+ # --------------------------------------------------
61
+ y_pred = self.model.predict(X_test)
62
+ mse = mean_squared_error(y_test, y_pred)
63
+ r2 = r2_score(y_test, y_pred)
64
+ self.__scores = [mse, r2]
65
+
66
+ print(f"[KNN] Test MSE: {mse:.3f}")
67
+ print(f"[KNN] Test R^2: {r2:.3f}")
68
+
69
+ def preprocessing(self, raw_df):
70
+ """
71
+ For new data, replicate the same steps:
72
+ 1) Convert 'smoker' to 0/1
73
+ 2) Extract columns [bloodpressure, bmi, smoker]
74
+ Returns a DataFrame or numpy array in the same format as training X.
75
+ """
76
+ # Copy to avoid mutating original df
77
+ df_copy = raw_df.copy()
78
+ # Convert 'smoker'
79
+ if 'smoker' in df_copy.columns:
80
+ df_copy["smoker"] = np.where(df_copy["smoker"] == 'Yes', 1, 0)
81
+ else:
82
+ # If missing, default to 0 or handle as needed
83
+ df_copy["smoker"] = 0
84
+
85
+ # Ensure we only use the same columns as training
86
+ return df_copy[["bloodpressure", "bmi", "smoker"]]
87
+
88
+ def predict(self, preprocessed_data):
89
+ """
90
+ Takes feature data already processed by `preprocessing`,
91
+ returns predictions (in original claim scale, since we didn't scale).
92
+ """
93
+ preds = self.model.predict(preprocessed_data)
94
+ return self.postprocessing(preds)
95
+
96
+ def postprocessing(self, preds):
97
+ """
98
+ No target scaling to invert, so just return `preds`.
99
+ """
100
+ return preds
101
+
102
+ def getScores(self):
103
+ return f"MSE: {self.__scores[0]} \nR2: {self.__scores[1]}"
104
+
105
+ if __name__ == "__main__":
106
+ # --------------------------------------------------
107
+ # 5. Instantiate and train on 'cleaned_insurance_data.csv'
108
+ # --------------------------------------------------
109
+ knn_wrapper = KNNInsuranceModel("cleaned_insurance_data.csv")
110
+
111
+ # --------------------------------------------------
112
+ # 6. Export the entire model class for later use
113
+ # --------------------------------------------------
114
+ joblib.dump(knn_wrapper, "KNNInsuranceModel.joblib")
115
+ print("KNNInsuranceModel exported to KNNInsuranceModel.joblib")
models/randomforest.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+
5
+ from sklearn.experimental import enable_iterative_imputer
6
+ from sklearn.impute import IterativeImputer
7
+ from sklearn.compose import ColumnTransformer
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.ensemble import RandomForestRegressor
12
+ from sklearn.metrics import mean_absolute_error, r2_score
13
+
14
+
15
+ class RandomForestInsuranceModel:
16
+ """
17
+ A Random Forest regressor class with:
18
+ 1. Data loading & cleaning (iterative imputation, outlier clipping)
19
+ 2. A fixed set of hyperparameters (n_estimators=100, max_depth=4, min_samples_split=15)
20
+ 3. A ColumnTransformer for numeric & categorical data
21
+ 4. Consistent API: preprocessing, predict, postprocessing
22
+ """
23
+
24
+ def __init__(self, csv_path):
25
+ """
26
+ Loads the CSV, cleans data, sets up the column transformer,
27
+ trains a RandomForestRegressor with fixed hyperparameters,
28
+ and evaluates on a test set.
29
+ """
30
+ # -----------------------------------------------------
31
+ # 1. Load and clean data
32
+ # -----------------------------------------------------
33
+ df = pd.read_csv(csv_path)
34
+ # Drop irrelevant columns if present, remove any leftover NaNs
35
+ df = df.drop(columns=["index", "PatientID"], errors="ignore").dropna()
36
+
37
+ # Apply iterative imputation for the specified columns
38
+ self._impute(df, columns=['age', 'bmi', 'bloodpressure'])
39
+
40
+ # Clip outliers in 'claim' (1st to 98th percentile)
41
+ lower_percentile = df['claim'].quantile(0.01)
42
+ upper_percentile = df['claim'].quantile(0.98)
43
+ df = df[
44
+ (df['claim'] >= lower_percentile) & (df['claim'] <= upper_percentile)
45
+ ]
46
+
47
+ # -----------------------------------------------------
48
+ # 2. Separate features & target
49
+ # -----------------------------------------------------
50
+ features = df.drop(columns=['claim'])
51
+ target = df['claim'].values # or df['claim'].to_numpy()
52
+
53
+ # -----------------------------------------------------
54
+ # 3. Create ColumnTransformer
55
+ # -----------------------------------------------------
56
+ text_pipeline = Pipeline([
57
+ ('one-hot', OneHotEncoder(handle_unknown='ignore'))
58
+ ])
59
+
60
+ nums_pipeline = Pipeline([
61
+ ('normalize', StandardScaler(with_mean=False))
62
+ ])
63
+
64
+ self.ct = ColumnTransformer([
65
+ ('categorical', text_pipeline, ['diabetic', 'gender', 'region', 'smoker']),
66
+ ('numerical', nums_pipeline, ['children', 'age', 'bmi', 'bloodpressure'])
67
+ ])
68
+
69
+ # Fit the ColumnTransformer on the entire dataset
70
+ X_full_transformed = self.ct.fit_transform(features)
71
+
72
+ # -----------------------------------------------------
73
+ # 4. Train/test split
74
+ # -----------------------------------------------------
75
+ X_train, X_test, y_train, y_test = train_test_split(
76
+ X_full_transformed,
77
+ target,
78
+ test_size=0.2,
79
+ random_state=42
80
+ )
81
+
82
+ # -----------------------------------------------------
83
+ # 5. RandomForest with fixed hyperparameters
84
+ # -----------------------------------------------------
85
+ self.model = RandomForestRegressor(
86
+ n_estimators=100,
87
+ max_depth=4,
88
+ min_samples_split=15,
89
+ random_state=42
90
+ )
91
+ self.model.fit(X_train, y_train)
92
+
93
+ # -----------------------------------------------------
94
+ # 6. Evaluate
95
+ # -----------------------------------------------------
96
+ mae, r2 = self._evaluate(X_test, y_test)
97
+ print(f"[RANDOM FOREST] Test MAE: {mae:.3f}")
98
+ print(f"[RANDOM FOREST] Test R^2: {r2:.3f}")
99
+
100
+ # -------------------------------------------
101
+ # Private: iterative imputation
102
+ # -------------------------------------------
103
+ def _impute(self, df, columns):
104
+ imp = IterativeImputer(max_iter=5, verbose=2)
105
+ arr = imp.fit_transform(df[columns])
106
+ df[columns] = arr
107
+
108
+ # -------------------------------------------
109
+ # Private: evaluation
110
+ # -------------------------------------------
111
+ def _evaluate(self, X_test, y_test):
112
+ y_pred = self.model.predict(X_test)
113
+ mae = mean_absolute_error(y_test, y_pred)
114
+ r2 = r2_score(y_test, y_pred)
115
+ return mae, r2
116
+
117
+ # -------------------------------------------
118
+ # Public: preprocessing
119
+ # -------------------------------------------
120
+ def preprocessing(self, raw_df):
121
+ """
122
+ Takes a new DataFrame with the columns the pipeline expects,
123
+ and returns the transformed matrix.
124
+ """
125
+ return self.ct.transform(raw_df)
126
+
127
+ # -------------------------------------------
128
+ # Public: predict
129
+ # -------------------------------------------
130
+ def predict(self, preprocessed_data):
131
+ """
132
+ Takes feature data already processed by `preprocessing`,
133
+ returns predictions in the original claim scale.
134
+ """
135
+ preds = self.model.predict(preprocessed_data)
136
+ return self.postprocessing(preds)
137
+
138
+ # -------------------------------------------
139
+ # Public: postprocessing
140
+ # -------------------------------------------
141
+ def postprocessing(self, preds):
142
+ """
143
+ Currently a pass-through, as there's no target scaling to invert.
144
+ """
145
+ return preds
146
+
147
+
148
+ if __name__ == "__main__":
149
+ # Instantiate and train on "cleaned_insurance_data.csv"
150
+ rf_model = RandomForestInsuranceModel("cleaned_insurance_data.csv")
151
+
152
+ # Export the entire trained class instance (including the ColumnTransformer)
153
+ joblib.dump(rf_model, "RandomForestInsuranceModel.joblib")
154
+ print("Exported RandomForestInsuranceModel to RandomForestInsuranceModel.joblib")
models/svr.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ from sklearn.svm import NuSVR
5
+ from sklearn.compose import ColumnTransformer
6
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler
7
+ from sklearn.pipeline import Pipeline
8
+ from sklearn.base import BaseEstimator, TransformerMixin
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.metrics import mean_absolute_error, r2_score
11
+
12
+ import joblib
13
+
14
+
15
+ class NuSVRInsuranceModel:
16
+ """
17
+ This class encapsulates:
18
+ 1. Preprocessing: column transformations, scaling
19
+ 2. Prediction: using NuSVR
20
+ 3. Postprocessing: inverse-transform predictions to original scale
21
+ """
22
+
23
+ # --- Custom Transformer defined INSIDE the class ---
24
+ class MultiplyScaler(BaseEstimator, TransformerMixin):
25
+ def __init__(self, factor=2):
26
+ self.factor = factor
27
+
28
+ def fit(self, X, y=None):
29
+ return self
30
+
31
+ def transform(self, X):
32
+ return X * self.factor
33
+
34
+ def __init__(self):
35
+ """
36
+ In the constructor, define the column pipelines, the main ColumnTransformer,
37
+ the target scaler, and the model.
38
+ """
39
+
40
+ # Example pipelines (adjust as needed)
41
+ text_pipeline = Pipeline([
42
+ ('one-hot', OneHotEncoder())
43
+ ])
44
+
45
+ nums_pipeline = Pipeline([
46
+ ('normalize', StandardScaler(with_mean=True)),
47
+ ])
48
+
49
+ nums_pipeline_strong = Pipeline([
50
+ ('normalize', StandardScaler(with_mean=True)),
51
+ # Note we reference the nested class here
52
+ ('scalarMultiply', NuSVRInsuranceModel.MultiplyScaler(factor=2))
53
+ ])
54
+
55
+ smoke_pipeline = Pipeline([
56
+ ('one-hot', OneHotEncoder()),
57
+ ('normalize', StandardScaler(with_mean=False)),
58
+ ('scalar-multiply', NuSVRInsuranceModel.MultiplyScaler(factor=5))
59
+ ])
60
+
61
+ region_pipeline = Pipeline([
62
+ ('categories', OrdinalEncoder())
63
+ ])
64
+
65
+ # Create ColumnTransformer
66
+ # Adjust columns to match your dataset's actual column names
67
+ self.ct = ColumnTransformer([
68
+ ('str_handler', text_pipeline, ['diabetic', 'gender']),
69
+ ('smoke_handle', smoke_pipeline, ['smoker']),
70
+ ('floats_ints_weak', nums_pipeline, ['children', 'age']),
71
+ ('floats_ints_strong', nums_pipeline_strong, ['bmi', 'bloodpressure']),
72
+ ])
73
+
74
+ # Target scaler (for the 'claim' column)
75
+ self.target_scaler = MinMaxScaler(feature_range=(-0.5, 0.5))
76
+
77
+ # NuSVR model with desired hyperparameters
78
+ self.model = NuSVR(C=10, gamma='scale', kernel='rbf', nu=0.80)
79
+
80
+ def preprocessing(self, df):
81
+ """
82
+ Takes a raw dataframe (with the relevant columns) and applies the
83
+ fitted ColumnTransformer used in training.
84
+ Returns the transformed feature matrix.
85
+ """
86
+ return self.ct.transform(df)
87
+
88
+ def predict(self, preprocessed_data):
89
+ """
90
+ Takes already-preprocessed data (matrix/array) and outputs the
91
+ final predictions in the original scale.
92
+ """
93
+ y_pred_scaled = self.model.predict(preprocessed_data)
94
+ return self.postprocessing(y_pred_scaled)
95
+
96
+ def postprocessing(self, y_pred_scaled):
97
+ """
98
+ Takes scaled predictions (in the target_scaler domain) and inversely
99
+ transforms them back to the original target domain.
100
+ """
101
+ y_pred_original = self.target_scaler.inverse_transform(
102
+ y_pred_scaled.reshape(-1, 1)
103
+ )
104
+ return y_pred_original.ravel()
105
+
106
+
107
+ if __name__ == "__main__":
108
+ # -------------------------------------------------
109
+ # 1. Load data
110
+ # -------------------------------------------------
111
+ df = pd.read_csv('cleaned_insurance_data.csv')
112
+
113
+ # Separate features and target
114
+ features = df.drop(columns=['claim', 'PatientID', 'index'])
115
+ target = df['claim']
116
+
117
+ # -------------------------------------------------
118
+ # 2. Instantiate our NuSVRInsuranceModel
119
+ # -------------------------------------------------
120
+ nusvr_wrapper = NuSVRInsuranceModel()
121
+
122
+ # -------------------------------------------------
123
+ # 3. Train-test split
124
+ # -------------------------------------------------
125
+ X_train_raw, X_test_raw, y_train, y_test = train_test_split(
126
+ features, target, test_size=0.25, random_state=42
127
+ )
128
+
129
+ # -------------------------------------------------
130
+ # 4. Fit ColumnTransformer & target scaler on TRAIN data
131
+ # -------------------------------------------------
132
+ # Fit the ColumnTransformer
133
+ X_train_t = nusvr_wrapper.ct.fit_transform(X_train_raw)
134
+ # Fit the target scaler
135
+ y_train_t = nusvr_wrapper.target_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()
136
+
137
+ # -------------------------------------------------
138
+ # 5. Train the NuSVR model
139
+ # -------------------------------------------------
140
+ nusvr_wrapper.model.fit(X_train_t, y_train_t)
141
+
142
+ # -------------------------------------------------
143
+ # 6. Evaluate on test data
144
+ # -------------------------------------------------
145
+ # Preprocess the test features with the same pipeline
146
+ X_test_t = nusvr_wrapper.preprocessing(X_test_raw)
147
+
148
+ # Make predictions (in original scale)
149
+ y_pred = nusvr_wrapper.predict(X_test_t)
150
+
151
+ mae = mean_absolute_error(y_test, y_pred)
152
+ r2 = r2_score(y_test, y_pred)
153
+
154
+ print(f"Test MAE (original scale): {mae:.3f}")
155
+ print(f"Test R^2 (original scale): {r2:.3f}")
156
+
157
+ # -------------------------------------------------
158
+ # 7. Export the fitted model
159
+ # -------------------------------------------------
160
+ joblib.dump(nusvr_wrapper, "nusvr_insurance_model.joblib")
161
+ print("Fitted NuSVRInsuranceModel saved to nusvr_insurance_model.joblib")