Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from sklearn.svm import NuSVR | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import mean_absolute_error, r2_score | |
| import joblib | |
| class NuSVRInsuranceModel: | |
| """ | |
| This class encapsulates: | |
| 1. Preprocessing: column transformations, scaling | |
| 2. Prediction: using NuSVR | |
| 3. Postprocessing: inverse-transform predictions to original scale | |
| """ | |
| # --- Custom Transformer defined INSIDE the class --- | |
| class MultiplyScaler(BaseEstimator, TransformerMixin): | |
| def __init__(self, factor=2): | |
| self.factor = factor | |
| def fit(self, X, y=None): | |
| return self | |
| def transform(self, X): | |
| return X * self.factor | |
| def __init__(self): | |
| """ | |
| In the constructor, define the column pipelines, the main ColumnTransformer, | |
| the target scaler, and the model. | |
| """ | |
| # Example pipelines (adjust as needed) | |
| text_pipeline = Pipeline([ | |
| ('one-hot', OneHotEncoder()) | |
| ]) | |
| nums_pipeline = Pipeline([ | |
| ('normalize', StandardScaler(with_mean=True)), | |
| ]) | |
| nums_pipeline_strong = Pipeline([ | |
| ('normalize', StandardScaler(with_mean=True)), | |
| # Note we reference the nested class here | |
| ('scalarMultiply', NuSVRInsuranceModel.MultiplyScaler(factor=2)) | |
| ]) | |
| smoke_pipeline = Pipeline([ | |
| ('one-hot', OneHotEncoder()), | |
| ('normalize', StandardScaler(with_mean=False)), | |
| ('scalar-multiply', NuSVRInsuranceModel.MultiplyScaler(factor=5)) | |
| ]) | |
| region_pipeline = Pipeline([ | |
| ('categories', OrdinalEncoder()) | |
| ]) | |
| # Create ColumnTransformer | |
| # Adjust columns to match your dataset's actual column names | |
| self.ct = ColumnTransformer([ | |
| ('str_handler', text_pipeline, ['diabetic', 'gender']), | |
| ('smoke_handle', smoke_pipeline, ['smoker']), | |
| ('floats_ints_weak', nums_pipeline, ['children', 'age']), | |
| ('floats_ints_strong', nums_pipeline_strong, ['bmi', 'bloodpressure']), | |
| ]) | |
| # Target scaler (for the 'claim' column) | |
| self.target_scaler = MinMaxScaler(feature_range=(-0.5, 0.5)) | |
| # NuSVR model with desired hyperparameters | |
| self.model = NuSVR(C=10, gamma='scale', kernel='rbf', nu=0.80) | |
| def preprocessing(self, df): | |
| """ | |
| Takes a raw dataframe (with the relevant columns) and applies the | |
| fitted ColumnTransformer used in training. | |
| Returns the transformed feature matrix. | |
| """ | |
| return self.ct.transform(df) | |
| def predict(self, preprocessed_data): | |
| """ | |
| Takes already-preprocessed data (matrix/array) and outputs the | |
| final predictions in the original scale. | |
| """ | |
| y_pred_scaled = self.model.predict(preprocessed_data) | |
| return self.postprocessing(y_pred_scaled) | |
| def postprocessing(self, y_pred_scaled): | |
| """ | |
| Takes scaled predictions (in the target_scaler domain) and inversely | |
| transforms them back to the original target domain. | |
| """ | |
| y_pred_original = self.target_scaler.inverse_transform( | |
| y_pred_scaled.reshape(-1, 1) | |
| ) | |
| return y_pred_original.ravel() | |
| if __name__ == "__main__": | |
| # ------------------------------------------------- | |
| # 1. Load data | |
| # ------------------------------------------------- | |
| df = pd.read_csv('cleaned_insurance_data.csv') | |
| # Separate features and target | |
| features = df.drop(columns=['claim', 'PatientID', 'index']) | |
| target = df['claim'] | |
| # ------------------------------------------------- | |
| # 2. Instantiate our NuSVRInsuranceModel | |
| # ------------------------------------------------- | |
| nusvr_wrapper = NuSVRInsuranceModel() | |
| # ------------------------------------------------- | |
| # 3. Train-test split | |
| # ------------------------------------------------- | |
| X_train_raw, X_test_raw, y_train, y_test = train_test_split( | |
| features, target, test_size=0.25, random_state=42 | |
| ) | |
| # ------------------------------------------------- | |
| # 4. Fit ColumnTransformer & target scaler on TRAIN data | |
| # ------------------------------------------------- | |
| # Fit the ColumnTransformer | |
| X_train_t = nusvr_wrapper.ct.fit_transform(X_train_raw) | |
| # Fit the target scaler | |
| y_train_t = nusvr_wrapper.target_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel() | |
| # ------------------------------------------------- | |
| # 5. Train the NuSVR model | |
| # ------------------------------------------------- | |
| nusvr_wrapper.model.fit(X_train_t, y_train_t) | |
| # ------------------------------------------------- | |
| # 6. Evaluate on test data | |
| # ------------------------------------------------- | |
| # Preprocess the test features with the same pipeline | |
| X_test_t = nusvr_wrapper.preprocessing(X_test_raw) | |
| # Make predictions (in original scale) | |
| y_pred = nusvr_wrapper.predict(X_test_t) | |
| mae = mean_absolute_error(y_test, y_pred) | |
| r2 = r2_score(y_test, y_pred) | |
| print(f"Test MAE (original scale): {mae:.3f}") | |
| print(f"Test R^2 (original scale): {r2:.3f}") | |
| # ------------------------------------------------- | |
| # 7. Export the fitted model | |
| # ------------------------------------------------- | |
| joblib.dump(nusvr_wrapper, "nusvr_insurance_model.joblib") | |
| print("Fitted NuSVRInsuranceModel saved to nusvr_insurance_model.joblib") | |