Spaces:

3nthusiast
/

Insurance-Claim-Predict

Sleeping

App Files Files Community

Mateusz Paszynski commited on Jan 30, 2025

Commit

5de1466

1 Parent(s): 75b972f

publish website

Browse files

Files changed (9) hide show

KNNInsuranceModel.joblib +3 -0
NuSVRInsuranceModel.joblib +3 -0
RandomForestInsuranceModel.joblib +3 -0
XGBoostInsuranceModel.joblib +3 -0
app.py +133 -0
models/XGBoost.py +142 -0
models/knn.py +115 -0
models/randomforest.py +154 -0
models/svr.py +161 -0

KNNInsuranceModel.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e42bf88f3ea8c987e04d270daeae1944e61ecbb51d655b74c15f667d6b1b65f7
+size 75472

NuSVRInsuranceModel.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a21361ad305bcc706235a87f72f5f147f6dc2ceddbd51c9d36e12bb9e1fe65b2
+size 90887

RandomForestInsuranceModel.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7091796dc3eb05b4fe48607cfb0f9419ffa847343e8baf3534965e7378d2c96d
+size 225027

XGBoostInsuranceModel.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a06f22700c735de1f450ca10404a71fca66509570b3c1aa10af8576b4feb61d
+size 184349

app.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import gradio as gr
+import joblib
+import numpy as np
+import pandas as pd
+from models.svr import NuSVRInsuranceModel
+from models.randomforest import RandomForestInsuranceModel
+from models.XGBoost import XGBoostInsuranceModel
+from models.knn import KNNInsuranceModel
+import os
+# -------------------------------------------------------
+# Placeholder code to load pre-trained models
+# (Replace 'model_NuSVR.joblib', etc. with your actual file paths)
+# -------------------------------------------------------
+model_nusvr = joblib.load(os.path.join('.', 'Website', 'NuSVRInsuranceModel.joblib'))
+model_xgb = joblib.load(os.path.join('.', 'Website', 'XGBoostInsuranceModel.joblib'))
+model_knn   = joblib.load(os.path.join('.', 'Website', 'KNNInsuranceModel.joblib'))
+model_rf    = joblib.load(os.path.join('.', 'Website', 'RandomForestInsuranceModel.joblib'))
+# # Dictionary to map model choice to the actual loaded model
+models_dict = {
+     "NuSVR": model_nusvr,
+     "XGBoost": model_xgb,
+     "KNN": model_knn,
+     "Random Forest": model_rf
+ }
+# -------------------------------------------------------
+# Simple label encodings or numeric mappings:
+#   - gender: {"male":0, "female":1}
+#   - diabetic: {"No":0, "Yes":1}
+#   - smoker: {"No":0, "Yes":1}
+#   - region: {"southwest":0, "southeast":1, "northwest":2, "northeast":3}
+# Adjust this to however you pre-processed the data for training!
+# -------------------------------------------------------
+region_mapping = {"southwest": 0, "southeast": 1, "northwest": 2, "northeast": 3}
+gender_mapping = {"male": 0, "female": 1}
+yes_no_mapping = {"No": 0, "Yes": 1}
+def predict_insurance_claim(
+    model_choice,
+    age,
+    gender,
+    bmi,
+    blood_pressure,
+    diabetic,
+    children,
+    smoker,
+    region
+):
+    # Convert categorical values using the same logic used during training
+    try:
+        gender_val   = gender_mapping[gender.lower()]
+        diabetic_val = yes_no_mapping[diabetic]
+        smoker_val   = yes_no_mapping[smoker]
+        region_val   = region_mapping[region.lower()]
+    except KeyError:
+        return "Invalid input for categorical field."
+    # Construct a single-row DataFrame with the correct column names
+    user_data = {
+        'age':           [age],
+        'gender':        [gender],
+        'bmi':           [bmi],
+        'bloodpressure': [blood_pressure],
+        'diabetic':      [diabetic],
+        'children':      [children],
+        'smoker':        [smoker],
+        'region':        [region]
+    }
+    user_df = pd.DataFrame(user_data)
+    chosen_model = models_dict[model_choice]
+    d = chosen_model.preprocessing(user_df)
+    y_pred = chosen_model.predict(d)
+    return float(y_pred[0])
+# -------------------------------------------------------
+# Gradio Interface
+# -------------------------------------------------------
+def build_interface():
+    # Dropdown to choose model
+    with gr.Row():
+        model_dropdown = gr.Dropdown(
+            choices=["NuSVR", "XGBoost", "KNN", "Random Forest"],
+            value="NuSVR",
+            label="Select Model"
+        )
+    with gr.Row():
+    # Numeric inputs
+        age_input = gr.Number(value=39.0, label="Age")
+        bmi_input = gr.Number(value=23.2, label="BMI")
+        bp_input  = gr.Number(value=91.0, label="Blood Pressure")
+        children_input = gr.Number(value=0, label="Children")
+        # Dropdowns for categorical data
+        gender_input = gr.Dropdown(choices=["male", "female"], value="male", label="Gender")
+        diabetic_input = gr.Dropdown(choices=["No", "Yes"], value="Yes", label="Diabetic")
+        smoker_input = gr.Dropdown(choices=["No", "Yes"], value="No", label="Smoker")
+        region_input = gr.Dropdown(choices=["southwest", "southeast", "northwest", "northeast"],
+                               value="southeast",
+                               label="Region")
+    # Output
+    output_label = gr.Textbox(label="Predicted Claim")
+    # Interface
+    demo = gr.Interface(
+        fn=predict_insurance_claim,
+        inputs=[
+            model_dropdown,
+            age_input,
+            gender_input,
+            bmi_input,
+            bp_input,
+            diabetic_input,
+            children_input,
+            smoker_input,
+            region_input
+        ],
+        outputs=output_label,
+        title="Insurance Claim Prediction"
+    )
+    return demo
+if __name__ == "__main__":
+    interface = build_interface()
+    interface.launch(server_name="localhost", server_port=7861, share=True)

models/XGBoost.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.impute import SimpleImputer
+class XGBoostInsuranceModel:
+    """
+    A Gradient Boosting-based insurance model class with:
+      1. Data loading & dropping unnecessary columns
+      2. Dummy-encoding for categorical variables
+      3. SimpleImputer for missing data
+      4. PolynomialFeatures for interactions
+      5. Train/test splits and final evaluation
+      6. A consistent API with `preprocessing`, `predict`, `postprocessing`
+    """
+    def __init__(self, csv_path):
+        # -----------------------------------------------------
+        # 1. Load & prepare the data
+        # -----------------------------------------------------
+        df = pd.read_csv(csv_path)
+        # Drop these columns if present (ignore errors if they're missing)
+        df = df.drop(columns=['index', 'PatientID'], errors='ignore')
+        # Separate features & target
+        X = df.drop(columns=['claim'])
+        y = df['claim'].values
+        # We'll discover the categorical columns by using get_dummies once
+        # But to replicate these transformations on new data,
+        # we must track all dummy columns.
+        X_dummies = pd.get_dummies(X, drop_first=True)
+        self.all_dummy_cols = X_dummies.columns.tolist()
+        # Create a SimpleImputer
+        self.imputer = SimpleImputer(strategy='mean')
+        # Create a PolynomialFeatures transformer (degree=2, interaction_only=True)
+        self.poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
+        # -----------------------------------------------------
+        # 2. Fit transformations on training set only
+        #    (but we do an initial get_dummies for the entire data
+        #     to discover all possible dummy columns).
+        # -----------------------------------------------------
+        # We'll do the train-test split on X_dummies (since that's how you had it).
+        X_train_dummies, X_test_dummies, y_train, y_test = train_test_split(
+            X_dummies, y, test_size=0.2, random_state=42
+        )
+        # A second split from X_train to get a validation set (as in your code)
+        X_train_dummies, X_val_dummies, y_train, y_val = train_test_split(
+            X_train_dummies, y_train, test_size=0.25, random_state=42
+        )
+        # Fit the imputer on the training set
+        X_train_imputed = self.imputer.fit_transform(X_train_dummies)
+        # Fit polynomial on the training set
+        X_train_poly = self.poly.fit_transform(X_train_imputed)
+        # -----------------------------------------------------
+        # 3. Initialize and train the GradientBoostingRegressor
+        # -----------------------------------------------------
+        self.model = GradientBoostingRegressor(
+            n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
+        )
+        self.model.fit(X_train_poly, y_train)
+        # -----------------------------------------------------
+        # 4. Evaluate on the test set
+        # -----------------------------------------------------
+        X_test_imputed = self.imputer.transform(X_test_dummies)
+        X_test_poly = self.poly.transform(X_test_imputed)
+        y_test_pred = self.model.predict(X_test_poly)
+        test_mse = mean_squared_error(y_test, y_test_pred)
+        test_mae = mean_absolute_error(y_test, y_test_pred)
+        test_r2 = r2_score(y_test, y_test_pred)
+        self.__scores = [test_mae, test_mse, test_r2]
+        print(f"[XGBoostInsuranceModel] MAE: {test_mae:.3f} | MSE: {test_mse:.3f} | R^2: {test_r2:.3f}")
+    def preprocessing(self, raw_df):
+        """
+        Takes a new DataFrame with the same columns as the original CSV minus 'claim'.
+        Then:
+          1. Turns categorical features into dummy columns
+          2. Ensures the dummy columns match those used during training
+          3. Imputes missing values
+          4. Applies PolynomialFeatures transform
+        Returns transformed data (numpy array).
+        """
+        # 1. Convert raw_df to dummies, potentially missing or extra columns
+        temp_dummies = pd.get_dummies(raw_df, drop_first=True)
+        # Ensure it has exactly the same dummy columns as the training data
+        for col in self.all_dummy_cols:
+            if col not in temp_dummies.columns:
+                temp_dummies[col] = 0
+        # If there are extra columns not in self.all_dummy_cols, drop them
+        temp_dummies = temp_dummies[self.all_dummy_cols]
+        # 2. Imputation
+        temp_imputed = self.imputer.transform(temp_dummies)
+        # 3. Polynomial Features
+        temp_poly = self.poly.transform(temp_imputed)
+        return temp_poly
+    def predict(self, preprocessed_data):
+        """
+        Receives data already output by `preprocessing`.
+        Returns predictions in the original scale (no inverse transform needed).
+        """
+        preds = self.model.predict(preprocessed_data)
+        return self.postprocessing(preds)
+    def postprocessing(self, preds):
+        """
+        Currently a pass-through, because we haven't scaled 'claim'.
+        In a different scenario, you might invert-scale predictions here.
+        """
+        return preds
+    def getScores(self):
+        """
+        Returns a string with the test metrics.
+        (MAE, MSE, R^2) from the last training process.
+        """
+        return f"MAE: {self.__scores[0]} | MSE: {self.__scores[1]} | R^2: {self.__scores[2]}"
+if __name__ == "__main__":
+    # Instantiate, train, and evaluate the model on "cleaned_insurance_data.csv"
+    xgb_model = XGBoostInsuranceModel("cleaned_insurance_data.csv")
+    # Export the entire class instance for future use
+    joblib.dump(xgb_model, "XGBoostInsuranceModel.joblib")
+    print("Exported XGBoostInsuranceModel to XGBoostInsuranceModel.joblib")

models/knn.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.model_selection import train_test_split, GridSearchCV
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.metrics import mean_squared_error, r2_score
+class KNNInsuranceModel:
+    """
+    KNN-based insurance model with:
+      1. Data loading & cleaning
+      2. Preprocessing for 'smoker' (Yes=1, No=0)
+      3. Grid search for best hyperparameters
+      4. A consistent API with preprocessing, predict, and postprocessing
+    """
+    def __init__(self, csv_path):
+        # --------------------------------------------------
+        # 1. Load data
+        # --------------------------------------------------
+        insurance_df = pd.read_csv(csv_path)
+        # Drop columns if they exist, ignore if not
+        insurance_df = insurance_df.drop(columns=["index", "PatientID"], errors="ignore").dropna()
+        # Convert smoker: 'Yes' -> 1, 'No' (or anything else) -> 0
+        insurance_df["smoker"] = np.where(insurance_df["smoker"] == 'Yes', 1, 0)
+        # For training, we use columns [bloodpressure, bmi, smoker] as in your snippet
+        X = insurance_df[["bloodpressure", "bmi", "smoker"]]
+        y = insurance_df["claim"]
+        # --------------------------------------------------
+        # 2. Train-test split
+        # --------------------------------------------------
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42
+        )
+        # --------------------------------------------------
+        # 3. Grid search for best KNN
+        # --------------------------------------------------
+        param_grid = {
+            'n_neighbors': range(1, 31),
+            'weights': ['uniform', 'distance'],
+            'metric': ['minkowski', 'euclidean', 'manhattan']
+        }
+        grid_search = GridSearchCV(
+            KNeighborsRegressor(),
+            param_grid,
+            cv=5
+        )
+        grid_search.fit(X_train, y_train)
+        # The best estimator from the grid
+        self.model = grid_search.best_estimator_
+        self.model.fit(X_train, y_train)
+        # --------------------------------------------------
+        # 4. Evaluate on test set
+        # --------------------------------------------------
+        y_pred = self.model.predict(X_test)
+        mse = mean_squared_error(y_test, y_pred)
+        r2 = r2_score(y_test, y_pred)
+        self.__scores = [mse, r2]
+        print(f"[KNN] Test MSE: {mse:.3f}")
+        print(f"[KNN] Test R^2: {r2:.3f}")
+    def preprocessing(self, raw_df):
+        """
+        For new data, replicate the same steps:
+          1) Convert 'smoker' to 0/1
+          2) Extract columns [bloodpressure, bmi, smoker]
+        Returns a DataFrame or numpy array in the same format as training X.
+        """
+        # Copy to avoid mutating original df
+        df_copy = raw_df.copy()
+        # Convert 'smoker'
+        if 'smoker' in df_copy.columns:
+            df_copy["smoker"] = np.where(df_copy["smoker"] == 'Yes', 1, 0)
+        else:
+            # If missing, default to 0 or handle as needed
+            df_copy["smoker"] = 0
+        # Ensure we only use the same columns as training
+        return df_copy[["bloodpressure", "bmi", "smoker"]]
+    def predict(self, preprocessed_data):
+        """
+        Takes feature data already processed by `preprocessing`,
+        returns predictions (in original claim scale, since we didn't scale).
+        """
+        preds = self.model.predict(preprocessed_data)
+        return self.postprocessing(preds)
+    def postprocessing(self, preds):
+        """
+        No target scaling to invert, so just return `preds`.
+        """
+        return preds
+    def getScores(self):
+        return f"MSE: {self.__scores[0]} \nR2: {self.__scores[1]}"
+if __name__ == "__main__":
+    # --------------------------------------------------
+    # 5. Instantiate and train on 'cleaned_insurance_data.csv'
+    # --------------------------------------------------
+    knn_wrapper = KNNInsuranceModel("cleaned_insurance_data.csv")
+    # --------------------------------------------------
+    # 6. Export the entire model class for later use
+    # --------------------------------------------------
+    joblib.dump(knn_wrapper, "KNNInsuranceModel.joblib")
+    print("KNNInsuranceModel exported to KNNInsuranceModel.joblib")

models/randomforest.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.experimental import enable_iterative_imputer
+from sklearn.impute import IterativeImputer
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_error, r2_score
+class RandomForestInsuranceModel:
+    """
+    A Random Forest regressor class with:
+      1. Data loading & cleaning (iterative imputation, outlier clipping)
+      2. A fixed set of hyperparameters (n_estimators=100, max_depth=4, min_samples_split=15)
+      3. A ColumnTransformer for numeric & categorical data
+      4. Consistent API: preprocessing, predict, postprocessing
+    """
+    def __init__(self, csv_path):
+        """
+        Loads the CSV, cleans data, sets up the column transformer,
+        trains a RandomForestRegressor with fixed hyperparameters,
+        and evaluates on a test set.
+        """
+        # -----------------------------------------------------
+        # 1. Load and clean data
+        # -----------------------------------------------------
+        df = pd.read_csv(csv_path)
+        # Drop irrelevant columns if present, remove any leftover NaNs
+        df = df.drop(columns=["index", "PatientID"], errors="ignore").dropna()
+        # Apply iterative imputation for the specified columns
+        self._impute(df, columns=['age', 'bmi', 'bloodpressure'])
+        # Clip outliers in 'claim' (1st to 98th percentile)
+        lower_percentile = df['claim'].quantile(0.01)
+        upper_percentile = df['claim'].quantile(0.98)
+        df = df[
+            (df['claim'] >= lower_percentile) & (df['claim'] <= upper_percentile)
+        ]
+        # -----------------------------------------------------
+        # 2. Separate features & target
+        # -----------------------------------------------------
+        features = df.drop(columns=['claim'])
+        target = df['claim'].values  # or df['claim'].to_numpy()
+        # -----------------------------------------------------
+        # 3. Create ColumnTransformer
+        # -----------------------------------------------------
+        text_pipeline = Pipeline([
+            ('one-hot', OneHotEncoder(handle_unknown='ignore'))
+        ])
+        nums_pipeline = Pipeline([
+            ('normalize', StandardScaler(with_mean=False))
+        ])
+        self.ct = ColumnTransformer([
+            ('categorical', text_pipeline, ['diabetic', 'gender', 'region', 'smoker']),
+            ('numerical',  nums_pipeline,   ['children', 'age', 'bmi', 'bloodpressure'])
+        ])
+        # Fit the ColumnTransformer on the entire dataset
+        X_full_transformed = self.ct.fit_transform(features)
+        # -----------------------------------------------------
+        # 4. Train/test split
+        # -----------------------------------------------------
+        X_train, X_test, y_train, y_test = train_test_split(
+            X_full_transformed,
+            target,
+            test_size=0.2,
+            random_state=42
+        )
+        # -----------------------------------------------------
+        # 5. RandomForest with fixed hyperparameters
+        # -----------------------------------------------------
+        self.model = RandomForestRegressor(
+            n_estimators=100,
+            max_depth=4,
+            min_samples_split=15,
+            random_state=42
+        )
+        self.model.fit(X_train, y_train)
+        # -----------------------------------------------------
+        # 6. Evaluate
+        # -----------------------------------------------------
+        mae, r2 = self._evaluate(X_test, y_test)
+        print(f"[RANDOM FOREST] Test MAE: {mae:.3f}")
+        print(f"[RANDOM FOREST] Test R^2: {r2:.3f}")
+    # -------------------------------------------
+    # Private: iterative imputation
+    # -------------------------------------------
+    def _impute(self, df, columns):
+        imp = IterativeImputer(max_iter=5, verbose=2)
+        arr = imp.fit_transform(df[columns])
+        df[columns] = arr
+    # -------------------------------------------
+    # Private: evaluation
+    # -------------------------------------------
+    def _evaluate(self, X_test, y_test):
+        y_pred = self.model.predict(X_test)
+        mae = mean_absolute_error(y_test, y_pred)
+        r2 = r2_score(y_test, y_pred)
+        return mae, r2
+    # -------------------------------------------
+    # Public: preprocessing
+    # -------------------------------------------
+    def preprocessing(self, raw_df):
+        """
+        Takes a new DataFrame with the columns the pipeline expects,
+        and returns the transformed matrix.
+        """
+        return self.ct.transform(raw_df)
+    # -------------------------------------------
+    # Public: predict
+    # -------------------------------------------
+    def predict(self, preprocessed_data):
+        """
+        Takes feature data already processed by `preprocessing`,
+        returns predictions in the original claim scale.
+        """
+        preds = self.model.predict(preprocessed_data)
+        return self.postprocessing(preds)
+    # -------------------------------------------
+    # Public: postprocessing
+    # -------------------------------------------
+    def postprocessing(self, preds):
+        """
+        Currently a pass-through, as there's no target scaling to invert.
+        """
+        return preds
+if __name__ == "__main__":
+    # Instantiate and train on "cleaned_insurance_data.csv"
+    rf_model = RandomForestInsuranceModel("cleaned_insurance_data.csv")
+    # Export the entire trained class instance (including the ColumnTransformer)
+    joblib.dump(rf_model, "RandomForestInsuranceModel.joblib")
+    print("Exported RandomForestInsuranceModel to RandomForestInsuranceModel.joblib")

models/svr.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import pandas as pd
+import numpy as np
+from sklearn.svm import NuSVR
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler
+from sklearn.pipeline import Pipeline
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_absolute_error, r2_score
+import joblib
+class NuSVRInsuranceModel:
+    """
+    This class encapsulates:
+      1. Preprocessing: column transformations, scaling
+      2. Prediction: using NuSVR
+      3. Postprocessing: inverse-transform predictions to original scale
+    """
+    # --- Custom Transformer defined INSIDE the class ---
+    class MultiplyScaler(BaseEstimator, TransformerMixin):
+        def __init__(self, factor=2):
+            self.factor = factor
+        def fit(self, X, y=None):
+            return self
+        def transform(self, X):
+            return X * self.factor
+    def __init__(self):
+        """
+        In the constructor, define the column pipelines, the main ColumnTransformer,
+        the target scaler, and the model.
+        """
+        # Example pipelines (adjust as needed)
+        text_pipeline = Pipeline([
+            ('one-hot', OneHotEncoder())
+        ])
+        nums_pipeline = Pipeline([
+            ('normalize', StandardScaler(with_mean=True)),
+        ])
+        nums_pipeline_strong = Pipeline([
+            ('normalize', StandardScaler(with_mean=True)),
+            # Note we reference the nested class here
+            ('scalarMultiply', NuSVRInsuranceModel.MultiplyScaler(factor=2))
+        ])
+        smoke_pipeline = Pipeline([
+            ('one-hot', OneHotEncoder()),
+            ('normalize', StandardScaler(with_mean=False)),
+            ('scalar-multiply', NuSVRInsuranceModel.MultiplyScaler(factor=5))
+        ])
+        region_pipeline = Pipeline([
+            ('categories', OrdinalEncoder())
+        ])
+        # Create ColumnTransformer
+        # Adjust columns to match your dataset's actual column names
+        self.ct = ColumnTransformer([
+            ('str_handler', text_pipeline, ['diabetic', 'gender']),
+            ('smoke_handle', smoke_pipeline, ['smoker']),
+            ('floats_ints_weak', nums_pipeline, ['children', 'age']),
+            ('floats_ints_strong', nums_pipeline_strong, ['bmi', 'bloodpressure']),
+        ])
+        # Target scaler (for the 'claim' column)
+        self.target_scaler = MinMaxScaler(feature_range=(-0.5, 0.5))
+        # NuSVR model with desired hyperparameters
+        self.model = NuSVR(C=10, gamma='scale', kernel='rbf', nu=0.80)
+    def preprocessing(self, df):
+        """
+        Takes a raw dataframe (with the relevant columns) and applies the
+        fitted ColumnTransformer used in training.
+        Returns the transformed feature matrix.
+        """
+        return self.ct.transform(df)
+    def predict(self, preprocessed_data):
+        """
+        Takes already-preprocessed data (matrix/array) and outputs the
+        final predictions in the original scale.
+        """
+        y_pred_scaled = self.model.predict(preprocessed_data)
+        return self.postprocessing(y_pred_scaled)
+    def postprocessing(self, y_pred_scaled):
+        """
+        Takes scaled predictions (in the target_scaler domain) and inversely
+        transforms them back to the original target domain.
+        """
+        y_pred_original = self.target_scaler.inverse_transform(
+            y_pred_scaled.reshape(-1, 1)
+        )
+        return y_pred_original.ravel()
+if __name__ == "__main__":
+    # -------------------------------------------------
+    # 1. Load data
+    # -------------------------------------------------
+    df = pd.read_csv('cleaned_insurance_data.csv')
+    # Separate features and target
+    features = df.drop(columns=['claim', 'PatientID', 'index'])
+    target = df['claim']
+    # -------------------------------------------------
+    # 2. Instantiate our NuSVRInsuranceModel
+    # -------------------------------------------------
+    nusvr_wrapper = NuSVRInsuranceModel()
+    # -------------------------------------------------
+    # 3. Train-test split
+    # -------------------------------------------------
+    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
+        features, target, test_size=0.25, random_state=42
+    )
+    # -------------------------------------------------
+    # 4. Fit ColumnTransformer & target scaler on TRAIN data
+    # -------------------------------------------------
+    # Fit the ColumnTransformer
+    X_train_t = nusvr_wrapper.ct.fit_transform(X_train_raw)
+    # Fit the target scaler
+    y_train_t = nusvr_wrapper.target_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()
+    # -------------------------------------------------
+    # 5. Train the NuSVR model
+    # -------------------------------------------------
+    nusvr_wrapper.model.fit(X_train_t, y_train_t)
+    # -------------------------------------------------
+    # 6. Evaluate on test data
+    # -------------------------------------------------
+    # Preprocess the test features with the same pipeline
+    X_test_t = nusvr_wrapper.preprocessing(X_test_raw)
+    # Make predictions (in original scale)
+    y_pred = nusvr_wrapper.predict(X_test_t)
+    mae = mean_absolute_error(y_test, y_pred)
+    r2 = r2_score(y_test, y_pred)
+    print(f"Test MAE (original scale): {mae:.3f}")
+    print(f"Test R^2 (original scale): {r2:.3f}")
+    # -------------------------------------------------
+    # 7. Export the fitted model
+    # -------------------------------------------------
+    joblib.dump(nusvr_wrapper, "nusvr_insurance_model.joblib")
+    print("Fitted NuSVRInsuranceModel saved to nusvr_insurance_model.joblib")