Mateusz Paszynski
publish website
5de1466
import pandas as pd
import numpy as np
from sklearn.svm import NuSVR
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
class NuSVRInsuranceModel:
"""
This class encapsulates:
1. Preprocessing: column transformations, scaling
2. Prediction: using NuSVR
3. Postprocessing: inverse-transform predictions to original scale
"""
# --- Custom Transformer defined INSIDE the class ---
class MultiplyScaler(BaseEstimator, TransformerMixin):
def __init__(self, factor=2):
self.factor = factor
def fit(self, X, y=None):
return self
def transform(self, X):
return X * self.factor
def __init__(self):
"""
In the constructor, define the column pipelines, the main ColumnTransformer,
the target scaler, and the model.
"""
# Example pipelines (adjust as needed)
text_pipeline = Pipeline([
('one-hot', OneHotEncoder())
])
nums_pipeline = Pipeline([
('normalize', StandardScaler(with_mean=True)),
])
nums_pipeline_strong = Pipeline([
('normalize', StandardScaler(with_mean=True)),
# Note we reference the nested class here
('scalarMultiply', NuSVRInsuranceModel.MultiplyScaler(factor=2))
])
smoke_pipeline = Pipeline([
('one-hot', OneHotEncoder()),
('normalize', StandardScaler(with_mean=False)),
('scalar-multiply', NuSVRInsuranceModel.MultiplyScaler(factor=5))
])
region_pipeline = Pipeline([
('categories', OrdinalEncoder())
])
# Create ColumnTransformer
# Adjust columns to match your dataset's actual column names
self.ct = ColumnTransformer([
('str_handler', text_pipeline, ['diabetic', 'gender']),
('smoke_handle', smoke_pipeline, ['smoker']),
('floats_ints_weak', nums_pipeline, ['children', 'age']),
('floats_ints_strong', nums_pipeline_strong, ['bmi', 'bloodpressure']),
])
# Target scaler (for the 'claim' column)
self.target_scaler = MinMaxScaler(feature_range=(-0.5, 0.5))
# NuSVR model with desired hyperparameters
self.model = NuSVR(C=10, gamma='scale', kernel='rbf', nu=0.80)
def preprocessing(self, df):
"""
Takes a raw dataframe (with the relevant columns) and applies the
fitted ColumnTransformer used in training.
Returns the transformed feature matrix.
"""
return self.ct.transform(df)
def predict(self, preprocessed_data):
"""
Takes already-preprocessed data (matrix/array) and outputs the
final predictions in the original scale.
"""
y_pred_scaled = self.model.predict(preprocessed_data)
return self.postprocessing(y_pred_scaled)
def postprocessing(self, y_pred_scaled):
"""
Takes scaled predictions (in the target_scaler domain) and inversely
transforms them back to the original target domain.
"""
y_pred_original = self.target_scaler.inverse_transform(
y_pred_scaled.reshape(-1, 1)
)
return y_pred_original.ravel()
if __name__ == "__main__":
# -------------------------------------------------
# 1. Load data
# -------------------------------------------------
df = pd.read_csv('cleaned_insurance_data.csv')
# Separate features and target
features = df.drop(columns=['claim', 'PatientID', 'index'])
target = df['claim']
# -------------------------------------------------
# 2. Instantiate our NuSVRInsuranceModel
# -------------------------------------------------
nusvr_wrapper = NuSVRInsuranceModel()
# -------------------------------------------------
# 3. Train-test split
# -------------------------------------------------
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
features, target, test_size=0.25, random_state=42
)
# -------------------------------------------------
# 4. Fit ColumnTransformer & target scaler on TRAIN data
# -------------------------------------------------
# Fit the ColumnTransformer
X_train_t = nusvr_wrapper.ct.fit_transform(X_train_raw)
# Fit the target scaler
y_train_t = nusvr_wrapper.target_scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()
# -------------------------------------------------
# 5. Train the NuSVR model
# -------------------------------------------------
nusvr_wrapper.model.fit(X_train_t, y_train_t)
# -------------------------------------------------
# 6. Evaluate on test data
# -------------------------------------------------
# Preprocess the test features with the same pipeline
X_test_t = nusvr_wrapper.preprocessing(X_test_raw)
# Make predictions (in original scale)
y_pred = nusvr_wrapper.predict(X_test_t)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Test MAE (original scale): {mae:.3f}")
print(f"Test R^2 (original scale): {r2:.3f}")
# -------------------------------------------------
# 7. Export the fitted model
# -------------------------------------------------
joblib.dump(nusvr_wrapper, "nusvr_insurance_model.joblib")
print("Fitted NuSVRInsuranceModel saved to nusvr_insurance_model.joblib")