| """ |
| InsureOS — Insurance Pricing GLM + EBM |
| Trains a Tweedie GLM for pure premium estimation and an Explainable Boosting Machine (EBM) |
| for interpretable rating factor analysis. Uses motor claims tabular data. |
| """ |
|
|
| import os |
| import json |
| import argparse |
| import pickle |
| from pathlib import Path |
|
|
| import numpy as np |
| import pandas as pd |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import mean_absolute_error, mean_squared_error |
| from sklearn.preprocessing import LabelEncoder |
|
|
| |
| from sklearn.linear_model import TweedieRegressor |
|
|
| |
| from interpret.glassbox import ExplainableBoostingRegressor |
| from interpret import show |
|
|
|
|
| |
|
|
| DATA_DIR = "data/output" |
| OUTPUT_DIR = "models/pricing-glm" |
| TEST_SIZE = 0.2 |
| RANDOM_STATE = 42 |
|
|
| |
| TWEEDIE_POWER = 1.5 |
| TWEEDIE_ALPHA = 1.0 |
| TWEEDIE_MAX_ITER = 300 |
|
|
| |
| PRICING_FEATURES = [ |
| "driver_age", "years_driving", "years_ncd", "vehicle_year", "vehicle_value", |
| "annual_mileage", "voluntary_excess", "compulsory_excess", |
| "previous_claims_3y", "policy_age_days", |
| ] |
|
|
| CAT_FEATURES = [ |
| "vehicle_make", "fuel_type", "occupation", "region", |
| ] |
|
|
|
|
| def load_and_prepare(data_dir: str) -> pd.DataFrame: |
| """Load motor claims CSV and prepare for pricing model.""" |
| motor_files = list(Path(data_dir).glob("claims_motor_*.csv")) |
| if not motor_files: |
| raise FileNotFoundError(f"No motor claims CSV found in {data_dir}") |
|
|
| df = pd.read_csv(str(motor_files[0])) |
|
|
| |
| |
| df = df[df["claim_amount"] > 0].copy() |
|
|
| |
| encoders = {} |
| for col in CAT_FEATURES: |
| if col in df.columns: |
| le = LabelEncoder() |
| df[col + "_enc"] = le.fit_transform(df[col].fillna("Unknown")) |
| encoders[col] = le |
|
|
| |
| df["vehicle_age"] = 2025 - df["vehicle_year"] |
| df["driver_experience_ratio"] = df["years_driving"] / df["driver_age"].clip(lower=18) |
| df["ncd_ratio"] = df["years_ncd"] / df["years_driving"].clip(lower=1) |
|
|
| return df, encoders |
|
|
|
|
| def train_tweedie_glm( |
| X_train: pd.DataFrame, |
| y_train: pd.Series, |
| X_test: pd.DataFrame, |
| y_test: pd.Series, |
| feature_names: list, |
| output_dir: str, |
| ) -> dict: |
| """Train Tweedie GLM for pure premium.""" |
| print("\n[GLM] Training Tweedie Regressor...") |
|
|
| glm = TweedieRegressor( |
| power=TWEEDIE_POWER, |
| alpha=TWEEDIE_ALPHA, |
| max_iter=TWEEDIE_MAX_ITER, |
| link="log", |
| ) |
|
|
| X_tr = X_train[feature_names].fillna(0) |
| X_te = X_test[feature_names].fillna(0) |
|
|
| glm.fit(X_tr, y_train) |
|
|
| |
| y_pred = np.clip(glm.predict(X_te), 0, None) |
|
|
| |
| mae = mean_absolute_error(y_test, y_pred) |
| rmse = np.sqrt(mean_squared_error(y_test, y_pred)) |
| mape = np.mean(np.abs((y_test - y_pred) / y_test.clip(lower=1))) * 100 |
|
|
| |
| coefs = dict(zip(feature_names, glm.coef_)) |
|
|
| results = { |
| "model": "TweedieGLM", |
| "tweedie_power": TWEEDIE_POWER, |
| "mae": mae, |
| "rmse": rmse, |
| "mape_pct": mape, |
| "coefficients": coefs, |
| "intercept": float(glm.intercept_), |
| "n_train": len(y_train), |
| "n_test": len(y_test), |
| } |
|
|
| |
| model_path = os.path.join(output_dir, "tweedie_glm.pkl") |
| with open(model_path, "wb") as f: |
| pickle.dump(glm, f) |
|
|
| print(f" ✓ Tweedie GLM saved → {model_path}") |
| print(f" MAE: £{mae:,.2f}") |
| print(f" RMSE: £{rmse:,.2f}") |
| print(f" MAPE: {mape:.1f}%") |
| print(f" Top coefficients:") |
| for feat, coef in sorted(coefs.items(), key=lambda x: abs(x[1]), reverse=True)[:5]: |
| print(f" {feat}: {coef:+.4f}") |
|
|
| return results |
|
|
|
|
| def train_ebm( |
| X_train: pd.DataFrame, |
| y_train: pd.Series, |
| X_test: pd.DataFrame, |
| y_test: pd.Series, |
| feature_names: list, |
| output_dir: str, |
| ) -> dict: |
| """Train Explainable Boosting Machine for interpretable pricing.""" |
| print("\n[EBM] Training Explainable Boosting Machine...") |
|
|
| ebm = ExplainableBoostingRegressor( |
| max_bins=256, |
| outer_bags=8, |
| inner_bags=4, |
| learning_rate=0.01, |
| max_leaves=3, |
| min_samples_leaf=10, |
| interactions=10, |
| random_state=RANDOM_STATE, |
| ) |
|
|
| X_tr = X_train[feature_names].fillna(0) |
| X_te = X_test[feature_names].fillna(0) |
|
|
| ebm.fit(X_tr, y_train) |
|
|
| y_pred = np.clip(ebm.predict(X_te), 0, None) |
|
|
| mae = mean_absolute_error(y_test, y_pred) |
| rmse = np.sqrt(mean_squared_error(y_test, y_pred)) |
| mape = np.mean(np.abs((y_test - y_pred) / y_test.clip(lower=1))) * 100 |
|
|
| |
| importance = dict(zip( |
| ebm.term_names_, |
| ebm.term_importances(), |
| )) |
|
|
| results = { |
| "model": "EBM", |
| "mae": mae, |
| "rmse": rmse, |
| "mape_pct": mape, |
| "n_train": len(y_train), |
| "n_test": len(y_test), |
| "top_features": dict(sorted(importance.items(), key=lambda x: x[1], reverse=True)[:10]), |
| } |
|
|
| |
| model_path = os.path.join(output_dir, "pricing_ebm.pkl") |
| with open(model_path, "wb") as f: |
| pickle.dump(ebm, f) |
|
|
| print(f" ✓ EBM saved → {model_path}") |
| print(f" MAE: £{mae:,.2f}") |
| print(f" RMSE: £{rmse:,.2f}") |
| print(f" MAPE: {mape:.1f}%") |
| print(f" Top features:") |
| for feat, imp in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5]: |
| print(f" {feat}: {imp:.4f}") |
|
|
| return results |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Train pricing models") |
| parser.add_argument("--data-dir", default=DATA_DIR) |
| parser.add_argument("--output-dir", default=OUTPUT_DIR) |
| args = parser.parse_args() |
|
|
| print(f"{'='*60}") |
| print(f" InsureOS — Pricing Model Training") |
| print(f" Data: {args.data_dir}") |
| print(f"{'='*60}") |
|
|
| os.makedirs(args.output_dir, exist_ok=True) |
|
|
| |
| print("\nLoading motor claims data...") |
| df, encoders = load_and_prepare(args.data_dir) |
| print(f" Records: {len(df)}") |
| print(f" Mean claim amount: £{df['claim_amount'].mean():,.2f}") |
| print(f" Median claim amount: £{df['claim_amount'].median():,.2f}") |
|
|
| |
| numeric_features = PRICING_FEATURES + ["vehicle_age", "driver_experience_ratio", "ncd_ratio"] |
| cat_enc_features = [c + "_enc" for c in CAT_FEATURES if c + "_enc" in df.columns] |
| all_features = numeric_features + cat_enc_features |
|
|
| y = df["claim_amount"] |
| X = df[all_features] |
|
|
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE |
| ) |
|
|
| |
| glm_results = train_tweedie_glm(X_train, y_train, X_test, y_test, all_features, args.output_dir) |
| ebm_results = train_ebm(X_train, y_train, X_test, y_test, all_features, args.output_dir) |
|
|
| |
| encoder_path = os.path.join(args.output_dir, "label_encoders.pkl") |
| with open(encoder_path, "wb") as f: |
| pickle.dump(encoders, f) |
|
|
| |
| summary = {"glm": glm_results, "ebm": ebm_results} |
| summary_path = os.path.join(args.output_dir, "training_results.json") |
| with open(summary_path, "w") as f: |
| json.dump(summary, f, indent=2, default=str) |
|
|
| print(f"\n{'='*60}") |
| print(f" ✓ Pricing model training complete!") |
| print(f" Tweedie GLM MAE: £{glm_results['mae']:,.2f}") |
| print(f" EBM MAE: £{ebm_results['mae']:,.2f}") |
| print(f" Results → {summary_path}") |
| print(f"{'='*60}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|