insureos-models / pricing_glm.py
piyushptiwari's picture
Upload folder using huggingface_hub
2cc32a5 verified
"""
InsureOS — Insurance Pricing GLM + EBM
Trains a Tweedie GLM for pure premium estimation and an Explainable Boosting Machine (EBM)
for interpretable rating factor analysis. Uses motor claims tabular data.
"""
import os
import json
import argparse
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder
# Tweedie GLM
from sklearn.linear_model import TweedieRegressor
# Explainable Boosting Machine (glass-box model)
from interpret.glassbox import ExplainableBoostingRegressor
from interpret import show
# ── Defaults ──
DATA_DIR = "data/output"
OUTPUT_DIR = "models/pricing-glm"
TEST_SIZE = 0.2
RANDOM_STATE = 42
# GLM hyperparams
TWEEDIE_POWER = 1.5 # 1 < p < 2 → Compound Poisson-Gamma (standard for insurance)
TWEEDIE_ALPHA = 1.0 # regularization strength
TWEEDIE_MAX_ITER = 300
# Features for pricing
PRICING_FEATURES = [
"driver_age", "years_driving", "years_ncd", "vehicle_year", "vehicle_value",
"annual_mileage", "voluntary_excess", "compulsory_excess",
"previous_claims_3y", "policy_age_days",
]
CAT_FEATURES = [
"vehicle_make", "fuel_type", "occupation", "region",
]
def load_and_prepare(data_dir: str) -> pd.DataFrame:
"""Load motor claims CSV and prepare for pricing model."""
motor_files = list(Path(data_dir).glob("claims_motor_*.csv"))
if not motor_files:
raise FileNotFoundError(f"No motor claims CSV found in {data_dir}")
df = pd.read_csv(str(motor_files[0]))
# Target: claim_amount (pure premium proxy)
# Only use settled claims with positive amounts
df = df[df["claim_amount"] > 0].copy()
# Encode categoricals
encoders = {}
for col in CAT_FEATURES:
if col in df.columns:
le = LabelEncoder()
df[col + "_enc"] = le.fit_transform(df[col].fillna("Unknown"))
encoders[col] = le
# Derived features
df["vehicle_age"] = 2025 - df["vehicle_year"]
df["driver_experience_ratio"] = df["years_driving"] / df["driver_age"].clip(lower=18)
df["ncd_ratio"] = df["years_ncd"] / df["years_driving"].clip(lower=1)
return df, encoders
def train_tweedie_glm(
X_train: pd.DataFrame,
y_train: pd.Series,
X_test: pd.DataFrame,
y_test: pd.Series,
feature_names: list,
output_dir: str,
) -> dict:
"""Train Tweedie GLM for pure premium."""
print("\n[GLM] Training Tweedie Regressor...")
glm = TweedieRegressor(
power=TWEEDIE_POWER,
alpha=TWEEDIE_ALPHA,
max_iter=TWEEDIE_MAX_ITER,
link="log",
)
X_tr = X_train[feature_names].fillna(0)
X_te = X_test[feature_names].fillna(0)
glm.fit(X_tr, y_train)
# Predictions (clipped to positive)
y_pred = np.clip(glm.predict(X_te), 0, None)
# Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test.clip(lower=1))) * 100
# Coefficients
coefs = dict(zip(feature_names, glm.coef_))
results = {
"model": "TweedieGLM",
"tweedie_power": TWEEDIE_POWER,
"mae": mae,
"rmse": rmse,
"mape_pct": mape,
"coefficients": coefs,
"intercept": float(glm.intercept_),
"n_train": len(y_train),
"n_test": len(y_test),
}
# Save
model_path = os.path.join(output_dir, "tweedie_glm.pkl")
with open(model_path, "wb") as f:
pickle.dump(glm, f)
print(f" ✓ Tweedie GLM saved → {model_path}")
print(f" MAE: £{mae:,.2f}")
print(f" RMSE: £{rmse:,.2f}")
print(f" MAPE: {mape:.1f}%")
print(f" Top coefficients:")
for feat, coef in sorted(coefs.items(), key=lambda x: abs(x[1]), reverse=True)[:5]:
print(f" {feat}: {coef:+.4f}")
return results
def train_ebm(
X_train: pd.DataFrame,
y_train: pd.Series,
X_test: pd.DataFrame,
y_test: pd.Series,
feature_names: list,
output_dir: str,
) -> dict:
"""Train Explainable Boosting Machine for interpretable pricing."""
print("\n[EBM] Training Explainable Boosting Machine...")
ebm = ExplainableBoostingRegressor(
max_bins=256,
outer_bags=8,
inner_bags=4,
learning_rate=0.01,
max_leaves=3,
min_samples_leaf=10,
interactions=10, # allow up to 10 pairwise interactions
random_state=RANDOM_STATE,
)
X_tr = X_train[feature_names].fillna(0)
X_te = X_test[feature_names].fillna(0)
ebm.fit(X_tr, y_train)
y_pred = np.clip(ebm.predict(X_te), 0, None)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test.clip(lower=1))) * 100
# Feature importances from EBM
importance = dict(zip(
ebm.term_names_,
ebm.term_importances(),
))
results = {
"model": "EBM",
"mae": mae,
"rmse": rmse,
"mape_pct": mape,
"n_train": len(y_train),
"n_test": len(y_test),
"top_features": dict(sorted(importance.items(), key=lambda x: x[1], reverse=True)[:10]),
}
# Save
model_path = os.path.join(output_dir, "pricing_ebm.pkl")
with open(model_path, "wb") as f:
pickle.dump(ebm, f)
print(f" ✓ EBM saved → {model_path}")
print(f" MAE: £{mae:,.2f}")
print(f" RMSE: £{rmse:,.2f}")
print(f" MAPE: {mape:.1f}%")
print(f" Top features:")
for feat, imp in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5]:
print(f" {feat}: {imp:.4f}")
return results
def main():
parser = argparse.ArgumentParser(description="Train pricing models")
parser.add_argument("--data-dir", default=DATA_DIR)
parser.add_argument("--output-dir", default=OUTPUT_DIR)
args = parser.parse_args()
print(f"{'='*60}")
print(f" InsureOS — Pricing Model Training")
print(f" Data: {args.data_dir}")
print(f"{'='*60}")
os.makedirs(args.output_dir, exist_ok=True)
# Load data
print("\nLoading motor claims data...")
df, encoders = load_and_prepare(args.data_dir)
print(f" Records: {len(df)}")
print(f" Mean claim amount: £{df['claim_amount'].mean():,.2f}")
print(f" Median claim amount: £{df['claim_amount'].median():,.2f}")
# Feature set
numeric_features = PRICING_FEATURES + ["vehicle_age", "driver_experience_ratio", "ncd_ratio"]
cat_enc_features = [c + "_enc" for c in CAT_FEATURES if c + "_enc" in df.columns]
all_features = numeric_features + cat_enc_features
y = df["claim_amount"]
X = df[all_features]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
# Train both models
glm_results = train_tweedie_glm(X_train, y_train, X_test, y_test, all_features, args.output_dir)
ebm_results = train_ebm(X_train, y_train, X_test, y_test, all_features, args.output_dir)
# Save encoders
encoder_path = os.path.join(args.output_dir, "label_encoders.pkl")
with open(encoder_path, "wb") as f:
pickle.dump(encoders, f)
# Save results
summary = {"glm": glm_results, "ebm": ebm_results}
summary_path = os.path.join(args.output_dir, "training_results.json")
with open(summary_path, "w") as f:
json.dump(summary, f, indent=2, default=str)
print(f"\n{'='*60}")
print(f" ✓ Pricing model training complete!")
print(f" Tweedie GLM MAE: £{glm_results['mae']:,.2f}")
print(f" EBM MAE: £{ebm_results['mae']:,.2f}")
print(f" Results → {summary_path}")
print(f"{'='*60}")
if __name__ == "__main__":
main()