Upload folder using huggingface_hub

2cc32a5 verified 21 days ago

7.87 kB

	"""
	InsureOS — Insurance Pricing GLM + EBM
	Trains a Tweedie GLM for pure premium estimation and an Explainable Boosting Machine (EBM)
	for interpretable rating factor analysis. Uses motor claims tabular data.
	"""

	import os
	import json
	import argparse
	import pickle
	from pathlib import Path

	import numpy as np
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import mean_absolute_error, mean_squared_error
	from sklearn.preprocessing import LabelEncoder

	# Tweedie GLM
	from sklearn.linear_model import TweedieRegressor

	# Explainable Boosting Machine (glass-box model)
	from interpret.glassbox import ExplainableBoostingRegressor
	from interpret import show


	# ── Defaults ──

	DATA_DIR = "data/output"
	OUTPUT_DIR = "models/pricing-glm"
	TEST_SIZE = 0.2
	RANDOM_STATE = 42

	# GLM hyperparams
	TWEEDIE_POWER = 1.5 # 1 < p < 2 → Compound Poisson-Gamma (standard for insurance)
	TWEEDIE_ALPHA = 1.0 # regularization strength
	TWEEDIE_MAX_ITER = 300

	# Features for pricing
	PRICING_FEATURES = [
	"driver_age", "years_driving", "years_ncd", "vehicle_year", "vehicle_value",
	"annual_mileage", "voluntary_excess", "compulsory_excess",
	"previous_claims_3y", "policy_age_days",
	]

	CAT_FEATURES = [
	"vehicle_make", "fuel_type", "occupation", "region",
	]


	def load_and_prepare(data_dir: str) -> pd.DataFrame:
	"""Load motor claims CSV and prepare for pricing model."""
	motor_files = list(Path(data_dir).glob("claims_motor_*.csv"))
	if not motor_files:
	raise FileNotFoundError(f"No motor claims CSV found in {data_dir}")

	df = pd.read_csv(str(motor_files[0]))

	# Target: claim_amount (pure premium proxy)
	# Only use settled claims with positive amounts
	df = df[df["claim_amount"] > 0].copy()

	# Encode categoricals
	encoders = {}
	for col in CAT_FEATURES:
	if col in df.columns:
	le = LabelEncoder()
	df[col + "_enc"] = le.fit_transform(df[col].fillna("Unknown"))
	encoders[col] = le

	# Derived features
	df["vehicle_age"] = 2025 - df["vehicle_year"]
	df["driver_experience_ratio"] = df["years_driving"] / df["driver_age"].clip(lower=18)
	df["ncd_ratio"] = df["years_ncd"] / df["years_driving"].clip(lower=1)

	return df, encoders


	def train_tweedie_glm(
	X_train: pd.DataFrame,
	y_train: pd.Series,
	X_test: pd.DataFrame,
	y_test: pd.Series,
	feature_names: list,
	output_dir: str,
	) -> dict:
	"""Train Tweedie GLM for pure premium."""
	print("\n[GLM] Training Tweedie Regressor...")

	glm = TweedieRegressor(
	power=TWEEDIE_POWER,
	alpha=TWEEDIE_ALPHA,
	max_iter=TWEEDIE_MAX_ITER,
	link="log",
	)

	X_tr = X_train[feature_names].fillna(0)
	X_te = X_test[feature_names].fillna(0)

	glm.fit(X_tr, y_train)

	# Predictions (clipped to positive)
	y_pred = np.clip(glm.predict(X_te), 0, None)

	# Metrics
	mae = mean_absolute_error(y_test, y_pred)
	rmse = np.sqrt(mean_squared_error(y_test, y_pred))
	mape = np.mean(np.abs((y_test - y_pred) / y_test.clip(lower=1))) * 100

	# Coefficients
	coefs = dict(zip(feature_names, glm.coef_))

	results = {
	"model": "TweedieGLM",
	"tweedie_power": TWEEDIE_POWER,
	"mae": mae,
	"rmse": rmse,
	"mape_pct": mape,
	"coefficients": coefs,
	"intercept": float(glm.intercept_),
	"n_train": len(y_train),
	"n_test": len(y_test),
	}

	# Save
	model_path = os.path.join(output_dir, "tweedie_glm.pkl")
	with open(model_path, "wb") as f:
	pickle.dump(glm, f)

	print(f" ✓ Tweedie GLM saved → {model_path}")
	print(f" MAE: £{mae:,.2f}")
	print(f" RMSE: £{rmse:,.2f}")
	print(f" MAPE: {mape:.1f}%")
	print(f" Top coefficients:")
	for feat, coef in sorted(coefs.items(), key=lambda x: abs(x[1]), reverse=True)[:5]:
	print(f" {feat}: {coef:+.4f}")

	return results


	def train_ebm(
	X_train: pd.DataFrame,
	y_train: pd.Series,
	X_test: pd.DataFrame,
	y_test: pd.Series,
	feature_names: list,
	output_dir: str,
	) -> dict:
	"""Train Explainable Boosting Machine for interpretable pricing."""
	print("\n[EBM] Training Explainable Boosting Machine...")

	ebm = ExplainableBoostingRegressor(
	max_bins=256,
	outer_bags=8,
	inner_bags=4,
	learning_rate=0.01,
	max_leaves=3,
	min_samples_leaf=10,
	interactions=10, # allow up to 10 pairwise interactions
	random_state=RANDOM_STATE,
	)

	X_tr = X_train[feature_names].fillna(0)
	X_te = X_test[feature_names].fillna(0)

	ebm.fit(X_tr, y_train)

	y_pred = np.clip(ebm.predict(X_te), 0, None)

	mae = mean_absolute_error(y_test, y_pred)
	rmse = np.sqrt(mean_squared_error(y_test, y_pred))
	mape = np.mean(np.abs((y_test - y_pred) / y_test.clip(lower=1))) * 100

	# Feature importances from EBM
	importance = dict(zip(
	ebm.term_names_,
	ebm.term_importances(),
	))

	results = {
	"model": "EBM",
	"mae": mae,
	"rmse": rmse,
	"mape_pct": mape,
	"n_train": len(y_train),
	"n_test": len(y_test),
	"top_features": dict(sorted(importance.items(), key=lambda x: x[1], reverse=True)[:10]),
	}

	# Save
	model_path = os.path.join(output_dir, "pricing_ebm.pkl")
	with open(model_path, "wb") as f:
	pickle.dump(ebm, f)

	print(f" ✓ EBM saved → {model_path}")
	print(f" MAE: £{mae:,.2f}")
	print(f" RMSE: £{rmse:,.2f}")
	print(f" MAPE: {mape:.1f}%")
	print(f" Top features:")
	for feat, imp in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:5]:
	print(f" {feat}: {imp:.4f}")

	return results


	def main():
	parser = argparse.ArgumentParser(description="Train pricing models")
	parser.add_argument("--data-dir", default=DATA_DIR)
	parser.add_argument("--output-dir", default=OUTPUT_DIR)
	args = parser.parse_args()

	print(f"{'='*60}")
	print(f" InsureOS — Pricing Model Training")
	print(f" Data: {args.data_dir}")
	print(f"{'='*60}")

	os.makedirs(args.output_dir, exist_ok=True)

	# Load data
	print("\nLoading motor claims data...")
	df, encoders = load_and_prepare(args.data_dir)
	print(f" Records: {len(df)}")
	print(f" Mean claim amount: £{df['claim_amount'].mean():,.2f}")
	print(f" Median claim amount: £{df['claim_amount'].median():,.2f}")

	# Feature set
	numeric_features = PRICING_FEATURES + ["vehicle_age", "driver_experience_ratio", "ncd_ratio"]
	cat_enc_features = [c + "_enc" for c in CAT_FEATURES if c + "_enc" in df.columns]
	all_features = numeric_features + cat_enc_features

	y = df["claim_amount"]
	X = df[all_features]

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
	)

	# Train both models
	glm_results = train_tweedie_glm(X_train, y_train, X_test, y_test, all_features, args.output_dir)
	ebm_results = train_ebm(X_train, y_train, X_test, y_test, all_features, args.output_dir)

	# Save encoders
	encoder_path = os.path.join(args.output_dir, "label_encoders.pkl")
	with open(encoder_path, "wb") as f:
	pickle.dump(encoders, f)

	# Save results
	summary = {"glm": glm_results, "ebm": ebm_results}
	summary_path = os.path.join(args.output_dir, "training_results.json")
	with open(summary_path, "w") as f:
	json.dump(summary, f, indent=2, default=str)

	print(f"\n{'='*60}")
	print(f" ✓ Pricing model training complete!")
	print(f" Tweedie GLM MAE: £{glm_results['mae']:,.2f}")
	print(f" EBM MAE: £{ebm_results['mae']:,.2f}")
	print(f" Results → {summary_path}")
	print(f"{'='*60}")


	if __name__ == "__main__":
	main()