towardsinnovationlab commited on
Commit
5993cd4
·
verified ·
1 Parent(s): dc33249

Upload LLM_Trial_2.py

Browse files
Files changed (1) hide show
  1. pages/LLM_Trial_2.py +1140 -0
pages/LLM_Trial_2.py ADDED
@@ -0,0 +1,1140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import random
4
+ import numpy as np
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+ import matplotlib.image as mpimg
8
+ import seaborn as sns
9
+ from matplotlib.pyplot import subplots
10
+ from sklearn.model_selection import train_test_split
11
+ from sklearn.model_selection import KFold
12
+ from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, make_scorer
13
+ from scipy.stats import ks_2samp
14
+ from sklearn.decomposition import PCA
15
+ from sklearn.preprocessing import StandardScaler
16
+ from mpl_toolkits.mplot3d import Axes3D
17
+ from sklearn.linear_model import TweedieRegressor
18
+ import shap
19
+ from sklearn.mixture import GaussianMixture
20
+ from joblib import dump
21
+ from joblib import load
22
+ import streamlit as st
23
+
24
+ import warnings
25
+ warnings.filterwarnings('ignore')
26
+
27
+
28
+ DEFAULT_RANDOM_SEED = 0 # Set a random seed for reproducibility throughout Python, NumPy, and TensorFlow operations
29
+ random.seed(DEFAULT_RANDOM_SEED)
30
+ os.environ['PYTHONHASHSEED'] = str(DEFAULT_RANDOM_SEED)
31
+ np.random.seed(DEFAULT_RANDOM_SEED)
32
+
33
+ # Title
34
+ st.title("Large Language Model GPT-5.1: Synthetic Data Generation Analysis")
35
+
36
+
37
+ def compare_real_vs_synthetic(real_df, synthetic_df, columns=None, kind='hist', bins=30, figsize=(15, 10)):
38
+ """
39
+ Compare distributions between real and synthetic datasets.
40
+
41
+ Parameters:
42
+ - real_df: pd.DataFrame, the original dataset
43
+ - synthetic_df: pd.DataFrame, the synthetic dataset
44
+ - columns: list of column names to compare; if None, all columns are used
45
+ - kind: str, type of plot: 'hist', 'kde', or 'box'
46
+ - bins: int, number of bins for histograms
47
+ - figsize: tuple, size of the plot figure
48
+
49
+ Returns:
50
+ - None (displays plots)
51
+ """
52
+ if columns is None:
53
+ columns = [col for col in real_df.columns if real_df[col].dtype != 'object']
54
+
55
+ n_cols = 2
56
+ n_rows = (len(columns) + 1) // n_cols
57
+
58
+ fig= plt.figure(figsize=figsize)
59
+
60
+ for idx, col in enumerate(columns, 1):
61
+ plt.subplot(n_rows, n_cols, idx)
62
+
63
+ if kind == 'hist':
64
+ sns.histplot(real_df[col], color='blue', label='Real', kde=False, stat='density', bins=bins, alpha=0.6)
65
+ sns.histplot(synthetic_df[col], color='red', label='Synthetic', kde=False, stat='density', bins=bins, alpha=0.6)
66
+
67
+ elif kind == 'kde':
68
+ sns.kdeplot(real_df[col], color='blue', label='Real')
69
+ sns.kdeplot(synthetic_df[col], color='red', label='Synthetic')
70
+
71
+ elif kind == 'box':
72
+ sns.boxplot(data=[real_df[col], synthetic_df[col]], palette=['blue', 'red'])
73
+ plt.xticks([0, 1], ['Real', 'Synthetic'])
74
+
75
+ else:
76
+ raise ValueError("Unsupported plot kind. Choose from 'hist', 'kde', or 'box'.")
77
+
78
+ plt.title(f"Comparison for '{col}'")
79
+ plt.legend()
80
+
81
+ plt.tight_layout()
82
+ st.pyplot(fig)
83
+
84
+
85
+ def run_glm_frequency_analysis(
86
+ X_train, X_test, model=None, clip_exposure=False, random_state=0, label="Model", var=None):
87
+ """
88
+ Run GLM Poisson regression frequency analysis (ClaimNb ~ Features | Exposure).
89
+
90
+ Parameters:
91
+ - X_train: pd.DataFrame with ['Exposure', 'ClaimNb', ...]
92
+ - X_test: pd.DataFrame with ['Exposure', 'ClaimNb', ...]
93
+ - model: sklearn regressor, default is TweedieRegressor(power=1, link='log')
94
+ - clip_exposure: bool, if True, caps Exposure at 1 in training set
95
+ - random_state: int, for reproducibility
96
+ - label: str, label for printing/logging
97
+
98
+ Returns:
99
+ - trained_model: fitted model
100
+ - results: dict with CV scores, deviance on train/test, and predictions
101
+ """
102
+
103
+ np.random.seed(0)
104
+
105
+ # Optionally clip exposure in training data
106
+ if clip_exposure:
107
+ X_train = X_train.copy()
108
+ X_train['Exposure'] = np.where(X_train['Exposure'] > 1, 1, X_train['Exposure'])
109
+
110
+ # Filter for Exposure > 0
111
+ mask_tr = X_train['Exposure'] > 0
112
+ mask_te = X_test['Exposure'] > 0
113
+ X_train_f = X_train[mask_tr].copy()
114
+ X_test_f = X_test[mask_te].copy()
115
+
116
+ y_train = X_train_f['ClaimNb']
117
+ y_test = X_test_f['ClaimNb']
118
+ exposure_train = X_train_f['Exposure']
119
+ exposure_test = X_test_f['Exposure']
120
+
121
+ X_train_ = X_train_f.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
122
+ X_test_ = X_test_f.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
123
+
124
+ # Set model if not passed
125
+ if model is None:
126
+ model = TweedieRegressor(power=1, link='log')
127
+
128
+ # Cross-validation
129
+ cv = KFold(n_splits=5)
130
+ mpd_scores = []
131
+
132
+ for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_)):
133
+ X_tr, X_val = X_train_.iloc[train_idx], X_train_.iloc[val_idx]
134
+ y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
135
+ w_tr, w_val = exposure_train.iloc[train_idx], exposure_train.iloc[val_idx]
136
+
137
+ model.fit(X_tr, y_tr / w_tr, sample_weight=w_tr)
138
+ y_pred = model.predict(X_val)
139
+
140
+ score = mean_poisson_deviance(y_val / w_val, y_pred)
141
+ #st.write(f"Fold {fold_idx + 1} Poisson Deviance Score: {score:.4f}")
142
+ mpd_scores.append(score)
143
+
144
+ #st.write(f"Average cross-validation Poisson Deviance Score: {np.mean(mpd_scores):.4f}")
145
+ #st.write(f"Standard Deviation of CV Scores: {np.std(mpd_scores):.4f}")
146
+
147
+ # Final fit on full training set
148
+ model.fit(X_train_, y_train / exposure_train, sample_weight=exposure_train)
149
+
150
+ pred_train = model.predict(X_train_)
151
+ pred_test = model.predict(X_test_)
152
+
153
+ mpd_train = mean_poisson_deviance(y_train / exposure_train, pred_train)
154
+ mpd_test = mean_poisson_deviance(y_test / exposure_test, pred_test)
155
+
156
+ st.write(f"Train Poisson {var} Deviance: {mpd_train:.4f}")
157
+ st.write(f"Test Poisson {var} Deviance: {mpd_test:.4f}")
158
+
159
+ return model, {
160
+ "cv_scores": mpd_scores,
161
+ "mpd_train": mpd_train,
162
+ "mpd_test": mpd_test,
163
+ "train_predictions": pred_train,
164
+ "test_predictions": pred_test
165
+ }
166
+
167
+
168
+ def run_glm_cost_analysis(X_train, X_test, is_sampled=False, verbose=True, var=None):
169
+ """
170
+ Perform GLM Cost Analysis using Tweedie Regressor (power=2, link='log').
171
+
172
+ Parameters:
173
+ - X_train: Training DataFrame (must include 'ClaimAmount', 'ClaimNb', 'Exposure')
174
+ - X_test: Testing DataFrame
175
+ - is_sampled: If True, cap 'Exposure' at 1 for training data
176
+ - verbose: If True, print CV results and scores
177
+
178
+ Returns:
179
+ - Dictionary containing train/test gamma deviance and predictions
180
+ """
181
+
182
+ np.random.seed(0)
183
+
184
+ # Cap exposure if sampled
185
+ if is_sampled:
186
+ X_train = X_train.copy()
187
+ X_train['Exposure'] = np.where(X_train['Exposure'] > 1, 1, X_train['Exposure'])
188
+
189
+ X_train_co = X_train.copy()
190
+ X_test_co = X_test.copy()
191
+
192
+ # Compute average cost per claim (Acost)
193
+ X_train_co['Acost'] = np.where(X_train_co['ClaimNb'] != 0,
194
+ X_train_co['ClaimAmount'] / X_train_co['ClaimNb'], 0)
195
+ X_test_co['Acost'] = np.where(X_test_co['ClaimNb'] != 0,
196
+ X_test_co['ClaimAmount'] / X_test_co['ClaimNb'], 0)
197
+
198
+ # Filter rows with non-zero claim amounts
199
+ X_train_cost = X_train_co[X_train_co['ClaimAmount'] != 0].copy()
200
+ X_test_cost = X_test_co[X_test_co['ClaimAmount'] != 0].copy()
201
+
202
+ # Target and weights
203
+ y_train = X_train_cost['Acost']
204
+ claim_tr = X_train_cost['ClaimNb']
205
+ y_test = X_test_cost['Acost']
206
+ claim_te = X_test_cost['ClaimNb']
207
+
208
+ # Features
209
+ drop_cols = ['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb']
210
+ X_train_ = X_train_cost.drop(columns=drop_cols)
211
+ X_test_ = X_test_cost.drop(columns=drop_cols)
212
+
213
+ # Initialize model
214
+ glm_cl = TweedieRegressor(power=2, link='log')
215
+
216
+ # Cross-validation
217
+ cv = KFold(n_splits=5, shuffle=True, random_state=0)
218
+ mgd_scores = []
219
+
220
+ for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_)):
221
+ X_tr, X_val = X_train_.iloc[train_idx], X_train_.iloc[val_idx]
222
+ y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
223
+ w_tr, w_val = claim_tr.iloc[train_idx], claim_tr.iloc[val_idx]
224
+
225
+ glm_cl.fit(X_tr, y_tr, sample_weight=w_tr)
226
+ y_pred_val = glm_cl.predict(X_val)
227
+ score = mean_gamma_deviance(y_val, y_pred_val)
228
+ mgd_scores.append(score)
229
+
230
+ #if verbose:
231
+ # print(f"Fold {fold_idx + 1} Gamma Deviance Score: {score:.4f}")
232
+
233
+ #if verbose:
234
+ # print("Average cross-validation Gamma Deviance Score:", np.mean(mgd_scores))
235
+ # print("Standard Deviation of CV Scores:", np.std(mgd_scores))
236
+
237
+ # Train on full data
238
+ glm_cl.fit(X_train_, y_train, sample_weight=claim_tr)
239
+
240
+ # Predictions
241
+ y_pred_train = glm_cl.predict(X_train_)
242
+ y_pred_test = glm_cl.predict(X_test_)
243
+
244
+ # Deviance on train and test
245
+ mgd_train = mean_gamma_deviance(y_train, y_pred_train)
246
+ mgd_test = mean_gamma_deviance(y_test, y_pred_test)
247
+
248
+ if verbose:
249
+ st.write(f"Train Gamma {var} Deviance: {mgd_train:.4f}")
250
+ st.write(f"Test Gamma {var} Deviance: {mgd_test:.4f}")
251
+
252
+ return {
253
+ "cv_scores": mgd_scores,
254
+ 'mgd_train': mgd_train,
255
+ 'mgd_test': mgd_test,
256
+ 'y_pred_train': y_pred_train,
257
+ 'y_pred_test': y_pred_test
258
+ }
259
+
260
+
261
+ def plot_glm_shap_importance(
262
+ X_train, X_test, y_train, sample_weight,
263
+ power: int, title: str, max_display: int = 10, figsize: tuple = (5, 5), seed: int = 0):
264
+ """
265
+ Compute and plot SHAP feature importance for GLMs using SHAP LinearExplainer.
266
+
267
+ Parameters:
268
+ X_train (pd.DataFrame): Training features
269
+ X_test (pd.DataFrame): Test features
270
+ y_train (pd.Series or np.array): Training target
271
+ sample_weight (pd.Series or np.array): Sample weights
272
+ power (int): Tweedie power (1 = Poisson for frequency, 2 = Gamma for severity)
273
+ title (str): Title for the plot
274
+ max_display (int): Max number of features to display
275
+ figsize (tuple): Size of the figure
276
+ seed (int): Random seed for reproducibility
277
+ """
278
+
279
+ np.random.seed(seed)
280
+
281
+ model = TweedieRegressor(power=power, link='log')
282
+ model.fit(X_train, y_train, sample_weight=sample_weight)
283
+
284
+ masker = shap.maskers.Independent(X_train)
285
+ explainer = shap.LinearExplainer(model, masker=masker)
286
+ shap_values = explainer.shap_values(X_test)
287
+
288
+ plt.figure(figsize=figsize)
289
+ shap.summary_plot(
290
+ shap_values, features=X_test,
291
+ feature_names=X_test.columns,
292
+ plot_type='bar',
293
+ max_display=max_display,
294
+ show=False
295
+ )
296
+ plt.title(title, fontsize=12)
297
+ plt.tight_layout()
298
+ fig = plt.gcf()
299
+ st.pyplot(fig)
300
+
301
+
302
+ # ### Upload datasets
303
+
304
+ #-------------------
305
+ # DATASETS
306
+ #-------------------
307
+ df1=pd.read_csv('./data/ausprivauto0405.csv')
308
+ df2=pd.read_csv('./data/swmotorcycle.csv')
309
+ df1_synth=pd.read_csv('./LLM/synthetic_nonlife_53320_D1_60.csv')
310
+ #df1_synth = df1_synth.drop(columns=["Unnamed: 0"])
311
+ df2_synth=pd.read_csv('./LLM/synthetic_nonlife_51638_D2_60.csv')
312
+ #df2_synth = df2_synth.drop(columns=["Unnamed: 0"])
313
+
314
+
315
+
316
+ # ### dataset 1 and data handling
317
+
318
+ st.header('Dataset 1: ausprivauto0405')
319
+
320
+ df1_duplicated_rows=df1[df1.duplicated()]
321
+ df1=df1.drop_duplicates()
322
+ df1_duplicated_col=df1.columns[df1.columns.duplicated()]
323
+
324
+
325
+ # ### Encoding
326
+
327
+ df1_encod=df1.copy()
328
+ # VehAge
329
+ VehAge_group = {'old cars':'1','young cars':'2','oldest cars':'3','youngest cars':'4'}
330
+ df1_encod['VehAge'] = df1_encod['VehAge'].map(VehAge_group)
331
+ df1_encod['VehAge']= df1_encod['VehAge'].astype(int)
332
+ # DrivAge
333
+ DrivAge_group = {'young people':'1','older work. people':'2','oldest people':'3','working people':'4','old people':'5','youngest people':'6'}
334
+ df1_encod['DrivAge'] = df1_encod['DrivAge'].map(DrivAge_group)
335
+ df1_encod['DrivAge']= df1_encod['DrivAge'].astype(int)
336
+ # VehBody
337
+ VehBody_group = {'Hatchback':'1','Utility':'2','Station wagon':'3','Hardtop':'4','Panel van':'5','Sedan':'6','Truck':'7',\
338
+ 'Coupe':'8', 'Minibus':'9', 'Motorized caravan':'10', 'Bus':'11', 'Convertible':'12','Roadster':'13'}
339
+ df1_encod['VehBody'] = df1_encod['VehBody'].map(VehBody_group)
340
+ df1_encod['VehBody']= df1_encod['VehBody'].astype(int)
341
+ # Gender
342
+ Gender_group = {'Female':'0','Male':'1'}
343
+ df1_encod['Gender'] = df1_encod['Gender'].map(Gender_group)
344
+ df1_encod['Gender']= df1_encod['Gender'].astype(int)
345
+
346
+
347
+
348
+
349
+ # ### Split dataset
350
+ # Split the dataset into train/test split
351
+ X_train, X_test = train_test_split(df1_encod, test_size=0.2, random_state=0)
352
+ st.markdown(f"**Train shape:** {X_train.shape} \n**Test shape:** {X_test.shape}")
353
+
354
+
355
+ # ### Use Generate Samples Dataframe
356
+ df1_synth_encod=df1_synth.copy()
357
+ # VehAge
358
+ VehAge_group = {'old cars':'1','young cars':'2','oldest cars':'3','youngest cars':'4'}
359
+ df1_synth_encod['VehAge'] = df1_synth_encod['VehAge'].map(VehAge_group)
360
+ df1_synth_encod['VehAge']= df1_synth_encod['VehAge'].astype(int)
361
+ # DrivAge
362
+ DrivAge_group = {'young people':'1','older work. people':'2','oldest people':'3','working people':'4','old people':'5','youngest people':'6'}
363
+ df1_synth_encod['DrivAge'] = df1_synth_encod['DrivAge'].map(DrivAge_group)
364
+ df1_synth_encod['DrivAge']= df1_synth_encod['DrivAge'].astype(int)
365
+ # VehBody
366
+ VehBody_group = {'Hatchback':'1','Utility':'2','Station wagon':'3','Hardtop':'4','Panel van':'5','Sedan':'6','Truck':'7',\
367
+ 'Coupe':'8', 'Minibus':'9', 'Motorized caravan':'10', 'Bus':'11', 'Convertible':'12','Roadster':'13'}
368
+ df1_synth_encod['VehBody'] = df1_synth_encod['VehBody'].map(VehBody_group)
369
+ df1_synth_encod['VehBody']= df1_synth_encod['VehBody'].astype(int)
370
+ # Gender
371
+ Gender_group = {'Female':'0','Male':'1'}
372
+ df1_synth_encod['Gender'] = df1_synth_encod['Gender'].map(Gender_group)
373
+ df1_synth_encod['Gender']= df1_synth_encod['Gender'].astype(int)
374
+
375
+
376
+ new_samples_df=df1_synth_encod.copy()
377
+
378
+ # Check consistency
379
+ st.subheader(f"Check consistency")
380
+ # Find inconsistencies
381
+ inconsistent_records = new_samples_df[
382
+ ~(((new_samples_df["ClaimNb"] == 0) & (new_samples_df["ClaimOcc"] == 0) & (new_samples_df["ClaimAmount"] == 0)) |
383
+ ((new_samples_df["ClaimNb"] > 0) & (new_samples_df["ClaimOcc"] > 0) & (new_samples_df["ClaimAmount"] > 0)))
384
+ ]
385
+
386
+ st.write(f"Number of inconsistent records on synthetic data: {len(inconsistent_records)}")
387
+ st.write(inconsistent_records.head()) # Show a few inconsistent rows
388
+ st.write('Helps assess basic data fidelity by checking structural or logical violations.')
389
+ #st.write('The generative model successfully learned the essential business logic')
390
+
391
+
392
+ # ### Visual Comparison
393
+
394
+ # Compare selected variables using histograms
395
+ st.subheader(f"Univariate distribution comparison: real vs synthetic")
396
+ st.write('Shows how well each individual feature is mimicked by the synthetic data.')
397
+ #st.write('The model captures variables like Exposure, VehValue, ClaimAmount, ClaimOcc, and \
398
+ #ClaimNb reasonably well, showing similar overall shapes and ranges. Meanwhile for the others \
399
+ #show a poor replication.')
400
+
401
+ compare_real_vs_synthetic(
402
+ real_df=X_train,
403
+ synthetic_df=df1_synth,
404
+ columns=['Exposure','VehBody','VehValue','ClaimOcc','ClaimNb', 'ClaimAmount', 'DrivAge', 'VehAge','Gender'],
405
+ kind='hist'
406
+ )
407
+
408
+
409
+ st.subheader(f"Correlation matrix comparison: real vs synthetic")
410
+ st.write('Evaluates preservation of feature-to-feature relationships.')
411
+ #st.write('Overall the correlation structure is well-preserved, indicating this synthetic data \
412
+ #generation method maintains feature relationships effectively')
413
+
414
+ # Compute correlation matrices
415
+ corr_matrix_X_train = X_train.corr()
416
+ corr_matrix_new_samples = new_samples_df.corr()
417
+
418
+ # Set figure size
419
+ fig=plt.figure(figsize=(30,15))
420
+
421
+ # a subplot grid
422
+ # Parameters (1, 2, 1) implies 1 row, 2 columns, and this plot is the 1st plot.
423
+ plt.subplot(1, 2, 1) # Subplot 1
424
+ sns.heatmap(corr_matrix_X_train, square=True, annot=True, cmap='coolwarm', fmt='.2f',annot_kws={"size": 15})
425
+ plt.title('Correlation Heatmap of X_train', size=15)
426
+ plt.yticks(rotation=0,fontsize=15)
427
+ plt.xticks(rotation=90,fontsize=15)
428
+
429
+ # another subplot for the second heatmap
430
+ plt.subplot(1, 2, 2) # Subplot 2
431
+ sns.heatmap(corr_matrix_new_samples, square=True, annot=True, cmap='coolwarm', fmt='.2f',annot_kws={"size": 15})
432
+ plt.title('Correlation Heatmap of New Samples', size=15)
433
+ plt.yticks(rotation=0,fontsize=15)
434
+ plt.xticks(rotation=90,fontsize=15)
435
+
436
+ # Display the plot
437
+ plt.tight_layout()
438
+ st.pyplot(fig)
439
+
440
+ # ### Statistical Analysis
441
+ # Kolmogorov-Smirnov test
442
+ st.subheader("Kolmogorov–Smirnov Test Results")
443
+ st.write('Quantifies the statistical distance between real and synthetic distributions.')
444
+ #st.write('Five variables (VehAge, VehBody, Gender, ClaimOcc, ClaimNb) pass the KS test \
445
+ #with p ≥ 0.05, demonstrating good distributional similarity.')
446
+
447
+ results = []
448
+
449
+ for column in X_train.columns:
450
+ original = X_train[column].values
451
+ generated = new_samples_df[column].values
452
+ statistic, p_value = ks_2samp(original, generated)
453
+
454
+ results.append({
455
+ "Feature": column,
456
+ "KS Statistic": statistic,
457
+ "P-value": p_value
458
+ })
459
+
460
+ results_df = pd.DataFrame(results)
461
+
462
+ def color_pval(val):
463
+ color = "red" if val < 0.05 else "green"
464
+ return f"color: {color};"
465
+
466
+ styled_df = results_df.style.applymap(color_pval, subset=["P-value"]) \
467
+ .format({"KS Statistic": "{:.4f}", "P-value": "{:.4f}"})
468
+
469
+ st.markdown("""
470
+ **Legend:**
471
+ - <span style='color:green;'>Green P-value</span>: distributions are **similar** (p ≥ 0.05)
472
+ - <span style='color:red;'>Red P-value</span>: distributions are **significantly different** (p < 0.05)
473
+ """, unsafe_allow_html=True)
474
+ st.dataframe(styled_df)
475
+
476
+
477
+ # ### PCA Analysis
478
+
479
+ st.subheader('PCA comparison')
480
+ st.write('Assesses similarity in global variance structure and major latent components.')
481
+ #st.write('The synthetic data points substantially overlap with the real data in the principal component space, \
482
+ #indicating the synthetic generation method successfully captures the main variance structure and multivariate \
483
+ #relationships present in the original dataset.')
484
+ # Load the saved models
485
+ img = mpimg.imread('./LLM/pca_d1_60.png')
486
+ fig=plt.figure(figsize=(10, 8))
487
+ plt.imshow(img)
488
+ plt.axis('off')
489
+ st.pyplot(fig)
490
+
491
+
492
+
493
+ # ### UMAP Analysis
494
+
495
+ st.subheader('UMAP comparison')
496
+ st.write('Examines nonlinear manifold structure and clustering behavior.')
497
+ #st.write('This visualization shows a strong co-location across all three dimensions \
498
+ #indicating the synthetic data successfully captures the complex, high-dimensional structure \
499
+ #of the real data, preserving both local neighborhoods and global manifold geometry essential \
500
+ #for downstream modeling tasks.')
501
+ img = mpimg.imread('./LLM/umap_d1_60.png')
502
+ fig=plt.figure(figsize=(10, 8))
503
+ plt.imshow(img)
504
+ plt.axis('off')
505
+ st.pyplot(fig)
506
+
507
+
508
+ # ### GLM Frequency Analysis
509
+ st.subheader('Frequency GLM Analysis')
510
+ st.write('Tests how well synthetic data preserves predictive relationships for claim frequency.')
511
+ # Baseline frequency model
512
+ results_frequency_1 = run_glm_frequency_analysis(X_train, X_test, label="Baseline", var='Real')
513
+ # Using synthetic sample data with exposure clipping
514
+ results_frequency_2 = run_glm_frequency_analysis(new_samples_df, X_test, clip_exposure=True, label="Synthetic Clipped",var= 'Synthetic')
515
+
516
+
517
+ # ### GLM Cost Analysis
518
+ st.subheader('Severity GLM Analysis')
519
+ st.write('Evaluates whether severity-related predictors behave similarly on real and synthetic data.')
520
+ results_cost_1 = run_glm_cost_analysis(X_train, X_test,var='Real')
521
+ results_cost_2 = run_glm_cost_analysis(new_samples_df, X_test, is_sampled=True,var='Synthetic')
522
+
523
+
524
+ # ### Feature Importance Analysis
525
+ # --- SHAP Feature Importance for Frequency ---
526
+ st.subheader('SHAP Feature Importance for Frequency Model')
527
+ st.write('Shows whether drivers of frequency predictions remain consistent across datasets.')
528
+ #st.write('This SHAP analysis reveals good model consistency: ClaimOcc (claim occurrence) dominates feature importance \
529
+ #in both real and synthetic datasets, suggesting the model has learned stable, meaningful patterns. However, the relative \
530
+ #importance of VehBody increases substantially in synthetic data compared to real data.')
531
+ # Prepare data for frequency model SHAP
532
+ X_train_freq = X_train.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
533
+ y_train_freq = X_train['ClaimNb']
534
+ sample_weight_freq = X_train['Exposure']
535
+
536
+ X_test_freq = X_test.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
537
+
538
+ # Filter out rows with Exposure = 0 for frequency model training and SHAP explanation
539
+ mask_train_freq = sample_weight_freq > 0
540
+ X_train_freq_filtered = X_train_freq[mask_train_freq]
541
+ y_train_freq_filtered = y_train_freq[mask_train_freq]
542
+ sample_weight_freq_filtered = sample_weight_freq[mask_train_freq]
543
+
544
+ # Ensure X_test_freq also only contains rows where Exposure > 0
545
+ mask_test_freq = X_test['Exposure'] > 0
546
+ X_test_freq_filtered = X_test_freq[mask_test_freq]
547
+
548
+
549
+ # Plot SHAP for Frequency
550
+ plot_glm_shap_importance(
551
+ X_train=X_train_freq_filtered,
552
+ X_test=X_test_freq_filtered,
553
+ y_train=y_train_freq_filtered / sample_weight_freq_filtered, # Target is rate (ClaimNb / Exposure)
554
+ sample_weight=sample_weight_freq_filtered,
555
+ power=1, # Power=1 for Poisson (frequency)
556
+ title="SHAP Feature Importance for Frequency Model (Real Data)",
557
+ max_display=10
558
+ )
559
+
560
+ # --- SHAP Feature Importance for Frequency (Synthetic Data) ---
561
+ # Prepare data for frequency model SHAP using synthetic data
562
+ X_train_freq_synth = new_samples_df.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
563
+ y_train_freq_synth = new_samples_df['ClaimNb']
564
+ sample_weight_freq_synth = new_samples_df['Exposure']
565
+
566
+ # X_test_freq is the same as before (real test data)
567
+ X_test_freq = X_test.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
568
+
569
+ # Filter out rows with Exposure = 0 for frequency model training and SHAP explanation
570
+ mask_train_freq_synth = sample_weight_freq_synth > 0
571
+ X_train_freq_synth_filtered = X_train_freq_synth[mask_train_freq_synth]
572
+ y_train_freq_synth_filtered = y_train_freq_synth[mask_train_freq_synth]
573
+ sample_weight_freq_synth_filtered = sample_weight_freq_synth[mask_train_freq_synth]
574
+
575
+ # Ensure X_test_freq also only contains rows where Exposure > 0
576
+ mask_test_freq = X_test['Exposure'] > 0
577
+ X_test_freq_filtered = X_test_freq[mask_test_freq]
578
+
579
+ # Plot SHAP for Frequency (Synthetic Data)
580
+ plot_glm_shap_importance(
581
+ X_train=X_train_freq_synth_filtered,
582
+ X_test=X_test_freq_filtered,
583
+ y_train=y_train_freq_synth_filtered / sample_weight_freq_synth_filtered, # Target is rate
584
+ sample_weight=sample_weight_freq_synth_filtered,
585
+ power=1, # Power=1 for Poisson (frequency)
586
+ title="SHAP Feature Importance for Frequency Model (Synthetic Data)",
587
+ max_display=10
588
+ )
589
+
590
+ # --- SHAP Feature Importance for Severity ---
591
+ st.subheader('SHAP Feature Importance for Severity Model')
592
+ st.write('Assesses stability of model explanations for severity outcomes.')
593
+ #st.write('The severity model shows concerning instability between real and synthetic data: \
594
+ #the top features completely flip, with VehBody most important on real data but VehValue dominating synthetic data.')
595
+ # Prepare data for severity model SHAP
596
+ X_train_cost_prep = X_train[X_train['ClaimAmount'] != 0].copy()
597
+ X_test_cost_prep = X_test[X_test['ClaimAmount'] != 0].copy()
598
+
599
+ X_train_sev = X_train_cost_prep.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
600
+ y_train_sev = X_train_cost_prep['ClaimAmount'] / X_train_cost_prep['ClaimNb']
601
+ sample_weight_sev = X_train_cost_prep['ClaimNb'] # Number of claims is the weight for severity
602
+
603
+ X_test_sev = X_test_cost_prep.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
604
+
605
+ # Plot SHAP for Severity
606
+ plot_glm_shap_importance(
607
+ X_train=X_train_sev,
608
+ X_test=X_test_sev,
609
+ y_train=y_train_sev,
610
+ sample_weight=sample_weight_sev,
611
+ power=2, # Power=2 for Gamma (severity)
612
+ title="SHAP Feature Importance for Severity Model (Real Data)",
613
+ max_display=10
614
+ )
615
+
616
+
617
+ # --- SHAP Feature Importance for Severity (Synthetic Data) ---
618
+ # Prepare data for severity model SHAP using synthetic data
619
+ X_train_cost_prep_synth = new_samples_df[new_samples_df['ClaimAmount'] != 0].copy()
620
+ X_test_cost_prep_synth = X_test[X_test['ClaimAmount'] != 0].copy() # Keep using real test data for explanation
621
+
622
+ X_train_sev_synth = X_train_cost_prep_synth.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
623
+ y_train_sev_synth = X_train_cost_prep_synth['ClaimAmount'] / X_train_cost_prep_synth['ClaimNb']
624
+ sample_weight_sev_synth = X_train_cost_prep_synth['ClaimNb'] # Number of claims is the weight for severity
625
+
626
+ X_test_sev_synth = X_test_cost_prep_synth.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
627
+
628
+
629
+ # Plot SHAP for Severity (Synthetic Data)
630
+ plot_glm_shap_importance(
631
+ X_train=X_train_sev_synth,
632
+ X_test=X_test_sev_synth,
633
+ y_train=y_train_sev_synth,
634
+ sample_weight=sample_weight_sev_synth,
635
+ power=2, # Power=2 for Gamma (severity)
636
+ title="SHAP Feature Importance for Severity Model (Synthetic Data)",
637
+ max_display=10
638
+ )
639
+
640
+
641
+ # ### dataset 2 and data handling
642
+ st.header('Dataset 2: swmotorcycle')
643
+
644
+ df2_duplicated_rows=df2[df2.duplicated()]
645
+ df2=df2.drop_duplicates()
646
+ df2_duplicated_col=df2.columns[df2.columns.duplicated()]
647
+
648
+
649
+ # add ClaimOcc feature
650
+ df_2 = df2.copy()
651
+ df_2['ClaimOcc'] = np.where(df_2['ClaimNb'] > 0, 1, 0)
652
+ # Feature transformation
653
+ df_2['Exposure'] = df_2['Exposure'].clip(upper=1)
654
+ df_2['VehAge'] = df_2['VehAge'].clip(upper=20)
655
+
656
+
657
+ # ### Encoding
658
+ df2_encod=df_2.copy()
659
+ # RiskClass
660
+ RiskClass_group = {'EV ratio 13-15':'1','EV ratio 20-24':'2','EV ratio 9-12':'3','EV ratio <5':'4','EV ratio 6-8':'5',\
661
+ 'EV ratio 16-19':'6','EV ratio >25':'7'}
662
+ df2_encod['RiskClass'] = df2_encod['RiskClass'].map(RiskClass_group)
663
+ df2_encod['RiskClass']= df2_encod['RiskClass'].astype(int)
664
+ # BonusClass
665
+ BonusClass_group = {'BM1':'1','BM2':'2','BM3':'3','BM4':'4','BM5':'5','BM6':'6','BM7':'7'}
666
+ df2_encod['BonusClass'] = df2_encod['BonusClass'].map(BonusClass_group)
667
+ df2_encod['BonusClass']= df2_encod['BonusClass'].astype(int)
668
+ # Area
669
+ Area_group = {"Central parts of Sweden's three largest cities":'1','Lesser towns except Gotland; Northern towns':'2',\
670
+ 'Small towns; countryside except Gotland; Northern towns':'3','Suburbs; middle-sized cities':'4',\
671
+ 'Northern countryside':'5','Northern towns':'6',"Gotland (Sweden's largest island)":'7'}
672
+ df2_encod['Area'] = df2_encod['Area'].map(Area_group)
673
+ df2_encod['Area']= df2_encod['Area'].astype(int)
674
+ # Gender
675
+ Gender_group = {'Female':'0','Male':'1'}
676
+ df2_encod['Gender'] = df2_encod['Gender'].map(Gender_group)
677
+ df2_encod['Gender']= df2_encod['Gender'].astype(int)
678
+
679
+
680
+
681
+
682
+ # ### Split dataset
683
+ # Split the dataset into train/test split
684
+ X_train, X_test = train_test_split(df2_encod, test_size=0.2, random_state=0)
685
+ st.markdown(f"**Train shape:** {X_train.shape} \n**Test shape:** {X_test.shape}")
686
+
687
+
688
+ # ### Use Generate Samples Dataframe
689
+ df2_synth_encod=df2_synth.copy()
690
+ # RiskClass
691
+ RiskClass_group = {'EV ratio 13-15':'1','EV ratio 20-24':'2','EV ratio 9-12':'3','EV ratio <5':'4','EV ratio 6-8':'5',\
692
+ 'EV ratio 16-19':'6','EV ratio >25':'7'}
693
+ df2_synth_encod['RiskClass'] = df2_synth_encod['RiskClass'].map(RiskClass_group)
694
+ df2_synth_encod['RiskClass']= df2_synth_encod['RiskClass'].astype(int)
695
+ # BonusClass
696
+ BonusClass_group = {'BM1':'1','BM2':'2','BM3':'3','BM4':'4','BM5':'5','BM6':'6','BM7':'7'}
697
+ df2_synth_encod['BonusClass'] = df2_synth_encod['BonusClass'].map(BonusClass_group)
698
+ df2_synth_encod['BonusClass']= df2_synth_encod['BonusClass'].astype(int)
699
+ # Area
700
+ Area_group = {"Central parts of Sweden's three largest cities":'1','Lesser towns except Gotland; Northern towns':'2',\
701
+ 'Small towns; countryside except Gotland; Northern towns':'3','Suburbs; middle-sized cities':'4',\
702
+ 'Northern countryside':'5','Northern towns':'6',"Gotland (Sweden's largest island)":'7'}
703
+ df2_synth_encod['Area'] = df2_synth_encod['Area'].map(Area_group)
704
+ df2_synth_encod['Area']= df2_synth_encod['Area'].astype(int)
705
+ # Gender
706
+ Gender_group = {'Female':'0','Male':'1'}
707
+ df2_synth_encod['Gender'] = df2_synth_encod['Gender'].map(Gender_group)
708
+ df2_synth_encod['Gender']= df2_synth_encod['Gender'].astype(int)
709
+
710
+ new_samples_df=df2_synth_encod.copy()
711
+
712
+ # Check consistency
713
+ st.subheader(f"Check consistency")
714
+ # Find inconsistencies
715
+ inconsistent_records = new_samples_df[
716
+ ~(((new_samples_df["ClaimNb"] == 0) & (new_samples_df["ClaimOcc"] == 0) & (new_samples_df["ClaimAmount"] == 0)) |
717
+ ((new_samples_df["ClaimNb"] > 0) & (new_samples_df["ClaimOcc"] > 0) & (new_samples_df["ClaimAmount"] > 0)))
718
+ ]
719
+
720
+ st.write(f"Number of inconsistent records on synthetic data: {len(inconsistent_records)}")
721
+ st.write(inconsistent_records.head()) # Show a few inconsistent rows
722
+ st.write('Helps assess basic data fidelity by checking structural or logical violations.')
723
+ #st.write('The generative model replaced the business patterns in a right way')
724
+
725
+
726
+ # ### Visual Comparison
727
+ st.subheader('Univariate distribution comparison: real vs synthetic')
728
+ st.write('Shows how well each individual feature is mimicked by the synthetic data.')
729
+ #st.write('The model captures variables like ClaimAmount, ClaimOcc, ClaimNb and Gender in a good manner. \
730
+ #Meanwhile fails for the others.')
731
+
732
+ # Compare selected variables using histograms
733
+ compare_real_vs_synthetic(
734
+ real_df=X_train,
735
+ synthetic_df=df2_synth,
736
+ columns=['Exposure','VehAge','ClaimOcc','ClaimNb', 'ClaimAmount', 'RiskClass', 'Area','BonusClass','Gender'],
737
+ kind='hist'
738
+ )
739
+
740
+ st.subheader('Correlation matrix comparison: real vs synthetic')
741
+ st.write('Evaluates preservation of feature-to-feature relationships.')
742
+ #st.write('The synthetic data nearly perfectly replicates the correlation structure, with identical \
743
+ #values across almost all variable pairs.')
744
+
745
+ # Compute correlation matrices
746
+ corr_matrix_X_train = X_train.corr()
747
+ corr_matrix_new_samples = new_samples_df.corr()
748
+
749
+ # Set figure size
750
+ fig=plt.figure(figsize=(30,15))
751
+
752
+ # a subplot grid
753
+ # Parameters (1, 2, 1) implies 1 row, 2 columns, and this plot is the 1st plot.
754
+ plt.subplot(1, 2, 1) # Subplot 1
755
+ sns.heatmap(corr_matrix_X_train, square=True, annot=True, cmap='coolwarm', fmt='.2f',annot_kws={"size": 15})
756
+ plt.title('Correlation Heatmap of X_train', size=15)
757
+ plt.yticks(rotation=0,fontsize=15)
758
+ plt.xticks(rotation=90,fontsize=15)
759
+
760
+ # another subplot for the second heatmap
761
+ plt.subplot(1, 2, 2) # Subplot 2
762
+ sns.heatmap(corr_matrix_new_samples, square=True, annot=True, cmap='coolwarm', fmt='.2f',annot_kws={"size": 15})
763
+ plt.title('Correlation Heatmap of New Samples', size=15)
764
+ plt.yticks(rotation=0,fontsize=15)
765
+ plt.xticks(rotation=90,fontsize=15)
766
+
767
+ # Display the plot
768
+ plt.tight_layout()
769
+ st.pyplot(fig)
770
+
771
+
772
+ # ### Statistical Analysis
773
+ # Kolmogorov-Smirnov test
774
+ st.subheader('Kolmogorov–Smirnov Test Results')
775
+ st.write('Quantifies the statistical distance between real and synthetic distributions.')
776
+ #st.write('Only four variables (Gender, ClaimNb, ClaimAmount, ClaimOcc) pass the KS test achieving \
777
+ #a perfect p = 1.0000 or close to it, but these successes are primarily on claim-related variables \
778
+ #while demographic and policy features are poorly reproduced.')
779
+
780
+
781
+ results = []
782
+
783
+ for column in X_train.columns:
784
+ original = X_train[column].values
785
+ generated = new_samples_df[column].values
786
+ statistic, p_value = ks_2samp(original, generated)
787
+
788
+ results.append({
789
+ "Feature": column,
790
+ "KS Statistic": statistic,
791
+ "P-value": p_value
792
+ })
793
+
794
+ results_df = pd.DataFrame(results)
795
+
796
+ def color_pval(val):
797
+ color = "red" if val < 0.05 else "green"
798
+ return f"color: {color};"
799
+
800
+ styled_df = results_df.style.applymap(color_pval, subset=["P-value"]) \
801
+ .format({"KS Statistic": "{:.4f}", "P-value": "{:.4f}"})
802
+
803
+ st.markdown("""
804
+ **Legend:**
805
+ - <span style='color:green;'>Green P-value</span>: distributions are **similar** (p ≥ 0.05)
806
+ - <span style='color:red;'>Red P-value</span>: distributions are **significantly different** (p < 0.05)
807
+ """, unsafe_allow_html=True)
808
+ st.dataframe(styled_df)
809
+
810
+
811
+ # ### PCA Analysis
812
+ st.subheader('PCA comparison')
813
+ st.write('Assesses similarity in global variance structure and major latent components.')
814
+ #st.write('The synthetic points exhibit nearly identical spread, density, and boundary \
815
+ #characteristics as the real data, with minimal outliers and no visible systematic shifts.')
816
+ # Load the saved models
817
+ #scaler = load('./LLM/scaler_pca_model_d2_llm_60.pkl')
818
+ #pca = load('./LLM/pca_model_d2_llm_60.pkl')
819
+ img = mpimg.imread('./LLM/pca_d2_60.png')
820
+ fig=plt.figure(figsize=(10, 8))
821
+ plt.imshow(img)
822
+ plt.axis('off')
823
+ st.pyplot(fig)
824
+
825
+
826
+ # ### UMAP Analysis
827
+ st.subheader('UMAP comparison')
828
+ st.write('Examines nonlinear manifold structure and clustering behavior.')
829
+ #st.write('The plot shows that synthetic points (red) closely overlap the real data (blue), \
830
+ #indicating the generative process preserves the global structure of the feature space. \
831
+ #Minor deviations appear at the edges, but overall the synthetic dataset replicates key clusters well.')
832
+ img = mpimg.imread('./LLM/umap_d2_60.png')
833
+ fig=plt.figure(figsize=(10, 8))
834
+ plt.imshow(img)
835
+ plt.axis('off')
836
+ st.pyplot(fig)
837
+
838
+
839
+ # ### GLM Frequency Analysis
840
+ st.subheader('Frequency GLM Analysis')
841
+ st.write('Tests how well synthetic data preserves predictive relationships for claim frequency.')
842
+ # Baseline frequency model
843
+ results_frequency_3 = run_glm_frequency_analysis(X_train, X_test, label="Baseline", var='Real')
844
+ # Using synthetic sample data with exposure clipping
845
+ results_frequency_4 = run_glm_frequency_analysis(new_samples_df, X_test, clip_exposure=True, label="Synthetic Clipped", var='Synthetic')
846
+
847
+
848
+ # ### GLM Cost Analysis
849
+ st.subheader('Severity GLM Analysis')
850
+ st.write('Evaluates whether severity-related predictors behave similarly on real and synthetic data.')
851
+ results_cost_3 = run_glm_cost_analysis(X_train, X_test, var='Real')
852
+ results_cost_4 = run_glm_cost_analysis(new_samples_df, X_test, is_sampled=True, var= 'Synthetic')
853
+
854
+
855
+ # ### Feature Importance Analysis
856
+
857
+ # --- SHAP Feature Importance for Frequency ---
858
+ st.subheader('SHAP Feature Importance for Frequency Model')
859
+ st.write('Shows whether drivers of frequency predictions remain consistent across datasets.')
860
+ #st.write('The frequency model demonstrates excellent stability across real and synthetic datasets: \
861
+ #both show OwnerAge as the dominant predictor followed by VehAge, with nearly identical feature importance \
862
+ #rankings and similar magnitude patterns.')
863
+ # Prepare data for frequency model SHAP
864
+ X_train_freq = X_train.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
865
+ y_train_freq = X_train['ClaimNb']
866
+ sample_weight_freq = X_train['Exposure']
867
+
868
+ X_test_freq = X_test.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
869
+
870
+ # Filter out rows with Exposure = 0 for frequency model training and SHAP explanation
871
+ mask_train_freq = sample_weight_freq > 0
872
+ X_train_freq_filtered = X_train_freq[mask_train_freq]
873
+ y_train_freq_filtered = y_train_freq[mask_train_freq]
874
+ sample_weight_freq_filtered = sample_weight_freq[mask_train_freq]
875
+
876
+ # Ensure X_test_freq also only contains rows where Exposure > 0
877
+ mask_test_freq = X_test['Exposure'] > 0
878
+ X_test_freq_filtered = X_test_freq[mask_test_freq]
879
+
880
+
881
+ # Plot SHAP for Frequency
882
+ plot_glm_shap_importance(
883
+ X_train=X_train_freq_filtered,
884
+ X_test=X_test_freq_filtered,
885
+ y_train=y_train_freq_filtered / sample_weight_freq_filtered, # Target is rate (ClaimNb / Exposure)
886
+ sample_weight=sample_weight_freq_filtered,
887
+ power=1, # Power=1 for Poisson (frequency)
888
+ title="SHAP Feature Importance for Frequency Model (Real Data)",
889
+ max_display=10
890
+ )
891
+
892
+ # --- SHAP Feature Importance for Frequency (Synthetic Data) ---
893
+ # Prepare data for frequency model SHAP using synthetic data
894
+ X_train_freq_synth = new_samples_df.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
895
+ y_train_freq_synth = new_samples_df['ClaimNb']
896
+ sample_weight_freq_synth = new_samples_df['Exposure']
897
+
898
+ # X_test_freq is the same as before (real test data)
899
+ X_test_freq = X_test.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
900
+
901
+ # Filter out rows with Exposure = 0 for frequency model training and SHAP explanation
902
+ mask_train_freq_synth = sample_weight_freq_synth > 0
903
+ X_train_freq_synth_filtered = X_train_freq_synth[mask_train_freq_synth]
904
+ y_train_freq_synth_filtered = y_train_freq_synth[mask_train_freq_synth]
905
+ sample_weight_freq_synth_filtered = sample_weight_freq_synth[mask_train_freq_synth]
906
+
907
+ # Ensure X_test_freq also only contains rows where Exposure > 0
908
+ mask_test_freq = X_test['Exposure'] > 0
909
+ X_test_freq_filtered = X_test_freq[mask_test_freq]
910
+
911
+ # Plot SHAP for Frequency (Synthetic Data)
912
+ plot_glm_shap_importance(
913
+ X_train=X_train_freq_synth_filtered,
914
+ X_test=X_test_freq_filtered,
915
+ y_train=y_train_freq_synth_filtered / sample_weight_freq_synth_filtered, # Target is rate
916
+ sample_weight=sample_weight_freq_synth_filtered,
917
+ power=1, # Power=1 for Poisson (frequency)
918
+ title="SHAP Feature Importance for Frequency Model (Synthetic Data)",
919
+ max_display=10
920
+ )
921
+
922
+ # --- SHAP Feature Importance for Severity ---
923
+ st.subheader('SHAP Feature Importance for Severity Model')
924
+ st.write('Assesses stability of model explanations for severity outcomes')
925
+ #st.write('The severity model shows strong consistency between real and synthetic data: \
926
+ #VehAge clearly dominates as the primary driver in both datasets, followed by OwnerAge \
927
+ #as a distant second.')
928
+ # Prepare data for severity model SHAP
929
+ X_train_cost_prep = X_train[X_train['ClaimAmount'] != 0].copy()
930
+ X_test_cost_prep = X_test[X_test['ClaimAmount'] != 0].copy()
931
+
932
+ X_train_sev = X_train_cost_prep.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
933
+ y_train_sev = X_train_cost_prep['ClaimAmount'] / X_train_cost_prep['ClaimNb']
934
+ sample_weight_sev = X_train_cost_prep['ClaimNb'] # Number of claims is the weight for severity
935
+
936
+ X_test_sev = X_test_cost_prep.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
937
+
938
+ # Plot SHAP for Severity
939
+ plot_glm_shap_importance(
940
+ X_train=X_train_sev,
941
+ X_test=X_test_sev,
942
+ y_train=y_train_sev,
943
+ sample_weight=sample_weight_sev,
944
+ power=2, # Power=2 for Gamma (severity)
945
+ title="SHAP Feature Importance for Severity Model (Real Data)",
946
+ max_display=10
947
+ )
948
+
949
+ # --- SHAP Feature Importance for Severity (Synthetic Data) ---
950
+ # Prepare data for severity model SHAP using synthetic data
951
+ X_train_cost_prep_synth = new_samples_df[new_samples_df['ClaimAmount'] != 0].copy()
952
+ X_test_cost_prep_synth = X_test[X_test['ClaimAmount'] != 0].copy() # Keep using real test data for explanation
953
+
954
+ X_train_sev_synth = X_train_cost_prep_synth.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
955
+ y_train_sev_synth = X_train_cost_prep_synth['ClaimAmount'] / X_train_cost_prep_synth['ClaimNb']
956
+ sample_weight_sev_synth = X_train_cost_prep_synth['ClaimNb'] # Number of claims is the weight for severity
957
+
958
+ X_test_sev_synth = X_test_cost_prep_synth.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
959
+
960
+
961
+ # Plot SHAP for Severity (Synthetic Data)
962
+ plot_glm_shap_importance(
963
+ X_train=X_train_sev_synth,
964
+ X_test=X_test_sev_synth,
965
+ y_train=y_train_sev_synth,
966
+ sample_weight=sample_weight_sev_synth,
967
+ power=2, # Power=2 for Gamma (severity)
968
+ title="SHAP Feature Importance for Severity Model (Synthetic Data)",
969
+ max_display=10
970
+ )
971
+
972
+
973
+ # ### Results
974
+ st.subheader('Overall results')
975
+ # The dictionary dataset 1
976
+ metrics_dict_1 = results_frequency_1[1]
977
+ mpd_train_1 = metrics_dict_1['mpd_train']
978
+ mpd_test_1 = metrics_dict_1['mpd_test']
979
+
980
+
981
+ # The dictionary synthetic dataset 1
982
+ metrics_dict_2 = results_frequency_2[1]
983
+ mpd_train_2 = metrics_dict_2['mpd_train']
984
+ mpd_test_2 = metrics_dict_2['mpd_test']
985
+
986
+
987
+
988
+ # The dictionary dataset 2
989
+ metrics_dict_3 = results_frequency_3[1]
990
+ mpd_train_3 = metrics_dict_3['mpd_train']
991
+ mpd_test_3 = metrics_dict_3['mpd_test']
992
+
993
+
994
+
995
+ # The dictionary synthetic dataset 2
996
+ metrics_dict_4 = results_frequency_4[1]
997
+ mpd_train_4 = metrics_dict_4['mpd_train']
998
+ mpd_test_4 = metrics_dict_4['mpd_test']
999
+
1000
+
1001
+
1002
+ # The dictionary dataset 1
1003
+ mgd_train_1 = results_cost_1['mgd_train']
1004
+ mgd_test_1 = results_cost_1['mgd_test']
1005
+
1006
+
1007
+
1008
+ # The dictionary synthetic dataset 1
1009
+ mgd_train_2 = results_cost_2['mgd_train']
1010
+ mgd_test_2 = results_cost_2['mgd_test']
1011
+
1012
+
1013
+
1014
+ # The dictionary dataset 2
1015
+ mgd_train_3 = results_cost_3['mgd_train']
1016
+ mgd_test_3 = results_cost_3['mgd_test']
1017
+
1018
+
1019
+
1020
+ # The dictionary synthetic dataset 2
1021
+ mgd_train_4 = results_cost_4['mgd_train']
1022
+ mgd_test_4 = results_cost_4['mgd_test']
1023
+
1024
+
1025
+
1026
+ # Create the DataFrame
1027
+ results_df1 = {
1028
+ 'mpd_train': mpd_train_1,
1029
+ 'mpd_test': mpd_test_1,
1030
+ 'mgd_train': mgd_train_1,
1031
+ 'mgd_test': mgd_test_1,
1032
+ }
1033
+ results_df2 = {
1034
+ 'mpd_train': mpd_train_2,
1035
+ 'mpd_test': mpd_test_2,
1036
+ 'mgd_train': mgd_train_2,
1037
+ 'mgd_test': mgd_test_2,
1038
+ }
1039
+ results_df3 = {
1040
+ 'mpd_train': mpd_train_3,
1041
+ 'mpd_test': mpd_test_3,
1042
+ 'mgd_train': mgd_train_3,
1043
+ 'mgd_test': mgd_test_3,
1044
+ }
1045
+ results_df4 = {
1046
+ 'mpd_train': mpd_train_4,
1047
+ 'mpd_test': mpd_test_4,
1048
+ 'mgd_train': mgd_train_4,
1049
+ 'mgd_test': mgd_test_4,
1050
+ }
1051
+ d1=pd.DataFrame(results_df1, index=['dataset 1'])
1052
+ d2=pd.DataFrame(results_df2, index=['synthetic dataset 1'])
1053
+ d3=pd.DataFrame(results_df3, index=['dataset 2'])
1054
+ d4=pd.DataFrame(results_df4, index=['synthetic dataset 2'])
1055
+ df_tot= pd.concat([d1,d2,d3,d4])
1056
+ st.dataframe(df_tot)
1057
+ #st.write('These results demonstrate excellent synthetic data quality: \
1058
+ #the mean poisson deviance (mpd) and mean gamma deviance (mgd) metrics are \
1059
+ #nearly identical between real and synthetic datasets for both dataset 1 and dataset 2. \
1060
+ #This suggests the synthetic data accurately preserves the statistical properties and \
1061
+ #predictive complexity of the original data')
1062
+
1063
+
1064
+ # barplot comparison
1065
+ fig, ax = plt.subplots(figsize=(9, 5))
1066
+ df_tot.plot(kind='bar', ax=ax)
1067
+ ax.set_title('Comparison of MPD and MGD Metrics')
1068
+ ax.set_ylabel('Value')
1069
+ ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
1070
+ ax.legend(title='Metric')
1071
+ for container in ax.containers:
1072
+ labels = ax.bar_label(container, fmt='%.2f', label_type='edge', padding=2)
1073
+ for label in labels:
1074
+ label.set_fontsize(8)
1075
+
1076
+ plt.tight_layout()
1077
+ st.pyplot(fig)
1078
+ #st.write('This visualization confirms the strong fidelity of the synthetic data. \
1079
+ #The first synthetic dataset pefroms little better on frequency')
1080
+
1081
+
1082
+ # MPD: Train vs Test Comparison
1083
+ fig, axes = plt.subplots(1, 2, figsize=(15, 6))
1084
+
1085
+ # --- MPD Comparison ---
1086
+ mpd_data = df_tot[['mpd_train', 'mpd_test']]
1087
+ mpd_data.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
1088
+
1089
+ axes[0].set_title('Mean Poisson Deviance: Train vs Test', fontsize=16, fontweight='bold')
1090
+ axes[0].set_ylabel('MPD Value', fontsize=14)
1091
+ axes[0].set_xlabel('Dataset', fontsize=14)
1092
+ axes[0].legend(['Train', 'Test'], fontsize=10)
1093
+
1094
+ # Larger tick labels
1095
+ axes[0].tick_params(axis='x', labelsize=12, rotation=45)
1096
+ axes[0].tick_params(axis='y', labelsize=12)
1097
+
1098
+ axes[0].grid(axis='y', alpha=0.3)
1099
+ for container in axes[0].containers:
1100
+ axes[0].bar_label(container, fmt='%.3f', fontsize=15)
1101
+
1102
+ # --- MGD Comparison ---
1103
+ mgd_data = df_tot[['mgd_train', 'mgd_test']]
1104
+ mgd_data.plot(kind='bar', ax=axes[1], color=['#3498db', '#f39c12'])
1105
+
1106
+ axes[1].set_title('Mean Gamma Deviance: Train vs Test', fontsize=16, fontweight='bold')
1107
+ axes[1].set_ylabel('MGD Value', fontsize=14)
1108
+ axes[1].set_xlabel('Dataset', fontsize=14)
1109
+ axes[1].legend(['Train', 'Test'], fontsize=10)
1110
+
1111
+ # Larger tick labels
1112
+ axes[1].tick_params(axis='x', labelsize=12, rotation=45)
1113
+ axes[1].tick_params(axis='y', labelsize=12)
1114
+
1115
+ axes[1].grid(axis='y', alpha=0.3)
1116
+ for container in axes[1].containers:
1117
+ axes[1].bar_label(container, fmt='%.3f', fontsize=15)
1118
+
1119
+ plt.tight_layout()
1120
+ st.pyplot(fig)
1121
+ #st.write('This comparison reveals excellent synthetic data quality with minimal \
1122
+ #train-test gaps. The synthetic generation process maintains distributional properties, \
1123
+ #and also model generalization characteristics.')
1124
+
1125
+ # Create a heatmap
1126
+ fig, ax = plt.subplots(figsize=(10, 6))
1127
+
1128
+ sns.heatmap(df_tot, annot=True, fmt='.3f', cmap='RdYlGn_r',
1129
+ linewidths=0.5, ax=ax, cbar_kws={'label': 'Deviance Value'})
1130
+ ax.set_title('Performance Heatmap: All Metrics Across Datasets', fontsize=15, fontweight='bold', pad=20)
1131
+ ax.set_xlabel('Metrics')
1132
+ ax.set_ylabel('Datasets')
1133
+
1134
+ plt.tight_layout()
1135
+ st.pyplot(fig)
1136
+ #st.write('The heatmap with the near-identical color patterns between real and synthetic versions \
1137
+ #of each dataset confirm excellent replication fidelity. Dataset 2 shows dramatically \
1138
+ #lower MPD values (green, ~0.28-0.44) compared to dataset 1 (orange-red, ~1.43-1.75), while MGD \
1139
+ #values remain similarly high across both, suggesting dataset 2 represents a different \
1140
+ #modeling challenge that the synthetic generation process successfully preserves.')