Spaces:

towardsinnovationlab
/

Generative_Models_4_Insurance_Data

Running

App Files Files Community

towardsinnovationlab commited on Dec 15, 2025

Commit

5993cd4

verified ·

1 Parent(s): dc33249

Upload LLM_Trial_2.py

Browse files

Files changed (1) hide show

pages/LLM_Trial_2.py +1140 -0

pages/LLM_Trial_2.py ADDED Viewed

	@@ -0,0 +1,1140 @@

+import os
+import random
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib.image as mpimg
+import seaborn as sns
+from matplotlib.pyplot import subplots
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import KFold
+from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, make_scorer
+from scipy.stats import ks_2samp
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from mpl_toolkits.mplot3d import Axes3D
+from sklearn.linear_model import TweedieRegressor
+import shap
+from sklearn.mixture import GaussianMixture
+from joblib import dump
+from joblib import load
+import streamlit as st
+import warnings
+warnings.filterwarnings('ignore')
+DEFAULT_RANDOM_SEED = 0 # Set a random seed for reproducibility throughout Python, NumPy, and TensorFlow operations
+random.seed(DEFAULT_RANDOM_SEED)
+os.environ['PYTHONHASHSEED'] = str(DEFAULT_RANDOM_SEED)
+np.random.seed(DEFAULT_RANDOM_SEED)
+# Title
+st.title("Large Language Model GPT-5.1: Synthetic Data Generation Analysis")
+def compare_real_vs_synthetic(real_df, synthetic_df, columns=None, kind='hist', bins=30, figsize=(15, 10)):
+    """
+    Compare distributions between real and synthetic datasets.
+    Parameters:
+    - real_df: pd.DataFrame, the original dataset
+    - synthetic_df: pd.DataFrame, the synthetic dataset
+    - columns: list of column names to compare; if None, all columns are used
+    - kind: str, type of plot: 'hist', 'kde', or 'box'
+    - bins: int, number of bins for histograms
+    - figsize: tuple, size of the plot figure
+    Returns:
+    - None (displays plots)
+    """
+    if columns is None:
+        columns = [col for col in real_df.columns if real_df[col].dtype != 'object']
+    n_cols = 2
+    n_rows = (len(columns) + 1) // n_cols
+    fig= plt.figure(figsize=figsize)
+    for idx, col in enumerate(columns, 1):
+        plt.subplot(n_rows, n_cols, idx)
+        if kind == 'hist':
+            sns.histplot(real_df[col], color='blue', label='Real', kde=False, stat='density', bins=bins, alpha=0.6)
+            sns.histplot(synthetic_df[col], color='red', label='Synthetic', kde=False, stat='density', bins=bins, alpha=0.6)
+        elif kind == 'kde':
+            sns.kdeplot(real_df[col], color='blue', label='Real')
+            sns.kdeplot(synthetic_df[col], color='red', label='Synthetic')
+        elif kind == 'box':
+            sns.boxplot(data=[real_df[col], synthetic_df[col]], palette=['blue', 'red'])
+            plt.xticks([0, 1], ['Real', 'Synthetic'])
+        else:
+            raise ValueError("Unsupported plot kind. Choose from 'hist', 'kde', or 'box'.")
+        plt.title(f"Comparison for '{col}'")
+        plt.legend()
+    plt.tight_layout()
+    st.pyplot(fig)
+def run_glm_frequency_analysis(
+    X_train, X_test, model=None, clip_exposure=False, random_state=0, label="Model", var=None):
+    """
+    Run GLM Poisson regression frequency analysis (ClaimNb ~ Features | Exposure).
+    Parameters:
+    - X_train: pd.DataFrame with ['Exposure', 'ClaimNb', ...]
+    - X_test: pd.DataFrame with ['Exposure', 'ClaimNb', ...]
+    - model: sklearn regressor, default is TweedieRegressor(power=1, link='log')
+    - clip_exposure: bool, if True, caps Exposure at 1 in training set
+    - random_state: int, for reproducibility
+    - label: str, label for printing/logging
+    Returns:
+    - trained_model: fitted model
+    - results: dict with CV scores, deviance on train/test, and predictions
+    """
+    np.random.seed(0)
+    # Optionally clip exposure in training data
+    if clip_exposure:
+        X_train = X_train.copy()
+        X_train['Exposure'] = np.where(X_train['Exposure'] > 1, 1, X_train['Exposure'])
+    # Filter for Exposure > 0
+    mask_tr = X_train['Exposure'] > 0
+    mask_te = X_test['Exposure'] > 0
+    X_train_f = X_train[mask_tr].copy()
+    X_test_f = X_test[mask_te].copy()
+    y_train = X_train_f['ClaimNb']
+    y_test = X_test_f['ClaimNb']
+    exposure_train = X_train_f['Exposure']
+    exposure_test = X_test_f['Exposure']
+    X_train_ = X_train_f.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
+    X_test_ = X_test_f.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
+    # Set model if not passed
+    if model is None:
+        model = TweedieRegressor(power=1, link='log')
+    # Cross-validation
+    cv = KFold(n_splits=5)
+    mpd_scores = []
+    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_)):
+        X_tr, X_val = X_train_.iloc[train_idx], X_train_.iloc[val_idx]
+        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
+        w_tr, w_val = exposure_train.iloc[train_idx], exposure_train.iloc[val_idx]
+        model.fit(X_tr, y_tr / w_tr, sample_weight=w_tr)
+        y_pred = model.predict(X_val)
+        score = mean_poisson_deviance(y_val / w_val, y_pred)
+        #st.write(f"Fold {fold_idx + 1} Poisson Deviance Score: {score:.4f}")
+        mpd_scores.append(score)
+    #st.write(f"Average cross-validation Poisson Deviance Score: {np.mean(mpd_scores):.4f}")
+    #st.write(f"Standard Deviation of CV Scores: {np.std(mpd_scores):.4f}")
+    # Final fit on full training set
+    model.fit(X_train_, y_train / exposure_train, sample_weight=exposure_train)
+    pred_train = model.predict(X_train_)
+    pred_test = model.predict(X_test_)
+    mpd_train = mean_poisson_deviance(y_train / exposure_train, pred_train)
+    mpd_test = mean_poisson_deviance(y_test / exposure_test, pred_test)
+    st.write(f"Train Poisson {var} Deviance: {mpd_train:.4f}")
+    st.write(f"Test Poisson {var} Deviance: {mpd_test:.4f}")
+    return model, {
+        "cv_scores": mpd_scores,
+        "mpd_train": mpd_train,
+        "mpd_test": mpd_test,
+        "train_predictions": pred_train,
+        "test_predictions": pred_test
+    }
+def run_glm_cost_analysis(X_train, X_test, is_sampled=False, verbose=True, var=None):
+    """
+    Perform GLM Cost Analysis using Tweedie Regressor (power=2, link='log').
+    Parameters:
+    - X_train: Training DataFrame (must include 'ClaimAmount', 'ClaimNb', 'Exposure')
+    - X_test: Testing DataFrame
+    - is_sampled: If True, cap 'Exposure' at 1 for training data
+    - verbose: If True, print CV results and scores
+    Returns:
+    - Dictionary containing train/test gamma deviance and predictions
+    """
+    np.random.seed(0)
+    # Cap exposure if sampled
+    if is_sampled:
+        X_train = X_train.copy()
+        X_train['Exposure'] = np.where(X_train['Exposure'] > 1, 1, X_train['Exposure'])
+    X_train_co = X_train.copy()
+    X_test_co = X_test.copy()
+    # Compute average cost per claim (Acost)
+    X_train_co['Acost'] = np.where(X_train_co['ClaimNb'] != 0,
+                                   X_train_co['ClaimAmount'] / X_train_co['ClaimNb'], 0)
+    X_test_co['Acost'] = np.where(X_test_co['ClaimNb'] != 0,
+                                  X_test_co['ClaimAmount'] / X_test_co['ClaimNb'], 0)
+    # Filter rows with non-zero claim amounts
+    X_train_cost = X_train_co[X_train_co['ClaimAmount'] != 0].copy()
+    X_test_cost = X_test_co[X_test_co['ClaimAmount'] != 0].copy()
+    # Target and weights
+    y_train = X_train_cost['Acost']
+    claim_tr = X_train_cost['ClaimNb']
+    y_test = X_test_cost['Acost']
+    claim_te = X_test_cost['ClaimNb']
+    # Features
+    drop_cols = ['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb']
+    X_train_ = X_train_cost.drop(columns=drop_cols)
+    X_test_ = X_test_cost.drop(columns=drop_cols)
+    # Initialize model
+    glm_cl = TweedieRegressor(power=2, link='log')
+    # Cross-validation
+    cv = KFold(n_splits=5, shuffle=True, random_state=0)
+    mgd_scores = []
+    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_)):
+        X_tr, X_val = X_train_.iloc[train_idx], X_train_.iloc[val_idx]
+        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
+        w_tr, w_val = claim_tr.iloc[train_idx], claim_tr.iloc[val_idx]
+        glm_cl.fit(X_tr, y_tr, sample_weight=w_tr)
+        y_pred_val = glm_cl.predict(X_val)
+        score = mean_gamma_deviance(y_val, y_pred_val)
+        mgd_scores.append(score)
+        #if verbose:
+        #    print(f"Fold {fold_idx + 1} Gamma Deviance Score: {score:.4f}")
+    #if verbose:
+    #    print("Average cross-validation Gamma Deviance Score:", np.mean(mgd_scores))
+    #    print("Standard Deviation of CV Scores:", np.std(mgd_scores))
+    # Train on full data
+    glm_cl.fit(X_train_, y_train, sample_weight=claim_tr)
+    # Predictions
+    y_pred_train = glm_cl.predict(X_train_)
+    y_pred_test = glm_cl.predict(X_test_)
+    # Deviance on train and test
+    mgd_train = mean_gamma_deviance(y_train, y_pred_train)
+    mgd_test = mean_gamma_deviance(y_test, y_pred_test)
+    if verbose:
+        st.write(f"Train Gamma {var} Deviance: {mgd_train:.4f}")
+        st.write(f"Test Gamma {var} Deviance: {mgd_test:.4f}")
+    return {
+        "cv_scores": mgd_scores,
+        'mgd_train': mgd_train,
+        'mgd_test': mgd_test,
+        'y_pred_train': y_pred_train,
+        'y_pred_test': y_pred_test
+    }
+def plot_glm_shap_importance(
+    X_train, X_test, y_train, sample_weight,
+    power: int, title: str, max_display: int = 10, figsize: tuple = (5, 5), seed: int = 0):
+    """
+    Compute and plot SHAP feature importance for GLMs using SHAP LinearExplainer.
+    Parameters:
+        X_train (pd.DataFrame): Training features
+        X_test (pd.DataFrame): Test features
+        y_train (pd.Series or np.array): Training target
+        sample_weight (pd.Series or np.array): Sample weights
+        power (int): Tweedie power (1 = Poisson for frequency, 2 = Gamma for severity)
+        title (str): Title for the plot
+        max_display (int): Max number of features to display
+        figsize (tuple): Size of the figure
+        seed (int): Random seed for reproducibility
+    """
+    np.random.seed(seed)
+    model = TweedieRegressor(power=power, link='log')
+    model.fit(X_train, y_train, sample_weight=sample_weight)
+    masker = shap.maskers.Independent(X_train)
+    explainer = shap.LinearExplainer(model, masker=masker)
+    shap_values = explainer.shap_values(X_test)
+    plt.figure(figsize=figsize)
+    shap.summary_plot(
+        shap_values, features=X_test,
+        feature_names=X_test.columns,
+        plot_type='bar',
+        max_display=max_display,
+        show=False
+    )
+    plt.title(title, fontsize=12)
+    plt.tight_layout()
+    fig = plt.gcf()
+    st.pyplot(fig)
+# ### Upload datasets
+#-------------------
+# DATASETS
+#-------------------
+df1=pd.read_csv('./data/ausprivauto0405.csv')
+df2=pd.read_csv('./data/swmotorcycle.csv')
+df1_synth=pd.read_csv('./LLM/synthetic_nonlife_53320_D1_60.csv')
+#df1_synth = df1_synth.drop(columns=["Unnamed: 0"])
+df2_synth=pd.read_csv('./LLM/synthetic_nonlife_51638_D2_60.csv')
+#df2_synth = df2_synth.drop(columns=["Unnamed: 0"])
+# ### dataset 1 and data handling
+st.header('Dataset 1: ausprivauto0405')
+df1_duplicated_rows=df1[df1.duplicated()]
+df1=df1.drop_duplicates()
+df1_duplicated_col=df1.columns[df1.columns.duplicated()]
+# ### Encoding
+df1_encod=df1.copy()
+# VehAge
+VehAge_group = {'old cars':'1','young cars':'2','oldest cars':'3','youngest cars':'4'}
+df1_encod['VehAge'] = df1_encod['VehAge'].map(VehAge_group)
+df1_encod['VehAge']= df1_encod['VehAge'].astype(int)
+# DrivAge
+DrivAge_group = {'young people':'1','older work. people':'2','oldest people':'3','working people':'4','old people':'5','youngest people':'6'}
+df1_encod['DrivAge'] = df1_encod['DrivAge'].map(DrivAge_group)
+df1_encod['DrivAge']= df1_encod['DrivAge'].astype(int)
+# VehBody
+VehBody_group = {'Hatchback':'1','Utility':'2','Station wagon':'3','Hardtop':'4','Panel van':'5','Sedan':'6','Truck':'7',\
+                'Coupe':'8', 'Minibus':'9', 'Motorized caravan':'10', 'Bus':'11', 'Convertible':'12','Roadster':'13'}
+df1_encod['VehBody'] = df1_encod['VehBody'].map(VehBody_group)
+df1_encod['VehBody']= df1_encod['VehBody'].astype(int)
+# Gender
+Gender_group = {'Female':'0','Male':'1'}
+df1_encod['Gender'] = df1_encod['Gender'].map(Gender_group)
+df1_encod['Gender']= df1_encod['Gender'].astype(int)
+# ### Split dataset
+# Split the dataset into train/test split
+X_train, X_test = train_test_split(df1_encod, test_size=0.2, random_state=0)
+st.markdown(f"**Train shape:** {X_train.shape}  \n**Test shape:** {X_test.shape}")
+# ### Use Generate Samples Dataframe
+df1_synth_encod=df1_synth.copy()
+# VehAge
+VehAge_group = {'old cars':'1','young cars':'2','oldest cars':'3','youngest cars':'4'}
+df1_synth_encod['VehAge'] = df1_synth_encod['VehAge'].map(VehAge_group)
+df1_synth_encod['VehAge']= df1_synth_encod['VehAge'].astype(int)
+# DrivAge
+DrivAge_group = {'young people':'1','older work. people':'2','oldest people':'3','working people':'4','old people':'5','youngest people':'6'}
+df1_synth_encod['DrivAge'] = df1_synth_encod['DrivAge'].map(DrivAge_group)
+df1_synth_encod['DrivAge']= df1_synth_encod['DrivAge'].astype(int)
+# VehBody
+VehBody_group = {'Hatchback':'1','Utility':'2','Station wagon':'3','Hardtop':'4','Panel van':'5','Sedan':'6','Truck':'7',\
+                'Coupe':'8', 'Minibus':'9', 'Motorized caravan':'10', 'Bus':'11', 'Convertible':'12','Roadster':'13'}
+df1_synth_encod['VehBody'] = df1_synth_encod['VehBody'].map(VehBody_group)
+df1_synth_encod['VehBody']= df1_synth_encod['VehBody'].astype(int)
+# Gender
+Gender_group = {'Female':'0','Male':'1'}
+df1_synth_encod['Gender'] = df1_synth_encod['Gender'].map(Gender_group)
+df1_synth_encod['Gender']= df1_synth_encod['Gender'].astype(int)
+new_samples_df=df1_synth_encod.copy()
+# Check consistency
+st.subheader(f"Check consistency")
+# Find inconsistencies
+inconsistent_records = new_samples_df[
+    ~(((new_samples_df["ClaimNb"] == 0) & (new_samples_df["ClaimOcc"] == 0) & (new_samples_df["ClaimAmount"] == 0)) |
+      ((new_samples_df["ClaimNb"] > 0) & (new_samples_df["ClaimOcc"] > 0) & (new_samples_df["ClaimAmount"] > 0)))
+]
+st.write(f"Number of inconsistent records on synthetic data: {len(inconsistent_records)}")
+st.write(inconsistent_records.head())  # Show a few inconsistent rows
+st.write('Helps assess basic data fidelity by checking structural or logical violations.')
+#st.write('The generative model successfully learned the essential business logic')
+# ### Visual Comparison
+# Compare selected variables using histograms
+st.subheader(f"Univariate distribution comparison: real vs synthetic")
+st.write('Shows how well each individual feature is mimicked by the synthetic data.')
+#st.write('The model captures variables like Exposure, VehValue, ClaimAmount, ClaimOcc, and \
+#ClaimNb reasonably well, showing similar overall shapes and ranges. Meanwhile for the others \
+#show a poor replication.')
+compare_real_vs_synthetic(
+    real_df=X_train,
+    synthetic_df=df1_synth,
+    columns=['Exposure','VehBody','VehValue','ClaimOcc','ClaimNb', 'ClaimAmount', 'DrivAge', 'VehAge','Gender'],
+    kind='hist'
+)
+st.subheader(f"Correlation matrix comparison: real vs synthetic")
+st.write('Evaluates preservation of feature-to-feature relationships.')
+#st.write('Overall the correlation structure is well-preserved, indicating this synthetic data \
+#generation method maintains feature relationships effectively')
+# Compute correlation matrices
+corr_matrix_X_train = X_train.corr()
+corr_matrix_new_samples = new_samples_df.corr()
+# Set figure size
+fig=plt.figure(figsize=(30,15))
+# a subplot grid
+# Parameters (1, 2, 1) implies 1 row, 2 columns, and this plot is the 1st plot.
+plt.subplot(1, 2, 1) # Subplot 1
+sns.heatmap(corr_matrix_X_train, square=True, annot=True, cmap='coolwarm', fmt='.2f',annot_kws={"size": 15})
+plt.title('Correlation Heatmap of X_train', size=15)
+plt.yticks(rotation=0,fontsize=15)
+plt.xticks(rotation=90,fontsize=15)
+# another subplot for the second heatmap
+plt.subplot(1, 2, 2) # Subplot 2
+sns.heatmap(corr_matrix_new_samples, square=True, annot=True, cmap='coolwarm', fmt='.2f',annot_kws={"size": 15})
+plt.title('Correlation Heatmap of New Samples', size=15)
+plt.yticks(rotation=0,fontsize=15)
+plt.xticks(rotation=90,fontsize=15)
+# Display the plot
+plt.tight_layout()
+st.pyplot(fig)
+# ### Statistical Analysis
+# Kolmogorov-Smirnov test
+st.subheader("Kolmogorov–Smirnov Test Results")
+st.write('Quantifies the statistical distance between real and synthetic distributions.')
+#st.write('Five variables (VehAge, VehBody, Gender, ClaimOcc, ClaimNb) pass the KS test \
+#with p ≥ 0.05, demonstrating good distributional similarity.')
+results = []
+for column in X_train.columns:
+    original = X_train[column].values
+    generated = new_samples_df[column].values
+    statistic, p_value = ks_2samp(original, generated)
+    results.append({
+        "Feature": column,
+        "KS Statistic": statistic,
+        "P-value": p_value
+    })
+results_df = pd.DataFrame(results)
+def color_pval(val):
+    color = "red" if val < 0.05 else "green"
+    return f"color: {color};"
+styled_df = results_df.style.applymap(color_pval, subset=["P-value"]) \
+                            .format({"KS Statistic": "{:.4f}", "P-value": "{:.4f}"})
+st.markdown("""
+**Legend:**
+- <span style='color:green;'>Green P-value</span>: distributions are **similar** (p ≥ 0.05)
+- <span style='color:red;'>Red P-value</span>: distributions are **significantly different** (p < 0.05)
+""", unsafe_allow_html=True)
+st.dataframe(styled_df)
+# ### PCA Analysis
+st.subheader('PCA comparison')
+st.write('Assesses similarity in global variance structure and major latent components.')
+#st.write('The synthetic data points substantially overlap with the real data in the principal component space, \
+#indicating the synthetic generation method successfully captures the main variance structure and multivariate  \
+#relationships present in the original dataset.')
+# Load the saved models
+img = mpimg.imread('./LLM/pca_d1_60.png')
+fig=plt.figure(figsize=(10, 8))
+plt.imshow(img)
+plt.axis('off')
+st.pyplot(fig)
+# ### UMAP Analysis
+st.subheader('UMAP comparison')
+st.write('Examines nonlinear manifold structure and clustering behavior.')
+#st.write('This visualization shows a strong co-location across all three dimensions \
+#indicating the synthetic data successfully captures the complex, high-dimensional structure \
+#of the real data, preserving both local neighborhoods and global manifold geometry essential \
+#for downstream modeling tasks.')
+img = mpimg.imread('./LLM/umap_d1_60.png')
+fig=plt.figure(figsize=(10, 8))
+plt.imshow(img)
+plt.axis('off')
+st.pyplot(fig)
+# ### GLM Frequency Analysis
+st.subheader('Frequency GLM Analysis')
+st.write('Tests how well synthetic data preserves predictive relationships for claim frequency.')
+# Baseline frequency model
+results_frequency_1 = run_glm_frequency_analysis(X_train, X_test, label="Baseline", var='Real')
+# Using synthetic sample data with exposure clipping
+results_frequency_2 = run_glm_frequency_analysis(new_samples_df, X_test, clip_exposure=True, label="Synthetic Clipped",var= 'Synthetic')
+# ### GLM Cost Analysis
+st.subheader('Severity GLM Analysis')
+st.write('Evaluates whether severity-related predictors behave similarly on real and synthetic data.')
+results_cost_1 = run_glm_cost_analysis(X_train, X_test,var='Real')
+results_cost_2 = run_glm_cost_analysis(new_samples_df, X_test, is_sampled=True,var='Synthetic')
+# ### Feature Importance Analysis
+# --- SHAP Feature Importance for Frequency ---
+st.subheader('SHAP Feature Importance for Frequency Model')
+st.write('Shows whether drivers of frequency predictions remain consistent across datasets.')
+#st.write('This SHAP analysis reveals good model consistency: ClaimOcc (claim occurrence) dominates feature importance \
+#in both real and synthetic datasets, suggesting the model has learned stable, meaningful patterns. However, the relative \
+#importance of VehBody increases substantially in synthetic data compared to real data.')
+# Prepare data for frequency model SHAP
+X_train_freq = X_train.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
+y_train_freq = X_train['ClaimNb']
+sample_weight_freq = X_train['Exposure']
+X_test_freq = X_test.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
+# Filter out rows with Exposure = 0 for frequency model training and SHAP explanation
+mask_train_freq = sample_weight_freq > 0
+X_train_freq_filtered = X_train_freq[mask_train_freq]
+y_train_freq_filtered = y_train_freq[mask_train_freq]
+sample_weight_freq_filtered = sample_weight_freq[mask_train_freq]
+# Ensure X_test_freq also only contains rows where Exposure > 0
+mask_test_freq = X_test['Exposure'] > 0
+X_test_freq_filtered = X_test_freq[mask_test_freq]
+# Plot SHAP for Frequency
+plot_glm_shap_importance(
+    X_train=X_train_freq_filtered,
+    X_test=X_test_freq_filtered,
+    y_train=y_train_freq_filtered / sample_weight_freq_filtered, # Target is rate (ClaimNb / Exposure)
+    sample_weight=sample_weight_freq_filtered,
+    power=1, # Power=1 for Poisson (frequency)
+    title="SHAP Feature Importance for Frequency Model (Real Data)",
+    max_display=10
+)
+# --- SHAP Feature Importance for Frequency (Synthetic Data) ---
+# Prepare data for frequency model SHAP using synthetic data
+X_train_freq_synth = new_samples_df.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
+y_train_freq_synth = new_samples_df['ClaimNb']
+sample_weight_freq_synth = new_samples_df['Exposure']
+# X_test_freq is the same as before (real test data)
+X_test_freq = X_test.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
+# Filter out rows with Exposure = 0 for frequency model training and SHAP explanation
+mask_train_freq_synth = sample_weight_freq_synth > 0
+X_train_freq_synth_filtered = X_train_freq_synth[mask_train_freq_synth]
+y_train_freq_synth_filtered = y_train_freq_synth[mask_train_freq_synth]
+sample_weight_freq_synth_filtered = sample_weight_freq_synth[mask_train_freq_synth]
+# Ensure X_test_freq also only contains rows where Exposure > 0
+mask_test_freq = X_test['Exposure'] > 0
+X_test_freq_filtered = X_test_freq[mask_test_freq]
+# Plot SHAP for Frequency (Synthetic Data)
+plot_glm_shap_importance(
+    X_train=X_train_freq_synth_filtered,
+    X_test=X_test_freq_filtered,
+    y_train=y_train_freq_synth_filtered / sample_weight_freq_synth_filtered, # Target is rate
+    sample_weight=sample_weight_freq_synth_filtered,
+    power=1, # Power=1 for Poisson (frequency)
+    title="SHAP Feature Importance for Frequency Model (Synthetic Data)",
+    max_display=10
+)
+# --- SHAP Feature Importance for Severity ---
+st.subheader('SHAP Feature Importance for Severity Model')
+st.write('Assesses stability of model explanations for severity outcomes.')
+#st.write('The severity model shows concerning instability between real and synthetic data: \
+#the top features completely flip, with VehBody most important on real data but VehValue dominating synthetic data.')
+# Prepare data for severity model SHAP
+X_train_cost_prep = X_train[X_train['ClaimAmount'] != 0].copy()
+X_test_cost_prep = X_test[X_test['ClaimAmount'] != 0].copy()
+X_train_sev = X_train_cost_prep.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
+y_train_sev = X_train_cost_prep['ClaimAmount'] / X_train_cost_prep['ClaimNb']
+sample_weight_sev = X_train_cost_prep['ClaimNb'] # Number of claims is the weight for severity
+X_test_sev = X_test_cost_prep.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
+# Plot SHAP for Severity
+plot_glm_shap_importance(
+    X_train=X_train_sev,
+    X_test=X_test_sev,
+    y_train=y_train_sev,
+    sample_weight=sample_weight_sev,
+    power=2, # Power=2 for Gamma (severity)
+    title="SHAP Feature Importance for Severity Model (Real Data)",
+    max_display=10
+)
+# --- SHAP Feature Importance for Severity (Synthetic Data) ---
+# Prepare data for severity model SHAP using synthetic data
+X_train_cost_prep_synth = new_samples_df[new_samples_df['ClaimAmount'] != 0].copy()
+X_test_cost_prep_synth = X_test[X_test['ClaimAmount'] != 0].copy() # Keep using real test data for explanation
+X_train_sev_synth = X_train_cost_prep_synth.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
+y_train_sev_synth = X_train_cost_prep_synth['ClaimAmount'] / X_train_cost_prep_synth['ClaimNb']
+sample_weight_sev_synth = X_train_cost_prep_synth['ClaimNb'] # Number of claims is the weight for severity
+X_test_sev_synth = X_test_cost_prep_synth.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
+# Plot SHAP for Severity (Synthetic Data)
+plot_glm_shap_importance(
+    X_train=X_train_sev_synth,
+    X_test=X_test_sev_synth,
+    y_train=y_train_sev_synth,
+    sample_weight=sample_weight_sev_synth,
+    power=2, # Power=2 for Gamma (severity)
+    title="SHAP Feature Importance for Severity Model (Synthetic Data)",
+    max_display=10
+)
+# ### dataset 2 and data handling
+st.header('Dataset 2: swmotorcycle')
+df2_duplicated_rows=df2[df2.duplicated()]
+df2=df2.drop_duplicates()
+df2_duplicated_col=df2.columns[df2.columns.duplicated()]
+# add ClaimOcc feature
+df_2 = df2.copy()
+df_2['ClaimOcc'] = np.where(df_2['ClaimNb'] > 0, 1, 0)
+# Feature transformation
+df_2['Exposure'] = df_2['Exposure'].clip(upper=1)
+df_2['VehAge'] = df_2['VehAge'].clip(upper=20)
+# ### Encoding
+df2_encod=df_2.copy()
+# RiskClass
+RiskClass_group = {'EV ratio 13-15':'1','EV ratio 20-24':'2','EV ratio 9-12':'3','EV ratio <5':'4','EV ratio 6-8':'5',\
+                   'EV ratio 16-19':'6','EV ratio >25':'7'}
+df2_encod['RiskClass'] = df2_encod['RiskClass'].map(RiskClass_group)
+df2_encod['RiskClass']= df2_encod['RiskClass'].astype(int)
+# BonusClass
+BonusClass_group = {'BM1':'1','BM2':'2','BM3':'3','BM4':'4','BM5':'5','BM6':'6','BM7':'7'}
+df2_encod['BonusClass'] = df2_encod['BonusClass'].map(BonusClass_group)
+df2_encod['BonusClass']= df2_encod['BonusClass'].astype(int)
+# Area
+Area_group = {"Central parts of Sweden's three largest cities":'1','Lesser towns except Gotland; Northern towns':'2',\
+              'Small towns; countryside except Gotland; Northern towns':'3','Suburbs; middle-sized cities':'4',\
+              'Northern countryside':'5','Northern towns':'6',"Gotland (Sweden's largest island)":'7'}
+df2_encod['Area'] = df2_encod['Area'].map(Area_group)
+df2_encod['Area']= df2_encod['Area'].astype(int)
+# Gender
+Gender_group = {'Female':'0','Male':'1'}
+df2_encod['Gender'] = df2_encod['Gender'].map(Gender_group)
+df2_encod['Gender']= df2_encod['Gender'].astype(int)
+# ### Split dataset
+# Split the dataset into train/test split
+X_train, X_test = train_test_split(df2_encod, test_size=0.2, random_state=0)
+st.markdown(f"**Train shape:** {X_train.shape}  \n**Test shape:** {X_test.shape}")
+# ### Use Generate Samples Dataframe
+df2_synth_encod=df2_synth.copy()
+# RiskClass
+RiskClass_group = {'EV ratio 13-15':'1','EV ratio 20-24':'2','EV ratio 9-12':'3','EV ratio <5':'4','EV ratio 6-8':'5',\
+                   'EV ratio 16-19':'6','EV ratio >25':'7'}
+df2_synth_encod['RiskClass'] = df2_synth_encod['RiskClass'].map(RiskClass_group)
+df2_synth_encod['RiskClass']= df2_synth_encod['RiskClass'].astype(int)
+# BonusClass
+BonusClass_group = {'BM1':'1','BM2':'2','BM3':'3','BM4':'4','BM5':'5','BM6':'6','BM7':'7'}
+df2_synth_encod['BonusClass'] = df2_synth_encod['BonusClass'].map(BonusClass_group)
+df2_synth_encod['BonusClass']= df2_synth_encod['BonusClass'].astype(int)
+# Area
+Area_group = {"Central parts of Sweden's three largest cities":'1','Lesser towns except Gotland; Northern towns':'2',\
+              'Small towns; countryside except Gotland; Northern towns':'3','Suburbs; middle-sized cities':'4',\
+              'Northern countryside':'5','Northern towns':'6',"Gotland (Sweden's largest island)":'7'}
+df2_synth_encod['Area'] = df2_synth_encod['Area'].map(Area_group)
+df2_synth_encod['Area']= df2_synth_encod['Area'].astype(int)
+# Gender
+Gender_group = {'Female':'0','Male':'1'}
+df2_synth_encod['Gender'] = df2_synth_encod['Gender'].map(Gender_group)
+df2_synth_encod['Gender']= df2_synth_encod['Gender'].astype(int)
+new_samples_df=df2_synth_encod.copy()
+# Check consistency
+st.subheader(f"Check consistency")
+# Find inconsistencies
+inconsistent_records = new_samples_df[
+    ~(((new_samples_df["ClaimNb"] == 0) & (new_samples_df["ClaimOcc"] == 0) & (new_samples_df["ClaimAmount"] == 0)) |
+      ((new_samples_df["ClaimNb"] > 0) & (new_samples_df["ClaimOcc"] > 0) & (new_samples_df["ClaimAmount"] > 0)))
+]
+st.write(f"Number of inconsistent records on synthetic data: {len(inconsistent_records)}")
+st.write(inconsistent_records.head())  # Show a few inconsistent rows
+st.write('Helps assess basic data fidelity by checking structural or logical violations.')
+#st.write('The generative model replaced the business patterns in a right way')
+# ### Visual Comparison
+st.subheader('Univariate distribution comparison: real vs synthetic')
+st.write('Shows how well each individual feature is mimicked by the synthetic data.')
+#st.write('The model captures variables like ClaimAmount, ClaimOcc, ClaimNb and Gender in a good manner. \
+#Meanwhile fails for the others.')
+# Compare selected variables using histograms
+compare_real_vs_synthetic(
+    real_df=X_train,
+    synthetic_df=df2_synth,
+    columns=['Exposure','VehAge','ClaimOcc','ClaimNb', 'ClaimAmount', 'RiskClass', 'Area','BonusClass','Gender'],
+    kind='hist'
+)
+st.subheader('Correlation matrix comparison: real vs synthetic')
+st.write('Evaluates preservation of feature-to-feature relationships.')
+#st.write('The synthetic data nearly perfectly replicates the correlation structure, with identical \
+#values across almost all variable pairs.')
+# Compute correlation matrices
+corr_matrix_X_train = X_train.corr()
+corr_matrix_new_samples = new_samples_df.corr()
+# Set figure size
+fig=plt.figure(figsize=(30,15))
+# a subplot grid
+# Parameters (1, 2, 1) implies 1 row, 2 columns, and this plot is the 1st plot.
+plt.subplot(1, 2, 1) # Subplot 1
+sns.heatmap(corr_matrix_X_train, square=True, annot=True, cmap='coolwarm', fmt='.2f',annot_kws={"size": 15})
+plt.title('Correlation Heatmap of X_train', size=15)
+plt.yticks(rotation=0,fontsize=15)
+plt.xticks(rotation=90,fontsize=15)
+# another subplot for the second heatmap
+plt.subplot(1, 2, 2) # Subplot 2
+sns.heatmap(corr_matrix_new_samples, square=True, annot=True, cmap='coolwarm', fmt='.2f',annot_kws={"size": 15})
+plt.title('Correlation Heatmap of New Samples', size=15)
+plt.yticks(rotation=0,fontsize=15)
+plt.xticks(rotation=90,fontsize=15)
+# Display the plot
+plt.tight_layout()
+st.pyplot(fig)
+# ### Statistical Analysis
+# Kolmogorov-Smirnov test
+st.subheader('Kolmogorov–Smirnov Test Results')
+st.write('Quantifies the statistical distance between real and synthetic distributions.')
+#st.write('Only four variables (Gender, ClaimNb, ClaimAmount, ClaimOcc) pass the KS test achieving \
+#a perfect p = 1.0000 or close to it, but these successes are primarily on claim-related variables \
+#while demographic and policy features are poorly reproduced.')
+results = []
+for column in X_train.columns:
+    original = X_train[column].values
+    generated = new_samples_df[column].values
+    statistic, p_value = ks_2samp(original, generated)
+    results.append({
+        "Feature": column,
+        "KS Statistic": statistic,
+        "P-value": p_value
+    })
+results_df = pd.DataFrame(results)
+def color_pval(val):
+    color = "red" if val < 0.05 else "green"
+    return f"color: {color};"
+styled_df = results_df.style.applymap(color_pval, subset=["P-value"]) \
+                            .format({"KS Statistic": "{:.4f}", "P-value": "{:.4f}"})
+st.markdown("""
+**Legend:**
+- <span style='color:green;'>Green P-value</span>: distributions are **similar** (p ≥ 0.05)
+- <span style='color:red;'>Red P-value</span>: distributions are **significantly different** (p < 0.05)
+""", unsafe_allow_html=True)
+st.dataframe(styled_df)
+# ### PCA Analysis
+st.subheader('PCA comparison')
+st.write('Assesses similarity in global variance structure and major latent components.')
+#st.write('The synthetic points exhibit nearly identical spread, density, and boundary \
+#characteristics as the real data, with minimal outliers and no visible systematic shifts.')
+# Load the saved models
+#scaler = load('./LLM/scaler_pca_model_d2_llm_60.pkl')
+#pca = load('./LLM/pca_model_d2_llm_60.pkl')
+img = mpimg.imread('./LLM/pca_d2_60.png')
+fig=plt.figure(figsize=(10, 8))
+plt.imshow(img)
+plt.axis('off')
+st.pyplot(fig)
+# ### UMAP Analysis
+st.subheader('UMAP comparison')
+st.write('Examines nonlinear manifold structure and clustering behavior.')
+#st.write('The plot shows that synthetic points (red) closely overlap the real data (blue), \
+#indicating the generative process preserves the global structure of the feature space. \
+#Minor deviations appear at the edges, but overall the synthetic dataset replicates key clusters well.')
+img = mpimg.imread('./LLM/umap_d2_60.png')
+fig=plt.figure(figsize=(10, 8))
+plt.imshow(img)
+plt.axis('off')
+st.pyplot(fig)
+# ### GLM Frequency Analysis
+st.subheader('Frequency GLM Analysis')
+st.write('Tests how well synthetic data preserves predictive relationships for claim frequency.')
+# Baseline frequency model
+results_frequency_3 = run_glm_frequency_analysis(X_train, X_test, label="Baseline", var='Real')
+# Using synthetic sample data with exposure clipping
+results_frequency_4 = run_glm_frequency_analysis(new_samples_df, X_test, clip_exposure=True, label="Synthetic Clipped", var='Synthetic')
+# ### GLM Cost Analysis
+st.subheader('Severity GLM Analysis')
+st.write('Evaluates whether severity-related predictors behave similarly on real and synthetic data.')
+results_cost_3 = run_glm_cost_analysis(X_train, X_test, var='Real')
+results_cost_4 = run_glm_cost_analysis(new_samples_df, X_test, is_sampled=True, var= 'Synthetic')
+# ### Feature Importance Analysis
+# --- SHAP Feature Importance for Frequency ---
+st.subheader('SHAP Feature Importance for Frequency Model')
+st.write('Shows whether drivers of frequency predictions remain consistent across datasets.')
+#st.write('The frequency model demonstrates excellent stability across real and synthetic datasets: \
+#both show OwnerAge as the dominant predictor followed by VehAge, with nearly identical feature importance \
+#rankings and similar magnitude patterns.')
+# Prepare data for frequency model SHAP
+X_train_freq = X_train.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
+y_train_freq = X_train['ClaimNb']
+sample_weight_freq = X_train['Exposure']
+X_test_freq = X_test.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
+# Filter out rows with Exposure = 0 for frequency model training and SHAP explanation
+mask_train_freq = sample_weight_freq > 0
+X_train_freq_filtered = X_train_freq[mask_train_freq]
+y_train_freq_filtered = y_train_freq[mask_train_freq]
+sample_weight_freq_filtered = sample_weight_freq[mask_train_freq]
+# Ensure X_test_freq also only contains rows where Exposure > 0
+mask_test_freq = X_test['Exposure'] > 0
+X_test_freq_filtered = X_test_freq[mask_test_freq]
+# Plot SHAP for Frequency
+plot_glm_shap_importance(
+    X_train=X_train_freq_filtered,
+    X_test=X_test_freq_filtered,
+    y_train=y_train_freq_filtered / sample_weight_freq_filtered, # Target is rate (ClaimNb / Exposure)
+    sample_weight=sample_weight_freq_filtered,
+    power=1, # Power=1 for Poisson (frequency)
+    title="SHAP Feature Importance for Frequency Model (Real Data)",
+    max_display=10
+)
+# --- SHAP Feature Importance for Frequency (Synthetic Data) ---
+# Prepare data for frequency model SHAP using synthetic data
+X_train_freq_synth = new_samples_df.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
+y_train_freq_synth = new_samples_df['ClaimNb']
+sample_weight_freq_synth = new_samples_df['Exposure']
+# X_test_freq is the same as before (real test data)
+X_test_freq = X_test.drop(['Exposure', 'ClaimNb', 'ClaimAmount'], axis=1, errors='ignore')
+# Filter out rows with Exposure = 0 for frequency model training and SHAP explanation
+mask_train_freq_synth = sample_weight_freq_synth > 0
+X_train_freq_synth_filtered = X_train_freq_synth[mask_train_freq_synth]
+y_train_freq_synth_filtered = y_train_freq_synth[mask_train_freq_synth]
+sample_weight_freq_synth_filtered = sample_weight_freq_synth[mask_train_freq_synth]
+# Ensure X_test_freq also only contains rows where Exposure > 0
+mask_test_freq = X_test['Exposure'] > 0
+X_test_freq_filtered = X_test_freq[mask_test_freq]
+# Plot SHAP for Frequency (Synthetic Data)
+plot_glm_shap_importance(
+    X_train=X_train_freq_synth_filtered,
+    X_test=X_test_freq_filtered,
+    y_train=y_train_freq_synth_filtered / sample_weight_freq_synth_filtered, # Target is rate
+    sample_weight=sample_weight_freq_synth_filtered,
+    power=1, # Power=1 for Poisson (frequency)
+    title="SHAP Feature Importance for Frequency Model (Synthetic Data)",
+    max_display=10
+)
+# --- SHAP Feature Importance for Severity ---
+st.subheader('SHAP Feature Importance for Severity Model')
+st.write('Assesses stability of model explanations for severity outcomes')
+#st.write('The severity model shows strong consistency between real and synthetic data: \
+#VehAge clearly dominates as the primary driver in both datasets, followed by OwnerAge \
+#as a distant second.')
+# Prepare data for severity model SHAP
+X_train_cost_prep = X_train[X_train['ClaimAmount'] != 0].copy()
+X_test_cost_prep = X_test[X_test['ClaimAmount'] != 0].copy()
+X_train_sev = X_train_cost_prep.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
+y_train_sev = X_train_cost_prep['ClaimAmount'] / X_train_cost_prep['ClaimNb']
+sample_weight_sev = X_train_cost_prep['ClaimNb'] # Number of claims is the weight for severity
+X_test_sev = X_test_cost_prep.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
+# Plot SHAP for Severity
+plot_glm_shap_importance(
+    X_train=X_train_sev,
+    X_test=X_test_sev,
+    y_train=y_train_sev,
+    sample_weight=sample_weight_sev,
+    power=2, # Power=2 for Gamma (severity)
+    title="SHAP Feature Importance for Severity Model (Real Data)",
+    max_display=10
+)
+# --- SHAP Feature Importance for Severity (Synthetic Data) ---
+# Prepare data for severity model SHAP using synthetic data
+X_train_cost_prep_synth = new_samples_df[new_samples_df['ClaimAmount'] != 0].copy()
+X_test_cost_prep_synth = X_test[X_test['ClaimAmount'] != 0].copy() # Keep using real test data for explanation
+X_train_sev_synth = X_train_cost_prep_synth.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
+y_train_sev_synth = X_train_cost_prep_synth['ClaimAmount'] / X_train_cost_prep_synth['ClaimNb']
+sample_weight_sev_synth = X_train_cost_prep_synth['ClaimNb'] # Number of claims is the weight for severity
+X_test_sev_synth = X_test_cost_prep_synth.drop(columns=['Acost', 'Exposure', 'ClaimAmount', 'ClaimNb'], errors='ignore')
+# Plot SHAP for Severity (Synthetic Data)
+plot_glm_shap_importance(
+    X_train=X_train_sev_synth,
+    X_test=X_test_sev_synth,
+    y_train=y_train_sev_synth,
+    sample_weight=sample_weight_sev_synth,
+    power=2, # Power=2 for Gamma (severity)
+    title="SHAP Feature Importance for Severity Model (Synthetic Data)",
+    max_display=10
+)
+# ### Results
+st.subheader('Overall results')
+# The dictionary dataset 1
+metrics_dict_1 = results_frequency_1[1]
+mpd_train_1 = metrics_dict_1['mpd_train']
+mpd_test_1 = metrics_dict_1['mpd_test']
+# The dictionary synthetic dataset 1
+metrics_dict_2 = results_frequency_2[1]
+mpd_train_2 = metrics_dict_2['mpd_train']
+mpd_test_2 = metrics_dict_2['mpd_test']
+# The dictionary dataset 2
+metrics_dict_3 = results_frequency_3[1]
+mpd_train_3 = metrics_dict_3['mpd_train']
+mpd_test_3 = metrics_dict_3['mpd_test']
+# The dictionary synthetic dataset 2
+metrics_dict_4 = results_frequency_4[1]
+mpd_train_4 = metrics_dict_4['mpd_train']
+mpd_test_4 = metrics_dict_4['mpd_test']
+# The dictionary dataset 1
+mgd_train_1 = results_cost_1['mgd_train']
+mgd_test_1 = results_cost_1['mgd_test']
+# The dictionary synthetic dataset 1
+mgd_train_2 = results_cost_2['mgd_train']
+mgd_test_2 = results_cost_2['mgd_test']
+# The dictionary dataset 2
+mgd_train_3 = results_cost_3['mgd_train']
+mgd_test_3 = results_cost_3['mgd_test']
+# The dictionary synthetic dataset 2
+mgd_train_4 = results_cost_4['mgd_train']
+mgd_test_4 = results_cost_4['mgd_test']
+# Create the DataFrame
+results_df1 = {
+    'mpd_train': mpd_train_1,
+    'mpd_test': mpd_test_1,
+    'mgd_train': mgd_train_1,
+    'mgd_test': mgd_test_1,
+}
+results_df2 = {
+    'mpd_train': mpd_train_2,
+    'mpd_test': mpd_test_2,
+    'mgd_train': mgd_train_2,
+    'mgd_test': mgd_test_2,
+}
+results_df3 = {
+    'mpd_train': mpd_train_3,
+    'mpd_test': mpd_test_3,
+    'mgd_train': mgd_train_3,
+    'mgd_test': mgd_test_3,
+}
+results_df4 = {
+    'mpd_train': mpd_train_4,
+    'mpd_test': mpd_test_4,
+    'mgd_train': mgd_train_4,
+    'mgd_test': mgd_test_4,
+}
+d1=pd.DataFrame(results_df1, index=['dataset 1'])
+d2=pd.DataFrame(results_df2, index=['synthetic dataset 1'])
+d3=pd.DataFrame(results_df3, index=['dataset 2'])
+d4=pd.DataFrame(results_df4, index=['synthetic dataset 2'])
+df_tot= pd.concat([d1,d2,d3,d4])
+st.dataframe(df_tot)
+#st.write('These results demonstrate excellent synthetic data quality: \
+#the mean poisson deviance (mpd) and mean gamma deviance (mgd) metrics are \
+#nearly identical between real and synthetic datasets for both dataset 1 and dataset 2. \
+#This suggests the synthetic data accurately preserves the statistical properties and \
+#predictive complexity of the original data')
+# barplot comparison
+fig, ax = plt.subplots(figsize=(9, 5))
+df_tot.plot(kind='bar', ax=ax)
+ax.set_title('Comparison of MPD and MGD Metrics')
+ax.set_ylabel('Value')
+ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
+ax.legend(title='Metric')
+for container in ax.containers:
+    labels = ax.bar_label(container, fmt='%.2f', label_type='edge', padding=2)
+    for label in labels:
+        label.set_fontsize(8)
+plt.tight_layout()
+st.pyplot(fig)
+#st.write('This visualization confirms the strong fidelity of the synthetic data. \
+#The first synthetic dataset pefroms little better on frequency')
+# MPD: Train vs Test Comparison
+fig, axes = plt.subplots(1, 2, figsize=(15, 6))
+# --- MPD Comparison ---
+mpd_data = df_tot[['mpd_train', 'mpd_test']]
+mpd_data.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
+axes[0].set_title('Mean Poisson Deviance: Train vs Test', fontsize=16, fontweight='bold')
+axes[0].set_ylabel('MPD Value', fontsize=14)
+axes[0].set_xlabel('Dataset', fontsize=14)
+axes[0].legend(['Train', 'Test'], fontsize=10)
+# Larger tick labels
+axes[0].tick_params(axis='x', labelsize=12, rotation=45)
+axes[0].tick_params(axis='y', labelsize=12)
+axes[0].grid(axis='y', alpha=0.3)
+for container in axes[0].containers:
+    axes[0].bar_label(container, fmt='%.3f', fontsize=15)
+# --- MGD Comparison ---
+mgd_data = df_tot[['mgd_train', 'mgd_test']]
+mgd_data.plot(kind='bar', ax=axes[1], color=['#3498db', '#f39c12'])
+axes[1].set_title('Mean Gamma Deviance: Train vs Test', fontsize=16, fontweight='bold')
+axes[1].set_ylabel('MGD Value', fontsize=14)
+axes[1].set_xlabel('Dataset', fontsize=14)
+axes[1].legend(['Train', 'Test'], fontsize=10)
+# Larger tick labels
+axes[1].tick_params(axis='x', labelsize=12, rotation=45)
+axes[1].tick_params(axis='y', labelsize=12)
+axes[1].grid(axis='y', alpha=0.3)
+for container in axes[1].containers:
+    axes[1].bar_label(container, fmt='%.3f', fontsize=15)
+plt.tight_layout()
+st.pyplot(fig)
+#st.write('This comparison reveals excellent synthetic data quality with minimal \
+#train-test gaps. The synthetic generation process maintains distributional properties, \
+#and also model generalization characteristics.')
+# Create a heatmap
+fig, ax = plt.subplots(figsize=(10, 6))
+sns.heatmap(df_tot, annot=True, fmt='.3f', cmap='RdYlGn_r',
+            linewidths=0.5, ax=ax, cbar_kws={'label': 'Deviance Value'})
+ax.set_title('Performance Heatmap: All Metrics Across Datasets', fontsize=15, fontweight='bold', pad=20)
+ax.set_xlabel('Metrics')
+ax.set_ylabel('Datasets')
+plt.tight_layout()
+st.pyplot(fig)
+#st.write('The heatmap with the near-identical color patterns between real and synthetic versions \
+#of each dataset confirm excellent replication fidelity. Dataset 2 shows dramatically \
+#lower MPD values (green, ~0.28-0.44) compared to dataset 1 (orange-red, ~1.43-1.75), while MGD \
+#values remain similarly high across both, suggesting dataset 2 represents a different \
+#modeling challenge that the synthetic generation process successfully preserves.')