Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files Community

alidenewade commited on May 29, 2025

Commit

8e9768e

verified ·

1 Parent(s): 005e14d

Update app.py

Browse files

Files changed (1) hide show

app.py +336 -463

app.py CHANGED Viewed

@@ -2,15 +2,15 @@ import gradio as gr
 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
-from sklearn.metrics import pairwise_distances_argmin_min, r2_score
-import matplotlib.pyplot as plt
-import seaborn as sns
 import io
 import os
 from PIL import Image
 # Define the paths for example data
-# For Hugging Face Spaces, these paths will be relative to the app's root
 EXAMPLE_DATA_DIR = "eg_data"
 EXAMPLE_FILES = {
     "cashflow_base": os.path.join(EXAMPLE_DATA_DIR, "cashflows_seriatim_10K.xlsx"),
@@ -24,455 +24,306 @@ EXAMPLE_FILES = {
 class Clusters:
     def __init__(self, loc_vars):
-        if isinstance(loc_vars, pd.DataFrame):
-            loc_vars_np = np.ascontiguousarray(loc_vars.values)
-        else:
-            loc_vars_np = np.ascontiguousarray(loc_vars)
-        self.kmeans = KMeans(n_clusters=1000, random_state=0, n_init=10).fit(loc_vars_np)
-        closest, _ = pairwise_distances_argmin_min(self.kmeans.cluster_centers_, loc_vars_np)
-        rep_ids = pd.Series(data=(closest + 1))
         rep_ids.name = 'policy_id'
         rep_ids.index.name = 'cluster_id'
         self.rep_ids = rep_ids
-        self.policy_count = self.agg_by_cluster(pd.DataFrame({'policy_count': [1] * len(loc_vars_np)}))['policy_count']
     def agg_by_cluster(self, df, agg=None):
         temp = df.copy()
         temp['cluster_id'] = self.kmeans.labels_
         temp = temp.set_index('cluster_id')
-        agg_dict = {c: (agg[c] if agg and c in agg else 'sum') for c in temp.columns if c != 'cluster_id'} if agg else "sum"
-        if not agg_dict:
-             return pd.DataFrame(index=temp.index.unique())
-        return temp.groupby(level='cluster_id').agg(agg_dict)
     def extract_reps(self, df):
-        if 'policy_id' not in df.columns and df.index.name != 'policy_id':
-            # Try to use the first part of the index if it's a MultiIndex and called 'policy_id'
-            if isinstance(df.index, pd.MultiIndex) and 'policy_id' in df.index.names:
-                 df_to_merge = df.reset_index() # Reset all levels, policy_id becomes a column
-            else:
-                raise ValueError("DataFrame for extract_reps must have 'policy_id' as a named index or a column.")
-        df_to_merge = df.reset_index() if df.index.name == 'policy_id' or (isinstance(df.index, pd.MultiIndex) and 'policy_id' in df.index.names) else df.copy()
-        if 'policy_id' not in df_to_merge.columns:
-             # This is a fallback if policy_id was expected but still not a column.
-             # This might happen if the index was unnamed and thought to be policy_id.
-             # A robust solution depends on stricter input guarantees.
-             gr.Warning("extract_reps: 'policy_id' column not found after attempting to reset index. Merging may fail or be incorrect.")
-        temp = pd.merge(self.rep_ids.reset_index(), df_to_merge, how='left', on='policy_id')
-        temp = temp.set_index('cluster_id')
-        return temp.drop(columns=['policy_id'], errors='ignore')
     def extract_and_scale_reps(self, df, agg=None):
-        extracted_df = self.extract_reps(df)
         if agg:
-            cols_to_multiply = [col for col in df.columns if col in extracted_df.columns]
-            mult_data = {}
-            for c in cols_to_multiply:
-                # Ensure self.policy_count is aligned with extracted_df.index if it's a Series
-                if isinstance(self.policy_count, pd.Series):
-                    policy_count_for_col = self.policy_count.reindex(extracted_df.index).fillna(1) # Default to 1 if cluster missing
-                else: # Should be a scalar or array-like usable directly
-                    policy_count_for_col = self.policy_count
-                mult_data[c] = policy_count_for_col if (c not in agg or agg[c] == 'sum') else 1
-            mult = pd.DataFrame(mult_data, index=extracted_df.index)
-            result_df = extracted_df.copy()
-            for col in cols_to_multiply:
-                if col in mult.columns: # Ensure column exists in multiplier
-                    result_df[col] = extracted_df[col].mul(mult[col])
-            return result_df
         else:
-            numeric_cols = extracted_df.select_dtypes(include=np.number).columns
-            result_df = extracted_df.copy()
-            for col in numeric_cols:
-                if isinstance(self.policy_count, pd.Series):
-                    policy_count_for_col = self.policy_count.reindex(extracted_df.index).fillna(0) # Fill with 0 if not found
-                    result_df[col] = extracted_df[col].mul(policy_count_for_col, axis=0)
-                else: # Assuming self.policy_count is a scalar or compatible array
-                    result_df[col] = extracted_df[col].mul(self.policy_count, axis=0)
-            return result_df
     def compare(self, df, agg=None):
         source = self.agg_by_cluster(df, agg)
         target = self.extract_and_scale_reps(df, agg)
-        common_columns = source.columns.intersection(target.columns)
-        if common_columns.empty and (not source.empty or not target.empty):
-            gr.Warning("Compare function: No common columns between source and target. Result will be empty.")
-            return pd.DataFrame({'actual': pd.Series(dtype=float), 'estimate': pd.Series(dtype=float)})
-        source_stacked = source[common_columns].stack(dropna=False) # keepna=True for older pandas
-        target_stacked = target[common_columns].stack(dropna=False)
-        return pd.DataFrame({'actual': source_stacked, 'estimate': target_stacked})
     def compare_total(self, df, agg=None):
         if agg:
             actual_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     actual_values[col] = df[col].mean()
-                else:
                     actual_values[col] = df[col].sum()
             actual = pd.Series(actual_values)
             reps_unscaled = self.extract_reps(df)
             estimate_values = {}
-            for col_orig_df in df.columns:
-                if col_orig_df not in reps_unscaled.columns:
-                    estimate_values[col_orig_df] = np.nan
-                    continue
-                current_col_data = reps_unscaled[col_orig_df].astype(float) # Ensure numeric for calcs
-                policy_counts_aligned = self.policy_count.reindex(current_col_data.index).astype(float) # Align and ensure numeric
-                if agg.get(col_orig_df, 'sum') == 'mean':
-                    weighted_sum = (current_col_data * policy_counts_aligned).sum()
-                    total_weight = policy_counts_aligned.sum()
-                    estimate_values[col_orig_df] = weighted_sum / total_weight if total_weight > 0 else np.nan
-                else:
-                    estimate_values[col_orig_df] = (current_col_data * policy_counts_aligned).sum()
             estimate = pd.Series(estimate_values)
-        else:
             actual = df.sum()
-            estimate = self.extract_and_scale_reps(df).sum() # This sum might need to be on numeric cols only
-        actual, estimate = actual.align(estimate, fill_value=0)
-        error = np.where(actual != 0, (estimate / actual) - 1, 0)
-        error = np.nan_to_num(error, nan=0.0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
-# Plotting Functions (Modified for Seaborn)
-def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
-    sns.set_style("whitegrid")
-    if not cfs_list or not cluster_obj or not titles or not any(cfs_list) : # Check if cfs_list contains any non-None df
-        # Return a placeholder image indicating no data
-        fig, ax = plt.subplots(figsize=(7.5, 2.5)) # Smaller placeholder
-        ax.text(0.5, 0.5, "No cashflow data to plot", ha='center', va='center', fontsize=10)
-        ax.set_xticks([])
-        ax.set_yticks([])
-        buf = io.BytesIO()
-        plt.savefig(buf, format='png', dpi=100)
-        buf.seek(0)
-        img = Image.open(buf)
-        plt.close(fig)
-        return img
-    # Filter out None DataFrames from cfs_list to prevent errors
-    valid_cfs_data = [(df, title) for df, title in zip(cfs_list, titles) if df is not None and not df.empty]
-    if not valid_cfs_data: # If all DFs were None or empty
-        return plot_cashflows_comparison([], None, []) # Recurse to get placeholder
-    num_plots = len(valid_cfs_data)
     cols = 2
     rows = (num_plots + cols - 1) // cols
-    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows), squeeze=False)
     axes = axes.flatten()
-    plot_idx = 0 # Separate index for placing plots
-    for df_orig, title in valid_cfs_data:
-        if plot_idx < len(axes):
-            ax = axes[plot_idx]
-            comparison_df = cluster_obj.compare_total(df_orig)
-            if comparison_df.empty:
-                ax.text(0.5, 0.5, f"No comparison data for\n{title}", ha='center', va='center', fontsize=9)
-                ax.set_title(title)
-                plot_idx += 1
-                continue
-            plot_data = comparison_df[['actual', 'estimate']].copy()
-            plot_data['Time'] = plot_data.index.astype(str)
-            try:
-                plot_data['Time'] = pd.to_numeric(plot_data['Time'])
-            except ValueError:
-                pass
-            plot_data_melted = plot_data.melt(id_vars='Time', var_name='Legend', value_name='Value')
-            sns.lineplot(x='Time', y='Value', hue='Legend', data=plot_data_melted, ax=ax, errorbar=None)
             ax.set_title(title)
             ax.set_xlabel('Time')
             ax.set_ylabel('Value')
-            plot_idx += 1
-    for j in range(plot_idx, len(axes)): # Hide any unused subplots
         fig.delaxes(axes[j])
-    plt.tight_layout()
     buf = io.BytesIO()
-    plt.savefig(buf, format='png', dpi=100)
     buf.seek(0)
     img = Image.open(buf)
-    plt.close(fig)
     return img
 def plot_scatter_comparison(df_compare_output, title):
-    sns.set_style("whitegrid")
-    fig, ax = plt.subplots(figsize=(12, 8)) # Define fig and ax here for all paths
-    plot_data_available = False # Flag to check if we have data to plot for limits
     if df_compare_output is None or df_compare_output.empty:
         ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
         ax.set_title(title)
-    elif not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
-        gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
-        if not df_compare_output[['actual', 'estimate']].empty:
-            sns.scatterplot(x='actual', y='estimate', data=df_compare_output, s=25, alpha=0.7, ax=ax, legend=False)
-            plot_data_available = True
-        else:
-            ax.text(0.5, 0.5, "Data for scatter plot is empty.", ha='center', va='center', fontsize=15)
-        ax.set_title(title)
-    else:
-        plot_data_internal = df_compare_output.reset_index()
-        if plot_data_internal[['actual', 'estimate']].dropna().empty:
-             ax.text(0.5, 0.5, "Comparison data (actual/estimate) is empty or all NaN.", ha='center', va='center', fontsize=15)
-             ax.set_title(title)
-        else:
-            hue_col_name = df_compare_output.index.names[1]
-            plot_data_internal[hue_col_name] = plot_data_internal[hue_col_name].astype(str)
-            unique_levels = plot_data_internal[hue_col_name].nunique()
-            show_legend_flag = "auto"
-            if unique_levels == 1:
-                show_legend_flag = False
-            elif unique_levels > 10: # Max 10 items in legend for clarity
-                show_legend_flag = False
-                gr.Warning(f"Warning: Too many unique values ({unique_levels}) in '{hue_col_name}' for scatter plot legend. Legend hidden.")
-            sns.scatterplot(x='actual', y='estimate', hue=hue_col_name, data=plot_data_internal,
-                            s=25, alpha=0.7, ax=ax, legend=show_legend_flag)
-            plot_data_available = True
-            ax.set_title(title)
-            if ax.get_legend() is not None: # If legend is shown
-                 ax.get_legend().set_title(str(hue_col_name))
     ax.set_xlabel('Actual')
     ax.set_ylabel('Estimate')
-    if plot_data_available:
-        # Use 'plot_data_internal' if it exists, else 'df_compare_output' for non-multi-index case
-        current_plot_df = plot_data_internal if 'plot_data_internal' in locals() and not plot_data_internal.empty else df_compare_output
-        if current_plot_df is not None and not current_plot_df.empty:
-            all_values = pd.concat([current_plot_df['actual'], current_plot_df['estimate']]).dropna()
-            if not all_values.empty:
-                min_val = all_values.min()
-                max_val = all_values.max()
-                # Fallback if min_val and max_val are the same (e.g. single point data)
-                if min_val == max_val:
-                    margin = abs(min_val * 0.1) if min_val != 0 else 0.1 # 10% margin or 0.1 if value is 0
-                    plot_min, plot_max = min_val - margin, max_val + margin
-                else:
-                    plot_min, plot_max = min_val, max_val
-                # Ensure plot_min and plot_max are finite and distinct
-                if np.isfinite(plot_min) and np.isfinite(plot_max) and plot_min < plot_max:
-                    ax.plot([plot_min, plot_max], [plot_min, plot_max], 'r-', linewidth=0.7, alpha=0.8, zorder=0)
-                    ax.set_xlim(plot_min, plot_max)
-                    ax.set_ylim(plot_min, plot_max)
-                elif np.isfinite(plot_min) and np.isfinite(plot_max) and plot_min == plot_max: # Handles single point case after margin
-                    ax.plot([plot_min], [plot_min], 'ro', markersize=5) # Mark the point
-                    ax.set_xlim(plot_min - (abs(plot_min*0.1) if plot_min !=0 else 0.1), plot_min + (abs(plot_min*0.1) if plot_min !=0 else 0.1))
-                    ax.set_ylim(plot_min - (abs(plot_min*0.1) if plot_min !=0 else 0.1), plot_min + (abs(plot_min*0.1) if plot_min !=0 else 0.1))
     buf = io.BytesIO()
-    plt.savefig(buf, format='png', dpi=100)
     buf.seek(0)
     img = Image.open(buf)
-    plt.close(fig)
     return img
-# Main Processing and Gradio UI
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
     try:
-        def read_and_prep_excel(file_path, is_policy_data=False):
-            if not os.path.exists(file_path):
-                raise FileNotFoundError(f"File not found: {file_path}")
-            # For Hugging Face, ensure files are readable.
-            # The path provided by gr.File is usually to a temp copy.
-            df = pd.read_excel(file_path)
-            # Try to identify policy_id:
-            # 1. Explicit 'policy_id' column (case-insensitive)
-            # 2. First column if no explicit 'policy_id'
-            pid_col_name = None
-            for col in df.columns:
-                if str(col).lower() == 'policy_id':
-                    pid_col_name = col
-                    break
-            if pid_col_name:
-                df = df.rename(columns={pid_col_name: 'policy_id'})
-                df = df.set_index('policy_id')
-            elif df.index.name and df.index.name.lower() == 'policy_id': # Already indexed by policy_id
-                 pass # Keep as is
-            else: # Assume first column is policy_id if no explicit one is found
-                gr.Warning(f"No explicit 'policy_id' column/index in {os.path.basename(file_path)}. Assuming first column is policy_id.")
-                df = df.rename(columns={df.columns[0]: 'policy_id'})
-                df = df.set_index('policy_id')
-            if is_policy_data:
-                return df # Return all columns for policy data, selection happens next
-            return df.select_dtypes(include=np.number)
-        cfs = read_and_prep_excel(cashflow_base_path)
-        cfs_lapse50 = read_and_prep_excel(cashflow_lapse_path)
-        cfs_mort15 = read_and_prep_excel(cashflow_mort_path)
-        pol_data_full = read_and_prep_excel(policy_data_path, is_policy_data=True)
-        required_cols_std = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
-        # Normalize available column names for matching
-        available_cols_map = {col.lower().replace("_", "").replace(" ", ""): col for col in pol_data_full.columns}
-        cols_to_select = []
-        final_rename_map = {}
-        for req_col_std in required_cols_std:
-            req_col_norm = req_col_std.lower().replace("_", "").replace(" ", "")
-            if req_col_norm in available_cols_map:
-                original_name = available_cols_map[req_col_norm]
-                cols_to_select.append(original_name)
-                if original_name != req_col_std: # if original name was 'Age At Entry' map to 'age_at_entry'
-                    final_rename_map[original_name] = req_col_std
-            else: # If after normalization, it's still not found.
-                gr.Warning(f"Required policy data column '{req_col_std}' not found or could not be matched.")
-        if len(cols_to_select) == len(required_cols_std):
-            pol_data = pol_data_full[cols_to_select].rename(columns=final_rename_map)
-            pol_data = pol_data.select_dtypes(include=np.number) # Ensure numeric after selection
         else:
-            missing_cols_display = [rc for rc in required_cols_std if rc not in final_rename_map.values()]
-            gr.Warning(f"Policy data is missing some required columns: {missing_cols_display}. Using all available numeric columns instead.")
-            pol_data = pol_data_full.select_dtypes(include=np.number)
-        if pol_data.index.name != 'policy_id': # safety check if index was lost
-            gr.Error("Policy data lost 'policy_id' index during processing.")
-            # Attempt to recover if 'policy_id' is a column
-            if 'policy_id' in pol_data.columns:
-                pol_data = pol_data.set_index('policy_id')
-            else: # cannot proceed with pol_data
-                pol_data = pd.DataFrame() # Make it empty to signal issues later
-        pvs = read_and_prep_excel(pv_base_path)
-        pvs_lapse50 = read_and_prep_excel(pv_lapse_path)
-        pvs_mort15 = read_and_prep_excel(pv_mort_path)
         cfs_list = [cfs, cfs_lapse50, cfs_mort15]
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
         results = {}
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
         # --- 1. Cashflow Calibration ---
-        if cfs.empty: gr.Warning("Base cashflow data (cfs) is empty. CF Calib may fail or produce no results.")
-        cluster_cfs = Clusters(cfs)
         results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
-        results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs) if not pol_data.empty else pd.DataFrame()
-        results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs) if not pvs.empty else pd.DataFrame()
-        results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50) if not pvs_lapse50.empty else pd.DataFrame()
-        results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15) if not pvs_mort15.empty else pd.DataFrame()
         results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
         results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
         # --- 2. Policy Attribute Calibration ---
-        loc_vars_attrs_scaled = pd.DataFrame() # Initialize
-        if not pol_data.empty:
-            min_vals, max_vals = pol_data.min(), pol_data.max()
-            range_vals = max_vals - min_vals
-            if (range_vals.abs() < 1e-9).all(): # Check if all ranges are effectively zero
-                gr.Warning("Policy data for attribute calibration has no variance. Using unscaled data (0s).")
-                loc_vars_attrs_scaled = pd.DataFrame(0, index=pol_data.index, columns=pol_data.columns)
-            else:
-                loc_vars_attrs_scaled = pol_data.copy()
-                for col in range_vals.index:
-                    if range_vals[col] > 1e-9:
-                         loc_vars_attrs_scaled[col] = (pol_data[col] - min_vals[col]) / range_vals[col]
-                    else:
-                         loc_vars_attrs_scaled[col] = 0.0 # Column with no variance becomes 0
-                loc_vars_attrs_scaled = loc_vars_attrs_scaled.fillna(0) # Handle any NaNs from division by zero if range_vals was exactly 0
         else:
-            gr.Warning("Policy data is empty. Skipping attribute calibration.")
-        if not loc_vars_attrs_scaled.empty:
-            cluster_attrs = Clusters(loc_vars_attrs_scaled)
-            results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs) if not cfs.empty else pd.DataFrame()
-            results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs) # Compare with original pol_data
-            results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs) if not pvs.empty else pd.DataFrame()
             results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
             results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
         else:
             results['attr_total_cf_base'] = pd.DataFrame()
             results['attr_policy_attrs_total'] = pd.DataFrame()
             results['attr_total_pv_base'] = pd.DataFrame()
-            results['attr_cashflow_plot'] = plot_cashflows_comparison([None,None,None], None, scen_titles) # Pass None to get placeholder
-            results['attr_scatter_cashflows_base'] = plot_scatter_comparison(pd.DataFrame(), 'Policy Attr. Calib. - No Data')
         # --- 3. Present Value Calibration ---
-        if pvs.empty: gr.Warning("Base Present Value data (pvs) is empty. PV Calib may fail or produce no results.")
         cluster_pvs = Clusters(pvs)
-        results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs) if not cfs.empty else pd.DataFrame()
-        results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs) if not pol_data.empty else pd.DataFrame()
         results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
-        results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50) if not pvs_lapse50.empty else pd.DataFrame()
-        results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15) if not pvs_mort15.empty else pd.DataFrame()
         results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
         results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
         # --- Summary Comparison Plot Data ---
         error_data = {}
-        def get_error_safe(compare_result_df, col_name=None):
-            if compare_result_df is None or compare_result_df.empty or 'error' not in compare_result_df.columns:
                 return np.nan
-            if col_name and col_name in compare_result_df.index:
-                error_val = compare_result_df.loc[col_name, 'error']
-                return abs(error_val) if pd.notna(error_val) else np.nan
             else:
-                valid_errors = compare_result_df['error'].dropna()
-                return abs(valid_errors).mean() if not valid_errors.empty else np.nan
         key_pv_col = None
-        if not pvs.empty:
-            for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']:
-                if potential_col in pvs.columns:
-                    key_pv_col = potential_col
                     break
         error_data['CF Calib.'] = [
             get_error_safe(results.get('cf_pv_total_base'), key_pv_col),
             get_error_safe(results.get('cf_pv_total_lapse'), key_pv_col),
             get_error_safe(results.get('cf_pv_total_mort'), key_pv_col)
         ]
-        if not loc_vars_attrs_scaled.empty and 'cluster_attrs' in locals() : # Check if attribute calibration was performed
-             error_data['Attr Calib.'] = [
-                get_error_safe(results.get('attr_total_pv_base'), key_pv_col),
-                get_error_safe(cluster_attrs.compare_total(pvs_lapse50) if not pvs_lapse50.empty else pd.DataFrame(), key_pv_col),
-                get_error_safe(cluster_attrs.compare_total(pvs_mort15) if not pvs_mort15.empty else pd.DataFrame(), key_pv_col)
             ]
         else:
             error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
@@ -483,72 +334,74 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
             get_error_safe(results.get('pv_total_pv_mort'), key_pv_col)
         ]
-        summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%']).astype(float)
-        fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
         sns.set_style("whitegrid")
-        summary_df_melted = summary_df.reset_index().rename(columns={'index': 'Scenario'})
-        summary_df_melted = summary_df_melted.melt(id_vars='Scenario', var_name='Calibration Method', value_name='Absolute Error Rate')
-        sns.barplot(x='Scenario', y='Absolute Error Rate', hue='Calibration Method', data=summary_df_melted, ax=ax_summary)
-        ax_summary.set_ylabel('Absolute Error Rate')
-        title_suffix = f' for {key_pv_col}' if key_pv_col else ' (Mean Absolute Error)'
         ax_summary.set_title(f'Calibration Method Comparison - Error in Total PV{title_suffix}')
         ax_summary.tick_params(axis='x', rotation=0)
-        if ax_summary.get_legend(): ax_summary.get_legend().set_title('Calibration Method')
-        ax_summary.grid(True, axis='y')
-        plt.tight_layout()
         buf_summary = io.BytesIO()
-        plt.savefig(buf_summary, format='png', dpi=100)
         buf_summary.seek(0)
         results['summary_plot'] = Image.open(buf_summary)
-        plt.close(fig_summary)
         return results
     except FileNotFoundError as e:
-        gr.Error(f"File not found: {e.filename}. Please ensure example files are in '{EXAMPLE_DATA_DIR}' or all files are uploaded and paths are correct for Hugging Face.")
         return {"error": f"File not found: {e.filename}"}
     except KeyError as e:
-        gr.Error(f"A required column/index ('policy_id' or feature column) is missing or misnamed: {e}. Please check data format and Hugging Face file structure.")
-        import traceback
-        traceback.print_exc()
-        return {"error": f"Missing column/index: {e}"}
     except ValueError as e:
-        gr.Error(f"Data processing or plotting error: {str(e)}. Check data consistency and formats.")
-        import traceback
-        traceback.print_exc()
-        return {"error": f"Data error: {str(e)}"}
     except Exception as e:
-        gr.Error(f"An unexpected error occurred: {str(e)}. Check logs for details.")
         import traceback
-        traceback.print_exc()
-        return {"error": f"Unexpected error: {str(e)}"}
 def create_interface():
-    with gr.Blocks(title="Cluster Model Points Analysis") as demo:
         gr.Markdown("""
-        # Cluster Model Points Analysis
         This application applies cluster analysis to model point selection for insurance portfolios.
         Upload your Excel files or use the example data to analyze cashflows, policy attributes, and present values using different calibration methods.
         **Required Files (Excel .xlsx):**
-        - Cashflows - Base Scenario (must contain a 'policy_id' column/index)
-        - Cashflows - Lapse Stress (+50%) (similar structure)
-        - Cashflows - Mortality Stress (+15%) (similar structure)
-        - Policy Data (must contain 'policy_id', and ideally 'age_at_entry', 'policy_term', 'sum_assured', 'duration_mth')
-        - Present Values - Base Scenario (must contain 'policy_id' and PV columns like 'PV_NetCF')
-        - Present Values - Lapse Stress (similar structure)
-        - Present Values - Mortality Stress (similar structure)
-        *Note: Ensure your files are in the `eg_data` directory in your Hugging Face Space if using 'Load Example Data'.*
         """)
         with gr.Row():
             with gr.Column(scale=1):
-                gr.Markdown("### Upload Files or Load Examples")
-                load_example_btn = gr.Button("Load Example Data")
                 with gr.Row():
                     cashflow_base_input = gr.File(label="Cashflows - Base", file_types=[".xlsx"])
                     cashflow_lapse_input = gr.File(label="Cashflows - Lapse Stress", file_types=[".xlsx"])
@@ -559,86 +412,115 @@ def create_interface():
                     pv_lapse_input = gr.File(label="Present Values - Lapse Stress", file_types=[".xlsx"])
                 with gr.Row():
                     pv_mort_input = gr.File(label="Present Values - Mortality Stress", file_types=[".xlsx"])
-                analyze_btn = gr.Button("Analyze Dataset", variant="primary", size="lg")
         with gr.Tabs():
             with gr.TabItem("📊 Summary"):
-                summary_plot_output = gr.Image(label="Calibration Methods Comparison", type="pil")
             with gr.TabItem("💸 Cashflow Calibration"):
                 gr.Markdown("### Results: Using Annual Cashflows as Calibration Variables")
                 with gr.Row():
-                    cf_total_base_table_out = gr.DataFrame(label="Overall Comparison - Base Scenario (Cashflows)")
-                    cf_policy_attrs_total_out = gr.DataFrame(label="Overall Comparison - Policy Attributes")
-                cf_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios", type="pil")
-                cf_scatter_cashflows_base_out = gr.Image(label="Scatter Plot - Per-Cluster Cashflows (Base Scenario)", type="pil")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
                     with gr.Row():
-                        cf_pv_total_base_out = gr.DataFrame(label="PVs - Base Total")
-                        cf_pv_total_lapse_out = gr.DataFrame(label="PVs - Lapse Stress Total")
-                        cf_pv_total_mort_out = gr.DataFrame(label="PVs - Mortality Stress Total")
             with gr.TabItem("👤 Policy Attribute Calibration"):
                 gr.Markdown("### Results: Using Policy Attributes as Calibration Variables")
                 with gr.Row():
-                    attr_total_cf_base_out = gr.DataFrame(label="Overall Comparison - Base Scenario (Cashflows)")
-                    attr_policy_attrs_total_out = gr.DataFrame(label="Overall Comparison - Policy Attributes")
-                attr_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios", type="pil")
-                attr_scatter_cashflows_base_out = gr.Image(label="Scatter Plot - Per-Cluster Cashflows (Base Scenario)", type="pil")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
-                    attr_total_pv_base_out = gr.DataFrame(label="PVs - Base Scenario Total")
             with gr.TabItem("💰 Present Value Calibration"):
                 gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
                 with gr.Row():
-                    pv_total_cf_base_out = gr.DataFrame(label="Overall Comparison - Base Scenario (Cashflows)")
-                    pv_policy_attrs_total_out = gr.DataFrame(label="Overall Comparison - Policy Attributes")
-                pv_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios", type="pil")
-                pv_scatter_pvs_base_out = gr.Image(label="Scatter Plot - Per-Cluster Present Values (Base Scenario)", type="pil")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
                     with gr.Row():
-                        pv_total_pv_base_out = gr.DataFrame(label="PVs - Base Total")
-                        pv_total_pv_lapse_out = gr.DataFrame(label="PVs - Lapse Stress Total")
-                        pv_total_pv_mort_out = gr.DataFrame(label="PVs - Mortality Stress Total")
         def get_all_output_components():
             return [
-                summary_plot_output, cf_total_base_table_out, cf_policy_attrs_total_out,
-                cf_cashflow_plot_out, cf_scatter_cashflows_base_out, cf_pv_total_base_out,
-                cf_pv_total_lapse_out, cf_pv_total_mort_out, attr_total_cf_base_out,
-                attr_policy_attrs_total_out, attr_cashflow_plot_out, attr_scatter_cashflows_base_out,
-                attr_total_pv_base_out, pv_total_cf_base_out, pv_policy_attrs_total_out,
-                pv_cashflow_plot_out, pv_scatter_pvs_base_out, pv_total_pv_base_out,
-                pv_total_pv_lapse_out, pv_total_pv_mort_out
             ]
-        def handle_analysis(f1, f2, f3, f4, f5, f6, f7):
             files = [f1, f2, f3, f4, f5, f6, f7]
             file_paths = []
-            # In Gradio 4+, File component value is a string path to a temp file or None
-            for i, file_obj in enumerate(files):
-                if file_obj is None: # No file uploaded for this slot
-                    gr.Error(f"Missing file for input {i+1}. Please upload all required files or use 'Load Example Data'.")
                     return [None] * len(get_all_output_components())
-                # The object from gr.File is already the path string
-                file_paths.append(file_obj)
             results = process_files(*file_paths)
-            if "error" in results: # If process_files indicated an error
-                # Error message already shown by gr.Error in process_files
-                # Return Nones to clear outputs
                 return [None] * len(get_all_output_components())
             return [
-                results.get('summary_plot'), results.get('cf_total_base_table'),
-                results.get('cf_policy_attrs_total'), results.get('cf_cashflow_plot'),
-                results.get('cf_scatter_cashflows_base'), results.get('cf_pv_total_base'),
-                results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
                 results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
-                results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'),
-                results.get('attr_total_pv_base'), results.get('pv_total_cf_base'),
-                results.get('pv_policy_attrs_total'), results.get('pv_cashflow_plot'),
-                results.get('pv_scatter_pvs_base'), results.get('pv_total_pv_base'),
-                results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
             ]
         analyze_btn.click(
@@ -648,41 +530,31 @@ def create_interface():
             outputs=get_all_output_components()
         )
         def load_example_files():
-            os.makedirs(EXAMPLE_DATA_DIR, exist_ok=True)
-            # Check for existing files and create dummies if not present
-            for key, target_path in EXAMPLE_FILES.items():
-                if not os.path.exists(target_path):
-                    gr.Info(f"Example file {os.path.basename(target_path)} not found in '{EXAMPLE_DATA_DIR}'. Attempting to create a dummy file.")
-                    try:
-                        num_policies = 50
-                        df_data = {'policy_id': [f'P{j:03d}' for j in range(num_policies)]}
-                        if "cashflow" in key or "pv" in key:
-                            for i in range(10): df_data[f't{i}'] = np.random.uniform(100, 1000, num_policies)
-                        elif "policy_data" in key:
-                            df_data.update({
-                                'age_at_entry': np.random.randint(20, 60, num_policies),
-                                'policy_term': np.random.randint(5, 30, num_policies),
-                                'sum_assured': np.random.randint(5000, 200000, num_policies),
-                                'duration_mth': np.random.randint(1, 300, num_policies)
-                            })
-                        else: df_data['feature1'] = np.random.rand(num_policies)
-                        pd.DataFrame(df_data).to_excel(target_path, index=False)
-                        gr.Info(f"Dummy file '{os.path.basename(target_path)}' created.")
-                    except Exception as e:
-                        gr.Error(f"Failed to create dummy file {os.path.basename(target_path)}: {e}")
-                        return [None] * 7 # Abort if dummy creation fails
-            # Verify all files exist after potential dummy creation
-            if any(not os.path.exists(f) for f in EXAMPLE_FILES.values()):
-                gr.Error(f"One or more example files are still missing from '{EXAMPLE_DATA_DIR}' after attempting to create dummies. Please check permissions or provide the files.")
-                return [None] * 7
-            gr.Info("Example data loaded. Click 'Analyze Dataset'.")
-            return [
-                EXAMPLE_FILES["cashflow_base"], EXAMPLE_FILES["cashflow_lapse"], EXAMPLE_FILES["cashflow_mort"],
-                EXAMPLE_FILES["policy_data"], EXAMPLE_FILES["pv_base"], EXAMPLE_FILES["pv_lapse"],
-                EXAMPLE_FILES["pv_mort"]
             ]
         load_example_btn.click(
@@ -695,11 +567,12 @@ def create_interface():
     return demo
 if __name__ == "__main__":
-    # When running locally, ensure eg_data exists.
-    # Dummy file creation is now handled by load_example_files if needed.
     if not os.path.exists(EXAMPLE_DATA_DIR):
         os.makedirs(EXAMPLE_DATA_DIR)
-        print(f"Directory '{EXAMPLE_DATA_DIR}' created/ensured. Example files will be checked/created by 'Load Example Data' button if not present.")
     demo_app = create_interface()
     demo_app.launch()

 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin_min
+# import matplotlib.pyplot as plt # Replaced with seaborn
+# import matplotlib.cm # Replaced with seaborn palettes
+import seaborn as sns # Added Seaborn
 import io
 import os
 from PIL import Image
 # Define the paths for example data
 EXAMPLE_DATA_DIR = "eg_data"
 EXAMPLE_FILES = {
     "cashflow_base": os.path.join(EXAMPLE_DATA_DIR, "cashflows_seriatim_10K.xlsx"),
 class Clusters:
     def __init__(self, loc_vars):
+        self.kmeans = kmeans = KMeans(n_clusters=1000, random_state=0, n_init=10).fit(np.ascontiguousarray(loc_vars))
+        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, np.ascontiguousarray(loc_vars))
+        rep_ids = pd.Series(data=(closest+1))  # 0-based to 1-based indexes
         rep_ids.name = 'policy_id'
         rep_ids.index.name = 'cluster_id'
         self.rep_ids = rep_ids
+        self.policy_count = self.agg_by_cluster(pd.DataFrame({'policy_count': [1] * len(loc_vars)}))['policy_count']
     def agg_by_cluster(self, df, agg=None):
+        """Aggregate columns by cluster"""
         temp = df.copy()
         temp['cluster_id'] = self.kmeans.labels_
         temp = temp.set_index('cluster_id')
+        agg = {c: (agg[c] if agg and c in agg else 'sum') for c in temp.columns} if agg else "sum"
+        return temp.groupby(temp.index).agg(agg)
     def extract_reps(self, df):
+        """Extract the rows of representative policies"""
+        temp = pd.merge(self.rep_ids, df.reset_index(), how='left', on='policy_id')
+        temp.index.name = 'cluster_id'
+        return temp.drop('policy_id', axis=1)
     def extract_and_scale_reps(self, df, agg=None):
+        """Extract and scale the rows of representative policies"""
         if agg:
+            cols = df.columns
+            mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
+            extracted_df = self.extract_reps(df)
+            mult.index = extracted_df.index
+            return extracted_df.mul(mult)
         else:
+            return self.extract_reps(df).mul(self.policy_count, axis=0)
     def compare(self, df, agg=None):
+        """Returns a multi-indexed Dataframe comparing actual and estimate"""
         source = self.agg_by_cluster(df, agg)
         target = self.extract_and_scale_reps(df, agg)
+        return pd.DataFrame({'actual': source.stack(), 'estimate':target.stack()})
     def compare_total(self, df, agg=None):
+        """Aggregate df by columns"""
         if agg:
             actual_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     actual_values[col] = df[col].mean()
+                else:  # sum
                     actual_values[col] = df[col].sum()
             actual = pd.Series(actual_values)
             reps_unscaled = self.extract_reps(df)
             estimate_values = {}
+            for col in df.columns:
+                if agg.get(col, 'sum') == 'mean':
+                    weighted_sum = (reps_unscaled[col] * self.policy_count).sum()
+                    total_weight = self.policy_count.sum()
+                    estimate_values[col] = weighted_sum / total_weight if total_weight > 0 else 0
+                else:  # sum
+                    estimate_values[col] = (reps_unscaled[col] * self.policy_count).sum()
             estimate = pd.Series(estimate_values)
+        else:  # Original logic if no agg is specified (all sum)
             actual = df.sum()
+            estimate = self.extract_and_scale_reps(df).sum()
+        error = np.where(actual != 0, estimate / actual - 1, 0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
+def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
+    """Create cashflow comparison plots using Seaborn"""
+    if not cfs_list or not cluster_obj or not titles:
+        return None
+    num_plots = len(cfs_list)
+    if num_plots == 0:
+        return None
     cols = 2
     rows = (num_plots + cols - 1) // cols
+    # Use matplotlib's subplots for layout, Seaborn will plot on these axes
+    fig, axes = sns.plt.subplots(rows, cols, figsize=(15, 5 * rows), squeeze=False)
     axes = axes.flatten()
+    sns.set_style("whitegrid") # Apply Seaborn style
+    for i, (df, title) in enumerate(zip(cfs_list, titles)):
+        if i < len(axes):
+            ax = axes[i]
+            comparison = cluster_obj.compare_total(df)
+            # Melt dataframe for Seaborn lineplot
+            plot_data = comparison[['actual', 'estimate']].reset_index().melt(
+                id_vars='index', var_name='Category', value_name='Value'
+            )
+            sns.lineplot(x='index', y='Value', hue='Category', data=plot_data, ax=ax, marker="o")
             ax.set_title(title)
             ax.set_xlabel('Time')
             ax.set_ylabel('Value')
+            if not plot_data.empty: # Add legend if data exists
+                 ax.legend(title='Category')
+    for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
+    sns.plt.tight_layout()
     buf = io.BytesIO()
+    sns.plt.savefig(buf, format='png', dpi=100)
     buf.seek(0)
     img = Image.open(buf)
+    sns.plt.close(fig) # Use sns.plt to close
     return img
 def plot_scatter_comparison(df_compare_output, title):
+    """Create scatter plot comparison from compare() output using Seaborn"""
     if df_compare_output is None or df_compare_output.empty:
+        fig, ax = sns.plt.subplots(figsize=(12, 8)) # Use sns.plt
+        sns.set_style("whitegrid")
         ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
         ax.set_title(title)
+        buf = io.BytesIO()
+        sns.plt.savefig(buf, format='png', dpi=100)
+        buf.seek(0)
+        img = Image.open(buf)
+        sns.plt.close(fig)
+        return img
+    fig, ax = sns.plt.subplots(figsize=(12, 8)) # Use sns.plt
+    sns.set_style("whitegrid")
+    hue_col = None
+    plot_data = df_compare_output.copy()
+    if isinstance(df_compare_output.index, pd.MultiIndex) and df_compare_output.index.nlevels >= 2:
+        gr.Info("Plotting with multiple item levels.")
+        # Prepare data for seaborn: reset index to use levels as columns
+        plot_data = df_compare_output.reset_index()
+        hue_col = df_compare_output.index.names[1] # Use the second level for hue
+        if hue_col is None or hue_col == "": # Handle unnamed index level
+            hue_col = "item_level_1"
+            plot_data.rename(columns={plot_data.columns[1]: hue_col}, inplace=True)
+        num_unique_hue = plot_data[hue_col].nunique()
+        palette = "viridis" # Default seaborn palette
+        if num_unique_hue > 10 : # If too many categories, don't use hue or use a simpler palette
+             palette = sns.color_palette("husl", num_unique_hue)
+        sns.scatterplot(x='actual', y='estimate', hue=hue_col if num_unique_hue <= 20 else None,
+                        data=plot_data, ax=ax, s=20, alpha=0.7, palette=palette)
+        if hue_col and num_unique_hue > 1 and num_unique_hue <= 10:
+             ax.legend(title=hue_col)
+        elif num_unique_hue > 10:
+             ax.legend().set_visible(False) # Hide legend if too many items
+    else:
+        gr.Warning("Scatter plot data is not in the expected multi-index format or has fewer than 2 levels. Plotting raw actual vs estimate without hue.")
+        sns.scatterplot(x='actual', y='estimate', data=plot_data, ax=ax, s=20, alpha=0.7)
     ax.set_xlabel('Actual')
     ax.set_ylabel('Estimate')
+    ax.set_title(title)
+    # Draw identity line
+    lims = [
+        np.min([ax.get_xlim(), ax.get_ylim()]),
+        np.max([ax.get_xlim(), ax.get_ylim()]),
+    ]
+    if lims[0] != lims[1] and np.isfinite(lims[0]) and np.isfinite(lims[1]): # Check for valid limits
+        ax.plot(lims, lims, 'r-', linewidth=0.7, alpha=0.8, zorder=0)
+        ax.set_xlim(lims)
+        ax.set_ylim(lims)
     buf = io.BytesIO()
+    sns.plt.savefig(buf, format='png', dpi=100)
     buf.seek(0)
     img = Image.open(buf)
+    sns.plt.close(fig)
     return img
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
+    """Main processing function - now accepts file paths"""
     try:
+        cfs = pd.read_excel(cashflow_base_path, index_col=0)
+        cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
+        cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
+        pol_data_full = pd.read_excel(policy_data_path, index_col=0)
+        required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
+        if all(col in pol_data_full.columns for col in required_cols):
+            pol_data = pol_data_full[required_cols]
         else:
+            gr.Warning(f"Policy data might be missing required columns. Found: {pol_data_full.columns.tolist()}")
+            pol_data = pol_data_full
+        pvs = pd.read_excel(pv_base_path, index_col=0)
+        pvs_lapse50 = pd.read_excel(pv_lapse_path, index_col=0)
+        pvs_mort15 = pd.read_excel(pv_mort_path, index_col=0)
         cfs_list = [cfs, cfs_lapse50, cfs_mort15]
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
         results = {}
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
         # --- 1. Cashflow Calibration ---
+        cluster_cfs = Clusters(cfs)
         results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
+        results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
+        results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs)
+        results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50)
+        results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15)
         results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
         results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
         # --- 2. Policy Attribute Calibration ---
+        if not pol_data.empty and not pol_data.isnull().all().all() and (pol_data.max(numeric_only=True) - pol_data.min(numeric_only=True)).sum() != 0: # Check for actual variance
+            loc_vars_attrs = (pol_data - pol_data.min()) / (pol_data.max() - pol_data.min())
+            loc_vars_attrs = loc_vars_attrs.fillna(0) # Handle potential NaNs after division if a column is constant
         else:
+            gr.Warning("Policy data for attribute calibration is empty, all NaNs, or has no variance. Skipping attribute calibration plots.")
+            loc_vars_attrs = pol_data # or pd.DataFrame() if you want to ensure it's empty
+        if not loc_vars_attrs.empty:
+            cluster_attrs = Clusters(loc_vars_attrs)
+            results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
+            results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs)
+            results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
             results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
             results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
         else:
             results['attr_total_cf_base'] = pd.DataFrame()
             results['attr_policy_attrs_total'] = pd.DataFrame()
             results['attr_total_pv_base'] = pd.DataFrame()
+            results['attr_cashflow_plot'] = plot_scatter_comparison(None, "Policy Attr. Calib. - Cashflows (Base) - No Data") # Generate blank plot
+            results['attr_scatter_cashflows_base'] = plot_scatter_comparison(None, "Policy Attr. Calib. - Scatter - No Data")
         # --- 3. Present Value Calibration ---
         cluster_pvs = Clusters(pvs)
+        results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
+        results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
         results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
+        results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50)
+        results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15)
         results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
         results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
         # --- Summary Comparison Plot Data ---
         error_data = {}
+        def get_error_safe(compare_result, col_name=None):
+            if compare_result is None or compare_result.empty:
                 return np.nan
+            if col_name and col_name in compare_result.index:
+                return abs(compare_result.loc[col_name, 'error'])
+            elif 'error' in compare_result.columns:
+                return abs(compare_result['error']).mean()
             else:
+                return np.nan # Should not happen if compare_result is valid
         key_pv_col = None
+        for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF', 'PV NET CF']: # Added more common names
+            if potential_col in pvs.columns:
+                key_pv_col = potential_col
+                break
+            # Case insensitive check
+            for col in pvs.columns:
+                if col.lower() == potential_col.lower():
+                    key_pv_col = col
                     break
+            if key_pv_col:
+                break
+        if not key_pv_col and not pvs.empty:
+             gr.Warning(f"Could not find a standard PV Net CF column in PV data. Using mean absolute error for all PV columns for summary. Columns available: {pvs.columns.tolist()}")
         error_data['CF Calib.'] = [
             get_error_safe(results.get('cf_pv_total_base'), key_pv_col),
             get_error_safe(results.get('cf_pv_total_lapse'), key_pv_col),
             get_error_safe(results.get('cf_pv_total_mort'), key_pv_col)
         ]
+        if not loc_vars_attrs.empty:
+            error_data['Attr Calib.'] = [
+                get_error_safe(results.get('attr_total_pv_base'), key_pv_col), # Assuming pvs is the right df here
+                get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col), # Recalculate for lapse scenario with attr cluster
+                get_error_safe(cluster_attrs.compare_total(pvs_mort15), key_pv_col) # Recalculate for mort scenario with attr cluster
             ]
         else:
             error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
             get_error_safe(results.get('pv_total_pv_mort'), key_pv_col)
         ]
+        summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
+        fig_summary, ax_summary = sns.plt.subplots(figsize=(10, 6)) # Use sns.plt
         sns.set_style("whitegrid")
+        # Melt the DataFrame for Seaborn barplot
+        summary_plot_data = summary_df.reset_index().melt(
+            id_vars='index', var_name='Calibration Method', value_name='Absolute Error Rate'
+        )
+        sns.barplot(x='index', y='Absolute Error Rate', hue='Calibration Method', data=summary_plot_data, ax=ax_summary, palette="muted")
+        ax_summary.set_ylabel('Absolute Error Rate (0.1 = 10%)')
+        title_suffix = f' (Key PV Column: {key_pv_col})' if key_pv_col else ' (Mean Absolute Error of PVs)'
         ax_summary.set_title(f'Calibration Method Comparison - Error in Total PV{title_suffix}')
+        ax_summary.set_xlabel('Scenario')
         ax_summary.tick_params(axis='x', rotation=0)
+        ax_summary.legend(title='Calibration Method')
+        sns.plt.tight_layout()
         buf_summary = io.BytesIO()
+        sns.plt.savefig(buf_summary, format='png', dpi=100)
         buf_summary.seek(0)
         results['summary_plot'] = Image.open(buf_summary)
+        sns.plt.close(fig_summary)
         return results
     except FileNotFoundError as e:
+        gr.Error(f"File not found: {e.filename}. Please ensure example files are in '{EXAMPLE_DATA_DIR}' or all files are uploaded.")
         return {"error": f"File not found: {e.filename}"}
     except KeyError as e:
+        gr.Error(f"A required column is missing from one of the excel files: {e}. Please check data format.")
+        return {"error": f"Missing column: {e}"}
     except ValueError as e:
+        gr.Error(f"ValueError during processing: {str(e)}. This might be due to empty data or data format issues (e.g. non-numeric data for clustering).")
+        return {"error": f"ValueError: {str(e)}"}
     except Exception as e:
         import traceback
+        print(traceback.format_exc()) # Print full traceback to console for debugging
+        gr.Error(f"An unexpected error occurred: {str(e)}. Check console for details.")
+        return {"error": f"Error processing files: {str(e)}"}
 def create_interface():
+    with gr.Blocks(theme=gr.themes.Soft(), title="Cluster Model Points Analysis") as demo: # Added a theme
         gr.Markdown("""
+        # Cluster Model Points Analysis 📊
         This application applies cluster analysis to model point selection for insurance portfolios.
         Upload your Excel files or use the example data to analyze cashflows, policy attributes, and present values using different calibration methods.
         **Required Files (Excel .xlsx):**
+        - Cashflows - Base Scenario
+        - Cashflows - Lapse Stress (+50%)
+        - Cashflows - Mortality Stress (+15%)
+        - Policy Data (including 'age_at_entry', 'policy_term', 'sum_assured', 'duration_mth')
+        - Present Values - Base Scenario
+        - Present Values - Lapse Stress
+        - Present Values - Mortality Stress
         """)
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("### 📁 Upload Files or Load Examples")
+                load_example_btn = gr.Button("Load Example Data ✨", variant="secondary")
                 with gr.Row():
                     cashflow_base_input = gr.File(label="Cashflows - Base", file_types=[".xlsx"])
                     cashflow_lapse_input = gr.File(label="Cashflows - Lapse Stress", file_types=[".xlsx"])
                     pv_lapse_input = gr.File(label="Present Values - Lapse Stress", file_types=[".xlsx"])
                 with gr.Row():
                     pv_mort_input = gr.File(label="Present Values - Mortality Stress", file_types=[".xlsx"])
+                analyze_btn = gr.Button("Analyze Dataset 🚀", variant="primary", size="lg")
         with gr.Tabs():
             with gr.TabItem("📊 Summary"):
+                summary_plot_output = gr.Image(label="Calibration Methods Comparison")
             with gr.TabItem("💸 Cashflow Calibration"):
                 gr.Markdown("### Results: Using Annual Cashflows as Calibration Variables")
                 with gr.Row():
+                    cf_total_base_table_out = gr.Dataframe(label="Overall Comparison - Base Scenario (Cashflows)", wrap=True, height=300)
+                    cf_policy_attrs_total_out = gr.Dataframe(label="Overall Comparison - Policy Attributes", wrap=True, height=300)
+                cf_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios")
+                cf_scatter_cashflows_base_out = gr.Image(label="Scatter Plot - Per-Cluster Cashflows (Base Scenario)")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
                     with gr.Row():
+                        cf_pv_total_base_out = gr.Dataframe(label="PVs - Base Total", wrap=True)
+                        cf_pv_total_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total", wrap=True)
+                        cf_pv_total_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total", wrap=True)
             with gr.TabItem("👤 Policy Attribute Calibration"):
                 gr.Markdown("### Results: Using Policy Attributes as Calibration Variables")
                 with gr.Row():
+                    attr_total_cf_base_out = gr.Dataframe(label="Overall Comparison - Base Scenario (Cashflows)", wrap=True, height=300)
+                    attr_policy_attrs_total_out = gr.Dataframe(label="Overall Comparison - Policy Attributes", wrap=True, height=300)
+                attr_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios")
+                attr_scatter_cashflows_base_out = gr.Image(label="Scatter Plot - Per-Cluster Cashflows (Base Scenario)")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
+                     attr_total_pv_base_out = gr.Dataframe(label="PVs - Base Scenario Total (All Shocks)", wrap=True) # Changed label for clarity
             with gr.TabItem("💰 Present Value Calibration"):
                 gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
                 with gr.Row():
+                    pv_total_cf_base_out = gr.Dataframe(label="Overall Comparison - Base Scenario (Cashflows)", wrap=True, height=300)
+                    pv_policy_attrs_total_out = gr.Dataframe(label="Overall Comparison - Policy Attributes", wrap=True, height=300)
+                pv_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios")
+                pv_scatter_pvs_base_out = gr.Image(label="Scatter Plot - Per-Cluster Present Values (Base Scenario)")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
                     with gr.Row():
+                        pv_total_pv_base_out = gr.Dataframe(label="PVs - Base Total", wrap=True)
+                        pv_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total", wrap=True)
+                        pv_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total", wrap=True)
+        # --- Helper function to prepare outputs ---
         def get_all_output_components():
             return [
+                summary_plot_output,
+                # Cashflow Calib Outputs
+                cf_total_base_table_out, cf_policy_attrs_total_out,
+                cf_cashflow_plot_out, cf_scatter_cashflows_base_out,
+                cf_pv_total_base_out, cf_pv_total_lapse_out, cf_pv_total_mort_out,
+                # Attribute Calib Outputs
+                attr_total_cf_base_out, attr_policy_attrs_total_out,
+                attr_cashflow_plot_out, attr_scatter_cashflows_base_out, attr_total_pv_base_out,
+                # PV Calib Outputs
+                pv_total_cf_base_out, pv_policy_attrs_total_out,
+                pv_cashflow_plot_out, pv_scatter_pvs_base_out,
+                pv_total_pv_base_out, pv_total_pv_lapse_out, pv_total_pv_mort_out
             ]
+        # --- Action for Analyze Button ---
+        def handle_analysis(f1, f2, f3, f4, f5, f6, f7, progress=gr.Progress(track_tqdm=True)):
             files = [f1, f2, f3, f4, f5, f6, f7]
             file_paths = []
+            file_labels = ["Cashflows - Base", "Cashflows - Lapse", "Cashflows - Mort",
+                           "Policy Data", "PVs - Base", "PVs - Lapse", "PVs - Mort"]
+            for i, f_obj in enumerate(files):
+                if f_obj is None:
+                    gr.Error(f"Missing file input for: {file_labels[i]}. Please upload all files or load examples.")
+                    # Return empty/None for all outputs
+                    return [None] * len(get_all_output_components())
+                if hasattr(f_obj, 'name') and isinstance(f_obj.name, str):
+                    file_paths.append(f_obj.name)
+                elif isinstance(f_obj, str): # Already a path (from example load)
+                     file_paths.append(f_obj)
+                else:
+                    gr.Error(f"Invalid file input for {file_labels[i]}. Type: {type(f_obj)}")
                     return [None] * len(get_all_output_components())
+            progress(0, desc="Starting Analysis...")
+            # This is a placeholder for actual progress tracking if process_files were to support it.
+            # For now, it just shows activity.
+            # You could break down process_files and update progress more granularly if needed.
+            for i in range(1, 6):
+                progress(i/5, desc=f"Processing Data Step {i}/5...") # Simulate progress
+                # time.sleep(0.2) # if you want to see the progress bar update
             results = process_files(*file_paths)
+            progress(1, desc="Analysis Complete!")
+            if "error" in results: # Error handled by process_files with gr.Error
                 return [None] * len(get_all_output_components())
             return [
+                results.get('summary_plot'),
+                # CF Calib
+                results.get('cf_total_base_table'), results.get('cf_policy_attrs_total'),
+                results.get('cf_cashflow_plot'), results.get('cf_scatter_cashflows_base'),
+                results.get('cf_pv_total_base'), results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
+                # Attr Calib
                 results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
+                results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'), results.get('attr_total_pv_base'),
+                # PV Calib
+                results.get('pv_total_cf_base'), results.get('pv_policy_attrs_total'),
+                results.get('pv_cashflow_plot'), results.get('pv_scatter_pvs_base'),
+                results.get('pv_total_pv_base'), results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
             ]
         analyze_btn.click(
             outputs=get_all_output_components()
         )
+        # --- Action for Load Example Data Button ---
         def load_example_files():
+            # Create eg_data directory if it doesn't exist
+            if not os.path.exists(EXAMPLE_DATA_DIR):
+                 os.makedirs(EXAMPLE_DATA_DIR)
+                 gr.Warning(f"Created directory '{EXAMPLE_DATA_DIR}'. Please place example Excel files there. App will likely fail analysis if files are missing.")
+            missing_files_info = []
+            for key, fp in EXAMPLE_FILES.items():
+                if not os.path.exists(fp):
+                    missing_files_info.append(f"'{key}' (expected at '{fp}')")
+            if missing_files_info:
+                gr.Error(f"Missing example data files in '{EXAMPLE_DATA_DIR}': {', '.join(missing_files_info)}. Please ensure they exist or upload files manually.")
+                return [None] * 7 # Return None for all file inputs
+            gr.Info("Example data paths loaded. Click 'Analyze Dataset'.")
+            return [ # Return the paths for the File components
+                gr.File(value=EXAMPLE_FILES["cashflow_base"]),
+                gr.File(value=EXAMPLE_FILES["cashflow_lapse"]),
+                gr.File(value=EXAMPLE_FILES["cashflow_mort"]),
+                gr.File(value=EXAMPLE_FILES["policy_data"]),
+                gr.File(value=EXAMPLE_FILES["pv_base"]),
+                gr.File(value=EXAMPLE_FILES["pv_lapse"]),
+                gr.File(value=EXAMPLE_FILES["pv_mort"])
             ]
         load_example_btn.click(
     return demo
 if __name__ == "__main__":
     if not os.path.exists(EXAMPLE_DATA_DIR):
         os.makedirs(EXAMPLE_DATA_DIR)
+        print(f"Created directory '{EXAMPLE_DATA_DIR}'. Please place example Excel files there.")
+        print(f"Expected files in '{EXAMPLE_DATA_DIR}':")
+        for key, path in EXAMPLE_FILES.items():
+            print(f"  - {key}: {os.path.basename(path)}") # Print just file name for cleaner output
     demo_app = create_interface()
     demo_app.launch()