Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files Community

alidenewade commited on May 29, 2025

Commit

4f94e21

verified ·

1 Parent(s): 2f2f1dd

Update app.py

Browse files

Files changed (1) hide show

app.py +355 -328

app.py CHANGED Viewed

@@ -2,11 +2,12 @@ import gradio as gr
 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
-from sklearn.metrics import pairwise_distances_argmin_min # r2_score is not used in the provided snippet.
 import matplotlib.pyplot as plt
-import matplotlib.cm
 import io
-import os # Added for path joining
 from PIL import Image
 # Define the paths for example data
@@ -22,98 +23,98 @@ EXAMPLE_FILES = {
 }
 class Clusters:
-    def __init__(self, loc_vars_df): # Expecting a pandas DataFrame
-        # "Quantisize" by converting input DataFrame to float32 for KMeans.
-        # This reduces precision, potentially speeding up calculations and lowering memory.
-        # Results might have minor numerical differences compared to float64.
-        # Ensure data is a C-contiguous NumPy array.
-        if loc_vars_df.empty:
-            # Handle empty DataFrame case to avoid errors with .values or astype
-            # KMeans would fail anyway, but this prevents issues before that.
-            loc_vars_np_float32 = np.array([], dtype=np.float32).reshape(0, loc_vars_df.shape[1] if loc_vars_df.shape[1] > 0 else 0)
-        else:
-            loc_vars_np_float32 = np.ascontiguousarray(loc_vars_df.astype(np.float32).values)
-        # Initialize KMeans with algorithm="elkan" for potential speedup
-        # and fit on the float32 data.
-        self.kmeans = KMeans(
-            n_clusters=1000,
-            random_state=0,
-            n_init=10,
-            algorithm="elkan"  # Added for speed optimization
-        ).fit(loc_vars_np_float32)
-        # cluster_centers_ will be float32 if fitted on float32 data.
-        # Pass the same float32 NumPy array for distance calculations.
-        closest, _ = pairwise_distances_argmin_min(
-            self.kmeans.cluster_centers_,
-            loc_vars_np_float32
-        )
-        self.rep_ids = pd.Series(data=(closest + 1))  # 0-based to 1-based indexes
-        self.rep_ids.name = 'policy_id'
-        self.rep_ids.index.name = 'cluster_id'
-        # policy_count is based on the number of items in the input data.
-        # Use loc_vars_np_float32.shape[0] which is the number of rows.
-        self.policy_count = self.agg_by_cluster(pd.DataFrame({'policy_count': [1] * loc_vars_np_float32.shape[0]}))['policy_count']
     def agg_by_cluster(self, df, agg=None):
         """Aggregate columns by cluster"""
         temp = df.copy()
-        temp['cluster_id'] = self.kmeans.labels_
         temp = temp.set_index('cluster_id')
-        agg = {c: (agg[c] if agg and c in agg else 'sum') for c in temp.columns} if agg else "sum"
-        return temp.groupby(temp.index).agg(agg)
     def extract_reps(self, df):
         """Extract the rows of representative policies"""
-        # Ensure policy_id in df is of the same type as self.rep_ids if it's not already the index
-        # Typically, df here will have 'policy_id' as its index as per original data.
-        # If df's index is not 'policy_id', ensure 'policy_id' column exists and has compatible type.
-        current_df_index_name = df.index.name
-        # If 'policy_id' is not the index, reset it. Otherwise, use the index.
         if 'policy_id' not in df.columns and df.index.name != 'policy_id':
-            # This case should ideally not happen if inputs are consistent
-            # Forcing index to be named 'policy_id' if it's the policy identifier
-             df_indexed = df.copy()
-             if df_indexed.index.name is None: # Or some other logic to identify the policy_id column
-                 gr.Warning("DataFrame passed to extract_reps has no index name, assuming index is policy_id.")
-                 df_indexed.index.name = 'policy_id'
-             temp = pd.merge(self.rep_ids, df_indexed.reset_index(), how='left', on='policy_id')
-        elif 'policy_id' in df.columns and df.index.name == 'policy_id' and df.index.name in df.columns: # if policy_id is both index and a column
-            temp = pd.merge(self.rep_ids, df, how='left', on='policy_id') # Merge on column if available
-        elif df.index.name == 'policy_id':
-             temp = pd.merge(self.rep_ids, df.reset_index(), how='left', on='policy_id')
-        else: # 'policy_id' is a column, not the index
-             temp = pd.merge(self.rep_ids, df.reset_index(drop=df.index.name is None), how='left', on='policy_id')
-        temp.index.name = 'cluster_id' # The merge result's index is not cluster_id by default
-        temp = temp.set_index(self.rep_ids.index) # Set index to be cluster_id from self.rep_ids
-        return temp.drop('policy_id', axis=1, errors='ignore')
     def extract_and_scale_reps(self, df, agg=None):
         """Extract and scale the rows of representative policies"""
         extracted_df = self.extract_reps(df)
         if agg:
-            cols = extracted_df.columns # Use columns from extracted_df
-            mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
-            mult.index = extracted_df.index # Align index
-            return extracted_df.mul(mult)
         else:
-            return extracted_df.mul(self.policy_count, axis=0)
     def compare(self, df, agg=None):
         """Returns a multi-indexed Dataframe comparing actual and estimate"""
         source = self.agg_by_cluster(df, agg)
         target = self.extract_and_scale_reps(df, agg)
-        return pd.DataFrame({'actual': source.stack(), 'estimate':target.stack()})
     def compare_total(self, df, agg=None):
         """Aggregate df by columns"""
@@ -130,37 +131,35 @@ class Clusters:
             estimate_values = {}
             for col in df.columns: # Iterate over original df columns to ensure all are covered
-                if col not in reps_unscaled.columns: # Column might not be in reps_unscaled if it was dropped or not selected
-                    if agg.get(col, 'sum') == 'mean':
-                        estimate_values[col] = np.nan # Or some other placeholder like 0, or actual.get(col, 0)
-                    else:
-                        estimate_values[col] = 0
-                    gr.Warning(f"Column '{col}' not found in representative policies output for 'compare_total'. Estimate will be 0/NaN.")
                     continue
                 if agg.get(col, 'sum') == 'mean':
-                    weighted_sum = (reps_unscaled[col] * self.policy_count).sum()
-                    total_weight = self.policy_count.sum()
-                    estimate_values[col] = weighted_sum / total_weight if total_weight > 0 else 0
                 else:  # sum
-                    estimate_values[col] = (reps_unscaled[col] * self.policy_count).sum()
             estimate = pd.Series(estimate_values)
-        else:  # Original logic if no agg is specified (all sum)
             actual = df.sum()
             estimate = self.extract_and_scale_reps(df).sum()
-        # Ensure alignment for error calculation
-        actual, estimate = actual.align(estimate, fill_value=0)
-        error = np.where(actual != 0, estimate / actual - 1, 0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
-# --- Plotting functions (plot_cashflows_comparison, plot_scatter_comparison) remain unchanged ---
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
-    """Create cashflow comparison plots"""
     if not cfs_list or not cluster_obj or not titles:
         return None
     num_plots = len(cfs_list)
@@ -173,20 +172,30 @@ def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
     fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows), squeeze=False)
     axes = axes.flatten()
-    for i, (df, title) in enumerate(zip(cfs_list, titles)):
         if i < len(axes):
-            # Ensure df passed to compare_total is appropriate.
-            # If df has policy_id as index, it matches expectations of downstream functions in Clusters.
-            # If not, ensure policy_id is a column or handle appropriately.
-            if df.index.name != 'policy_id' and 'policy_id' not in df.columns:
-                 gr.Warning(f"DataFrame for plot '{title}' does not have 'policy_id' as index or column. Results may be incorrect.")
-            comparison = cluster_obj.compare_total(df.set_index('policy_id') if 'policy_id' in df.columns and df.index.name != 'policy_id' else df)
-            comparison[['actual', 'estimate']].plot(ax=axes[i], grid=True, title=title)
-            axes[i].set_xlabel('Time')
-            axes[i].set_ylabel('Value')
-    for j in range(i + 1, len(axes)):
         fig.delaxes(axes[j])
     plt.tight_layout()
@@ -198,47 +207,77 @@ def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
     return img
 def plot_scatter_comparison(df_compare_output, title):
-    """Create scatter plot comparison from compare() output"""
     if df_compare_output is None or df_compare_output.empty:
-        fig, ax = plt.subplots(figsize=(12, 8))
         ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
         ax.set_title(title)
-        buf = io.BytesIO()
-        plt.savefig(buf, format='png', dpi=100)
-        buf.seek(0)
-        img = Image.open(buf)
-        plt.close(fig)
-        return img
-    fig, ax = plt.subplots(figsize=(12, 8))
-    if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
         gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
-        ax.scatter(df_compare_output['actual'], df_compare_output['estimate'], s=9, alpha=0.6)
     else:
-        unique_levels = df_compare_output.index.get_level_values(1).unique()
-        colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(unique_levels)))
-        for item_level, color_val in zip(unique_levels, colors):
-            subset = df_compare_output.xs(item_level, level=1)
-            ax.scatter(subset['actual'], subset['estimate'], color=color_val, s=9, alpha=0.6, label=str(item_level)) # Ensure label is string
-        if len(unique_levels) > 1 and len(unique_levels) <= 20: # Increased legend item limit slightly
-            ax.legend(title=str(df_compare_output.index.names[1]))
     ax.set_xlabel('Actual')
     ax.set_ylabel('Estimate')
-    ax.set_title(title)
-    ax.grid(True)
-    lims = [
-        np.nanmin([ax.get_xlim(), ax.get_ylim()]), # Use nanmin/nanmax
-        np.nanmax([ax.get_xlim(), ax.get_ylim()]),
-    ]
-    if lims[0] != lims[1] and np.isfinite(lims[0]) and np.isfinite(lims[1]): # Check for valid limits
-      ax.plot(lims, lims, 'r-', linewidth=0.5)
-      ax.set_xlim(lims)
-      ax.set_ylim(lims)
     buf = io.BytesIO()
     plt.savefig(buf, format='png', dpi=100)
     buf.seek(0)
@@ -246,56 +285,74 @@ def plot_scatter_comparison(df_compare_output, title):
     plt.close(fig)
     return img
-# --- Main processing function (process_files) ---
-# Ensure DataFrames passed to Clusters methods have 'policy_id' as index if expected.
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
     """Main processing function - now accepts file paths"""
     try:
-        # Consider using engine='calamine' for faster Excel reading if available (pip install pandas[calamine])
-        # e.g., cfs = pd.read_excel(cashflow_base_path, index_col=0, engine='calamine')
-        cfs = pd.read_excel(cashflow_base_path, index_col=0)
-        cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
-        cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
-        pol_data_full = pd.read_excel(policy_data_path, index_col=0)
         required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
-        # Ensure index is named 'policy_id' if it's not already named, assuming index is the policy identifier
-        for df in [cfs, cfs_lapse50, cfs_mort15, pol_data_full]:
-            if df.index.name is None:
-                df.index.name = 'policy_id'
-            if 'policy_id' not in df.columns and df.index.name == 'policy_id': # Add policy_id as column if its only an index
-                df.reset_index(inplace=True) # this makes policy_id a column
-                df.set_index('policy_id', inplace=True) # and keeps it as index
-        if all(col in pol_data_full.columns or col == pol_data_full.index.name for col in required_cols):
-            # If policy_id is index, it won't be in columns. Adjust selection.
-            cols_to_select = [col for col in required_cols if col in pol_data_full.columns]
-            if pol_data_full.index.name in required_cols and pol_data_full.index.name not in cols_to_select:
-                 # This case is tricky; if an ID is part of required_cols and is the index.
-                 # For simplicity, assume required_cols are actual data columns.
-                 pass # Let it proceed, it might be handled by selection or error later.
-            pol_data = pol_data_full[cols_to_select].copy() # Use .copy() to avoid SettingWithCopyWarning
-            # If 'policy_id' was the index and required, it's implicitly handled or needs specific logic.
-            # For K-Means, policy_id itself is usually not a feature.
         else:
-            missing_req_cols = [col for col in required_cols if col not in pol_data_full.columns and col != pol_data_full.index.name]
-            gr.Warning(f"Policy data might be missing required columns: {missing_req_cols}. Found: {pol_data_full.columns.tolist()}")
-            pol_data = pol_data_full # Fallback, but ensure it's numeric for clustering/scaling
-        pvs = pd.read_excel(pv_base_path, index_col=0)
-        pvs_lapse50 = pd.read_excel(pv_lapse_path, index_col=0)
-        pvs_mort15 = pd.read_excel(pv_mort_path, index_col=0)
-        for df in [pvs, pvs_lapse50, pvs_mort15]:
-            if df.index.name is None:
-                df.index.name = 'policy_id'
-            if 'policy_id' not in df.columns and df.index.name == 'policy_id':
-                df.reset_index(inplace=True)
-                df.set_index('policy_id', inplace=True)
         cfs_list = [cfs, cfs_lapse50, cfs_mort15]
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
@@ -303,44 +360,44 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
-        # DataFrames passed to Clusters should be policy_id indexed for .values to exclude it.
-        # Or, select only feature columns before passing.
-        # The Clusters class now expects a DataFrame and will use .values, so pass only feature columns.
-        # If index is policy_id, df.values will not include it. This is good.
         # --- 1. Cashflow Calibration ---
-        # Ensure 'cfs' DataFrame does not include 'policy_id' when .values is called in Clusters
-        cluster_cfs = Clusters(cfs.reset_index().set_index('policy_id')) # Pass with policy_id as index
         results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
         results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
         results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs)
         results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50)
         results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15)
         results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
         results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
         # --- 2. Policy Attribute Calibration ---
-        loc_vars_attrs = pd.DataFrame() # Initialize
-        if not pol_data.empty:
-            # Ensure pol_data is purely numeric for scaling and KMeans
-            numeric_pol_data = pol_data.select_dtypes(include=np.number)
-            if not numeric_pol_data.empty and not (numeric_pol_data.max(numeric_only=True) - numeric_pol_data.min(numeric_only=True) == 0).all():
-                loc_vars_attrs = (numeric_pol_data - numeric_pol_data.min(numeric_only=True)) / \
-                                 (numeric_pol_data.max(numeric_only=True) - numeric_pol_data.min(numeric_only=True))
-                loc_vars_attrs.index = numeric_pol_data.index # Preserve index
             else:
-                gr.Warning("Policy data for attribute calibration is empty, non-numeric, or has no variance. Skipping attribute calibration content.")
-                loc_vars_attrs = numeric_pol_data # or an empty DataFrame with original index
         else:
-            gr.Warning("Policy data is empty. Skipping attribute calibration content.")
-        if not loc_vars_attrs.empty:
-            cluster_attrs = Clusters(loc_vars_attrs.reset_index().set_index('policy_id')) # Pass with policy_id as index
             results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
-            results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs)
             results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
             results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
             results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
@@ -348,41 +405,39 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
             results['attr_total_cf_base'] = pd.DataFrame()
             results['attr_policy_attrs_total'] = pd.DataFrame()
             results['attr_total_pv_base'] = pd.DataFrame()
-            results['attr_cashflow_plot'] = plot_scatter_comparison(pd.DataFrame(), 'Policy Attr. Calib. - Cashflows (Base) - No Data') # Empty plot
-            results['attr_scatter_cashflows_base'] = plot_scatter_comparison(pd.DataFrame(), 'Policy Attr. Calib. - Cashflows (Base) - No Data')
         # --- 3. Present Value Calibration ---
-        cluster_pvs = Clusters(pvs.reset_index().set_index('policy_id')) # Pass with policy_id as index
         results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
         results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
         results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
         results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50)
         results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15)
         results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
         results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
         # --- Summary Comparison Plot Data ---
         error_data = {}
-        def get_error_safe(compare_result, col_name=None):
-            if compare_result is None or compare_result.empty or 'error' not in compare_result.columns: # Check if None
                 return np.nan
-            if col_name and col_name in compare_result.index:
-                return abs(compare_result.loc[col_name, 'error'])
-            else:
-                return abs(compare_result['error']).mean()
         key_pv_col = None
-        # Use pvs.columns (which should be only feature columns after reset_index().set_index())
-        # Or, use the original pvs DataFrame if it's guaranteed to have the PV_NetCF column.
-        # For safety, check in the original pvs DataFrame which has not been stripped of columns.
-        original_pvs_cols = pd.read_excel(pv_base_path).columns # Quick read just for columns
         for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']:
-            if potential_col in original_pvs_cols: # Check against original columns
                 key_pv_col = potential_col
                 break
@@ -392,11 +447,12 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
             get_error_safe(results.get('cf_pv_total_mort'), key_pv_col)
         ]
-        if not loc_vars_attrs.empty:
-            error_data['Attr Calib.'] = [
-                get_error_safe(results.get('attr_total_pv_base'), key_pv_col), # This was pvs, should be fine
-                get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col), # Re-calculate for pvs_lapse50
-                get_error_safe(cluster_attrs.compare_total(pvs_mort15), key_pv_col)  # Re-calculate for pvs_mort15
             ]
         else:
             error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
@@ -407,17 +463,26 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
             get_error_safe(results.get('pv_total_pv_mort'), key_pv_col)
         ]
-        summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
         fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
-        summary_df.plot(kind='bar', ax=ax_summary, grid=True)
         ax_summary.set_ylabel('Absolute Error Rate')
-        title_suffix = f' ({key_pv_col})' if key_pv_col else ' (Mean Absolute Error)'
         ax_summary.set_title(f'Calibration Method Comparison - Error in Total PV{title_suffix}')
         ax_summary.tick_params(axis='x', rotation=0)
-        ax_summary.legend(title='Calibration Method')
         plt.tight_layout()
         buf_summary = io.BytesIO()
         plt.savefig(buf_summary, format='png', dpi=100)
         buf_summary.seek(0)
@@ -430,15 +495,22 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         gr.Error(f"File not found: {e.filename}. Please ensure example files are in '{EXAMPLE_DATA_DIR}' or all files are uploaded.")
         return {"error": f"File not found: {e.filename}"}
     except KeyError as e:
-        # Check if the KeyError is from trying to access a column that became an index
-        gr.Error(f"A required column or index is missing or misnamed: {e}. Please check data format and ensure 'policy_id' is correctly handled as index for feature dataframes.")
         return {"error": f"Missing column/index: {e}"}
     except Exception as e:
         import traceback
-        gr.Error(f"Error processing files: {str(e)}. Trace: {traceback.format_exc()}")
-        return {"error": f"Error processing files: {str(e)}"}
-# --- Gradio interface creation (create_interface, etc.) remains unchanged ---
 def create_interface():
     with gr.Blocks(title="Cluster Model Points Analysis") as demo:
         gr.Markdown("""
@@ -448,15 +520,13 @@ def create_interface():
         Upload your Excel files or use the example data to analyze cashflows, policy attributes, and present values using different calibration methods.
         **Required Files (Excel .xlsx):**
-        - Cashflows - Base Scenario (index = policy_id, columns = time periods)
-        - Cashflows - Lapse Stress (+50%) (index = policy_id)
-        - Cashflows - Mortality Stress (+15%) (index = policy_id)
-        - Policy Data (index = policy_id, including 'age_at_entry', 'policy_term', 'sum_assured', 'duration_mth' as columns)
-        - Present Values - Base Scenario (index = policy_id, columns = PV components like 'PV_NetCF')
-        - Present Values - Lapse Stress (index = policy_id)
-        - Present Values - Mortality Stress (index = policy_id)
-        *Note: Ensure 'policy_id' is the index for all input files for correct processing.*
         """)
         with gr.Row():
@@ -503,11 +573,7 @@ def create_interface():
                 attr_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios")
                 attr_scatter_cashflows_base_out = gr.Image(label="Scatter Plot - Per-Cluster Cashflows (Base Scenario)")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
-                    with gr.Row(): # Changed to Row for consistency
-                        attr_total_pv_base_out = gr.Dataframe(label="PVs - Base Scenario Total")
-                        # Added placeholders for other scenarios if they were intended
-                        # attr_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
-                        # attr_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
             with gr.TabItem("💰 Present Value Calibration"):
                 gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
@@ -522,62 +588,46 @@ def create_interface():
                         pv_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
                         pv_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
-        # --- Helper function to prepare outputs ---
         def get_all_output_components():
             return [
                 summary_plot_output,
-                # Cashflow Calib Outputs
                 cf_total_base_table_out, cf_policy_attrs_total_out,
                 cf_cashflow_plot_out, cf_scatter_cashflows_base_out,
                 cf_pv_total_base_out, cf_pv_total_lapse_out, cf_pv_total_mort_out,
-                # Attribute Calib Outputs
                 attr_total_cf_base_out, attr_policy_attrs_total_out,
                 attr_cashflow_plot_out, attr_scatter_cashflows_base_out, attr_total_pv_base_out,
-                # PV Calib Outputs
                 pv_total_cf_base_out, pv_policy_attrs_total_out,
                 pv_cashflow_plot_out, pv_scatter_pvs_base_out,
                 pv_total_pv_base_out, pv_total_pv_lapse_out, pv_total_pv_mort_out
             ]
-        # --- Action for Analyze Button ---
         def handle_analysis(f1, f2, f3, f4, f5, f6, f7):
             files = [f1, f2, f3, f4, f5, f6, f7]
             file_paths = []
-            # Check if any FileData object is None (no file uploaded for a slot)
-            if any(f_obj is None for f_obj in files):
-                # Attempt to load from EXAMPLE_FILES if any input is missing
-                # This logic might be complex if mixing examples and uploads.
-                # For now, strict: all files must be present.
-                gr.Error("Missing file input for one or more fields. Please upload all required files or load the complete example dataset.")
-                return [None] * len(get_all_output_components())
-            for i, f_obj in enumerate(files):
-                # f_obj is TempFilePath (older Gradio) or FileData (newer) or str (from example load)
-                if hasattr(f_obj, 'name') and isinstance(f_obj.name, str): # Gradio FileData or similar
-                    file_paths.append(f_obj.name)
-                elif isinstance(f_obj, str): # Path from example load
-                    file_paths.append(f_obj)
-                else: # Should not happen if inputs are Files or paths
-                    gr.Error(f"Invalid file input for argument {i+1}. Type: {type(f_obj)}")
                     return [None] * len(get_all_output_components())
             results = process_files(*file_paths)
-            if "error" in results : # Check if process_files returned an error dict
-                # Error already shown by gr.Error in process_files
-                return [None] * len(get_all_output_components())
             return [
                 results.get('summary_plot'),
-                # CF Calib
                 results.get('cf_total_base_table'), results.get('cf_policy_attrs_total'),
                 results.get('cf_cashflow_plot'), results.get('cf_scatter_cashflows_base'),
                 results.get('cf_pv_total_base'), results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
-                # Attr Calib
                 results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
                 results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'), results.get('attr_total_pv_base'),
-                # PV Calib
                 results.get('pv_total_cf_base'), results.get('pv_policy_attrs_total'),
                 results.get('pv_cashflow_plot'), results.get('pv_scatter_pvs_base'),
                 results.get('pv_total_pv_base'), results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
@@ -590,50 +640,50 @@ def create_interface():
             outputs=get_all_output_components()
         )
-        # --- Action for Load Example Data Button ---
         def load_example_files():
-            # Create dummy example files if they don't exist for demonstration if needed
-            # For this exercise, we assume they exist or user is warned.
-            os.makedirs(EXAMPLE_DATA_DIR, exist_ok=True) # Ensure dir exists
-            missing_files = []
             for key, fp in EXAMPLE_FILES.items():
                 if not os.path.exists(fp):
-                    missing_files.append(fp)
-                    # Create a minimal dummy Excel file if it's missing
                     try:
-                        dummy_df_data = {'policy_id': [1,2,3], 'col1': [0.1,0.2,0.3], 'col2':[10,20,30]}
-                        if "cashflow" in key or "pv" in key: # Time series like
-                            dummy_df_data = {'policy_id': [1,2,3], '0': [1,2,3], '1': [4,5,6]}
                         elif "policy_data" in key:
-                             dummy_df_data = {'policy_id': [1,2,3], 'age_at_entry': [20,30,40], 'policy_term': [10,20,15],
-                                              'sum_assured': [1000,2000,1500], 'duration_mth': [5,10,7]}
-                        dummy_df = pd.DataFrame(dummy_df_data).set_index('policy_id')
-                        dummy_df.to_excel(fp)
-                        gr.Warning(f"Example file '{fp}' was missing and a dummy file has been created. Results may not be meaningful.")
                     except Exception as e:
-                        gr.Warning(f"Could not create dummy file for {fp}: {e}")
-            if missing_files and not all(os.path.exists(fp) for fp in EXAMPLE_FILES.values()): # Re-check after dummy creation attempt
-                 # If still missing after trying to create dummies
-                gr.Error(f"Critical example data files are missing from '{EXAMPLE_DATA_DIR}': {', '.join(missing_files)}. Please ensure they exist or check permissions.")
-                return [None] * 7 # Return None for all file inputs
             gr.Info("Example data paths loaded. Click 'Analyze Dataset'.")
-            # Return the string paths for the file components
             return [
-                gr.File(value=EXAMPLE_FILES["cashflow_base"], Labeled_input=cashflow_base_input.label),
-                gr.File(value=EXAMPLE_FILES["cashflow_lapse"], Labeled_input=cashflow_lapse_input.label),
-                gr.File(value=EXAMPLE_FILES["cashflow_mort"], Labeled_input=cashflow_mort_input.label),
-                gr.File(value=EXAMPLE_FILES["policy_data"], Labeled_input=policy_data_input.label),
-                gr.File(value=EXAMPLE_FILES["pv_base"], Labeled_input=pv_base_input.label),
-                gr.File(value=EXAMPLE_FILES["pv_lapse"], Labeled_input=pv_lapse_input.label),
-                gr.File(value=EXAMPLE_FILES["pv_mort"], Labeled_input=pv_mort_input.label)
             ]
         load_example_btn.click(
             load_example_files,
             inputs=[],
@@ -646,30 +696,7 @@ def create_interface():
 if __name__ == "__main__":
     if not os.path.exists(EXAMPLE_DATA_DIR):
         os.makedirs(EXAMPLE_DATA_DIR)
-        print(f"Created directory '{EXAMPLE_DATA_DIR}'. Please place example Excel files there or they will be generated as dummies.")
-    # Simple check and dummy file creation for example data if not present
-    for key, fp in EXAMPLE_FILES.items():
-        if not os.path.exists(fp):
-            print(f"Example file {fp} not found. Attempting to create a dummy file.")
-            try:
-                dummy_df_data = {'policy_id': [1,2,3], 'col1': [0.1,0.2,0.3], 'col2':[10,20,30]}
-                if "cashflow" in key or "pv" in key:
-                    dummy_df_data = {f'{i}':np.random.rand(3) for i in range(10)} # 10 time periods
-                    dummy_df_data['policy_id'] = [f'P{j}' for j in range(3)]
-                elif "policy_data" in key:
-                    dummy_df_data = {'policy_id': [f'P{j}' for j in range(3)],
-                                     'age_at_entry': np.random.randint(20, 50, 3),
-                                     'policy_term': np.random.randint(10, 30, 3),
-                                     'sum_assured': np.random.randint(10000, 50000, 3),
-                                     'duration_mth': np.random.randint(1, 120, 3)}
-                dummy_df = pd.DataFrame(dummy_df_data).set_index('policy_id')
-                dummy_df.to_excel(fp)
-                print(f"Dummy file for '{fp}' created.")
-            except Exception as e:
-                print(f"Could not create dummy file for {fp}: {e}")
     demo_app = create_interface()
     demo_app.launch()

 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin_min, r2_score # r2_score is not used but kept from original
 import matplotlib.pyplot as plt
+# import matplotlib.cm # No longer explicitly needed for rainbow
+import seaborn as sns # Added Seaborn
 import io
+import os
 from PIL import Image
 # Define the paths for example data
 }
 class Clusters:
+    def __init__(self, loc_vars):
+        # loc_vars is expected to be a DataFrame for cfs, loc_vars_attrs, pvs
+        # For KMeans, we need a NumPy array. If loc_vars is a DataFrame, .values extracts the data.
+        if isinstance(loc_vars, pd.DataFrame):
+            loc_vars_np = np.ascontiguousarray(loc_vars.values)
+        else: # If it's already a NumPy array (e.g. from previous processing not shown)
+            loc_vars_np = np.ascontiguousarray(loc_vars)
+        self.kmeans = KMeans(n_clusters=1000, random_state=0, n_init=10).fit(loc_vars_np)
+        closest, _ = pairwise_distances_argmin_min(self.kmeans.cluster_centers_, loc_vars_np)
+        rep_ids = pd.Series(data=(closest + 1))  # 0-based to 1-based indexes
+        rep_ids.name = 'policy_id'
+        rep_ids.index.name = 'cluster_id' # This index represents the cluster number (0 to 999)
+        self.rep_ids = rep_ids
+        # policy_count should be based on the length of the input data used for clustering
+        self.policy_count = self.agg_by_cluster(pd.DataFrame({'policy_count': [1] * len(loc_vars_np)}))['policy_count']
     def agg_by_cluster(self, df, agg=None):
         """Aggregate columns by cluster"""
         temp = df.copy()
+        temp['cluster_id'] = self.kmeans.labels_ # labels_ are 0-indexed cluster assignments
         temp = temp.set_index('cluster_id')
+        agg_dict = {c: (agg[c] if agg and c in agg else 'sum') for c in temp.columns if c != 'cluster_id'} if agg else "sum"
+        if not agg_dict: # handles case where temp has only cluster_id or agg makes agg_dict empty
+             return pd.DataFrame(index=temp.index.unique()) # return empty DF with cluster_id index
+        return temp.groupby(level='cluster_id').agg(agg_dict)
     def extract_reps(self, df):
         """Extract the rows of representative policies"""
+        # df is expected to have policy_id as its index or as a column
         if 'policy_id' not in df.columns and df.index.name != 'policy_id':
+            raise ValueError("DataFrame for extract_reps must have 'policy_id' as index or column.")
+        df_to_merge = df.reset_index() if df.index.name == 'policy_id' else df.copy()
+        # Ensure policy_id column exists after reset_index or in copy
+        if 'policy_id' not in df_to_merge.columns:
+             # This case implies policy_id was the index but reset_index didn't create it (e.g. unnamed index)
+             # This should be handled by input data prep: ensure policy_id is a named index or a column.
+             # For robustness, if original df had named index 'policy_id', reset_index works.
+             # If it was an unnamed index that is policy_id, it's more problematic.
+             # Assuming 'policy_id' is present in df_to_merge now.
+             pass
+        temp = pd.merge(self.rep_ids.reset_index(), df_to_merge, how='left', on='policy_id')
+        # temp now has 'cluster_id' from rep_ids and other columns from df_to_merge
+        temp = temp.set_index('cluster_id')
+        return temp.drop(columns=['policy_id'], errors='ignore')
     def extract_and_scale_reps(self, df, agg=None):
         """Extract and scale the rows of representative policies"""
         extracted_df = self.extract_reps(df)
         if agg:
+            # Ensure we only try to multiply columns that exist in extracted_df
+            cols_to_multiply = [col for col in df.columns if col in extracted_df.columns]
+            mult = pd.DataFrame({
+                c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1)
+                for c in cols_to_multiply
+            })
+            mult.index = extracted_df.index # Align index for multiplication
+            # Only multiply existing columns
+            result_df = extracted_df.copy()
+            for col in cols_to_multiply:
+                result_df[col] = extracted_df[col].mul(mult[col])
+            return result_df
         else:
+            # Scale all numeric columns in extracted_df
+            numeric_cols = extracted_df.select_dtypes(include=np.number).columns
+            result_df = extracted_df.copy()
+            for col in numeric_cols:
+                 result_df[col] = extracted_df[col].mul(self.policy_count, axis=0)
+            return result_df
     def compare(self, df, agg=None):
         """Returns a multi-indexed Dataframe comparing actual and estimate"""
         source = self.agg_by_cluster(df, agg)
         target = self.extract_and_scale_reps(df, agg)
+        # Ensure consistent columns for stacking, could be an issue if agg is selective
+        common_columns = source.columns.intersection(target.columns)
+        source_stacked = source[common_columns].stack()
+        target_stacked = target[common_columns].stack()
+        return pd.DataFrame({'actual': source_stacked, 'estimate': target_stacked})
     def compare_total(self, df, agg=None):
         """Aggregate df by columns"""
             estimate_values = {}
             for col in df.columns: # Iterate over original df columns to ensure all are covered
+                if col not in reps_unscaled.columns:
+                    estimate_values[col] = np.nan # Column not in representative policies
                     continue
                 if agg.get(col, 'sum') == 'mean':
+                    if self.policy_count.sum() > 0:
+                        weighted_sum = (reps_unscaled[col].astype(float) * self.policy_count.astype(float)).sum()
+                        total_weight = self.policy_count.sum()
+                        estimate_values[col] = weighted_sum / total_weight
+                    else:
+                        estimate_values[col] = np.nan # Avoid division by zero
                 else:  # sum
+                    estimate_values[col] = (reps_unscaled[col].astype(float) * self.policy_count.astype(float)).sum()
             estimate = pd.Series(estimate_values)
+        else:
             actual = df.sum()
             estimate = self.extract_and_scale_reps(df).sum()
+        actual, estimate = actual.align(estimate, fill_value=0) # Align before calculating error
+        error = np.where(actual != 0, (estimate / actual) - 1, 0) # estimate/actual can be NaN if actual is 0
+        error = np.nan_to_num(error, nan=0.0) # Replace NaNs from 0/0 with 0
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
+## Plotting Functions (Modified for Seaborn)
+---
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
+    """Create cashflow comparison plots using Seaborn"""
+    sns.set_style("whitegrid") # Apply Seaborn styling
     if not cfs_list or not cluster_obj or not titles:
         return None
     num_plots = len(cfs_list)
     fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows), squeeze=False)
     axes = axes.flatten()
+    for i, (df_orig, title) in enumerate(zip(cfs_list, titles)):
         if i < len(axes):
+            ax = axes[i]
+            # Assuming df_orig has policy_id as index, or it's handled before compare_total
+            comparison_df = cluster_obj.compare_total(df_orig)
+            # Prepare data for Seaborn lineplot (long format)
+            plot_data = comparison_df[['actual', 'estimate']].copy()
+            # Assuming the index of comparison_df represents 'Time'
+            plot_data['Time'] = plot_data.index.astype(str) # Ensure Time is string for categorical plotting if not truly numeric
+            try: # If Time can be numeric, use it as such.
+                plot_data['Time'] = pd.to_numeric(plot_data['Time'])
+            except ValueError:
+                pass # Keep as string if not convertible
+            plot_data_melted = plot_data.melt(id_vars='Time', var_name='Legend', value_name='Value')
+            sns.lineplot(x='Time', y='Value', hue='Legend', data=plot_data_melted, ax=ax, errorbar=None)
+            ax.set_title(title)
+            ax.set_xlabel('Time')
+            ax.set_ylabel('Value')
+            # ax.grid(True) # whitegrid style includes a grid
+    for j in range(i + 1, len(axes)): # Hide any unused subplots
         fig.delaxes(axes[j])
     plt.tight_layout()
     return img
 def plot_scatter_comparison(df_compare_output, title):
+    """Create scatter plot comparison from compare() output using Seaborn"""
+    sns.set_style("whitegrid")
+    fig, ax = plt.subplots(figsize=(12, 8))
     if df_compare_output is None or df_compare_output.empty:
         ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
         ax.set_title(title)
+    elif not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
         gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
+        sns.scatterplot(x='actual', y='estimate', data=df_compare_output, s=25, alpha=0.7, ax=ax, legend=False)
+        ax.set_title(title)
     else:
+        plot_data = df_compare_output.reset_index()
+        hue_col_name = df_compare_output.index.names[1]
+        # Ensure the hue column is treated as categorical by converting to string
+        plot_data[hue_col_name] = plot_data[hue_col_name].astype(str)
+        unique_levels = plot_data[hue_col_name].nunique()
+        show_legend_flag = "auto"
+        if unique_levels == 1:
+            show_legend_flag = False
+        elif unique_levels > 10:
+            show_legend_flag = False
+            gr.Warning(f"Warning: Too many unique values ({unique_levels}) in '{hue_col_name}' for scatter plot legend. Legend hidden.")
+        sns.scatterplot(x='actual', y='estimate', hue=hue_col_name, data=plot_data,
+                        s=25, alpha=0.7, ax=ax, legend=show_legend_flag)
+        ax.set_title(title)
+        if show_legend_flag == True and ax.get_legend() is not None:
+             ax.get_legend().set_title(str(hue_col_name))
+        elif show_legend_flag == "auto" and ax.get_legend() is not None: # Seaborn decided to show it
+             ax.get_legend().set_title(str(hue_col_name))
     ax.set_xlabel('Actual')
     ax.set_ylabel('Estimate')
+    # ax.grid(True) # whitegrid includes it
+    # Draw identity line
+    # Must draw after scatterplot to get correct limits
+    # Delay lims calculation until after plot, ensure data exists
+    if not (df_compare_output is None or df_compare_output.empty):
+        # Get limits from data if axes limits are too wide or default
+        # This ensures the identity line is relevant to the plotted data
+        all_values = pd.concat([plot_data['actual'], plot_data['estimate']]).dropna() if 'plot_data' in locals() else \
+                     pd.concat([df_compare_output['actual'], df_compare_output['estimate']]).dropna()
+        if not all_values.empty:
+            min_val = all_values.min()
+            max_val = all_values.max()
+            # Use current axis limits if they are tighter than data range (e.g., user zoomed)
+            # But if they are default (-0.05 to 0.05 for empty data), use data range.
+            ax_xlims = ax.get_xlim()
+            ax_ylims = ax.get_ylim()
+            plot_min = np.nanmin([min_val, ax_xlims[0], ax_ylims[0]])
+            plot_max = np.nanmax([max_val, ax_xlims[1], ax_ylims[1]])
+            # Handle cases where min and max might be too close or NaN
+            if np.isfinite(plot_min) and np.isfinite(plot_max) and plot_min < plot_max:
+                ax.plot([plot_min, plot_max], [plot_min, plot_max], 'r-', linewidth=0.7, alpha=0.8, zorder=0)
+                ax.set_xlim(plot_min, plot_max)
+                ax.set_ylim(plot_min, plot_max)
+            elif np.isfinite(plot_min) and np.isfinite(plot_max) and plot_min == plot_max: # Single point
+                margin = abs(plot_min * 0.1) if plot_min != 0 else 0.1
+                ax.plot([plot_min], [plot_min], 'ro') # Mark the point
+                ax.set_xlim(plot_min - margin, plot_min + margin)
+                ax.set_ylim(plot_min - margin, plot_min + margin)
     buf = io.BytesIO()
     plt.savefig(buf, format='png', dpi=100)
     buf.seek(0)
     plt.close(fig)
     return img
+## Main Processing and Gradio UI (Largely Unchanged)
+---
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
     """Main processing function - now accepts file paths"""
     try:
+        # Ensure 'policy_id' is the index for dataframes used in clustering/comparison
+        def read_and_prep_excel(path, set_policy_id_index=True):
+            df = pd.read_excel(path) # Read first, then set index
+            if 'policy_id' not in df.columns:
+                # Try to find it in unnamed index columns if any, or assume first column
+                # This is risky; ideally, 'policy_id' is an explicit column name
+                gr.Warning(f"'policy_id' column not found in {os.path.basename(path)}. Attempting to use first column or existing index.")
+                if df.columns[0].lower() == 'policy_id' or 'policyid' in df.columns[0].lower():
+                    df.rename(columns={df.columns[0]: 'policy_id'}, inplace=True)
+                # Or if it is in the index already but unnamed
+                elif df.index.name is None and len(df.index) == len(df): # A heuristic
+                     pass # keep as is, will try to use index later
+                else: # Fallback if no clear policy_id column found and index is not it
+                    gr.Error(f"Cannot reliably find 'policy_id' in {os.path.basename(path)}.")
+                    # For this example, let's assume files WILL have policy_id column or as first column
+                    # This part needs robust handling based on expected file structures.
+                    # If it's always index_col=0 as in original:
+                    df = pd.read_excel(path, index_col=0)
+                    if df.index.name != 'policy_id': # if index_col=0 was not named 'policy_id'
+                        df.index.name = 'policy_id' # Name it 'policy_id'
+                    return df.reset_index() # Make policy_id a column then set as index
+            if set_policy_id_index:
+                return df.set_index('policy_id')
+            return df
+        cfs = read_and_prep_excel(cashflow_base_path).select_dtypes(include=np.number)
+        cfs_lapse50 = read_and_prep_excel(cashflow_lapse_path).select_dtypes(include=np.number)
+        cfs_mort15 = read_and_prep_excel(cashflow_mort_path).select_dtypes(include=np.number)
+        pol_data_full_raw = read_and_prep_excel(policy_data_path, set_policy_id_index=False)
+        # Ensure the correct columns are selected for pol_data
         required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
+        # Check if required_cols exist, case-insensitively, and normalize names
+        rename_map = {}
+        available_cols_lower = {col.lower(): col for col in pol_data_full_raw.columns}
+        for req_col in required_cols:
+            if req_col.lower() in available_cols_lower:
+                rename_map[available_cols_lower[req_col.lower()]] = req_col # Map original to standardized
+        pol_data_full_renamed = pol_data_full_raw.rename(columns=rename_map)
+        if all(col in pol_data_full_renamed.columns for col in required_cols):
+            pol_data = pol_data_full_renamed.set_index('policy_id')[required_cols].select_dtypes(include=np.number)
         else:
+            missing = [col for col in required_cols if col not in pol_data_full_renamed.columns]
+            gr.Warning(f"Policy data might be missing required columns: {missing}. Found: {pol_data_full_renamed.columns.tolist()}")
+            # Fallback: use all numeric columns if required are missing, set policy_id as index
+            pol_data = pol_data_full_renamed.set_index('policy_id').select_dtypes(include=np.number)
+            if pol_data.empty and not pol_data_full_renamed.select_dtypes(include=np.number).empty:
+                 gr.Warning("Policy data became empty after trying to select numeric types with policy_id index. Check input.")
+        pvs = read_and_prep_excel(pv_base_path).select_dtypes(include=np.number)
+        pvs_lapse50 = read_and_prep_excel(pv_lapse_path).select_dtypes(include=np.number)
+        pvs_mort15 = read_and_prep_excel(pv_mort_path).select_dtypes(include=np.number)
+        # DataFrames for Clusters class should not include the policy_id if it's an index
+        # The class constructor expects features only (typically a DataFrame where .values gives numeric data)
+        # The current read_and_prep_excel sets policy_id as index. This is fine.
+        # KMeans will be called on df.values implicitly.
         cfs_list = [cfs, cfs_lapse50, cfs_mort15]
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
         # --- 1. Cashflow Calibration ---
+        # Pass DataFrame with features only. If policy_id is index, df.values is correct.
+        cluster_cfs = Clusters(cfs)
         results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
         results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
         results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs)
         results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50)
         results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15)
         results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
         results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
         # --- 2. Policy Attribute Calibration ---
+        loc_vars_attrs_input = pol_data # pol_data is already features with policy_id as index
+        if not loc_vars_attrs_input.empty:
+            # Standardize policy attributes if there's variance
+            min_vals = loc_vars_attrs_input.min()
+            max_vals = loc_vars_attrs_input.max()
+            range_vals = max_vals - min_vals
+            if (range_vals == 0).all(): # No variance
+                gr.Warning("Policy data for attribute calibration has no variance. Using original values (may lead to poor clustering if scales differ).")
+                loc_vars_attrs_scaled = loc_vars_attrs_input
             else:
+                # Scale only columns with variance, keep others as is (or handle as 0 if appropriate)
+                loc_vars_attrs_scaled = loc_vars_attrs_input.copy()
+                for col in range_vals.index:
+                    if range_vals[col] > 1e-9: # Check for non-zero range with tolerance
+                         loc_vars_attrs_scaled[col] = (loc_vars_attrs_input[col] - min_vals[col]) / range_vals[col]
+                    else: # if no variance, scaled value is 0 or 0.5 (or original)
+                         loc_vars_attrs_scaled[col] = 0.0 # Or np.nan, or keep original: loc_vars_attrs_input[col]
         else:
+            gr.Warning("Policy data for attribute calibration is empty. Skipping attribute calibration plots.")
+            loc_vars_attrs_scaled = pd.DataFrame(index=pol_data.index) # Empty DF with correct index
+        if not loc_vars_attrs_scaled.empty:
+            cluster_attrs = Clusters(loc_vars_attrs_scaled) # Pass the scaled data
             results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
+            results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs) # Compare against original pol_data
             results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
             results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
             results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
             results['attr_total_cf_base'] = pd.DataFrame()
             results['attr_policy_attrs_total'] = pd.DataFrame()
             results['attr_total_pv_base'] = pd.DataFrame()
+            results['attr_cashflow_plot'] = plot_cashflows_comparison([], None, []) # Empty plot
+            results['attr_scatter_cashflows_base'] = plot_scatter_comparison(pd.DataFrame(), 'Policy Attr. Calib. - No Data')
         # --- 3. Present Value Calibration ---
+        cluster_pvs = Clusters(pvs)
         results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
         results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
         results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
         results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50)
         results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15)
         results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
         results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
         # --- Summary Comparison Plot Data ---
         error_data = {}
+        def get_error_safe(compare_result_df, col_name=None):
+            if compare_result_df is None or compare_result_df.empty or 'error' not in compare_result_df.columns:
                 return np.nan
+            # Ensure col_name, if provided, is actually an index in the DataFrame
+            # compare_result_df has an index of column names of the original data (e.g. PV_NetCF)
+            if col_name and col_name in compare_result_df.index:
+                error_val = compare_result_df.loc[col_name, 'error']
+                return abs(error_val) if pd.notna(error_val) else np.nan
+            else: # Mean absolute error of all error column values
+                valid_errors = compare_result_df['error'].dropna()
+                return abs(valid_errors).mean() if not valid_errors.empty else np.nan
         key_pv_col = None
+        # pvs dataframe here has policy_id as index, columns are features.
         for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']:
+            if potential_col in pvs.columns: # pvs is already loaded and indexed
                 key_pv_col = potential_col
                 break
             get_error_safe(results.get('cf_pv_total_mort'), key_pv_col)
         ]
+        if not loc_vars_attrs_scaled.empty: # Check if attribute calibration was performed
+             error_data['Attr Calib.'] = [
+                get_error_safe(results.get('attr_total_pv_base'), key_pv_col),
+                # For stressed PVs under Attr Calib, we need to call compare_total from cluster_attrs
+                get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col),
+                get_error_safe(cluster_attrs.compare_total(pvs_mort15), key_pv_col)
             ]
         else:
             error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
             get_error_safe(results.get('pv_total_pv_mort'), key_pv_col)
         ]
+        summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%']).astype(float) # Ensure float for plotting
         fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
+        sns.set_style("whitegrid")
+        # Melt for Seaborn barplot
+        summary_df_melted = summary_df.reset_index().rename(columns={'index': 'Scenario'})
+        summary_df_melted = summary_df_melted.melt(id_vars='Scenario', var_name='Calibration Method', value_name='Absolute Error Rate')
+        sns.barplot(x='Scenario', y='Absolute Error Rate', hue='Calibration Method', data=summary_df_melted, ax=ax_summary)
         ax_summary.set_ylabel('Absolute Error Rate')
+        title_suffix = f' for {key_pv_col}' if key_pv_col else ' (Mean Absolute Error)'
         ax_summary.set_title(f'Calibration Method Comparison - Error in Total PV{title_suffix}')
         ax_summary.tick_params(axis='x', rotation=0)
+        if ax_summary.get_legend():
+            ax_summary.get_legend().set_title('Calibration Method')
+        ax_summary.grid(True, axis='y') # Horizontal grid lines for bar plot
         plt.tight_layout()
         buf_summary = io.BytesIO()
         plt.savefig(buf_summary, format='png', dpi=100)
         buf_summary.seek(0)
         gr.Error(f"File not found: {e.filename}. Please ensure example files are in '{EXAMPLE_DATA_DIR}' or all files are uploaded.")
         return {"error": f"File not found: {e.filename}"}
     except KeyError as e:
+        gr.Error(f"A required column/index ('policy_id' or feature column) is missing or misnamed: {e}. Please check data format.")
         return {"error": f"Missing column/index: {e}"}
+    except ValueError as e: # Catch other value errors like from plotting or data prep
+        gr.Error(f"Data processing or plotting error: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return {"error": f"Data error: {str(e)}"}
     except Exception as e:
+        gr.Error(f"An unexpected error occurred: {str(e)}")
         import traceback
+        traceback.print_exc()
+        return {"error": f"Unexpected error: {str(e)}"}
+# --- Gradio interface creation (create_interface, etc.) ---
+# This part remains unchanged from your original script.
+# Ensure dummy file creation in if __name__ == "__main__": handles policy_id correctly.
 def create_interface():
     with gr.Blocks(title="Cluster Model Points Analysis") as demo:
         gr.Markdown("""
         Upload your Excel files or use the example data to analyze cashflows, policy attributes, and present values using different calibration methods.
         **Required Files (Excel .xlsx):**
+        - Cashflows - Base Scenario (should contain a 'policy_id' column, or it's the first column/index)
+        - Cashflows - Lapse Stress (+50%) (similar structure)
+        - Cashflows - Mortality Stress (+15%) (similar structure)
+        - Policy Data (should contain 'policy_id', 'age_at_entry', 'policy_term', 'sum_assured', 'duration_mth')
+        - Present Values - Base Scenario (should contain 'policy_id' and PV columns like 'PV_NetCF')
+        - Present Values - Lapse Stress (similar structure)
+        - Present Values - Mortality Stress (similar structure)
         """)
         with gr.Row():
                 attr_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios")
                 attr_scatter_cashflows_base_out = gr.Image(label="Scatter Plot - Per-Cluster Cashflows (Base Scenario)")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
+                    attr_total_pv_base_out = gr.Dataframe(label="PVs - Base Scenario Total") # Only one PV table shown in original UI for this tab
             with gr.TabItem("💰 Present Value Calibration"):
                 gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
                         pv_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
                         pv_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
         def get_all_output_components():
             return [
                 summary_plot_output,
                 cf_total_base_table_out, cf_policy_attrs_total_out,
                 cf_cashflow_plot_out, cf_scatter_cashflows_base_out,
                 cf_pv_total_base_out, cf_pv_total_lapse_out, cf_pv_total_mort_out,
                 attr_total_cf_base_out, attr_policy_attrs_total_out,
                 attr_cashflow_plot_out, attr_scatter_cashflows_base_out, attr_total_pv_base_out,
                 pv_total_cf_base_out, pv_policy_attrs_total_out,
                 pv_cashflow_plot_out, pv_scatter_pvs_base_out,
                 pv_total_pv_base_out, pv_total_pv_lapse_out, pv_total_pv_mort_out
             ]
         def handle_analysis(f1, f2, f3, f4, f5, f6, f7):
             files = [f1, f2, f3, f4, f5, f6, f7]
             file_paths = []
+            # Gradio File component now passes full path for temporary files
+            for i, f_obj_path in enumerate(files): # f_obj is now a path string or None
+                if f_obj_path is None:
+                    gr.Error(f"Missing file input for argument {i+1}. Please upload all files or load examples.")
                     return [None] * len(get_all_output_components())
+                if not isinstance(f_obj_path, str): # Should be a path string
+                     gr.Error(f"Invalid file input for argument {i+1}. Expected path, got {type(f_obj_path)}")
+                     return [None] * len(get_all_output_components())
+                file_paths.append(f_obj_path)
             results = process_files(*file_paths)
+            if "error" in results :
+                return [gr.Plot.update(None)] * len(get_all_output_components()) # Clear plots on error
+            # Ensure DataFrames are converted to a format Gradio can display (e.g. List of Lists or pandas)
+            # For Dataframe components, pandas DataFrames are fine. For Image, PIL Image is fine.
             return [
                 results.get('summary_plot'),
                 results.get('cf_total_base_table'), results.get('cf_policy_attrs_total'),
                 results.get('cf_cashflow_plot'), results.get('cf_scatter_cashflows_base'),
                 results.get('cf_pv_total_base'), results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
                 results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
                 results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'), results.get('attr_total_pv_base'),
                 results.get('pv_total_cf_base'), results.get('pv_policy_attrs_total'),
                 results.get('pv_cashflow_plot'), results.get('pv_scatter_pvs_base'),
                 results.get('pv_total_pv_base'), results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
             outputs=get_all_output_components()
         )
         def load_example_files():
+            # Create dummy example files if they don't exist
+            os.makedirs(EXAMPLE_DATA_DIR, exist_ok=True)
             for key, fp in EXAMPLE_FILES.items():
                 if not os.path.exists(fp):
+                    gr.Info(f"Example file {fp} not found. Attempting to create a dummy file.")
                     try:
+                        num_policies = 50 # For dummy data
+                        if "cashflow" in key or "pv" in key:
+                            dummy_data = {'policy_id': [f'P{j:03d}' for j in range(num_policies)]}
+                            for i in range(10): # 10 time periods / PV components
+                                dummy_data[f't{i}'] = np.random.rand(num_policies) * 1000
                         elif "policy_data" in key:
+                            dummy_data = {
+                                'policy_id': [f'P{j:03d}' for j in range(num_policies)],
+                                'age_at_entry': np.random.randint(20, 50, num_policies),
+                                'policy_term': np.random.randint(10, 30, num_policies),
+                                'sum_assured': np.random.randint(10000, 50000, num_policies),
+                                'duration_mth': np.random.randint(1, 240, num_policies)
+                            }
+                        else: # Default dummy
+                             dummy_data = {'policy_id': [f'P{j:03d}' for j in range(num_policies)], 'feature1': np.random.rand(num_policies)}
+                        dummy_df = pd.DataFrame(dummy_data)
+                        # Do not set index here, let read_and_prep_excel handle it.
+                        dummy_df.to_excel(fp, index=False) # Save without pandas index
+                        gr.Info(f"Dummy file for '{os.path.basename(fp)}' created in '{EXAMPLE_DATA_DIR}'.")
                     except Exception as e:
+                        gr.Error(f"Could not create dummy file for {fp}: {e}")
+                        return [None] * 7 # Fail loading if dummy creation fails
+            missing_files = [fp for fp in EXAMPLE_FILES.values() if not os.path.exists(fp)]
+            if missing_files:
+                gr.Error(f"Still missing example data files in '{EXAMPLE_DATA_DIR}': {', '.join(missing_files)}. Please ensure they exist.")
+                return [None] * 7
             gr.Info("Example data paths loaded. Click 'Analyze Dataset'.")
+            # Return file paths directly to the File components
             return [
+                EXAMPLE_FILES["cashflow_base"], EXAMPLE_FILES["cashflow_lapse"], EXAMPLE_FILES["cashflow_mort"],
+                EXAMPLE_FILES["policy_data"], EXAMPLE_FILES["pv_base"], EXAMPLE_FILES["pv_lapse"],
+                EXAMPLE_FILES["pv_mort"]
             ]
         load_example_btn.click(
             load_example_files,
             inputs=[],
 if __name__ == "__main__":
     if not os.path.exists(EXAMPLE_DATA_DIR):
         os.makedirs(EXAMPLE_DATA_DIR)
+        print(f"Created directory '{EXAMPLE_DATA_DIR}'. Please place example Excel files there or dummy files will be generated on 'Load Example Data'.")
     demo_app = create_interface()
     demo_app.launch()