Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files Community

alidenewade commited on May 28, 2025

Commit

541bbc3

verified ·

1 Parent(s): 6570096

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -207

app.py CHANGED Viewed

@@ -2,12 +2,11 @@ import gradio as gr
 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
-from sklearn.metrics import pairwise_distances_argmin_min, r2_score # r2_score is not used but kept
-import plotly.graph_objects as go # ADDED
-import plotly.express as px # ADDED
-from plotly.subplots import make_subplots # ADDED
 import io
-import os
 from PIL import Image
 # Define the paths for example data
@@ -53,6 +52,7 @@ class Clusters:
         if agg:
             cols = df.columns
             mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
             extracted_df = self.extract_reps(df)
             mult.index = extracted_df.index
             return extracted_df.mul(mult)
@@ -68,199 +68,143 @@ class Clusters:
     def compare_total(self, df, agg=None):
         """Aggregate df by columns"""
         if agg:
             actual_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     actual_values[col] = df[col].mean()
-                else:
                     actual_values[col] = df[col].sum()
             actual = pd.Series(actual_values)
             reps_unscaled = self.extract_reps(df)
             estimate_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     weighted_sum = (reps_unscaled[col] * self.policy_count).sum()
                     total_weight = self.policy_count.sum()
                     estimate_values[col] = weighted_sum / total_weight if total_weight > 0 else 0
-                else:
                     estimate_values[col] = (reps_unscaled[col] * self.policy_count).sum()
             estimate = pd.Series(estimate_values)
-        else:
             actual = df.sum()
             estimate = self.extract_and_scale_reps(df).sum()
         error = np.where(actual != 0, estimate / actual - 1, 0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
-    """Create cashflow comparison plots using Plotly"""
     if not cfs_list or not cluster_obj or not titles:
         return None
     num_plots = len(cfs_list)
     if num_plots == 0:
         return None
     cols = 2
     rows = (num_plots + cols - 1) // cols
-    # Use subplot titles from the input 'titles'
-    subplot_titles_full = titles[:num_plots] + [""] * (rows * cols - num_plots)
-    fig = make_subplots(
-        rows=rows, cols=cols,
-        subplot_titles=subplot_titles_full
-    )
-    plot_idx = 0
-    for i_df, (df, title) in enumerate(zip(cfs_list, titles)): # Use i_df to avoid conflict with internal loop i
-        if plot_idx < rows * cols:
-            r = plot_idx // cols + 1
-            c = plot_idx % cols + 1
             comparison = cluster_obj.compare_total(df)
-            fig.add_trace(go.Scatter(x=comparison.index, y=comparison['actual'], name='Actual',
-                                     legendgroup='group1', showlegend=(plot_idx == 0)), row=r, col=c)
-            fig.add_trace(go.Scatter(x=comparison.index, y=comparison['estimate'], name='Estimate',
-                                     legendgroup='group2', showlegend=(plot_idx == 0)), row=r, col=c)
-            fig.update_xaxes(title_text='Time', showgrid=True, row=r, col=c)
-            fig.update_yaxes(title_text='Value', showgrid=True, row=r, col=c)
-            plot_idx += 1
-    # Hide unused subplots by making axes invisible and clearing titles
-    for i in range(plot_idx, rows * cols):
-        r = i // cols + 1
-        c = i % cols + 1
-        fig.update_xaxes(visible=False, row=r, col=c)
-        fig.update_yaxes(visible=False, row=r, col=c)
-        if fig.layout.annotations and i < len(fig.layout.annotations):
-             fig.layout.annotations[i].update(text="")
-    fig_width = 1500
-    fig_height = 500 * rows
-    fig.update_layout(
-        width=fig_width,
-        height=fig_height,
-        margin=dict(l=60, r=30, t=60, b=60) # Adjusted margins
-    )
-    try:
-        # Requires kaleido: pip install kaleido
-        img_bytes = fig.to_image(format="png", width=fig_width, height=fig_height)
-        buf = io.BytesIO(img_bytes)
         img = Image.open(buf)
         return img
-    except Exception as e:
-        print(f"Error generating cashflow plot image with Plotly/Kaleido: {e}. Ensure Kaleido is installed.")
-        # Create a placeholder error image
-        error_fig = go.Figure()
-        error_fig.add_annotation(text=f"Plot Error: {e}", showarrow=False)
-        error_fig.update_layout(width=fig_width, height=fig_height)
-        img_bytes = error_fig.to_image(format="png", width=fig_width, height=fig_height)
-        return Image.open(io.BytesIO(img_bytes))
-def plot_scatter_comparison(df_compare_output, title):
-    """Create scatter plot comparison from compare() output using Plotly"""
-    fig_width = 1200
-    fig_height = 800
-    if df_compare_output is None or df_compare_output.empty:
-        fig = go.Figure()
-        fig.add_annotation(
-            text="No data to display",
-            xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False,
-            font=dict(size=15)
-        )
-        fig.update_layout(title_text=title, width=fig_width, height=fig_height)
     else:
-        if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
-            gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
-            fig = px.scatter(df_compare_output, x='actual', y='estimate', title=title)
-            fig.update_traces(marker=dict(size=5, opacity=0.6)) # Set marker size and opacity
-        else:
-            df_reset = df_compare_output.reset_index()
-            level_1_name = df_compare_output.index.names[1] if df_compare_output.index.names[1] else 'category'
-            if level_1_name not in df_reset.columns: # Handle case where level name might not be in columns
-                df_reset = df_reset.rename(columns={df_reset.columns[1]: level_1_name})
-            fig = px.scatter(df_reset, x='actual', y='estimate', color=level_1_name,
-                             title=title,
-                             labels={'actual': 'Actual', 'estimate': 'Estimate', level_1_name: level_1_name})
-            fig.update_traces(marker=dict(size=5, opacity=0.6)) # Set marker size and opacity
-            num_unique_levels = df_reset[level_1_name].nunique()
-            if num_unique_levels == 0 or num_unique_levels > 10:
-                fig.update_layout(showlegend=False)
-            elif num_unique_levels == 1: # Show legend even for one item if it's named
-                 fig.update_layout(showlegend=True)
-        fig.update_xaxes(showgrid=True, title_text='Actual')
-        fig.update_yaxes(showgrid=True, title_text='Estimate')
-        # Draw identity line
-        if not df_compare_output.empty:
-            min_val_actual = df_compare_output['actual'].min()
-            max_val_actual = df_compare_output['actual'].max()
-            min_val_estimate = df_compare_output['estimate'].min()
-            max_val_estimate = df_compare_output['estimate'].max()
-            # Handle cases where min/max might be NaN (e.g. if all data is NaN)
-            if pd.isna(min_val_actual) or pd.isna(min_val_estimate) or pd.isna(max_val_actual) or pd.isna(max_val_estimate):
-                 lims = [0,1] # Default if data is problematic
-            else:
-                overall_min = min(min_val_actual, min_val_estimate)
-                overall_max = max(max_val_actual, max_val_estimate)
-                lims = [overall_min, overall_max]
-            if lims[0] != lims[1]: # Avoid issues if all data is single point or NaN
-                fig.add_trace(go.Scatter(
-                    x=lims, y=lims, mode='lines', name='Identity',
-                    line=dict(color='red', width=1), # Adjusted width for Plotly
-                    showlegend=False
-                ))
-                fig.update_xaxes(range=lims)
-                fig.update_yaxes(range=lims, scaleanchor="x", scaleratio=1) # Makes axes square based on data range
-        fig.update_layout(width=fig_width, height=fig_height)
-    try:
-        # Requires kaleido: pip install kaleido
-        img_bytes = fig.to_image(format="png", width=fig_width, height=fig_height)
-        buf = io.BytesIO(img_bytes)
-        img = Image.open(buf)
-        return img
-    except Exception as e:
-        print(f"Error generating scatter plot image with Plotly/Kaleido: {e}. Ensure Kaleido is installed.")
-        error_fig = go.Figure()
-        error_fig.add_annotation(text=f"Plot Error: {e}", showarrow=False)
-        error_fig.update_layout(width=fig_width, height=fig_height, title_text=title)
-        img_bytes = error_fig.to_image(format="png", width=fig_width, height=fig_height)
-        return Image.open(io.BytesIO(img_bytes))
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
     """Main processing function - now accepts file paths"""
     try:
         cfs = pd.read_excel(cashflow_base_path, index_col=0)
         cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
         cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
         pol_data_full = pd.read_excel(policy_data_path, index_col=0)
         required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
         if all(col in pol_data_full.columns for col in required_cols):
             pol_data = pol_data_full[required_cols]
         else:
             gr.Warning(f"Policy data might be missing required columns. Found: {pol_data_full.columns.tolist()}")
-            pol_data = pol_data_full
         pvs = pd.read_excel(pv_base_path, index_col=0)
         pvs_lapse50 = pd.read_excel(pv_lapse_path, index_col=0)
@@ -270,43 +214,38 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
         results = {}
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
         # --- 1. Cashflow Calibration ---
         cluster_cfs = Clusters(cfs)
         results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
         results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
         results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs)
         results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50)
         results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15)
         results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
         results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
         # --- 2. Policy Attribute Calibration ---
-        if not pol_data.empty and (pol_data.max(numeric_only=True) - pol_data.min(numeric_only=True)).all() != 0:
-             loc_vars_attrs = (pol_data - pol_data.min(numeric_only=True)) / (pol_data.max(numeric_only=True) - pol_data.min(numeric_only=True))
-             loc_vars_attrs = loc_vars_attrs.fillna(0) # Fill NaNs that may result from division by zero if a column has no variance
         else:
             gr.Warning("Policy data for attribute calibration is empty or has no variance. Skipping attribute calibration plots.")
-            loc_vars_attrs = pol_data.copy() # Use a copy
-        if not loc_vars_attrs.empty and pd.api.types.is_numeric_dtype(loc_vars_attrs.values): # Check if data is numeric for KMeans
-            try:
-                cluster_attrs = Clusters(loc_vars_attrs)
-                results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
-                results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs)
-                results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
-                results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
-                results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
-            except Exception as e_attr_clust: # Catch errors during clustering (e.g. if data is not suitable)
-                gr.Error(f"Error during policy attribute clustering: {e_attr_clust}")
-                results['attr_total_cf_base'] = pd.DataFrame()
-                results['attr_policy_attrs_total'] = pd.DataFrame()
-                results['attr_total_pv_base'] = pd.DataFrame()
-                results['attr_cashflow_plot'] = None
-                results['attr_scatter_cashflows_base'] = None
         else:
-            gr.Warning("Skipping attribute calibration as data is empty or non-numeric after processing.")
             results['attr_total_cf_base'] = pd.DataFrame()
             results['attr_policy_attrs_total'] = pd.DataFrame()
             results['attr_total_pv_base'] = pd.DataFrame()
@@ -316,36 +255,48 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         # --- 3. Present Value Calibration ---
         cluster_pvs = Clusters(pvs)
         results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
         results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
         results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
         results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50)
         results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15)
         results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
         results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
         # --- Summary Comparison Plot Data ---
         error_data = {}
         def get_error_safe(compare_result, col_name=None):
             if compare_result.empty:
                 return np.nan
             if col_name and col_name in compare_result.index:
                 return abs(compare_result.loc[col_name, 'error'])
             else:
                 return abs(compare_result['error']).mean()
         key_pv_col = None
-        for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']:
             if potential_col in pvs.columns:
                 key_pv_col = potential_col
                 break
         error_data['CF Calib.'] = [
             get_error_safe(cluster_cfs.compare_total(pvs), key_pv_col),
             get_error_safe(cluster_cfs.compare_total(pvs_lapse50), key_pv_col),
             get_error_safe(cluster_cfs.compare_total(pvs_mort15), key_pv_col)
         ]
-        if results.get('attr_total_pv_base') is not None and not results['attr_total_pv_base'].empty : # Check if Attr Calib was successful
             error_data['Attr Calib.'] = [
                 get_error_safe(cluster_attrs.compare_total(pvs), key_pv_col),
                 get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col),
@@ -354,51 +305,32 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         else:
             error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
         error_data['PV Calib.'] = [
             get_error_safe(cluster_pvs.compare_total(pvs), key_pv_col),
             get_error_safe(cluster_pvs.compare_total(pvs_lapse50), key_pv_col),
             get_error_safe(cluster_pvs.compare_total(pvs_mort15), key_pv_col)
         ]
         summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
         title_suffix = f' ({key_pv_col})' if key_pv_col else ' (Mean Absolute Error)'
-        plot_title = f'Calibration Method Comparison - Error in Total PV{title_suffix}'
-        fig_width = 1000
-        fig_height = 600
-        summary_df_melted = summary_df.reset_index().melt(id_vars='index', var_name='Calibration Method', value_name='Absolute Error Rate')
-        summary_df_melted.rename(columns={'index': 'Scenario'}, inplace=True)
-        fig_summary = px.bar(
-            summary_df_melted,
-            x='Scenario',
-            y='Absolute Error Rate',
-            color='Calibration Method',
-            barmode='group',
-            title=plot_title
-        )
-        fig_summary.update_layout(
-            width=fig_width, height=fig_height,
-            xaxis_tickangle=0,
-            yaxis_title='Absolute Error Rate',
-            legend_title_text='Calibration Method'
-        )
-        fig_summary.update_yaxes(showgrid=True)
-        try:
-            # Requires kaleido: pip install kaleido
-            buf_summary_bytes = fig_summary.to_image(format="png", width=fig_width, height=fig_height)
-            buf_summary = io.BytesIO(buf_summary_bytes)
-            results['summary_plot'] = Image.open(buf_summary)
-        except Exception as e:
-            print(f"Error generating summary plot image with Plotly/Kaleido: {e}. Ensure Kaleido is installed.")
-            error_fig = go.Figure()
-            error_fig.add_annotation(text=f"Plot Error: {e}", showarrow=False)
-            error_fig.update_layout(width=fig_width, height=fig_height, title_text=plot_title)
-            img_bytes = error_fig.to_image(format="png", width=fig_width, height=fig_height)
-            results['summary_plot'] = Image.open(io.BytesIO(img_bytes))
         return results
     except FileNotFoundError as e:
@@ -409,9 +341,6 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         return {"error": f"Missing column: {e}"}
     except Exception as e:
         gr.Error(f"Error processing files: {str(e)}")
-        # Optionally log the full traceback for debugging
-        import traceback
-        traceback.print_exc()
         return {"error": f"Error processing files: {str(e)}"}
@@ -431,14 +360,14 @@ def create_interface():
         - Present Values - Base Scenario
         - Present Values - Lapse Stress
         - Present Values - Mortality Stress
-        **Note:** Plot generation uses Plotly and Kaleido. If plots appear as errors, ensure Kaleido is installed (`pip install kaleido`).
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("### Upload Files or Load Examples")
                 load_example_btn = gr.Button("Load Example Data")
                 with gr.Row():
                     cashflow_base_input = gr.File(label="Cashflows - Base", file_types=[".xlsx"])
                     cashflow_lapse_input = gr.File(label="Cashflows - Lapse Stress", file_types=[".xlsx"])
@@ -449,11 +378,12 @@ def create_interface():
                     pv_lapse_input = gr.File(label="Present Values - Lapse Stress", file_types=[".xlsx"])
                 with gr.Row():
                     pv_mort_input = gr.File(label="Present Values - Mortality Stress", file_types=[".xlsx"])
                 analyze_btn = gr.Button("Analyze Dataset", variant="primary", size="lg")
         with gr.Tabs():
             with gr.TabItem("📊 Summary"):
-                summary_plot_output = gr.Image(label="Calibration Methods Comparison") # Stays as gr.Image
             with gr.TabItem("💸 Cashflow Calibration"):
                 gr.Markdown("### Results: Using Annual Cashflows as Calibration Variables")
@@ -478,6 +408,7 @@ def create_interface():
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
                      attr_total_pv_base_out = gr.Dataframe(label="PVs - Base Scenario Total")
             with gr.TabItem("💰 Present Value Calibration"):
                 gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
                 with gr.Row():
@@ -491,46 +422,59 @@ def create_interface():
                         pv_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
                         pv_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
         def get_all_output_components():
             return [
                 summary_plot_output,
                 cf_total_base_table_out, cf_policy_attrs_total_out,
                 cf_cashflow_plot_out, cf_scatter_cashflows_base_out,
                 cf_pv_total_base_out, cf_pv_total_lapse_out, cf_pv_total_mort_out,
                 attr_total_cf_base_out, attr_policy_attrs_total_out,
                 attr_cashflow_plot_out, attr_scatter_cashflows_base_out, attr_total_pv_base_out,
                 pv_total_cf_base_out, pv_policy_attrs_total_out,
                 pv_cashflow_plot_out, pv_scatter_pvs_base_out,
                 pv_total_pv_base_out, pv_total_pv_lapse_out, pv_total_pv_mort_out
             ]
         def handle_analysis(f1, f2, f3, f4, f5, f6, f7):
             files = [f1, f2, f3, f4, f5, f6, f7]
             file_paths = []
             for i, f_obj in enumerate(files):
                 if f_obj is None:
                     gr.Error(f"Missing file input for argument {i+1}. Please upload all files or load examples.")
                     return [None] * len(get_all_output_components())
                 if hasattr(f_obj, 'name') and isinstance(f_obj.name, str):
                     file_paths.append(f_obj.name)
                 elif isinstance(f_obj, str):
                      file_paths.append(f_obj)
                 else:
                     gr.Error(f"Invalid file input for argument {i+1}. Type: {type(f_obj)}")
                     return [None] * len(get_all_output_components())
             results = process_files(*file_paths)
-            if "error" in results:
                 return [None] * len(get_all_output_components())
             return [
                 results.get('summary_plot'),
                 results.get('cf_total_base_table'), results.get('cf_policy_attrs_total'),
                 results.get('cf_cashflow_plot'), results.get('cf_scatter_cashflows_base'),
                 results.get('cf_pv_total_base'), results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
                 results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
                 results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'), results.get('attr_total_pv_base'),
                 results.get('pv_total_cf_base'), results.get('pv_policy_attrs_total'),
                 results.get('pv_cashflow_plot'), results.get('pv_scatter_pvs_base'),
                 results.get('pv_total_pv_base'), results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
@@ -543,11 +487,12 @@ def create_interface():
             outputs=get_all_output_components()
         )
         def load_example_files():
             missing_files = [fp for fp in EXAMPLE_FILES.values() if not os.path.exists(fp)]
             if missing_files:
                 gr.Error(f"Missing example data files in '{EXAMPLE_DATA_DIR}': {', '.join(missing_files)}. Please ensure they exist.")
-                return [None] * 7
             gr.Info("Example data paths loaded. Click 'Analyze Dataset'.")
             return [

 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin_min, r2_score
+import matplotlib.pyplot as plt
+import matplotlib.cm
 import io
+import os # Added for path joining
 from PIL import Image
 # Define the paths for example data
         if agg:
             cols = df.columns
             mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
+            # Ensure mult has same index as extract_reps(df) for proper alignment
             extracted_df = self.extract_reps(df)
             mult.index = extracted_df.index
             return extracted_df.mul(mult)
     def compare_total(self, df, agg=None):
         """Aggregate df by columns"""
         if agg:
+            # Calculate actual values using specified aggregation
             actual_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     actual_values[col] = df[col].mean()
+                else:  # sum
                     actual_values[col] = df[col].sum()
             actual = pd.Series(actual_values)
+            # Calculate estimate values
             reps_unscaled = self.extract_reps(df)
             estimate_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
+                    # Weighted average for mean columns
                     weighted_sum = (reps_unscaled[col] * self.policy_count).sum()
                     total_weight = self.policy_count.sum()
                     estimate_values[col] = weighted_sum / total_weight if total_weight > 0 else 0
+                else:  # sum
                     estimate_values[col] = (reps_unscaled[col] * self.policy_count).sum()
             estimate = pd.Series(estimate_values)
+        else:  # Original logic if no agg is specified (all sum)
             actual = df.sum()
             estimate = self.extract_and_scale_reps(df).sum()
+        # Calculate error, handling division by zero
         error = np.where(actual != 0, estimate / actual - 1, 0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
+    """Create cashflow comparison plots"""
     if not cfs_list or not cluster_obj or not titles:
         return None
     num_plots = len(cfs_list)
     if num_plots == 0:
         return None
+    # Determine subplot layout
     cols = 2
     rows = (num_plots + cols - 1) // cols
+    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows), squeeze=False)
+    axes = axes.flatten()
+    for i, (df, title) in enumerate(zip(cfs_list, titles)):
+        if i < len(axes):
             comparison = cluster_obj.compare_total(df)
+            comparison[['actual', 'estimate']].plot(ax=axes[i], grid=True, title=title)
+            axes[i].set_xlabel('Time')
+            axes[i].set_ylabel('Value')
+    # Hide any unused subplots
+    for j in range(i + 1, len(axes)):
+        fig.delaxes(axes[j])
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', dpi=100)
+    buf.seek(0)
+    img = Image.open(buf)
+    plt.close(fig)
+    return img
+def plot_scatter_comparison(df_compare_output, title):
+    """Create scatter plot comparison from compare() output"""
+    if df_compare_output is None or df_compare_output.empty:
+        # Create a blank plot with a message
+        fig, ax = plt.subplots(figsize=(12, 8))
+        ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
+        ax.set_title(title)
+        buf = io.BytesIO()
+        plt.savefig(buf, format='png', dpi=100)
+        buf.seek(0)
         img = Image.open(buf)
+        plt.close(fig)
         return img
+    fig, ax = plt.subplots(figsize=(12, 8))
+    if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
+         gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
+         ax.scatter(df_compare_output['actual'], df_compare_output['estimate'], s=9, alpha=0.6)
     else:
+        unique_levels = df_compare_output.index.get_level_values(1).unique()
+        colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(unique_levels)))
+        for item_level, color_val in zip(unique_levels, colors):
+            subset = df_compare_output.xs(item_level, level=1)
+            ax.scatter(subset['actual'], subset['estimate'], color=color_val, s=9, alpha=0.6, label=item_level)
+        if len(unique_levels) > 1 and len(unique_levels) <= 10: # Add legend if reasonable number of items
+            ax.legend(title=df_compare_output.index.names[1])
+    ax.set_xlabel('Actual')
+    ax.set_ylabel('Estimate')
+    ax.set_title(title)
+    ax.grid(True)
+    # Draw identity line
+    lims = [
+        np.min([ax.get_xlim(), ax.get_ylim()]),
+        np.max([ax.get_xlim(), ax.get_ylim()]),
+    ]
+    if lims[0] != lims[1]: # Avoid issues if data is all zeros or single point
+      ax.plot(lims, lims, 'r-', linewidth=0.5)
+      ax.set_xlim(lims)
+      ax.set_ylim(lims)
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', dpi=100)
+    buf.seek(0)
+    img = Image.open(buf)
+    plt.close(fig)
+    return img
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
     """Main processing function - now accepts file paths"""
     try:
+        # Read uploaded files using paths
         cfs = pd.read_excel(cashflow_base_path, index_col=0)
         cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
         cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
         pol_data_full = pd.read_excel(policy_data_path, index_col=0)
+        # Ensure the correct columns are selected for pol_data
         required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
         if all(col in pol_data_full.columns for col in required_cols):
             pol_data = pol_data_full[required_cols]
         else:
             gr.Warning(f"Policy data might be missing required columns. Found: {pol_data_full.columns.tolist()}")
+            pol_data = pol_data_full # proceed with whatever columns are there
         pvs = pd.read_excel(pv_base_path, index_col=0)
         pvs_lapse50 = pd.read_excel(pv_lapse_path, index_col=0)
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
         results = {}
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
         # --- 1. Cashflow Calibration ---
         cluster_cfs = Clusters(cfs)
         results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
         results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
         results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs)
         results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50)
         results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15)
         results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
         results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
         # --- 2. Policy Attribute Calibration ---
+        # Standardize policy attributes
+        if not pol_data.empty and (pol_data.max() - pol_data.min()).all() != 0: # check for variance
+             loc_vars_attrs = (pol_data - pol_data.min()) / (pol_data.max() - pol_data.min())
         else:
             gr.Warning("Policy data for attribute calibration is empty or has no variance. Skipping attribute calibration plots.")
+            loc_vars_attrs = pol_data # Use original if no variance, KMeans might handle it or fail gracefully
+        if not loc_vars_attrs.empty:
+            cluster_attrs = Clusters(loc_vars_attrs)
+            results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
+            results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs)
+            results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
+            results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
+            results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
         else:
             results['attr_total_cf_base'] = pd.DataFrame()
             results['attr_policy_attrs_total'] = pd.DataFrame()
             results['attr_total_pv_base'] = pd.DataFrame()
         # --- 3. Present Value Calibration ---
         cluster_pvs = Clusters(pvs)
         results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
         results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
         results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
         results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50)
         results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15)
         results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
         results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
         # --- Summary Comparison Plot Data ---
+        # Error metric for key PV column or mean absolute error
         error_data = {}
+        # Function to safely get error value
         def get_error_safe(compare_result, col_name=None):
             if compare_result.empty:
                 return np.nan
             if col_name and col_name in compare_result.index:
                 return abs(compare_result.loc[col_name, 'error'])
             else:
+                # Use mean absolute error if specific column not found or col_name is None
                 return abs(compare_result['error']).mean()
+        # Determine key PV column (try common names)
         key_pv_col = None
+        for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']: # Add more common names if needed
             if potential_col in pvs.columns:
                 key_pv_col = potential_col
                 break
+        # Cashflow Calibration Errors
         error_data['CF Calib.'] = [
             get_error_safe(cluster_cfs.compare_total(pvs), key_pv_col),
             get_error_safe(cluster_cfs.compare_total(pvs_lapse50), key_pv_col),
             get_error_safe(cluster_cfs.compare_total(pvs_mort15), key_pv_col)
         ]
+        # Policy Attribute Calibration Errors
+        if not loc_vars_attrs.empty:
             error_data['Attr Calib.'] = [
                 get_error_safe(cluster_attrs.compare_total(pvs), key_pv_col),
                 get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col),
         else:
             error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
+        # Present Value Calibration Errors
         error_data['PV Calib.'] = [
             get_error_safe(cluster_pvs.compare_total(pvs), key_pv_col),
             get_error_safe(cluster_pvs.compare_total(pvs_lapse50), key_pv_col),
             get_error_safe(cluster_pvs.compare_total(pvs_mort15), key_pv_col)
         ]
+        # Create Summary Plot
         summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
+        fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
+        summary_df.plot(kind='bar', ax=ax_summary, grid=True)
+        ax_summary.set_ylabel('Absolute Error Rate')
         title_suffix = f' ({key_pv_col})' if key_pv_col else ' (Mean Absolute Error)'
+        ax_summary.set_title(f'Calibration Method Comparison - Error in Total PV{title_suffix}')
+        ax_summary.tick_params(axis='x', rotation=0)
+        ax_summary.legend(title='Calibration Method')
+        plt.tight_layout()
+        buf_summary = io.BytesIO()
+        plt.savefig(buf_summary, format='png', dpi=100)
+        buf_summary.seek(0)
+        results['summary_plot'] = Image.open(buf_summary)
+        plt.close(fig_summary)
         return results
     except FileNotFoundError as e:
         return {"error": f"Missing column: {e}"}
     except Exception as e:
         gr.Error(f"Error processing files: {str(e)}")
         return {"error": f"Error processing files: {str(e)}"}
         - Present Values - Base Scenario
         - Present Values - Lapse Stress
         - Present Values - Mortality Stress
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("### Upload Files or Load Examples")
                 load_example_btn = gr.Button("Load Example Data")
                 with gr.Row():
                     cashflow_base_input = gr.File(label="Cashflows - Base", file_types=[".xlsx"])
                     cashflow_lapse_input = gr.File(label="Cashflows - Lapse Stress", file_types=[".xlsx"])
                     pv_lapse_input = gr.File(label="Present Values - Lapse Stress", file_types=[".xlsx"])
                 with gr.Row():
                     pv_mort_input = gr.File(label="Present Values - Mortality Stress", file_types=[".xlsx"])
                 analyze_btn = gr.Button("Analyze Dataset", variant="primary", size="lg")
         with gr.Tabs():
             with gr.TabItem("📊 Summary"):
+                summary_plot_output = gr.Image(label="Calibration Methods Comparison")
             with gr.TabItem("💸 Cashflow Calibration"):
                 gr.Markdown("### Results: Using Annual Cashflows as Calibration Variables")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
                      attr_total_pv_base_out = gr.Dataframe(label="PVs - Base Scenario Total")
             with gr.TabItem("💰 Present Value Calibration"):
                 gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
                 with gr.Row():
                         pv_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
                         pv_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
+        # --- Helper function to prepare outputs ---
         def get_all_output_components():
             return [
                 summary_plot_output,
+                # Cashflow Calib Outputs
                 cf_total_base_table_out, cf_policy_attrs_total_out,
                 cf_cashflow_plot_out, cf_scatter_cashflows_base_out,
                 cf_pv_total_base_out, cf_pv_total_lapse_out, cf_pv_total_mort_out,
+                # Attribute Calib Outputs
                 attr_total_cf_base_out, attr_policy_attrs_total_out,
                 attr_cashflow_plot_out, attr_scatter_cashflows_base_out, attr_total_pv_base_out,
+                # PV Calib Outputs
                 pv_total_cf_base_out, pv_policy_attrs_total_out,
                 pv_cashflow_plot_out, pv_scatter_pvs_base_out,
                 pv_total_pv_base_out, pv_total_pv_lapse_out, pv_total_pv_mort_out
             ]
+        # --- Action for Analyze Button ---
         def handle_analysis(f1, f2, f3, f4, f5, f6, f7):
             files = [f1, f2, f3, f4, f5, f6, f7]
             file_paths = []
             for i, f_obj in enumerate(files):
                 if f_obj is None:
                     gr.Error(f"Missing file input for argument {i+1}. Please upload all files or load examples.")
                     return [None] * len(get_all_output_components())
+                # If f_obj is a Gradio FileData object (from direct upload)
                 if hasattr(f_obj, 'name') and isinstance(f_obj.name, str):
                     file_paths.append(f_obj.name)
+                # If f_obj is already a string path (from example load)
                 elif isinstance(f_obj, str):
                      file_paths.append(f_obj)
                 else:
                     gr.Error(f"Invalid file input for argument {i+1}. Type: {type(f_obj)}")
                     return [None] * len(get_all_output_components())
             results = process_files(*file_paths)
+            if "error" in results: # Check if process_files returned an error dictionary
                 return [None] * len(get_all_output_components())
             return [
                 results.get('summary_plot'),
+                # CF Calib
                 results.get('cf_total_base_table'), results.get('cf_policy_attrs_total'),
                 results.get('cf_cashflow_plot'), results.get('cf_scatter_cashflows_base'),
                 results.get('cf_pv_total_base'), results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
+                # Attr Calib
                 results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
                 results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'), results.get('attr_total_pv_base'),
+                # PV Calib
                 results.get('pv_total_cf_base'), results.get('pv_policy_attrs_total'),
                 results.get('pv_cashflow_plot'), results.get('pv_scatter_pvs_base'),
                 results.get('pv_total_pv_base'), results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
             outputs=get_all_output_components()
         )
+        # --- Action for Load Example Data Button ---
         def load_example_files():
             missing_files = [fp for fp in EXAMPLE_FILES.values() if not os.path.exists(fp)]
             if missing_files:
                 gr.Error(f"Missing example data files in '{EXAMPLE_DATA_DIR}': {', '.join(missing_files)}. Please ensure they exist.")
+                return [None] * 7 # Return None for all file inputs
             gr.Info("Example data paths loaded. Click 'Analyze Dataset'.")
             return [