Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files Community

alidenewade commited on May 28, 2025

Commit

6570096

verified ·

1 Parent(s): 3adb7ec

Update app.py

Browse files

Files changed (1) hide show

app.py +207 -152

app.py CHANGED Viewed

@@ -2,11 +2,12 @@ import gradio as gr
 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
-from sklearn.metrics import pairwise_distances_argmin_min, r2_score
-import matplotlib.pyplot as plt
-import matplotlib.cm
 import io
-import os # Added for path joining
 from PIL import Image
 # Define the paths for example data
@@ -52,7 +53,6 @@ class Clusters:
         if agg:
             cols = df.columns
             mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
-            # Ensure mult has same index as extract_reps(df) for proper alignment
             extracted_df = self.extract_reps(df)
             mult.index = extracted_df.index
             return extracted_df.mul(mult)
@@ -68,143 +68,199 @@ class Clusters:
     def compare_total(self, df, agg=None):
         """Aggregate df by columns"""
         if agg:
-            # Calculate actual values using specified aggregation
             actual_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     actual_values[col] = df[col].mean()
-                else:  # sum
                     actual_values[col] = df[col].sum()
             actual = pd.Series(actual_values)
-            # Calculate estimate values
             reps_unscaled = self.extract_reps(df)
             estimate_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
-                    # Weighted average for mean columns
                     weighted_sum = (reps_unscaled[col] * self.policy_count).sum()
                     total_weight = self.policy_count.sum()
                     estimate_values[col] = weighted_sum / total_weight if total_weight > 0 else 0
-                else:  # sum
                     estimate_values[col] = (reps_unscaled[col] * self.policy_count).sum()
             estimate = pd.Series(estimate_values)
-        else:  # Original logic if no agg is specified (all sum)
             actual = df.sum()
             estimate = self.extract_and_scale_reps(df).sum()
-        # Calculate error, handling division by zero
         error = np.where(actual != 0, estimate / actual - 1, 0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
-    """Create cashflow comparison plots"""
     if not cfs_list or not cluster_obj or not titles:
         return None
     num_plots = len(cfs_list)
     if num_plots == 0:
         return None
-    # Determine subplot layout
     cols = 2
     rows = (num_plots + cols - 1) // cols
-    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows), squeeze=False)
-    axes = axes.flatten()
-    for i, (df, title) in enumerate(zip(cfs_list, titles)):
-        if i < len(axes):
             comparison = cluster_obj.compare_total(df)
-            comparison[['actual', 'estimate']].plot(ax=axes[i], grid=True, title=title)
-            axes[i].set_xlabel('Time')
-            axes[i].set_ylabel('Value')
-    # Hide any unused subplots
-    for j in range(i + 1, len(axes)):
-        fig.delaxes(axes[j])
-    plt.tight_layout()
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png', dpi=100)
-    buf.seek(0)
-    img = Image.open(buf)
-    plt.close(fig)
-    return img
-def plot_scatter_comparison(df_compare_output, title):
-    """Create scatter plot comparison from compare() output"""
-    if df_compare_output is None or df_compare_output.empty:
-        # Create a blank plot with a message
-        fig, ax = plt.subplots(figsize=(12, 8))
-        ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
-        ax.set_title(title)
-        buf = io.BytesIO()
-        plt.savefig(buf, format='png', dpi=100)
-        buf.seek(0)
         img = Image.open(buf)
-        plt.close(fig)
         return img
-    fig, ax = plt.subplots(figsize=(12, 8))
-    if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
-         gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
-         ax.scatter(df_compare_output['actual'], df_compare_output['estimate'], s=9, alpha=0.6)
     else:
-        unique_levels = df_compare_output.index.get_level_values(1).unique()
-        colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(unique_levels)))
-        for item_level, color_val in zip(unique_levels, colors):
-            subset = df_compare_output.xs(item_level, level=1)
-            ax.scatter(subset['actual'], subset['estimate'], color=color_val, s=9, alpha=0.6, label=item_level)
-        if len(unique_levels) > 1 and len(unique_levels) <= 10: # Add legend if reasonable number of items
-            ax.legend(title=df_compare_output.index.names[1])
-    ax.set_xlabel('Actual')
-    ax.set_ylabel('Estimate')
-    ax.set_title(title)
-    ax.grid(True)
-    # Draw identity line
-    lims = [
-        np.min([ax.get_xlim(), ax.get_ylim()]),
-        np.max([ax.get_xlim(), ax.get_ylim()]),
-    ]
-    if lims[0] != lims[1]: # Avoid issues if data is all zeros or single point
-      ax.plot(lims, lims, 'r-', linewidth=0.5)
-      ax.set_xlim(lims)
-      ax.set_ylim(lims)
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png', dpi=100)
-    buf.seek(0)
-    img = Image.open(buf)
-    plt.close(fig)
-    return img
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
     """Main processing function - now accepts file paths"""
     try:
-        # Read uploaded files using paths
         cfs = pd.read_excel(cashflow_base_path, index_col=0)
         cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
         cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
         pol_data_full = pd.read_excel(policy_data_path, index_col=0)
-        # Ensure the correct columns are selected for pol_data
         required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
         if all(col in pol_data_full.columns for col in required_cols):
             pol_data = pol_data_full[required_cols]
         else:
             gr.Warning(f"Policy data might be missing required columns. Found: {pol_data_full.columns.tolist()}")
-            pol_data = pol_data_full # proceed with whatever columns are there
         pvs = pd.read_excel(pv_base_path, index_col=0)
         pvs_lapse50 = pd.read_excel(pv_lapse_path, index_col=0)
@@ -214,38 +270,43 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
         results = {}
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
         # --- 1. Cashflow Calibration ---
         cluster_cfs = Clusters(cfs)
         results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
         results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
         results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs)
         results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50)
         results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15)
         results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
         results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
         # --- 2. Policy Attribute Calibration ---
-        # Standardize policy attributes
-        if not pol_data.empty and (pol_data.max() - pol_data.min()).all() != 0: # check for variance
-             loc_vars_attrs = (pol_data - pol_data.min()) / (pol_data.max() - pol_data.min())
         else:
             gr.Warning("Policy data for attribute calibration is empty or has no variance. Skipping attribute calibration plots.")
-            loc_vars_attrs = pol_data # Use original if no variance, KMeans might handle it or fail gracefully
-        if not loc_vars_attrs.empty:
-            cluster_attrs = Clusters(loc_vars_attrs)
-            results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
-            results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs)
-            results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
-            results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
-            results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
         else:
             results['attr_total_cf_base'] = pd.DataFrame()
             results['attr_policy_attrs_total'] = pd.DataFrame()
             results['attr_total_pv_base'] = pd.DataFrame()
@@ -255,48 +316,36 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         # --- 3. Present Value Calibration ---
         cluster_pvs = Clusters(pvs)
         results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
         results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
         results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
         results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50)
         results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15)
         results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
         results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
         # --- Summary Comparison Plot Data ---
-        # Error metric for key PV column or mean absolute error
         error_data = {}
-        # Function to safely get error value
         def get_error_safe(compare_result, col_name=None):
             if compare_result.empty:
                 return np.nan
             if col_name and col_name in compare_result.index:
                 return abs(compare_result.loc[col_name, 'error'])
             else:
-                # Use mean absolute error if specific column not found or col_name is None
                 return abs(compare_result['error']).mean()
-        # Determine key PV column (try common names)
         key_pv_col = None
-        for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']: # Add more common names if needed
             if potential_col in pvs.columns:
                 key_pv_col = potential_col
                 break
-        # Cashflow Calibration Errors
         error_data['CF Calib.'] = [
             get_error_safe(cluster_cfs.compare_total(pvs), key_pv_col),
             get_error_safe(cluster_cfs.compare_total(pvs_lapse50), key_pv_col),
             get_error_safe(cluster_cfs.compare_total(pvs_mort15), key_pv_col)
         ]
-        # Policy Attribute Calibration Errors
-        if not loc_vars_attrs.empty:
             error_data['Attr Calib.'] = [
                 get_error_safe(cluster_attrs.compare_total(pvs), key_pv_col),
                 get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col),
@@ -305,32 +354,51 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         else:
             error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
-        # Present Value Calibration Errors
         error_data['PV Calib.'] = [
             get_error_safe(cluster_pvs.compare_total(pvs), key_pv_col),
             get_error_safe(cluster_pvs.compare_total(pvs_lapse50), key_pv_col),
             get_error_safe(cluster_pvs.compare_total(pvs_mort15), key_pv_col)
         ]
-        # Create Summary Plot
         summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
-        fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
-        summary_df.plot(kind='bar', ax=ax_summary, grid=True)
-        ax_summary.set_ylabel('Absolute Error Rate')
         title_suffix = f' ({key_pv_col})' if key_pv_col else ' (Mean Absolute Error)'
-        ax_summary.set_title(f'Calibration Method Comparison - Error in Total PV{title_suffix}')
-        ax_summary.tick_params(axis='x', rotation=0)
-        ax_summary.legend(title='Calibration Method')
-        plt.tight_layout()
-        buf_summary = io.BytesIO()
-        plt.savefig(buf_summary, format='png', dpi=100)
-        buf_summary.seek(0)
-        results['summary_plot'] = Image.open(buf_summary)
-        plt.close(fig_summary)
         return results
     except FileNotFoundError as e:
@@ -341,6 +409,9 @@ def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
         return {"error": f"Missing column: {e}"}
     except Exception as e:
         gr.Error(f"Error processing files: {str(e)}")
         return {"error": f"Error processing files: {str(e)}"}
@@ -360,14 +431,14 @@ def create_interface():
         - Present Values - Base Scenario
         - Present Values - Lapse Stress
         - Present Values - Mortality Stress
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("### Upload Files or Load Examples")
                 load_example_btn = gr.Button("Load Example Data")
                 with gr.Row():
                     cashflow_base_input = gr.File(label="Cashflows - Base", file_types=[".xlsx"])
                     cashflow_lapse_input = gr.File(label="Cashflows - Lapse Stress", file_types=[".xlsx"])
@@ -378,12 +449,11 @@ def create_interface():
                     pv_lapse_input = gr.File(label="Present Values - Lapse Stress", file_types=[".xlsx"])
                 with gr.Row():
                     pv_mort_input = gr.File(label="Present Values - Mortality Stress", file_types=[".xlsx"])
                 analyze_btn = gr.Button("Analyze Dataset", variant="primary", size="lg")
         with gr.Tabs():
             with gr.TabItem("📊 Summary"):
-                summary_plot_output = gr.Image(label="Calibration Methods Comparison")
             with gr.TabItem("💸 Cashflow Calibration"):
                 gr.Markdown("### Results: Using Annual Cashflows as Calibration Variables")
@@ -408,7 +478,6 @@ def create_interface():
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
                      attr_total_pv_base_out = gr.Dataframe(label="PVs - Base Scenario Total")
             with gr.TabItem("💰 Present Value Calibration"):
                 gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
                 with gr.Row():
@@ -422,59 +491,46 @@ def create_interface():
                         pv_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
                         pv_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
-        # --- Helper function to prepare outputs ---
         def get_all_output_components():
             return [
                 summary_plot_output,
-                # Cashflow Calib Outputs
                 cf_total_base_table_out, cf_policy_attrs_total_out,
                 cf_cashflow_plot_out, cf_scatter_cashflows_base_out,
                 cf_pv_total_base_out, cf_pv_total_lapse_out, cf_pv_total_mort_out,
-                # Attribute Calib Outputs
                 attr_total_cf_base_out, attr_policy_attrs_total_out,
                 attr_cashflow_plot_out, attr_scatter_cashflows_base_out, attr_total_pv_base_out,
-                # PV Calib Outputs
                 pv_total_cf_base_out, pv_policy_attrs_total_out,
                 pv_cashflow_plot_out, pv_scatter_pvs_base_out,
                 pv_total_pv_base_out, pv_total_pv_lapse_out, pv_total_pv_mort_out
             ]
-        # --- Action for Analyze Button ---
         def handle_analysis(f1, f2, f3, f4, f5, f6, f7):
             files = [f1, f2, f3, f4, f5, f6, f7]
             file_paths = []
             for i, f_obj in enumerate(files):
                 if f_obj is None:
                     gr.Error(f"Missing file input for argument {i+1}. Please upload all files or load examples.")
                     return [None] * len(get_all_output_components())
-                # If f_obj is a Gradio FileData object (from direct upload)
                 if hasattr(f_obj, 'name') and isinstance(f_obj.name, str):
                     file_paths.append(f_obj.name)
-                # If f_obj is already a string path (from example load)
                 elif isinstance(f_obj, str):
                      file_paths.append(f_obj)
                 else:
                     gr.Error(f"Invalid file input for argument {i+1}. Type: {type(f_obj)}")
                     return [None] * len(get_all_output_components())
             results = process_files(*file_paths)
-            if "error" in results: # Check if process_files returned an error dictionary
                 return [None] * len(get_all_output_components())
             return [
                 results.get('summary_plot'),
-                # CF Calib
                 results.get('cf_total_base_table'), results.get('cf_policy_attrs_total'),
                 results.get('cf_cashflow_plot'), results.get('cf_scatter_cashflows_base'),
                 results.get('cf_pv_total_base'), results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
-                # Attr Calib
                 results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
                 results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'), results.get('attr_total_pv_base'),
-                # PV Calib
                 results.get('pv_total_cf_base'), results.get('pv_policy_attrs_total'),
                 results.get('pv_cashflow_plot'), results.get('pv_scatter_pvs_base'),
                 results.get('pv_total_pv_base'), results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
@@ -487,12 +543,11 @@ def create_interface():
             outputs=get_all_output_components()
         )
-        # --- Action for Load Example Data Button ---
         def load_example_files():
             missing_files = [fp for fp in EXAMPLE_FILES.values() if not os.path.exists(fp)]
             if missing_files:
                 gr.Error(f"Missing example data files in '{EXAMPLE_DATA_DIR}': {', '.join(missing_files)}. Please ensure they exist.")
-                return [None] * 7 # Return None for all file inputs
             gr.Info("Example data paths loaded. Click 'Analyze Dataset'.")
             return [

 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin_min, r2_score # r2_score is not used but kept
+import plotly.graph_objects as go # ADDED
+import plotly.express as px # ADDED
+from plotly.subplots import make_subplots # ADDED
 import io
+import os
 from PIL import Image
 # Define the paths for example data
         if agg:
             cols = df.columns
             mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
             extracted_df = self.extract_reps(df)
             mult.index = extracted_df.index
             return extracted_df.mul(mult)
     def compare_total(self, df, agg=None):
         """Aggregate df by columns"""
         if agg:
             actual_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     actual_values[col] = df[col].mean()
+                else:
                     actual_values[col] = df[col].sum()
             actual = pd.Series(actual_values)
             reps_unscaled = self.extract_reps(df)
             estimate_values = {}
             for col in df.columns:
                 if agg.get(col, 'sum') == 'mean':
                     weighted_sum = (reps_unscaled[col] * self.policy_count).sum()
                     total_weight = self.policy_count.sum()
                     estimate_values[col] = weighted_sum / total_weight if total_weight > 0 else 0
+                else:
                     estimate_values[col] = (reps_unscaled[col] * self.policy_count).sum()
             estimate = pd.Series(estimate_values)
+        else:
             actual = df.sum()
             estimate = self.extract_and_scale_reps(df).sum()
         error = np.where(actual != 0, estimate / actual - 1, 0)
         return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': error})
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
+    """Create cashflow comparison plots using Plotly"""
     if not cfs_list or not cluster_obj or not titles:
         return None
     num_plots = len(cfs_list)
     if num_plots == 0:
         return None
     cols = 2
     rows = (num_plots + cols - 1) // cols
+    # Use subplot titles from the input 'titles'
+    subplot_titles_full = titles[:num_plots] + [""] * (rows * cols - num_plots)
+    fig = make_subplots(
+        rows=rows, cols=cols,
+        subplot_titles=subplot_titles_full
+    )
+    plot_idx = 0
+    for i_df, (df, title) in enumerate(zip(cfs_list, titles)): # Use i_df to avoid conflict with internal loop i
+        if plot_idx < rows * cols:
+            r = plot_idx // cols + 1
+            c = plot_idx % cols + 1
             comparison = cluster_obj.compare_total(df)
+            fig.add_trace(go.Scatter(x=comparison.index, y=comparison['actual'], name='Actual',
+                                     legendgroup='group1', showlegend=(plot_idx == 0)), row=r, col=c)
+            fig.add_trace(go.Scatter(x=comparison.index, y=comparison['estimate'], name='Estimate',
+                                     legendgroup='group2', showlegend=(plot_idx == 0)), row=r, col=c)
+            fig.update_xaxes(title_text='Time', showgrid=True, row=r, col=c)
+            fig.update_yaxes(title_text='Value', showgrid=True, row=r, col=c)
+            plot_idx += 1
+    # Hide unused subplots by making axes invisible and clearing titles
+    for i in range(plot_idx, rows * cols):
+        r = i // cols + 1
+        c = i % cols + 1
+        fig.update_xaxes(visible=False, row=r, col=c)
+        fig.update_yaxes(visible=False, row=r, col=c)
+        if fig.layout.annotations and i < len(fig.layout.annotations):
+             fig.layout.annotations[i].update(text="")
+    fig_width = 1500
+    fig_height = 500 * rows
+    fig.update_layout(
+        width=fig_width,
+        height=fig_height,
+        margin=dict(l=60, r=30, t=60, b=60) # Adjusted margins
+    )
+    try:
+        # Requires kaleido: pip install kaleido
+        img_bytes = fig.to_image(format="png", width=fig_width, height=fig_height)
+        buf = io.BytesIO(img_bytes)
         img = Image.open(buf)
         return img
+    except Exception as e:
+        print(f"Error generating cashflow plot image with Plotly/Kaleido: {e}. Ensure Kaleido is installed.")
+        # Create a placeholder error image
+        error_fig = go.Figure()
+        error_fig.add_annotation(text=f"Plot Error: {e}", showarrow=False)
+        error_fig.update_layout(width=fig_width, height=fig_height)
+        img_bytes = error_fig.to_image(format="png", width=fig_width, height=fig_height)
+        return Image.open(io.BytesIO(img_bytes))
+def plot_scatter_comparison(df_compare_output, title):
+    """Create scatter plot comparison from compare() output using Plotly"""
+    fig_width = 1200
+    fig_height = 800
+    if df_compare_output is None or df_compare_output.empty:
+        fig = go.Figure()
+        fig.add_annotation(
+            text="No data to display",
+            xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False,
+            font=dict(size=15)
+        )
+        fig.update_layout(title_text=title, width=fig_width, height=fig_height)
     else:
+        if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
+            gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
+            fig = px.scatter(df_compare_output, x='actual', y='estimate', title=title)
+            fig.update_traces(marker=dict(size=5, opacity=0.6)) # Set marker size and opacity
+        else:
+            df_reset = df_compare_output.reset_index()
+            level_1_name = df_compare_output.index.names[1] if df_compare_output.index.names[1] else 'category'
+            if level_1_name not in df_reset.columns: # Handle case where level name might not be in columns
+                df_reset = df_reset.rename(columns={df_reset.columns[1]: level_1_name})
+            fig = px.scatter(df_reset, x='actual', y='estimate', color=level_1_name,
+                             title=title,
+                             labels={'actual': 'Actual', 'estimate': 'Estimate', level_1_name: level_1_name})
+            fig.update_traces(marker=dict(size=5, opacity=0.6)) # Set marker size and opacity
+            num_unique_levels = df_reset[level_1_name].nunique()
+            if num_unique_levels == 0 or num_unique_levels > 10:
+                fig.update_layout(showlegend=False)
+            elif num_unique_levels == 1: # Show legend even for one item if it's named
+                 fig.update_layout(showlegend=True)
+        fig.update_xaxes(showgrid=True, title_text='Actual')
+        fig.update_yaxes(showgrid=True, title_text='Estimate')
+        # Draw identity line
+        if not df_compare_output.empty:
+            min_val_actual = df_compare_output['actual'].min()
+            max_val_actual = df_compare_output['actual'].max()
+            min_val_estimate = df_compare_output['estimate'].min()
+            max_val_estimate = df_compare_output['estimate'].max()
+            # Handle cases where min/max might be NaN (e.g. if all data is NaN)
+            if pd.isna(min_val_actual) or pd.isna(min_val_estimate) or pd.isna(max_val_actual) or pd.isna(max_val_estimate):
+                 lims = [0,1] # Default if data is problematic
+            else:
+                overall_min = min(min_val_actual, min_val_estimate)
+                overall_max = max(max_val_actual, max_val_estimate)
+                lims = [overall_min, overall_max]
+            if lims[0] != lims[1]: # Avoid issues if all data is single point or NaN
+                fig.add_trace(go.Scatter(
+                    x=lims, y=lims, mode='lines', name='Identity',
+                    line=dict(color='red', width=1), # Adjusted width for Plotly
+                    showlegend=False
+                ))
+                fig.update_xaxes(range=lims)
+                fig.update_yaxes(range=lims, scaleanchor="x", scaleratio=1) # Makes axes square based on data range
+        fig.update_layout(width=fig_width, height=fig_height)
+    try:
+        # Requires kaleido: pip install kaleido
+        img_bytes = fig.to_image(format="png", width=fig_width, height=fig_height)
+        buf = io.BytesIO(img_bytes)
+        img = Image.open(buf)
+        return img
+    except Exception as e:
+        print(f"Error generating scatter plot image with Plotly/Kaleido: {e}. Ensure Kaleido is installed.")
+        error_fig = go.Figure()
+        error_fig.add_annotation(text=f"Plot Error: {e}", showarrow=False)
+        error_fig.update_layout(width=fig_width, height=fig_height, title_text=title)
+        img_bytes = error_fig.to_image(format="png", width=fig_width, height=fig_height)
+        return Image.open(io.BytesIO(img_bytes))
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
                   policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
     """Main processing function - now accepts file paths"""
     try:
         cfs = pd.read_excel(cashflow_base_path, index_col=0)
         cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
         cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
         pol_data_full = pd.read_excel(policy_data_path, index_col=0)
         required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
         if all(col in pol_data_full.columns for col in required_cols):
             pol_data = pol_data_full[required_cols]
         else:
             gr.Warning(f"Policy data might be missing required columns. Found: {pol_data_full.columns.tolist()}")
+            pol_data = pol_data_full
         pvs = pd.read_excel(pv_base_path, index_col=0)
         pvs_lapse50 = pd.read_excel(pv_lapse_path, index_col=0)
         scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
         results = {}
         mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
         # --- 1. Cashflow Calibration ---
         cluster_cfs = Clusters(cfs)
         results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
         results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
         results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs)
         results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50)
         results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15)
         results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
         results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
         # --- 2. Policy Attribute Calibration ---
+        if not pol_data.empty and (pol_data.max(numeric_only=True) - pol_data.min(numeric_only=True)).all() != 0:
+             loc_vars_attrs = (pol_data - pol_data.min(numeric_only=True)) / (pol_data.max(numeric_only=True) - pol_data.min(numeric_only=True))
+             loc_vars_attrs = loc_vars_attrs.fillna(0) # Fill NaNs that may result from division by zero if a column has no variance
         else:
             gr.Warning("Policy data for attribute calibration is empty or has no variance. Skipping attribute calibration plots.")
+            loc_vars_attrs = pol_data.copy() # Use a copy
+        if not loc_vars_attrs.empty and pd.api.types.is_numeric_dtype(loc_vars_attrs.values): # Check if data is numeric for KMeans
+            try:
+                cluster_attrs = Clusters(loc_vars_attrs)
+                results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
+                results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs)
+                results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
+                results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
+                results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
+            except Exception as e_attr_clust: # Catch errors during clustering (e.g. if data is not suitable)
+                gr.Error(f"Error during policy attribute clustering: {e_attr_clust}")
+                results['attr_total_cf_base'] = pd.DataFrame()
+                results['attr_policy_attrs_total'] = pd.DataFrame()
+                results['attr_total_pv_base'] = pd.DataFrame()
+                results['attr_cashflow_plot'] = None
+                results['attr_scatter_cashflows_base'] = None
         else:
+            gr.Warning("Skipping attribute calibration as data is empty or non-numeric after processing.")
             results['attr_total_cf_base'] = pd.DataFrame()
             results['attr_policy_attrs_total'] = pd.DataFrame()
             results['attr_total_pv_base'] = pd.DataFrame()
         # --- 3. Present Value Calibration ---
         cluster_pvs = Clusters(pvs)
         results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
         results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
         results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
         results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50)
         results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15)
         results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
         results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
         # --- Summary Comparison Plot Data ---
         error_data = {}
         def get_error_safe(compare_result, col_name=None):
             if compare_result.empty:
                 return np.nan
             if col_name and col_name in compare_result.index:
                 return abs(compare_result.loc[col_name, 'error'])
             else:
                 return abs(compare_result['error']).mean()
         key_pv_col = None
+        for potential_col in ['PV_NetCF', 'pv_net_cf', 'net_cf_pv', 'PV_Net_CF']:
             if potential_col in pvs.columns:
                 key_pv_col = potential_col
                 break
         error_data['CF Calib.'] = [
             get_error_safe(cluster_cfs.compare_total(pvs), key_pv_col),
             get_error_safe(cluster_cfs.compare_total(pvs_lapse50), key_pv_col),
             get_error_safe(cluster_cfs.compare_total(pvs_mort15), key_pv_col)
         ]
+        if results.get('attr_total_pv_base') is not None and not results['attr_total_pv_base'].empty : # Check if Attr Calib was successful
             error_data['Attr Calib.'] = [
                 get_error_safe(cluster_attrs.compare_total(pvs), key_pv_col),
                 get_error_safe(cluster_attrs.compare_total(pvs_lapse50), key_pv_col),
         else:
             error_data['Attr Calib.'] = [np.nan, np.nan, np.nan]
         error_data['PV Calib.'] = [
             get_error_safe(cluster_pvs.compare_total(pvs), key_pv_col),
             get_error_safe(cluster_pvs.compare_total(pvs_lapse50), key_pv_col),
             get_error_safe(cluster_pvs.compare_total(pvs_mort15), key_pv_col)
         ]
         summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
         title_suffix = f' ({key_pv_col})' if key_pv_col else ' (Mean Absolute Error)'
+        plot_title = f'Calibration Method Comparison - Error in Total PV{title_suffix}'
+        fig_width = 1000
+        fig_height = 600
+        summary_df_melted = summary_df.reset_index().melt(id_vars='index', var_name='Calibration Method', value_name='Absolute Error Rate')
+        summary_df_melted.rename(columns={'index': 'Scenario'}, inplace=True)
+        fig_summary = px.bar(
+            summary_df_melted,
+            x='Scenario',
+            y='Absolute Error Rate',
+            color='Calibration Method',
+            barmode='group',
+            title=plot_title
+        )
+        fig_summary.update_layout(
+            width=fig_width, height=fig_height,
+            xaxis_tickangle=0,
+            yaxis_title='Absolute Error Rate',
+            legend_title_text='Calibration Method'
+        )
+        fig_summary.update_yaxes(showgrid=True)
+        try:
+            # Requires kaleido: pip install kaleido
+            buf_summary_bytes = fig_summary.to_image(format="png", width=fig_width, height=fig_height)
+            buf_summary = io.BytesIO(buf_summary_bytes)
+            results['summary_plot'] = Image.open(buf_summary)
+        except Exception as e:
+            print(f"Error generating summary plot image with Plotly/Kaleido: {e}. Ensure Kaleido is installed.")
+            error_fig = go.Figure()
+            error_fig.add_annotation(text=f"Plot Error: {e}", showarrow=False)
+            error_fig.update_layout(width=fig_width, height=fig_height, title_text=plot_title)
+            img_bytes = error_fig.to_image(format="png", width=fig_width, height=fig_height)
+            results['summary_plot'] = Image.open(io.BytesIO(img_bytes))
         return results
     except FileNotFoundError as e:
         return {"error": f"Missing column: {e}"}
     except Exception as e:
         gr.Error(f"Error processing files: {str(e)}")
+        # Optionally log the full traceback for debugging
+        import traceback
+        traceback.print_exc()
         return {"error": f"Error processing files: {str(e)}"}
         - Present Values - Base Scenario
         - Present Values - Lapse Stress
         - Present Values - Mortality Stress
+        **Note:** Plot generation uses Plotly and Kaleido. If plots appear as errors, ensure Kaleido is installed (`pip install kaleido`).
         """)
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown("### Upload Files or Load Examples")
                 load_example_btn = gr.Button("Load Example Data")
                 with gr.Row():
                     cashflow_base_input = gr.File(label="Cashflows - Base", file_types=[".xlsx"])
                     cashflow_lapse_input = gr.File(label="Cashflows - Lapse Stress", file_types=[".xlsx"])
                     pv_lapse_input = gr.File(label="Present Values - Lapse Stress", file_types=[".xlsx"])
                 with gr.Row():
                     pv_mort_input = gr.File(label="Present Values - Mortality Stress", file_types=[".xlsx"])
                 analyze_btn = gr.Button("Analyze Dataset", variant="primary", size="lg")
         with gr.Tabs():
             with gr.TabItem("📊 Summary"):
+                summary_plot_output = gr.Image(label="Calibration Methods Comparison") # Stays as gr.Image
             with gr.TabItem("💸 Cashflow Calibration"):
                 gr.Markdown("### Results: Using Annual Cashflows as Calibration Variables")
                 with gr.Accordion("Present Value Comparisons (Total)", open=False):
                      attr_total_pv_base_out = gr.Dataframe(label="PVs - Base Scenario Total")
             with gr.TabItem("💰 Present Value Calibration"):
                 gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
                 with gr.Row():
                         pv_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
                         pv_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
         def get_all_output_components():
             return [
                 summary_plot_output,
                 cf_total_base_table_out, cf_policy_attrs_total_out,
                 cf_cashflow_plot_out, cf_scatter_cashflows_base_out,
                 cf_pv_total_base_out, cf_pv_total_lapse_out, cf_pv_total_mort_out,
                 attr_total_cf_base_out, attr_policy_attrs_total_out,
                 attr_cashflow_plot_out, attr_scatter_cashflows_base_out, attr_total_pv_base_out,
                 pv_total_cf_base_out, pv_policy_attrs_total_out,
                 pv_cashflow_plot_out, pv_scatter_pvs_base_out,
                 pv_total_pv_base_out, pv_total_pv_lapse_out, pv_total_pv_mort_out
             ]
         def handle_analysis(f1, f2, f3, f4, f5, f6, f7):
             files = [f1, f2, f3, f4, f5, f6, f7]
             file_paths = []
             for i, f_obj in enumerate(files):
                 if f_obj is None:
                     gr.Error(f"Missing file input for argument {i+1}. Please upload all files or load examples.")
                     return [None] * len(get_all_output_components())
                 if hasattr(f_obj, 'name') and isinstance(f_obj.name, str):
                     file_paths.append(f_obj.name)
                 elif isinstance(f_obj, str):
                      file_paths.append(f_obj)
                 else:
                     gr.Error(f"Invalid file input for argument {i+1}. Type: {type(f_obj)}")
                     return [None] * len(get_all_output_components())
             results = process_files(*file_paths)
+            if "error" in results:
                 return [None] * len(get_all_output_components())
             return [
                 results.get('summary_plot'),
                 results.get('cf_total_base_table'), results.get('cf_policy_attrs_total'),
                 results.get('cf_cashflow_plot'), results.get('cf_scatter_cashflows_base'),
                 results.get('cf_pv_total_base'), results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
                 results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
                 results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'), results.get('attr_total_pv_base'),
                 results.get('pv_total_cf_base'), results.get('pv_policy_attrs_total'),
                 results.get('pv_cashflow_plot'), results.get('pv_scatter_pvs_base'),
                 results.get('pv_total_pv_base'), results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
             outputs=get_all_output_components()
         )
         def load_example_files():
             missing_files = [fp for fp in EXAMPLE_FILES.values() if not os.path.exists(fp)]
             if missing_files:
                 gr.Error(f"Missing example data files in '{EXAMPLE_DATA_DIR}': {', '.join(missing_files)}. Please ensure they exist.")
+                return [None] * 7
             gr.Info("Example data paths loaded. Click 'Analyze Dataset'.")
             return [