Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files Community

alidenewade commited on May 23, 2025

Commit

e82ad24

verified ·

1 Parent(s): 46a2e7c

Update app.py

Browse files

Files changed (1) hide show

app.py +545 -583

app.py CHANGED Viewed

@@ -2,608 +2,570 @@ import gradio as gr
 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
-from sklearn.metrics import pairwise_distances_argmin_min
 import matplotlib.pyplot as plt
 import matplotlib.cm
 import io
-import os
 from PIL import Image
 # Define the paths for example data
 EXAMPLE_DATA_DIR = "eg_data"
 EXAMPLE_FILES = {
-    "cashflow_base": os.path.join(EXAMPLE_DATA_DIR, "cashflows_seriatim_10K.xlsx"),
-    "cashflow_lapse": os.path.join(EXAMPLE_DATA_DIR, "cashflows_seriatim_10K_lapse50.xlsx"),
-    "cashflow_mort": os.path.join(EXAMPLE_DATA_DIR, "cashflows_seriatim_10K_mort15.xlsx"),
-    "policy_data": os.path.join(EXAMPLE_DATA_DIR, "model_point_table.xlsx"),
-    "pv_base": os.path.join(EXAMPLE_DATA_DIR, "pv_seriatim_10K.xlsx"),
-    "pv_lapse": os.path.join(EXAMPLE_DATA_DIR, "pv_seriatim_10K_lapse50.xlsx"),
-    "pv_mort": os.path.join(EXAMPLE_DATA_DIR, "pv_seriatim_10K_mort15.xlsx"),
 }
 class Clusters:
-    def __init__(self, loc_vars):
-        if loc_vars.empty:
-            raise ValueError("Input data for KMeans (loc_vars) is empty.")
-        if loc_vars.isnull().all().all():
-            raise ValueError("Input data for KMeans (loc_vars) contains all NaN values.")
-        n_samples = len(loc_vars)
-        n_clusters_to_use = min(1000, n_samples)
-        if n_clusters_to_use == 0 :
-             raise ValueError("Cannot determine n_clusters as no samples are available.")
-        self.kmeans = KMeans(n_clusters=n_clusters_to_use, random_state=0, n_init=10).fit(np.ascontiguousarray(loc_vars))
-        closest, _ = pairwise_distances_argmin_min(self.kmeans.cluster_centers_, np.ascontiguousarray(loc_vars))
-        rep_ids = pd.Series(data=(closest + 1))
-        rep_ids.name = 'policy_id'
-        rep_ids.index.name = 'cluster_id'
-        self.rep_ids = rep_ids
-        if n_samples > 0:
-            self.policy_count = self.agg_by_cluster(pd.DataFrame({'policy_count': [1] * n_samples})).get('policy_count', pd.Series(dtype=int))
-            if self.policy_count is None: # get can return None if key not present
-                 self.policy_count = pd.Series(dtype=int).rename_axis('cluster_id')
-        else:
-            self.policy_count = pd.Series(dtype=int).rename_axis('cluster_id')
-    def agg_by_cluster(self, df, agg=None):
-        temp = df.copy()
-        if len(self.kmeans.labels_) != len(df):
-             gr.Warning(f"Length mismatch in agg_by_cluster: kmeans.labels_ ({len(self.kmeans.labels_)}) vs df ({len(df)}).")
-             # Attempt to proceed if df is shorter, otherwise this indicates a deeper issue
-             if len(self.kmeans.labels_) < len(df):
-                 # Cannot assign labels if df is longer than available labels
-                 return pd.DataFrame() # Or raise error
-             temp['cluster_id'] = self.kmeans.labels_[:len(df)]
-        else:
-             temp['cluster_id'] = self.kmeans.labels_
-        temp = temp.set_index('cluster_id')
-        agg_ops = {}
-        if isinstance(agg, dict):
-            agg_ops = {c: (agg[c] if c in agg else 'sum') for c in temp.columns if pd.api.types.is_numeric_dtype(temp[c])}
-        else:
-            for col in temp.columns:
-                if pd.api.types.is_numeric_dtype(temp[col]):
-                    agg_ops[col] = 'sum'
-        if not agg_ops: # No numeric columns or no valid agg ops
-            return pd.DataFrame(index=temp.index.unique()) # Return empty DF with cluster index
-        return temp.groupby(temp.index).agg(agg_ops)
-    def extract_reps(self, df):
-        df_reset = df.reset_index()
-        original_index_name = df.index.name if df.index.name else 'index'
-        # Ensure 'policy_id' column exists for the merge operation
-        if 'policy_id' not in df_reset.columns:
-            if original_index_name in df_reset.columns and original_index_name != 'policy_id':
-                df_reset = df_reset.rename(columns={original_index_name: 'policy_id'})
-            elif original_index_name == 'policy_id': # Already named policy_id
-                pass
-            else: # No identifiable policy_id column from index
-                 gr.Warning(f"Could not find 'policy_id' from index '{original_index_name}' for merging in extract_reps. Trying to merge on index if rep_ids index matches.")
-                 # This path is risky; merge might fail if rep_ids index (cluster_id) doesn't match df_reset's current index
-                 # For safety, assuming policy_id must be present in rep_ids for merge.
-                 # If rep_ids uses 'policy_id' as data, then df_reset must have it as a column.
-        if self.rep_ids.empty:
-            gr.Warning("Representative IDs (rep_ids) are empty in extract_reps.")
-            # Return an empty DataFrame with columns from df, but indexed by 'cluster_id' if possible
-            # This is tricky as we don't know the cluster_ids without rep_ids.
-            # Best to return an empty version of df's structure perhaps.
-            return pd.DataFrame(columns=df.columns).rename_axis('cluster_id')
-        temp = pd.merge(self.rep_ids, df_reset, how='left', on='policy_id')
-        temp = temp.set_index('cluster_id') # rep_ids index is cluster_id
-        if 'policy_id' in temp.columns: # Drop the policy_id column used for merging
-          return temp.drop('policy_id', axis=1)
-        return temp
-    def extract_and_scale_reps(self, df, agg=None):
-        extracted_df = self.extract_reps(df)
-        if extracted_df.empty:
-            return extracted_df
-        scaled_df = extracted_df.copy()
-        if self.policy_count.empty:
-             gr.Warning("Policy count is empty in extract_and_scale_reps. Not scaling.")
-             return scaled_df # Return unscaled if no policy counts
-        policy_count_aligned = self.policy_count.reindex(scaled_df.index).fillna(0)
-        if agg and isinstance(agg, dict):
-            for c in extracted_df.columns:
-                if pd.api.types.is_numeric_dtype(extracted_df[c]):
-                    if agg.get(c, 'sum') == 'sum': # Default to 'sum' for scaling
-                        scaled_df[c] = extracted_df[c].mul(policy_count_aligned, axis=0)
-        else:
-            for c in extracted_df.columns:
-                 if pd.api.types.is_numeric_dtype(extracted_df[c]):
-                    scaled_df[c] = extracted_df[c].mul(policy_count_aligned, axis=0)
-        return scaled_df
-    def compare(self, df, agg=None):
-        source = self.agg_by_cluster(df, agg) # Aggregated actuals per cluster
-        # Target: representative values, potentially scaled by policy_count for 'sum' type aggregations
-        target_reps_raw = self.extract_reps(df) # Raw representative values per cluster
-        if source.empty and target_reps_raw.empty:
-            return pd.DataFrame(columns=['actual', 'estimate'])
-        if source.empty: # Fill with NaNs if only source is empty
-            source = pd.DataFrame(index=target_reps_raw.index, columns=target_reps_raw.columns)
-        if target_reps_raw.empty: # Fill with NaNs if only target is empty
-            target_reps_raw = pd.DataFrame(index=source.index, columns=source.columns)
-        target_estimates_per_cluster = target_reps_raw.copy()
-        if not self.policy_count.empty:
-            policy_count_aligned = self.policy_count.reindex(target_reps_raw.index).fillna(0)
-            if isinstance(agg, dict):
-                for col, method in agg.items():
-                    if col in target_estimates_per_cluster.columns and method == 'sum':
-                        if pd.api.types.is_numeric_dtype(target_estimates_per_cluster[col]):
-                            target_estimates_per_cluster[col] = target_reps_raw[col].mul(policy_count_aligned, axis=0)
-            elif not agg: # Default to sum if agg is None (original notebook behavior)
-                for col in target_estimates_per_cluster.columns:
-                    if pd.api.types.is_numeric_dtype(target_estimates_per_cluster[col]):
-                        target_estimates_per_cluster[col] = target_reps_raw[col].mul(policy_count_aligned, axis=0)
-        else: # No policy_count, target_estimates remain raw rep values
-            gr.Warning("Policy_count is empty, compare() target estimates will be raw representative values.")
-        # Align source and target_estimates_per_cluster before stacking
-        aligned_source, aligned_target = source.align(target_estimates_per_cluster, join='outer', axis=0) # outer join on clusters
-        aligned_source, aligned_target = aligned_source.align(aligned_target, join='outer', axis=1) # outer join on columns
-        return pd.DataFrame({'actual': aligned_source.stack(dropna=False), 'estimate': aligned_target.stack(dropna=False)})
-    def compare_total(self, df, agg=None):
-        if df.empty:
-            return pd.DataFrame(columns=['actual', 'estimate', 'error'])
-        op_for_actual = {}
-        numeric_cols_df = df.select_dtypes(include=np.number).columns
-        if isinstance(agg, dict):
-            for c in numeric_cols_df: op_for_actual[c] = agg.get(c, 'sum')
-        else:
-            for c in numeric_cols_df: op_for_actual[c] = 'sum'
-        if not op_for_actual : # No numeric columns to aggregate
-            return pd.DataFrame(columns=['actual', 'estimate', 'error'])
-        actual = df.agg(op_for_actual).dropna()
-        if actual.empty: # No results from aggregation
-            return pd.DataFrame(columns=['actual', 'estimate', 'error'])
-        reps_values = self.extract_reps(df)
-        estimate_values = {}
-        if reps_values.empty or self.policy_count.empty:
-            estimate = pd.Series(index=actual.index, dtype=float).fillna(np.nan)
-        else:
-            policy_count_aligned = self.policy_count.reindex(reps_values.index).fillna(0)
-            total_weight = policy_count_aligned.sum()
-            for col_name in actual.index: # Iterate over columns that had a valid actual aggregation
-                col_op = op_for_actual.get(col_name)
-                if col_name not in reps_values.columns or not pd.api.types.is_numeric_dtype(reps_values[col_name]):
-                    estimate_values[col_name] = np.nan; continue
-                rep_col_values = reps_values[col_name]
-                if col_op == 'sum':
-                    estimate_values[col_name] = (rep_col_values * policy_count_aligned).sum()
-                elif col_op == 'mean':
-                    if total_weight != 0:
-                        weighted_sum = (rep_col_values * policy_count_aligned).sum()
-                        estimate_values[col_name] = weighted_sum / total_weight
-                    else: estimate_values[col_name] = np.nan
-                else: estimate_values[col_name] = np.nan # Should not happen
-            estimate = pd.Series(estimate_values, index=actual.index) # Align with actual's index
-        actual_aligned, estimate_aligned = actual.align(estimate, join='inner') # Only compare where both exist
-        if actual_aligned.empty: # Nothing to compare
-             return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': pd.Series(index=actual.index, dtype=float)})
-        error = pd.Series(index=actual_aligned.index, dtype=float)
-        valid_mask = (actual_aligned != 0) & (~actual_aligned.isna())
-        error[valid_mask] = estimate_aligned[valid_mask] / actual_aligned[valid_mask] - 1
-        actual_zero_mask = (actual_aligned == 0) & (~actual_aligned.isna())
-        error[actual_zero_mask & (estimate_aligned == 0)] = 0.0
-        error[actual_zero_mask & (estimate_aligned != 0) & (~estimate_aligned.isna())] = np.inf
-        error = error.replace([np.inf, -np.inf], np.nan) # Convert inf to NaN for mean, etc.
-        return pd.DataFrame({'actual': actual_aligned, 'estimate': estimate_aligned, 'error': error})
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
-    if not cfs_list or cluster_obj is None or not titles or len(cfs_list) == 0: # cluster_obj can be None if init failed
-        fig, ax = plt.subplots(); ax.text(0.5, 0.5, "No data/cluster for cashflow plot.", ha='center', va='center')
-        buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0); img = Image.open(buf); plt.close(fig); return img
-    num_plots = len(cfs_list)
-    cols = min(2, num_plots) if num_plots > 0 else 1
-    rows = (num_plots + cols - 1) // cols if num_plots > 0 else 1
-    fig, axes = plt.subplots(rows, cols, figsize=(7.5 * cols, 5 * rows), squeeze=False)
-    axes = axes.flatten()
-    plot_made = False
-    for i, (df_cf, title) in enumerate(zip(cfs_list, titles)):
-        if i < len(axes):
-            ax_curr = axes[i]; ax_curr.set_title(title)
-            if df_cf is None or df_cf.empty:
-                ax_curr.text(0.5,0.5, f"No data for\n{title}", ha='center', va='center', wrap=True); continue
-            try:
-                comparison = cluster_obj.compare_total(df_cf)
-                if not comparison.empty and 'actual' in comparison.columns and 'estimate' in comparison.columns:
-                    plot_df = comparison[['actual', 'estimate']].dropna(how='all')
-                    if not plot_df.empty:
-                        plot_df.plot(ax=ax_curr, grid=True)
-                        ax_curr.set_xlabel('Time Period'); ax_curr.set_ylabel('Cashflow Value')
-                        plot_made = True
-                    else: ax_curr.text(0.5,0.5, f"No comparable data\nfor {title}", ha='center', va='center', wrap=True)
-                else: ax_curr.text(0.5,0.5, f"Comparison failed\nfor {title}", ha='center', va='center', wrap=True)
-            except Exception as e: ax_curr.text(0.5,0.5, f"Error plotting {title}:\n{str(e)[:50]}...", ha='center', va='center', wrap=True)
-    for j in range(num_plots, len(axes)): fig.delaxes(axes[j]) # Remove unused axes
-    if not plot_made:
-        plt.close(fig); fig, ax = plt.subplots(); ax.text(0.5, 0.5, "No cashflow plots generated.", ha='center', va='center')
-    plt.tight_layout(pad=2.0)
-    buf = io.BytesIO(); plt.savefig(buf, format='png', dpi=90); buf.seek(0); img = Image.open(buf); plt.close(fig); return img
 def plot_scatter_comparison(df_compare_output, title):
-    if df_compare_output is None or df_compare_output.empty:
-        fig, ax = plt.subplots(figsize=(8,5)); ax.text(0.5, 0.5, "No data for scatter plot.", ha='center', va='center'); ax.set_title(title)
-        buf = io.BytesIO(); plt.savefig(buf, format='png'); buf.seek(0); img = Image.open(buf); plt.close(fig); return img
-    fig, ax = plt.subplots(figsize=(8, 5))
-    ax.set_title(title, fontsize='medium')
-    if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
-         ax.scatter(df_compare_output.get('actual', pd.Series(dtype=float)), df_compare_output.get('estimate', pd.Series(dtype=float)), s=9, alpha=0.6)
-    else:
-        try:
-            unique_levels = df_compare_output.index.get_level_values(1).unique()
-            if len(unique_levels) == 0 : ax.text(0.5, 0.5, "No data points for scatter.", ha='center', va='center')
-            else:
-                colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(unique_levels)))
-                for item_level, color_val in zip(unique_levels, colors):
-                    try: subset = df_compare_output.xs(item_level, level=1)
-                    except KeyError: continue # Level not found, skip
-                    if not subset.empty: ax.scatter(subset['actual'], subset['estimate'], color=color_val, s=9, alpha=0.6, label=str(item_level))
-                if len(unique_levels) > 1 and len(unique_levels) <=10: ax.legend(title=str(df_compare_output.index.names[1]), fontsize='small')
-        except IndexError: # Problem with index levels
-             ax.scatter(df_compare_output.get('actual', pd.Series(dtype=float)), df_compare_output.get('estimate', pd.Series(dtype=float)), s=9, alpha=0.6)
-             gr.Warning("Could not process levels for scatter plot, showing raw data.")
-    ax.set_xlabel('Actual Value'); ax.set_ylabel('Estimated Value')
-    ax.grid(True, linestyle='--', alpha=0.7)
-    try: # Draw identity line
-        current_xlim = ax.get_xlim(); current_ylim = ax.get_ylim()
-        if np.isfinite(current_xlim).all() and np.isfinite(current_ylim).all() and current_xlim[0] < current_xlim[1] and current_ylim[0] < current_ylim[1]:
-            lims = [np.nanmin([current_xlim[0], current_ylim[0]]), np.nanmax([current_xlim[1], current_ylim[1]])]
-            if lims[0] < lims[1] and not np.isnan(lims[0]) and not np.isnan(lims[1]):
-                ax.plot(lims, lims, 'r-', linewidth=1, alpha=0.8, dashes=(3,3)); ax.set_xlim(lims); ax.set_ylim(lims)
-    except Exception: pass
-    plt.tight_layout(pad=1.5)
-    buf = io.BytesIO(); plt.savefig(buf, format='png', dpi=90); buf.seek(0); img = Image.open(buf); plt.close(fig); return img
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
-                  policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
-    results = {}
-    try:
-        cfs = pd.read_excel(cashflow_base_path, index_col=0)
-        cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
-        cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
-        pol_data_full = pd.read_excel(policy_data_path, index_col=0)
-        required_policy_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
-        missing_policy_cols = [col for col in required_policy_cols if col not in pol_data_full.columns]
-        if missing_policy_cols: gr.Warning(f"Policy data missing: {', '.join(missing_policy_cols)}.")
-        pol_data = pol_data_full[required_policy_cols] if not missing_policy_cols else pol_data_full
-        pvs = pd.read_excel(pv_base_path, index_col=0)
-        pvs_lapse50 = pd.read_excel(pv_lapse_path, index_col=0)
-        pvs_mort15 = pd.read_excel(pv_mort_path, index_col=0)
-        cfs_list = [cfs, cfs_lapse50, cfs_mort15]
-        scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
-        mean_attrs_agg = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'}
-        gr.Info("Processing calibrations...")
-        cluster_cfs = cluster_attrs = cluster_pvs = None # Initialize
-        if not cfs.empty: cluster_cfs = Clusters(cfs)
-        else: gr.Warning("Base cashflow data is empty. CF Calib. might be affected or skipped.")
-        if cluster_cfs:
-            results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
-            results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs_agg)
-            results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs)
-            results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50)
-            results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15)
-            results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
-            results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'CF Calib. - Cashflows (Base)')
-        if not pol_data.empty:
-            pol_data_min = pol_data.min(); pol_data_range = pol_data.max() - pol_data_min
-            pol_data_range_safe = pol_data_range.copy()
-            pol_data_range_safe[pol_data_range_safe == 0] = 1 # Avoid division by zero for constant columns
-            loc_vars_attrs = ((pol_data - pol_data_min) / pol_data_range_safe).fillna(0) # Standardize
-            if not loc_vars_attrs.empty: cluster_attrs = Clusters(loc_vars_attrs)
-        else: gr.Warning("Policy data is empty. Attr Calib. skipped.")
-        if cluster_attrs:
-            results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
-            results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs_agg)
-            results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
-            results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
-            results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Attr Calib. - Cashflows (Base)')
-        if not pvs.empty: cluster_pvs = Clusters(pvs)
-        else: gr.Warning("Base PV data is empty. PV Calib. might be affected or skipped.")
-        if cluster_pvs:
-            results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
-            results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs_agg)
-            results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
-            results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50)
-            results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15)
-            results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
-            results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
-        gr.Info("Generating Summary Plot...")
-        error_data = {}
-        pv_col_name = 'PV_NetCF'
-        calibration_objects_for_summary = [
-            ("CF Calib.", cluster_cfs), ("Attr Calib.", cluster_attrs), ("PV Calib.", cluster_pvs)
-        ]
-        for calib_name_display, cl_obj in calibration_objects_for_summary:
-            current_errors = []
-            if cl_obj is None: current_errors = [np.nan, np.nan, np.nan]
-            else:
-                for pv_df_scen in [pvs, pvs_lapse50, pvs_mort15]:
-                    err_val = np.nan
-                    if not pv_df_scen.empty:
-                        comp_df = cl_obj.compare_total(pv_df_scen)
-                        if not comp_df.empty:
-                            if pv_col_name in comp_df.index: err_val = comp_df.loc[pv_col_name, 'error']
-                            elif 'error' in comp_df.columns: err_val = comp_df['error'].mean() # Fallback
-                    current_errors.append(abs(err_val))
-            error_data[calib_name_display] = current_errors
-        summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%']).round(4) # Round summary errors
-        fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
-        plot_title = f'Abs. Error in Total {pv_col_name} by Calibration Method'
-        if summary_df.isnull().all().all() or summary_df.empty:
-            ax_summary.text(0.5, 0.5, f"Summary N/A.\nCheck PV files for '{pv_col_name}'.", ha='center', va='center', wrap=True)
-        else:
-            summary_df.plot(kind='bar', ax=ax_summary, grid=True, width=0.8, legend=True)
-            ax_summary.set_ylabel(f'Absolute Error (of {pv_col_name} or fallback mean)'); ax_summary.tick_params(axis='x', rotation=0)
-            ax_summary.legend(title="Calibration Method")
-        ax_summary.set_title(plot_title)
-        plt.tight_layout(pad=1.5)
-        buf_summary = io.BytesIO(); plt.savefig(buf_summary, format='png', dpi=90); buf_summary.seek(0)
-        results['summary_plot'] = Image.open(buf_summary); plt.close(fig_summary)
-        for key, value in results.items():
-            if isinstance(value, pd.DataFrame):
-                try: results[key] = value.round(2)
-                except (TypeError, AttributeError): pass # Ignore non-numeric data for rounding
-        gr.Info("All processing complete. ✅")
-        return results
-    except FileNotFoundError as e: gr.Error(f"File not found: {e.filename}."); return {"error": str(e)}
-    except ValueError as e: gr.Error(f"Data error: {str(e)}"); return {"error": str(e)}
-    except KeyError as e: gr.Error(f"Missing column: {e}. Check data formats."); return {"error": str(e)}
-    except Exception as e:
-        gr.Error(f"Unexpected error: {str(e)}"); import traceback; traceback.print_exc()
-        return {"error": str(e)}
 def create_interface():
-    with gr.Blocks(title="Cluster Model Points Analysis", theme=gr.themes.Default()) as demo:
-        gr.Markdown("## Cluster Model Points Analysis 📈")
-        gr.Markdown("Applies k-means clustering for model point selection in insurance portfolios. Upload Excel files or use examples.")
-        with gr.Accordion("📚 File Requirements & Instructions", open=False):
-            gr.Markdown(
-            """
-            **Required Excel (.xlsx) Files (Index: `policy_id` for all):**
-            1.  **Cashflows (Base, Lapse Stress, Mort Stress)**: Net annual cashflows (cols: time periods).
-            2.  **Policy Data**: Attributes. Must include: `age_at_entry`, `policy_term`, `sum_assured`, `duration_mth`.
-            3.  **Present Values (Base, Lapse Stress, Mort Stress)**: PVs of cashflow components. Ideally include `PV_NetCF`.
-            All files must share a common `policy_id` (use `index_col=0` if it's the first column).
-            """
-            )
-        with gr.Row():
-            with gr.Column(scale=3):
-                gr.Markdown("#### 📂 Upload Files or Load Examples")
-                with gr.Row():
-                    cashflow_base_input = gr.File(label="CF Base", file_types=[".xlsx"], scale=1)
-                    cashflow_lapse_input = gr.File(label="CF Lapse Str.", file_types=[".xlsx"], scale=1)
-                    cashflow_mort_input = gr.File(label="CF Mort Str.", file_types=[".xlsx"], scale=1)
-                with gr.Row():
-                    policy_data_input = gr.File(label="Policy Data", file_types=[".xlsx"], scale=1)
-                    pv_base_input = gr.File(label="PV Base", file_types=[".xlsx"], scale=1)
-                    pv_lapse_input = gr.File(label="PV Lapse Str.", file_types=[".xlsx"], scale=1)
-                with gr.Row():
-                    pv_mort_input = gr.File(label="PV Mort Str.", file_types=[".xlsx"], scale=1)
-                    # Dummy invisible components for layout, if needed, or adjust column scales
-                    gr.HTML("", scale=1, visible=False)
-                    gr.HTML("", scale=1, visible=False)
-            with gr.Column(scale=1, min_width=180): # Adjusted min_width
-                 gr.Markdown("ㅤ") # Spacer for alignment
-                 load_example_btn = gr.Button("Load Example Data", icon="💾", full_width=True)
-                 analyze_btn = gr.Button("Analyze Dataset", variant="primary", icon="🚀", full_width=True)
-        with gr.Tabs():
-            with gr.TabItem("📊 Summary", id="summary_tab"):
-                summary_plot_output = gr.Image(label="Calibration Methods Comparison", type="pil") # Use type="pil"
-            tab_items_data = [
-                ("💸 CF Calib.", "cf", "Annual Cashflows (Base)"),
-                ("👤 Attr Calib.", "attr", "Policy Attributes"),
-                ("💰 PV Calib.", "pv", "Present Values (Base)")
-            ]
-            # Dynamically create output components and store them
-            output_component_map = {"summary_plot_output": summary_plot_output}
-            for tab_name, prefix, calib_vars_desc in tab_items_data:
-                with gr.TabItem(tab_name, id=f"{prefix}_calib_tab"):
-                    gr.Markdown(f"#### Results: Using {calib_vars_desc} as Calibration Variables")
-                    with gr.Row():
-                        # Removed height parameter
-                        output_component_map[f"{prefix}_total_base_table_out"] = gr.Dataframe(label="Overall Comparison - Base CF", wrap=True)
-                        output_component_map[f"{prefix}_policy_attrs_total_out"] = gr.Dataframe(label="Overall Comparison - Policy Attr.", wrap=True)
-                    output_component_map[f"{prefix}_cashflow_plot_out"] = gr.Image(label="Cashflow Value Comparisons", type="pil")
-                    scatter_label = "Scatter: Per-Cluster PVs (Base)" if prefix == "pv" else "Scatter: Per-Cluster CFs (Base)"
-                    output_component_map[f"{prefix}_scatter_display_out"] = gr.Image(label=scatter_label, type="pil")
-                    with gr.Accordion("Present Value Comparisons (Totals)", open=False):
-                        with gr.Row():
-                            # Removed height parameter
-                            output_component_map[f"{prefix}_pv_total_base_out"] = gr.Dataframe(label="PVs - Base", wrap=True)
-                            if prefix != "attr":
-                                output_component_map[f"{prefix}_pv_total_lapse_out"] = gr.Dataframe(label="PVs - Lapse Stress", wrap=True)
-                                output_component_map[f"{prefix}_pv_total_mort_out"] = gr.Dataframe(label="PVs - Mortality Stress", wrap=True)
-        # Define the list of all output components in the correct order for the click handler
-        ordered_output_keys = [
-            'summary_plot_output',
-            'cf_total_base_table_out', 'cf_policy_attrs_total_out', 'cf_cashflow_plot_out', 'cf_scatter_display_out',
-            'cf_pv_total_base_out', 'cf_pv_total_lapse_out', 'cf_pv_total_mort_out',
-            'attr_total_base_table_out', 'attr_policy_attrs_total_out', 'attr_cashflow_plot_out', 'attr_scatter_display_out',
-            'attr_pv_total_base_out',
-            'pv_total_base_table_out', 'pv_policy_attrs_total_out', 'pv_cashflow_plot_out', 'pv_scatter_display_out',
-            'pv_total_pv_base_out', 'pv_pv_total_lapse_out', 'pv_pv_total_mort_out'
-        ]
-        # Filter out keys that might not be created if a tab's structure changes (e.g., attr_pv_total_lapse)
-        final_output_components = [output_component_map[k] for k in ordered_output_keys if k in output_component_map]
-        input_file_components = [
-            cashflow_base_input, cashflow_lapse_input, cashflow_mort_input,
-            policy_data_input, pv_base_input, pv_lapse_input, pv_mort_input
-        ]
-        def handle_analysis_click(*files_input):
-            if not all(f is not None for f in files_input):
-                gr.Warning("Not all files provided. Please upload/load all 7 files.")
-                return [None] * len(final_output_components)
-            file_paths = []
-            for f_obj in files_input:
-                if hasattr(f_obj, 'name') and isinstance(f_obj.name, str): file_paths.append(f_obj.name)
-                elif isinstance(f_obj, str): file_paths.append(f_obj)
-                else: gr.Error(f"Invalid file input: {f_obj}."); return [None] * len(final_output_components)
-            analysis_results = process_files(*file_paths)
-            if "error" in analysis_results and analysis_results["error"]: # check if error is not None or empty
-                return [None] * len(final_output_components)
-            # Map results to output components based on the ordered_output_keys
-            output_values = []
-            # Keys used in process_files for results dict:
-            # summary_plot
-            # cf_total_base_table, cf_policy_attrs_total, cf_pv_total_base, cf_pv_total_lapse, cf_pv_total_mort, cf_cashflow_plot, cf_scatter_cashflows_base
-            # attr_total_cf_base, attr_policy_attrs_total, attr_total_pv_base, attr_cashflow_plot, attr_scatter_cashflows_base
-            # pv_total_cf_base, pv_policy_attrs_total, pv_total_pv_base, pv_total_pv_lapse, pv_total_pv_mort, pv_cashflow_plot, pv_scatter_pvs_base
-            key_map = { # Maps UI component key stem to result key stem
-                'total_base_table_out': 'total_base_table',
-                'policy_attrs_total_out': 'policy_attrs_total',
-                'cashflow_plot_out': 'cashflow_plot',
-                'scatter_display_out': lambda p: f'scatter_{"pvs" if p == "pv" else "cashflows"}_base', # Special handling for scatter key
-                'pv_total_base_out': 'pv_total_base',
-                'pv_total_lapse_out': 'pv_total_lapse',
-                'pv_total_mort_out': 'pv_total_mort'
-            }
-            for ui_key in ordered_output_keys:
-                if ui_key == "summary_plot_output":
-                    output_values.append(analysis_results.get('summary_plot'))
-                    continue
-                # Deconstruct ui_key: e.g., "cf_total_base_table_out" -> prefix="cf", stem="total_base_table_out"
-                parts = ui_key.split('_', 1)
-                prefix = parts[0]
-                stem_ui = parts[1]
-                result_stem_mapper = key_map.get(stem_ui)
-                if callable(result_stem_mapper): # For scatter plot key
-                    result_key_stem = result_stem_mapper(prefix)
-                else:
-                    result_key_stem = result_stem_mapper
-                if result_key_stem:
-                    result_data_key = f"{prefix}_{result_key_stem}"
-                    output_values.append(analysis_results.get(result_data_key))
-                else: # Should not happen if ordered_output_keys and key_map are correct
-                    output_values.append(None)
-                    gr.Debug(f"No mapping found for UI key {ui_key}")
-            return output_values
-        analyze_btn.click(handle_analysis_click, inputs=input_file_components, outputs=final_output_components)
-        def load_example_files_action():
-            # Check if all example files exist
-            # Ensure EXAMPLE_FILES dictionary keys match what's expected for list(EXAMPLE_FILES.values()) order
-            expected_order = ["cashflow_base", "cashflow_lapse", "cashflow_mort", "policy_data", "pv_base", "pv_lapse", "pv_mort"]
-            example_file_paths = []
-            missing_files_list = []
-            for key in expected_order:
-                f_path = EXAMPLE_FILES.get(key)
-                if f_path and os.path.exists(f_path):
-                    example_file_paths.append(f_path)
-                else:
-                    missing_files_list.append(f_path or f"'{key}' not configured")
-            if missing_files_list:
-                gr.Error(f"Missing example data files: {', '.join(missing_files_list)}. Please ensure they exist in '{EXAMPLE_DATA_DIR}'.")
-                return [None] * len(input_file_components)
-            gr.Info(f"Example data paths loaded from '{EXAMPLE_DATA_DIR}'. Click 'Analyze Dataset'.")
-            return example_file_paths
-        load_example_btn.click(load_example_files_action, inputs=None, outputs=input_file_components) # No inputs for this button
-    return demo
 if __name__ == "__main__":
-    if not os.path.exists(EXAMPLE_DATA_DIR):
-        try: os.makedirs(EXAMPLE_DATA_DIR); print(f"Created '{EXAMPLE_DATA_DIR}'. Place example Excel files there.")
-        except OSError as e: print(f"Error creating {EXAMPLE_DATA_DIR}: {e}. Please create manually.")
-    print(f"Starting Gradio application... Ensure example files are in '{os.path.abspath(EXAMPLE_DATA_DIR)}'")
-    demo_app = create_interface()
-    demo_app.launch()

 import numpy as np
 import pandas as pd
 from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin_min, r2_score
 import matplotlib.pyplot as plt
 import matplotlib.cm
 import io
+import os # Added for path joining
 from PIL import Image
 # Define the paths for example data
 EXAMPLE_DATA_DIR = "eg_data"
 EXAMPLE_FILES = {
+    "cashflow_base": os.path.join(EXAMPLE_DATA_DIR, "cashflows_seriatim_10K.xlsx"),
+    "cashflow_lapse": os.path.join(EXAMPLE_DATA_DIR, "cashflows_seriatim_10K_lapse50.xlsx"),
+    "cashflow_mort": os.path.join(EXAMPLE_DATA_DIR, "cashflows_seriatim_10K_mort15.xlsx"),
+    "policy_data": os.path.join(EXAMPLE_DATA_DIR, "model_point_table.xlsx"), # Assuming this is the correct path/name for the example
+    "pv_base": os.path.join(EXAMPLE_DATA_DIR, "pv_seriatim_10K.xlsx"),
+    "pv_lapse": os.path.join(EXAMPLE_DATA_DIR, "pv_seriatim_10K_lapse50.xlsx"),
+    "pv_mort": os.path.join(EXAMPLE_DATA_DIR, "pv_seriatim_10K_mort15.xlsx"),
 }
 class Clusters:
+    def __init__(self, loc_vars):
+        self.kmeans = kmeans = KMeans(n_clusters=1000, random_state=0, n_init=10).fit(np.ascontiguousarray(loc_vars))
+        closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, np.ascontiguousarray(loc_vars))
+        rep_ids = pd.Series(data=(closest+1))  # 0-based to 1-based indexes
+        rep_ids.name = 'policy_id'
+        rep_ids.index.name = 'cluster_id'
+        self.rep_ids = rep_ids
+        self.policy_count = self.agg_by_cluster(pd.DataFrame({'policy_count': [1] * len(loc_vars)}))['policy_count']
+    def agg_by_cluster(self, df, agg=None):
+        """Aggregate columns by cluster"""
+        temp = df.copy()
+        temp['cluster_id'] = self.kmeans.labels_
+        temp = temp.set_index('cluster_id')
+        agg = {c: (agg[c] if agg and c in agg else 'sum') for c in temp.columns} if agg else "sum"
+        return temp.groupby(temp.index).agg(agg)
+    def extract_reps(self, df):
+        """Extract the rows of representative policies"""
+        temp = pd.merge(self.rep_ids, df.reset_index(), how='left', on='policy_id')
+        temp.index.name = 'cluster_id'
+        return temp.drop('policy_id', axis=1)
+    def extract_and_scale_reps(self, df, agg=None):
+        """Extract and scale the rows of representative policies"""
+        if agg:
+            cols = df.columns
+            mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
+            # Ensure mult has same index as extract_reps(df) for proper alignment
+            extracted_df = self.extract_reps(df)
+            mult.index = extracted_df.index
+            return extracted_df.mul(mult)
+        else:
+            return self.extract_reps(df).mul(self.policy_count, axis=0)
+    def compare(self, df, agg=None):
+        """Returns a multi-indexed Dataframe comparing actual and estimate"""
+        source = self.agg_by_cluster(df, agg)
+        target = self.extract_and_scale_reps(df, agg)
+        return pd.DataFrame({'actual': source.stack(), 'estimate':target.stack()})
+    def compare_total(self, df, agg=None):
+        """Aggregate df by columns"""
+        if agg:
+            # cols = df.columns # Not used
+            op = {c: (agg[c] if c in agg else 'sum') for c in df.columns}
+            actual = df.agg(op)
+            # For estimate, ensure aggregation ops are correctly applied *after* scaling
+            scaled_reps = self.extract_and_scale_reps(df, agg=op) # Pass op to ensure correct scaling for mean
+            # Corrected aggregation for estimate when 'mean' is involved
+            estimate_agg_ops = {}
+            for col_name, agg_type in op.items():
+                if agg_type == 'mean':
+                    # Weighted average for mean columns
+                    estimate_agg_ops[col_name] = lambda s, c=col_name: (s * self.policy_count.reindex(s.index)).sum() / self.policy_count.reindex(s.index).sum() if c in self.policy_count.name else s.mean()
+                else: # 'sum'
+                    estimate_agg_ops[col_name] = 'sum'
+            # Need to handle the case where extract_and_scale_reps already applied scaling for sum
+            # The logic in extract_and_scale_reps is:
+            # mult = pd.DataFrame({c: (self.policy_count if (c not in agg or agg[c] == 'sum') else 1) for c in cols})
+            # This means 'mean' columns are NOT multiplied by policy_count initially.
+            # Let's re-think the estimate aggregation for 'mean'
+            estimate_scaled = self.extract_and_scale_reps(df, agg=op) # agg=op is important here
+            final_estimate_ops = {}
+            for col, method in op.items():
+                if method == 'mean':
+                    # For mean, we need the sum of (value * policy_count) / sum(policy_count)
+                    # extract_and_scale_reps with agg=op should have scaled sum-columns by policy_count
+                    # and mean-columns by 1. So, for mean columns in estimate_scaled, we need to multiply by policy_count,
+                    # sum them up, and divide by total policy_count.
+                    # However, the current extract_and_scale_reps scales 'mean' columns by 1.
+                    # So we need to take the mean of these scaled (by 1) values, but it should be a weighted mean.
+                    # Let's try to be more direct:
+                    # Get the representative policies (unscaled for mean columns)
+                    reps_unscaled_for_mean = self.extract_reps(df)
+                    estimate_values = {}
+                    for c in df.columns:
+                        if op[c] == 'sum':
+                           estimate_values[c] = reps_unscaled_for_mean[c].mul(self.policy_count, axis=0).sum()
+                        elif op[c] == 'mean':
+                           weighted_sum = (reps_unscaled_for_mean[c] * self.policy_count).sum()
+                           total_weight = self.policy_count.sum()
+                           estimate_values[c] = weighted_sum / total_weight if total_weight else 0
+                    estimate = pd.Series(estimate_values)
+                else: # original 'sum' logic for all columns
+                    final_estimate_ops[col] = 'sum' # All columns in estimate_scaled are ready to be summed up
+                    estimate = estimate_scaled.agg(final_estimate_ops)
+        else: # Original logic if no agg is specified (all sum)
+            actual = df.sum()
+            estimate = self.extract_and_scale_reps(df).sum()
+        return pd.DataFrame({'actual': actual, 'estimate': estimate, 'error': estimate / actual - 1})
 def plot_cashflows_comparison(cfs_list, cluster_obj, titles):
+    """Create cashflow comparison plots"""
+    if not cfs_list or not cluster_obj or not titles:
+        return None # Or a placeholder image
+    num_plots = len(cfs_list)
+    if num_plots == 0:
+        return None
+    # Determine subplot layout (e.g., 2x2 or adapt)
+    cols = 2
+    rows = (num_plots + cols - 1) // cols
+    fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows), squeeze=False) # Ensure axes is always 2D
+    axes = axes.flatten()
+    for i, (df, title) in enumerate(zip(cfs_list, titles)):
+        if i < len(axes):
+            comparison = cluster_obj.compare_total(df)
+            comparison[['actual', 'estimate']].plot(ax=axes[i], grid=True, title=title)
+            axes[i].set_xlabel('Time') # Assuming x-axis is time for cashflows
+            axes[i].set_ylabel('Value')
+    # Hide any unused subplots
+    for j in range(i + 1, len(axes)):
+        fig.delaxes(axes[j])
+    plt.tight_layout()
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', dpi=100) # Lowered DPI slightly for potentially faster rendering
+    buf.seek(0)
+    img = Image.open(buf)
+    plt.close(fig) # Ensure figure is closed
+    return img
 def plot_scatter_comparison(df_compare_output, title):
+    """Create scatter plot comparison from compare() output"""
+    if df_compare_output is None or df_compare_output.empty:
+        # Create a blank plot with a message
+        fig, ax = plt.subplots(figsize=(12, 8))
+        ax.text(0.5, 0.5, "No data to display", ha='center', va='center', fontsize=15)
+        ax.set_title(title)
+        buf = io.BytesIO()
+        plt.savefig(buf, format='png', dpi=100)
+        buf.seek(0)
+        img = Image.open(buf)
+        plt.close(fig)
+        return img
+    fig, ax = plt.subplots(figsize=(12, 8)) # Use a single Axes object
+    if not isinstance(df_compare_output.index, pd.MultiIndex) or df_compare_output.index.nlevels < 2:
+         gr.Warning("Scatter plot data is not in the expected multi-index format. Plotting raw actual vs estimate.")
+         ax.scatter(df_compare_output['actual'], df_compare_output['estimate'], s=9, alpha=0.6)
+    else:
+        unique_levels = df_compare_output.index.get_level_values(1).unique()
+        colors = matplotlib.cm.rainbow(np.linspace(0, 1, len(unique_levels)))
+        for item_level, color_val in zip(unique_levels, colors):
+            subset = df_compare_output.xs(item_level, level=1)
+            ax.scatter(subset['actual'], subset['estimate'], color=color_val, s=9, alpha=0.6, label=item_level)
+        if len(unique_levels) > 1 and len(unique_levels) <=10: # Add legend if not too many items
+            ax.legend(title=df_compare_output.index.names[1])
+    ax.set_xlabel('Actual')
+    ax.set_ylabel('Estimate')
+    ax.set_title(title)
+    ax.grid(True)
+    # Draw identity line
+    lims = [
+        np.min([ax.get_xlim(), ax.get_ylim()]),
+        np.max([ax.get_xlim(), ax.get_ylim()]),
+    ]
+    if lims[0] != lims[1]: # Avoid issues if all data is zero or a single point
+      ax.plot(lims, lims, 'r-', linewidth=0.5)
+      ax.set_xlim(lims)
+      ax.set_ylim(lims)
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png', dpi=100)
+    buf.seek(0)
+    img = Image.open(buf)
+    plt.close(fig)
+    return img
 def process_files(cashflow_base_path, cashflow_lapse_path, cashflow_mort_path,
+                  policy_data_path, pv_base_path, pv_lapse_path, pv_mort_path):
+    """Main processing function - now accepts file paths"""
+    try:
+        # Read uploaded files using paths
+        cfs = pd.read_excel(cashflow_base_path, index_col=0)
+        cfs_lapse50 = pd.read_excel(cashflow_lapse_path, index_col=0)
+        cfs_mort15 = pd.read_excel(cashflow_mort_path, index_col=0)
+        pol_data_full = pd.read_excel(policy_data_path, index_col=0)
+        # Ensure the correct columns are selected for pol_data
+        required_cols = ['age_at_entry', 'policy_term', 'sum_assured', 'duration_mth']
+        if all(col in pol_data_full.columns for col in required_cols):
+            pol_data = pol_data_full[required_cols]
+        else:
+            # Fallback or error if columns are missing. For now, try to use as is or a subset.
+            gr.Warning(f"Policy data might be missing required columns. Found: {pol_data_full.columns.tolist()}")
+            pol_data = pol_data_full
+        pvs = pd.read_excel(pv_base_path, index_col=0)
+        pvs_lapse50 = pd.read_excel(pv_lapse_path, index_col=0)
+        pvs_mort15 = pd.read_excel(pv_mort_path, index_col=0)
+        cfs_list = [cfs, cfs_lapse50, cfs_mort15]
+        # pvs_list = [pvs, pvs_lapse50, pvs_mort15] # Not directly used for plotting in this structure
+        scen_titles = ['Base', 'Lapse+50%', 'Mort+15%']
+        results = {}
+        mean_attrs = {'age_at_entry':'mean', 'policy_term':'mean', 'duration_mth':'mean', 'sum_assured': 'sum'} # sum_assured is usually summed
+        # --- 1. Cashflow Calibration ---
+        cluster_cfs = Clusters(cfs)
+        results['cf_total_base_table'] = cluster_cfs.compare_total(cfs)
+        # results['cf_total_lapse_table'] = cluster_cfs.compare_total(cfs_lapse50) # For full detail if needed
+        # results['cf_total_mort_table'] = cluster_cfs.compare_total(cfs_mort15)
+        results['cf_policy_attrs_total'] = cluster_cfs.compare_total(pol_data, agg=mean_attrs)
+        results['cf_pv_total_base'] = cluster_cfs.compare_total(pvs)
+        results['cf_pv_total_lapse'] = cluster_cfs.compare_total(pvs_lapse50)
+        results['cf_pv_total_mort'] = cluster_cfs.compare_total(pvs_mort15)
+        results['cf_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_cfs, scen_titles)
+        results['cf_scatter_cashflows_base'] = plot_scatter_comparison(cluster_cfs.compare(cfs), 'Cashflow Calib. - Cashflows (Base)')
+        # results['cf_scatter_policy_attrs'] = plot_scatter_comparison(cluster_cfs.compare(pol_data, agg=mean_attrs), 'Cashflow Calib. - Policy Attributes')
+        # results['cf_scatter_pvs_base'] = plot_scatter_comparison(cluster_cfs.compare(pvs), 'Cashflow Calib. - PVs (Base)')
+        # --- 2. Policy Attribute Calibration ---
+        # Standardize policy attributes
+        if not pol_data.empty and (pol_data.max() - pol_data.min()).all() != 0 : # Avoid division by zero if a column is constant
+             loc_vars_attrs = (pol_data - pol_data.min()) / (pol_data.max() - pol_data.min())
+        else:
+            gr.Warning("Policy data for attribute calibration is empty or has no variance. Skipping attribute calibration plots.")
+            loc_vars_attrs = pol_data # or handle as an error/skip
+        if not loc_vars_attrs.empty:
+            cluster_attrs = Clusters(loc_vars_attrs)
+            results['attr_total_cf_base'] = cluster_attrs.compare_total(cfs)
+            results['attr_policy_attrs_total'] = cluster_attrs.compare_total(pol_data, agg=mean_attrs)
+            results['attr_total_pv_base'] = cluster_attrs.compare_total(pvs)
+            results['attr_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_attrs, scen_titles)
+            results['attr_scatter_cashflows_base'] = plot_scatter_comparison(cluster_attrs.compare(cfs), 'Policy Attr. Calib. - Cashflows (Base)')
+            # results['attr_scatter_policy_attrs'] = plot_scatter_comparison(cluster_attrs.compare(pol_data, agg=mean_attrs), 'Policy Attr. Calib. - Policy Attributes')
+        else: # Fill with None if skipped
+            results['attr_total_cf_base'] = pd.DataFrame()
+            results['attr_policy_attrs_total'] = pd.DataFrame()
+            results['attr_total_pv_base'] = pd.DataFrame()
+            results['attr_cashflow_plot'] = None
+            results['attr_scatter_cashflows_base'] = None
+        # --- 3. Present Value Calibration ---
+        cluster_pvs = Clusters(pvs)
+        results['pv_total_cf_base'] = cluster_pvs.compare_total(cfs)
+        results['pv_policy_attrs_total'] = cluster_pvs.compare_total(pol_data, agg=mean_attrs)
+        results['pv_total_pv_base'] = cluster_pvs.compare_total(pvs)
+        results['pv_total_pv_lapse'] = cluster_pvs.compare_total(pvs_lapse50)
+        results['pv_total_pv_mort'] = cluster_pvs.compare_total(pvs_mort15)
+        results['pv_cashflow_plot'] = plot_cashflows_comparison(cfs_list, cluster_pvs, scen_titles)
+        results['pv_scatter_pvs_base'] = plot_scatter_comparison(cluster_pvs.compare(pvs), 'PV Calib. - PVs (Base)')
+        # results['pv_scatter_cashflows_base'] = plot_scatter_comparison(cluster_pvs.compare(cfs), 'PV Calib. - Cashflows (Base)')
+        # --- Summary Comparison Plot Data ---
+        # Error metric: Mean Absolute Percentage Error for the 'TOTAL' net present value of cashflows (usually the 'PV_NetCF' column)
+        # Or sum of absolute errors if percentage is problematic (e.g. actual is zero)
+        # For simplicity, using mean of the 'error' column from compare_total for key metrics
+        error_data = {}
+        # Cashflow Calibration Errors
+        if 'PV_NetCF' in pvs.columns:
+            err_cf_cal_pv_base = cluster_cfs.compare_total(pvs).loc['PV_NetCF', 'error']
+            err_cf_cal_pv_lapse = cluster_cfs.compare_total(pvs_lapse50).loc['PV_NetCF', 'error']
+            err_cf_cal_pv_mort = cluster_cfs.compare_total(pvs_mort15).loc['PV_NetCF', 'error']
+            error_data['CF Calib. (PV NetCF)'] = [
+                abs(err_cf_cal_pv_base), abs(err_cf_cal_pv_lapse), abs(err_cf_cal_pv_mort)
+            ]
+        else: # Fallback if PV_NetCF is not present
+            error_data['CF Calib. (PV NetCF)'] = [
+                abs(cluster_cfs.compare_total(pvs)['error'].mean()),
+                abs(cluster_cfs.compare_total(pvs_lapse50)['error'].mean()),
+                abs(cluster_cfs.compare_total(pvs_mort15)['error'].mean())
+            ]
+        # Policy Attribute Calibration Errors
+        if not loc_vars_attrs.empty and 'PV_NetCF' in pvs.columns:
+            err_attr_cal_pv_base = cluster_attrs.compare_total(pvs).loc['PV_NetCF', 'error']
+            err_attr_cal_pv_lapse = cluster_attrs.compare_total(pvs_lapse50).loc['PV_NetCF', 'error']
+            err_attr_cal_pv_mort = cluster_attrs.compare_total(pvs_mort15).loc['PV_NetCF', 'error']
+            error_data['Attr Calib. (PV NetCF)'] = [
+                abs(err_attr_cal_pv_base), abs(err_attr_cal_pv_lapse), abs(err_attr_cal_pv_mort)
+            ]
+        else:
+             error_data['Attr Calib. (PV NetCF)'] = [np.nan, np.nan, np.nan] # Placeholder if skipped
+        # Present Value Calibration Errors
+        if 'PV_NetCF' in pvs.columns:
+            err_pv_cal_pv_base = cluster_pvs.compare_total(pvs).loc['PV_NetCF', 'error']
+            err_pv_cal_pv_lapse = cluster_pvs.compare_total(pvs_lapse50).loc['PV_NetCF', 'error']
+            err_pv_cal_pv_mort = cluster_pvs.compare_total(pvs_mort15).loc['PV_NetCF', 'error']
+            error_data['PV Calib. (PV NetCF)'] = [
+                abs(err_pv_cal_pv_base), abs(err_pv_cal_pv_lapse), abs(err_pv_cal_pv_mort)
+            ]
+        else:
+            error_data['PV Calib. (PV NetCF)'] = [
+                abs(cluster_pvs.compare_total(pvs)['error'].mean()),
+                abs(cluster_pvs.compare_total(pvs_lapse50)['error'].mean()),
+                abs(cluster_pvs.compare_total(pvs_mort15)['error'].mean())
+            ]
+        # Create Summary Plot
+        summary_df = pd.DataFrame(error_data, index=['Base', 'Lapse+50%', 'Mort+15%'])
+        fig_summary, ax_summary = plt.subplots(figsize=(10, 6))
+        summary_df.plot(kind='bar', ax=ax_summary, grid=True)
+        ax_summary.set_ylabel('Mean Absolute Error (of PV_NetCF)')
+        ax_summary.set_title('Calibration Method Comparison - Error in Total PV Net Cashflow')
+        ax_summary.tick_params(axis='x', rotation=0)
+        plt.tight_layout()
+        buf_summary = io.BytesIO()
+        plt.savefig(buf_summary, format='png', dpi=100)
+        buf_summary.seek(0)
+        results['summary_plot'] = Image.open(buf_summary)
+        plt.close(fig_summary)
+        return results
+    except FileNotFoundError as e:
+        gr.Error(f"File not found: {e.filename}. Please ensure example files are in '{EXAMPLE_DATA_DIR}' or all files are uploaded.")
+        return {"error": f"File not found: {e.filename}"}
+    except KeyError as e:
+        gr.Error(f"A required column is missing from one of the excel files: {e}. Please check data format.")
+        return {"error": f"Missing column: {e}"}
+    except Exception as e:
+        gr.Error(f"Error processing files: {str(e)}")
+        return {"error": f"Error processing files: {str(e)}"}
 def create_interface():
+    with gr.Blocks(title="Cluster Model Points Analysis") as demo: # Removed theme
+        gr.Markdown("""
+        # Cluster Model Points Analysis
+        This application applies cluster analysis to model point selection for insurance portfolios.
+        Upload your Excel files or use the example data to analyze cashflows, policy attributes, and present values using different calibration methods.
+        **Required Files (Excel .xlsx):**
+        - Cashflows - Base Scenario
+        - Cashflows - Lapse Stress (+50%)
+        - Cashflows - Mortality Stress (+15%)
+        - Policy Data (including 'age_at_entry', 'policy_term', 'sum_assured', 'duration_mth')
+        - Present Values - Base Scenario
+        - Present Values - Lapse Stress
+        - Present Values - Mortality Stress
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### Upload Files or Load Examples")
+                load_example_btn = gr.Button("Load Example Data")
+                with gr.Row():
+                    cashflow_base_input = gr.File(label="Cashflows - Base", file_types=[".xlsx"])
+                    cashflow_lapse_input = gr.File(label="Cashflows - Lapse Stress", file_types=[".xlsx"])
+                    cashflow_mort_input = gr.File(label="Cashflows - Mortality Stress", file_types=[".xlsx"])
+                with gr.Row():
+                    policy_data_input = gr.File(label="Policy Data", file_types=[".xlsx"])
+                    pv_base_input = gr.File(label="Present Values - Base", file_types=[".xlsx"])
+                    pv_lapse_input = gr.File(label="Present Values - Lapse Stress", file_types=[".xlsx"])
+                with gr.Row():
+                    pv_mort_input = gr.File(label="Present Values - Mortality Stress", file_types=[".xlsx"])
+                analyze_btn = gr.Button("Analyze Dataset", variant="primary", size="lg")
+        with gr.Tabs():
+            with gr.TabItem(" Summary"):
+                summary_plot_output = gr.Image(label="Calibration Methods Comparison (Error in Total PV Net Cashflow)")
+            with gr.TabItem(" Cashflow Calibration"):
+                gr.Markdown("### Results: Using Annual Cashflows as Calibration Variables")
+                with gr.Row():
+                    cf_total_base_table_out = gr.Dataframe(label="Overall Comparison - Base Scenario (Cashflows)")
+                    cf_policy_attrs_total_out = gr.Dataframe(label="Overall Comparison - Policy Attributes")
+                cf_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios")
+                cf_scatter_cashflows_base_out = gr.Image(label="Scatter Plot - Per-Cluster Cashflows (Base Scenario)")
+                with gr.Accordion("Present Value Comparisons (Total)", open=False):
+                    with gr.Row():
+                        cf_pv_total_base_out = gr.Dataframe(label="PVs - Base Total")
+                        cf_pv_total_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
+                        cf_pv_total_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
+            with gr.TabItem(" Policy Attribute Calibration"):
+                gr.Markdown("### Results: Using Policy Attributes as Calibration Variables")
+                with gr.Row():
+                    attr_total_cf_base_out = gr.Dataframe(label="Overall Comparison - Base Scenario (Cashflows)")
+                    attr_policy_attrs_total_out = gr.Dataframe(label="Overall Comparison - Policy Attributes")
+                attr_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios")
+                attr_scatter_cashflows_base_out = gr.Image(label="Scatter Plot - Per-Cluster Cashflows (Base Scenario)")
+                with gr.Accordion("Present Value Comparisons (Total)", open=False):
+                     attr_total_pv_base_out = gr.Dataframe(label="PVs - Base Scenario Total")
+            with gr.TabItem(" Present Value Calibration"):
+                gr.Markdown("### Results: Using Present Values (Base Scenario) as Calibration Variables")
+                with gr.Row():
+                    pv_total_cf_base_out = gr.Dataframe(label="Overall Comparison - Base Scenario (Cashflows)")
+                    pv_policy_attrs_total_out = gr.Dataframe(label="Overall Comparison - Policy Attributes")
+                pv_cashflow_plot_out = gr.Image(label="Cashflow Value Comparisons (Actual vs. Estimate) Across Scenarios")
+                pv_scatter_pvs_base_out = gr.Image(label="Scatter Plot - Per-Cluster Present Values (Base Scenario)")
+                with gr.Accordion("Present Value Comparisons (Total)", open=False):
+                    with gr.Row():
+                        pv_total_pv_base_out = gr.Dataframe(label="PVs - Base Total")
+                        pv_total_pv_lapse_out = gr.Dataframe(label="PVs - Lapse Stress Total")
+                        pv_total_pv_mort_out = gr.Dataframe(label="PVs - Mortality Stress Total")
+        # --- Helper function to prepare outputs ---
+        def get_all_output_components():
+            return [
+                summary_plot_output,
+                # Cashflow Calib Outputs
+                cf_total_base_table_out, cf_policy_attrs_total_out,
+                cf_cashflow_plot_out, cf_scatter_cashflows_base_out,
+                cf_pv_total_base_out, cf_pv_total_lapse_out, cf_pv_total_mort_out,
+                # Attribute Calib Outputs
+                attr_total_cf_base_out, attr_policy_attrs_total_out,
+                attr_cashflow_plot_out, attr_scatter_cashflows_base_out, attr_total_pv_base_out,
+                # PV Calib Outputs
+                pv_total_cf_base_out, pv_policy_attrs_total_out,
+                pv_cashflow_plot_out, pv_scatter_pvs_base_out,
+                pv_total_pv_base_out, pv_total_pv_lapse_out, pv_total_pv_mort_out
+            ]
+        # --- Action for Analyze Button ---
+        def handle_analysis(f1, f2, f3, f4, f5, f6, f7):
+            # Ensure all files are provided (either by upload or example load)
+            files = [f1, f2, f3, f4, f5, f6, f7]
+            # Gradio File objects have a .name attribute for the temp path
+            # If they are already strings (from example load), they are paths
+            file_paths = []
+            for i, f_obj in enumerate(files):
+                if f_obj is None:
+                    gr.Error(f"Missing file input for argument {i+1}. Please upload all files or load examples.")
+                    # Return Nones for all output components
+                    return [None] * len(get_all_output_components())
+                # If f_obj is a Gradio FileData object (from direct upload)
+                if hasattr(f_obj, 'name') and isinstance(f_obj.name, str):
+                    file_paths.append(f_obj.name)
+                # If f_obj is already a string path (from example load)
+                elif isinstance(f_obj, str):
+                     file_paths.append(f_obj)
+                else:
+                    gr.Error(f"Invalid file input for argument {i+1}. Type: {type(f_obj)}")
+                    return [None] * len(get_all_output_components())
+            results = process_files(*file_paths)
+            if "error" in results:
+                # Error already displayed by process_files or here
+                return [None] * len(get_all_output_components())
+            return [
+                results.get('summary_plot'),
+                # CF Calib
+                results.get('cf_total_base_table'), results.get('cf_policy_attrs_total'),
+                results.get('cf_cashflow_plot'), results.get('cf_scatter_cashflows_base'),
+                results.get('cf_pv_total_base'), results.get('cf_pv_total_lapse'), results.get('cf_pv_total_mort'),
+                # Attr Calib
+                results.get('attr_total_cf_base'), results.get('attr_policy_attrs_total'),
+                results.get('attr_cashflow_plot'), results.get('attr_scatter_cashflows_base'), results.get('attr_total_pv_base'),
+                # PV Calib
+                results.get('pv_total_cf_base'), results.get('pv_policy_attrs_total'),
+                results.get('pv_cashflow_plot'), results.get('pv_scatter_pvs_base'),
+                results.get('pv_total_pv_base'), results.get('pv_total_pv_lapse'), results.get('pv_total_pv_mort')
+            ]
+        analyze_btn.click(
+            handle_analysis,
+            inputs=[cashflow_base_input, cashflow_lapse_input, cashflow_mort_input,
+                    policy_data_input, pv_base_input, pv_lapse_input, pv_mort_input],
+            outputs=get_all_output_components()
+        )
+        # --- Action for Load Example Data Button ---
+        def load_example_files():
+            # Check if all example files exist
+            missing_files = [fp for fp in EXAMPLE_FILES.values() if not os.path.exists(fp)]
+            if missing_files:
+                gr.Error(f"Missing example data files in '{EXAMPLE_DATA_DIR}': {', '.join(missing_files)}. Please ensure they exist.")
+                return [None] * 7 # Return Nones for all file inputs
+            gr.Info("Example data paths loaded. Click 'Analyze Dataset'.")
+            return [
+                EXAMPLE_FILES["cashflow_base"], EXAMPLE_FILES["cashflow_lapse"], EXAMPLE_FILES["cashflow_mort"],
+                EXAMPLE_FILES["policy_data"], EXAMPLE_FILES["pv_base"], EXAMPLE_FILES["pv_lapse"],
+                EXAMPLE_FILES["pv_mort"]
+            ]
+        load_example_btn.click(
+            load_example_files,
+            inputs=[],
+            outputs=[cashflow_base_input, cashflow_lapse_input, cashflow_mort_input,
+                     policy_data_input, pv_base_input, pv_lapse_input, pv_mort_input]
+        )
+    return demo
 if __name__ == "__main__":
+    # Create the eg_data directory if it doesn't exist (for testing, user should create it with files)
+    if not os.path.exists(EXAMPLE_DATA_DIR):
+        os.makedirs(EXAMPLE_DATA_DIR)
+        print(f"Created directory '{EXAMPLE_DATA_DIR}'. Please place example Excel files there.")
+        # You might want to add dummy files here for basic testing if the real files aren't present
+        # For example:
+        # with open(os.path.join(EXAMPLE_DATA_DIR, "cashflows_seriatim_10K.xlsx"), "w") as f: f.write("")
+        # ... and so on for other files, but they would be empty and cause errors in pd.read_excel.
+        # It's better to instruct the user to add the actual files.
+        print(f"Expected files in '{EXAMPLE_DATA_DIR}': {list(EXAMPLE_FILES.values())}")
+    demo_app = create_interface()
+    demo_app.launch()