Spaces:

alidenewade
/

model-point-clustering

Sleeping

App Files Files Community

alidenewade commited on May 22, 2025

Commit

f647840

verified ·

1 Parent(s): b9e8b82

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -99

app.py CHANGED Viewed

@@ -2,117 +2,146 @@ import gradio as gr
 import pandas as pd
 import numpy as np
 from sklearn.cluster import KMeans
-from sklearn.metrics import r2_score
 import matplotlib.pyplot as plt
 import io
-def cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters):
-    # Basic checks and reads
     try:
-        policy_df = pd.read_excel(policy_file.name)
-        cashflow_df = pd.read_excel(cashflow_file.name, index_col=0)
-        pv_df = pd.read_excel(pv_file.name, index_col=0)
-    except Exception as e:
-        return (None, None, None, f"Error reading files: {e}")
-    # Use policy attributes clustering as simple example
-    required_cols = ['IssueAge', 'PolicyTerm', 'SumAssured', 'Duration']
-    if not all(col in policy_df.columns for col in required_cols):
-        return (None, None, None, f"Policy data missing required columns: {required_cols}")
-    X = policy_df[required_cols].fillna(0)
-    X_scaled = (X - X.mean()) / X.std()
-    # Cluster
-    try:
-        kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
         kmeans.fit(X_scaled)
-        policy_df['Cluster'] = kmeans.labels_
-    except Exception as e:
-        return (None, None, None, f"Clustering error: {e}")
-    # Select model points as closest to cluster centers
-    from sklearn.metrics import pairwise_distances_argmin_min
-    centers = kmeans.cluster_centers_
-    closest, _ = pairwise_distances_argmin_min(centers, X_scaled)
-    model_points = policy_df.iloc[closest].copy()
-    # Calculate weights (count per cluster)
-    counts = policy_df['Cluster'].value_counts()
-    model_points['Weight'] = model_points['Cluster'].map(counts)
-    # Create CSV for download
-    csv_buffer = io.StringIO()
-    model_points.to_csv(csv_buffer, index=False)
-    csv_data = csv_buffer.getvalue()
-    # Aggregate cashflows weighted by cluster counts
-    proxy_cashflows = cashflow_df.loc[model_points.index].multiply(model_points['Weight'], axis=0).sum()
-    seriatim_cashflows = cashflow_df.sum()
-    # Plot aggregated cashflows
-    fig, ax = plt.subplots(figsize=(8,4))
-    seriatim_cashflows.plot(ax=ax, label='Seriatim Cashflows')
-    proxy_cashflows.plot(ax=ax, label='Proxy Cashflows', linestyle='--')
-    ax.set_title('Aggregated Cashflows Comparison')
-    ax.legend()
-    ax.grid(True)
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png')
-    plt.close(fig)
-    buf.seek(0)
-    cashflow_plot = buf.read()
-    # Aggregate present values weighted
-    proxy_pv = pv_df.loc[model_points.index].multiply(model_points['Weight'], axis=0).sum().values[0]
-    seriatim_pv = pv_df.sum().values[0]
-    # Present Value comparison plot (bar)
-    fig2, ax2 = plt.subplots(figsize=(5,4))
-    ax2.bar(['Seriatim PV', 'Proxy PV'], [seriatim_pv, proxy_pv], color=['blue', 'orange'])
-    ax2.set_title('Aggregated Present Values')
-    ax2.grid(axis='y')
-    buf2 = io.BytesIO()
-    plt.savefig(buf2, format='png')
-    plt.close(fig2)
-    buf2.seek(0)
-    pv_plot = buf2.read()
-    # Accuracy metrics
-    common_idx = seriatim_cashflows.index.intersection(proxy_cashflows.index)
-    r2 = r2_score(seriatim_cashflows.loc[common_idx], proxy_cashflows.loc[common_idx])
-    pv_error = abs(proxy_pv - seriatim_pv) / seriatim_pv * 100 if seriatim_pv != 0 else float('inf')
-    metrics_text = (
-        f"R-squared for aggregated cashflows: {r2:.4f}\n"
-        f"Absolute percentage error in present value: {pv_error:.4f}%"
-    )
-    return csv_data, cashflow_plot, pv_plot, metrics_text
 with gr.Blocks() as demo:
-    gr.Markdown("# Actuarial Model Point Selection")
     with gr.Row():
         with gr.Column():
-            policy_input = gr.File(label="Upload Policy Data (Excel)")
-            cashflow_input = gr.File(label="Upload Cashflow Data (Excel)")
-            pv_input = gr.File(label="Upload Present Value Data (Excel)")
-            clusters_input = gr.Slider(minimum=2, maximum=100, step=1, value=10, label="Number of Model Points")
-            run_btn = gr.Button("Run Clustering")
         with gr.Column():
-            output_csv = gr.Textbox(label="Model Points CSV Output", lines=10)
-            cashflow_img = gr.Image(label="Aggregated Cashflows Comparison")
-            pv_img = gr.Image(label="Aggregated Present Values Comparison")
-            metrics_box = gr.Textbox(label="Accuracy Metrics", lines=4)
     run_btn.click(
-        cluster_analysis,
-        inputs=[policy_input, cashflow_input, pv_input, clusters_input],
-        outputs=[output_csv, cashflow_img, pv_img, metrics_box]
     )
-demo.launch(debug=True)

 import pandas as pd
 import numpy as np
 from sklearn.cluster import KMeans
+from sklearn.metrics import r2_score, pairwise_distances_argmin_min
 import matplotlib.pyplot as plt
 import io
+import os
+def run_cluster_analysis(policy_file, cashflow_file, pv_file, num_clusters, cluster_type):
     try:
+        # Load data
+        policy_df = pd.read_excel(policy_file)
+        cashflow_df = pd.read_excel(cashflow_file, index_col=0)
+        pv_df = pd.read_excel(pv_file, index_col=0)
+        # Normalize column names for robustness
+        policy_df.columns = policy_df.columns.str.strip().str.lower()
+        pv_df.columns = pv_df.columns.str.strip().str.lower()
+        if cluster_type == "Policy Attributes":
+            required_cols = ['issueage', 'policyterm', 'sumassured', 'duration']
+            missing = [col for col in required_cols if col not in policy_df.columns]
+            if missing:
+                return (None, None, None, f"Policy data missing required columns: {missing}")
+            X = policy_df[required_cols].fillna(0)
+        elif cluster_type == "Net Cashflows":
+            X = cashflow_df.fillna(0)
+        elif cluster_type == "Present Values":
+            if 'pv_net_cf' not in pv_df.columns:
+                return (None, None, None, "Missing 'PV_Net_CF' column in PV file.")
+            X = pv_df[['pv_net_cf']].fillna(0)
+        else:
+            return (None, None, None, "Invalid clustering variable choice.")
+        # Scale data
+        X_scaled = (X - X.mean()) / X.std(ddof=0)
+        X_scaled = X_scaled.fillna(0)
+        # Run KMeans
+        kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init='auto')
         kmeans.fit(X_scaled)
+        policy_df['cluster'] = kmeans.labels_
+        # Get closest policies (model points)
+        closest_idxs = pairwise_distances_argmin_min(kmeans.cluster_centers_, X_scaled)[0]
+        model_points = policy_df.iloc[closest_idxs].copy()
+        cluster_counts = policy_df['cluster'].value_counts()
+        model_points['weight'] = model_points['cluster'].map(cluster_counts)
+        # Aggregate comparisons
+        total_seriatim_cf = cashflow_df.sum(axis=0)
+        total_seriatim_pv = pv_df.sum(axis=0)
+        proxy_cf = cashflow_df.loc[model_points.index].multiply(model_points['weight'], axis=0).sum(axis=0)
+        proxy_pv = pv_df.loc[model_points.index].multiply(model_points['weight'], axis=0).sum(axis=0)
+        # Output CSV
+        csv_buf = io.StringIO()
+        model_points.to_csv(csv_buf, index=False)
+        csv_bytes = csv_buf.getvalue().encode()
+        # Cashflow plot
+        fig1, ax1 = plt.subplots()
+        total_seriatim_cf.plot(ax=ax1, label="Seriatim", color="blue")
+        proxy_cf.plot(ax=ax1, label="Proxy", linestyle="--", color="orange")
+        ax1.set_title("Aggregated Cashflows")
+        ax1.legend()
+        ax1.grid()
+        buf1 = io.BytesIO()
+        plt.savefig(buf1, format='png')
+        buf1.seek(0)
+        plt.close(fig1)
+        # PV plot
+        fig2, ax2 = plt.subplots()
+        pv_plot = pd.DataFrame({
+            "Seriatim PV": [total_seriatim_pv.iloc[0]],
+            "Proxy PV": [proxy_pv.iloc[0]]
+        })
+        pv_plot.plot(kind="bar", ax=ax2, color=["blue", "orange"])
+        ax2.set_title("Aggregated Present Values")
+        ax2.set_xticks([0])
+        ax2.set_xticklabels(["Total PV"])
+        ax2.grid(axis='y')
+        buf2 = io.BytesIO()
+        plt.savefig(buf2, format='png')
+        buf2.seek(0)
+        plt.close(fig2)
+        # Metrics
+        r2 = r2_score(total_seriatim_cf, proxy_cf)
+        pv_err = abs((proxy_pv.iloc[0] - total_seriatim_pv.iloc[0]) / total_seriatim_pv.iloc[0]) * 100
+        metrics = (
+            f"--- Accuracy Metrics ---\n"
+            f"R-squared (Cashflows): {r2:.4f}\n"
+            f"Absolute % Error (Present Value): {pv_err:.2f}%"
+        )
+        return csv_bytes, buf1, buf2, metrics
+    except Exception as e:
+        return (None, None, None, f"An error occurred: {str(e)}")
+# Build UI
 with gr.Blocks() as demo:
+    gr.Markdown("## Actuarial Model Point Selection via Cluster Analysis")
     with gr.Row():
         with gr.Column():
+            policy_file = gr.File(label="Upload Policy Data (.xlsx)", file_types=[".xlsx", ".xls"])
+            cashflow_file = gr.File(label="Upload Cashflow Data (.xlsx)", file_types=[".xlsx", ".xls"])
+            pv_file = gr.File(label="Upload Present Value Data (.xlsx)", file_types=[".xlsx", ".xls"])
         with gr.Column():
+            num_clusters = gr.Slider(10, 2000, value=1000, step=10, label="Number of Model Points (k)")
+            cluster_type = gr.Dropdown(
+                ["Policy Attributes", "Net Cashflows", "Present Values"],
+                value="Present Values",
+                label="Clustering Variable"
+            )
+            run_btn = gr.Button("Run Cluster Analysis")
+    with gr.Row():
+        output_csv = gr.File(label="Download Model Points (CSV)")
+        output_cf_plot = gr.Image(label="Cashflow Comparison")
+        output_pv_plot = gr.Image(label="PV Comparison")
+        output_metrics = gr.Textbox(label="Accuracy Metrics", lines=5)
+    def wrapper(policy_file, cashflow_file, pv_file, num_clusters, cluster_type):
+        csv_bytes, img_cf, img_pv, metrics = run_cluster_analysis(policy_file.name, cashflow_file.name, pv_file.name, num_clusters, cluster_type)
+        if csv_bytes is not None:
+            csv_path = "/tmp/model_points.csv"
+            with open(csv_path, "wb") as f:
+                f.write(csv_bytes)
+        else:
+            csv_path = None
+        return csv_path, img_cf, img_pv, metrics
     run_btn.click(
+        fn=wrapper,
+        inputs=[policy_file, cashflow_file, pv_file, num_clusters, cluster_type],
+        outputs=[output_csv, output_cf_plot, output_pv_plot, output_metrics]
     )
+demo.launch()