Spaces:

speri420
/

agentic-aml-demo

Paused

File size: 35,238 Bytes

import math
import  os
try:
    import boto3
except ImportError:
    boto3 = None  # only needed for show_ss_performance() (legacy AWS Lambda function)
try:
    import botocore
except ImportError:
    botocore = None  # only needed for legacy AWS Lambda functions
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import json
#import kaleido
def segment_threshold_tuning(df, segment, threshold):
    segments=[]
    segment_total_alerts = []
    segment_fps=[]
    segment_btl10_alerts=[]
    segment_atl10_alerts=[]
    segment_btl20_alerts=[]
    segment_atl20_alerts=[]
    segment_ta_alerts=[]

    segment_current_thresholds=[100,25]
    segment_threshold_averages=[730, 220]
    segment_names=['Business', 'Individual']
    segments.append(segment)
    segment_total_alerts.append(df[(df['smart_segment_id'] == segment)& ( df[threshold] >= segment_current_thresholds[segment]) & (df['alerts'] == 1)].shape[0])
    segment_fps.append(df[(df['smart_segment_id'] == segment) & ( df[threshold] >= segment_current_thresholds[segment]) & (df['false_positives'] == 1)].shape[0])
    segment_btl10_alerts.append(df[(df['smart_segment_id'] == segment)& (df[threshold] >= (segment_current_thresholds[segment]  - segment_current_thresholds[segment] * .1)) & (df['alerts'] == 1)].shape[0] )
    segment_btl20_alerts.append(df[(df['smart_segment_id'] == segment) & (
                df[threshold] >= (segment_current_thresholds[segment]  - segment_current_thresholds[segment] * .2)) & (df['alerts'] == 1)].shape[0])
    segment_atl10_alerts.append(df[(df['smart_segment_id'] == segment)& (df[threshold] >= (segment_current_thresholds[segment]  + segment_current_thresholds[segment] * .1)) & (df['alerts'] == 1)].shape[0] )
    segment_atl20_alerts.append(df[(df['smart_segment_id'] == segment) & (
                df[threshold] >= (segment_current_thresholds[segment]  + segment_current_thresholds[segment] * .2)) & (df['alerts'] == 1)].shape[0])
    segment_ta_alerts.append(df[(df['smart_segment_id'] == segment)& (df['alerts'] == 1) &(
                df[threshold] >= (segment_threshold_averages[segment] ))].shape[0])
    data = [
        go.Bar(name='Total Alerts', x=[segment_names[segment]], y=segment_total_alerts),
        go.Bar(name='Unproductive Alerts', x=[segment_names[segment]], y=segment_fps),
        go.Bar(name='Alerts BTL 10%', x=[segment_names[segment]], y=segment_btl10_alerts),
        go.Bar(name='Alerts BTL 20%', x=[segment_names[segment]], y=segment_btl20_alerts),
        go.Bar(name='Alerts ATL 10%', x=[segment_names[segment]], y=segment_atl10_alerts),
        go.Bar(name='Alerts ATL 20%', x=[segment_names[segment]], y=segment_atl20_alerts),
        go.Bar(name='Alerts using Segment Average', x=[segment_names[segment]], y=segment_ta_alerts),
    ]
    fig = go.Figure(data)

    fig.add_annotation(
        text=f"<b>Total Alerts:{segment_total_alerts[segment]}<br><b>Current Threshold:{segment_current_thresholds[segment]}<br><b>Segment Threshold Mean:{segment_threshold_averages[segment]}",  # Text to display
        xref="paper",  # Reference the figure's paper coordinates
        yref="paper",  # Reference the figure's paper coordinates
        x=1,  # Position the text at the right edge of the figure
        y=1,  # Position the text at the top edge of the figure
        showarrow=False,  # No arrow pointing to the text
        align="right",  # Align the text to the right
        valign="top"  # Align the text to the top
    )
    # Adjust bar width and gap
    fig.update_traces(width=0.05)  # Make bars thinner
    fig.update_layout(bargroupgap = 0.01, title=f"Threshold({threshold}) Tuning for {segment_names[segment]} Segment")
    return fig
def alerts_distribution(df):
    segment_total_alerts = [
        df[(df['smart_segment_id'] == 0) & (df['alerts'] == 1)].shape[0],
        df[(df['smart_segment_id'] == 1) & (df['alerts'] == 1)].shape[0],
    ]
    segment_fps = [
        df[(df['smart_segment_id'] == 0) & (df['false_positives'] == 1)].shape[0],
        df[(df['smart_segment_id'] == 1) & (df['false_positives'] == 1)].shape[0],
    ]

    data = [
        go.Bar(name='Total Alerts', x=['Business', 'Individual'], y=segment_total_alerts),
        go.Bar(name='False Positives', x=['Business', 'Individual'], y=segment_fps),
    ]

    fig = go.Figure(data)
    fig.update_layout(barmode='group', title="Alerts distribution across Segments")
    return fig
def plot_thresholds_tuning(df_segment, threshold, bump_pct, segment):
    false_positives = []
    false_negatives = []
    thresholds = []
    threshold_min = df_segment[threshold].min()
    threshold_max = df_segment[threshold].max()
    step = max(1, int((threshold_max - threshold_min) / 100))
    threshold_bump = threshold_min
    while threshold_bump <= threshold_max + step:
        fp = df_segment[(df_segment[threshold] >= threshold_bump) & (df_segment['false_positives'] == 1)].shape[0]
        fn = df_segment[(df_segment[threshold] < threshold_bump) & (df_segment['false_negatives'] == 1)].shape[0]
        false_positives.append(fp)
        false_negatives.append(fn)
        thresholds.append(round(threshold_bump, 2))
        threshold_bump = threshold_bump + step
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=thresholds, y=false_positives, mode='lines', name='False Positives',
                             line=dict(color='#EF553B', width=2)))
    fig.add_trace(go.Scatter(x=thresholds, y=false_negatives, mode='lines', name='False Negatives',
                             line=dict(color='#636EFA', width=2)))
    fig.update_layout(
        title=f'False Positives & False Negatives vs Threshold ({threshold}) — Segment: {segment}',
        xaxis_title=threshold,
        yaxis_title='Count',
        legend=dict(x=0.01, y=0.99),
    )
    fig.add_annotation(
        text=f"<b>Threshold Min: {round(threshold_min, 2)}<br><b>Threshold Max: {round(threshold_max, 2)}",
        xref="paper", yref="paper",
        x=1, y=0.5,
        showarrow=False, align="right", valign="middle"
    )
    df_thresholds = pd.DataFrame({f'{threshold}': thresholds, 'False Positives': false_positives, 'False Negatives': false_negatives})
    df_thresholds.to_csv(os.path.join("/tmp", f"Segment_{segment}_{threshold}.csv"), index=False)
    return fig, df_segment
def smartseg_tree():
    dtree = pd.read_csv('smartsegments.csv')
    dtree['SmartSegment'] = dtree['SmartSegment'].astype(int)

    agg = {
        'amount_MEAN':        'mean',
        'avg_num_trxns_MEAN': 'mean',
        'avg_trxn_amt_MEAN':  'mean',
        'NUM_COUNT':          'sum',
    }

    rows = []

    # Root node
    r = dtree.agg(agg)
    rows.append({'id': 'All', 'parent': '', 'label': 'AML Smart Segments',
                 'amount_MEAN': r['amount_MEAN'], 'avg_num_trxns_MEAN': r['avg_num_trxns_MEAN'],
                 'avg_trxn_amt_MEAN': r['avg_trxn_amt_MEAN'], 'NUM_COUNT': r['NUM_COUNT']})

    # SmartSegment level
    for _, g in dtree.groupby('SmartSegment').agg(agg).reset_index().iterrows():
        sid = f"SS_{int(g['SmartSegment'])}"
        rows.append({'id': sid, 'parent': 'All', 'label': f"Segment {int(g['SmartSegment'])}",
                     'amount_MEAN': g['amount_MEAN'], 'avg_num_trxns_MEAN': g['avg_num_trxns_MEAN'],
                     'avg_trxn_amt_MEAN': g['avg_trxn_amt_MEAN'], 'NUM_COUNT': g['NUM_COUNT']})

    # SmartSegment x customer_type level
    for _, g in dtree.groupby(['SmartSegment', 'customer_type']).agg(agg).reset_index().iterrows():
        sid = f"SS_{int(g['SmartSegment'])}"
        cid = f"{sid}_{g['customer_type']}"
        rows.append({'id': cid, 'parent': sid, 'label': g['customer_type'],
                     'amount_MEAN': g['amount_MEAN'], 'avg_num_trxns_MEAN': g['avg_num_trxns_MEAN'],
                     'avg_trxn_amt_MEAN': g['avg_trxn_amt_MEAN'], 'NUM_COUNT': g['NUM_COUNT']})

    # Leaf: SmartSegment x customer_type x acct_type
    for _, g in dtree.groupby(['SmartSegment', 'customer_type', 'acct_type']).agg(agg).reset_index().iterrows():
        sid = f"SS_{int(g['SmartSegment'])}"
        cid = f"{sid}_{g['customer_type']}"
        lid = f"{cid}_{g['acct_type']}"
        rows.append({'id': lid, 'parent': cid, 'label': g['acct_type'],
                     'amount_MEAN': g['amount_MEAN'], 'avg_num_trxns_MEAN': g['avg_num_trxns_MEAN'],
                     'avg_trxn_amt_MEAN': g['avg_trxn_amt_MEAN'], 'NUM_COUNT': g['NUM_COUNT']})

    tree_df = pd.DataFrame(rows)

    fig = go.Figure(go.Treemap(
        ids=tree_df['id'],
        labels=tree_df['label'],
        parents=tree_df['parent'],
        values=tree_df['NUM_COUNT'],
        customdata=np.column_stack([
            tree_df['avg_num_trxns_MEAN'].fillna(0),
            tree_df['avg_trxn_amt_MEAN'].fillna(0),
            tree_df['NUM_COUNT'].fillna(0),
            tree_df['amount_MEAN'].fillna(0),
        ]),
        hovertemplate=(
            '<b>%{label}</b><br>'
            'Count: %{customdata[2]:.0f}<br>'
            'Avg Trxns/Week: %{customdata[0]:.0f}<br>'
            'Avg Trxn Amt: $%{customdata[1]:.0f}<br>'
            'Avg Monthly Amt: $%{customdata[3]:.0f}<br>'
            '<extra></extra>'
        ),
        texttemplate=(
            '<b>%{label}</b><br>'
            'n=%{customdata[2]:.0f}<br>'
            'trxns/wk=%{customdata[0]:.0f}<br>'
            'amt=$%{customdata[1]:.0f}'
        ),
        marker=dict(
            colors=tree_df['avg_num_trxns_MEAN'].fillna(0),
            colorscale='RdBu',
            showscale=True,
            colorbar=dict(title='Avg Trxns/Wk'),
        ),
    ))
    fig.update_layout(
        title='AML Smart Segments',
        font_size=14,
        margin=dict(t=50, l=25, r=25, b=25),
    )
    return fig, tree_df
# Remove rows with outliers in any of the specified columns using IQR
def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.10)
        Q3 = df[col].quantile(0.90)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= Q1) & (df[col] <= Q3)]
    return df

def plot_pct_metric(df, metric):
    scores=[]
    pcts = []
    Precision = []
    Recall = []
    for i in range(0, 101):
        df_pct = df.head(int(len(df)*(i/100)))
        TP = df_pct[df_pct['true_positives'] ==1].shape[0]
        FP = df_pct[df_pct['false_positives'] ==1].shape[0]
        TN = df_pct[df_pct['true_negatives'] ==1].shape[0]
        FN = df_pct[df_pct['false_negatives'] ==1].shape[0]
        if (metric == 'Jstat'):
            if ((TP+FN == 0) or (FP+TN == 0)):
                metric_J = 0
            else:
                 metric_J = (TP/(TP+FN))+(TN/(FP+TN)) - 1
            scores.append (metric_J)
        elif (metric == 'F1'):
            if ((TP+FP) == 0):
                P = 0
            else:
                P = TP / (TP+FP)
            if ((TP+FN) == 0):
                R = 0
            else:
                R = TP / (TP+FN)
            Precision.append(P)
            Recall.append(R)
            if (P+R != 0):
                metric_F1 = 2 * (P* R) / (P+R)
            else:
                metric_F1 = 0
            scores.append (metric_F1)
        pcts.append(i/100)
    maxJ = max(scores)
    max_index = scores.index(maxJ)
    if (metric == 'Jstat'):
        fig = px.line( x=pcts, y=scores)
        # Highlight the maximum point
        fig.add_scatter(x= [pcts[max_index]],y=[scores[max_index]],
                    mode='markers', marker=dict(color='red', size=10),
                    marker_symbol = ['star'],
                    name=f'Max J: ({scores[max_index]})')
        #fig.show()
        return fig
    else:
        fig1 = px.line( x=pcts, y=scores)
        # Highlight the maximum point
        fig1.add_scatter(x= [pcts[max_index]],y=[scores[max_index]],
            mode='markers', marker=dict(color='red', size=10),
            marker_symbol = ['star'],
            name=f'Max J: ({scores[max_index]})')
        fig2 = px.line( x= Recall, y = Precision)
        # Highlight the maximum point
        fig2.add_scatter(x= [Recall[max_index]],y=[Precision[max_index]],
                    mode='markers', marker=dict(color='red', size=10),
                    marker_symbol = ['star'],
                    name=f'Max J: ({scores[max_index]})')
        return fig1, fig2

def plot_thresholds_metric(df_segment, threshold, bump_pct, segment, metric):
    scores = []
    thresholds = []
    df_segment = remove_outliers_iqr(df_segment, [threshold])
    threshold_min = df_segment[threshold].min()
    threshold_max = df_segment[threshold].max()
    threshold_bump = threshold_min
    while threshold_bump < threshold_max:
        df_trxn_set = df_segment[df_segment[threshold] >= threshold_bump]
        TP = df_trxn_set[df_trxn_set['true_positives'] ==1].shape[0]
        FP = df_trxn_set[df_trxn_set['false_positives'] ==1].shape[0]
        TN = df_trxn_set[df_trxn_set['true_negatives'] ==1].shape[0]
        FN = df_trxn_set[df_trxn_set['false_negatives'] ==1].shape[0]
        if (metric == 'Jstat'):

            if ((TP+FN == 0) or (FP+TN == 0)):
                metric_J = 0
            else:
                metric_J = (TP/(TP+FN))+(TN/(FP+TN)) - 1
            scores.append (metric_J)
        elif (metric == 'F1'):
            if ((TP+FP) == 0):
                P = 0
            else:
                P = TP / (TP+FP)
            if ((TP+FN) == 0):
                R = 0
            else:
                R = TP / (TP+FN)
            if (P+R != 0):
                metric_F1 = 2 * (P* R) / (P+R)
            else:
                metric_F1 = 0
            scores.append (metric_F1)
        thresholds.append(round(threshold_bump, 2))
        threshold_bump = threshold_bump + (threshold_bump * bump_pct)
    fig = px.line( x=thresholds, y=scores)
    maxJ = max(scores)
    max_index = scores.index(maxJ)
    fig.add_scatter(x= [thresholds[max_index]],y=[scores[max_index]],
                mode='markers', marker=dict(color='red', size=10),
                marker_symbol = ['star'],
                name=f'Max J: ({scores[max_index]})')
    #fig.show()
    #write this out to a file for this segment for plotting later
    df_Jstats = pd.DataFrame({f'YJ_{threshold}':thresholds,'YJstats':scores})
    df_Jstats.to_csv(f"Jstats_segment_{segment}_{threshold}.csv", index=False)
    return fig

def tpr_fpr_plot(df):
    tpr = []
    fpr = []
    tp_cnts = 0
    fp_cnts = 0
    df_alerts = df[df['alert']==1].reset_index()
    tp_total = df_alerts[df_alerts['true_positives'] == 1].shape[0]
    fp_total = df_alerts[df_alerts['false_positives'] == 1].shape[0]
    total_alerts = df_alerts.shape[0]
    Jstat = 0
    max_index = 0
    for index, row in df_alerts.iterrows():
        if row['true_positives'] == 1:
            tp_cnts = tp_cnts+1
        elif row['false_positives'] == 1:
            fp_cnts = fp_cnts+1
        tpr.append(tp_cnts/tp_total)
        fpr.append(fp_cnts/fp_total)
        #J stat
        if ( ((tp_cnts/tp_total) - (index / total_alerts)) > Jstat):
            Jstat = ((tp_cnts/tp_total) - (index / total_alerts)) #second part is random guess value
            max_index = index

    fig = px.line( x=fpr, y=tpr)
    fig.add_scatter(x= [fpr[max_index]],y=[tpr[max_index]],
            mode='markers', marker=dict(color='red', size=10),
            marker_symbol = ['star'],
            name=f'Max J: ({Jstat})')
    #fig.show()
    return fig

def add_sub_plots(fig, subplot, row_id, col_id, x_title, y_title):
    for trace in subplot.data:
        fig.add_trace(trace, row=row_id, col=col_id)
        fig.update_xaxes(title_text=x_title, row=row_id, col=col_id)
        fig.update_yaxes(title_text=y_title, row=row_id, col=col_id)
    return fig

def show_ss_performance():
    #os.chdir("/tmp/") # this is for a lambda function which has only access to /tmp of aws EC2
    try:
        s3 = boto3.client('s3')
        bucket_name = 'sagemaker-us-east-1-143337186090'
        file_key = 'framl_ss_data_xl.xlsx'# Download the file from S3
        s3.download_file(bucket_name, file_key, 'framl_ss_data.xlsx')
        df_alerts = pd.read_excel("framl_ss_data.xlsx", sheet_name='alerts')
        #print(df_alerts.head(5))

        for segment in df_alerts['smart_segment_id'].unique():
            df_segment = df_alerts[df_alerts['smart_segment_id'] == segment] #segment level transactions, trxn aggregates and alerts
            segment_type = df_segment['customer_type'].unique()
            fig1 = plot_pct_metric(df_segment, 'Jstat')
            threshold = 'avg_trxn_amt'
            fig2 = plot_thresholds_metric(df_segment,threshold, .1,  segment_type, 'Jstat')
            fig3 = tpr_fpr_plot(df_segment)
            fig4, fig6 = plot_pct_metric(df_segment, 'F1')
            fig5 = plot_thresholds_metric(df_segment,threshold, .1,  segment_type, 'F1')
            #plot_thresholds_Jstat(df_segment,'avg_num_trxns', .1,  segment)
            fig = make_subplots(rows=2, cols=3) # subplot_titles= (f"Segment:{segment}",f"Segment:{segment}",f"Segment:{segment}" ))
                                #subplot_titles=(f'Segment_{segment} Percentile vs J Statistic', f'Segment_{segment} #{threshold} vs J Statistic', f'Segment_{segment} FPR Vs TPR'))# specs=[[{"type": "line"}, {"type": "line"}, {"type": "line"}]])
            fig = add_sub_plots(fig, fig1, 1,1,"Percentile", "J Statistic")
            fig = add_sub_plots(fig, fig2, 1,2,f"{threshold}", "J Statistic")
            fig = add_sub_plots(fig, fig3, 1,3,"FPR", "TPR")
            fig = add_sub_plots(fig, fig4, 2,1,"Percentile", "F1")
            fig = add_sub_plots(fig, fig5, 2,2,f"{threshold}", "F1")
            fig = add_sub_plots(fig, fig6, 2,3,"Recall", "Precision")
            fig.update_layout(title_text=f'Threshold Tuning Plots for segment:{segment_type}', showlegend=False)
            fig.write_html("threshold_tuning.html")
            bucket_name = 'framl-agents'
            s3.upload_file("threshold_tuning.html", bucket_name, f"threshold_tuning_segment_{segment}.html")
            with open(f"tt_plot_{segment}.json", 'w') as f:
                json.dump(fig.to_json(), f)
        return fig
    except Exception as e:
        print (f"exception:{e}")

def perform_clustering(df, customer_type=None, n_clusters=4):
    """
    Cluster active customers (avg_num_trxns > 0) using numeric + categorical features.
    Inactive accounts are assigned to a 'No Activity' cluster (index = n_clusters).
    Returns (scatter_fig, stats_text, df_combined).
    """
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA

    # Filter by segment
    if customer_type == "Business":
        df_work = df[df['smart_segment_id'] == 0].copy()
    elif customer_type == "Individual":
        df_work = df[df['smart_segment_id'] == 1].copy()
    else:
        df_work = df.copy()

    seg_label = customer_type or "All"

    # ── Keep only accounts with transaction history ─────────────────────
    if 'avg_num_trxns' in df_work.columns:
        df_active = df_work[df_work['avg_num_trxns'].fillna(0) > 0].copy()
    else:
        df_active = df_work.copy()
    df_inactive = pd.DataFrame()   # not used — excluded entirely

    # ── Feature set (avg_weekly_trxn_amt replaces avg_trxn_amt) ────────
    numeric_cols = [c for c in [
        'avg_num_trxns', 'avg_weekly_trxn_amt', 'trxn_amt_monthly',
        'INCOME', 'CURRENT_BALANCE', 'ACCT_AGE_YEARS', 'AGE'
    ] if c in df_active.columns]

    cat_cols = [c for c in [
        'ACCOUNT_TYPE', 'GENDER', 'AGE_CATEGORY', 'ACCT_OPEN_CHANNEL',
        'NNM', 'OFAC', '314b', 'CITIZENSHIP', 'RESIDENCY_COUNTRY'
    ] if c in df_active.columns]

    df_encoded = pd.get_dummies(df_active[cat_cols], drop_first=True) if cat_cols else pd.DataFrame(index=df_active.index)
    X_num   = df_active[numeric_cols].fillna(df_active[numeric_cols].median())
    X       = pd.concat([X_num.reset_index(drop=True), df_encoded.reset_index(drop=True)], axis=1).fillna(0)
    feature_cols = list(X.columns)

    scaler   = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # ── Auto-select K via elbow ─────────────────────────────────────────
    if n_clusters == 0:
        inertias = []
        k_range  = range(2, 9)
        for k in k_range:
            km = KMeans(n_clusters=k, random_state=42, n_init=10)
            km.fit(X_scaled)
            inertias.append(km.inertia_)
        diffs  = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)]
        diffs2 = [diffs[i] - diffs[i+1] for i in range(len(diffs)-1)]
        n_clusters = list(k_range)[diffs2.index(max(diffs2)) + 1]
        print(f"Auto-selected K={n_clusters} clusters")

    # ── K-Means on active accounts only ────────────────────────────────
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    df_active['cluster'] = labels

    # ── PCA scatter ─────────────────────────────────────────────────────
    pca   = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    var1  = pca.explained_variance_ratio_[0] * 100
    var2  = pca.explained_variance_ratio_[1] * 100

    scatter_df = pd.DataFrame({
        'PC1':     X_pca[:, 0],
        'PC2':     X_pca[:, 1],
        'Cluster': [f'Cluster {l+1}' for l in labels],
    })

    fig = px.scatter(
        scatter_df, x='PC1', y='PC2', color='Cluster',
        title=f"Dynamic Segmentation Clustering — {seg_label} ({n_clusters} clusters, active accounts only)",
        labels={
            'PC1': f'PC1 ({var1:.1f}% variance)',
            'PC2': f'PC2 ({var2:.1f}% variance)',
        },
        opacity=0.5,
        color_discrete_sequence=px.colors.qualitative.Set1,
    )
    fig.update_traces(marker=dict(size=3))
    fig.update_layout(legend=dict(itemsizing='constant'), height=500, width=700)

    # ── Stats ────────────────────────────────────────────────────────────
    _COL_DISPLAY = {
        'avg_num_trxns':        'Avg Weekly Transactions',
        'avg_weekly_trxn_amt':  'Avg Weekly Txn Amount ($)',
        'trxn_amt_monthly':     'Monthly Txn Volume ($)',
        'INCOME':               'Income ($)',
        'CURRENT_BALANCE':      'Current Balance ($)',
        'ACCT_AGE_YEARS':       'Account Age (years)',
        'AGE':                  'Age',
    }

    n_num         = len(numeric_cols)
    n_cat_encoded = len(df_encoded.columns)
    stats_lines = [
        f"=== PRE-COMPUTED CLUSTER STATS (copy verbatim, do not compute new numbers) ===",
        f"Segment: {seg_label} | Active accounts: {len(df_active):,} (excluded {len(df_work) - len(df_active):,} with no transactions)",
        f"Clusters: {n_clusters} | Features: {n_num} numeric + {n_cat_encoded} encoded categorical ({len(cat_cols)} original)",
        f"PCA variance explained: PC1={var1:.1f}%, PC2={var2:.1f}%",
        "",
    ]
    # Columns to skip in stats display per segment
    _skip_cols = set()
    if seg_label.upper() == "BUSINESS":
        _skip_cols.add("INCOME")   # income is individual-only
        _skip_cols.add("AGE")      # age not collected for businesses

    total_active = len(df_active)
    for i in range(n_clusters):
        c   = df_active[df_active['cluster'] == i]
        pct = 100 * len(c) / total_active if total_active > 0 else 0
        stats_lines.append(f"**Cluster {i+1}**")
        stats_lines.append(f"- Customers: **{len(c):,}** ({pct:.1f}% of active accounts)")
        for col in numeric_cols:
            if col in _skip_cols:
                continue
            val = c[col].median()
            if not (val != val):  # skip NaN
                label = _COL_DISPLAY.get(col, col)
                stats_lines.append(f"- {label}: **{val:,.1f}**")
        stats_lines.append("")  # blank line after each cluster block

    stats_lines.append("=== END PRE-COMPUTED CLUSTER STATS ===")
    return fig, "\n".join(stats_lines), df_active


def _cluster_title(trxns, amt, overall_trxns, overall_amt):
    """Generate a descriptive cluster title based on relative profile values."""
    freq  = "High Freq"  if trxns > overall_trxns * 1.15 else ("Low Freq"  if trxns < overall_trxns * 0.85 else "Mid Freq")
    value = "High Value" if amt   > overall_amt   * 1.15 else ("Low Value" if amt   < overall_amt   * 0.85 else "Mid Value")
    return f"{freq} / {value}"


def smartseg_tree_dynamic(df_clustered, seg_label="All", dims=None, df_rule_sweep=None):
    """
    Build a treemap from a cluster-labelled DataFrame (output of perform_clustering).

    dims can be:
      - None / list: same hierarchy path applied to all rows.
            e.g. ['customer_type', 'ACCOUNT_TYPE']
      - dict: customer_type is always the first level after Cluster;
            the dict maps each customer_type value to its own sub-dim path.
            e.g. {
                'BUSINESS':   ['ACCOUNT_TYPE', 'ACCOUNT_AGE_CATEGORY'],
                'INDIVIDUAL': ['ACCOUNT_TYPE', 'GENDER', 'AGE_CATEGORY', 'INCOME_BAND'],
            }

    Only columns actually present in df_clustered are used.
    Each cluster gets its own distinct color; no heatmap colorscale.
    """
    PALETTE = px.colors.qualitative.Set1

    if dims is None:
        dims = ['customer_type', 'ACCOUNT_TYPE']

    df = df_clustered.copy()

    # Enrich with SAR/alert info from rule sweep if provided
    if df_rule_sweep is not None and 'customer_id' in df.columns:
        sar_map   = df_rule_sweep.groupby('customer_id')['is_sar'].max()
        alerted   = set(df_rule_sweep['customer_id'].unique())
        df['is_sar']     = df['customer_id'].map(sar_map).fillna(0).astype(int)
        df['is_alerted'] = df['customer_id'].isin(alerted).astype(int)
        df['is_fp']      = ((df['is_alerted'] == 1) & (df['is_sar'] == 0)).astype(int)
    else:
        df['is_sar'] = 0; df['is_alerted'] = 0; df['is_fp'] = 0

    # Overall means over active accounts only for cluster title relative comparisons
    _active_all = df[df['avg_num_trxns'].fillna(0) > 0] if 'avg_num_trxns' in df.columns else df
    overall_trxns = _active_all['avg_num_trxns'].mean()       if len(_active_all) > 0 and 'avg_num_trxns'       in _active_all.columns else 1
    overall_amt   = _active_all['avg_weekly_trxn_amt'].mean() if len(_active_all) > 0 and 'avg_weekly_trxn_amt' in _active_all.columns else 1

    # Build indicative title per cluster (all clusters are active — inactive excluded before clustering)
    cluster_titles = {}
    for counter, (i, grp) in enumerate(df.groupby('cluster'), start=1):
        title = _cluster_title(
            grp['avg_num_trxns'].mean() if 'avg_num_trxns' in grp.columns else 0,
            grp['avg_weekly_trxn_amt'].mean() if 'avg_weekly_trxn_amt' in grp.columns else 0,
            overall_trxns, overall_amt,
        )
        cluster_titles[i] = f"C{counter}: {title}"

    df['cluster_label'] = df['cluster'].map(cluster_titles)

    rows = []

    def add_row(rid, parent, label, sub, cidx=None):
        # Filter to active accounts (with transactions) for transaction metrics
        active = sub[sub['avg_num_trxns'] > 0] if 'avg_num_trxns' in sub.columns else sub
        n_active = len(active)
        pct_active = round(100 * n_active / len(sub), 1) if len(sub) > 0 else 0
        rows.append({
            'id': rid, 'parent': parent, 'label': label,
            # Transaction frequency: median over active accounts (robust to outliers)
            'avg_num_trxns':       active['avg_num_trxns'].median()       if n_active > 0 and 'avg_num_trxns'       in active.columns else 0,
            # Transaction amounts: median to avoid single large-transaction accounts skewing results
            'avg_weekly_trxn_amt': active['avg_weekly_trxn_amt'].median() if n_active > 0 and 'avg_weekly_trxn_amt' in active.columns else 0,
            'trxn_amt_monthly':    active['trxn_amt_monthly'].median()    if n_active > 0 and 'trxn_amt_monthly'    in active.columns else 0,
            # Demographics: mean over all accounts in this node
            'INCOME':           sub['INCOME'].mean()              if 'INCOME' in sub.columns else 0,
            'AGE':              sub['AGE'].mean()                 if 'AGE'    in sub.columns else 0,
            'pct_active': pct_active,
            'NUM_COUNT': len(sub),
            'cidx': cidx,
            # AML risk counts
            'n_sar':     int(sub['is_sar'].sum()),
            'n_alerted': int(sub['is_alerted'].sum()),
            'n_fp':      int(sub['is_fp'].sum()),
        })

    def build_nodes(sub_df, parent_id, remaining_dims, cidx):
        """Recursively build treemap nodes for each dimension level."""
        if not remaining_dims:
            return
        dim = remaining_dims[0]
        if dim not in sub_df.columns:
            return
        for val, grp in sub_df.groupby(dim, dropna=False):
            val_str = str(val) if pd.notna(val) else 'Unknown'
            node_id = f"{parent_id}__{dim}_{val_str}"
            add_row(node_id, parent_id, val_str, grp, cidx=cidx)
            build_nodes(grp, node_id, remaining_dims[1:], cidx)

    SMALL_CLUSTER_THRESHOLD = 0.01  # clusters < 1% of total go into a "Small Clusters" group

    total_rows = len(df)
    small_clusters = {cl for cl, grp in df.groupby('cluster_label')
                      if len(grp) / total_rows < SMALL_CLUSTER_THRESHOLD} if total_rows > 0 else set()

    # Root
    add_row('All', '', f'Smart Segments - {seg_label}', df, cidx=None)

    # Add a "Small Clusters" bucket if any clusters are below threshold
    if small_clusters:
        df_small = df[df['cluster_label'].isin(small_clusters)]
        add_row('SMALL', 'All', f'Small Clusters (<1%) — {len(df_small):,} accounts', df_small, cidx=None)

    # Cluster level
    for cl, grp in df.groupby('cluster_label'):
        cid  = f"CL__{cl}"
        cidx = next((k for k, v in cluster_titles.items() if v == cl), None)
        parent = 'SMALL' if cl in small_clusters else 'All'
        add_row(cid, parent, cl, grp, cidx=cidx)

        if isinstance(dims, dict):
            # customer_type is always the first level; each type gets its own sub-dims
            if 'customer_type' not in grp.columns:
                continue
            for ct, cgrp in grp.groupby('customer_type'):
                ctid = f"{cid}__ct_{ct}"
                add_row(ctid, cid, ct, cgrp, cidx=cidx)
                ct_sub_dims = [d for d in dims.get(ct, []) if d in cgrp.columns]
                build_nodes(cgrp, ctid, ct_sub_dims, cidx)
        else:
            # List mode: recurse through all dims uniformly
            active_dims = [d for d in dims if d in grp.columns]
            build_nodes(grp, cid, active_dims, cidx)

    tree_df = pd.DataFrame(rows)

    # Boost small cluster display values so they're visible in the treemap.
    # Use 5% of total as the minimum display size; actual counts are shown in hover labels.
    if small_clusters:
        min_display = int(max(total_rows * 0.05, 1))
        small_ids = {f"CL__{cl}" for cl in small_clusters} | {'SMALL'}
        tree_df.loc[tree_df['id'].isin(small_ids), 'NUM_COUNT'] = \
            tree_df.loc[tree_df['id'].isin(small_ids), 'NUM_COUNT'].clip(lower=min_display).astype(int)

    # Per-node colors: neutral grey for root, cluster color for all other nodes
    node_colors = []
    for _, r in tree_df.iterrows():
        if r['cidx'] is None or pd.isna(r['cidx']):
            node_colors.append('#CCCCCC')
        else:
            node_colors.append(PALETTE[int(r['cidx']) % len(PALETTE)])

    fig = go.Figure(go.Treemap(
        ids=tree_df['id'],
        labels=tree_df['label'],
        parents=tree_df['parent'],
        values=tree_df['NUM_COUNT'],
        customdata=np.column_stack([
            tree_df['avg_num_trxns'].fillna(0),       # 0
            tree_df['avg_weekly_trxn_amt'].fillna(0), # 1
            tree_df['NUM_COUNT'].fillna(0),            # 2
            tree_df['trxn_amt_monthly'].fillna(0),     # 3
            tree_df['INCOME'].fillna(0),               # 4
            tree_df['AGE'].fillna(0),                  # 5
            tree_df['pct_active'].fillna(0),           # 6
            tree_df['n_sar'].fillna(0),                # 7
            tree_df['n_alerted'].fillna(0),            # 8
            tree_df['n_fp'].fillna(0),                 # 9
        ]),
        hovertemplate=(
            '<b>%{label}</b><br>'
            'Count: %{customdata[2]:.0f}<br>'
            'Active (w/ txns): %{customdata[6]:.1f}%<br>'
            'Avg Trxns/Week: %{customdata[0]:.1f}<br>'
            'Avg Weekly Trxn Amt: $%{customdata[1]:.0f}<br>'
            'Avg Monthly Trxn Amt: $%{customdata[3]:.0f}<br>'
            + ('' if seg_label == 'Business' else
               'Avg Income: $%{customdata[4]:.0f}<br>'
               'Avg Age: %{customdata[5]:.0f}<br>')
            + '─────────────────<br>'
            'Alerts: %{customdata[8]:.0f} | SARs: %{customdata[7]:.0f} | FPs: %{customdata[9]:.0f}<br>'
            '<extra></extra>'
        ),
        texttemplate=(
            '<b>%{label}</b><br>'
            'n=%{customdata[2]:.0f}<br>'
            'SAR=%{customdata[7]:.0f} FP=%{customdata[9]:.0f}<br>'
            'wk=$%{customdata[1]:.0f}'
        ),
        marker=dict(colors=node_colors),
    ))
    fig.update_layout(
        title=f'AML Smart Segments (Dynamic Clustering) - {seg_label}',
        font_size=14,
        margin=dict(t=50, l=25, r=25, b=25),
        height=500, width=700,
    )
    return fig


def lambda_handler(event, context):
    agent = event['agent']
    actionGroup = event['actionGroup']
    function = event['function']
    parameters = event.get('parameters', [])
    bucket_name = show_ss_performance()
    # Execute your business logic here. For more information, refer to: https://docs.aws.amazon.com/bedrock/latest/userguide/agents-lambda.html
    responseBody =  {
        "TEXT": {
            "body": f'segment level threshold tuning files are created in the S3 bucket:{bucket_name}'
        },
            "sessionAttributes": {
            "generatedFileS3Bucket": bucket_name,
            "generatedFileS3Key": bucket_name
        }
    }

    action_response = {
        'actionGroup': actionGroup,
        'function': function,
        'functionResponse': {
            'responseBody': responseBody
        }

    }
    response = {'response': action_response, 'messageVersion': event['messageVersion']}
    print("Response: {}".format(response))

    return response
if __name__ == "__main__":

    response = show_ss_performance()
    i=0