Spaces:
Paused
Paused
| import math | |
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import json | |
| #import kaleido | |
| def alerts_distribution(df): | |
| segment_total_alerts = [ | |
| df[(df['dynamic_segment'] == 0) & (df['alerts'] == 1)].shape[0], | |
| df[(df['dynamic_segment'] == 1) & (df['alerts'] == 1)].shape[0], | |
| ] | |
| segment_fps = [ | |
| df[(df['dynamic_segment'] == 0) & (df['false_positives'] == 1)].shape[0], | |
| df[(df['dynamic_segment'] == 1) & (df['false_positives'] == 1)].shape[0], | |
| ] | |
| data = [ | |
| go.Bar(name='Total Alerts', x=['Business', 'Individual'], y=segment_total_alerts), | |
| go.Bar(name='False Positives', x=['Business', 'Individual'], y=segment_fps), | |
| ] | |
| fig = go.Figure(data) | |
| fig.update_layout(barmode='group', title="Alerts distribution across Segments") | |
| return fig | |
| def plot_thresholds_tuning(df_segment, threshold, bump_pct, segment): | |
| false_positives = [] | |
| false_negatives = [] | |
| thresholds = [] | |
| threshold_min = df_segment[threshold].min() | |
| threshold_max = df_segment[threshold].max() | |
| step = max(1, int((threshold_max - threshold_min) / 100)) | |
| threshold_bump = threshold_min | |
| while threshold_bump <= threshold_max + step: | |
| fp = df_segment[(df_segment[threshold] >= threshold_bump) & (df_segment['false_positives'] == 1)].shape[0] | |
| fn = df_segment[(df_segment[threshold] < threshold_bump) & (df_segment['false_negatives'] == 1)].shape[0] | |
| false_positives.append(fp) | |
| false_negatives.append(fn) | |
| thresholds.append(round(threshold_bump, 2)) | |
| threshold_bump = threshold_bump + step | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=thresholds, y=false_positives, mode='lines', name='False Positives', | |
| line=dict(color='#EF553B', width=2))) | |
| fig.add_trace(go.Scatter(x=thresholds, y=false_negatives, mode='lines', name='False Negatives', | |
| line=dict(color='#636EFA', width=2))) | |
| fig.update_layout( | |
| title=f'False Positives & False Negatives vs Threshold ({threshold}) β Segment: {segment}', | |
| xaxis_title=threshold, | |
| yaxis_title='Count', | |
| legend=dict(x=0.01, y=0.99), | |
| ) | |
| fig.add_annotation( | |
| text=f"<b>Threshold Min: {round(threshold_min, 2)}<br><b>Threshold Max: {round(threshold_max, 2)}", | |
| xref="paper", yref="paper", | |
| x=1, y=0.5, | |
| showarrow=False, align="right", valign="middle" | |
| ) | |
| df_thresholds = pd.DataFrame({f'{threshold}': thresholds, 'False Positives': false_positives, 'False Negatives': false_negatives}) | |
| df_thresholds.to_csv(os.path.join("/tmp", f"Segment_{segment}_{threshold}.csv"), index=False) | |
| return fig, df_segment | |
| def smartseg_tree(): | |
| dtree = pd.read_csv('smartsegments.csv') | |
| dtree['SmartSegment'] = dtree['SmartSegment'].astype(int) | |
| agg = { | |
| 'amount_MEAN': 'mean', | |
| 'avg_num_trxns_MEAN': 'mean', | |
| 'avg_trxn_amt_MEAN': 'mean', | |
| 'NUM_COUNT': 'sum', | |
| } | |
| rows = [] | |
| # Root node | |
| r = dtree.agg(agg) | |
| rows.append({'id': 'All', 'parent': '', 'label': 'AML Dynamic Segments', | |
| 'amount_MEAN': r['amount_MEAN'], 'avg_num_trxns_MEAN': r['avg_num_trxns_MEAN'], | |
| 'avg_trxn_amt_MEAN': r['avg_trxn_amt_MEAN'], 'NUM_COUNT': r['NUM_COUNT']}) | |
| # SmartSegment level | |
| for _, g in dtree.groupby('SmartSegment').agg(agg).reset_index().iterrows(): | |
| sid = f"SS_{int(g['SmartSegment'])}" | |
| rows.append({'id': sid, 'parent': 'All', 'label': f"Segment {int(g['SmartSegment'])}", | |
| 'amount_MEAN': g['amount_MEAN'], 'avg_num_trxns_MEAN': g['avg_num_trxns_MEAN'], | |
| 'avg_trxn_amt_MEAN': g['avg_trxn_amt_MEAN'], 'NUM_COUNT': g['NUM_COUNT']}) | |
| # SmartSegment x customer_type level | |
| for _, g in dtree.groupby(['SmartSegment', 'customer_type']).agg(agg).reset_index().iterrows(): | |
| sid = f"SS_{int(g['SmartSegment'])}" | |
| cid = f"{sid}_{g['customer_type']}" | |
| rows.append({'id': cid, 'parent': sid, 'label': g['customer_type'], | |
| 'amount_MEAN': g['amount_MEAN'], 'avg_num_trxns_MEAN': g['avg_num_trxns_MEAN'], | |
| 'avg_trxn_amt_MEAN': g['avg_trxn_amt_MEAN'], 'NUM_COUNT': g['NUM_COUNT']}) | |
| # Leaf: SmartSegment x customer_type x acct_type | |
| for _, g in dtree.groupby(['SmartSegment', 'customer_type', 'acct_type']).agg(agg).reset_index().iterrows(): | |
| sid = f"SS_{int(g['SmartSegment'])}" | |
| cid = f"{sid}_{g['customer_type']}" | |
| lid = f"{cid}_{g['acct_type']}" | |
| rows.append({'id': lid, 'parent': cid, 'label': g['acct_type'], | |
| 'amount_MEAN': g['amount_MEAN'], 'avg_num_trxns_MEAN': g['avg_num_trxns_MEAN'], | |
| 'avg_trxn_amt_MEAN': g['avg_trxn_amt_MEAN'], 'NUM_COUNT': g['NUM_COUNT']}) | |
| tree_df = pd.DataFrame(rows) | |
| fig = go.Figure(go.Treemap( | |
| ids=tree_df['id'], | |
| labels=tree_df['label'], | |
| parents=tree_df['parent'], | |
| values=tree_df['NUM_COUNT'], | |
| customdata=np.column_stack([ | |
| tree_df['avg_num_trxns_MEAN'].fillna(0), | |
| tree_df['avg_trxn_amt_MEAN'].fillna(0), | |
| tree_df['NUM_COUNT'].fillna(0), | |
| tree_df['amount_MEAN'].fillna(0), | |
| ]), | |
| hovertemplate=( | |
| '<b>%{label}</b><br>' | |
| 'Count: %{customdata[2]:.0f}<br>' | |
| 'Avg Trxns/Week: %{customdata[0]:.0f}<br>' | |
| 'Avg Trxn Amt: $%{customdata[1]:.0f}<br>' | |
| 'Avg Monthly Amt: $%{customdata[3]:.0f}<br>' | |
| '<extra></extra>' | |
| ), | |
| texttemplate=( | |
| '<b>%{label}</b><br>' | |
| 'n=%{customdata[2]:.0f}<br>' | |
| 'trxns/wk=%{customdata[0]:.0f}<br>' | |
| 'amt=$%{customdata[1]:.0f}' | |
| ), | |
| marker=dict( | |
| colors=tree_df['avg_num_trxns_MEAN'].fillna(0), | |
| colorscale='RdBu', | |
| showscale=True, | |
| colorbar=dict(title='Avg Trxns/Wk'), | |
| ), | |
| )) | |
| fig.update_layout( | |
| title='AML Dynamic Segments', | |
| font_size=14, | |
| margin=dict(t=50, l=25, r=25, b=25), | |
| ) | |
| return fig, tree_df | |
| # Remove rows with outliers in any of the specified columns using IQR | |
| def remove_outliers_iqr(df, columns): | |
| for col in columns: | |
| Q1 = df[col].quantile(0.10) | |
| Q3 = df[col].quantile(0.90) | |
| IQR = Q3 - Q1 | |
| lower_bound = Q1 - 1.5 * IQR | |
| upper_bound = Q3 + 1.5 * IQR | |
| df = df[(df[col] >= Q1) & (df[col] <= Q3)] | |
| return df | |
| def plot_pct_metric(df, metric): | |
| scores=[] | |
| pcts = [] | |
| Precision = [] | |
| Recall = [] | |
| for i in range(0, 101): | |
| df_pct = df.head(int(len(df)*(i/100))) | |
| TP = df_pct[df_pct['true_positives'] ==1].shape[0] | |
| FP = df_pct[df_pct['false_positives'] ==1].shape[0] | |
| TN = df_pct[df_pct['true_negatives'] ==1].shape[0] | |
| FN = df_pct[df_pct['false_negatives'] ==1].shape[0] | |
| if (metric == 'Jstat'): | |
| if ((TP+FN == 0) or (FP+TN == 0)): | |
| metric_J = 0 | |
| else: | |
| metric_J = (TP/(TP+FN))+(TN/(FP+TN)) - 1 | |
| scores.append (metric_J) | |
| elif (metric == 'F1'): | |
| if ((TP+FP) == 0): | |
| P = 0 | |
| else: | |
| P = TP / (TP+FP) | |
| if ((TP+FN) == 0): | |
| R = 0 | |
| else: | |
| R = TP / (TP+FN) | |
| Precision.append(P) | |
| Recall.append(R) | |
| if (P+R != 0): | |
| metric_F1 = 2 * (P* R) / (P+R) | |
| else: | |
| metric_F1 = 0 | |
| scores.append (metric_F1) | |
| pcts.append(i/100) | |
| maxJ = max(scores) | |
| max_index = scores.index(maxJ) | |
| if (metric == 'Jstat'): | |
| fig = px.line( x=pcts, y=scores) | |
| # Highlight the maximum point | |
| fig.add_scatter(x= [pcts[max_index]],y=[scores[max_index]], | |
| mode='markers', marker=dict(color='red', size=10), | |
| marker_symbol = ['star'], | |
| name=f'Max J: ({scores[max_index]})') | |
| #fig.show() | |
| return fig | |
| else: | |
| fig1 = px.line( x=pcts, y=scores) | |
| # Highlight the maximum point | |
| fig1.add_scatter(x= [pcts[max_index]],y=[scores[max_index]], | |
| mode='markers', marker=dict(color='red', size=10), | |
| marker_symbol = ['star'], | |
| name=f'Max J: ({scores[max_index]})') | |
| fig2 = px.line( x= Recall, y = Precision) | |
| # Highlight the maximum point | |
| fig2.add_scatter(x= [Recall[max_index]],y=[Precision[max_index]], | |
| mode='markers', marker=dict(color='red', size=10), | |
| marker_symbol = ['star'], | |
| name=f'Max J: ({scores[max_index]})') | |
| return fig1, fig2 | |
| def plot_thresholds_metric(df_segment, threshold, bump_pct, segment, metric): | |
| scores = [] | |
| thresholds = [] | |
| df_segment = remove_outliers_iqr(df_segment, [threshold]) | |
| threshold_min = df_segment[threshold].min() | |
| threshold_max = df_segment[threshold].max() | |
| threshold_bump = threshold_min | |
| while threshold_bump < threshold_max: | |
| df_trxn_set = df_segment[df_segment[threshold] >= threshold_bump] | |
| TP = df_trxn_set[df_trxn_set['true_positives'] ==1].shape[0] | |
| FP = df_trxn_set[df_trxn_set['false_positives'] ==1].shape[0] | |
| TN = df_trxn_set[df_trxn_set['true_negatives'] ==1].shape[0] | |
| FN = df_trxn_set[df_trxn_set['false_negatives'] ==1].shape[0] | |
| if (metric == 'Jstat'): | |
| if ((TP+FN == 0) or (FP+TN == 0)): | |
| metric_J = 0 | |
| else: | |
| metric_J = (TP/(TP+FN))+(TN/(FP+TN)) - 1 | |
| scores.append (metric_J) | |
| elif (metric == 'F1'): | |
| if ((TP+FP) == 0): | |
| P = 0 | |
| else: | |
| P = TP / (TP+FP) | |
| if ((TP+FN) == 0): | |
| R = 0 | |
| else: | |
| R = TP / (TP+FN) | |
| if (P+R != 0): | |
| metric_F1 = 2 * (P* R) / (P+R) | |
| else: | |
| metric_F1 = 0 | |
| scores.append (metric_F1) | |
| thresholds.append(round(threshold_bump, 2)) | |
| threshold_bump = threshold_bump + (threshold_bump * bump_pct) | |
| fig = px.line( x=thresholds, y=scores) | |
| maxJ = max(scores) | |
| max_index = scores.index(maxJ) | |
| fig.add_scatter(x= [thresholds[max_index]],y=[scores[max_index]], | |
| mode='markers', marker=dict(color='red', size=10), | |
| marker_symbol = ['star'], | |
| name=f'Max J: ({scores[max_index]})') | |
| #fig.show() | |
| #write this out to a file for this segment for plotting later | |
| df_Jstats = pd.DataFrame({f'YJ_{threshold}':thresholds,'YJstats':scores}) | |
| df_Jstats.to_csv(f"Jstats_segment_{segment}_{threshold}.csv", index=False) | |
| return fig | |
| def tpr_fpr_plot(df): | |
| tpr = [] | |
| fpr = [] | |
| tp_cnts = 0 | |
| fp_cnts = 0 | |
| df_alerts = df[df['alert']==1].reset_index() | |
| tp_total = df_alerts[df_alerts['true_positives'] == 1].shape[0] | |
| fp_total = df_alerts[df_alerts['false_positives'] == 1].shape[0] | |
| total_alerts = df_alerts.shape[0] | |
| Jstat = 0 | |
| max_index = 0 | |
| for index, row in df_alerts.iterrows(): | |
| if row['true_positives'] == 1: | |
| tp_cnts = tp_cnts+1 | |
| elif row['false_positives'] == 1: | |
| fp_cnts = fp_cnts+1 | |
| tpr.append(tp_cnts/tp_total) | |
| fpr.append(fp_cnts/fp_total) | |
| #J stat | |
| if ( ((tp_cnts/tp_total) - (index / total_alerts)) > Jstat): | |
| Jstat = ((tp_cnts/tp_total) - (index / total_alerts)) #second part is random guess value | |
| max_index = index | |
| fig = px.line( x=fpr, y=tpr) | |
| fig.add_scatter(x= [fpr[max_index]],y=[tpr[max_index]], | |
| mode='markers', marker=dict(color='red', size=10), | |
| marker_symbol = ['star'], | |
| name=f'Max J: ({Jstat})') | |
| #fig.show() | |
| return fig | |
| def add_sub_plots(fig, subplot, row_id, col_id, x_title, y_title): | |
| for trace in subplot.data: | |
| fig.add_trace(trace, row=row_id, col=col_id) | |
| fig.update_xaxes(title_text=x_title, row=row_id, col=col_id) | |
| fig.update_yaxes(title_text=y_title, row=row_id, col=col_id) | |
| return fig | |
| def perform_clustering(df, customer_type=None, n_clusters=4): | |
| """ | |
| Cluster active customers (avg_num_trxns > 0) using numeric + categorical features. | |
| Inactive accounts are assigned to a 'No Activity' cluster (index = n_clusters). | |
| Returns (scatter_fig, stats_text, df_combined). | |
| """ | |
| from sklearn.cluster import KMeans | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.decomposition import PCA | |
| # Filter by segment | |
| if customer_type == "Business": | |
| df_work = df[df['dynamic_segment'] == 0].copy() | |
| elif customer_type == "Individual": | |
| df_work = df[df['dynamic_segment'] == 1].copy() | |
| else: | |
| df_work = df.copy() | |
| seg_label = customer_type or "All" | |
| # ββ Keep only accounts with transaction history βββββββββββββββββββββ | |
| if 'avg_num_trxns' in df_work.columns: | |
| df_active = df_work[df_work['avg_num_trxns'].fillna(0) > 0].copy() | |
| else: | |
| df_active = df_work.copy() | |
| df_inactive = pd.DataFrame() # not used β excluded entirely | |
| # ββ Feature set (avg_weekly_trxn_amt replaces avg_trxn_amt) ββββββββ | |
| numeric_cols = [c for c in [ | |
| 'avg_num_trxns', 'avg_weekly_trxn_amt', 'trxn_amt_monthly', | |
| 'INCOME', 'CURRENT_BALANCE', 'ACCT_AGE_YEARS', 'AGE' | |
| ] if c in df_active.columns] | |
| cat_cols = [c for c in [ | |
| 'ACCOUNT_TYPE', 'GENDER', 'AGE_CATEGORY', 'ACCT_OPEN_CHANNEL', | |
| 'NNM', 'OFAC', '314b', 'CITIZENSHIP', 'RESIDENCY_COUNTRY' | |
| ] if c in df_active.columns] | |
| df_encoded = pd.get_dummies(df_active[cat_cols], drop_first=True) if cat_cols else pd.DataFrame(index=df_active.index) | |
| X_num = df_active[numeric_cols].fillna(df_active[numeric_cols].median()) | |
| X = pd.concat([X_num.reset_index(drop=True), df_encoded.reset_index(drop=True)], axis=1).fillna(0) | |
| feature_cols = list(X.columns) | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| # ββ Auto-select K via elbow βββββββββββββββββββββββββββββββββββββββββ | |
| if n_clusters == 0: | |
| inertias = [] | |
| k_range = range(2, 9) | |
| for k in k_range: | |
| km = KMeans(n_clusters=k, random_state=42, n_init=10) | |
| km.fit(X_scaled) | |
| inertias.append(km.inertia_) | |
| diffs = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)] | |
| diffs2 = [diffs[i] - diffs[i+1] for i in range(len(diffs)-1)] | |
| n_clusters = list(k_range)[diffs2.index(max(diffs2)) + 1] | |
| print(f"Auto-selected K={n_clusters} clusters") | |
| # ββ K-Means on active accounts only ββββββββββββββββββββββββββββββββ | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) | |
| labels = kmeans.fit_predict(X_scaled) | |
| df_active['cluster'] = labels | |
| # ββ PCA scatter βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| pca = PCA(n_components=2) | |
| X_pca = pca.fit_transform(X_scaled) | |
| var1 = pca.explained_variance_ratio_[0] * 100 | |
| var2 = pca.explained_variance_ratio_[1] * 100 | |
| scatter_df = pd.DataFrame({ | |
| 'PC1': X_pca[:, 0], | |
| 'PC2': X_pca[:, 1], | |
| 'Cluster': [f'Cluster {l+1}' for l in labels], | |
| }) | |
| # Sort so legend appears in numeric order (1, 2, 3, 4) regardless of KMeans label assignment | |
| cluster_order = [f'Cluster {i+1}' for i in range(n_clusters)] | |
| scatter_df['Cluster'] = pd.Categorical(scatter_df['Cluster'], categories=cluster_order, ordered=True) | |
| scatter_df = scatter_df.sort_values('Cluster') | |
| fig = px.scatter( | |
| scatter_df, x='PC1', y='PC2', color='Cluster', | |
| category_orders={'Cluster': cluster_order}, | |
| title=f"Dynamic Segmentation Clustering β {seg_label} ({n_clusters} clusters, active accounts only)", | |
| labels={ | |
| 'PC1': f'PC1 ({var1:.1f}% variance)', | |
| 'PC2': f'PC2 ({var2:.1f}% variance)', | |
| }, | |
| opacity=0.5, | |
| color_discrete_sequence=px.colors.qualitative.Set1, | |
| ) | |
| fig.update_traces(marker=dict(size=3)) | |
| fig.update_layout(legend=dict(itemsizing='constant')) | |
| # ββ Stats ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _COL_DISPLAY = { | |
| 'avg_num_trxns': 'Avg Weekly Transactions', | |
| 'avg_weekly_trxn_amt': 'Avg Weekly Txn Amount', | |
| 'trxn_amt_monthly': 'Monthly Txn Volume', | |
| 'INCOME': 'Income', | |
| 'CURRENT_BALANCE': 'Current Balance', | |
| 'ACCT_AGE_YEARS': 'Account Age (years)', | |
| 'AGE': 'Age', | |
| } | |
| _DOLLAR_COLS = {'avg_weekly_trxn_amt', 'trxn_amt_monthly', 'INCOME', 'CURRENT_BALANCE'} | |
| n_num = len(numeric_cols) | |
| n_cat_encoded = len(df_encoded.columns) | |
| stats_lines = [ | |
| f"=== PRE-COMPUTED CLUSTER STATS (copy verbatim, do not compute new numbers) ===", | |
| f"Segment: {seg_label} | Active accounts: {len(df_active):,} (excluded {len(df_work) - len(df_active):,} with no transactions)", | |
| f"Clusters: {n_clusters} | Features: {n_num} numeric + {n_cat_encoded} encoded categorical ({len(cat_cols)} original)", | |
| f"PCA variance explained: PC1={var1:.1f}%, PC2={var2:.1f}%", | |
| "", | |
| ] | |
| # Columns to skip in stats display per segment | |
| _skip_cols = set() | |
| if seg_label.upper() == "BUSINESS": | |
| _skip_cols.add("INCOME") # income is individual-only | |
| _skip_cols.add("AGE") # age not collected for businesses | |
| total_active = len(df_active) | |
| for i in range(n_clusters): | |
| c = df_active[df_active['cluster'] == i] | |
| pct = 100 * len(c) / total_active if total_active > 0 else 0 | |
| stats_lines.append(f"**Cluster {i+1}**") | |
| stats_lines.append(f"- Customers: **{len(c):,}** ({pct:.1f}% of active accounts)") | |
| for col in numeric_cols: | |
| if col in _skip_cols: | |
| continue | |
| val = c[col].median() | |
| if not (val != val): # skip NaN | |
| label = _COL_DISPLAY.get(col, col) | |
| fmt = f"${val:,.0f}" if col in _DOLLAR_COLS else f"{val:,.1f}" | |
| stats_lines.append(f"- {label}: **{fmt}**") | |
| stats_lines.append("") # blank line after each cluster block | |
| stats_lines.append("=== END PRE-COMPUTED CLUSTER STATS ===") | |
| return fig, "\n".join(stats_lines), df_active | |
| def _cluster_title(trxns, amt, overall_trxns, overall_amt): | |
| """Generate a descriptive cluster title based on relative profile values.""" | |
| freq = "High Freq" if trxns > overall_trxns * 1.15 else ("Low Freq" if trxns < overall_trxns * 0.85 else "Mid Freq") | |
| value = "High Value" if amt > overall_amt * 1.15 else ("Low Value" if amt < overall_amt * 0.85 else "Mid Value") | |
| return f"{freq} / {value}" | |
| # Columns excluded from treemap dimension discovery β IDs, numerics, internal flags | |
| _DIM_EXCLUDE = { | |
| 'customer_id', 'account_id', 'cluster', 'cluster_label', 'dynamic_segment', | |
| 'is_sar', 'is_fp', 'is_alerted', 'is_fn', 'pct_active', | |
| 'avg_num_trxns', 'avg_weekly_trxn_amt', 'trxn_amt_monthly', 'avg_trxn_amt', | |
| 'income', 'current_balance', 'acct_age_years', 'age', | |
| 'total_trxn_amt', 'cashout_count', 'sar_score', 'alert_count', | |
| 'customer_type', # used as the segment split level, not a sub-dimension | |
| } | |
| def discover_dims(df, segment=None, availability=0.70, max_cardinality=20): | |
| """ | |
| Discover categorical columns suitable as treemap dimensions from df. | |
| Parameters | |
| ---------- | |
| df : segmentation DataFrame (output of DS_CSV load) | |
| segment : 'BUSINESS' or 'INDIVIDUAL' β filter df before scanning, or None for all | |
| availability : minimum fraction of non-null values required (default 0.70) | |
| max_cardinality: maximum number of unique values for a column to be considered categorical | |
| Returns | |
| ------- | |
| List of column names suitable as treemap hierarchy dimensions, ordered by availability desc. | |
| """ | |
| if segment and 'customer_type' in df.columns: | |
| sub = df[df['customer_type'].str.upper() == segment.upper()] | |
| else: | |
| sub = df | |
| if len(sub) == 0: | |
| return [] | |
| n = len(sub) | |
| scored = [] | |
| for col in sub.columns: | |
| if col.lower() in _DIM_EXCLUDE: | |
| continue | |
| col_data = sub[col].dropna() | |
| avail = len(col_data) / n | |
| if avail < availability: | |
| continue | |
| n_unique = sub[col].nunique(dropna=True) | |
| if 1 < n_unique <= max_cardinality: | |
| scored.append((col, avail)) | |
| # Sort by availability descending so highest-coverage dims come first | |
| scored.sort(key=lambda x: -x[1]) | |
| return [col for col, _ in scored] | |
| def smartseg_tree_dynamic(df_clustered, seg_label="All", dims=None, df_rule_sweep=None): | |
| """ | |
| Build a treemap from a cluster-labelled DataFrame (output of perform_clustering). | |
| dims can be: | |
| - None / list: same hierarchy path applied to all rows. | |
| e.g. ['customer_type', 'ACCOUNT_TYPE'] | |
| - dict: customer_type is always the first level after Cluster; | |
| the dict maps each customer_type value to its own sub-dim path. | |
| e.g. { | |
| 'BUSINESS': ['ACCOUNT_TYPE', 'ACCOUNT_AGE_CATEGORY'], | |
| 'INDIVIDUAL': ['ACCOUNT_TYPE', 'GENDER', 'AGE_CATEGORY', 'INCOME_BAND'], | |
| } | |
| Only columns actually present in df_clustered are used. | |
| Each cluster gets its own distinct color; no heatmap colorscale. | |
| """ | |
| PALETTE = px.colors.qualitative.Set1 | |
| if dims is None: | |
| dims = ['customer_type', 'ACCOUNT_TYPE'] | |
| df = df_clustered.copy() | |
| # Enrich with SAR/alert info from rule sweep if provided | |
| if df_rule_sweep is not None and 'customer_id' in df.columns: | |
| sar_map = df_rule_sweep.groupby('customer_id')['is_sar'].max() | |
| alerted = set(df_rule_sweep['customer_id'].unique()) | |
| df['is_sar'] = df['customer_id'].map(sar_map).fillna(0).astype(int) | |
| df['is_alerted'] = df['customer_id'].isin(alerted).astype(int) | |
| df['is_fp'] = ((df['is_alerted'] == 1) & (df['is_sar'] == 0)).astype(int) | |
| else: | |
| df['is_sar'] = 0; df['is_alerted'] = 0; df['is_fp'] = 0 | |
| # Overall means over active accounts only for cluster title relative comparisons | |
| _active_all = df[df['avg_num_trxns'].fillna(0) > 0] if 'avg_num_trxns' in df.columns else df | |
| overall_trxns = _active_all['avg_num_trxns'].mean() if len(_active_all) > 0 and 'avg_num_trxns' in _active_all.columns else 1 | |
| overall_amt = _active_all['avg_weekly_trxn_amt'].mean() if len(_active_all) > 0 and 'avg_weekly_trxn_amt' in _active_all.columns else 1 | |
| # Build indicative title per cluster (all clusters are active β inactive excluded before clustering) | |
| cluster_titles = {} | |
| for counter, (i, grp) in enumerate(df.groupby('cluster'), start=1): | |
| title = _cluster_title( | |
| grp['avg_num_trxns'].mean() if 'avg_num_trxns' in grp.columns else 0, | |
| grp['avg_weekly_trxn_amt'].mean() if 'avg_weekly_trxn_amt' in grp.columns else 0, | |
| overall_trxns, overall_amt, | |
| ) | |
| cluster_titles[i] = f"C{counter}: {title}" | |
| df['cluster_label'] = df['cluster'].map(cluster_titles) | |
| rows = [] | |
| def add_row(rid, parent, label, sub, cidx=None): | |
| # Filter to active accounts (with transactions) for transaction metrics | |
| active = sub[sub['avg_num_trxns'] > 0] if 'avg_num_trxns' in sub.columns else sub | |
| n_active = len(active) | |
| pct_active = round(100 * n_active / len(sub), 1) if len(sub) > 0 else 0 | |
| rows.append({ | |
| 'id': rid, 'parent': parent, 'label': label, | |
| # Transaction frequency: median over active accounts (robust to outliers) | |
| 'avg_num_trxns': active['avg_num_trxns'].median() if n_active > 0 and 'avg_num_trxns' in active.columns else 0, | |
| # Transaction amounts: median to avoid single large-transaction accounts skewing results | |
| 'avg_weekly_trxn_amt': active['avg_weekly_trxn_amt'].median() if n_active > 0 and 'avg_weekly_trxn_amt' in active.columns else 0, | |
| 'trxn_amt_monthly': active['trxn_amt_monthly'].median() if n_active > 0 and 'trxn_amt_monthly' in active.columns else 0, | |
| # Demographics: mean over all accounts in this node | |
| 'INCOME': sub['INCOME'].mean() if 'INCOME' in sub.columns else 0, | |
| 'AGE': sub['AGE'].mean() if 'AGE' in sub.columns else 0, | |
| 'pct_active': pct_active, | |
| 'NUM_COUNT': len(sub), | |
| 'cidx': cidx, | |
| # AML risk counts | |
| 'n_sar': int(sub['is_sar'].sum()), | |
| 'n_alerted': int(sub['is_alerted'].sum()), | |
| 'n_fp': int(sub['is_fp'].sum()), | |
| }) | |
| def build_nodes(sub_df, parent_id, remaining_dims, cidx): | |
| """Recursively build treemap nodes for each dimension level.""" | |
| if not remaining_dims: | |
| return | |
| dim = remaining_dims[0] | |
| if dim not in sub_df.columns: | |
| return | |
| for val, grp in sub_df.groupby(dim, dropna=True): | |
| val_str = str(val) | |
| node_id = f"{parent_id}__{dim}_{val_str}" | |
| add_row(node_id, parent_id, val_str, grp, cidx=cidx) | |
| build_nodes(grp, node_id, remaining_dims[1:], cidx) | |
| SMALL_CLUSTER_THRESHOLD = 0.01 # clusters < 1% of total go into a "Small Clusters" group | |
| total_rows = len(df) | |
| small_clusters = {cl for cl, grp in df.groupby('cluster_label') | |
| if len(grp) / total_rows < SMALL_CLUSTER_THRESHOLD} if total_rows > 0 else set() | |
| # Root | |
| add_row('All', '', f'Dynamic Segments - {seg_label}', df, cidx=None) | |
| # Add a "Small Clusters" bucket if any clusters are below threshold | |
| if small_clusters: | |
| df_small = df[df['cluster_label'].isin(small_clusters)] | |
| add_row('SMALL', 'All', f'Small Clusters (<1%) β {len(df_small):,} accounts', df_small, cidx=None) | |
| # Cluster level | |
| for cl, grp in df.groupby('cluster_label'): | |
| cid = f"CL__{cl}" | |
| cidx = next((k for k, v in cluster_titles.items() if v == cl), None) | |
| parent = 'SMALL' if cl in small_clusters else 'All' | |
| add_row(cid, parent, cl, grp, cidx=cidx) | |
| if isinstance(dims, dict): | |
| # customer_type is always the first level; each type gets its own sub-dims | |
| if 'customer_type' not in grp.columns: | |
| continue | |
| for ct, cgrp in grp.groupby('customer_type'): | |
| ctid = f"{cid}__ct_{ct}" | |
| add_row(ctid, cid, ct, cgrp, cidx=cidx) | |
| ct_sub_dims = [d for d in dims.get(ct, []) if d in cgrp.columns] | |
| build_nodes(cgrp, ctid, ct_sub_dims, cidx) | |
| else: | |
| # List mode: recurse through all dims uniformly | |
| active_dims = [d for d in dims if d in grp.columns] | |
| build_nodes(grp, cid, active_dims, cidx) | |
| tree_df = pd.DataFrame(rows) | |
| # Boost small cluster display values so they're visible in the treemap. | |
| # Use 5% of total as the minimum display size; actual counts are shown in hover labels. | |
| if small_clusters: | |
| min_display = int(max(total_rows * 0.05, 1)) | |
| small_ids = {f"CL__{cl}" for cl in small_clusters} | {'SMALL'} | |
| tree_df.loc[tree_df['id'].isin(small_ids), 'NUM_COUNT'] = \ | |
| tree_df.loc[tree_df['id'].isin(small_ids), 'NUM_COUNT'].clip(lower=min_display).astype(int) | |
| # Per-node colors: neutral grey for root, cluster color for all other nodes | |
| node_colors = [] | |
| for _, r in tree_df.iterrows(): | |
| if r['cidx'] is None or pd.isna(r['cidx']): | |
| node_colors.append('#CCCCCC') | |
| else: | |
| node_colors.append(PALETTE[int(r['cidx']) % len(PALETTE)]) | |
| fig = go.Figure(go.Treemap( | |
| ids=tree_df['id'], | |
| labels=tree_df['label'], | |
| parents=tree_df['parent'], | |
| values=tree_df['NUM_COUNT'], | |
| customdata=np.column_stack([ | |
| tree_df['avg_num_trxns'].fillna(0), # 0 | |
| tree_df['avg_weekly_trxn_amt'].fillna(0), # 1 | |
| tree_df['NUM_COUNT'].fillna(0), # 2 | |
| tree_df['trxn_amt_monthly'].fillna(0), # 3 | |
| tree_df['INCOME'].fillna(0), # 4 | |
| tree_df['AGE'].fillna(0), # 5 | |
| tree_df['pct_active'].fillna(0), # 6 | |
| tree_df['n_sar'].fillna(0), # 7 | |
| tree_df['n_alerted'].fillna(0), # 8 | |
| tree_df['n_fp'].fillna(0), # 9 | |
| ]), | |
| hovertemplate=( | |
| '<b>%{label}</b><br>' | |
| 'Count: %{customdata[2]:.0f}<br>' | |
| 'Active (w/ txns): %{customdata[6]:.1f}%<br>' | |
| 'Avg Trxns/Week: %{customdata[0]:.1f}<br>' | |
| 'Avg Weekly Trxn Amt: $%{customdata[1]:.0f}<br>' | |
| 'Avg Monthly Trxn Amt: $%{customdata[3]:.0f}<br>' | |
| + ('' if seg_label.upper() == 'BUSINESS' else | |
| 'Avg Income: $%{customdata[4]:.0f}<br>' | |
| 'Avg Age: %{customdata[5]:.0f}<br>') | |
| + 'βββββββββββββββββ<br>' | |
| 'Alerts: %{customdata[8]:.0f} | SARs: %{customdata[7]:.0f} | FPs: %{customdata[9]:.0f}<br>' | |
| '<extra></extra>' | |
| ), | |
| texttemplate=( | |
| '<b>%{label}</b><br>' | |
| 'n=%{customdata[2]:.0f}<br>' | |
| 'SAR=%{customdata[7]:.0f} FP=%{customdata[9]:.0f}<br>' | |
| 'wk=$%{customdata[1]:.0f}' | |
| ), | |
| marker=dict(colors=node_colors), | |
| )) | |
| fig.update_layout( | |
| title=f'AML Dynamic Segments - {seg_label}', | |
| font_size=14, | |
| margin=dict(t=50, l=25, r=25, b=25), | |
| ) | |
| return fig | |
| if __name__ == "__main__": | |
| pass | |