Spaces:

speri420
/

agentic-aml-demo

Paused

App Files Files Community

agentic-aml-demo / lambda_ds_performance.py

shawn420

Sync aria-v6 full-app deployment: agents, app, startup rewrite

a610f79 21 days ago

raw

history blame contribute delete

30.5 kB

	import math
	import os
	import pandas as pd
	import numpy as np
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import plotly.express as px
	import plotly.graph_objects as go
	import json
	#import kaleido
	def alerts_distribution(df):
	segment_total_alerts = [
	df[(df['dynamic_segment'] == 0) & (df['alerts'] == 1)].shape[0],
	df[(df['dynamic_segment'] == 1) & (df['alerts'] == 1)].shape[0],
	]
	segment_fps = [
	df[(df['dynamic_segment'] == 0) & (df['false_positives'] == 1)].shape[0],
	df[(df['dynamic_segment'] == 1) & (df['false_positives'] == 1)].shape[0],
	]

	data = [
	go.Bar(name='Total Alerts', x=['Business', 'Individual'], y=segment_total_alerts),
	go.Bar(name='False Positives', x=['Business', 'Individual'], y=segment_fps),
	]

	fig = go.Figure(data)
	fig.update_layout(barmode='group', title="Alerts distribution across Segments")
	return fig
	def plot_thresholds_tuning(df_segment, threshold, bump_pct, segment):
	false_positives = []
	false_negatives = []
	thresholds = []
	threshold_min = df_segment[threshold].min()
	threshold_max = df_segment[threshold].max()
	step = max(1, int((threshold_max - threshold_min) / 100))
	threshold_bump = threshold_min
	while threshold_bump <= threshold_max + step:
	fp = df_segment[(df_segment[threshold] >= threshold_bump) & (df_segment['false_positives'] == 1)].shape[0]
	fn = df_segment[(df_segment[threshold] < threshold_bump) & (df_segment['false_negatives'] == 1)].shape[0]
	false_positives.append(fp)
	false_negatives.append(fn)
	thresholds.append(round(threshold_bump, 2))
	threshold_bump = threshold_bump + step
	fig = go.Figure()
	fig.add_trace(go.Scatter(x=thresholds, y=false_positives, mode='lines', name='False Positives',
	line=dict(color='#EF553B', width=2)))
	fig.add_trace(go.Scatter(x=thresholds, y=false_negatives, mode='lines', name='False Negatives',
	line=dict(color='#636EFA', width=2)))
	fig.update_layout(
	title=f'False Positives & False Negatives vs Threshold ({threshold}) — Segment: {segment}',
	xaxis_title=threshold,
	yaxis_title='Count',
	legend=dict(x=0.01, y=0.99),
	)
	fig.add_annotation(
	text=f"<b>Threshold Min: {round(threshold_min, 2)}<br><b>Threshold Max: {round(threshold_max, 2)}",
	xref="paper", yref="paper",
	x=1, y=0.5,
	showarrow=False, align="right", valign="middle"
	)
	df_thresholds = pd.DataFrame({f'{threshold}': thresholds, 'False Positives': false_positives, 'False Negatives': false_negatives})
	df_thresholds.to_csv(os.path.join("/tmp", f"Segment_{segment}_{threshold}.csv"), index=False)
	return fig, df_segment
	def smartseg_tree():
	dtree = pd.read_csv('smartsegments.csv')
	dtree['SmartSegment'] = dtree['SmartSegment'].astype(int)

	agg = {
	'amount_MEAN': 'mean',
	'avg_num_trxns_MEAN': 'mean',
	'avg_trxn_amt_MEAN': 'mean',
	'NUM_COUNT': 'sum',
	}

	rows = []

	# Root node
	r = dtree.agg(agg)
	rows.append({'id': 'All', 'parent': '', 'label': 'AML Dynamic Segments',
	'amount_MEAN': r['amount_MEAN'], 'avg_num_trxns_MEAN': r['avg_num_trxns_MEAN'],
	'avg_trxn_amt_MEAN': r['avg_trxn_amt_MEAN'], 'NUM_COUNT': r['NUM_COUNT']})

	# SmartSegment level
	for _, g in dtree.groupby('SmartSegment').agg(agg).reset_index().iterrows():
	sid = f"SS_{int(g['SmartSegment'])}"
	rows.append({'id': sid, 'parent': 'All', 'label': f"Segment {int(g['SmartSegment'])}",
	'amount_MEAN': g['amount_MEAN'], 'avg_num_trxns_MEAN': g['avg_num_trxns_MEAN'],
	'avg_trxn_amt_MEAN': g['avg_trxn_amt_MEAN'], 'NUM_COUNT': g['NUM_COUNT']})

	# SmartSegment x customer_type level
	for _, g in dtree.groupby(['SmartSegment', 'customer_type']).agg(agg).reset_index().iterrows():
	sid = f"SS_{int(g['SmartSegment'])}"
	cid = f"{sid}_{g['customer_type']}"
	rows.append({'id': cid, 'parent': sid, 'label': g['customer_type'],
	'amount_MEAN': g['amount_MEAN'], 'avg_num_trxns_MEAN': g['avg_num_trxns_MEAN'],
	'avg_trxn_amt_MEAN': g['avg_trxn_amt_MEAN'], 'NUM_COUNT': g['NUM_COUNT']})

	# Leaf: SmartSegment x customer_type x acct_type
	for _, g in dtree.groupby(['SmartSegment', 'customer_type', 'acct_type']).agg(agg).reset_index().iterrows():
	sid = f"SS_{int(g['SmartSegment'])}"
	cid = f"{sid}_{g['customer_type']}"
	lid = f"{cid}_{g['acct_type']}"
	rows.append({'id': lid, 'parent': cid, 'label': g['acct_type'],
	'amount_MEAN': g['amount_MEAN'], 'avg_num_trxns_MEAN': g['avg_num_trxns_MEAN'],
	'avg_trxn_amt_MEAN': g['avg_trxn_amt_MEAN'], 'NUM_COUNT': g['NUM_COUNT']})

	tree_df = pd.DataFrame(rows)

	fig = go.Figure(go.Treemap(
	ids=tree_df['id'],
	labels=tree_df['label'],
	parents=tree_df['parent'],
	values=tree_df['NUM_COUNT'],
	customdata=np.column_stack([
	tree_df['avg_num_trxns_MEAN'].fillna(0),
	tree_df['avg_trxn_amt_MEAN'].fillna(0),
	tree_df['NUM_COUNT'].fillna(0),
	tree_df['amount_MEAN'].fillna(0),
	]),
	hovertemplate=(
	'<b>%{label}</b><br>'
	'Count: %{customdata[2]:.0f}<br>'
	'Avg Trxns/Week: %{customdata[0]:.0f}<br>'
	'Avg Trxn Amt: $%{customdata[1]:.0f}<br>'
	'Avg Monthly Amt: $%{customdata[3]:.0f}<br>'
	'<extra></extra>'
	),
	texttemplate=(
	'<b>%{label}</b><br>'
	'n=%{customdata[2]:.0f}<br>'
	'trxns/wk=%{customdata[0]:.0f}<br>'
	'amt=$%{customdata[1]:.0f}'
	),
	marker=dict(
	colors=tree_df['avg_num_trxns_MEAN'].fillna(0),
	colorscale='RdBu',
	showscale=True,
	colorbar=dict(title='Avg Trxns/Wk'),
	),
	))
	fig.update_layout(
	title='AML Dynamic Segments',
	font_size=14,
	margin=dict(t=50, l=25, r=25, b=25),
	)
	return fig, tree_df
	# Remove rows with outliers in any of the specified columns using IQR
	def remove_outliers_iqr(df, columns):
	for col in columns:
	Q1 = df[col].quantile(0.10)
	Q3 = df[col].quantile(0.90)
	IQR = Q3 - Q1
	lower_bound = Q1 - 1.5 * IQR
	upper_bound = Q3 + 1.5 * IQR
	df = df[(df[col] >= Q1) & (df[col] <= Q3)]
	return df

	def plot_pct_metric(df, metric):
	scores=[]
	pcts = []
	Precision = []
	Recall = []
	for i in range(0, 101):
	df_pct = df.head(int(len(df)*(i/100)))
	TP = df_pct[df_pct['true_positives'] ==1].shape[0]
	FP = df_pct[df_pct['false_positives'] ==1].shape[0]
	TN = df_pct[df_pct['true_negatives'] ==1].shape[0]
	FN = df_pct[df_pct['false_negatives'] ==1].shape[0]
	if (metric == 'Jstat'):
	if ((TP+FN == 0) or (FP+TN == 0)):
	metric_J = 0
	else:
	metric_J = (TP/(TP+FN))+(TN/(FP+TN)) - 1
	scores.append (metric_J)
	elif (metric == 'F1'):
	if ((TP+FP) == 0):
	P = 0
	else:
	P = TP / (TP+FP)
	if ((TP+FN) == 0):
	R = 0
	else:
	R = TP / (TP+FN)
	Precision.append(P)
	Recall.append(R)
	if (P+R != 0):
	metric_F1 = 2 * (P* R) / (P+R)
	else:
	metric_F1 = 0
	scores.append (metric_F1)
	pcts.append(i/100)
	maxJ = max(scores)
	max_index = scores.index(maxJ)
	if (metric == 'Jstat'):
	fig = px.line( x=pcts, y=scores)
	# Highlight the maximum point
	fig.add_scatter(x= [pcts[max_index]],y=[scores[max_index]],
	mode='markers', marker=dict(color='red', size=10),
	marker_symbol = ['star'],
	name=f'Max J: ({scores[max_index]})')
	#fig.show()
	return fig
	else:
	fig1 = px.line( x=pcts, y=scores)
	# Highlight the maximum point
	fig1.add_scatter(x= [pcts[max_index]],y=[scores[max_index]],
	mode='markers', marker=dict(color='red', size=10),
	marker_symbol = ['star'],
	name=f'Max J: ({scores[max_index]})')
	fig2 = px.line( x= Recall, y = Precision)
	# Highlight the maximum point
	fig2.add_scatter(x= [Recall[max_index]],y=[Precision[max_index]],
	mode='markers', marker=dict(color='red', size=10),
	marker_symbol = ['star'],
	name=f'Max J: ({scores[max_index]})')
	return fig1, fig2

	def plot_thresholds_metric(df_segment, threshold, bump_pct, segment, metric):
	scores = []
	thresholds = []
	df_segment = remove_outliers_iqr(df_segment, [threshold])
	threshold_min = df_segment[threshold].min()
	threshold_max = df_segment[threshold].max()
	threshold_bump = threshold_min
	while threshold_bump < threshold_max:
	df_trxn_set = df_segment[df_segment[threshold] >= threshold_bump]
	TP = df_trxn_set[df_trxn_set['true_positives'] ==1].shape[0]
	FP = df_trxn_set[df_trxn_set['false_positives'] ==1].shape[0]
	TN = df_trxn_set[df_trxn_set['true_negatives'] ==1].shape[0]
	FN = df_trxn_set[df_trxn_set['false_negatives'] ==1].shape[0]
	if (metric == 'Jstat'):

	if ((TP+FN == 0) or (FP+TN == 0)):
	metric_J = 0
	else:
	metric_J = (TP/(TP+FN))+(TN/(FP+TN)) - 1
	scores.append (metric_J)
	elif (metric == 'F1'):
	if ((TP+FP) == 0):
	P = 0
	else:
	P = TP / (TP+FP)
	if ((TP+FN) == 0):
	R = 0
	else:
	R = TP / (TP+FN)
	if (P+R != 0):
	metric_F1 = 2 * (P* R) / (P+R)
	else:
	metric_F1 = 0
	scores.append (metric_F1)
	thresholds.append(round(threshold_bump, 2))
	threshold_bump = threshold_bump + (threshold_bump * bump_pct)
	fig = px.line( x=thresholds, y=scores)
	maxJ = max(scores)
	max_index = scores.index(maxJ)
	fig.add_scatter(x= [thresholds[max_index]],y=[scores[max_index]],
	mode='markers', marker=dict(color='red', size=10),
	marker_symbol = ['star'],
	name=f'Max J: ({scores[max_index]})')
	#fig.show()
	#write this out to a file for this segment for plotting later
	df_Jstats = pd.DataFrame({f'YJ_{threshold}':thresholds,'YJstats':scores})
	df_Jstats.to_csv(f"Jstats_segment_{segment}_{threshold}.csv", index=False)
	return fig

	def tpr_fpr_plot(df):
	tpr = []
	fpr = []
	tp_cnts = 0
	fp_cnts = 0
	df_alerts = df[df['alert']==1].reset_index()
	tp_total = df_alerts[df_alerts['true_positives'] == 1].shape[0]
	fp_total = df_alerts[df_alerts['false_positives'] == 1].shape[0]
	total_alerts = df_alerts.shape[0]
	Jstat = 0
	max_index = 0
	for index, row in df_alerts.iterrows():
	if row['true_positives'] == 1:
	tp_cnts = tp_cnts+1
	elif row['false_positives'] == 1:
	fp_cnts = fp_cnts+1
	tpr.append(tp_cnts/tp_total)
	fpr.append(fp_cnts/fp_total)
	#J stat
	if ( ((tp_cnts/tp_total) - (index / total_alerts)) > Jstat):
	Jstat = ((tp_cnts/tp_total) - (index / total_alerts)) #second part is random guess value
	max_index = index

	fig = px.line( x=fpr, y=tpr)
	fig.add_scatter(x= [fpr[max_index]],y=[tpr[max_index]],
	mode='markers', marker=dict(color='red', size=10),
	marker_symbol = ['star'],
	name=f'Max J: ({Jstat})')
	#fig.show()
	return fig

	def add_sub_plots(fig, subplot, row_id, col_id, x_title, y_title):
	for trace in subplot.data:
	fig.add_trace(trace, row=row_id, col=col_id)
	fig.update_xaxes(title_text=x_title, row=row_id, col=col_id)
	fig.update_yaxes(title_text=y_title, row=row_id, col=col_id)
	return fig

	def perform_clustering(df, customer_type=None, n_clusters=4):
	"""
	Cluster active customers (avg_num_trxns > 0) using numeric + categorical features.
	Inactive accounts are assigned to a 'No Activity' cluster (index = n_clusters).
	Returns (scatter_fig, stats_text, df_combined).
	"""
	from sklearn.cluster import KMeans
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA

	# Filter by segment
	if customer_type == "Business":
	df_work = df[df['dynamic_segment'] == 0].copy()
	elif customer_type == "Individual":
	df_work = df[df['dynamic_segment'] == 1].copy()
	else:
	df_work = df.copy()

	seg_label = customer_type or "All"

	# ── Keep only accounts with transaction history ─────────────────────
	if 'avg_num_trxns' in df_work.columns:
	df_active = df_work[df_work['avg_num_trxns'].fillna(0) > 0].copy()
	else:
	df_active = df_work.copy()
	df_inactive = pd.DataFrame() # not used — excluded entirely

	# ── Feature set (avg_weekly_trxn_amt replaces avg_trxn_amt) ────────
	numeric_cols = [c for c in [
	'avg_num_trxns', 'avg_weekly_trxn_amt', 'trxn_amt_monthly',
	'INCOME', 'CURRENT_BALANCE', 'ACCT_AGE_YEARS', 'AGE'
	] if c in df_active.columns]

	cat_cols = [c for c in [
	'ACCOUNT_TYPE', 'GENDER', 'AGE_CATEGORY', 'ACCT_OPEN_CHANNEL',
	'NNM', 'OFAC', '314b', 'CITIZENSHIP', 'RESIDENCY_COUNTRY'
	] if c in df_active.columns]

	df_encoded = pd.get_dummies(df_active[cat_cols], drop_first=True) if cat_cols else pd.DataFrame(index=df_active.index)
	X_num = df_active[numeric_cols].fillna(df_active[numeric_cols].median())
	X = pd.concat([X_num.reset_index(drop=True), df_encoded.reset_index(drop=True)], axis=1).fillna(0)
	feature_cols = list(X.columns)

	scaler = StandardScaler()
	X_scaled = scaler.fit_transform(X)

	# ── Auto-select K via elbow ─────────────────────────────────────────
	if n_clusters == 0:
	inertias = []
	k_range = range(2, 9)
	for k in k_range:
	km = KMeans(n_clusters=k, random_state=42, n_init=10)
	km.fit(X_scaled)
	inertias.append(km.inertia_)
	diffs = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)]
	diffs2 = [diffs[i] - diffs[i+1] for i in range(len(diffs)-1)]
	n_clusters = list(k_range)[diffs2.index(max(diffs2)) + 1]
	print(f"Auto-selected K={n_clusters} clusters")

	# ── K-Means on active accounts only ────────────────────────────────
	kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
	labels = kmeans.fit_predict(X_scaled)
	df_active['cluster'] = labels

	# ── PCA scatter ─────────────────────────────────────────────────────
	pca = PCA(n_components=2)
	X_pca = pca.fit_transform(X_scaled)
	var1 = pca.explained_variance_ratio_[0] * 100
	var2 = pca.explained_variance_ratio_[1] * 100

	scatter_df = pd.DataFrame({
	'PC1': X_pca[:, 0],
	'PC2': X_pca[:, 1],
	'Cluster': [f'Cluster {l+1}' for l in labels],
	})
	# Sort so legend appears in numeric order (1, 2, 3, 4) regardless of KMeans label assignment
	cluster_order = [f'Cluster {i+1}' for i in range(n_clusters)]
	scatter_df['Cluster'] = pd.Categorical(scatter_df['Cluster'], categories=cluster_order, ordered=True)
	scatter_df = scatter_df.sort_values('Cluster')

	fig = px.scatter(
	scatter_df, x='PC1', y='PC2', color='Cluster',
	category_orders={'Cluster': cluster_order},
	title=f"Dynamic Segmentation Clustering — {seg_label} ({n_clusters} clusters, active accounts only)",
	labels={
	'PC1': f'PC1 ({var1:.1f}% variance)',
	'PC2': f'PC2 ({var2:.1f}% variance)',
	},
	opacity=0.5,
	color_discrete_sequence=px.colors.qualitative.Set1,
	)
	fig.update_traces(marker=dict(size=3))
	fig.update_layout(legend=dict(itemsizing='constant'))

	# ── Stats ────────────────────────────────────────────────────────────
	_COL_DISPLAY = {
	'avg_num_trxns': 'Avg Weekly Transactions',
	'avg_weekly_trxn_amt': 'Avg Weekly Txn Amount',
	'trxn_amt_monthly': 'Monthly Txn Volume',
	'INCOME': 'Income',
	'CURRENT_BALANCE': 'Current Balance',
	'ACCT_AGE_YEARS': 'Account Age (years)',
	'AGE': 'Age',
	}
	_DOLLAR_COLS = {'avg_weekly_trxn_amt', 'trxn_amt_monthly', 'INCOME', 'CURRENT_BALANCE'}

	n_num = len(numeric_cols)
	n_cat_encoded = len(df_encoded.columns)
	stats_lines = [
	f"=== PRE-COMPUTED CLUSTER STATS (copy verbatim, do not compute new numbers) ===",
	f"Segment: {seg_label} \| Active accounts: {len(df_active):,} (excluded {len(df_work) - len(df_active):,} with no transactions)",
	f"Clusters: {n_clusters} \| Features: {n_num} numeric + {n_cat_encoded} encoded categorical ({len(cat_cols)} original)",
	f"PCA variance explained: PC1={var1:.1f}%, PC2={var2:.1f}%",
	"",
	]
	# Columns to skip in stats display per segment
	_skip_cols = set()
	if seg_label.upper() == "BUSINESS":
	_skip_cols.add("INCOME") # income is individual-only
	_skip_cols.add("AGE") # age not collected for businesses

	total_active = len(df_active)
	for i in range(n_clusters):
	c = df_active[df_active['cluster'] == i]
	pct = 100 * len(c) / total_active if total_active > 0 else 0
	stats_lines.append(f"Cluster {i+1}")
	stats_lines.append(f"- Customers: {len(c):,} ({pct:.1f}% of active accounts)")
	for col in numeric_cols:
	if col in _skip_cols:
	continue
	val = c[col].median()
	if not (val != val): # skip NaN
	label = _COL_DISPLAY.get(col, col)
	fmt = f"${val:,.0f}" if col in _DOLLAR_COLS else f"{val:,.1f}"
	stats_lines.append(f"- {label}: {fmt}")
	stats_lines.append("") # blank line after each cluster block

	stats_lines.append("=== END PRE-COMPUTED CLUSTER STATS ===")
	return fig, "\n".join(stats_lines), df_active


	def _cluster_title(trxns, amt, overall_trxns, overall_amt):
	"""Generate a descriptive cluster title based on relative profile values."""
	freq = "High Freq" if trxns > overall_trxns * 1.15 else ("Low Freq" if trxns < overall_trxns * 0.85 else "Mid Freq")
	value = "High Value" if amt > overall_amt * 1.15 else ("Low Value" if amt < overall_amt * 0.85 else "Mid Value")
	return f"{freq} / {value}"


	# Columns excluded from treemap dimension discovery — IDs, numerics, internal flags
	_DIM_EXCLUDE = {
	'customer_id', 'account_id', 'cluster', 'cluster_label', 'dynamic_segment',
	'is_sar', 'is_fp', 'is_alerted', 'is_fn', 'pct_active',
	'avg_num_trxns', 'avg_weekly_trxn_amt', 'trxn_amt_monthly', 'avg_trxn_amt',
	'income', 'current_balance', 'acct_age_years', 'age',
	'total_trxn_amt', 'cashout_count', 'sar_score', 'alert_count',
	'customer_type', # used as the segment split level, not a sub-dimension
	}


	def discover_dims(df, segment=None, availability=0.70, max_cardinality=20):
	"""
	Discover categorical columns suitable as treemap dimensions from df.

	Parameters
	----------
	df : segmentation DataFrame (output of DS_CSV load)
	segment : 'BUSINESS' or 'INDIVIDUAL' — filter df before scanning, or None for all
	availability : minimum fraction of non-null values required (default 0.70)
	max_cardinality: maximum number of unique values for a column to be considered categorical

	Returns
	-------
	List of column names suitable as treemap hierarchy dimensions, ordered by availability desc.
	"""
	if segment and 'customer_type' in df.columns:
	sub = df[df['customer_type'].str.upper() == segment.upper()]
	else:
	sub = df

	if len(sub) == 0:
	return []

	n = len(sub)
	scored = []
	for col in sub.columns:
	if col.lower() in _DIM_EXCLUDE:
	continue
	col_data = sub[col].dropna()
	avail = len(col_data) / n
	if avail < availability:
	continue
	n_unique = sub[col].nunique(dropna=True)
	if 1 < n_unique <= max_cardinality:
	scored.append((col, avail))

	# Sort by availability descending so highest-coverage dims come first
	scored.sort(key=lambda x: -x[1])
	return [col for col, _ in scored]


	def smartseg_tree_dynamic(df_clustered, seg_label="All", dims=None, df_rule_sweep=None):
	"""
	Build a treemap from a cluster-labelled DataFrame (output of perform_clustering).

	dims can be:
	- None / list: same hierarchy path applied to all rows.
	e.g. ['customer_type', 'ACCOUNT_TYPE']
	- dict: customer_type is always the first level after Cluster;
	the dict maps each customer_type value to its own sub-dim path.
	e.g. {
	'BUSINESS': ['ACCOUNT_TYPE', 'ACCOUNT_AGE_CATEGORY'],
	'INDIVIDUAL': ['ACCOUNT_TYPE', 'GENDER', 'AGE_CATEGORY', 'INCOME_BAND'],
	}

	Only columns actually present in df_clustered are used.
	Each cluster gets its own distinct color; no heatmap colorscale.
	"""
	PALETTE = px.colors.qualitative.Set1

	if dims is None:
	dims = ['customer_type', 'ACCOUNT_TYPE']

	df = df_clustered.copy()

	# Enrich with SAR/alert info from rule sweep if provided
	if df_rule_sweep is not None and 'customer_id' in df.columns:
	sar_map = df_rule_sweep.groupby('customer_id')['is_sar'].max()
	alerted = set(df_rule_sweep['customer_id'].unique())
	df['is_sar'] = df['customer_id'].map(sar_map).fillna(0).astype(int)
	df['is_alerted'] = df['customer_id'].isin(alerted).astype(int)
	df['is_fp'] = ((df['is_alerted'] == 1) & (df['is_sar'] == 0)).astype(int)
	else:
	df['is_sar'] = 0; df['is_alerted'] = 0; df['is_fp'] = 0

	# Overall means over active accounts only for cluster title relative comparisons
	_active_all = df[df['avg_num_trxns'].fillna(0) > 0] if 'avg_num_trxns' in df.columns else df
	overall_trxns = _active_all['avg_num_trxns'].mean() if len(_active_all) > 0 and 'avg_num_trxns' in _active_all.columns else 1
	overall_amt = _active_all['avg_weekly_trxn_amt'].mean() if len(_active_all) > 0 and 'avg_weekly_trxn_amt' in _active_all.columns else 1

	# Build indicative title per cluster (all clusters are active — inactive excluded before clustering)
	cluster_titles = {}
	for counter, (i, grp) in enumerate(df.groupby('cluster'), start=1):
	title = _cluster_title(
	grp['avg_num_trxns'].mean() if 'avg_num_trxns' in grp.columns else 0,
	grp['avg_weekly_trxn_amt'].mean() if 'avg_weekly_trxn_amt' in grp.columns else 0,
	overall_trxns, overall_amt,
	)
	cluster_titles[i] = f"C{counter}: {title}"

	df['cluster_label'] = df['cluster'].map(cluster_titles)

	rows = []

	def add_row(rid, parent, label, sub, cidx=None):
	# Filter to active accounts (with transactions) for transaction metrics
	active = sub[sub['avg_num_trxns'] > 0] if 'avg_num_trxns' in sub.columns else sub
	n_active = len(active)
	pct_active = round(100 * n_active / len(sub), 1) if len(sub) > 0 else 0
	rows.append({
	'id': rid, 'parent': parent, 'label': label,
	# Transaction frequency: median over active accounts (robust to outliers)
	'avg_num_trxns': active['avg_num_trxns'].median() if n_active > 0 and 'avg_num_trxns' in active.columns else 0,
	# Transaction amounts: median to avoid single large-transaction accounts skewing results
	'avg_weekly_trxn_amt': active['avg_weekly_trxn_amt'].median() if n_active > 0 and 'avg_weekly_trxn_amt' in active.columns else 0,
	'trxn_amt_monthly': active['trxn_amt_monthly'].median() if n_active > 0 and 'trxn_amt_monthly' in active.columns else 0,
	# Demographics: mean over all accounts in this node
	'INCOME': sub['INCOME'].mean() if 'INCOME' in sub.columns else 0,
	'AGE': sub['AGE'].mean() if 'AGE' in sub.columns else 0,
	'pct_active': pct_active,
	'NUM_COUNT': len(sub),
	'cidx': cidx,
	# AML risk counts
	'n_sar': int(sub['is_sar'].sum()),
	'n_alerted': int(sub['is_alerted'].sum()),
	'n_fp': int(sub['is_fp'].sum()),
	})

	def build_nodes(sub_df, parent_id, remaining_dims, cidx):
	"""Recursively build treemap nodes for each dimension level."""
	if not remaining_dims:
	return
	dim = remaining_dims[0]
	if dim not in sub_df.columns:
	return
	for val, grp in sub_df.groupby(dim, dropna=True):
	val_str = str(val)
	node_id = f"{parent_id}__{dim}_{val_str}"
	add_row(node_id, parent_id, val_str, grp, cidx=cidx)
	build_nodes(grp, node_id, remaining_dims[1:], cidx)

	SMALL_CLUSTER_THRESHOLD = 0.01 # clusters < 1% of total go into a "Small Clusters" group

	total_rows = len(df)
	small_clusters = {cl for cl, grp in df.groupby('cluster_label')
	if len(grp) / total_rows < SMALL_CLUSTER_THRESHOLD} if total_rows > 0 else set()

	# Root
	add_row('All', '', f'Dynamic Segments - {seg_label}', df, cidx=None)

	# Add a "Small Clusters" bucket if any clusters are below threshold
	if small_clusters:
	df_small = df[df['cluster_label'].isin(small_clusters)]
	add_row('SMALL', 'All', f'Small Clusters (<1%) — {len(df_small):,} accounts', df_small, cidx=None)

	# Cluster level
	for cl, grp in df.groupby('cluster_label'):
	cid = f"CL__{cl}"
	cidx = next((k for k, v in cluster_titles.items() if v == cl), None)
	parent = 'SMALL' if cl in small_clusters else 'All'
	add_row(cid, parent, cl, grp, cidx=cidx)

	if isinstance(dims, dict):
	# customer_type is always the first level; each type gets its own sub-dims
	if 'customer_type' not in grp.columns:
	continue
	for ct, cgrp in grp.groupby('customer_type'):
	ctid = f"{cid}__ct_{ct}"
	add_row(ctid, cid, ct, cgrp, cidx=cidx)
	ct_sub_dims = [d for d in dims.get(ct, []) if d in cgrp.columns]
	build_nodes(cgrp, ctid, ct_sub_dims, cidx)
	else:
	# List mode: recurse through all dims uniformly
	active_dims = [d for d in dims if d in grp.columns]
	build_nodes(grp, cid, active_dims, cidx)

	tree_df = pd.DataFrame(rows)

	# Boost small cluster display values so they're visible in the treemap.
	# Use 5% of total as the minimum display size; actual counts are shown in hover labels.
	if small_clusters:
	min_display = int(max(total_rows * 0.05, 1))
	small_ids = {f"CL__{cl}" for cl in small_clusters} \| {'SMALL'}
	tree_df.loc[tree_df['id'].isin(small_ids), 'NUM_COUNT'] = \
	tree_df.loc[tree_df['id'].isin(small_ids), 'NUM_COUNT'].clip(lower=min_display).astype(int)

	# Per-node colors: neutral grey for root, cluster color for all other nodes
	node_colors = []
	for _, r in tree_df.iterrows():
	if r['cidx'] is None or pd.isna(r['cidx']):
	node_colors.append('#CCCCCC')
	else:
	node_colors.append(PALETTE[int(r['cidx']) % len(PALETTE)])

	fig = go.Figure(go.Treemap(
	ids=tree_df['id'],
	labels=tree_df['label'],
	parents=tree_df['parent'],
	values=tree_df['NUM_COUNT'],
	customdata=np.column_stack([
	tree_df['avg_num_trxns'].fillna(0), # 0
	tree_df['avg_weekly_trxn_amt'].fillna(0), # 1
	tree_df['NUM_COUNT'].fillna(0), # 2
	tree_df['trxn_amt_monthly'].fillna(0), # 3
	tree_df['INCOME'].fillna(0), # 4
	tree_df['AGE'].fillna(0), # 5
	tree_df['pct_active'].fillna(0), # 6
	tree_df['n_sar'].fillna(0), # 7
	tree_df['n_alerted'].fillna(0), # 8
	tree_df['n_fp'].fillna(0), # 9
	]),
	hovertemplate=(
	'<b>%{label}</b><br>'
	'Count: %{customdata[2]:.0f}<br>'
	'Active (w/ txns): %{customdata[6]:.1f}%<br>'
	'Avg Trxns/Week: %{customdata[0]:.1f}<br>'
	'Avg Weekly Trxn Amt: $%{customdata[1]:.0f}<br>'
	'Avg Monthly Trxn Amt: $%{customdata[3]:.0f}<br>'
	+ ('' if seg_label.upper() == 'BUSINESS' else
	'Avg Income: $%{customdata[4]:.0f}<br>'
	'Avg Age: %{customdata[5]:.0f}<br>')
	+ '─────────────────<br>'
	'Alerts: %{customdata[8]:.0f} \| SARs: %{customdata[7]:.0f} \| FPs: %{customdata[9]:.0f}<br>'
	'<extra></extra>'
	),
	texttemplate=(
	'<b>%{label}</b><br>'
	'n=%{customdata[2]:.0f}<br>'
	'SAR=%{customdata[7]:.0f} FP=%{customdata[9]:.0f}<br>'
	'wk=$%{customdata[1]:.0f}'
	),
	marker=dict(colors=node_colors),
	))
	fig.update_layout(
	title=f'AML Dynamic Segments - {seg_label}',
	font_size=14,
	margin=dict(t=50, l=25, r=25, b=25),
	)
	return fig


	if __name__ == "__main__":
	pass