import pandas as pd from sklearn.preprocessing import MinMaxScaler from sklearn.cluster import KMeans import numpy as np def build_customer_profiles(df): """ Group by customer_id and aggregate features for KYC. """ profile_df = df.groupby('customer_id').agg( total_transactions=('transaction_id', 'count'), total_volume=('amount', 'sum'), avg_transaction_amount=('amount', 'mean'), max_transaction_amount=('amount', 'max'), international_ratio=('is_international', 'mean'), flagged_ratio=('is_flagged', 'mean'), avg_risk_score=('risk_score', 'mean'), unique_countries=('origin_country', 'nunique'), structuring_attempts=('structuring_flag', 'sum') ).reset_index() return profile_df def assign_kyc_tier(profile_df): """ Assign clustering based tiers. """ profile_df = profile_df.copy() features = ['total_transactions', 'total_volume', 'avg_transaction_amount', 'max_transaction_amount', 'international_ratio', 'flagged_ratio', 'avg_risk_score', 'unique_countries', 'structuring_attempts'] X = profile_df[features].fillna(0) # Normalize scaler = MinMaxScaler() X_scaled = scaler.fit_transform(X) # KMeans kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) clusters = kmeans.fit_predict(X_scaled) profile_df['cluster'] = clusters # Map cluster labels to Low/Medium/High cluster_risk = profile_df.groupby('cluster')['avg_risk_score'].mean().sort_values() tier_mapping = { cluster_risk.index[0]: 'Low', cluster_risk.index[1]: 'Medium', cluster_risk.index[2]: 'High' } profile_df['kyc_tier'] = profile_df['cluster'].map(tier_mapping) # Simple kyc_risk_score based on normalized avg_risk_score of the user # to meet the "kyc_risk_score" float req risk_scaler = MinMaxScaler(feature_range=(0, 100)) profile_df['kyc_risk_score'] = risk_scaler.fit_transform( profile_df[['avg_risk_score']] ).flatten() profile_df = profile_df.drop(columns=['cluster']) return profile_df