Spaces:
Running
Running
| import pandas as pd | |
| from sklearn.preprocessing import MinMaxScaler | |
| from sklearn.cluster import KMeans | |
| import numpy as np | |
| def build_customer_profiles(df): | |
| """ | |
| Group by customer_id and aggregate features for KYC. | |
| """ | |
| profile_df = df.groupby('customer_id').agg( | |
| total_transactions=('transaction_id', 'count'), | |
| total_volume=('amount', 'sum'), | |
| avg_transaction_amount=('amount', 'mean'), | |
| max_transaction_amount=('amount', 'max'), | |
| international_ratio=('is_international', 'mean'), | |
| flagged_ratio=('is_flagged', 'mean'), | |
| avg_risk_score=('risk_score', 'mean'), | |
| unique_countries=('origin_country', 'nunique'), | |
| structuring_attempts=('structuring_flag', 'sum') | |
| ).reset_index() | |
| return profile_df | |
| def assign_kyc_tier(profile_df): | |
| """ | |
| Assign clustering based tiers. | |
| """ | |
| profile_df = profile_df.copy() | |
| features = ['total_transactions', 'total_volume', 'avg_transaction_amount', | |
| 'max_transaction_amount', 'international_ratio', 'flagged_ratio', | |
| 'avg_risk_score', 'unique_countries', 'structuring_attempts'] | |
| X = profile_df[features].fillna(0) | |
| # Normalize | |
| scaler = MinMaxScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| # KMeans | |
| kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) | |
| clusters = kmeans.fit_predict(X_scaled) | |
| profile_df['cluster'] = clusters | |
| # Map cluster labels to Low/Medium/High | |
| cluster_risk = profile_df.groupby('cluster')['avg_risk_score'].mean().sort_values() | |
| tier_mapping = { | |
| cluster_risk.index[0]: 'Low', | |
| cluster_risk.index[1]: 'Medium', | |
| cluster_risk.index[2]: 'High' | |
| } | |
| profile_df['kyc_tier'] = profile_df['cluster'].map(tier_mapping) | |
| # Simple kyc_risk_score based on normalized avg_risk_score of the user | |
| # to meet the "kyc_risk_score" float req | |
| risk_scaler = MinMaxScaler(feature_range=(0, 100)) | |
| profile_df['kyc_risk_score'] = risk_scaler.fit_transform( | |
| profile_df[['avg_risk_score']] | |
| ).flatten() | |
| profile_df = profile_df.drop(columns=['cluster']) | |
| return profile_df | |