Spaces:
Running
Running
File size: 2,187 Bytes
7d391cb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import numpy as np
def build_customer_profiles(df):
"""
Group by customer_id and aggregate features for KYC.
"""
profile_df = df.groupby('customer_id').agg(
total_transactions=('transaction_id', 'count'),
total_volume=('amount', 'sum'),
avg_transaction_amount=('amount', 'mean'),
max_transaction_amount=('amount', 'max'),
international_ratio=('is_international', 'mean'),
flagged_ratio=('is_flagged', 'mean'),
avg_risk_score=('risk_score', 'mean'),
unique_countries=('origin_country', 'nunique'),
structuring_attempts=('structuring_flag', 'sum')
).reset_index()
return profile_df
def assign_kyc_tier(profile_df):
"""
Assign clustering based tiers.
"""
profile_df = profile_df.copy()
features = ['total_transactions', 'total_volume', 'avg_transaction_amount',
'max_transaction_amount', 'international_ratio', 'flagged_ratio',
'avg_risk_score', 'unique_countries', 'structuring_attempts']
X = profile_df[features].fillna(0)
# Normalize
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
# KMeans
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
profile_df['cluster'] = clusters
# Map cluster labels to Low/Medium/High
cluster_risk = profile_df.groupby('cluster')['avg_risk_score'].mean().sort_values()
tier_mapping = {
cluster_risk.index[0]: 'Low',
cluster_risk.index[1]: 'Medium',
cluster_risk.index[2]: 'High'
}
profile_df['kyc_tier'] = profile_df['cluster'].map(tier_mapping)
# Simple kyc_risk_score based on normalized avg_risk_score of the user
# to meet the "kyc_risk_score" float req
risk_scaler = MinMaxScaler(feature_range=(0, 100))
profile_df['kyc_risk_score'] = risk_scaler.fit_transform(
profile_df[['avg_risk_score']]
).flatten()
profile_df = profile_df.drop(columns=['cluster'])
return profile_df
|