File size: 2,187 Bytes
7d391cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import numpy as np

def build_customer_profiles(df):
    """
    Group by customer_id and aggregate features for KYC.
    """
    profile_df = df.groupby('customer_id').agg(
        total_transactions=('transaction_id', 'count'),
        total_volume=('amount', 'sum'),
        avg_transaction_amount=('amount', 'mean'),
        max_transaction_amount=('amount', 'max'),
        international_ratio=('is_international', 'mean'),
        flagged_ratio=('is_flagged', 'mean'),
        avg_risk_score=('risk_score', 'mean'),
        unique_countries=('origin_country', 'nunique'),
        structuring_attempts=('structuring_flag', 'sum')
    ).reset_index()
    return profile_df

def assign_kyc_tier(profile_df):
    """
    Assign clustering based tiers.
    """
    profile_df = profile_df.copy()
    
    features = ['total_transactions', 'total_volume', 'avg_transaction_amount',
                'max_transaction_amount', 'international_ratio', 'flagged_ratio',
                'avg_risk_score', 'unique_countries', 'structuring_attempts']
                
    X = profile_df[features].fillna(0)
    
    # Normalize
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    # KMeans
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(X_scaled)
    profile_df['cluster'] = clusters
    
    # Map cluster labels to Low/Medium/High
    cluster_risk = profile_df.groupby('cluster')['avg_risk_score'].mean().sort_values()
    
    tier_mapping = {
        cluster_risk.index[0]: 'Low',
        cluster_risk.index[1]: 'Medium',
        cluster_risk.index[2]: 'High'
    }
    
    profile_df['kyc_tier'] = profile_df['cluster'].map(tier_mapping)
    
    # Simple kyc_risk_score based on normalized avg_risk_score of the user
    # to meet the "kyc_risk_score" float req
    risk_scaler = MinMaxScaler(feature_range=(0, 100))
    profile_df['kyc_risk_score'] = risk_scaler.fit_transform(
        profile_df[['avg_risk_score']]
    ).flatten()
    
    profile_df = profile_df.drop(columns=['cluster'])
    return profile_df