File size: 3,933 Bytes
7d391cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
import numpy as np
import datetime
import random

def generate_sample_data(filepath="sample_transactions.csv"):
    np.random.seed(42)
    random.seed(42)
    
    num_rows = 1000
    customer_ids = [f"CUST_{str(i).zfill(4)}" for i in range(1, 51)]
    
    end_date = datetime.datetime.now()
    start_date = end_date - datetime.timedelta(days=30)
    
    timestamps = [start_date + datetime.timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(num_rows)]
    timestamps.sort()
    
    tx_types = ['TRANSFER'] * 400 + ['CASH_OUT'] * 250 + ['PAYMENT'] * 200 + ['DEBIT'] * 100 + ['CASH_IN'] * 50
    random.shuffle(tx_types)
    
    countries = ['US', 'GB', 'CN', 'NG', 'RU', 'DE', 'BR', 'MX']
    
    data = []
    
    # 5 customers with rapid sequential transactions
    rapid_customers = random.sample(customer_ids, 5)
    
    for i in range(num_rows):
        cid = random.choice(customer_ids)
        amount = round(random.uniform(100, 5000), 2)
        
        # Outliers up to 500K
        if random.random() < 0.05:
            amount = round(random.uniform(5000, 500000), 2)
            
        orig = 'US' if random.random() < 0.7 else random.choice(countries)
        dest = 'US' if random.random() < 0.7 else random.choice(countries)
        age = random.randint(7, 3650)
        
        data.append({
            'transaction_id': f"TXN_{str(i+1).zfill(6)}",
            'customer_id': cid,
            'amount': amount,
            'timestamp': timestamps[i],
            'transaction_type': tx_types[i],
            'origin_country': orig,
            'dest_country': dest,
            'account_age_days': age
        })
        
    df = pd.DataFrame(data)
    
    # Inject suspicious patterns
    
    # 1. 10 structuring transactions ($9000-$9999)
    structuring_indices = random.sample(range(num_rows), 10)
    for idx in structuring_indices:
        df.at[idx, 'amount'] = round(random.uniform(9000, 9999), 2)
        
    # 2. 8 large international cash outs > $50K
    intl_cash_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices], 8)
    for idx in intl_cash_indices:
        df.at[idx, 'transaction_type'] = 'CASH_OUT'
        df.at[idx, 'amount'] = round(random.uniform(50001, 150000), 2)
        df.at[idx, 'origin_country'] = 'US'
        df.at[idx, 'dest_country'] = random.choice([c for c in countries if c != 'US'])
        
    # 3. 6 dormant account spikes (age < 30 days, amount > $10K)
    dormant_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices and i not in intl_cash_indices], 6)
    for idx in dormant_indices:
        df.at[idx, 'account_age_days'] = random.randint(1, 29)
        df.at[idx, 'amount'] = round(random.uniform(10001, 50000), 2)
        
    # 4. 5 exact round amounts
    round_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices and i not in intl_cash_indices and i not in dormant_indices], 5)
    round_amounts = [10000, 50000, 100000, 10000, 50000]
    for i, idx in enumerate(round_indices):
        df.at[idx, 'amount'] = float(round_amounts[i])
        
    # 5. Rapid sequential transactions for the 5 customers
    # (Just cluster their timestamps closely in a few places)
    for rc in rapid_customers:
        rc_indices = df[df['customer_id'] == rc].index.tolist()
        if len(rc_indices) > 5:
            base_time = df.at[rc_indices[0], 'timestamp']
            for j in range(1, 6):
                df.at[rc_indices[j], 'timestamp'] = base_time + datetime.timedelta(minutes=j)

    df = df.sort_values(by='timestamp').reset_index(drop=True)
    df.to_csv(filepath, index=False)
    print(f"Generated {filepath} successfully with {len(df)} rows.")

if __name__ == "__main__":
    generate_sample_data("/Users/ajaykasu/Downloads/AML Shield/aml-shield/sample_data/sample_transactions.csv")