File size: 5,050 Bytes
6a2d0c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Create sample CSV files for testing the fraud detection modules
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# 1. Sample Transaction Data
def create_transaction_sample():
    np.random.seed(42)
    data = {
        'transaction_id': [f'TXN_{i:06d}' for i in range(1, 1001)],
        'customer_id': [f'CUST_{i:04d}' for i in np.random.randint(1, 501, 1000)],
        'amount': np.random.exponential(100, 1000),
        'merchant_category': np.random.choice(['GROCERY', 'GAS', 'RESTAURANT', 'ONLINE', 'HIGH_RISK', 'ATM'], 1000),
        'timestamp': pd.date_range(start='2024-01-01', periods=1000, freq='H'),
        'currency': 'USD'
    }
    
    # Add some fraudulent patterns
    fraud_indices = np.random.choice(1000, 50, replace=False)
    data['amount'][fraud_indices] = np.random.uniform(15000, 50000, 50)  # Large amounts
    
    # Add negative amounts (errors)
    error_indices = np.random.choice(1000, 10, replace=False)
    data['amount'][error_indices] = -np.random.uniform(100, 1000, 10)
    
    df = pd.DataFrame(data)
    df.to_csv('sample_transactions.csv', index=False)
    return df

# 2. Sample KYC Data
def create_kyc_sample():
    np.random.seed(42)
    names = ['John Smith', 'Jane Doe', 'Michael Johnson', 'Sarah Wilson', 'David Brown', 
             'JOHN SMITH', 'Test123', 'A B', 'Maria Garcia', 'James Miller']
    
    data = {
        'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)],
        'name': np.random.choice(names, 100),
        'email': [f'user{i}@email.com' for i in range(1, 101)],
        'phone': [f'+1555{i:07d}' for i in range(1, 101)],
        'dob': pd.date_range(start='1950-01-01', end='2005-12-31', periods=100).strftime('%Y-%m-%d'),
        'address': [f'{i} Main St, City, State' for i in range(1, 101)]
    }
    
    # Add duplicates
    data['email'][5:10] = 'duplicate@email.com'
    data['phone'][15:18] = '+15551234567'
    
    # Add invalid DOBs
    data['dob'][20] = '2030-01-01'  # Future date
    data['dob'][21] = 'invalid-date'  # Invalid format
    data['dob'][22] = '1900-01-01'  # Very old
    
    df = pd.DataFrame(data)
    df.to_csv('sample_kyc.csv', index=False)
    return df

# 3. Sample Sanctions List
def create_sanctions_sample():
    sanctions_data = {
        'name': [
            'John Doe', 'Jane Smith', 'Vladimir Putin', 'Kim Jong Un',
            'Alexander Petrov', 'Maria Gonzalez', 'Ahmed Hassan',
            'Natasha Volkov', 'Carlos Rodriguez', 'Wei Zhang'
        ],
        'list_type': [
            'OFAC', 'EU', 'UN', 'UN', 'OFAC', 'EU', 'OFAC', 'EU', 'OFAC', 'UN'
        ],
        'date_added': pd.date_range(start='2020-01-01', periods=10, freq='30D')
    }
    
    df = pd.DataFrame(sanctions_data)
    df.to_csv('sample_sanctions.csv', index=False)
    return df

# 4. Sample Customer List for Sanctions Screening
def create_customer_sample():
    np.random.seed(42)
    customer_names = [
        'John Doe', 'Alice Johnson', 'Bob Wilson', 'Jane Smith',
        'Michael Brown', 'Sarah Davis', 'Vladimir Putin', 'Emily Chen',
        'David Miller', 'Lisa Anderson', 'Ahmed Hassan', 'Maria Lopez'
    ]
    
    data = {
        'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)],
        'name': np.random.choice(customer_names, 100),
        'dob': pd.date_range(start='1950-01-01', end='2000-12-31', periods=100).strftime('%Y-%m-%d'),
        'country': np.random.choice(['USA', 'UK', 'Canada', 'Germany', 'France'], 100)
    }
    
    df = pd.DataFrame(data)
    df.to_csv('sample_customers.csv', index=False)
    return df

# 5. Sample Credit Risk Data
def create_credit_sample():
    np.random.seed(42)
    data = {
        'customer_id': [f'CUST_{i:04d}' for i in range(1, 201)],
        'credit_score': np.random.normal(700, 100, 200).astype(int),
        'utilization_rate': np.random.beta(2, 5, 200),
        'debt_to_income': np.random.beta(2, 3, 200),
        'income': np.random.lognormal(10.5, 0.5, 200).astype(int),
        'recent_defaults': np.random.poisson(0.1, 200),
        'credit_history_length': np.random.randint(1, 30, 200)
    }
    
    # Create some high-risk profiles
    high_risk_indices = np.random.choice(200, 30, replace=False)
    data['credit_score'][high_risk_indices] = np.random.randint(300, 600, 30)
    data['utilization_rate'][high_risk_indices] = np.random.uniform(0.8, 1.0, 30)
    data['debt_to_income'][high_risk_indices] = np.random.uniform(0.4, 0.8, 30)
    
    df = pd.DataFrame(data)
    df.to_csv('sample_credit.csv', index=False)
    return df

# Create all sample files
if __name__ == "__main__":
    print("Creating sample CSV files...")
    create_transaction_sample()
    create_kyc_sample() 
    create_sanctions_sample()
    create_customer_sample()
    create_credit_sample()
    print("Sample files created successfully!")
    print("\nFiles created:")
    print("- sample_transactions.csv")
    print("- sample_kyc.csv") 
    print("- sample_sanctions.csv")
    print("- sample_customers.csv")
    print("- sample_credit.csv")