soupstick commited on
Commit
6a2d0c6
·
1 Parent(s): 2132a21

Add sample data generator and CSV templates for fraud detection modules

Browse files

- Created create_samples.py script to generate test data
- Generated sample CSV files for all fraud detection modules:
* sample_transactions.csv - Transaction fraud data with anomalies
* sample_kyc.csv - KYC customer data with duplicate/invalid records
* sample_sanctions.csv - Sanctions watchlist for screening
* sample_customers.csv - Customer list for sanctions screening
* sample_credit.csv - Credit risk profiles with high-risk indicators
- Updated requirements.txt with necessary dependencies
- Ready for immediate testing and demonstration

Files changed (1) hide show
  1. create_samples.py +133 -0
create_samples.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Create sample CSV files for testing the fraud detection modules
2
+ import pandas as pd
3
+ import numpy as np
4
+ from datetime import datetime, timedelta
5
+
6
+ # 1. Sample Transaction Data
7
+ def create_transaction_sample():
8
+ np.random.seed(42)
9
+ data = {
10
+ 'transaction_id': [f'TXN_{i:06d}' for i in range(1, 1001)],
11
+ 'customer_id': [f'CUST_{i:04d}' for i in np.random.randint(1, 501, 1000)],
12
+ 'amount': np.random.exponential(100, 1000),
13
+ 'merchant_category': np.random.choice(['GROCERY', 'GAS', 'RESTAURANT', 'ONLINE', 'HIGH_RISK', 'ATM'], 1000),
14
+ 'timestamp': pd.date_range(start='2024-01-01', periods=1000, freq='H'),
15
+ 'currency': 'USD'
16
+ }
17
+
18
+ # Add some fraudulent patterns
19
+ fraud_indices = np.random.choice(1000, 50, replace=False)
20
+ data['amount'][fraud_indices] = np.random.uniform(15000, 50000, 50) # Large amounts
21
+
22
+ # Add negative amounts (errors)
23
+ error_indices = np.random.choice(1000, 10, replace=False)
24
+ data['amount'][error_indices] = -np.random.uniform(100, 1000, 10)
25
+
26
+ df = pd.DataFrame(data)
27
+ df.to_csv('sample_transactions.csv', index=False)
28
+ return df
29
+
30
+ # 2. Sample KYC Data
31
+ def create_kyc_sample():
32
+ np.random.seed(42)
33
+ names = ['John Smith', 'Jane Doe', 'Michael Johnson', 'Sarah Wilson', 'David Brown',
34
+ 'JOHN SMITH', 'Test123', 'A B', 'Maria Garcia', 'James Miller']
35
+
36
+ data = {
37
+ 'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)],
38
+ 'name': np.random.choice(names, 100),
39
+ 'email': [f'user{i}@email.com' for i in range(1, 101)],
40
+ 'phone': [f'+1555{i:07d}' for i in range(1, 101)],
41
+ 'dob': pd.date_range(start='1950-01-01', end='2005-12-31', periods=100).strftime('%Y-%m-%d'),
42
+ 'address': [f'{i} Main St, City, State' for i in range(1, 101)]
43
+ }
44
+
45
+ # Add duplicates
46
+ data['email'][5:10] = 'duplicate@email.com'
47
+ data['phone'][15:18] = '+15551234567'
48
+
49
+ # Add invalid DOBs
50
+ data['dob'][20] = '2030-01-01' # Future date
51
+ data['dob'][21] = 'invalid-date' # Invalid format
52
+ data['dob'][22] = '1900-01-01' # Very old
53
+
54
+ df = pd.DataFrame(data)
55
+ df.to_csv('sample_kyc.csv', index=False)
56
+ return df
57
+
58
+ # 3. Sample Sanctions List
59
+ def create_sanctions_sample():
60
+ sanctions_data = {
61
+ 'name': [
62
+ 'John Doe', 'Jane Smith', 'Vladimir Putin', 'Kim Jong Un',
63
+ 'Alexander Petrov', 'Maria Gonzalez', 'Ahmed Hassan',
64
+ 'Natasha Volkov', 'Carlos Rodriguez', 'Wei Zhang'
65
+ ],
66
+ 'list_type': [
67
+ 'OFAC', 'EU', 'UN', 'UN', 'OFAC', 'EU', 'OFAC', 'EU', 'OFAC', 'UN'
68
+ ],
69
+ 'date_added': pd.date_range(start='2020-01-01', periods=10, freq='30D')
70
+ }
71
+
72
+ df = pd.DataFrame(sanctions_data)
73
+ df.to_csv('sample_sanctions.csv', index=False)
74
+ return df
75
+
76
+ # 4. Sample Customer List for Sanctions Screening
77
+ def create_customer_sample():
78
+ np.random.seed(42)
79
+ customer_names = [
80
+ 'John Doe', 'Alice Johnson', 'Bob Wilson', 'Jane Smith',
81
+ 'Michael Brown', 'Sarah Davis', 'Vladimir Putin', 'Emily Chen',
82
+ 'David Miller', 'Lisa Anderson', 'Ahmed Hassan', 'Maria Lopez'
83
+ ]
84
+
85
+ data = {
86
+ 'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)],
87
+ 'name': np.random.choice(customer_names, 100),
88
+ 'dob': pd.date_range(start='1950-01-01', end='2000-12-31', periods=100).strftime('%Y-%m-%d'),
89
+ 'country': np.random.choice(['USA', 'UK', 'Canada', 'Germany', 'France'], 100)
90
+ }
91
+
92
+ df = pd.DataFrame(data)
93
+ df.to_csv('sample_customers.csv', index=False)
94
+ return df
95
+
96
+ # 5. Sample Credit Risk Data
97
+ def create_credit_sample():
98
+ np.random.seed(42)
99
+ data = {
100
+ 'customer_id': [f'CUST_{i:04d}' for i in range(1, 201)],
101
+ 'credit_score': np.random.normal(700, 100, 200).astype(int),
102
+ 'utilization_rate': np.random.beta(2, 5, 200),
103
+ 'debt_to_income': np.random.beta(2, 3, 200),
104
+ 'income': np.random.lognormal(10.5, 0.5, 200).astype(int),
105
+ 'recent_defaults': np.random.poisson(0.1, 200),
106
+ 'credit_history_length': np.random.randint(1, 30, 200)
107
+ }
108
+
109
+ # Create some high-risk profiles
110
+ high_risk_indices = np.random.choice(200, 30, replace=False)
111
+ data['credit_score'][high_risk_indices] = np.random.randint(300, 600, 30)
112
+ data['utilization_rate'][high_risk_indices] = np.random.uniform(0.8, 1.0, 30)
113
+ data['debt_to_income'][high_risk_indices] = np.random.uniform(0.4, 0.8, 30)
114
+
115
+ df = pd.DataFrame(data)
116
+ df.to_csv('sample_credit.csv', index=False)
117
+ return df
118
+
119
+ # Create all sample files
120
+ if __name__ == "__main__":
121
+ print("Creating sample CSV files...")
122
+ create_transaction_sample()
123
+ create_kyc_sample()
124
+ create_sanctions_sample()
125
+ create_customer_sample()
126
+ create_credit_sample()
127
+ print("Sample files created successfully!")
128
+ print("\nFiles created:")
129
+ print("- sample_transactions.csv")
130
+ print("- sample_kyc.csv")
131
+ print("- sample_sanctions.csv")
132
+ print("- sample_customers.csv")
133
+ print("- sample_credit.csv")