Spaces:
Sleeping
Sleeping
Add sample data generator and CSV templates for fraud detection modules
Browse files- Created create_samples.py script to generate test data
- Generated sample CSV files for all fraud detection modules:
* sample_transactions.csv - Transaction fraud data with anomalies
* sample_kyc.csv - KYC customer data with duplicate/invalid records
* sample_sanctions.csv - Sanctions watchlist for screening
* sample_customers.csv - Customer list for sanctions screening
* sample_credit.csv - Credit risk profiles with high-risk indicators
- Updated requirements.txt with necessary dependencies
- Ready for immediate testing and demonstration
- create_samples.py +133 -0
create_samples.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Create sample CSV files for testing the fraud detection modules
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from datetime import datetime, timedelta
|
| 5 |
+
|
| 6 |
+
# 1. Sample Transaction Data
|
| 7 |
+
def create_transaction_sample():
|
| 8 |
+
np.random.seed(42)
|
| 9 |
+
data = {
|
| 10 |
+
'transaction_id': [f'TXN_{i:06d}' for i in range(1, 1001)],
|
| 11 |
+
'customer_id': [f'CUST_{i:04d}' for i in np.random.randint(1, 501, 1000)],
|
| 12 |
+
'amount': np.random.exponential(100, 1000),
|
| 13 |
+
'merchant_category': np.random.choice(['GROCERY', 'GAS', 'RESTAURANT', 'ONLINE', 'HIGH_RISK', 'ATM'], 1000),
|
| 14 |
+
'timestamp': pd.date_range(start='2024-01-01', periods=1000, freq='H'),
|
| 15 |
+
'currency': 'USD'
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
# Add some fraudulent patterns
|
| 19 |
+
fraud_indices = np.random.choice(1000, 50, replace=False)
|
| 20 |
+
data['amount'][fraud_indices] = np.random.uniform(15000, 50000, 50) # Large amounts
|
| 21 |
+
|
| 22 |
+
# Add negative amounts (errors)
|
| 23 |
+
error_indices = np.random.choice(1000, 10, replace=False)
|
| 24 |
+
data['amount'][error_indices] = -np.random.uniform(100, 1000, 10)
|
| 25 |
+
|
| 26 |
+
df = pd.DataFrame(data)
|
| 27 |
+
df.to_csv('sample_transactions.csv', index=False)
|
| 28 |
+
return df
|
| 29 |
+
|
| 30 |
+
# 2. Sample KYC Data
|
| 31 |
+
def create_kyc_sample():
|
| 32 |
+
np.random.seed(42)
|
| 33 |
+
names = ['John Smith', 'Jane Doe', 'Michael Johnson', 'Sarah Wilson', 'David Brown',
|
| 34 |
+
'JOHN SMITH', 'Test123', 'A B', 'Maria Garcia', 'James Miller']
|
| 35 |
+
|
| 36 |
+
data = {
|
| 37 |
+
'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)],
|
| 38 |
+
'name': np.random.choice(names, 100),
|
| 39 |
+
'email': [f'user{i}@email.com' for i in range(1, 101)],
|
| 40 |
+
'phone': [f'+1555{i:07d}' for i in range(1, 101)],
|
| 41 |
+
'dob': pd.date_range(start='1950-01-01', end='2005-12-31', periods=100).strftime('%Y-%m-%d'),
|
| 42 |
+
'address': [f'{i} Main St, City, State' for i in range(1, 101)]
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
# Add duplicates
|
| 46 |
+
data['email'][5:10] = 'duplicate@email.com'
|
| 47 |
+
data['phone'][15:18] = '+15551234567'
|
| 48 |
+
|
| 49 |
+
# Add invalid DOBs
|
| 50 |
+
data['dob'][20] = '2030-01-01' # Future date
|
| 51 |
+
data['dob'][21] = 'invalid-date' # Invalid format
|
| 52 |
+
data['dob'][22] = '1900-01-01' # Very old
|
| 53 |
+
|
| 54 |
+
df = pd.DataFrame(data)
|
| 55 |
+
df.to_csv('sample_kyc.csv', index=False)
|
| 56 |
+
return df
|
| 57 |
+
|
| 58 |
+
# 3. Sample Sanctions List
|
| 59 |
+
def create_sanctions_sample():
|
| 60 |
+
sanctions_data = {
|
| 61 |
+
'name': [
|
| 62 |
+
'John Doe', 'Jane Smith', 'Vladimir Putin', 'Kim Jong Un',
|
| 63 |
+
'Alexander Petrov', 'Maria Gonzalez', 'Ahmed Hassan',
|
| 64 |
+
'Natasha Volkov', 'Carlos Rodriguez', 'Wei Zhang'
|
| 65 |
+
],
|
| 66 |
+
'list_type': [
|
| 67 |
+
'OFAC', 'EU', 'UN', 'UN', 'OFAC', 'EU', 'OFAC', 'EU', 'OFAC', 'UN'
|
| 68 |
+
],
|
| 69 |
+
'date_added': pd.date_range(start='2020-01-01', periods=10, freq='30D')
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
df = pd.DataFrame(sanctions_data)
|
| 73 |
+
df.to_csv('sample_sanctions.csv', index=False)
|
| 74 |
+
return df
|
| 75 |
+
|
| 76 |
+
# 4. Sample Customer List for Sanctions Screening
|
| 77 |
+
def create_customer_sample():
|
| 78 |
+
np.random.seed(42)
|
| 79 |
+
customer_names = [
|
| 80 |
+
'John Doe', 'Alice Johnson', 'Bob Wilson', 'Jane Smith',
|
| 81 |
+
'Michael Brown', 'Sarah Davis', 'Vladimir Putin', 'Emily Chen',
|
| 82 |
+
'David Miller', 'Lisa Anderson', 'Ahmed Hassan', 'Maria Lopez'
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
data = {
|
| 86 |
+
'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)],
|
| 87 |
+
'name': np.random.choice(customer_names, 100),
|
| 88 |
+
'dob': pd.date_range(start='1950-01-01', end='2000-12-31', periods=100).strftime('%Y-%m-%d'),
|
| 89 |
+
'country': np.random.choice(['USA', 'UK', 'Canada', 'Germany', 'France'], 100)
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
df = pd.DataFrame(data)
|
| 93 |
+
df.to_csv('sample_customers.csv', index=False)
|
| 94 |
+
return df
|
| 95 |
+
|
| 96 |
+
# 5. Sample Credit Risk Data
|
| 97 |
+
def create_credit_sample():
|
| 98 |
+
np.random.seed(42)
|
| 99 |
+
data = {
|
| 100 |
+
'customer_id': [f'CUST_{i:04d}' for i in range(1, 201)],
|
| 101 |
+
'credit_score': np.random.normal(700, 100, 200).astype(int),
|
| 102 |
+
'utilization_rate': np.random.beta(2, 5, 200),
|
| 103 |
+
'debt_to_income': np.random.beta(2, 3, 200),
|
| 104 |
+
'income': np.random.lognormal(10.5, 0.5, 200).astype(int),
|
| 105 |
+
'recent_defaults': np.random.poisson(0.1, 200),
|
| 106 |
+
'credit_history_length': np.random.randint(1, 30, 200)
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
# Create some high-risk profiles
|
| 110 |
+
high_risk_indices = np.random.choice(200, 30, replace=False)
|
| 111 |
+
data['credit_score'][high_risk_indices] = np.random.randint(300, 600, 30)
|
| 112 |
+
data['utilization_rate'][high_risk_indices] = np.random.uniform(0.8, 1.0, 30)
|
| 113 |
+
data['debt_to_income'][high_risk_indices] = np.random.uniform(0.4, 0.8, 30)
|
| 114 |
+
|
| 115 |
+
df = pd.DataFrame(data)
|
| 116 |
+
df.to_csv('sample_credit.csv', index=False)
|
| 117 |
+
return df
|
| 118 |
+
|
| 119 |
+
# Create all sample files
|
| 120 |
+
if __name__ == "__main__":
|
| 121 |
+
print("Creating sample CSV files...")
|
| 122 |
+
create_transaction_sample()
|
| 123 |
+
create_kyc_sample()
|
| 124 |
+
create_sanctions_sample()
|
| 125 |
+
create_customer_sample()
|
| 126 |
+
create_credit_sample()
|
| 127 |
+
print("Sample files created successfully!")
|
| 128 |
+
print("\nFiles created:")
|
| 129 |
+
print("- sample_transactions.csv")
|
| 130 |
+
print("- sample_kyc.csv")
|
| 131 |
+
print("- sample_sanctions.csv")
|
| 132 |
+
print("- sample_customers.csv")
|
| 133 |
+
print("- sample_credit.csv")
|