Spaces:
Sleeping
Sleeping
File size: 5,050 Bytes
6a2d0c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# Create sample CSV files for testing the fraud detection modules
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
# 1. Sample Transaction Data
def create_transaction_sample():
np.random.seed(42)
data = {
'transaction_id': [f'TXN_{i:06d}' for i in range(1, 1001)],
'customer_id': [f'CUST_{i:04d}' for i in np.random.randint(1, 501, 1000)],
'amount': np.random.exponential(100, 1000),
'merchant_category': np.random.choice(['GROCERY', 'GAS', 'RESTAURANT', 'ONLINE', 'HIGH_RISK', 'ATM'], 1000),
'timestamp': pd.date_range(start='2024-01-01', periods=1000, freq='H'),
'currency': 'USD'
}
# Add some fraudulent patterns
fraud_indices = np.random.choice(1000, 50, replace=False)
data['amount'][fraud_indices] = np.random.uniform(15000, 50000, 50) # Large amounts
# Add negative amounts (errors)
error_indices = np.random.choice(1000, 10, replace=False)
data['amount'][error_indices] = -np.random.uniform(100, 1000, 10)
df = pd.DataFrame(data)
df.to_csv('sample_transactions.csv', index=False)
return df
# 2. Sample KYC Data
def create_kyc_sample():
np.random.seed(42)
names = ['John Smith', 'Jane Doe', 'Michael Johnson', 'Sarah Wilson', 'David Brown',
'JOHN SMITH', 'Test123', 'A B', 'Maria Garcia', 'James Miller']
data = {
'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)],
'name': np.random.choice(names, 100),
'email': [f'user{i}@email.com' for i in range(1, 101)],
'phone': [f'+1555{i:07d}' for i in range(1, 101)],
'dob': pd.date_range(start='1950-01-01', end='2005-12-31', periods=100).strftime('%Y-%m-%d'),
'address': [f'{i} Main St, City, State' for i in range(1, 101)]
}
# Add duplicates
data['email'][5:10] = 'duplicate@email.com'
data['phone'][15:18] = '+15551234567'
# Add invalid DOBs
data['dob'][20] = '2030-01-01' # Future date
data['dob'][21] = 'invalid-date' # Invalid format
data['dob'][22] = '1900-01-01' # Very old
df = pd.DataFrame(data)
df.to_csv('sample_kyc.csv', index=False)
return df
# 3. Sample Sanctions List
def create_sanctions_sample():
sanctions_data = {
'name': [
'John Doe', 'Jane Smith', 'Vladimir Putin', 'Kim Jong Un',
'Alexander Petrov', 'Maria Gonzalez', 'Ahmed Hassan',
'Natasha Volkov', 'Carlos Rodriguez', 'Wei Zhang'
],
'list_type': [
'OFAC', 'EU', 'UN', 'UN', 'OFAC', 'EU', 'OFAC', 'EU', 'OFAC', 'UN'
],
'date_added': pd.date_range(start='2020-01-01', periods=10, freq='30D')
}
df = pd.DataFrame(sanctions_data)
df.to_csv('sample_sanctions.csv', index=False)
return df
# 4. Sample Customer List for Sanctions Screening
def create_customer_sample():
np.random.seed(42)
customer_names = [
'John Doe', 'Alice Johnson', 'Bob Wilson', 'Jane Smith',
'Michael Brown', 'Sarah Davis', 'Vladimir Putin', 'Emily Chen',
'David Miller', 'Lisa Anderson', 'Ahmed Hassan', 'Maria Lopez'
]
data = {
'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)],
'name': np.random.choice(customer_names, 100),
'dob': pd.date_range(start='1950-01-01', end='2000-12-31', periods=100).strftime('%Y-%m-%d'),
'country': np.random.choice(['USA', 'UK', 'Canada', 'Germany', 'France'], 100)
}
df = pd.DataFrame(data)
df.to_csv('sample_customers.csv', index=False)
return df
# 5. Sample Credit Risk Data
def create_credit_sample():
np.random.seed(42)
data = {
'customer_id': [f'CUST_{i:04d}' for i in range(1, 201)],
'credit_score': np.random.normal(700, 100, 200).astype(int),
'utilization_rate': np.random.beta(2, 5, 200),
'debt_to_income': np.random.beta(2, 3, 200),
'income': np.random.lognormal(10.5, 0.5, 200).astype(int),
'recent_defaults': np.random.poisson(0.1, 200),
'credit_history_length': np.random.randint(1, 30, 200)
}
# Create some high-risk profiles
high_risk_indices = np.random.choice(200, 30, replace=False)
data['credit_score'][high_risk_indices] = np.random.randint(300, 600, 30)
data['utilization_rate'][high_risk_indices] = np.random.uniform(0.8, 1.0, 30)
data['debt_to_income'][high_risk_indices] = np.random.uniform(0.4, 0.8, 30)
df = pd.DataFrame(data)
df.to_csv('sample_credit.csv', index=False)
return df
# Create all sample files
if __name__ == "__main__":
print("Creating sample CSV files...")
create_transaction_sample()
create_kyc_sample()
create_sanctions_sample()
create_customer_sample()
create_credit_sample()
print("Sample files created successfully!")
print("\nFiles created:")
print("- sample_transactions.csv")
print("- sample_kyc.csv")
print("- sample_sanctions.csv")
print("- sample_customers.csv")
print("- sample_credit.csv")
|