Spaces:
Sleeping
Sleeping
| # Create sample CSV files for testing the fraud detection modules | |
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime, timedelta | |
| # 1. Sample Transaction Data | |
| def create_transaction_sample(): | |
| np.random.seed(42) | |
| data = { | |
| 'transaction_id': [f'TXN_{i:06d}' for i in range(1, 1001)], | |
| 'customer_id': [f'CUST_{i:04d}' for i in np.random.randint(1, 501, 1000)], | |
| 'amount': np.random.exponential(100, 1000), | |
| 'merchant_category': np.random.choice(['GROCERY', 'GAS', 'RESTAURANT', 'ONLINE', 'HIGH_RISK', 'ATM'], 1000), | |
| 'timestamp': pd.date_range(start='2024-01-01', periods=1000, freq='H'), | |
| 'currency': 'USD' | |
| } | |
| # Add some fraudulent patterns | |
| fraud_indices = np.random.choice(1000, 50, replace=False) | |
| data['amount'][fraud_indices] = np.random.uniform(15000, 50000, 50) # Large amounts | |
| # Add negative amounts (errors) | |
| error_indices = np.random.choice(1000, 10, replace=False) | |
| data['amount'][error_indices] = -np.random.uniform(100, 1000, 10) | |
| df = pd.DataFrame(data) | |
| df.to_csv('sample_transactions.csv', index=False) | |
| return df | |
| # 2. Sample KYC Data | |
| def create_kyc_sample(): | |
| np.random.seed(42) | |
| names = ['John Smith', 'Jane Doe', 'Michael Johnson', 'Sarah Wilson', 'David Brown', | |
| 'JOHN SMITH', 'Test123', 'A B', 'Maria Garcia', 'James Miller'] | |
| data = { | |
| 'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)], | |
| 'name': np.random.choice(names, 100), | |
| 'email': [f'user{i}@email.com' for i in range(1, 101)], | |
| 'phone': [f'+1555{i:07d}' for i in range(1, 101)], | |
| 'dob': pd.date_range(start='1950-01-01', end='2005-12-31', periods=100).strftime('%Y-%m-%d'), | |
| 'address': [f'{i} Main St, City, State' for i in range(1, 101)] | |
| } | |
| # Add duplicates | |
| data['email'][5:10] = 'duplicate@email.com' | |
| data['phone'][15:18] = '+15551234567' | |
| # Add invalid DOBs | |
| data['dob'][20] = '2030-01-01' # Future date | |
| data['dob'][21] = 'invalid-date' # Invalid format | |
| data['dob'][22] = '1900-01-01' # Very old | |
| df = pd.DataFrame(data) | |
| df.to_csv('sample_kyc.csv', index=False) | |
| return df | |
| # 3. Sample Sanctions List | |
| def create_sanctions_sample(): | |
| sanctions_data = { | |
| 'name': [ | |
| 'John Doe', 'Jane Smith', 'Vladimir Putin', 'Kim Jong Un', | |
| 'Alexander Petrov', 'Maria Gonzalez', 'Ahmed Hassan', | |
| 'Natasha Volkov', 'Carlos Rodriguez', 'Wei Zhang' | |
| ], | |
| 'list_type': [ | |
| 'OFAC', 'EU', 'UN', 'UN', 'OFAC', 'EU', 'OFAC', 'EU', 'OFAC', 'UN' | |
| ], | |
| 'date_added': pd.date_range(start='2020-01-01', periods=10, freq='30D') | |
| } | |
| df = pd.DataFrame(sanctions_data) | |
| df.to_csv('sample_sanctions.csv', index=False) | |
| return df | |
| # 4. Sample Customer List for Sanctions Screening | |
| def create_customer_sample(): | |
| np.random.seed(42) | |
| customer_names = [ | |
| 'John Doe', 'Alice Johnson', 'Bob Wilson', 'Jane Smith', | |
| 'Michael Brown', 'Sarah Davis', 'Vladimir Putin', 'Emily Chen', | |
| 'David Miller', 'Lisa Anderson', 'Ahmed Hassan', 'Maria Lopez' | |
| ] | |
| data = { | |
| 'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)], | |
| 'name': np.random.choice(customer_names, 100), | |
| 'dob': pd.date_range(start='1950-01-01', end='2000-12-31', periods=100).strftime('%Y-%m-%d'), | |
| 'country': np.random.choice(['USA', 'UK', 'Canada', 'Germany', 'France'], 100) | |
| } | |
| df = pd.DataFrame(data) | |
| df.to_csv('sample_customers.csv', index=False) | |
| return df | |
| # 5. Sample Credit Risk Data | |
| def create_credit_sample(): | |
| np.random.seed(42) | |
| data = { | |
| 'customer_id': [f'CUST_{i:04d}' for i in range(1, 201)], | |
| 'credit_score': np.random.normal(700, 100, 200).astype(int), | |
| 'utilization_rate': np.random.beta(2, 5, 200), | |
| 'debt_to_income': np.random.beta(2, 3, 200), | |
| 'income': np.random.lognormal(10.5, 0.5, 200).astype(int), | |
| 'recent_defaults': np.random.poisson(0.1, 200), | |
| 'credit_history_length': np.random.randint(1, 30, 200) | |
| } | |
| # Create some high-risk profiles | |
| high_risk_indices = np.random.choice(200, 30, replace=False) | |
| data['credit_score'][high_risk_indices] = np.random.randint(300, 600, 30) | |
| data['utilization_rate'][high_risk_indices] = np.random.uniform(0.8, 1.0, 30) | |
| data['debt_to_income'][high_risk_indices] = np.random.uniform(0.4, 0.8, 30) | |
| df = pd.DataFrame(data) | |
| df.to_csv('sample_credit.csv', index=False) | |
| return df | |
| # Create all sample files | |
| if __name__ == "__main__": | |
| print("Creating sample CSV files...") | |
| create_transaction_sample() | |
| create_kyc_sample() | |
| create_sanctions_sample() | |
| create_customer_sample() | |
| create_credit_sample() | |
| print("Sample files created successfully!") | |
| print("\nFiles created:") | |
| print("- sample_transactions.csv") | |
| print("- sample_kyc.csv") | |
| print("- sample_sanctions.csv") | |
| print("- sample_customers.csv") | |
| print("- sample_credit.csv") | |