Spaces:

soupstick
/

fraud-detector-app

Sleeping

App Files Files Community

fraud-detector-app / create_samples.py

soupstick

Add sample data generator and CSV templates for fraud detection modules

6a2d0c6 6 months ago

raw

history blame contribute delete

5.05 kB

	# Create sample CSV files for testing the fraud detection modules
	import pandas as pd
	import numpy as np
	from datetime import datetime, timedelta

	# 1. Sample Transaction Data
	def create_transaction_sample():
	np.random.seed(42)
	data = {
	'transaction_id': [f'TXN_{i:06d}' for i in range(1, 1001)],
	'customer_id': [f'CUST_{i:04d}' for i in np.random.randint(1, 501, 1000)],
	'amount': np.random.exponential(100, 1000),
	'merchant_category': np.random.choice(['GROCERY', 'GAS', 'RESTAURANT', 'ONLINE', 'HIGH_RISK', 'ATM'], 1000),
	'timestamp': pd.date_range(start='2024-01-01', periods=1000, freq='H'),
	'currency': 'USD'
	}

	# Add some fraudulent patterns
	fraud_indices = np.random.choice(1000, 50, replace=False)
	data['amount'][fraud_indices] = np.random.uniform(15000, 50000, 50) # Large amounts

	# Add negative amounts (errors)
	error_indices = np.random.choice(1000, 10, replace=False)
	data['amount'][error_indices] = -np.random.uniform(100, 1000, 10)

	df = pd.DataFrame(data)
	df.to_csv('sample_transactions.csv', index=False)
	return df

	# 2. Sample KYC Data
	def create_kyc_sample():
	np.random.seed(42)
	names = ['John Smith', 'Jane Doe', 'Michael Johnson', 'Sarah Wilson', 'David Brown',
	'JOHN SMITH', 'Test123', 'A B', 'Maria Garcia', 'James Miller']

	data = {
	'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)],
	'name': np.random.choice(names, 100),
	'email': [f'user{i}@email.com' for i in range(1, 101)],
	'phone': [f'+1555{i:07d}' for i in range(1, 101)],
	'dob': pd.date_range(start='1950-01-01', end='2005-12-31', periods=100).strftime('%Y-%m-%d'),
	'address': [f'{i} Main St, City, State' for i in range(1, 101)]
	}

	# Add duplicates
	data['email'][5:10] = 'duplicate@email.com'
	data['phone'][15:18] = '+15551234567'

	# Add invalid DOBs
	data['dob'][20] = '2030-01-01' # Future date
	data['dob'][21] = 'invalid-date' # Invalid format
	data['dob'][22] = '1900-01-01' # Very old

	df = pd.DataFrame(data)
	df.to_csv('sample_kyc.csv', index=False)
	return df

	# 3. Sample Sanctions List
	def create_sanctions_sample():
	sanctions_data = {
	'name': [
	'John Doe', 'Jane Smith', 'Vladimir Putin', 'Kim Jong Un',
	'Alexander Petrov', 'Maria Gonzalez', 'Ahmed Hassan',
	'Natasha Volkov', 'Carlos Rodriguez', 'Wei Zhang'
	],
	'list_type': [
	'OFAC', 'EU', 'UN', 'UN', 'OFAC', 'EU', 'OFAC', 'EU', 'OFAC', 'UN'
	],
	'date_added': pd.date_range(start='2020-01-01', periods=10, freq='30D')
	}

	df = pd.DataFrame(sanctions_data)
	df.to_csv('sample_sanctions.csv', index=False)
	return df

	# 4. Sample Customer List for Sanctions Screening
	def create_customer_sample():
	np.random.seed(42)
	customer_names = [
	'John Doe', 'Alice Johnson', 'Bob Wilson', 'Jane Smith',
	'Michael Brown', 'Sarah Davis', 'Vladimir Putin', 'Emily Chen',
	'David Miller', 'Lisa Anderson', 'Ahmed Hassan', 'Maria Lopez'
	]

	data = {
	'customer_id': [f'CUST_{i:04d}' for i in range(1, 101)],
	'name': np.random.choice(customer_names, 100),
	'dob': pd.date_range(start='1950-01-01', end='2000-12-31', periods=100).strftime('%Y-%m-%d'),
	'country': np.random.choice(['USA', 'UK', 'Canada', 'Germany', 'France'], 100)
	}

	df = pd.DataFrame(data)
	df.to_csv('sample_customers.csv', index=False)
	return df

	# 5. Sample Credit Risk Data
	def create_credit_sample():
	np.random.seed(42)
	data = {
	'customer_id': [f'CUST_{i:04d}' for i in range(1, 201)],
	'credit_score': np.random.normal(700, 100, 200).astype(int),
	'utilization_rate': np.random.beta(2, 5, 200),
	'debt_to_income': np.random.beta(2, 3, 200),
	'income': np.random.lognormal(10.5, 0.5, 200).astype(int),
	'recent_defaults': np.random.poisson(0.1, 200),
	'credit_history_length': np.random.randint(1, 30, 200)
	}

	# Create some high-risk profiles
	high_risk_indices = np.random.choice(200, 30, replace=False)
	data['credit_score'][high_risk_indices] = np.random.randint(300, 600, 30)
	data['utilization_rate'][high_risk_indices] = np.random.uniform(0.8, 1.0, 30)
	data['debt_to_income'][high_risk_indices] = np.random.uniform(0.4, 0.8, 30)

	df = pd.DataFrame(data)
	df.to_csv('sample_credit.csv', index=False)
	return df

	# Create all sample files
	if __name__ == "__main__":
	print("Creating sample CSV files...")
	create_transaction_sample()
	create_kyc_sample()
	create_sanctions_sample()
	create_customer_sample()
	create_credit_sample()
	print("Sample files created successfully!")
	print("\nFiles created:")
	print("- sample_transactions.csv")
	print("- sample_kyc.csv")
	print("- sample_sanctions.csv")
	print("- sample_customers.csv")
	print("- sample_credit.csv")