Spaces:

AJAYKASU
/

AML_Shield

Running

AML_Shield / generate_sample_data.py

AJAY KASU

Initial commit AML Shield

7d391cb 23 days ago

3.93 kB

	import pandas as pd
	import numpy as np
	import datetime
	import random

	def generate_sample_data(filepath="sample_transactions.csv"):
	np.random.seed(42)
	random.seed(42)

	num_rows = 1000
	customer_ids = [f"CUST_{str(i).zfill(4)}" for i in range(1, 51)]

	end_date = datetime.datetime.now()
	start_date = end_date - datetime.timedelta(days=30)

	timestamps = [start_date + datetime.timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(num_rows)]
	timestamps.sort()

	tx_types = ['TRANSFER'] * 400 + ['CASH_OUT'] * 250 + ['PAYMENT'] * 200 + ['DEBIT'] * 100 + ['CASH_IN'] * 50
	random.shuffle(tx_types)

	countries = ['US', 'GB', 'CN', 'NG', 'RU', 'DE', 'BR', 'MX']

	data = []

	# 5 customers with rapid sequential transactions
	rapid_customers = random.sample(customer_ids, 5)

	for i in range(num_rows):
	cid = random.choice(customer_ids)
	amount = round(random.uniform(100, 5000), 2)

	# Outliers up to 500K
	if random.random() < 0.05:
	amount = round(random.uniform(5000, 500000), 2)

	orig = 'US' if random.random() < 0.7 else random.choice(countries)
	dest = 'US' if random.random() < 0.7 else random.choice(countries)
	age = random.randint(7, 3650)

	data.append({
	'transaction_id': f"TXN_{str(i+1).zfill(6)}",
	'customer_id': cid,
	'amount': amount,
	'timestamp': timestamps[i],
	'transaction_type': tx_types[i],
	'origin_country': orig,
	'dest_country': dest,
	'account_age_days': age
	})

	df = pd.DataFrame(data)

	# Inject suspicious patterns

	# 1. 10 structuring transactions ($9000-$9999)
	structuring_indices = random.sample(range(num_rows), 10)
	for idx in structuring_indices:
	df.at[idx, 'amount'] = round(random.uniform(9000, 9999), 2)

	# 2. 8 large international cash outs > $50K
	intl_cash_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices], 8)
	for idx in intl_cash_indices:
	df.at[idx, 'transaction_type'] = 'CASH_OUT'
	df.at[idx, 'amount'] = round(random.uniform(50001, 150000), 2)
	df.at[idx, 'origin_country'] = 'US'
	df.at[idx, 'dest_country'] = random.choice([c for c in countries if c != 'US'])

	# 3. 6 dormant account spikes (age < 30 days, amount > $10K)
	dormant_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices and i not in intl_cash_indices], 6)
	for idx in dormant_indices:
	df.at[idx, 'account_age_days'] = random.randint(1, 29)
	df.at[idx, 'amount'] = round(random.uniform(10001, 50000), 2)

	# 4. 5 exact round amounts
	round_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices and i not in intl_cash_indices and i not in dormant_indices], 5)
	round_amounts = [10000, 50000, 100000, 10000, 50000]
	for i, idx in enumerate(round_indices):
	df.at[idx, 'amount'] = float(round_amounts[i])

	# 5. Rapid sequential transactions for the 5 customers
	# (Just cluster their timestamps closely in a few places)
	for rc in rapid_customers:
	rc_indices = df[df['customer_id'] == rc].index.tolist()
	if len(rc_indices) > 5:
	base_time = df.at[rc_indices[0], 'timestamp']
	for j in range(1, 6):
	df.at[rc_indices[j], 'timestamp'] = base_time + datetime.timedelta(minutes=j)

	df = df.sort_values(by='timestamp').reset_index(drop=True)
	df.to_csv(filepath, index=False)
	print(f"Generated {filepath} successfully with {len(df)} rows.")

	if __name__ == "__main__":
	generate_sample_data("/Users/ajaykasu/Downloads/AML Shield/aml-shield/sample_data/sample_transactions.csv")