import pandas as pd import numpy as np import datetime import random def generate_sample_data(filepath="sample_transactions.csv"): np.random.seed(42) random.seed(42) num_rows = 1000 customer_ids = [f"CUST_{str(i).zfill(4)}" for i in range(1, 51)] end_date = datetime.datetime.now() start_date = end_date - datetime.timedelta(days=30) timestamps = [start_date + datetime.timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(num_rows)] timestamps.sort() tx_types = ['TRANSFER'] * 400 + ['CASH_OUT'] * 250 + ['PAYMENT'] * 200 + ['DEBIT'] * 100 + ['CASH_IN'] * 50 random.shuffle(tx_types) countries = ['US', 'GB', 'CN', 'NG', 'RU', 'DE', 'BR', 'MX'] data = [] # 5 customers with rapid sequential transactions rapid_customers = random.sample(customer_ids, 5) for i in range(num_rows): cid = random.choice(customer_ids) amount = round(random.uniform(100, 5000), 2) # Outliers up to 500K if random.random() < 0.05: amount = round(random.uniform(5000, 500000), 2) orig = 'US' if random.random() < 0.7 else random.choice(countries) dest = 'US' if random.random() < 0.7 else random.choice(countries) age = random.randint(7, 3650) data.append({ 'transaction_id': f"TXN_{str(i+1).zfill(6)}", 'customer_id': cid, 'amount': amount, 'timestamp': timestamps[i], 'transaction_type': tx_types[i], 'origin_country': orig, 'dest_country': dest, 'account_age_days': age }) df = pd.DataFrame(data) # Inject suspicious patterns # 1. 10 structuring transactions ($9000-$9999) structuring_indices = random.sample(range(num_rows), 10) for idx in structuring_indices: df.at[idx, 'amount'] = round(random.uniform(9000, 9999), 2) # 2. 8 large international cash outs > $50K intl_cash_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices], 8) for idx in intl_cash_indices: df.at[idx, 'transaction_type'] = 'CASH_OUT' df.at[idx, 'amount'] = round(random.uniform(50001, 150000), 2) df.at[idx, 'origin_country'] = 'US' df.at[idx, 'dest_country'] = random.choice([c for c in countries if c != 'US']) # 3. 6 dormant account spikes (age < 30 days, amount > $10K) dormant_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices and i not in intl_cash_indices], 6) for idx in dormant_indices: df.at[idx, 'account_age_days'] = random.randint(1, 29) df.at[idx, 'amount'] = round(random.uniform(10001, 50000), 2) # 4. 5 exact round amounts round_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices and i not in intl_cash_indices and i not in dormant_indices], 5) round_amounts = [10000, 50000, 100000, 10000, 50000] for i, idx in enumerate(round_indices): df.at[idx, 'amount'] = float(round_amounts[i]) # 5. Rapid sequential transactions for the 5 customers # (Just cluster their timestamps closely in a few places) for rc in rapid_customers: rc_indices = df[df['customer_id'] == rc].index.tolist() if len(rc_indices) > 5: base_time = df.at[rc_indices[0], 'timestamp'] for j in range(1, 6): df.at[rc_indices[j], 'timestamp'] = base_time + datetime.timedelta(minutes=j) df = df.sort_values(by='timestamp').reset_index(drop=True) df.to_csv(filepath, index=False) print(f"Generated {filepath} successfully with {len(df)} rows.") if __name__ == "__main__": generate_sample_data("/Users/ajaykasu/Downloads/AML Shield/aml-shield/sample_data/sample_transactions.csv")