Spaces:
Running
Running
| import pandas as pd | |
| import numpy as np | |
| import datetime | |
| import random | |
| def generate_sample_data(filepath="sample_transactions.csv"): | |
| np.random.seed(42) | |
| random.seed(42) | |
| num_rows = 1000 | |
| customer_ids = [f"CUST_{str(i).zfill(4)}" for i in range(1, 51)] | |
| end_date = datetime.datetime.now() | |
| start_date = end_date - datetime.timedelta(days=30) | |
| timestamps = [start_date + datetime.timedelta(seconds=random.randint(0, int((end_date - start_date).total_seconds()))) for _ in range(num_rows)] | |
| timestamps.sort() | |
| tx_types = ['TRANSFER'] * 400 + ['CASH_OUT'] * 250 + ['PAYMENT'] * 200 + ['DEBIT'] * 100 + ['CASH_IN'] * 50 | |
| random.shuffle(tx_types) | |
| countries = ['US', 'GB', 'CN', 'NG', 'RU', 'DE', 'BR', 'MX'] | |
| data = [] | |
| # 5 customers with rapid sequential transactions | |
| rapid_customers = random.sample(customer_ids, 5) | |
| for i in range(num_rows): | |
| cid = random.choice(customer_ids) | |
| amount = round(random.uniform(100, 5000), 2) | |
| # Outliers up to 500K | |
| if random.random() < 0.05: | |
| amount = round(random.uniform(5000, 500000), 2) | |
| orig = 'US' if random.random() < 0.7 else random.choice(countries) | |
| dest = 'US' if random.random() < 0.7 else random.choice(countries) | |
| age = random.randint(7, 3650) | |
| data.append({ | |
| 'transaction_id': f"TXN_{str(i+1).zfill(6)}", | |
| 'customer_id': cid, | |
| 'amount': amount, | |
| 'timestamp': timestamps[i], | |
| 'transaction_type': tx_types[i], | |
| 'origin_country': orig, | |
| 'dest_country': dest, | |
| 'account_age_days': age | |
| }) | |
| df = pd.DataFrame(data) | |
| # Inject suspicious patterns | |
| # 1. 10 structuring transactions ($9000-$9999) | |
| structuring_indices = random.sample(range(num_rows), 10) | |
| for idx in structuring_indices: | |
| df.at[idx, 'amount'] = round(random.uniform(9000, 9999), 2) | |
| # 2. 8 large international cash outs > $50K | |
| intl_cash_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices], 8) | |
| for idx in intl_cash_indices: | |
| df.at[idx, 'transaction_type'] = 'CASH_OUT' | |
| df.at[idx, 'amount'] = round(random.uniform(50001, 150000), 2) | |
| df.at[idx, 'origin_country'] = 'US' | |
| df.at[idx, 'dest_country'] = random.choice([c for c in countries if c != 'US']) | |
| # 3. 6 dormant account spikes (age < 30 days, amount > $10K) | |
| dormant_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices and i not in intl_cash_indices], 6) | |
| for idx in dormant_indices: | |
| df.at[idx, 'account_age_days'] = random.randint(1, 29) | |
| df.at[idx, 'amount'] = round(random.uniform(10001, 50000), 2) | |
| # 4. 5 exact round amounts | |
| round_indices = random.sample([i for i in range(num_rows) if i not in structuring_indices and i not in intl_cash_indices and i not in dormant_indices], 5) | |
| round_amounts = [10000, 50000, 100000, 10000, 50000] | |
| for i, idx in enumerate(round_indices): | |
| df.at[idx, 'amount'] = float(round_amounts[i]) | |
| # 5. Rapid sequential transactions for the 5 customers | |
| # (Just cluster their timestamps closely in a few places) | |
| for rc in rapid_customers: | |
| rc_indices = df[df['customer_id'] == rc].index.tolist() | |
| if len(rc_indices) > 5: | |
| base_time = df.at[rc_indices[0], 'timestamp'] | |
| for j in range(1, 6): | |
| df.at[rc_indices[j], 'timestamp'] = base_time + datetime.timedelta(minutes=j) | |
| df = df.sort_values(by='timestamp').reset_index(drop=True) | |
| df.to_csv(filepath, index=False) | |
| print(f"Generated {filepath} successfully with {len(df)} rows.") | |
| if __name__ == "__main__": | |
| generate_sample_data("/Users/ajaykasu/Downloads/AML Shield/aml-shield/sample_data/sample_transactions.csv") | |