import pandas as pd import numpy as np from datetime import datetime, timedelta def transform_kaggle_dataset(input_path: str, output_path: str, sample_size: int = None): """ Transform Kaggle Credit Card Fraud Detection dataset to our system format. Original format: Time, V1-V28, Amount, Class Target format: Transaction ID, User ID, Amount, Timestamp, Merchant Category, Location """ # Load the dataset df = pd.read_csv(input_path) print(f"Loaded {len(df)} transactions from {input_path}") # If sample_size is specified, take a sample if sample_size and sample_size < len(df): df = df.sample(n=sample_size, random_state=42) print(f"Sampled {sample_size} transactions") # Generate Transaction IDs df['Transaction ID'] = [f'TX{i:06d}' for i in range(len(df))] # Generate User IDs (simulate multiple users) # We'll assign users based on transaction patterns to create realistic behavior num_users = 10 df['User ID'] = [f'USER{i:03d}' for i in np.random.randint(1, num_users + 1, len(df))] # Convert Time to Timestamp (Time is in seconds from first transaction) # Assume first transaction is at 2024-01-01 00:00:00 start_time = datetime(2024, 1, 1, 0, 0, 0) df['Timestamp'] = df['Time'].apply(lambda x: start_time + timedelta(seconds=x)) # Generate Merchant Categories based on amount patterns def get_merchant_category(amount): if amount < 10: return 'Coffee Shop' elif amount < 30: return 'Fast Food' elif amount < 50: return 'Grocery' elif amount < 100: return 'Restaurant' elif amount < 200: return 'Department Store' elif amount < 500: return 'Electronics' elif amount < 1000: return 'Jewelry' else: return 'Luxury Goods' df['Merchant Category'] = df['Amount'].apply(get_merchant_category) # Generate Locations locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Miami', 'Seattle', 'Boston', 'Denver', 'Dallas'] df['Location'] = np.random.choice(locations, len(df)) # Select and reorder columns columns = ['Transaction ID', 'User ID', 'Amount', 'Timestamp', 'Merchant Category', 'Location'] df_output = df[columns].copy() # Save transformed dataset df_output.to_csv(output_path, index=False) print(f"Saved transformed dataset to {output_path}") print(f"Output shape: {df_output.shape}") print(f"\nSample data:") print(df_output.head()) return df_output if __name__ == '__main__': # Transform the full dataset or a sample input_path = 'data/creditcard_original.csv' output_path = 'data/sample.csv' # Use a sample of 5000 transactions for better training # Change to None to use the full dataset (284,807 transactions) transform_kaggle_dataset(input_path, output_path, sample_size=5000)