Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime, timedelta | |
| def transform_kaggle_dataset(input_path: str, output_path: str, sample_size: int = None): | |
| """ | |
| Transform Kaggle Credit Card Fraud Detection dataset to our system format. | |
| Original format: Time, V1-V28, Amount, Class | |
| Target format: Transaction ID, User ID, Amount, Timestamp, Merchant Category, Location | |
| """ | |
| # Load the dataset | |
| df = pd.read_csv(input_path) | |
| print(f"Loaded {len(df)} transactions from {input_path}") | |
| # If sample_size is specified, take a sample | |
| if sample_size and sample_size < len(df): | |
| df = df.sample(n=sample_size, random_state=42) | |
| print(f"Sampled {sample_size} transactions") | |
| # Generate Transaction IDs | |
| df['Transaction ID'] = [f'TX{i:06d}' for i in range(len(df))] | |
| # Generate User IDs (simulate multiple users) | |
| # We'll assign users based on transaction patterns to create realistic behavior | |
| num_users = 10 | |
| df['User ID'] = [f'USER{i:03d}' for i in np.random.randint(1, num_users + 1, len(df))] | |
| # Convert Time to Timestamp (Time is in seconds from first transaction) | |
| # Assume first transaction is at 2024-01-01 00:00:00 | |
| start_time = datetime(2024, 1, 1, 0, 0, 0) | |
| df['Timestamp'] = df['Time'].apply(lambda x: start_time + timedelta(seconds=x)) | |
| # Generate Merchant Categories based on amount patterns | |
| def get_merchant_category(amount): | |
| if amount < 10: | |
| return 'Coffee Shop' | |
| elif amount < 30: | |
| return 'Fast Food' | |
| elif amount < 50: | |
| return 'Grocery' | |
| elif amount < 100: | |
| return 'Restaurant' | |
| elif amount < 200: | |
| return 'Department Store' | |
| elif amount < 500: | |
| return 'Electronics' | |
| elif amount < 1000: | |
| return 'Jewelry' | |
| else: | |
| return 'Luxury Goods' | |
| df['Merchant Category'] = df['Amount'].apply(get_merchant_category) | |
| # Generate Locations | |
| locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', | |
| 'Miami', 'Seattle', 'Boston', 'Denver', 'Dallas'] | |
| df['Location'] = np.random.choice(locations, len(df)) | |
| # Select and reorder columns | |
| columns = ['Transaction ID', 'User ID', 'Amount', 'Timestamp', 'Merchant Category', 'Location'] | |
| df_output = df[columns].copy() | |
| # Save transformed dataset | |
| df_output.to_csv(output_path, index=False) | |
| print(f"Saved transformed dataset to {output_path}") | |
| print(f"Output shape: {df_output.shape}") | |
| print(f"\nSample data:") | |
| print(df_output.head()) | |
| return df_output | |
| if __name__ == '__main__': | |
| # Transform the full dataset or a sample | |
| input_path = 'data/creditcard_original.csv' | |
| output_path = 'data/sample.csv' | |
| # Use a sample of 5000 transactions for better training | |
| # Change to None to use the full dataset (284,807 transactions) | |
| transform_kaggle_dataset(input_path, output_path, sample_size=5000) | |