Credit-Card-Anomaly / utils /transform_kaggle.py
Zayeemk's picture
Rename transform_kaggle.py to utils/transform_kaggle.py
a89b384 verified
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
def transform_kaggle_dataset(input_path: str, output_path: str, sample_size: int = None):
"""
Transform Kaggle Credit Card Fraud Detection dataset to our system format.
Original format: Time, V1-V28, Amount, Class
Target format: Transaction ID, User ID, Amount, Timestamp, Merchant Category, Location
"""
# Load the dataset
df = pd.read_csv(input_path)
print(f"Loaded {len(df)} transactions from {input_path}")
# If sample_size is specified, take a sample
if sample_size and sample_size < len(df):
df = df.sample(n=sample_size, random_state=42)
print(f"Sampled {sample_size} transactions")
# Generate Transaction IDs
df['Transaction ID'] = [f'TX{i:06d}' for i in range(len(df))]
# Generate User IDs (simulate multiple users)
# We'll assign users based on transaction patterns to create realistic behavior
num_users = 10
df['User ID'] = [f'USER{i:03d}' for i in np.random.randint(1, num_users + 1, len(df))]
# Convert Time to Timestamp (Time is in seconds from first transaction)
# Assume first transaction is at 2024-01-01 00:00:00
start_time = datetime(2024, 1, 1, 0, 0, 0)
df['Timestamp'] = df['Time'].apply(lambda x: start_time + timedelta(seconds=x))
# Generate Merchant Categories based on amount patterns
def get_merchant_category(amount):
if amount < 10:
return 'Coffee Shop'
elif amount < 30:
return 'Fast Food'
elif amount < 50:
return 'Grocery'
elif amount < 100:
return 'Restaurant'
elif amount < 200:
return 'Department Store'
elif amount < 500:
return 'Electronics'
elif amount < 1000:
return 'Jewelry'
else:
return 'Luxury Goods'
df['Merchant Category'] = df['Amount'].apply(get_merchant_category)
# Generate Locations
locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix',
'Miami', 'Seattle', 'Boston', 'Denver', 'Dallas']
df['Location'] = np.random.choice(locations, len(df))
# Select and reorder columns
columns = ['Transaction ID', 'User ID', 'Amount', 'Timestamp', 'Merchant Category', 'Location']
df_output = df[columns].copy()
# Save transformed dataset
df_output.to_csv(output_path, index=False)
print(f"Saved transformed dataset to {output_path}")
print(f"Output shape: {df_output.shape}")
print(f"\nSample data:")
print(df_output.head())
return df_output
if __name__ == '__main__':
# Transform the full dataset or a sample
input_path = 'data/creditcard_original.csv'
output_path = 'data/sample.csv'
# Use a sample of 5000 transactions for better training
# Change to None to use the full dataset (284,807 transactions)
transform_kaggle_dataset(input_path, output_path, sample_size=5000)