Spaces:

Zayeemk
/

Credit-Card-Anomaly

Sleeping

App Files Files Community

Credit-Card-Anomaly / utils /transform_kaggle.py

Zayeemk

Rename transform_kaggle.py to utils/transform_kaggle.py

a89b384 verified about 2 months ago

raw

history blame contribute delete

3.05 kB

	import pandas as pd
	import numpy as np
	from datetime import datetime, timedelta


	def transform_kaggle_dataset(input_path: str, output_path: str, sample_size: int = None):
	"""
	Transform Kaggle Credit Card Fraud Detection dataset to our system format.

	Original format: Time, V1-V28, Amount, Class
	Target format: Transaction ID, User ID, Amount, Timestamp, Merchant Category, Location
	"""

	# Load the dataset
	df = pd.read_csv(input_path)

	print(f"Loaded {len(df)} transactions from {input_path}")

	# If sample_size is specified, take a sample
	if sample_size and sample_size < len(df):
	df = df.sample(n=sample_size, random_state=42)
	print(f"Sampled {sample_size} transactions")

	# Generate Transaction IDs
	df['Transaction ID'] = [f'TX{i:06d}' for i in range(len(df))]

	# Generate User IDs (simulate multiple users)
	# We'll assign users based on transaction patterns to create realistic behavior
	num_users = 10
	df['User ID'] = [f'USER{i:03d}' for i in np.random.randint(1, num_users + 1, len(df))]

	# Convert Time to Timestamp (Time is in seconds from first transaction)
	# Assume first transaction is at 2024-01-01 00:00:00
	start_time = datetime(2024, 1, 1, 0, 0, 0)
	df['Timestamp'] = df['Time'].apply(lambda x: start_time + timedelta(seconds=x))

	# Generate Merchant Categories based on amount patterns
	def get_merchant_category(amount):
	if amount < 10:
	return 'Coffee Shop'
	elif amount < 30:
	return 'Fast Food'
	elif amount < 50:
	return 'Grocery'
	elif amount < 100:
	return 'Restaurant'
	elif amount < 200:
	return 'Department Store'
	elif amount < 500:
	return 'Electronics'
	elif amount < 1000:
	return 'Jewelry'
	else:
	return 'Luxury Goods'

	df['Merchant Category'] = df['Amount'].apply(get_merchant_category)

	# Generate Locations
	locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix',
	'Miami', 'Seattle', 'Boston', 'Denver', 'Dallas']
	df['Location'] = np.random.choice(locations, len(df))

	# Select and reorder columns
	columns = ['Transaction ID', 'User ID', 'Amount', 'Timestamp', 'Merchant Category', 'Location']
	df_output = df[columns].copy()

	# Save transformed dataset
	df_output.to_csv(output_path, index=False)
	print(f"Saved transformed dataset to {output_path}")
	print(f"Output shape: {df_output.shape}")
	print(f"\nSample data:")
	print(df_output.head())

	return df_output


	if __name__ == '__main__':
	# Transform the full dataset or a sample
	input_path = 'data/creditcard_original.csv'
	output_path = 'data/sample.csv'

	# Use a sample of 5000 transactions for better training
	# Change to None to use the full dataset (284,807 transactions)
	transform_kaggle_dataset(input_path, output_path, sample_size=5000)