import pandas as pd
import numpy as np
from datetime import datetime, timedelta


def transform_kaggle_dataset(input_path: str, output_path: str, sample_size: int = None):
    """
    Transform Kaggle Credit Card Fraud Detection dataset to our system format.
    
    Original format: Time, V1-V28, Amount, Class
    Target format: Transaction ID, User ID, Amount, Timestamp, Merchant Category, Location
    """
    
    # Load the dataset
    df = pd.read_csv(input_path)
    
    print(f"Loaded {len(df)} transactions from {input_path}")
    
    # If sample_size is specified, take a sample
    if sample_size and sample_size < len(df):
        df = df.sample(n=sample_size, random_state=42)
        print(f"Sampled {sample_size} transactions")
    
    # Generate Transaction IDs
    df['Transaction ID'] = [f'TX{i:06d}' for i in range(len(df))]
    
    # Generate User IDs (simulate multiple users)
    # We'll assign users based on transaction patterns to create realistic behavior
    num_users = 10
    df['User ID'] = [f'USER{i:03d}' for i in np.random.randint(1, num_users + 1, len(df))]
    
    # Convert Time to Timestamp (Time is in seconds from first transaction)
    # Assume first transaction is at 2024-01-01 00:00:00
    start_time = datetime(2024, 1, 1, 0, 0, 0)
    df['Timestamp'] = df['Time'].apply(lambda x: start_time + timedelta(seconds=x))
    
    # Generate Merchant Categories based on amount patterns
    def get_merchant_category(amount):
        if amount < 10:
            return 'Coffee Shop'
        elif amount < 30:
            return 'Fast Food'
        elif amount < 50:
            return 'Grocery'
        elif amount < 100:
            return 'Restaurant'
        elif amount < 200:
            return 'Department Store'
        elif amount < 500:
            return 'Electronics'
        elif amount < 1000:
            return 'Jewelry'
        else:
            return 'Luxury Goods'
    
    df['Merchant Category'] = df['Amount'].apply(get_merchant_category)
    
    # Generate Locations
    locations = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 
                 'Miami', 'Seattle', 'Boston', 'Denver', 'Dallas']
    df['Location'] = np.random.choice(locations, len(df))
    
    # Select and reorder columns
    columns = ['Transaction ID', 'User ID', 'Amount', 'Timestamp', 'Merchant Category', 'Location']
    df_output = df[columns].copy()
    
    # Save transformed dataset
    df_output.to_csv(output_path, index=False)
    print(f"Saved transformed dataset to {output_path}")
    print(f"Output shape: {df_output.shape}")
    print(f"\nSample data:")
    print(df_output.head())
    
    return df_output


if __name__ == '__main__':
    # Transform the full dataset or a sample
    input_path = 'data/creditcard_original.csv'
    output_path = 'data/sample.csv'
    
    # Use a sample of 5000 transactions for better training
    # Change to None to use the full dataset (284,807 transactions)
    transform_kaggle_dataset(input_path, output_path, sample_size=5000)