File size: 5,089 Bytes
c5ec08c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn import __version__ as sklearn_version
from packaging import version

class DataProcessor:
    def __init__(self):
        self.scaler = StandardScaler()
        
        # Handle different scikit-learn versions
        if version.parse(sklearn_version) >= version.parse('1.2.0'):
            self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        else:
            self.encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        
    def load_data(self, file_path):
        """Load the dataset from a CSV file"""
        try:
            df = pd.read_csv(file_path)
            return df
        except Exception as e:
            print(f"Error loading data: {e}")
            return None
    
    def preprocess_data(self, df, target_col='Class'):
        """Preprocess the data for model training"""
        # Handle missing values
        df = df.fillna(df.mean())
        
        # Separate features and target
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale numerical features
        num_features = X.select_dtypes(include=['int64', 'float64']).columns
        
        # Get categorical features if any
        cat_features = X.select_dtypes(include=['object', 'category']).columns
        
        # Create preprocessing pipelines
        if version.parse(sklearn_version) >= version.parse('1.2.0'):
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), num_features),
                    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_features)
                ] if len(cat_features) > 0 else [
                    ('num', StandardScaler(), num_features)
                ]
            )
        else:
            preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), num_features),
                    ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_features)
                ] if len(cat_features) > 0 else [
                    ('num', StandardScaler(), num_features)
                ]
            )
        
        # Fit and transform the training data
        X_train_processed = preprocessor.fit_transform(X_train)
        X_test_processed = preprocessor.transform(X_test)
        
        # Handle class imbalance using SMOTE
        smote = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
        
        return X_train_resampled, X_test_processed, y_train_resampled, y_test, preprocessor
    
    def engineer_features(self, df):
        """Create new features for fraud detection"""
        # Copy the dataframe to avoid modifying the original
        df_new = df.copy()
        
        # If Time column exists, create time-based features
        if 'Time' in df_new.columns:
            # Convert seconds to hours of the day (assuming Time is in seconds from a reference point)
            df_new['Hour'] = (df_new['Time'] / 3600) % 24
            
            # Flag for transactions during odd hours (midnight to 5 AM)
            df_new['Odd_Hour'] = ((df_new['Hour'] >= 0) & (df_new['Hour'] < 5)).astype(int)
        
        # If Amount column exists, create amount-based features
        if 'Amount' in df_new.columns:
            # Log transform for amount (to handle skewed distribution)
            df_new['Log_Amount'] = np.log1p(df_new['Amount'])
            
            # Flag for high-value transactions (top 5%)
            threshold = df_new['Amount'].quantile(0.95)
            df_new['High_Value'] = (df_new['Amount'] > threshold).astype(int)
        
        # Transaction frequency features (if multiple transactions per account)
        if 'card_id' in df_new.columns:  # Assuming there's a card or account ID
            # Number of transactions per card
            tx_count = df_new.groupby('card_id').size().reset_index(name='Tx_Count')
            df_new = df_new.merge(tx_count, on='card_id', how='left')
            
            # Average transaction amount per card
            avg_amount = df_new.groupby('card_id')['Amount'].mean().reset_index(name='Avg_Amount')
            df_new = df_new.merge(avg_amount, on='card_id', how='left')
            
            # Transaction amount deviation from average
            df_new['Amount_Deviation'] = df_new['Amount'] - df_new['Avg_Amount']
        
        return df_new