File size: 3,406 Bytes
41e2db1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
import joblib

RAW_PATH       = os.path.join('data', 'raw', 'creditcard.csv')
PROCESSED_DIR  = os.path.join('data', 'processed')
SCALER_PATH    = os.path.join('models', 'scaler.pkl')
RANDOM_STATE   = 42
TEST_SIZE      = 0.20


def load_data(path: str = RAW_PATH) -> pd.DataFrame:
    df = pd.read_csv(path)
    print(f"Loaded  : {df.shape[0]:,} rows, {df.shape[1]} columns")
    return df


def scale_features(df: pd.DataFrame, fit: bool = True, scaler=None):
    """
    Scale Amount and Time with RobustScaler (resistant to outliers).
    V1-V28 are already PCA-scaled — leave them untouched.
    """
    df = df.copy()
    if fit:
        scaler = RobustScaler()
        df[['Amount', 'Time']] = scaler.fit_transform(df[['Amount', 'Time']])
        os.makedirs(os.path.dirname(SCALER_PATH), exist_ok=True)
        joblib.dump(scaler, SCALER_PATH)
        print(f"Scaler  : fitted and saved to {SCALER_PATH}")
    else:
        df[['Amount', 'Time']] = scaler.transform(df[['Amount', 'Time']])
    return df, scaler


def split_data(df: pd.DataFrame):
    X = df.drop('Class', axis=1)
    y = df['Class']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=TEST_SIZE,
        random_state=RANDOM_STATE,
        stratify=y          # preserves 0.17% fraud ratio in both splits
    )
    print(f"Train   : {X_train.shape[0]:,} rows  (fraud: {y_train.sum():,})")
    print(f"Test    : {X_test.shape[0]:,} rows  (fraud: {y_test.sum():,})")
    return X_train, X_test, y_train, y_test


def apply_smote(X_train: pd.DataFrame, y_train: pd.Series):
    """
    SMOTE applied ONLY to training data — never to test data.
    Creates synthetic fraud examples so the model sees a balanced dataset.
    """
    print(f"Before SMOTE — normal: {(y_train==0).sum():,}  fraud: {(y_train==1).sum():,}")
    sm = SMOTE(random_state=RANDOM_STATE)
    X_res, y_res = sm.fit_resample(X_train, y_train)
    print(f"After  SMOTE — normal: {(y_res==0).sum():,}  fraud: {(y_res==1).sum():,}")
    return X_res, y_res


def save_splits(X_train, X_test, y_train, y_test, X_train_sm, y_train_sm):
    os.makedirs(PROCESSED_DIR, exist_ok=True)
    joblib.dump(X_train,    os.path.join(PROCESSED_DIR, 'X_train_raw.pkl'))
    joblib.dump(X_test,     os.path.join(PROCESSED_DIR, 'X_test.pkl'))
    joblib.dump(y_train,    os.path.join(PROCESSED_DIR, 'y_train_raw.pkl'))
    joblib.dump(y_test,     os.path.join(PROCESSED_DIR, 'y_test.pkl'))
    joblib.dump(X_train_sm, os.path.join(PROCESSED_DIR, 'X_train_smote.pkl'))
    joblib.dump(y_train_sm, os.path.join(PROCESSED_DIR, 'y_train_smote.pkl'))
    print(f"Saved   : all splits to {PROCESSED_DIR}/")


def run():
    print("=" * 45)
    print("  Preprocessing Pipeline")
    print("=" * 45)

    df = load_data()
    df, scaler = scale_features(df, fit=True)
    X_train, X_test, y_train, y_test = split_data(df)
    X_train_sm, y_train_sm = apply_smote(X_train, y_train)
    save_splits(X_train, X_test, y_train, y_test, X_train_sm, y_train_sm)

    print("=" * 45)
    print("  Done — preprocessing complete")
    print("=" * 45)
    return X_train, X_test, y_train, y_test, X_train_sm, y_train_sm, scaler


if __name__ == '__main__':
    run()