File size: 3,091 Bytes
22d73cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# -*- coding: utf-8 -*-
"""

Data processing für Fraud Detection:

- Datenbereinigung

- Encoding

- Skalierung

- SMOTE-Ausgleich

"""

import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

# ------------------------------
# Daten laden und bereinigen
# ------------------------------
def load_and_clean_data(file_path):
    df = pd.read_csv(file_path)
    df = df.dropna()
    df = df[df['amt'] > 0]

    # Nicht relevante / hochdimensionale Spalten entfernen
    drop_cols = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 
                 'city', 'merchant', 'merch_lat', 'merch_long']
    df = df.drop(columns=[col for col in drop_cols if col in df.columns])
    return df

# ------------------------------
# Features encodieren
# ------------------------------
def encode_features(df):
    # Kleine kategoriale Features: One-Hot
    small_cats = ['gender', 'category', 'state']
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_data = encoder.fit_transform(df[small_cats])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(small_cats))
    df = df.drop(small_cats, axis=1)
    df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
    joblib.dump(encoder, "encoder.pkl")

    # Große kategorische Features: Label-Encoding
    if 'job' in df.columns:
        df['job'] = LabelEncoder().fit_transform(df['job'])
    return df

# ------------------------------
# SMOTE ausgleichen
# ------------------------------
def balance_data(df):
    # Nur numerische Spalten für SMOTE
    X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1)
    y = df['is_fraud']
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)

    # Nicht-numerische Spalten wieder hinzufügen
    non_numeric = df.select_dtypes(exclude=['int64', 'float64'])
    df_resampled = pd.concat([non_numeric.reset_index(drop=True),
                              pd.DataFrame(X_res, columns=X.columns),
                              pd.DataFrame(y_res, columns=['is_fraud'])], axis=1)
    return df_resampled

# ------------------------------
# Skalierung numerischer Features
# ------------------------------
def scale_features(df):
    X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    joblib.dump(scaler, "scaler.pkl")
    return X_scaled, scaler

# ------------------------------
# Hauptprogramm zum Testen
# ------------------------------
if __name__ == "__main__":
    input_file = "fraudTest.csv"
    df = load_and_clean_data(input_file)
    df_encoded = encode_features(df)
    df_balanced = balance_data(df_encoded)
    X_scaled, scaler = scale_features(df_balanced)

    df_balanced.to_csv("fraudTest_cleaned.csv", index=False)
    print("Datenaufbereitung abgeschlossen!")