Upload data_processing.py
Browse files- data_processing.py +85 -0
data_processing.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Data processing für Fraud Detection:
|
| 4 |
+
- Datenbereinigung
|
| 5 |
+
- Encoding
|
| 6 |
+
- Skalierung
|
| 7 |
+
- SMOTE-Ausgleich
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
import joblib
|
| 12 |
+
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
|
| 13 |
+
from imblearn.over_sampling import SMOTE
|
| 14 |
+
|
| 15 |
+
# ------------------------------
|
| 16 |
+
# Daten laden und bereinigen
|
| 17 |
+
# ------------------------------
|
| 18 |
+
def load_and_clean_data(file_path):
|
| 19 |
+
df = pd.read_csv(file_path)
|
| 20 |
+
df = df.dropna()
|
| 21 |
+
df = df[df['amt'] > 0]
|
| 22 |
+
|
| 23 |
+
# Nicht relevante / hochdimensionale Spalten entfernen
|
| 24 |
+
drop_cols = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'street',
|
| 25 |
+
'city', 'merchant', 'merch_lat', 'merch_long']
|
| 26 |
+
df = df.drop(columns=[col for col in drop_cols if col in df.columns])
|
| 27 |
+
return df
|
| 28 |
+
|
| 29 |
+
# ------------------------------
|
| 30 |
+
# Features encodieren
|
| 31 |
+
# ------------------------------
|
| 32 |
+
def encode_features(df):
|
| 33 |
+
# Kleine kategoriale Features: One-Hot
|
| 34 |
+
small_cats = ['gender', 'category', 'state']
|
| 35 |
+
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
|
| 36 |
+
encoded_data = encoder.fit_transform(df[small_cats])
|
| 37 |
+
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(small_cats))
|
| 38 |
+
df = df.drop(small_cats, axis=1)
|
| 39 |
+
df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
|
| 40 |
+
joblib.dump(encoder, "encoder.pkl")
|
| 41 |
+
|
| 42 |
+
# Große kategorische Features: Label-Encoding
|
| 43 |
+
if 'job' in df.columns:
|
| 44 |
+
df['job'] = LabelEncoder().fit_transform(df['job'])
|
| 45 |
+
return df
|
| 46 |
+
|
| 47 |
+
# ------------------------------
|
| 48 |
+
# SMOTE ausgleichen
|
| 49 |
+
# ------------------------------
|
| 50 |
+
def balance_data(df):
|
| 51 |
+
# Nur numerische Spalten für SMOTE
|
| 52 |
+
X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1)
|
| 53 |
+
y = df['is_fraud']
|
| 54 |
+
smote = SMOTE(random_state=42)
|
| 55 |
+
X_res, y_res = smote.fit_resample(X, y)
|
| 56 |
+
|
| 57 |
+
# Nicht-numerische Spalten wieder hinzufügen
|
| 58 |
+
non_numeric = df.select_dtypes(exclude=['int64', 'float64'])
|
| 59 |
+
df_resampled = pd.concat([non_numeric.reset_index(drop=True),
|
| 60 |
+
pd.DataFrame(X_res, columns=X.columns),
|
| 61 |
+
pd.DataFrame(y_res, columns=['is_fraud'])], axis=1)
|
| 62 |
+
return df_resampled
|
| 63 |
+
|
| 64 |
+
# ------------------------------
|
| 65 |
+
# Skalierung numerischer Features
|
| 66 |
+
# ------------------------------
|
| 67 |
+
def scale_features(df):
|
| 68 |
+
X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1)
|
| 69 |
+
scaler = StandardScaler()
|
| 70 |
+
X_scaled = scaler.fit_transform(X)
|
| 71 |
+
joblib.dump(scaler, "scaler.pkl")
|
| 72 |
+
return X_scaled, scaler
|
| 73 |
+
|
| 74 |
+
# ------------------------------
|
| 75 |
+
# Hauptprogramm zum Testen
|
| 76 |
+
# ------------------------------
|
| 77 |
+
if __name__ == "__main__":
|
| 78 |
+
input_file = "fraudTest.csv"
|
| 79 |
+
df = load_and_clean_data(input_file)
|
| 80 |
+
df_encoded = encode_features(df)
|
| 81 |
+
df_balanced = balance_data(df_encoded)
|
| 82 |
+
X_scaled, scaler = scale_features(df_balanced)
|
| 83 |
+
|
| 84 |
+
df_balanced.to_csv("fraudTest_cleaned.csv", index=False)
|
| 85 |
+
print("Datenaufbereitung abgeschlossen!")
|