File size: 3,091 Bytes
22d73cc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | # -*- coding: utf-8 -*-
"""
Data processing für Fraud Detection:
- Datenbereinigung
- Encoding
- Skalierung
- SMOTE-Ausgleich
"""
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
# ------------------------------
# Daten laden und bereinigen
# ------------------------------
def load_and_clean_data(file_path):
df = pd.read_csv(file_path)
df = df.dropna()
df = df[df['amt'] > 0]
# Nicht relevante / hochdimensionale Spalten entfernen
drop_cols = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'street',
'city', 'merchant', 'merch_lat', 'merch_long']
df = df.drop(columns=[col for col in drop_cols if col in df.columns])
return df
# ------------------------------
# Features encodieren
# ------------------------------
def encode_features(df):
# Kleine kategoriale Features: One-Hot
small_cats = ['gender', 'category', 'state']
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(df[small_cats])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(small_cats))
df = df.drop(small_cats, axis=1)
df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
joblib.dump(encoder, "encoder.pkl")
# Große kategorische Features: Label-Encoding
if 'job' in df.columns:
df['job'] = LabelEncoder().fit_transform(df['job'])
return df
# ------------------------------
# SMOTE ausgleichen
# ------------------------------
def balance_data(df):
# Nur numerische Spalten für SMOTE
X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1)
y = df['is_fraud']
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)
# Nicht-numerische Spalten wieder hinzufügen
non_numeric = df.select_dtypes(exclude=['int64', 'float64'])
df_resampled = pd.concat([non_numeric.reset_index(drop=True),
pd.DataFrame(X_res, columns=X.columns),
pd.DataFrame(y_res, columns=['is_fraud'])], axis=1)
return df_resampled
# ------------------------------
# Skalierung numerischer Features
# ------------------------------
def scale_features(df):
X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, "scaler.pkl")
return X_scaled, scaler
# ------------------------------
# Hauptprogramm zum Testen
# ------------------------------
if __name__ == "__main__":
input_file = "fraudTest.csv"
df = load_and_clean_data(input_file)
df_encoded = encode_features(df)
df_balanced = balance_data(df_encoded)
X_scaled, scaler = scale_features(df_balanced)
df_balanced.to_csv("fraudTest_cleaned.csv", index=False)
print("Datenaufbereitung abgeschlossen!")
|