# -*- coding: utf-8 -*- """ Data processing für Fraud Detection: - Datenbereinigung - Encoding - Skalierung - SMOTE-Ausgleich """ import pandas as pd import joblib from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler from imblearn.over_sampling import SMOTE # ------------------------------ # Daten laden und bereinigen # ------------------------------ def load_and_clean_data(file_path): df = pd.read_csv(file_path) df = df.dropna() df = df[df['amt'] > 0] # Nicht relevante / hochdimensionale Spalten entfernen drop_cols = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'street', 'city', 'merchant', 'merch_lat', 'merch_long'] df = df.drop(columns=[col for col in drop_cols if col in df.columns]) return df # ------------------------------ # Features encodieren # ------------------------------ def encode_features(df): # Kleine kategoriale Features: One-Hot small_cats = ['gender', 'category', 'state'] encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') encoded_data = encoder.fit_transform(df[small_cats]) encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(small_cats)) df = df.drop(small_cats, axis=1) df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1) joblib.dump(encoder, "encoder.pkl") # Große kategorische Features: Label-Encoding if 'job' in df.columns: df['job'] = LabelEncoder().fit_transform(df['job']) return df # ------------------------------ # SMOTE ausgleichen # ------------------------------ def balance_data(df): # Nur numerische Spalten für SMOTE X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1) y = df['is_fraud'] smote = SMOTE(random_state=42) X_res, y_res = smote.fit_resample(X, y) # Nicht-numerische Spalten wieder hinzufügen non_numeric = df.select_dtypes(exclude=['int64', 'float64']) df_resampled = pd.concat([non_numeric.reset_index(drop=True), pd.DataFrame(X_res, columns=X.columns), pd.DataFrame(y_res, columns=['is_fraud'])], axis=1) return df_resampled # ------------------------------ # Skalierung numerischer Features # ------------------------------ def scale_features(df): X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1) scaler = StandardScaler() X_scaled = scaler.fit_transform(X) joblib.dump(scaler, "scaler.pkl") return X_scaled, scaler # ------------------------------ # Hauptprogramm zum Testen # ------------------------------ if __name__ == "__main__": input_file = "fraudTest.csv" df = load_and_clean_data(input_file) df_encoded = encode_features(df) df_balanced = balance_data(df_encoded) X_scaled, scaler = scale_features(df_balanced) df_balanced.to_csv("fraudTest_cleaned.csv", index=False) print("Datenaufbereitung abgeschlossen!")