| |
|
| | """
|
| | Data processing für Fraud Detection:
|
| | - Datenbereinigung
|
| | - Encoding
|
| | - Skalierung
|
| | - SMOTE-Ausgleich
|
| | """
|
| |
|
| | import pandas as pd
|
| | import joblib
|
| | from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
|
| | from imblearn.over_sampling import SMOTE
|
| |
|
| |
|
| |
|
| |
|
| | def load_and_clean_data(file_path):
|
| | df = pd.read_csv(file_path)
|
| | df = df.dropna()
|
| | df = df[df['amt'] > 0]
|
| |
|
| |
|
| | drop_cols = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'street',
|
| | 'city', 'merchant', 'merch_lat', 'merch_long']
|
| | df = df.drop(columns=[col for col in drop_cols if col in df.columns])
|
| | return df
|
| |
|
| |
|
| |
|
| |
|
| | def encode_features(df):
|
| |
|
| | small_cats = ['gender', 'category', 'state']
|
| | encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
|
| | encoded_data = encoder.fit_transform(df[small_cats])
|
| | encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(small_cats))
|
| | df = df.drop(small_cats, axis=1)
|
| | df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
|
| | joblib.dump(encoder, "encoder.pkl")
|
| |
|
| |
|
| | if 'job' in df.columns:
|
| | df['job'] = LabelEncoder().fit_transform(df['job'])
|
| | return df
|
| |
|
| |
|
| |
|
| |
|
| | def balance_data(df):
|
| |
|
| | X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1)
|
| | y = df['is_fraud']
|
| | smote = SMOTE(random_state=42)
|
| | X_res, y_res = smote.fit_resample(X, y)
|
| |
|
| |
|
| | non_numeric = df.select_dtypes(exclude=['int64', 'float64'])
|
| | df_resampled = pd.concat([non_numeric.reset_index(drop=True),
|
| | pd.DataFrame(X_res, columns=X.columns),
|
| | pd.DataFrame(y_res, columns=['is_fraud'])], axis=1)
|
| | return df_resampled
|
| |
|
| |
|
| |
|
| |
|
| | def scale_features(df):
|
| | X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1)
|
| | scaler = StandardScaler()
|
| | X_scaled = scaler.fit_transform(X)
|
| | joblib.dump(scaler, "scaler.pkl")
|
| | return X_scaled, scaler
|
| |
|
| |
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | input_file = "fraudTest.csv"
|
| | df = load_and_clean_data(input_file)
|
| | df_encoded = encode_features(df)
|
| | df_balanced = balance_data(df_encoded)
|
| | X_scaled, scaler = scale_features(df_balanced)
|
| |
|
| | df_balanced.to_csv("fraudTest_cleaned.csv", index=False)
|
| | print("Datenaufbereitung abgeschlossen!")
|
| |
|