anas83alrajeh commited on
Commit
22d73cc
·
verified ·
1 Parent(s): 3596721

Upload data_processing.py

Browse files
Files changed (1) hide show
  1. data_processing.py +85 -0
data_processing.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Data processing für Fraud Detection:
4
+ - Datenbereinigung
5
+ - Encoding
6
+ - Skalierung
7
+ - SMOTE-Ausgleich
8
+ """
9
+
10
+ import pandas as pd
11
+ import joblib
12
+ from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
13
+ from imblearn.over_sampling import SMOTE
14
+
15
+ # ------------------------------
16
+ # Daten laden und bereinigen
17
+ # ------------------------------
18
+ def load_and_clean_data(file_path):
19
+ df = pd.read_csv(file_path)
20
+ df = df.dropna()
21
+ df = df[df['amt'] > 0]
22
+
23
+ # Nicht relevante / hochdimensionale Spalten entfernen
24
+ drop_cols = ['trans_date_trans_time', 'cc_num', 'first', 'last', 'street',
25
+ 'city', 'merchant', 'merch_lat', 'merch_long']
26
+ df = df.drop(columns=[col for col in drop_cols if col in df.columns])
27
+ return df
28
+
29
+ # ------------------------------
30
+ # Features encodieren
31
+ # ------------------------------
32
+ def encode_features(df):
33
+ # Kleine kategoriale Features: One-Hot
34
+ small_cats = ['gender', 'category', 'state']
35
+ encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
36
+ encoded_data = encoder.fit_transform(df[small_cats])
37
+ encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(small_cats))
38
+ df = df.drop(small_cats, axis=1)
39
+ df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
40
+ joblib.dump(encoder, "encoder.pkl")
41
+
42
+ # Große kategorische Features: Label-Encoding
43
+ if 'job' in df.columns:
44
+ df['job'] = LabelEncoder().fit_transform(df['job'])
45
+ return df
46
+
47
+ # ------------------------------
48
+ # SMOTE ausgleichen
49
+ # ------------------------------
50
+ def balance_data(df):
51
+ # Nur numerische Spalten für SMOTE
52
+ X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1)
53
+ y = df['is_fraud']
54
+ smote = SMOTE(random_state=42)
55
+ X_res, y_res = smote.fit_resample(X, y)
56
+
57
+ # Nicht-numerische Spalten wieder hinzufügen
58
+ non_numeric = df.select_dtypes(exclude=['int64', 'float64'])
59
+ df_resampled = pd.concat([non_numeric.reset_index(drop=True),
60
+ pd.DataFrame(X_res, columns=X.columns),
61
+ pd.DataFrame(y_res, columns=['is_fraud'])], axis=1)
62
+ return df_resampled
63
+
64
+ # ------------------------------
65
+ # Skalierung numerischer Features
66
+ # ------------------------------
67
+ def scale_features(df):
68
+ X = df.select_dtypes(include=['int64', 'float64']).drop('is_fraud', axis=1)
69
+ scaler = StandardScaler()
70
+ X_scaled = scaler.fit_transform(X)
71
+ joblib.dump(scaler, "scaler.pkl")
72
+ return X_scaled, scaler
73
+
74
+ # ------------------------------
75
+ # Hauptprogramm zum Testen
76
+ # ------------------------------
77
+ if __name__ == "__main__":
78
+ input_file = "fraudTest.csv"
79
+ df = load_and_clean_data(input_file)
80
+ df_encoded = encode_features(df)
81
+ df_balanced = balance_data(df_encoded)
82
+ X_scaled, scaler = scale_features(df_balanced)
83
+
84
+ df_balanced.to_csv("fraudTest_cleaned.csv", index=False)
85
+ print("Datenaufbereitung abgeschlossen!")