fraud-detection-streamlit / src /data_preprocessing.py
yannthur's picture
Upload folder using huggingface_hub
df17a19
"""Module de prétraitement du dataset de détection de fraude bancaire."""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
CATEGORICAL_COLS = ["category", "gender"]
NUMERIC_COLS = ["amt", "lat", "long", "city_pop", "merch_lat", "merch_long"]
TARGET_COL = "is_fraud"
def compute_age(dob_series: pd.Series) -> pd.Series:
"""Compute age in years from date of birth."""
dob = pd.to_datetime(dob_series)
now = pd.Timestamp("2019-01-01")
return ((now - dob).dt.days / 365.25).round(1)
def compute_distance(df: pd.DataFrame) -> pd.Series:
"""Compute Euclidean distance between client and merchant."""
return np.sqrt(
(df["lat"] - df["merch_lat"]) ** 2 + (df["long"] - df["merch_long"]) ** 2
)
def load_and_prepare(filepath: str, test_size: float = 0.2, random_state: int = 42):
"""
Load, clean and prepare dataset for training.
Args:
filepath: Path to CSV file.
test_size: Test set proportion.
random_state: Random seed for reproducibility.
Returns:
Tuple (X_train, X_test, y_train, y_test).
"""
df = pd.read_csv(filepath)
df["age"] = compute_age(df["dob"])
df["distance"] = compute_distance(df)
le = LabelEncoder()
for col in CATEGORICAL_COLS:
if col in df.columns:
df[col] = le.fit_transform(df[col].astype(str))
feature_cols = NUMERIC_COLS + CATEGORICAL_COLS + ["age", "distance"]
feature_cols = [c for c in feature_cols if c in df.columns]
X = df[feature_cols]
y = df[TARGET_COL]
return train_test_split(
X, y, test_size=test_size, random_state=random_state, stratify=y
)
if __name__ == "__main__":
X_train, X_test, y_train, y_test = load_and_prepare("data/train.csv")
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Taux de fraude (train): {y_train.mean():.4f}")