Spaces:
Runtime error
Runtime error
| """Module de prétraitement du dataset de détection de fraude bancaire.""" | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder | |
| CATEGORICAL_COLS = ["category", "gender"] | |
| NUMERIC_COLS = ["amt", "lat", "long", "city_pop", "merch_lat", "merch_long"] | |
| TARGET_COL = "is_fraud" | |
| def compute_age(dob_series: pd.Series) -> pd.Series: | |
| """Compute age in years from date of birth.""" | |
| dob = pd.to_datetime(dob_series) | |
| now = pd.Timestamp("2019-01-01") | |
| return ((now - dob).dt.days / 365.25).round(1) | |
| def compute_distance(df: pd.DataFrame) -> pd.Series: | |
| """Compute Euclidean distance between client and merchant.""" | |
| return np.sqrt( | |
| (df["lat"] - df["merch_lat"]) ** 2 + (df["long"] - df["merch_long"]) ** 2 | |
| ) | |
| def load_and_prepare(filepath: str, test_size: float = 0.2, random_state: int = 42): | |
| """ | |
| Load, clean and prepare dataset for training. | |
| Args: | |
| filepath: Path to CSV file. | |
| test_size: Test set proportion. | |
| random_state: Random seed for reproducibility. | |
| Returns: | |
| Tuple (X_train, X_test, y_train, y_test). | |
| """ | |
| df = pd.read_csv(filepath) | |
| df["age"] = compute_age(df["dob"]) | |
| df["distance"] = compute_distance(df) | |
| le = LabelEncoder() | |
| for col in CATEGORICAL_COLS: | |
| if col in df.columns: | |
| df[col] = le.fit_transform(df[col].astype(str)) | |
| feature_cols = NUMERIC_COLS + CATEGORICAL_COLS + ["age", "distance"] | |
| feature_cols = [c for c in feature_cols if c in df.columns] | |
| X = df[feature_cols] | |
| y = df[TARGET_COL] | |
| return train_test_split( | |
| X, y, test_size=test_size, random_state=random_state, stratify=y | |
| ) | |
| if __name__ == "__main__": | |
| X_train, X_test, y_train, y_test = load_and_prepare("data/train.csv") | |
| print(f"Train: {X_train.shape}, Test: {X_test.shape}") | |
| print(f"Taux de fraude (train): {y_train.mean():.4f}") | |