Spaces:
Running
Running
File size: 5,541 Bytes
99bc19c d724279 99bc19c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | """
Central configuration β paths, column groups, model defaults, business costs.
Import this everywhere instead of scattering magic strings.
"""
from pathlib import Path
# ββ Paths ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
ROOT = Path(__file__).resolve().parents[1]
DATA_RAW = ROOT / "data" / "raw"
DATA_PROC = ROOT / "data" / "processed"
MODELS_DIR = ROOT / "models"
REPORTS_DIR = ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"
# Sparkov dataset files (Kaggle: kartik2112/fraud-detection)
RAW_TRAIN_CSV = DATA_RAW / "fraudTrain.csv"
RAW_TEST_CSV = DATA_RAW / "fraudTest.csv"
KAGGLE_DATASET = "kartik2112/fraud-detection"
# Processed feature tables
FEATURES_TRAIN = DATA_PROC / "features_train.parquet"
FEATURES_TEST = DATA_PROC / "features_test.parquet"
# Model artifacts
LGBM_MODEL = MODELS_DIR / "lgbm_fraud.joblib"
XGB_MODEL = MODELS_DIR / "xgb_fraud.joblib"
AE_MODEL = MODELS_DIR / "autoencoder.pt"
GNN_MODEL = MODELS_DIR / "gnn_fraud.pt"
MODEL_META = MODELS_DIR / "model_meta.json"
SHAP_VALUES = MODELS_DIR / "shap_values.npy"
FEATURE_PIPE = MODELS_DIR / "feature_pipeline.joblib"
# ββ Target / identifiers βββββββββββββββββββββββββββββββββββββββββββββββββββ
TARGET = "is_fraud"
CARD_COL = "cc_num"
TIME_COL = "trans_date_trans_time"
MERCHANT_COL = "merchant"
# Raw columns we drop (PII / identifiers not used as features directly)
DROP_RAW = ["first", "last", "street", "trans_num", "unix_time"]
# ββ Engineered feature groups (populated by features.py) βββββββββββββββββββ
# These names are produced by src/features.py::engineer_features
NUMERIC_FEATURES = [
"amt", "amt_log",
"hour", "day_of_week", "is_night", "is_weekend",
"age",
"city_pop_log",
# Geo
"dist_home_merchant_km", "dist_from_prev_txn_km",
# Velocity (per card)
"txn_count_1h", "txn_count_24h", "txn_count_7d",
"amt_sum_1h", "amt_sum_24h", "amt_sum_7d",
"amt_mean_24h",
"secs_since_prev_txn",
# Behavioral
"amt_dev_from_card_mean", "amt_ratio_to_card_mean",
"distinct_merchants_24h",
]
CATEGORICAL_FEATURES = [
"category", "gender", "state",
]
ALL_FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES
# ββ Train / test split ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Sparkov ships pre-split temporally (fraudTrain = earlier, fraudTest = later).
# We respect that temporal ordering β no random shuffling (avoids leakage).
RANDOM_SEED = 42
VALID_FRACTION = 0.15 # last 15% of train (by time) held out for early stopping
# ββ Business cost model βββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Used for threshold optimization. Tunable to a company's economics.
#
# Fraud economics are asymmetric in the FN direction: a missed fraud (false
# negative) loses real money β the transaction amount, which we scale by in
# evaluate.expected_cost β while a blocked legit transaction (false positive)
# only costs customer friction / goodwill. So FN must dominate FP.
COST_FALSE_NEGATIVE = 5.0 # missed fraud β lose the txn amount (and amount-scaled in evaluate)
COST_FALSE_POSITIVE = 1.0 # blocking a legit txn β flat friction / goodwill cost
# Interpretation: a missed fraud of average value costs ~5Γ the friction of one
# wrongly-declined customer. This pushes the optimal threshold below 0.5 β
# catch more fraud, tolerate some extra false positives.
# ββ Gradient boosting defaults (overridden by Optuna) ββββββββββββββββββββββ
LGBM_BASE_PARAMS = {
"objective": "binary",
"metric": "average_precision", # PR-AUC β correct metric for imbalance
"boosting_type": "gbdt",
"random_state": RANDOM_SEED,
"n_jobs": -1,
"verbose": -1,
}
OPTUNA_N_TRIALS = 40
# ββ Autoencoder βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
AE_HIDDEN = [32, 16, 8]
AE_EPOCHS = 30
AE_BATCH = 2048
AE_LR = 1e-3
# ββ GNN βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
GNN_HIDDEN = 64
GNN_LAYERS = 2
GNN_EPOCHS = 50
GNN_BATCH = 4096
GNN_LR = 5e-3
GNN_NEIGHBORS = [15, 10] # neighbor sampling fan-out per layer
# ββ MLflow ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
MLFLOW_EXPERIMENT = "fraud-detection"
# ββ Drift (PSI) βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PSI_BINS = 10
PSI_THRESHOLD_WARN = 0.10 # 0.1β0.25 = moderate shift
PSI_THRESHOLD_ALERT = 0.25 # > 0.25 = significant shift
|