File size: 11,283 Bytes
8a08300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
"""
Model Training Script with MLflow Tracking.

CLI script to train the fraud detection model with comprehensive experiment tracking.
Logs hyperparameters, metrics, and artifacts to MLflow.

Usage:
    python src/models/train.py --data_path data/fraudTrain.csv
    python src/models/train.py --data_path data/fraudTrain.csv --experiment_name fraud_v2
"""

import argparse
import json
import os
from pathlib import Path
from typing import Dict, Tuple

import joblib
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import yaml
from sklearn.model_selection import train_test_split

from src.data.ingest import load_dataset
from src.models.metrics import calculate_metrics, find_optimal_threshold
from src.models.pipeline import create_fraud_pipeline


def parse_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description="Train fraud detection model")

    parser.add_argument(
        "--data_path", type=str, required=True, help="Path to input CSV/Parquet file"
    )

    parser.add_argument(
        "--params_path",
        type=str,
        default="configs/model_config.yaml",
        help="Path to model configuration YAML",
    )

    parser.add_argument(
        "--experiment_name", type=str, default="fraud_detection", help="MLflow experiment name"
    )

    parser.add_argument("--test_size", type=float, default=0.2, help="Test set proportion (0-1)")

    parser.add_argument(
        "--min_recall",
        type=float,
        default=0.80,
        help="Minimum recall target for threshold optimization (Notebook: 0.80)",
    )

    parser.add_argument(
        "--output_dir", type=str, default="models", help="Directory to save model artifacts"
    )

    return parser.parse_args()


def load_config(config_path: str) -> Dict:
    """Load model configuration from YAML."""
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    return config


def prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Prepare features and target from raw dataframe.

    Args:
        df: Raw transaction data
        df contains Training set for Credit Card Transactions
        index - Unique Identifier for each row
        trans_date_trans_time - Transaction DateTime
        cc_num - Credit Card Number of Customer
        merchant - Merchant Name
        category - Category of Merchant
        amt - Amount of Transaction
        first - First Name of Credit Card Holder
        last - Last Name of Credit Card Holder
        gender - Gender of Credit Card Holder
        street - Street Address of Credit Card Holder
        city - City of Credit Card Holder
        state - State of Credit Card Holder
        zip - Zip of Credit Card Holder
        lat - Latitude Location of Credit Card Holder
        long - Longitude Location of Credit Card Holder
        city_pop - Credit Card Holder's City Population
        job - Job of Credit Card Holder
        dob - Date of Birth of Credit Card Holder
        trans_num - Transaction Number
        unix_time - UNIX Time of transaction
        merch_lat - Latitude Location of Merchant
        merch_long - Longitude Location of Merchant
        is_fraud - Fraud Flag <--- Target Class

    Returns:
        Tuple of (X, y)
    """
    # Required columns for training
    required_cols = [
        "trans_date_trans_time",
        "amt",
        "lat",
        "long",
        "merch_lat",
        "merch_long",
        "job",
        "category",
        "gender",
        "dob",
        "is_fraud",
    ]

    # Compute feature store features from raw data
    # Sort by user and timestamp for rolling window calculations
    print("  → Computing rolling window features (trans_count_24h, avg_amt_24h)...")

    # CRITICAL: Convert to datetime BEFORE using as index for time-based rolling windows
    df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])

    df = df.sort_values(["cc_num", "trans_date_trans_time"])
    df = df.set_index("trans_date_trans_time")

    # 1. Transaction Velocity (Rolling Count)
    # Identifies sudden bursts in card usage
    df["trans_count_24h"] = (
        df.groupby("cc_num")["amt"]
        .rolling("24h")
        .count()
        .shift(1)
        .reset_index(0, drop=True)
        .fillna(0)
    )

    # 2. Recent Spending Baseline (Rolling Mean)
    # Needed for the 24h ratio calculation
    df["avg_amt_24h"] = (
        df.groupby("cc_num")["amt"]
        .rolling("24h")
        .mean()
        .shift(1)
        .reset_index(0, drop=True)
        .fillna(df["amt"])
    )

    # 3. All-time Spending Profile (Expanding Mean)
    # Captures long-term user behavior
    df["user_avg_amt_all_time"] = (
        df.groupby("cc_num")["amt"]
        .transform(lambda x: x.expanding().mean().shift(1))
        .fillna(df["amt"])
    )

    # Reset index to restore dataframe structure
    df = df.reset_index()

    # 4. Derived Ratio Features
    # Identifies spikes relative to recent 24-hour activity (Burst Detection)
    df["amt_to_avg_ratio_24h"] = df["amt"] / df["avg_amt_24h"]

    # Identifies spikes relative to long-term behavior (Anomaly Detection)
    df["amt_relative_to_all_time"] = df["amt"] / df["user_avg_amt_all_time"]

    # Extract target
    y = df["is_fraud"]

    # Features (pipeline will extract derived features)
    feature_cols = [c for c in df.columns if c != "is_fraud"]
    X = df[feature_cols]

    return X, y


def train_model(args):
    """Main training workflow."""

    print("=" * 70)
    print("PayShield-ML: Fraud Detection Training Pipeline")
    print("=" * 70)

    # 1. Load Configuration
    print(f"\n[1/7] Loading configuration from {args.params_path}")
    config = load_config(args.params_path)
    model_params = config.get("model", {})

    # 2. Load Data
    print(f"\n[2/7] Loading data from {args.data_path}")
    df = load_dataset(args.data_path, validate=False)  # Skip validation for speed
    print(f"  → Loaded {len(df):,} transactions")
    print(f"  → Fraud rate: {df['is_fraud'].mean() * 100:.2f}%")

    # 3. Prepare Features
    print(f"\n[3/7] Preparing features and target")
    X, y = prepare_data(df)
    print(f"  → Features shape: {X.shape}")
    print(f"  → Target shape: {y.shape}")

    # 4. Train/Test Split (TEMPORAL - No Data Leakage)
    print(f"\n[4/7] Splitting data temporally (test_size={args.test_size})")

    # A. Data is already sorted from prepare_data (for rolling window calculations)
    # But let's ensure it's sorted and reset index
    df_combined = pd.concat([X, y], axis=1)
    df_combined = df_combined.sort_values("trans_date_trans_time").reset_index(drop=True)

    # B. Calculate split index (strictly temporal)
    split_index = int(len(df_combined) * (1 - args.test_size))

    # C. Split strictly by index (No shuffling)
    train_df = df_combined.iloc[:split_index]
    test_df = df_combined.iloc[split_index:]

    # D. Separate Features and Target
    X_train = train_df.drop("is_fraud", axis=1)
    y_train = train_df["is_fraud"]
    X_test = test_df.drop("is_fraud", axis=1)
    y_test = test_df["is_fraud"]

    # E. Report temporal boundaries and fraud rates
    print(f"  → Train: {len(X_train):,} samples")
    print(f"    • Earliest: {train_df['trans_date_trans_time'].min()}")
    print(f"    • Latest:   {train_df['trans_date_trans_time'].max()}")
    print(f"    • Fraud Rate: {y_train.mean():.4%}")

    print(f"  → Test:  {len(X_test):,} samples")
    print(f"    • Earliest: {test_df['trans_date_trans_time'].min()}")
    print(f"    • Latest:   {test_df['trans_date_trans_time'].max()}")
    print(f"    • Fraud Rate: {y_test.mean():.4%}")

    # F. Sanity check: Ensure test is strictly after train
    if train_df["trans_date_trans_time"].max() >= test_df["trans_date_trans_time"].min():
        print("  ⚠ WARNING: Temporal overlap detected between train and test sets!")

    # 5. Initialize MLflow
    print(f"\n[5/7] Initializing MLflow experiment: {args.experiment_name}")
    mlflow.set_experiment(args.experiment_name)

    with mlflow.start_run():
        # Calculate class imbalance ratio from actual training data
        imbalance_ratio = (y_train == 0).sum() / (y_train == 1).sum()
        print(f"\n  → Class Imbalance Ratio: {imbalance_ratio:.2f}:1 (negative:positive)")

        # Override scale_pos_weight with calculated ratio
        model_params["scale_pos_weight"] = imbalance_ratio

        # Log parameters
        mlflow.log_params(model_params)
        mlflow.log_param("test_size", args.test_size)
        mlflow.log_param("min_recall_target", args.min_recall)
        mlflow.log_param("n_train_samples", len(X_train))
        mlflow.log_param("n_test_samples", len(X_test))

        # 6. Train Pipeline
        print(f"\n[6/7] Training pipeline")
        pipeline = create_fraud_pipeline(model_params)

        print("  → Fitting model...")
        pipeline.fit(X_train, y_train)
        print("  ✓ Training complete")

        # Predict probabilities
        y_train_prob = pipeline.predict_proba(X_train)[:, 1]
        y_test_prob = pipeline.predict_proba(X_test)[:, 1]

        # 7. Optimize Threshold
        print(f"\n[7/7] Optimizing decision threshold (target recall >= {args.min_recall:.2%})")
        optimal_threshold, threshold_metrics = find_optimal_threshold(
            y_test, y_test_prob, min_recall=args.min_recall
        )
        print(f"  → Optimal threshold: {optimal_threshold:.4f}")
        print(f"  → Precision: {threshold_metrics['precision']:.4f}")
        print(f"  → Recall:    {threshold_metrics['recall']:.4f}")
        print(f"  → F1 Score:  {threshold_metrics['f1']:.4f}")
        print(f"  → PR-AUC:    {threshold_metrics['pr_auc']:.4f}")

        # Log metrics to MLflow
        mlflow.log_metrics(
            {
                "train_pr_auc": float(calculate_metrics(y_train, y_train_prob, 0.5)["pr_auc"]),
                "test_precision": threshold_metrics["precision"],
                "test_recall": threshold_metrics["recall"],
                "test_f1": threshold_metrics["f1"],
                "test_pr_auc": threshold_metrics["pr_auc"],
                "optimal_threshold": optimal_threshold,
            }
        )

        # Save artifacts locally
        output_dir = Path(args.output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        # Save model
        model_path = output_dir / "fraud_model.pkl"
        joblib.dump(pipeline, model_path)
        print(f"\n✓ Model saved to {model_path}")

        # Save threshold
        threshold_path = output_dir / "threshold.json"
        with open(threshold_path, "w") as f:
            json.dump(
                {"optimal_threshold": optimal_threshold, "metrics": threshold_metrics}, f, indent=2
            )
        print(f"✓ Threshold saved to {threshold_path}")

        # Log artifacts to MLflow
        mlflow.sklearn.log_model(pipeline, "model")
        mlflow.log_artifact(str(threshold_path))

        print("\n" + "=" * 70)
        print("✅ Training Complete!")
        print(f"MLflow Run ID: {mlflow.active_run().info.run_id}")
        print("=" * 70)


if __name__ == "__main__":
    args = parse_args()
    train_model(args)