| from __future__ import annotations |
|
|
| import argparse |
| import json |
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Any, Dict, List, Tuple |
|
|
| import joblib |
| import numpy as np |
| import pandas as pd |
| from sklearn.impute import SimpleImputer |
| from sklearn.metrics import average_precision_score, roc_auc_score |
| from sklearn.model_selection import train_test_split |
| from sklearn.pipeline import Pipeline |
| from xgboost import XGBClassifier |
|
|
|
|
| TARGET = "isFraud" |
|
|
|
|
| def _read_joined(raw_dir: Path, max_rows: int | None) -> pd.DataFrame: |
| tx = pd.read_csv(raw_dir / "train_transaction.csv", nrows=max_rows) |
| ident_path = raw_dir / "train_identity.csv" |
| if ident_path.exists(): |
| ident = pd.read_csv(ident_path, nrows=max_rows) |
| df = tx.merge(ident, on="TransactionID", how="left") |
| else: |
| df = tx |
| return df |
|
|
|
|
| def _select_columns(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str], List[str]]: |
| |
| numeric = [ |
| "TransactionAmt", |
| "dist1", |
| "dist2", |
| "C1", |
| "C2", |
| "C3", |
| "C4", |
| "C5", |
| "C6", |
| "C7", |
| "C8", |
| "C9", |
| "C10", |
| "C11", |
| "C12", |
| "C13", |
| "C14", |
| "D1", |
| "D2", |
| "D3", |
| "D4", |
| "D5", |
| "D10", |
| "D11", |
| "D15", |
| ] |
| categorical = [ |
| "ProductCD", |
| "card1", |
| "card2", |
| "card3", |
| "card5", |
| "card6", |
| "addr1", |
| "addr2", |
| "P_emaildomain", |
| "R_emaildomain", |
| "DeviceType", |
| ] |
|
|
| numeric = [c for c in numeric if c in df.columns] |
| categorical = [c for c in categorical if c in df.columns] |
| keep = [TARGET] + numeric + categorical |
| keep = [c for c in keep if c in df.columns] |
|
|
| out = df[keep].copy() |
| return out, numeric, categorical |
|
|
|
|
| def _freq_encode(train: pd.DataFrame, test: pd.DataFrame, cols: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Dict[Any, int]]]: |
| mappings: Dict[str, Dict[Any, int]] = {} |
| tr = train.copy() |
| te = test.copy() |
|
|
| for c in cols: |
| vc = tr[c].astype("string").fillna("NA").value_counts(dropna=False) |
| mapping = vc.to_dict() |
| mappings[c] = mapping |
|
|
| tr[c] = tr[c].astype("string").fillna("NA").map(mapping).fillna(0).astype(float) |
| te[c] = te[c].astype("string").fillna("NA").map(mapping).fillna(0).astype(float) |
|
|
| return tr, te, mappings |
|
|
|
|
| @dataclass |
| class Artifacts: |
| model: XGBClassifier |
| imputer: SimpleImputer |
| feature_names: List[str] |
| freq_mappings: Dict[str, Dict[Any, int]] |
| categorical_cols: List[str] |
|
|
|
|
| def train(df: pd.DataFrame, numeric: List[str], categorical: List[str], seed: int) -> Tuple[Artifacts, Dict[str, Any]]: |
| df = df.dropna(subset=[TARGET]).copy() |
| y = df[TARGET].astype(int).to_numpy() |
| X = df.drop(columns=[TARGET]) |
|
|
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=0.25, random_state=seed, stratify=y |
| ) |
|
|
| X_train_enc, X_test_enc, mappings = _freq_encode(X_train, X_test, categorical) |
|
|
| feature_names = list(X_train_enc.columns) |
| imputer = SimpleImputer(strategy="median") |
| Xtr = imputer.fit_transform(X_train_enc) |
| Xte = imputer.transform(X_test_enc) |
|
|
| model = XGBClassifier( |
| n_estimators=900, |
| learning_rate=0.04, |
| max_depth=5, |
| subsample=0.9, |
| colsample_bytree=0.8, |
| reg_lambda=2.0, |
| min_child_weight=5, |
| objective="binary:logistic", |
| eval_metric="logloss", |
| n_jobs=4, |
| random_state=seed, |
| ) |
| model.fit(Xtr, y_train) |
|
|
| proba = model.predict_proba(Xte)[:, 1] |
| metrics = { |
| "roc_auc": float(roc_auc_score(y_test, proba)), |
| "pr_auc": float(average_precision_score(y_test, proba)), |
| "n_test": int(len(y_test)), |
| "seed": int(seed), |
| } |
|
|
| artifacts = Artifacts( |
| model=model, |
| imputer=imputer, |
| feature_names=feature_names, |
| freq_mappings=mappings, |
| categorical_cols=categorical, |
| ) |
| return artifacts, metrics |
|
|
|
|
| def save(artifacts: Artifacts, metrics: Dict[str, Any], out_dir: Path) -> None: |
| out_dir.mkdir(parents=True, exist_ok=True) |
| joblib.dump( |
| { |
| "model": artifacts.model, |
| "imputer": artifacts.imputer, |
| "feature_names": artifacts.feature_names, |
| "freq_mappings": artifacts.freq_mappings, |
| "categorical_cols": artifacts.categorical_cols, |
| }, |
| out_dir / "model.joblib", |
| ) |
| with (out_dir / "metrics.json").open("w") as f: |
| json.dump(metrics, f, indent=2, sort_keys=True) |
|
|
|
|
| def main() -> None: |
| p = argparse.ArgumentParser() |
| p.add_argument("--seed", type=int, default=42) |
| p.add_argument("--max_rows", type=int, default=200000) |
| p.add_argument("--raw_dir", type=str, default=str(Path("data/raw"))) |
| p.add_argument("--out_dir", type=str, default=str(Path("artifacts"))) |
| args = p.parse_args() |
|
|
| raw_dir = Path(args.raw_dir) |
| df = _read_joined(raw_dir, max_rows=args.max_rows if args.max_rows > 0 else None) |
| df, numeric, categorical = _select_columns(df) |
|
|
| artifacts, metrics = train(df, numeric, categorical, seed=args.seed) |
| save(artifacts, metrics, Path(args.out_dir)) |
| print(json.dumps(metrics, indent=2)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|