ravitejas1596's picture
Deploy Space
ae7c16d verified
from __future__ import annotations
import argparse
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Tuple
import joblib
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
TARGET = "isFraud"
def _read_joined(raw_dir: Path, max_rows: int | None) -> pd.DataFrame:
tx = pd.read_csv(raw_dir / "train_transaction.csv", nrows=max_rows)
ident_path = raw_dir / "train_identity.csv"
if ident_path.exists():
ident = pd.read_csv(ident_path, nrows=max_rows)
df = tx.merge(ident, on="TransactionID", how="left")
else:
df = tx
return df
def _select_columns(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str], List[str]]:
# Compact, stable set (keeps model lightweight + portable)
numeric = [
"TransactionAmt",
"dist1",
"dist2",
"C1",
"C2",
"C3",
"C4",
"C5",
"C6",
"C7",
"C8",
"C9",
"C10",
"C11",
"C12",
"C13",
"C14",
"D1",
"D2",
"D3",
"D4",
"D5",
"D10",
"D11",
"D15",
]
categorical = [
"ProductCD",
"card1",
"card2",
"card3",
"card5",
"card6",
"addr1",
"addr2",
"P_emaildomain",
"R_emaildomain",
"DeviceType",
]
numeric = [c for c in numeric if c in df.columns]
categorical = [c for c in categorical if c in df.columns]
keep = [TARGET] + numeric + categorical
keep = [c for c in keep if c in df.columns]
out = df[keep].copy()
return out, numeric, categorical
def _freq_encode(train: pd.DataFrame, test: pd.DataFrame, cols: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame, Dict[str, Dict[Any, int]]]:
mappings: Dict[str, Dict[Any, int]] = {}
tr = train.copy()
te = test.copy()
for c in cols:
vc = tr[c].astype("string").fillna("NA").value_counts(dropna=False)
mapping = vc.to_dict()
mappings[c] = mapping
tr[c] = tr[c].astype("string").fillna("NA").map(mapping).fillna(0).astype(float)
te[c] = te[c].astype("string").fillna("NA").map(mapping).fillna(0).astype(float)
return tr, te, mappings
@dataclass
class Artifacts:
model: XGBClassifier
imputer: SimpleImputer
feature_names: List[str]
freq_mappings: Dict[str, Dict[Any, int]]
categorical_cols: List[str]
def train(df: pd.DataFrame, numeric: List[str], categorical: List[str], seed: int) -> Tuple[Artifacts, Dict[str, Any]]:
df = df.dropna(subset=[TARGET]).copy()
y = df[TARGET].astype(int).to_numpy()
X = df.drop(columns=[TARGET])
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=seed, stratify=y
)
X_train_enc, X_test_enc, mappings = _freq_encode(X_train, X_test, categorical)
feature_names = list(X_train_enc.columns)
imputer = SimpleImputer(strategy="median")
Xtr = imputer.fit_transform(X_train_enc)
Xte = imputer.transform(X_test_enc)
model = XGBClassifier(
n_estimators=900,
learning_rate=0.04,
max_depth=5,
subsample=0.9,
colsample_bytree=0.8,
reg_lambda=2.0,
min_child_weight=5,
objective="binary:logistic",
eval_metric="logloss",
n_jobs=4,
random_state=seed,
)
model.fit(Xtr, y_train)
proba = model.predict_proba(Xte)[:, 1]
metrics = {
"roc_auc": float(roc_auc_score(y_test, proba)),
"pr_auc": float(average_precision_score(y_test, proba)),
"n_test": int(len(y_test)),
"seed": int(seed),
}
artifacts = Artifacts(
model=model,
imputer=imputer,
feature_names=feature_names,
freq_mappings=mappings,
categorical_cols=categorical,
)
return artifacts, metrics
def save(artifacts: Artifacts, metrics: Dict[str, Any], out_dir: Path) -> None:
out_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(
{
"model": artifacts.model,
"imputer": artifacts.imputer,
"feature_names": artifacts.feature_names,
"freq_mappings": artifacts.freq_mappings,
"categorical_cols": artifacts.categorical_cols,
},
out_dir / "model.joblib",
)
with (out_dir / "metrics.json").open("w") as f:
json.dump(metrics, f, indent=2, sort_keys=True)
def main() -> None:
p = argparse.ArgumentParser()
p.add_argument("--seed", type=int, default=42)
p.add_argument("--max_rows", type=int, default=200000)
p.add_argument("--raw_dir", type=str, default=str(Path("data/raw")))
p.add_argument("--out_dir", type=str, default=str(Path("artifacts")))
args = p.parse_args()
raw_dir = Path(args.raw_dir)
df = _read_joined(raw_dir, max_rows=args.max_rows if args.max_rows > 0 else None)
df, numeric, categorical = _select_columns(df)
artifacts, metrics = train(df, numeric, categorical, seed=args.seed)
save(artifacts, metrics, Path(args.out_dir))
print(json.dumps(metrics, indent=2))
if __name__ == "__main__":
main()