Credid-Card-Fraud-Detection / src /models /train_all_models.py
Zalaid's picture
Step 6: model comparison charts, XGBoost GridSearchCV tuning, real benchmark scores in README
161d0ac
import os
import sys
import warnings
import joblib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
RandomForestClassifier, ExtraTreesClassifier,
AdaBoostClassifier, GradientBoostingClassifier,
)
from sklearn.metrics import (
roc_auc_score, f1_score, precision_score,
recall_score, accuracy_score,
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# allow imports from project root
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
from src.mlflow_setup import init_mlflow, log_model_run
warnings.filterwarnings('ignore')
PROCESSED_DIR = os.path.join('data', 'processed')
MODELS_DIR = os.path.join('models')
# ---------------------------------------------------------------------------
# Model definitions — (name, estimator, params_to_log)
# ---------------------------------------------------------------------------
MODELS = [
(
"Logistic Regression",
LogisticRegression(max_iter=1000, random_state=42),
{"max_iter": 1000, "solver": "lbfgs"},
),
(
"Decision Tree",
DecisionTreeClassifier(max_depth=10, random_state=42),
{"max_depth": 10},
),
(
"Random Forest",
RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42),
{"n_estimators": 100},
),
(
"Extra Trees",
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=42),
{"n_estimators": 100},
),
(
"AdaBoost",
AdaBoostClassifier(n_estimators=100, random_state=42),
{"n_estimators": 100},
),
(
"Gradient Boosting",
GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42),
{"n_estimators": 100, "max_depth": 4},
),
(
"XGBoost",
XGBClassifier(
n_estimators=200, max_depth=6, learning_rate=0.1,
scale_pos_weight=1, eval_metric='logloss',
random_state=42, n_jobs=-1,
),
{"n_estimators": 200, "max_depth": 6, "learning_rate": 0.1},
),
(
"LightGBM",
LGBMClassifier(
n_estimators=200, max_depth=6, learning_rate=0.1,
random_state=42, n_jobs=-1, verbose=-1,
),
{"n_estimators": 200, "max_depth": 6, "learning_rate": 0.1},
),
(
"CatBoost",
CatBoostClassifier(
iterations=200, depth=6, learning_rate=0.1,
random_seed=42, verbose=0,
),
{"iterations": 200, "depth": 6, "learning_rate": 0.1},
),
]
def load_data():
X_train = joblib.load(os.path.join(PROCESSED_DIR, 'X_train_smote.pkl'))
y_train = joblib.load(os.path.join(PROCESSED_DIR, 'y_train_smote.pkl'))
X_test = joblib.load(os.path.join(PROCESSED_DIR, 'X_test.pkl'))
y_test = joblib.load(os.path.join(PROCESSED_DIR, 'y_test.pkl'))
return X_train, y_train, X_test, y_test
def evaluate(model, X_test, y_test):
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
return {
"auc_roc" : round(roc_auc_score(y_test, y_proba), 6),
"f1" : round(f1_score(y_test, y_pred), 6),
"precision" : round(precision_score(y_test, y_pred), 6),
"recall" : round(recall_score(y_test, y_pred), 6),
"accuracy" : round(accuracy_score(y_test, y_pred), 6),
}
def run():
print("=" * 65)
print(" Training & Benchmarking 9 Models")
print("=" * 65)
init_mlflow()
X_train, y_train, X_test, y_test = load_data()
os.makedirs(MODELS_DIR, exist_ok=True)
results = []
for name, model, params in MODELS:
print(f"\n Training : {name}")
model.fit(X_train, y_train)
metrics = evaluate(model, X_test, y_test)
# log to MLflow
log_model_run(
model_name=name,
model=model,
params=params,
metrics=metrics,
X_sample=X_test.iloc[:5],
)
# save model locally
safe_name = name.lower().replace(' ', '_').replace('(', '').replace(')', '')
joblib.dump(model, os.path.join(MODELS_DIR, f'{safe_name}.pkl'))
results.append({"Model": name, **metrics})
# -----------------------------------------------------------------------
# Print comparison table ranked by AUC-ROC
# -----------------------------------------------------------------------
df = pd.DataFrame(results).sort_values("auc_roc", ascending=False).reset_index(drop=True)
df.index += 1
print("\n")
print("=" * 65)
print(" Model Benchmark Results (ranked by AUC-ROC)")
print("=" * 65)
print(df.to_string())
print("=" * 65)
print(f"\n Best model : {df.iloc[0]['Model']} (AUC-ROC = {df.iloc[0]['auc_roc']})")
print("=" * 65)
return df
if __name__ == '__main__':
run()