Spaces:

Zalaid
/

Credid-Card-Fraud-Detection

Sleeping

App Files Files Community

Credid-Card-Fraud-Detection / src /models /train_all_models.py

Zalaid

Step 6: model comparison charts, XGBoost GridSearchCV tuning, real benchmark scores in README

161d0ac 18 days ago

raw

history blame contribute delete

5.03 kB

	import os
	import sys
	import warnings
	import joblib
	import numpy as np
	import pandas as pd

	from sklearn.linear_model import LogisticRegression
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import (
	RandomForestClassifier, ExtraTreesClassifier,
	AdaBoostClassifier, GradientBoostingClassifier,
	)
	from sklearn.metrics import (
	roc_auc_score, f1_score, precision_score,
	recall_score, accuracy_score,
	)
	from xgboost import XGBClassifier
	from lightgbm import LGBMClassifier
	from catboost import CatBoostClassifier

	# allow imports from project root
	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
	from src.mlflow_setup import init_mlflow, log_model_run

	warnings.filterwarnings('ignore')

	PROCESSED_DIR = os.path.join('data', 'processed')
	MODELS_DIR = os.path.join('models')


	# ---------------------------------------------------------------------------
	# Model definitions — (name, estimator, params_to_log)
	# ---------------------------------------------------------------------------
	MODELS = [
	(
	"Logistic Regression",
	LogisticRegression(max_iter=1000, random_state=42),
	{"max_iter": 1000, "solver": "lbfgs"},
	),
	(
	"Decision Tree",
	DecisionTreeClassifier(max_depth=10, random_state=42),
	{"max_depth": 10},
	),
	(
	"Random Forest",
	RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42),
	{"n_estimators": 100},
	),
	(
	"Extra Trees",
	ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=42),
	{"n_estimators": 100},
	),
	(
	"AdaBoost",
	AdaBoostClassifier(n_estimators=100, random_state=42),
	{"n_estimators": 100},
	),
	(
	"Gradient Boosting",
	GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42),
	{"n_estimators": 100, "max_depth": 4},
	),
	(
	"XGBoost",
	XGBClassifier(
	n_estimators=200, max_depth=6, learning_rate=0.1,
	scale_pos_weight=1, eval_metric='logloss',
	random_state=42, n_jobs=-1,
	),
	{"n_estimators": 200, "max_depth": 6, "learning_rate": 0.1},
	),
	(
	"LightGBM",
	LGBMClassifier(
	n_estimators=200, max_depth=6, learning_rate=0.1,
	random_state=42, n_jobs=-1, verbose=-1,
	),
	{"n_estimators": 200, "max_depth": 6, "learning_rate": 0.1},
	),
	(
	"CatBoost",
	CatBoostClassifier(
	iterations=200, depth=6, learning_rate=0.1,
	random_seed=42, verbose=0,
	),
	{"iterations": 200, "depth": 6, "learning_rate": 0.1},
	),
	]


	def load_data():
	X_train = joblib.load(os.path.join(PROCESSED_DIR, 'X_train_smote.pkl'))
	y_train = joblib.load(os.path.join(PROCESSED_DIR, 'y_train_smote.pkl'))
	X_test = joblib.load(os.path.join(PROCESSED_DIR, 'X_test.pkl'))
	y_test = joblib.load(os.path.join(PROCESSED_DIR, 'y_test.pkl'))
	return X_train, y_train, X_test, y_test


	def evaluate(model, X_test, y_test):
	y_pred = model.predict(X_test)
	y_proba = model.predict_proba(X_test)[:, 1]
	return {
	"auc_roc" : round(roc_auc_score(y_test, y_proba), 6),
	"f1" : round(f1_score(y_test, y_pred), 6),
	"precision" : round(precision_score(y_test, y_pred), 6),
	"recall" : round(recall_score(y_test, y_pred), 6),
	"accuracy" : round(accuracy_score(y_test, y_pred), 6),
	}


	def run():
	print("=" * 65)
	print(" Training & Benchmarking 9 Models")
	print("=" * 65)

	init_mlflow()
	X_train, y_train, X_test, y_test = load_data()

	os.makedirs(MODELS_DIR, exist_ok=True)
	results = []

	for name, model, params in MODELS:
	print(f"\n Training : {name}")
	model.fit(X_train, y_train)
	metrics = evaluate(model, X_test, y_test)

	# log to MLflow
	log_model_run(
	model_name=name,
	model=model,
	params=params,
	metrics=metrics,
	X_sample=X_test.iloc[:5],
	)

	# save model locally
	safe_name = name.lower().replace(' ', '_').replace('(', '').replace(')', '')
	joblib.dump(model, os.path.join(MODELS_DIR, f'{safe_name}.pkl'))

	results.append({"Model": name, **metrics})

	# -----------------------------------------------------------------------
	# Print comparison table ranked by AUC-ROC
	# -----------------------------------------------------------------------
	df = pd.DataFrame(results).sort_values("auc_roc", ascending=False).reset_index(drop=True)
	df.index += 1

	print("\n")
	print("=" * 65)
	print(" Model Benchmark Results (ranked by AUC-ROC)")
	print("=" * 65)
	print(df.to_string())
	print("=" * 65)
	print(f"\n Best model : {df.iloc[0]['Model']} (AUC-ROC = {df.iloc[0]['auc_roc']})")
	print("=" * 65)

	return df


	if __name__ == '__main__':
	run()