Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import warnings | |
| import joblib | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import ( | |
| RandomForestClassifier, ExtraTreesClassifier, | |
| AdaBoostClassifier, GradientBoostingClassifier, | |
| ) | |
| from sklearn.metrics import ( | |
| roc_auc_score, f1_score, precision_score, | |
| recall_score, accuracy_score, | |
| ) | |
| from xgboost import XGBClassifier | |
| from lightgbm import LGBMClassifier | |
| from catboost import CatBoostClassifier | |
| # allow imports from project root | |
| sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) | |
| from src.mlflow_setup import init_mlflow, log_model_run | |
| warnings.filterwarnings('ignore') | |
| PROCESSED_DIR = os.path.join('data', 'processed') | |
| MODELS_DIR = os.path.join('models') | |
| # --------------------------------------------------------------------------- | |
| # Model definitions — (name, estimator, params_to_log) | |
| # --------------------------------------------------------------------------- | |
| MODELS = [ | |
| ( | |
| "Logistic Regression", | |
| LogisticRegression(max_iter=1000, random_state=42), | |
| {"max_iter": 1000, "solver": "lbfgs"}, | |
| ), | |
| ( | |
| "Decision Tree", | |
| DecisionTreeClassifier(max_depth=10, random_state=42), | |
| {"max_depth": 10}, | |
| ), | |
| ( | |
| "Random Forest", | |
| RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42), | |
| {"n_estimators": 100}, | |
| ), | |
| ( | |
| "Extra Trees", | |
| ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=42), | |
| {"n_estimators": 100}, | |
| ), | |
| ( | |
| "AdaBoost", | |
| AdaBoostClassifier(n_estimators=100, random_state=42), | |
| {"n_estimators": 100}, | |
| ), | |
| ( | |
| "Gradient Boosting", | |
| GradientBoostingClassifier(n_estimators=100, max_depth=4, random_state=42), | |
| {"n_estimators": 100, "max_depth": 4}, | |
| ), | |
| ( | |
| "XGBoost", | |
| XGBClassifier( | |
| n_estimators=200, max_depth=6, learning_rate=0.1, | |
| scale_pos_weight=1, eval_metric='logloss', | |
| random_state=42, n_jobs=-1, | |
| ), | |
| {"n_estimators": 200, "max_depth": 6, "learning_rate": 0.1}, | |
| ), | |
| ( | |
| "LightGBM", | |
| LGBMClassifier( | |
| n_estimators=200, max_depth=6, learning_rate=0.1, | |
| random_state=42, n_jobs=-1, verbose=-1, | |
| ), | |
| {"n_estimators": 200, "max_depth": 6, "learning_rate": 0.1}, | |
| ), | |
| ( | |
| "CatBoost", | |
| CatBoostClassifier( | |
| iterations=200, depth=6, learning_rate=0.1, | |
| random_seed=42, verbose=0, | |
| ), | |
| {"iterations": 200, "depth": 6, "learning_rate": 0.1}, | |
| ), | |
| ] | |
| def load_data(): | |
| X_train = joblib.load(os.path.join(PROCESSED_DIR, 'X_train_smote.pkl')) | |
| y_train = joblib.load(os.path.join(PROCESSED_DIR, 'y_train_smote.pkl')) | |
| X_test = joblib.load(os.path.join(PROCESSED_DIR, 'X_test.pkl')) | |
| y_test = joblib.load(os.path.join(PROCESSED_DIR, 'y_test.pkl')) | |
| return X_train, y_train, X_test, y_test | |
| def evaluate(model, X_test, y_test): | |
| y_pred = model.predict(X_test) | |
| y_proba = model.predict_proba(X_test)[:, 1] | |
| return { | |
| "auc_roc" : round(roc_auc_score(y_test, y_proba), 6), | |
| "f1" : round(f1_score(y_test, y_pred), 6), | |
| "precision" : round(precision_score(y_test, y_pred), 6), | |
| "recall" : round(recall_score(y_test, y_pred), 6), | |
| "accuracy" : round(accuracy_score(y_test, y_pred), 6), | |
| } | |
| def run(): | |
| print("=" * 65) | |
| print(" Training & Benchmarking 9 Models") | |
| print("=" * 65) | |
| init_mlflow() | |
| X_train, y_train, X_test, y_test = load_data() | |
| os.makedirs(MODELS_DIR, exist_ok=True) | |
| results = [] | |
| for name, model, params in MODELS: | |
| print(f"\n Training : {name}") | |
| model.fit(X_train, y_train) | |
| metrics = evaluate(model, X_test, y_test) | |
| # log to MLflow | |
| log_model_run( | |
| model_name=name, | |
| model=model, | |
| params=params, | |
| metrics=metrics, | |
| X_sample=X_test.iloc[:5], | |
| ) | |
| # save model locally | |
| safe_name = name.lower().replace(' ', '_').replace('(', '').replace(')', '') | |
| joblib.dump(model, os.path.join(MODELS_DIR, f'{safe_name}.pkl')) | |
| results.append({"Model": name, **metrics}) | |
| # ----------------------------------------------------------------------- | |
| # Print comparison table ranked by AUC-ROC | |
| # ----------------------------------------------------------------------- | |
| df = pd.DataFrame(results).sort_values("auc_roc", ascending=False).reset_index(drop=True) | |
| df.index += 1 | |
| print("\n") | |
| print("=" * 65) | |
| print(" Model Benchmark Results (ranked by AUC-ROC)") | |
| print("=" * 65) | |
| print(df.to_string()) | |
| print("=" * 65) | |
| print(f"\n Best model : {df.iloc[0]['Model']} (AUC-ROC = {df.iloc[0]['auc_roc']})") | |
| print("=" * 65) | |
| return df | |
| if __name__ == '__main__': | |
| run() | |