File size: 5,198 Bytes
161d0ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74e6a06
161d0ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import sys
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score,
    recall_score, accuracy_score, roc_curve,
)

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))

PROCESSED_DIR = os.path.join('data', 'processed')
MODELS_DIR    = os.path.join('models')
CHARTS_DIR    = os.path.join('data', 'processed', 'charts')

MODEL_FILES = {
    "Logistic Regression" : "logistic_regression.pkl",
    "Decision Tree"       : "decision_tree.pkl",
    "Random Forest"       : "random_forest.pkl",
    "Extra Trees"         : "extra_trees.pkl",
    "AdaBoost"            : "adaboost.pkl",
    "Gradient Boosting"   : "gradient_boosting.pkl",
    "XGBoost"             : "xgboost.pkl",
    "LightGBM"            : "lightgbm.pkl",
    "CatBoost"            : "catboost.pkl",
}


def load_test_data():
    X_test = joblib.load(os.path.join(PROCESSED_DIR, 'X_test.pkl'))
    y_test = joblib.load(os.path.join(PROCESSED_DIR, 'y_test.pkl'))
    return X_test, y_test


def score_all_models(X_test, y_test):
    results = []
    for name, fname in MODEL_FILES.items():
        path = os.path.join(MODELS_DIR, fname)
        model = joblib.load(path)
        y_pred  = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]
        results.append({
            "Model"     : name,
            "AUC-ROC"   : round(roc_auc_score(y_test, y_proba), 4),
            "F1"        : round(f1_score(y_test, y_pred), 4),
            "Precision" : round(precision_score(y_test, y_pred), 4),
            "Recall"    : round(recall_score(y_test, y_pred), 4),
            "Accuracy"  : round(accuracy_score(y_test, y_pred), 4),
        })
        print(f"  Scored : {name}")
    df = pd.DataFrame(results).sort_values("AUC-ROC", ascending=False).reset_index(drop=True)
    df.index += 1
    return df


def plot_bar_comparison(df):
    os.makedirs(CHARTS_DIR, exist_ok=True)
    metrics = ["AUC-ROC", "F1", "Precision", "Recall"]
    fig, axes = plt.subplots(2, 2, figsize=(16, 10))
    axes = axes.flatten()
    colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(df)))

    for i, metric in enumerate(metrics):
        vals   = df.sort_values(metric, ascending=True)
        bars   = axes[i].barh(vals["Model"], vals[metric], color=colors)
        axes[i].set_title(metric, fontsize=13, fontweight='bold')
        axes[i].set_xlim(max(0, vals[metric].min() - 0.05), 1.0)
        for bar, val in zip(bars, vals[metric]):
            axes[i].text(val + 0.002, bar.get_y() + bar.get_height() / 2,
                         f'{val:.4f}', va='center', fontsize=8)

    plt.suptitle('Model Benchmark — All Metrics', fontsize=15, fontweight='bold')
    plt.tight_layout()
    path = os.path.join(CHARTS_DIR, 'metric_comparison.png')
    plt.savefig(path, bbox_inches='tight', dpi=130)
    plt.close()
    print(f"  Saved  : {path}")


def plot_roc_curves(X_test, y_test):
    os.makedirs(CHARTS_DIR, exist_ok=True)
    plt.figure(figsize=(10, 8))
    colors = plt.cm.tab10(np.linspace(0, 1, len(MODEL_FILES)))

    for (name, fname), color in zip(MODEL_FILES.items(), colors):
        model   = joblib.load(os.path.join(MODELS_DIR, fname))
        y_proba = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        auc = roc_auc_score(y_test, y_proba)
        plt.plot(fpr, tpr, label=f"{name}  (AUC={auc:.4f})", color=color, linewidth=1.8)

    plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random (AUC=0.5)')
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves — All Models', fontsize=14, fontweight='bold')
    plt.legend(loc='lower right', fontsize=9)
    plt.tight_layout()
    path = os.path.join(CHARTS_DIR, 'roc_curves.png')
    plt.savefig(path, bbox_inches='tight', dpi=130)
    plt.close()
    print(f"  Saved  : {path}")


def save_best_model(df):
    best_name = df.iloc[0]["Model"]
    best_file = MODEL_FILES[best_name]
    best_model = joblib.load(os.path.join(MODELS_DIR, best_file))
    out_path = os.path.join(MODELS_DIR, 'best_model.pkl')
    joblib.dump(best_model, out_path)
    print(f"  Best   : {best_name}  (AUC-ROC={df.iloc[0]['AUC-ROC']})")
    print(f"  Saved  : {out_path}")
    print(f"  Note   : Run tune_catboost.py to get the tuned version → catboost_final.pkl")
    return best_name, best_model


def run():
    print("=" * 55)
    print("  Model Comparison & Selection")
    print("=" * 55)

    X_test, y_test = load_test_data()

    print("\n[1] Scoring all models ...")
    df = score_all_models(X_test, y_test)

    print("\n[2] Generating comparison bar charts ...")
    plot_bar_comparison(df)

    print("\n[3] Generating ROC curves ...")
    plot_roc_curves(X_test, y_test)

    print("\n[4] Saving best model ...")
    best_name, _ = save_best_model(df)

    print("\n")
    print("=" * 55)
    print("  Benchmark Results (ranked by AUC-ROC)")
    print("=" * 55)
    print(df.to_string())
    print("=" * 55)

    return df


if __name__ == '__main__':
    run()