Spaces:
Sleeping
Sleeping
File size: 5,198 Bytes
161d0ac 74e6a06 161d0ac | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | import os
import sys
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.metrics import (
roc_auc_score, f1_score, precision_score,
recall_score, accuracy_score, roc_curve,
)
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
PROCESSED_DIR = os.path.join('data', 'processed')
MODELS_DIR = os.path.join('models')
CHARTS_DIR = os.path.join('data', 'processed', 'charts')
MODEL_FILES = {
"Logistic Regression" : "logistic_regression.pkl",
"Decision Tree" : "decision_tree.pkl",
"Random Forest" : "random_forest.pkl",
"Extra Trees" : "extra_trees.pkl",
"AdaBoost" : "adaboost.pkl",
"Gradient Boosting" : "gradient_boosting.pkl",
"XGBoost" : "xgboost.pkl",
"LightGBM" : "lightgbm.pkl",
"CatBoost" : "catboost.pkl",
}
def load_test_data():
X_test = joblib.load(os.path.join(PROCESSED_DIR, 'X_test.pkl'))
y_test = joblib.load(os.path.join(PROCESSED_DIR, 'y_test.pkl'))
return X_test, y_test
def score_all_models(X_test, y_test):
results = []
for name, fname in MODEL_FILES.items():
path = os.path.join(MODELS_DIR, fname)
model = joblib.load(path)
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]
results.append({
"Model" : name,
"AUC-ROC" : round(roc_auc_score(y_test, y_proba), 4),
"F1" : round(f1_score(y_test, y_pred), 4),
"Precision" : round(precision_score(y_test, y_pred), 4),
"Recall" : round(recall_score(y_test, y_pred), 4),
"Accuracy" : round(accuracy_score(y_test, y_pred), 4),
})
print(f" Scored : {name}")
df = pd.DataFrame(results).sort_values("AUC-ROC", ascending=False).reset_index(drop=True)
df.index += 1
return df
def plot_bar_comparison(df):
os.makedirs(CHARTS_DIR, exist_ok=True)
metrics = ["AUC-ROC", "F1", "Precision", "Recall"]
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.flatten()
colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(df)))
for i, metric in enumerate(metrics):
vals = df.sort_values(metric, ascending=True)
bars = axes[i].barh(vals["Model"], vals[metric], color=colors)
axes[i].set_title(metric, fontsize=13, fontweight='bold')
axes[i].set_xlim(max(0, vals[metric].min() - 0.05), 1.0)
for bar, val in zip(bars, vals[metric]):
axes[i].text(val + 0.002, bar.get_y() + bar.get_height() / 2,
f'{val:.4f}', va='center', fontsize=8)
plt.suptitle('Model Benchmark — All Metrics', fontsize=15, fontweight='bold')
plt.tight_layout()
path = os.path.join(CHARTS_DIR, 'metric_comparison.png')
plt.savefig(path, bbox_inches='tight', dpi=130)
plt.close()
print(f" Saved : {path}")
def plot_roc_curves(X_test, y_test):
os.makedirs(CHARTS_DIR, exist_ok=True)
plt.figure(figsize=(10, 8))
colors = plt.cm.tab10(np.linspace(0, 1, len(MODEL_FILES)))
for (name, fname), color in zip(MODEL_FILES.items(), colors):
model = joblib.load(os.path.join(MODELS_DIR, fname))
y_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
plt.plot(fpr, tpr, label=f"{name} (AUC={auc:.4f})", color=color, linewidth=1.8)
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random (AUC=0.5)')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves — All Models', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=9)
plt.tight_layout()
path = os.path.join(CHARTS_DIR, 'roc_curves.png')
plt.savefig(path, bbox_inches='tight', dpi=130)
plt.close()
print(f" Saved : {path}")
def save_best_model(df):
best_name = df.iloc[0]["Model"]
best_file = MODEL_FILES[best_name]
best_model = joblib.load(os.path.join(MODELS_DIR, best_file))
out_path = os.path.join(MODELS_DIR, 'best_model.pkl')
joblib.dump(best_model, out_path)
print(f" Best : {best_name} (AUC-ROC={df.iloc[0]['AUC-ROC']})")
print(f" Saved : {out_path}")
print(f" Note : Run tune_catboost.py to get the tuned version → catboost_final.pkl")
return best_name, best_model
def run():
print("=" * 55)
print(" Model Comparison & Selection")
print("=" * 55)
X_test, y_test = load_test_data()
print("\n[1] Scoring all models ...")
df = score_all_models(X_test, y_test)
print("\n[2] Generating comparison bar charts ...")
plot_bar_comparison(df)
print("\n[3] Generating ROC curves ...")
plot_roc_curves(X_test, y_test)
print("\n[4] Saving best model ...")
best_name, _ = save_best_model(df)
print("\n")
print("=" * 55)
print(" Benchmark Results (ranked by AUC-ROC)")
print("=" * 55)
print(df.to_string())
print("=" * 55)
return df
if __name__ == '__main__':
run()
|