QuietML / monoMNB /QuietML_training.py
drnull03's picture
QuietML Version 1.0
31c93e2
import numpy as np
import pandas as pd
# Importing tools for visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Import evaluation metric libraries
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
# Word Cloud library
from wordcloud import WordCloud, STOPWORDS
# Library used for data preprocessing
from sklearn.feature_extraction.text import CountVectorizer
# Import model selection libraries
from sklearn.model_selection import train_test_split
# Library used for ML Model implementation
from sklearn.naive_bayes import MultinomialNB
# Importing the Pipeline class from scikit-learn
from sklearn.pipeline import Pipeline
# Library used for ignore warnings
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("./data/clean_spam.csv", encoding='ISO-8859-1')
print(df.head())
def evaluate_model(model, X_train, X_test, y_train, y_test):
'''The function will take model, x train, x test, y train, y test
and then it will fit the model, then make predictions on the trained model,
it will then print roc-auc score of train and test, then plot the roc, auc curve,
print confusion matrix for train and test, then print classification report for train and test,
then plot the feature importances if the model has feature importances,
and finally it will return the following scores as a list:
recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test
'''
# fit the model on the training data
model.fit(X_train, y_train)
# make predictions on the test data
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
pred_prob_train = model.predict_proba(X_train)[:,1]
pred_prob_test = model.predict_proba(X_test)[:,1]
# calculate ROC AUC score
roc_auc_train = roc_auc_score(y_train, y_pred_train)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
print("\nTrain ROC AUC:", roc_auc_train)
print("Test ROC AUC:", roc_auc_test)
# plot the ROC curve
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, pred_prob_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, pred_prob_test)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr_train, tpr_train, label="Train ROC AUC: {:.2f}".format(roc_auc_train))
plt.plot(fpr_test, tpr_test, label="Test ROC AUC: {:.2f}".format(roc_auc_test))
plt.legend()
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()
# calculate confusion matrix
cm_train = confusion_matrix(y_train, y_pred_train)
cm_test = confusion_matrix(y_test, y_pred_test)
fig, ax = plt.subplots(1, 2, figsize=(11,4))
print("\nConfusion Matrix:")
sns.heatmap(cm_train, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[0])
ax[0].set_xlabel("Predicted Label")
ax[0].set_ylabel("True Label")
ax[0].set_title("Train Confusion Matrix")
sns.heatmap(cm_test, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[1])
ax[1].set_xlabel("Predicted Label")
ax[1].set_ylabel("True Label")
ax[1].set_title("Test Confusion Matrix")
plt.tight_layout()
plt.show()
# calculate classification report
cr_train = classification_report(y_train, y_pred_train, output_dict=True)
cr_test = classification_report(y_test, y_pred_test, output_dict=True)
print("\nTrain Classification Report:")
crt = pd.DataFrame(cr_train).T
print(crt.to_markdown())
# sns.heatmap(pd.DataFrame(cr_train).T.iloc[:, :-1], annot=True, cmap="Blues")
print("\nTest Classification Report:")
crt2 = pd.DataFrame(cr_test).T
print(crt2.to_markdown())
# sns.heatmap(pd.DataFrame(cr_test).T.iloc[:, :-1], annot=True, cmap="Blues")
precision_train = cr_train['weighted avg']['precision']
precision_test = cr_test['weighted avg']['precision']
recall_train = cr_train['weighted avg']['recall']
recall_test = cr_test['weighted avg']['recall']
acc_train = accuracy_score(y_true = y_train, y_pred = y_pred_train)
acc_test = accuracy_score(y_true = y_test, y_pred = y_pred_test)
F1_train = cr_train['weighted avg']['f1-score']
F1_test = cr_test['weighted avg']['f1-score']
model_score = [precision_train, precision_test, recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test ]
return model_score
# Splitting the data to train and test
X_train,X_test,y_train,y_test=train_test_split(df.Message,df.Spam,test_size=0.25)
#############################
# Create a machine learning pipeline using scikit-learn, combining text vectorization (CountVectorizer)
# and a Multinomial Naive Bayes classifier for email spam detection.
clf = Pipeline([
('vectorizer', CountVectorizer()), # Step 1: Text data transformation
('nb', MultinomialNB()) # Step 2: Classification using Naive Bayes
])
# Visualizing evaluation Metric Score chart
MultinomialNB_score = evaluate_model(clf, X_train, X_test, y_train, y_test)
print(MultinomialNB_score)
#we want to choose between recall or precision
#precision no false postive
# if we say something is spam we are sure
#recall
# no false negative
# we get a high precentage of actual spam email
#exporting the model
from joblib import dump
dump(clf, './QuietML.joblib')