| | import numpy as np |
| | import pandas as pd |
| |
|
| | |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| |
|
| | |
| | from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report |
| |
|
| | |
| | from wordcloud import WordCloud, STOPWORDS |
| |
|
| | |
| | from sklearn.feature_extraction.text import CountVectorizer |
| |
|
| | |
| | from sklearn.model_selection import train_test_split |
| |
|
| | |
| | from sklearn.naive_bayes import MultinomialNB |
| |
|
| | |
| | from sklearn.pipeline import Pipeline |
| | |
| | import warnings |
| | warnings.filterwarnings('ignore') |
| |
|
| | df = pd.read_csv("./data/clean_spam.csv", encoding='ISO-8859-1') |
| |
|
| | print(df.head()) |
| |
|
| |
|
| |
|
| |
|
| | def evaluate_model(model, X_train, X_test, y_train, y_test): |
| | '''The function will take model, x train, x test, y train, y test |
| | and then it will fit the model, then make predictions on the trained model, |
| | it will then print roc-auc score of train and test, then plot the roc, auc curve, |
| | print confusion matrix for train and test, then print classification report for train and test, |
| | then plot the feature importances if the model has feature importances, |
| | and finally it will return the following scores as a list: |
| | recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test |
| | ''' |
| |
|
| | |
| | model.fit(X_train, y_train) |
| |
|
| | |
| | y_pred_train = model.predict(X_train) |
| | y_pred_test = model.predict(X_test) |
| | pred_prob_train = model.predict_proba(X_train)[:,1] |
| | pred_prob_test = model.predict_proba(X_test)[:,1] |
| |
|
| | |
| | roc_auc_train = roc_auc_score(y_train, y_pred_train) |
| | roc_auc_test = roc_auc_score(y_test, y_pred_test) |
| | print("\nTrain ROC AUC:", roc_auc_train) |
| | print("Test ROC AUC:", roc_auc_test) |
| |
|
| | |
| | fpr_train, tpr_train, thresholds_train = roc_curve(y_train, pred_prob_train) |
| | fpr_test, tpr_test, thresholds_test = roc_curve(y_test, pred_prob_test) |
| | plt.plot([0,1],[0,1],'k--') |
| | plt.plot(fpr_train, tpr_train, label="Train ROC AUC: {:.2f}".format(roc_auc_train)) |
| | plt.plot(fpr_test, tpr_test, label="Test ROC AUC: {:.2f}".format(roc_auc_test)) |
| | plt.legend() |
| | plt.title("ROC Curve") |
| | plt.xlabel("False Positive Rate") |
| | plt.ylabel("True Positive Rate") |
| | plt.show() |
| |
|
| | |
| | cm_train = confusion_matrix(y_train, y_pred_train) |
| | cm_test = confusion_matrix(y_test, y_pred_test) |
| |
|
| | fig, ax = plt.subplots(1, 2, figsize=(11,4)) |
| |
|
| | print("\nConfusion Matrix:") |
| | sns.heatmap(cm_train, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[0]) |
| | ax[0].set_xlabel("Predicted Label") |
| | ax[0].set_ylabel("True Label") |
| | ax[0].set_title("Train Confusion Matrix") |
| |
|
| | sns.heatmap(cm_test, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[1]) |
| | ax[1].set_xlabel("Predicted Label") |
| | ax[1].set_ylabel("True Label") |
| | ax[1].set_title("Test Confusion Matrix") |
| |
|
| | plt.tight_layout() |
| | plt.show() |
| |
|
| |
|
| | |
| | cr_train = classification_report(y_train, y_pred_train, output_dict=True) |
| | cr_test = classification_report(y_test, y_pred_test, output_dict=True) |
| | print("\nTrain Classification Report:") |
| | crt = pd.DataFrame(cr_train).T |
| | print(crt.to_markdown()) |
| | |
| | print("\nTest Classification Report:") |
| | crt2 = pd.DataFrame(cr_test).T |
| | print(crt2.to_markdown()) |
| | |
| |
|
| |
|
| | precision_train = cr_train['weighted avg']['precision'] |
| | precision_test = cr_test['weighted avg']['precision'] |
| |
|
| | recall_train = cr_train['weighted avg']['recall'] |
| | recall_test = cr_test['weighted avg']['recall'] |
| |
|
| | acc_train = accuracy_score(y_true = y_train, y_pred = y_pred_train) |
| | acc_test = accuracy_score(y_true = y_test, y_pred = y_pred_test) |
| |
|
| | F1_train = cr_train['weighted avg']['f1-score'] |
| | F1_test = cr_test['weighted avg']['f1-score'] |
| |
|
| | model_score = [precision_train, precision_test, recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test ] |
| | return model_score |
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | |
| | X_train,X_test,y_train,y_test=train_test_split(df.Message,df.Spam,test_size=0.25) |
| |
|
| |
|
| | |
| |
|
| | |
| | |
| | clf = Pipeline([ |
| | ('vectorizer', CountVectorizer()), |
| | ('nb', MultinomialNB()) |
| | ]) |
| |
|
| | |
| | MultinomialNB_score = evaluate_model(clf, X_train, X_test, y_train, y_test) |
| | print(MultinomialNB_score) |
| |
|
| |
|
| |
|
| |
|
| |
|
| | |
| |
|
| | |
| |
|
| | |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| |
|
| |
|
| | |
| |
|
| | from joblib import dump |
| | dump(clf, './QuietML.joblib') |
| |
|