QuietML / monoMNB /QuietML_training.py

QuietML Version 1.0

31c93e2 8 months ago

5.56 kB

	import numpy as np
	import pandas as pd

	# Importing tools for visualization
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Import evaluation metric libraries
	from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

	# Word Cloud library
	from wordcloud import WordCloud, STOPWORDS

	# Library used for data preprocessing
	from sklearn.feature_extraction.text import CountVectorizer

	# Import model selection libraries
	from sklearn.model_selection import train_test_split

	# Library used for ML Model implementation
	from sklearn.naive_bayes import MultinomialNB

	# Importing the Pipeline class from scikit-learn
	from sklearn.pipeline import Pipeline
	# Library used for ignore warnings
	import warnings
	warnings.filterwarnings('ignore')

	df = pd.read_csv("./data/clean_spam.csv", encoding='ISO-8859-1')

	print(df.head())




	def evaluate_model(model, X_train, X_test, y_train, y_test):
	'''The function will take model, x train, x test, y train, y test
	and then it will fit the model, then make predictions on the trained model,
	it will then print roc-auc score of train and test, then plot the roc, auc curve,
	print confusion matrix for train and test, then print classification report for train and test,
	then plot the feature importances if the model has feature importances,
	and finally it will return the following scores as a list:
	recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test
	'''

	# fit the model on the training data
	model.fit(X_train, y_train)

	# make predictions on the test data
	y_pred_train = model.predict(X_train)
	y_pred_test = model.predict(X_test)
	pred_prob_train = model.predict_proba(X_train)[:,1]
	pred_prob_test = model.predict_proba(X_test)[:,1]

	# calculate ROC AUC score
	roc_auc_train = roc_auc_score(y_train, y_pred_train)
	roc_auc_test = roc_auc_score(y_test, y_pred_test)
	print("\nTrain ROC AUC:", roc_auc_train)
	print("Test ROC AUC:", roc_auc_test)

	# plot the ROC curve
	fpr_train, tpr_train, thresholds_train = roc_curve(y_train, pred_prob_train)
	fpr_test, tpr_test, thresholds_test = roc_curve(y_test, pred_prob_test)
	plt.plot([0,1],[0,1],'k--')
	plt.plot(fpr_train, tpr_train, label="Train ROC AUC: {:.2f}".format(roc_auc_train))
	plt.plot(fpr_test, tpr_test, label="Test ROC AUC: {:.2f}".format(roc_auc_test))
	plt.legend()
	plt.title("ROC Curve")
	plt.xlabel("False Positive Rate")
	plt.ylabel("True Positive Rate")
	plt.show()

	# calculate confusion matrix
	cm_train = confusion_matrix(y_train, y_pred_train)
	cm_test = confusion_matrix(y_test, y_pred_test)

	fig, ax = plt.subplots(1, 2, figsize=(11,4))

	print("\nConfusion Matrix:")
	sns.heatmap(cm_train, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[0])
	ax[0].set_xlabel("Predicted Label")
	ax[0].set_ylabel("True Label")
	ax[0].set_title("Train Confusion Matrix")

	sns.heatmap(cm_test, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[1])
	ax[1].set_xlabel("Predicted Label")
	ax[1].set_ylabel("True Label")
	ax[1].set_title("Test Confusion Matrix")

	plt.tight_layout()
	plt.show()


	# calculate classification report
	cr_train = classification_report(y_train, y_pred_train, output_dict=True)
	cr_test = classification_report(y_test, y_pred_test, output_dict=True)
	print("\nTrain Classification Report:")
	crt = pd.DataFrame(cr_train).T
	print(crt.to_markdown())
	# sns.heatmap(pd.DataFrame(cr_train).T.iloc[:, :-1], annot=True, cmap="Blues")
	print("\nTest Classification Report:")
	crt2 = pd.DataFrame(cr_test).T
	print(crt2.to_markdown())
	# sns.heatmap(pd.DataFrame(cr_test).T.iloc[:, :-1], annot=True, cmap="Blues")


	precision_train = cr_train['weighted avg']['precision']
	precision_test = cr_test['weighted avg']['precision']

	recall_train = cr_train['weighted avg']['recall']
	recall_test = cr_test['weighted avg']['recall']

	acc_train = accuracy_score(y_true = y_train, y_pred = y_pred_train)
	acc_test = accuracy_score(y_true = y_test, y_pred = y_pred_test)

	F1_train = cr_train['weighted avg']['f1-score']
	F1_test = cr_test['weighted avg']['f1-score']

	model_score = [precision_train, precision_test, recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test ]
	return model_score







	# Splitting the data to train and test
	X_train,X_test,y_train,y_test=train_test_split(df.Message,df.Spam,test_size=0.25)


	#############################

	# Create a machine learning pipeline using scikit-learn, combining text vectorization (CountVectorizer)
	# and a Multinomial Naive Bayes classifier for email spam detection.
	clf = Pipeline([
	('vectorizer', CountVectorizer()), # Step 1: Text data transformation
	('nb', MultinomialNB()) # Step 2: Classification using Naive Bayes
	])

	# Visualizing evaluation Metric Score chart
	MultinomialNB_score = evaluate_model(clf, X_train, X_test, y_train, y_test)
	print(MultinomialNB_score)





	#we want to choose between recall or precision

	#precision no false postive

	# if we say something is spam we are sure


	#recall
	# no false negative
	# we get a high precentage of actual spam email




	#exporting the model

	from joblib import dump
	dump(clf, './QuietML.joblib')