QuietML Version 1.0
Browse files- ..gitignore.swp +0 -0
- .gitignore +7 -0
- Stacking/.gitattributes +1 -0
- Stacking/QuietML.ipynb +0 -0
- Stacking/QuietML.joblib +3 -0
- data/.gitattributes +2 -0
- data/clean_spam.csv +3 -0
- data/spam.csv +3 -0
- monoMNB/QuietML.ipynb +0 -0
- monoMNB/QuietML.joblib +3 -0
- monoMNB/QuietML_feature_engineering.py +96 -0
- monoMNB/QuietML_predict.py +44 -0
- monoMNB/QuietML_training.py +166 -0
- monoMNB/REQUEST +7 -0
- monoMNB/docker/Dockerfile +9 -0
- monoMNB/docker/QuietML.joblib +3 -0
- monoMNB/docker/flask_app.py +31 -0
- monoMNB/docker/requirements.txt +3 -0
- monoMNB/requirements.txt +4 -0
..gitignore.swp
ADDED
|
Binary file (1.02 kB). View file
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/bin/
|
| 2 |
+
/include/
|
| 3 |
+
/code/
|
| 4 |
+
/lib/
|
| 5 |
+
/lib64
|
| 6 |
+
/paper.pdf
|
| 7 |
+
/pyvenv.cfg
|
Stacking/.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
QuietML.joblib filter=lfs diff=lfs merge=lfs -text
|
Stacking/QuietML.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Stacking/QuietML.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba347f2d181db2090a7ad0e1f8145964c9a12c143c16d4517f7482be1f742d8b
|
| 3 |
+
size 25836193
|
data/.gitattributes
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
clean_spam.csv filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
spam.csv filter=lfs diff=lfs merge=lfs -text
|
data/clean_spam.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ecf315a25e0233d687a9c39cc2f4988880b5b27e61b8055909b4589aaa7b4a3
|
| 3 |
+
size 490414
|
data/spam.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:440e6ea9fa825578abfdd7b7932ef8393d72ef86c0c33f64676705ce40b1dfc2
|
| 3 |
+
size 503663
|
monoMNB/QuietML.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
monoMNB/QuietML.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a15e5695d59b5584771495663c912ea6d78eb35d346b706505847282be717163
|
| 3 |
+
size 330029
|
monoMNB/QuietML_feature_engineering.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Import Libraries
|
| 2 |
+
# Importing Numpy & Pandas for data processing & data wrangling
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# Importing tools for visualization
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
import seaborn as sns
|
| 10 |
+
|
| 11 |
+
# Import evaluation metric libraries
|
| 12 |
+
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
|
| 13 |
+
|
| 14 |
+
# Word Cloud library
|
| 15 |
+
from wordcloud import WordCloud, STOPWORDS
|
| 16 |
+
|
| 17 |
+
# Library used for data preprocessing
|
| 18 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 19 |
+
|
| 20 |
+
# Import model selection libraries
|
| 21 |
+
from sklearn.model_selection import train_test_split
|
| 22 |
+
|
| 23 |
+
# Library used for ML Model implementation
|
| 24 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 25 |
+
|
| 26 |
+
# Importing the Pipeline class from scikit-learn
|
| 27 |
+
from sklearn.pipeline import Pipeline
|
| 28 |
+
|
| 29 |
+
# Library used for ignore warnings
|
| 30 |
+
import warnings
|
| 31 |
+
warnings.filterwarnings('ignore')
|
| 32 |
+
#%matplotlib inline
|
| 33 |
+
|
| 34 |
+
#knowing the current path
|
| 35 |
+
import os
|
| 36 |
+
print(os.system("pwd"))
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# Load Dataset
|
| 40 |
+
df = pd.read_csv("./data/spam.csv", encoding='ISO-8859-1')
|
| 41 |
+
print(df.head())
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# Dataset Rows & Columns count
|
| 46 |
+
# Checking number of rows and columns of the dataset using shape
|
| 47 |
+
print("Number of rows are: ",df.shape[0])
|
| 48 |
+
print("Number of columns are: ",df.shape[1])
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# Dataset Info
|
| 53 |
+
# Checking information about the dataset using info
|
| 54 |
+
df.info()
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# Dataset Duplicate Value Count
|
| 59 |
+
dup = df.duplicated().sum()
|
| 60 |
+
print(f'number of duplicated rows are {dup}')
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# Missing Values/Null Values Count
|
| 65 |
+
print(df.isnull().sum())
|
| 66 |
+
|
| 67 |
+
# Dataset Columns
|
| 68 |
+
print(df.columns)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# Dataset Describe (all columns included)
|
| 73 |
+
print(df.describe(include= 'all').round(2))
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# Check Unique Values for each variable using a for loop.
|
| 78 |
+
for i in df.columns.tolist():
|
| 79 |
+
print("No. of unique values in",i,"is",df[i].nunique())
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# Change the v1 & v2 columns as Category and Message
|
| 84 |
+
df.rename(columns={"v1": "Category", "v2": "Message"}, inplace=True)
|
| 85 |
+
# Removing the all unnamed columns (its include much number of missing values)
|
| 86 |
+
df.drop(columns={'Unnamed: 2','Unnamed: 3','Unnamed: 4'}, inplace=True)
|
| 87 |
+
|
| 88 |
+
# Create a binary 'Spam' column: 1 for 'spam' and 0 for 'ham', based on the 'Category' column.
|
| 89 |
+
df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
|
| 90 |
+
|
| 91 |
+
# Updated new dataset
|
| 92 |
+
print(df.head())
|
| 93 |
+
|
| 94 |
+
#exporting the clean data frame
|
| 95 |
+
|
| 96 |
+
df.to_csv('./data/clean_spam.csv', index=False) # `index=False` avoids saving row numbers
|
monoMNB/QuietML_predict.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from joblib import load
|
| 2 |
+
import sys
|
| 3 |
+
clf_loaded = load('./QuietML.joblib') # Load the model
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Defining a function for the Email Spam Detection System
|
| 7 |
+
def detect_spam(email_text):
|
| 8 |
+
# Load the trained classifier (clf) here
|
| 9 |
+
# Replace the comment with your code to load the classifier model
|
| 10 |
+
|
| 11 |
+
# Make a prediction using the loaded classifier
|
| 12 |
+
prediction = clf_loaded.predict([email_text])
|
| 13 |
+
probabilities = clf_loaded.predict_proba([email_text])
|
| 14 |
+
|
| 15 |
+
# The probability of being spam (class 1) is usually the second element
|
| 16 |
+
spam_probability = probabilities[0][1]
|
| 17 |
+
|
| 18 |
+
# The probability of being ham (class 0) is usually the first element
|
| 19 |
+
ham_probability = probabilities[0][0]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
if prediction == 0:
|
| 23 |
+
output_message = "This is a Ham Email!"
|
| 24 |
+
else:
|
| 25 |
+
output_message = "This is a Spam Email!"
|
| 26 |
+
|
| 27 |
+
output_message += f"\nProbability (Ham): {ham_probability:.4f}"
|
| 28 |
+
output_message += f"\nProbability (Spam): {spam_probability:.4f}"
|
| 29 |
+
|
| 30 |
+
return output_message
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# Example of how to use the function
|
| 34 |
+
#sample_email = 'hello want a free prize , you gonna have fun with this prize???'
|
| 35 |
+
#result = detect_spam(sample_email)
|
| 36 |
+
#print(result)
|
| 37 |
+
if __name__ == "__main__":
|
| 38 |
+
if len(sys.argv) < 2:
|
| 39 |
+
print("Please provide an email text to classify.")
|
| 40 |
+
sys.exit(1)
|
| 41 |
+
|
| 42 |
+
email_input = sys.argv[1]
|
| 43 |
+
result = detect_spam(email_input)
|
| 44 |
+
print(result)
|
monoMNB/QuietML_training.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
# Importing tools for visualization
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
+
import seaborn as sns
|
| 7 |
+
|
| 8 |
+
# Import evaluation metric libraries
|
| 9 |
+
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
|
| 10 |
+
|
| 11 |
+
# Word Cloud library
|
| 12 |
+
from wordcloud import WordCloud, STOPWORDS
|
| 13 |
+
|
| 14 |
+
# Library used for data preprocessing
|
| 15 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 16 |
+
|
| 17 |
+
# Import model selection libraries
|
| 18 |
+
from sklearn.model_selection import train_test_split
|
| 19 |
+
|
| 20 |
+
# Library used for ML Model implementation
|
| 21 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 22 |
+
|
| 23 |
+
# Importing the Pipeline class from scikit-learn
|
| 24 |
+
from sklearn.pipeline import Pipeline
|
| 25 |
+
# Library used for ignore warnings
|
| 26 |
+
import warnings
|
| 27 |
+
warnings.filterwarnings('ignore')
|
| 28 |
+
|
| 29 |
+
df = pd.read_csv("./data/clean_spam.csv", encoding='ISO-8859-1')
|
| 30 |
+
|
| 31 |
+
print(df.head())
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def evaluate_model(model, X_train, X_test, y_train, y_test):
|
| 37 |
+
'''The function will take model, x train, x test, y train, y test
|
| 38 |
+
and then it will fit the model, then make predictions on the trained model,
|
| 39 |
+
it will then print roc-auc score of train and test, then plot the roc, auc curve,
|
| 40 |
+
print confusion matrix for train and test, then print classification report for train and test,
|
| 41 |
+
then plot the feature importances if the model has feature importances,
|
| 42 |
+
and finally it will return the following scores as a list:
|
| 43 |
+
recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test
|
| 44 |
+
'''
|
| 45 |
+
|
| 46 |
+
# fit the model on the training data
|
| 47 |
+
model.fit(X_train, y_train)
|
| 48 |
+
|
| 49 |
+
# make predictions on the test data
|
| 50 |
+
y_pred_train = model.predict(X_train)
|
| 51 |
+
y_pred_test = model.predict(X_test)
|
| 52 |
+
pred_prob_train = model.predict_proba(X_train)[:,1]
|
| 53 |
+
pred_prob_test = model.predict_proba(X_test)[:,1]
|
| 54 |
+
|
| 55 |
+
# calculate ROC AUC score
|
| 56 |
+
roc_auc_train = roc_auc_score(y_train, y_pred_train)
|
| 57 |
+
roc_auc_test = roc_auc_score(y_test, y_pred_test)
|
| 58 |
+
print("\nTrain ROC AUC:", roc_auc_train)
|
| 59 |
+
print("Test ROC AUC:", roc_auc_test)
|
| 60 |
+
|
| 61 |
+
# plot the ROC curve
|
| 62 |
+
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, pred_prob_train)
|
| 63 |
+
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, pred_prob_test)
|
| 64 |
+
plt.plot([0,1],[0,1],'k--')
|
| 65 |
+
plt.plot(fpr_train, tpr_train, label="Train ROC AUC: {:.2f}".format(roc_auc_train))
|
| 66 |
+
plt.plot(fpr_test, tpr_test, label="Test ROC AUC: {:.2f}".format(roc_auc_test))
|
| 67 |
+
plt.legend()
|
| 68 |
+
plt.title("ROC Curve")
|
| 69 |
+
plt.xlabel("False Positive Rate")
|
| 70 |
+
plt.ylabel("True Positive Rate")
|
| 71 |
+
plt.show()
|
| 72 |
+
|
| 73 |
+
# calculate confusion matrix
|
| 74 |
+
cm_train = confusion_matrix(y_train, y_pred_train)
|
| 75 |
+
cm_test = confusion_matrix(y_test, y_pred_test)
|
| 76 |
+
|
| 77 |
+
fig, ax = plt.subplots(1, 2, figsize=(11,4))
|
| 78 |
+
|
| 79 |
+
print("\nConfusion Matrix:")
|
| 80 |
+
sns.heatmap(cm_train, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[0])
|
| 81 |
+
ax[0].set_xlabel("Predicted Label")
|
| 82 |
+
ax[0].set_ylabel("True Label")
|
| 83 |
+
ax[0].set_title("Train Confusion Matrix")
|
| 84 |
+
|
| 85 |
+
sns.heatmap(cm_test, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[1])
|
| 86 |
+
ax[1].set_xlabel("Predicted Label")
|
| 87 |
+
ax[1].set_ylabel("True Label")
|
| 88 |
+
ax[1].set_title("Test Confusion Matrix")
|
| 89 |
+
|
| 90 |
+
plt.tight_layout()
|
| 91 |
+
plt.show()
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# calculate classification report
|
| 95 |
+
cr_train = classification_report(y_train, y_pred_train, output_dict=True)
|
| 96 |
+
cr_test = classification_report(y_test, y_pred_test, output_dict=True)
|
| 97 |
+
print("\nTrain Classification Report:")
|
| 98 |
+
crt = pd.DataFrame(cr_train).T
|
| 99 |
+
print(crt.to_markdown())
|
| 100 |
+
# sns.heatmap(pd.DataFrame(cr_train).T.iloc[:, :-1], annot=True, cmap="Blues")
|
| 101 |
+
print("\nTest Classification Report:")
|
| 102 |
+
crt2 = pd.DataFrame(cr_test).T
|
| 103 |
+
print(crt2.to_markdown())
|
| 104 |
+
# sns.heatmap(pd.DataFrame(cr_test).T.iloc[:, :-1], annot=True, cmap="Blues")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
precision_train = cr_train['weighted avg']['precision']
|
| 108 |
+
precision_test = cr_test['weighted avg']['precision']
|
| 109 |
+
|
| 110 |
+
recall_train = cr_train['weighted avg']['recall']
|
| 111 |
+
recall_test = cr_test['weighted avg']['recall']
|
| 112 |
+
|
| 113 |
+
acc_train = accuracy_score(y_true = y_train, y_pred = y_pred_train)
|
| 114 |
+
acc_test = accuracy_score(y_true = y_test, y_pred = y_pred_test)
|
| 115 |
+
|
| 116 |
+
F1_train = cr_train['weighted avg']['f1-score']
|
| 117 |
+
F1_test = cr_test['weighted avg']['f1-score']
|
| 118 |
+
|
| 119 |
+
model_score = [precision_train, precision_test, recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test ]
|
| 120 |
+
return model_score
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# Splitting the data to train and test
|
| 129 |
+
X_train,X_test,y_train,y_test=train_test_split(df.Message,df.Spam,test_size=0.25)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
#############################
|
| 133 |
+
|
| 134 |
+
# Create a machine learning pipeline using scikit-learn, combining text vectorization (CountVectorizer)
|
| 135 |
+
# and a Multinomial Naive Bayes classifier for email spam detection.
|
| 136 |
+
clf = Pipeline([
|
| 137 |
+
('vectorizer', CountVectorizer()), # Step 1: Text data transformation
|
| 138 |
+
('nb', MultinomialNB()) # Step 2: Classification using Naive Bayes
|
| 139 |
+
])
|
| 140 |
+
|
| 141 |
+
# Visualizing evaluation Metric Score chart
|
| 142 |
+
MultinomialNB_score = evaluate_model(clf, X_train, X_test, y_train, y_test)
|
| 143 |
+
print(MultinomialNB_score)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
#we want to choose between recall or precision
|
| 150 |
+
|
| 151 |
+
#precision no false postive
|
| 152 |
+
|
| 153 |
+
# if we say something is spam we are sure
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
#recall
|
| 157 |
+
# no false negative
|
| 158 |
+
# we get a high precentage of actual spam email
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
#exporting the model
|
| 164 |
+
|
| 165 |
+
from joblib import dump
|
| 166 |
+
dump(clf, './QuietML.joblib')
|
monoMNB/REQUEST
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
curl -X POST http://localhost:5000/predict \
|
| 5 |
+
-H "Content-Type: application/json" \
|
| 6 |
+
-d '{"email_text": "Congratulations! You have won a free lottery ticket!"}'
|
| 7 |
+
|
monoMNB/docker/Dockerfile
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY . .
|
| 6 |
+
|
| 7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 8 |
+
|
| 9 |
+
CMD ["python", "flask_app.py"]
|
monoMNB/docker/QuietML.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a15e5695d59b5584771495663c912ea6d78eb35d346b706505847282be717163
|
| 3 |
+
size 330029
|
monoMNB/docker/flask_app.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, request, jsonify
|
| 2 |
+
from joblib import load
|
| 3 |
+
|
| 4 |
+
# Initialize Flask app
|
| 5 |
+
app = Flask(__name__)
|
| 6 |
+
|
| 7 |
+
# Load the spam detection model
|
| 8 |
+
clf_loaded = load('./QuietML.joblib')
|
| 9 |
+
|
| 10 |
+
@app.route('/predict', methods=['POST'])
|
| 11 |
+
def predict():
|
| 12 |
+
# Get the email text from the incoming request
|
| 13 |
+
email_text = request.json.get('email_text')
|
| 14 |
+
|
| 15 |
+
if not email_text:
|
| 16 |
+
return jsonify({'error': 'No email_text provided'}), 400
|
| 17 |
+
|
| 18 |
+
# Make the prediction using the spam detector model
|
| 19 |
+
prediction = clf_loaded.predict([email_text])
|
| 20 |
+
probabilities = clf_loaded.predict_proba([email_text])
|
| 21 |
+
# Return the result as JSON
|
| 22 |
+
result = "Spam" if prediction == 1 else "Ham"
|
| 23 |
+
if prediction==1:
|
| 24 |
+
probabilities=probabilities[0][1]
|
| 25 |
+
else:
|
| 26 |
+
probabilities=probabilities[0][0]
|
| 27 |
+
|
| 28 |
+
return jsonify({'prediction': result,'probability': probabilities})
|
| 29 |
+
|
| 30 |
+
if __name__ == '__main__':
|
| 31 |
+
app.run(debug=True, host='0.0.0.0', port=5000)
|
monoMNB/docker/requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
joblib
|
| 2 |
+
scikit-learn
|
| 3 |
+
Flask
|
monoMNB/requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
joblib
|
| 2 |
+
scikit-learn
|
| 3 |
+
Flask
|
| 4 |
+
|