drnull03 commited on
Commit
31c93e2
·
1 Parent(s): 6390963

QuietML Version 1.0

Browse files
..gitignore.swp ADDED
Binary file (1.02 kB). View file
 
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ /bin/
2
+ /include/
3
+ /code/
4
+ /lib/
5
+ /lib64
6
+ /paper.pdf
7
+ /pyvenv.cfg
Stacking/.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ QuietML.joblib filter=lfs diff=lfs merge=lfs -text
Stacking/QuietML.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
Stacking/QuietML.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba347f2d181db2090a7ad0e1f8145964c9a12c143c16d4517f7482be1f742d8b
3
+ size 25836193
data/.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ clean_spam.csv filter=lfs diff=lfs merge=lfs -text
2
+ spam.csv filter=lfs diff=lfs merge=lfs -text
data/clean_spam.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ecf315a25e0233d687a9c39cc2f4988880b5b27e61b8055909b4589aaa7b4a3
3
+ size 490414
data/spam.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:440e6ea9fa825578abfdd7b7932ef8393d72ef86c0c33f64676705ce40b1dfc2
3
+ size 503663
monoMNB/QuietML.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
monoMNB/QuietML.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a15e5695d59b5584771495663c912ea6d78eb35d346b706505847282be717163
3
+ size 330029
monoMNB/QuietML_feature_engineering.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import Libraries
2
+ # Importing Numpy & Pandas for data processing & data wrangling
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+
7
+ # Importing tools for visualization
8
+ import matplotlib.pyplot as plt
9
+ import seaborn as sns
10
+
11
+ # Import evaluation metric libraries
12
+ from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
13
+
14
+ # Word Cloud library
15
+ from wordcloud import WordCloud, STOPWORDS
16
+
17
+ # Library used for data preprocessing
18
+ from sklearn.feature_extraction.text import CountVectorizer
19
+
20
+ # Import model selection libraries
21
+ from sklearn.model_selection import train_test_split
22
+
23
+ # Library used for ML Model implementation
24
+ from sklearn.naive_bayes import MultinomialNB
25
+
26
+ # Importing the Pipeline class from scikit-learn
27
+ from sklearn.pipeline import Pipeline
28
+
29
+ # Library used for ignore warnings
30
+ import warnings
31
+ warnings.filterwarnings('ignore')
32
+ #%matplotlib inline
33
+
34
+ #knowing the current path
35
+ import os
36
+ print(os.system("pwd"))
37
+
38
+
39
+ # Load Dataset
40
+ df = pd.read_csv("./data/spam.csv", encoding='ISO-8859-1')
41
+ print(df.head())
42
+
43
+
44
+
45
+ # Dataset Rows & Columns count
46
+ # Checking number of rows and columns of the dataset using shape
47
+ print("Number of rows are: ",df.shape[0])
48
+ print("Number of columns are: ",df.shape[1])
49
+
50
+
51
+
52
+ # Dataset Info
53
+ # Checking information about the dataset using info
54
+ df.info()
55
+
56
+
57
+
58
+ # Dataset Duplicate Value Count
59
+ dup = df.duplicated().sum()
60
+ print(f'number of duplicated rows are {dup}')
61
+
62
+
63
+
64
+ # Missing Values/Null Values Count
65
+ print(df.isnull().sum())
66
+
67
+ # Dataset Columns
68
+ print(df.columns)
69
+
70
+
71
+
72
+ # Dataset Describe (all columns included)
73
+ print(df.describe(include= 'all').round(2))
74
+
75
+
76
+
77
+ # Check Unique Values for each variable using a for loop.
78
+ for i in df.columns.tolist():
79
+ print("No. of unique values in",i,"is",df[i].nunique())
80
+
81
+
82
+
83
+ # Change the v1 & v2 columns as Category and Message
84
+ df.rename(columns={"v1": "Category", "v2": "Message"}, inplace=True)
85
+ # Removing the all unnamed columns (its include much number of missing values)
86
+ df.drop(columns={'Unnamed: 2','Unnamed: 3','Unnamed: 4'}, inplace=True)
87
+
88
+ # Create a binary 'Spam' column: 1 for 'spam' and 0 for 'ham', based on the 'Category' column.
89
+ df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
90
+
91
+ # Updated new dataset
92
+ print(df.head())
93
+
94
+ #exporting the clean data frame
95
+
96
+ df.to_csv('./data/clean_spam.csv', index=False) # `index=False` avoids saving row numbers
monoMNB/QuietML_predict.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from joblib import load
2
+ import sys
3
+ clf_loaded = load('./QuietML.joblib') # Load the model
4
+
5
+
6
+ # Defining a function for the Email Spam Detection System
7
+ def detect_spam(email_text):
8
+ # Load the trained classifier (clf) here
9
+ # Replace the comment with your code to load the classifier model
10
+
11
+ # Make a prediction using the loaded classifier
12
+ prediction = clf_loaded.predict([email_text])
13
+ probabilities = clf_loaded.predict_proba([email_text])
14
+
15
+ # The probability of being spam (class 1) is usually the second element
16
+ spam_probability = probabilities[0][1]
17
+
18
+ # The probability of being ham (class 0) is usually the first element
19
+ ham_probability = probabilities[0][0]
20
+
21
+
22
+ if prediction == 0:
23
+ output_message = "This is a Ham Email!"
24
+ else:
25
+ output_message = "This is a Spam Email!"
26
+
27
+ output_message += f"\nProbability (Ham): {ham_probability:.4f}"
28
+ output_message += f"\nProbability (Spam): {spam_probability:.4f}"
29
+
30
+ return output_message
31
+
32
+
33
+ # Example of how to use the function
34
+ #sample_email = 'hello want a free prize , you gonna have fun with this prize???'
35
+ #result = detect_spam(sample_email)
36
+ #print(result)
37
+ if __name__ == "__main__":
38
+ if len(sys.argv) < 2:
39
+ print("Please provide an email text to classify.")
40
+ sys.exit(1)
41
+
42
+ email_input = sys.argv[1]
43
+ result = detect_spam(email_input)
44
+ print(result)
monoMNB/QuietML_training.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ # Importing tools for visualization
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+
8
+ # Import evaluation metric libraries
9
+ from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
10
+
11
+ # Word Cloud library
12
+ from wordcloud import WordCloud, STOPWORDS
13
+
14
+ # Library used for data preprocessing
15
+ from sklearn.feature_extraction.text import CountVectorizer
16
+
17
+ # Import model selection libraries
18
+ from sklearn.model_selection import train_test_split
19
+
20
+ # Library used for ML Model implementation
21
+ from sklearn.naive_bayes import MultinomialNB
22
+
23
+ # Importing the Pipeline class from scikit-learn
24
+ from sklearn.pipeline import Pipeline
25
+ # Library used for ignore warnings
26
+ import warnings
27
+ warnings.filterwarnings('ignore')
28
+
29
+ df = pd.read_csv("./data/clean_spam.csv", encoding='ISO-8859-1')
30
+
31
+ print(df.head())
32
+
33
+
34
+
35
+
36
+ def evaluate_model(model, X_train, X_test, y_train, y_test):
37
+ '''The function will take model, x train, x test, y train, y test
38
+ and then it will fit the model, then make predictions on the trained model,
39
+ it will then print roc-auc score of train and test, then plot the roc, auc curve,
40
+ print confusion matrix for train and test, then print classification report for train and test,
41
+ then plot the feature importances if the model has feature importances,
42
+ and finally it will return the following scores as a list:
43
+ recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test
44
+ '''
45
+
46
+ # fit the model on the training data
47
+ model.fit(X_train, y_train)
48
+
49
+ # make predictions on the test data
50
+ y_pred_train = model.predict(X_train)
51
+ y_pred_test = model.predict(X_test)
52
+ pred_prob_train = model.predict_proba(X_train)[:,1]
53
+ pred_prob_test = model.predict_proba(X_test)[:,1]
54
+
55
+ # calculate ROC AUC score
56
+ roc_auc_train = roc_auc_score(y_train, y_pred_train)
57
+ roc_auc_test = roc_auc_score(y_test, y_pred_test)
58
+ print("\nTrain ROC AUC:", roc_auc_train)
59
+ print("Test ROC AUC:", roc_auc_test)
60
+
61
+ # plot the ROC curve
62
+ fpr_train, tpr_train, thresholds_train = roc_curve(y_train, pred_prob_train)
63
+ fpr_test, tpr_test, thresholds_test = roc_curve(y_test, pred_prob_test)
64
+ plt.plot([0,1],[0,1],'k--')
65
+ plt.plot(fpr_train, tpr_train, label="Train ROC AUC: {:.2f}".format(roc_auc_train))
66
+ plt.plot(fpr_test, tpr_test, label="Test ROC AUC: {:.2f}".format(roc_auc_test))
67
+ plt.legend()
68
+ plt.title("ROC Curve")
69
+ plt.xlabel("False Positive Rate")
70
+ plt.ylabel("True Positive Rate")
71
+ plt.show()
72
+
73
+ # calculate confusion matrix
74
+ cm_train = confusion_matrix(y_train, y_pred_train)
75
+ cm_test = confusion_matrix(y_test, y_pred_test)
76
+
77
+ fig, ax = plt.subplots(1, 2, figsize=(11,4))
78
+
79
+ print("\nConfusion Matrix:")
80
+ sns.heatmap(cm_train, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[0])
81
+ ax[0].set_xlabel("Predicted Label")
82
+ ax[0].set_ylabel("True Label")
83
+ ax[0].set_title("Train Confusion Matrix")
84
+
85
+ sns.heatmap(cm_test, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[1])
86
+ ax[1].set_xlabel("Predicted Label")
87
+ ax[1].set_ylabel("True Label")
88
+ ax[1].set_title("Test Confusion Matrix")
89
+
90
+ plt.tight_layout()
91
+ plt.show()
92
+
93
+
94
+ # calculate classification report
95
+ cr_train = classification_report(y_train, y_pred_train, output_dict=True)
96
+ cr_test = classification_report(y_test, y_pred_test, output_dict=True)
97
+ print("\nTrain Classification Report:")
98
+ crt = pd.DataFrame(cr_train).T
99
+ print(crt.to_markdown())
100
+ # sns.heatmap(pd.DataFrame(cr_train).T.iloc[:, :-1], annot=True, cmap="Blues")
101
+ print("\nTest Classification Report:")
102
+ crt2 = pd.DataFrame(cr_test).T
103
+ print(crt2.to_markdown())
104
+ # sns.heatmap(pd.DataFrame(cr_test).T.iloc[:, :-1], annot=True, cmap="Blues")
105
+
106
+
107
+ precision_train = cr_train['weighted avg']['precision']
108
+ precision_test = cr_test['weighted avg']['precision']
109
+
110
+ recall_train = cr_train['weighted avg']['recall']
111
+ recall_test = cr_test['weighted avg']['recall']
112
+
113
+ acc_train = accuracy_score(y_true = y_train, y_pred = y_pred_train)
114
+ acc_test = accuracy_score(y_true = y_test, y_pred = y_pred_test)
115
+
116
+ F1_train = cr_train['weighted avg']['f1-score']
117
+ F1_test = cr_test['weighted avg']['f1-score']
118
+
119
+ model_score = [precision_train, precision_test, recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test ]
120
+ return model_score
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+ # Splitting the data to train and test
129
+ X_train,X_test,y_train,y_test=train_test_split(df.Message,df.Spam,test_size=0.25)
130
+
131
+
132
+ #############################
133
+
134
+ # Create a machine learning pipeline using scikit-learn, combining text vectorization (CountVectorizer)
135
+ # and a Multinomial Naive Bayes classifier for email spam detection.
136
+ clf = Pipeline([
137
+ ('vectorizer', CountVectorizer()), # Step 1: Text data transformation
138
+ ('nb', MultinomialNB()) # Step 2: Classification using Naive Bayes
139
+ ])
140
+
141
+ # Visualizing evaluation Metric Score chart
142
+ MultinomialNB_score = evaluate_model(clf, X_train, X_test, y_train, y_test)
143
+ print(MultinomialNB_score)
144
+
145
+
146
+
147
+
148
+
149
+ #we want to choose between recall or precision
150
+
151
+ #precision no false postive
152
+
153
+ # if we say something is spam we are sure
154
+
155
+
156
+ #recall
157
+ # no false negative
158
+ # we get a high precentage of actual spam email
159
+
160
+
161
+
162
+
163
+ #exporting the model
164
+
165
+ from joblib import dump
166
+ dump(clf, './QuietML.joblib')
monoMNB/REQUEST ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+
4
+ curl -X POST http://localhost:5000/predict \
5
+ -H "Content-Type: application/json" \
6
+ -d '{"email_text": "Congratulations! You have won a free lottery ticket!"}'
7
+
monoMNB/docker/Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY . .
6
+
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ CMD ["python", "flask_app.py"]
monoMNB/docker/QuietML.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a15e5695d59b5584771495663c912ea6d78eb35d346b706505847282be717163
3
+ size 330029
monoMNB/docker/flask_app.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ from joblib import load
3
+
4
+ # Initialize Flask app
5
+ app = Flask(__name__)
6
+
7
+ # Load the spam detection model
8
+ clf_loaded = load('./QuietML.joblib')
9
+
10
+ @app.route('/predict', methods=['POST'])
11
+ def predict():
12
+ # Get the email text from the incoming request
13
+ email_text = request.json.get('email_text')
14
+
15
+ if not email_text:
16
+ return jsonify({'error': 'No email_text provided'}), 400
17
+
18
+ # Make the prediction using the spam detector model
19
+ prediction = clf_loaded.predict([email_text])
20
+ probabilities = clf_loaded.predict_proba([email_text])
21
+ # Return the result as JSON
22
+ result = "Spam" if prediction == 1 else "Ham"
23
+ if prediction==1:
24
+ probabilities=probabilities[0][1]
25
+ else:
26
+ probabilities=probabilities[0][0]
27
+
28
+ return jsonify({'prediction': result,'probability': probabilities})
29
+
30
+ if __name__ == '__main__':
31
+ app.run(debug=True, host='0.0.0.0', port=5000)
monoMNB/docker/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ joblib
2
+ scikit-learn
3
+ Flask
monoMNB/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ joblib
2
+ scikit-learn
3
+ Flask
4
+