drnull03 commited on Jul 11, 2025

Commit

31c93e2

1 Parent(s): 6390963

QuietML Version 1.0

Browse files

Files changed (19) hide show

..gitignore.swp +0 -0
.gitignore +7 -0
Stacking/.gitattributes +1 -0
Stacking/QuietML.ipynb +0 -0
Stacking/QuietML.joblib +3 -0
data/.gitattributes +2 -0
data/clean_spam.csv +3 -0
data/spam.csv +3 -0
monoMNB/QuietML.ipynb +0 -0
monoMNB/QuietML.joblib +3 -0
monoMNB/QuietML_feature_engineering.py +96 -0
monoMNB/QuietML_predict.py +44 -0
monoMNB/QuietML_training.py +166 -0
monoMNB/REQUEST +7 -0
monoMNB/docker/Dockerfile +9 -0
monoMNB/docker/QuietML.joblib +3 -0
monoMNB/docker/flask_app.py +31 -0
monoMNB/docker/requirements.txt +3 -0
monoMNB/requirements.txt +4 -0

..gitignore.swp ADDED Viewed

Binary file (1.02 kB). View file

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+/bin/
+/include/
+/code/
+/lib/
+/lib64
+/paper.pdf
+/pyvenv.cfg

Stacking/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ QuietML.joblib filter=lfs diff=lfs merge=lfs -text

Stacking/QuietML.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Stacking/QuietML.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba347f2d181db2090a7ad0e1f8145964c9a12c143c16d4517f7482be1f742d8b
+size 25836193

data/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ clean_spam.csv filter=lfs diff=lfs merge=lfs -text
2	+ spam.csv filter=lfs diff=lfs merge=lfs -text

data/clean_spam.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ecf315a25e0233d687a9c39cc2f4988880b5b27e61b8055909b4589aaa7b4a3
+size 490414

data/spam.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:440e6ea9fa825578abfdd7b7932ef8393d72ef86c0c33f64676705ce40b1dfc2
+size 503663

monoMNB/QuietML.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

monoMNB/QuietML.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a15e5695d59b5584771495663c912ea6d78eb35d346b706505847282be717163
+size 330029

monoMNB/QuietML_feature_engineering.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Import Libraries
+# Importing Numpy & Pandas for data processing & data wrangling
+import numpy as np
+import pandas as pd
+# Importing  tools for visualization
+import matplotlib.pyplot as plt
+import seaborn as sns
+# Import evaluation metric libraries
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
+# Word Cloud library
+from wordcloud import WordCloud, STOPWORDS
+# Library used for data preprocessing
+from sklearn.feature_extraction.text import CountVectorizer
+# Import model selection libraries
+from sklearn.model_selection import train_test_split
+# Library used for ML Model implementation
+from sklearn.naive_bayes import MultinomialNB
+# Importing the Pipeline class from scikit-learn
+from sklearn.pipeline import Pipeline
+# Library used for ignore warnings
+import warnings
+warnings.filterwarnings('ignore')
+#%matplotlib inline
+#knowing the current path
+import os
+print(os.system("pwd"))
+# Load Dataset
+df = pd.read_csv("./data/spam.csv", encoding='ISO-8859-1')
+print(df.head())
+# Dataset Rows & Columns count
+# Checking number of rows and columns of the dataset using shape
+print("Number of rows are: ",df.shape[0])
+print("Number of columns are: ",df.shape[1])
+# Dataset Info
+# Checking information about the dataset using info
+df.info()
+# Dataset Duplicate Value Count
+dup = df.duplicated().sum()
+print(f'number of duplicated rows are {dup}')
+# Missing Values/Null Values Count
+print(df.isnull().sum())
+# Dataset Columns
+print(df.columns)
+# Dataset Describe (all columns included)
+print(df.describe(include= 'all').round(2))
+# Check Unique Values for each variable using a for loop.
+for i in df.columns.tolist():
+  print("No. of unique values in",i,"is",df[i].nunique())
+# Change the v1 & v2 columns as Category and Message
+df.rename(columns={"v1": "Category", "v2": "Message"}, inplace=True)
+# Removing the all unnamed columns (its include much number of missing values)
+df.drop(columns={'Unnamed: 2','Unnamed: 3','Unnamed: 4'}, inplace=True)
+# Create a binary 'Spam' column: 1 for 'spam' and 0 for 'ham', based on the 'Category' column.
+df['Spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
+# Updated new dataset
+print(df.head())
+#exporting the clean data frame
+df.to_csv('./data/clean_spam.csv', index=False)  # `index=False` avoids saving row numbers

monoMNB/QuietML_predict.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from joblib import load
+import sys
+clf_loaded = load('./QuietML.joblib')  # Load the model
+# Defining a function for the Email Spam Detection System
+def detect_spam(email_text):
+    # Load the trained classifier (clf) here
+    # Replace the comment with your code to load the classifier model
+    # Make a prediction using the loaded classifier
+    prediction = clf_loaded.predict([email_text])
+    probabilities = clf_loaded.predict_proba([email_text])
+    # The probability of being spam (class 1) is usually the second element
+    spam_probability = probabilities[0][1]
+    # The probability of being ham (class 0) is usually the first element
+    ham_probability = probabilities[0][0]
+    if prediction == 0:
+        output_message = "This is a Ham Email!"
+    else:
+        output_message = "This is a Spam Email!"
+    output_message += f"\nProbability (Ham): {ham_probability:.4f}"
+    output_message += f"\nProbability (Spam): {spam_probability:.4f}"
+    return output_message
+# Example of how to use the function
+#sample_email = 'hello want a free prize , you gonna have fun with this prize???'
+#result = detect_spam(sample_email)
+#print(result)
+if __name__ == "__main__":
+	if len(sys.argv) < 2:
+		print("Please provide an email text to classify.")
+		sys.exit(1)
+	email_input = sys.argv[1]
+	result = detect_spam(email_input)
+	print(result)

monoMNB/QuietML_training.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import numpy as np
+import pandas as pd
+# Importing  tools for visualization
+import matplotlib.pyplot as plt
+import seaborn as sns
+# Import evaluation metric libraries
+from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
+# Word Cloud library
+from wordcloud import WordCloud, STOPWORDS
+# Library used for data preprocessing
+from sklearn.feature_extraction.text import CountVectorizer
+# Import model selection libraries
+from sklearn.model_selection import train_test_split
+# Library used for ML Model implementation
+from sklearn.naive_bayes import MultinomialNB
+# Importing the Pipeline class from scikit-learn
+from sklearn.pipeline import Pipeline
+# Library used for ignore warnings
+import warnings
+warnings.filterwarnings('ignore')
+df = pd.read_csv("./data/clean_spam.csv", encoding='ISO-8859-1')
+print(df.head())
+def evaluate_model(model, X_train, X_test, y_train, y_test):
+    '''The function will take model, x train, x test, y train, y test
+    and then it will fit the model, then make predictions on the trained model,
+	it will then print roc-auc score of train and test, then plot the roc, auc curve,
+	print confusion matrix for train and test, then print classification report for train and test,
+	then plot the feature importances if the model has feature importances,
+	and finally it will return the following scores as a list:
+	recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test
+	'''
+	# fit the model on the training data
+    model.fit(X_train, y_train)
+	# make predictions on the test data
+    y_pred_train = model.predict(X_train)
+    y_pred_test = model.predict(X_test)
+    pred_prob_train = model.predict_proba(X_train)[:,1]
+    pred_prob_test = model.predict_proba(X_test)[:,1]
+    # calculate ROC AUC score
+    roc_auc_train = roc_auc_score(y_train, y_pred_train)
+    roc_auc_test = roc_auc_score(y_test, y_pred_test)
+    print("\nTrain ROC AUC:", roc_auc_train)
+    print("Test ROC AUC:", roc_auc_test)
+    # plot the ROC curve
+    fpr_train, tpr_train, thresholds_train = roc_curve(y_train, pred_prob_train)
+    fpr_test, tpr_test, thresholds_test = roc_curve(y_test, pred_prob_test)
+    plt.plot([0,1],[0,1],'k--')
+    plt.plot(fpr_train, tpr_train, label="Train ROC AUC: {:.2f}".format(roc_auc_train))
+    plt.plot(fpr_test, tpr_test, label="Test ROC AUC: {:.2f}".format(roc_auc_test))
+    plt.legend()
+    plt.title("ROC Curve")
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.show()
+    # calculate confusion matrix
+    cm_train = confusion_matrix(y_train, y_pred_train)
+    cm_test = confusion_matrix(y_test, y_pred_test)
+    fig, ax = plt.subplots(1, 2, figsize=(11,4))
+    print("\nConfusion Matrix:")
+    sns.heatmap(cm_train, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[0])
+    ax[0].set_xlabel("Predicted Label")
+    ax[0].set_ylabel("True Label")
+    ax[0].set_title("Train Confusion Matrix")
+    sns.heatmap(cm_test, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[1])
+    ax[1].set_xlabel("Predicted Label")
+    ax[1].set_ylabel("True Label")
+    ax[1].set_title("Test Confusion Matrix")
+    plt.tight_layout()
+    plt.show()
+    # calculate classification report
+    cr_train = classification_report(y_train, y_pred_train, output_dict=True)
+    cr_test = classification_report(y_test, y_pred_test, output_dict=True)
+    print("\nTrain Classification Report:")
+    crt = pd.DataFrame(cr_train).T
+    print(crt.to_markdown())
+    # sns.heatmap(pd.DataFrame(cr_train).T.iloc[:, :-1], annot=True, cmap="Blues")
+    print("\nTest Classification Report:")
+    crt2 = pd.DataFrame(cr_test).T
+    print(crt2.to_markdown())
+    # sns.heatmap(pd.DataFrame(cr_test).T.iloc[:, :-1], annot=True, cmap="Blues")
+    precision_train = cr_train['weighted avg']['precision']
+    precision_test = cr_test['weighted avg']['precision']
+    recall_train = cr_train['weighted avg']['recall']
+    recall_test = cr_test['weighted avg']['recall']
+    acc_train = accuracy_score(y_true = y_train, y_pred = y_pred_train)
+    acc_test = accuracy_score(y_true = y_test, y_pred = y_pred_test)
+    F1_train = cr_train['weighted avg']['f1-score']
+    F1_test = cr_test['weighted avg']['f1-score']
+    model_score = [precision_train, precision_test, recall_train, recall_test, acc_train, acc_test, roc_auc_train, roc_auc_test, F1_train, F1_test ]
+    return model_score
+# Splitting the data to train and test
+X_train,X_test,y_train,y_test=train_test_split(df.Message,df.Spam,test_size=0.25)
+#############################
+# Create a machine learning pipeline using scikit-learn, combining text vectorization (CountVectorizer)
+# and a Multinomial Naive Bayes classifier for email spam detection.
+clf = Pipeline([
+    ('vectorizer', CountVectorizer()),  # Step 1: Text data transformation
+    ('nb', MultinomialNB())  # Step 2: Classification using Naive Bayes
+])
+# Visualizing evaluation Metric Score chart
+MultinomialNB_score = evaluate_model(clf, X_train, X_test, y_train, y_test)
+print(MultinomialNB_score)
+#we want to choose between  recall or precision
+#precision no false postive
+# if we say something is spam  we are sure
+#recall
+# no false negative
+# we get a high precentage of actual spam email
+#exporting the model
+from joblib import dump
+dump(clf, './QuietML.joblib')

monoMNB/REQUEST ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/bash
+curl -X POST http://localhost:5000/predict \
+     -H "Content-Type: application/json" \
+     -d '{"email_text": "Congratulations! You have won a free lottery ticket!"}'

monoMNB/docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,9 @@

+FROM python:3.10-slim
+WORKDIR /app
+COPY . .
+RUN pip install --no-cache-dir -r requirements.txt
+CMD ["python", "flask_app.py"]

monoMNB/docker/QuietML.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a15e5695d59b5584771495663c912ea6d78eb35d346b706505847282be717163
+size 330029

monoMNB/docker/flask_app.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from flask import Flask, request, jsonify
+from joblib import load
+# Initialize Flask app
+app = Flask(__name__)
+# Load the spam detection model
+clf_loaded = load('./QuietML.joblib')
+@app.route('/predict', methods=['POST'])
+def predict():
+    # Get the email text from the incoming request
+    email_text = request.json.get('email_text')
+    if not email_text:
+        return jsonify({'error': 'No email_text provided'}), 400
+    # Make the prediction using the spam detector model
+    prediction = clf_loaded.predict([email_text])
+    probabilities = clf_loaded.predict_proba([email_text])
+    # Return the result as JSON
+    result = "Spam" if prediction == 1 else "Ham"
+    if prediction==1:
+        probabilities=probabilities[0][1]
+    else:
+        probabilities=probabilities[0][0]
+    return jsonify({'prediction': result,'probability': probabilities})
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port=5000)

monoMNB/docker/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+joblib
+scikit-learn
+Flask

monoMNB/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+joblib
+scikit-learn
+Flask