Spaces:
Runtime error
Runtime error
Email Spam Classifier using ML and UI in Gradio with Jupyter Notebook
Browse files- app.py +69 -0
- datasets/emails.csv +0 -0
- notebook/spam_email_classification.ipynb +0 -0
- requirements.txt +3 -0
- saved_models/SVM_TF-IDF.pkl +3 -0
- saved_models/vectorizer_TF-IDF.pkl +3 -0
- utils/model_loader.py +10 -0
- utils/predict.py +8 -0
- utils/preprocessing.py +21 -0
app.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from utils.model_loader import load_models
|
| 5 |
+
from utils.predict import predict
|
| 6 |
+
|
| 7 |
+
vectorizer, model = load_models()
|
| 8 |
+
|
| 9 |
+
def classify_email(text):
|
| 10 |
+
if not text.strip():
|
| 11 |
+
return {"__not_spam__": 0.5}
|
| 12 |
+
|
| 13 |
+
result = predict(text, vectorizer, model)
|
| 14 |
+
|
| 15 |
+
if result == "Spam":
|
| 16 |
+
return {"Spam": 1.0}
|
| 17 |
+
else:
|
| 18 |
+
return {"Not Spam": 1.0}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
with gr.Blocks(theme="soft", css="footer {display: none !important}") as demo:
|
| 22 |
+
gr.Markdown(
|
| 23 |
+
"""
|
| 24 |
+
# 🚨 Spam Email Classifier
|
| 25 |
+
Classify emails as **Spam** or **Not Spam** using TF-IDF + SVM
|
| 26 |
+
"""
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
with gr.Row():
|
| 30 |
+
with gr.Column(scale=4):
|
| 31 |
+
input_text = gr.Textbox(
|
| 32 |
+
lines=10,
|
| 33 |
+
placeholder="Paste the full email content here...",
|
| 34 |
+
label="Email Text",
|
| 35 |
+
info="Include subject and body for better accuracy"
|
| 36 |
+
)
|
| 37 |
+
with gr.Column(scale=1, min_width=200):
|
| 38 |
+
output_label = gr.Label(
|
| 39 |
+
label="Prediction",
|
| 40 |
+
num_top_classes=1
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
with gr.Row():
|
| 44 |
+
submit_btn = gr.Button("Classify", variant="primary", size="lg")
|
| 45 |
+
clear_btn = gr.ClearButton([input_text, output_label], value="Clear")
|
| 46 |
+
|
| 47 |
+
submit_btn.click(
|
| 48 |
+
fn=classify_email,
|
| 49 |
+
inputs=input_text,
|
| 50 |
+
outputs=output_label
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
gr.Markdown("### Examples (click to load)")
|
| 54 |
+
examples = gr.Examples(
|
| 55 |
+
examples=[
|
| 56 |
+
["Win a free iPhone! Click here now!!! Limited time offer."],
|
| 57 |
+
["Earn money from home with this simple trick. Start today."],
|
| 58 |
+
["Hey, are we still meeting for lunch tomorrow?"],
|
| 59 |
+
["Meeting rescheduled to 3 PM. See you then!"],
|
| 60 |
+
],
|
| 61 |
+
inputs=input_text,
|
| 62 |
+
outputs=output_label,
|
| 63 |
+
fn=classify_email,
|
| 64 |
+
cache_examples=False
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
demo.launch()
|
datasets/emails.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebook/spam_email_classification.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
nltk
|
| 3 |
+
scikit-learn
|
saved_models/SVM_TF-IDF.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:08d47efb74837ab4280b983d375b85d1a21fd3ef5036fd8eb29448901a50a5e1
|
| 3 |
+
size 738740
|
saved_models/vectorizer_TF-IDF.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f021f2b6366aa0e24654bca2a802ed56471a47b60455e8f0a01853bef3b184b4
|
| 3 |
+
size 182801
|
utils/model_loader.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
|
| 3 |
+
def load_models(vectorizer_path="saved_models/vectorizer_TF-IDF.pkl",
|
| 4 |
+
model_path="saved_models/SVM_TF-IDF.pkl"):
|
| 5 |
+
"""Load vectorizer and SVM model."""
|
| 6 |
+
with open(vectorizer_path, "rb") as f:
|
| 7 |
+
vectorizer = pickle.load(f)
|
| 8 |
+
with open(model_path, "rb") as f:
|
| 9 |
+
model = pickle.load(f)
|
| 10 |
+
return vectorizer, model
|
utils/predict.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .preprocessing import preprocess_text
|
| 2 |
+
|
| 3 |
+
def predict(text: str, vectorizer, model) -> str:
|
| 4 |
+
"""Preprocess text, vectorize, and predict Spam/Ham."""
|
| 5 |
+
processed = preprocess_text(text)
|
| 6 |
+
vectorized = vectorizer.transform([processed])
|
| 7 |
+
result = model.predict(vectorized)[0]
|
| 8 |
+
return "Spam" if result == 1 else "Not Spam"
|
utils/preprocessing.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import string
|
| 3 |
+
import nltk
|
| 4 |
+
from nltk.corpus import stopwords
|
| 5 |
+
from nltk.stem import WordNetLemmatizer
|
| 6 |
+
|
| 7 |
+
nltk.download('punkt')
|
| 8 |
+
nltk.download('stopwords')
|
| 9 |
+
nltk.download('wordnet')
|
| 10 |
+
|
| 11 |
+
stop_words = set(stopwords.words('english'))
|
| 12 |
+
lemmatizer = WordNetLemmatizer()
|
| 13 |
+
|
| 14 |
+
def preprocess_text(text: str) -> str:
|
| 15 |
+
"""Clean and preprocess input text."""
|
| 16 |
+
text = text.lower()
|
| 17 |
+
text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
|
| 18 |
+
tokens = nltk.word_tokenize(text)
|
| 19 |
+
tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
|
| 20 |
+
tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
| 21 |
+
return " ".join(tokens)
|