File size: 3,823 Bytes
9740f31
82fda6d
1dc2d2c
 
 
82fda6d
 
 
1dc2d2c
82fda6d
1dc2d2c
82fda6d
 
1dc2d2c
82fda6d
1dc2d2c
82fda6d
91f527b
 
82fda6d
 
 
 
91f527b
 
 
 
 
 
 
 
 
 
82fda6d
1dc2d2c
 
 
 
 
 
 
 
 
82fda6d
1dc2d2c
 
82fda6d
1dc2d2c
82fda6d
 
 
 
1dc2d2c
82fda6d
 
 
1dc2d2c
 
 
 
 
 
82fda6d
1dc2d2c
 
82fda6d
1dc2d2c
 
 
82fda6d
 
1dc2d2c
 
 
 
 
 
 
 
 
82fda6d
1dc2d2c
 
82fda6d
1dc2d2c
 
82fda6d
1dc2d2c
82fda6d
 
 
 
 
1dc2d2c
82fda6d
1dc2d2c
 
9740f31
1dc2d2c
82fda6d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import streamlit as st
import pandas as pd
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# ----------------- STREAMLIT CONFIG -----------------
st.set_page_config(page_title="πŸ“§ Email Spam Detector", layout="centered")
st.title("πŸ“§ Email Spam Detector")
st.markdown("This app uses **Machine Learning** (Naive Bayes + TF-IDF) to classify emails as **Spam** or **Ham (Not Spam)**.")

# ----------------- DATA LOADING -----------------
@st.cache_data
def load_data(file):
    df = pd.read_csv(file, encoding='latin-1')[['v1', 'v2']]
    df.columns = ['label', 'message']
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    return df

# File uploader for user dataset
st.subheader("πŸ“‚ Upload Dataset")
uploaded_file = st.file_uploader("Upload your spam dataset (CSV format)", type=["csv"])

if uploaded_file is not None:
    df = load_data(uploaded_file)
    st.success("βœ… Dataset loaded successfully from uploaded file.")
else:
    st.info("ℹ️ No file uploaded. Using default dataset (spam.csv).")
    df = load_data("spam.csv")

# ----------------- PREPROCESS FUNCTION -----------------
def clean_text(text):
    text = text.lower().strip()
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

df['message'] = df['message'].apply(clean_text)

# ----------------- TRAIN / TEST SPLIT -----------------
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42
)

# ----------------- TF-IDF VECTORIZATION -----------------
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# ----------------- MODEL TRAINING -----------------
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# ----------------- METRICS -----------------
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# ----------------- SIDEBAR METRICS -----------------
st.sidebar.header("πŸ” Model Performance")
st.sidebar.write(f"**Accuracy:** {accuracy:.2%}")
st.sidebar.write(f"**Precision:** {precision:.2%}")
st.sidebar.write(f"**Recall:** {recall:.2%}")
st.sidebar.write(f"**F1 Score:** {f1:.2%}")
st.sidebar.markdown("Model: `Multinomial Naive Bayes`  \nVectorizer: `TF-IDF`")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["Ham", "Spam"], yticklabels=["Ham", "Spam"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
st.sidebar.pyplot(fig)

# ----------------- PREDICT FUNCTION -----------------
def predict_message(msg):
    msg_clean = clean_text(msg)
    vect_msg = vectorizer.transform([msg_clean])
    pred = model.predict(vect_msg)[0]
    prob = model.predict_proba(vect_msg)[0][pred]
    return ("🚫 Spam", prob) if pred == 1 else ("βœ… Ham (Not Spam)", prob)

# ----------------- USER INPUT -----------------
st.subheader("βœ‰οΈ Test Your Message")
user_input = st.text_area("Enter your email message here:")

if st.button("Detect"):
    if user_input.strip() == "":
        st.warning("⚠️ Please enter a message to classify.")
    else:
        result, confidence = predict_message(user_input)
        st.success(f"Prediction: **{result}**  \nConfidence: **{confidence:.2%}**")

# ----------------- SAMPLE DATA -----------------
with st.expander("πŸ“‚ View Sample Dataset"):
    st.dataframe(df.sample(10))