Spam-Detector-App-Code / src /streamlit_app.py
meesamraza's picture
Update src/streamlit_app.py
91f527b verified
import streamlit as st
import pandas as pd
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# ----------------- STREAMLIT CONFIG -----------------
st.set_page_config(page_title="πŸ“§ Email Spam Detector", layout="centered")
st.title("πŸ“§ Email Spam Detector")
st.markdown("This app uses **Machine Learning** (Naive Bayes + TF-IDF) to classify emails as **Spam** or **Ham (Not Spam)**.")
# ----------------- DATA LOADING -----------------
@st.cache_data
def load_data(file):
df = pd.read_csv(file, encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
return df
# File uploader for user dataset
st.subheader("πŸ“‚ Upload Dataset")
uploaded_file = st.file_uploader("Upload your spam dataset (CSV format)", type=["csv"])
if uploaded_file is not None:
df = load_data(uploaded_file)
st.success("βœ… Dataset loaded successfully from uploaded file.")
else:
st.info("ℹ️ No file uploaded. Using default dataset (spam.csv).")
df = load_data("spam.csv")
# ----------------- PREPROCESS FUNCTION -----------------
def clean_text(text):
text = text.lower().strip()
text = text.translate(str.maketrans("", "", string.punctuation))
return text
df['message'] = df['message'].apply(clean_text)
# ----------------- TRAIN / TEST SPLIT -----------------
X_train, X_test, y_train, y_test = train_test_split(
df['message'], df['label'], test_size=0.2, random_state=42
)
# ----------------- TF-IDF VECTORIZATION -----------------
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
# ----------------- MODEL TRAINING -----------------
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
# ----------------- METRICS -----------------
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# ----------------- SIDEBAR METRICS -----------------
st.sidebar.header("πŸ” Model Performance")
st.sidebar.write(f"**Accuracy:** {accuracy:.2%}")
st.sidebar.write(f"**Precision:** {precision:.2%}")
st.sidebar.write(f"**Recall:** {recall:.2%}")
st.sidebar.write(f"**F1 Score:** {f1:.2%}")
st.sidebar.markdown("Model: `Multinomial Naive Bayes` \nVectorizer: `TF-IDF`")
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots()
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["Ham", "Spam"], yticklabels=["Ham", "Spam"])
plt.ylabel('Actual')
plt.xlabel('Predicted')
st.sidebar.pyplot(fig)
# ----------------- PREDICT FUNCTION -----------------
def predict_message(msg):
msg_clean = clean_text(msg)
vect_msg = vectorizer.transform([msg_clean])
pred = model.predict(vect_msg)[0]
prob = model.predict_proba(vect_msg)[0][pred]
return ("🚫 Spam", prob) if pred == 1 else ("βœ… Ham (Not Spam)", prob)
# ----------------- USER INPUT -----------------
st.subheader("βœ‰οΈ Test Your Message")
user_input = st.text_area("Enter your email message here:")
if st.button("Detect"):
if user_input.strip() == "":
st.warning("⚠️ Please enter a message to classify.")
else:
result, confidence = predict_message(user_input)
st.success(f"Prediction: **{result}** \nConfidence: **{confidence:.2%}**")
# ----------------- SAMPLE DATA -----------------
with st.expander("πŸ“‚ View Sample Dataset"):
st.dataframe(df.sample(10))