import streamlit as st import pandas as pd import string import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix # ----------------- STREAMLIT CONFIG ----------------- st.set_page_config(page_title="📧 Email Spam Detector", layout="centered") st.title("📧 Email Spam Detector") st.markdown("This app uses **Machine Learning** (Naive Bayes + TF-IDF) to classify emails as **Spam** or **Ham (Not Spam)**.") # ----------------- DATA LOADING ----------------- @st.cache_data def load_data(file): df = pd.read_csv(file, encoding='latin-1')[['v1', 'v2']] df.columns = ['label', 'message'] df['label'] = df['label'].map({'ham': 0, 'spam': 1}) return df # File uploader for user dataset st.subheader("📂 Upload Dataset") uploaded_file = st.file_uploader("Upload your spam dataset (CSV format)", type=["csv"]) if uploaded_file is not None: df = load_data(uploaded_file) st.success("✅ Dataset loaded successfully from uploaded file.") else: st.info("â„šī¸ No file uploaded. Using default dataset (spam.csv).") df = load_data("spam.csv") # ----------------- PREPROCESS FUNCTION ----------------- def clean_text(text): text = text.lower().strip() text = text.translate(str.maketrans("", "", string.punctuation)) return text df['message'] = df['message'].apply(clean_text) # ----------------- TRAIN / TEST SPLIT ----------------- X_train, X_test, y_train, y_test = train_test_split( df['message'], df['label'], test_size=0.2, random_state=42 ) # ----------------- TF-IDF VECTORIZATION ----------------- vectorizer = TfidfVectorizer(stop_words='english') X_train_tfidf = vectorizer.fit_transform(X_train) X_test_tfidf = vectorizer.transform(X_test) # ----------------- MODEL TRAINING ----------------- model = MultinomialNB() model.fit(X_train_tfidf, y_train) # ----------------- METRICS ----------------- y_pred = model.predict(X_test_tfidf) accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) # ----------------- SIDEBAR METRICS ----------------- st.sidebar.header("🔍 Model Performance") st.sidebar.write(f"**Accuracy:** {accuracy:.2%}") st.sidebar.write(f"**Precision:** {precision:.2%}") st.sidebar.write(f"**Recall:** {recall:.2%}") st.sidebar.write(f"**F1 Score:** {f1:.2%}") st.sidebar.markdown("Model: `Multinomial Naive Bayes` \nVectorizer: `TF-IDF`") # Confusion Matrix cm = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots() sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["Ham", "Spam"], yticklabels=["Ham", "Spam"]) plt.ylabel('Actual') plt.xlabel('Predicted') st.sidebar.pyplot(fig) # ----------------- PREDICT FUNCTION ----------------- def predict_message(msg): msg_clean = clean_text(msg) vect_msg = vectorizer.transform([msg_clean]) pred = model.predict(vect_msg)[0] prob = model.predict_proba(vect_msg)[0][pred] return ("đŸšĢ Spam", prob) if pred == 1 else ("✅ Ham (Not Spam)", prob) # ----------------- USER INPUT ----------------- st.subheader("âœ‰ī¸ Test Your Message") user_input = st.text_area("Enter your email message here:") if st.button("Detect"): if user_input.strip() == "": st.warning("âš ī¸ Please enter a message to classify.") else: result, confidence = predict_message(user_input) st.success(f"Prediction: **{result}** \nConfidence: **{confidence:.2%}**") # ----------------- SAMPLE DATA ----------------- with st.expander("📂 View Sample Dataset"): st.dataframe(df.sample(10))