import pandas as pd import streamlit as st from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline # Load dataset @st.cache_data def load_data(): # Replace with your dataset path or URL url = "spam.csv" df = pd.read_csv(url, encoding="latin-1") df = df.rename(columns={"v1": "label", "v2": "text"}) # Rename columns df = df[['text', 'label']] # Keep only necessary columns df['label'] = df['label'].map({'spam': 'spam', 'ham': 'legit'}) # Standardize labels return df # Load data df = load_data() # Split dataset into training and testing sets X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42) # Build spam classifier model model = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB()) ]) # Train the model model.fit(X_train, y_train) # Streamlit UI st.title("Spam Filter Email Classifier") st.write("This app classifies emails as **spam** or **legit** based on trained data.") # File uploader for a custom dataset uploaded_file = st.file_uploader("Upload your own spam dataset (CSV format)", type=["csv"]) if uploaded_file: df = pd.read_csv(uploaded_file) if "text" in df.columns and "label" in df.columns: X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42) model.fit(X_train, y_train) st.success("Custom dataset loaded and model retrained!") else: st.error("CSV file must contain 'text' and 'label' columns.") # Text input for email classification email_input = st.text_area("Enter email content:") if st.button("Classify Email"): if email_input: prediction = model.predict([email_input])[0] st.subheader(f"The email is classified as: **{prediction}**") else: st.write("Please enter an email to classify.")