import streamlit as st import joblib import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import string import re # Download NLTK data nltk.download('punkt') nltk.download('stopwords') def preprocess_text(text): # Convert to lowercase text = text.lower() # Remove punctuation text = ''.join([char for char in text if char not in string.punctuation]) # Remove numbers text = re.sub(r'\d+', '', text) # Remove extra whitespace text = ' '.join(text.split()) # Tokenization tokens = word_tokenize(text) # Remove stopwords stop_words = set(stopwords.words('english')) tokens = [token for token in tokens if token not in stop_words] # Join tokens back into text return ' '.join(tokens) # Load the saved model and vectorizer model = joblib.load('spam_detector_model.joblib') vectorizer = joblib.load('tfidf_vectorizer.joblib') # Create the Streamlit interface st.title("📧 Spam Message Detector") st.write(""" This app detects whether a message is spam or not. Enter your message below and click 'Analyze' to check! """) # Create text input message = st.text_area("Enter your message:", height=100) if st.button("Analyze"): if message: # Preprocess the input processed_text = preprocess_text(message) # Vectorize the text text_vectorized = vectorizer.transform([processed_text]) # Make prediction prediction = model.predict(text_vectorized)[0] probability = model.predict_proba(text_vectorized)[0] # Display result st.markdown("### Analysis Result") if prediction == 1: st.error("🚨 This message is likely SPAM!") st.write(f"Confidence: {probability[1]:.2%}") else: st.success("✅ This message appears to be legitimate.") st.write(f"Confidence: {probability[0]:.2%}") # Show preprocessing details with st.expander("See preprocessing steps"): st.write("Original message:", message) st.write("Processed message:", processed_text) else: st.warning("Please enter a message to analyze.") # Add sidebar information with st.sidebar: st.header("About the Model") st.write(""" This spam detector uses an XGBoost classifier trained on a dataset of spam and legitimate messages. Model Performance: - Training Accuracy: 99.7% - Testing Accuracy: 98.9% """)