spam_detection / app.py
mrciomnl's picture
major changes made
6ae6f57
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
# Load dataset
def load_data():
df = pd.read_csv("spam.csv", encoding="latin-1")
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
return df
# Preprocess text
def preprocess_text(text):
text = text.lower()
text = re.sub(f"[{string.punctuation}]", "", text)
return text
# Train model
def train_model(X_train, y_train):
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
return model, vectorizer
# Streamlit app navigation
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to:", ["Data Exploration", "Model Training & Evaluation", "Message Prediction"])
# Load data
df = load_data()
df['message_clean'] = df['message'].apply(preprocess_text)
if page == "Data Exploration":
st.title("πŸ“Š Data Exploration")
st.write("This page provides an overview of the dataset, including distributions and key insights.")
st.subheader("Dataset Overview")
st.write(df.head())
st.write("Total messages:", df.shape[0])
st.write(df['label'].value_counts())
# Visualization
st.subheader("Spam vs. Ham Distribution")
fig, ax = plt.subplots()
sns.countplot(x=df['label'], palette='coolwarm', ax=ax)
st.pyplot(fig)
# Word Cloud
st.subheader("Word Cloud for Spam Messages")
spam_words = " ".join(df[df['label'] == 'spam']['message_clean'])
wordcloud = WordCloud(width=500, height=300, background_color='black').generate(spam_words)
fig, ax = plt.subplots()
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis("off")
st.pyplot(fig)
elif page == "Model Training & Evaluation":
st.title("πŸ“ˆ Model Training & Evaluation")
st.write("This page shows the model training process and performance evaluation.")
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(df['message_clean'], df['label'], test_size=0.2, random_state=42)
model, vectorizer = train_model(X_train, y_train)
# Store model and vectorizer in session state
st.session_state['model'] = model
st.session_state['vectorizer'] = vectorizer
# Model evaluation
X_test_tfidf = vectorizer.transform(X_test)
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
st.subheader("Model Performance")
st.write("The model is evaluated using accuracy and a classification report.")
st.write(f"**Accuracy:** {accuracy:.2f}")
st.text("Classification Report:")
st.text(classification_report(y_test, y_pred))
st.write("**Explanation:** The accuracy score represents the proportion of correctly classified messages. The classification report provides precision, recall, and F1-score for spam and ham categories, helping us understand the model's performance in more detail.")
elif page == "Message Prediction":
st.title("βœ‰ Message Prediction")
st.write("Test the model by entering an SMS message to classify it as spam or ham.")
# Check if model and vectorizer exist
if 'model' in st.session_state and 'vectorizer' in st.session_state:
model = st.session_state['model']
vectorizer = st.session_state['vectorizer']
# Prediction interface
user_input = st.text_area("Enter an SMS message:")
if st.button("Predict"):
user_input_tfidf = vectorizer.transform([user_input])
prediction = model.predict(user_input_tfidf)[0]
st.success(f"This message is classified as: **{prediction.upper()}**")
st.write("**Explanation:** The model analyzes the text and classifies it as spam or ham based on learned patterns. Spam messages typically contain promotional content, urgent requests, or suspicious links, while ham messages are normal communications.")
else:
st.warning("⚠️ Please train the model first in the 'Model Training & Evaluation' page.")