Spaces:
Sleeping
Sleeping
initial commit
Browse files- app.py +97 -0
- requirements.txt +1 -0
- spam.csv +0 -0
app.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import seaborn as sns
|
| 5 |
+
import string
|
| 6 |
+
import re
|
| 7 |
+
from wordcloud import WordCloud
|
| 8 |
+
from sklearn.model_selection import train_test_split
|
| 9 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 10 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 11 |
+
from sklearn.metrics import accuracy_score, classification_report
|
| 12 |
+
|
| 13 |
+
# Load dataset
|
| 14 |
+
def load_data():
|
| 15 |
+
df = pd.read_csv("spam.csv", encoding="latin-1")
|
| 16 |
+
df = df[['v1', 'v2']]
|
| 17 |
+
df.columns = ['label', 'message']
|
| 18 |
+
return df
|
| 19 |
+
|
| 20 |
+
# Preprocess text
|
| 21 |
+
def preprocess_text(text):
|
| 22 |
+
text = text.lower()
|
| 23 |
+
text = re.sub(f"[{string.punctuation}]", "", text)
|
| 24 |
+
return text
|
| 25 |
+
|
| 26 |
+
# Train model
|
| 27 |
+
def train_model(X_train, y_train):
|
| 28 |
+
vectorizer = TfidfVectorizer()
|
| 29 |
+
X_train_tfidf = vectorizer.fit_transform(X_train)
|
| 30 |
+
model = MultinomialNB()
|
| 31 |
+
model.fit(X_train_tfidf, y_train)
|
| 32 |
+
return model, vectorizer
|
| 33 |
+
|
| 34 |
+
# Streamlit app navigation
|
| 35 |
+
st.sidebar.title("Navigation")
|
| 36 |
+
page = st.sidebar.radio("Go to:", ["Data Exploration", "Model Training & Evaluation", "Message Prediction"])
|
| 37 |
+
|
| 38 |
+
# Load data
|
| 39 |
+
df = load_data()
|
| 40 |
+
df['message_clean'] = df['message'].apply(preprocess_text)
|
| 41 |
+
|
| 42 |
+
if page == "Data Exploration":
|
| 43 |
+
st.title("π Data Exploration")
|
| 44 |
+
st.write("This page provides an overview of the dataset, including distributions and key insights.")
|
| 45 |
+
st.subheader("Dataset Overview")
|
| 46 |
+
st.write(df.head())
|
| 47 |
+
st.write("Total messages:", df.shape[0])
|
| 48 |
+
st.write(df['label'].value_counts())
|
| 49 |
+
|
| 50 |
+
# Visualization
|
| 51 |
+
st.subheader("Spam vs. Ham Distribution")
|
| 52 |
+
fig, ax = plt.subplots()
|
| 53 |
+
sns.countplot(x=df['label'], palette='coolwarm', ax=ax)
|
| 54 |
+
st.pyplot(fig)
|
| 55 |
+
|
| 56 |
+
# Word Cloud
|
| 57 |
+
st.subheader("Word Cloud for Spam Messages")
|
| 58 |
+
spam_words = " ".join(df[df['label'] == 'spam']['message_clean'])
|
| 59 |
+
wordcloud = WordCloud(width=500, height=300, background_color='black').generate(spam_words)
|
| 60 |
+
fig, ax = plt.subplots()
|
| 61 |
+
ax.imshow(wordcloud, interpolation='bilinear')
|
| 62 |
+
ax.axis("off")
|
| 63 |
+
st.pyplot(fig)
|
| 64 |
+
|
| 65 |
+
elif page == "Model Training & Evaluation":
|
| 66 |
+
st.title("π Model Training & Evaluation")
|
| 67 |
+
st.write("This page shows the model training process and performance evaluation.")
|
| 68 |
+
|
| 69 |
+
# Train/test split
|
| 70 |
+
X_train, X_test, y_train, y_test = train_test_split(df['message_clean'], df['label'], test_size=0.2, random_state=42)
|
| 71 |
+
model, vectorizer = train_model(X_train, y_train)
|
| 72 |
+
|
| 73 |
+
# Model evaluation
|
| 74 |
+
X_test_tfidf = vectorizer.transform(X_test)
|
| 75 |
+
y_pred = model.predict(X_test_tfidf)
|
| 76 |
+
accuracy = accuracy_score(y_test, y_pred)
|
| 77 |
+
|
| 78 |
+
st.subheader("Model Performance")
|
| 79 |
+
st.write("The model is evaluated using accuracy and a classification report.")
|
| 80 |
+
st.write(f"**Accuracy:** {accuracy:.2f}")
|
| 81 |
+
st.text("Classification Report:")
|
| 82 |
+
st.text(classification_report(y_test, y_pred))
|
| 83 |
+
|
| 84 |
+
st.write("**Explanation:** The accuracy score represents the proportion of correctly classified messages. The classification report provides precision, recall, and F1-score for spam and ham categories, helping us understand the model's performance in more detail.")
|
| 85 |
+
|
| 86 |
+
elif page == "Message Prediction":
|
| 87 |
+
st.title("β Message Prediction")
|
| 88 |
+
st.write("Test the model by entering an SMS message to classify it as spam or ham.")
|
| 89 |
+
|
| 90 |
+
# Prediction interface
|
| 91 |
+
user_input = st.text_area("Enter an SMS message:")
|
| 92 |
+
if st.button("Predict"):
|
| 93 |
+
user_input_tfidf = vectorizer.transform([user_input])
|
| 94 |
+
prediction = model.predict(user_input_tfidf)[0]
|
| 95 |
+
st.success(f"This message is classified as: **{prediction.upper()}**")
|
| 96 |
+
|
| 97 |
+
st.write("**Explanation:** The model analyzes the text and classifies it as spam or ham based on learned patterns. Spam messages typically contain promotional content, urgent requests, or suspicious links, while ham messages are normal communications.")
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
spam.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|