mrciomnl commited on
Commit
e1b4e83
Β·
1 Parent(s): 97edeb7

initial commit

Browse files
Files changed (3) hide show
  1. app.py +97 -0
  2. requirements.txt +1 -0
  3. spam.csv +0 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import seaborn as sns
5
+ import string
6
+ import re
7
+ from wordcloud import WordCloud
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.naive_bayes import MultinomialNB
11
+ from sklearn.metrics import accuracy_score, classification_report
12
+
13
+ # Load dataset
14
+ def load_data():
15
+ df = pd.read_csv("spam.csv", encoding="latin-1")
16
+ df = df[['v1', 'v2']]
17
+ df.columns = ['label', 'message']
18
+ return df
19
+
20
+ # Preprocess text
21
+ def preprocess_text(text):
22
+ text = text.lower()
23
+ text = re.sub(f"[{string.punctuation}]", "", text)
24
+ return text
25
+
26
+ # Train model
27
+ def train_model(X_train, y_train):
28
+ vectorizer = TfidfVectorizer()
29
+ X_train_tfidf = vectorizer.fit_transform(X_train)
30
+ model = MultinomialNB()
31
+ model.fit(X_train_tfidf, y_train)
32
+ return model, vectorizer
33
+
34
+ # Streamlit app navigation
35
+ st.sidebar.title("Navigation")
36
+ page = st.sidebar.radio("Go to:", ["Data Exploration", "Model Training & Evaluation", "Message Prediction"])
37
+
38
+ # Load data
39
+ df = load_data()
40
+ df['message_clean'] = df['message'].apply(preprocess_text)
41
+
42
+ if page == "Data Exploration":
43
+ st.title("πŸ“Š Data Exploration")
44
+ st.write("This page provides an overview of the dataset, including distributions and key insights.")
45
+ st.subheader("Dataset Overview")
46
+ st.write(df.head())
47
+ st.write("Total messages:", df.shape[0])
48
+ st.write(df['label'].value_counts())
49
+
50
+ # Visualization
51
+ st.subheader("Spam vs. Ham Distribution")
52
+ fig, ax = plt.subplots()
53
+ sns.countplot(x=df['label'], palette='coolwarm', ax=ax)
54
+ st.pyplot(fig)
55
+
56
+ # Word Cloud
57
+ st.subheader("Word Cloud for Spam Messages")
58
+ spam_words = " ".join(df[df['label'] == 'spam']['message_clean'])
59
+ wordcloud = WordCloud(width=500, height=300, background_color='black').generate(spam_words)
60
+ fig, ax = plt.subplots()
61
+ ax.imshow(wordcloud, interpolation='bilinear')
62
+ ax.axis("off")
63
+ st.pyplot(fig)
64
+
65
+ elif page == "Model Training & Evaluation":
66
+ st.title("πŸ“ˆ Model Training & Evaluation")
67
+ st.write("This page shows the model training process and performance evaluation.")
68
+
69
+ # Train/test split
70
+ X_train, X_test, y_train, y_test = train_test_split(df['message_clean'], df['label'], test_size=0.2, random_state=42)
71
+ model, vectorizer = train_model(X_train, y_train)
72
+
73
+ # Model evaluation
74
+ X_test_tfidf = vectorizer.transform(X_test)
75
+ y_pred = model.predict(X_test_tfidf)
76
+ accuracy = accuracy_score(y_test, y_pred)
77
+
78
+ st.subheader("Model Performance")
79
+ st.write("The model is evaluated using accuracy and a classification report.")
80
+ st.write(f"**Accuracy:** {accuracy:.2f}")
81
+ st.text("Classification Report:")
82
+ st.text(classification_report(y_test, y_pred))
83
+
84
+ st.write("**Explanation:** The accuracy score represents the proportion of correctly classified messages. The classification report provides precision, recall, and F1-score for spam and ham categories, helping us understand the model's performance in more detail.")
85
+
86
+ elif page == "Message Prediction":
87
+ st.title("βœ‰ Message Prediction")
88
+ st.write("Test the model by entering an SMS message to classify it as spam or ham.")
89
+
90
+ # Prediction interface
91
+ user_input = st.text_area("Enter an SMS message:")
92
+ if st.button("Predict"):
93
+ user_input_tfidf = vectorizer.transform([user_input])
94
+ prediction = model.predict(user_input_tfidf)[0]
95
+ st.success(f"This message is classified as: **{prediction.upper()}**")
96
+
97
+ st.write("**Explanation:** The model analyzes the text and classifies it as spam or ham based on learned patterns. Spam messages typically contain promotional content, urgent requests, or suspicious links, while ham messages are normal communications.")
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ streamlit
spam.csv ADDED
The diff for this file is too large to render. See raw diff