3v324v23 commited on
Commit
136950c
Β·
1 Parent(s): 45bb8fa

final changes

Browse files
Files changed (5) hide show
  1. app.py +155 -0
  2. requirements.txt +7 -0
  3. spam.csv +0 -0
  4. svm_sms_spam.pkl +3 -0
  5. vectorizer.pkl +3 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import joblib
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.svm import SVC
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import accuracy_score, confusion_matrix
10
+ import numpy as np
11
+
12
+ # Set Streamlit page config
13
+ st.set_page_config(page_title="SMS Spam Detector", page_icon="πŸ“©", layout="wide")
14
+
15
+ # Custom CSS for centering and styling
16
+ st.markdown("""
17
+ <style>
18
+ .centered-container {
19
+ display: flex;
20
+ justify-content: center;
21
+ align-items: center;
22
+ flex-direction: column;
23
+ text-align: center;
24
+ width: 80%;
25
+ }
26
+ .padded-container {
27
+ padding: 20px;
28
+ }
29
+ .big-dataset {
30
+ font-size: 12px;
31
+ max-width: 100%;
32
+ margin: auto;
33
+ }
34
+ .stDataFrame {
35
+ display: flex;
36
+ justify-content: center;
37
+ align-items: center;
38
+ }
39
+ img {
40
+ max-width: 150px;
41
+ height: 600px;
42
+ }
43
+ </style>
44
+ """, unsafe_allow_html=True)
45
+
46
+ # Title
47
+ st.title("πŸ“© SMS Spam Detector")
48
+
49
+ # Load dataset
50
+ @st.cache_data
51
+ def load_data():
52
+ dataset_path = "D:/CCS229 - Intelligent System/SMS_Spam_Detection_using_SVM/spam.csv"
53
+ df = pd.read_csv(dataset_path, encoding='latin-1')[['v1', 'v2']]
54
+ df.columns = ['label', 'message']
55
+ df['label'] = df['label'].map({'ham': 0, 'spam': 1})
56
+ return df
57
+
58
+ df = load_data()
59
+
60
+ # Train and save model
61
+ @st.cache_resource
62
+ def train_and_save_model():
63
+ X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)
64
+ vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
65
+ X_train_tfidf = vectorizer.fit_transform(X_train)
66
+ X_test_tfidf = vectorizer.transform(X_test)
67
+
68
+ svm_model = SVC(kernel='linear')
69
+ svm_model.fit(X_train_tfidf, y_train)
70
+
71
+ y_pred = svm_model.predict(X_test_tfidf)
72
+ accuracy = accuracy_score(y_test, y_pred)
73
+
74
+ joblib.dump(svm_model, "D:/CCS229 - Intelligent System/SMS_Spam_Detection_using_SVM/svm_sms_spam.pkl")
75
+ joblib.dump(vectorizer, "D:/CCS229 - Intelligent System/SMS_Spam_Detection_using_SVM/vectorizer.pkl")
76
+
77
+ return svm_model, vectorizer, accuracy
78
+
79
+ svm_model, vectorizer, accuracy = train_and_save_model()
80
+
81
+ # Create tabs
82
+ tab1, tab2, tab3 = st.tabs(["πŸ“Š Data Overview", "πŸ“ˆ Data Visualization", "πŸ” Spam Detector"])
83
+
84
+ # Tab 1: Data Overview
85
+ with tab1:
86
+ st.subheader("Dataset Overview")
87
+ st.markdown('<div class="centered-container">', unsafe_allow_html=True)
88
+ st.markdown('<div style="display: flex; justify-content: center;">', unsafe_allow_html=True)
89
+ st.dataframe(df, height=300, width=1000)
90
+ st.markdown('</div>', unsafe_allow_html=True)
91
+ st.markdown('</div>', unsafe_allow_html=True)
92
+
93
+ # Smaller class distribution title
94
+ st.subheader("Class Distribution")
95
+ fig, ax = plt.subplots(figsize=(2, 2)) # Smaller figure size
96
+ sns.countplot(
97
+ x=df['label'].map({0: 'Not Spam', 1: 'Spam'}),
98
+ palette='coolwarm',
99
+ ax=ax,
100
+ width=0.2
101
+ )
102
+ ax.set_title("Distribution of Spam vs. Not Spam Messages", fontsize=8) # Smaller title
103
+ ax.set_xlabel("Message Type", fontsize=5) # Smaller x-axis label
104
+ ax.set_ylabel("Count", fontsize=5) # Smaller y-axis label
105
+ ax.tick_params(axis='both', labelsize=5) # Smaller tick labels
106
+ st.pyplot(fig)
107
+
108
+ st.markdown(f"### πŸ“Š Model Accuracy: **{accuracy * 100:.2f}%**")
109
+
110
+ # Tab 2: Data Visualization
111
+ with tab2:
112
+ st.subheader("Data Visualizations")
113
+
114
+ # Confusion Matrix
115
+ st.markdown("### Confusion Matrix")
116
+ X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)
117
+ X_test_tfidf = vectorizer.transform(X_test)
118
+ y_pred = svm_model.predict(X_test_tfidf)
119
+
120
+ cm = confusion_matrix(y_test, y_pred)
121
+ fig, ax = plt.subplots(figsize=(5, 3))
122
+ sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Spam', 'Spam'], yticklabels=['Not Spam', 'Spam'])
123
+ ax.set_xlabel("Predicted")
124
+ ax.set_ylabel("Actual")
125
+ ax.set_title("Confusion Matrix")
126
+ st.pyplot(fig)
127
+
128
+ # Heatmap
129
+ st.markdown("### Heatmap of Feature Correlations")
130
+ df['message_length'] = df['message'].apply(len)
131
+ correlation_matrix = df[['message_length', 'label']].corr()
132
+ fig, ax = plt.subplots(figsize=(5, 3))
133
+ sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax)
134
+ ax.set_title("Feature Correlation Heatmap")
135
+ st.pyplot(fig)
136
+
137
+ st.markdown('</div>', unsafe_allow_html=True)
138
+
139
+ # Tab 3: Spam Detector
140
+ with tab3:
141
+ st.subheader("Check SMS Message")
142
+ st.write("Enter an SMS message below to check if it's spam or not.")
143
+ user_input = st.text_area("Enter SMS Message:")
144
+
145
+ if st.button("Check Message"):
146
+ if user_input:
147
+ input_features = vectorizer.transform([user_input])
148
+ prediction = svm_model.predict(input_features)
149
+
150
+ if prediction[0] == 1:
151
+ st.error("🚨 This message is Spam!")
152
+ else:
153
+ st.success("βœ… This message is NOT Spam!")
154
+ else:
155
+ st.warning("Please enter a message before checking.")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ joblib
3
+ pandas
4
+ matplotlib.pyplot
5
+ seaborn
6
+ sklearn.metrics
7
+ numpy
spam.csv ADDED
The diff for this file is too large to render. See raw diff
 
svm_sms_spam.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e71a5173c59e56448ec3ffe20e45b1acef918074915f47ceaca4b0013f79ccaf
3
+ size 133483
vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2892dd114db5bb43346bd34b5d092cb9e83225d9f2b519513efae2d6443ec153
3
+ size 180007