Shiva7706 commited on
Commit
2245b2a
·
verified ·
1 Parent(s): fcb19ad

Upload 6 files

Browse files
README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Spam-message-detection
2
+ Building model to detect spam messages specifically for India most probably by using NNs and ML
main.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
spam_app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import joblib
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from nltk.tokenize import word_tokenize
6
+ import string
7
+ import re
8
+
9
+ # Download NLTK data
10
+ nltk.download('punkt')
11
+ nltk.download('stopwords')
12
+
13
+ def preprocess_text(text):
14
+ # Convert to lowercase
15
+ text = text.lower()
16
+
17
+ # Remove punctuation
18
+ text = ''.join([char for char in text if char not in string.punctuation])
19
+
20
+ # Remove numbers
21
+ text = re.sub(r'\d+', '', text)
22
+
23
+ # Remove extra whitespace
24
+ text = ' '.join(text.split())
25
+
26
+ # Tokenization
27
+ tokens = word_tokenize(text)
28
+
29
+ # Remove stopwords
30
+ stop_words = set(stopwords.words('english'))
31
+ tokens = [token for token in tokens if token not in stop_words]
32
+
33
+ # Join tokens back into text
34
+ return ' '.join(tokens)
35
+
36
+ # Load the saved model and vectorizer
37
+ model = joblib.load('spam_detector_model.joblib')
38
+ vectorizer = joblib.load('tfidf_vectorizer.joblib')
39
+
40
+ # Create the Streamlit interface
41
+ st.title("📧 Spam Message Detector")
42
+
43
+ st.write("""
44
+ This app detects whether a message is spam or not.
45
+ Enter your message below and click 'Analyze' to check!
46
+ """)
47
+
48
+ # Create text input
49
+ message = st.text_area("Enter your message:", height=100)
50
+
51
+ if st.button("Analyze"):
52
+ if message:
53
+ # Preprocess the input
54
+ processed_text = preprocess_text(message)
55
+
56
+ # Vectorize the text
57
+ text_vectorized = vectorizer.transform([processed_text])
58
+
59
+ # Make prediction
60
+ prediction = model.predict(text_vectorized)[0]
61
+ probability = model.predict_proba(text_vectorized)[0]
62
+
63
+ # Display result
64
+ st.markdown("### Analysis Result")
65
+
66
+ if prediction == 1:
67
+ st.error("🚨 This message is likely SPAM!")
68
+ st.write(f"Confidence: {probability[1]:.2%}")
69
+ else:
70
+ st.success("✅ This message appears to be legitimate.")
71
+ st.write(f"Confidence: {probability[0]:.2%}")
72
+
73
+ # Show preprocessing details
74
+ with st.expander("See preprocessing steps"):
75
+ st.write("Original message:", message)
76
+ st.write("Processed message:", processed_text)
77
+ else:
78
+ st.warning("Please enter a message to analyze.")
79
+
80
+ # Add sidebar information
81
+ with st.sidebar:
82
+ st.header("About the Model")
83
+ st.write("""
84
+ This spam detector uses an XGBoost classifier trained on a dataset of spam and legitimate messages.
85
+
86
+ Model Performance:
87
+ - Training Accuracy: 99.7%
88
+ - Testing Accuracy: 98.9%
89
+ """)
spam_detector_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b493180f86a79d71c0f8bb56f09c25627feb48b89e9bd20df1740295abd902d
3
+ size 339087
spam_ham_india.csv ADDED
The diff for this file is too large to render. See raw diff
 
tfidf_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae27e1b70184ff42809e70f62e4f58bd9342ed9a2cf438508dd9991b08e6969c
3
+ size 94260