ABHI010 commited on
Commit
34d4137
·
verified ·
1 Parent(s): 4008151

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +249 -0
app.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ from sklearn.naive_bayes import MultinomialNB
6
+ from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
7
+ import joblib
8
+ import matplotlib.pyplot as plt
9
+ from io import BytesIO
10
+ import base64
11
+ import gradio as gr
12
+ import re
13
+
14
+ # Load and preprocess dataset
15
+ dataset = pd.read_csv('/content/email_spam (1).csv', on_bad_lines='skip', engine='python')
16
+
17
+ # Drop rows where 'spam' or 'text' is NaN and convert 'spam' to numeric
18
+ dataset.dropna(subset=['spam', 'text'], inplace=True)
19
+ dataset['spam'] = pd.to_numeric(dataset['spam'], errors='coerce')
20
+
21
+ # Remove any rows where 'spam' is NaN after conversion and convert 'spam' to integers
22
+ dataset.dropna(subset=['spam'], inplace=True)
23
+ dataset['spam'] = dataset['spam'].astype(int)
24
+
25
+ # Vectorize the text data
26
+ vectorizer = CountVectorizer()
27
+ X = vectorizer.fit_transform(dataset['text'])
28
+ y = dataset['spam']
29
+
30
+ # Split the data into training and testing sets
31
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
32
+
33
+ # Train the Naive Bayes model
34
+ model = MultinomialNB()
35
+ model.fit(X_train, y_train)
36
+
37
+ # Save the model and vectorizer
38
+ joblib.dump(model, 'spam_model.pkl')
39
+ joblib.dump(vectorizer, 'spam_vectorizer.pkl')
40
+
41
+ # Reload for consistency
42
+ model = joblib.load('spam_model.pkl')
43
+ vectorizer = joblib.load('spam_vectorizer.pkl')
44
+
45
+ # List of spammy keywords
46
+ spam_keywords = [
47
+ "win", "free", "urgent", "money", "credit", "loan", "offer", "buy now",
48
+ "limited time", "click here", "guaranteed", "congratulations", "winner"
49
+ ]
50
+
51
+ # Helper function to highlight spammy keywords
52
+ def highlight_keywords(text):
53
+ highlighted = text
54
+ for keyword in spam_keywords:
55
+ pattern = re.compile(rf"{keyword}", re.IGNORECASE)
56
+ highlighted = pattern.sub(f"<span class='highlight'>{keyword}</span>", highlighted)
57
+ return highlighted
58
+
59
+ # Prediction function
60
+ def classify_email(email_text):
61
+ email_vector = vectorizer.transform([email_text])
62
+ prediction = model.predict(email_vector)
63
+ confidence = model.predict_proba(email_vector).max() * 100
64
+ result = "Spam" if prediction[0] == 1 else "Ham"
65
+
66
+ highlighted_text = highlight_keywords(email_text)
67
+ color = "red" if result == "Spam" else "green"
68
+ emoji = "📧" if result == "Ham" else "⚠️"
69
+ advice = "<b>Be careful!</b> This might be a scam." if result == "Spam" else "<b>This email seems safe.</b>"
70
+
71
+ return {
72
+ "result": f"<span style='color: {color}; font-size: 1.5em;'>{emoji} {result}</span>",
73
+ "confidence": f"{confidence:.2f}%",
74
+ "highlighted": highlighted_text,
75
+ "spammy_keywords": ", ".join(
76
+ [kw for kw in spam_keywords if kw.lower() in email_text.lower()]
77
+ ),
78
+ "advice": advice
79
+ }
80
+
81
+ # Generate performance metrics
82
+ def generate_performance_metrics():
83
+ y_pred = model.predict(X_test)
84
+ accuracy = accuracy_score(y_test, y_pred)
85
+ report = classification_report(y_test, y_pred, output_dict=True)
86
+
87
+ # Confusion matrix plot
88
+ fig, ax = plt.subplots(figsize=(6, 6))
89
+ ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax, cmap='Blues')
90
+ plt.title("Confusion Matrix")
91
+ plt.tight_layout()
92
+
93
+ # Save plot as a base64 string
94
+ buf = BytesIO()
95
+ plt.savefig(buf, format="png")
96
+ buf.seek(0)
97
+ img_base64 = base64.b64encode(buf.getvalue()).decode("utf-8")
98
+ buf.close()
99
+
100
+ return {
101
+ "accuracy": f"{accuracy:.2%}",
102
+ "precision": f"{report['1']['precision']:.2%}",
103
+ "recall": f"{report['1']['recall']:.2%}",
104
+ "f1_score": f"{report['1']['f1-score']:.2%}",
105
+ "confusion_matrix_plot": img_base64,
106
+ }
107
+
108
+ # Updated CSS
109
+ custom_css = """
110
+ body {
111
+ font-family: 'Arial', sans-serif;
112
+ background-image: url('https://cdn.pixabay.com/photo/2016/11/19/15/26/email-1839873_1280.jpg');
113
+ background-size: cover;
114
+ background-position: center;
115
+ background-attachment: fixed;
116
+ color: #333;
117
+ }
118
+
119
+ h1, h2, h3 {
120
+ text-align: center;
121
+ color: #ffffff;
122
+ text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.7);
123
+ }
124
+
125
+ .gradio-container {
126
+ background-color: rgba(255, 255, 255, 0.8);
127
+ border-radius: 10px;
128
+ padding: 20px;
129
+ box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.3);
130
+ }
131
+
132
+ button {
133
+ background-color: #1e90ff;
134
+ color: white;
135
+ padding: 10px 20px;
136
+ border: none;
137
+ border-radius: 5px;
138
+ cursor: pointer;
139
+ font-size: 1.2em;
140
+ transition: transform 0.2s, background-color 0.3s;
141
+ }
142
+
143
+ button:hover {
144
+ background-color: #1c86ee;
145
+ transform: scale(1.05);
146
+ }
147
+
148
+ .highlight {
149
+ background-color: #ffeb3b;
150
+ font-weight: bold;
151
+ padding: 0 3px;
152
+ border-radius: 3px;
153
+ }
154
+
155
+ .metric {
156
+ font-size: 1.2em;
157
+ text-align: center;
158
+ color: #ffffff;
159
+ background-color: #4CAF50;
160
+ border-radius: 8px;
161
+ padding: 10px;
162
+ margin: 10px 0;
163
+ box-shadow: 2px 2px 5px rgba(0, 0, 0, 0.2);
164
+ }
165
+ """
166
+
167
+ # Create Gradio Interface
168
+ def create_interface():
169
+ performance_metrics = generate_performance_metrics()
170
+
171
+ with gr.Blocks(css=custom_css) as interface:
172
+ gr.Markdown("# 📩 Advanced Email Spam Classifier")
173
+ gr.Markdown(
174
+ """
175
+ ### Enter the content of an email below to classify it as Spam or Ham.
176
+ The tool uses **machine learning** to analyze email content, highlights spammy keywords, and shows key performance analytics.
177
+ """
178
+ )
179
+
180
+ with gr.Row():
181
+ with gr.Column():
182
+ email_input = gr.Textbox(
183
+ lines=8, placeholder="Type or paste your email content here...", label="Email Content"
184
+ )
185
+ with gr.Column():
186
+ result_output = gr.HTML(label="Classification Result")
187
+ confidence_output = gr.Textbox(label="Confidence Score", interactive=False)
188
+ highlighted_output = gr.HTML(label="Highlighted Text")
189
+ keywords_output = gr.Textbox(label="Spam Keywords Detected", interactive=False)
190
+ advice_output = gr.HTML(label="Advice")
191
+
192
+ analyze_button = gr.Button("Analyze Email 🕵️‍♂️")
193
+
194
+ def email_analysis_pipeline(email_text):
195
+ results = classify_email(email_text)
196
+ return (
197
+ results["result"],
198
+ results["confidence"],
199
+ results["highlighted"],
200
+ results["spammy_keywords"],
201
+ results["advice"]
202
+ )
203
+
204
+ analyze_button.click(
205
+ fn=email_analysis_pipeline,
206
+ inputs=email_input,
207
+ outputs=[
208
+ result_output,
209
+ confidence_output,
210
+ highlighted_output,
211
+ keywords_output,
212
+ advice_output
213
+ ]
214
+ )
215
+
216
+ gr.Markdown("## 📊 Model Performance Analytics")
217
+ with gr.Row():
218
+ with gr.Column():
219
+ gr.Textbox(value=performance_metrics["accuracy"], label="Accuracy", interactive=False, elem_classes=["metric"])
220
+ gr.Textbox(value=performance_metrics["precision"], label="Precision", interactive=False, elem_classes=["metric"])
221
+ gr.Textbox(value=performance_metrics["recall"], label="Recall", interactive=False, elem_classes=["metric"])
222
+ gr.Textbox(value=performance_metrics["f1_score"], label="F1 Score", interactive=False, elem_classes=["metric"])
223
+ with gr.Column():
224
+ gr.Markdown("### Confusion Matrix")
225
+ gr.HTML(f"<img src='data:image/png;base64,{performance_metrics['confusion_matrix_plot']}' style='max-width: 100%; height: auto;' />")
226
+
227
+ gr.Markdown("## 📘 Glossary and Explanation of Labels")
228
+ gr.Markdown(
229
+ """
230
+ ### Labels:
231
+ - **Spam:** Unwanted or harmful emails flagged by the system.
232
+ - **Ham:** Legitimate, safe emails.
233
+
234
+ ### Metrics:
235
+ - **Accuracy:** Percentage of correct classifications.
236
+ - **Precision:** Out of predicted Spam, how many are actually Spam.
237
+ - **Recall:** Out of all actual Spam emails, how many are predicted as Spam.
238
+ - **F1 Score:** Harmonic mean of Precision and Recall.
239
+
240
+ ### Confusion Matrix:
241
+ Shows the distribution of true vs predicted labels.
242
+ """
243
+ )
244
+
245
+ return interface
246
+
247
+ # Launch the interface
248
+ interface = create_interface()
249
+ interface.launch(share=True)