mimi111222 commited on
Commit
1f5ae17
Β·
verified Β·
1 Parent(s): fdb1265

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +826 -799
app.py CHANGED
@@ -1,799 +1,826 @@
1
- """
2
- AI Phishing Email Detector - Premium Black & Gold UI
3
- TF-IDF + Logistic Regression trained on Kaggle Phishing Emails dataset.
4
- Author & Deployer: Umaima Qureshi
5
- """
6
-
7
- import streamlit as st
8
- import pandas as pd
9
- import numpy as np
10
- import re
11
- from sklearn.model_selection import train_test_split
12
- from sklearn.feature_extraction.text import TfidfVectorizer
13
- from sklearn.linear_model import LogisticRegression
14
- from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
15
- import matplotlib.pyplot as plt
16
- import seaborn as sns
17
- import io
18
- import os
19
-
20
- # Page Configuration
21
- st.set_page_config(
22
- page_title="AI Phishing Shield β€” by Umaima Qureshi",
23
- layout="wide",
24
- initial_sidebar_state="collapsed"
25
- )
26
-
27
- # Premium Black & Gold CSS Styling
28
- st.markdown("""
29
- <style>
30
- @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;800&display=swap');
31
-
32
- * {
33
- font-family: 'Inter', sans-serif;
34
- }
35
-
36
- .stApp {
37
- background: linear-gradient(135deg, #0f0f0f 0%, #1a1a1a 50%, #0f0f0f 100%);
38
- }
39
-
40
- .main {
41
- background: transparent;
42
- padding: 0;
43
- }
44
-
45
- .block-container {
46
- padding: 2rem 3rem !important;
47
- max-width: 1400px;
48
- }
49
-
50
- section[data-testid="stSidebar"] {
51
- display: none;
52
- }
53
-
54
- .element-container {
55
- background: transparent !important;
56
- }
57
-
58
- /* Hero Section */
59
- .hero-container {
60
- background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
61
- border-radius: 28px;
62
- padding: 3.5rem 3rem;
63
- margin-bottom: 2.5rem;
64
- box-shadow: 0 25px 70px rgba(0,0,0,0.5), 0 10px 30px rgba(218,165,32,0.2);
65
- position: relative;
66
- overflow: hidden;
67
- border: 2px solid rgba(218,165,32,0.3);
68
- }
69
-
70
- .hero-container::before {
71
- content: '';
72
- position: absolute;
73
- top: -50%;
74
- right: -20%;
75
- width: 500px;
76
- height: 500px;
77
- background: radial-gradient(circle, rgba(218,165,32,0.15) 0%, transparent 70%);
78
- border-radius: 50%;
79
- }
80
-
81
- .hero-container::after {
82
- content: '';
83
- position: absolute;
84
- bottom: -30%;
85
- left: -10%;
86
- width: 400px;
87
- height: 400px;
88
- background: radial-gradient(circle, rgba(255,215,0,0.1) 0%, transparent 70%);
89
- border-radius: 50%;
90
- }
91
-
92
- .hero-title {
93
- font-size: 4rem;
94
- font-weight: 900;
95
- background: linear-gradient(135deg, #FFD700 0%, #FFA500 50%, #FFD700 100%);
96
- -webkit-background-clip: text;
97
- -webkit-text-fill-color: transparent;
98
- margin-bottom: 0.75rem;
99
- position: relative;
100
- z-index: 1;
101
- letter-spacing: -0.02em;
102
- filter: drop-shadow(0 4px 20px rgba(255,215,0,0.3));
103
- }
104
-
105
- .hero-subtitle {
106
- font-size: 1.35rem;
107
- color: #e5e7eb;
108
- font-weight: 500;
109
- margin-bottom: 1.5rem;
110
- position: relative;
111
- z-index: 1;
112
- line-height: 1.6;
113
- }
114
-
115
- .hero-badge {
116
- display: inline-block;
117
- background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
118
- color: #0f0f0f;
119
- padding: 0.7rem 2rem;
120
- border-radius: 50px;
121
- font-size: 1rem;
122
- font-weight: 700;
123
- margin-top: 1.5rem;
124
- box-shadow: 0 8px 25px rgba(255,215,0,0.4);
125
- position: relative;
126
- z-index: 1;
127
- transition: all 0.3s ease;
128
- }
129
-
130
- .hero-badge:hover {
131
- transform: translateY(-3px);
132
- box-shadow: 0 12px 35px rgba(255,215,0,0.6);
133
- }
134
-
135
- /* Glass Cards */
136
- .glass-card {
137
- background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
138
- backdrop-filter: blur(20px);
139
- border-radius: 24px;
140
- padding: 2.5rem;
141
- margin-bottom: 2rem;
142
- box-shadow: 0 15px 45px rgba(0,0,0,0.5), 0 5px 15px rgba(255,215,0,0.1);
143
- border: 2px solid rgba(218,165,32,0.2);
144
- transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
145
- position: relative;
146
- }
147
-
148
- .glass-card::before {
149
- content: '';
150
- position: absolute;
151
- top: 0;
152
- left: 0;
153
- right: 0;
154
- height: 4px;
155
- background: linear-gradient(90deg, #FFD700 0%, #FFA500 100%);
156
- border-radius: 24px 24px 0 0;
157
- opacity: 0;
158
- transition: opacity 0.3s ease;
159
- }
160
-
161
- .glass-card:hover {
162
- transform: translateY(-8px);
163
- box-shadow: 0 20px 60px rgba(0,0,0,0.6), 0 8px 20px rgba(255,215,0,0.2);
164
- border-color: rgba(218,165,32,0.4);
165
- }
166
-
167
- .glass-card:hover::before {
168
- opacity: 1;
169
- }
170
-
171
- /* Section Headers */
172
- .section-header {
173
- font-size: 1.8rem;
174
- font-weight: 700;
175
- color: #f5f5f5;
176
- margin-bottom: 1.5rem;
177
- display: flex;
178
- align-items: center;
179
- gap: 0.75rem;
180
- position: relative;
181
- z-index: 2;
182
- }
183
-
184
- .section-icon {
185
- width: 40px;
186
- height: 40px;
187
- background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
188
- border-radius: 12px;
189
- display: flex;
190
- align-items: center;
191
- justify-content: center;
192
- font-size: 1.5rem;
193
- box-shadow: 0 4px 15px rgba(255,215,0,0.3);
194
- flex-shrink: 0;
195
- }
196
-
197
- /* Stats Grid */
198
- .stats-grid {
199
- display: grid;
200
- grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
201
- gap: 1.5rem;
202
- margin: 2rem 0;
203
- }
204
-
205
- .stat-card {
206
- background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
207
- padding: 2rem 1.5rem;
208
- border-radius: 20px;
209
- text-align: center;
210
- color: #0f0f0f;
211
- box-shadow: 0 10px 30px rgba(255,215,0,0.3);
212
- transition: all 0.3s ease;
213
- position: relative;
214
- overflow: hidden;
215
- }
216
-
217
- .stat-card::before {
218
- content: '';
219
- position: absolute;
220
- top: -50%;
221
- right: -50%;
222
- width: 200%;
223
- height: 200%;
224
- background: radial-gradient(circle, rgba(255,255,255,0.2) 0%, transparent 70%);
225
- transition: all 0.5s ease;
226
- }
227
-
228
- .stat-card:hover {
229
- transform: translateY(-8px) scale(1.02);
230
- box-shadow: 0 15px 40px rgba(255,215,0,0.5);
231
- }
232
-
233
- .stat-card:hover::before {
234
- top: -30%;
235
- right: -30%;
236
- }
237
-
238
- .stat-value {
239
- font-size: 3rem;
240
- font-weight: 900;
241
- margin-bottom: 0.5rem;
242
- position: relative;
243
- z-index: 1;
244
- text-shadow: 0 2px 10px rgba(0,0,0,0.2);
245
- color: #0f0f0f;
246
- }
247
-
248
- .stat-label {
249
- font-size: 0.95rem;
250
- font-weight: 600;
251
- opacity: 0.9;
252
- text-transform: uppercase;
253
- letter-spacing: 1.5px;
254
- position: relative;
255
- z-index: 1;
256
- color: #0f0f0f;
257
- }
258
-
259
- /* Input Areas */
260
- .stTextArea textarea {
261
- border-radius: 16px;
262
- border: 2px solid rgba(218,165,32,0.3);
263
- font-size: 1rem;
264
- transition: all 0.3s ease;
265
- background: #1a1a1a;
266
- color: #e5e7eb;
267
- }
268
-
269
- .stTextArea textarea:focus {
270
- border-color: #FFD700;
271
- box-shadow: 0 0 0 3px rgba(255,215,0,0.2);
272
- }
273
-
274
- /* Buttons */
275
- .stButton > button {
276
- background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
277
- color: #0f0f0f;
278
- border: none;
279
- border-radius: 12px;
280
- padding: 0.75rem 2.5rem;
281
- font-size: 1.1rem;
282
- font-weight: 600;
283
- transition: all 0.3s ease;
284
- box-shadow: 0 4px 15px rgba(255,215,0,0.4);
285
- width: 100%;
286
- }
287
-
288
- .stButton > button:hover {
289
- transform: translateY(-2px);
290
- box-shadow: 0 6px 20px rgba(255,215,0,0.6);
291
- }
292
-
293
- /* Alert Boxes */
294
- .alert-danger {
295
- background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
296
- color: white;
297
- padding: 1.5rem;
298
- border-radius: 16px;
299
- font-size: 1.1rem;
300
- font-weight: 600;
301
- box-shadow: 0 8px 24px rgba(239,68,68,0.3);
302
- margin: 1rem 0;
303
- }
304
-
305
- .alert-success {
306
- background: linear-gradient(135deg, #10b981 0%, #059669 100%);
307
- color: white;
308
- padding: 1.5rem;
309
- border-radius: 16px;
310
- font-size: 1.1rem;
311
- font-weight: 600;
312
- box-shadow: 0 8px 24px rgba(16,185,129,0.3);
313
- margin: 1rem 0;
314
- }
315
-
316
- .confidence-bar {
317
- height: 12px;
318
- background: rgba(255,255,255,0.3);
319
- border-radius: 10px;
320
- overflow: hidden;
321
- margin-top: 0.75rem;
322
- }
323
-
324
- .confidence-fill {
325
- height: 100%;
326
- background: rgba(255,255,255,0.9);
327
- border-radius: 10px;
328
- transition: width 1s ease;
329
- }
330
-
331
- /* Hints Panel */
332
- .hints-panel {
333
- background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
334
- border-radius: 16px;
335
- padding: 1.5rem;
336
- border-left: 4px solid #FFD700;
337
- box-shadow: 0 4px 15px rgba(0,0,0,0.3);
338
- }
339
-
340
- .hint-item {
341
- display: flex;
342
- align-items: start;
343
- gap: 0.75rem;
344
- margin-bottom: 1rem;
345
- font-size: 0.95rem;
346
- color: #d1d5db;
347
- }
348
-
349
- .hint-icon {
350
- min-width: 24px;
351
- height: 24px;
352
- background: #FFD700;
353
- color: #0f0f0f;
354
- border-radius: 50%;
355
- display: flex;
356
- align-items: center;
357
- justify-content: center;
358
- font-size: 0.75rem;
359
- font-weight: 700;
360
- }
361
-
362
- /* Expanders */
363
- .streamlit-expanderHeader {
364
- background: rgba(218,165,32,0.15);
365
- border-radius: 12px;
366
- font-weight: 600;
367
- color: #f5f5f5;
368
- }
369
-
370
- /* Footer */
371
- .footer {
372
- background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
373
- border-radius: 16px;
374
- padding: 2rem;
375
- text-align: center;
376
- margin-top: 3rem;
377
- color: #9ca3af;
378
- box-shadow: 0 8px 24px rgba(0,0,0,0.3);
379
- border: 2px solid rgba(218,165,32,0.2);
380
- }
381
-
382
- .footer-name {
383
- font-weight: 700;
384
- background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
385
- -webkit-background-clip: text;
386
- -webkit-text-fill-color: transparent;
387
- }
388
-
389
- /* File Uploader */
390
- .stFileUploader {
391
- border: 2px dashed rgba(218,165,32,0.4);
392
- border-radius: 16px;
393
- padding: 1.5rem;
394
- background: rgba(26,26,26,0.5);
395
- transition: all 0.3s ease;
396
- }
397
-
398
- .stFileUploader:hover {
399
- border-color: #FFD700;
400
- background: rgba(218,165,32,0.1);
401
- }
402
-
403
- .stFileUploader label {
404
- color: #e5e7eb !important;
405
- }
406
-
407
- /* Metric Cards */
408
- .metric-container {
409
- background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
410
- padding: 1.25rem;
411
- border-radius: 12px;
412
- border-left: 4px solid #FFD700;
413
- box-shadow: 0 2px 8px rgba(0,0,0,0.3);
414
- }
415
-
416
- .metric-container div {
417
- color: #e5e7eb;
418
- }
419
-
420
- /* Dataframe Styling */
421
- .dataframe {
422
- border-radius: 12px;
423
- overflow: hidden;
424
- }
425
-
426
- /* Animations */
427
- @keyframes fadeIn {
428
- from { opacity: 0; transform: translateY(20px); }
429
- to { opacity: 1; transform: translateY(0); }
430
- }
431
-
432
- .glass-card {
433
- animation: fadeIn 0.6s ease forwards;
434
- }
435
-
436
- /* Hide Streamlit Branding */
437
- #MainMenu {visibility: hidden;}
438
- footer {visibility: hidden;}
439
- </style>
440
- """, unsafe_allow_html=True)
441
-
442
- # Utility Functions
443
- @st.cache_data
444
- def load_csv_from_bytes(uploaded_bytes):
445
- return pd.read_csv(io.BytesIO(uploaded_bytes))
446
-
447
- def safe_read_csv(path):
448
- try:
449
- return pd.read_csv(path)
450
- except Exception as e:
451
- st.warning(f"Could not read {path}: {e}")
452
- return pd.DataFrame()
453
-
454
- def preprocess_text(text):
455
- if not isinstance(text, str):
456
- text = str(text)
457
- text = text.lower()
458
- text = re.sub(r'http\S+|www\S+|https\S+', ' url ', text)
459
- text = re.sub(r'\S+@\S+', ' email ', text)
460
- text = re.sub(r'[^a-z\s]', ' ', text)
461
- text = re.sub(r'\s+', ' ', text).strip()
462
- return text
463
-
464
- # Hero Header
465
- st.markdown("""
466
- <div class="hero-container">
467
- <div class="hero-title">πŸ›‘οΈ AI Phishing Shield</div>
468
- <div class="hero-subtitle">Advanced machine learning protection against email threats</div>
469
- <div style="color: #d1d5db; font-size: 1rem; line-height: 1.6;">
470
- Powered by TF-IDF vectorization and Logistic Regression, trained on thousands of real-world phishing examples.
471
- Get instant threat analysis with confidence scoring and explainable AI insights.
472
- </div>
473
- <div class="hero-badge">⚑ Developed by Umaima Qureshi</div>
474
- </div>
475
- """, unsafe_allow_html=True)
476
-
477
- # Load Dataset
478
- main_csv_path = "Phishing_Email.csv"
479
- sample_csv_path = "Phishing_Email_Sample.csv"
480
-
481
- with st.container():
482
- st.markdown('<div class="glass-card">', unsafe_allow_html=True)
483
- st.markdown('<div class="section-header"><div class="section-icon">πŸ“‚</div>Dataset Configuration</div>', unsafe_allow_html=True)
484
-
485
- uploaded_file = st.file_uploader("Upload your phishing dataset (optional)", type=["csv"], help="Upload Phishing_Email.csv for full training")
486
-
487
- if uploaded_file is not None:
488
- df = load_csv_from_bytes(uploaded_file.read())
489
- elif os.path.exists(main_csv_path):
490
- df = safe_read_csv(main_csv_path)
491
- elif os.path.exists(sample_csv_path):
492
- st.info("πŸ“Š Using sample dataset for demonstration")
493
- df = safe_read_csv(sample_csv_path)
494
- else:
495
- st.info("πŸ“Š Using built-in demo dataset")
496
- df = pd.DataFrame({
497
- "Email Text": [
498
- "Urgent! Your account has been suspended. Click http://fakebank.com to verify.",
499
- "Hi team, attached is the agenda for tomorrow's meeting. Regards.",
500
- "Dear user, update your password at http://phishingsite.com immediately to avoid suspension.",
501
- "Hello Omaima, congrats on your results. Let's celebrate this week!"
502
- ],
503
- "Email Type": ["Phishing Email", "Safe Email", "Phishing Email", "Safe Email"]
504
- })
505
-
506
- st.markdown('</div>', unsafe_allow_html=True)
507
-
508
- # Clean & Prepare Dataset
509
- if "Unnamed: 0" in df.columns:
510
- df = df.drop(columns=["Unnamed: 0"])
511
-
512
- text_col = "Email Text" if "Email Text" in df.columns else df.columns[0]
513
- label_col = "Email Type" if "Email Type" in df.columns else df.columns[-1]
514
-
515
- df[text_col] = df[text_col].fillna("").astype(str)
516
- df = df[df[text_col].str.strip() != ""].reset_index(drop=True)
517
- df = df.drop(index=0, errors="ignore").reset_index(drop=True)
518
-
519
- label_map = {"Phishing Email": 1, "Safe Email": 0}
520
- if df[label_col].dtype == object:
521
- df['label'] = df[label_col].map(label_map)
522
- df['label'] = df['label'].fillna(0).astype(int)
523
- else:
524
- df['label'] = df[label_col].astype(int)
525
-
526
- df['processed_text'] = df[text_col].apply(preprocess_text)
527
-
528
- # Dataset Stats
529
- phishing_count = (df['label'] == 1).sum()
530
- safe_count = (df['label'] == 0).sum()
531
- total_count = len(df)
532
-
533
- st.markdown('<div class="glass-card">', unsafe_allow_html=True)
534
- st.markdown('<div class="section-header"><div class="section-icon">πŸ“Š</div>Dataset Statistics</div>', unsafe_allow_html=True)
535
-
536
- st.markdown(f"""
537
- <div class="stats-grid">
538
- <div class="stat-card">
539
- <div class="stat-value">{total_count}</div>
540
- <div class="stat-label">Total Emails</div>
541
- </div>
542
- <div class="stat-card">
543
- <div class="stat-value">{phishing_count}</div>
544
- <div class="stat-label">Phishing Detected</div>
545
- </div>
546
- <div class="stat-card">
547
- <div class="stat-value">{safe_count}</div>
548
- <div class="stat-label">Safe Emails</div>
549
- </div>
550
- <div class="stat-card">
551
- <div class="stat-value">{(phishing_count/total_count*100):.1f}%</div>
552
- <div class="stat-label">Threat Rate</div>
553
- </div>
554
- </div>
555
- """, unsafe_allow_html=True)
556
-
557
- with st.expander("πŸ” View Dataset Preview", expanded=False):
558
- st.dataframe(df[[text_col, label_col]].head(10), use_container_width=True)
559
-
560
- st.markdown('</div>', unsafe_allow_html=True)
561
-
562
- # Model Training
563
- @st.cache_resource
564
- def train_model(processed_texts, labels, test_size=0.2, random_state=42):
565
- strat = labels if len(np.unique(labels)) > 1 else None
566
- X_train, X_test, y_train, y_test = train_test_split(
567
- processed_texts, labels, test_size=test_size, random_state=random_state, stratify=strat
568
- )
569
- vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
570
- X_train_vec = vectorizer.fit_transform(X_train)
571
- X_test_vec = vectorizer.transform(X_test)
572
-
573
- model = LogisticRegression(max_iter=1000, solver='liblinear')
574
- model.fit(X_train_vec, y_train)
575
-
576
- y_pred = model.predict(X_test_vec)
577
- acc = accuracy_score(y_test, y_pred)
578
- cm = confusion_matrix(y_test, y_pred)
579
- report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
580
-
581
- return {
582
- "vectorizer": vectorizer,
583
- "model": model,
584
- "accuracy": acc,
585
- "confusion_matrix": cm,
586
- "report": report
587
- }
588
-
589
- model_info = train_model(df['processed_text'].tolist(), df['label'].values)
590
- vectorizer, model, accuracy = model_info["vectorizer"], model_info["model"], model_info["accuracy"]
591
-
592
- # Model Performance
593
- st.markdown('<div class="glass-card">', unsafe_allow_html=True)
594
- st.markdown('<div class="section-header"><div class="section-icon">🎯</div>Model Performance</div>', unsafe_allow_html=True)
595
-
596
- col1, col2, col3 = st.columns(3)
597
-
598
- with col1:
599
- st.markdown(f"""
600
- <div class="metric-container">
601
- <div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Accuracy</div>
602
- <div style="font-size: 2rem; font-weight: 800; color: #FFD700;">{accuracy:.1%}</div>
603
- </div>
604
- """, unsafe_allow_html=True)
605
-
606
- with col2:
607
- precision = model_info["report"].get("1", {}).get("precision", 0)
608
- st.markdown(f"""
609
- <div class="metric-container">
610
- <div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Precision</div>
611
- <div style="font-size: 2rem; font-weight: 800; color: #FFD700;">{precision:.1%}</div>
612
- </div>
613
- """, unsafe_allow_html=True)
614
-
615
- with col3:
616
- recall = model_info["report"].get("1", {}).get("recall", 0)
617
- st.markdown(f"""
618
- <div class="metric-container">
619
- <div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Recall</div>
620
- <div style="font-size: 2rem; font-weight: 800; color: #FFD700;">{recall:.1%}</div>
621
- </div>
622
- """, unsafe_allow_html=True)
623
-
624
- with st.expander("πŸ“ˆ Detailed Metrics & Confusion Matrix"):
625
- col_matrix, col_spacer = st.columns([1, 1.5])
626
-
627
- with col_matrix:
628
- fig, ax = plt.subplots(figsize=(4,3.5))
629
- sns.heatmap(
630
- model_info["confusion_matrix"],
631
- annot=True,
632
- fmt="d",
633
- ax=ax,
634
- cmap="YlOrBr",
635
- cbar=False,
636
- square=True,
637
- annot_kws={"size": 14, "weight": "bold"}
638
- )
639
- ax.set_xlabel("Predicted", fontsize=10, fontweight='bold')
640
- ax.set_ylabel("Actual", fontsize=10, fontweight='bold')
641
- ax.set_xticklabels(["Safe", "Phishing"], fontsize=9)
642
- ax.set_yticklabels(["Safe", "Phishing"], fontsize=9, rotation=0)
643
- ax.set_title("Confusion Matrix", fontsize=11, fontweight='bold', pad=10)
644
- plt.tight_layout()
645
- st.pyplot(fig)
646
-
647
- st.write("**Classification Report:**")
648
- report_df = pd.DataFrame(model_info["report"]).transpose().round(3)
649
- st.dataframe(report_df, use_container_width=True, height=200)
650
-
651
- st.markdown('</div>', unsafe_allow_html=True)
652
-
653
- # Inference UI
654
- st.markdown('<div class="glass-card">', unsafe_allow_html=True)
655
- st.markdown('<div class="section-header"><div class="section-icon">βœ‰οΈ</div>Email Threat Scanner</div>', unsafe_allow_html=True)
656
-
657
- col_input, col_hints = st.columns([2, 1])
658
-
659
- with col_input:
660
- email_input = st.text_area(
661
- "Paste email content for analysis",
662
- height=250,
663
- placeholder="Example: Urgent! Your account has been compromised. Click here to verify your identity immediately...",
664
- help="Paste the full email content including subject and body"
665
- )
666
-
667
- uploaded_txt = st.file_uploader("Or upload a .txt file", type=["txt"], help="Upload a text file containing the email")
668
-
669
- if uploaded_txt is not None and not email_input:
670
- try:
671
- email_input = uploaded_txt.read().decode("utf-8", errors="ignore")
672
- except Exception:
673
- email_input = str(uploaded_txt.getvalue())
674
-
675
- if st.button("πŸ” Analyze Email Threat"):
676
- if not email_input.strip():
677
- st.warning("⚠️ Please paste or upload email content to analyze")
678
- else:
679
- processed_input = preprocess_text(email_input)
680
- input_vec = vectorizer.transform([processed_input])
681
-
682
- try:
683
- proba = model.predict_proba(input_vec)[0][1]
684
- except Exception:
685
- try:
686
- score = model.decision_function(input_vec)[0]
687
- proba = 1/(1+np.exp(-score))
688
- except Exception:
689
- proba = None
690
-
691
- pred = model.predict(input_vec)[0]
692
-
693
- if pred == 1:
694
- conf_pct = f"{proba:.1%}" if proba is not None else "N/A"
695
- st.markdown(f"""
696
- <div class="alert-danger">
697
- <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
698
- <div style="font-size: 2.5rem;">⚠️</div>
699
- <div>
700
- <div style="font-size: 1.4rem; font-weight: 800;">PHISHING DETECTED</div>
701
- <div style="font-size: 1rem; opacity: 0.95;">Threat Confidence: {conf_pct}</div>
702
- </div>
703
- </div>
704
- <div class="confidence-bar">
705
- <div class="confidence-fill" style="width: {proba*100 if proba else 0}%;"></div>
706
- </div>
707
- </div>
708
- """, unsafe_allow_html=True)
709
-
710
- st.markdown("**πŸ” Threat Indicators Detected:**")
711
- indicators = []
712
- if "url" in processed_input:
713
- indicators.append("πŸ”— Suspicious URL tokens detected")
714
- if re.search(r'\b(urgent|immediately|verify|password|suspended|click|act now)\b', processed_input):
715
- indicators.append("⚑ Urgency manipulation tactics")
716
- if re.search(r'\b(bank|account|verify|login|password|security|credential)\b', processed_input):
717
- indicators.append("🏦 Financial/security keywords present")
718
- if re.search(r'\b(winner|prize|congratulations|claim|free)\b', processed_input):
719
- indicators.append("🎁 Reward/prize baiting language")
720
-
721
- for indicator in indicators:
722
- st.markdown(f"- {indicator}")
723
-
724
- if not indicators:
725
- st.markdown("- ⚠️ Content pattern matches known phishing templates")
726
-
727
- else:
728
- conf_pct = f"{(1-proba):.1%}" if proba is not None else "N/A"
729
- st.markdown(f"""
730
- <div class="alert-success">
731
- <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
732
- <div style="font-size: 2.5rem;">βœ…</div>
733
- <div>
734
- <div style="font-size: 1.4rem; font-weight: 800;">EMAIL APPEARS SAFE</div>
735
- <div style="font-size: 1rem; opacity: 0.95;">Safety Confidence: {conf_pct}</div>
736
- </div>
737
- </div>
738
- <div class="confidence-bar">
739
- <div class="confidence-fill" style="width: {(1-proba)*100 if proba else 100}%;"></div>
740
- </div>
741
- </div>
742
- """, unsafe_allow_html=True)
743
- st.markdown("**βœ“ No obvious threat indicators found in content analysis**")
744
- st.info("πŸ’‘ Remember: Always verify sender identity and be cautious with unexpected emails, even if they appear safe.")
745
-
746
- with col_hints:
747
- st.markdown("""
748
- <div class="hints-panel">
749
- <div style="font-weight: 700; font-size: 1.1rem; margin-bottom: 1rem; color: #f5f5f5;">🧠 AI Detection Insights</div>
750
-
751
- <div class="hint-item">
752
- <div class="hint-icon">1</div>
753
- <div><strong>Urgency words</strong> like "urgent", "verify", "immediately" raise red flags</div>
754
- </div>
755
-
756
- <div class="hint-item">
757
- <div class="hint-icon">2</div>
758
- <div><strong>Suspicious links</strong> or email addresses are automatically flagged</div>
759
- </div>
760
-
761
- <div class="hint-item">
762
- <div class="hint-icon">3</div>
763
- <div><strong>Financial keywords</strong> combined with urgency indicate high risk</div>
764
- </div>
765
-
766
- <div class="hint-item">
767
- <div class="hint-icon">4</div>
768
- <div>Confidence <strong>>70%</strong> warrants immediate caution</div>
769
- </div>
770
-
771
- <div class="hint-item">
772
- <div class="hint-icon">⚠️</div>
773
- <div><strong>Limitations:</strong> This tool analyzes text content only. Always verify sender identity separately.</div>
774
- </div>
775
- </div>
776
- """, unsafe_allow_html=True)
777
-
778
- st.markdown('</div>', unsafe_allow_html=True)
779
-
780
- # Footer
781
- st.markdown("""
782
- <div class="footer">
783
- <div style="font-size: 1.1rem; margin-bottom: 0.5rem;">
784
- Developed and Deployed by <span class="footer-name">Umaima Qureshi</span>
785
- </div>
786
- <div style="font-size: 0.9rem; color: #94a3b8;">
787
- πŸŽ“ Educational demonstration of ML-powered email security<br>
788
- For production use: Implement additional verification layers, link scanning, attachment analysis, and human oversight
789
- </div>
790
- <div style="margin-top: 1rem; font-size: 0.85rem; color: #6b7280;">
791
- Powered by TF-IDF β€’ Logistic Regression β€’ Scikit-learn β€’ Streamlit
792
- </div>
793
- </div>
794
- """, unsafe_allow_html=True)
795
-
796
-
797
-
798
-
799
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AI Phishing Email Detector - Premium Black & Gold UI
3
+ TF-IDF + Logistic Regression trained on Kaggle Phishing Emails dataset.
4
+ Author & Deployer: Umaima Qureshi
5
+ """
6
+
7
+ import streamlit as st
8
+ import pandas as pd
9
+ import numpy as np
10
+ import re
11
+ from sklearn.model_selection import train_test_split
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.linear_model import LogisticRegression
14
+ from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
15
+ import matplotlib.pyplot as plt
16
+ import seaborn as sns
17
+ import io
18
+ import os
19
+
20
+ # Page Configuration
21
+ st.set_page_config(
22
+ page_title="AI Phishing Shield – by Umaima Qureshi",
23
+ layout="wide",
24
+ initial_sidebar_state="collapsed"
25
+ )
26
+
27
+ # Premium Black & Gold CSS Styling
28
+ st.markdown("""
29
+ <style>
30
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;800&display=swap');
31
+
32
+ * {
33
+ font-family: 'Inter', sans-serif;
34
+ }
35
+
36
+ .stApp {
37
+ background: linear-gradient(135deg, #0f0f0f 0%, #1a1a1a 50%, #0f0f0f 100%);
38
+ }
39
+
40
+ .main {
41
+ background: transparent;
42
+ padding: 0;
43
+ }
44
+
45
+ .block-container {
46
+ padding: 2rem 3rem !important;
47
+ max-width: 1400px;
48
+ }
49
+
50
+ section[data-testid="stSidebar"] {
51
+ display: none;
52
+ }
53
+
54
+ .element-container {
55
+ background: transparent !important;
56
+ }
57
+
58
+ /* Hero Section */
59
+ .hero-container {
60
+ background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
61
+ border-radius: 28px;
62
+ padding: 3.5rem 3rem;
63
+ margin-bottom: 2.5rem;
64
+ box-shadow: 0 25px 70px rgba(0,0,0,0.5), 0 10px 30px rgba(218,165,32,0.2);
65
+ position: relative;
66
+ overflow: hidden;
67
+ border: 2px solid rgba(218,165,32,0.3);
68
+ }
69
+
70
+ .hero-container::before {
71
+ content: '';
72
+ position: absolute;
73
+ top: -50%;
74
+ right: -20%;
75
+ width: 500px;
76
+ height: 500px;
77
+ background: radial-gradient(circle, rgba(218,165,32,0.15) 0%, transparent 70%);
78
+ border-radius: 50%;
79
+ }
80
+
81
+ .hero-container::after {
82
+ content: '';
83
+ position: absolute;
84
+ bottom: -30%;
85
+ left: -10%;
86
+ width: 400px;
87
+ height: 400px;
88
+ background: radial-gradient(circle, rgba(255,215,0,0.1) 0%, transparent 70%);
89
+ border-radius: 50%;
90
+ }
91
+
92
+ .hero-title {
93
+ font-size: 4rem;
94
+ font-weight: 900;
95
+ background: linear-gradient(135deg, #FFD700 0%, #FFA500 50%, #FFD700 100%);
96
+ -webkit-background-clip: text;
97
+ -webkit-text-fill-color: transparent;
98
+ margin-bottom: 0.75rem;
99
+ position: relative;
100
+ z-index: 1;
101
+ letter-spacing: -0.02em;
102
+ filter: drop-shadow(0 4px 20px rgba(255,215,0,0.3));
103
+ }
104
+
105
+ .hero-subtitle {
106
+ font-size: 1.35rem;
107
+ color: #e5e7eb;
108
+ font-weight: 500;
109
+ margin-bottom: 1.5rem;
110
+ position: relative;
111
+ z-index: 1;
112
+ line-height: 1.6;
113
+ }
114
+
115
+ .hero-badge {
116
+ display: inline-block;
117
+ background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
118
+ color: #0f0f0f;
119
+ padding: 0.7rem 2rem;
120
+ border-radius: 50px;
121
+ font-size: 1rem;
122
+ font-weight: 700;
123
+ margin-top: 1.5rem;
124
+ box-shadow: 0 8px 25px rgba(255,215,0,0.4);
125
+ position: relative;
126
+ z-index: 1;
127
+ transition: all 0.3s ease;
128
+ }
129
+
130
+ .hero-badge:hover {
131
+ transform: translateY(-3px);
132
+ box-shadow: 0 12px 35px rgba(255,215,0,0.6);
133
+ }
134
+
135
+ /* Glass Cards */
136
+ .glass-card {
137
+ background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
138
+ backdrop-filter: blur(20px);
139
+ border-radius: 24px;
140
+ padding: 2.5rem;
141
+ margin-bottom: 2rem;
142
+ box-shadow: 0 15px 45px rgba(0,0,0,0.5), 0 5px 15px rgba(255,215,0,0.1);
143
+ border: 2px solid rgba(218,165,32,0.2);
144
+ transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
145
+ position: relative;
146
+ }
147
+
148
+ .glass-card::before {
149
+ content: '';
150
+ position: absolute;
151
+ top: 0;
152
+ left: 0;
153
+ right: 0;
154
+ height: 4px;
155
+ background: linear-gradient(90deg, #FFD700 0%, #FFA500 100%);
156
+ border-radius: 24px 24px 0 0;
157
+ opacity: 0;
158
+ transition: opacity 0.3s ease;
159
+ }
160
+
161
+ .glass-card:hover {
162
+ transform: translateY(-8px);
163
+ box-shadow: 0 20px 60px rgba(0,0,0,0.6), 0 8px 20px rgba(255,215,0,0.2);
164
+ border-color: rgba(218,165,32,0.4);
165
+ }
166
+
167
+ .glass-card:hover::before {
168
+ opacity: 1;
169
+ }
170
+
171
+ /* Section Headers */
172
+ .section-header {
173
+ font-size: 1.8rem;
174
+ font-weight: 700;
175
+ color: #f5f5f5;
176
+ margin-bottom: 1.5rem;
177
+ display: flex;
178
+ align-items: center;
179
+ gap: 0.75rem;
180
+ position: relative;
181
+ z-index: 2;
182
+ }
183
+
184
+ .section-icon {
185
+ width: 40px;
186
+ height: 40px;
187
+ background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
188
+ border-radius: 12px;
189
+ display: flex;
190
+ align-items: center;
191
+ justify-content: center;
192
+ font-size: 1.5rem;
193
+ box-shadow: 0 4px 15px rgba(255,215,0,0.3);
194
+ flex-shrink: 0;
195
+ }
196
+
197
+ /* Stats Grid */
198
+ .stats-grid {
199
+ display: grid;
200
+ grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
201
+ gap: 1.5rem;
202
+ margin: 2rem 0;
203
+ }
204
+
205
+ .stat-card {
206
+ background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
207
+ padding: 2rem 1.5rem;
208
+ border-radius: 20px;
209
+ text-align: center;
210
+ color: #0f0f0f;
211
+ box-shadow: 0 10px 30px rgba(255,215,0,0.3);
212
+ transition: all 0.3s ease;
213
+ position: relative;
214
+ overflow: hidden;
215
+ }
216
+
217
+ .stat-card::before {
218
+ content: '';
219
+ position: absolute;
220
+ top: -50%;
221
+ right: -50%;
222
+ width: 200%;
223
+ height: 200%;
224
+ background: radial-gradient(circle, rgba(255,255,255,0.2) 0%, transparent 70%);
225
+ transition: all 0.5s ease;
226
+ }
227
+
228
+ .stat-card:hover {
229
+ transform: translateY(-8px) scale(1.02);
230
+ box-shadow: 0 15px 40px rgba(255,215,0,0.5);
231
+ }
232
+
233
+ .stat-card:hover::before {
234
+ top: -30%;
235
+ right: -30%;
236
+ }
237
+
238
+ .stat-value {
239
+ font-size: 3rem;
240
+ font-weight: 900;
241
+ margin-bottom: 0.5rem;
242
+ position: relative;
243
+ z-index: 1;
244
+ text-shadow: 0 2px 10px rgba(0,0,0,0.2);
245
+ color: #0f0f0f;
246
+ }
247
+
248
+ .stat-label {
249
+ font-size: 0.95rem;
250
+ font-weight: 600;
251
+ opacity: 0.9;
252
+ text-transform: uppercase;
253
+ letter-spacing: 1.5px;
254
+ position: relative;
255
+ z-index: 1;
256
+ color: #0f0f0f;
257
+ }
258
+
259
+ /* Input Areas */
260
+ .stTextArea textarea {
261
+ border-radius: 16px;
262
+ border: 2px solid rgba(218,165,32,0.3);
263
+ font-size: 1rem;
264
+ transition: all 0.3s ease;
265
+ background: #1a1a1a;
266
+ color: #e5e7eb;
267
+ }
268
+
269
+ .stTextArea textarea:focus {
270
+ border-color: #FFD700;
271
+ box-shadow: 0 0 0 3px rgba(255,215,0,0.2);
272
+ }
273
+
274
+ /* Buttons */
275
+ .stButton > button {
276
+ background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
277
+ color: #0f0f0f;
278
+ border: none;
279
+ border-radius: 12px;
280
+ padding: 0.75rem 2.5rem;
281
+ font-size: 1.1rem;
282
+ font-weight: 600;
283
+ transition: all 0.3s ease;
284
+ box-shadow: 0 4px 15px rgba(255,215,0,0.4);
285
+ width: 100%;
286
+ }
287
+
288
+ .stButton > button:hover {
289
+ transform: translateY(-2px);
290
+ box-shadow: 0 6px 20px rgba(255,215,0,0.6);
291
+ }
292
+
293
+ /* Alert Boxes */
294
+ .alert-danger {
295
+ background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
296
+ color: white;
297
+ padding: 1.5rem;
298
+ border-radius: 16px;
299
+ font-size: 1.1rem;
300
+ font-weight: 600;
301
+ box-shadow: 0 8px 24px rgba(239,68,68,0.3);
302
+ margin: 1rem 0;
303
+ }
304
+
305
+ .alert-success {
306
+ background: linear-gradient(135deg, #10b981 0%, #059669 100%);
307
+ color: white;
308
+ padding: 1.5rem;
309
+ border-radius: 16px;
310
+ font-size: 1.1rem;
311
+ font-weight: 600;
312
+ box-shadow: 0 8px 24px rgba(16,185,129,0.3);
313
+ margin: 1rem 0;
314
+ }
315
+
316
+ .confidence-bar {
317
+ height: 12px;
318
+ background: rgba(255,255,255,0.3);
319
+ border-radius: 10px;
320
+ overflow: hidden;
321
+ margin-top: 0.75rem;
322
+ }
323
+
324
+ .confidence-fill {
325
+ height: 100%;
326
+ background: rgba(255,255,255,0.9);
327
+ border-radius: 10px;
328
+ transition: width 1s ease;
329
+ }
330
+
331
+ /* Hints Panel */
332
+ .hints-panel {
333
+ background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
334
+ border-radius: 16px;
335
+ padding: 1.5rem;
336
+ border-left: 4px solid #FFD700;
337
+ box-shadow: 0 4px 15px rgba(0,0,0,0.3);
338
+ }
339
+
340
+ .hint-item {
341
+ display: flex;
342
+ align-items: start;
343
+ gap: 0.75rem;
344
+ margin-bottom: 1rem;
345
+ font-size: 0.95rem;
346
+ color: #d1d5db;
347
+ }
348
+
349
+ .hint-icon {
350
+ min-width: 24px;
351
+ height: 24px;
352
+ background: #FFD700;
353
+ color: #0f0f0f;
354
+ border-radius: 50%;
355
+ display: flex;
356
+ align-items: center;
357
+ justify-content: center;
358
+ font-size: 0.75rem;
359
+ font-weight: 700;
360
+ }
361
+
362
+ /* Expanders */
363
+ .streamlit-expanderHeader {
364
+ background: rgba(218,165,32,0.15);
365
+ border-radius: 12px;
366
+ font-weight: 600;
367
+ color: #f5f5f5;
368
+ }
369
+
370
+ /* Footer */
371
+ .footer {
372
+ background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
373
+ border-radius: 16px;
374
+ padding: 2rem;
375
+ text-align: center;
376
+ margin-top: 3rem;
377
+ color: #9ca3af;
378
+ box-shadow: 0 8px 24px rgba(0,0,0,0.3);
379
+ border: 2px solid rgba(218,165,32,0.2);
380
+ }
381
+
382
+ .footer-name {
383
+ font-weight: 700;
384
+ background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
385
+ -webkit-background-clip: text;
386
+ -webkit-text-fill-color: transparent;
387
+ }
388
+
389
+ /* File Uploader */
390
+ .stFileUploader {
391
+ border: 2px dashed rgba(218,165,32,0.4);
392
+ border-radius: 16px;
393
+ padding: 1.5rem;
394
+ background: rgba(26,26,26,0.5);
395
+ transition: all 0.3s ease;
396
+ }
397
+
398
+ .stFileUploader:hover {
399
+ border-color: #FFD700;
400
+ background: rgba(218,165,32,0.1);
401
+ }
402
+
403
+ .stFileUploader label {
404
+ color: #e5e7eb !important;
405
+ }
406
+
407
+ /* Metric Cards */
408
+ .metric-container {
409
+ background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
410
+ padding: 1.25rem;
411
+ border-radius: 12px;
412
+ border-left: 4px solid #FFD700;
413
+ box-shadow: 0 2px 8px rgba(0,0,0,0.3);
414
+ }
415
+
416
+ .metric-container div {
417
+ color: #e5e7eb;
418
+ }
419
+
420
+ /* Dataframe Styling */
421
+ .dataframe {
422
+ border-radius: 12px;
423
+ overflow: hidden;
424
+ }
425
+
426
+ /* Animations */
427
+ @keyframes fadeIn {
428
+ from { opacity: 0; transform: translateY(20px); }
429
+ to { opacity: 1; transform: translateY(0); }
430
+ }
431
+
432
+ .glass-card {
433
+ animation: fadeIn 0.6s ease forwards;
434
+ }
435
+
436
+ /* Hide Streamlit Branding */
437
+ #MainMenu {visibility: hidden;}
438
+ footer {visibility: hidden;}
439
+ </style>
440
+ """, unsafe_allow_html=True)
441
+
442
+ # Utility Functions
443
+ @st.cache_data
444
+ def load_csv_from_bytes(uploaded_bytes):
445
+ return pd.read_csv(io.BytesIO(uploaded_bytes))
446
+
447
+ def safe_read_csv(path):
448
+ try:
449
+ return pd.read_csv(path)
450
+ except Exception as e:
451
+ st.warning(f"Could not read {path}: {e}")
452
+ return pd.DataFrame()
453
+
454
+ def preprocess_text(text):
455
+ if not isinstance(text, str):
456
+ text = str(text)
457
+ text = text.lower()
458
+ text = re.sub(r'http\S+|www\S+|https\S+', ' url ', text)
459
+ text = re.sub(r'\S+@\S+', ' email ', text)
460
+ text = re.sub(r'[^a-z\s]', ' ', text)
461
+ text = re.sub(r'\s+', ' ', text).strip()
462
+ return text
463
+
464
+ # Hero Header
465
+ st.markdown("""
466
+ <div class="hero-container">
467
+ <div class="hero-title">πŸ›‘οΈ AI Phishing Shield</div>
468
+ <div class="hero-subtitle">Advanced machine learning protection against email threats</div>
469
+ <div style="color: #d1d5db; font-size: 1rem; line-height: 1.6;">
470
+ Powered by TF-IDF vectorization and Logistic Regression, trained on thousands of real-world phishing examples.
471
+ Get instant threat analysis with confidence scoring and explainable AI insights.
472
+ </div>
473
+ <div class="hero-badge">⚑ Developed by Umaima Qureshi</div>
474
+ </div>
475
+ """, unsafe_allow_html=True)
476
+
477
+ # Load Dataset
478
+ main_csv_path = "Phishing_Email.csv"
479
+ sample_csv_path = "Phishing_Email_Sample.csv"
480
+
481
+ with st.container():
482
+ st.markdown('<div class="glass-card">', unsafe_allow_html=True)
483
+ st.markdown('<div class="section-header"><div class="section-icon">πŸ“‚</div>Dataset Configuration</div>', unsafe_allow_html=True)
484
+
485
+ uploaded_file = st.file_uploader("Upload your phishing dataset (optional)", type=["csv"], help="Upload Phishing_Email.csv for full training")
486
+
487
+ if uploaded_file is not None:
488
+ df = load_csv_from_bytes(uploaded_file.read())
489
+ elif os.path.exists(main_csv_path):
490
+ df = safe_read_csv(main_csv_path)
491
+ elif os.path.exists(sample_csv_path):
492
+ st.info("πŸ“Š Using sample dataset for demonstration")
493
+ df = safe_read_csv(sample_csv_path)
494
+ else:
495
+ st.info("πŸ“Š Using built-in demo dataset")
496
+ # FIXED: Enhanced demo dataset with more samples for proper training
497
+ df = pd.DataFrame({
498
+ "Email Text": [
499
+ "Urgent! Your account has been suspended. Click http://fakebank.com to verify.",
500
+ "WINNER! Claim your $1000 prize now at http://scam.com before it expires!",
501
+ "Hi team, attached is the agenda for tomorrow's meeting. Regards.",
502
+ "Hello Umaima, congrats on your results. Let's celebrate this week!",
503
+ "Action required: Update your bank password at http://phishingsite.com immediately.",
504
+ "Reminder: Project deadline is next Monday. Please submit your updates.",
505
+ "Your PayPal account needs verification. Click here: http://fake-paypal.com",
506
+ "Thanks for your email. I'll review the document and get back to you tomorrow."
507
+ ],
508
+ "Email Type": [
509
+ "Phishing Email", "Phishing Email", "Safe Email", "Safe Email",
510
+ "Phishing Email", "Safe Email", "Phishing Email", "Safe Email"
511
+ ]
512
+ })
513
+
514
+ st.markdown('</div>', unsafe_allow_html=True)
515
+
516
+ # Clean & Prepare Dataset
517
+ if "Unnamed: 0" in df.columns:
518
+ df = df.drop(columns=["Unnamed: 0"])
519
+
520
+ text_col = "Email Text" if "Email Text" in df.columns else df.columns[0]
521
+ label_col = "Email Type" if "Email Type" in df.columns else df.columns[-1]
522
+
523
+ df[text_col] = df[text_col].fillna("").astype(str)
524
+ df = df[df[text_col].str.strip() != ""].reset_index(drop=True)
525
+ df = df.drop(index=0, errors="ignore").reset_index(drop=True)
526
+
527
+ label_map = {"Phishing Email": 1, "Safe Email": 0}
528
+ if df[label_col].dtype == object:
529
+ df['label'] = df[label_col].map(label_map)
530
+ df['label'] = df['label'].fillna(0).astype(int)
531
+ else:
532
+ df['label'] = df[label_col].astype(int)
533
+
534
+ df['processed_text'] = df[text_col].apply(preprocess_text)
535
+
536
+ # Dataset Stats
537
+ phishing_count = (df['label'] == 1).sum()
538
+ safe_count = (df['label'] == 0).sum()
539
+ total_count = len(df)
540
+
541
+ st.markdown('<div class="glass-card">', unsafe_allow_html=True)
542
+ st.markdown('<div class="section-header"><div class="section-icon">πŸ“Š</div>Dataset Statistics</div>', unsafe_allow_html=True)
543
+
544
+ st.markdown(f"""
545
+ <div class="stats-grid">
546
+ <div class="stat-card">
547
+ <div class="stat-value">{total_count}</div>
548
+ <div class="stat-label">Total Emails</div>
549
+ </div>
550
+ <div class="stat-card">
551
+ <div class="stat-value">{phishing_count}</div>
552
+ <div class="stat-label">Phishing Detected</div>
553
+ </div>
554
+ <div class="stat-card">
555
+ <div class="stat-value">{safe_count}</div>
556
+ <div class="stat-label">Safe Emails</div>
557
+ </div>
558
+ <div class="stat-card">
559
+ <div class="stat-value">{(phishing_count/total_count*100):.1f}%</div>
560
+ <div class="stat-label">Threat Rate</div>
561
+ </div>
562
+ </div>
563
+ """, unsafe_allow_html=True)
564
+
565
+ with st.expander("πŸ” View Dataset Preview", expanded=False):
566
+ st.dataframe(df[[text_col, label_col]].head(10), use_container_width=True)
567
+
568
+ st.markdown('</div>', unsafe_allow_html=True)
569
+
570
+ # Model Training - ULTIMATE FIX
571
+ @st.cache_resource
572
+ def train_model(processed_texts, labels, test_size=0.2, random_state=42):
573
+ # Check if we have enough samples for stratified split
574
+ unique_labels, counts = np.unique(labels, return_counts=True)
575
+ min_samples = counts.min()
576
+
577
+ # Determine if stratification is safe
578
+ # Need at least 2 samples per class AND test_size must allow at least 1 sample per class in split
579
+ min_test_samples = int(np.ceil(min_samples * test_size))
580
+ min_train_samples = min_samples - min_test_samples
581
+
582
+ use_stratify = (min_samples >= 2 and min_train_samples >= 1 and min_test_samples >= 1 and len(unique_labels) > 1)
583
+
584
+ if not use_stratify:
585
+ # Use simple split without stratification
586
+ X_train, X_test, y_train, y_test = train_test_split(
587
+ processed_texts, labels, test_size=test_size, random_state=random_state, stratify=None
588
+ )
589
+ else:
590
+ # Try stratified split with fallback
591
+ try:
592
+ X_train, X_test, y_train, y_test = train_test_split(
593
+ processed_texts, labels, test_size=test_size, random_state=random_state, stratify=labels
594
+ )
595
+ except ValueError:
596
+ # Fallback to simple split
597
+ X_train, X_test, y_train, y_test = train_test_split(
598
+ processed_texts, labels, test_size=test_size, random_state=random_state, stratify=None
599
+ )
600
+
601
+ vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
602
+ X_train_vec = vectorizer.fit_transform(X_train)
603
+ X_test_vec = vectorizer.transform(X_test)
604
+
605
+ model = LogisticRegression(max_iter=1000, solver='liblinear')
606
+ model.fit(X_train_vec, y_train)
607
+
608
+ y_pred = model.predict(X_test_vec)
609
+ acc = accuracy_score(y_test, y_pred)
610
+ cm = confusion_matrix(y_test, y_pred)
611
+ report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
612
+
613
+ return {
614
+ "vectorizer": vectorizer,
615
+ "model": model,
616
+ "accuracy": acc,
617
+ "confusion_matrix": cm,
618
+ "report": report
619
+ }
620
+
621
+ model_info = train_model(df['processed_text'].tolist(), df['label'].values)
622
+ vectorizer, model, accuracy = model_info["vectorizer"], model_info["model"], model_info["accuracy"]
623
+
624
+ # Model Performance
625
+ st.markdown('<div class="glass-card">', unsafe_allow_html=True)
626
+ st.markdown('<div class="section-header"><div class="section-icon">🎯</div>Model Performance</div>', unsafe_allow_html=True)
627
+
628
+ col1, col2, col3 = st.columns(3)
629
+
630
+ with col1:
631
+ st.markdown(f"""
632
+ <div class="metric-container">
633
+ <div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Accuracy</div>
634
+ <div style="font-size: 2rem; font-weight: 800; color: #FFD700;">{accuracy:.1%}</div>
635
+ </div>
636
+ """, unsafe_allow_html=True)
637
+
638
+ with col2:
639
+ precision = model_info["report"].get("1", {}).get("precision", 0)
640
+ st.markdown(f"""
641
+ <div class="metric-container">
642
+ <div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Precision</div>
643
+ <div style="font-size: 2rem; font-weight: 800; color: #FFD700;">{precision:.1%}</div>
644
+ </div>
645
+ """, unsafe_allow_html=True)
646
+
647
+ with col3:
648
+ recall = model_info["report"].get("1", {}).get("recall", 0)
649
+ st.markdown(f"""
650
+ <div class="metric-container">
651
+ <div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Recall</div>
652
+ <div style="font-size: 2rem; font-weight: 800; color: #FFD700;">{recall:.1%}</div>
653
+ </div>
654
+ """, unsafe_allow_html=True)
655
+
656
+ with st.expander("πŸ“ˆ Detailed Metrics & Confusion Matrix"):
657
+ col_matrix, col_spacer = st.columns([1, 1.5])
658
+
659
+ with col_matrix:
660
+ fig, ax = plt.subplots(figsize=(4,3.5))
661
+ sns.heatmap(
662
+ model_info["confusion_matrix"],
663
+ annot=True,
664
+ fmt="d",
665
+ ax=ax,
666
+ cmap="YlOrBr",
667
+ cbar=False,
668
+ square=True,
669
+ annot_kws={"size": 14, "weight": "bold"}
670
+ )
671
+ ax.set_xlabel("Predicted", fontsize=10, fontweight='bold')
672
+ ax.set_ylabel("Actual", fontsize=10, fontweight='bold')
673
+ ax.set_xticklabels(["Safe", "Phishing"], fontsize=9)
674
+ ax.set_yticklabels(["Safe", "Phishing"], fontsize=9, rotation=0)
675
+ ax.set_title("Confusion Matrix", fontsize=11, fontweight='bold', pad=10)
676
+ plt.tight_layout()
677
+ st.pyplot(fig)
678
+
679
+ st.write("**Classification Report:**")
680
+ report_df = pd.DataFrame(model_info["report"]).transpose().round(3)
681
+ st.dataframe(report_df, use_container_width=True, height=200)
682
+
683
+ st.markdown('</div>', unsafe_allow_html=True)
684
+
685
+ # Inference UI
686
+ st.markdown('<div class="glass-card">', unsafe_allow_html=True)
687
+ st.markdown('<div class="section-header"><div class="section-icon">βœ‰οΈ</div>Email Threat Scanner</div>', unsafe_allow_html=True)
688
+
689
+ col_input, col_hints = st.columns([2, 1])
690
+
691
+ with col_input:
692
+ email_input = st.text_area(
693
+ "Paste email content for analysis",
694
+ height=250,
695
+ placeholder="Example: Urgent! Your account has been compromised. Click here to verify your identity immediately...",
696
+ help="Paste the full email content including subject and body"
697
+ )
698
+
699
+ uploaded_txt = st.file_uploader("Or upload a .txt file", type=["txt"], help="Upload a text file containing the email")
700
+
701
+ if uploaded_txt is not None and not email_input:
702
+ try:
703
+ email_input = uploaded_txt.read().decode("utf-8", errors="ignore")
704
+ except Exception:
705
+ email_input = str(uploaded_txt.getvalue())
706
+
707
+ if st.button("πŸ” Analyze Email Threat"):
708
+ if not email_input.strip():
709
+ st.warning("⚠️ Please paste or upload email content to analyze")
710
+ else:
711
+ processed_input = preprocess_text(email_input)
712
+ input_vec = vectorizer.transform([processed_input])
713
+
714
+ try:
715
+ proba = model.predict_proba(input_vec)[0][1]
716
+ except Exception:
717
+ try:
718
+ score = model.decision_function(input_vec)[0]
719
+ proba = 1/(1+np.exp(-score))
720
+ except Exception:
721
+ proba = None
722
+
723
+ pred = model.predict(input_vec)[0]
724
+
725
+ if pred == 1:
726
+ conf_pct = f"{proba:.1%}" if proba is not None else "N/A"
727
+ st.markdown(f"""
728
+ <div class="alert-danger">
729
+ <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
730
+ <div style="font-size: 2.5rem;">⚠️</div>
731
+ <div>
732
+ <div style="font-size: 1.4rem; font-weight: 800;">PHISHING DETECTED</div>
733
+ <div style="font-size: 1rem; opacity: 0.95;">Threat Confidence: {conf_pct}</div>
734
+ </div>
735
+ </div>
736
+ <div class="confidence-bar">
737
+ <div class="confidence-fill" style="width: {proba*100 if proba else 0}%;"></div>
738
+ </div>
739
+ </div>
740
+ """, unsafe_allow_html=True)
741
+
742
+ st.markdown("**πŸ” Threat Indicators Detected:**")
743
+ indicators = []
744
+ if "url" in processed_input:
745
+ indicators.append("πŸ”— Suspicious URL tokens detected")
746
+ if re.search(r'\b(urgent|immediately|verify|password|suspended|click|act now)\b', processed_input):
747
+ indicators.append("⚑ Urgency manipulation tactics")
748
+ if re.search(r'\b(bank|account|verify|login|password|security|credential)\b', processed_input):
749
+ indicators.append("🏦 Financial/security keywords present")
750
+ if re.search(r'\b(winner|prize|congratulations|claim|free)\b', processed_input):
751
+ indicators.append("🎁 Reward/prize baiting language")
752
+
753
+ for indicator in indicators:
754
+ st.markdown(f"- {indicator}")
755
+
756
+ if not indicators:
757
+ st.markdown("- ⚠️ Content pattern matches known phishing templates")
758
+
759
+ else:
760
+ conf_pct = f"{(1-proba):.1%}" if proba is not None else "N/A"
761
+ st.markdown(f"""
762
+ <div class="alert-success">
763
+ <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
764
+ <div style="font-size: 2.5rem;">βœ…</div>
765
+ <div>
766
+ <div style="font-size: 1.4rem; font-weight: 800;">EMAIL APPEARS SAFE</div>
767
+ <div style="font-size: 1rem; opacity: 0.95;">Safety Confidence: {conf_pct}</div>
768
+ </div>
769
+ </div>
770
+ <div class="confidence-bar">
771
+ <div class="confidence-fill" style="width: {(1-proba)*100 if proba else 100}%;"></div>
772
+ </div>
773
+ </div>
774
+ """, unsafe_allow_html=True)
775
+ st.markdown("**βœ“ No obvious threat indicators found in content analysis**")
776
+ st.info("πŸ’‘ Remember: Always verify sender identity and be cautious with unexpected emails, even if they appear safe.")
777
+
778
+ with col_hints:
779
+ st.markdown("""
780
+ <div class="hints-panel">
781
+ <div style="font-weight: 700; font-size: 1.1rem; margin-bottom: 1rem; color: #f5f5f5;">🧠 AI Detection Insights</div>
782
+
783
+ <div class="hint-item">
784
+ <div class="hint-icon">1</div>
785
+ <div><strong>Urgency words</strong> like "urgent", "verify", "immediately" raise red flags</div>
786
+ </div>
787
+
788
+ <div class="hint-item">
789
+ <div class="hint-icon">2</div>
790
+ <div><strong>Suspicious links</strong> or email addresses are automatically flagged</div>
791
+ </div>
792
+
793
+ <div class="hint-item">
794
+ <div class="hint-icon">3</div>
795
+ <div><strong>Financial keywords</strong> combined with urgency indicate high risk</div>
796
+ </div>
797
+
798
+ <div class="hint-item">
799
+ <div class="hint-icon">4</div>
800
+ <div>Confidence <strong>>70%</strong> warrants immediate caution</div>
801
+ </div>
802
+
803
+ <div class="hint-item">
804
+ <div class="hint-icon">⚠️</div>
805
+ <div><strong>Limitations:</strong> This tool analyzes text content only. Always verify sender identity separately.</div>
806
+ </div>
807
+ </div>
808
+ """, unsafe_allow_html=True)
809
+
810
+ st.markdown('</div>', unsafe_allow_html=True)
811
+
812
+ # Footer
813
+ st.markdown("""
814
+ <div class="footer">
815
+ <div style="font-size: 1.1rem; margin-bottom: 0.5rem;">
816
+ Developed and Deployed by <span class="footer-name">Umaima Qureshi</span>
817
+ </div>
818
+ <div style="font-size: 0.9rem; color: #94a3b8;">
819
+ πŸŽ“ Educational demonstration of ML-powered email security<br>
820
+ For production use: Implement additional verification layers, link scanning, attachment analysis, and human oversight
821
+ </div>
822
+ <div style="margin-top: 1rem; font-size: 0.85rem; color: #6b7280;">
823
+ Powered by TF-IDF β€’ Logistic Regression β€’ Scikit-learn β€’ Streamlit
824
+ </div>
825
+ </div>
826
+ """, unsafe_allow_html=True)