Spaces:
Sleeping
Sleeping
| """ | |
| AI Phishing Email Detector - Premium Black & Gold UI | |
| TF-IDF + Logistic Regression trained on Kaggle Phishing Emails dataset from HuggingFace Files | |
| Author & Deployer: Umaima Qureshi | |
| Modified for HuggingFace Files Support | |
| """ | |
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import accuracy_score, confusion_matrix, classification_report | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import io | |
| import os | |
| # Page Configuration | |
| st.set_page_config( | |
| page_title="AI Phishing Shield β by Umaima Qureshi", | |
| layout="wide", | |
| initial_sidebar_state="collapsed" | |
| ) | |
| # Initialize Session State | |
| if 'model_trained' not in st.session_state: | |
| st.session_state.model_trained = False | |
| if 'analysis_history' not in st.session_state: | |
| st.session_state.analysis_history = [] | |
| if 'cm_plot_cached' not in st.session_state: | |
| st.session_state.cm_plot_cached = None | |
| # Premium Black & Gold CSS Styling | |
| st.markdown(""" | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;800;900&display=swap'); | |
| * { | |
| font-family: 'Inter', sans-serif; | |
| } | |
| .stApp { | |
| background: linear-gradient(135deg, #0a0a0a 0%, #1a1a1a 50%, #0a0a0a 100%); | |
| } | |
| .main { | |
| background: transparent; | |
| padding: 0; | |
| } | |
| .block-container { | |
| padding: 2rem 3rem !important; | |
| max-width: 1400px; | |
| } | |
| section[data-testid="stSidebar"] { | |
| display: none; | |
| } | |
| .hero-container { | |
| background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%); | |
| border-radius: 32px; | |
| padding: 4rem 3rem; | |
| margin-bottom: 3rem; | |
| box-shadow: 0 25px 70px rgba(0,0,0,0.6), 0 10px 30px rgba(218,165,32,0.25), inset 0 1px 0 rgba(255,255,255,0.1); | |
| position: relative; | |
| overflow: hidden; | |
| border: 2px solid rgba(218,165,32,0.4); | |
| } | |
| .hero-container::before { | |
| content: ''; | |
| position: absolute; | |
| top: -50%; | |
| right: -20%; | |
| width: 600px; | |
| height: 600px; | |
| background: radial-gradient(circle, rgba(218,165,32,0.2) 0%, transparent 70%); | |
| border-radius: 50%; | |
| animation: pulse 8s ease-in-out infinite; | |
| } | |
| @keyframes pulse { | |
| 0%, 100% { transform: scale(1); opacity: 0.3; } | |
| 50% { transform: scale(1.1); opacity: 0.5; } | |
| } | |
| .hero-container::after { | |
| content: ''; | |
| position: absolute; | |
| bottom: -30%; | |
| left: -10%; | |
| width: 500px; | |
| height: 500px; | |
| background: radial-gradient(circle, rgba(255,215,0,0.15) 0%, transparent 70%); | |
| border-radius: 50%; | |
| } | |
| .hero-title { | |
| font-size: 4.5rem; | |
| font-weight: 900; | |
| background: linear-gradient(135deg, #FFD700 0%, #FFA500 50%, #FFD700 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| margin-bottom: 1rem; | |
| position: relative; | |
| z-index: 1; | |
| letter-spacing: -0.03em; | |
| filter: drop-shadow(0 4px 20px rgba(255,215,0,0.4)); | |
| } | |
| .hero-subtitle { | |
| font-size: 1.45rem; | |
| color: #e5e7eb; | |
| font-weight: 500; | |
| margin-bottom: 1.5rem; | |
| position: relative; | |
| z-index: 1; | |
| line-height: 1.6; | |
| letter-spacing: 0.3px; | |
| } | |
| .hero-description { | |
| color: #d1d5db; | |
| font-size: 1.05rem; | |
| line-height: 1.7; | |
| position: relative; | |
| z-index: 1; | |
| max-width: 900px; | |
| } | |
| .hero-badge { | |
| display: inline-block; | |
| background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%); | |
| color: #0f0f0f; | |
| padding: 0.8rem 2.5rem; | |
| border-radius: 50px; | |
| font-size: 1.05rem; | |
| font-weight: 700; | |
| margin-top: 1.8rem; | |
| box-shadow: 0 8px 25px rgba(255,215,0,0.5), 0 0 40px rgba(255,215,0,0.3); | |
| position: relative; | |
| z-index: 1; | |
| transition: all 0.3s ease; | |
| } | |
| .hero-badge:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 12px 35px rgba(255,215,0,0.6), 0 0 50px rgba(255,215,0,0.4); | |
| } | |
| .section-title { | |
| font-size: 2.2rem; | |
| font-weight: 800; | |
| background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| margin: 3.5rem 0 2rem 0; | |
| text-align: center; | |
| letter-spacing: 0.5px; | |
| position: relative; | |
| padding-bottom: 1rem; | |
| } | |
| .section-title::after { | |
| content: ''; | |
| position: absolute; | |
| bottom: 0; | |
| left: 50%; | |
| transform: translateX(-50%); | |
| width: 100px; | |
| height: 4px; | |
| background: linear-gradient(90deg, transparent, #FFD700, transparent); | |
| border-radius: 2px; | |
| } | |
| .stats-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(240px, 1fr)); | |
| gap: 1.8rem; | |
| margin: 2.5rem 0; | |
| } | |
| .stat-card { | |
| background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%); | |
| padding: 2.5rem 1.8rem; | |
| border-radius: 24px; | |
| text-align: center; | |
| color: #0f0f0f; | |
| box-shadow: 0 10px 30px rgba(255,215,0,0.35), 0 0 40px rgba(255,215,0,0.2), inset 0 1px 0 rgba(255,255,255,0.3); | |
| transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1); | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .stat-card:hover { | |
| transform: translateY(-10px) scale(1.03); | |
| box-shadow: 0 20px 50px rgba(255,215,0,0.5), 0 0 60px rgba(255,215,0,0.3), inset 0 1px 0 rgba(255,255,255,0.4); | |
| } | |
| .stat-value { | |
| font-size: 3.5rem; | |
| font-weight: 900; | |
| margin-bottom: 0.5rem; | |
| position: relative; | |
| z-index: 1; | |
| color: #0f0f0f; | |
| text-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| } | |
| .stat-label { | |
| font-size: 0.95rem; | |
| font-weight: 700; | |
| opacity: 0.9; | |
| text-transform: uppercase; | |
| letter-spacing: 1.8px; | |
| position: relative; | |
| z-index: 1; | |
| color: #0f0f0f; | |
| } | |
| .stTextArea textarea { | |
| border-radius: 18px; | |
| border: 2px solid rgba(218,165,32,0.35); | |
| font-size: 1.05rem; | |
| transition: all 0.3s ease; | |
| background: rgba(26,26,26,0.8) !important; | |
| color: #e5e7eb !important; | |
| padding: 1rem !important; | |
| line-height: 1.6 !important; | |
| } | |
| .stTextArea textarea:focus { | |
| border-color: #FFD700; | |
| box-shadow: 0 0 0 4px rgba(255,215,0,0.15); | |
| background: rgba(26,26,26,0.95) !important; | |
| } | |
| .stButton > button { | |
| background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%); | |
| color: #0f0f0f; | |
| border: none; | |
| border-radius: 14px; | |
| padding: 0.9rem 2.8rem; | |
| font-size: 1.15rem; | |
| font-weight: 700; | |
| transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1); | |
| box-shadow: 0 4px 15px rgba(255,215,0,0.4), 0 0 30px rgba(255,215,0,0.2); | |
| width: 100%; | |
| letter-spacing: 0.5px; | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .stButton > button:hover { | |
| transform: translateY(-3px); | |
| box-shadow: 0 8px 25px rgba(255,215,0,0.6), 0 0 50px rgba(255,215,0,0.3); | |
| } | |
| .stButton > button:active { | |
| transform: translateY(-1px); | |
| } | |
| .alert-box { | |
| padding: 2rem; | |
| border-radius: 20px; | |
| font-size: 1.1rem; | |
| font-weight: 600; | |
| margin: 1.5rem 0; | |
| border: 2px solid rgba(255,255,255,0.1); | |
| color: white; | |
| } | |
| .confidence-bar { | |
| height: 14px; | |
| background: rgba(255,255,255,0.25); | |
| border-radius: 12px; | |
| overflow: hidden; | |
| margin-top: 1rem; | |
| box-shadow: inset 0 2px 4px rgba(0,0,0,0.2); | |
| } | |
| .confidence-fill { | |
| height: 100%; | |
| background: rgba(255,255,255,0.95); | |
| border-radius: 12px; | |
| transition: width 1.2s cubic-bezier(0.4, 0, 0.2, 1); | |
| box-shadow: 0 0 10px rgba(255,255,255,0.5); | |
| } | |
| .hints-panel { | |
| background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%); | |
| border-radius: 20px; | |
| padding: 2rem; | |
| border-left: 5px solid #FFD700; | |
| box-shadow: 0 4px 15px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05); | |
| backdrop-filter: blur(10px); | |
| } | |
| .hint-item { | |
| display: flex; | |
| align-items: start; | |
| gap: 1rem; | |
| margin-bottom: 1.2rem; | |
| font-size: 0.98rem; | |
| color: #d1d5db; | |
| line-height: 1.6; | |
| } | |
| .hint-icon { | |
| min-width: 28px; | |
| height: 28px; | |
| background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%); | |
| color: #0f0f0f; | |
| border-radius: 50%; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| font-size: 0.8rem; | |
| font-weight: 800; | |
| box-shadow: 0 2px 8px rgba(255,215,0,0.4); | |
| } | |
| .metric-container { | |
| background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%); | |
| padding: 1.8rem; | |
| border-radius: 16px; | |
| border-left: 5px solid #FFD700; | |
| box-shadow: 0 4px 12px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05); | |
| transition: all 0.3s ease; | |
| } | |
| .metric-container:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 6px 18px rgba(0,0,0,0.5), inset 0 1px 0 rgba(255,255,255,0.08); | |
| } | |
| .stFileUploader { | |
| border: 2px dashed rgba(218,165,32,0.45); | |
| border-radius: 18px; | |
| padding: 2rem; | |
| background: rgba(26,26,26,0.6); | |
| transition: all 0.3s ease; | |
| } | |
| .stFileUploader:hover { | |
| border-color: #FFD700; | |
| background: rgba(218,165,32,0.12); | |
| box-shadow: 0 0 20px rgba(255,215,0,0.15); | |
| } | |
| .streamlit-expanderHeader { | |
| background: linear-gradient(135deg, rgba(218,165,32,0.2) 0%, rgba(218,165,32,0.1) 100%) !important; | |
| border-radius: 14px !important; | |
| font-weight: 700 !important; | |
| color: #f5f5f5 !important; | |
| border: 1px solid rgba(218,165,32,0.3) !important; | |
| padding: 1rem 1.5rem !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| .streamlit-expanderHeader:hover { | |
| background: linear-gradient(135deg, rgba(218,165,32,0.25) 0%, rgba(218,165,32,0.15) 100%) !important; | |
| border-color: rgba(218,165,32,0.5) !important; | |
| } | |
| .stDataFrame { | |
| background: rgba(26,26,26,0.95) !important; | |
| border-radius: 12px !important; | |
| overflow: hidden !important; | |
| } | |
| .stDataFrame table { | |
| background: rgba(26,26,26,0.95) !important; | |
| color: #e5e7eb !important; | |
| } | |
| .stDataFrame thead tr th { | |
| background: rgba(218,165,32,0.2) !important; | |
| color: #FFD700 !important; | |
| font-weight: 700 !important; | |
| border-bottom: 2px solid rgba(218,165,32,0.4) !important; | |
| } | |
| .stDataFrame tbody tr { | |
| background: rgba(26,26,26,0.8) !important; | |
| border-bottom: 1px solid rgba(255,255,255,0.05) !important; | |
| } | |
| .stDataFrame tbody tr:hover { | |
| background: rgba(218,165,32,0.1) !important; | |
| } | |
| .stDataFrame tbody tr td { | |
| color: #d1d5db !important; | |
| } | |
| .stAlert { | |
| background: rgba(26,26,26,0.9) !important; | |
| border-radius: 12px !important; | |
| border-left: 4px solid #FFD700 !important; | |
| color: #e5e7eb !important; | |
| } | |
| .footer { | |
| background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%); | |
| border-radius: 20px; | |
| padding: 2.5rem; | |
| text-align: center; | |
| margin-top: 4rem; | |
| color: #9ca3af; | |
| box-shadow: 0 8px 24px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05); | |
| border: 2px solid rgba(218,165,32,0.3); | |
| } | |
| .footer-name { | |
| font-weight: 800; | |
| background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| font-size: 1.1rem; | |
| } | |
| .stPlotlyChart, .stPyplot { | |
| background: rgba(26,26,26,0.6) !important; | |
| border-radius: 12px !important; | |
| padding: 1rem !important; | |
| } | |
| #MainMenu {visibility: hidden;} | |
| footer {visibility: hidden;} | |
| header {visibility: hidden;} | |
| html { | |
| scroll-behavior: smooth; | |
| } | |
| ::-webkit-scrollbar { | |
| width: 10px; | |
| height: 10px; | |
| } | |
| ::-webkit-scrollbar-track { | |
| background: #1a1a1a; | |
| } | |
| ::-webkit-scrollbar-thumb { | |
| background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%); | |
| border-radius: 5px; | |
| } | |
| ::-webkit-scrollbar-thumb:hover { | |
| background: linear-gradient(135deg, #FFA500 0%, #FFD700 100%); | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Utility Functions | |
| def load_dataset_from_files(): | |
| """Load CSV dataset from HuggingFace Files""" | |
| df = None | |
| source = "" | |
| # List of possible CSV file locations in HuggingFace - ordered by priority | |
| possible_paths = [ | |
| "Phishing_Email.csv", | |
| "email_phishing_data.csv", | |
| "phishing_email.csv", | |
| "emails.csv", | |
| "phishing.csv", | |
| "./Phishing_Email.csv", | |
| "./email_phishing_data.csv", | |
| "./phishing_email.csv", | |
| ] | |
| # Try to find and load the CSV | |
| for path in possible_paths: | |
| if os.path.exists(path): | |
| try: | |
| st.info(f"π Found: {path} | Loading dataset...") | |
| df = pd.read_csv(path, encoding='utf-8', on_bad_lines='skip') | |
| source = path | |
| st.success(f"β Successfully loaded dataset from: `{path}` ({len(df)} rows)") | |
| return df, source | |
| except UnicodeDecodeError: | |
| try: | |
| df = pd.read_csv(path, encoding='latin-1', on_bad_lines='skip') | |
| source = path | |
| st.success(f"β Successfully loaded dataset from: `{path}` ({len(df)} rows)") | |
| return df, source | |
| except Exception as e: | |
| st.warning(f"β οΈ Failed to load {path}: {str(e)}") | |
| continue | |
| except Exception as e: | |
| st.warning(f"β οΈ Failed to load {path}: {str(e)}") | |
| continue | |
| return df, source | |
| def safe_read_csv(path): | |
| """Safely read CSV file""" | |
| try: | |
| return pd.read_csv(path) | |
| except Exception as e: | |
| st.error(f"Error reading CSV: {str(e)}") | |
| return pd.DataFrame() | |
| def sanitize_input(text): | |
| """Sanitize user input to prevent injection""" | |
| text = re.sub(r'<script.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE) | |
| text = re.sub(r'<.*?>', '', text) | |
| return text | |
| def validate_email_input(text): | |
| """Validate email input""" | |
| if len(text.strip()) < 10: | |
| return False, "Email content too short for analysis (minimum 10 characters)" | |
| if len(text) > 10000: | |
| return False, "Email content too long (maximum 10,000 characters)" | |
| return True, "" | |
| def preprocess_text(text): | |
| """Enhanced preprocessing with better phishing indicator preservation""" | |
| if not isinstance(text, str): | |
| text = str(text) | |
| text = text.lower() | |
| text = re.sub(r'http\S+|www\S+|https\S+', ' suspiciousurl ', text) | |
| text = re.sub(r'\S+@\S+', ' emailaddress ', text) | |
| text = re.sub(r'\$\d+', ' moneymention ', text) | |
| text = re.sub(r'\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}', ' cardnumber ', text) | |
| text = re.sub(r'[^a-z\s]', ' ', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def calculate_phishing_score(text): | |
| """Enhanced phishing detection with multi-factor scoring""" | |
| score = 0 | |
| text_lower = text.lower() | |
| high_risk = ['verify', 'suspended', 'urgent', 'immediately', 'click here', 'act now', | |
| 'confirm identity', 'account locked', 'unusual activity', 'security alert', | |
| 'expire', 'limited time', 'action required', 'update payment', 'validate'] | |
| score += sum(15 for word in high_risk if word in text_lower) | |
| financial = ['bank', 'credit card', 'password', 'ssn', 'social security', 'paypal', | |
| 'billing', 'payment', 'account number', 'pin', 'cvv', 'credential'] | |
| score += sum(12 for word in financial if word in text_lower) | |
| prize_scam = ['won', 'winner', 'prize', 'claim now', 'congratulations', 'free money', | |
| 'inheritance', 'lottery', 'jackpot', 'cash prize', '$1000', '$10000'] | |
| score += sum(18 for word in prize_scam if word in text_lower) | |
| if any(urg in text_lower for urg in ['urgent', 'immediately', 'now', 'expire']) and \ | |
| any(fin in text_lower for fin in ['account', 'bank', 'payment', 'card']): | |
| score += 25 | |
| if re.search(r'http\S+|www\S+', text, re.IGNORECASE): | |
| url_count = len(re.findall(r'http\S+|www\S+', text, re.IGNORECASE)) | |
| score += min(url_count * 20, 40) | |
| if re.search(r'\b(enter|provide|submit|update|confirm).{0,20}(password|credential|info|detail)', text_lower): | |
| score += 20 | |
| threats = ['locked', 'suspended', 'terminated', 'closed', 'blocked', 'restricted'] | |
| score += sum(15 for word in threats if word in text_lower) | |
| if re.search(r'\b(dear customer|dear user|dear member|dear valued)\b', text_lower): | |
| score += 8 | |
| max_score = 200 | |
| probability = min(score / max_score, 0.99) | |
| return probability | |
| def generate_confusion_matrix_plot(_cm): | |
| """Generate confusion matrix plot - optimized for performance""" | |
| plt.style.use('dark_background') | |
| fig, ax = plt.subplots(figsize=(5, 4), facecolor='#1a1a1a', dpi=80) | |
| ax.set_facecolor('#1a1a1a') | |
| sns.heatmap( | |
| _cm, | |
| annot=True, | |
| fmt="d", | |
| ax=ax, | |
| cmap="YlOrBr", | |
| cbar=True, | |
| square=True, | |
| annot_kws={"size": 14, "weight": "bold", "color": "#0f0f0f"}, | |
| linewidths=1, | |
| linecolor='#0f0f0f', | |
| cbar_kws={'label': 'Count', 'shrink': 0.8}, | |
| vmin=0, | |
| vmax=_cm.max() | |
| ) | |
| ax.set_xlabel("Predicted", fontsize=10, fontweight='bold', color='#FFD700') | |
| ax.set_ylabel("Actual", fontsize=10, fontweight='bold', color='#FFD700') | |
| ax.set_xticklabels(["Safe", "Phishing"], fontsize=9, color='#e5e7eb') | |
| ax.set_yticklabels(["Safe", "Phishing"], fontsize=9, rotation=0, color='#e5e7eb') | |
| ax.set_title("Confusion Matrix", fontsize=12, fontweight='bold', pad=10, color='#FFD700') | |
| try: | |
| cbar = ax.collections[0].colorbar | |
| if cbar: | |
| cbar.ax.yaxis.set_tick_params(color='#e5e7eb') | |
| plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#e5e7eb') | |
| except: | |
| pass | |
| plt.tight_layout() | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format='png', facecolor='#1a1a1a', dpi=80, bbox_inches='tight') | |
| buf.seek(0) | |
| plt.close(fig) | |
| plt.close('all') | |
| return buf | |
| # Hero Header | |
| st.markdown(""" | |
| <div class="hero-container"> | |
| <div class="hero-title">π‘οΈ AI Phishing Shield</div> | |
| <div class="hero-subtitle">Advanced Machine Learning Protection Against Email Threats</div> | |
| <div class="hero-description"> | |
| Powered by TF-IDF vectorization and Logistic Regression, trained on Kaggle phishing dataset. | |
| 80% Training | 20% Testing for maximum accuracy and robustness. | |
| </div> | |
| <div class="hero-badge">β‘ Developed by Umaima Qureshi</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Load Dataset from HuggingFace Files | |
| st.markdown('<div class="section-title">π Dataset Configuration</div>', unsafe_allow_html=True) | |
| with st.spinner("π Loading dataset from HuggingFace Files..."): | |
| df, source = load_dataset_from_files() | |
| if df is None or len(df) == 0: | |
| st.error("β No dataset found! Please ensure Phishing_Email.csv is uploaded to HuggingFace Files.") | |
| st.info("π Expected file: 'Phishing_Email.csv' with columns for email text and labels") | |
| st.stop() | |
| st.info(f"β **Dataset Successfully Loaded** from: `{source}`") | |
| st.write(f"π Dataset shape: {df.shape[0]} rows Γ {df.shape[1]} columns") | |
| # Validate and Prepare Dataset | |
| required_columns = 2 | |
| if len(df.columns) < required_columns or len(df) == 0: | |
| st.error("β οΈ Invalid dataset format. Please ensure your CSV has email text and labels.") | |
| st.stop() | |
| # Handle unnamed index column | |
| if "Unnamed: 0" in df.columns: | |
| df = df.drop(columns=["Unnamed: 0"]) | |
| # Identify text and label columns | |
| text_col = "Email Text" if "Email Text" in df.columns else df.columns[0] | |
| label_col = "Email Type" if "Email Type" in df.columns else df.columns[-1] | |
| st.info(f"π Using columns: Text='{text_col}' | Label='{label_col}'") | |
| # Clean dataset | |
| df[text_col] = df[text_col].fillna("").astype(str) | |
| df = df[df[text_col].str.strip() != ""].reset_index(drop=True) | |
| # Handle labels | |
| label_map = {"Phishing Email": 1, "Safe Email": 0, "Phishing": 1, "Safe": 0, 1: 1, 0: 0} | |
| if df[label_col].dtype == object: | |
| df['label'] = df[label_col].map(label_map) | |
| df['label'] = df['label'].fillna(0).astype(int) | |
| else: | |
| df['label'] = df[label_col].astype(int) | |
| # Preprocess text | |
| df['processed_text'] = df[text_col].apply(preprocess_text) | |
| # Dataset Stats | |
| phishing_count = (df['label'] == 1).sum() | |
| safe_count = (df['label'] == 0).sum() | |
| total_count = len(df) | |
| st.markdown('<div class="section-title">π Dataset Statistics</div>', unsafe_allow_html=True) | |
| st.markdown(f""" | |
| <div class="stats-grid"> | |
| <div class="stat-card"> | |
| <div class="stat-value">{total_count}</div> | |
| <div class="stat-label">Total Emails</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-value">{phishing_count}</div> | |
| <div class="stat-label">Phishing Detected</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-value">{safe_count}</div> | |
| <div class="stat-label">Safe Emails</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-value">{(phishing_count/total_count*100):.1f}%</div> | |
| <div class="stat-label">Threat Rate</div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with st.expander("π View Dataset Preview", expanded=False): | |
| st.dataframe(df[[text_col, label_col]].head(10), use_container_width=True) | |
| # Model Training - 80/20 Split | |
| def train_model(processed_texts, labels): | |
| """Train model with 80% training and 20% testing split""" | |
| # 80% train, 20% test split | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| processed_texts, | |
| labels, | |
| test_size=0.2, # 20% for testing | |
| random_state=42, | |
| stratify=labels if len(np.unique(labels)) > 1 else None | |
| ) | |
| st.write(f"π Training set: {len(X_train)} samples (80%)") | |
| st.write(f"π§ͺ Testing set: {len(X_test)} samples (20%)") | |
| # Enhanced TF-IDF | |
| vectorizer = TfidfVectorizer( | |
| max_features=5000, | |
| ngram_range=(1, 3), | |
| min_df=1, | |
| max_df=0.95, | |
| sublinear_tf=True | |
| ) | |
| X_train_vec = vectorizer.fit_transform(X_train) | |
| X_test_vec = vectorizer.transform(X_test) | |
| # Logistic Regression with balanced weights | |
| model = LogisticRegression( | |
| max_iter=2000, | |
| solver='liblinear', | |
| class_weight='balanced', | |
| C=1.0, | |
| random_state=42 | |
| ) | |
| model.fit(X_train_vec, y_train) | |
| # Predictions and metrics | |
| y_pred = model.predict(X_test_vec) | |
| acc = accuracy_score(y_test, y_pred) | |
| cm = confusion_matrix(y_test, y_pred) | |
| report = classification_report(y_test, y_pred, output_dict=True, zero_division=0) | |
| return { | |
| "vectorizer": vectorizer, | |
| "model": model, | |
| "accuracy": acc, | |
| "confusion_matrix": cm, | |
| "report": report, | |
| "X_test": X_test, | |
| "y_test": y_test, | |
| "y_pred": y_pred | |
| } | |
| # Train or retrieve cached model | |
| if not st.session_state.model_trained: | |
| with st.spinner("π€ Training model with 80/20 split..."): | |
| model_info = train_model(df['processed_text'].tolist(), df['label'].values) | |
| st.session_state.model_info = model_info | |
| st.session_state.model_trained = True | |
| st.success("β Model trained successfully!") | |
| else: | |
| model_info = st.session_state.model_info | |
| vectorizer = model_info["vectorizer"] | |
| model = model_info["model"] | |
| accuracy = model_info["accuracy"] | |
| # Model Performance | |
| st.markdown('<div class="section-title">π― Model Performance (20% Test Set)</div>', unsafe_allow_html=True) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.markdown(f""" | |
| <div class="metric-container"> | |
| <div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Accuracy</div> | |
| <div style="font-size: 2.5rem; font-weight: 900; color: #FFD700;">{accuracy:.1%}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col2: | |
| precision = model_info["report"].get("1", {}).get("precision", 0) | |
| st.markdown(f""" | |
| <div class="metric-container"> | |
| <div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Precision</div> | |
| <div style="font-size: 2.5rem; font-weight: 900; color: #FFD700;">{precision:.1%}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col3: | |
| recall = model_info["report"].get("1", {}).get("recall", 0) | |
| st.markdown(f""" | |
| <div class="metric-container"> | |
| <div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Recall</div> | |
| <div style="font-size: 2.5rem; font-weight: 900; color: #FFD700;">{recall:.1%}</div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Confusion Matrix Section | |
| with st.expander("π Detailed Metrics & Confusion Matrix"): | |
| col_matrix, col_report = st.columns([1, 1.5]) | |
| with col_matrix: | |
| if st.session_state.cm_plot_cached is None: | |
| st.session_state.cm_plot_cached = generate_confusion_matrix_plot(model_info["confusion_matrix"]) | |
| st.image(st.session_state.cm_plot_cached, use_container_width=True) | |
| with col_report: | |
| st.markdown("**π Classification Report:**") | |
| report_df = pd.DataFrame(model_info["report"]).transpose().round(3) | |
| st.dataframe(report_df, use_container_width=True, height=250) | |
| # Inference UI | |
| st.markdown('<div class="section-title">βοΈ Email Threat Scanner</div>', unsafe_allow_html=True) | |
| col_input, col_hints = st.columns([2, 1]) | |
| with col_input: | |
| email_input = st.text_area( | |
| "Paste email content for analysis", | |
| height=280, | |
| placeholder="Example: Urgent! Your account has been compromised. Click here to verify your identity immediately...", | |
| help="Paste the full email content including subject and body" | |
| ) | |
| if st.button("π Analyze Email Threat"): | |
| if not email_input.strip(): | |
| st.warning("β οΈ Please paste email content to analyze") | |
| else: | |
| email_input = sanitize_input(email_input) | |
| is_valid, error_msg = validate_email_input(email_input) | |
| if not is_valid: | |
| st.warning(f"β οΈ {error_msg}") | |
| else: | |
| with st.spinner("π Analyzing email threat..."): | |
| try: | |
| processed_input = preprocess_text(email_input) | |
| input_vec = vectorizer.transform([processed_input]) | |
| try: | |
| ml_proba = model.predict_proba(input_vec)[0][1] | |
| except AttributeError: | |
| decision = model.decision_function(input_vec)[0] | |
| ml_proba = 1 / (1 + np.exp(-decision)) | |
| ml_pred = model.predict(input_vec)[0] | |
| rule_score = calculate_phishing_score(email_input) | |
| hybrid_proba = (0.6 * ml_proba) + (0.4 * rule_score) | |
| final_pred = 1 if hybrid_proba > 0.5 else 0 | |
| # Dynamic color coding | |
| if hybrid_proba >= 0.8: | |
| alert_color = "#dc2626" | |
| alert_gradient = "linear-gradient(135deg, #dc2626 0%, #991b1b 100%)" | |
| shadow_color = "220, 38, 38" | |
| emoji = "π¨" | |
| risk_level = "CRITICAL THREAT" | |
| elif hybrid_proba >= 0.6: | |
| alert_color = "#ef4444" | |
| alert_gradient = "linear-gradient(135deg, #ef4444 0%, #dc2626 100%)" | |
| shadow_color = "239, 68, 68" | |
| emoji = "β οΈ" | |
| risk_level = "HIGH RISK" | |
| elif hybrid_proba >= 0.4: | |
| alert_color = "#f97316" | |
| alert_gradient = "linear-gradient(135deg, #f97316 0%, #ea580c 100%)" | |
| shadow_color = "249, 115, 22" | |
| emoji = "β‘" | |
| risk_level = "MEDIUM RISK" | |
| elif hybrid_proba >= 0.2: | |
| alert_color = "#eab308" | |
| alert_gradient = "linear-gradient(135deg, #eab308 0%, #ca8a04 100%)" | |
| shadow_color = "234, 179, 8" | |
| emoji = "β οΈ" | |
| risk_level = "LOW RISK" | |
| else: | |
| alert_color = "#10b981" | |
| alert_gradient = "linear-gradient(135deg, #10b981 0%, #059669 100%)" | |
| shadow_color = "16, 185, 129" | |
| emoji = "β " | |
| risk_level = "SAFE" | |
| if final_pred == 1: | |
| conf_pct = f"{hybrid_proba:.1%}" | |
| st.markdown(f""" | |
| <div class="alert-box" style="background: {alert_gradient}; box-shadow: 0 10px 30px rgba({shadow_color}, 0.4), 0 0 50px rgba({shadow_color}, 0.2);"> | |
| <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;"> | |
| <div style="font-size: 2.5rem;">{emoji}</div> | |
| <div> | |
| <div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">{risk_level} DETECTED</div> | |
| <div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Threat Confidence: {conf_pct}</div> | |
| <div style="font-size: 0.9rem; opacity: 0.85; margin-top: 0.25rem;">ML Score: {ml_proba:.1%} | Rule Score: {rule_score:.1%}</div> | |
| </div> | |
| </div> | |
| <div class="confidence-bar"> | |
| <div class="confidence-fill" style="width: {hybrid_proba*100}%;"></div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown("**π Threat Indicators Detected:**") | |
| indicators = [] | |
| if "suspiciousurl" in processed_input or re.search(r'http\S+|www\S+', email_input, re.IGNORECASE): | |
| indicators.append("π Suspicious URL tokens detected") | |
| if re.search(r'\b(urgent|immediately|verify|password|suspended|click|act now|action required)\b', email_input, re.IGNORECASE): | |
| indicators.append("β‘ Urgency manipulation tactics") | |
| if re.search(r'\b(bank|account|verify|login|password|security|credential|paypal)\b', email_input, re.IGNORECASE): | |
| indicators.append("π¦ Financial/security keywords present") | |
| if re.search(r'\b(winner|prize|congratulations|claim|free|won)\b', email_input, re.IGNORECASE): | |
| indicators.append("π Reward/prize baiting language") | |
| if re.search(r'\b(confirm|update|validate|unlock|restore)\b', email_input, re.IGNORECASE): | |
| indicators.append("π Account action requests") | |
| if "cardnumber" in processed_input: | |
| indicators.append("π³ Credit card pattern detected") | |
| if "moneymention" in processed_input: | |
| indicators.append("π° Money amount mentioned") | |
| for indicator in indicators: | |
| st.markdown(f"- {indicator}") | |
| st.error("π¨ **Recommendation:** Do NOT click any links. Delete this email immediately and report to your IT security team.") | |
| else: | |
| conf_pct = f"{(1-hybrid_proba):.1%}" | |
| st.markdown(f""" | |
| <div class="alert-box" style="background: {alert_gradient}; box-shadow: 0 10px 30px rgba({shadow_color}, 0.4), 0 0 50px rgba({shadow_color}, 0.2);"> | |
| <div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;"> | |
| <div style="font-size: 2.5rem;">{emoji}</div> | |
| <div> | |
| <div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">EMAIL APPEARS SAFE</div> | |
| <div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Safety Confidence: {conf_pct}</div> | |
| <div style="font-size: 0.9rem; opacity: 0.85; margin-top: 0.25rem;">ML Score: {(1-ml_proba):.1%} | Rule Score: {(1-rule_score):.1%}</div> | |
| </div> | |
| </div> | |
| <div class="confidence-bar"> | |
| <div class="confidence-fill" style="width: {(1-hybrid_proba)*100}%;"></div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.info("π‘ **Best Practice:** Always verify sender identity and be cautious with unexpected emails.") | |
| st.session_state.analysis_history.append({ | |
| 'timestamp': pd.Timestamp.now(), | |
| 'result': 'Phishing' if final_pred == 1 else 'Safe', | |
| 'confidence': f"{hybrid_proba:.2%}", | |
| 'preview': email_input[:50] + "..." | |
| }) | |
| except Exception as e: | |
| st.error(f"β οΈ Analysis failed: {str(e)}") | |
| with col_hints: | |
| st.markdown(""" | |
| <div class="hints-panel"> | |
| <div style="font-weight: 700; font-size: 1.15rem; margin-bottom: 1.2rem; color: #f5f5f5;">π§ AI Detection Insights</div> | |
| <div class="hint-item"> | |
| <div class="hint-icon">1</div> | |
| <div><strong>Urgency words</strong> like "urgent", "verify" raise red flags</div> | |
| </div> | |
| <div class="hint-item"> | |
| <div class="hint-icon">2</div> | |
| <div><strong>Suspicious links</strong> are automatically flagged</div> | |
| </div> | |
| <div class="hint-item"> | |
| <div class="hint-icon">3</div> | |
| <div><strong>Financial + urgency</strong> combo indicates high risk</div> | |
| </div> | |
| <div class="hint-item"> | |
| <div class="hint-icon">4</div> | |
| <div>Confidence <strong>>70%</strong> warrants caution</div> | |
| </div> | |
| <div class="hint-item"> | |
| <div class="hint-icon">β‘</div> | |
| <div><strong>80/20 Split:</strong> Trained on 80%, tested on 20% for accuracy</div> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| # Footer | |
| st.markdown(""" | |
| <div class="footer"> | |
| <div style="font-size: 1.2rem; margin-bottom: 0.75rem; font-weight: 700;"> | |
| Developed and Deployed by <span class="footer-name">Umaima Qureshi</span> | |
| </div> | |
| <div style="font-size: 1rem; color: #94a3b8; margin-bottom: 1rem; line-height: 1.6;"> | |
| π Educational ML-powered email security with 80% training / 20% testing<br> | |
| Trained on Kaggle Phishing Email Dataset from HuggingFace Files | |
| </div> | |
| <div style="margin-top: 1.5rem; padding-top: 1.5rem; border-top: 1px solid rgba(218,165,32,0.2); font-size: 0.9rem; color: #6b7280;"> | |
| TF-IDF β’ Logistic Regression β’ Hybrid Detection β’ Scikit-learn β’ Streamlit | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) |