Spaces:

mimi111222
/

_phisingdector

Sleeping

App Files Files Community

_phisingdector / app.py

mimi111222

Update app.py

6d0cf2b verified 2 months ago

raw

history blame contribute delete

36 kB

	"""
	AI Phishing Email Detector - Premium Black & Gold UI
	TF-IDF + Logistic Regression trained on Kaggle Phishing Emails dataset from HuggingFace Files
	Author & Deployer: Umaima Qureshi
	Modified for HuggingFace Files Support
	"""

	import streamlit as st
	import pandas as pd
	import numpy as np
	import re
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
	import matplotlib
	matplotlib.use('Agg')
	import matplotlib.pyplot as plt
	import seaborn as sns
	import io
	import os

	# Page Configuration
	st.set_page_config(
	page_title="AI Phishing Shield – by Umaima Qureshi",
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	# Initialize Session State
	if 'model_trained' not in st.session_state:
	st.session_state.model_trained = False
	if 'analysis_history' not in st.session_state:
	st.session_state.analysis_history = []
	if 'cm_plot_cached' not in st.session_state:
	st.session_state.cm_plot_cached = None

	# Premium Black & Gold CSS Styling
	st.markdown("""
	<style>
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;800;900&display=swap');

	* {
	font-family: 'Inter', sans-serif;
	}

	.stApp {
	background: linear-gradient(135deg, #0a0a0a 0%, #1a1a1a 50%, #0a0a0a 100%);
	}

	.main {
	background: transparent;
	padding: 0;
	}

	.block-container {
	padding: 2rem 3rem !important;
	max-width: 1400px;
	}

	section[data-testid="stSidebar"] {
	display: none;
	}

	.hero-container {
	background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
	border-radius: 32px;
	padding: 4rem 3rem;
	margin-bottom: 3rem;
	box-shadow: 0 25px 70px rgba(0,0,0,0.6), 0 10px 30px rgba(218,165,32,0.25), inset 0 1px 0 rgba(255,255,255,0.1);
	position: relative;
	overflow: hidden;
	border: 2px solid rgba(218,165,32,0.4);
	}

	.hero-container::before {
	content: '';
	position: absolute;
	top: -50%;
	right: -20%;
	width: 600px;
	height: 600px;
	background: radial-gradient(circle, rgba(218,165,32,0.2) 0%, transparent 70%);
	border-radius: 50%;
	animation: pulse 8s ease-in-out infinite;
	}

	@keyframes pulse {
	0%, 100% { transform: scale(1); opacity: 0.3; }
	50% { transform: scale(1.1); opacity: 0.5; }
	}

	.hero-container::after {
	content: '';
	position: absolute;
	bottom: -30%;
	left: -10%;
	width: 500px;
	height: 500px;
	background: radial-gradient(circle, rgba(255,215,0,0.15) 0%, transparent 70%);
	border-radius: 50%;
	}

	.hero-title {
	font-size: 4.5rem;
	font-weight: 900;
	background: linear-gradient(135deg, #FFD700 0%, #FFA500 50%, #FFD700 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin-bottom: 1rem;
	position: relative;
	z-index: 1;
	letter-spacing: -0.03em;
	filter: drop-shadow(0 4px 20px rgba(255,215,0,0.4));
	}

	.hero-subtitle {
	font-size: 1.45rem;
	color: #e5e7eb;
	font-weight: 500;
	margin-bottom: 1.5rem;
	position: relative;
	z-index: 1;
	line-height: 1.6;
	letter-spacing: 0.3px;
	}

	.hero-description {
	color: #d1d5db;
	font-size: 1.05rem;
	line-height: 1.7;
	position: relative;
	z-index: 1;
	max-width: 900px;
	}

	.hero-badge {
	display: inline-block;
	background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
	color: #0f0f0f;
	padding: 0.8rem 2.5rem;
	border-radius: 50px;
	font-size: 1.05rem;
	font-weight: 700;
	margin-top: 1.8rem;
	box-shadow: 0 8px 25px rgba(255,215,0,0.5), 0 0 40px rgba(255,215,0,0.3);
	position: relative;
	z-index: 1;
	transition: all 0.3s ease;
	}

	.hero-badge:hover {
	transform: translateY(-2px);
	box-shadow: 0 12px 35px rgba(255,215,0,0.6), 0 0 50px rgba(255,215,0,0.4);
	}

	.section-title {
	font-size: 2.2rem;
	font-weight: 800;
	background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	margin: 3.5rem 0 2rem 0;
	text-align: center;
	letter-spacing: 0.5px;
	position: relative;
	padding-bottom: 1rem;
	}

	.section-title::after {
	content: '';
	position: absolute;
	bottom: 0;
	left: 50%;
	transform: translateX(-50%);
	width: 100px;
	height: 4px;
	background: linear-gradient(90deg, transparent, #FFD700, transparent);
	border-radius: 2px;
	}

	.stats-grid {
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
	gap: 1.8rem;
	margin: 2.5rem 0;
	}

	.stat-card {
	background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
	padding: 2.5rem 1.8rem;
	border-radius: 24px;
	text-align: center;
	color: #0f0f0f;
	box-shadow: 0 10px 30px rgba(255,215,0,0.35), 0 0 40px rgba(255,215,0,0.2), inset 0 1px 0 rgba(255,255,255,0.3);
	transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
	position: relative;
	overflow: hidden;
	}

	.stat-card:hover {
	transform: translateY(-10px) scale(1.03);
	box-shadow: 0 20px 50px rgba(255,215,0,0.5), 0 0 60px rgba(255,215,0,0.3), inset 0 1px 0 rgba(255,255,255,0.4);
	}

	.stat-value {
	font-size: 3.5rem;
	font-weight: 900;
	margin-bottom: 0.5rem;
	position: relative;
	z-index: 1;
	color: #0f0f0f;
	text-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}

	.stat-label {
	font-size: 0.95rem;
	font-weight: 700;
	opacity: 0.9;
	text-transform: uppercase;
	letter-spacing: 1.8px;
	position: relative;
	z-index: 1;
	color: #0f0f0f;
	}

	.stTextArea textarea {
	border-radius: 18px;
	border: 2px solid rgba(218,165,32,0.35);
	font-size: 1.05rem;
	transition: all 0.3s ease;
	background: rgba(26,26,26,0.8) !important;
	color: #e5e7eb !important;
	padding: 1rem !important;
	line-height: 1.6 !important;
	}

	.stTextArea textarea:focus {
	border-color: #FFD700;
	box-shadow: 0 0 0 4px rgba(255,215,0,0.15);
	background: rgba(26,26,26,0.95) !important;
	}

	.stButton > button {
	background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
	color: #0f0f0f;
	border: none;
	border-radius: 14px;
	padding: 0.9rem 2.8rem;
	font-size: 1.15rem;
	font-weight: 700;
	transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
	box-shadow: 0 4px 15px rgba(255,215,0,0.4), 0 0 30px rgba(255,215,0,0.2);
	width: 100%;
	letter-spacing: 0.5px;
	position: relative;
	overflow: hidden;
	}

	.stButton > button:hover {
	transform: translateY(-3px);
	box-shadow: 0 8px 25px rgba(255,215,0,0.6), 0 0 50px rgba(255,215,0,0.3);
	}

	.stButton > button:active {
	transform: translateY(-1px);
	}

	.alert-box {
	padding: 2rem;
	border-radius: 20px;
	font-size: 1.1rem;
	font-weight: 600;
	margin: 1.5rem 0;
	border: 2px solid rgba(255,255,255,0.1);
	color: white;
	}

	.confidence-bar {
	height: 14px;
	background: rgba(255,255,255,0.25);
	border-radius: 12px;
	overflow: hidden;
	margin-top: 1rem;
	box-shadow: inset 0 2px 4px rgba(0,0,0,0.2);
	}

	.confidence-fill {
	height: 100%;
	background: rgba(255,255,255,0.95);
	border-radius: 12px;
	transition: width 1.2s cubic-bezier(0.4, 0, 0.2, 1);
	box-shadow: 0 0 10px rgba(255,255,255,0.5);
	}

	.hints-panel {
	background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
	border-radius: 20px;
	padding: 2rem;
	border-left: 5px solid #FFD700;
	box-shadow: 0 4px 15px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
	backdrop-filter: blur(10px);
	}

	.hint-item {
	display: flex;
	align-items: start;
	gap: 1rem;
	margin-bottom: 1.2rem;
	font-size: 0.98rem;
	color: #d1d5db;
	line-height: 1.6;
	}

	.hint-icon {
	min-width: 28px;
	height: 28px;
	background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
	color: #0f0f0f;
	border-radius: 50%;
	display: flex;
	align-items: center;
	justify-content: center;
	font-size: 0.8rem;
	font-weight: 800;
	box-shadow: 0 2px 8px rgba(255,215,0,0.4);
	}

	.metric-container {
	background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
	padding: 1.8rem;
	border-radius: 16px;
	border-left: 5px solid #FFD700;
	box-shadow: 0 4px 12px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
	transition: all 0.3s ease;
	}

	.metric-container:hover {
	transform: translateY(-2px);
	box-shadow: 0 6px 18px rgba(0,0,0,0.5), inset 0 1px 0 rgba(255,255,255,0.08);
	}

	.stFileUploader {
	border: 2px dashed rgba(218,165,32,0.45);
	border-radius: 18px;
	padding: 2rem;
	background: rgba(26,26,26,0.6);
	transition: all 0.3s ease;
	}

	.stFileUploader:hover {
	border-color: #FFD700;
	background: rgba(218,165,32,0.12);
	box-shadow: 0 0 20px rgba(255,215,0,0.15);
	}

	.streamlit-expanderHeader {
	background: linear-gradient(135deg, rgba(218,165,32,0.2) 0%, rgba(218,165,32,0.1) 100%) !important;
	border-radius: 14px !important;
	font-weight: 700 !important;
	color: #f5f5f5 !important;
	border: 1px solid rgba(218,165,32,0.3) !important;
	padding: 1rem 1.5rem !important;
	transition: all 0.3s ease !important;
	}

	.streamlit-expanderHeader:hover {
	background: linear-gradient(135deg, rgba(218,165,32,0.25) 0%, rgba(218,165,32,0.15) 100%) !important;
	border-color: rgba(218,165,32,0.5) !important;
	}

	.stDataFrame {
	background: rgba(26,26,26,0.95) !important;
	border-radius: 12px !important;
	overflow: hidden !important;
	}

	.stDataFrame table {
	background: rgba(26,26,26,0.95) !important;
	color: #e5e7eb !important;
	}

	.stDataFrame thead tr th {
	background: rgba(218,165,32,0.2) !important;
	color: #FFD700 !important;
	font-weight: 700 !important;
	border-bottom: 2px solid rgba(218,165,32,0.4) !important;
	}

	.stDataFrame tbody tr {
	background: rgba(26,26,26,0.8) !important;
	border-bottom: 1px solid rgba(255,255,255,0.05) !important;
	}

	.stDataFrame tbody tr:hover {
	background: rgba(218,165,32,0.1) !important;
	}

	.stDataFrame tbody tr td {
	color: #d1d5db !important;
	}

	.stAlert {
	background: rgba(26,26,26,0.9) !important;
	border-radius: 12px !important;
	border-left: 4px solid #FFD700 !important;
	color: #e5e7eb !important;
	}

	.footer {
	background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
	border-radius: 20px;
	padding: 2.5rem;
	text-align: center;
	margin-top: 4rem;
	color: #9ca3af;
	box-shadow: 0 8px 24px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
	border: 2px solid rgba(218,165,32,0.3);
	}

	.footer-name {
	font-weight: 800;
	background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	font-size: 1.1rem;
	}

	.stPlotlyChart, .stPyplot {
	background: rgba(26,26,26,0.6) !important;
	border-radius: 12px !important;
	padding: 1rem !important;
	}

	#MainMenu {visibility: hidden;}
	footer {visibility: hidden;}
	header {visibility: hidden;}

	html {
	scroll-behavior: smooth;
	}

	::-webkit-scrollbar {
	width: 10px;
	height: 10px;
	}

	::-webkit-scrollbar-track {
	background: #1a1a1a;
	}

	::-webkit-scrollbar-thumb {
	background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
	border-radius: 5px;
	}

	::-webkit-scrollbar-thumb:hover {
	background: linear-gradient(135deg, #FFA500 0%, #FFD700 100%);
	}
	</style>
	""", unsafe_allow_html=True)

	# Utility Functions
	def load_dataset_from_files():
	"""Load CSV dataset from HuggingFace Files"""
	df = None
	source = ""

	# List of possible CSV file locations in HuggingFace - ordered by priority
	possible_paths = [
	"Phishing_Email.csv",
	"email_phishing_data.csv",
	"phishing_email.csv",
	"emails.csv",
	"phishing.csv",
	"./Phishing_Email.csv",
	"./email_phishing_data.csv",
	"./phishing_email.csv",
	]

	# Try to find and load the CSV
	for path in possible_paths:
	if os.path.exists(path):
	try:
	st.info(f"📂 Found: {path} \| Loading dataset...")
	df = pd.read_csv(path, encoding='utf-8', on_bad_lines='skip')
	source = path
	st.success(f"✅ Successfully loaded dataset from: `{path}` ({len(df)} rows)")
	return df, source
	except UnicodeDecodeError:
	try:
	df = pd.read_csv(path, encoding='latin-1', on_bad_lines='skip')
	source = path
	st.success(f"✅ Successfully loaded dataset from: `{path}` ({len(df)} rows)")
	return df, source
	except Exception as e:
	st.warning(f"⚠️ Failed to load {path}: {str(e)}")
	continue
	except Exception as e:
	st.warning(f"⚠️ Failed to load {path}: {str(e)}")
	continue

	return df, source

	def safe_read_csv(path):
	"""Safely read CSV file"""
	try:
	return pd.read_csv(path)
	except Exception as e:
	st.error(f"Error reading CSV: {str(e)}")
	return pd.DataFrame()

	def sanitize_input(text):
	"""Sanitize user input to prevent injection"""
	text = re.sub(r'<script.*?</script>', '', text, flags=re.DOTALL \| re.IGNORECASE)
	text = re.sub(r'<.*?>', '', text)
	return text

	def validate_email_input(text):
	"""Validate email input"""
	if len(text.strip()) < 10:
	return False, "Email content too short for analysis (minimum 10 characters)"
	if len(text) > 10000:
	return False, "Email content too long (maximum 10,000 characters)"
	return True, ""

	def preprocess_text(text):
	"""Enhanced preprocessing with better phishing indicator preservation"""
	if not isinstance(text, str):
	text = str(text)
	text = text.lower()
	text = re.sub(r'http\S+\|www\S+\|https\S+', ' suspiciousurl ', text)
	text = re.sub(r'\S+@\S+', ' emailaddress ', text)
	text = re.sub(r'\$\d+', ' moneymention ', text)
	text = re.sub(r'\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}', ' cardnumber ', text)
	text = re.sub(r'[^a-z\s]', ' ', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def calculate_phishing_score(text):
	"""Enhanced phishing detection with multi-factor scoring"""
	score = 0
	text_lower = text.lower()

	high_risk = ['verify', 'suspended', 'urgent', 'immediately', 'click here', 'act now',
	'confirm identity', 'account locked', 'unusual activity', 'security alert',
	'expire', 'limited time', 'action required', 'update payment', 'validate']
	score += sum(15 for word in high_risk if word in text_lower)

	financial = ['bank', 'credit card', 'password', 'ssn', 'social security', 'paypal',
	'billing', 'payment', 'account number', 'pin', 'cvv', 'credential']
	score += sum(12 for word in financial if word in text_lower)

	prize_scam = ['won', 'winner', 'prize', 'claim now', 'congratulations', 'free money',
	'inheritance', 'lottery', 'jackpot', 'cash prize', '$1000', '$10000']
	score += sum(18 for word in prize_scam if word in text_lower)

	if any(urg in text_lower for urg in ['urgent', 'immediately', 'now', 'expire']) and \
	any(fin in text_lower for fin in ['account', 'bank', 'payment', 'card']):
	score += 25

	if re.search(r'http\S+\|www\S+', text, re.IGNORECASE):
	url_count = len(re.findall(r'http\S+\|www\S+', text, re.IGNORECASE))
	score += min(url_count * 20, 40)

	if re.search(r'\b(enter\|provide\|submit\|update\|confirm).{0,20}(password\|credential\|info\|detail)', text_lower):
	score += 20

	threats = ['locked', 'suspended', 'terminated', 'closed', 'blocked', 'restricted']
	score += sum(15 for word in threats if word in text_lower)

	if re.search(r'\b(dear customer\|dear user\|dear member\|dear valued)\b', text_lower):
	score += 8

	max_score = 200
	probability = min(score / max_score, 0.99)

	return probability

	def generate_confusion_matrix_plot(_cm):
	"""Generate confusion matrix plot - optimized for performance"""
	plt.style.use('dark_background')
	fig, ax = plt.subplots(figsize=(5, 4), facecolor='#1a1a1a', dpi=80)
	ax.set_facecolor('#1a1a1a')

	sns.heatmap(
	_cm,
	annot=True,
	fmt="d",
	ax=ax,
	cmap="YlOrBr",
	cbar=True,
	square=True,
	annot_kws={"size": 14, "weight": "bold", "color": "#0f0f0f"},
	linewidths=1,
	linecolor='#0f0f0f',
	cbar_kws={'label': 'Count', 'shrink': 0.8},
	vmin=0,
	vmax=_cm.max()
	)

	ax.set_xlabel("Predicted", fontsize=10, fontweight='bold', color='#FFD700')
	ax.set_ylabel("Actual", fontsize=10, fontweight='bold', color='#FFD700')
	ax.set_xticklabels(["Safe", "Phishing"], fontsize=9, color='#e5e7eb')
	ax.set_yticklabels(["Safe", "Phishing"], fontsize=9, rotation=0, color='#e5e7eb')
	ax.set_title("Confusion Matrix", fontsize=12, fontweight='bold', pad=10, color='#FFD700')

	try:
	cbar = ax.collections[0].colorbar
	if cbar:
	cbar.ax.yaxis.set_tick_params(color='#e5e7eb')
	plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#e5e7eb')
	except:
	pass

	plt.tight_layout()
	buf = io.BytesIO()
	plt.savefig(buf, format='png', facecolor='#1a1a1a', dpi=80, bbox_inches='tight')
	buf.seek(0)
	plt.close(fig)
	plt.close('all')

	return buf

	# Hero Header
	st.markdown("""
	<div class="hero-container">
	<div class="hero-title">🛡️ AI Phishing Shield</div>
	<div class="hero-subtitle">Advanced Machine Learning Protection Against Email Threats</div>
	<div class="hero-description">
	Powered by TF-IDF vectorization and Logistic Regression, trained on Kaggle phishing dataset.
	80% Training \| 20% Testing for maximum accuracy and robustness.
	</div>
	<div class="hero-badge">⚡ Developed by Umaima Qureshi</div>
	</div>
	""", unsafe_allow_html=True)

	# Load Dataset from HuggingFace Files
	st.markdown('<div class="section-title">📂 Dataset Configuration</div>', unsafe_allow_html=True)

	with st.spinner("🔄 Loading dataset from HuggingFace Files..."):
	df, source = load_dataset_from_files()

	if df is None or len(df) == 0:
	st.error("❌ No dataset found! Please ensure Phishing_Email.csv is uploaded to HuggingFace Files.")
	st.info("📝 Expected file: 'Phishing_Email.csv' with columns for email text and labels")
	st.stop()

	st.info(f"✅ Dataset Successfully Loaded from: `{source}`")
	st.write(f"📊 Dataset shape: {df.shape[0]} rows × {df.shape[1]} columns")

	# Validate and Prepare Dataset
	required_columns = 2
	if len(df.columns) < required_columns or len(df) == 0:
	st.error("⚠️ Invalid dataset format. Please ensure your CSV has email text and labels.")
	st.stop()

	# Handle unnamed index column
	if "Unnamed: 0" in df.columns:
	df = df.drop(columns=["Unnamed: 0"])

	# Identify text and label columns
	text_col = "Email Text" if "Email Text" in df.columns else df.columns[0]
	label_col = "Email Type" if "Email Type" in df.columns else df.columns[-1]

	st.info(f"📌 Using columns: Text='{text_col}' \| Label='{label_col}'")

	# Clean dataset
	df[text_col] = df[text_col].fillna("").astype(str)
	df = df[df[text_col].str.strip() != ""].reset_index(drop=True)

	# Handle labels
	label_map = {"Phishing Email": 1, "Safe Email": 0, "Phishing": 1, "Safe": 0, 1: 1, 0: 0}
	if df[label_col].dtype == object:
	df['label'] = df[label_col].map(label_map)
	df['label'] = df['label'].fillna(0).astype(int)
	else:
	df['label'] = df[label_col].astype(int)

	# Preprocess text
	df['processed_text'] = df[text_col].apply(preprocess_text)

	# Dataset Stats
	phishing_count = (df['label'] == 1).sum()
	safe_count = (df['label'] == 0).sum()
	total_count = len(df)

	st.markdown('<div class="section-title">📊 Dataset Statistics</div>', unsafe_allow_html=True)

	st.markdown(f"""
	<div class="stats-grid">
	<div class="stat-card">
	<div class="stat-value">{total_count}</div>
	<div class="stat-label">Total Emails</div>
	</div>
	<div class="stat-card">
	<div class="stat-value">{phishing_count}</div>
	<div class="stat-label">Phishing Detected</div>
	</div>
	<div class="stat-card">
	<div class="stat-value">{safe_count}</div>
	<div class="stat-label">Safe Emails</div>
	</div>
	<div class="stat-card">
	<div class="stat-value">{(phishing_count/total_count*100):.1f}%</div>
	<div class="stat-label">Threat Rate</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	with st.expander("🔍 View Dataset Preview", expanded=False):
	st.dataframe(df[[text_col, label_col]].head(10), use_container_width=True)

	# Model Training - 80/20 Split
	@st.cache_resource
	def train_model(processed_texts, labels):
	"""Train model with 80% training and 20% testing split"""

	# 80% train, 20% test split
	X_train, X_test, y_train, y_test = train_test_split(
	processed_texts,
	labels,
	test_size=0.2, # 20% for testing
	random_state=42,
	stratify=labels if len(np.unique(labels)) > 1 else None
	)

	st.write(f"📈 Training set: {len(X_train)} samples (80%)")
	st.write(f"🧪 Testing set: {len(X_test)} samples (20%)")

	# Enhanced TF-IDF
	vectorizer = TfidfVectorizer(
	max_features=5000,
	ngram_range=(1, 3),
	min_df=1,
	max_df=0.95,
	sublinear_tf=True
	)
	X_train_vec = vectorizer.fit_transform(X_train)
	X_test_vec = vectorizer.transform(X_test)

	# Logistic Regression with balanced weights
	model = LogisticRegression(
	max_iter=2000,
	solver='liblinear',
	class_weight='balanced',
	C=1.0,
	random_state=42
	)
	model.fit(X_train_vec, y_train)

	# Predictions and metrics
	y_pred = model.predict(X_test_vec)
	acc = accuracy_score(y_test, y_pred)
	cm = confusion_matrix(y_test, y_pred)
	report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

	return {
	"vectorizer": vectorizer,
	"model": model,
	"accuracy": acc,
	"confusion_matrix": cm,
	"report": report,
	"X_test": X_test,
	"y_test": y_test,
	"y_pred": y_pred
	}

	# Train or retrieve cached model
	if not st.session_state.model_trained:
	with st.spinner("🤖 Training model with 80/20 split..."):
	model_info = train_model(df['processed_text'].tolist(), df['label'].values)
	st.session_state.model_info = model_info
	st.session_state.model_trained = True
	st.success("✅ Model trained successfully!")
	else:
	model_info = st.session_state.model_info

	vectorizer = model_info["vectorizer"]
	model = model_info["model"]
	accuracy = model_info["accuracy"]

	# Model Performance
	st.markdown('<div class="section-title">🎯 Model Performance (20% Test Set)</div>', unsafe_allow_html=True)

	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown(f"""
	<div class="metric-container">
	<div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Accuracy</div>
	<div style="font-size: 2.5rem; font-weight: 900; color: #FFD700;">{accuracy:.1%}</div>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	precision = model_info["report"].get("1", {}).get("precision", 0)
	st.markdown(f"""
	<div class="metric-container">
	<div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Precision</div>
	<div style="font-size: 2.5rem; font-weight: 900; color: #FFD700;">{precision:.1%}</div>
	</div>
	""", unsafe_allow_html=True)

	with col3:
	recall = model_info["report"].get("1", {}).get("recall", 0)
	st.markdown(f"""
	<div class="metric-container">
	<div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Recall</div>
	<div style="font-size: 2.5rem; font-weight: 900; color: #FFD700;">{recall:.1%}</div>
	</div>
	""", unsafe_allow_html=True)

	# Confusion Matrix Section
	with st.expander("📈 Detailed Metrics & Confusion Matrix"):
	col_matrix, col_report = st.columns([1, 1.5])

	with col_matrix:
	if st.session_state.cm_plot_cached is None:
	st.session_state.cm_plot_cached = generate_confusion_matrix_plot(model_info["confusion_matrix"])
	st.image(st.session_state.cm_plot_cached, use_container_width=True)

	with col_report:
	st.markdown("📊 Classification Report:")
	report_df = pd.DataFrame(model_info["report"]).transpose().round(3)
	st.dataframe(report_df, use_container_width=True, height=250)

	# Inference UI
	st.markdown('<div class="section-title">✉️ Email Threat Scanner</div>', unsafe_allow_html=True)

	col_input, col_hints = st.columns([2, 1])

	with col_input:
	email_input = st.text_area(
	"Paste email content for analysis",
	height=280,
	placeholder="Example: Urgent! Your account has been compromised. Click here to verify your identity immediately...",
	help="Paste the full email content including subject and body"
	)

	if st.button("🔍 Analyze Email Threat"):
	if not email_input.strip():
	st.warning("⚠️ Please paste email content to analyze")
	else:
	email_input = sanitize_input(email_input)
	is_valid, error_msg = validate_email_input(email_input)

	if not is_valid:
	st.warning(f"⚠️ {error_msg}")
	else:
	with st.spinner("🔍 Analyzing email threat..."):
	try:
	processed_input = preprocess_text(email_input)
	input_vec = vectorizer.transform([processed_input])

	try:
	ml_proba = model.predict_proba(input_vec)[0][1]
	except AttributeError:
	decision = model.decision_function(input_vec)[0]
	ml_proba = 1 / (1 + np.exp(-decision))

	ml_pred = model.predict(input_vec)[0]
	rule_score = calculate_phishing_score(email_input)
	hybrid_proba = (0.6 * ml_proba) + (0.4 * rule_score)
	final_pred = 1 if hybrid_proba > 0.5 else 0

	# Dynamic color coding
	if hybrid_proba >= 0.8:
	alert_color = "#dc2626"
	alert_gradient = "linear-gradient(135deg, #dc2626 0%, #991b1b 100%)"
	shadow_color = "220, 38, 38"
	emoji = "🚨"
	risk_level = "CRITICAL THREAT"
	elif hybrid_proba >= 0.6:
	alert_color = "#ef4444"
	alert_gradient = "linear-gradient(135deg, #ef4444 0%, #dc2626 100%)"
	shadow_color = "239, 68, 68"
	emoji = "⚠️"
	risk_level = "HIGH RISK"
	elif hybrid_proba >= 0.4:
	alert_color = "#f97316"
	alert_gradient = "linear-gradient(135deg, #f97316 0%, #ea580c 100%)"
	shadow_color = "249, 115, 22"
	emoji = "⚡"
	risk_level = "MEDIUM RISK"
	elif hybrid_proba >= 0.2:
	alert_color = "#eab308"
	alert_gradient = "linear-gradient(135deg, #eab308 0%, #ca8a04 100%)"
	shadow_color = "234, 179, 8"
	emoji = "⚠️"
	risk_level = "LOW RISK"
	else:
	alert_color = "#10b981"
	alert_gradient = "linear-gradient(135deg, #10b981 0%, #059669 100%)"
	shadow_color = "16, 185, 129"
	emoji = "✅"
	risk_level = "SAFE"

	if final_pred == 1:
	conf_pct = f"{hybrid_proba:.1%}"
	st.markdown(f"""
	<div class="alert-box" style="background: {alert_gradient}; box-shadow: 0 10px 30px rgba({shadow_color}, 0.4), 0 0 50px rgba({shadow_color}, 0.2);">
	<div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
	<div style="font-size: 2.5rem;">{emoji}</div>
	<div>
	<div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">{risk_level} DETECTED</div>
	<div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Threat Confidence: {conf_pct}</div>
	<div style="font-size: 0.9rem; opacity: 0.85; margin-top: 0.25rem;">ML Score: {ml_proba:.1%} \| Rule Score: {rule_score:.1%}</div>
	</div>
	</div>
	<div class="confidence-bar">
	<div class="confidence-fill" style="width: {hybrid_proba*100}%;"></div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	st.markdown("🔍 Threat Indicators Detected:")
	indicators = []
	if "suspiciousurl" in processed_input or re.search(r'http\S+\|www\S+', email_input, re.IGNORECASE):
	indicators.append("🔗 Suspicious URL tokens detected")
	if re.search(r'\b(urgent\|immediately\|verify\|password\|suspended\|click\|act now\|action required)\b', email_input, re.IGNORECASE):
	indicators.append("⚡ Urgency manipulation tactics")
	if re.search(r'\b(bank\|account\|verify\|login\|password\|security\|credential\|paypal)\b', email_input, re.IGNORECASE):
	indicators.append("🏦 Financial/security keywords present")
	if re.search(r'\b(winner\|prize\|congratulations\|claim\|free\|won)\b', email_input, re.IGNORECASE):
	indicators.append("🎁 Reward/prize baiting language")
	if re.search(r'\b(confirm\|update\|validate\|unlock\|restore)\b', email_input, re.IGNORECASE):
	indicators.append("🔐 Account action requests")
	if "cardnumber" in processed_input:
	indicators.append("💳 Credit card pattern detected")
	if "moneymention" in processed_input:
	indicators.append("💰 Money amount mentioned")

	for indicator in indicators:
	st.markdown(f"- {indicator}")

	st.error("🚨 Recommendation: Do NOT click any links. Delete this email immediately and report to your IT security team.")

	else:
	conf_pct = f"{(1-hybrid_proba):.1%}"
	st.markdown(f"""
	<div class="alert-box" style="background: {alert_gradient}; box-shadow: 0 10px 30px rgba({shadow_color}, 0.4), 0 0 50px rgba({shadow_color}, 0.2);">
	<div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
	<div style="font-size: 2.5rem;">{emoji}</div>
	<div>
	<div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">EMAIL APPEARS SAFE</div>
	<div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Safety Confidence: {conf_pct}</div>
	<div style="font-size: 0.9rem; opacity: 0.85; margin-top: 0.25rem;">ML Score: {(1-ml_proba):.1%} \| Rule Score: {(1-rule_score):.1%}</div>
	</div>
	</div>
	<div class="confidence-bar">
	<div class="confidence-fill" style="width: {(1-hybrid_proba)*100}%;"></div>
	</div>
	</div>
	""", unsafe_allow_html=True)
	st.info("💡 Best Practice: Always verify sender identity and be cautious with unexpected emails.")

	st.session_state.analysis_history.append({
	'timestamp': pd.Timestamp.now(),
	'result': 'Phishing' if final_pred == 1 else 'Safe',
	'confidence': f"{hybrid_proba:.2%}",
	'preview': email_input[:50] + "..."
	})

	except Exception as e:
	st.error(f"⚠️ Analysis failed: {str(e)}")

	with col_hints:
	st.markdown("""
	<div class="hints-panel">
	<div style="font-weight: 700; font-size: 1.15rem; margin-bottom: 1.2rem; color: #f5f5f5;">🧠 AI Detection Insights</div>

	<div class="hint-item">
	<div class="hint-icon">1</div>
	<div><strong>Urgency words</strong> like "urgent", "verify" raise red flags</div>
	</div>

	<div class="hint-item">
	<div class="hint-icon">2</div>
	<div><strong>Suspicious links</strong> are automatically flagged</div>
	</div>

	<div class="hint-item">
	<div class="hint-icon">3</div>
	<div><strong>Financial + urgency</strong> combo indicates high risk</div>
	</div>

	<div class="hint-item">
	<div class="hint-icon">4</div>
	<div>Confidence <strong>>70%</strong> warrants caution</div>
	</div>

	<div class="hint-item">
	<div class="hint-icon">⚡</div>
	<div><strong>80/20 Split:</strong> Trained on 80%, tested on 20% for accuracy</div>
	</div>
	</div>
	""", unsafe_allow_html=True)

	# Footer
	st.markdown("""
	<div class="footer">
	<div style="font-size: 1.2rem; margin-bottom: 0.75rem; font-weight: 700;">
	Developed and Deployed by <span class="footer-name">Umaima Qureshi</span>
	</div>
	<div style="font-size: 1rem; color: #94a3b8; margin-bottom: 1rem; line-height: 1.6;">
	🎓 Educational ML-powered email security with 80% training / 20% testing<br>
	Trained on Kaggle Phishing Email Dataset from HuggingFace Files
	</div>
	<div style="margin-top: 1.5rem; padding-top: 1.5rem; border-top: 1px solid rgba(218,165,32,0.2); font-size: 0.9rem; color: #6b7280;">
	TF-IDF • Logistic Regression • Hybrid Detection • Scikit-learn • Streamlit
	</div>
	</div>
	""", unsafe_allow_html=True)