_phisingdector / app.py
mimi111222's picture
Update app.py
6d0cf2b verified
"""
AI Phishing Email Detector - Premium Black & Gold UI
TF-IDF + Logistic Regression trained on Kaggle Phishing Emails dataset from HuggingFace Files
Author & Deployer: Umaima Qureshi
Modified for HuggingFace Files Support
"""
import streamlit as st
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import io
import os
# Page Configuration
st.set_page_config(
page_title="AI Phishing Shield – by Umaima Qureshi",
layout="wide",
initial_sidebar_state="collapsed"
)
# Initialize Session State
if 'model_trained' not in st.session_state:
st.session_state.model_trained = False
if 'analysis_history' not in st.session_state:
st.session_state.analysis_history = []
if 'cm_plot_cached' not in st.session_state:
st.session_state.cm_plot_cached = None
# Premium Black & Gold CSS Styling
st.markdown("""
<style>
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;800;900&display=swap');
* {
font-family: 'Inter', sans-serif;
}
.stApp {
background: linear-gradient(135deg, #0a0a0a 0%, #1a1a1a 50%, #0a0a0a 100%);
}
.main {
background: transparent;
padding: 0;
}
.block-container {
padding: 2rem 3rem !important;
max-width: 1400px;
}
section[data-testid="stSidebar"] {
display: none;
}
.hero-container {
background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
border-radius: 32px;
padding: 4rem 3rem;
margin-bottom: 3rem;
box-shadow: 0 25px 70px rgba(0,0,0,0.6), 0 10px 30px rgba(218,165,32,0.25), inset 0 1px 0 rgba(255,255,255,0.1);
position: relative;
overflow: hidden;
border: 2px solid rgba(218,165,32,0.4);
}
.hero-container::before {
content: '';
position: absolute;
top: -50%;
right: -20%;
width: 600px;
height: 600px;
background: radial-gradient(circle, rgba(218,165,32,0.2) 0%, transparent 70%);
border-radius: 50%;
animation: pulse 8s ease-in-out infinite;
}
@keyframes pulse {
0%, 100% { transform: scale(1); opacity: 0.3; }
50% { transform: scale(1.1); opacity: 0.5; }
}
.hero-container::after {
content: '';
position: absolute;
bottom: -30%;
left: -10%;
width: 500px;
height: 500px;
background: radial-gradient(circle, rgba(255,215,0,0.15) 0%, transparent 70%);
border-radius: 50%;
}
.hero-title {
font-size: 4.5rem;
font-weight: 900;
background: linear-gradient(135deg, #FFD700 0%, #FFA500 50%, #FFD700 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 1rem;
position: relative;
z-index: 1;
letter-spacing: -0.03em;
filter: drop-shadow(0 4px 20px rgba(255,215,0,0.4));
}
.hero-subtitle {
font-size: 1.45rem;
color: #e5e7eb;
font-weight: 500;
margin-bottom: 1.5rem;
position: relative;
z-index: 1;
line-height: 1.6;
letter-spacing: 0.3px;
}
.hero-description {
color: #d1d5db;
font-size: 1.05rem;
line-height: 1.7;
position: relative;
z-index: 1;
max-width: 900px;
}
.hero-badge {
display: inline-block;
background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
color: #0f0f0f;
padding: 0.8rem 2.5rem;
border-radius: 50px;
font-size: 1.05rem;
font-weight: 700;
margin-top: 1.8rem;
box-shadow: 0 8px 25px rgba(255,215,0,0.5), 0 0 40px rgba(255,215,0,0.3);
position: relative;
z-index: 1;
transition: all 0.3s ease;
}
.hero-badge:hover {
transform: translateY(-2px);
box-shadow: 0 12px 35px rgba(255,215,0,0.6), 0 0 50px rgba(255,215,0,0.4);
}
.section-title {
font-size: 2.2rem;
font-weight: 800;
background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin: 3.5rem 0 2rem 0;
text-align: center;
letter-spacing: 0.5px;
position: relative;
padding-bottom: 1rem;
}
.section-title::after {
content: '';
position: absolute;
bottom: 0;
left: 50%;
transform: translateX(-50%);
width: 100px;
height: 4px;
background: linear-gradient(90deg, transparent, #FFD700, transparent);
border-radius: 2px;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
gap: 1.8rem;
margin: 2.5rem 0;
}
.stat-card {
background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
padding: 2.5rem 1.8rem;
border-radius: 24px;
text-align: center;
color: #0f0f0f;
box-shadow: 0 10px 30px rgba(255,215,0,0.35), 0 0 40px rgba(255,215,0,0.2), inset 0 1px 0 rgba(255,255,255,0.3);
transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
position: relative;
overflow: hidden;
}
.stat-card:hover {
transform: translateY(-10px) scale(1.03);
box-shadow: 0 20px 50px rgba(255,215,0,0.5), 0 0 60px rgba(255,215,0,0.3), inset 0 1px 0 rgba(255,255,255,0.4);
}
.stat-value {
font-size: 3.5rem;
font-weight: 900;
margin-bottom: 0.5rem;
position: relative;
z-index: 1;
color: #0f0f0f;
text-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.stat-label {
font-size: 0.95rem;
font-weight: 700;
opacity: 0.9;
text-transform: uppercase;
letter-spacing: 1.8px;
position: relative;
z-index: 1;
color: #0f0f0f;
}
.stTextArea textarea {
border-radius: 18px;
border: 2px solid rgba(218,165,32,0.35);
font-size: 1.05rem;
transition: all 0.3s ease;
background: rgba(26,26,26,0.8) !important;
color: #e5e7eb !important;
padding: 1rem !important;
line-height: 1.6 !important;
}
.stTextArea textarea:focus {
border-color: #FFD700;
box-shadow: 0 0 0 4px rgba(255,215,0,0.15);
background: rgba(26,26,26,0.95) !important;
}
.stButton > button {
background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
color: #0f0f0f;
border: none;
border-radius: 14px;
padding: 0.9rem 2.8rem;
font-size: 1.15rem;
font-weight: 700;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
box-shadow: 0 4px 15px rgba(255,215,0,0.4), 0 0 30px rgba(255,215,0,0.2);
width: 100%;
letter-spacing: 0.5px;
position: relative;
overflow: hidden;
}
.stButton > button:hover {
transform: translateY(-3px);
box-shadow: 0 8px 25px rgba(255,215,0,0.6), 0 0 50px rgba(255,215,0,0.3);
}
.stButton > button:active {
transform: translateY(-1px);
}
.alert-box {
padding: 2rem;
border-radius: 20px;
font-size: 1.1rem;
font-weight: 600;
margin: 1.5rem 0;
border: 2px solid rgba(255,255,255,0.1);
color: white;
}
.confidence-bar {
height: 14px;
background: rgba(255,255,255,0.25);
border-radius: 12px;
overflow: hidden;
margin-top: 1rem;
box-shadow: inset 0 2px 4px rgba(0,0,0,0.2);
}
.confidence-fill {
height: 100%;
background: rgba(255,255,255,0.95);
border-radius: 12px;
transition: width 1.2s cubic-bezier(0.4, 0, 0.2, 1);
box-shadow: 0 0 10px rgba(255,255,255,0.5);
}
.hints-panel {
background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
border-radius: 20px;
padding: 2rem;
border-left: 5px solid #FFD700;
box-shadow: 0 4px 15px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
backdrop-filter: blur(10px);
}
.hint-item {
display: flex;
align-items: start;
gap: 1rem;
margin-bottom: 1.2rem;
font-size: 0.98rem;
color: #d1d5db;
line-height: 1.6;
}
.hint-icon {
min-width: 28px;
height: 28px;
background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
color: #0f0f0f;
border-radius: 50%;
display: flex;
align-items: center;
justify-content: center;
font-size: 0.8rem;
font-weight: 800;
box-shadow: 0 2px 8px rgba(255,215,0,0.4);
}
.metric-container {
background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
padding: 1.8rem;
border-radius: 16px;
border-left: 5px solid #FFD700;
box-shadow: 0 4px 12px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
transition: all 0.3s ease;
}
.metric-container:hover {
transform: translateY(-2px);
box-shadow: 0 6px 18px rgba(0,0,0,0.5), inset 0 1px 0 rgba(255,255,255,0.08);
}
.stFileUploader {
border: 2px dashed rgba(218,165,32,0.45);
border-radius: 18px;
padding: 2rem;
background: rgba(26,26,26,0.6);
transition: all 0.3s ease;
}
.stFileUploader:hover {
border-color: #FFD700;
background: rgba(218,165,32,0.12);
box-shadow: 0 0 20px rgba(255,215,0,0.15);
}
.streamlit-expanderHeader {
background: linear-gradient(135deg, rgba(218,165,32,0.2) 0%, rgba(218,165,32,0.1) 100%) !important;
border-radius: 14px !important;
font-weight: 700 !important;
color: #f5f5f5 !important;
border: 1px solid rgba(218,165,32,0.3) !important;
padding: 1rem 1.5rem !important;
transition: all 0.3s ease !important;
}
.streamlit-expanderHeader:hover {
background: linear-gradient(135deg, rgba(218,165,32,0.25) 0%, rgba(218,165,32,0.15) 100%) !important;
border-color: rgba(218,165,32,0.5) !important;
}
.stDataFrame {
background: rgba(26,26,26,0.95) !important;
border-radius: 12px !important;
overflow: hidden !important;
}
.stDataFrame table {
background: rgba(26,26,26,0.95) !important;
color: #e5e7eb !important;
}
.stDataFrame thead tr th {
background: rgba(218,165,32,0.2) !important;
color: #FFD700 !important;
font-weight: 700 !important;
border-bottom: 2px solid rgba(218,165,32,0.4) !important;
}
.stDataFrame tbody tr {
background: rgba(26,26,26,0.8) !important;
border-bottom: 1px solid rgba(255,255,255,0.05) !important;
}
.stDataFrame tbody tr:hover {
background: rgba(218,165,32,0.1) !important;
}
.stDataFrame tbody tr td {
color: #d1d5db !important;
}
.stAlert {
background: rgba(26,26,26,0.9) !important;
border-radius: 12px !important;
border-left: 4px solid #FFD700 !important;
color: #e5e7eb !important;
}
.footer {
background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
border-radius: 20px;
padding: 2.5rem;
text-align: center;
margin-top: 4rem;
color: #9ca3af;
box-shadow: 0 8px 24px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
border: 2px solid rgba(218,165,32,0.3);
}
.footer-name {
font-weight: 800;
background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-size: 1.1rem;
}
.stPlotlyChart, .stPyplot {
background: rgba(26,26,26,0.6) !important;
border-radius: 12px !important;
padding: 1rem !important;
}
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
header {visibility: hidden;}
html {
scroll-behavior: smooth;
}
::-webkit-scrollbar {
width: 10px;
height: 10px;
}
::-webkit-scrollbar-track {
background: #1a1a1a;
}
::-webkit-scrollbar-thumb {
background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
border-radius: 5px;
}
::-webkit-scrollbar-thumb:hover {
background: linear-gradient(135deg, #FFA500 0%, #FFD700 100%);
}
</style>
""", unsafe_allow_html=True)
# Utility Functions
def load_dataset_from_files():
"""Load CSV dataset from HuggingFace Files"""
df = None
source = ""
# List of possible CSV file locations in HuggingFace - ordered by priority
possible_paths = [
"Phishing_Email.csv",
"email_phishing_data.csv",
"phishing_email.csv",
"emails.csv",
"phishing.csv",
"./Phishing_Email.csv",
"./email_phishing_data.csv",
"./phishing_email.csv",
]
# Try to find and load the CSV
for path in possible_paths:
if os.path.exists(path):
try:
st.info(f"πŸ“‚ Found: {path} | Loading dataset...")
df = pd.read_csv(path, encoding='utf-8', on_bad_lines='skip')
source = path
st.success(f"βœ… Successfully loaded dataset from: `{path}` ({len(df)} rows)")
return df, source
except UnicodeDecodeError:
try:
df = pd.read_csv(path, encoding='latin-1', on_bad_lines='skip')
source = path
st.success(f"βœ… Successfully loaded dataset from: `{path}` ({len(df)} rows)")
return df, source
except Exception as e:
st.warning(f"⚠️ Failed to load {path}: {str(e)}")
continue
except Exception as e:
st.warning(f"⚠️ Failed to load {path}: {str(e)}")
continue
return df, source
def safe_read_csv(path):
"""Safely read CSV file"""
try:
return pd.read_csv(path)
except Exception as e:
st.error(f"Error reading CSV: {str(e)}")
return pd.DataFrame()
def sanitize_input(text):
"""Sanitize user input to prevent injection"""
text = re.sub(r'<script.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<.*?>', '', text)
return text
def validate_email_input(text):
"""Validate email input"""
if len(text.strip()) < 10:
return False, "Email content too short for analysis (minimum 10 characters)"
if len(text) > 10000:
return False, "Email content too long (maximum 10,000 characters)"
return True, ""
def preprocess_text(text):
"""Enhanced preprocessing with better phishing indicator preservation"""
if not isinstance(text, str):
text = str(text)
text = text.lower()
text = re.sub(r'http\S+|www\S+|https\S+', ' suspiciousurl ', text)
text = re.sub(r'\S+@\S+', ' emailaddress ', text)
text = re.sub(r'\$\d+', ' moneymention ', text)
text = re.sub(r'\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}', ' cardnumber ', text)
text = re.sub(r'[^a-z\s]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def calculate_phishing_score(text):
"""Enhanced phishing detection with multi-factor scoring"""
score = 0
text_lower = text.lower()
high_risk = ['verify', 'suspended', 'urgent', 'immediately', 'click here', 'act now',
'confirm identity', 'account locked', 'unusual activity', 'security alert',
'expire', 'limited time', 'action required', 'update payment', 'validate']
score += sum(15 for word in high_risk if word in text_lower)
financial = ['bank', 'credit card', 'password', 'ssn', 'social security', 'paypal',
'billing', 'payment', 'account number', 'pin', 'cvv', 'credential']
score += sum(12 for word in financial if word in text_lower)
prize_scam = ['won', 'winner', 'prize', 'claim now', 'congratulations', 'free money',
'inheritance', 'lottery', 'jackpot', 'cash prize', '$1000', '$10000']
score += sum(18 for word in prize_scam if word in text_lower)
if any(urg in text_lower for urg in ['urgent', 'immediately', 'now', 'expire']) and \
any(fin in text_lower for fin in ['account', 'bank', 'payment', 'card']):
score += 25
if re.search(r'http\S+|www\S+', text, re.IGNORECASE):
url_count = len(re.findall(r'http\S+|www\S+', text, re.IGNORECASE))
score += min(url_count * 20, 40)
if re.search(r'\b(enter|provide|submit|update|confirm).{0,20}(password|credential|info|detail)', text_lower):
score += 20
threats = ['locked', 'suspended', 'terminated', 'closed', 'blocked', 'restricted']
score += sum(15 for word in threats if word in text_lower)
if re.search(r'\b(dear customer|dear user|dear member|dear valued)\b', text_lower):
score += 8
max_score = 200
probability = min(score / max_score, 0.99)
return probability
def generate_confusion_matrix_plot(_cm):
"""Generate confusion matrix plot - optimized for performance"""
plt.style.use('dark_background')
fig, ax = plt.subplots(figsize=(5, 4), facecolor='#1a1a1a', dpi=80)
ax.set_facecolor('#1a1a1a')
sns.heatmap(
_cm,
annot=True,
fmt="d",
ax=ax,
cmap="YlOrBr",
cbar=True,
square=True,
annot_kws={"size": 14, "weight": "bold", "color": "#0f0f0f"},
linewidths=1,
linecolor='#0f0f0f',
cbar_kws={'label': 'Count', 'shrink': 0.8},
vmin=0,
vmax=_cm.max()
)
ax.set_xlabel("Predicted", fontsize=10, fontweight='bold', color='#FFD700')
ax.set_ylabel("Actual", fontsize=10, fontweight='bold', color='#FFD700')
ax.set_xticklabels(["Safe", "Phishing"], fontsize=9, color='#e5e7eb')
ax.set_yticklabels(["Safe", "Phishing"], fontsize=9, rotation=0, color='#e5e7eb')
ax.set_title("Confusion Matrix", fontsize=12, fontweight='bold', pad=10, color='#FFD700')
try:
cbar = ax.collections[0].colorbar
if cbar:
cbar.ax.yaxis.set_tick_params(color='#e5e7eb')
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#e5e7eb')
except:
pass
plt.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format='png', facecolor='#1a1a1a', dpi=80, bbox_inches='tight')
buf.seek(0)
plt.close(fig)
plt.close('all')
return buf
# Hero Header
st.markdown("""
<div class="hero-container">
<div class="hero-title">πŸ›‘οΈ AI Phishing Shield</div>
<div class="hero-subtitle">Advanced Machine Learning Protection Against Email Threats</div>
<div class="hero-description">
Powered by TF-IDF vectorization and Logistic Regression, trained on Kaggle phishing dataset.
80% Training | 20% Testing for maximum accuracy and robustness.
</div>
<div class="hero-badge">⚑ Developed by Umaima Qureshi</div>
</div>
""", unsafe_allow_html=True)
# Load Dataset from HuggingFace Files
st.markdown('<div class="section-title">πŸ“‚ Dataset Configuration</div>', unsafe_allow_html=True)
with st.spinner("πŸ”„ Loading dataset from HuggingFace Files..."):
df, source = load_dataset_from_files()
if df is None or len(df) == 0:
st.error("❌ No dataset found! Please ensure Phishing_Email.csv is uploaded to HuggingFace Files.")
st.info("πŸ“ Expected file: 'Phishing_Email.csv' with columns for email text and labels")
st.stop()
st.info(f"βœ… **Dataset Successfully Loaded** from: `{source}`")
st.write(f"πŸ“Š Dataset shape: {df.shape[0]} rows Γ— {df.shape[1]} columns")
# Validate and Prepare Dataset
required_columns = 2
if len(df.columns) < required_columns or len(df) == 0:
st.error("⚠️ Invalid dataset format. Please ensure your CSV has email text and labels.")
st.stop()
# Handle unnamed index column
if "Unnamed: 0" in df.columns:
df = df.drop(columns=["Unnamed: 0"])
# Identify text and label columns
text_col = "Email Text" if "Email Text" in df.columns else df.columns[0]
label_col = "Email Type" if "Email Type" in df.columns else df.columns[-1]
st.info(f"πŸ“Œ Using columns: Text='{text_col}' | Label='{label_col}'")
# Clean dataset
df[text_col] = df[text_col].fillna("").astype(str)
df = df[df[text_col].str.strip() != ""].reset_index(drop=True)
# Handle labels
label_map = {"Phishing Email": 1, "Safe Email": 0, "Phishing": 1, "Safe": 0, 1: 1, 0: 0}
if df[label_col].dtype == object:
df['label'] = df[label_col].map(label_map)
df['label'] = df['label'].fillna(0).astype(int)
else:
df['label'] = df[label_col].astype(int)
# Preprocess text
df['processed_text'] = df[text_col].apply(preprocess_text)
# Dataset Stats
phishing_count = (df['label'] == 1).sum()
safe_count = (df['label'] == 0).sum()
total_count = len(df)
st.markdown('<div class="section-title">πŸ“Š Dataset Statistics</div>', unsafe_allow_html=True)
st.markdown(f"""
<div class="stats-grid">
<div class="stat-card">
<div class="stat-value">{total_count}</div>
<div class="stat-label">Total Emails</div>
</div>
<div class="stat-card">
<div class="stat-value">{phishing_count}</div>
<div class="stat-label">Phishing Detected</div>
</div>
<div class="stat-card">
<div class="stat-value">{safe_count}</div>
<div class="stat-label">Safe Emails</div>
</div>
<div class="stat-card">
<div class="stat-value">{(phishing_count/total_count*100):.1f}%</div>
<div class="stat-label">Threat Rate</div>
</div>
</div>
""", unsafe_allow_html=True)
with st.expander("πŸ” View Dataset Preview", expanded=False):
st.dataframe(df[[text_col, label_col]].head(10), use_container_width=True)
# Model Training - 80/20 Split
@st.cache_resource
def train_model(processed_texts, labels):
"""Train model with 80% training and 20% testing split"""
# 80% train, 20% test split
X_train, X_test, y_train, y_test = train_test_split(
processed_texts,
labels,
test_size=0.2, # 20% for testing
random_state=42,
stratify=labels if len(np.unique(labels)) > 1 else None
)
st.write(f"πŸ“ˆ Training set: {len(X_train)} samples (80%)")
st.write(f"πŸ§ͺ Testing set: {len(X_test)} samples (20%)")
# Enhanced TF-IDF
vectorizer = TfidfVectorizer(
max_features=5000,
ngram_range=(1, 3),
min_df=1,
max_df=0.95,
sublinear_tf=True
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# Logistic Regression with balanced weights
model = LogisticRegression(
max_iter=2000,
solver='liblinear',
class_weight='balanced',
C=1.0,
random_state=42
)
model.fit(X_train_vec, y_train)
# Predictions and metrics
y_pred = model.predict(X_test_vec)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
return {
"vectorizer": vectorizer,
"model": model,
"accuracy": acc,
"confusion_matrix": cm,
"report": report,
"X_test": X_test,
"y_test": y_test,
"y_pred": y_pred
}
# Train or retrieve cached model
if not st.session_state.model_trained:
with st.spinner("πŸ€– Training model with 80/20 split..."):
model_info = train_model(df['processed_text'].tolist(), df['label'].values)
st.session_state.model_info = model_info
st.session_state.model_trained = True
st.success("βœ… Model trained successfully!")
else:
model_info = st.session_state.model_info
vectorizer = model_info["vectorizer"]
model = model_info["model"]
accuracy = model_info["accuracy"]
# Model Performance
st.markdown('<div class="section-title">🎯 Model Performance (20% Test Set)</div>', unsafe_allow_html=True)
col1, col2, col3 = st.columns(3)
with col1:
st.markdown(f"""
<div class="metric-container">
<div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Accuracy</div>
<div style="font-size: 2.5rem; font-weight: 900; color: #FFD700;">{accuracy:.1%}</div>
</div>
""", unsafe_allow_html=True)
with col2:
precision = model_info["report"].get("1", {}).get("precision", 0)
st.markdown(f"""
<div class="metric-container">
<div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Precision</div>
<div style="font-size: 2.5rem; font-weight: 900; color: #FFD700;">{precision:.1%}</div>
</div>
""", unsafe_allow_html=True)
with col3:
recall = model_info["report"].get("1", {}).get("recall", 0)
st.markdown(f"""
<div class="metric-container">
<div style="color: #9ca3af; font-size: 0.85rem; font-weight: 600; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 0.5rem;">Recall</div>
<div style="font-size: 2.5rem; font-weight: 900; color: #FFD700;">{recall:.1%}</div>
</div>
""", unsafe_allow_html=True)
# Confusion Matrix Section
with st.expander("πŸ“ˆ Detailed Metrics & Confusion Matrix"):
col_matrix, col_report = st.columns([1, 1.5])
with col_matrix:
if st.session_state.cm_plot_cached is None:
st.session_state.cm_plot_cached = generate_confusion_matrix_plot(model_info["confusion_matrix"])
st.image(st.session_state.cm_plot_cached, use_container_width=True)
with col_report:
st.markdown("**πŸ“Š Classification Report:**")
report_df = pd.DataFrame(model_info["report"]).transpose().round(3)
st.dataframe(report_df, use_container_width=True, height=250)
# Inference UI
st.markdown('<div class="section-title">βœ‰οΈ Email Threat Scanner</div>', unsafe_allow_html=True)
col_input, col_hints = st.columns([2, 1])
with col_input:
email_input = st.text_area(
"Paste email content for analysis",
height=280,
placeholder="Example: Urgent! Your account has been compromised. Click here to verify your identity immediately...",
help="Paste the full email content including subject and body"
)
if st.button("πŸ” Analyze Email Threat"):
if not email_input.strip():
st.warning("⚠️ Please paste email content to analyze")
else:
email_input = sanitize_input(email_input)
is_valid, error_msg = validate_email_input(email_input)
if not is_valid:
st.warning(f"⚠️ {error_msg}")
else:
with st.spinner("πŸ” Analyzing email threat..."):
try:
processed_input = preprocess_text(email_input)
input_vec = vectorizer.transform([processed_input])
try:
ml_proba = model.predict_proba(input_vec)[0][1]
except AttributeError:
decision = model.decision_function(input_vec)[0]
ml_proba = 1 / (1 + np.exp(-decision))
ml_pred = model.predict(input_vec)[0]
rule_score = calculate_phishing_score(email_input)
hybrid_proba = (0.6 * ml_proba) + (0.4 * rule_score)
final_pred = 1 if hybrid_proba > 0.5 else 0
# Dynamic color coding
if hybrid_proba >= 0.8:
alert_color = "#dc2626"
alert_gradient = "linear-gradient(135deg, #dc2626 0%, #991b1b 100%)"
shadow_color = "220, 38, 38"
emoji = "🚨"
risk_level = "CRITICAL THREAT"
elif hybrid_proba >= 0.6:
alert_color = "#ef4444"
alert_gradient = "linear-gradient(135deg, #ef4444 0%, #dc2626 100%)"
shadow_color = "239, 68, 68"
emoji = "⚠️"
risk_level = "HIGH RISK"
elif hybrid_proba >= 0.4:
alert_color = "#f97316"
alert_gradient = "linear-gradient(135deg, #f97316 0%, #ea580c 100%)"
shadow_color = "249, 115, 22"
emoji = "⚑"
risk_level = "MEDIUM RISK"
elif hybrid_proba >= 0.2:
alert_color = "#eab308"
alert_gradient = "linear-gradient(135deg, #eab308 0%, #ca8a04 100%)"
shadow_color = "234, 179, 8"
emoji = "⚠️"
risk_level = "LOW RISK"
else:
alert_color = "#10b981"
alert_gradient = "linear-gradient(135deg, #10b981 0%, #059669 100%)"
shadow_color = "16, 185, 129"
emoji = "βœ…"
risk_level = "SAFE"
if final_pred == 1:
conf_pct = f"{hybrid_proba:.1%}"
st.markdown(f"""
<div class="alert-box" style="background: {alert_gradient}; box-shadow: 0 10px 30px rgba({shadow_color}, 0.4), 0 0 50px rgba({shadow_color}, 0.2);">
<div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
<div style="font-size: 2.5rem;">{emoji}</div>
<div>
<div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">{risk_level} DETECTED</div>
<div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Threat Confidence: {conf_pct}</div>
<div style="font-size: 0.9rem; opacity: 0.85; margin-top: 0.25rem;">ML Score: {ml_proba:.1%} | Rule Score: {rule_score:.1%}</div>
</div>
</div>
<div class="confidence-bar">
<div class="confidence-fill" style="width: {hybrid_proba*100}%;"></div>
</div>
</div>
""", unsafe_allow_html=True)
st.markdown("**πŸ” Threat Indicators Detected:**")
indicators = []
if "suspiciousurl" in processed_input or re.search(r'http\S+|www\S+', email_input, re.IGNORECASE):
indicators.append("πŸ”— Suspicious URL tokens detected")
if re.search(r'\b(urgent|immediately|verify|password|suspended|click|act now|action required)\b', email_input, re.IGNORECASE):
indicators.append("⚑ Urgency manipulation tactics")
if re.search(r'\b(bank|account|verify|login|password|security|credential|paypal)\b', email_input, re.IGNORECASE):
indicators.append("🏦 Financial/security keywords present")
if re.search(r'\b(winner|prize|congratulations|claim|free|won)\b', email_input, re.IGNORECASE):
indicators.append("🎁 Reward/prize baiting language")
if re.search(r'\b(confirm|update|validate|unlock|restore)\b', email_input, re.IGNORECASE):
indicators.append("πŸ” Account action requests")
if "cardnumber" in processed_input:
indicators.append("πŸ’³ Credit card pattern detected")
if "moneymention" in processed_input:
indicators.append("πŸ’° Money amount mentioned")
for indicator in indicators:
st.markdown(f"- {indicator}")
st.error("🚨 **Recommendation:** Do NOT click any links. Delete this email immediately and report to your IT security team.")
else:
conf_pct = f"{(1-hybrid_proba):.1%}"
st.markdown(f"""
<div class="alert-box" style="background: {alert_gradient}; box-shadow: 0 10px 30px rgba({shadow_color}, 0.4), 0 0 50px rgba({shadow_color}, 0.2);">
<div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
<div style="font-size: 2.5rem;">{emoji}</div>
<div>
<div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">EMAIL APPEARS SAFE</div>
<div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Safety Confidence: {conf_pct}</div>
<div style="font-size: 0.9rem; opacity: 0.85; margin-top: 0.25rem;">ML Score: {(1-ml_proba):.1%} | Rule Score: {(1-rule_score):.1%}</div>
</div>
</div>
<div class="confidence-bar">
<div class="confidence-fill" style="width: {(1-hybrid_proba)*100}%;"></div>
</div>
</div>
""", unsafe_allow_html=True)
st.info("πŸ’‘ **Best Practice:** Always verify sender identity and be cautious with unexpected emails.")
st.session_state.analysis_history.append({
'timestamp': pd.Timestamp.now(),
'result': 'Phishing' if final_pred == 1 else 'Safe',
'confidence': f"{hybrid_proba:.2%}",
'preview': email_input[:50] + "..."
})
except Exception as e:
st.error(f"⚠️ Analysis failed: {str(e)}")
with col_hints:
st.markdown("""
<div class="hints-panel">
<div style="font-weight: 700; font-size: 1.15rem; margin-bottom: 1.2rem; color: #f5f5f5;">🧠 AI Detection Insights</div>
<div class="hint-item">
<div class="hint-icon">1</div>
<div><strong>Urgency words</strong> like "urgent", "verify" raise red flags</div>
</div>
<div class="hint-item">
<div class="hint-icon">2</div>
<div><strong>Suspicious links</strong> are automatically flagged</div>
</div>
<div class="hint-item">
<div class="hint-icon">3</div>
<div><strong>Financial + urgency</strong> combo indicates high risk</div>
</div>
<div class="hint-item">
<div class="hint-icon">4</div>
<div>Confidence <strong>>70%</strong> warrants caution</div>
</div>
<div class="hint-item">
<div class="hint-icon">⚑</div>
<div><strong>80/20 Split:</strong> Trained on 80%, tested on 20% for accuracy</div>
</div>
</div>
""", unsafe_allow_html=True)
# Footer
st.markdown("""
<div class="footer">
<div style="font-size: 1.2rem; margin-bottom: 0.75rem; font-weight: 700;">
Developed and Deployed by <span class="footer-name">Umaima Qureshi</span>
</div>
<div style="font-size: 1rem; color: #94a3b8; margin-bottom: 1rem; line-height: 1.6;">
πŸŽ“ Educational ML-powered email security with 80% training / 20% testing<br>
Trained on Kaggle Phishing Email Dataset from HuggingFace Files
</div>
<div style="margin-top: 1.5rem; padding-top: 1.5rem; border-top: 1px solid rgba(218,165,32,0.2); font-size: 0.9rem; color: #6b7280;">
TF-IDF β€’ Logistic Regression β€’ Hybrid Detection β€’ Scikit-learn β€’ Streamlit
</div>
</div>
""", unsafe_allow_html=True)