Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,6 +12,8 @@ from sklearn.model_selection import train_test_split
|
|
| 12 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 13 |
from sklearn.linear_model import LogisticRegression
|
| 14 |
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
|
|
|
|
|
|
|
| 15 |
import matplotlib.pyplot as plt
|
| 16 |
import seaborn as sns
|
| 17 |
import io
|
|
@@ -19,12 +21,20 @@ import os
|
|
| 19 |
|
| 20 |
# Page Configuration
|
| 21 |
st.set_page_config(
|
| 22 |
-
page_title="AI Phishing Shield β by Umaima Qureshi",
|
| 23 |
layout="wide",
|
| 24 |
initial_sidebar_state="collapsed"
|
| 25 |
)
|
| 26 |
|
| 27 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
st.markdown("""
|
| 29 |
<style>
|
| 30 |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;800;900&display=swap');
|
|
@@ -51,16 +61,13 @@ section[data-testid="stSidebar"] {
|
|
| 51 |
display: none;
|
| 52 |
}
|
| 53 |
|
| 54 |
-
/* Hero Section
|
| 55 |
.hero-container {
|
| 56 |
background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
|
| 57 |
border-radius: 32px;
|
| 58 |
padding: 4rem 3rem;
|
| 59 |
margin-bottom: 3rem;
|
| 60 |
-
box-shadow:
|
| 61 |
-
0 25px 70px rgba(0,0,0,0.6),
|
| 62 |
-
0 10px 30px rgba(218,165,32,0.25),
|
| 63 |
-
inset 0 1px 0 rgba(255,255,255,0.1);
|
| 64 |
position: relative;
|
| 65 |
overflow: hidden;
|
| 66 |
border: 2px solid rgba(218,165,32,0.4);
|
|
@@ -105,7 +112,6 @@ section[data-testid="stSidebar"] {
|
|
| 105 |
z-index: 1;
|
| 106 |
letter-spacing: -0.03em;
|
| 107 |
filter: drop-shadow(0 4px 20px rgba(255,215,0,0.4));
|
| 108 |
-
text-shadow: 0 0 80px rgba(255,215,0,0.3);
|
| 109 |
}
|
| 110 |
|
| 111 |
.hero-subtitle {
|
|
@@ -137,9 +143,7 @@ section[data-testid="stSidebar"] {
|
|
| 137 |
font-size: 1.05rem;
|
| 138 |
font-weight: 700;
|
| 139 |
margin-top: 1.8rem;
|
| 140 |
-
box-shadow:
|
| 141 |
-
0 8px 25px rgba(255,215,0,0.5),
|
| 142 |
-
0 0 40px rgba(255,215,0,0.3);
|
| 143 |
position: relative;
|
| 144 |
z-index: 1;
|
| 145 |
transition: all 0.3s ease;
|
|
@@ -147,9 +151,7 @@ section[data-testid="stSidebar"] {
|
|
| 147 |
|
| 148 |
.hero-badge:hover {
|
| 149 |
transform: translateY(-2px);
|
| 150 |
-
box-shadow:
|
| 151 |
-
0 12px 35px rgba(255,215,0,0.6),
|
| 152 |
-
0 0 50px rgba(255,215,0,0.4);
|
| 153 |
}
|
| 154 |
|
| 155 |
/* Section Headers */
|
|
@@ -178,7 +180,7 @@ section[data-testid="stSidebar"] {
|
|
| 178 |
border-radius: 2px;
|
| 179 |
}
|
| 180 |
|
| 181 |
-
/* Stats Grid
|
| 182 |
.stats-grid {
|
| 183 |
display: grid;
|
| 184 |
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
|
|
@@ -192,10 +194,7 @@ section[data-testid="stSidebar"] {
|
|
| 192 |
border-radius: 24px;
|
| 193 |
text-align: center;
|
| 194 |
color: #0f0f0f;
|
| 195 |
-
box-shadow:
|
| 196 |
-
0 10px 30px rgba(255,215,0,0.35),
|
| 197 |
-
0 0 40px rgba(255,215,0,0.2),
|
| 198 |
-
inset 0 1px 0 rgba(255,255,255,0.3);
|
| 199 |
transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
|
| 200 |
position: relative;
|
| 201 |
overflow: hidden;
|
|
@@ -220,10 +219,7 @@ section[data-testid="stSidebar"] {
|
|
| 220 |
|
| 221 |
.stat-card:hover {
|
| 222 |
transform: translateY(-10px) scale(1.03);
|
| 223 |
-
box-shadow:
|
| 224 |
-
0 20px 50px rgba(255,215,0,0.5),
|
| 225 |
-
0 0 60px rgba(255,215,0,0.3),
|
| 226 |
-
inset 0 1px 0 rgba(255,255,255,0.4);
|
| 227 |
}
|
| 228 |
|
| 229 |
.stat-value {
|
|
@@ -247,7 +243,7 @@ section[data-testid="stSidebar"] {
|
|
| 247 |
color: #0f0f0f;
|
| 248 |
}
|
| 249 |
|
| 250 |
-
/* Input Areas
|
| 251 |
.stTextArea textarea {
|
| 252 |
border-radius: 18px;
|
| 253 |
border: 2px solid rgba(218,165,32,0.35);
|
|
@@ -265,7 +261,7 @@ section[data-testid="stSidebar"] {
|
|
| 265 |
background: rgba(26,26,26,0.95) !important;
|
| 266 |
}
|
| 267 |
|
| 268 |
-
/* Buttons
|
| 269 |
.stButton > button {
|
| 270 |
background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
|
| 271 |
color: #0f0f0f;
|
|
@@ -275,9 +271,7 @@ section[data-testid="stSidebar"] {
|
|
| 275 |
font-size: 1.15rem;
|
| 276 |
font-weight: 700;
|
| 277 |
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
|
| 278 |
-
box-shadow:
|
| 279 |
-
0 4px 15px rgba(255,215,0,0.4),
|
| 280 |
-
0 0 30px rgba(255,215,0,0.2);
|
| 281 |
width: 100%;
|
| 282 |
letter-spacing: 0.5px;
|
| 283 |
position: relative;
|
|
@@ -304,42 +298,22 @@ section[data-testid="stSidebar"] {
|
|
| 304 |
|
| 305 |
.stButton > button:hover {
|
| 306 |
transform: translateY(-3px);
|
| 307 |
-
box-shadow:
|
| 308 |
-
0 8px 25px rgba(255,215,0,0.6),
|
| 309 |
-
0 0 50px rgba(255,215,0,0.3);
|
| 310 |
}
|
| 311 |
|
| 312 |
.stButton > button:active {
|
| 313 |
transform: translateY(-1px);
|
| 314 |
}
|
| 315 |
|
| 316 |
-
/* Alert Boxes
|
| 317 |
-
.alert-
|
| 318 |
-
background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%);
|
| 319 |
-
color: white;
|
| 320 |
padding: 2rem;
|
| 321 |
border-radius: 20px;
|
| 322 |
font-size: 1.1rem;
|
| 323 |
font-weight: 600;
|
| 324 |
-
box-shadow:
|
| 325 |
-
0 10px 30px rgba(239,68,68,0.4),
|
| 326 |
-
0 0 50px rgba(239,68,68,0.2);
|
| 327 |
margin: 1.5rem 0;
|
| 328 |
border: 2px solid rgba(255,255,255,0.1);
|
| 329 |
-
}
|
| 330 |
-
|
| 331 |
-
.alert-success {
|
| 332 |
-
background: linear-gradient(135deg, #10b981 0%, #059669 100%);
|
| 333 |
color: white;
|
| 334 |
-
padding: 2rem;
|
| 335 |
-
border-radius: 20px;
|
| 336 |
-
font-size: 1.1rem;
|
| 337 |
-
font-weight: 600;
|
| 338 |
-
box-shadow:
|
| 339 |
-
0 10px 30px rgba(16,185,129,0.4),
|
| 340 |
-
0 0 50px rgba(16,185,129,0.2);
|
| 341 |
-
margin: 1.5rem 0;
|
| 342 |
-
border: 2px solid rgba(255,255,255,0.1);
|
| 343 |
}
|
| 344 |
|
| 345 |
.confidence-bar {
|
|
@@ -359,15 +333,13 @@ section[data-testid="stSidebar"] {
|
|
| 359 |
box-shadow: 0 0 10px rgba(255,255,255,0.5);
|
| 360 |
}
|
| 361 |
|
| 362 |
-
/* Hints Panel
|
| 363 |
.hints-panel {
|
| 364 |
background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 365 |
border-radius: 20px;
|
| 366 |
padding: 2rem;
|
| 367 |
border-left: 5px solid #FFD700;
|
| 368 |
-
box-shadow:
|
| 369 |
-
0 4px 15px rgba(0,0,0,0.4),
|
| 370 |
-
inset 0 1px 0 rgba(255,255,255,0.05);
|
| 371 |
backdrop-filter: blur(10px);
|
| 372 |
}
|
| 373 |
|
|
@@ -395,26 +367,22 @@ section[data-testid="stSidebar"] {
|
|
| 395 |
box-shadow: 0 2px 8px rgba(255,215,0,0.4);
|
| 396 |
}
|
| 397 |
|
| 398 |
-
/* Metric Cards
|
| 399 |
.metric-container {
|
| 400 |
background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 401 |
padding: 1.8rem;
|
| 402 |
border-radius: 16px;
|
| 403 |
border-left: 5px solid #FFD700;
|
| 404 |
-
box-shadow:
|
| 405 |
-
0 4px 12px rgba(0,0,0,0.4),
|
| 406 |
-
inset 0 1px 0 rgba(255,255,255,0.05);
|
| 407 |
transition: all 0.3s ease;
|
| 408 |
}
|
| 409 |
|
| 410 |
.metric-container:hover {
|
| 411 |
transform: translateY(-2px);
|
| 412 |
-
box-shadow:
|
| 413 |
-
0 6px 18px rgba(0,0,0,0.5),
|
| 414 |
-
inset 0 1px 0 rgba(255,255,255,0.08);
|
| 415 |
}
|
| 416 |
|
| 417 |
-
/* File Uploader
|
| 418 |
.stFileUploader {
|
| 419 |
border: 2px dashed rgba(218,165,32,0.45);
|
| 420 |
border-radius: 18px;
|
|
@@ -429,7 +397,7 @@ section[data-testid="stSidebar"] {
|
|
| 429 |
box-shadow: 0 0 20px rgba(255,215,0,0.15);
|
| 430 |
}
|
| 431 |
|
| 432 |
-
/* Expanders
|
| 433 |
.streamlit-expanderHeader {
|
| 434 |
background: linear-gradient(135deg, rgba(218,165,32,0.2) 0%, rgba(218,165,32,0.1) 100%) !important;
|
| 435 |
border-radius: 14px !important;
|
|
@@ -489,7 +457,7 @@ section[data-testid="stSidebar"] {
|
|
| 489 |
color: #e5e7eb !important;
|
| 490 |
}
|
| 491 |
|
| 492 |
-
/* Footer
|
| 493 |
.footer {
|
| 494 |
background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 495 |
border-radius: 20px;
|
|
@@ -497,9 +465,7 @@ section[data-testid="stSidebar"] {
|
|
| 497 |
text-align: center;
|
| 498 |
margin-top: 4rem;
|
| 499 |
color: #9ca3af;
|
| 500 |
-
box-shadow:
|
| 501 |
-
0 8px 24px rgba(0,0,0,0.4),
|
| 502 |
-
inset 0 1px 0 rgba(255,255,255,0.05);
|
| 503 |
border: 2px solid rgba(218,165,32,0.3);
|
| 504 |
}
|
| 505 |
|
|
@@ -560,23 +526,136 @@ def safe_read_csv(path):
|
|
| 560 |
except Exception as e:
|
| 561 |
return pd.DataFrame()
|
| 562 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
def preprocess_text(text):
|
|
|
|
| 564 |
if not isinstance(text, str):
|
| 565 |
text = str(text)
|
| 566 |
text = text.lower()
|
| 567 |
-
|
| 568 |
-
text = re.sub(r'\S+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
text = re.sub(r'[^a-z\s]', ' ', text)
|
| 570 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 571 |
return text
|
| 572 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
# Hero Header
|
| 574 |
st.markdown("""
|
| 575 |
<div class="hero-container">
|
| 576 |
<div class="hero-title">π‘οΈ AI Phishing Shield</div>
|
| 577 |
<div class="hero-subtitle">Advanced Machine Learning Protection Against Email Threats</div>
|
| 578 |
<div class="hero-description">
|
| 579 |
-
Powered by TF-IDF vectorization and Logistic Regression, trained on thousands of real-world phishing examples.
|
| 580 |
Get instant threat analysis with confidence scoring and explainable AI insights.
|
| 581 |
</div>
|
| 582 |
<div class="hero-badge">β‘ Developed by Umaima Qureshi</div>
|
|
@@ -617,6 +696,12 @@ else:
|
|
| 617 |
]
|
| 618 |
})
|
| 619 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
# Clean & Prepare Dataset
|
| 621 |
if "Unnamed: 0" in df.columns:
|
| 622 |
df = df.drop(columns=["Unnamed: 0"])
|
|
@@ -671,6 +756,7 @@ with st.expander("π View Dataset Preview", expanded=False):
|
|
| 671 |
# Model Training
|
| 672 |
@st.cache_resource
|
| 673 |
def train_model(processed_texts, labels, test_size=0.2, random_state=42):
|
|
|
|
| 674 |
unique_labels, counts = np.unique(labels, return_counts=True)
|
| 675 |
min_samples = counts.min()
|
| 676 |
|
|
@@ -693,18 +779,31 @@ def train_model(processed_texts, labels, test_size=0.2, random_state=42):
|
|
| 693 |
processed_texts, labels, test_size=test_size, random_state=random_state, stratify=None
|
| 694 |
)
|
| 695 |
|
| 696 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 697 |
X_train_vec = vectorizer.fit_transform(X_train)
|
| 698 |
X_test_vec = vectorizer.transform(X_test)
|
| 699 |
-
|
| 700 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 701 |
model.fit(X_train_vec, y_train)
|
| 702 |
-
|
| 703 |
y_pred = model.predict(X_test_vec)
|
| 704 |
acc = accuracy_score(y_test, y_pred)
|
| 705 |
cm = confusion_matrix(y_test, y_pred)
|
| 706 |
report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
|
| 707 |
-
|
| 708 |
return {
|
| 709 |
"vectorizer": vectorizer,
|
| 710 |
"model": model,
|
|
@@ -713,8 +812,17 @@ def train_model(processed_texts, labels, test_size=0.2, random_state=42):
|
|
| 713 |
"report": report
|
| 714 |
}
|
| 715 |
|
| 716 |
-
|
| 717 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 718 |
|
| 719 |
# Model Performance
|
| 720 |
st.markdown('<div class="section-title">π― Model Performance</div>', unsafe_allow_html=True)
|
|
@@ -747,53 +855,23 @@ with col3:
|
|
| 747 |
</div>
|
| 748 |
""", unsafe_allow_html=True)
|
| 749 |
|
| 750 |
-
#
|
| 751 |
with st.expander("π Detailed Metrics & Confusion Matrix"):
|
| 752 |
col_matrix, col_report = st.columns([1, 1.5])
|
| 753 |
|
| 754 |
with col_matrix:
|
| 755 |
-
#
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
ax.set_facecolor('#1a1a1a')
|
| 761 |
-
|
| 762 |
-
sns.heatmap(
|
| 763 |
-
model_info["confusion_matrix"],
|
| 764 |
-
annot=True,
|
| 765 |
-
fmt="d",
|
| 766 |
-
ax=ax,
|
| 767 |
-
cmap="YlOrBr",
|
| 768 |
-
cbar=True,
|
| 769 |
-
square=True,
|
| 770 |
-
annot_kws={"size": 16, "weight": "bold", "color": "#0f0f0f"},
|
| 771 |
-
linewidths=2,
|
| 772 |
-
linecolor='#0f0f0f',
|
| 773 |
-
cbar_kws={'label': 'Count', 'shrink': 0.8}
|
| 774 |
-
)
|
| 775 |
-
|
| 776 |
-
ax.set_xlabel("Predicted", fontsize=11, fontweight='bold', color='#FFD700')
|
| 777 |
-
ax.set_ylabel("Actual", fontsize=11, fontweight='bold', color='#FFD700')
|
| 778 |
-
ax.set_xticklabels(["Safe", "Phishing"], fontsize=10, color='#e5e7eb')
|
| 779 |
-
ax.set_yticklabels(["Safe", "Phishing"], fontsize=10, rotation=0, color='#e5e7eb')
|
| 780 |
-
ax.set_title("Confusion Matrix", fontsize=13, fontweight='bold', pad=12, color='#FFD700')
|
| 781 |
-
|
| 782 |
-
# Style the colorbar
|
| 783 |
-
cbar = ax.collections[0].colorbar
|
| 784 |
-
cbar.ax.yaxis.set_tick_params(color='#e5e7eb')
|
| 785 |
-
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#e5e7eb')
|
| 786 |
-
|
| 787 |
-
plt.tight_layout()
|
| 788 |
-
st.pyplot(fig, use_container_width=True)
|
| 789 |
-
plt.close(fig) # Close figure to prevent memory leak
|
| 790 |
|
| 791 |
with col_report:
|
| 792 |
st.markdown("**π Classification Report:**")
|
| 793 |
report_df = pd.DataFrame(model_info["report"]).transpose().round(3)
|
| 794 |
st.dataframe(
|
| 795 |
-
report_df,
|
| 796 |
-
use_container_width=True,
|
| 797 |
height=250
|
| 798 |
)
|
| 799 |
|
|
@@ -817,82 +895,169 @@ with col_input:
|
|
| 817 |
email_input = uploaded_txt.read().decode("utf-8", errors="ignore")
|
| 818 |
except Exception:
|
| 819 |
email_input = str(uploaded_txt.getvalue())
|
| 820 |
-
|
| 821 |
if st.button("π Analyze Email Threat"):
|
| 822 |
if not email_input.strip():
|
| 823 |
st.warning("β οΈ Please paste or upload email content to analyze")
|
| 824 |
else:
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
|
|
|
|
|
|
| 832 |
try:
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 849 |
</div>
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 877 |
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 887 |
</div>
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
|
| 897 |
with col_hints:
|
| 898 |
st.markdown("""
|
|
@@ -924,6 +1089,11 @@ with col_hints:
|
|
| 924 |
<div><strong>Prize/reward</strong> language is a common phishing tactic</div>
|
| 925 |
</div>
|
| 926 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 927 |
<div class="hint-item">
|
| 928 |
<div class="hint-icon">β οΈ</div>
|
| 929 |
<div><strong>Limitations:</strong> This tool analyzes text content only. Always verify sender identity separately.</div>
|
|
@@ -931,6 +1101,18 @@ with col_hints:
|
|
| 931 |
</div>
|
| 932 |
""", unsafe_allow_html=True)
|
| 933 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 934 |
# Additional Tips Section
|
| 935 |
st.markdown('<div class="section-title">π‘ Phishing Protection Tips</div>', unsafe_allow_html=True)
|
| 936 |
|
|
@@ -938,8 +1120,8 @@ col_tip1, col_tip2, col_tip3 = st.columns(3)
|
|
| 938 |
|
| 939 |
with col_tip1:
|
| 940 |
st.markdown("""
|
| 941 |
-
<div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 942 |
-
padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
|
| 943 |
box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
|
| 944 |
<div style="font-size: 2rem; margin-bottom: 0.75rem;">π</div>
|
| 945 |
<div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Verify Sender</div>
|
|
@@ -951,8 +1133,8 @@ with col_tip1:
|
|
| 951 |
|
| 952 |
with col_tip2:
|
| 953 |
st.markdown("""
|
| 954 |
-
<div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 955 |
-
padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
|
| 956 |
box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
|
| 957 |
<div style="font-size: 2rem; margin-bottom: 0.75rem;">π</div>
|
| 958 |
<div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Hover Links</div>
|
|
@@ -964,8 +1146,8 @@ with col_tip2:
|
|
| 964 |
|
| 965 |
with col_tip3:
|
| 966 |
st.markdown("""
|
| 967 |
-
<div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 968 |
-
padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
|
| 969 |
box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
|
| 970 |
<div style="font-size: 2rem; margin-bottom: 0.75rem;">π</div>
|
| 971 |
<div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Contact Directly</div>
|
|
@@ -986,7 +1168,7 @@ st.markdown("""
|
|
| 986 |
For production use: Implement additional verification layers, link scanning, attachment analysis, and human oversight
|
| 987 |
</div>
|
| 988 |
<div style="margin-top: 1.5rem; padding-top: 1.5rem; border-top: 1px solid rgba(218,165,32,0.2); font-size: 0.9rem; color: #6b7280;">
|
| 989 |
-
Powered by TF-IDF β’ Logistic Regression β’ Scikit-learn β’ Streamlit
|
| 990 |
</div>
|
| 991 |
<div style="margin-top: 1rem; font-size: 0.85rem; color: #6b7280;">
|
| 992 |
Β© 2024 AI Phishing Shield | All Rights Reserved
|
|
|
|
| 12 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 13 |
from sklearn.linear_model import LogisticRegression
|
| 14 |
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
|
| 15 |
+
import matplotlib
|
| 16 |
+
matplotlib.use('Agg') # Use non-interactive backend
|
| 17 |
import matplotlib.pyplot as plt
|
| 18 |
import seaborn as sns
|
| 19 |
import io
|
|
|
|
| 21 |
|
| 22 |
# Page Configuration
|
| 23 |
st.set_page_config(
|
| 24 |
+
page_title="AI Phishing Shield β by Umaima Qureshi",
|
| 25 |
layout="wide",
|
| 26 |
initial_sidebar_state="collapsed"
|
| 27 |
)
|
| 28 |
|
| 29 |
+
# Initialize Session State
|
| 30 |
+
if 'model_trained' not in st.session_state:
|
| 31 |
+
st.session_state.model_trained = False
|
| 32 |
+
if 'analysis_history' not in st.session_state:
|
| 33 |
+
st.session_state.analysis_history = []
|
| 34 |
+
if 'cm_plot_cached' not in st.session_state:
|
| 35 |
+
st.session_state.cm_plot_cached = None
|
| 36 |
+
|
| 37 |
+
# Premium Black & Gold CSS Styling
|
| 38 |
st.markdown("""
|
| 39 |
<style>
|
| 40 |
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700;800;900&display=swap');
|
|
|
|
| 61 |
display: none;
|
| 62 |
}
|
| 63 |
|
| 64 |
+
/* Hero Section */
|
| 65 |
.hero-container {
|
| 66 |
background: linear-gradient(135deg, #1a1a1a 0%, #0f0f0f 100%);
|
| 67 |
border-radius: 32px;
|
| 68 |
padding: 4rem 3rem;
|
| 69 |
margin-bottom: 3rem;
|
| 70 |
+
box-shadow: 0 25px 70px rgba(0,0,0,0.6), 0 10px 30px rgba(218,165,32,0.25), inset 0 1px 0 rgba(255,255,255,0.1);
|
|
|
|
|
|
|
|
|
|
| 71 |
position: relative;
|
| 72 |
overflow: hidden;
|
| 73 |
border: 2px solid rgba(218,165,32,0.4);
|
|
|
|
| 112 |
z-index: 1;
|
| 113 |
letter-spacing: -0.03em;
|
| 114 |
filter: drop-shadow(0 4px 20px rgba(255,215,0,0.4));
|
|
|
|
| 115 |
}
|
| 116 |
|
| 117 |
.hero-subtitle {
|
|
|
|
| 143 |
font-size: 1.05rem;
|
| 144 |
font-weight: 700;
|
| 145 |
margin-top: 1.8rem;
|
| 146 |
+
box-shadow: 0 8px 25px rgba(255,215,0,0.5), 0 0 40px rgba(255,215,0,0.3);
|
|
|
|
|
|
|
| 147 |
position: relative;
|
| 148 |
z-index: 1;
|
| 149 |
transition: all 0.3s ease;
|
|
|
|
| 151 |
|
| 152 |
.hero-badge:hover {
|
| 153 |
transform: translateY(-2px);
|
| 154 |
+
box-shadow: 0 12px 35px rgba(255,215,0,0.6), 0 0 50px rgba(255,215,0,0.4);
|
|
|
|
|
|
|
| 155 |
}
|
| 156 |
|
| 157 |
/* Section Headers */
|
|
|
|
| 180 |
border-radius: 2px;
|
| 181 |
}
|
| 182 |
|
| 183 |
+
/* Stats Grid */
|
| 184 |
.stats-grid {
|
| 185 |
display: grid;
|
| 186 |
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
|
|
|
|
| 194 |
border-radius: 24px;
|
| 195 |
text-align: center;
|
| 196 |
color: #0f0f0f;
|
| 197 |
+
box-shadow: 0 10px 30px rgba(255,215,0,0.35), 0 0 40px rgba(255,215,0,0.2), inset 0 1px 0 rgba(255,255,255,0.3);
|
|
|
|
|
|
|
|
|
|
| 198 |
transition: all 0.4s cubic-bezier(0.4, 0, 0.2, 1);
|
| 199 |
position: relative;
|
| 200 |
overflow: hidden;
|
|
|
|
| 219 |
|
| 220 |
.stat-card:hover {
|
| 221 |
transform: translateY(-10px) scale(1.03);
|
| 222 |
+
box-shadow: 0 20px 50px rgba(255,215,0,0.5), 0 0 60px rgba(255,215,0,0.3), inset 0 1px 0 rgba(255,255,255,0.4);
|
|
|
|
|
|
|
|
|
|
| 223 |
}
|
| 224 |
|
| 225 |
.stat-value {
|
|
|
|
| 243 |
color: #0f0f0f;
|
| 244 |
}
|
| 245 |
|
| 246 |
+
/* Input Areas */
|
| 247 |
.stTextArea textarea {
|
| 248 |
border-radius: 18px;
|
| 249 |
border: 2px solid rgba(218,165,32,0.35);
|
|
|
|
| 261 |
background: rgba(26,26,26,0.95) !important;
|
| 262 |
}
|
| 263 |
|
| 264 |
+
/* Buttons */
|
| 265 |
.stButton > button {
|
| 266 |
background: linear-gradient(135deg, #FFD700 0%, #FFA500 100%);
|
| 267 |
color: #0f0f0f;
|
|
|
|
| 271 |
font-size: 1.15rem;
|
| 272 |
font-weight: 700;
|
| 273 |
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
|
| 274 |
+
box-shadow: 0 4px 15px rgba(255,215,0,0.4), 0 0 30px rgba(255,215,0,0.2);
|
|
|
|
|
|
|
| 275 |
width: 100%;
|
| 276 |
letter-spacing: 0.5px;
|
| 277 |
position: relative;
|
|
|
|
| 298 |
|
| 299 |
.stButton > button:hover {
|
| 300 |
transform: translateY(-3px);
|
| 301 |
+
box-shadow: 0 8px 25px rgba(255,215,0,0.6), 0 0 50px rgba(255,215,0,0.3);
|
|
|
|
|
|
|
| 302 |
}
|
| 303 |
|
| 304 |
.stButton > button:active {
|
| 305 |
transform: translateY(-1px);
|
| 306 |
}
|
| 307 |
|
| 308 |
+
/* Dynamic Alert Boxes */
|
| 309 |
+
.alert-box {
|
|
|
|
|
|
|
| 310 |
padding: 2rem;
|
| 311 |
border-radius: 20px;
|
| 312 |
font-size: 1.1rem;
|
| 313 |
font-weight: 600;
|
|
|
|
|
|
|
|
|
|
| 314 |
margin: 1.5rem 0;
|
| 315 |
border: 2px solid rgba(255,255,255,0.1);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
color: white;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
}
|
| 318 |
|
| 319 |
.confidence-bar {
|
|
|
|
| 333 |
box-shadow: 0 0 10px rgba(255,255,255,0.5);
|
| 334 |
}
|
| 335 |
|
| 336 |
+
/* Hints Panel */
|
| 337 |
.hints-panel {
|
| 338 |
background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 339 |
border-radius: 20px;
|
| 340 |
padding: 2rem;
|
| 341 |
border-left: 5px solid #FFD700;
|
| 342 |
+
box-shadow: 0 4px 15px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
|
|
|
|
|
|
|
| 343 |
backdrop-filter: blur(10px);
|
| 344 |
}
|
| 345 |
|
|
|
|
| 367 |
box-shadow: 0 2px 8px rgba(255,215,0,0.4);
|
| 368 |
}
|
| 369 |
|
| 370 |
+
/* Metric Cards */
|
| 371 |
.metric-container {
|
| 372 |
background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 373 |
padding: 1.8rem;
|
| 374 |
border-radius: 16px;
|
| 375 |
border-left: 5px solid #FFD700;
|
| 376 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
|
|
|
|
|
|
|
| 377 |
transition: all 0.3s ease;
|
| 378 |
}
|
| 379 |
|
| 380 |
.metric-container:hover {
|
| 381 |
transform: translateY(-2px);
|
| 382 |
+
box-shadow: 0 6px 18px rgba(0,0,0,0.5), inset 0 1px 0 rgba(255,255,255,0.08);
|
|
|
|
|
|
|
| 383 |
}
|
| 384 |
|
| 385 |
+
/* File Uploader */
|
| 386 |
.stFileUploader {
|
| 387 |
border: 2px dashed rgba(218,165,32,0.45);
|
| 388 |
border-radius: 18px;
|
|
|
|
| 397 |
box-shadow: 0 0 20px rgba(255,215,0,0.15);
|
| 398 |
}
|
| 399 |
|
| 400 |
+
/* Expanders */
|
| 401 |
.streamlit-expanderHeader {
|
| 402 |
background: linear-gradient(135deg, rgba(218,165,32,0.2) 0%, rgba(218,165,32,0.1) 100%) !important;
|
| 403 |
border-radius: 14px !important;
|
|
|
|
| 457 |
color: #e5e7eb !important;
|
| 458 |
}
|
| 459 |
|
| 460 |
+
/* Footer */
|
| 461 |
.footer {
|
| 462 |
background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 463 |
border-radius: 20px;
|
|
|
|
| 465 |
text-align: center;
|
| 466 |
margin-top: 4rem;
|
| 467 |
color: #9ca3af;
|
| 468 |
+
box-shadow: 0 8px 24px rgba(0,0,0,0.4), inset 0 1px 0 rgba(255,255,255,0.05);
|
|
|
|
|
|
|
| 469 |
border: 2px solid rgba(218,165,32,0.3);
|
| 470 |
}
|
| 471 |
|
|
|
|
| 526 |
except Exception as e:
|
| 527 |
return pd.DataFrame()
|
| 528 |
|
| 529 |
+
def sanitize_input(text):
|
| 530 |
+
"""Sanitize user input to prevent injection"""
|
| 531 |
+
text = re.sub(r'<script.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
| 532 |
+
text = re.sub(r'<.*?>', '', text)
|
| 533 |
+
return text
|
| 534 |
+
|
| 535 |
+
def validate_email_input(text):
|
| 536 |
+
"""Validate email input"""
|
| 537 |
+
if len(text.strip()) < 10:
|
| 538 |
+
return False, "Email content too short for analysis (minimum 10 characters)"
|
| 539 |
+
if len(text) > 10000:
|
| 540 |
+
return False, "Email content too long (maximum 10,000 characters)"
|
| 541 |
+
return True, ""
|
| 542 |
+
|
| 543 |
+
@st.cache_data
|
| 544 |
+
def preprocess_text_cached(text):
|
| 545 |
+
"""Cached version of text preprocessing"""
|
| 546 |
+
return preprocess_text(text)
|
| 547 |
+
|
| 548 |
def preprocess_text(text):
|
| 549 |
+
"""Enhanced preprocessing with better phishing indicator preservation"""
|
| 550 |
if not isinstance(text, str):
|
| 551 |
text = str(text)
|
| 552 |
text = text.lower()
|
| 553 |
+
# Enhanced URL detection - preserve URL patterns better
|
| 554 |
+
text = re.sub(r'http\S+|www\S+|https\S+', ' suspiciousurl ', text)
|
| 555 |
+
text = re.sub(r'\S+@\S+', ' emailaddress ', text)
|
| 556 |
+
# Preserve important phishing indicators
|
| 557 |
+
text = re.sub(r'\$\d+', ' moneymention ', text)
|
| 558 |
+
text = re.sub(r'\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}', ' cardnumber ', text)
|
| 559 |
text = re.sub(r'[^a-z\s]', ' ', text)
|
| 560 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 561 |
return text
|
| 562 |
|
| 563 |
+
def calculate_phishing_score(text):
|
| 564 |
+
"""Enhanced phishing detection with multi-factor scoring"""
|
| 565 |
+
score = 0
|
| 566 |
+
text_lower = text.lower()
|
| 567 |
+
|
| 568 |
+
# High-risk phishing keywords (weight: 15 points each)
|
| 569 |
+
high_risk = ['verify', 'suspended', 'urgent', 'immediately', 'click here', 'act now',
|
| 570 |
+
'confirm identity', 'account locked', 'unusual activity', 'security alert',
|
| 571 |
+
'expire', 'limited time', 'action required', 'update payment', 'validate']
|
| 572 |
+
score += sum(15 for word in high_risk if word in text_lower)
|
| 573 |
+
|
| 574 |
+
# Financial/security keywords (weight: 12 points each)
|
| 575 |
+
financial = ['bank', 'credit card', 'password', 'ssn', 'social security', 'paypal',
|
| 576 |
+
'billing', 'payment', 'account number', 'pin', 'cvv', 'credential']
|
| 577 |
+
score += sum(12 for word in financial if word in text_lower)
|
| 578 |
+
|
| 579 |
+
# Prize/reward scam indicators (weight: 18 points each)
|
| 580 |
+
prize_scam = ['won', 'winner', 'prize', 'claim now', 'congratulations', 'free money',
|
| 581 |
+
'inheritance', 'lottery', 'jackpot', 'cash prize', '$1000', '$10000']
|
| 582 |
+
score += sum(18 for word in prize_scam if word in text_lower)
|
| 583 |
+
|
| 584 |
+
# Urgency + financial combo (weight: 25 points)
|
| 585 |
+
if any(urg in text_lower for urg in ['urgent', 'immediately', 'now', 'expire']) and \
|
| 586 |
+
any(fin in text_lower for fin in ['account', 'bank', 'payment', 'card']):
|
| 587 |
+
score += 25
|
| 588 |
+
|
| 589 |
+
# Suspicious URL patterns (weight: 20 points)
|
| 590 |
+
if re.search(r'http\S+|www\S+', text, re.IGNORECASE):
|
| 591 |
+
url_count = len(re.findall(r'http\S+|www\S+', text, re.IGNORECASE))
|
| 592 |
+
score += min(url_count * 20, 40) # Cap at 40 for multiple URLs
|
| 593 |
+
|
| 594 |
+
# Request for credentials/info (weight: 20 points)
|
| 595 |
+
if re.search(r'\b(enter|provide|submit|update|confirm).{0,20}(password|credential|info|detail)', text_lower):
|
| 596 |
+
score += 20
|
| 597 |
+
|
| 598 |
+
# Threatening language (weight: 15 points)
|
| 599 |
+
threats = ['locked', 'suspended', 'terminated', 'closed', 'blocked', 'restricted']
|
| 600 |
+
score += sum(15 for word in threats if word in text_lower)
|
| 601 |
+
|
| 602 |
+
# Poor grammar indicators (weight: 8 points)
|
| 603 |
+
if re.search(r'\b(dear customer|dear user|dear member|dear valued)\b', text_lower):
|
| 604 |
+
score += 8
|
| 605 |
+
|
| 606 |
+
# Convert to probability (0-1 scale)
|
| 607 |
+
max_score = 200 # Adjusted maximum possible score
|
| 608 |
+
probability = min(score / max_score, 0.99) # Cap at 99%
|
| 609 |
+
|
| 610 |
+
return probability
|
| 611 |
+
|
| 612 |
+
@st.cache_data
|
| 613 |
+
def generate_confusion_matrix_plot(_cm):
|
| 614 |
+
"""Generate confusion matrix plot once and cache it"""
|
| 615 |
+
plt.style.use('dark_background')
|
| 616 |
+
fig, ax = plt.subplots(figsize=(5, 4), facecolor='#1a1a1a')
|
| 617 |
+
ax.set_facecolor('#1a1a1a')
|
| 618 |
+
|
| 619 |
+
sns.heatmap(
|
| 620 |
+
_cm,
|
| 621 |
+
annot=True,
|
| 622 |
+
fmt="d",
|
| 623 |
+
ax=ax,
|
| 624 |
+
cmap="YlOrBr",
|
| 625 |
+
cbar=True,
|
| 626 |
+
square=True,
|
| 627 |
+
annot_kws={"size": 16, "weight": "bold", "color": "#0f0f0f"},
|
| 628 |
+
linewidths=2,
|
| 629 |
+
linecolor='#0f0f0f',
|
| 630 |
+
cbar_kws={'label': 'Count', 'shrink': 0.8}
|
| 631 |
+
)
|
| 632 |
+
|
| 633 |
+
ax.set_xlabel("Predicted", fontsize=11, fontweight='bold', color='#FFD700')
|
| 634 |
+
ax.set_ylabel("Actual", fontsize=11, fontweight='bold', color='#FFD700')
|
| 635 |
+
ax.set_xticklabels(["Safe", "Phishing"], fontsize=10, color='#e5e7eb')
|
| 636 |
+
ax.set_yticklabels(["Safe", "Phishing"], fontsize=10, rotation=0, color='#e5e7eb')
|
| 637 |
+
ax.set_title("Confusion Matrix", fontsize=13, fontweight='bold', pad=12, color='#FFD700')
|
| 638 |
+
|
| 639 |
+
# Style the colorbar
|
| 640 |
+
cbar = ax.collections[0].colorbar
|
| 641 |
+
cbar.ax.yaxis.set_tick_params(color='#e5e7eb')
|
| 642 |
+
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#e5e7eb')
|
| 643 |
+
|
| 644 |
+
plt.tight_layout()
|
| 645 |
+
buf = io.BytesIO()
|
| 646 |
+
plt.savefig(buf, format='png', facecolor='#1a1a1a', dpi=100)
|
| 647 |
+
buf.seek(0)
|
| 648 |
+
plt.close(fig)
|
| 649 |
+
|
| 650 |
+
return buf
|
| 651 |
+
|
| 652 |
# Hero Header
|
| 653 |
st.markdown("""
|
| 654 |
<div class="hero-container">
|
| 655 |
<div class="hero-title">π‘οΈ AI Phishing Shield</div>
|
| 656 |
<div class="hero-subtitle">Advanced Machine Learning Protection Against Email Threats</div>
|
| 657 |
<div class="hero-description">
|
| 658 |
+
Powered by TF-IDF vectorization and Logistic Regression, trained on thousands of real-world phishing examples.
|
| 659 |
Get instant threat analysis with confidence scoring and explainable AI insights.
|
| 660 |
</div>
|
| 661 |
<div class="hero-badge">β‘ Developed by Umaima Qureshi</div>
|
|
|
|
| 696 |
]
|
| 697 |
})
|
| 698 |
|
| 699 |
+
# Validate dataset
|
| 700 |
+
required_columns = 2
|
| 701 |
+
if len(df.columns) < required_columns or len(df) == 0:
|
| 702 |
+
st.error("β οΈ Invalid dataset format. Please ensure your CSV has email text and labels.")
|
| 703 |
+
st.stop()
|
| 704 |
+
|
| 705 |
# Clean & Prepare Dataset
|
| 706 |
if "Unnamed: 0" in df.columns:
|
| 707 |
df = df.drop(columns=["Unnamed: 0"])
|
|
|
|
| 756 |
# Model Training
|
| 757 |
@st.cache_resource
|
| 758 |
def train_model(processed_texts, labels, test_size=0.2, random_state=42):
|
| 759 |
+
"""Enhanced model training with better parameters"""
|
| 760 |
unique_labels, counts = np.unique(labels, return_counts=True)
|
| 761 |
min_samples = counts.min()
|
| 762 |
|
|
|
|
| 779 |
processed_texts, labels, test_size=test_size, random_state=random_state, stratify=None
|
| 780 |
)
|
| 781 |
|
| 782 |
+
# Enhanced TF-IDF with better parameters for phishing detection
|
| 783 |
+
vectorizer = TfidfVectorizer(
|
| 784 |
+
max_features=5000,
|
| 785 |
+
ngram_range=(1,3), # Include trigrams for better context
|
| 786 |
+
min_df=1,
|
| 787 |
+
max_df=0.95,
|
| 788 |
+
sublinear_tf=True
|
| 789 |
+
)
|
| 790 |
X_train_vec = vectorizer.fit_transform(X_train)
|
| 791 |
X_test_vec = vectorizer.transform(X_test)
|
| 792 |
+
|
| 793 |
+
# Use balanced class weights for better phishing detection
|
| 794 |
+
model = LogisticRegression(
|
| 795 |
+
max_iter=2000,
|
| 796 |
+
solver='liblinear',
|
| 797 |
+
class_weight='balanced', # Handle imbalanced data better
|
| 798 |
+
C=1.0
|
| 799 |
+
)
|
| 800 |
model.fit(X_train_vec, y_train)
|
| 801 |
+
|
| 802 |
y_pred = model.predict(X_test_vec)
|
| 803 |
acc = accuracy_score(y_test, y_pred)
|
| 804 |
cm = confusion_matrix(y_test, y_pred)
|
| 805 |
report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
|
| 806 |
+
|
| 807 |
return {
|
| 808 |
"vectorizer": vectorizer,
|
| 809 |
"model": model,
|
|
|
|
| 812 |
"report": report
|
| 813 |
}
|
| 814 |
|
| 815 |
+
# Train model with session state to prevent re-training
|
| 816 |
+
if not st.session_state.model_trained:
|
| 817 |
+
model_info = train_model(df['processed_text'].tolist(), df['label'].values)
|
| 818 |
+
st.session_state.model_info = model_info
|
| 819 |
+
st.session_state.model_trained = True
|
| 820 |
+
else:
|
| 821 |
+
model_info = st.session_state.model_info
|
| 822 |
+
|
| 823 |
+
vectorizer = model_info["vectorizer"]
|
| 824 |
+
model = model_info["model"]
|
| 825 |
+
accuracy = model_info["accuracy"]
|
| 826 |
|
| 827 |
# Model Performance
|
| 828 |
st.markdown('<div class="section-title">π― Model Performance</div>', unsafe_allow_html=True)
|
|
|
|
| 855 |
</div>
|
| 856 |
""", unsafe_allow_html=True)
|
| 857 |
|
| 858 |
+
# Confusion Matrix Section
|
| 859 |
with st.expander("π Detailed Metrics & Confusion Matrix"):
|
| 860 |
col_matrix, col_report = st.columns([1, 1.5])
|
| 861 |
|
| 862 |
with col_matrix:
|
| 863 |
+
# Generate confusion matrix plot once
|
| 864 |
+
if st.session_state.cm_plot_cached is None:
|
| 865 |
+
st.session_state.cm_plot_cached = generate_confusion_matrix_plot(model_info["confusion_matrix"])
|
| 866 |
+
|
| 867 |
+
st.image(st.session_state.cm_plot_cached, use_column_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 868 |
|
| 869 |
with col_report:
|
| 870 |
st.markdown("**π Classification Report:**")
|
| 871 |
report_df = pd.DataFrame(model_info["report"]).transpose().round(3)
|
| 872 |
st.dataframe(
|
| 873 |
+
report_df,
|
| 874 |
+
use_container_width=True,
|
| 875 |
height=250
|
| 876 |
)
|
| 877 |
|
|
|
|
| 895 |
email_input = uploaded_txt.read().decode("utf-8", errors="ignore")
|
| 896 |
except Exception:
|
| 897 |
email_input = str(uploaded_txt.getvalue())
|
| 898 |
+
|
| 899 |
if st.button("π Analyze Email Threat"):
|
| 900 |
if not email_input.strip():
|
| 901 |
st.warning("β οΈ Please paste or upload email content to analyze")
|
| 902 |
else:
|
| 903 |
+
# Sanitize input
|
| 904 |
+
email_input = sanitize_input(email_input)
|
| 905 |
+
|
| 906 |
+
# Validate input
|
| 907 |
+
is_valid, error_msg = validate_email_input(email_input)
|
| 908 |
+
if not is_valid:
|
| 909 |
+
st.warning(f"β οΈ {error_msg}")
|
| 910 |
+
else:
|
| 911 |
+
with st.spinner("π Analyzing email threat..."):
|
| 912 |
try:
|
| 913 |
+
# ML Model prediction
|
| 914 |
+
processed_input = preprocess_text_cached(email_input)
|
| 915 |
+
input_vec = vectorizer.transform([processed_input])
|
| 916 |
+
|
| 917 |
+
try:
|
| 918 |
+
ml_proba = model.predict_proba(input_vec)[0][1]
|
| 919 |
+
except AttributeError:
|
| 920 |
+
decision = model.decision_function(input_vec)[0]
|
| 921 |
+
ml_proba = 1 / (1 + np.exp(-decision))
|
| 922 |
+
|
| 923 |
+
ml_pred = model.predict(input_vec)[0]
|
| 924 |
+
|
| 925 |
+
# Rule-based scoring
|
| 926 |
+
rule_score = calculate_phishing_score(email_input)
|
| 927 |
+
|
| 928 |
+
# Hybrid approach: weighted combination
|
| 929 |
+
# 60% ML model + 40% rule-based (adjustable)
|
| 930 |
+
hybrid_proba = (0.6 * ml_proba) + (0.4 * rule_score)
|
| 931 |
+
|
| 932 |
+
# Final prediction based on hybrid score
|
| 933 |
+
final_pred = 1 if hybrid_proba > 0.5 else 0
|
| 934 |
+
|
| 935 |
+
# Dynamic color based on confidence
|
| 936 |
+
if hybrid_proba >= 0.8:
|
| 937 |
+
alert_color = "#dc2626" # Deep red - Critical
|
| 938 |
+
alert_gradient = "linear-gradient(135deg, #dc2626 0%, #991b1b 100%)"
|
| 939 |
+
shadow_color = "220, 38, 38"
|
| 940 |
+
emoji = "π¨"
|
| 941 |
+
risk_level = "CRITICAL THREAT"
|
| 942 |
+
elif hybrid_proba >= 0.6:
|
| 943 |
+
alert_color = "#ef4444" # Red - High risk
|
| 944 |
+
alert_gradient = "linear-gradient(135deg, #ef4444 0%, #dc2626 100%)"
|
| 945 |
+
shadow_color = "239, 68, 68"
|
| 946 |
+
emoji = "β οΈ"
|
| 947 |
+
risk_level = "HIGH RISK"
|
| 948 |
+
elif hybrid_proba >= 0.4:
|
| 949 |
+
alert_color = "#f97316" # Orange - Medium risk
|
| 950 |
+
alert_gradient = "linear-gradient(135deg, #f97316 0%, #ea580c 100%)"
|
| 951 |
+
shadow_color = "249, 115, 22"
|
| 952 |
+
emoji = "β‘"
|
| 953 |
+
risk_level = "MEDIUM RISK"
|
| 954 |
+
elif hybrid_proba >= 0.2:
|
| 955 |
+
alert_color = "#eab308" # Yellow - Low risk
|
| 956 |
+
alert_gradient = "linear-gradient(135deg, #eab308 0%, #ca8a04 100%)"
|
| 957 |
+
shadow_color = "234, 179, 8"
|
| 958 |
+
emoji = "β οΈ"
|
| 959 |
+
risk_level = "LOW RISK"
|
| 960 |
+
else:
|
| 961 |
+
alert_color = "#10b981" # Green - Safe
|
| 962 |
+
alert_gradient = "linear-gradient(135deg, #10b981 0%, #059669 100%)"
|
| 963 |
+
shadow_color = "16, 185, 129"
|
| 964 |
+
emoji = "β
"
|
| 965 |
+
risk_level = "SAFE"
|
| 966 |
+
|
| 967 |
+
if final_pred == 1:
|
| 968 |
+
conf_pct = f"{hybrid_proba:.1%}"
|
| 969 |
+
st.markdown(f"""
|
| 970 |
+
<div class="alert-box" style="background: {alert_gradient}; box-shadow: 0 10px 30px rgba({shadow_color}, 0.4), 0 0 50px rgba({shadow_color}, 0.2);">
|
| 971 |
+
<div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
|
| 972 |
+
<div style="font-size: 2.5rem;">{emoji}</div>
|
| 973 |
+
<div>
|
| 974 |
+
<div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">{risk_level} DETECTED</div>
|
| 975 |
+
<div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Threat Confidence: {conf_pct}</div>
|
| 976 |
+
<div style="font-size: 0.9rem; opacity: 0.85; margin-top: 0.25rem;">ML Score: {ml_proba:.1%} | Rule Score: {rule_score:.1%}</div>
|
| 977 |
+
</div>
|
| 978 |
+
</div>
|
| 979 |
+
<div class="confidence-bar">
|
| 980 |
+
<div class="confidence-fill" style="width: {hybrid_proba*100}%;"></div>
|
| 981 |
+
</div>
|
| 982 |
</div>
|
| 983 |
+
""", unsafe_allow_html=True)
|
| 984 |
+
|
| 985 |
+
st.markdown("**π Threat Indicators Detected:**")
|
| 986 |
+
indicators = []
|
| 987 |
+
if "suspiciousurl" in processed_input or re.search(r'http\S+|www\S+', email_input, re.IGNORECASE):
|
| 988 |
+
indicators.append("π Suspicious URL tokens detected")
|
| 989 |
+
if re.search(r'\b(urgent|immediately|verify|password|suspended|click|act now|action required)\b', email_input, re.IGNORECASE):
|
| 990 |
+
indicators.append("β‘ Urgency manipulation tactics")
|
| 991 |
+
if re.search(r'\b(bank|account|verify|login|password|security|credential|paypal)\b', email_input, re.IGNORECASE):
|
| 992 |
+
indicators.append("π¦ Financial/security keywords present")
|
| 993 |
+
if re.search(r'\b(winner|prize|congratulations|claim|free|won)\b', email_input, re.IGNORECASE):
|
| 994 |
+
indicators.append("π Reward/prize baiting language")
|
| 995 |
+
if re.search(r'\b(confirm|update|validate|unlock|restore)\b', email_input, re.IGNORECASE):
|
| 996 |
+
indicators.append("π Account action requests")
|
| 997 |
+
if "cardnumber" in processed_input:
|
| 998 |
+
indicators.append("π³ Credit card pattern detected")
|
| 999 |
+
if "moneymention" in processed_input:
|
| 1000 |
+
indicators.append("π° Money amount mentioned")
|
| 1001 |
+
|
| 1002 |
+
for indicator in indicators:
|
| 1003 |
+
st.markdown(f"- {indicator}")
|
| 1004 |
+
|
| 1005 |
+
if not indicators:
|
| 1006 |
+
st.markdown("- β οΈ Content pattern matches known phishing templates")
|
| 1007 |
+
|
| 1008 |
+
st.error("π¨ **Recommendation:** Do NOT click any links. Delete this email immediately and report to your IT security team.")
|
| 1009 |
+
|
| 1010 |
+
# Download analysis report
|
| 1011 |
+
result_data = {
|
| 1012 |
+
'timestamp': pd.Timestamp.now(),
|
| 1013 |
+
'prediction': 'Phishing',
|
| 1014 |
+
'confidence': f"{hybrid_proba:.2%}",
|
| 1015 |
+
'ml_score': f"{ml_proba:.2%}",
|
| 1016 |
+
'rule_score': f"{rule_score:.2%}",
|
| 1017 |
+
'risk_level': risk_level,
|
| 1018 |
+
'email_preview': email_input[:100] + "..."
|
| 1019 |
+
}
|
| 1020 |
+
result_df = pd.DataFrame([result_data])
|
| 1021 |
+
csv = result_df.to_csv(index=False)
|
| 1022 |
+
|
| 1023 |
+
st.download_button(
|
| 1024 |
+
label="π₯ Download Analysis Report",
|
| 1025 |
+
data=csv,
|
| 1026 |
+
file_name=f"phishing_analysis_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}.csv",
|
| 1027 |
+
mime="text/csv"
|
| 1028 |
+
)
|
| 1029 |
|
| 1030 |
+
else:
|
| 1031 |
+
conf_pct = f"{(1-hybrid_proba):.1%}"
|
| 1032 |
+
st.markdown(f"""
|
| 1033 |
+
<div class="alert-box" style="background: {alert_gradient}; box-shadow: 0 10px 30px rgba({shadow_color}, 0.4), 0 0 50px rgba({shadow_color}, 0.2);">
|
| 1034 |
+
<div style="display: flex; align-items: center; gap: 1rem; margin-bottom: 0.75rem;">
|
| 1035 |
+
<div style="font-size: 2.5rem;">{emoji}</div>
|
| 1036 |
+
<div>
|
| 1037 |
+
<div style="font-size: 1.5rem; font-weight: 800; letter-spacing: 0.5px;">EMAIL APPEARS SAFE</div>
|
| 1038 |
+
<div style="font-size: 1.05rem; opacity: 0.95; margin-top: 0.25rem;">Safety Confidence: {conf_pct}</div>
|
| 1039 |
+
<div style="font-size: 0.9rem; opacity: 0.85; margin-top: 0.25rem;">ML Score: {(1-ml_proba):.1%} | Rule Score: {(1-rule_score):.1%}</div>
|
| 1040 |
+
</div>
|
| 1041 |
+
</div>
|
| 1042 |
+
<div class="confidence-bar">
|
| 1043 |
+
<div class="confidence-fill" style="width: {(1-hybrid_proba)*100}%;"></div>
|
| 1044 |
+
</div>
|
| 1045 |
</div>
|
| 1046 |
+
""", unsafe_allow_html=True)
|
| 1047 |
+
|
| 1048 |
+
st.markdown("**β No obvious threat indicators found in content analysis**")
|
| 1049 |
+
st.info("π‘ **Best Practice:** Always verify sender identity through known contact methods and be cautious with unexpected emails, even if they appear safe.")
|
| 1050 |
+
|
| 1051 |
+
# Add to history
|
| 1052 |
+
st.session_state.analysis_history.append({
|
| 1053 |
+
'timestamp': pd.Timestamp.now(),
|
| 1054 |
+
'result': 'Phishing' if final_pred == 1 else 'Safe',
|
| 1055 |
+
'confidence': f"{hybrid_proba:.2%}",
|
| 1056 |
+
'preview': email_input[:50] + "..."
|
| 1057 |
+
})
|
| 1058 |
+
|
| 1059 |
+
except Exception as e:
|
| 1060 |
+
st.error(f"β οΈ Analysis failed: {str(e)}")
|
| 1061 |
|
| 1062 |
with col_hints:
|
| 1063 |
st.markdown("""
|
|
|
|
| 1089 |
<div><strong>Prize/reward</strong> language is a common phishing tactic</div>
|
| 1090 |
</div>
|
| 1091 |
|
| 1092 |
+
<div class="hint-item">
|
| 1093 |
+
<div class="hint-icon">β‘</div>
|
| 1094 |
+
<div><strong>Hybrid Detection:</strong> Combines ML model (60%) with rule-based scoring (40%)</div>
|
| 1095 |
+
</div>
|
| 1096 |
+
|
| 1097 |
<div class="hint-item">
|
| 1098 |
<div class="hint-icon">β οΈ</div>
|
| 1099 |
<div><strong>Limitations:</strong> This tool analyzes text content only. Always verify sender identity separately.</div>
|
|
|
|
| 1101 |
</div>
|
| 1102 |
""", unsafe_allow_html=True)
|
| 1103 |
|
| 1104 |
+
# Recent Analyses History
|
| 1105 |
+
if len(st.session_state.analysis_history) > 0:
|
| 1106 |
+
st.markdown('<div class="section-title">π Recent Analyses</div>', unsafe_allow_html=True)
|
| 1107 |
+
with st.expander("View Recent Analysis History", expanded=False):
|
| 1108 |
+
hist_df = pd.DataFrame(st.session_state.analysis_history[-10:]) # Show last 10
|
| 1109 |
+
hist_df = hist_df.iloc[::-1] # Reverse to show most recent first
|
| 1110 |
+
st.dataframe(hist_df, use_container_width=True, height=300)
|
| 1111 |
+
|
| 1112 |
+
if st.button("ποΈ Clear History"):
|
| 1113 |
+
st.session_state.analysis_history = []
|
| 1114 |
+
st.rerun()
|
| 1115 |
+
|
| 1116 |
# Additional Tips Section
|
| 1117 |
st.markdown('<div class="section-title">π‘ Phishing Protection Tips</div>', unsafe_allow_html=True)
|
| 1118 |
|
|
|
|
| 1120 |
|
| 1121 |
with col_tip1:
|
| 1122 |
st.markdown("""
|
| 1123 |
+
<div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 1124 |
+
padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
|
| 1125 |
box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
|
| 1126 |
<div style="font-size: 2rem; margin-bottom: 0.75rem;">π</div>
|
| 1127 |
<div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Verify Sender</div>
|
|
|
|
| 1133 |
|
| 1134 |
with col_tip2:
|
| 1135 |
st.markdown("""
|
| 1136 |
+
<div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 1137 |
+
padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
|
| 1138 |
box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
|
| 1139 |
<div style="font-size: 2rem; margin-bottom: 0.75rem;">π</div>
|
| 1140 |
<div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Hover Links</div>
|
|
|
|
| 1146 |
|
| 1147 |
with col_tip3:
|
| 1148 |
st.markdown("""
|
| 1149 |
+
<div style="background: linear-gradient(135deg, rgba(26,26,26,0.95) 0%, rgba(15,15,15,0.95) 100%);
|
| 1150 |
+
padding: 1.5rem; border-radius: 16px; border-left: 4px solid #FFD700;
|
| 1151 |
box-shadow: 0 4px 15px rgba(0,0,0,0.3); height: 100%;">
|
| 1152 |
<div style="font-size: 2rem; margin-bottom: 0.75rem;">π</div>
|
| 1153 |
<div style="font-weight: 700; font-size: 1.1rem; color: #FFD700; margin-bottom: 0.75rem;">Contact Directly</div>
|
|
|
|
| 1168 |
For production use: Implement additional verification layers, link scanning, attachment analysis, and human oversight
|
| 1169 |
</div>
|
| 1170 |
<div style="margin-top: 1.5rem; padding-top: 1.5rem; border-top: 1px solid rgba(218,165,32,0.2); font-size: 0.9rem; color: #6b7280;">
|
| 1171 |
+
Powered by TF-IDF β’ Logistic Regression β’ Hybrid Detection β’ Scikit-learn β’ Streamlit
|
| 1172 |
</div>
|
| 1173 |
<div style="margin-top: 1rem; font-size: 0.85rem; color: #6b7280;">
|
| 1174 |
Β© 2024 AI Phishing Shield | All Rights Reserved
|